first commit
This commit is contained in:
433
services/history-v1/storage/lib/blob_store/index.js
Normal file
433
services/history-v1/storage/lib/blob_store/index.js
Normal file
@@ -0,0 +1,433 @@
|
||||
'use strict'
|
||||
|
||||
const config = require('config')
|
||||
const fs = require('node:fs')
|
||||
const isValidUtf8 = require('utf-8-validate')
|
||||
const { ReadableString } = require('@overleaf/stream-utils')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const Blob = core.Blob
|
||||
const TextOperation = core.TextOperation
|
||||
const containsNonBmpChars = core.util.containsNonBmpChars
|
||||
|
||||
const assert = require('../assert')
|
||||
const blobHash = require('../blob_hash')
|
||||
const mongodb = require('../mongodb')
|
||||
const persistor = require('../persistor')
|
||||
const projectKey = require('../project_key')
|
||||
const streams = require('../streams')
|
||||
const postgresBackend = require('./postgres')
|
||||
const mongoBackend = require('./mongo')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
/** @import { Readable } from 'stream' */
|
||||
|
||||
const GLOBAL_BLOBS = new Map()
|
||||
|
||||
function makeGlobalKey(hash) {
|
||||
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
||||
}
|
||||
|
||||
function makeProjectKey(projectId, hash) {
|
||||
return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
|
||||
}
|
||||
|
||||
async function uploadBlob(projectId, blob, stream, opts = {}) {
|
||||
const bucket = config.get('blobStore.projectBucket')
|
||||
const key = makeProjectKey(projectId, blob.getHash())
|
||||
logger.debug({ projectId, blob }, 'uploadBlob started')
|
||||
try {
|
||||
await persistor.sendStream(bucket, key, stream, {
|
||||
contentType: 'application/octet-stream',
|
||||
...opts,
|
||||
})
|
||||
} finally {
|
||||
logger.debug({ projectId, blob }, 'uploadBlob finished')
|
||||
}
|
||||
}
|
||||
|
||||
function getBlobLocation(projectId, hash) {
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
return {
|
||||
bucket: config.get('blobStore.globalBucket'),
|
||||
key: makeGlobalKey(hash),
|
||||
}
|
||||
} else {
|
||||
return {
|
||||
bucket: config.get('blobStore.projectBucket'),
|
||||
key: makeProjectKey(projectId, hash),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the appropriate backend for the given project id
|
||||
*
|
||||
* Numeric ids use the Postgres backend.
|
||||
* Strings of 24 characters use the Mongo backend.
|
||||
*/
|
||||
function getBackend(projectId) {
|
||||
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
|
||||
return postgresBackend
|
||||
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
|
||||
return mongoBackend
|
||||
} else {
|
||||
throw new OError('bad project id', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
async function makeBlobForFile(pathname) {
|
||||
const { size: byteLength } = await fs.promises.stat(pathname)
|
||||
const hash = await blobHash.fromStream(
|
||||
byteLength,
|
||||
fs.createReadStream(pathname)
|
||||
)
|
||||
return new Blob(hash, byteLength)
|
||||
}
|
||||
|
||||
async function getStringLengthOfFile(byteLength, pathname) {
|
||||
// We have to read the file into memory to get its UTF-8 length, so don't
|
||||
// bother for files that are too large for us to edit anyway.
|
||||
if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
||||
return null
|
||||
}
|
||||
|
||||
// We need to check if the file contains nonBmp or null characters
|
||||
let data = await fs.promises.readFile(pathname)
|
||||
if (!isValidUtf8(data)) return null
|
||||
data = data.toString()
|
||||
if (data.length > TextOperation.MAX_STRING_LENGTH) return null
|
||||
if (containsNonBmpChars(data)) return null
|
||||
if (data.indexOf('\x00') !== -1) return null
|
||||
return data.length
|
||||
}
|
||||
|
||||
async function deleteBlobsInBucket(projectId) {
|
||||
const bucket = config.get('blobStore.projectBucket')
|
||||
const prefix = `${projectKey.format(projectId)}/`
|
||||
logger.debug({ projectId }, 'deleteBlobsInBucket started')
|
||||
try {
|
||||
await persistor.deleteDirectory(bucket, prefix)
|
||||
} finally {
|
||||
logger.debug({ projectId }, 'deleteBlobsInBucket finished')
|
||||
}
|
||||
}
|
||||
|
||||
async function loadGlobalBlobs() {
|
||||
const blobs = await mongodb.globalBlobs.find()
|
||||
for await (const blob of blobs) {
|
||||
GLOBAL_BLOBS.set(blob._id, {
|
||||
blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
|
||||
demoted: Boolean(blob.demoted),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<string|number>} projectIds
|
||||
* @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
const mongoProjects = []
|
||||
const postgresProjects = []
|
||||
for (const projectId of projectIds) {
|
||||
if (typeof projectId === 'number') {
|
||||
postgresProjects.push(projectId)
|
||||
} else {
|
||||
mongoProjects.push(projectId)
|
||||
}
|
||||
}
|
||||
const [
|
||||
{ nBlobs: nBlobsPostgres, blobs: blobsPostgres },
|
||||
{ nBlobs: nBlobsMongo, blobs: blobsMongo },
|
||||
] = await Promise.all([
|
||||
postgresBackend.getProjectBlobsBatch(postgresProjects),
|
||||
mongoBackend.getProjectBlobsBatch(mongoProjects),
|
||||
])
|
||||
for (const [id, blobs] of blobsPostgres.entries()) {
|
||||
blobsMongo.set(id.toString(), blobs)
|
||||
}
|
||||
return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
|
||||
}
|
||||
|
||||
/**
|
||||
* @classdesc
|
||||
* Fetch and store the content of files using content-addressable hashing. The
|
||||
* blob store manages both content and metadata (byte and UTF-8 length) for
|
||||
* blobs.
|
||||
*/
|
||||
class BlobStore {
|
||||
/**
|
||||
* @constructor
|
||||
* @param {string} projectId the project for which we'd like to find blobs
|
||||
*/
|
||||
constructor(projectId) {
|
||||
assert.projectId(projectId)
|
||||
this.projectId = projectId
|
||||
this.backend = getBackend(this.projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the initial data structure for a given project
|
||||
*/
|
||||
async initialize() {
|
||||
await this.backend.initialize(this.projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a blob, if one does not already exist, with the given UTF-8 encoded
|
||||
* string content.
|
||||
*
|
||||
* @param {string} string
|
||||
* @return {Promise.<core.Blob>}
|
||||
*/
|
||||
async putString(string) {
|
||||
assert.string(string, 'bad string')
|
||||
const hash = blobHash.fromString(string)
|
||||
|
||||
const existingBlob = await this._findBlobBeforeInsert(hash)
|
||||
if (existingBlob != null) {
|
||||
return existingBlob
|
||||
}
|
||||
const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
|
||||
// Note: the ReadableString is to work around a bug in the AWS SDK: it won't
|
||||
// allow Body to be blank.
|
||||
await uploadBlob(this.projectId, newBlob, new ReadableString(string))
|
||||
await this.backend.insertBlob(this.projectId, newBlob)
|
||||
return newBlob
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a blob, if one does not already exist, with the given file (usually a
|
||||
* temporary file).
|
||||
*
|
||||
* @param {string} pathname
|
||||
* @return {Promise<core.Blob>}
|
||||
*/
|
||||
async putFile(pathname) {
|
||||
assert.string(pathname, 'bad pathname')
|
||||
const newBlob = await makeBlobForFile(pathname)
|
||||
const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
|
||||
if (existingBlob != null) {
|
||||
return existingBlob
|
||||
}
|
||||
const stringLength = await getStringLengthOfFile(
|
||||
newBlob.getByteLength(),
|
||||
pathname
|
||||
)
|
||||
newBlob.setStringLength(stringLength)
|
||||
await this.putBlob(pathname, newBlob)
|
||||
return newBlob
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a new blob, the stringLength must have been added already. It should
|
||||
* have been checked that the blob does not exist yet. Consider using
|
||||
* {@link putFile} instead of this lower-level method.
|
||||
*
|
||||
* @param {string} pathname
|
||||
* @param {core.Blob} finializedBlob
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async putBlob(pathname, finializedBlob) {
|
||||
await uploadBlob(
|
||||
this.projectId,
|
||||
finializedBlob,
|
||||
fs.createReadStream(pathname)
|
||||
)
|
||||
await this.backend.insertBlob(this.projectId, finializedBlob)
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores an object as a JSON string in a blob.
|
||||
*
|
||||
* @param {object} obj
|
||||
* @returns {Promise.<core.Blob>}
|
||||
*/
|
||||
async putObject(obj) {
|
||||
assert.object(obj, 'bad object')
|
||||
const string = JSON.stringify(obj)
|
||||
return await this.putString(string)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Fetch a blob's content by its hash as a UTF-8 encoded string.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise.<string>} promise for the content of the file
|
||||
*/
|
||||
async getString(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const projectId = this.projectId
|
||||
logger.debug({ projectId, hash }, 'getString started')
|
||||
try {
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.readStreamToBuffer(stream)
|
||||
return buffer.toString()
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getString finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a JSON encoded blob by its hash and deserialize it.
|
||||
*
|
||||
* @template [T=unknown]
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise.<T>} promise for the content of the file
|
||||
*/
|
||||
async getObject(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
const projectId = this.projectId
|
||||
logger.debug({ projectId, hash }, 'getObject started')
|
||||
try {
|
||||
const jsonString = await this.getString(hash)
|
||||
const object = JSON.parse(jsonString)
|
||||
return object
|
||||
} catch (error) {
|
||||
// Maybe this is blob is gzipped. Try to gunzip it.
|
||||
// TODO: Remove once we've ensured this is not reached
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.gunzipStreamToBuffer(stream)
|
||||
const object = JSON.parse(buffer.toString())
|
||||
logger.warn('getObject: Gzipped object in BlobStore')
|
||||
return object
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getObject finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a blob by its hash as a stream.
|
||||
*
|
||||
* Note that, according to the AWS SDK docs, this does not retry after initial
|
||||
* failure, so the caller must be prepared to retry on errors, if appropriate.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @param {Object} opts
|
||||
* @return {Promise.<Readable>} a stream to read the file
|
||||
*/
|
||||
async getStream(hash, opts = {}) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const { bucket, key } = getBlobLocation(this.projectId, hash)
|
||||
try {
|
||||
const stream = await persistor.getObjectStream(bucket, key, opts)
|
||||
return stream
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Blob.NotFoundError(hash)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a blob metadata record by hexadecimal hash.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise<core.Blob | null>}
|
||||
*/
|
||||
async getBlob(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null) {
|
||||
return globalBlob.blob
|
||||
}
|
||||
const blob = await this.backend.findBlob(this.projectId, hash)
|
||||
return blob
|
||||
}
|
||||
|
||||
async getBlobs(hashes) {
|
||||
assert.array(hashes, 'bad hashes')
|
||||
const nonGlobalHashes = []
|
||||
const blobs = []
|
||||
for (const hash of hashes) {
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null) {
|
||||
blobs.push(globalBlob.blob)
|
||||
} else {
|
||||
nonGlobalHashes.push(hash)
|
||||
}
|
||||
}
|
||||
if (nonGlobalHashes.length === 0) {
|
||||
return blobs // to avoid unnecessary database lookup
|
||||
}
|
||||
const projectBlobs = await this.backend.findBlobs(
|
||||
this.projectId,
|
||||
nonGlobalHashes
|
||||
)
|
||||
blobs.push(...projectBlobs)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve all blobs associated with the project.
|
||||
* @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
|
||||
*/
|
||||
|
||||
async getProjectBlobs() {
|
||||
const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
|
||||
return projectBlobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all blobs that belong to the project.
|
||||
*/
|
||||
async deleteBlobs() {
|
||||
await Promise.all([
|
||||
this.backend.deleteBlobs(this.projectId),
|
||||
deleteBlobsInBucket(this.projectId),
|
||||
])
|
||||
}
|
||||
|
||||
async _findBlobBeforeInsert(hash) {
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null && !globalBlob.demoted) {
|
||||
return globalBlob.blob
|
||||
}
|
||||
const blob = await this.backend.findBlob(this.projectId, hash)
|
||||
return blob
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy an existing sourceBlob in this project to a target project.
|
||||
* @param {Blob} sourceBlob
|
||||
* @param {string} targetProjectId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async copyBlob(sourceBlob, targetProjectId) {
|
||||
assert.instance(sourceBlob, Blob, 'bad sourceBlob')
|
||||
assert.projectId(targetProjectId, 'bad targetProjectId')
|
||||
const hash = sourceBlob.getHash()
|
||||
const sourceProjectId = this.projectId
|
||||
const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
|
||||
const destKey = makeProjectKey(targetProjectId, hash)
|
||||
const targetBackend = getBackend(targetProjectId)
|
||||
logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
|
||||
try {
|
||||
await persistor.copyObject(bucket, sourceKey, destKey)
|
||||
await targetBackend.insertBlob(targetProjectId, sourceBlob)
|
||||
} finally {
|
||||
logger.debug(
|
||||
{ sourceProjectId, targetProjectId, hash },
|
||||
'copyBlob finished'
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BlobStore,
|
||||
getProjectBlobsBatch,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
makeBlobForFile,
|
||||
getStringLengthOfFile,
|
||||
GLOBAL_BLOBS,
|
||||
}
|
||||
437
services/history-v1/storage/lib/blob_store/mongo.js
Normal file
437
services/history-v1/storage/lib/blob_store/mongo.js
Normal file
@@ -0,0 +1,437 @@
|
||||
// @ts-check
|
||||
/**
|
||||
* Mongo backend for the blob store.
|
||||
*
|
||||
* Blobs are stored in the projectHistoryBlobs collection. Each project has a
|
||||
* document in that collection. That document has a "blobs" subdocument whose
|
||||
* fields are buckets of blobs. The key of a bucket is the first three hex
|
||||
* digits of the blob hash. The value of the bucket is an array of blobs that
|
||||
* match the key.
|
||||
*
|
||||
* Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
|
||||
* blobs are stored in a secondary collection: the projectHistoryShardedBlobs
|
||||
* collection. This collection shards blobs between 16 documents per project.
|
||||
* The shard key is the first hex digit of the hash. The documents are also
|
||||
* organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
|
||||
*/
|
||||
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
|
||||
const assert = require('../assert')
|
||||
const mongodb = require('../mongodb')
|
||||
|
||||
const MAX_BLOBS_IN_BUCKET = 8
|
||||
const DUPLICATE_KEY_ERROR_CODE = 11000
|
||||
|
||||
/**
|
||||
* @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
|
||||
*/
|
||||
|
||||
/**
|
||||
* Set up the data structures for a given project.
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function initialize(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
try {
|
||||
await mongodb.blobs.insertOne({
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: {},
|
||||
})
|
||||
} catch (err) {
|
||||
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
|
||||
return // ignore already initialized case
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return blob metadata for the given project and hash.
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<Blob | null>}
|
||||
*/
|
||||
async function findBlob(projectId, hash) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const bucket = getBucket(hash)
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { _id: 0, bucket: `$${bucket}` } }
|
||||
)
|
||||
|
||||
if (result?.bucket == null) {
|
||||
return null
|
||||
}
|
||||
|
||||
const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
|
||||
if (record == null) {
|
||||
if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
|
||||
return await findBlobSharded(projectId, hash)
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the sharded collection for blob metadata
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<Blob | null>}
|
||||
*/
|
||||
async function findBlobSharded(projectId, hash) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
const id = makeShardedId(projectId, shard)
|
||||
const result = await mongodb.shardedBlobs.findOne(
|
||||
{ _id: id },
|
||||
{ projection: { _id: 0, blobs: `$${bucket}` } }
|
||||
)
|
||||
if (result?.blobs == null) {
|
||||
return null
|
||||
}
|
||||
const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
|
||||
if (!record) return null
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read multiple blob metadata records by hexadecimal hashes.
|
||||
* @param {string} projectId
|
||||
* @param {Array<string>} hashes
|
||||
* @return {Promise<Array<Blob>>}
|
||||
*/
|
||||
async function findBlobs(projectId, hashes) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.array(hashes, 'bad hashes: not array')
|
||||
hashes.forEach(function (hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
})
|
||||
|
||||
// Build a set of unique buckets
|
||||
const buckets = new Set(hashes.map(getBucket))
|
||||
|
||||
// Get buckets from Mongo
|
||||
const projection = { _id: 0 }
|
||||
for (const bucket of buckets) {
|
||||
projection[bucket] = 1
|
||||
}
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection }
|
||||
)
|
||||
|
||||
if (result?.blobs == null) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Build blobs from the query results
|
||||
const hashSet = new Set(hashes)
|
||||
const blobs = []
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
const hash = record.h.toString('hex')
|
||||
if (hashSet.has(hash)) {
|
||||
blobs.push(recordToBlob(record))
|
||||
hashSet.delete(hash)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we haven't found all the blobs, look in the sharded collection
|
||||
if (hashSet.size > 0) {
|
||||
const shardedBlobs = await findBlobsSharded(projectId, hashSet)
|
||||
blobs.push(...shardedBlobs)
|
||||
}
|
||||
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the sharded collection for blob metadata.
|
||||
* @param {string} projectId
|
||||
* @param {Set<string>} hashSet
|
||||
* @return {Promise<Array<Blob>>}
|
||||
*/
|
||||
async function findBlobsSharded(projectId, hashSet) {
|
||||
// Build a map of buckets by shard key
|
||||
const bucketsByShard = new Map()
|
||||
for (const hash of hashSet) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
let buckets = bucketsByShard.get(shard)
|
||||
if (buckets == null) {
|
||||
buckets = new Set()
|
||||
bucketsByShard.set(shard, buckets)
|
||||
}
|
||||
buckets.add(bucket)
|
||||
}
|
||||
|
||||
// Make parallel requests to the shards that might contain the hashes we want
|
||||
const requests = []
|
||||
for (const [shard, buckets] of bucketsByShard.entries()) {
|
||||
const id = makeShardedId(projectId, shard)
|
||||
const projection = { _id: 0 }
|
||||
for (const bucket of buckets) {
|
||||
projection[bucket] = 1
|
||||
}
|
||||
const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
|
||||
requests.push(request)
|
||||
}
|
||||
const results = await Promise.all(requests)
|
||||
|
||||
// Build blobs from the query results
|
||||
const blobs = []
|
||||
for (const result of results) {
|
||||
if (result?.blobs == null) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
const hash = record.h.toString('hex')
|
||||
if (hashSet.has(hash)) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
*/
|
||||
async function getProjectBlobs(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { _id: 0 } }
|
||||
)
|
||||
|
||||
if (!result) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Build blobs from the query results
|
||||
const blobs = []
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
|
||||
// Look for all possible sharded blobs
|
||||
|
||||
const minShardedId = makeShardedId(projectId, '0')
|
||||
const maxShardedId = makeShardedId(projectId, 'f')
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
const shardedRecords = mongodb.shardedBlobs.find(
|
||||
{
|
||||
_id: { $gte: minShardedId, $lte: maxShardedId },
|
||||
},
|
||||
{ projection: { _id: 0 } }
|
||||
)
|
||||
|
||||
for await (const shardedRecord of shardedRecords) {
|
||||
if (shardedRecord.blobs == null) {
|
||||
continue
|
||||
}
|
||||
for (const bucket of Object.values(shardedRecord.blobs)) {
|
||||
for (const record of bucket) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<string>} projectIds
|
||||
* @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
for (const project of projectIds) {
|
||||
assert.mongoId(project, 'bad projectId')
|
||||
}
|
||||
let nBlobs = 0
|
||||
const blobs = new Map()
|
||||
if (projectIds.length === 0) return { nBlobs, blobs }
|
||||
|
||||
// blobs
|
||||
{
|
||||
const cursor = await mongodb.blobs.find(
|
||||
{ _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
|
||||
{ readPreference: ReadPreference.secondaryPreferred }
|
||||
)
|
||||
for await (const record of cursor) {
|
||||
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
|
||||
blobs.set(record._id.toString(), projectBlobs)
|
||||
nBlobs += projectBlobs.length
|
||||
}
|
||||
}
|
||||
|
||||
// sharded blobs
|
||||
{
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
const cursor = await mongodb.shardedBlobs.find(
|
||||
{
|
||||
_id: {
|
||||
$gte: makeShardedId(projectIds[0], '0'),
|
||||
$lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
|
||||
},
|
||||
},
|
||||
{ readPreference: ReadPreference.secondaryPreferred }
|
||||
)
|
||||
for await (const record of cursor) {
|
||||
const recordIdHex = record._id.toString('hex')
|
||||
const recordProjectId = recordIdHex.slice(0, 24)
|
||||
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
|
||||
const found = blobs.get(recordProjectId)
|
||||
if (found) {
|
||||
found.push(...projectBlobs)
|
||||
} else {
|
||||
blobs.set(recordProjectId, projectBlobs)
|
||||
}
|
||||
nBlobs += projectBlobs.length
|
||||
}
|
||||
}
|
||||
return { nBlobs, blobs }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the blobs collection after it has been uploaded.
|
||||
* @param {string} projectId
|
||||
* @param {Blob} blob
|
||||
*/
|
||||
async function insertBlob(projectId, blob) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
const hash = blob.getHash()
|
||||
const bucket = getBucket(hash)
|
||||
const record = blobToRecord(blob)
|
||||
const result = await mongodb.blobs.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
$expr: {
|
||||
$lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
|
||||
},
|
||||
},
|
||||
{
|
||||
$addToSet: { [bucket]: record },
|
||||
}
|
||||
)
|
||||
|
||||
if (result.matchedCount === 0) {
|
||||
await insertRecordSharded(projectId, hash, record)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the sharded blobs collection.
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @param {Record} record
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function insertRecordSharded(projectId, hash, record) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
const id = makeShardedId(projectId, shard)
|
||||
await mongodb.shardedBlobs.updateOne(
|
||||
{ _id: id },
|
||||
{ $addToSet: { [bucket]: record } },
|
||||
{ upsert: true }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all blobs for a given project.
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function deleteBlobs(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
|
||||
const minShardedId = makeShardedId(projectId, '0')
|
||||
const maxShardedId = makeShardedId(projectId, 'f')
|
||||
await mongodb.shardedBlobs.deleteMany({
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
_id: { $gte: minShardedId, $lte: maxShardedId },
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the Mongo path to the bucket for the given hash.
|
||||
* @param {string} hash
|
||||
* @return {string}
|
||||
*/
|
||||
function getBucket(hash) {
|
||||
return `blobs.${hash.slice(0, 3)}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the shard key and Mongo path to the bucket for the given hash in the
|
||||
* sharded collection.
|
||||
* @param {string} hash
|
||||
* @return {[string, string]}
|
||||
*/
|
||||
function getShardedBucket(hash) {
|
||||
const shard = hash.slice(0, 1)
|
||||
const bucket = `blobs.${hash.slice(1, 4)}`
|
||||
return [shard, bucket]
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an _id key for the sharded collection.
|
||||
* @param {string} projectId
|
||||
* @param {string} shard
|
||||
* @return {Binary}
|
||||
*/
|
||||
function makeShardedId(projectId, shard) {
|
||||
return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} Record
|
||||
* @property {Binary} h
|
||||
* @property {number} b
|
||||
* @property {number} [s]
|
||||
*/
|
||||
|
||||
/**
|
||||
* Return the Mongo record for the given blob.
|
||||
* @param {Blob} blob
|
||||
* @return {Record}
|
||||
*/
|
||||
function blobToRecord(blob) {
|
||||
const hash = blob.getHash()
|
||||
const byteLength = blob.getByteLength()
|
||||
const stringLength = blob.getStringLength()
|
||||
return {
|
||||
h: new Binary(Buffer.from(hash, 'hex')),
|
||||
b: byteLength,
|
||||
s: stringLength,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a blob from the given Mongo record.
|
||||
* @param {Record} record
|
||||
* @return {Blob}
|
||||
*/
|
||||
function recordToBlob(record) {
|
||||
return new Blob(record.h.toString('hex'), record.b, record.s)
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initialize,
|
||||
findBlob,
|
||||
findBlobs,
|
||||
getProjectBlobs,
|
||||
getProjectBlobsBatch,
|
||||
insertBlob,
|
||||
deleteBlobs,
|
||||
}
|
||||
161
services/history-v1/storage/lib/blob_store/postgres.js
Normal file
161
services/history-v1/storage/lib/blob_store/postgres.js
Normal file
@@ -0,0 +1,161 @@
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const assert = require('../assert')
|
||||
const knex = require('../knex')
|
||||
|
||||
/**
|
||||
* Set up the initial data structures for a project
|
||||
*/
|
||||
async function initialize(projectId) {
|
||||
// Nothing to do for Postgres
|
||||
}
|
||||
|
||||
/**
|
||||
* Return blob metadata for the given project and hash
|
||||
*/
|
||||
async function findBlob(projectId, hash) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const binaryHash = hashToBuffer(hash)
|
||||
const record = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where({
|
||||
project_id: projectId,
|
||||
hash_bytes: binaryHash,
|
||||
})
|
||||
.first()
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read multiple blob metadata records by hexadecimal hashes.
|
||||
*
|
||||
* @param {Array.<string>} hashes hexadecimal SHA-1 hashes
|
||||
* @return {Promise.<Array.<Blob?>>} no guarantee on order
|
||||
*/
|
||||
async function findBlobs(projectId, hashes) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
assert.array(hashes, 'bad hashes: not array')
|
||||
hashes.forEach(function (hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
})
|
||||
|
||||
const binaryHashes = hashes.map(hashToBuffer)
|
||||
|
||||
const records = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where('project_id', projectId)
|
||||
.whereIn('hash_bytes', binaryHashes)
|
||||
|
||||
const blobs = records.map(recordToBlob)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
*/
|
||||
async function getProjectBlobs(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
const records = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where({
|
||||
project_id: projectId,
|
||||
})
|
||||
|
||||
const blobs = records.map(recordToBlob)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<number>} projectIds
|
||||
* @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
for (const projectId of projectIds) {
|
||||
assert.integer(projectId, 'bad projectId')
|
||||
}
|
||||
let nBlobs = 0
|
||||
const blobs = new Map()
|
||||
if (projectIds.length === 0) return { nBlobs, blobs }
|
||||
|
||||
const cursor = knex('project_blobs')
|
||||
.select('project_id', 'hash_bytes', 'byte_length', 'string_length')
|
||||
.whereIn('project_id', projectIds)
|
||||
.stream()
|
||||
for await (const record of cursor) {
|
||||
const found = blobs.get(record.project_id)
|
||||
if (found) {
|
||||
found.push(recordToBlob(record))
|
||||
} else {
|
||||
blobs.set(record.project_id, [recordToBlob(record)])
|
||||
}
|
||||
nBlobs++
|
||||
}
|
||||
return { nBlobs, blobs }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the blobs table after it has been uploaded.
|
||||
*/
|
||||
async function insertBlob(projectId, blob) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
await knex('project_blobs')
|
||||
.insert(blobToRecord(projectId, blob))
|
||||
.onConflict(['project_id', 'hash_bytes'])
|
||||
.ignore()
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes all blobs for a given project
|
||||
*/
|
||||
async function deleteBlobs(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
await knex('project_blobs').where('project_id', projectId).delete()
|
||||
}
|
||||
|
||||
function blobToRecord(projectId, blob) {
|
||||
return {
|
||||
project_id: projectId,
|
||||
hash_bytes: hashToBuffer(blob.hash),
|
||||
byte_length: blob.getByteLength(),
|
||||
string_length: blob.getStringLength(),
|
||||
}
|
||||
}
|
||||
|
||||
function recordToBlob(record) {
|
||||
if (!record) return
|
||||
return new Blob(
|
||||
hashFromBuffer(record.hash_bytes),
|
||||
record.byte_length,
|
||||
record.string_length
|
||||
)
|
||||
}
|
||||
|
||||
function hashToBuffer(hash) {
|
||||
if (!hash) return
|
||||
return Buffer.from(hash, 'hex')
|
||||
}
|
||||
|
||||
function hashFromBuffer(buffer) {
|
||||
if (!buffer) return
|
||||
return buffer.toString('hex')
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initialize,
|
||||
findBlob,
|
||||
findBlobs,
|
||||
getProjectBlobs,
|
||||
getProjectBlobsBatch,
|
||||
insertBlob,
|
||||
deleteBlobs,
|
||||
}
|
||||
Reference in New Issue
Block a user