first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,433 @@
'use strict'
const config = require('config')
const fs = require('node:fs')
const isValidUtf8 = require('utf-8-validate')
const { ReadableString } = require('@overleaf/stream-utils')
const core = require('overleaf-editor-core')
const objectPersistor = require('@overleaf/object-persistor')
const OError = require('@overleaf/o-error')
const Blob = core.Blob
const TextOperation = core.TextOperation
const containsNonBmpChars = core.util.containsNonBmpChars
const assert = require('../assert')
const blobHash = require('../blob_hash')
const mongodb = require('../mongodb')
const persistor = require('../persistor')
const projectKey = require('../project_key')
const streams = require('../streams')
const postgresBackend = require('./postgres')
const mongoBackend = require('./mongo')
const logger = require('@overleaf/logger')
/** @import { Readable } from 'stream' */
const GLOBAL_BLOBS = new Map()
function makeGlobalKey(hash) {
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
}
function makeProjectKey(projectId, hash) {
return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
}
async function uploadBlob(projectId, blob, stream, opts = {}) {
const bucket = config.get('blobStore.projectBucket')
const key = makeProjectKey(projectId, blob.getHash())
logger.debug({ projectId, blob }, 'uploadBlob started')
try {
await persistor.sendStream(bucket, key, stream, {
contentType: 'application/octet-stream',
...opts,
})
} finally {
logger.debug({ projectId, blob }, 'uploadBlob finished')
}
}
function getBlobLocation(projectId, hash) {
if (GLOBAL_BLOBS.has(hash)) {
return {
bucket: config.get('blobStore.globalBucket'),
key: makeGlobalKey(hash),
}
} else {
return {
bucket: config.get('blobStore.projectBucket'),
key: makeProjectKey(projectId, hash),
}
}
}
/**
* Returns the appropriate backend for the given project id
*
* Numeric ids use the Postgres backend.
* Strings of 24 characters use the Mongo backend.
*/
function getBackend(projectId) {
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
return postgresBackend
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
return mongoBackend
} else {
throw new OError('bad project id', { projectId })
}
}
async function makeBlobForFile(pathname) {
const { size: byteLength } = await fs.promises.stat(pathname)
const hash = await blobHash.fromStream(
byteLength,
fs.createReadStream(pathname)
)
return new Blob(hash, byteLength)
}
async function getStringLengthOfFile(byteLength, pathname) {
// We have to read the file into memory to get its UTF-8 length, so don't
// bother for files that are too large for us to edit anyway.
if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
return null
}
// We need to check if the file contains nonBmp or null characters
let data = await fs.promises.readFile(pathname)
if (!isValidUtf8(data)) return null
data = data.toString()
if (data.length > TextOperation.MAX_STRING_LENGTH) return null
if (containsNonBmpChars(data)) return null
if (data.indexOf('\x00') !== -1) return null
return data.length
}
async function deleteBlobsInBucket(projectId) {
const bucket = config.get('blobStore.projectBucket')
const prefix = `${projectKey.format(projectId)}/`
logger.debug({ projectId }, 'deleteBlobsInBucket started')
try {
await persistor.deleteDirectory(bucket, prefix)
} finally {
logger.debug({ projectId }, 'deleteBlobsInBucket finished')
}
}
async function loadGlobalBlobs() {
const blobs = await mongodb.globalBlobs.find()
for await (const blob of blobs) {
GLOBAL_BLOBS.set(blob._id, {
blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
demoted: Boolean(blob.demoted),
})
}
}
/**
* Return metadata for all blobs in the given project
* @param {Array<string|number>} projectIds
* @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
*/
async function getProjectBlobsBatch(projectIds) {
const mongoProjects = []
const postgresProjects = []
for (const projectId of projectIds) {
if (typeof projectId === 'number') {
postgresProjects.push(projectId)
} else {
mongoProjects.push(projectId)
}
}
const [
{ nBlobs: nBlobsPostgres, blobs: blobsPostgres },
{ nBlobs: nBlobsMongo, blobs: blobsMongo },
] = await Promise.all([
postgresBackend.getProjectBlobsBatch(postgresProjects),
mongoBackend.getProjectBlobsBatch(mongoProjects),
])
for (const [id, blobs] of blobsPostgres.entries()) {
blobsMongo.set(id.toString(), blobs)
}
return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
}
/**
* @classdesc
* Fetch and store the content of files using content-addressable hashing. The
* blob store manages both content and metadata (byte and UTF-8 length) for
* blobs.
*/
class BlobStore {
/**
* @constructor
* @param {string} projectId the project for which we'd like to find blobs
*/
constructor(projectId) {
assert.projectId(projectId)
this.projectId = projectId
this.backend = getBackend(this.projectId)
}
/**
* Set up the initial data structure for a given project
*/
async initialize() {
await this.backend.initialize(this.projectId)
}
/**
* Write a blob, if one does not already exist, with the given UTF-8 encoded
* string content.
*
* @param {string} string
* @return {Promise.<core.Blob>}
*/
async putString(string) {
assert.string(string, 'bad string')
const hash = blobHash.fromString(string)
const existingBlob = await this._findBlobBeforeInsert(hash)
if (existingBlob != null) {
return existingBlob
}
const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
// Note: the ReadableString is to work around a bug in the AWS SDK: it won't
// allow Body to be blank.
await uploadBlob(this.projectId, newBlob, new ReadableString(string))
await this.backend.insertBlob(this.projectId, newBlob)
return newBlob
}
/**
* Write a blob, if one does not already exist, with the given file (usually a
* temporary file).
*
* @param {string} pathname
* @return {Promise<core.Blob>}
*/
async putFile(pathname) {
assert.string(pathname, 'bad pathname')
const newBlob = await makeBlobForFile(pathname)
const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
if (existingBlob != null) {
return existingBlob
}
const stringLength = await getStringLengthOfFile(
newBlob.getByteLength(),
pathname
)
newBlob.setStringLength(stringLength)
await this.putBlob(pathname, newBlob)
return newBlob
}
/**
* Write a new blob, the stringLength must have been added already. It should
* have been checked that the blob does not exist yet. Consider using
* {@link putFile} instead of this lower-level method.
*
* @param {string} pathname
* @param {core.Blob} finializedBlob
* @return {Promise<void>}
*/
async putBlob(pathname, finializedBlob) {
await uploadBlob(
this.projectId,
finializedBlob,
fs.createReadStream(pathname)
)
await this.backend.insertBlob(this.projectId, finializedBlob)
}
/**
* Stores an object as a JSON string in a blob.
*
* @param {object} obj
* @returns {Promise.<core.Blob>}
*/
async putObject(obj) {
assert.object(obj, 'bad object')
const string = JSON.stringify(obj)
return await this.putString(string)
}
/**
*
* Fetch a blob's content by its hash as a UTF-8 encoded string.
*
* @param {string} hash hexadecimal SHA-1 hash
* @return {Promise.<string>} promise for the content of the file
*/
async getString(hash) {
assert.blobHash(hash, 'bad hash')
const projectId = this.projectId
logger.debug({ projectId, hash }, 'getString started')
try {
const stream = await this.getStream(hash)
const buffer = await streams.readStreamToBuffer(stream)
return buffer.toString()
} finally {
logger.debug({ projectId, hash }, 'getString finished')
}
}
/**
* Fetch a JSON encoded blob by its hash and deserialize it.
*
* @template [T=unknown]
* @param {string} hash hexadecimal SHA-1 hash
* @return {Promise.<T>} promise for the content of the file
*/
async getObject(hash) {
assert.blobHash(hash, 'bad hash')
const projectId = this.projectId
logger.debug({ projectId, hash }, 'getObject started')
try {
const jsonString = await this.getString(hash)
const object = JSON.parse(jsonString)
return object
} catch (error) {
// Maybe this is blob is gzipped. Try to gunzip it.
// TODO: Remove once we've ensured this is not reached
const stream = await this.getStream(hash)
const buffer = await streams.gunzipStreamToBuffer(stream)
const object = JSON.parse(buffer.toString())
logger.warn('getObject: Gzipped object in BlobStore')
return object
} finally {
logger.debug({ projectId, hash }, 'getObject finished')
}
}
/**
* Fetch a blob by its hash as a stream.
*
* Note that, according to the AWS SDK docs, this does not retry after initial
* failure, so the caller must be prepared to retry on errors, if appropriate.
*
* @param {string} hash hexadecimal SHA-1 hash
* @param {Object} opts
* @return {Promise.<Readable>} a stream to read the file
*/
async getStream(hash, opts = {}) {
assert.blobHash(hash, 'bad hash')
const { bucket, key } = getBlobLocation(this.projectId, hash)
try {
const stream = await persistor.getObjectStream(bucket, key, opts)
return stream
} catch (err) {
if (err instanceof objectPersistor.Errors.NotFoundError) {
throw new Blob.NotFoundError(hash)
}
throw err
}
}
/**
* Read a blob metadata record by hexadecimal hash.
*
* @param {string} hash hexadecimal SHA-1 hash
* @return {Promise<core.Blob | null>}
*/
async getBlob(hash) {
assert.blobHash(hash, 'bad hash')
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob != null) {
return globalBlob.blob
}
const blob = await this.backend.findBlob(this.projectId, hash)
return blob
}
async getBlobs(hashes) {
assert.array(hashes, 'bad hashes')
const nonGlobalHashes = []
const blobs = []
for (const hash of hashes) {
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob != null) {
blobs.push(globalBlob.blob)
} else {
nonGlobalHashes.push(hash)
}
}
if (nonGlobalHashes.length === 0) {
return blobs // to avoid unnecessary database lookup
}
const projectBlobs = await this.backend.findBlobs(
this.projectId,
nonGlobalHashes
)
blobs.push(...projectBlobs)
return blobs
}
/**
* Retrieve all blobs associated with the project.
* @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
*/
async getProjectBlobs() {
const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
return projectBlobs
}
/**
* Delete all blobs that belong to the project.
*/
async deleteBlobs() {
await Promise.all([
this.backend.deleteBlobs(this.projectId),
deleteBlobsInBucket(this.projectId),
])
}
async _findBlobBeforeInsert(hash) {
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob != null && !globalBlob.demoted) {
return globalBlob.blob
}
const blob = await this.backend.findBlob(this.projectId, hash)
return blob
}
/**
* Copy an existing sourceBlob in this project to a target project.
* @param {Blob} sourceBlob
* @param {string} targetProjectId
* @return {Promise<void>}
*/
async copyBlob(sourceBlob, targetProjectId) {
assert.instance(sourceBlob, Blob, 'bad sourceBlob')
assert.projectId(targetProjectId, 'bad targetProjectId')
const hash = sourceBlob.getHash()
const sourceProjectId = this.projectId
const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
const destKey = makeProjectKey(targetProjectId, hash)
const targetBackend = getBackend(targetProjectId)
logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
try {
await persistor.copyObject(bucket, sourceKey, destKey)
await targetBackend.insertBlob(targetProjectId, sourceBlob)
} finally {
logger.debug(
{ sourceProjectId, targetProjectId, hash },
'copyBlob finished'
)
}
}
}
module.exports = {
BlobStore,
getProjectBlobsBatch,
loadGlobalBlobs,
makeProjectKey,
makeBlobForFile,
getStringLengthOfFile,
GLOBAL_BLOBS,
}

View File

@@ -0,0 +1,437 @@
// @ts-check
/**
* Mongo backend for the blob store.
*
* Blobs are stored in the projectHistoryBlobs collection. Each project has a
* document in that collection. That document has a "blobs" subdocument whose
* fields are buckets of blobs. The key of a bucket is the first three hex
* digits of the blob hash. The value of the bucket is an array of blobs that
* match the key.
*
* Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
* blobs are stored in a secondary collection: the projectHistoryShardedBlobs
* collection. This collection shards blobs between 16 documents per project.
* The shard key is the first hex digit of the hash. The documents are also
* organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
*/
const { Blob } = require('overleaf-editor-core')
const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
const assert = require('../assert')
const mongodb = require('../mongodb')
const MAX_BLOBS_IN_BUCKET = 8
const DUPLICATE_KEY_ERROR_CODE = 11000
/**
* @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
*/
/**
* Set up the data structures for a given project.
* @param {string} projectId
*/
async function initialize(projectId) {
assert.mongoId(projectId, 'bad projectId')
try {
await mongodb.blobs.insertOne({
_id: new ObjectId(projectId),
blobs: {},
})
} catch (err) {
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
return // ignore already initialized case
}
throw err
}
}
/**
* Return blob metadata for the given project and hash.
* @param {string} projectId
* @param {string} hash
* @return {Promise<Blob | null>}
*/
async function findBlob(projectId, hash) {
assert.mongoId(projectId, 'bad projectId')
assert.blobHash(hash, 'bad hash')
const bucket = getBucket(hash)
const result = await mongodb.blobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { _id: 0, bucket: `$${bucket}` } }
)
if (result?.bucket == null) {
return null
}
const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
if (record == null) {
if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
return await findBlobSharded(projectId, hash)
} else {
return null
}
}
return recordToBlob(record)
}
/**
* Search in the sharded collection for blob metadata
* @param {string} projectId
* @param {string} hash
* @return {Promise<Blob | null>}
*/
async function findBlobSharded(projectId, hash) {
const [shard, bucket] = getShardedBucket(hash)
const id = makeShardedId(projectId, shard)
const result = await mongodb.shardedBlobs.findOne(
{ _id: id },
{ projection: { _id: 0, blobs: `$${bucket}` } }
)
if (result?.blobs == null) {
return null
}
const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
if (!record) return null
return recordToBlob(record)
}
/**
* Read multiple blob metadata records by hexadecimal hashes.
* @param {string} projectId
* @param {Array<string>} hashes
* @return {Promise<Array<Blob>>}
*/
async function findBlobs(projectId, hashes) {
assert.mongoId(projectId, 'bad projectId')
assert.array(hashes, 'bad hashes: not array')
hashes.forEach(function (hash) {
assert.blobHash(hash, 'bad hash')
})
// Build a set of unique buckets
const buckets = new Set(hashes.map(getBucket))
// Get buckets from Mongo
const projection = { _id: 0 }
for (const bucket of buckets) {
projection[bucket] = 1
}
const result = await mongodb.blobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection }
)
if (result?.blobs == null) {
return []
}
// Build blobs from the query results
const hashSet = new Set(hashes)
const blobs = []
for (const bucket of Object.values(result.blobs)) {
for (const record of bucket) {
const hash = record.h.toString('hex')
if (hashSet.has(hash)) {
blobs.push(recordToBlob(record))
hashSet.delete(hash)
}
}
}
// If we haven't found all the blobs, look in the sharded collection
if (hashSet.size > 0) {
const shardedBlobs = await findBlobsSharded(projectId, hashSet)
blobs.push(...shardedBlobs)
}
return blobs
}
/**
* Search in the sharded collection for blob metadata.
* @param {string} projectId
* @param {Set<string>} hashSet
* @return {Promise<Array<Blob>>}
*/
async function findBlobsSharded(projectId, hashSet) {
// Build a map of buckets by shard key
const bucketsByShard = new Map()
for (const hash of hashSet) {
const [shard, bucket] = getShardedBucket(hash)
let buckets = bucketsByShard.get(shard)
if (buckets == null) {
buckets = new Set()
bucketsByShard.set(shard, buckets)
}
buckets.add(bucket)
}
// Make parallel requests to the shards that might contain the hashes we want
const requests = []
for (const [shard, buckets] of bucketsByShard.entries()) {
const id = makeShardedId(projectId, shard)
const projection = { _id: 0 }
for (const bucket of buckets) {
projection[bucket] = 1
}
const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
requests.push(request)
}
const results = await Promise.all(requests)
// Build blobs from the query results
const blobs = []
for (const result of results) {
if (result?.blobs == null) {
continue
}
for (const bucket of Object.values(result.blobs)) {
for (const record of bucket) {
const hash = record.h.toString('hex')
if (hashSet.has(hash)) {
blobs.push(recordToBlob(record))
}
}
}
}
return blobs
}
/**
* Return metadata for all blobs in the given project
*/
async function getProjectBlobs(projectId) {
assert.mongoId(projectId, 'bad projectId')
const result = await mongodb.blobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { _id: 0 } }
)
if (!result) {
return []
}
// Build blobs from the query results
const blobs = []
for (const bucket of Object.values(result.blobs)) {
for (const record of bucket) {
blobs.push(recordToBlob(record))
}
}
// Look for all possible sharded blobs
const minShardedId = makeShardedId(projectId, '0')
const maxShardedId = makeShardedId(projectId, 'f')
// @ts-ignore We are using a custom _id here.
const shardedRecords = mongodb.shardedBlobs.find(
{
_id: { $gte: minShardedId, $lte: maxShardedId },
},
{ projection: { _id: 0 } }
)
for await (const shardedRecord of shardedRecords) {
if (shardedRecord.blobs == null) {
continue
}
for (const bucket of Object.values(shardedRecord.blobs)) {
for (const record of bucket) {
blobs.push(recordToBlob(record))
}
}
}
return blobs
}
/**
* Return metadata for all blobs in the given project
* @param {Array<string>} projectIds
* @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
*/
async function getProjectBlobsBatch(projectIds) {
for (const project of projectIds) {
assert.mongoId(project, 'bad projectId')
}
let nBlobs = 0
const blobs = new Map()
if (projectIds.length === 0) return { nBlobs, blobs }
// blobs
{
const cursor = await mongodb.blobs.find(
{ _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
{ readPreference: ReadPreference.secondaryPreferred }
)
for await (const record of cursor) {
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
blobs.set(record._id.toString(), projectBlobs)
nBlobs += projectBlobs.length
}
}
// sharded blobs
{
// @ts-ignore We are using a custom _id here.
const cursor = await mongodb.shardedBlobs.find(
{
_id: {
$gte: makeShardedId(projectIds[0], '0'),
$lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
},
},
{ readPreference: ReadPreference.secondaryPreferred }
)
for await (const record of cursor) {
const recordIdHex = record._id.toString('hex')
const recordProjectId = recordIdHex.slice(0, 24)
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
const found = blobs.get(recordProjectId)
if (found) {
found.push(...projectBlobs)
} else {
blobs.set(recordProjectId, projectBlobs)
}
nBlobs += projectBlobs.length
}
}
return { nBlobs, blobs }
}
/**
* Add a blob's metadata to the blobs collection after it has been uploaded.
* @param {string} projectId
* @param {Blob} blob
*/
async function insertBlob(projectId, blob) {
assert.mongoId(projectId, 'bad projectId')
const hash = blob.getHash()
const bucket = getBucket(hash)
const record = blobToRecord(blob)
const result = await mongodb.blobs.updateOne(
{
_id: new ObjectId(projectId),
$expr: {
$lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
},
},
{
$addToSet: { [bucket]: record },
}
)
if (result.matchedCount === 0) {
await insertRecordSharded(projectId, hash, record)
}
}
/**
* Add a blob's metadata to the sharded blobs collection.
* @param {string} projectId
* @param {string} hash
* @param {Record} record
* @return {Promise<void>}
*/
async function insertRecordSharded(projectId, hash, record) {
const [shard, bucket] = getShardedBucket(hash)
const id = makeShardedId(projectId, shard)
await mongodb.shardedBlobs.updateOne(
{ _id: id },
{ $addToSet: { [bucket]: record } },
{ upsert: true }
)
}
/**
* Delete all blobs for a given project.
* @param {string} projectId
*/
async function deleteBlobs(projectId) {
assert.mongoId(projectId, 'bad projectId')
await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
const minShardedId = makeShardedId(projectId, '0')
const maxShardedId = makeShardedId(projectId, 'f')
await mongodb.shardedBlobs.deleteMany({
// @ts-ignore We are using a custom _id here.
_id: { $gte: minShardedId, $lte: maxShardedId },
})
}
/**
* Return the Mongo path to the bucket for the given hash.
* @param {string} hash
* @return {string}
*/
function getBucket(hash) {
return `blobs.${hash.slice(0, 3)}`
}
/**
* Return the shard key and Mongo path to the bucket for the given hash in the
* sharded collection.
* @param {string} hash
* @return {[string, string]}
*/
function getShardedBucket(hash) {
const shard = hash.slice(0, 1)
const bucket = `blobs.${hash.slice(1, 4)}`
return [shard, bucket]
}
/**
* Create an _id key for the sharded collection.
* @param {string} projectId
* @param {string} shard
* @return {Binary}
*/
function makeShardedId(projectId, shard) {
return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
}
/**
* @typedef {Object} Record
* @property {Binary} h
* @property {number} b
* @property {number} [s]
*/
/**
* Return the Mongo record for the given blob.
* @param {Blob} blob
* @return {Record}
*/
function blobToRecord(blob) {
const hash = blob.getHash()
const byteLength = blob.getByteLength()
const stringLength = blob.getStringLength()
return {
h: new Binary(Buffer.from(hash, 'hex')),
b: byteLength,
s: stringLength,
}
}
/**
* Create a blob from the given Mongo record.
* @param {Record} record
* @return {Blob}
*/
function recordToBlob(record) {
return new Blob(record.h.toString('hex'), record.b, record.s)
}
module.exports = {
initialize,
findBlob,
findBlobs,
getProjectBlobs,
getProjectBlobsBatch,
insertBlob,
deleteBlobs,
}

View File

@@ -0,0 +1,161 @@
const { Blob } = require('overleaf-editor-core')
const assert = require('../assert')
const knex = require('../knex')
/**
* Set up the initial data structures for a project
*/
async function initialize(projectId) {
// Nothing to do for Postgres
}
/**
* Return blob metadata for the given project and hash
*/
async function findBlob(projectId, hash) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
assert.blobHash(hash, 'bad hash')
const binaryHash = hashToBuffer(hash)
const record = await knex('project_blobs')
.select('hash_bytes', 'byte_length', 'string_length')
.where({
project_id: projectId,
hash_bytes: binaryHash,
})
.first()
return recordToBlob(record)
}
/**
* Read multiple blob metadata records by hexadecimal hashes.
*
* @param {Array.<string>} hashes hexadecimal SHA-1 hashes
* @return {Promise.<Array.<Blob?>>} no guarantee on order
*/
async function findBlobs(projectId, hashes) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
assert.array(hashes, 'bad hashes: not array')
hashes.forEach(function (hash) {
assert.blobHash(hash, 'bad hash')
})
const binaryHashes = hashes.map(hashToBuffer)
const records = await knex('project_blobs')
.select('hash_bytes', 'byte_length', 'string_length')
.where('project_id', projectId)
.whereIn('hash_bytes', binaryHashes)
const blobs = records.map(recordToBlob)
return blobs
}
/**
* Return metadata for all blobs in the given project
*/
async function getProjectBlobs(projectId) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
const records = await knex('project_blobs')
.select('hash_bytes', 'byte_length', 'string_length')
.where({
project_id: projectId,
})
const blobs = records.map(recordToBlob)
return blobs
}
/**
* Return metadata for all blobs in the given project
* @param {Array<number>} projectIds
* @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
*/
async function getProjectBlobsBatch(projectIds) {
for (const projectId of projectIds) {
assert.integer(projectId, 'bad projectId')
}
let nBlobs = 0
const blobs = new Map()
if (projectIds.length === 0) return { nBlobs, blobs }
const cursor = knex('project_blobs')
.select('project_id', 'hash_bytes', 'byte_length', 'string_length')
.whereIn('project_id', projectIds)
.stream()
for await (const record of cursor) {
const found = blobs.get(record.project_id)
if (found) {
found.push(recordToBlob(record))
} else {
blobs.set(record.project_id, [recordToBlob(record)])
}
nBlobs++
}
return { nBlobs, blobs }
}
/**
* Add a blob's metadata to the blobs table after it has been uploaded.
*/
async function insertBlob(projectId, blob) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
await knex('project_blobs')
.insert(blobToRecord(projectId, blob))
.onConflict(['project_id', 'hash_bytes'])
.ignore()
}
/**
* Deletes all blobs for a given project
*/
async function deleteBlobs(projectId) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
await knex('project_blobs').where('project_id', projectId).delete()
}
function blobToRecord(projectId, blob) {
return {
project_id: projectId,
hash_bytes: hashToBuffer(blob.hash),
byte_length: blob.getByteLength(),
string_length: blob.getStringLength(),
}
}
function recordToBlob(record) {
if (!record) return
return new Blob(
hashFromBuffer(record.hash_bytes),
record.byte_length,
record.string_length
)
}
function hashToBuffer(hash) {
if (!hash) return
return Buffer.from(hash, 'hex')
}
function hashFromBuffer(buffer) {
if (!buffer) return
return buffer.toString('hex')
}
module.exports = {
initialize,
findBlob,
findBlobs,
getProjectBlobs,
getProjectBlobsBatch,
insertBlob,
deleteBlobs,
}