first commit

2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions
--- a/services/history-v1/storage/lib/blob_store/index.js
+++ b/services/history-v1/storage/lib/blob_store/index.js
@@ -0,0 +1,433 @@
+'use strict'
+
+const config = require('config')
+const fs = require('node:fs')
+const isValidUtf8 = require('utf-8-validate')
+const { ReadableString } = require('@overleaf/stream-utils')
+
+const core = require('overleaf-editor-core')
+const objectPersistor = require('@overleaf/object-persistor')
+const OError = require('@overleaf/o-error')
+const Blob = core.Blob
+const TextOperation = core.TextOperation
+const containsNonBmpChars = core.util.containsNonBmpChars
+
+const assert = require('../assert')
+const blobHash = require('../blob_hash')
+const mongodb = require('../mongodb')
+const persistor = require('../persistor')
+const projectKey = require('../project_key')
+const streams = require('../streams')
+const postgresBackend = require('./postgres')
+const mongoBackend = require('./mongo')
+const logger = require('@overleaf/logger')
+
+/** @import { Readable } from 'stream' */
+
+const GLOBAL_BLOBS = new Map()
+
+function makeGlobalKey(hash) {
+  return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
+}
+
+function makeProjectKey(projectId, hash) {
+  return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
+}
+
+async function uploadBlob(projectId, blob, stream, opts = {}) {
+  const bucket = config.get('blobStore.projectBucket')
+  const key = makeProjectKey(projectId, blob.getHash())
+  logger.debug({ projectId, blob }, 'uploadBlob started')
+  try {
+    await persistor.sendStream(bucket, key, stream, {
+      contentType: 'application/octet-stream',
+      ...opts,
+    })
+  } finally {
+    logger.debug({ projectId, blob }, 'uploadBlob finished')
+  }
+}
+
+function getBlobLocation(projectId, hash) {
+  if (GLOBAL_BLOBS.has(hash)) {
+    return {
+      bucket: config.get('blobStore.globalBucket'),
+      key: makeGlobalKey(hash),
+    }
+  } else {
+    return {
+      bucket: config.get('blobStore.projectBucket'),
+      key: makeProjectKey(projectId, hash),
+    }
+  }
+}
+
+/**
+ * Returns the appropriate backend for the given project id
+ *
+ * Numeric ids use the Postgres backend.
+ * Strings of 24 characters use the Mongo backend.
+ */
+function getBackend(projectId) {
+  if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
+    return postgresBackend
+  } else if (assert.MONGO_ID_REGEXP.test(projectId)) {
+    return mongoBackend
+  } else {
+    throw new OError('bad project id', { projectId })
+  }
+}
+
+async function makeBlobForFile(pathname) {
+  const { size: byteLength } = await fs.promises.stat(pathname)
+  const hash = await blobHash.fromStream(
+    byteLength,
+    fs.createReadStream(pathname)
+  )
+  return new Blob(hash, byteLength)
+}
+
+async function getStringLengthOfFile(byteLength, pathname) {
+  // We have to read the file into memory to get its UTF-8 length, so don't
+  // bother for files that are too large for us to edit anyway.
+  if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
+    return null
+  }
+
+  // We need to check if the file contains nonBmp or null characters
+  let data = await fs.promises.readFile(pathname)
+  if (!isValidUtf8(data)) return null
+  data = data.toString()
+  if (data.length > TextOperation.MAX_STRING_LENGTH) return null
+  if (containsNonBmpChars(data)) return null
+  if (data.indexOf('\x00') !== -1) return null
+  return data.length
+}
+
+async function deleteBlobsInBucket(projectId) {
+  const bucket = config.get('blobStore.projectBucket')
+  const prefix = `${projectKey.format(projectId)}/`
+  logger.debug({ projectId }, 'deleteBlobsInBucket started')
+  try {
+    await persistor.deleteDirectory(bucket, prefix)
+  } finally {
+    logger.debug({ projectId }, 'deleteBlobsInBucket finished')
+  }
+}
+
+async function loadGlobalBlobs() {
+  const blobs = await mongodb.globalBlobs.find()
+  for await (const blob of blobs) {
+    GLOBAL_BLOBS.set(blob._id, {
+      blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
+      demoted: Boolean(blob.demoted),
+    })
+  }
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ * @param {Array<string|number>} projectIds
+ * @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
+ */
+async function getProjectBlobsBatch(projectIds) {
+  const mongoProjects = []
+  const postgresProjects = []
+  for (const projectId of projectIds) {
+    if (typeof projectId === 'number') {
+      postgresProjects.push(projectId)
+    } else {
+      mongoProjects.push(projectId)
+    }
+  }
+  const [
+    { nBlobs: nBlobsPostgres, blobs: blobsPostgres },
+    { nBlobs: nBlobsMongo, blobs: blobsMongo },
+  ] = await Promise.all([
+    postgresBackend.getProjectBlobsBatch(postgresProjects),
+    mongoBackend.getProjectBlobsBatch(mongoProjects),
+  ])
+  for (const [id, blobs] of blobsPostgres.entries()) {
+    blobsMongo.set(id.toString(), blobs)
+  }
+  return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
+}
+
+/**
+ * @classdesc
+ * Fetch and store the content of files using content-addressable hashing. The
+ * blob store manages both content and metadata (byte and UTF-8 length) for
+ * blobs.
+ */
+class BlobStore {
+  /**
+   * @constructor
+   * @param {string} projectId the project for which we'd like to find blobs
+   */
+  constructor(projectId) {
+    assert.projectId(projectId)
+    this.projectId = projectId
+    this.backend = getBackend(this.projectId)
+  }
+
+  /**
+   * Set up the initial data structure for a given project
+   */
+  async initialize() {
+    await this.backend.initialize(this.projectId)
+  }
+
+  /**
+   * Write a blob, if one does not already exist, with the given UTF-8 encoded
+   * string content.
+   *
+   * @param {string} string
+   * @return {Promise.<core.Blob>}
+   */
+  async putString(string) {
+    assert.string(string, 'bad string')
+    const hash = blobHash.fromString(string)
+
+    const existingBlob = await this._findBlobBeforeInsert(hash)
+    if (existingBlob != null) {
+      return existingBlob
+    }
+    const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
+    // Note: the ReadableString is to work around a bug in the AWS SDK: it won't
+    // allow Body to be blank.
+    await uploadBlob(this.projectId, newBlob, new ReadableString(string))
+    await this.backend.insertBlob(this.projectId, newBlob)
+    return newBlob
+  }
+
+  /**
+   * Write a blob, if one does not already exist, with the given file (usually a
+   * temporary file).
+   *
+   * @param {string} pathname
+   * @return {Promise<core.Blob>}
+   */
+  async putFile(pathname) {
+    assert.string(pathname, 'bad pathname')
+    const newBlob = await makeBlobForFile(pathname)
+    const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
+    if (existingBlob != null) {
+      return existingBlob
+    }
+    const stringLength = await getStringLengthOfFile(
+      newBlob.getByteLength(),
+      pathname
+    )
+    newBlob.setStringLength(stringLength)
+    await this.putBlob(pathname, newBlob)
+    return newBlob
+  }
+
+  /**
+   * Write a new blob, the stringLength must have been added already. It should
+   * have been checked that the blob does not exist yet. Consider using
+   * {@link putFile} instead of this lower-level method.
+   *
+   * @param {string} pathname
+   * @param {core.Blob} finializedBlob
+   * @return {Promise<void>}
+   */
+  async putBlob(pathname, finializedBlob) {
+    await uploadBlob(
+      this.projectId,
+      finializedBlob,
+      fs.createReadStream(pathname)
+    )
+    await this.backend.insertBlob(this.projectId, finializedBlob)
+  }
+
+  /**
+   * Stores an object as a JSON string in a blob.
+   *
+   * @param {object} obj
+   * @returns {Promise.<core.Blob>}
+   */
+  async putObject(obj) {
+    assert.object(obj, 'bad object')
+    const string = JSON.stringify(obj)
+    return await this.putString(string)
+  }
+
+  /**
+   *
+   * Fetch a blob's content by its hash as a UTF-8 encoded string.
+   *
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @return {Promise.<string>} promise for the content of the file
+   */
+  async getString(hash) {
+    assert.blobHash(hash, 'bad hash')
+
+    const projectId = this.projectId
+    logger.debug({ projectId, hash }, 'getString started')
+    try {
+      const stream = await this.getStream(hash)
+      const buffer = await streams.readStreamToBuffer(stream)
+      return buffer.toString()
+    } finally {
+      logger.debug({ projectId, hash }, 'getString finished')
+    }
+  }
+
+  /**
+   * Fetch a JSON encoded blob by its hash and deserialize it.
+   *
+   * @template [T=unknown]
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @return {Promise.<T>} promise for the content of the file
+   */
+  async getObject(hash) {
+    assert.blobHash(hash, 'bad hash')
+    const projectId = this.projectId
+    logger.debug({ projectId, hash }, 'getObject started')
+    try {
+      const jsonString = await this.getString(hash)
+      const object = JSON.parse(jsonString)
+      return object
+    } catch (error) {
+      // Maybe this is blob is gzipped. Try to gunzip it.
+      // TODO: Remove once we've ensured this is not reached
+      const stream = await this.getStream(hash)
+      const buffer = await streams.gunzipStreamToBuffer(stream)
+      const object = JSON.parse(buffer.toString())
+      logger.warn('getObject: Gzipped object in BlobStore')
+      return object
+    } finally {
+      logger.debug({ projectId, hash }, 'getObject finished')
+    }
+  }
+
+  /**
+   * Fetch a blob by its hash as a stream.
+   *
+   * Note that, according to the AWS SDK docs, this does not retry after initial
+   * failure, so the caller must be prepared to retry on errors, if appropriate.
+   *
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @param {Object} opts
+   * @return {Promise.<Readable>} a stream to read the file
+   */
+  async getStream(hash, opts = {}) {
+    assert.blobHash(hash, 'bad hash')
+
+    const { bucket, key } = getBlobLocation(this.projectId, hash)
+    try {
+      const stream = await persistor.getObjectStream(bucket, key, opts)
+      return stream
+    } catch (err) {
+      if (err instanceof objectPersistor.Errors.NotFoundError) {
+        throw new Blob.NotFoundError(hash)
+      }
+      throw err
+    }
+  }
+
+  /**
+   * Read a blob metadata record by hexadecimal hash.
+   *
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @return {Promise<core.Blob | null>}
+   */
+  async getBlob(hash) {
+    assert.blobHash(hash, 'bad hash')
+    const globalBlob = GLOBAL_BLOBS.get(hash)
+    if (globalBlob != null) {
+      return globalBlob.blob
+    }
+    const blob = await this.backend.findBlob(this.projectId, hash)
+    return blob
+  }
+
+  async getBlobs(hashes) {
+    assert.array(hashes, 'bad hashes')
+    const nonGlobalHashes = []
+    const blobs = []
+    for (const hash of hashes) {
+      const globalBlob = GLOBAL_BLOBS.get(hash)
+      if (globalBlob != null) {
+        blobs.push(globalBlob.blob)
+      } else {
+        nonGlobalHashes.push(hash)
+      }
+    }
+    if (nonGlobalHashes.length === 0) {
+      return blobs // to avoid unnecessary database lookup
+    }
+    const projectBlobs = await this.backend.findBlobs(
+      this.projectId,
+      nonGlobalHashes
+    )
+    blobs.push(...projectBlobs)
+    return blobs
+  }
+
+  /**
+   * Retrieve all blobs associated with the project.
+   * @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
+   */
+
+  async getProjectBlobs() {
+    const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
+    return projectBlobs
+  }
+
+  /**
+   * Delete all blobs that belong to the project.
+   */
+  async deleteBlobs() {
+    await Promise.all([
+      this.backend.deleteBlobs(this.projectId),
+      deleteBlobsInBucket(this.projectId),
+    ])
+  }
+
+  async _findBlobBeforeInsert(hash) {
+    const globalBlob = GLOBAL_BLOBS.get(hash)
+    if (globalBlob != null && !globalBlob.demoted) {
+      return globalBlob.blob
+    }
+    const blob = await this.backend.findBlob(this.projectId, hash)
+    return blob
+  }
+
+  /**
+   * Copy an existing sourceBlob in this project to a target project.
+   * @param {Blob} sourceBlob
+   * @param {string} targetProjectId
+   * @return {Promise<void>}
+   */
+  async copyBlob(sourceBlob, targetProjectId) {
+    assert.instance(sourceBlob, Blob, 'bad sourceBlob')
+    assert.projectId(targetProjectId, 'bad targetProjectId')
+    const hash = sourceBlob.getHash()
+    const sourceProjectId = this.projectId
+    const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
+    const destKey = makeProjectKey(targetProjectId, hash)
+    const targetBackend = getBackend(targetProjectId)
+    logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
+    try {
+      await persistor.copyObject(bucket, sourceKey, destKey)
+      await targetBackend.insertBlob(targetProjectId, sourceBlob)
+    } finally {
+      logger.debug(
+        { sourceProjectId, targetProjectId, hash },
+        'copyBlob finished'
+      )
+    }
+  }
+}
+
+module.exports = {
+  BlobStore,
+  getProjectBlobsBatch,
+  loadGlobalBlobs,
+  makeProjectKey,
+  makeBlobForFile,
+  getStringLengthOfFile,
+  GLOBAL_BLOBS,
+}
--- a/services/history-v1/storage/lib/blob_store/mongo.js
+++ b/services/history-v1/storage/lib/blob_store/mongo.js
@@ -0,0 +1,437 @@
+// @ts-check
+/**
+ * Mongo backend for the blob store.
+ *
+ * Blobs are stored in the projectHistoryBlobs collection. Each project has a
+ * document in that collection. That document has a "blobs" subdocument whose
+ * fields are buckets of blobs. The key of a bucket is the first three hex
+ * digits of the blob hash. The value of the bucket is an array of blobs that
+ * match the key.
+ *
+ * Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
+ * blobs are stored in a secondary collection: the projectHistoryShardedBlobs
+ * collection. This collection shards blobs between 16 documents per project.
+ * The shard key is the first hex digit of the hash. The documents are also
+ * organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
+ */
+
+const { Blob } = require('overleaf-editor-core')
+const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
+const assert = require('../assert')
+const mongodb = require('../mongodb')
+
+const MAX_BLOBS_IN_BUCKET = 8
+const DUPLICATE_KEY_ERROR_CODE = 11000
+
+/**
+ * @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
+ */
+
+/**
+ * Set up the data structures for a given project.
+ * @param {string} projectId
+ */
+async function initialize(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+  try {
+    await mongodb.blobs.insertOne({
+      _id: new ObjectId(projectId),
+      blobs: {},
+    })
+  } catch (err) {
+    if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
+      return // ignore already initialized case
+    }
+    throw err
+  }
+}
+
+/**
+ * Return blob metadata for the given project and hash.
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<Blob | null>}
+ */
+async function findBlob(projectId, hash) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.blobHash(hash, 'bad hash')
+
+  const bucket = getBucket(hash)
+  const result = await mongodb.blobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection: { _id: 0, bucket: `$${bucket}` } }
+  )
+
+  if (result?.bucket == null) {
+    return null
+  }
+
+  const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
+  if (record == null) {
+    if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
+      return await findBlobSharded(projectId, hash)
+    } else {
+      return null
+    }
+  }
+  return recordToBlob(record)
+}
+
+/**
+ * Search in the sharded collection for blob metadata
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<Blob | null>}
+ */
+async function findBlobSharded(projectId, hash) {
+  const [shard, bucket] = getShardedBucket(hash)
+  const id = makeShardedId(projectId, shard)
+  const result = await mongodb.shardedBlobs.findOne(
+    { _id: id },
+    { projection: { _id: 0, blobs: `$${bucket}` } }
+  )
+  if (result?.blobs == null) {
+    return null
+  }
+  const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
+  if (!record) return null
+  return recordToBlob(record)
+}
+
+/**
+ * Read multiple blob metadata records by hexadecimal hashes.
+ * @param {string} projectId
+ * @param {Array<string>} hashes
+ * @return {Promise<Array<Blob>>}
+ */
+async function findBlobs(projectId, hashes) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.array(hashes, 'bad hashes: not array')
+  hashes.forEach(function (hash) {
+    assert.blobHash(hash, 'bad hash')
+  })
+
+  // Build a set of unique buckets
+  const buckets = new Set(hashes.map(getBucket))
+
+  // Get buckets from Mongo
+  const projection = { _id: 0 }
+  for (const bucket of buckets) {
+    projection[bucket] = 1
+  }
+  const result = await mongodb.blobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection }
+  )
+
+  if (result?.blobs == null) {
+    return []
+  }
+
+  // Build blobs from the query results
+  const hashSet = new Set(hashes)
+  const blobs = []
+  for (const bucket of Object.values(result.blobs)) {
+    for (const record of bucket) {
+      const hash = record.h.toString('hex')
+      if (hashSet.has(hash)) {
+        blobs.push(recordToBlob(record))
+        hashSet.delete(hash)
+      }
+    }
+  }
+
+  // If we haven't found all the blobs, look in the sharded collection
+  if (hashSet.size > 0) {
+    const shardedBlobs = await findBlobsSharded(projectId, hashSet)
+    blobs.push(...shardedBlobs)
+  }
+
+  return blobs
+}
+
+/**
+ * Search in the sharded collection for blob metadata.
+ * @param {string} projectId
+ * @param {Set<string>} hashSet
+ * @return {Promise<Array<Blob>>}
+ */
+async function findBlobsSharded(projectId, hashSet) {
+  // Build a map of buckets by shard key
+  const bucketsByShard = new Map()
+  for (const hash of hashSet) {
+    const [shard, bucket] = getShardedBucket(hash)
+    let buckets = bucketsByShard.get(shard)
+    if (buckets == null) {
+      buckets = new Set()
+      bucketsByShard.set(shard, buckets)
+    }
+    buckets.add(bucket)
+  }
+
+  // Make parallel requests to the shards that might contain the hashes we want
+  const requests = []
+  for (const [shard, buckets] of bucketsByShard.entries()) {
+    const id = makeShardedId(projectId, shard)
+    const projection = { _id: 0 }
+    for (const bucket of buckets) {
+      projection[bucket] = 1
+    }
+    const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
+    requests.push(request)
+  }
+  const results = await Promise.all(requests)
+
+  // Build blobs from the query results
+  const blobs = []
+  for (const result of results) {
+    if (result?.blobs == null) {
+      continue
+    }
+
+    for (const bucket of Object.values(result.blobs)) {
+      for (const record of bucket) {
+        const hash = record.h.toString('hex')
+        if (hashSet.has(hash)) {
+          blobs.push(recordToBlob(record))
+        }
+      }
+    }
+  }
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ */
+async function getProjectBlobs(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+
+  const result = await mongodb.blobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection: { _id: 0 } }
+  )
+
+  if (!result) {
+    return []
+  }
+
+  // Build blobs from the query results
+  const blobs = []
+  for (const bucket of Object.values(result.blobs)) {
+    for (const record of bucket) {
+      blobs.push(recordToBlob(record))
+    }
+  }
+
+  // Look for all possible sharded blobs
+
+  const minShardedId = makeShardedId(projectId, '0')
+  const maxShardedId = makeShardedId(projectId, 'f')
+  // @ts-ignore We are using a custom _id here.
+  const shardedRecords = mongodb.shardedBlobs.find(
+    {
+      _id: { $gte: minShardedId, $lte: maxShardedId },
+    },
+    { projection: { _id: 0 } }
+  )
+
+  for await (const shardedRecord of shardedRecords) {
+    if (shardedRecord.blobs == null) {
+      continue
+    }
+    for (const bucket of Object.values(shardedRecord.blobs)) {
+      for (const record of bucket) {
+        blobs.push(recordToBlob(record))
+      }
+    }
+  }
+
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ * @param {Array<string>} projectIds
+ * @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
+ */
+async function getProjectBlobsBatch(projectIds) {
+  for (const project of projectIds) {
+    assert.mongoId(project, 'bad projectId')
+  }
+  let nBlobs = 0
+  const blobs = new Map()
+  if (projectIds.length === 0) return { nBlobs, blobs }
+
+  // blobs
+  {
+    const cursor = await mongodb.blobs.find(
+      { _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
+      { readPreference: ReadPreference.secondaryPreferred }
+    )
+    for await (const record of cursor) {
+      const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
+      blobs.set(record._id.toString(), projectBlobs)
+      nBlobs += projectBlobs.length
+    }
+  }
+
+  // sharded blobs
+  {
+    // @ts-ignore We are using a custom _id here.
+    const cursor = await mongodb.shardedBlobs.find(
+      {
+        _id: {
+          $gte: makeShardedId(projectIds[0], '0'),
+          $lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
+        },
+      },
+      { readPreference: ReadPreference.secondaryPreferred }
+    )
+    for await (const record of cursor) {
+      const recordIdHex = record._id.toString('hex')
+      const recordProjectId = recordIdHex.slice(0, 24)
+      const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
+      const found = blobs.get(recordProjectId)
+      if (found) {
+        found.push(...projectBlobs)
+      } else {
+        blobs.set(recordProjectId, projectBlobs)
+      }
+      nBlobs += projectBlobs.length
+    }
+  }
+  return { nBlobs, blobs }
+}
+
+/**
+ * Add a blob's metadata to the blobs collection after it has been uploaded.
+ * @param {string} projectId
+ * @param {Blob} blob
+ */
+async function insertBlob(projectId, blob) {
+  assert.mongoId(projectId, 'bad projectId')
+  const hash = blob.getHash()
+  const bucket = getBucket(hash)
+  const record = blobToRecord(blob)
+  const result = await mongodb.blobs.updateOne(
+    {
+      _id: new ObjectId(projectId),
+      $expr: {
+        $lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
+      },
+    },
+    {
+      $addToSet: { [bucket]: record },
+    }
+  )
+
+  if (result.matchedCount === 0) {
+    await insertRecordSharded(projectId, hash, record)
+  }
+}
+
+/**
+ * Add a blob's metadata to the sharded blobs collection.
+ * @param {string} projectId
+ * @param {string} hash
+ * @param {Record} record
+ * @return {Promise<void>}
+ */
+async function insertRecordSharded(projectId, hash, record) {
+  const [shard, bucket] = getShardedBucket(hash)
+  const id = makeShardedId(projectId, shard)
+  await mongodb.shardedBlobs.updateOne(
+    { _id: id },
+    { $addToSet: { [bucket]: record } },
+    { upsert: true }
+  )
+}
+
+/**
+ * Delete all blobs for a given project.
+ * @param {string} projectId
+ */
+async function deleteBlobs(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+  await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
+  const minShardedId = makeShardedId(projectId, '0')
+  const maxShardedId = makeShardedId(projectId, 'f')
+  await mongodb.shardedBlobs.deleteMany({
+    // @ts-ignore We are using a custom _id here.
+    _id: { $gte: minShardedId, $lte: maxShardedId },
+  })
+}
+
+/**
+ * Return the Mongo path to the bucket for the given hash.
+ * @param {string} hash
+ * @return {string}
+ */
+function getBucket(hash) {
+  return `blobs.${hash.slice(0, 3)}`
+}
+
+/**
+ * Return the shard key and Mongo path to the bucket for the given hash in the
+ * sharded collection.
+ * @param {string} hash
+ * @return {[string, string]}
+ */
+function getShardedBucket(hash) {
+  const shard = hash.slice(0, 1)
+  const bucket = `blobs.${hash.slice(1, 4)}`
+  return [shard, bucket]
+}
+
+/**
+ * Create an _id key for the sharded collection.
+ * @param {string} projectId
+ * @param {string} shard
+ * @return {Binary}
+ */
+function makeShardedId(projectId, shard) {
+  return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
+}
+
+/**
+ * @typedef {Object} Record
+ * @property {Binary} h
+ * @property {number} b
+ * @property {number} [s]
+ */
+
+/**
+ * Return the Mongo record for the given blob.
+ * @param {Blob} blob
+ * @return {Record}
+ */
+function blobToRecord(blob) {
+  const hash = blob.getHash()
+  const byteLength = blob.getByteLength()
+  const stringLength = blob.getStringLength()
+  return {
+    h: new Binary(Buffer.from(hash, 'hex')),
+    b: byteLength,
+    s: stringLength,
+  }
+}
+
+/**
+ * Create a blob from the given Mongo record.
+ * @param {Record} record
+ * @return {Blob}
+ */
+function recordToBlob(record) {
+  return new Blob(record.h.toString('hex'), record.b, record.s)
+}
+
+module.exports = {
+  initialize,
+  findBlob,
+  findBlobs,
+  getProjectBlobs,
+  getProjectBlobsBatch,
+  insertBlob,
+  deleteBlobs,
+}
--- a/services/history-v1/storage/lib/blob_store/postgres.js
+++ b/services/history-v1/storage/lib/blob_store/postgres.js
@@ -0,0 +1,161 @@
+const { Blob } = require('overleaf-editor-core')
+const assert = require('../assert')
+const knex = require('../knex')
+
+/**
+ * Set up the initial data structures for a project
+ */
+async function initialize(projectId) {
+  // Nothing to do for Postgres
+}
+
+/**
+ * Return blob metadata for the given project and hash
+ */
+async function findBlob(projectId, hash) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+  assert.blobHash(hash, 'bad hash')
+
+  const binaryHash = hashToBuffer(hash)
+  const record = await knex('project_blobs')
+    .select('hash_bytes', 'byte_length', 'string_length')
+    .where({
+      project_id: projectId,
+      hash_bytes: binaryHash,
+    })
+    .first()
+  return recordToBlob(record)
+}
+
+/**
+ * Read multiple blob metadata records by hexadecimal hashes.
+ *
+ * @param {Array.<string>} hashes hexadecimal SHA-1 hashes
+ * @return {Promise.<Array.<Blob?>>} no guarantee on order
+ */
+async function findBlobs(projectId, hashes) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+  assert.array(hashes, 'bad hashes: not array')
+  hashes.forEach(function (hash) {
+    assert.blobHash(hash, 'bad hash')
+  })
+
+  const binaryHashes = hashes.map(hashToBuffer)
+
+  const records = await knex('project_blobs')
+    .select('hash_bytes', 'byte_length', 'string_length')
+    .where('project_id', projectId)
+    .whereIn('hash_bytes', binaryHashes)
+
+  const blobs = records.map(recordToBlob)
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ */
+async function getProjectBlobs(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+
+  const records = await knex('project_blobs')
+    .select('hash_bytes', 'byte_length', 'string_length')
+    .where({
+      project_id: projectId,
+    })
+
+  const blobs = records.map(recordToBlob)
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ * @param {Array<number>} projectIds
+ * @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
+ */
+async function getProjectBlobsBatch(projectIds) {
+  for (const projectId of projectIds) {
+    assert.integer(projectId, 'bad projectId')
+  }
+  let nBlobs = 0
+  const blobs = new Map()
+  if (projectIds.length === 0) return { nBlobs, blobs }
+
+  const cursor = knex('project_blobs')
+    .select('project_id', 'hash_bytes', 'byte_length', 'string_length')
+    .whereIn('project_id', projectIds)
+    .stream()
+  for await (const record of cursor) {
+    const found = blobs.get(record.project_id)
+    if (found) {
+      found.push(recordToBlob(record))
+    } else {
+      blobs.set(record.project_id, [recordToBlob(record)])
+    }
+    nBlobs++
+  }
+  return { nBlobs, blobs }
+}
+
+/**
+ * Add a blob's metadata to the blobs table after it has been uploaded.
+ */
+async function insertBlob(projectId, blob) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+
+  await knex('project_blobs')
+    .insert(blobToRecord(projectId, blob))
+    .onConflict(['project_id', 'hash_bytes'])
+    .ignore()
+}
+
+/**
+ * Deletes all blobs for a given project
+ */
+async function deleteBlobs(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+
+  await knex('project_blobs').where('project_id', projectId).delete()
+}
+
+function blobToRecord(projectId, blob) {
+  return {
+    project_id: projectId,
+    hash_bytes: hashToBuffer(blob.hash),
+    byte_length: blob.getByteLength(),
+    string_length: blob.getStringLength(),
+  }
+}
+
+function recordToBlob(record) {
+  if (!record) return
+  return new Blob(
+    hashFromBuffer(record.hash_bytes),
+    record.byte_length,
+    record.string_length
+  )
+}
+
+function hashToBuffer(hash) {
+  if (!hash) return
+  return Buffer.from(hash, 'hex')
+}
+
+function hashFromBuffer(buffer) {
+  if (!buffer) return
+  return buffer.toString('hex')
+}
+
+module.exports = {
+  initialize,
+  findBlob,
+  findBlobs,
+  getProjectBlobs,
+  getProjectBlobsBatch,
+  insertBlob,
+  deleteBlobs,
+}