first commit

2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions
--- a/services/history-v1/storage/index.js
+++ b/services/history-v1/storage/index.js
@@ -0,0 +1,25 @@
+exports.BatchBlobStore = require('./lib/batch_blob_store')
+exports.blobHash = require('./lib/blob_hash')
+exports.HashCheckBlobStore = require('./lib/hash_check_blob_store')
+exports.chunkBuffer = require('./lib/chunk_buffer')
+exports.chunkStore = require('./lib/chunk_store')
+exports.historyStore = require('./lib/history_store').historyStore
+exports.knex = require('./lib/knex')
+exports.mongodb = require('./lib/mongodb')
+exports.redis = require('./lib/redis')
+exports.persistChanges = require('./lib/persist_changes')
+exports.persistor = require('./lib/persistor')
+exports.ProjectArchive = require('./lib/project_archive')
+exports.streams = require('./lib/streams')
+exports.temp = require('./lib/temp')
+exports.zipStore = require('./lib/zip_store')
+
+const { BlobStore, loadGlobalBlobs } = require('./lib/blob_store')
+exports.BlobStore = BlobStore
+exports.loadGlobalBlobs = loadGlobalBlobs
+
+const { InvalidChangeError } = require('./lib/errors')
+exports.InvalidChangeError = InvalidChangeError
+
+const { ChunkVersionConflictError } = require('./lib/chunk_store/errors')
+exports.ChunkVersionConflictError = ChunkVersionConflictError
--- a/services/history-v1/storage/lib/assert.js
+++ b/services/history-v1/storage/lib/assert.js
@@ -0,0 +1,76 @@
+'use strict'
+
+const OError = require('@overleaf/o-error')
+
+const check = require('check-types')
+const { Blob } = require('overleaf-editor-core')
+
+const assert = check.assert
+
+const MONGO_ID_REGEXP = /^[0-9a-f]{24}$/
+const POSTGRES_ID_REGEXP = /^[1-9][0-9]{0,9}$/
+const MONGO_OR_POSTGRES_ID_REGEXP = /^([0-9a-f]{24}|[1-9][0-9]{0,9})$/
+
+function transaction(transaction, message) {
+  assert.function(transaction, message)
+}
+
+function blobHash(arg, message) {
+  try {
+    assert.match(arg, Blob.HEX_HASH_RX, message)
+  } catch (error) {
+    throw OError.tag(error, message, { arg })
+  }
+}
+
+/**
+ * A project id is a string that contains either an integer (for projects stored in Postgres) or 24
+ * hex digits (for projects stored in Mongo)
+ */
+function projectId(arg, message) {
+  try {
+    assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
+  } catch (error) {
+    throw OError.tag(error, message, { arg })
+  }
+}
+
+/**
+ * A chunk id is a string that contains either an integer (for projects stored in Postgres) or 24
+ * hex digits (for projects stored in Mongo)
+ */
+function chunkId(arg, message) {
+  try {
+    assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
+  } catch (error) {
+    throw OError.tag(error, message, { arg })
+  }
+}
+
+function mongoId(arg, message) {
+  try {
+    assert.match(arg, MONGO_ID_REGEXP, message)
+  } catch (error) {
+    throw OError.tag(error, message, { arg })
+  }
+}
+
+function postgresId(arg, message) {
+  try {
+    assert.match(arg, POSTGRES_ID_REGEXP, message)
+  } catch (error) {
+    throw OError.tag(error, message, { arg })
+  }
+}
+
+module.exports = {
+  ...assert,
+  transaction,
+  blobHash,
+  projectId,
+  chunkId,
+  mongoId,
+  postgresId,
+  MONGO_ID_REGEXP,
+  POSTGRES_ID_REGEXP,
+}
--- a/services/history-v1/storage/lib/backupBlob.mjs
+++ b/services/history-v1/storage/lib/backupBlob.mjs
@@ -0,0 +1,251 @@
+// @ts-check
+import { backupPersistor, projectBlobsBucket } from './backupPersistor.mjs'
+import { GLOBAL_BLOBS, makeProjectKey, BlobStore } from './blob_store/index.js'
+import Stream from 'node:stream'
+import fs from 'node:fs'
+import Crypto from 'node:crypto'
+import assert from './assert.js'
+import { backedUpBlobs, projects } from './mongodb.js'
+import { Binary, ObjectId } from 'mongodb'
+import logger from '@overleaf/logger/logging-manager.js'
+import { AlreadyWrittenError } from '@overleaf/object-persistor/src/Errors.js'
+import metrics from '@overleaf/metrics'
+import zLib from 'node:zlib'
+import Path from 'node:path'
+
+const HIGHWATER_MARK = 1024 * 1024
+
+/**
+ * @typedef {import("overleaf-editor-core").Blob} Blob
+ */
+
+/**
+ * @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
+ */
+
+/**
+ * Increment a metric to record the outcome of a backup operation.
+ *
+ * @param {"success"|"failure"|"skipped"} status
+ * @param {"global"|"already_backed_up"|"none"} reason
+ */
+function recordBackupConclusion(status, reason = 'none') {
+  metrics.inc('blob_backed_up', 1, { status, reason })
+}
+
+/**
+ * Downloads a blob to a specified directory
+ *
+ * @param {string} historyId - The history ID of the project the blob belongs to
+ * @param {Blob} blob - The blob to download
+ * @param {string} tmpDir - The directory path where the blob will be downloaded
+ * @returns {Promise<string>} The full path where the blob was downloaded
+ */
+export async function downloadBlobToDir(historyId, blob, tmpDir) {
+  const blobStore = new BlobStore(historyId)
+  const blobHash = blob.getHash()
+  const src = await blobStore.getStream(blobHash)
+  const filePath = Path.join(tmpDir, `${historyId}-${blobHash}`)
+  try {
+    const dst = fs.createWriteStream(filePath, {
+      highWaterMark: HIGHWATER_MARK,
+      flags: 'wx',
+    })
+    await Stream.promises.pipeline(src, dst)
+    return filePath
+  } catch (error) {
+    try {
+      await fs.promises.unlink(filePath)
+    } catch {}
+    throw error
+  }
+}
+
+/**
+ * Performs the actual upload of the blob to the backup storage.
+ *
+ * @param {string} historyId - The history ID of the project the blob belongs to
+ * @param {Blob} blob - The blob being uploaded
+ * @param {string} path - The path to the file to upload (should have been stored on disk already)
+ * @return {Promise<void>}
+ */
+export async function uploadBlobToBackup(historyId, blob, path, persistor) {
+  const md5 = Crypto.createHash('md5')
+  const filePathCompressed = path + '.gz'
+  let backupSource
+  let contentEncoding
+  let size
+  try {
+    if (blob.getStringLength()) {
+      backupSource = filePathCompressed
+      contentEncoding = 'gzip'
+      size = 0
+      await Stream.promises.pipeline(
+        fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
+        zLib.createGzip(),
+        async function* (source) {
+          for await (const chunk of source) {
+            size += chunk.byteLength
+            md5.update(chunk)
+            yield chunk
+          }
+        },
+        fs.createWriteStream(filePathCompressed, {
+          highWaterMark: HIGHWATER_MARK,
+        })
+      )
+    } else {
+      backupSource = path
+      size = blob.getByteLength()
+      await Stream.promises.pipeline(
+        fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
+        md5
+      )
+    }
+    const key = makeProjectKey(historyId, blob.getHash())
+    await persistor.sendStream(
+      projectBlobsBucket,
+      key,
+      fs.createReadStream(backupSource, { highWaterMark: HIGHWATER_MARK }),
+      {
+        contentEncoding,
+        contentType: 'application/octet-stream',
+        contentLength: size,
+        sourceMd5: md5.digest('hex'),
+        ifNoneMatch: '*',
+      }
+    )
+  } finally {
+    if (backupSource === filePathCompressed) {
+      try {
+        await fs.promises.rm(filePathCompressed, { force: true })
+      } catch {}
+    }
+  }
+}
+
+/**
+ * Converts a legacy (postgres) historyId to a mongo projectId
+ *
+ * @param {string} historyId
+ * @return {Promise<string>}
+ * @private
+ */
+async function _convertLegacyHistoryIdToProjectId(historyId) {
+  const project = await projects.findOne(
+    { 'overleaf.history.id': parseInt(historyId) },
+    { projection: { _id: 1 } }
+  )
+
+  if (!project?._id) {
+    throw new Error('Did not find project for history id')
+  }
+
+  return project?._id?.toString()
+}
+
+/**
+ * Records that a blob was backed up for a project.
+ *
+ * @param {string} projectId - projectId for a project (mongo format)
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+export async function storeBlobBackup(projectId, hash) {
+  await backedUpBlobs.updateOne(
+    { _id: new ObjectId(projectId) },
+    { $addToSet: { blobs: new Binary(Buffer.from(hash, 'hex')) } },
+    { upsert: true }
+  )
+}
+
+/**
+ * Determine whether a specific blob has been backed up in this project.
+ *
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<*>}
+ * @private
+ */
+export async function _blobIsBackedUp(projectId, hash) {
+  const blobs = await backedUpBlobs.findOne(
+    {
+      _id: new ObjectId(projectId),
+      blobs: new Binary(Buffer.from(hash, 'hex')),
+    },
+    { projection: { _id: 1 } }
+  )
+  return blobs?._id
+}
+
+/**
+ * Back up a blob to the global storage and record that it was backed up.
+ *
+ * @param {string} historyId - history ID for a project (can be postgres format or mongo format)
+ * @param {Blob} blob - The blob that is being backed up
+ * @param {string} tmpPath - The path to a temporary file storing the contents of the blob.
+ * @param {CachedPerProjectEncryptedS3Persistor} [persistor] - The persistor to use (optional)
+ * @return {Promise<void>}
+ */
+export async function backupBlob(historyId, blob, tmpPath, persistor) {
+  const hash = blob.getHash()
+
+  let projectId = historyId
+  if (assert.POSTGRES_ID_REGEXP.test(historyId)) {
+    projectId = await _convertLegacyHistoryIdToProjectId(historyId)
+  }
+
+  const globalBlob = GLOBAL_BLOBS.get(hash)
+
+  if (globalBlob && !globalBlob.demoted) {
+    recordBackupConclusion('skipped', 'global')
+    logger.debug({ projectId, hash }, 'Blob is global - skipping backup')
+    return
+  }
+
+  try {
+    if (await _blobIsBackedUp(projectId, hash)) {
+      recordBackupConclusion('skipped', 'already_backed_up')
+      logger.debug(
+        { projectId, hash },
+        'Blob already backed up - skipping backup'
+      )
+      return
+    }
+  } catch (error) {
+    logger.warn({ error }, 'Failed to check if blob is backed up')
+    // We'll try anyway - we'll catch the error if it was backed up
+  }
+  // If we weren't passed a persistor for this project, create one.
+  // This will fetch the key from AWS, so it's prefereable to use
+  // the same persistor for all blobs in a project where possible.
+  if (!persistor) {
+    logger.debug(
+      { historyId, hash },
+      'warning: persistor not passed to backupBlob'
+    )
+  }
+  persistor ??= await backupPersistor.forProject(
+    projectBlobsBucket,
+    makeProjectKey(historyId, '')
+  )
+  try {
+    logger.debug({ projectId, hash }, 'Starting blob backup')
+    await uploadBlobToBackup(historyId, blob, tmpPath, persistor)
+    await storeBlobBackup(projectId, hash)
+    recordBackupConclusion('success')
+  } catch (error) {
+    if (error instanceof AlreadyWrittenError) {
+      logger.debug({ error, projectId, hash }, 'Blob already backed up')
+      // record that we backed it up already
+      await storeBlobBackup(projectId, hash)
+      recordBackupConclusion('failure', 'already_backed_up')
+      return
+    }
+    // eventually queue this for retry - for now this will be fixed by running the script
+    recordBackupConclusion('failure')
+    logger.warn({ error, projectId, hash }, 'Failed to upload blob to backup')
+  } finally {
+    logger.debug({ projectId, hash }, 'Ended blob backup')
+  }
+}
--- a/services/history-v1/storage/lib/backupDeletion.mjs
+++ b/services/history-v1/storage/lib/backupDeletion.mjs
@@ -0,0 +1,93 @@
+// @ts-check
+import { callbackify } from 'util'
+import { ObjectId } from 'mongodb'
+import config from 'config'
+import OError from '@overleaf/o-error'
+import { db } from './mongodb.js'
+import projectKey from './project_key.js'
+import chunkStore from '../lib/chunk_store/index.js'
+import {
+  backupPersistor,
+  chunksBucket,
+  projectBlobsBucket,
+} from './backupPersistor.mjs'
+
+const MS_PER_DAY = 24 * 60 * 60 * 1000
+const EXPIRE_PROJECTS_AFTER_MS =
+  parseInt(config.get('minSoftDeletionPeriodDays'), 10) * MS_PER_DAY
+const deletedProjectsCollection = db.collection('deletedProjects')
+
+/**
+ * @param {string} historyId
+ * @return {Promise<boolean>}
+ */
+async function projectHasLatestChunk(historyId) {
+  const chunk = await chunkStore.getBackend(historyId).getLatestChunk(historyId)
+  return chunk != null
+}
+
+export class NotReadyToDelete extends OError {}
+
+/**
+ * @param {string} projectId
+ * @return {Promise<void>}
+ */
+async function deleteProjectBackup(projectId) {
+  const deletedProject = await deletedProjectsCollection.findOne(
+    { 'deleterData.deletedProjectId': new ObjectId(projectId) },
+    {
+      projection: {
+        'deleterData.deletedProjectOverleafHistoryId': 1,
+        'deleterData.deletedAt': 1,
+      },
+    }
+  )
+  if (!deletedProject) {
+    throw new NotReadyToDelete('refusing to delete non-deleted project')
+  }
+  const expiresAt =
+    deletedProject.deleterData.deletedAt.getTime() + EXPIRE_PROJECTS_AFTER_MS
+  if (expiresAt > Date.now()) {
+    throw new NotReadyToDelete('refusing to delete non-expired project')
+  }
+
+  const historyId =
+    deletedProject.deleterData.deletedProjectOverleafHistoryId?.toString()
+  if (!historyId) {
+    throw new NotReadyToDelete(
+      'refusing to delete project with unknown historyId'
+    )
+  }
+
+  if (await projectHasLatestChunk(historyId)) {
+    throw new NotReadyToDelete(
+      'refusing to delete project with remaining chunks'
+    )
+  }
+
+  const prefix = projectKey.format(historyId) + '/'
+  await backupPersistor.deleteDirectory(chunksBucket, prefix)
+  await backupPersistor.deleteDirectory(projectBlobsBucket, prefix)
+}
+
+export async function healthCheck() {
+  const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
+  if (HEALTH_CHECK_PROJECTS.length !== 2) {
+    throw new Error('expected 2 healthCheckProjects')
+  }
+  if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
+    throw new Error('expected mongo id in healthCheckProjects')
+  }
+  if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
+    throw new Error('expected postgres id in healthCheckProjects')
+  }
+
+  for (const historyId of HEALTH_CHECK_PROJECTS) {
+    if (!(await projectHasLatestChunk(historyId))) {
+      throw new Error(`project has no history: ${historyId}`)
+    }
+  }
+}
+
+export const healthCheckCb = callbackify(healthCheck)
+export const deleteProjectBackupCb = callbackify(deleteProjectBackup)
--- a/services/history-v1/storage/lib/backupGenerator.mjs
+++ b/services/history-v1/storage/lib/backupGenerator.mjs
@@ -0,0 +1,152 @@
+/**
+ * Provides a generator function to back up project chunks and blobs.
+ */
+
+import chunkStore from './chunk_store/index.js'
+
+import {
+  GLOBAL_BLOBS, // NOTE:  must call loadGlobalBlobs() before using this
+  BlobStore,
+} from './blob_store/index.js'
+
+import assert from './assert.js'
+
+async function lookBehindForSeenBlobs(
+  projectId,
+  chunk,
+  lastBackedUpVersion,
+  seenBlobs
+) {
+  if (chunk.startVersion === 0) {
+    return // this is the first chunk, no need to check for blobs in the previous chunk
+  }
+  if (chunk.startVersion > 0 && lastBackedUpVersion > chunk.startVersion) {
+    return // the snapshot in this chunk has already been backed up
+  }
+  if (
+    chunk.startVersion > 0 &&
+    lastBackedUpVersion === chunk.startVersion // same as previousChunk.endVersion
+  ) {
+    // the snapshot in this chunk has not been backed up
+    // so we find the set of backed up blobs from the previous chunk
+    const previousChunk = await chunkStore.loadAtVersion(
+      projectId,
+      lastBackedUpVersion
+    )
+    const previousChunkHistory = previousChunk.getHistory()
+    previousChunkHistory.findBlobHashes(seenBlobs)
+  }
+}
+
+/**
+ * Records blob hashes that have been previously seen in a chunk's history.
+ *
+ * @param {Object} chunk - The chunk containing history data
+ * @param {number} currentBackedUpVersion - The version number that has been backed up
+ * @param {Set<string>} seenBlobs - Set to collect previously seen blob hashes
+ * @returns {void}
+ */
+function recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs) {
+  // We need to look at the chunk and decide how far we have backed up.
+  // If we have not backed up this chunk at all, we need to backup the blobs
+  // in the snapshot. Otherwise we need to backup the blobs in the changes
+  // that have occurred since the last backup.
+  const history = chunk.getHistory()
+  const startVersion = chunk.getStartVersion()
+  if (currentBackedUpVersion === 0) {
+    // If we have only backed up version 0 (i.e. the first change)
+    // then that includes the initial snapshot, so we consider
+    // the blobs of the initial snapshot as seen.  If the project
+    // has not been backed up at all then currentBackedUpVersion
+    // will be undefined.
+    history.snapshot.findBlobHashes(seenBlobs)
+  } else if (currentBackedUpVersion > startVersion) {
+    history.snapshot.findBlobHashes(seenBlobs)
+    for (let i = 0; i < currentBackedUpVersion - startVersion; i++) {
+      history.changes[i].findBlobHashes(seenBlobs)
+    }
+  }
+}
+
+/**
+ * Collects new blob objects that need to be backed up from a given chunk.
+ *
+ * @param {Object} chunk - The chunk object containing history data
+ * @param {Object} blobStore - Storage interface for retrieving blobs
+ * @param {Set<string>} seenBlobs - Set of blob hashes that have already been processed
+ * @returns {Promise<Object[]>} Array of blob objects that need to be backed up
+ * @throws {Error} If blob retrieval fails
+ */
+async function collectNewBlobsForBackup(chunk, blobStore, seenBlobs) {
+  /** @type {Set<string>} */
+  const blobHashes = new Set()
+  const history = chunk.getHistory()
+  // Get all the blobs in this chunk, then exclude the seenBlobs and global blobs
+  history.findBlobHashes(blobHashes)
+  const blobsToBackup = await blobStore.getBlobs(
+    [...blobHashes].filter(
+      hash =>
+        hash &&
+        !seenBlobs.has(hash) &&
+        (!GLOBAL_BLOBS.has(hash) || GLOBAL_BLOBS.get(hash).demoted)
+    )
+  )
+  return blobsToBackup
+}
+
+/**
+ * Asynchronously generates backups for a project based on provided versions.
+ * @param {string} projectId - The ID of the project's history to back up.
+ * @param {number} lastBackedUpVersion - The last version that was successfully backed up.
+ * @yields {AsyncGenerator<{ chunkRecord: object, chunkToBackup: object, chunkBuffer: Buffer, blobsToBackup: object[] }>}
+ *   Yields chunk records and corresponding data needed for backups.
+ */
+export async function* backupGenerator(projectId, lastBackedUpVersion) {
+  assert.projectId(projectId, 'bad projectId')
+  assert.maybe.integer(lastBackedUpVersion, 'bad lastBackedUpVersion')
+
+  const blobStore = new BlobStore(projectId)
+
+  /** @type {Set<string>} */
+  const seenBlobs = new Set() // records the blobs that are already backed up
+
+  const firstPendingVersion =
+    lastBackedUpVersion >= 0 ? lastBackedUpVersion + 1 : 0
+  let isStartingChunk = true
+  let currentBackedUpVersion = lastBackedUpVersion
+  const chunkRecordIterator = chunkStore.getProjectChunksFromVersion(
+    projectId,
+    firstPendingVersion
+  )
+
+  for await (const chunkRecord of chunkRecordIterator) {
+    const { chunk, chunkBuffer } = await chunkStore.loadByChunkRecord(
+      projectId,
+      chunkRecord
+    )
+
+    if (isStartingChunk) {
+      await lookBehindForSeenBlobs(
+        projectId,
+        chunkRecord,
+        lastBackedUpVersion,
+        seenBlobs
+      )
+      isStartingChunk = false
+    }
+
+    recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs)
+
+    const blobsToBackup = await collectNewBlobsForBackup(
+      chunk,
+      blobStore,
+      seenBlobs
+    )
+
+    yield { chunkRecord, chunkToBackup: chunk, chunkBuffer, blobsToBackup }
+
+    // After we generate a backup of this chunk, mark the backed up blobs as seen
+    blobsToBackup.forEach(blob => seenBlobs.add(blob.getHash()))
+    currentBackedUpVersion = chunkRecord.endVersion
+  }
+}
--- a/services/history-v1/storage/lib/backupPersistor.mjs
+++ b/services/history-v1/storage/lib/backupPersistor.mjs
@@ -0,0 +1,121 @@
+// @ts-check
+import fs from 'node:fs'
+import Path from 'node:path'
+import _ from 'lodash'
+import config from 'config'
+import { SecretManagerServiceClient } from '@google-cloud/secret-manager'
+import OError from '@overleaf/o-error'
+import {
+  PerProjectEncryptedS3Persistor,
+  RootKeyEncryptionKey,
+} from '@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js'
+import { HistoryStore } from './history_store.js'
+
+const persistorConfig = _.cloneDeep(config.get('backupPersistor'))
+const { chunksBucket, deksBucket, globalBlobsBucket, projectBlobsBucket } =
+  config.get('backupStore')
+
+export { chunksBucket, globalBlobsBucket, projectBlobsBucket }
+
+function convertKey(key, convertFn) {
+  if (_.has(persistorConfig, key)) {
+    _.update(persistorConfig, key, convertFn)
+  }
+}
+
+convertKey('s3SSEC.httpOptions.timeout', s => parseInt(s, 10))
+convertKey('s3SSEC.maxRetries', s => parseInt(s, 10))
+convertKey('s3SSEC.pathStyle', s => s === 'true')
+// array of CA, either inlined or on disk
+convertKey('s3SSEC.ca', s =>
+  JSON.parse(s).map(ca => (ca.startsWith('/') ? fs.readFileSync(ca) : ca))
+)
+
+/** @type {() => Promise<string>} */
+let getRawRootKeyEncryptionKeys
+
+if ((process.env.NODE_ENV || 'production') === 'production') {
+  ;[persistorConfig.s3SSEC.key, persistorConfig.s3SSEC.secret] = (
+    await loadFromSecretsManager(
+      process.env.BACKUP_AWS_CREDENTIALS || '',
+      'BACKUP_AWS_CREDENTIALS'
+    )
+  ).split(':')
+  getRawRootKeyEncryptionKeys = () =>
+    loadFromSecretsManager(
+      persistorConfig.keyEncryptionKeys,
+      'BACKUP_KEY_ENCRYPTION_KEYS'
+    )
+} else {
+  getRawRootKeyEncryptionKeys = () => persistorConfig.keyEncryptionKeys
+}
+
+export const DELETION_ONLY = persistorConfig.keyEncryptionKeys === 'none'
+if (DELETION_ONLY) {
+  // For Backup-deleter; should not encrypt or read data; deleting does not need key.
+  getRawRootKeyEncryptionKeys = () => new Promise(_resolve => {})
+}
+
+const PROJECT_FOLDER_REGEX =
+  /^\d{3}\/\d{3}\/\d{3,}\/|[0-9a-f]{3}\/[0-9a-f]{3}\/[0-9a-f]{18}\/$/
+
+/**
+ * @param {string} bucketName
+ * @param {string} path
+ * @return {string}
+ */
+export function pathToProjectFolder(bucketName, path) {
+  switch (bucketName) {
+    case deksBucket:
+    case chunksBucket:
+    case projectBlobsBucket:
+      const projectFolder = Path.join(...path.split('/').slice(0, 3)) + '/'
+      if (!PROJECT_FOLDER_REGEX.test(projectFolder)) {
+        throw new OError('invalid project folder', { bucketName, path })
+      }
+      return projectFolder
+    default:
+      throw new Error(`${bucketName} does not store per-project files`)
+  }
+}
+
+/**
+ * @param {string} name
+ * @param {string} label
+ * @return {Promise<string>}
+ */
+async function loadFromSecretsManager(name, label) {
+  const client = new SecretManagerServiceClient()
+  const [version] = await client.accessSecretVersion({ name })
+  if (!version.payload?.data) throw new Error(`empty secret: ${label}`)
+  return version.payload.data.toString()
+}
+
+async function getRootKeyEncryptionKeys() {
+  return JSON.parse(await getRawRootKeyEncryptionKeys()).map(
+    ({ key, salt }) => {
+      return new RootKeyEncryptionKey(
+        Buffer.from(key, 'base64'),
+        Buffer.from(salt, 'base64')
+      )
+    }
+  )
+}
+
+export const backupPersistor = new PerProjectEncryptedS3Persistor({
+  ...persistorConfig.s3SSEC,
+  disableMultiPartUpload: true,
+  dataEncryptionKeyBucketName: deksBucket,
+  pathToProjectFolder,
+  getRootKeyEncryptionKeys,
+  storageClass: {
+    [deksBucket]: 'STANDARD',
+    [chunksBucket]: persistorConfig.tieringStorageClass,
+    [projectBlobsBucket]: persistorConfig.tieringStorageClass,
+  },
+})
+
+export const backupHistoryStore = new HistoryStore(
+  backupPersistor,
+  chunksBucket
+)
--- a/services/history-v1/storage/lib/backupVerifier.mjs
+++ b/services/history-v1/storage/lib/backupVerifier.mjs
@@ -0,0 +1,216 @@
+// @ts-check
+import OError from '@overleaf/o-error'
+import chunkStore from '../lib/chunk_store/index.js'
+import {
+  backupPersistor,
+  chunksBucket,
+  projectBlobsBucket,
+} from './backupPersistor.mjs'
+import { Blob, Chunk, History } from 'overleaf-editor-core'
+import { BlobStore, GLOBAL_BLOBS, makeProjectKey } from './blob_store/index.js'
+import blobHash from './blob_hash.js'
+import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
+import logger from '@overleaf/logger'
+import path from 'node:path'
+import projectKey from './project_key.js'
+import streams from './streams.js'
+import objectPersistor from '@overleaf/object-persistor'
+import { getEndDateForRPO } from '../../backupVerifier/utils.mjs'
+
+/**
+ * @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
+ */
+
+/**
+ * @param {string} historyId
+ * @param {string} hash
+ */
+export async function verifyBlob(historyId, hash) {
+  return await verifyBlobs(historyId, [hash])
+}
+
+/**
+ *
+ * @param {string} historyId
+ * @return {Promise<CachedPerProjectEncryptedS3Persistor>}
+ */
+async function getProjectPersistor(historyId) {
+  try {
+    return await backupPersistor.forProjectRO(
+      projectBlobsBucket,
+      makeProjectKey(historyId, '')
+    )
+  } catch (err) {
+    if (err instanceof NotFoundError) {
+      throw new BackupCorruptedError('dek does not exist', {}, err)
+    }
+    throw err
+  }
+}
+
+/**
+ * @param {string} historyId
+ * @param {Array<string>} hashes
+ * @param {CachedPerProjectEncryptedS3Persistor} [projectCache]
+ */
+export async function verifyBlobs(historyId, hashes, projectCache) {
+  if (hashes.length === 0) throw new Error('bug: empty hashes')
+
+  if (!projectCache) {
+    projectCache = await getProjectPersistor(historyId)
+  }
+  const blobStore = new BlobStore(historyId)
+  for (const hash of hashes) {
+    const path = makeProjectKey(historyId, hash)
+    const blob = await blobStore.getBlob(hash)
+    if (!blob) throw new Blob.NotFoundError(hash)
+    let stream
+    try {
+      stream = await projectCache.getObjectStream(projectBlobsBucket, path, {
+        autoGunzip: true,
+      })
+    } catch (err) {
+      if (err instanceof NotFoundError) {
+        throw new BackupCorruptedMissingBlobError('missing blob', {
+          path,
+          hash,
+        })
+      }
+      throw err
+    }
+    const backupHash = await blobHash.fromStream(blob.getByteLength(), stream)
+    if (backupHash !== hash) {
+      throw new BackupCorruptedInvalidBlobError(
+        'hash mismatch for backed up blob',
+        {
+          path,
+          hash,
+          backupHash,
+        }
+      )
+    }
+  }
+}
+
+/**
+ * @param {string} historyId
+ * @param {Date} [endTimestamp]
+ */
+export async function verifyProjectWithErrorContext(
+  historyId,
+  endTimestamp = getEndDateForRPO()
+) {
+  try {
+    await verifyProject(historyId, endTimestamp)
+  } catch (err) {
+    // @ts-ignore err is Error instance
+    throw OError.tag(err, 'verifyProject', { historyId, endTimestamp })
+  }
+}
+
+/**
+ *
+ * @param {string} historyId
+ * @param {number} startVersion
+ * @param {CachedPerProjectEncryptedS3Persistor} backupPersistorForProject
+ * @return {Promise<any>}
+ */
+async function loadChunk(historyId, startVersion, backupPersistorForProject) {
+  const key = path.join(
+    projectKey.format(historyId),
+    projectKey.pad(startVersion)
+  )
+  try {
+    const buf = await streams.gunzipStreamToBuffer(
+      await backupPersistorForProject.getObjectStream(chunksBucket, key)
+    )
+    return JSON.parse(buf.toString('utf-8'))
+  } catch (err) {
+    if (err instanceof objectPersistor.Errors.NotFoundError) {
+      throw new Chunk.NotPersistedError(historyId)
+    }
+    if (err instanceof Error) {
+      throw OError.tag(err, 'Failed to load chunk', { historyId, startVersion })
+    }
+    throw err
+  }
+}
+
+/**
+ * @param {string} historyId
+ * @param {Date} endTimestamp
+ */
+export async function verifyProject(historyId, endTimestamp) {
+  const backend = chunkStore.getBackend(historyId)
+  const [first, last] = await Promise.all([
+    backend.getFirstChunkBeforeTimestamp(historyId, endTimestamp),
+    backend.getLastActiveChunkBeforeTimestamp(historyId, endTimestamp),
+  ])
+
+  const chunksRecordsToVerify = [
+    {
+      chunkId: first.id,
+      chunkLabel: 'first',
+    },
+  ]
+  if (first.startVersion !== last.startVersion) {
+    chunksRecordsToVerify.push({
+      chunkId: last.id,
+      chunkLabel: 'last before RPO',
+    })
+  }
+
+  const projectCache = await getProjectPersistor(historyId)
+
+  const chunks = await Promise.all(
+    chunksRecordsToVerify.map(async chunk => {
+      try {
+        return History.fromRaw(
+          await loadChunk(historyId, chunk.startVersion, projectCache)
+        )
+      } catch (err) {
+        if (err instanceof Chunk.NotPersistedError) {
+          throw new BackupRPOViolationChunkNotBackedUpError(
+            'BackupRPOviolation: chunk not backed up',
+            chunk
+          )
+        }
+        throw err
+      }
+    })
+  )
+  const seenBlobs = new Set()
+  const blobsToVerify = []
+  for (const chunk of chunks) {
+    /** @type {Set<string>} */
+    const chunkBlobs = new Set()
+    chunk.findBlobHashes(chunkBlobs)
+    let hasAddedBlobFromThisChunk = false
+    for (const blobHash of chunkBlobs) {
+      if (seenBlobs.has(blobHash)) continue // old blob
+      if (GLOBAL_BLOBS.has(blobHash)) continue // global blob
+      seenBlobs.add(blobHash)
+      if (!hasAddedBlobFromThisChunk) {
+        blobsToVerify.push(blobHash)
+        hasAddedBlobFromThisChunk = true
+      }
+    }
+  }
+  if (blobsToVerify.length === 0) {
+    logger.debug(
+      {
+        historyId,
+        chunksRecordsToVerify: chunksRecordsToVerify.map(c => c.chunkId),
+      },
+      'chunks contain no blobs to verify'
+    )
+    return
+  }
+  await verifyBlobs(historyId, blobsToVerify, projectCache)
+}
+
+export class BackupCorruptedError extends OError {}
+export class BackupRPOViolationError extends OError {}
+export class BackupCorruptedMissingBlobError extends BackupCorruptedError {}
+export class BackupCorruptedInvalidBlobError extends BackupCorruptedError {}
+export class BackupRPOViolationChunkNotBackedUpError extends OError {}
--- a/services/history-v1/storage/lib/backup_store/index.js
+++ b/services/history-v1/storage/lib/backup_store/index.js
@@ -0,0 +1,212 @@
+const { Binary, ObjectId } = require('mongodb')
+const { projects, backedUpBlobs } = require('../mongodb')
+const OError = require('@overleaf/o-error')
+
+// List projects with pending backups older than the specified interval
+function listPendingBackups(timeIntervalMs = 0, limit = null) {
+  const cutoffTime = new Date(Date.now() - timeIntervalMs)
+  const options = {
+    projection: { 'overleaf.backup.pendingChangeAt': 1 },
+    sort: { 'overleaf.backup.pendingChangeAt': 1 },
+  }
+
+  // Apply limit if provided
+  if (limit) {
+    options.limit = limit
+  }
+
+  const cursor = projects.find(
+    {
+      'overleaf.backup.pendingChangeAt': {
+        $exists: true,
+        $lt: cutoffTime,
+      },
+    },
+    options
+  )
+  return cursor
+}
+
+// List projects that have never been backed up and are older than the specified interval
+function listUninitializedBackups(timeIntervalMs = 0, limit = null) {
+  const cutoffTimeInSeconds = (Date.now() - timeIntervalMs) / 1000
+  const options = {
+    projection: { _id: 1 },
+    sort: { _id: 1 },
+  }
+  // Apply limit if provided
+  if (limit) {
+    options.limit = limit
+  }
+  const cursor = projects.find(
+    {
+      'overleaf.backup.lastBackedUpVersion': null,
+      _id: {
+        $lt: ObjectId.createFromTime(cutoffTimeInSeconds),
+      },
+    },
+    options
+  )
+  return cursor
+}
+
+// Retrieve the history ID for a given project without giving direct access to the
+// projects collection.
+
+async function getHistoryId(projectId) {
+  const project = await projects.findOne(
+    { _id: new ObjectId(projectId) },
+    {
+      projection: {
+        'overleaf.history.id': 1,
+      },
+    }
+  )
+  if (!project) {
+    throw new Error('Project not found')
+  }
+  return project.overleaf.history.id
+}
+
+async function getBackupStatus(projectId) {
+  const project = await projects.findOne(
+    { _id: new ObjectId(projectId) },
+    {
+      projection: {
+        'overleaf.history': 1,
+        'overleaf.backup': 1,
+      },
+    }
+  )
+  if (!project) {
+    throw new Error('Project not found')
+  }
+  return {
+    backupStatus: project.overleaf.backup,
+    historyId: `${project.overleaf.history.id}`,
+    currentEndVersion: project.overleaf.history.currentEndVersion,
+    currentEndTimestamp: project.overleaf.history.currentEndTimestamp,
+  }
+}
+
+async function setBackupVersion(
+  projectId,
+  previousBackedUpVersion,
+  currentBackedUpVersion,
+  currentBackedUpAt
+) {
+  // FIXME: include a check to handle race conditions
+  // to make sure only one process updates the version numbers
+  const result = await projects.updateOne(
+    {
+      _id: new ObjectId(projectId),
+      'overleaf.backup.lastBackedUpVersion': previousBackedUpVersion,
+    },
+    {
+      $set: {
+        'overleaf.backup.lastBackedUpVersion': currentBackedUpVersion,
+        'overleaf.backup.lastBackedUpAt': currentBackedUpAt,
+      },
+    }
+  )
+  if (result.matchedCount === 0 || result.modifiedCount === 0) {
+    throw new OError('Failed to update backup version', {
+      previousBackedUpVersion,
+      currentBackedUpVersion,
+      currentBackedUpAt,
+      result,
+    })
+  }
+}
+
+async function updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata) {
+  await projects.updateOne(
+    {
+      _id: new ObjectId(projectId),
+      'overleaf.history.currentEndVersion': { $exists: false },
+      'overleaf.history.currentEndTimestamp': { $exists: false },
+    },
+    {
+      $set: {
+        'overleaf.history.currentEndVersion': latestChunkMetadata.endVersion,
+        'overleaf.history.currentEndTimestamp':
+          latestChunkMetadata.endTimestamp,
+      },
+    }
+  )
+}
+
+/**
+ * Updates the pending change timestamp for a project's backup status
+ * @param {string} projectId - The ID of the project to update
+ * @param {Date} backupStartTime - The timestamp to set for pending changes
+ * @returns {Promise<void>}
+ *
+ * If the project's last backed up version matches the current end version,
+ * the pending change timestamp is removed. Otherwise, it's set to the provided
+ * backup start time.
+ */
+async function updatePendingChangeTimestamp(projectId, backupStartTime) {
+  await projects.updateOne({ _id: new ObjectId(projectId) }, [
+    {
+      $set: {
+        'overleaf.backup.pendingChangeAt': {
+          $cond: {
+            if: {
+              $eq: [
+                '$overleaf.backup.lastBackedUpVersion',
+                '$overleaf.history.currentEndVersion',
+              ],
+            },
+            then: '$$REMOVE',
+            else: backupStartTime,
+          },
+        },
+      },
+    },
+  ])
+}
+
+async function getBackedUpBlobHashes(projectId) {
+  const result = await backedUpBlobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection: { blobs: 1 } }
+  )
+  if (!result) {
+    return new Set()
+  }
+  const hashes = result.blobs.map(b => b.buffer.toString('hex'))
+  return new Set(hashes)
+}
+
+async function unsetBackedUpBlobHashes(projectId, hashes) {
+  const binaryHashes = hashes.map(h => new Binary(Buffer.from(h, 'hex')))
+  const result = await backedUpBlobs.findOneAndUpdate(
+    { _id: new ObjectId(projectId) },
+    {
+      $pullAll: {
+        blobs: binaryHashes,
+      },
+    },
+    { returnDocument: 'after' }
+  )
+  if (result && result.blobs.length === 0) {
+    await backedUpBlobs.deleteOne({
+      _id: new ObjectId(projectId),
+      blobs: { $size: 0 },
+    })
+  }
+  return result
+}
+
+module.exports = {
+  getHistoryId,
+  getBackupStatus,
+  setBackupVersion,
+  updateCurrentMetadataIfNotSet,
+  updatePendingChangeTimestamp,
+  listPendingBackups,
+  listUninitializedBackups,
+  getBackedUpBlobHashes,
+  unsetBackedUpBlobHashes,
+}
--- a/services/history-v1/storage/lib/batch_blob_store.js
+++ b/services/history-v1/storage/lib/batch_blob_store.js
@@ -0,0 +1,40 @@
+'use strict'
+
+const BPromise = require('bluebird')
+
+/**
+ * @constructor
+ * @param {BlobStore} blobStore
+ * @classdesc
+ * Wrapper for BlobStore that pre-fetches blob metadata to avoid making one
+ * database call per blob lookup.
+ */
+function BatchBlobStore(blobStore) {
+  this.blobStore = blobStore
+  this.blobs = new Map()
+}
+
+/**
+ * Pre-fetch metadata for the given blob hashes.
+ *
+ * @param {Array.<string>} hashes
+ * @return {Promise}
+ */
+BatchBlobStore.prototype.preload = function batchBlobStorePreload(hashes) {
+  return BPromise.each(this.blobStore.getBlobs(hashes), blob => {
+    this.blobs.set(blob.getHash(), blob)
+  })
+}
+
+/**
+ * @see BlobStore#getBlob
+ */
+BatchBlobStore.prototype.getBlob = BPromise.method(
+  function batchBlobStoreGetBlob(hash) {
+    const blob = this.blobs.get(hash)
+    if (blob) return blob
+    return this.blobStore.getBlob(hash)
+  }
+)
+
+module.exports = BatchBlobStore
--- a/services/history-v1/storage/lib/blob_hash.js
+++ b/services/history-v1/storage/lib/blob_hash.js
@@ -0,0 +1,80 @@
+/** @module */
+'use strict'
+
+const BPromise = require('bluebird')
+const fs = BPromise.promisifyAll(require('node:fs'))
+const crypto = require('node:crypto')
+const { pipeline } = require('node:stream')
+const assert = require('./assert')
+
+function getGitBlobHeader(byteLength) {
+  return 'blob ' + byteLength + '\x00'
+}
+
+function getBlobHash(byteLength) {
+  const hash = crypto.createHash('sha1')
+  hash.setEncoding('hex')
+  hash.update(getGitBlobHeader(byteLength))
+  return hash
+}
+
+/**
+ * Compute the git blob hash for a blob from a readable stream of its content.
+ *
+ * @function
+ * @param  {number} byteLength
+ * @param  {stream.Readable} stream
+ * @return {Promise.<string>} hexadecimal SHA-1 hash
+ */
+exports.fromStream = BPromise.method(
+  function blobHashFromStream(byteLength, stream) {
+    assert.integer(byteLength, 'blobHash: bad byteLength')
+    assert.object(stream, 'blobHash: bad stream')
+
+    const hash = getBlobHash(byteLength)
+    return new BPromise(function (resolve, reject) {
+      pipeline(stream, hash, function (err) {
+        if (err) {
+          reject(err)
+        } else {
+          hash.end()
+          resolve(hash.read())
+        }
+      })
+    })
+  }
+)
+
+/**
+ * Compute the git blob hash for a blob with the given string content.
+ *
+ * @param  {string} string
+ * @return {string} hexadecimal SHA-1 hash
+ */
+exports.fromString = function blobHashFromString(string) {
+  assert.string(string, 'blobHash: bad string')
+  const hash = getBlobHash(Buffer.byteLength(string))
+  hash.update(string, 'utf8')
+  hash.end()
+  return hash.read()
+}
+
+/**
+ * Compute the git blob hash for the content of a file
+ *
+ * @param  {string} filePath
+ * @return {string} hexadecimal SHA-1 hash
+ */
+exports.fromFile = function blobHashFromFile(pathname) {
+  assert.string(pathname, 'blobHash: bad pathname')
+
+  function getByteLengthOfFile() {
+    return fs.statAsync(pathname).then(stat => stat.size)
+  }
+
+  const fromStream = this.fromStream
+  return getByteLengthOfFile(pathname).then(function (byteLength) {
+    const stream = fs.createReadStream(pathname)
+    return fromStream(byteLength, stream)
+  })
+}
--- a/services/history-v1/storage/lib/blob_store/index.js
+++ b/services/history-v1/storage/lib/blob_store/index.js
@@ -0,0 +1,433 @@
+'use strict'
+
+const config = require('config')
+const fs = require('node:fs')
+const isValidUtf8 = require('utf-8-validate')
+const { ReadableString } = require('@overleaf/stream-utils')
+
+const core = require('overleaf-editor-core')
+const objectPersistor = require('@overleaf/object-persistor')
+const OError = require('@overleaf/o-error')
+const Blob = core.Blob
+const TextOperation = core.TextOperation
+const containsNonBmpChars = core.util.containsNonBmpChars
+
+const assert = require('../assert')
+const blobHash = require('../blob_hash')
+const mongodb = require('../mongodb')
+const persistor = require('../persistor')
+const projectKey = require('../project_key')
+const streams = require('../streams')
+const postgresBackend = require('./postgres')
+const mongoBackend = require('./mongo')
+const logger = require('@overleaf/logger')
+
+/** @import { Readable } from 'stream' */
+
+const GLOBAL_BLOBS = new Map()
+
+function makeGlobalKey(hash) {
+  return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
+}
+
+function makeProjectKey(projectId, hash) {
+  return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
+}
+
+async function uploadBlob(projectId, blob, stream, opts = {}) {
+  const bucket = config.get('blobStore.projectBucket')
+  const key = makeProjectKey(projectId, blob.getHash())
+  logger.debug({ projectId, blob }, 'uploadBlob started')
+  try {
+    await persistor.sendStream(bucket, key, stream, {
+      contentType: 'application/octet-stream',
+      ...opts,
+    })
+  } finally {
+    logger.debug({ projectId, blob }, 'uploadBlob finished')
+  }
+}
+
+function getBlobLocation(projectId, hash) {
+  if (GLOBAL_BLOBS.has(hash)) {
+    return {
+      bucket: config.get('blobStore.globalBucket'),
+      key: makeGlobalKey(hash),
+    }
+  } else {
+    return {
+      bucket: config.get('blobStore.projectBucket'),
+      key: makeProjectKey(projectId, hash),
+    }
+  }
+}
+
+/**
+ * Returns the appropriate backend for the given project id
+ *
+ * Numeric ids use the Postgres backend.
+ * Strings of 24 characters use the Mongo backend.
+ */
+function getBackend(projectId) {
+  if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
+    return postgresBackend
+  } else if (assert.MONGO_ID_REGEXP.test(projectId)) {
+    return mongoBackend
+  } else {
+    throw new OError('bad project id', { projectId })
+  }
+}
+
+async function makeBlobForFile(pathname) {
+  const { size: byteLength } = await fs.promises.stat(pathname)
+  const hash = await blobHash.fromStream(
+    byteLength,
+    fs.createReadStream(pathname)
+  )
+  return new Blob(hash, byteLength)
+}
+
+async function getStringLengthOfFile(byteLength, pathname) {
+  // We have to read the file into memory to get its UTF-8 length, so don't
+  // bother for files that are too large for us to edit anyway.
+  if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
+    return null
+  }
+
+  // We need to check if the file contains nonBmp or null characters
+  let data = await fs.promises.readFile(pathname)
+  if (!isValidUtf8(data)) return null
+  data = data.toString()
+  if (data.length > TextOperation.MAX_STRING_LENGTH) return null
+  if (containsNonBmpChars(data)) return null
+  if (data.indexOf('\x00') !== -1) return null
+  return data.length
+}
+
+async function deleteBlobsInBucket(projectId) {
+  const bucket = config.get('blobStore.projectBucket')
+  const prefix = `${projectKey.format(projectId)}/`
+  logger.debug({ projectId }, 'deleteBlobsInBucket started')
+  try {
+    await persistor.deleteDirectory(bucket, prefix)
+  } finally {
+    logger.debug({ projectId }, 'deleteBlobsInBucket finished')
+  }
+}
+
+async function loadGlobalBlobs() {
+  const blobs = await mongodb.globalBlobs.find()
+  for await (const blob of blobs) {
+    GLOBAL_BLOBS.set(blob._id, {
+      blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
+      demoted: Boolean(blob.demoted),
+    })
+  }
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ * @param {Array<string|number>} projectIds
+ * @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
+ */
+async function getProjectBlobsBatch(projectIds) {
+  const mongoProjects = []
+  const postgresProjects = []
+  for (const projectId of projectIds) {
+    if (typeof projectId === 'number') {
+      postgresProjects.push(projectId)
+    } else {
+      mongoProjects.push(projectId)
+    }
+  }
+  const [
+    { nBlobs: nBlobsPostgres, blobs: blobsPostgres },
+    { nBlobs: nBlobsMongo, blobs: blobsMongo },
+  ] = await Promise.all([
+    postgresBackend.getProjectBlobsBatch(postgresProjects),
+    mongoBackend.getProjectBlobsBatch(mongoProjects),
+  ])
+  for (const [id, blobs] of blobsPostgres.entries()) {
+    blobsMongo.set(id.toString(), blobs)
+  }
+  return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
+}
+
+/**
+ * @classdesc
+ * Fetch and store the content of files using content-addressable hashing. The
+ * blob store manages both content and metadata (byte and UTF-8 length) for
+ * blobs.
+ */
+class BlobStore {
+  /**
+   * @constructor
+   * @param {string} projectId the project for which we'd like to find blobs
+   */
+  constructor(projectId) {
+    assert.projectId(projectId)
+    this.projectId = projectId
+    this.backend = getBackend(this.projectId)
+  }
+
+  /**
+   * Set up the initial data structure for a given project
+   */
+  async initialize() {
+    await this.backend.initialize(this.projectId)
+  }
+
+  /**
+   * Write a blob, if one does not already exist, with the given UTF-8 encoded
+   * string content.
+   *
+   * @param {string} string
+   * @return {Promise.<core.Blob>}
+   */
+  async putString(string) {
+    assert.string(string, 'bad string')
+    const hash = blobHash.fromString(string)
+
+    const existingBlob = await this._findBlobBeforeInsert(hash)
+    if (existingBlob != null) {
+      return existingBlob
+    }
+    const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
+    // Note: the ReadableString is to work around a bug in the AWS SDK: it won't
+    // allow Body to be blank.
+    await uploadBlob(this.projectId, newBlob, new ReadableString(string))
+    await this.backend.insertBlob(this.projectId, newBlob)
+    return newBlob
+  }
+
+  /**
+   * Write a blob, if one does not already exist, with the given file (usually a
+   * temporary file).
+   *
+   * @param {string} pathname
+   * @return {Promise<core.Blob>}
+   */
+  async putFile(pathname) {
+    assert.string(pathname, 'bad pathname')
+    const newBlob = await makeBlobForFile(pathname)
+    const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
+    if (existingBlob != null) {
+      return existingBlob
+    }
+    const stringLength = await getStringLengthOfFile(
+      newBlob.getByteLength(),
+      pathname
+    )
+    newBlob.setStringLength(stringLength)
+    await this.putBlob(pathname, newBlob)
+    return newBlob
+  }
+
+  /**
+   * Write a new blob, the stringLength must have been added already. It should
+   * have been checked that the blob does not exist yet. Consider using
+   * {@link putFile} instead of this lower-level method.
+   *
+   * @param {string} pathname
+   * @param {core.Blob} finializedBlob
+   * @return {Promise<void>}
+   */
+  async putBlob(pathname, finializedBlob) {
+    await uploadBlob(
+      this.projectId,
+      finializedBlob,
+      fs.createReadStream(pathname)
+    )
+    await this.backend.insertBlob(this.projectId, finializedBlob)
+  }
+
+  /**
+   * Stores an object as a JSON string in a blob.
+   *
+   * @param {object} obj
+   * @returns {Promise.<core.Blob>}
+   */
+  async putObject(obj) {
+    assert.object(obj, 'bad object')
+    const string = JSON.stringify(obj)
+    return await this.putString(string)
+  }
+
+  /**
+   *
+   * Fetch a blob's content by its hash as a UTF-8 encoded string.
+   *
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @return {Promise.<string>} promise for the content of the file
+   */
+  async getString(hash) {
+    assert.blobHash(hash, 'bad hash')
+
+    const projectId = this.projectId
+    logger.debug({ projectId, hash }, 'getString started')
+    try {
+      const stream = await this.getStream(hash)
+      const buffer = await streams.readStreamToBuffer(stream)
+      return buffer.toString()
+    } finally {
+      logger.debug({ projectId, hash }, 'getString finished')
+    }
+  }
+
+  /**
+   * Fetch a JSON encoded blob by its hash and deserialize it.
+   *
+   * @template [T=unknown]
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @return {Promise.<T>} promise for the content of the file
+   */
+  async getObject(hash) {
+    assert.blobHash(hash, 'bad hash')
+    const projectId = this.projectId
+    logger.debug({ projectId, hash }, 'getObject started')
+    try {
+      const jsonString = await this.getString(hash)
+      const object = JSON.parse(jsonString)
+      return object
+    } catch (error) {
+      // Maybe this is blob is gzipped. Try to gunzip it.
+      // TODO: Remove once we've ensured this is not reached
+      const stream = await this.getStream(hash)
+      const buffer = await streams.gunzipStreamToBuffer(stream)
+      const object = JSON.parse(buffer.toString())
+      logger.warn('getObject: Gzipped object in BlobStore')
+      return object
+    } finally {
+      logger.debug({ projectId, hash }, 'getObject finished')
+    }
+  }
+
+  /**
+   * Fetch a blob by its hash as a stream.
+   *
+   * Note that, according to the AWS SDK docs, this does not retry after initial
+   * failure, so the caller must be prepared to retry on errors, if appropriate.
+   *
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @param {Object} opts
+   * @return {Promise.<Readable>} a stream to read the file
+   */
+  async getStream(hash, opts = {}) {
+    assert.blobHash(hash, 'bad hash')
+
+    const { bucket, key } = getBlobLocation(this.projectId, hash)
+    try {
+      const stream = await persistor.getObjectStream(bucket, key, opts)
+      return stream
+    } catch (err) {
+      if (err instanceof objectPersistor.Errors.NotFoundError) {
+        throw new Blob.NotFoundError(hash)
+      }
+      throw err
+    }
+  }
+
+  /**
+   * Read a blob metadata record by hexadecimal hash.
+   *
+   * @param {string} hash hexadecimal SHA-1 hash
+   * @return {Promise<core.Blob | null>}
+   */
+  async getBlob(hash) {
+    assert.blobHash(hash, 'bad hash')
+    const globalBlob = GLOBAL_BLOBS.get(hash)
+    if (globalBlob != null) {
+      return globalBlob.blob
+    }
+    const blob = await this.backend.findBlob(this.projectId, hash)
+    return blob
+  }
+
+  async getBlobs(hashes) {
+    assert.array(hashes, 'bad hashes')
+    const nonGlobalHashes = []
+    const blobs = []
+    for (const hash of hashes) {
+      const globalBlob = GLOBAL_BLOBS.get(hash)
+      if (globalBlob != null) {
+        blobs.push(globalBlob.blob)
+      } else {
+        nonGlobalHashes.push(hash)
+      }
+    }
+    if (nonGlobalHashes.length === 0) {
+      return blobs // to avoid unnecessary database lookup
+    }
+    const projectBlobs = await this.backend.findBlobs(
+      this.projectId,
+      nonGlobalHashes
+    )
+    blobs.push(...projectBlobs)
+    return blobs
+  }
+
+  /**
+   * Retrieve all blobs associated with the project.
+   * @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
+   */
+
+  async getProjectBlobs() {
+    const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
+    return projectBlobs
+  }
+
+  /**
+   * Delete all blobs that belong to the project.
+   */
+  async deleteBlobs() {
+    await Promise.all([
+      this.backend.deleteBlobs(this.projectId),
+      deleteBlobsInBucket(this.projectId),
+    ])
+  }
+
+  async _findBlobBeforeInsert(hash) {
+    const globalBlob = GLOBAL_BLOBS.get(hash)
+    if (globalBlob != null && !globalBlob.demoted) {
+      return globalBlob.blob
+    }
+    const blob = await this.backend.findBlob(this.projectId, hash)
+    return blob
+  }
+
+  /**
+   * Copy an existing sourceBlob in this project to a target project.
+   * @param {Blob} sourceBlob
+   * @param {string} targetProjectId
+   * @return {Promise<void>}
+   */
+  async copyBlob(sourceBlob, targetProjectId) {
+    assert.instance(sourceBlob, Blob, 'bad sourceBlob')
+    assert.projectId(targetProjectId, 'bad targetProjectId')
+    const hash = sourceBlob.getHash()
+    const sourceProjectId = this.projectId
+    const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
+    const destKey = makeProjectKey(targetProjectId, hash)
+    const targetBackend = getBackend(targetProjectId)
+    logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
+    try {
+      await persistor.copyObject(bucket, sourceKey, destKey)
+      await targetBackend.insertBlob(targetProjectId, sourceBlob)
+    } finally {
+      logger.debug(
+        { sourceProjectId, targetProjectId, hash },
+        'copyBlob finished'
+      )
+    }
+  }
+}
+
+module.exports = {
+  BlobStore,
+  getProjectBlobsBatch,
+  loadGlobalBlobs,
+  makeProjectKey,
+  makeBlobForFile,
+  getStringLengthOfFile,
+  GLOBAL_BLOBS,
+}
--- a/services/history-v1/storage/lib/blob_store/mongo.js
+++ b/services/history-v1/storage/lib/blob_store/mongo.js
@@ -0,0 +1,437 @@
+// @ts-check
+/**
+ * Mongo backend for the blob store.
+ *
+ * Blobs are stored in the projectHistoryBlobs collection. Each project has a
+ * document in that collection. That document has a "blobs" subdocument whose
+ * fields are buckets of blobs. The key of a bucket is the first three hex
+ * digits of the blob hash. The value of the bucket is an array of blobs that
+ * match the key.
+ *
+ * Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
+ * blobs are stored in a secondary collection: the projectHistoryShardedBlobs
+ * collection. This collection shards blobs between 16 documents per project.
+ * The shard key is the first hex digit of the hash. The documents are also
+ * organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
+ */
+
+const { Blob } = require('overleaf-editor-core')
+const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
+const assert = require('../assert')
+const mongodb = require('../mongodb')
+
+const MAX_BLOBS_IN_BUCKET = 8
+const DUPLICATE_KEY_ERROR_CODE = 11000
+
+/**
+ * @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
+ */
+
+/**
+ * Set up the data structures for a given project.
+ * @param {string} projectId
+ */
+async function initialize(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+  try {
+    await mongodb.blobs.insertOne({
+      _id: new ObjectId(projectId),
+      blobs: {},
+    })
+  } catch (err) {
+    if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
+      return // ignore already initialized case
+    }
+    throw err
+  }
+}
+
+/**
+ * Return blob metadata for the given project and hash.
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<Blob | null>}
+ */
+async function findBlob(projectId, hash) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.blobHash(hash, 'bad hash')
+
+  const bucket = getBucket(hash)
+  const result = await mongodb.blobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection: { _id: 0, bucket: `$${bucket}` } }
+  )
+
+  if (result?.bucket == null) {
+    return null
+  }
+
+  const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
+  if (record == null) {
+    if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
+      return await findBlobSharded(projectId, hash)
+    } else {
+      return null
+    }
+  }
+  return recordToBlob(record)
+}
+
+/**
+ * Search in the sharded collection for blob metadata
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<Blob | null>}
+ */
+async function findBlobSharded(projectId, hash) {
+  const [shard, bucket] = getShardedBucket(hash)
+  const id = makeShardedId(projectId, shard)
+  const result = await mongodb.shardedBlobs.findOne(
+    { _id: id },
+    { projection: { _id: 0, blobs: `$${bucket}` } }
+  )
+  if (result?.blobs == null) {
+    return null
+  }
+  const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
+  if (!record) return null
+  return recordToBlob(record)
+}
+
+/**
+ * Read multiple blob metadata records by hexadecimal hashes.
+ * @param {string} projectId
+ * @param {Array<string>} hashes
+ * @return {Promise<Array<Blob>>}
+ */
+async function findBlobs(projectId, hashes) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.array(hashes, 'bad hashes: not array')
+  hashes.forEach(function (hash) {
+    assert.blobHash(hash, 'bad hash')
+  })
+
+  // Build a set of unique buckets
+  const buckets = new Set(hashes.map(getBucket))
+
+  // Get buckets from Mongo
+  const projection = { _id: 0 }
+  for (const bucket of buckets) {
+    projection[bucket] = 1
+  }
+  const result = await mongodb.blobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection }
+  )
+
+  if (result?.blobs == null) {
+    return []
+  }
+
+  // Build blobs from the query results
+  const hashSet = new Set(hashes)
+  const blobs = []
+  for (const bucket of Object.values(result.blobs)) {
+    for (const record of bucket) {
+      const hash = record.h.toString('hex')
+      if (hashSet.has(hash)) {
+        blobs.push(recordToBlob(record))
+        hashSet.delete(hash)
+      }
+    }
+  }
+
+  // If we haven't found all the blobs, look in the sharded collection
+  if (hashSet.size > 0) {
+    const shardedBlobs = await findBlobsSharded(projectId, hashSet)
+    blobs.push(...shardedBlobs)
+  }
+
+  return blobs
+}
+
+/**
+ * Search in the sharded collection for blob metadata.
+ * @param {string} projectId
+ * @param {Set<string>} hashSet
+ * @return {Promise<Array<Blob>>}
+ */
+async function findBlobsSharded(projectId, hashSet) {
+  // Build a map of buckets by shard key
+  const bucketsByShard = new Map()
+  for (const hash of hashSet) {
+    const [shard, bucket] = getShardedBucket(hash)
+    let buckets = bucketsByShard.get(shard)
+    if (buckets == null) {
+      buckets = new Set()
+      bucketsByShard.set(shard, buckets)
+    }
+    buckets.add(bucket)
+  }
+
+  // Make parallel requests to the shards that might contain the hashes we want
+  const requests = []
+  for (const [shard, buckets] of bucketsByShard.entries()) {
+    const id = makeShardedId(projectId, shard)
+    const projection = { _id: 0 }
+    for (const bucket of buckets) {
+      projection[bucket] = 1
+    }
+    const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
+    requests.push(request)
+  }
+  const results = await Promise.all(requests)
+
+  // Build blobs from the query results
+  const blobs = []
+  for (const result of results) {
+    if (result?.blobs == null) {
+      continue
+    }
+
+    for (const bucket of Object.values(result.blobs)) {
+      for (const record of bucket) {
+        const hash = record.h.toString('hex')
+        if (hashSet.has(hash)) {
+          blobs.push(recordToBlob(record))
+        }
+      }
+    }
+  }
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ */
+async function getProjectBlobs(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+
+  const result = await mongodb.blobs.findOne(
+    { _id: new ObjectId(projectId) },
+    { projection: { _id: 0 } }
+  )
+
+  if (!result) {
+    return []
+  }
+
+  // Build blobs from the query results
+  const blobs = []
+  for (const bucket of Object.values(result.blobs)) {
+    for (const record of bucket) {
+      blobs.push(recordToBlob(record))
+    }
+  }
+
+  // Look for all possible sharded blobs
+
+  const minShardedId = makeShardedId(projectId, '0')
+  const maxShardedId = makeShardedId(projectId, 'f')
+  // @ts-ignore We are using a custom _id here.
+  const shardedRecords = mongodb.shardedBlobs.find(
+    {
+      _id: { $gte: minShardedId, $lte: maxShardedId },
+    },
+    { projection: { _id: 0 } }
+  )
+
+  for await (const shardedRecord of shardedRecords) {
+    if (shardedRecord.blobs == null) {
+      continue
+    }
+    for (const bucket of Object.values(shardedRecord.blobs)) {
+      for (const record of bucket) {
+        blobs.push(recordToBlob(record))
+      }
+    }
+  }
+
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ * @param {Array<string>} projectIds
+ * @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
+ */
+async function getProjectBlobsBatch(projectIds) {
+  for (const project of projectIds) {
+    assert.mongoId(project, 'bad projectId')
+  }
+  let nBlobs = 0
+  const blobs = new Map()
+  if (projectIds.length === 0) return { nBlobs, blobs }
+
+  // blobs
+  {
+    const cursor = await mongodb.blobs.find(
+      { _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
+      { readPreference: ReadPreference.secondaryPreferred }
+    )
+    for await (const record of cursor) {
+      const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
+      blobs.set(record._id.toString(), projectBlobs)
+      nBlobs += projectBlobs.length
+    }
+  }
+
+  // sharded blobs
+  {
+    // @ts-ignore We are using a custom _id here.
+    const cursor = await mongodb.shardedBlobs.find(
+      {
+        _id: {
+          $gte: makeShardedId(projectIds[0], '0'),
+          $lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
+        },
+      },
+      { readPreference: ReadPreference.secondaryPreferred }
+    )
+    for await (const record of cursor) {
+      const recordIdHex = record._id.toString('hex')
+      const recordProjectId = recordIdHex.slice(0, 24)
+      const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
+      const found = blobs.get(recordProjectId)
+      if (found) {
+        found.push(...projectBlobs)
+      } else {
+        blobs.set(recordProjectId, projectBlobs)
+      }
+      nBlobs += projectBlobs.length
+    }
+  }
+  return { nBlobs, blobs }
+}
+
+/**
+ * Add a blob's metadata to the blobs collection after it has been uploaded.
+ * @param {string} projectId
+ * @param {Blob} blob
+ */
+async function insertBlob(projectId, blob) {
+  assert.mongoId(projectId, 'bad projectId')
+  const hash = blob.getHash()
+  const bucket = getBucket(hash)
+  const record = blobToRecord(blob)
+  const result = await mongodb.blobs.updateOne(
+    {
+      _id: new ObjectId(projectId),
+      $expr: {
+        $lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
+      },
+    },
+    {
+      $addToSet: { [bucket]: record },
+    }
+  )
+
+  if (result.matchedCount === 0) {
+    await insertRecordSharded(projectId, hash, record)
+  }
+}
+
+/**
+ * Add a blob's metadata to the sharded blobs collection.
+ * @param {string} projectId
+ * @param {string} hash
+ * @param {Record} record
+ * @return {Promise<void>}
+ */
+async function insertRecordSharded(projectId, hash, record) {
+  const [shard, bucket] = getShardedBucket(hash)
+  const id = makeShardedId(projectId, shard)
+  await mongodb.shardedBlobs.updateOne(
+    { _id: id },
+    { $addToSet: { [bucket]: record } },
+    { upsert: true }
+  )
+}
+
+/**
+ * Delete all blobs for a given project.
+ * @param {string} projectId
+ */
+async function deleteBlobs(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+  await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
+  const minShardedId = makeShardedId(projectId, '0')
+  const maxShardedId = makeShardedId(projectId, 'f')
+  await mongodb.shardedBlobs.deleteMany({
+    // @ts-ignore We are using a custom _id here.
+    _id: { $gte: minShardedId, $lte: maxShardedId },
+  })
+}
+
+/**
+ * Return the Mongo path to the bucket for the given hash.
+ * @param {string} hash
+ * @return {string}
+ */
+function getBucket(hash) {
+  return `blobs.${hash.slice(0, 3)}`
+}
+
+/**
+ * Return the shard key and Mongo path to the bucket for the given hash in the
+ * sharded collection.
+ * @param {string} hash
+ * @return {[string, string]}
+ */
+function getShardedBucket(hash) {
+  const shard = hash.slice(0, 1)
+  const bucket = `blobs.${hash.slice(1, 4)}`
+  return [shard, bucket]
+}
+
+/**
+ * Create an _id key for the sharded collection.
+ * @param {string} projectId
+ * @param {string} shard
+ * @return {Binary}
+ */
+function makeShardedId(projectId, shard) {
+  return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
+}
+
+/**
+ * @typedef {Object} Record
+ * @property {Binary} h
+ * @property {number} b
+ * @property {number} [s]
+ */
+
+/**
+ * Return the Mongo record for the given blob.
+ * @param {Blob} blob
+ * @return {Record}
+ */
+function blobToRecord(blob) {
+  const hash = blob.getHash()
+  const byteLength = blob.getByteLength()
+  const stringLength = blob.getStringLength()
+  return {
+    h: new Binary(Buffer.from(hash, 'hex')),
+    b: byteLength,
+    s: stringLength,
+  }
+}
+
+/**
+ * Create a blob from the given Mongo record.
+ * @param {Record} record
+ * @return {Blob}
+ */
+function recordToBlob(record) {
+  return new Blob(record.h.toString('hex'), record.b, record.s)
+}
+
+module.exports = {
+  initialize,
+  findBlob,
+  findBlobs,
+  getProjectBlobs,
+  getProjectBlobsBatch,
+  insertBlob,
+  deleteBlobs,
+}
--- a/services/history-v1/storage/lib/blob_store/postgres.js
+++ b/services/history-v1/storage/lib/blob_store/postgres.js
@@ -0,0 +1,161 @@
+const { Blob } = require('overleaf-editor-core')
+const assert = require('../assert')
+const knex = require('../knex')
+
+/**
+ * Set up the initial data structures for a project
+ */
+async function initialize(projectId) {
+  // Nothing to do for Postgres
+}
+
+/**
+ * Return blob metadata for the given project and hash
+ */
+async function findBlob(projectId, hash) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+  assert.blobHash(hash, 'bad hash')
+
+  const binaryHash = hashToBuffer(hash)
+  const record = await knex('project_blobs')
+    .select('hash_bytes', 'byte_length', 'string_length')
+    .where({
+      project_id: projectId,
+      hash_bytes: binaryHash,
+    })
+    .first()
+  return recordToBlob(record)
+}
+
+/**
+ * Read multiple blob metadata records by hexadecimal hashes.
+ *
+ * @param {Array.<string>} hashes hexadecimal SHA-1 hashes
+ * @return {Promise.<Array.<Blob?>>} no guarantee on order
+ */
+async function findBlobs(projectId, hashes) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+  assert.array(hashes, 'bad hashes: not array')
+  hashes.forEach(function (hash) {
+    assert.blobHash(hash, 'bad hash')
+  })
+
+  const binaryHashes = hashes.map(hashToBuffer)
+
+  const records = await knex('project_blobs')
+    .select('hash_bytes', 'byte_length', 'string_length')
+    .where('project_id', projectId)
+    .whereIn('hash_bytes', binaryHashes)
+
+  const blobs = records.map(recordToBlob)
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ */
+async function getProjectBlobs(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+
+  const records = await knex('project_blobs')
+    .select('hash_bytes', 'byte_length', 'string_length')
+    .where({
+      project_id: projectId,
+    })
+
+  const blobs = records.map(recordToBlob)
+  return blobs
+}
+
+/**
+ * Return metadata for all blobs in the given project
+ * @param {Array<number>} projectIds
+ * @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
+ */
+async function getProjectBlobsBatch(projectIds) {
+  for (const projectId of projectIds) {
+    assert.integer(projectId, 'bad projectId')
+  }
+  let nBlobs = 0
+  const blobs = new Map()
+  if (projectIds.length === 0) return { nBlobs, blobs }
+
+  const cursor = knex('project_blobs')
+    .select('project_id', 'hash_bytes', 'byte_length', 'string_length')
+    .whereIn('project_id', projectIds)
+    .stream()
+  for await (const record of cursor) {
+    const found = blobs.get(record.project_id)
+    if (found) {
+      found.push(recordToBlob(record))
+    } else {
+      blobs.set(record.project_id, [recordToBlob(record)])
+    }
+    nBlobs++
+  }
+  return { nBlobs, blobs }
+}
+
+/**
+ * Add a blob's metadata to the blobs table after it has been uploaded.
+ */
+async function insertBlob(projectId, blob) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+
+  await knex('project_blobs')
+    .insert(blobToRecord(projectId, blob))
+    .onConflict(['project_id', 'hash_bytes'])
+    .ignore()
+}
+
+/**
+ * Deletes all blobs for a given project
+ */
+async function deleteBlobs(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+  projectId = parseInt(projectId, 10)
+
+  await knex('project_blobs').where('project_id', projectId).delete()
+}
+
+function blobToRecord(projectId, blob) {
+  return {
+    project_id: projectId,
+    hash_bytes: hashToBuffer(blob.hash),
+    byte_length: blob.getByteLength(),
+    string_length: blob.getStringLength(),
+  }
+}
+
+function recordToBlob(record) {
+  if (!record) return
+  return new Blob(
+    hashFromBuffer(record.hash_bytes),
+    record.byte_length,
+    record.string_length
+  )
+}
+
+function hashToBuffer(hash) {
+  if (!hash) return
+  return Buffer.from(hash, 'hex')
+}
+
+function hashFromBuffer(buffer) {
+  if (!buffer) return
+  return buffer.toString('hex')
+}
+
+module.exports = {
+  initialize,
+  findBlob,
+  findBlobs,
+  getProjectBlobs,
+  getProjectBlobsBatch,
+  insertBlob,
+  deleteBlobs,
+}
--- a/services/history-v1/storage/lib/chunk_buffer/index.js
+++ b/services/history-v1/storage/lib/chunk_buffer/index.js
@@ -0,0 +1,40 @@
+'use strict'
+
+/**
+ * @module storage/lib/chunk_buffer
+ */
+
+const chunkStore = require('../chunk_store')
+const redisBackend = require('../chunk_store/redis')
+const metrics = require('@overleaf/metrics')
+/**
+ * Load the latest Chunk stored for a project, including blob metadata.
+ *
+ * @param {string} projectId
+ * @return {Promise.<Chunk>}
+ */
+async function loadLatest(projectId) {
+  const cachedChunk = await redisBackend.getCurrentChunk(projectId)
+  const chunkRecord = await chunkStore.loadLatestRaw(projectId)
+  const cachedChunkIsValid = redisBackend.checkCacheValidityWithMetadata(
+    cachedChunk,
+    chunkRecord
+  )
+  if (cachedChunkIsValid) {
+    metrics.inc('chunk_buffer.loadLatest', 1, {
+      status: 'cache-hit',
+    })
+    return cachedChunk
+  } else {
+    metrics.inc('chunk_buffer.loadLatest', 1, {
+      status: 'cache-miss',
+    })
+    const chunk = await chunkStore.loadLatest(projectId)
+    await redisBackend.setCurrentChunk(projectId, chunk)
+    return chunk
+  }
+}
+
+module.exports = {
+  loadLatest,
+}
--- a/services/history-v1/storage/lib/chunk_store/errors.js
+++ b/services/history-v1/storage/lib/chunk_store/errors.js
@@ -0,0 +1,7 @@
+const OError = require('@overleaf/o-error')
+
+class ChunkVersionConflictError extends OError {}
+
+module.exports = {
+  ChunkVersionConflictError,
+}
--- a/services/history-v1/storage/lib/chunk_store/index.js
+++ b/services/history-v1/storage/lib/chunk_store/index.js
@@ -0,0 +1,447 @@
+// @ts-check
+
+'use strict'
+
+/**
+ * Manage {@link Chunk} and {@link History} storage.
+ *
+ * For storage, chunks are immutable. If we want to update a project with new
+ * changes, we create a new chunk record and History object and delete the old
+ * ones. If we compact a project's history, we similarly destroy the old chunk
+ * (or chunks) and replace them with a new one. This is helpful when using S3,
+ * because it guarantees only eventual consistency for updates but provides
+ * stronger consistency guarantees for object creation.
+ *
+ * When a chunk record in the database is removed, we save its ID for later
+ * in the `old_chunks` table, rather than deleting it immediately. This lets us
+ * use batch deletion to reduce the number of delete requests to S3.
+ *
+ * The chunk store also caches data about which blobs are referenced by each
+ * chunk, which allows us to find unused blobs without loading all of the data
+ * for all projects from S3. Whenever we create a chunk, we also insert records
+ * into the `chunk_blobs` table, to help with this bookkeeping.
+ */
+
+const config = require('config')
+const OError = require('@overleaf/o-error')
+const { Chunk, History, Snapshot } = require('overleaf-editor-core')
+
+const assert = require('../assert')
+const BatchBlobStore = require('../batch_blob_store')
+const { BlobStore } = require('../blob_store')
+const { historyStore } = require('../history_store')
+const mongoBackend = require('./mongo')
+const postgresBackend = require('./postgres')
+const { ChunkVersionConflictError } = require('./errors')
+
+const DEFAULT_DELETE_BATCH_SIZE = parseInt(config.get('maxDeleteKeys'), 10)
+const DEFAULT_DELETE_TIMEOUT_SECS = 3000 // 50 minutes
+const DEFAULT_DELETE_MIN_AGE_SECS = 86400 // 1 day
+
+/**
+ * Create the initial chunk for a project.
+ */
+async function initializeProject(projectId, snapshot) {
+  if (projectId != null) {
+    assert.projectId(projectId, 'bad projectId')
+  } else {
+    projectId = await postgresBackend.generateProjectId()
+  }
+
+  if (snapshot != null) {
+    assert.instance(snapshot, Snapshot, 'bad snapshot')
+  } else {
+    snapshot = new Snapshot()
+  }
+
+  const blobStore = new BlobStore(projectId)
+  await blobStore.initialize()
+
+  const backend = getBackend(projectId)
+  const chunkRecord = await backend.getLatestChunk(projectId)
+  if (chunkRecord != null) {
+    throw new AlreadyInitialized(projectId)
+  }
+
+  const history = new History(snapshot, [])
+  const chunk = new Chunk(history, 0)
+  await create(projectId, chunk)
+  return projectId
+}
+
+/**
+ * Load the blobs referenced in the given history
+ */
+async function lazyLoadHistoryFiles(history, batchBlobStore) {
+  const blobHashes = new Set()
+  history.findBlobHashes(blobHashes)
+
+  await batchBlobStore.preload(Array.from(blobHashes))
+  await history.loadFiles('lazy', batchBlobStore)
+}
+
+/**
+ * Load the latest Chunk stored for a project, including blob metadata.
+ *
+ * @param {string} projectId
+ * @param {Object} [opts]
+ * @param {boolean} [opts.readOnly]
+ * @return {Promise<{id: string, startVersion: number, endVersion: number, endTimestamp: Date}>}
+ */
+async function loadLatestRaw(projectId, opts) {
+  assert.projectId(projectId, 'bad projectId')
+
+  const backend = getBackend(projectId)
+  const chunkRecord = await backend.getLatestChunk(projectId, opts)
+  if (chunkRecord == null) {
+    throw new Chunk.NotFoundError(projectId)
+  }
+  return chunkRecord
+}
+
+/**
+ * Load the latest Chunk stored for a project, including blob metadata.
+ *
+ * @param {string} projectId
+ * @return {Promise.<Chunk>}
+ */
+async function loadLatest(projectId) {
+  const chunkRecord = await loadLatestRaw(projectId)
+  const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
+  const history = History.fromRaw(rawHistory)
+  const blobStore = new BlobStore(projectId)
+  const batchBlobStore = new BatchBlobStore(blobStore)
+  await lazyLoadHistoryFiles(history, batchBlobStore)
+  return new Chunk(history, chunkRecord.startVersion)
+}
+
+/**
+ * Load the the chunk that contains the given version, including blob metadata.
+ */
+async function loadAtVersion(projectId, version) {
+  assert.projectId(projectId, 'bad projectId')
+  assert.integer(version, 'bad version')
+
+  const backend = getBackend(projectId)
+  const blobStore = new BlobStore(projectId)
+  const batchBlobStore = new BatchBlobStore(blobStore)
+
+  const chunkRecord = await backend.getChunkForVersion(projectId, version)
+  const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
+  const history = History.fromRaw(rawHistory)
+  await lazyLoadHistoryFiles(history, batchBlobStore)
+  return new Chunk(history, chunkRecord.endVersion - history.countChanges())
+}
+
+/**
+ * Load the chunk that contains the version that was current at the given
+ * timestamp, including blob metadata.
+ */
+async function loadAtTimestamp(projectId, timestamp) {
+  assert.projectId(projectId, 'bad projectId')
+  assert.date(timestamp, 'bad timestamp')
+
+  const backend = getBackend(projectId)
+  const blobStore = new BlobStore(projectId)
+  const batchBlobStore = new BatchBlobStore(blobStore)
+
+  const chunkRecord = await backend.getChunkForTimestamp(projectId, timestamp)
+  const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
+  const history = History.fromRaw(rawHistory)
+  await lazyLoadHistoryFiles(history, batchBlobStore)
+  return new Chunk(history, chunkRecord.endVersion - history.countChanges())
+}
+
+/**
+ * Store the chunk and insert corresponding records in the database.
+ *
+ * @param {string} projectId
+ * @param {Chunk} chunk
+ * @param {Date} [earliestChangeTimestamp]
+ */
+async function create(projectId, chunk, earliestChangeTimestamp) {
+  assert.projectId(projectId, 'bad projectId')
+  assert.instance(chunk, Chunk, 'bad chunk')
+  assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
+
+  const backend = getBackend(projectId)
+  const chunkStart = chunk.getStartVersion()
+  const chunkId = await uploadChunk(projectId, chunk)
+
+  const opts = {}
+  if (chunkStart > 0) {
+    opts.oldChunkId = await getChunkIdForVersion(projectId, chunkStart - 1)
+  }
+  if (earliestChangeTimestamp != null) {
+    opts.earliestChangeTimestamp = earliestChangeTimestamp
+  }
+
+  await backend.confirmCreate(projectId, chunk, chunkId, opts)
+}
+
+/**
+ * Upload the given chunk to object storage.
+ *
+ * This is used by the create and update methods.
+ */
+async function uploadChunk(projectId, chunk) {
+  const backend = getBackend(projectId)
+  const blobStore = new BlobStore(projectId)
+
+  const historyStoreConcurrency = parseInt(
+    config.get('chunkStore.historyStoreConcurrency'),
+    10
+  )
+
+  const rawHistory = await chunk
+    .getHistory()
+    .store(blobStore, historyStoreConcurrency)
+  const chunkId = await backend.insertPendingChunk(projectId, chunk)
+  await historyStore.storeRaw(projectId, chunkId, rawHistory)
+  return chunkId
+}
+
+/**
+ * Extend the project's history by replacing the latest chunk with a new
+ * chunk.
+ *
+ * @param {string} projectId
+ * @param {number} oldEndVersion
+ * @param {Chunk} newChunk
+ * @param {Date} [earliestChangeTimestamp]
+ * @return {Promise}
+ */
+async function update(
+  projectId,
+  oldEndVersion,
+  newChunk,
+  earliestChangeTimestamp
+) {
+  assert.projectId(projectId, 'bad projectId')
+  assert.integer(oldEndVersion, 'bad oldEndVersion')
+  assert.instance(newChunk, Chunk, 'bad newChunk')
+  assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
+
+  const backend = getBackend(projectId)
+  const oldChunkId = await getChunkIdForVersion(projectId, oldEndVersion)
+  const newChunkId = await uploadChunk(projectId, newChunk)
+
+  const opts = {}
+  if (earliestChangeTimestamp != null) {
+    opts.earliestChangeTimestamp = earliestChangeTimestamp
+  }
+
+  await backend.confirmUpdate(projectId, oldChunkId, newChunk, newChunkId, opts)
+}
+
+/**
+ * Find the chunk ID for a given version of a project.
+ *
+ * @param {string} projectId
+ * @param {number} version
+ * @return {Promise.<string>}
+ */
+async function getChunkIdForVersion(projectId, version) {
+  const backend = getBackend(projectId)
+  const chunkRecord = await backend.getChunkForVersion(projectId, version)
+  return chunkRecord.id
+}
+
+/**
+ * Find the chunk metadata for a given version of a project.
+ *
+ * @param {string} projectId
+ * @param {number} version
+ * @return {Promise.<{id: string|number, startVersion: number, endVersion: number}>}
+ */
+async function getChunkMetadataForVersion(projectId, version) {
+  const backend = getBackend(projectId)
+  const chunkRecord = await backend.getChunkForVersion(projectId, version)
+  return chunkRecord
+}
+
+/**
+ * Get all of a project's chunk ids
+ */
+async function getProjectChunkIds(projectId) {
+  const backend = getBackend(projectId)
+  const chunkIds = await backend.getProjectChunkIds(projectId)
+  return chunkIds
+}
+
+/**
+ * Get all of a projects chunks directly
+ */
+async function getProjectChunks(projectId) {
+  const backend = getBackend(projectId)
+  const chunkIds = await backend.getProjectChunks(projectId)
+  return chunkIds
+}
+
+/**
+ * Load the chunk for a given chunk record, including blob metadata.
+ */
+async function loadByChunkRecord(projectId, chunkRecord) {
+  const blobStore = new BlobStore(projectId)
+  const batchBlobStore = new BatchBlobStore(blobStore)
+  const { raw: rawHistory, buffer: chunkBuffer } =
+    await historyStore.loadRawWithBuffer(projectId, chunkRecord.id)
+  const history = History.fromRaw(rawHistory)
+  await lazyLoadHistoryFiles(history, batchBlobStore)
+  return {
+    chunk: new Chunk(history, chunkRecord.endVersion - history.countChanges()),
+    chunkBuffer,
+  }
+}
+
+/**
+ * Asynchronously retrieves project chunks starting from a specific version.
+ *
+ * This generator function yields chunk records for a given project starting from the specified version (inclusive).
+ * It continues to fetch and yield subsequent chunk records until the end version of the latest chunk metadata is reached.
+ * If you want to fetch all the chunks *after* a version V, call this function with V+1.
+ *
+ * @param {string} projectId - The ID of the project.
+ * @param {number} version - The starting version to retrieve chunks from.
+ * @returns {AsyncGenerator<Object, void, undefined>} An async generator that yields chunk records.
+ */
+async function* getProjectChunksFromVersion(projectId, version) {
+  const backend = getBackend(projectId)
+  const latestChunkMetadata = await loadLatestRaw(projectId)
+  if (!latestChunkMetadata || version > latestChunkMetadata.endVersion) {
+    return
+  }
+  let chunkRecord = await backend.getChunkForVersion(projectId, version)
+  while (chunkRecord != null) {
+    yield chunkRecord
+    if (chunkRecord.endVersion >= latestChunkMetadata.endVersion) {
+      break
+    } else {
+      chunkRecord = await backend.getChunkForVersion(
+        projectId,
+        chunkRecord.endVersion + 1
+      )
+    }
+  }
+}
+
+/**
+ * Delete the given chunk from the database.
+ *
+ * This doesn't delete the chunk from object storage yet. The old chunks
+ * collection will do that.
+ */
+async function destroy(projectId, chunkId) {
+  const backend = getBackend(projectId)
+  await backend.deleteChunk(projectId, chunkId)
+}
+
+/**
+ * Delete all of a project's chunks from the database.
+ */
+async function deleteProjectChunks(projectId) {
+  const backend = getBackend(projectId)
+  await backend.deleteProjectChunks(projectId)
+}
+
+/**
+ * Delete a given number of old chunks from both the database
+ * and from object storage.
+ *
+ * @param {object} options
+ * @param {number} [options.batchSize] - number of chunks to delete in each
+ *                                       batch
+ * @param {number} [options.maxBatches] - maximum number of batches to process
+ * @param {number} [options.minAgeSecs] - minimum age of chunks to delete
+ * @param {number} [options.timeout] - maximum time to spend deleting chunks
+ *
+ * @return {Promise<number>} number of chunks deleted
+ */
+async function deleteOldChunks(options = {}) {
+  const batchSize = options.batchSize ?? DEFAULT_DELETE_BATCH_SIZE
+  const maxBatches = options.maxBatches ?? Number.MAX_SAFE_INTEGER
+  const minAgeSecs = options.minAgeSecs ?? DEFAULT_DELETE_MIN_AGE_SECS
+  const timeout = options.timeout ?? DEFAULT_DELETE_TIMEOUT_SECS
+  assert.greater(batchSize, 0)
+  assert.greater(timeout, 0)
+  assert.greater(maxBatches, 0)
+  assert.greaterOrEqual(minAgeSecs, 0)
+
+  const timeoutAfter = Date.now() + timeout * 1000
+  let deletedChunksTotal = 0
+  for (const backend of [postgresBackend, mongoBackend]) {
+    for (let i = 0; i < maxBatches; i++) {
+      if (Date.now() > timeoutAfter) {
+        break
+      }
+      const deletedChunks = await deleteOldChunksBatch(
+        backend,
+        batchSize,
+        minAgeSecs
+      )
+      deletedChunksTotal += deletedChunks.length
+      if (deletedChunks.length !== batchSize) {
+        // Last batch was incomplete. There probably are no old chunks left
+        break
+      }
+    }
+  }
+  return deletedChunksTotal
+}
+
+async function deleteOldChunksBatch(backend, count, minAgeSecs) {
+  assert.greater(count, 0, 'bad count')
+  assert.greaterOrEqual(minAgeSecs, 0, 'bad minAgeSecs')
+
+  const oldChunks = await backend.getOldChunksBatch(count, minAgeSecs)
+  if (oldChunks.length === 0) {
+    return []
+  }
+  await historyStore.deleteChunks(oldChunks)
+  await backend.deleteOldChunks(oldChunks.map(chunk => chunk.chunkId))
+  return oldChunks
+}
+
+/**
+ * Returns the appropriate backend for the given project id
+ *
+ * Numeric ids use the Postgres backend.
+ * Strings of 24 characters use the Mongo backend.
+ */
+function getBackend(projectId) {
+  if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
+    return postgresBackend
+  } else if (assert.MONGO_ID_REGEXP.test(projectId)) {
+    return mongoBackend
+  } else {
+    throw new OError('bad project id', { projectId })
+  }
+}
+
+class AlreadyInitialized extends OError {
+  constructor(projectId) {
+    super('Project is already initialized', { projectId })
+  }
+}
+
+module.exports = {
+  getBackend,
+  initializeProject,
+  loadLatest,
+  loadLatestRaw,
+  loadAtVersion,
+  loadAtTimestamp,
+  loadByChunkRecord,
+  create,
+  update,
+  destroy,
+  getChunkIdForVersion,
+  getChunkMetadataForVersion,
+  getProjectChunkIds,
+  getProjectChunks,
+  getProjectChunksFromVersion,
+  deleteProjectChunks,
+  deleteOldChunks,
+  AlreadyInitialized,
+  ChunkVersionConflictError,
+}
--- a/services/history-v1/storage/lib/chunk_store/mongo.js
+++ b/services/history-v1/storage/lib/chunk_store/mongo.js
@@ -0,0 +1,526 @@
+// @ts-check
+
+const { ObjectId, ReadPreference, MongoError } = require('mongodb')
+const { Chunk } = require('overleaf-editor-core')
+const OError = require('@overleaf/o-error')
+const assert = require('../assert')
+const mongodb = require('../mongodb')
+const { ChunkVersionConflictError } = require('./errors')
+
+const DUPLICATE_KEY_ERROR_CODE = 11000
+
+/**
+ * @import { ClientSession } from 'mongodb'
+ */
+
+/**
+ * Get the latest chunk's metadata from the database
+ * @param {string} projectId
+ * @param {Object} [opts]
+ * @param {boolean} [opts.readOnly]
+ */
+async function getLatestChunk(projectId, opts = {}) {
+  assert.mongoId(projectId, 'bad projectId')
+  const { readOnly = false } = opts
+
+  const record = await mongodb.chunks.findOne(
+    {
+      projectId: new ObjectId(projectId),
+      state: { $in: ['active', 'closed'] },
+    },
+    {
+      sort: { startVersion: -1 },
+      readPreference: readOnly
+        ? ReadPreference.secondaryPreferred
+        : ReadPreference.primary,
+    }
+  )
+  if (record == null) {
+    return null
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get the metadata for the chunk that contains the given version.
+ */
+async function getChunkForVersion(projectId, version) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.integer(version, 'bad version')
+
+  const record = await mongodb.chunks.findOne(
+    {
+      projectId: new ObjectId(projectId),
+      state: { $in: ['active', 'closed'] },
+      startVersion: { $lte: version },
+      endVersion: { $gte: version },
+    },
+    { sort: { startVersion: 1 } }
+  )
+  if (record == null) {
+    throw new Chunk.VersionNotFoundError(projectId, version)
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get the metadata for the chunk that contains the given version before the endTime.
+ */
+async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.date(timestamp, 'bad timestamp')
+
+  const recordActive = await getChunkForVersion(projectId, 0)
+  if (recordActive && recordActive.endTimestamp <= timestamp) {
+    return recordActive
+  }
+
+  // fallback to deleted chunk
+  const recordDeleted = await mongodb.chunks.findOne(
+    {
+      projectId: new ObjectId(projectId),
+      state: 'deleted',
+      startVersion: 0,
+      updatedAt: { $lte: timestamp }, // indexed for state=deleted
+      endTimestamp: { $lte: timestamp },
+    },
+    { sort: { updatedAt: -1 } }
+  )
+  if (recordDeleted) {
+    return chunkFromRecord(recordDeleted)
+  }
+  throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
+}
+
+/**
+ * Get the metadata for the chunk that contains the version that was current at
+ * the given timestamp.
+ */
+async function getChunkForTimestamp(projectId, timestamp) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.date(timestamp, 'bad timestamp')
+
+  const record = await mongodb.chunks.findOne(
+    {
+      projectId: new ObjectId(projectId),
+      state: { $in: ['active', 'closed'] },
+      endTimestamp: { $gte: timestamp },
+    },
+    // We use the index on the startVersion for sorting records. This assumes
+    // that timestamps go up with each version.
+    { sort: { startVersion: 1 } }
+  )
+
+  if (record == null) {
+    // Couldn't find a chunk that had modifications after the given timestamp.
+    // Fetch the latest chunk instead.
+    const chunk = await getLatestChunk(projectId)
+    if (chunk == null) {
+      throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
+    }
+    return chunk
+  }
+
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get the metadata for the chunk that contains the version that was current before
+ * the given timestamp.
+ */
+async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.date(timestamp, 'bad timestamp')
+
+  const record = await mongodb.chunks.findOne(
+    {
+      projectId: new ObjectId(projectId),
+      state: { $in: ['active', 'closed'] },
+      $or: [
+        {
+          endTimestamp: {
+            $lte: timestamp,
+          },
+        },
+        {
+          endTimestamp: null,
+        },
+      ],
+    },
+    // We use the index on the startVersion for sorting records. This assumes
+    // that timestamps go up with each version.
+    { sort: { startVersion: -1 } }
+  )
+  if (record == null) {
+    throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get all of a project's chunk ids
+ */
+async function getProjectChunkIds(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+
+  const cursor = mongodb.chunks.find(
+    {
+      projectId: new ObjectId(projectId),
+      state: { $in: ['active', 'closed'] },
+    },
+    { projection: { _id: 1 } }
+  )
+  return await cursor.map(record => record._id).toArray()
+}
+
+/**
+ * Get all of a projects chunks directly
+ */
+async function getProjectChunks(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+
+  const cursor = mongodb.chunks
+    .find(
+      {
+        projectId: new ObjectId(projectId),
+        state: { $in: ['active', 'closed'] },
+      },
+      { projection: { state: 0 } }
+    )
+    .sort({ startVersion: 1 })
+  return await cursor.map(chunkFromRecord).toArray()
+}
+
+/**
+ * Insert a pending chunk before sending it to object storage.
+ */
+async function insertPendingChunk(projectId, chunk) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.instance(chunk, Chunk, 'bad chunk')
+
+  const chunkId = new ObjectId()
+  await mongodb.chunks.insertOne({
+    _id: chunkId,
+    projectId: new ObjectId(projectId),
+    startVersion: chunk.getStartVersion(),
+    endVersion: chunk.getEndVersion(),
+    endTimestamp: chunk.getEndTimestamp(),
+    state: 'pending',
+    updatedAt: new Date(),
+  })
+  return chunkId.toString()
+}
+
+/**
+ * Record that a new chunk was created.
+ *
+ * @param {string} projectId
+ * @param {Chunk} chunk
+ * @param {string} chunkId
+ * @param {object} opts
+ * @param {Date} [opts.earliestChangeTimestamp]
+ * @param {string} [opts.oldChunkId]
+ */
+async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.instance(chunk, Chunk, 'bad newChunk')
+  assert.mongoId(chunkId, 'bad newChunkId')
+
+  await mongodb.client.withSession(async session => {
+    await session.withTransaction(async () => {
+      if (opts.oldChunkId != null) {
+        await closeChunk(projectId, opts.oldChunkId, { session })
+      }
+
+      await activateChunk(projectId, chunkId, { session })
+
+      await updateProjectRecord(
+        projectId,
+        chunk,
+        opts.earliestChangeTimestamp,
+        { session }
+      )
+    })
+  })
+}
+
+/**
+ * Write the metadata to the project record
+ */
+async function updateProjectRecord(
+  projectId,
+  chunk,
+  earliestChangeTimestamp,
+  mongoOpts = {}
+) {
+  // record the end version against the project
+  await mongodb.projects.updateOne(
+    {
+      'overleaf.history.id': projectId, // string for Object ids, number for postgres ids
+    },
+    {
+      // always store the latest end version and timestamp for the chunk
+      $max: {
+        'overleaf.history.currentEndVersion': chunk.getEndVersion(),
+        'overleaf.history.currentEndTimestamp': chunk.getEndTimestamp(),
+        'overleaf.history.updatedAt': new Date(),
+      },
+      // store the first pending change timestamp for the chunk, this will
+      // be cleared every time a backup is completed.
+      $min: {
+        'overleaf.backup.pendingChangeAt':
+          earliestChangeTimestamp || chunk.getEndTimestamp() || new Date(),
+      },
+    },
+    mongoOpts
+  )
+}
+
+/**
+ * Record that a chunk was replaced by a new one.
+ *
+ * @param {string} projectId
+ * @param {string} oldChunkId
+ * @param {Chunk} newChunk
+ * @param {string} newChunkId
+ * @param {object} [opts]
+ * @param {Date} [opts.earliestChangeTimestamp]
+ */
+async function confirmUpdate(
+  projectId,
+  oldChunkId,
+  newChunk,
+  newChunkId,
+  opts = {}
+) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.mongoId(oldChunkId, 'bad oldChunkId')
+  assert.instance(newChunk, Chunk, 'bad newChunk')
+  assert.mongoId(newChunkId, 'bad newChunkId')
+
+  await mongodb.client.withSession(async session => {
+    await session.withTransaction(async () => {
+      await deleteActiveChunk(projectId, oldChunkId, { session })
+
+      await activateChunk(projectId, newChunkId, { session })
+
+      await updateProjectRecord(
+        projectId,
+        newChunk,
+        opts.earliestChangeTimestamp,
+        { session }
+      )
+    })
+  })
+}
+
+/**
+ * Activate a pending chunk
+ *
+ * @param {string} projectId
+ * @param {string} chunkId
+ * @param {object} [opts]
+ * @param {ClientSession} [opts.session]
+ */
+async function activateChunk(projectId, chunkId, opts = {}) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.mongoId(chunkId, 'bad chunkId')
+
+  let result
+  try {
+    result = await mongodb.chunks.updateOne(
+      {
+        _id: new ObjectId(chunkId),
+        projectId: new ObjectId(projectId),
+        state: 'pending',
+      },
+      { $set: { state: 'active', updatedAt: new Date() } },
+      opts
+    )
+  } catch (err) {
+    if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
+      throw new ChunkVersionConflictError('chunk start version is not unique', {
+        projectId,
+        chunkId,
+      })
+    } else {
+      throw err
+    }
+  }
+  if (result.matchedCount === 0) {
+    throw new OError('pending chunk not found', { projectId, chunkId })
+  }
+}
+
+/**
+ * Close a chunk
+ *
+ * A closed chunk is one that can't be extended anymore.
+ *
+ * @param {string} projectId
+ * @param {string} chunkId
+ * @param {object} [opts]
+ * @param {ClientSession} [opts.session]
+ */
+async function closeChunk(projectId, chunkId, opts = {}) {
+  const result = await mongodb.chunks.updateOne(
+    {
+      _id: new ObjectId(chunkId),
+      projectId: new ObjectId(projectId),
+      state: 'active',
+    },
+    { $set: { state: 'closed' } },
+    opts
+  )
+
+  if (result.matchedCount === 0) {
+    throw new ChunkVersionConflictError('unable to close chunk', {
+      projectId,
+      chunkId,
+    })
+  }
+}
+
+/**
+ * Delete an active chunk
+ *
+ * This is used to delete chunks that are in the process of being extended. It
+ * will refuse to delete chunks that are already closed and can therefore not be
+ * extended.
+ *
+ * @param {string} projectId
+ * @param {string} chunkId
+ * @param {object} [opts]
+ * @param {ClientSession} [opts.session]
+ */
+async function deleteActiveChunk(projectId, chunkId, opts = {}) {
+  const updateResult = await mongodb.chunks.updateOne(
+    {
+      _id: new ObjectId(chunkId),
+      projectId: new ObjectId(projectId),
+      state: 'active',
+    },
+    { $set: { state: 'deleted', updatedAt: new Date() } },
+    opts
+  )
+
+  if (updateResult.matchedCount === 0) {
+    throw new ChunkVersionConflictError('unable to delete active chunk', {
+      projectId,
+      chunkId,
+    })
+  }
+}
+
+/**
+ * Delete a chunk.
+ *
+ * @param {string} projectId
+ * @param {string} chunkId
+ * @return {Promise}
+ */
+async function deleteChunk(projectId, chunkId, mongoOpts = {}) {
+  assert.mongoId(projectId, 'bad projectId')
+  assert.mongoId(chunkId, 'bad chunkId')
+
+  await mongodb.chunks.updateOne(
+    { _id: new ObjectId(chunkId), projectId: new ObjectId(projectId) },
+    { $set: { state: 'deleted', updatedAt: new Date() } },
+    mongoOpts
+  )
+}
+
+/**
+ * Delete all of a project's chunks
+ */
+async function deleteProjectChunks(projectId) {
+  assert.mongoId(projectId, 'bad projectId')
+
+  await mongodb.chunks.updateMany(
+    {
+      projectId: new ObjectId(projectId),
+      state: { $in: ['active', 'closed'] },
+    },
+    { $set: { state: 'deleted', updatedAt: new Date() } }
+  )
+}
+
+/**
+ * Get a batch of old chunks for deletion
+ */
+async function getOldChunksBatch(count, minAgeSecs) {
+  const maxUpdatedAt = new Date(Date.now() - minAgeSecs * 1000)
+  const batch = []
+
+  // We need to fetch one state at a time to take advantage of the partial
+  // indexes on the chunks collection.
+  //
+  // Mongo 6.0 allows partial indexes that use the $in operator. When we reach
+  // that Mongo version, we can create a partial index on both the deleted and
+  // pending states and simplify this logic a bit.
+  for (const state of ['deleted', 'pending']) {
+    if (count === 0) {
+      // There's no more space in the batch
+      break
+    }
+
+    const cursor = mongodb.chunks
+      .find(
+        { state, updatedAt: { $lt: maxUpdatedAt } },
+        {
+          limit: count,
+          projection: { _id: 1, projectId: 1 },
+        }
+      )
+      .map(record => ({
+        chunkId: record._id.toString(),
+        projectId: record.projectId.toString(),
+      }))
+
+    for await (const record of cursor) {
+      batch.push(record)
+      count -= 1
+    }
+  }
+  return batch
+}
+
+/**
+ * Delete a batch of old chunks from the database
+ */
+async function deleteOldChunks(chunkIds) {
+  await mongodb.chunks.deleteMany({
+    _id: { $in: chunkIds.map(id => new ObjectId(id)) },
+    state: { $in: ['deleted', 'pending'] },
+  })
+}
+
+/**
+ * Build a chunk metadata object from the database record
+ */
+function chunkFromRecord(record) {
+  return {
+    id: record._id.toString(),
+    startVersion: record.startVersion,
+    endVersion: record.endVersion,
+    endTimestamp: record.endTimestamp,
+  }
+}
+
+module.exports = {
+  getLatestChunk,
+  getFirstChunkBeforeTimestamp,
+  getLastActiveChunkBeforeTimestamp,
+  getChunkForVersion,
+  getChunkForTimestamp,
+  getProjectChunkIds,
+  getProjectChunks,
+  insertPendingChunk,
+  confirmCreate,
+  confirmUpdate,
+  updateProjectRecord,
+  deleteChunk,
+  deleteProjectChunks,
+  getOldChunksBatch,
+  deleteOldChunks,
+}
--- a/services/history-v1/storage/lib/chunk_store/postgres.js
+++ b/services/history-v1/storage/lib/chunk_store/postgres.js
@@ -0,0 +1,487 @@
+// @ts-check
+
+const { Chunk } = require('overleaf-editor-core')
+const assert = require('../assert')
+const knex = require('../knex')
+const knexReadOnly = require('../knex_read_only')
+const { ChunkVersionConflictError } = require('./errors')
+const { updateProjectRecord } = require('./mongo')
+
+const DUPLICATE_KEY_ERROR_CODE = '23505'
+
+/**
+ * @import { Knex } from 'knex'
+ */
+
+/**
+ * Get the latest chunk's metadata from the database
+ * @param {string} projectId
+ * @param {Object} [opts]
+ * @param {boolean} [opts.readOnly]
+ */
+async function getLatestChunk(projectId, opts = {}) {
+  assert.postgresId(projectId, 'bad projectId')
+  const { readOnly = false } = opts
+
+  const record = await (readOnly ? knexReadOnly : knex)('chunks')
+    .where('doc_id', parseInt(projectId, 10))
+    .orderBy('end_version', 'desc')
+    .first()
+  if (record == null) {
+    return null
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get the metadata for the chunk that contains the given version.
+ *
+ * @param {string} projectId
+ * @param {number} version
+ */
+async function getChunkForVersion(projectId, version) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  const record = await knex('chunks')
+    .where('doc_id', parseInt(projectId, 10))
+    .where('end_version', '>=', version)
+    .orderBy('end_version')
+    .first()
+  if (!record) {
+    throw new Chunk.VersionNotFoundError(projectId, version)
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get the metadata for the chunk that contains the given version.
+ *
+ * @param {string} projectId
+ * @param {Date} timestamp
+ */
+async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
+  assert.date(timestamp, 'bad timestamp')
+
+  const recordActive = await getChunkForVersion(projectId, 0)
+
+  // projectId must be valid if getChunkForVersion did not throw
+  if (recordActive && recordActive.endTimestamp <= timestamp) {
+    return recordActive
+  }
+
+  // fallback to deleted chunk
+  const recordDeleted = await knex('old_chunks')
+    .where('doc_id', parseInt(projectId, 10))
+    .where('start_version', '=', 0)
+    .where('end_timestamp', '<=', timestamp)
+    .orderBy('end_version', 'desc')
+    .first()
+  if (recordDeleted) {
+    return chunkFromRecord(recordDeleted)
+  }
+  throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
+}
+
+/**
+ * Get the metadata for the chunk that contains the version that was current at
+ * the given timestamp.
+ *
+ * @param {string} projectId
+ * @param {Date} timestamp
+ */
+async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
+  assert.date(timestamp, 'bad timestamp')
+  assert.postgresId(projectId, 'bad projectId')
+
+  const query = knex('chunks')
+    .where('doc_id', parseInt(projectId, 10))
+    .where(function () {
+      this.where('end_timestamp', '<=', timestamp).orWhere(
+        'end_timestamp',
+        null
+      )
+    })
+    .orderBy('end_version', 'desc', 'last')
+
+  const record = await query.first()
+
+  if (!record) {
+    throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Get the metadata for the chunk that contains the version that was current at
+ * the given timestamp.
+ *
+ * @param {string} projectId
+ * @param {Date} timestamp
+ */
+async function getChunkForTimestamp(projectId, timestamp) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  // This query will find the latest chunk after the timestamp (query orders
+  // in reverse chronological order), OR the latest chunk
+  // This accounts for the case where the timestamp is ahead of the chunk's
+  // timestamp and therefore will not return any results
+  const whereAfterEndTimestampOrLatestChunk = knex.raw(
+    'end_timestamp >= ? ' +
+      'OR id = ( ' +
+      'SELECT id FROM chunks ' +
+      'WHERE doc_id = ? ' +
+      'ORDER BY end_version desc LIMIT 1' +
+      ')',
+    [timestamp, parseInt(projectId, 10)]
+  )
+
+  const record = await knex('chunks')
+    .where('doc_id', parseInt(projectId, 10))
+    .where(whereAfterEndTimestampOrLatestChunk)
+    .orderBy('end_version')
+    .first()
+  if (!record) {
+    throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
+  }
+  return chunkFromRecord(record)
+}
+
+/**
+ * Build a chunk metadata object from the database record
+ */
+function chunkFromRecord(record) {
+  return {
+    id: record.id.toString(),
+    startVersion: record.start_version,
+    endVersion: record.end_version,
+    endTimestamp: record.end_timestamp,
+  }
+}
+
+/**
+ * Get all of a project's chunk ids
+ *
+ * @param {string} projectId
+ */
+async function getProjectChunkIds(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  const records = await knex('chunks')
+    .select('id')
+    .where('doc_id', parseInt(projectId, 10))
+  return records.map(record => record.id)
+}
+
+/**
+ * Get all of a projects chunks directly
+ *
+ * @param {string} projectId
+ */
+async function getProjectChunks(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  const records = await knex('chunks')
+    .select()
+    .where('doc_id', parseInt(projectId, 10))
+    .orderBy('end_version')
+  return records.map(chunkFromRecord)
+}
+
+/**
+ * Insert a pending chunk before sending it to object storage.
+ *
+ * @param {string} projectId
+ * @param {Chunk} chunk
+ */
+async function insertPendingChunk(projectId, chunk) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  const result = await knex.first(
+    knex.raw("nextval('chunks_id_seq'::regclass)::integer as chunkid")
+  )
+  const chunkId = result.chunkid
+  await knex('pending_chunks').insert({
+    id: chunkId,
+    doc_id: parseInt(projectId, 10),
+    end_version: chunk.getEndVersion(),
+    start_version: chunk.getStartVersion(),
+    end_timestamp: chunk.getEndTimestamp(),
+  })
+  return chunkId.toString()
+}
+
+/**
+ * Record that a new chunk was created.
+ *
+ * @param {string} projectId
+ * @param {Chunk} chunk
+ * @param {string} chunkId
+ * @param {object} opts
+ * @param {Date} [opts.earliestChangeTimestamp]
+ * @param {string} [opts.oldChunkId]
+ */
+async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  await knex.transaction(async tx => {
+    if (opts.oldChunkId != null) {
+      await _assertChunkIsNotClosed(tx, projectId, opts.oldChunkId)
+      await _closeChunk(tx, projectId, opts.oldChunkId)
+    }
+    await Promise.all([
+      _deletePendingChunk(tx, projectId, chunkId),
+      _insertChunk(tx, projectId, chunk, chunkId),
+    ])
+    await updateProjectRecord(
+      // The history id in Mongo is an integer for Postgres projects
+      parseInt(projectId, 10),
+      chunk,
+      opts.earliestChangeTimestamp
+    )
+  })
+}
+
+/**
+ * Record that a chunk was replaced by a new one.
+ *
+ * @param {string} projectId
+ * @param {string} oldChunkId
+ * @param {Chunk} newChunk
+ * @param {string} newChunkId
+ */
+async function confirmUpdate(
+  projectId,
+  oldChunkId,
+  newChunk,
+  newChunkId,
+  opts = {}
+) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  await knex.transaction(async tx => {
+    await _assertChunkIsNotClosed(tx, projectId, oldChunkId)
+    await _deleteChunks(tx, { doc_id: projectId, id: oldChunkId })
+    await Promise.all([
+      _deletePendingChunk(tx, projectId, newChunkId),
+      _insertChunk(tx, projectId, newChunk, newChunkId),
+    ])
+    await updateProjectRecord(
+      // The history id in Mongo is an integer for Postgres projects
+      parseInt(projectId, 10),
+      newChunk,
+      opts.earliestChangeTimestamp
+    )
+  })
+}
+
+/**
+ * Delete a pending chunk
+ *
+ * @param {Knex} tx
+ * @param {string} projectId
+ * @param {string} chunkId
+ */
+async function _deletePendingChunk(tx, projectId, chunkId) {
+  await tx('pending_chunks')
+    .where({
+      doc_id: parseInt(projectId, 10),
+      id: parseInt(chunkId, 10),
+    })
+    .del()
+}
+
+/**
+ * Adds an active chunk
+ *
+ * @param {Knex} tx
+ * @param {string} projectId
+ * @param {Chunk} chunk
+ * @param {string} chunkId
+ */
+async function _insertChunk(tx, projectId, chunk, chunkId) {
+  const startVersion = chunk.getStartVersion()
+  const endVersion = chunk.getEndVersion()
+  try {
+    await tx('chunks').insert({
+      id: parseInt(chunkId, 10),
+      doc_id: parseInt(projectId, 10),
+      start_version: startVersion,
+      end_version: endVersion,
+      end_timestamp: chunk.getEndTimestamp(),
+    })
+  } catch (err) {
+    if (
+      err instanceof Error &&
+      'code' in err &&
+      err.code === DUPLICATE_KEY_ERROR_CODE
+    ) {
+      throw new ChunkVersionConflictError(
+        'chunk start or end version is not unique',
+        { projectId, chunkId, startVersion, endVersion }
+      )
+    }
+    throw err
+  }
+}
+
+/**
+ * Check that a chunk is not closed
+ *
+ * This is used to synchronize chunk creations and extensions.
+ *
+ * @param {Knex} tx
+ * @param {string} projectId
+ * @param {string} chunkId
+ */
+async function _assertChunkIsNotClosed(tx, projectId, chunkId) {
+  const record = await tx('chunks')
+    .forUpdate()
+    .select('closed')
+    .where('doc_id', parseInt(projectId, 10))
+    .where('id', parseInt(chunkId, 10))
+    .first()
+  if (!record) {
+    throw new ChunkVersionConflictError('unable to close chunk: not found', {
+      projectId,
+      chunkId,
+    })
+  }
+  if (record.closed) {
+    throw new ChunkVersionConflictError(
+      'unable to close chunk: already closed',
+      {
+        projectId,
+        chunkId,
+      }
+    )
+  }
+}
+
+/**
+ * Close a chunk
+ *
+ * A closed chunk can no longer be extended.
+ *
+ * @param {Knex} tx
+ * @param {string} projectId
+ * @param {string} chunkId
+ */
+async function _closeChunk(tx, projectId, chunkId) {
+  await tx('chunks')
+    .update({ closed: true })
+    .where('doc_id', parseInt(projectId, 10))
+    .where('id', parseInt(chunkId, 10))
+}
+
+/**
+ * Delete a chunk.
+ *
+ * @param {string} projectId
+ * @param {string} chunkId
+ */
+async function deleteChunk(projectId, chunkId) {
+  assert.postgresId(projectId, 'bad projectId')
+  assert.integer(chunkId, 'bad chunkId')
+
+  await _deleteChunks(knex, {
+    doc_id: parseInt(projectId, 10),
+    id: parseInt(chunkId, 10),
+  })
+}
+
+/**
+ * Delete all of a project's chunks
+ *
+ * @param {string} projectId
+ */
+async function deleteProjectChunks(projectId) {
+  assert.postgresId(projectId, 'bad projectId')
+
+  await knex.transaction(async tx => {
+    await _deleteChunks(knex, { doc_id: parseInt(projectId, 10) })
+  })
+}
+
+/**
+ * Delete many chunks
+ *
+ * @param {Knex} tx
+ * @param {any} whereClause
+ */
+async function _deleteChunks(tx, whereClause) {
+  const rows = await tx('chunks').where(whereClause).del().returning('*')
+  if (rows.length === 0) {
+    return
+  }
+
+  const oldChunks = rows.map(row => ({
+    doc_id: row.doc_id,
+    chunk_id: row.id,
+    start_version: row.start_version,
+    end_version: row.end_version,
+    end_timestamp: row.end_timestamp,
+    deleted_at: tx.fn.now(),
+  }))
+  await tx('old_chunks').insert(oldChunks)
+}
+
+/**
+ * Get a batch of old chunks for deletion
+ *
+ * @param {number} count
+ * @param {number} minAgeSecs
+ */
+async function getOldChunksBatch(count, minAgeSecs) {
+  const maxDeletedAt = new Date(Date.now() - minAgeSecs * 1000)
+  const records = await knex('old_chunks')
+    .whereNull('deleted_at')
+    .orWhere('deleted_at', '<', maxDeletedAt)
+    .orderBy('chunk_id')
+    .limit(count)
+  return records.map(oldChunk => ({
+    projectId: oldChunk.doc_id.toString(),
+    chunkId: oldChunk.chunk_id.toString(),
+  }))
+}
+
+/**
+ * Delete a batch of old chunks from the database
+ *
+ * @param {string[]} chunkIds
+ */
+async function deleteOldChunks(chunkIds) {
+  await knex('old_chunks')
+    .whereIn(
+      'chunk_id',
+      chunkIds.map(id => parseInt(id, 10))
+    )
+    .del()
+}
+
+/**
+ * Generate a new project id
+ */
+async function generateProjectId() {
+  const record = await knex.first(
+    knex.raw("nextval('docs_id_seq'::regclass)::integer as doc_id")
+  )
+  return record.doc_id.toString()
+}
+
+module.exports = {
+  getLatestChunk,
+  getFirstChunkBeforeTimestamp,
+  getLastActiveChunkBeforeTimestamp,
+  getChunkForVersion,
+  getChunkForTimestamp,
+  getProjectChunkIds,
+  getProjectChunks,
+  insertPendingChunk,
+  confirmCreate,
+  confirmUpdate,
+  deleteChunk,
+  deleteProjectChunks,
+  getOldChunksBatch,
+  deleteOldChunks,
+  generateProjectId,
+}
--- a/services/history-v1/storage/lib/chunk_store/redis.js
+++ b/services/history-v1/storage/lib/chunk_store/redis.js
@@ -0,0 +1,254 @@
+const metrics = require('@overleaf/metrics')
+const logger = require('@overleaf/logger')
+const redis = require('../redis')
+const rclient = redis.rclientHistory //
+const { Snapshot, Change, History, Chunk } = require('overleaf-editor-core')
+
+const TEMPORARY_CACHE_LIFETIME = 300 // 5 minutes
+
+const keySchema = {
+  snapshot({ projectId }) {
+    return `snapshot:{${projectId}}`
+  },
+  startVersion({ projectId }) {
+    return `snapshot-version:{${projectId}}`
+  },
+  changes({ projectId }) {
+    return `changes:{${projectId}}`
+  },
+}
+
+rclient.defineCommand('get_current_chunk', {
+  numberOfKeys: 3,
+  lua: `
+      local startVersionValue = redis.call('GET', KEYS[2])
+      if not startVersionValue then
+        return nil -- this is a cache-miss
+      end
+      local snapshotValue = redis.call('GET', KEYS[1])
+      local changesValues = redis.call('LRANGE', KEYS[3], 0, -1)
+      return {snapshotValue, startVersionValue, changesValues}
+    `,
+})
+
+/**
+ * Retrieves the current chunk of project history from Redis storage
+ * @param {string} projectId - The unique identifier of the project
+ * @returns {Promise<Chunk|null>} A Promise that resolves to a Chunk object containing project history,
+ *                               or null if retrieval fails
+ * @throws {Error} If Redis operations fail
+ */
+async function getCurrentChunk(projectId) {
+  try {
+    const result = await rclient.get_current_chunk(
+      keySchema.snapshot({ projectId }),
+      keySchema.startVersion({ projectId }),
+      keySchema.changes({ projectId })
+    )
+    if (!result) {
+      return null // cache-miss
+    }
+    const snapshot = Snapshot.fromRaw(JSON.parse(result[0]))
+    const startVersion = JSON.parse(result[1])
+    const changes = result[2].map(c => Change.fromRaw(JSON.parse(c)))
+    const history = new History(snapshot, changes)
+    const chunk = new Chunk(history, startVersion)
+    metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'success' })
+    return chunk
+  } catch (err) {
+    logger.error({ err, projectId }, 'error getting current chunk from redis')
+    metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'error' })
+    return null
+  }
+}
+
+rclient.defineCommand('get_current_chunk_metadata', {
+  numberOfKeys: 2,
+  lua: `
+      local startVersionValue = redis.call('GET', KEYS[1])
+      local changesCount = redis.call('LLEN', KEYS[2])
+      return {startVersionValue, changesCount}
+    `,
+})
+
+/**
+ * Retrieves the current chunk metadata for a given project from Redis
+ * @param {string} projectId - The ID of the project to get metadata for
+ * @returns {Promise<Object|null>} Object containing startVersion and changesCount if found, null on error or cache miss
+ * @property {number} startVersion - The starting version information
+ * @property {number} changesCount - The number of changes in the chunk
+ */
+async function getCurrentChunkMetadata(projectId) {
+  try {
+    const result = await rclient.get_current_chunk_metadata(
+      keySchema.startVersion({ projectId }),
+      keySchema.changes({ projectId })
+    )
+    if (!result) {
+      return null // cache-miss
+    }
+    const startVersion = JSON.parse(result[0])
+    const changesCount = parseInt(result[1], 10)
+    return { startVersion, changesCount }
+  } catch (err) {
+    return null
+  }
+}
+
+rclient.defineCommand('set_current_chunk', {
+  numberOfKeys: 3,
+  lua: `
+      local snapshotValue = ARGV[1]
+      local startVersionValue = ARGV[2]
+      redis.call('SETEX', KEYS[1], ${TEMPORARY_CACHE_LIFETIME}, snapshotValue)
+      redis.call('SETEX', KEYS[2], ${TEMPORARY_CACHE_LIFETIME}, startVersionValue)
+      redis.call('DEL', KEYS[3]) -- clear the old changes list
+      if #ARGV >= 3 then
+        redis.call('RPUSH', KEYS[3], unpack(ARGV, 3))
+        redis.call('EXPIRE', KEYS[3], ${TEMPORARY_CACHE_LIFETIME})
+      end
+    `,
+})
+
+/**
+ * Stores the current chunk of project history in Redis
+ * @param {string} projectId - The ID of the project
+ * @param {Chunk} chunk - The chunk object containing history data
+ * @returns {Promise<*>} Returns the result of the Redis operation, or null if an error occurs
+ * @throws {Error} May throw Redis-related errors which are caught internally
+ */
+async function setCurrentChunk(projectId, chunk) {
+  try {
+    const snapshotKey = keySchema.snapshot({ projectId })
+    const startVersionKey = keySchema.startVersion({ projectId })
+    const changesKey = keySchema.changes({ projectId })
+
+    const snapshot = chunk.history.snapshot
+    const startVersion = chunk.startVersion
+    const changes = chunk.history.changes
+
+    await rclient.set_current_chunk(
+      snapshotKey,
+      startVersionKey,
+      changesKey,
+      JSON.stringify(snapshot.toRaw()),
+      startVersion,
+      ...changes.map(c => JSON.stringify(c.toRaw()))
+    )
+    metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'success' })
+  } catch (err) {
+    logger.error(
+      { err, projectId, chunk },
+      'error setting current chunk inredis'
+    )
+    metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'error' })
+    return null // while testing we will suppress any errors
+  }
+}
+
+/**
+ * Checks whether a cached chunk's version metadata matches the current chunk's metadata
+ * @param {Chunk} cachedChunk - The chunk retrieved from cache
+ * @param {Chunk} currentChunk - The current chunk to compare against
+ * @returns {boolean} - Returns true if the chunks have matching start and end versions, false otherwise
+ */
+function checkCacheValidity(cachedChunk, currentChunk) {
+  return Boolean(
+    cachedChunk &&
+      cachedChunk.getStartVersion() === currentChunk.getStartVersion() &&
+      cachedChunk.getEndVersion() === currentChunk.getEndVersion()
+  )
+}
+
+/**
+ * Validates if a cached chunk matches the current chunk metadata by comparing versions
+ * @param {Object} cachedChunk - The cached chunk object to validate
+ * @param {Object} currentChunkMetadata - The current chunk metadata to compare against
+ * @param {number} currentChunkMetadata.startVersion - The starting version number
+ * @param {number} currentChunkMetadata.endVersion - The ending version number
+ * @returns {boolean} - True if the cached chunk is valid, false otherwise
+ */
+function checkCacheValidityWithMetadata(cachedChunk, currentChunkMetadata) {
+  return Boolean(
+    cachedChunk &&
+      cachedChunk.getStartVersion() === currentChunkMetadata.startVersion &&
+      cachedChunk.getEndVersion() === currentChunkMetadata.endVersion
+  )
+}
+
+/**
+ * Compares two chunks for equality using stringified JSON comparison
+ * @param {string} projectId - The ID of the project
+ * @param {Chunk} cachedChunk - The cached chunk to compare
+ * @param {Chunk} currentChunk - The current chunk to compare against
+ * @returns {boolean} - Returns false if either chunk is null/undefined, otherwise returns the comparison result
+ */
+function compareChunks(projectId, cachedChunk, currentChunk) {
+  if (!cachedChunk || !currentChunk) {
+    return false
+  }
+  const identical = JSON.stringify(cachedChunk) === JSON.stringify(currentChunk)
+  if (!identical) {
+    try {
+      logger.error(
+        {
+          projectId,
+          cachedChunkStartVersion: cachedChunk.getStartVersion(),
+          cachedChunkEndVersion: cachedChunk.getEndVersion(),
+          currentChunkStartVersion: currentChunk.getStartVersion(),
+          currentChunkEndVersion: currentChunk.getEndVersion(),
+        },
+        'chunk cache mismatch'
+      )
+    } catch (err) {
+      // ignore errors while logging
+    }
+  }
+  metrics.inc('chunk_store.redis.compare_chunks', 1, {
+    status: identical ? 'success' : 'fail',
+  })
+  return identical
+}
+
+// Define Lua script for atomic cache clearing
+rclient.defineCommand('clear_chunk_cache', {
+  numberOfKeys: 3,
+  lua: `
+    -- Delete all keys related to a project's chunk cache atomically
+    redis.call('DEL', KEYS[1]) -- snapshot key
+    redis.call('DEL', KEYS[2]) -- startVersion key
+    redis.call('DEL', KEYS[3]) -- changes key
+    return 1
+  `,
+})
+
+/**
+ * Clears all cache entries for a project's chunk data
+ * @param {string} projectId - The ID of the project whose cache should be cleared
+ * @returns {Promise<boolean>} A promise that resolves to true if successful, false on error
+ */
+async function clearCache(projectId) {
+  try {
+    const snapshotKey = keySchema.snapshot({ projectId })
+    const startVersionKey = keySchema.startVersion({ projectId })
+    const changesKey = keySchema.changes({ projectId })
+
+    await rclient.clear_chunk_cache(snapshotKey, startVersionKey, changesKey)
+    metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'success' })
+    return true
+  } catch (err) {
+    logger.error({ err, projectId }, 'error clearing chunk cache from redis')
+    metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'error' })
+    return false
+  }
+}
+
+module.exports = {
+  getCurrentChunk,
+  setCurrentChunk,
+  getCurrentChunkMetadata,
+  checkCacheValidity,
+  checkCacheValidityWithMetadata,
+  compareChunks,
+  clearCache,
+}
--- a/services/history-v1/storage/lib/content_hash.js
+++ b/services/history-v1/storage/lib/content_hash.js
@@ -0,0 +1,18 @@
+// @ts-check
+
+const { createHash } = require('node:crypto')
+
+/**
+ * Compute a SHA-1 hash of the content
+ *
+ * This is used to validate incoming updates.
+ *
+ * @param {string} content
+ */
+function getContentHash(content) {
+  const hash = createHash('sha-1')
+  hash.update(content)
+  return hash.digest('hex')
+}
+
+module.exports = { getContentHash }
--- a/services/history-v1/storage/lib/errors.js
+++ b/services/history-v1/storage/lib/errors.js
@@ -0,0 +1,5 @@
+const OError = require('@overleaf/o-error')
+
+class InvalidChangeError extends OError {}
+
+module.exports = { InvalidChangeError }
--- a/services/history-v1/storage/lib/hash_check_blob_store.js
+++ b/services/history-v1/storage/lib/hash_check_blob_store.js
@@ -0,0 +1,30 @@
+const Blob = require('overleaf-editor-core').Blob
+const blobHash = require('./blob_hash')
+const BPromise = require('bluebird')
+
+// We want to simulate applying all of the operations so we can return the
+// resulting hashes to the caller for them to check. To do this, we need to be
+// able to take the lazy files in the final snapshot, fetch their content, and
+// compute the new content hashes. We don't, however, need to actually store
+// that content; we just need to get the hash.
+function HashCheckBlobStore(realBlobStore) {
+  this.realBlobStore = realBlobStore
+}
+
+HashCheckBlobStore.prototype.getString = BPromise.method(
+  function hashCheckBlobStoreGetString(hash) {
+    return this.realBlobStore.getString(hash)
+  }
+)
+
+HashCheckBlobStore.prototype.putString = BPromise.method(
+  function hashCheckBlobStorePutString(string) {
+    return new Blob(
+      blobHash.fromString(string),
+      Buffer.byteLength(string),
+      string.length
+    )
+  }
+)
+
+module.exports = HashCheckBlobStore
--- a/services/history-v1/storage/lib/history_store.js
+++ b/services/history-v1/storage/lib/history_store.js
@@ -0,0 +1,202 @@
+// @ts-check
+'use strict'
+
+const core = require('overleaf-editor-core')
+
+const config = require('config')
+const path = require('node:path')
+const Stream = require('node:stream')
+const { promisify } = require('node:util')
+const zlib = require('node:zlib')
+
+const OError = require('@overleaf/o-error')
+const objectPersistor = require('@overleaf/object-persistor')
+const logger = require('@overleaf/logger')
+
+const assert = require('./assert')
+const persistor = require('./persistor')
+const projectKey = require('./project_key')
+const streams = require('./streams')
+
+const Chunk = core.Chunk
+
+const gzip = promisify(zlib.gzip)
+const gunzip = promisify(zlib.gunzip)
+
+class LoadError extends OError {
+  /**
+   * @param {string} projectId
+   * @param {string} chunkId
+   * @param {any} cause
+   */
+  constructor(projectId, chunkId, cause) {
+    super(
+      'HistoryStore: failed to load chunk history',
+      { projectId, chunkId },
+      cause
+    )
+    this.projectId = projectId
+    this.chunkId = chunkId
+  }
+}
+
+class StoreError extends OError {
+  /**
+   * @param {string} projectId
+   * @param {string} chunkId
+   * @param {any} cause
+   */
+  constructor(projectId, chunkId, cause) {
+    super(
+      'HistoryStore: failed to store chunk history',
+      { projectId, chunkId },
+      cause
+    )
+    this.projectId = projectId
+    this.chunkId = chunkId
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} chunkId
+ * @return {string}
+ */
+function getKey(projectId, chunkId) {
+  return path.join(projectKey.format(projectId), projectKey.pad(chunkId))
+}
+
+/**
+ * Store and retreive raw {@link History} objects from bucket. Mainly used via the
+ * {@link ChunkStore}.
+ *
+ * Histories are stored as gzipped JSON blobs, keyed on the project ID and the
+ * ID of the Chunk that owns the history. The project ID is currently redundant,
+ * but I think it might help in future if we have to shard on project ID, and
+ * it gives us some chance of reconstructing histories even if there is a
+ * problem with the chunk metadata in the database.
+ *
+ * @class
+ */
+class HistoryStore {
+  #persistor
+  #bucket
+  constructor(persistor, bucket) {
+    this.#persistor = persistor
+    this.#bucket = bucket
+  }
+
+  /**
+   * Load the raw object for a History.
+   *
+   * @param {string} projectId
+   * @param {string} chunkId
+   * @return {Promise<import('overleaf-editor-core/lib/types').RawHistory>}
+   */
+  async loadRaw(projectId, chunkId) {
+    assert.projectId(projectId, 'bad projectId')
+    assert.chunkId(chunkId, 'bad chunkId')
+
+    const key = getKey(projectId, chunkId)
+
+    logger.debug({ projectId, chunkId }, 'loadRaw started')
+    try {
+      const buf = await streams.gunzipStreamToBuffer(
+        await this.#persistor.getObjectStream(this.#bucket, key)
+      )
+      return JSON.parse(buf.toString('utf-8'))
+    } catch (err) {
+      if (err instanceof objectPersistor.Errors.NotFoundError) {
+        throw new Chunk.NotPersistedError(projectId)
+      }
+      throw new LoadError(projectId, chunkId, err)
+    } finally {
+      logger.debug({ projectId, chunkId }, 'loadRaw finished')
+    }
+  }
+
+  async loadRawWithBuffer(projectId, chunkId) {
+    assert.projectId(projectId, 'bad projectId')
+    assert.chunkId(chunkId, 'bad chunkId')
+
+    const key = getKey(projectId, chunkId)
+
+    logger.debug({ projectId, chunkId }, 'loadBuffer started')
+    try {
+      const buf = await streams.readStreamToBuffer(
+        await this.#persistor.getObjectStream(this.#bucket, key)
+      )
+      const unzipped = await gunzip(buf)
+      return {
+        buffer: buf,
+        raw: JSON.parse(unzipped.toString('utf-8')),
+      }
+    } catch (err) {
+      if (err instanceof objectPersistor.Errors.NotFoundError) {
+        throw new Chunk.NotPersistedError(projectId)
+      }
+      throw new LoadError(projectId, chunkId, err)
+    } finally {
+      logger.debug({ projectId, chunkId }, 'loadBuffer finished')
+    }
+  }
+
+  /**
+   * Compress and store a {@link History}.
+   *
+   * @param {string} projectId
+   * @param {string} chunkId
+   * @param {import('overleaf-editor-core/lib/types').RawHistory} rawHistory
+   */
+  async storeRaw(projectId, chunkId, rawHistory) {
+    assert.projectId(projectId, 'bad projectId')
+    assert.chunkId(chunkId, 'bad chunkId')
+    assert.object(rawHistory, 'bad rawHistory')
+
+    const key = getKey(projectId, chunkId)
+
+    logger.debug({ projectId, chunkId }, 'storeRaw started')
+
+    const buf = await gzip(JSON.stringify(rawHistory))
+    try {
+      await this.#persistor.sendStream(
+        this.#bucket,
+        key,
+        Stream.Readable.from([buf]),
+        {
+          contentType: 'application/json',
+          contentEncoding: 'gzip',
+          contentLength: buf.byteLength,
+        }
+      )
+    } catch (err) {
+      throw new StoreError(projectId, chunkId, err)
+    } finally {
+      logger.debug({ projectId, chunkId }, 'storeRaw finished')
+    }
+  }
+
+  /**
+   * Delete multiple chunks from bucket. Expects an Array of objects with
+   * projectId and chunkId properties
+   * @param {Array<{projectId: string,chunkId:string}>} chunks
+   */
+  async deleteChunks(chunks) {
+    logger.debug({ chunks }, 'deleteChunks started')
+    try {
+      await Promise.all(
+        chunks.map(chunk => {
+          const key = getKey(chunk.projectId, chunk.chunkId)
+          return this.#persistor.deleteObject(this.#bucket, key)
+        })
+      )
+    } finally {
+      logger.debug({ chunks }, 'deleteChunks finished')
+    }
+  }
+}
+
+module.exports = {
+  HistoryStore,
+  historyStore: new HistoryStore(persistor, config.get('chunkStore.bucket')),
+}
--- a/services/history-v1/storage/lib/knex.js
+++ b/services/history-v1/storage/lib/knex.js
@@ -0,0 +1,8 @@
+// @ts-check
+
+'use strict'
+
+const env = process.env.NODE_ENV || 'development'
+
+const knexfile = require('../../knexfile')
+module.exports = require('knex').default(knexfile[env])
--- a/services/history-v1/storage/lib/knex_read_only.js
+++ b/services/history-v1/storage/lib/knex_read_only.js
@@ -0,0 +1,19 @@
+'use strict'
+
+const config = require('config')
+const knexfile = require('../../knexfile')
+
+const env = process.env.NODE_ENV || 'development'
+
+if (config.databaseUrlReadOnly) {
+  module.exports = require('knex')({
+    ...knexfile[env],
+    pool: {
+      ...knexfile[env].pool,
+      min: 0,
+    },
+    connection: config.databaseUrlReadOnly,
+  })
+} else {
+  module.exports = require('./knex')
+}
--- a/services/history-v1/storage/lib/mongodb.js
+++ b/services/history-v1/storage/lib/mongodb.js
@@ -0,0 +1,30 @@
+const Metrics = require('@overleaf/metrics')
+
+const config = require('config')
+const { MongoClient } = require('mongodb')
+
+const client = new MongoClient(config.mongo.uri)
+const db = client.db()
+
+const chunks = db.collection('projectHistoryChunks')
+const blobs = db.collection('projectHistoryBlobs')
+const globalBlobs = db.collection('projectHistoryGlobalBlobs')
+const shardedBlobs = db.collection('projectHistoryShardedBlobs')
+const projects = db.collection('projects')
+// Temporary collection for tracking progress of backed up old blobs (without a hash).
+// The initial sync process will be able to skip over these.
+// Schema: _id: projectId, blobs: [Binary]
+const backedUpBlobs = db.collection('projectHistoryBackedUpBlobs')
+
+Metrics.mongodb.monitor(client)
+
+module.exports = {
+  client,
+  db,
+  chunks,
+  blobs,
+  globalBlobs,
+  projects,
+  shardedBlobs,
+  backedUpBlobs,
+}
--- a/services/history-v1/storage/lib/persist_changes.js
+++ b/services/history-v1/storage/lib/persist_changes.js
@@ -0,0 +1,261 @@
+// @ts-check
+
+'use strict'
+
+const _ = require('lodash')
+const logger = require('@overleaf/logger')
+
+const core = require('overleaf-editor-core')
+const Chunk = core.Chunk
+const History = core.History
+
+const assert = require('./assert')
+const chunkStore = require('./chunk_store')
+const { BlobStore } = require('./blob_store')
+const { InvalidChangeError } = require('./errors')
+const { getContentHash } = require('./content_hash')
+
+function countChangeBytes(change) {
+  // Note: This is not quite accurate, because the raw change may contain raw
+  // file info (or conceivably even content) that will not be included in the
+  // actual stored object.
+  return Buffer.byteLength(JSON.stringify(change.toRaw()))
+}
+
+function totalChangeBytes(changes) {
+  return changes.length ? _(changes).map(countChangeBytes).sum() : 0
+}
+
+// provide a simple timer function
+function Timer() {
+  this.t0 = process.hrtime()
+}
+Timer.prototype.elapsed = function () {
+  const dt = process.hrtime(this.t0)
+  const timeInMilliseconds = (dt[0] + dt[1] * 1e-9) * 1e3
+  return timeInMilliseconds
+}
+
+/**
+ * Break the given set of changes into zero or more Chunks according to the
+ * provided limits and store them.
+ *
+ * Some other possible improvements:
+ * 1. This does a lot more JSON serialization than it has to. We may know the
+ *    JSON for the changes before we call this function, so we could in that
+ *    case get the byte size of each change without doing any work. Even if we
+ *    don't know it initially, we could save some computation by caching this
+ *    info rather than recomputing it many times. TBD whether it is worthwhile.
+ * 2. We don't necessarily have to fetch the latest chunk in order to determine
+ *    that it is full. We could store this in the chunk metadata record. It may
+ *    be worth distinguishing between a Chunk and its metadata record. The
+ *    endVersion may be better suited to the metadata record.
+ *
+ * @param {string} projectId
+ * @param {core.Change[]} allChanges
+ * @param {Object} limits
+ * @param {number} clientEndVersion
+ * @return {Promise.<Object?>}
+ */
+async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
+  assert.projectId(projectId)
+  assert.array(allChanges)
+  assert.maybe.object(limits)
+  assert.integer(clientEndVersion)
+
+  const blobStore = new BlobStore(projectId)
+
+  const earliestChangeTimestamp =
+    allChanges.length > 0 ? allChanges[0].getTimestamp() : null
+
+  let currentChunk
+
+  /**
+   * currentSnapshot tracks the latest change that we're applying; we use it to
+   * check that the changes we are persisting are valid.
+   *
+   * @type {core.Snapshot}
+   */
+  let currentSnapshot
+
+  let originalEndVersion
+  let changesToPersist
+
+  limits = limits || {}
+  _.defaults(limits, {
+    changeBucketMinutes: 60,
+    maxChanges: 2500,
+    maxChangeBytes: 5 * 1024 * 1024,
+    maxChunkChanges: 2000,
+    maxChunkChangeBytes: 5 * 1024 * 1024,
+    maxChunkChangeTime: 5000, // warn if total time for changes in a chunk takes longer than this
+  })
+
+  function checkElapsedTime(timer) {
+    const timeTaken = timer.elapsed()
+    if (timeTaken > limits.maxChunkChangeTime) {
+      console.log('warning: slow chunk', projectId, timeTaken)
+    }
+  }
+
+  /**
+   * Add changes to a chunk until the chunk is full
+   *
+   * The chunk is full if it reaches a certain number of changes or a certain
+   * size in bytes
+   *
+   * @param {core.Chunk} chunk
+   * @param {core.Change[]} changes
+   */
+  async function fillChunk(chunk, changes) {
+    let totalBytes = totalChangeBytes(chunk.getChanges())
+    let changesPushed = false
+    while (changes.length > 0) {
+      if (chunk.getChanges().length >= limits.maxChunkChanges) {
+        break
+      }
+
+      const change = changes[0]
+      const changeBytes = countChangeBytes(change)
+
+      if (totalBytes + changeBytes > limits.maxChunkChangeBytes) {
+        break
+      }
+
+      for (const operation of change.iterativelyApplyTo(currentSnapshot, {
+        strict: true,
+      })) {
+        await validateContentHash(operation)
+      }
+
+      chunk.pushChanges([change])
+      changes.shift()
+      totalBytes += changeBytes
+      changesPushed = true
+    }
+    return changesPushed
+  }
+
+  /**
+   * Check that the operation is valid and can be incorporated to the history.
+   *
+   * For now, this checks content hashes when they are provided.
+   *
+   * @param {core.Operation} operation
+   */
+  async function validateContentHash(operation) {
+    if (operation instanceof core.EditFileOperation) {
+      const editOperation = operation.getOperation()
+      if (
+        editOperation instanceof core.TextOperation &&
+        editOperation.contentHash != null
+      ) {
+        const path = operation.getPathname()
+        const file = currentSnapshot.getFile(path)
+        if (file == null) {
+          throw new InvalidChangeError('file not found for hash validation', {
+            projectId,
+            path,
+          })
+        }
+        await file.load('eager', blobStore)
+        const content = file.getContent({ filterTrackedDeletes: true })
+        const expectedHash = editOperation.contentHash
+        const actualHash = content != null ? getContentHash(content) : null
+        logger.debug({ expectedHash, actualHash }, 'validating content hash')
+        if (actualHash !== expectedHash) {
+          throw new InvalidChangeError('content hash mismatch', {
+            projectId,
+            path,
+            expectedHash,
+            actualHash,
+          })
+        }
+
+        // Remove the content hash from the change before storing it in the chunk.
+        // It was only useful for validation.
+        editOperation.contentHash = null
+      }
+    }
+  }
+
+  async function extendLastChunkIfPossible() {
+    const latestChunk = await chunkStore.loadLatest(projectId)
+
+    currentChunk = latestChunk
+    originalEndVersion = latestChunk.getEndVersion()
+    if (originalEndVersion !== clientEndVersion) {
+      throw new Chunk.ConflictingEndVersion(
+        clientEndVersion,
+        originalEndVersion
+      )
+    }
+
+    currentSnapshot = latestChunk.getSnapshot().clone()
+    const timer = new Timer()
+    currentSnapshot.applyAll(latestChunk.getChanges())
+
+    const changesPushed = await fillChunk(currentChunk, changesToPersist)
+    if (!changesPushed) {
+      return
+    }
+
+    checkElapsedTime(timer)
+
+    await chunkStore.update(
+      projectId,
+      originalEndVersion,
+      currentChunk,
+      earliestChangeTimestamp
+    )
+  }
+
+  async function createNewChunksAsNeeded() {
+    while (changesToPersist.length > 0) {
+      const endVersion = currentChunk.getEndVersion()
+      const history = new History(currentSnapshot.clone(), [])
+      const chunk = new Chunk(history, endVersion)
+      const timer = new Timer()
+
+      const changesPushed = await fillChunk(chunk, changesToPersist)
+      if (changesPushed) {
+        checkElapsedTime(timer)
+        currentChunk = chunk
+        await chunkStore.create(projectId, chunk, earliestChangeTimestamp)
+      } else {
+        throw new Error('failed to fill empty chunk')
+      }
+    }
+  }
+
+  function isOlderThanMinChangeTimestamp(change) {
+    return change.getTimestamp().getTime() < limits.minChangeTimestamp
+  }
+
+  function isOlderThanMaxChangeTimestamp(change) {
+    return change.getTimestamp().getTime() < limits.maxChangeTimestamp
+  }
+
+  const oldChanges = _.filter(allChanges, isOlderThanMinChangeTimestamp)
+  const anyTooOld = _.some(oldChanges, isOlderThanMaxChangeTimestamp)
+  const tooManyChanges = oldChanges.length > limits.maxChanges
+  const tooManyBytes = totalChangeBytes(oldChanges) > limits.maxChangeBytes
+
+  if (anyTooOld || tooManyChanges || tooManyBytes) {
+    changesToPersist = oldChanges
+    const numberOfChangesToPersist = oldChanges.length
+
+    await extendLastChunkIfPossible()
+    await createNewChunksAsNeeded()
+
+    return {
+      numberOfChangesPersisted: numberOfChangesToPersist,
+      originalEndVersion,
+      currentChunk,
+    }
+  } else {
+    return null
+  }
+}
+
+module.exports = persistChanges
--- a/services/history-v1/storage/lib/persistor.js
+++ b/services/history-v1/storage/lib/persistor.js
@@ -0,0 +1,27 @@
+const _ = require('lodash')
+const config = require('config')
+const metrics = require('@overleaf/metrics')
+const objectPersistor = require('@overleaf/object-persistor')
+
+const persistorConfig = _.cloneDeep(config.get('persistor'))
+
+function convertKey(key, convertFn) {
+  if (_.has(persistorConfig, key)) {
+    _.update(persistorConfig, key, convertFn)
+  }
+}
+
+convertKey('s3.signedUrlExpiryInMs', s => parseInt(s, 10))
+convertKey('s3.httpOptions.timeout', s => parseInt(s, 10))
+convertKey('s3.maxRetries', s => parseInt(s, 10))
+convertKey('s3.pathStyle', s => s === 'true')
+convertKey('gcs.unlockBeforeDelete', s => s === 'true')
+convertKey('gcs.unsignedUrls', s => s === 'true')
+convertKey('gcs.signedUrlExpiryInMs', s => parseInt(s, 10))
+convertKey('gcs.deleteConcurrency', s => parseInt(s, 10))
+convertKey('gcs.retryOptions.maxRetries', s => parseInt(s, 10))
+convertKey('fallback.buckets', s => JSON.parse(s || '{}'))
+
+persistorConfig.Metrics = metrics
+
+module.exports = objectPersistor(persistorConfig)
--- a/services/history-v1/storage/lib/project_archive.js
+++ b/services/history-v1/storage/lib/project_archive.js
@@ -0,0 +1,140 @@
+// @ts-check
+'use strict'
+
+/**
+ * @import { Snapshot } from 'overleaf-editor-core'
+ * @import { BlobStore } from '../../storage/lib/blob_store/index'
+ */
+
+const Archive = require('archiver')
+const BPromise = require('bluebird')
+const fs = require('node:fs')
+const { pipeline } = require('node:stream')
+
+const core = require('overleaf-editor-core')
+
+const Snapshot = core.Snapshot
+const OError = require('@overleaf/o-error')
+
+const assert = require('./assert')
+
+// The maximum safe concurrency appears to be 1.
+// https://github.com/overleaf/issues/issues/1909
+const FETCH_CONCURRENCY = 1 // number of files to fetch at once
+const DEFAULT_ZIP_TIMEOUT = 25000 // ms
+
+class DownloadError extends OError {
+  constructor(hash) {
+    super(`ProjectArchive: blob download failed: ${hash}`, { hash })
+  }
+}
+
+class ArchiveTimeout extends OError {
+  constructor() {
+    super('ProjectArchive timed out')
+  }
+}
+
+class MissingfileError extends OError {
+  constructor() {
+    super('ProjectArchive: attempting to look up a file that does not exist')
+  }
+}
+
+class ProjectArchive {
+  static ArchiveTimeout = ArchiveTimeout
+  static MissingfileError = MissingfileError
+  static DownloadError = DownloadError
+
+  /**
+   * @constructor
+   * @param {Snapshot} snapshot
+   * @param {number} [timeout] in ms
+   * @classdesc
+   * Writes the project snapshot to a zip file.
+   */
+  constructor(snapshot, timeout) {
+    assert.instance(snapshot, Snapshot)
+    this.snapshot = snapshot
+    this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
+  }
+
+  /**
+   * Write zip archive to the given file path.
+   *
+   * @param {BlobStore} blobStore
+   * @param {string} zipFilePath
+   */
+  writeZip(blobStore, zipFilePath) {
+    const snapshot = this.snapshot
+    const timeout = this.timeout
+
+    const startTime = process.hrtime()
+    const archive = new Archive('zip')
+
+    // Convert elapsed seconds and nanoseconds to milliseconds.
+    function findElapsedMilliseconds() {
+      const elapsed = process.hrtime(startTime)
+      return elapsed[0] * 1e3 + elapsed[1] * 1e-6
+    }
+
+    function addFileToArchive(pathname) {
+      if (findElapsedMilliseconds() > timeout) {
+        throw new ProjectArchive.ArchiveTimeout()
+      }
+
+      const file = snapshot.getFile(pathname)
+      if (!file) {
+        throw new ProjectArchive.MissingfileError()
+      }
+      return file.load('eager', blobStore).then(function () {
+        const content = file.getContent({ filterTrackedDeletes: true })
+        if (content === null) {
+          return streamFileToArchive(pathname, file).catch(function (err) {
+            throw new ProjectArchive.DownloadError(file.getHash()).withCause(
+              err
+            )
+          })
+        } else {
+          archive.append(content, { name: pathname })
+        }
+      })
+    }
+
+    function streamFileToArchive(pathname, file) {
+      return new BPromise(function (resolve, reject) {
+        blobStore
+          .getStream(file.getHash())
+          .then(stream => {
+            stream.on('error', reject)
+            stream.on('end', resolve)
+            archive.append(stream, { name: pathname })
+          })
+          .catch(reject)
+      })
+    }
+
+    const addFilesToArchiveAndFinalize = BPromise.map(
+      snapshot.getFilePathnames(),
+      addFileToArchive,
+      { concurrency: FETCH_CONCURRENCY }
+    ).then(function () {
+      archive.finalize()
+    })
+
+    const streamArchiveToFile = new BPromise(function (resolve, reject) {
+      const stream = fs.createWriteStream(zipFilePath)
+      pipeline(archive, stream, function (err) {
+        if (err) {
+          reject(err)
+        } else {
+          resolve()
+        }
+      })
+    })
+
+    return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
+  }
+}
+
+module.exports = ProjectArchive
--- a/services/history-v1/storage/lib/project_key.js
+++ b/services/history-v1/storage/lib/project_key.js
@@ -0,0 +1,24 @@
+// Keep in sync with services/web/app/src/Features/History/project_key.js
+const _ = require('lodash')
+const path = require('node:path')
+
+//
+// The advice in http://docs.aws.amazon.com/AmazonS3/latest/dev/
+// request-rate-perf-considerations.html is to avoid sequential key prefixes,
+// so we reverse the project ID part of the key as they suggest.
+//
+function format(projectId) {
+  const prefix = naiveReverse(pad(projectId))
+  return path.join(prefix.slice(0, 3), prefix.slice(3, 6), prefix.slice(6))
+}
+
+function pad(number) {
+  return _.padStart(number, 9, '0')
+}
+
+function naiveReverse(string) {
+  return string.split('').reverse().join('')
+}
+
+exports.format = format
+exports.pad = pad
--- a/services/history-v1/storage/lib/redis.js
+++ b/services/history-v1/storage/lib/redis.js
@@ -0,0 +1,19 @@
+const config = require('config')
+const redis = require('@overleaf/redis-wrapper')
+
+const historyRedisOptions = config.get('redis.history')
+const rclientHistory = redis.createClient(historyRedisOptions)
+
+const lockRedisOptions = config.get('redis.history')
+const rclientLock = redis.createClient(lockRedisOptions)
+
+async function disconnect() {
+  await Promise.all([rclientHistory.disconnect(), rclientLock.disconnect()])
+}
+
+module.exports = {
+  rclientHistory,
+  rclientLock,
+  redis,
+  disconnect,
+}
--- a/services/history-v1/storage/lib/streams.js
+++ b/services/history-v1/storage/lib/streams.js
@@ -0,0 +1,40 @@
+// @ts-check
+/**
+ * Promises are promises and streams are streams, and ne'er the twain shall
+ * meet.
+ * @module
+ */
+'use strict'
+
+const Stream = require('node:stream')
+const zlib = require('node:zlib')
+const { WritableBuffer } = require('@overleaf/stream-utils')
+
+/**
+ * Create a promise for the result of reading a stream to a buffer.
+ *
+ * @param {Stream.Readable} readStream
+ * @return {Promise<Buffer>}
+ */
+async function readStreamToBuffer(readStream) {
+  const bufferStream = new WritableBuffer()
+  await Stream.promises.pipeline(readStream, bufferStream)
+  return bufferStream.contents()
+}
+
+exports.readStreamToBuffer = readStreamToBuffer
+
+/**
+ * Create a promise for the result of un-gzipping a stream to a buffer.
+ *
+ * @param {NodeJS.ReadableStream} readStream
+ * @return {Promise<Buffer>}
+ */
+async function gunzipStreamToBuffer(readStream) {
+  const gunzip = zlib.createGunzip()
+  const bufferStream = new WritableBuffer()
+  await Stream.promises.pipeline(readStream, gunzip, bufferStream)
+  return bufferStream.contents()
+}
+
+exports.gunzipStreamToBuffer = gunzipStreamToBuffer
--- a/services/history-v1/storage/lib/temp.js
+++ b/services/history-v1/storage/lib/temp.js
@@ -0,0 +1,25 @@
+/*
+ * Taken from renderer/app/helpers/temp.js with minor cosmetic changes.
+ * Promisify the temp package. The temp package provides a 'track' feature
+ * that automatically cleans up temp files at process exit, but that is not
+ * very useful. They also provide a method to trigger cleanup, but that is not
+ * safe for concurrent use. So, we use a disposer to unlink the file.
+ */
+
+const BPromise = require('bluebird')
+const fs = BPromise.promisifyAll(require('node:fs'))
+const temp = BPromise.promisifyAll(require('temp'))
+
+exports.open = function (affixes) {
+  return temp.openAsync(affixes).disposer(function (fileInfo) {
+    fs.closeAsync(fileInfo.fd)
+      .then(() => {
+        return fs.unlinkAsync(fileInfo.path)
+      })
+      .catch(function (err) {
+        if (err.code !== 'ENOENT') {
+          throw err
+        }
+      })
+  })
+}
--- a/services/history-v1/storage/lib/zip_store.js
+++ b/services/history-v1/storage/lib/zip_store.js
@@ -0,0 +1,134 @@
+'use strict'
+
+const BPromise = require('bluebird')
+const config = require('config')
+const fs = require('node:fs')
+const path = require('node:path')
+
+const OError = require('@overleaf/o-error')
+const objectPersistor = require('@overleaf/object-persistor')
+
+const assert = require('./assert')
+const { BlobStore } = require('./blob_store')
+const persistor = require('./persistor')
+const ProjectArchive = require('./project_archive')
+const projectKey = require('./project_key')
+const temp = require('./temp')
+
+const BUCKET = config.get('zipStore.bucket')
+
+function getZipKey(projectId, version) {
+  return path.join(
+    projectKey.format(projectId),
+    version.toString(),
+    'project.zip'
+  )
+}
+
+/**
+ * Store a zip of a given version of a project in bucket.
+ *
+ * @class
+ */
+class ZipStore {
+  /**
+   * Generate signed link to access the zip file.
+   *
+   * @param {number | string} projectId
+   * @param {number} version
+   * @return {string}
+   */
+  async getSignedUrl(projectId, version) {
+    assert.projectId(projectId, 'bad projectId')
+    assert.integer(version, 'bad version')
+
+    const key = getZipKey(projectId, version)
+    return await persistor.getRedirectUrl(BUCKET, key)
+  }
+
+  /**
+   * Generate a zip of the given snapshot.
+   *
+   * @param {number | string} projectId
+   * @param {number} version
+   * @param {Snapshot} snapshot
+   */
+  async storeZip(projectId, version, snapshot) {
+    assert.projectId(projectId, 'bad projectId')
+    assert.integer(version, 'bad version')
+    assert.object(snapshot, 'bad snapshot')
+
+    const zipKey = getZipKey(projectId, version)
+
+    if (await isZipPresent()) return
+
+    await BPromise.using(temp.open('zip'), async tempFileInfo => {
+      await zipSnapshot(tempFileInfo.path, snapshot)
+      await uploadZip(tempFileInfo.path)
+    })
+
+    // If the file is already there, we don't need to build the zip again. If we
+    // just HEAD the file, there's a race condition, because the zip files
+    // automatically expire. So, we try to copy the file from itself to itself,
+    // and if it fails, we know the file didn't exist. If it succeeds, this has
+    // the effect of re-extending its lifetime.
+    async function isZipPresent() {
+      try {
+        await persistor.copyObject(BUCKET, zipKey, zipKey)
+        return true
+      } catch (error) {
+        if (!(error instanceof objectPersistor.Errors.NotFoundError)) {
+          console.error(
+            'storeZip: isZipPresent: unexpected error (except in dev): %s',
+            error
+          )
+        }
+        return false
+      }
+    }
+
+    async function zipSnapshot(tempPathname, snapshot) {
+      const blobStore = new BlobStore(projectId)
+      const zipTimeoutMs = parseInt(config.get('zipStore.zipTimeoutMs'), 10)
+      const archive = new ProjectArchive(snapshot, zipTimeoutMs)
+      try {
+        await archive.writeZip(blobStore, tempPathname)
+      } catch (err) {
+        throw new ZipStore.CreationError(projectId, version).withCause(err)
+      }
+    }
+
+    async function uploadZip(tempPathname, snapshot) {
+      const stream = fs.createReadStream(tempPathname)
+      try {
+        await persistor.sendStream(BUCKET, zipKey, stream, {
+          contentType: 'application/zip',
+        })
+      } catch (err) {
+        throw new ZipStore.UploadError(projectId, version).withCause(err)
+      }
+    }
+  }
+}
+
+class CreationError extends OError {
+  constructor(projectId, version) {
+    super(`Zip creation failed for ${projectId} version ${version}`, {
+      projectId,
+      version,
+    })
+  }
+}
+ZipStore.CreationError = CreationError
+
+class UploadError extends OError {
+  constructor(projectId, version) {
+    super(`Zip upload failed for ${projectId} version ${version}`, {
+      projectId,
+      version,
+    })
+  }
+}
+ZipStore.UploadError = UploadError
+
+module.exports = new ZipStore()
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
--- a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs
@@ -0,0 +1,647 @@
+// @ts-check
+import Events from 'node:events'
+import fs from 'node:fs'
+import Stream from 'node:stream'
+import { ObjectId } from 'mongodb'
+import logger from '@overleaf/logger'
+import OError from '@overleaf/o-error'
+import { Blob } from 'overleaf-editor-core'
+import {
+  BlobStore,
+  getStringLengthOfFile,
+  GLOBAL_BLOBS,
+  makeBlobForFile,
+} from '../lib/blob_store/index.js'
+import { db } from '../lib/mongodb.js'
+import commandLineArgs from 'command-line-args'
+import readline from 'node:readline'
+import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
+import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
+import filestorePersistor from '../lib/persistor.js'
+import { setTimeout } from 'node:timers/promises'
+
+// Silence warning.
+Events.setMaxListeners(20)
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+/**
+ * @typedef {import("mongodb").Collection} Collection
+ * @typedef {import("mongodb").Collection<Project>} ProjectsCollection
+ * @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
+ */
+
+/**
+ * @typedef {Object} FileRef
+ * @property {ObjectId} _id
+ * @property {string} hash
+ */
+
+/**
+ * @typedef {Object} Folder
+ * @property {Array<Folder>} folders
+ * @property {Array<FileRef>} fileRefs
+ */
+
+/**
+ * @typedef {Object} Project
+ * @property {ObjectId} _id
+ * @property {Array<Folder>} rootFolder
+ * @property {{history: {id: (number|string)}}} overleaf
+ */
+
+/**
+ * @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
+ */
+function parseArgs() {
+  const args = commandLineArgs([
+    { name: 'fixNotFound', type: String, defaultValue: 'true' },
+    { name: 'fixDeletePermission', type: String, defaultValue: 'true' },
+    { name: 'fixHashMismatch', type: String, defaultValue: 'true' },
+    { name: 'fixMissingHash', type: String, defaultValue: 'true' },
+    { name: 'logs', type: String, defaultValue: '' },
+  ])
+  /**
+   * commandLineArgs cannot handle --foo=false, so go the long way
+   * @param {string} name
+   * @return {boolean}
+   */
+  function boolVal(name) {
+    const v = args[name]
+    if (['true', 'false'].includes(v)) return v === 'true'
+    throw new Error(`expected "true" or "false" for boolean option ${name}`)
+  }
+  return {
+    FIX_HASH_MISMATCH: boolVal('fixNotFound'),
+    FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
+    FIX_NOT_FOUND: boolVal('fixHashMismatch'),
+    FIX_MISSING_HASH: boolVal('fixMissingHash'),
+    LOGS: args.logs,
+  }
+}
+
+const {
+  FIX_HASH_MISMATCH,
+  FIX_DELETE_PERMISSION,
+  FIX_NOT_FOUND,
+  FIX_MISSING_HASH,
+  LOGS,
+} = parseArgs()
+if (!LOGS) {
+  throw new Error('--logs parameter missing')
+}
+const BUFFER_DIR = fs.mkdtempSync(
+  process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
+)
+const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
+if (!USER_FILES_BUCKET_NAME) {
+  throw new Error('env var USER_FILES_BUCKET_NAME is missing')
+}
+// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
+const STREAM_HIGH_WATER_MARK = parseInt(
+  process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
+  10
+)
+const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
+
+/** @type {ProjectsCollection} */
+const projectsCollection = db.collection('projects')
+/** @type {DeletedProjectsCollection} */
+const deletedProjectsCollection = db.collection('deletedProjects')
+
+let gracefulShutdownInitiated = false
+
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  gracefulShutdownInitiated = true
+  console.warn('graceful shutdown initiated, draining queue')
+}
+
+class FileDeletedError extends OError {}
+
+/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
+const PROJECT_CACHE = new Map()
+
+/**
+ * @param {string} projectId
+ * @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
+ */
+async function getProject(projectId) {
+  const cached = PROJECT_CACHE.get(projectId)
+  if (cached) return cached
+
+  let projectSoftDeleted
+  let project = await projectsCollection.findOne({
+    _id: new ObjectId(projectId),
+  })
+  if (project) {
+    projectSoftDeleted = false
+  } else {
+    const softDeleted = await deletedProjectsCollection.findOne({
+      'deleterData.deletedProjectId': new ObjectId(projectId),
+      project: { $exists: true },
+    })
+    if (!softDeleted) {
+      throw new OError('project hard-deleted')
+    }
+    project = softDeleted.project
+    projectSoftDeleted = true
+  }
+  PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
+  return { projectSoftDeleted, project }
+}
+
+/**
+ * @param {Folder} folder
+ * @param {string} fileId
+ * @return {{path: string, fileRef: FileRef, folder: Folder}|null}
+ */
+function getFileTreePath(folder, fileId) {
+  if (!folder) return null
+  let idx = 0
+  if (Array.isArray(folder.fileRefs)) {
+    for (const fileRef of folder.fileRefs) {
+      if (fileRef?._id.toString() === fileId) {
+        return {
+          fileRef,
+          path: `.fileRefs.${idx}`,
+          folder,
+        }
+      }
+      idx++
+    }
+  }
+  idx = 0
+  if (Array.isArray(folder.folders)) {
+    for (const child of folder.folders) {
+      const match = getFileTreePath(child, fileId)
+      if (match) {
+        return {
+          fileRef: match.fileRef,
+          folder: match.folder,
+          path: `.folders.${idx}${match.path}`,
+        }
+      }
+      idx++
+    }
+  }
+  return null
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
+ */
+async function findFile(projectId, fileId) {
+  const { projectSoftDeleted, project } = await getProject(projectId)
+  const match = getFileTreePath(project.rootFolder[0], fileId)
+  if (!match) {
+    throw new FileDeletedError('file not found in file-tree', {
+      projectSoftDeleted,
+    })
+  }
+  const { path, fileRef, folder } = match
+  let fullPath
+  let query
+  if (projectSoftDeleted) {
+    fullPath = `project.rootFolder.0${path}`
+    query = {
+      'deleterData.deletedProjectId': new ObjectId(projectId),
+      [`${fullPath}._id`]: new ObjectId(fileId),
+    }
+  } else {
+    fullPath = `rootFolder.0${path}`
+    query = {
+      _id: new ObjectId(projectId),
+      [`${fullPath}._id`]: new ObjectId(fileId),
+    }
+  }
+  return {
+    projectSoftDeleted,
+    query,
+    fullPath,
+    fileRef,
+    folder,
+  }
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixNotFound(line) {
+  const { projectId, fileId, bucketName } = JSON.parse(line)
+  if (bucketName !== USER_FILES_BUCKET_NAME) {
+    throw new OError('not found case for another bucket')
+  }
+
+  const { projectSoftDeleted, query, fullPath, fileRef, folder } =
+    await findFile(projectId, fileId)
+  logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
+  // Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
+  const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
+  let result
+  if (projectSoftDeleted) {
+    result = await deletedProjectsCollection.updateOne(query, {
+      $pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
+      $inc: { 'project.version': 1 },
+    })
+  } else {
+    result = await projectsCollection.updateOne(query, {
+      $pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
+      $inc: { version: 1 },
+    })
+  }
+  if (result.matchedCount !== 1) {
+    throw new OError('file-tree write did not match', { result })
+  }
+  // Update the cache. The mongo-path of the next file will be off otherwise.
+  folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
+  return true
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function setHashInMongo(projectId, fileId, hash) {
+  const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
+    projectId,
+    fileId
+  )
+  if (fileRef.hash === hash) return
+  logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
+  let result
+  if (projectSoftDeleted) {
+    result = await deletedProjectsCollection.updateOne(query, {
+      $set: { [`${fullPath}.hash`]: hash },
+      $inc: { 'project.version': 1 },
+    })
+  } else {
+    result = await projectsCollection.updateOne(query, {
+      $set: { [`${fullPath}.hash`]: hash },
+      $inc: { version: 1 },
+    })
+  }
+  if (result.matchedCount !== 1) {
+    throw new OError('file-tree write did not match', { result })
+  }
+  fileRef.hash = hash // Update cache for completeness.
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} historyId
+ * @return {Promise<void>}
+ */
+async function importRestoredFilestoreFile(projectId, fileId, historyId) {
+  const filestoreKey = `${projectId}/${fileId}`
+  const path = `${BUFFER_DIR}/${projectId}_${fileId}`
+  try {
+    let s
+    try {
+      s = await filestorePersistor.getObjectStream(
+        USER_FILES_BUCKET_NAME,
+        filestoreKey
+      )
+    } catch (err) {
+      if (err instanceof NotFoundError) {
+        throw new OError('missing blob, need to restore filestore file', {
+          filestoreKey,
+        })
+      }
+      throw err
+    }
+    await Stream.promises.pipeline(
+      s,
+      fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
+    )
+    const blobStore = new BlobStore(historyId)
+    const blob = await blobStore.putFile(path)
+    await backupBlob(historyId, blob, path)
+    await setHashInMongo(projectId, fileId, blob.getHash())
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} path
+ * @return {Promise<Blob>}
+ */
+async function bufferFilestoreFileToDisk(projectId, fileId, path) {
+  const filestoreKey = `${projectId}/${fileId}`
+  try {
+    await Stream.promises.pipeline(
+      await filestorePersistor.getObjectStream(
+        USER_FILES_BUCKET_NAME,
+        filestoreKey
+      ),
+      fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
+    )
+    const blob = await makeBlobForFile(path)
+    blob.setStringLength(
+      await getStringLengthOfFile(blob.getByteLength(), path)
+    )
+    return blob
+  } catch (err) {
+    if (err instanceof NotFoundError) {
+      throw new OError('missing blob, need to restore filestore file', {
+        filestoreKey,
+      })
+    }
+    throw err
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @return {Promise<string>}
+ */
+async function computeFilestoreFileHash(projectId, fileId) {
+  const path = `${BUFFER_DIR}/${projectId}_${fileId}`
+  try {
+    const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
+    return blob.getHash()
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @return {Promise<void>}
+ */
+async function uploadFilestoreFile(projectId, fileId) {
+  const path = `${BUFFER_DIR}/${projectId}_${fileId}`
+  try {
+    const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
+    const hash = blob.getHash()
+    try {
+      await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+    } catch (err) {
+      if (!(err instanceof Blob.NotFoundError)) throw err
+
+      const { project } = await getProject(projectId)
+      const historyId = project.overleaf.history.id.toString()
+      const blobStore = new BlobStore(historyId)
+      await blobStore.putBlob(path, blob)
+      await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+    }
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixHashMismatch(line) {
+  const {
+    projectId,
+    fileId,
+    hash: computedHash,
+    entry: {
+      hash: fileTreeHash,
+      ctx: { historyId },
+    },
+  } = JSON.parse(line)
+  const blobStore = new BlobStore(historyId)
+  if (await blobStore.getBlob(fileTreeHash)) {
+    throw new OError('found blob with computed filestore object hash')
+  }
+  if (!(await blobStore.getBlob(computedHash))) {
+    await importRestoredFilestoreFile(projectId, fileId, historyId)
+    return true
+  }
+  return await ensureBlobExistsForFileAndUploadToAWS(
+    projectId,
+    fileId,
+    computedHash
+  )
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} hash
+ * @return {Promise<boolean>}
+ */
+async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
+  const { fileRef } = await findFile(projectId, fileId)
+  return fileRef.hash === hash
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<boolean>}
+ */
+async function needsBackingUpToAWS(projectId, hash) {
+  if (GLOBAL_BLOBS.has(hash)) return false
+  return !(await _blobIsBackedUp(projectId, hash))
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} hash
+ * @return {Promise<boolean>}
+ */
+async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
+  const { project } = await getProject(projectId)
+  const historyId = project.overleaf.history.id.toString()
+  const blobStore = new BlobStore(historyId)
+  if (
+    (await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
+    (await blobStore.getBlob(hash)) &&
+    !(await needsBackingUpToAWS(projectId, hash))
+  ) {
+    return false // already processed
+  }
+
+  const stream = await blobStore.getStream(hash)
+  const path = `${BUFFER_DIR}/${historyId}_${hash}`
+  try {
+    await Stream.promises.pipeline(
+      stream,
+      fs.createWriteStream(path, {
+        highWaterMark: STREAM_HIGH_WATER_MARK,
+      })
+    )
+
+    const writtenBlob = await makeBlobForFile(path)
+    writtenBlob.setStringLength(
+      await getStringLengthOfFile(writtenBlob.getByteLength(), path)
+    )
+    if (writtenBlob.getHash() !== hash) {
+      // Double check download, better safe than sorry.
+      throw new OError('blob corrupted', { writtenBlob })
+    }
+
+    let blob = await blobStore.getBlob(hash)
+    if (!blob) {
+      // Calling blobStore.putBlob would result in the same error again.
+      // HACK: Skip upload to GCS and finalize putBlob operation directly.
+      await blobStore.backend.insertBlob(historyId, writtenBlob)
+    }
+    await backupBlob(historyId, writtenBlob, path)
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+  await setHashInMongo(projectId, fileId, hash)
+  return true
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixDeletePermission(line) {
+  let { projectId, fileId, hash } = JSON.parse(line)
+  if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
+  return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixMissingHash(line) {
+  let { projectId, _id: fileId } = JSON.parse(line)
+  const {
+    fileRef: { hash },
+  } = await findFile(projectId, fileId)
+  if (hash) {
+    // processed, double check
+    return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+  }
+  await uploadFilestoreFile(projectId, fileId)
+  return true
+}
+
+const CASES = {
+  'not found': {
+    match: 'NotFoundError',
+    flag: FIX_NOT_FOUND,
+    action: fixNotFound,
+  },
+  'hash mismatch': {
+    match: 'OError: hash mismatch',
+    flag: FIX_HASH_MISMATCH,
+    action: fixHashMismatch,
+  },
+  'delete permission': {
+    match: 'storage.objects.delete',
+    flag: FIX_DELETE_PERMISSION,
+    action: fixDeletePermission,
+  },
+  'missing file hash': {
+    match: '"bad file hash"',
+    flag: FIX_MISSING_HASH,
+    action: fixMissingHash,
+  },
+}
+
+const STATS = {
+  processedLines: 0,
+  success: 0,
+  alreadyProcessed: 0,
+  fileDeleted: 0,
+  skipped: 0,
+  failed: 0,
+  unmatched: 0,
+}
+function logStats() {
+  console.log(
+    JSON.stringify({
+      time: new Date(),
+      gracefulShutdownInitiated,
+      ...STATS,
+    })
+  )
+}
+setInterval(logStats, 10_000)
+
+async function processLog() {
+  const rl = readline.createInterface({
+    input: fs.createReadStream(LOGS),
+  })
+  nextLine: for await (const line of rl) {
+    if (gracefulShutdownInitiated) break
+    STATS.processedLines++
+    if (
+      !(
+        line.includes('"failed to process file"') ||
+        // Process missing hashes as flagged by find_malformed_filetrees.mjs
+        line.includes('"bad file-tree path"')
+      )
+    ) {
+      continue
+    }
+
+    for (const [name, { match, flag, action }] of Object.entries(CASES)) {
+      if (!line.includes(match)) continue
+      if (flag) {
+        try {
+          if (await action(line)) {
+            STATS.success++
+          } else {
+            STATS.alreadyProcessed++
+          }
+        } catch (err) {
+          if (err instanceof FileDeletedError) {
+            STATS.fileDeleted++
+            logger.info({ err, line }, 'file deleted, skipping')
+          } else {
+            STATS.failed++
+            logger.error({ err, line }, `failed to fix ${name}`)
+          }
+        }
+      } else {
+        STATS.skipped++
+      }
+      continue nextLine
+    }
+    STATS.unmatched++
+    logger.warn({ line }, 'unknown fatal error')
+  }
+}
+
+async function main() {
+  try {
+    await processLog()
+  } finally {
+    logStats()
+    try {
+      await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
+    } catch (err) {
+      console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
+    }
+  }
+  const { skipped, failed, unmatched } = STATS
+  await setTimeout(SLEEP_BEFORE_EXIT)
+  if (failed > 0) {
+    process.exit(Math.min(failed, 99))
+  } else if (unmatched > 0) {
+    process.exit(100)
+  } else if (skipped > 0) {
+    process.exit(101)
+  } else {
+    process.exit(0)
+  }
+}
+
+await main()
--- a/services/history-v1/storage/scripts/backup.mjs
+++ b/services/history-v1/storage/scripts/backup.mjs
--- a/services/history-v1/storage/scripts/backup_blob.mjs
+++ b/services/history-v1/storage/scripts/backup_blob.mjs
@@ -0,0 +1,173 @@
+// @ts-check
+import commandLineArgs from 'command-line-args'
+import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
+import withTmpDir from '../../api/controllers/with_tmp_dir.js'
+import {
+  BlobStore,
+  GLOBAL_BLOBS,
+  loadGlobalBlobs,
+} from '../lib/blob_store/index.js'
+import assert from '../lib/assert.js'
+import knex from '../lib/knex.js'
+import { client } from '../lib/mongodb.js'
+import redis from '../lib/redis.js'
+import { setTimeout } from 'node:timers/promises'
+import fs from 'node:fs'
+
+await loadGlobalBlobs()
+
+/**
+ * Gracefully shutdown the process
+ * @return {Promise<void>}
+ */
+async function gracefulShutdown() {
+  console.log('Gracefully shutting down')
+  await knex.destroy()
+  await client.close()
+  await redis.disconnect()
+  await setTimeout(100)
+  process.exit()
+}
+
+/**
+ *
+ * @param {string} row
+ * @return {BackupBlobJob}
+ */
+function parseCSVRow(row) {
+  const [historyId, hash] = row.split(',')
+  validateBackedUpBlobJob({ historyId, hash })
+  return { historyId, hash }
+}
+
+/**
+ *
+ * @param {BackupBlobJob} job
+ */
+function validateBackedUpBlobJob(job) {
+  assert.projectId(job.historyId)
+  assert.blobHash(job.hash)
+}
+
+/**
+ *
+ * @param {string} path
+ * @return {Promise<Array<BackupBlobJob>>}
+ */
+async function readCSV(path) {
+  let fh
+  /** @type {Array<BackupBlobJob>} */
+  const rows = []
+  try {
+    fh = await fs.promises.open(path, 'r')
+  } catch (error) {
+    console.error(`Could not open file: ${error}`)
+    throw error
+  }
+  for await (const line of fh.readLines()) {
+    try {
+      const row = parseCSVRow(line)
+      if (GLOBAL_BLOBS.has(row.hash)) {
+        console.log(`Skipping global blob: ${line}`)
+        continue
+      }
+      rows.push(row)
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      console.log(`Skipping invalid row: ${line}`)
+    }
+  }
+  return rows
+}
+
+/**
+ * @typedef {Object} BackupBlobJob
+ * @property {string} hash
+ * @property {string} historyId
+ */
+
+/**
+ * @param {Object} options
+ * @property {string} [options.historyId]
+ * @property {string} [options.hash]
+ * @property {string} [options.input]
+ * @return {Promise<Array<BackupBlobJob>>}
+ */
+async function initialiseJobs({ historyId, hash, input }) {
+  if (input) {
+    return await readCSV(input)
+  }
+
+  if (!historyId) {
+    console.error('historyId is required')
+    process.exitCode = 1
+    await gracefulShutdown()
+  }
+
+  if (!hash) {
+    console.error('hash is required')
+    process.exitCode = 1
+    await gracefulShutdown()
+  }
+
+  validateBackedUpBlobJob({ historyId, hash })
+
+  if (GLOBAL_BLOBS.has(hash)) {
+    console.error(`Blob ${hash} is a global blob; not backing up`)
+    process.exitCode = 1
+    await gracefulShutdown()
+  }
+  return [{ hash, historyId }]
+}
+
+/**
+ *
+ * @param {string} historyId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+export async function downloadAndBackupBlob(historyId, hash) {
+  const blobStore = new BlobStore(historyId)
+  const blob = await blobStore.getBlob(hash)
+  if (!blob) {
+    throw new Error(`Blob ${hash} could not be loaded`)
+  }
+  await withTmpDir(`blob-${hash}`, async tmpDir => {
+    const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
+    console.log(`Downloaded blob ${hash} to ${filePath}`)
+    await backupBlob(historyId, blob, filePath)
+    console.log('Backed up blob')
+  })
+}
+
+let jobs
+
+const options = commandLineArgs([
+  { name: 'historyId', type: String },
+  { name: 'hash', type: String },
+  { name: 'input', type: String },
+])
+
+try {
+  jobs = await initialiseJobs(options)
+} catch (error) {
+  console.error(error)
+  await gracefulShutdown()
+}
+
+if (!Array.isArray(jobs)) {
+  // This is mostly to satisfy typescript
+  process.exitCode = 1
+  await gracefulShutdown()
+  process.exit(1)
+}
+
+for (const { historyId, hash } of jobs) {
+  try {
+    await downloadAndBackupBlob(historyId, hash)
+  } catch (error) {
+    console.error(error)
+    process.exitCode = 1
+  }
+}
+await gracefulShutdown()
--- a/services/history-v1/storage/scripts/backup_sample.mjs
+++ b/services/history-v1/storage/scripts/backup_sample.mjs
@@ -0,0 +1,153 @@
+// @ts-check
+import { ObjectId } from 'mongodb'
+import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
+import { db, client } from '../lib/mongodb.js'
+
+const projectsCollection = db.collection('projects')
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+// Configuration
+const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
+  ? parseInt(process.argv[2], 10)
+  : 10000
+const TARGET_ERROR_PERCENTAGE = process.argv[3]
+  ? parseFloat(process.argv[3])
+  : 5.0
+
+let gracefulShutdownInitiated = false
+
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  gracefulShutdownInitiated = true
+  console.warn('graceful shutdown initiated')
+}
+
+async function takeSample(sampleSize) {
+  const results = await projectsCollection
+    .aggregate(
+      [
+        { $sample: { size: sampleSize } },
+        {
+          $match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
+        },
+        {
+          $count: 'total',
+        },
+      ],
+      { readPreference: READ_PREFERENCE_SECONDARY }
+    )
+    .toArray()
+
+  const count = results[0]?.total || 0
+  return { totalSampled: sampleSize, backedUp: count }
+}
+
+function calculateStatistics(
+  cumulativeSampled,
+  cumulativeBackedUp,
+  totalPopulation
+) {
+  const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
+
+  // Standard error with finite population correction
+  const fpc = Math.sqrt(
+    (totalPopulation - cumulativeSampled) / (totalPopulation - 1)
+  )
+  const stdError =
+    Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
+
+  // 95% confidence interval is approximately ±1.96 standard errors
+  const marginOfError = 1.96 * stdError
+
+  return {
+    proportion,
+    percentage: (proportion * 100).toFixed(2),
+    marginOfError,
+    errorPercentage: (marginOfError * 100).toFixed(2),
+    lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
+    upperBound: ((proportion + marginOfError) * 100).toFixed(2),
+    sampleSize: cumulativeSampled,
+    populationSize: totalPopulation,
+  }
+}
+
+async function main() {
+  console.log('Date:', new Date().toISOString())
+  const totalCount = await projectsCollection.estimatedDocumentCount({
+    readPreference: READ_PREFERENCE_SECONDARY,
+  })
+  console.log(
+    `Total projects in collection (estimated): ${totalCount.toLocaleString()}`
+  )
+  console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
+
+  let cumulativeSampled = 0
+  let cumulativeBackedUp = 0
+  let currentError = Infinity
+  let iteration = 0
+
+  console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
+  console.log('----------|---------------|-------------|----------------')
+
+  while (currentError > TARGET_ERROR_PERCENTAGE) {
+    if (gracefulShutdownInitiated) {
+      console.log('Graceful shutdown initiated. Exiting sampling loop.')
+      break
+    }
+
+    iteration++
+    const { totalSampled, backedUp } = await takeSample(
+      SAMPLE_SIZE_PER_ITERATION
+    )
+    cumulativeSampled += totalSampled
+    cumulativeBackedUp += backedUp
+
+    const stats = calculateStatistics(
+      cumulativeSampled,
+      cumulativeBackedUp,
+      totalCount
+    )
+    currentError = parseFloat(stats.errorPercentage)
+
+    console.log(
+      `${iteration.toString().padStart(9)} | ` +
+        `${cumulativeSampled.toString().padStart(13)} | ` +
+        `${stats.percentage.padStart(10)}% | ` +
+        `\u00B1${stats.errorPercentage}%`
+    )
+
+    // Small delay between iterations
+    await new Promise(resolve => setTimeout(resolve, 100))
+  }
+
+  const finalStats = calculateStatistics(
+    cumulativeSampled,
+    cumulativeBackedUp,
+    totalCount
+  )
+
+  console.log(
+    `Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
+  )
+  console.log(
+    `Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
+  )
+  console.log(
+    `95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
+  )
+  console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+  })
--- a/services/history-v1/storage/scripts/backup_scheduler.mjs
+++ b/services/history-v1/storage/scripts/backup_scheduler.mjs
@@ -0,0 +1,429 @@
+import Queue from 'bull'
+import config from 'config'
+import commandLineArgs from 'command-line-args'
+import logger from '@overleaf/logger'
+import {
+  listPendingBackups,
+  listUninitializedBackups,
+  getBackupStatus,
+} from '../lib/backup_store/index.js'
+
+logger.initialize('backup-queue')
+
+// Use the same redis config as backup_worker
+const redisOptions = config.get('redis.queue')
+
+// Create a Bull queue named 'backup'
+const backupQueue = new Queue('backup', {
+  redis: redisOptions,
+  defaultJobOptions: {
+    removeOnComplete: true,
+    removeOnFail: true,
+  },
+})
+
+// Define command-line options
+const optionDefinitions = [
+  { name: 'clean', type: Boolean },
+  { name: 'status', type: Boolean },
+  {
+    name: 'add',
+    type: String,
+    multiple: true,
+    description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
+  },
+  { name: 'monitor', type: Boolean },
+  {
+    name: 'queue-pending',
+    type: Number,
+    description:
+      'Find projects with pending changes older than N seconds and add them to the queue',
+  },
+  {
+    name: 'show-pending',
+    type: Number,
+    description:
+      'Show count of pending projects older than N seconds without adding to queue',
+  },
+  {
+    name: 'limit',
+    type: Number,
+    description: 'Limit the number of jobs to be added',
+  },
+  {
+    name: 'interval',
+    type: Number,
+    description: 'Time in seconds to spread jobs over (default: 300)',
+    defaultValue: 300,
+  },
+  {
+    name: 'backoff-delay',
+    type: Number,
+    description:
+      'Backoff delay in milliseconds for failed jobs (default: 1000)',
+    defaultValue: 1000,
+  },
+  {
+    name: 'attempts',
+    type: Number,
+    description: 'Number of retry attempts for failed jobs (default: 3)',
+    defaultValue: 3,
+  },
+  {
+    name: 'warn-threshold',
+    type: Number,
+    description: 'Warn about any project exceeding this pending age',
+    defaultValue: 2 * 3600, // 2 hours
+  },
+  {
+    name: 'verbose',
+    alias: 'v',
+    type: Boolean,
+    description: 'Show detailed information when used with --show-pending',
+  },
+]
+
+// Parse command line arguments
+const options = commandLineArgs(optionDefinitions)
+const WARN_THRESHOLD = options['warn-threshold']
+
+// Helper to validate date format
+function isValidDateFormat(dateStr) {
+  return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
+}
+
+// Helper to validate the pending time parameter
+function validatePendingTime(option, value) {
+  if (typeof value !== 'number' || value <= 0) {
+    console.error(
+      `Error: --${option} requires a positive numeric TIME argument in seconds`
+    )
+    console.error(`Example: --${option} 3600`)
+    process.exit(1)
+  }
+  return value
+}
+
+// Helper to format the pending time display
+function formatPendingTime(timestamp) {
+  const now = new Date()
+  const diffMs = now - timestamp
+  const seconds = Math.floor(diffMs / 1000)
+  return `${timestamp.toISOString()} (${seconds} seconds ago)`
+}
+
+// Helper to add a job to the queue, checking for duplicates
+async function addJobWithCheck(queue, data, options) {
+  const jobId = options.jobId
+
+  // Check if the job already exists
+  const existingJob = await queue.getJob(jobId)
+
+  if (existingJob) {
+    return { job: existingJob, added: false }
+  } else {
+    const job = await queue.add(data, options)
+    return { job, added: true }
+  }
+}
+
+// Setup queue event listeners
+function setupMonitoring() {
+  console.log('Starting queue monitoring. Press Ctrl+C to exit.')
+
+  backupQueue.on('global:error', error => {
+    logger.info({ error }, 'Queue error')
+  })
+
+  backupQueue.on('global:waiting', jobId => {
+    logger.info({ jobId }, 'job is waiting')
+  })
+
+  backupQueue.on('global:active', jobId => {
+    logger.info({ jobId }, 'job is now active')
+  })
+
+  backupQueue.on('global:stalled', jobId => {
+    logger.info({ jobId }, 'job has stalled')
+  })
+
+  backupQueue.on('global:progress', (jobId, progress) => {
+    logger.info({ jobId, progress }, 'job progress')
+  })
+
+  backupQueue.on('global:completed', (jobId, result) => {
+    logger.info({ jobId, result }, 'job completed')
+  })
+
+  backupQueue.on('global:failed', (jobId, err) => {
+    logger.info({ jobId, err }, 'job failed')
+  })
+
+  backupQueue.on('global:paused', () => {
+    logger.info({}, 'Queue paused')
+  })
+
+  backupQueue.on('global:resumed', () => {
+    logger.info({}, 'Queue resumed')
+  })
+
+  backupQueue.on('global:cleaned', (jobs, type) => {
+    logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
+  })
+
+  backupQueue.on('global:drained', () => {
+    logger.info({}, 'Queue drained')
+  })
+
+  backupQueue.on('global:removed', jobId => {
+    logger.info({ jobId }, 'Job removed')
+  })
+}
+
+async function addDateRangeJob(input) {
+  const [startDate, endDate] = input.split(':')
+  if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
+    console.error(
+      `Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
+    )
+    return
+  }
+
+  const jobId = `backup-${startDate}-to-${endDate}`
+  const { job, added } = await addJobWithCheck(
+    backupQueue,
+    { startDate, endDate },
+    { jobId }
+  )
+
+  console.log(
+    `${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
+  )
+}
+
+// Helper to list pending and uninitialized backups
+// This function combines the two cursors into a single generator
+// to yield projects from both lists
+async function* pendingCursor(timeIntervalMs, limit) {
+  for await (const project of listPendingBackups(timeIntervalMs, limit)) {
+    yield project
+  }
+  for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
+    yield project
+  }
+}
+
+// Process pending projects with changes older than the specified seconds
+async function processPendingProjects(
+  age,
+  showOnly,
+  limit,
+  verbose,
+  jobInterval,
+  jobOpts = {}
+) {
+  const timeIntervalMs = age * 1000
+  console.log(
+    `Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
+  )
+
+  let count = 0
+  let addedCount = 0
+  let existingCount = 0
+  // Pass the limit directly to MongoDB query for better performance
+  const changeTimes = []
+  for await (const project of pendingCursor(timeIntervalMs, limit)) {
+    const projectId = project._id.toHexString()
+    const pendingAt =
+      project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
+    if (pendingAt) {
+      changeTimes.push(pendingAt)
+      const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
+      if (pendingAge > WARN_THRESHOLD) {
+        try {
+          const backupStatus = await getBackupStatus(projectId)
+          logger.warn(
+            {
+              projectId,
+              pendingAt,
+              pendingAge,
+              backupStatus,
+              warnThreshold: WARN_THRESHOLD,
+            },
+            `pending change exceeds rpo warning threshold`
+          )
+        } catch (err) {
+          logger.error(
+            { projectId, pendingAt, pendingAge },
+            'Error getting backup status'
+          )
+          throw err
+        }
+      }
+    }
+    if (showOnly && verbose) {
+      console.log(
+        `Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
+      )
+    } else if (!showOnly) {
+      const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
+      const { job, added } = await addJobWithCheck(
+        backupQueue,
+        { projectId, pendingChangeAt: pendingAt.getTime() },
+        { ...jobOpts, delay, jobId: projectId }
+      )
+
+      if (added) {
+        if (verbose) {
+          console.log(
+            `Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
+          )
+        }
+        addedCount++
+      } else {
+        if (verbose) {
+          console.log(
+            `Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
+          )
+        }
+        existingCount++
+      }
+    }
+
+    count++
+    if (count % 1000 === 0) {
+      console.log(
+        `Processed ${count} projects`,
+        showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
+      )
+    }
+  }
+  // Set oldestChange to undefined if there are no changes
+  const oldestChange =
+    changeTimes.length > 0
+      ? changeTimes.reduce((min, time) => (time < min ? time : min))
+      : undefined
+
+  if (showOnly) {
+    console.log(
+      `Found ${count} projects with pending changes (not added to queue)`
+    )
+  } else {
+    console.log(`Found ${count} projects with pending changes:`)
+    console.log(`  ${addedCount} jobs added to queue`)
+    console.log(`  ${existingCount} jobs already existed in queue`)
+    if (oldestChange) {
+      console.log(`  Oldest pending change: ${formatPendingTime(oldestChange)}`)
+    }
+  }
+}
+
+// Main execution block
+async function run() {
+  const optionCount = [
+    options.clean,
+    options.status,
+    options.add,
+    options.monitor,
+    options['queue-pending'] !== undefined,
+    options['show-pending'] !== undefined,
+  ].filter(Boolean).length
+  if (optionCount > 1) {
+    console.error('Only one option can be specified')
+    process.exit(1)
+  }
+
+  if (options.clean) {
+    const beforeCounts = await backupQueue.getJobCounts()
+    console.log('Current queue state:', JSON.stringify(beforeCounts))
+    console.log('Cleaning completed and failed jobs...')
+    await backupQueue.clean(1, 'completed')
+    await backupQueue.clean(1, 'failed')
+    const afterCounts = await backupQueue.getJobCounts()
+    console.log('Current queue state:', JSON.stringify(afterCounts))
+    console.log('Queue cleaned successfully')
+  } else if (options.status) {
+    const counts = await backupQueue.getJobCounts()
+    console.log('Current queue state:', JSON.stringify(counts))
+  } else if (options.add) {
+    const inputs = Array.isArray(options.add) ? options.add : [options.add]
+    for (const input of inputs) {
+      if (input.includes(':')) {
+        // Handle date range format
+        await addDateRangeJob(input)
+      } else {
+        // Handle project ID format
+        const { job, added } = await addJobWithCheck(
+          backupQueue,
+          { projectId: input },
+          { jobId: input }
+        )
+        console.log(
+          `${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
+        )
+      }
+    }
+  } else if (options.monitor) {
+    setupMonitoring()
+  } else if (options['queue-pending'] !== undefined) {
+    const age = validatePendingTime('queue-pending', options['queue-pending'])
+    await processPendingProjects(
+      age,
+      false,
+      options.limit,
+      options.verbose,
+      options.interval,
+      {
+        attempts: options.attempts,
+        backoff: {
+          type: 'exponential',
+          delay: options['backoff-delay'],
+        },
+      }
+    )
+  } else if (options['show-pending'] !== undefined) {
+    const age = validatePendingTime('show-pending', options['show-pending'])
+    await processPendingProjects(age, true, options.limit, options.verbose)
+  } else {
+    console.log('Usage:')
+    console.log('  --clean               Clean up completed and failed jobs')
+    console.log('  --status              Show current job counts')
+    console.log('  --add [projectId]     Add a job for the specified projectId')
+    console.log(
+      '  --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
+    )
+    console.log('  --monitor             Monitor queue events')
+    console.log(
+      '  --queue-pending TIME  Find projects with changes older than TIME seconds and add them to the queue'
+    )
+    console.log(
+      '  --show-pending TIME   Show count of pending projects older than TIME seconds'
+    )
+    console.log('  --limit N             Limit the number of jobs to be added')
+    console.log(
+      '  --interval TIME       Time interval in seconds to spread jobs over'
+    )
+    console.log(
+      '  --backoff-delay TIME  Backoff delay in milliseconds for failed jobs (default: 1000)'
+    )
+    console.log(
+      '  --attempts N          Number of retry attempts for failed jobs (default: 3)'
+    )
+    console.log(
+      '  --verbose, -v         Show detailed information when used with --show-pending'
+    )
+  }
+}
+
+// Run and handle errors
+run()
+  .catch(err => {
+    console.error('Error:', err)
+    process.exit(1)
+  })
+  .then(result => {
+    // Only exit if not in monitor mode
+    if (!options.monitor) {
+      process.exit(0)
+    }
+  })
--- a/services/history-v1/storage/scripts/backup_worker.mjs
+++ b/services/history-v1/storage/scripts/backup_worker.mjs
@@ -0,0 +1,144 @@
+import Queue from 'bull'
+import logger from '@overleaf/logger'
+import config from 'config'
+import metrics from '@overleaf/metrics'
+import {
+  backupProject,
+  initializeProjects,
+  configureBackup,
+} from './backup.mjs'
+
+const CONCURRENCY = 15
+const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
+const redisOptions = config.get('redis.queue')
+const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
+const LAG_TIME_BUCKETS_HRS = [
+  0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
+] // hours
+
+// Configure backup settings to match worker concurrency
+configureBackup({ concurrency: 50, useSecondary: true })
+
+// Create a Bull queue named 'backup'
+const backupQueue = new Queue('backup', {
+  redis: redisOptions,
+  settings: {
+    lockDuration: 15 * 60 * 1000, // 15 minutes
+    lockRenewTime: 60 * 1000, // 1 minute
+    maxStalledCount: 0, // mark stalled jobs as failed
+  },
+})
+
+// Log queue events
+backupQueue.on('active', job => {
+  logger.debug({ job }, 'job is now active')
+})
+
+backupQueue.on('completed', (job, result) => {
+  metrics.inc('backup_worker_job', 1, { status: 'completed' })
+  logger.debug({ job, result }, 'job completed')
+})
+
+backupQueue.on('failed', (job, err) => {
+  metrics.inc('backup_worker_job', 1, { status: 'failed' })
+  logger.error({ job, err }, 'job failed')
+})
+
+backupQueue.on('waiting', jobId => {
+  logger.debug({ jobId }, 'job is waiting')
+})
+
+backupQueue.on('error', error => {
+  logger.error({ error }, 'queue error')
+})
+
+backupQueue.on('stalled', job => {
+  logger.error({ job }, 'job has stalled')
+})
+
+backupQueue.on('lock-extension-failed', (job, err) => {
+  logger.error({ job, err }, 'lock extension failed')
+})
+
+backupQueue.on('paused', () => {
+  logger.info('queue paused')
+})
+
+backupQueue.on('resumed', () => {
+  logger.info('queue resumed')
+})
+
+// Process jobs
+backupQueue.process(CONCURRENCY, async job => {
+  const { projectId, startDate, endDate } = job.data
+
+  if (projectId) {
+    return await runBackup(projectId, job.data, job)
+  } else if (startDate && endDate) {
+    return await runInit(startDate, endDate)
+  } else {
+    throw new Error('invalid job data')
+  }
+})
+
+async function runBackup(projectId, data, job) {
+  const { pendingChangeAt } = data
+  // record the time it takes to run the backup job
+  const timer = new metrics.Timer(
+    'backup_worker_job_duration',
+    1,
+    {},
+    JOB_TIME_BUCKETS
+  )
+  const pendingAge = Date.now() - pendingChangeAt
+  if (pendingAge > WARN_THRESHOLD) {
+    logger.warn(
+      { projectId, pendingAge, job },
+      'project has been pending for a long time'
+    )
+  }
+  try {
+    logger.debug({ projectId }, 'processing backup for project')
+    await backupProject(projectId, {})
+    metrics.inc('backup_worker_project', 1, {
+      status: 'success',
+    })
+    timer.done()
+    // record the replication lag (time from change to backup)
+    if (pendingChangeAt) {
+      metrics.histogram(
+        'backup_worker_replication_lag_in_hours',
+        (Date.now() - pendingChangeAt) / (3600 * 1000),
+        LAG_TIME_BUCKETS_HRS
+      )
+    }
+    return `backup completed ${projectId}`
+  } catch (err) {
+    metrics.inc('backup_worker_project', 1, { status: 'failed' })
+    logger.error({ projectId, err }, 'backup failed')
+    throw err // Re-throw to mark job as failed
+  }
+}
+
+async function runInit(startDate, endDate) {
+  try {
+    logger.info({ startDate, endDate }, 'initializing projects')
+    await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
+    return `initialization completed ${startDate} - ${endDate}`
+  } catch (err) {
+    logger.error({ startDate, endDate, err }, 'initialization failed')
+    throw err
+  }
+}
+
+export async function drainQueue() {
+  logger.info({ queue: backupQueue.name }, 'pausing queue')
+  await backupQueue.pause(true) // pause this worker and wait for jobs to finish
+  logger.info({ queue: backupQueue.name }, 'closing queue')
+  await backupQueue.close()
+}
+
+export async function healthCheck() {
+  const count = await backupQueue.count()
+  metrics.gauge('backup_worker_queue_length', count)
+}
--- a/services/history-v1/storage/scripts/export_global_blobs.mjs
+++ b/services/history-v1/storage/scripts/export_global_blobs.mjs
@@ -0,0 +1,69 @@
+/**
+ * A script to export the global blobs from mongo to a CSV file.
+ *
+ * node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
+ *
+ * The output CSV has the following format:
+ *
+ * hash,path,byteLength,stringLength,demoted
+ *
+ * hash: the hash of the blob
+ * path: the path of the blob in the blob store
+ * byteLength: the byte length of the blob, or empty if unknown
+ * stringLength: the string length of the blob, or empty if unknown
+ * demoted: true if the blob has been demoted to a reference, false otherwise
+ */
+
+// @ts-check
+import { ObjectId } from 'mongodb'
+import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
+import { client } from '../lib/mongodb.js'
+import commandLineArgs from 'command-line-args'
+import fs from 'node:fs'
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+function parseArgs() {
+  const args = commandLineArgs([
+    {
+      name: 'output',
+      type: String,
+      alias: 'o',
+    },
+  ])
+  const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
+
+  return {
+    OUTPUT_STREAM,
+  }
+}
+
+const { OUTPUT_STREAM } = parseArgs()
+
+async function main() {
+  await loadGlobalBlobs()
+  OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
+  for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
+    const { hash: blobHash, byteLength, stringLength } = blob
+    if (blobHash !== hash) {
+      throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
+    }
+    const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
+    const byteLengthStr = byteLength === null ? '' : byteLength
+    const stringLengthStr = stringLength === null ? '' : stringLength
+    OUTPUT_STREAM.write(
+      `${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
+    )
+  }
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+  })
--- a/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs
+++ b/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs
@@ -0,0 +1,51 @@
+// @ts-check
+import { backedUpBlobs } from '../lib/mongodb.js'
+import { mongoId } from '../lib/assert.js'
+import { ObjectId } from 'mongodb'
+import commandLineArgs from 'command-line-args'
+
+const STATS = {
+  total: 0,
+  replaced: 0,
+  skipped: 0,
+}
+
+const config = commandLineArgs([
+  { name: 'commit', type: Boolean, defaultValue: false },
+])
+
+async function processRecord(record) {
+  STATS.total++
+  try {
+    mongoId(record._id)
+    const newId = new ObjectId(record._id)
+    if (config.commit) {
+      await backedUpBlobs.updateOne(
+        { _id: newId },
+        {
+          $addToSet: { blobs: { $each: record.blobs } },
+        },
+        { upsert: true }
+      )
+      await backedUpBlobs.deleteOne({ _id: record._id })
+    }
+    STATS.replaced++
+  } catch (error) {
+    console.log(error)
+    STATS.skipped++
+  }
+}
+
+const cursor = backedUpBlobs
+  .find({ _id: { $type: 'string' } })
+  .project({ _id: 1, blobs: 1 })
+
+while (await cursor.hasNext()) {
+  const record = await cursor.next()
+  await processRecord(record)
+}
+
+console.log(
+  `${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
+)
+process.exit()
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/01-create-blob-hashes-table.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/01-create-blob-hashes-table.sql
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/02-set-global-flag.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/02-set-global-flag.sql
@@ -0,0 +1,3 @@
+UPDATE blobs
+SET global = TRUE
+WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/03-create-global-blobs-table.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/03-create-global-blobs-table.sql
@@ -0,0 +1,16 @@
+CREATE TABLE global_blobs (
+    hash_bytes bytea NOT NULL,
+    byte_length integer NOT NULL,
+    string_length integer,
+    global boolean,
+    CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
+    CONSTRAINT global_blobs_byte_length_non_negative
+        CHECK (byte_length >= 0),
+    CONSTRAINT global_blobs_string_length_non_negative
+        CHECK (string_length IS NULL OR string_length >= 0)
+);
+
+INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
+SELECT hash_bytes, byte_length, string_length, true
+FROM blobs
+WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/04-swap-global-blob-tables.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/04-swap-global-blob-tables.sql
@@ -0,0 +1,22 @@
+BEGIN;
+    ALTER TABLE blobs RENAME TO old_blobs;
+    ALTER TABLE global_blobs RENAME TO blobs;
+
+    ALTER TABLE old_blobs
+        RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
+    ALTER TABLE old_blobs
+        RENAME CONSTRAINT blobs_byte_length_non_negative
+        TO old_blobs_byte_length_non_negative;
+    ALTER TABLE old_blobs
+        RENAME CONSTRAINT blobs_string_length_non_negative
+        TO old_blobs_string_length_non_negative;
+
+    ALTER TABLE blobs
+        RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT global_blobs_byte_length_non_negative
+        TO blobs_byte_length_non_negative;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT global_blobs_string_length_non_negative
+        TO blobs_string_length_non_negative;
+COMMIT;
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/README.md
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/README.md
@@ -0,0 +1,9 @@
+Scripts in this directory were used when we cleaned up the global blobs table,
+ensuring that it only contained global blobs. The scripts are meant to be run in this order:
+
+* `01-create-blob-hashes-table.sql`
+* `02-set-global-flag.sql`
+* `03-create-global-blobs-table.sql`
+* `04-swap-global-blob-tables.sql`
+
+The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/rollback.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/rollback.sql
@@ -0,0 +1,22 @@
+BEGIN;
+    ALTER TABLE blobs RENAME TO global_blobs;
+    ALTER TABLE old_blobs RENAME TO blobs;
+
+    ALTER TABLE global_blobs
+        RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
+    ALTER TABLE global_blobs
+        RENAME CONSTRAINT blobs_byte_length_non_negative
+        TO global_blobs_byte_length_non_negative;
+    ALTER TABLE global_blobs
+        RENAME CONSTRAINT blobs_string_length_non_negative
+        TO global_blobs_string_length_non_negative;
+
+    ALTER TABLE blobs
+        RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT old_blobs_byte_length_non_negative
+        TO blobs_byte_length_non_negative;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT old_blobs_string_length_non_negative
+        TO blobs_string_length_non_negative;
+COMMIT;
--- a/services/history-v1/storage/scripts/recover_doc_versions.js
+++ b/services/history-v1/storage/scripts/recover_doc_versions.js
@@ -0,0 +1,379 @@
+const fsPromises = require('node:fs/promises')
+const { ObjectId } = require('mongodb')
+const BPromise = require('bluebird')
+const logger = require('@overleaf/logger')
+const Settings = require('@overleaf/settings')
+const rclient = require('@overleaf/redis-wrapper').createClient(
+  Settings.redis.documentupdater
+)
+const mongodb = require('../lib/mongodb')
+const { chunkStore } = require('..')
+const Events = require('node:events')
+
+// Silence warning.
+Events.setMaxListeners(20)
+
+const BATCH_SIZE = 1000
+const OPTIONS = {
+  concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
+  force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
+  'skip-history-failures':
+    process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
+  'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
+}
+
+const db = {
+  deletedProjects: mongodb.db.collection('deletedProjects'),
+  docs: mongodb.db.collection('docs'),
+  migrations: mongodb.db.collection('migrations'),
+  projects: mongodb.db.collection('projects'),
+}
+
+const BAD_MIGRATION_NAME =
+  '20231219081700_move_doc_versions_from_docops_to_docs'
+
+const RECOVERY_FILES_502 = [
+  '/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
+  '/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
+]
+
+let loggingChain = Promise.resolve()
+const projectIdsThatNeedResyncing = []
+const unflushedDocIds = new Set()
+
+async function flushLogQueue() {
+  const logPath = OPTIONS['resyncs-needed-file']
+  loggingChain = loggingChain.then(async () => {
+    const batch = projectIdsThatNeedResyncing.splice(0)
+    if (batch.length === 0) return
+    try {
+      await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
+    } catch (err) {
+      projectIdsThatNeedResyncing.push(...batch)
+      logger.err({ err, logPath, batch }, 'Failed to write to log file')
+    }
+  })
+  await loggingChain
+}
+async function recordProjectNeedsResync(projectId) {
+  if (OPTIONS['resyncs-needed-file']) {
+    projectIdsThatNeedResyncing.push(projectId)
+    await flushLogQueue()
+  } else {
+    console.log(`Project ${projectId} needs a hard resync.`)
+  }
+}
+
+async function main() {
+  const recovery502Ran = await did502RecoveryRun()
+  await getUnflushedDocIds()
+  const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
+
+  if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
+    // Tell customers that they need to flush
+    console.log(`
+--------------------------------------------------------------------
+Detected unflushed changes while recovering doc versions.
+Please go back to version 5.0.1 and follow the recovery procedure
+for flushing document updates:
+
+https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
+--------------------------------------------------------------------`)
+    process.exit(1)
+  }
+
+  if (OPTIONS.force || recovery502Ran || badMigration != null) {
+    console.warn('Need to recover doc versions. This will take a while.')
+    await runRecovery()
+    await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
+    await delete502RecoveryFiles()
+  }
+
+  console.log('Done.')
+}
+
+async function did502RecoveryRun() {
+  for (const file of RECOVERY_FILES_502) {
+    try {
+      await fsPromises.stat(file)
+      return true
+    } catch (err) {
+      // file doesn't exist. continue
+    }
+  }
+  return false
+}
+
+async function delete502RecoveryFiles() {
+  for (const file of RECOVERY_FILES_502) {
+    try {
+      await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
+    } catch (err) {
+      // file doesn't exist. continue
+    }
+  }
+}
+
+async function runRecovery() {
+  let batch = []
+  const summary = {
+    ignored: 0,
+    skipped: 0,
+    deletedUpdatedMongo: 0,
+    deletedUpdatedRedis: 0,
+    deletedUpdatedBoth: 0,
+    deletedIgnored: 0,
+    updatedMongo: 0,
+    updatedRedis: 0,
+    updatedBoth: 0,
+  }
+  const processBatchAndLogProgress = async () => {
+    try {
+      await BPromise.map(batch, project => processProject(project, summary), {
+        concurrency: OPTIONS.concurrency,
+      })
+    } finally {
+      console.log(`${summary.updatedRedis} projects updated in Redis`)
+      console.log(`${summary.updatedMongo} projects updated in Mongo`)
+      console.log(
+        `${summary.updatedBoth} projects updated in both Mongo and Redis`
+      )
+      console.log(`${summary.ignored} projects had good versions`)
+      console.log(
+        `${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
+      )
+      console.log(
+        `${summary.deletedUpdatedRedis} deleted projects updated in Redis`
+      )
+      console.log(
+        `${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
+      )
+      console.log(
+        `${summary.deletedIgnored} deleted projects had good versions`
+      )
+      console.log(`${summary.skipped} projects skipped`)
+    }
+    batch = []
+  }
+
+  await printDBStats()
+  await initResyncsNeededFile()
+  for await (const project of getProjects()) {
+    batch.push(project)
+    if (batch.length >= BATCH_SIZE) {
+      await processBatchAndLogProgress()
+    }
+  }
+
+  for await (const deletedProject of getDeletedProjects()) {
+    const project = deletedProject.project
+    project.isDeleted = true
+    batch.push(project)
+    if (batch.length >= BATCH_SIZE) {
+      await processBatchAndLogProgress()
+    }
+  }
+
+  if (batch.length > 0) {
+    await processBatchAndLogProgress()
+  }
+
+  await backfillMissingVersions()
+}
+
+async function getUnflushedDocIds() {
+  const batchSize = 1000
+  let cursor = '0'
+  do {
+    const [newCursor, keys] = await rclient.scan(
+      cursor,
+      'MATCH',
+      Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
+      'COUNT',
+      batchSize
+    )
+    for (const key of keys) {
+      unflushedDocIds.add(key.slice('DocVersion:'.length))
+    }
+    cursor = newCursor
+  } while (cursor !== '0')
+}
+
+async function printDBStats() {
+  const projects = await db.projects.estimatedDocumentCount()
+  const deletedProjects = await db.deletedProjects.countDocuments()
+  const docs = await db.docs.estimatedDocumentCount()
+  console.log(
+    `Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
+  )
+}
+
+async function initResyncsNeededFile() {
+  const logPath = OPTIONS['resyncs-needed-file']
+  if (logPath) {
+    await fsPromises.writeFile(logPath, '')
+    await fsPromises.rm(`${logPath}.done`, { force: true })
+  }
+}
+
+function getProjects() {
+  return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
+}
+
+function getDeletedProjects() {
+  return db.deletedProjects.find(
+    { 'project.overleaf.history.id': { $exists: true } },
+    { projection: { 'project._id': 1, 'project.overleaf': 1 } }
+  )
+}
+
+async function processProject(project, summary) {
+  const projectId = project._id.toString()
+  let updatedMongo = false
+  let updatedRedis = false
+  try {
+    const historyDocVersions = await getHistoryDocVersions(project)
+
+    for (const { docId, version } of historyDocVersions) {
+      const update = await fixDocVersion(docId, version)
+      if (update != null) {
+        if (update.in === 'mongo') {
+          updatedMongo = true
+        } else if (update.in === 'redis') {
+          updatedRedis = true
+        }
+      }
+    }
+
+    if (project.isDeleted) {
+      if (updatedMongo && updatedRedis) {
+        summary.deletedUpdatedBoth += 1
+      } else if (updatedMongo) {
+        summary.deletedUpdatedMongo += 1
+      } else if (updatedRedis) {
+        summary.deletedUpdatedRedis += 1
+      } else {
+        summary.deletedIgnored += 1
+      }
+    } else {
+      await recordProjectNeedsResync(projectId)
+      if (updatedMongo && updatedRedis) {
+        summary.updatedBoth += 1
+      } else if (updatedMongo) {
+        summary.updatedMongo += 1
+      } else if (updatedRedis) {
+        summary.updatedRedis += 1
+      } else {
+        summary.ignored += 1
+      }
+    }
+  } catch (err) {
+    logger.error({ err, projectId }, 'Failed to process project')
+    if (OPTIONS['skip-history-failures']) {
+      summary.skipped += 1
+    } else {
+      throw err
+    }
+  }
+}
+
+async function getHistoryDocVersions(project) {
+  const historyId = project.overleaf.history.id
+  const chunk = await chunkStore.loadLatest(historyId)
+  if (chunk == null) {
+    return []
+  }
+
+  const snapshot = chunk.getSnapshot()
+  const changes = chunk.getChanges()
+  snapshot.applyAll(changes)
+  const v2DocVersions = snapshot.getV2DocVersions()
+  if (v2DocVersions == null) {
+    return []
+  }
+  return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
+    docId,
+    version: versionInfo.v,
+  }))
+}
+
+async function fixDocVersion(docId, historyVersion) {
+  const redisVersion = await getRedisDocVersion(docId)
+  if (redisVersion != null && historyVersion >= redisVersion) {
+    await setRedisDocVersion(docId, historyVersion + 1)
+    return {
+      in: 'redis',
+      previousVersion: redisVersion,
+      newVersion: historyVersion + 1,
+    }
+  } else {
+    const docBeforeUpdate = await db.docs.findOneAndUpdate(
+      {
+        _id: new ObjectId(docId),
+        $or: [
+          { version: { $lte: historyVersion } },
+          { version: { $exists: false } },
+        ],
+      },
+      { $set: { version: historyVersion + 1 } },
+      { projection: { _id: 1, version: 1 } }
+    )
+
+    if (docBeforeUpdate != null) {
+      return {
+        in: 'mongo',
+        previousVersion: docBeforeUpdate.version,
+        newVersion: historyVersion + 1,
+      }
+    } else {
+      return null
+    }
+  }
+}
+
+async function getRedisDocVersion(docId) {
+  if (!unflushedDocIds.has(docId)) {
+    return null
+  }
+  const result = await rclient.get(
+    Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
+  )
+  if (result == null) {
+    return null
+  }
+  return parseInt(result, 10)
+}
+
+async function setRedisDocVersion(docId, version) {
+  const multi = rclient.multi()
+  multi.set(
+    Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
+    version
+  )
+  multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
+  await multi.exec()
+}
+
+/**
+ * Set all remaining versions to 0
+ */
+async function backfillMissingVersions() {
+  console.log('Defaulting version to 0 for remaining docs.')
+  await db.docs.updateMany(
+    { version: { $exists: false } },
+    { $set: { version: 0 } }
+  )
+}
+
+main()
+  .finally(async () => {
+    console.log('Flushing log queue.')
+    await flushLogQueue()
+  })
+  .then(() => {
+    process.exit(0)
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/scripts/recover_zip.js
+++ b/services/history-v1/storage/scripts/recover_zip.js
@@ -0,0 +1,255 @@
+/**
+ * Try to recover a zip of the latest version of a project using only data in
+ * GCS, where this data may have been (recently) hard deleted (i.e. may exist
+ * wholely or in part as non-current versions). This should be able to
+ * retrieve the latest content of a project up to 180 days after it was
+ * deleted.
+ *
+ * Usage:
+ * node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
+ *
+ * Output:
+ * Signed URL(s) for the uploaded zip files. Note that these are valid for
+ * only 24h, to match the lifecycle rule on the zip bucket.
+ */
+
+const fs = require('node:fs')
+const os = require('node:os')
+const path = require('node:path')
+const util = require('node:util')
+
+// Something is registering 11 listeners, over the limit of 10, which generates
+// a lot of warning noise.
+require('node:events').EventEmitter.defaultMaxListeners = 11
+
+const config = require('config')
+// We depend on this via object-persistor.
+// eslint-disable-next-line import/no-extraneous-dependencies
+const { Storage } = require('@google-cloud/storage')
+const isValidUtf8 = require('utf-8-validate')
+
+const core = require('overleaf-editor-core')
+const projectKey = require('../lib/project_key')
+const streams = require('../lib/streams')
+const ProjectArchive = require('../lib/project_archive')
+
+const {
+  values: { verbose: VERBOSE },
+  positionals: HISTORY_IDS,
+} = util.parseArgs({
+  options: {
+    verbose: {
+      type: 'boolean',
+      default: false,
+    },
+  },
+  allowPositionals: true,
+})
+
+if (HISTORY_IDS.length === 0) {
+  console.error('no history IDs; see usage')
+  process.exit(1)
+}
+
+async function listDeletedChunks(historyId) {
+  const bucketName = config.get('chunkStore.bucket')
+  const storage = new Storage()
+  const [files] = await storage.bucket(bucketName).getFiles({
+    prefix: projectKey.format(historyId),
+    versions: true,
+  })
+  return files
+}
+
+async function findLatestChunk(historyId) {
+  const files = await listDeletedChunks(historyId)
+  if (files.length === 0) return null
+  files.sort((a, b) => {
+    if (a.name < b.name) return -1
+    if (a.name > b.name) return 1
+    return 0
+  })
+  return files[files.length - 1]
+}
+
+async function downloadLatestChunk(tmp, historyId) {
+  const latestChunkFile = await findLatestChunk(historyId)
+  if (!latestChunkFile) throw new Error('no chunk found to recover')
+
+  const destination = path.join(tmp, 'latest.json')
+  await latestChunkFile.download({ destination })
+  return destination
+}
+
+async function loadHistory(historyPathname) {
+  const data = await fs.promises.readFile(historyPathname)
+  const rawHistory = JSON.parse(data)
+  return core.History.fromRaw(rawHistory)
+}
+
+async function loadChunk(historyPathname, blobStore) {
+  const history = await loadHistory(historyPathname)
+
+  const blobHashes = new Set()
+  history.findBlobHashes(blobHashes)
+
+  await blobStore.fetchBlobs(blobHashes)
+  await history.loadFiles('lazy', blobStore)
+
+  return new core.Chunk(history, 0)
+}
+
+// TODO: it would be nice to export / expose this from BlobStore;
+// currently this is a copy of the method there.
+async function getStringLengthOfFile(byteLength, pathname) {
+  // We have to read the file into memory to get its UTF-8 length, so don't
+  // bother for files that are too large for us to edit anyway.
+  if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
+    return null
+  }
+
+  // We need to check if the file contains nonBmp or null characters
+  let data = await fs.promises.readFile(pathname)
+  if (!isValidUtf8(data)) return null
+  data = data.toString()
+  if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
+  if (core.util.containsNonBmpChars(data)) return null
+  if (data.indexOf('\x00') !== -1) return null
+  return data.length
+}
+
+class RecoveryBlobStore {
+  constructor(historyId, tmp) {
+    this.historyId = historyId
+    this.tmp = tmp
+    this.blobs = new Map()
+  }
+
+  async fetchBlobs(blobHashes) {
+    for await (const blobHash of blobHashes) {
+      await this.fetchBlob(blobHash)
+    }
+  }
+
+  async fetchBlob(hash) {
+    if (this.blobs.has(hash)) return
+
+    if (VERBOSE) console.log('fetching blob', hash)
+
+    const bucketName = config.get('blobStore.projectBucket')
+    const storage = new Storage()
+    const [files] = await storage.bucket(bucketName).getFiles({
+      prefix: this.makeProjectBlobKey(hash),
+      versions: true,
+    })
+
+    const destination = this.getBlobPathname(hash)
+
+    if (files.length === 0) {
+      await this.fetchGlobalBlob(hash, destination)
+    } else if (files.length === 1) {
+      await files[0].download({ destination })
+    } else {
+      throw new Error('Multiple versions of blob ' + hash)
+    }
+
+    this.blobs.set(hash, await this.makeBlob(hash, destination))
+  }
+
+  async fetchGlobalBlob(hash, destination) {
+    const bucketName = config.get('blobStore.globalBucket')
+    const storage = new Storage()
+    const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
+    await file.download({ destination })
+  }
+
+  async makeBlob(hash, pathname) {
+    const stat = await fs.promises.stat(pathname)
+    const byteLength = stat.size
+    const stringLength = await getStringLengthOfFile(byteLength, pathname)
+    return new core.Blob(hash, byteLength, stringLength)
+  }
+
+  async getString(hash) {
+    const stream = await this.getStream(hash)
+    const buffer = await streams.readStreamToBuffer(stream)
+    return buffer.toString()
+  }
+
+  async getStream(hash) {
+    return fs.createReadStream(this.getBlobPathname(hash))
+  }
+
+  async getBlob(hash) {
+    return this.blobs.get(hash)
+  }
+
+  getBlobPathname(hash) {
+    return path.join(this.tmp, hash)
+  }
+
+  makeGlobalBlobKey(hash) {
+    return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
+  }
+
+  makeProjectBlobKey(hash) {
+    return `${projectKey.format(this.historyId)}/${hash.slice(
+      0,
+      2
+    )}/${hash.slice(2)}`
+  }
+}
+
+async function uploadZip(historyId, zipPathname) {
+  const bucketName = config.get('zipStore.bucket')
+  const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
+  const storage = new Storage()
+  const destination = `${historyId}-recovered.zip`
+  await storage.bucket(bucketName).upload(zipPathname, { destination })
+
+  const signedUrls = await storage
+    .bucket(bucketName)
+    .file(destination)
+    .getSignedUrl({
+      version: 'v4',
+      action: 'read',
+      expires: Date.now() + deadline,
+    })
+
+  return signedUrls[0]
+}
+
+async function restoreProject(historyId) {
+  const tmp = await fs.promises.mkdtemp(
+    path.join(os.tmpdir(), historyId.toString())
+  )
+  if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
+
+  const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
+  const blobStore = new RecoveryBlobStore(historyId, tmp)
+  const chunk = await loadChunk(latestJsonPathname, blobStore)
+
+  const snapshot = chunk.getSnapshot()
+  for (const change of chunk.getChanges()) {
+    change.applyTo(snapshot)
+  }
+
+  if (VERBOSE) console.log('zipping', historyId)
+
+  const zipPathname = path.join(tmp, `${historyId}.zip`)
+  const zipTimeoutMs = 60 * 1000
+  const archive = new ProjectArchive(snapshot, zipTimeoutMs)
+  await archive.writeZip(blobStore, zipPathname)
+
+  if (VERBOSE) console.log('uploading', historyId)
+
+  return await uploadZip(historyId, zipPathname)
+}
+
+async function main() {
+  for (const historyId of HISTORY_IDS) {
+    const signedUrl = await restoreProject(historyId)
+    console.log(signedUrl)
+  }
+}
+main().catch(console.error)
--- a/services/history-v1/storage/scripts/redis.mjs
+++ b/services/history-v1/storage/scripts/redis.mjs
@@ -0,0 +1,36 @@
+import redis from '@overleaf/redis-wrapper'
+import config from 'config'
+
+// Get allowed Redis dbs from config
+const redisConfig = config.get('redis')
+const allowedDbs = Object.keys(redisConfig)
+
+// Get the Redis db from command line argument or use the first available db as default
+const db = process.argv[2]
+
+// Validate redis db
+if (!allowedDbs.includes(db)) {
+  if (db) {
+    console.error('Invalid redis db:', db)
+  }
+  console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
+  process.exit(1)
+}
+
+// Get redis options based on command line argument
+const redisOptions = config.get(`redis.${db}`)
+console.log('Using redis db:', db)
+console.log('REDIS CONFIG', {
+  ...redisOptions,
+  password: '*'.repeat(redisOptions.password?.length),
+})
+const rclient = redis.createClient(redisOptions)
+
+try {
+  await rclient.healthCheck()
+  console.log('REDIS HEALTHCHECK SUCCEEDED')
+} catch (error) {
+  console.error('REDIS HEALTHCHECK FAILED', error)
+} finally {
+  await rclient.quit()
+}
--- a/services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
+++ b/services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
@@ -0,0 +1,104 @@
+// @ts-check
+import { readFileSync } from 'node:fs'
+import commandLineArgs from 'command-line-args'
+import { client } from '../lib/mongodb.js'
+import {
+  getBackedUpBlobHashes,
+  unsetBackedUpBlobHashes,
+} from '../lib/backup_store/index.js'
+
+let gracefulShutdownInitiated = false
+
+// Parse command line arguments
+const args = commandLineArgs([
+  { name: 'input', type: String, alias: 'i', defaultOption: true },
+  { name: 'commit', type: Boolean, default: false },
+])
+
+if (!args.input) {
+  console.error(
+    'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
+  )
+  process.exit(1)
+}
+
+if (!args.commit) {
+  console.log('Running in dry-run mode. Use --commit to apply changes.')
+}
+
+// Signal handling
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  console.warn('Graceful shutdown initiated')
+  gracefulShutdownInitiated = true
+}
+
+// Process CSV and remove blobs
+async function main() {
+  const projectBlobs = new Map()
+  const lines = readFileSync(args.input, 'utf8').split('\n')
+  const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
+
+  // Skip header
+  for (const line of lines.slice(1)) {
+    if (!line.trim() || gracefulShutdownInitiated) break
+
+    const [projectId, path] = line.split(',')
+    const pathParts = path.split('/')
+    const hash = pathParts[3] + pathParts[4]
+
+    if (!SHA1_HEX_REGEX.test(hash)) {
+      console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
+      continue
+    }
+
+    if (!projectBlobs.has(projectId)) {
+      projectBlobs.set(projectId, new Set())
+    }
+    projectBlobs.get(projectId).add(hash)
+  }
+
+  // Process each project
+  for (const [projectId, hashes] of projectBlobs) {
+    if (gracefulShutdownInitiated) break
+
+    if (!args.commit) {
+      console.log(
+        `DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
+      )
+      continue
+    }
+
+    try {
+      const originalHashes = await getBackedUpBlobHashes(projectId)
+      if (originalHashes.size === 0) {
+        continue
+      }
+      const result = await unsetBackedUpBlobHashes(
+        projectId,
+        Array.from(hashes)
+      )
+      if (result) {
+        console.log(
+          `Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
+        )
+      }
+    } catch (err) {
+      console.error(`Error updating project ${projectId}:`, err)
+    }
+  }
+}
+
+// Run the script
+main()
+  .catch(err => {
+    console.error('Fatal error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    client
+      .close()
+      .catch(err => console.error('Error closing MongoDB connection:', err))
+  })
--- a/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs
+++ b/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs
@@ -0,0 +1,221 @@
+// @ts-check
+
+/**
+ * This script is used to remove blobs that have been backed up under the project ID
+ * instead of the history ID (where those are different).
+ *
+ * This script reads a CSV file with the following format:
+ * ```
+ * project_id,hash
+ * <mongo ID>,<hash>
+ * ```
+ *
+ * The header row is optional. All rows will be checked for conformance to the format.
+ */
+
+import commandLineArgs from 'command-line-args'
+import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
+import { makeProjectKey } from '../lib/blob_store/index.js'
+import fs from 'node:fs'
+import assert from '../lib/assert.js'
+import { client } from '../lib/mongodb.js'
+import { verifyBlobs } from '../lib/backupVerifier.mjs'
+import { setTimeout } from 'node:timers/promises'
+import { getHistoryId } from '../lib/backup_store/index.js'
+
+const argsSchema = [
+  {
+    name: 'input',
+    type: String,
+  },
+  {
+    name: 'commit',
+    type: Boolean,
+  },
+  {
+    name: 'header',
+    type: Boolean,
+  },
+  {
+    name: 'force',
+    type: Boolean,
+  },
+  {
+    name: 'verbose',
+    type: Boolean,
+  },
+]
+
+const args = commandLineArgs(argsSchema)
+
+async function gracefulClose(code = 0) {
+  await client.close()
+  process.exit(code)
+}
+
+/**
+ *
+ * @param {(value: unknown) => void} fn
+ * @param {unknown} value
+ * @return {boolean}
+ */
+function not(fn, value) {
+  try {
+    fn(value)
+    return false
+  } catch {
+    return true
+  }
+}
+
+/**
+ *
+ * @param {string} row
+ * @return {{projectId: string, hash: string}}
+ */
+function parseCSVRow(row) {
+  const [projectId, hash] = row.split(',')
+  assert.mongoId(projectId, `invalid projectId ${projectId}`)
+  assert.blobHash(hash, `invalid hash ${hash}`)
+  return { projectId, hash }
+}
+
+/**
+ *
+ * @param {string} path
+ * @param {boolean} hasHeader
+ * @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
+ */
+async function* readCSV(path, hasHeader) {
+  let seenHeader = !hasHeader
+  let fh
+  try {
+    fh = await fs.promises.open(path, 'r')
+  } catch (error) {
+    console.error(`Could not open file: ${error}`)
+    return await gracefulClose(1)
+  }
+  for await (const line of fh.readLines()) {
+    if (!seenHeader) {
+      const [first, second] = line.split(',')
+      const noDataInHeader =
+        not(assert.mongoId, first) && not(assert.blobHash, second)
+      if (!noDataInHeader) {
+        console.error('Data found in header row')
+        return await gracefulClose(1)
+      }
+      seenHeader = true
+      continue
+    }
+    try {
+      yield parseCSVRow(line)
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      console.info(`Skipping invalid row: ${line}`)
+    }
+  }
+}
+
+function usage() {
+  console.info(
+    'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
+  )
+}
+
+if (!args.input) {
+  console.error('--input was missing')
+  usage()
+  await gracefulClose(1)
+}
+
+/**
+ *
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function deleteBlob(projectId, hash) {
+  const path = makeProjectKey(projectId, hash)
+  if (args.commit) {
+    await backupPersistor.deleteObject(projectBlobsBucket, path)
+  } else {
+    console.log(`DELETE: ${path}`)
+  }
+}
+
+/**
+ *
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function canDeleteBlob(projectId, hash) {
+  let historyId
+  try {
+    historyId = await getHistoryId(projectId)
+  } catch (error) {
+    if (args.verbose) {
+      console.error(error)
+    }
+    throw new Error(`No history ID found for project ${projectId}, skipping`)
+  }
+  if (historyId === projectId) {
+    throw new Error(
+      `Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
+    )
+  }
+
+  // TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
+  assert.postgresId(
+    `${historyId}`,
+    `History ID ${historyId} does not appear to be for a postgres project`
+  )
+
+  try {
+    await verifyBlobs(`${historyId}`, [hash])
+  } catch (error) {
+    if (args.verbose) {
+      console.error(error)
+    }
+    throw new Error(
+      `Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
+    )
+  }
+}
+
+if (!args.commit) {
+  console.log('DRY RUN: provide --commit to perform operations')
+}
+
+if (args.force) {
+  console.log(
+    'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
+  )
+  await setTimeout(5_000)
+}
+
+let deleted = 0
+let errors = 0
+
+for await (const { projectId, hash } of readCSV(args.input, args.header)) {
+  if (!args.force) {
+    try {
+      await canDeleteBlob(projectId, hash)
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      continue
+    }
+  }
+  try {
+    await deleteBlob(projectId, hash)
+    deleted++
+  } catch (error) {
+    errors++
+    console.error(error)
+  }
+}
+
+console.log(`Deleted: ${deleted}`)
+console.log(`Errors: ${errors}`)
+
+await gracefulClose()
--- a/services/history-v1/storage/scripts/show.mjs
+++ b/services/history-v1/storage/scripts/show.mjs
@@ -0,0 +1,254 @@
+import commandLineArgs from 'command-line-args'
+import {
+  loadAtVersion,
+  getChunkMetadataForVersion,
+  getProjectChunksFromVersion,
+} from '../lib/chunk_store/index.js'
+import { client } from '../lib/mongodb.js'
+import knex from '../lib/knex.js'
+import redis from '../lib/redis.js'
+import {
+  loadGlobalBlobs,
+  BlobStore,
+  makeProjectKey,
+} from '../lib/blob_store/index.js'
+import { TextDecoder } from 'node:util'
+import {
+  backupPersistor,
+  chunksBucket,
+  projectBlobsBucket,
+} from '../lib/backupPersistor.mjs'
+import fs from 'node:fs'
+import { pipeline } from 'node:stream/promises'
+import os from 'node:os'
+import path from 'node:path'
+import { createHash } from 'node:crypto'
+import projectKey from '../lib/project_key.js'
+import { createGunzip } from 'node:zlib'
+import { text } from 'node:stream/consumers'
+
+const optionDefinitions = [
+  { name: 'historyId', alias: 'p', type: String },
+  { name: 'version', alias: 'v', type: Number },
+  { name: 'blob', alias: 'b', type: String },
+  { name: 'remote', alias: 'r', type: Boolean },
+  { name: 'keep', alias: 'k', type: Boolean },
+]
+
+function makeChunkKey(projectId, startVersion) {
+  return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
+}
+
+async function listChunks(historyId) {
+  for await (const chunkRecord of getProjectChunksFromVersion(historyId, 0)) {
+    console.log('Chunk record:', chunkRecord)
+  }
+}
+
+async function fetchChunkLocal(historyId, version) {
+  const chunkRecord = await getChunkMetadataForVersion(historyId, version)
+  const chunk = await loadAtVersion(historyId, version)
+  return { key: version, chunk, metadata: chunkRecord, source: 'local storage' }
+}
+
+async function fetchChunkRemote(historyId, version) {
+  const chunkRecord = await getChunkMetadataForVersion(historyId, version)
+  const startVersion = chunkRecord.startVersion
+  const key = makeChunkKey(historyId, startVersion)
+  const backupPersistorForProject = await backupPersistor.forProject(
+    chunksBucket,
+    key
+  )
+  const backupChunkStream = await backupPersistorForProject.getObjectStream(
+    chunksBucket,
+    key
+  )
+  const backupStr = await text(backupChunkStream.pipe(createGunzip()))
+  return {
+    key,
+    chunk: JSON.parse(backupStr),
+    metadata: chunkRecord,
+    source: 'remote backup',
+  }
+}
+
+async function displayChunk(historyId, version, options) {
+  const { key, chunk, metadata, source } = await (options.remote
+    ? fetchChunkRemote(historyId, version)
+    : fetchChunkLocal(historyId, version))
+  console.log('Source:', source)
+  console.log('Chunk record', metadata)
+  console.log('Key', key)
+  // console.log('Number of changes', chunk.getChanges().length)
+  console.log(JSON.stringify(chunk))
+}
+
+async function fetchBlobRemote(historyId, blobHash) {
+  const backupPersistorForProject = await backupPersistor.forProject(
+    projectBlobsBucket,
+    makeProjectKey(historyId, '')
+  )
+  const blobKey = makeProjectKey(historyId, blobHash)
+  return {
+    stream: await backupPersistorForProject.getObjectStream(
+      projectBlobsBucket,
+      blobKey,
+      { autoGunzip: true }
+    ),
+    metadata: { hash: blobHash },
+    source: 'remote backup',
+  }
+}
+
+async function fetchBlobLocal(historyId, blobHash) {
+  const blobStore = new BlobStore(historyId)
+  const blob = await blobStore.getBlob(blobHash)
+  if (!blob) throw new Error(`Blob ${blobHash} not found`)
+  return {
+    stream: await blobStore.getStream(blobHash),
+    metadata: blob,
+    source: 'local storage',
+  }
+}
+
+async function displayBlobContent(filepath, metadata, source, blobHash) {
+  console.log('Source:', source)
+  console.log('Blob metadata:', metadata)
+
+  // Compute git hash using streaming
+  const stat = fs.statSync(filepath)
+  const header = `blob ${stat.size}\0`
+  const hash = createHash('sha1')
+  hash.update(header)
+
+  const hashStream = fs.createReadStream(filepath)
+  for await (const chunk of hashStream) {
+    hash.update(chunk)
+  }
+  const gitHash = hash.digest('hex')
+
+  // Check content type and display preview
+  const fd = fs.openSync(filepath, 'r')
+  try {
+    const headBuf = Buffer.alloc(16)
+    const tailBuf = Buffer.alloc(16)
+
+    try {
+      // Stream through TextDecoderStream to check for valid UTF-8
+      const textStream = fs.createReadStream(filepath)
+      const decoder = new TextDecoder('utf-8', { fatal: true })
+      for await (const chunk of textStream) {
+        decoder.decode(chunk, { stream: true })
+      }
+      decoder.decode()
+      // If we get here, it's valid UTF-8
+      if (stat.size <= 1024) {
+        console.log('Content (text):', await fs.readFileSync(filepath, 'utf8'))
+      } else {
+        console.log('Content (text, truncated):')
+        console.log(`  Length: ${stat.size} bytes`)
+        fs.readSync(fd, headBuf, 0, 16, 0)
+        fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
+        console.log(
+          '  Content:',
+          headBuf.toString('utf8') +
+            ' ...(truncated)... ' +
+            tailBuf.toString('utf8')
+        )
+      }
+    } catch (e) {
+      // Binary content - show head and tail
+      console.log('Content (binary):')
+      console.log(`  Length: ${stat.size} bytes`)
+
+      if (stat.size <= 32) {
+        // Small file - read it all
+        const buf = Buffer.alloc(stat.size)
+        fs.readSync(fd, buf, 0, stat.size, 0)
+        const hexBytes = buf.toString('hex').match(/../g).join(' ')
+        console.log('  Bytes:', hexBytes)
+      } else {
+        // Read tail for large files
+        fs.readSync(fd, headBuf, 0, 16, 0)
+        fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
+        const headHex = headBuf.toString('hex').match(/../g).join(' ')
+        const tailHex = tailBuf.toString('hex').match(/../g).join(' ')
+        console.log('  Bytes:', headHex + ' ... ' + tailHex)
+      }
+      console.log('  Git-style SHA1:', gitHash)
+      if (gitHash !== blobHash) {
+        console.log('  Warning: Git hash differs from blob hash!\x1b[0m')
+        console.log('  Blob hash:', blobHash)
+      }
+    }
+  } finally {
+    fs.closeSync(fd)
+  }
+}
+
+async function withTempDir(prefix, fn, options = {}) {
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix))
+  try {
+    return await Promise.resolve(fn(tmpDir))
+  } finally {
+    if (!options.keep) {
+      fs.rmSync(tmpDir, { recursive: true, force: true })
+    } else {
+      console.log('Keeping temporary file:', path.join(tmpDir, 'blob'))
+    }
+  }
+}
+
+async function displayBlob(historyId, blobHash, options) {
+  try {
+    const { stream, metadata, source } = await (options.remote
+      ? fetchBlobRemote(historyId, blobHash)
+      : fetchBlobLocal(historyId, blobHash))
+
+    await withTempDir(
+      'blob-show-',
+      async tmpDir => {
+        const tmpPath = path.join(tmpDir, 'blob')
+        await pipeline(stream, fs.createWriteStream(tmpPath))
+        await displayBlobContent(tmpPath, metadata, source, blobHash)
+      },
+      { keep: options.keep }
+    )
+  } catch (err) {
+    if (err.code === 'NoSuchKey') {
+      throw new Error(`Blob ${blobHash} not found in backup`)
+    }
+    throw err
+  }
+}
+
+async function main() {
+  const { historyId, version, blob, remote, keep } =
+    commandLineArgs(optionDefinitions)
+  if (!historyId) {
+    console.error('Error: --historyId is required.')
+    process.exit(1)
+  }
+  await loadGlobalBlobs()
+  if (version != null) {
+    await displayChunk(historyId, version, { remote })
+  } else if (blob != null) {
+    await displayBlob(historyId, blob, { remote, keep })
+  } else {
+    await listChunks(historyId)
+  }
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exit(1)
+  })
+  .finally(() => {
+    knex.destroy().catch(err => console.error('Error closing Postgres:', err))
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+    redis
+      .disconnect()
+      .catch(err => console.error('Error disconnecting Redis:', err))
+  })
--- a/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
+++ b/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
@@ -0,0 +1,153 @@
+// @ts-check
+import { ObjectId } from 'mongodb'
+import knex from '../lib/knex.js'
+import {
+  batchedUpdate,
+  objectIdFromInput,
+  READ_PREFERENCE_SECONDARY,
+} from '@overleaf/mongo-utils/batchedUpdate.js'
+import {
+  GLOBAL_BLOBS,
+  loadGlobalBlobs,
+  makeProjectKey,
+} from '../lib/blob_store/index.js'
+import {
+  backedUpBlobs as backedUpBlobsCollection,
+  db,
+  client,
+} from '../lib/mongodb.js'
+import redis from '../lib/redis.js'
+import commandLineArgs from 'command-line-args'
+import fs from 'node:fs'
+
+const projectsCollection = db.collection('projects')
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+function parseArgs() {
+  const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
+  const args = commandLineArgs([
+    {
+      name: 'BATCH_RANGE_START',
+      type: String,
+      defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
+    },
+    {
+      name: 'BATCH_RANGE_END',
+      type: String,
+      defaultValue: new Date().toISOString(),
+    },
+    {
+      name: 'output',
+      type: String,
+      alias: 'o',
+    },
+  ])
+  const BATCH_RANGE_START = objectIdFromInput(
+    args['BATCH_RANGE_START']
+  ).toString()
+  const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
+  if (!args['output']) {
+    throw new Error('missing --output')
+  }
+  const OUTPUT_STREAM = fs.createWriteStream(args['output'])
+
+  return {
+    BATCH_RANGE_START,
+    BATCH_RANGE_END,
+    OUTPUT_STREAM,
+  }
+}
+
+const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs()
+
+// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
+if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
+  throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
+}
+
+let gracefulShutdownInitiated = false
+
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  gracefulShutdownInitiated = true
+  console.warn('graceful shutdown initiated, draining queue')
+}
+
+async function processBatch(batch) {
+  if (gracefulShutdownInitiated) {
+    throw new Error('graceful shutdown: aborting batch processing')
+  }
+
+  const N = batch.length
+  const firstId = batch[0]._id
+  const lastId = batch[N - 1]._id
+  const projectCursor = await projectsCollection.find(
+    { _id: { $gte: firstId, $lte: lastId } },
+    {
+      projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 },
+      readPreference: READ_PREFERENCE_SECONDARY,
+    }
+  )
+  const projectMap = new Map()
+  for await (const project of projectCursor) {
+    projectMap.set(project._id.toString(), project)
+  }
+  for (const project of batch) {
+    const projectId = project._id.toString()
+    const projectRecord = projectMap.get(projectId)
+    if (!projectRecord) {
+      console.error(`project not found: ${projectId}`)
+      continue
+    }
+    if (!projectRecord.overleaf?.history?.id) {
+      console.error(`project missing history: ${projectId}`)
+      continue
+    }
+    const historyId = projectRecord.overleaf.history.id.toString()
+    const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},`
+    const hashes = project.blobs.map(blob => blob.toString('hex'))
+    const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash))
+    if (projectBlobHashes.length < hashes.length) {
+      console.warn(
+        `project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs`
+      )
+    }
+    const rows = projectBlobHashes.map(
+      hash => prefix + makeProjectKey(historyId, hash) + '\n'
+    )
+    OUTPUT_STREAM.write(rows.join(''))
+  }
+}
+
+async function main() {
+  await loadGlobalBlobs()
+  OUTPUT_STREAM.write('projectId,lastUpdated,path\n')
+  await batchedUpdate(
+    backedUpBlobsCollection,
+    {},
+    processBatch,
+    {},
+    {},
+    { BATCH_RANGE_START, BATCH_RANGE_END }
+  )
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    knex.destroy().catch(err => {
+      console.error('Error closing Postgres connection:', err)
+    })
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+    redis.disconnect().catch(err => {
+      console.error('Error disconnecting Redis:', err)
+    })
+  })
--- a/services/history-v1/storage/scripts/verify_backup_blob.mjs
+++ b/services/history-v1/storage/scripts/verify_backup_blob.mjs
@@ -0,0 +1,21 @@
+import logger from '@overleaf/logger'
+import commandLineArgs from 'command-line-args'
+import { verifyBlobs } from '../lib/backupVerifier.mjs'
+
+const { historyId, hashes } = commandLineArgs([
+  { name: 'historyId', type: String },
+  { name: 'hashes', type: String, multiple: true, defaultOption: true },
+])
+
+if (hashes.length === 0) {
+  throw new Error('missing --hashes flag')
+}
+
+try {
+  await verifyBlobs(historyId, hashes)
+  console.log('OK')
+  process.exit(0)
+} catch (err) {
+  logger.err({ err }, 'failed to verify blob')
+  process.exit(1)
+}
--- a/services/history-v1/storage/scripts/verify_blob_backed_up_by_path_bulk.mjs
+++ b/services/history-v1/storage/scripts/verify_blob_backed_up_by_path_bulk.mjs
@@ -0,0 +1,177 @@
+import fs from 'node:fs'
+import { makeProjectKey } from '../lib/blob_store/index.js'
+import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
+import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
+import commandLineArgs from 'command-line-args'
+import OError from '@overleaf/o-error'
+import assert from '../lib/assert.js'
+import { client, projects } from '../lib/mongodb.js'
+import { ObjectId } from 'mongodb'
+import { setTimeout } from 'node:timers/promises'
+
+const { input, verbose } = commandLineArgs([
+  { name: 'input', type: String },
+  { name: 'verbose', type: Boolean, defaultValue: false },
+])
+
+function parseCSVRow(row) {
+  const [path] = row.split(',')
+  const pathSegments = path.split('/')
+  const historyId = `${pathSegments[0]}${pathSegments[1]}${pathSegments[2]}`
+    .split('')
+    .reverse()
+    .join('')
+
+  return { historyId, path, hash: `${pathSegments[3]}${pathSegments[4]}` }
+}
+
+async function* readCSV(path) {
+  let fh
+  try {
+    fh = await fs.promises.open(path, 'r')
+  } catch (error) {
+    console.error(`Could not open file: ${error}`)
+    throw error
+  }
+  for await (const line of fh.readLines()) {
+    try {
+      const row = parseCSVRow(line)
+      yield row
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      console.log(`Skipping invalid row: ${line}`)
+    }
+  }
+}
+
+class MissingDEKError extends OError {}
+class InvalidHistoryIdError extends OError {}
+class MissingProjectError extends OError {}
+class MissingBlobError extends OError {}
+
+async function getProjectPersistor(historyId) {
+  try {
+    return await backupPersistor.forProjectRO(
+      projectBlobsBucket,
+      makeProjectKey(historyId, '')
+    )
+  } catch (err) {
+    if (err instanceof NotFoundError) {
+      throw new MissingDEKError('dek does not exist', { historyId }, err)
+    }
+    throw err
+  }
+}
+
+async function checkBlobExists(path, historyId) {
+  const persistor = await getProjectPersistor(historyId)
+  return await persistor.getObjectSize(projectBlobsBucket, path)
+}
+
+let total = 0
+const errors = {
+  invalidProjectId: 0,
+  notBackedUpProjectId: 0,
+  missingBlob: 0,
+  notInMongo: 0,
+  unknown: 0,
+}
+
+const notInMongoProjectIds = new Set()
+const notBackedUpProjectIds = new Set()
+
+let stopping = false
+
+process.on('SIGTERM', () => {
+  console.log('SIGTERM received')
+  stopping = true
+})
+
+process.on('SIGINT', () => {
+  console.log('SIGINT received')
+  stopping = true
+})
+
+/**
+ *
+ * @param {string} historyId
+ * @param {string} path
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function checkPath(historyId, path, hash) {
+  try {
+    assert.mongoId(historyId)
+  } catch (error) {
+    throw InvalidHistoryIdError('invalid history id', { historyId })
+  }
+  if (notInMongoProjectIds.has(historyId)) {
+    throw new MissingProjectError('project not in mongo', { historyId })
+  }
+  if (notBackedUpProjectIds.has(historyId)) {
+    throw new MissingDEKError('project not backed up', { historyId })
+  }
+
+  const project = await projects.findOne({ _id: new ObjectId(historyId) })
+  if (!project) {
+    notInMongoProjectIds.add(historyId)
+    throw new MissingProjectError('project not in mongo', { historyId })
+  }
+  try {
+    await checkBlobExists(path, historyId)
+  } catch (error) {
+    if (error instanceof NotFoundError) {
+      throw new MissingBlobError('missing blob', { historyId, hash })
+    }
+    if (error instanceof MissingDEKError) {
+      notBackedUpProjectIds.add(historyId)
+    }
+    throw error
+  }
+}
+
+for await (const line of readCSV(input)) {
+  if (stopping) break
+  total++
+  if (total % 10_000 === 0) {
+    console.log(`checked ${total}`)
+  }
+  const { historyId, path, hash } = line
+  try {
+    await checkPath(historyId, path, hash)
+    if (verbose) {
+      console.log(`✓ Project ${historyId} has ${hash} backed up`)
+    }
+  } catch (error) {
+    if (error instanceof InvalidHistoryIdError) {
+      errors.invalidProjectId++
+      console.warn(`invalid historyId ${historyId}`)
+      continue
+    } else if (error instanceof MissingProjectError) {
+      errors.notInMongo++
+      console.warn(`✗ project ${historyId} not in mongo`)
+      continue
+    } else if (error instanceof MissingDEKError) {
+      errors.notBackedUpProjectId++
+      console.error(`✗ Project DEK ${historyId} not found`)
+      continue
+    } else if (error instanceof MissingBlobError) {
+      errors.missingBlob++
+      console.error(`✗ missing blob ${hash} from project ${historyId}`)
+      continue
+    }
+    errors.unknown++
+    console.error(error)
+  }
+}
+
+console.log(`total checked: ${total}`)
+console.log(`invalid project id: ${errors.invalidProjectId}`)
+console.log(`not found in mongo: ${errors.notInMongo}`)
+console.log(`missing blob: ${errors.missingBlob}`)
+console.log(`project not backed up: ${errors.notBackedUpProjectId}`)
+console.log(`unknown errors: ${errors.unknown}`)
+
+await client.close()
+await setTimeout(100)
+process.exit()
--- a/services/history-v1/storage/scripts/verify_project.mjs
+++ b/services/history-v1/storage/scripts/verify_project.mjs
@@ -0,0 +1,35 @@
+import commandLineArgs from 'command-line-args'
+import { verifyProjectWithErrorContext } from '../lib/backupVerifier.mjs'
+import knex from '../lib/knex.js'
+import { client } from '../lib/mongodb.js'
+import redis from '../lib/redis.js'
+import { setTimeout } from 'node:timers/promises'
+import { loadGlobalBlobs } from '../lib/blob_store/index.js'
+
+const { historyId } = commandLineArgs([{ name: 'historyId', type: String }])
+
+async function gracefulShutdown(code = process.exitCode) {
+  await knex.destroy()
+  await client.close()
+  await redis.disconnect()
+  await setTimeout(1_000)
+  process.exit(code)
+}
+
+if (!historyId) {
+  console.error('missing --historyId')
+  process.exitCode = 1
+  await gracefulShutdown()
+}
+
+await loadGlobalBlobs()
+
+try {
+  await verifyProjectWithErrorContext(historyId)
+  console.log('OK')
+} catch (error) {
+  console.error('error verifying', error)
+  process.exitCode = 1
+} finally {
+  await gracefulShutdown()
+}
--- a/services/history-v1/storage/scripts/verify_sampled_projects.mjs
+++ b/services/history-v1/storage/scripts/verify_sampled_projects.mjs
@@ -0,0 +1,217 @@
+// @ts-check
+import commandLineArgs from 'command-line-args'
+import {
+  setWriteMetrics,
+  verifyProjectsCreatedInDateRange,
+  verifyRandomProjectSample,
+  verifyProjectsUpdatedInDateRange,
+} from '../../backupVerifier/ProjectVerifier.mjs'
+import knex from '../lib/knex.js'
+import { client } from '../lib/mongodb.js'
+import { setTimeout } from 'node:timers/promises'
+import logger from '@overleaf/logger'
+import { loadGlobalBlobs } from '../lib/blob_store/index.js'
+import { getDatesBeforeRPO } from '../../backupVerifier/utils.mjs'
+import { EventEmitter } from 'node:events'
+import { mongodb } from '../index.js'
+import redis from '../lib/redis.js'
+
+logger.logger.level('fatal')
+
+const usageMessage = [
+  'Usage: node verify_sampled_projects.mjs [--startDate <start>] [--endDate <end>] [--nProjects <n>] [--verbose] [--usage] [--writeMetrics] [--concurrency <n>] [--strategy <range|random>]',
+  'strategy: defaults to "range"; startDate and endDate are required for "range" strategy',
+].join('\n')
+
+/**
+ * Gracefully shutdown the process
+ * @param code
+ * @return {Promise<void>}
+ */
+async function gracefulShutdown(code = process.exitCode) {
+  await knex.destroy()
+  await client.close()
+  await redis.disconnect()
+  await setTimeout(1_000)
+  process.exit(code)
+}
+
+const STATS = {
+  verifiable: 0,
+  unverifiable: 0,
+}
+
+/**
+ * @typedef {Object} CLIOptions
+ * @property {(signal: EventEmitter) => Promise<VerificationJobStatus>} projectVerifier
+ * @property {boolean} verbose
+ */
+
+/**
+ * @typedef {import('../../backupVerifier/types.d.ts').VerificationJobStatus} VerificationJobStatus
+ */
+
+/**
+ *
+ * @return {CLIOptions}
+ */
+function getOptions() {
+  const {
+    startDate,
+    endDate,
+    concurrency,
+    writeMetrics,
+    verbose,
+    nProjects,
+    strategy,
+    usage,
+  } = commandLineArgs([
+    { name: 'startDate', type: String },
+    { name: 'endDate', type: String },
+    { name: 'concurrency', type: Number, defaultValue: 1 },
+    { name: 'verbose', type: Boolean, defaultValue: false },
+    { name: 'nProjects', type: Number, defaultValue: 10 },
+    { name: 'usage', type: Boolean, defaultValue: false },
+    { name: 'writeMetrics', type: Boolean, defaultValue: false },
+    { name: 'strategy', type: String, defaultValue: 'range' },
+  ])
+
+  if (usage) {
+    console.log(usageMessage)
+    process.exit(0)
+  }
+
+  if (!['range', 'random', 'recent'].includes(strategy)) {
+    throw new Error(`Invalid strategy: ${strategy}`)
+  }
+
+  setWriteMetrics(writeMetrics)
+
+  switch (strategy) {
+    case 'random':
+      console.log('Verifying random projects')
+      return {
+        verbose,
+        projectVerifier: signal => verifyRandomProjectSample(nProjects, signal),
+      }
+    case 'recent':
+      return {
+        verbose,
+        projectVerifier: async signal => {
+          const { startDate, endDate } = getDatesBeforeRPO(3 * 3600)
+          return await verifyProjectsUpdatedInDateRange(
+            startDate,
+            endDate,
+            nProjects,
+            signal
+          )
+        },
+      }
+    case 'range':
+    default: {
+      if (!startDate || !endDate) {
+        throw new Error(usageMessage)
+      }
+      const start = Date.parse(startDate)
+      const end = Date.parse(endDate)
+      if (Number.isNaN(start)) {
+        throw new Error(`Invalid start date: ${startDate}`)
+      }
+
+      if (Number.isNaN(end)) {
+        throw new Error(`Invalid end date: ${endDate}`)
+      }
+      if (verbose) {
+        console.log(`Verifying from ${startDate} to ${endDate}`)
+        console.log(`Concurrency: ${concurrency}`)
+      }
+      STATS.ranges = 0
+      return {
+        projectVerifier: signal =>
+          verifyProjectsCreatedInDateRange({
+            startDate: new Date(start),
+            endDate: new Date(end),
+            projectsPerRange: nProjects,
+            concurrency,
+            signal,
+          }),
+        verbose,
+      }
+    }
+  }
+}
+
+/**
+ * @type {CLIOptions}
+ */
+let options
+try {
+  options = getOptions()
+} catch (error) {
+  console.error(error)
+  process.exitCode = 1
+  await gracefulShutdown(1)
+  process.exit() // just here so the type checker knows that the process will exit
+}
+
+const { projectVerifier, verbose } = options
+
+if (verbose) {
+  logger.logger.level('debug')
+}
+
+/**
+ *
+ * @param {Array<string>} array
+ * @param {string} matchString
+ * @return {*}
+ */
+function sumStringInstances(array, matchString) {
+  return array.reduce((total, string) => {
+    return string === matchString ? total + 1 : total
+  }, 0)
+}
+
+/**
+ *
+ * @param {VerificationJobStatus} stats
+ */
+function displayStats(stats) {
+  console.log(`Verified projects: ${stats.verified}`)
+  console.log(`Total projects sampled: ${stats.total}`)
+  if (stats.errorTypes.length > 0) {
+    console.log('Errors:')
+    for (const error of new Set(stats.errorTypes)) {
+      console.log(`${error}: ${sumStringInstances(stats.errorTypes, error)}`)
+    }
+  }
+}
+
+const shutdownEmitter = new EventEmitter()
+
+shutdownEmitter.on('shutdown', async () => {
+  await gracefulShutdown()
+})
+
+process.on('SIGTERM', () => {
+  shutdownEmitter.emit('shutdown')
+})
+
+process.on('SIGINT', () => {
+  shutdownEmitter.emit('shutdown')
+})
+
+await loadGlobalBlobs()
+
+try {
+  const stats = await projectVerifier(shutdownEmitter)
+  displayStats(stats)
+  console.log(`completed`)
+} catch (error) {
+  console.error(error)
+  console.log('completed with errors')
+  process.exitCode = 1
+} finally {
+  console.log('shutting down')
+  await gracefulShutdown()
+}
--- a/services/history-v1/storage/tasks/backfill_start_version.js
+++ b/services/history-v1/storage/tasks/backfill_start_version.js
@@ -0,0 +1,109 @@
+const commandLineArgs = require('command-line-args')
+const BPromise = require('bluebird')
+const timersPromises = require('node:timers/promises')
+
+const { knex, historyStore } = require('..')
+
+const MAX_POSTGRES_INTEGER = 2147483647
+const DEFAULT_BATCH_SIZE = 1000
+const DEFAULT_CONCURRENCY = 1
+const MAX_RETRIES = 10
+const RETRY_DELAY_MS = 5000
+
+async function main() {
+  const options = parseOptions()
+  let batchStart = options.minId
+  while (batchStart <= options.maxId) {
+    const chunks = await getChunks(batchStart, options.maxId, options.batchSize)
+    if (chunks.length === 0) {
+      // No results. We're done.
+      break
+    }
+    const batchEnd = chunks[chunks.length - 1].id
+    await processBatch(chunks, options)
+    console.log(`Processed chunks ${batchStart} to ${batchEnd}`)
+    batchStart = batchEnd + 1
+  }
+}
+
+function parseOptions() {
+  const args = commandLineArgs([
+    { name: 'min-id', type: Number, defaultValue: 1 },
+    {
+      name: 'max-id',
+      type: Number,
+      defaultValue: MAX_POSTGRES_INTEGER,
+    },
+    { name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
+    { name: 'concurrency', type: Number, defaultValue: DEFAULT_CONCURRENCY },
+  ])
+  return {
+    minId: args['min-id'],
+    maxId: args['max-id'],
+    batchSize: args['batch-size'],
+    concurrency: args.concurrency,
+  }
+}
+
+async function getChunks(minId, maxId, batchSize) {
+  const chunks = await knex('chunks')
+    .where('id', '>=', minId)
+    .andWhere('id', '<=', maxId)
+    .orderBy('id')
+    .limit(batchSize)
+  return chunks
+}
+
+async function processBatch(chunks, options) {
+  let retries = 0
+  while (true) {
+    const results = await BPromise.map(chunks, processChunk, {
+      concurrency: options.concurrency,
+    })
+    const failedChunks = results
+      .filter(result => !result.success)
+      .map(result => result.chunk)
+    if (failedChunks.length === 0) {
+      // All chunks processed. Carry on.
+      break
+    }
+
+    // Some projects failed. Retry.
+    retries += 1
+    if (retries > MAX_RETRIES) {
+      console.log('Too many retries processing chunks. Giving up.')
+      process.exit(1)
+    }
+    console.log(
+      `Retrying chunks: ${failedChunks.map(chunk => chunk.id).join(', ')}`
+    )
+    await timersPromises.setTimeout(RETRY_DELAY_MS)
+    chunks = failedChunks
+  }
+}
+
+async function processChunk(chunk) {
+  try {
+    const rawHistory = await historyStore.loadRaw(
+      chunk.doc_id.toString(),
+      chunk.id
+    )
+    const startVersion = chunk.end_version - rawHistory.changes.length
+    await knex('chunks')
+      .where('id', chunk.id)
+      .update({ start_version: startVersion })
+    return { chunk, success: true }
+  } catch (err) {
+    console.error(`Failed to process chunk ${chunk.id}:`, err.stack)
+    return { chunk, success: false }
+  }
+}
+
+main()
+  .then(() => {
+    process.exit()
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/tasks/compress_changes.js
+++ b/services/history-v1/storage/tasks/compress_changes.js
@@ -0,0 +1,107 @@
+/**
+ * Compress changes for projects that have too many text operations.
+ *
+ * Usage:
+ *
+ *   node tasks/compress_changes.js CSV_FILE
+ *
+ * where CSV_FILE contains a list of project ids in the first column
+ */
+
+const fs = require('node:fs')
+const BPromise = require('bluebird')
+const { History } = require('overleaf-editor-core')
+const { historyStore, chunkStore } = require('..')
+
+const CONCURRENCY = 10
+
+async function main() {
+  const filename = process.argv[2]
+  const projectIds = await readCsv(filename)
+  const chunks = []
+  for (const projectId of projectIds) {
+    const chunkIds = await chunkStore.getProjectChunkIds(projectId)
+    chunks.push(...chunkIds.map(id => ({ id, projectId })))
+  }
+  let totalCompressed = 0
+  await BPromise.map(
+    chunks,
+    async chunk => {
+      try {
+        const history = await getHistory(chunk)
+        const numCompressed = compressChanges(history)
+        if (numCompressed > 0) {
+          await storeHistory(chunk, history)
+          console.log(
+            `Compressed project ${chunk.projectId}, chunk ${chunk.id}`
+          )
+        }
+        totalCompressed += numCompressed
+      } catch (err) {
+        console.log(err)
+      }
+    },
+    { concurrency: CONCURRENCY }
+  )
+  console.log('CHANGES:', totalCompressed)
+}
+
+async function readCsv(filename) {
+  const csv = await fs.promises.readFile(filename, 'utf-8')
+  const lines = csv.trim().split('\n')
+  const projectIds = lines.map(line => line.split(',')[0])
+  return projectIds
+}
+
+async function getHistory(chunk) {
+  const rawHistory = await historyStore.loadRaw(chunk.projectId, chunk.id)
+  const history = History.fromRaw(rawHistory)
+  return history
+}
+
+async function storeHistory(chunk, history) {
+  const rawHistory = history.toRaw()
+  await historyStore.storeRaw(chunk.projectId, chunk.id, rawHistory)
+}
+
+function compressChanges(history) {
+  let numCompressed = 0
+  for (const change of history.getChanges()) {
+    const newOperations = compressOperations(change.operations)
+    if (newOperations.length !== change.operations.length) {
+      numCompressed++
+    }
+    change.setOperations(newOperations)
+  }
+  return numCompressed
+}
+
+function compressOperations(operations) {
+  if (!operations.length) return []
+
+  const newOperations = []
+  let currentOperation = operations[0]
+  for (let operationId = 1; operationId < operations.length; operationId++) {
+    const nextOperation = operations[operationId]
+    if (currentOperation.canBeComposedWith(nextOperation)) {
+      currentOperation = currentOperation.compose(nextOperation)
+    } else {
+      // currentOperation and nextOperation cannot be composed. Push the
+      // currentOperation and start over with nextOperation.
+      newOperations.push(currentOperation)
+      currentOperation = nextOperation
+    }
+  }
+  newOperations.push(currentOperation)
+
+  return newOperations
+}
+
+main()
+  .then(() => {
+    process.exit()
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/tasks/copy_project_blobs.js
+++ b/services/history-v1/storage/tasks/copy_project_blobs.js
@@ -0,0 +1,294 @@
+#!/usr/bin/env node
+
+const { promisify } = require('node:util')
+const BPromise = require('bluebird')
+const commandLineArgs = require('command-line-args')
+const config = require('config')
+const fs = require('node:fs')
+const readline = require('node:readline')
+const { History } = require('overleaf-editor-core')
+const { knex, historyStore, persistor } = require('..')
+const projectKey = require('../lib/project_key')
+
+const MAX_POSTGRES_INTEGER = 2147483647
+const DEFAULT_BATCH_SIZE = 1000
+const MAX_RETRIES = 10
+const RETRY_DELAY_MS = 5000
+
+// Obtain a preconfigured GCS client through a non-documented property of
+// object-persistor. Sorry about that. We need the GCS client because we use
+// operations that are not implemented in object-persistor.
+const gcsClient = persistor.storage
+const globalBucket = gcsClient.bucket(config.get('blobStore.globalBucket'))
+const projectBucket = gcsClient.bucket(config.get('blobStore.projectBucket'))
+const delay = promisify(setTimeout)
+
+async function main() {
+  const options = commandLineArgs([
+    { name: 'global-blobs', type: String },
+    { name: 'min-project-id', type: Number, defaultValue: 1 },
+    {
+      name: 'max-project-id',
+      type: Number,
+      defaultValue: MAX_POSTGRES_INTEGER,
+    },
+    { name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
+    { name: 'concurrency', type: Number, defaultValue: 1 },
+  ])
+  if (!options['global-blobs']) {
+    console.error(
+      'You must specify a global blobs file with the --global-blobs option'
+    )
+    process.exit(1)
+  }
+  const globalBlobs = await readGlobalBlobs(options['global-blobs'])
+  const minProjectId = options['min-project-id']
+  const maxProjectId = options['max-project-id']
+  const batchSize = options['batch-size']
+  const concurrency = options.concurrency
+  console.log(`Keeping ${globalBlobs.size} global blobs`)
+  await run({ globalBlobs, minProjectId, maxProjectId, batchSize, concurrency })
+  console.log('Done.')
+}
+
+async function readGlobalBlobs(filename) {
+  const stream = fs.createReadStream(filename)
+  const reader = readline.createInterface({
+    input: stream,
+    crlfDelay: Infinity,
+  })
+  const blobs = new Set()
+  for await (const line of reader) {
+    blobs.add(line.trim())
+  }
+  return blobs
+}
+
+async function run(options) {
+  const { globalBlobs, minProjectId, maxProjectId, batchSize, concurrency } =
+    options
+  let batchStart = minProjectId
+  while (batchStart <= maxProjectId) {
+    let projectIds = await getProjectIds(batchStart, maxProjectId, batchSize)
+    if (projectIds.length === 0) {
+      break
+    }
+    const batchEnd = projectIds[projectIds.length - 1]
+    console.log(`Processing projects ${batchStart} to ${batchEnd}`)
+    const chunkIdsByProject = await getChunkIdsByProject(projectIds)
+
+    let retries = 0
+    while (true) {
+      const results = await BPromise.map(
+        projectIds,
+        async projectId =>
+          await processProject(
+            projectId,
+            chunkIdsByProject.get(projectId),
+            globalBlobs
+          ),
+        { concurrency }
+      )
+      const failedProjectIds = results
+        .filter(result => !result.success)
+        .map(result => result.projectId)
+      if (failedProjectIds.length === 0) {
+        // All projects were copied successfully. Carry on.
+        break
+      }
+
+      // Some projects failed. Retry.
+      retries += 1
+      if (retries > MAX_RETRIES) {
+        console.log(
+          `Too many retries processing projects ${batchStart} to ${batchEnd}. Giving up.`
+        )
+        process.exit(1)
+      }
+      console.log(`Retrying projects: ${failedProjectIds.join(', ')}`)
+      await delay(RETRY_DELAY_MS)
+      projectIds = failedProjectIds
+    }
+
+    // Set up next batch
+    batchStart = batchEnd + 1
+  }
+}
+
+async function getProjectIds(minProjectId, maxProjectId, batchSize) {
+  const projectIds = await knex('chunks')
+    .distinct('doc_id')
+    .where('doc_id', '>=', minProjectId)
+    .andWhere('doc_id', '<=', maxProjectId)
+    .orderBy('doc_id')
+    .limit(batchSize)
+    .pluck('doc_id')
+  return projectIds
+}
+
+async function getChunkIdsByProject(projectIds) {
+  const chunks = await knex('chunks')
+    .select('id', { projectId: 'doc_id' })
+    .where('doc_id', 'in', projectIds)
+  const chunkIdsByProject = new Map()
+  for (const projectId of projectIds) {
+    chunkIdsByProject.set(projectId, [])
+  }
+  for (const chunk of chunks) {
+    chunkIdsByProject.get(chunk.projectId).push(chunk.id)
+  }
+  return chunkIdsByProject
+}
+
+async function processProject(projectId, chunkIds, globalBlobs) {
+  try {
+    const blobHashes = await getBlobHashes(projectId, chunkIds)
+    const projectBlobHashes = blobHashes.filter(hash => !globalBlobs.has(hash))
+    const gcsSizesByHash = new Map()
+    for (const blobHash of projectBlobHashes) {
+      const blobSize = await copyBlobInGcs(projectId, blobHash)
+      if (blobSize != null) {
+        gcsSizesByHash.set(blobHash, blobSize)
+      }
+    }
+    const dbSizesByHash = await copyBlobsInDatabase(
+      projectId,
+      projectBlobHashes
+    )
+    compareBlobSizes(gcsSizesByHash, dbSizesByHash)
+    return { projectId, success: true }
+  } catch (err) {
+    console.error(`Failed to process project ${projectId}:`, err.stack)
+    return { projectId, success: false }
+  }
+}
+
+function compareBlobSizes(gcsSizesByHash, dbSizesByHash) {
+  // Throw an error if the database doesn't report as many blobs as GCS
+  if (dbSizesByHash.size !== gcsSizesByHash.size) {
+    throw new Error(
+      `the database reported ${dbSizesByHash.size} blobs copied, but GCS reported ${gcsSizesByHash.size} blobs copied`
+    )
+  }
+
+  const mismatches = []
+  for (const [hash, dbSize] of dbSizesByHash.entries()) {
+    if (gcsSizesByHash.get(hash) !== dbSize) {
+      mismatches.push(hash)
+    }
+  }
+  if (mismatches.length > 0) {
+    throw new Error(`blob size mismatch for hashes: ${mismatches.join(', ')}`)
+  }
+}
+
+async function getHistory(projectId, chunkId) {
+  const rawHistory = await historyStore.loadRaw(projectId, chunkId)
+  const history = History.fromRaw(rawHistory)
+  return history
+}
+
+async function getBlobHashes(projectId, chunkIds) {
+  const blobHashes = new Set()
+  for (const chunkId of chunkIds) {
+    const history = await getHistory(projectId, chunkId)
+    history.findBlobHashes(blobHashes)
+  }
+  return Array.from(blobHashes)
+}
+
+async function copyBlobInGcs(projectId, blobHash) {
+  const globalBlobKey = [
+    blobHash.slice(0, 2),
+    blobHash.slice(2, 4),
+    blobHash.slice(4),
+  ].join('/')
+  const projectBlobKey = [
+    projectKey.format(projectId),
+    blobHash.slice(0, 2),
+    blobHash.slice(2),
+  ].join('/')
+  const globalBlobObject = globalBucket.file(globalBlobKey)
+  const projectBlobObject = projectBucket.file(projectBlobKey)
+
+  // Check if the project blob exists
+  let projectBlobMetadata = null
+  try {
+    ;[projectBlobMetadata] = await projectBlobObject.getMetadata()
+  } catch (err) {
+    if (err.code !== 404) {
+      throw err
+    }
+  }
+
+  // Check that the blob exists
+  let globalBlobMetadata = null
+  try {
+    ;[globalBlobMetadata] = await globalBlobObject.getMetadata()
+  } catch (err) {
+    if (err.code !== 404) {
+      throw err
+    }
+  }
+
+  if (projectBlobMetadata) {
+    // Project blob already exists. Compare the metadata if the global blob
+    // also exists and return early.
+    if (
+      globalBlobMetadata != null &&
+      (globalBlobMetadata.size !== projectBlobMetadata.size ||
+        globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash)
+    ) {
+      throw new Error(
+        `Project blob ${blobHash} in project ${projectId} doesn't match global blob`
+      )
+    }
+    return null
+  }
+
+  await globalBlobObject.copy(projectBlobObject)
+
+  // Paranoid check that the copy went well. The getMetadata() method returns
+  // an array, with the metadata in first position.
+  ;[projectBlobMetadata] = await projectBlobObject.getMetadata()
+  if (
+    globalBlobMetadata.size !== projectBlobMetadata.size ||
+    globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash
+  ) {
+    throw new Error(`Failed to copy blob ${blobHash} to project ${projectId})`)
+  }
+
+  return parseInt(projectBlobMetadata.size, 10)
+}
+
+async function copyBlobsInDatabase(projectId, blobHashes) {
+  const blobSizesByHash = new Map()
+  if (blobHashes.length === 0) {
+    return blobSizesByHash
+  }
+  const binaryBlobHashes = blobHashes.map(hash => Buffer.from(hash, 'hex'))
+  const result = await knex.raw(
+    `INSERT INTO project_blobs (
+      project_id, hash_bytes, byte_length, string_length
+    )
+    SELECT ?, hash_bytes, byte_length, string_length
+    FROM blobs
+    WHERE hash_bytes IN (${binaryBlobHashes.map(_ => '?').join(',')})
+    ON CONFLICT (project_id, hash_bytes) DO NOTHING
+    RETURNING hash_bytes, byte_length`,
+    [projectId, ...binaryBlobHashes]
+  )
+  for (const row of result.rows) {
+    blobSizesByHash.set(row.hash_bytes.toString('hex'), row.byte_length)
+  }
+  return blobSizesByHash
+}
+
+main()
+  .then(() => {
+    process.exit()
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/tasks/delete_old_chunks.js
+++ b/services/history-v1/storage/tasks/delete_old_chunks.js
@@ -0,0 +1,36 @@
+#!/usr/bin/env node
+
+'use strict'
+
+const commandLineArgs = require('command-line-args')
+const { chunkStore } = require('../')
+
+async function deleteOldChunks(options) {
+  const deletedChunksTotal = await chunkStore.deleteOldChunks(options)
+  console.log(`Deleted ${deletedChunksTotal} old chunks`)
+}
+
+exports.deleteOldChunks = deleteOldChunks
+
+if (require.main === module) {
+  const options = commandLineArgs([
+    { name: 'batch-size', type: Number },
+    { name: 'max-batches', type: Number },
+    { name: 'min-age', type: Number },
+    { name: 'timeout', type: Number },
+    { name: 'verbose', type: Boolean, alias: 'v', defaultValue: false },
+  ])
+  deleteOldChunks({
+    batchSize: options['batch-size'],
+    maxBatches: options['max-batches'],
+    timeout: options.timeout,
+    minAgeSecs: options['min-age'],
+  })
+    .then(() => {
+      process.exit()
+    })
+    .catch(err => {
+      console.error(err)
+      process.exit(1)
+    })
+}
--- a/services/history-v1/storage/tasks/fix_duplicate_versions.js
+++ b/services/history-v1/storage/tasks/fix_duplicate_versions.js
@@ -0,0 +1,156 @@
+#!/usr/bin/env node
+
+'use strict'
+
+const commandLineArgs = require('command-line-args')
+const { chunkStore } = require('..')
+
+main()
+  .then(() => {
+    process.exit(0)
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
+
+async function main() {
+  const opts = commandLineArgs([
+    { name: 'project-ids', type: String, multiple: true, defaultOption: true },
+    { name: 'save', type: Boolean, defaultValue: false },
+    { name: 'help', type: Boolean, defaultValue: false },
+  ])
+  if (opts.help || opts['project-ids'] == null) {
+    console.log('Usage: fix_duplicate_versions [--save] PROJECT_ID...')
+    process.exit()
+  }
+  for (const projectId of opts['project-ids']) {
+    await processProject(projectId, opts.save)
+  }
+  if (!opts.save) {
+    console.log('\nThis was a dry run. Re-run with --save to persist changes.')
+  }
+}
+
+async function processProject(projectId, save) {
+  console.log(`Project ${projectId}:`)
+  const chunk = await chunkStore.loadLatest(projectId)
+  let numChanges = 0
+  numChanges += removeDuplicateProjectVersions(chunk)
+  numChanges += removeDuplicateDocVersions(chunk)
+  console.log(`    ${numChanges > 0 ? numChanges : 'no'} changes`)
+  if (save && numChanges > 0) {
+    await replaceChunk(projectId, chunk)
+  }
+}
+
+function removeDuplicateProjectVersions(chunk) {
+  let numChanges = 0
+  let lastVersion = null
+  const { snapshot, changes } = chunk.history
+  if (snapshot.projectVersion != null) {
+    lastVersion = snapshot.projectVersion
+  }
+  for (const change of changes) {
+    if (change.projectVersion == null) {
+      // Not a project structure change. Ignore.
+      continue
+    }
+    if (
+      lastVersion != null &&
+      !areProjectVersionsIncreasing(lastVersion, change.projectVersion)
+    ) {
+      // Duplicate. Remove all ops
+      console.log(
+        `    Removing out-of-order project structure change: ${change.projectVersion} <= ${lastVersion}`
+      )
+      change.setOperations([])
+      delete change.projectVersion
+      numChanges++
+    } else {
+      lastVersion = change.projectVersion
+    }
+  }
+
+  return numChanges
+}
+
+function removeDuplicateDocVersions(chunk) {
+  let numChanges = 0
+  const lastVersions = new Map()
+  const { snapshot, changes } = chunk.history
+  if (snapshot.v2DocVersions != null) {
+    for (const { pathname, v } of Object.values(snapshot.v2DocVersions.data)) {
+      lastVersions.set(pathname, v)
+    }
+  }
+  for (const change of changes) {
+    if (change.v2DocVersions == null) {
+      continue
+    }
+
+    // Collect all docs that have problematic versions
+    const badPaths = []
+    const badDocIds = []
+    for (const [docId, { pathname, v }] of Object.entries(
+      change.v2DocVersions.data
+    )) {
+      const lastVersion = lastVersions.get(docId)
+      if (lastVersion != null && v <= lastVersion) {
+        // Duplicate. Remove ops related to that doc
+        console.log(
+          `    Removing out-of-order change for doc ${docId} (${pathname}): ${v} <= ${lastVersion}`
+        )
+        badPaths.push(pathname)
+        badDocIds.push(docId)
+        numChanges++
+      } else {
+        lastVersions.set(docId, v)
+      }
+    }
+
+    // Remove bad operations
+    if (badPaths.length > 0) {
+      change.setOperations(
+        change.operations.filter(
+          op => op.pathname == null || !badPaths.includes(op.pathname)
+        )
+      )
+    }
+
+    // Remove bad v2 doc versions
+    for (const docId of badDocIds) {
+      delete change.v2DocVersions.data[docId]
+    }
+  }
+
+  return numChanges
+}
+
+function areProjectVersionsIncreasing(v1Str, v2Str) {
+  const v1 = parseProjectVersion(v1Str)
+  const v2 = parseProjectVersion(v2Str)
+  return v2.major > v1.major || (v2.major === v1.major && v2.minor > v1.minor)
+}
+
+function parseProjectVersion(version) {
+  const [major, minor] = version.split('.').map(x => parseInt(x, 10))
+  if (isNaN(major) || isNaN(minor)) {
+    throw new Error(`Invalid project version: ${version}`)
+  }
+  return { major, minor }
+}
+
+async function replaceChunk(projectId, chunk) {
+  const endVersion = chunk.getEndVersion()
+  const oldChunkId = await chunkStore.getChunkIdForVersion(
+    projectId,
+    endVersion
+  )
+  console.log(`    Replacing chunk ${oldChunkId}`)
+  // The chunks table has a unique constraint on doc_id and end_version. Because
+  // we're replacing a chunk with the same end version, we need to destroy the
+  // chunk first.
+  await chunkStore.destroy(projectId, oldChunkId)
+  await chunkStore.create(projectId, chunk)
+}
--- a/services/history-v1/storage/tasks/index.js
+++ b/services/history-v1/storage/tasks/index.js
@@ -0,0 +1 @@
+exports.deleteOldChunks = require('./delete_old_chunks').deleteOldChunks
				`@@ -0,0 +1 @@`
				`exports.deleteOldChunks = require('./delete_old_chunks').deleteOldChunks`