first commit

2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
--- a/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash_fix_up.mjs
@@ -0,0 +1,647 @@
+// @ts-check
+import Events from 'node:events'
+import fs from 'node:fs'
+import Stream from 'node:stream'
+import { ObjectId } from 'mongodb'
+import logger from '@overleaf/logger'
+import OError from '@overleaf/o-error'
+import { Blob } from 'overleaf-editor-core'
+import {
+  BlobStore,
+  getStringLengthOfFile,
+  GLOBAL_BLOBS,
+  makeBlobForFile,
+} from '../lib/blob_store/index.js'
+import { db } from '../lib/mongodb.js'
+import commandLineArgs from 'command-line-args'
+import readline from 'node:readline'
+import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
+import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
+import filestorePersistor from '../lib/persistor.js'
+import { setTimeout } from 'node:timers/promises'
+
+// Silence warning.
+Events.setMaxListeners(20)
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+/**
+ * @typedef {import("mongodb").Collection} Collection
+ * @typedef {import("mongodb").Collection<Project>} ProjectsCollection
+ * @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
+ */
+
+/**
+ * @typedef {Object} FileRef
+ * @property {ObjectId} _id
+ * @property {string} hash
+ */
+
+/**
+ * @typedef {Object} Folder
+ * @property {Array<Folder>} folders
+ * @property {Array<FileRef>} fileRefs
+ */
+
+/**
+ * @typedef {Object} Project
+ * @property {ObjectId} _id
+ * @property {Array<Folder>} rootFolder
+ * @property {{history: {id: (number|string)}}} overleaf
+ */
+
+/**
+ * @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
+ */
+function parseArgs() {
+  const args = commandLineArgs([
+    { name: 'fixNotFound', type: String, defaultValue: 'true' },
+    { name: 'fixDeletePermission', type: String, defaultValue: 'true' },
+    { name: 'fixHashMismatch', type: String, defaultValue: 'true' },
+    { name: 'fixMissingHash', type: String, defaultValue: 'true' },
+    { name: 'logs', type: String, defaultValue: '' },
+  ])
+  /**
+   * commandLineArgs cannot handle --foo=false, so go the long way
+   * @param {string} name
+   * @return {boolean}
+   */
+  function boolVal(name) {
+    const v = args[name]
+    if (['true', 'false'].includes(v)) return v === 'true'
+    throw new Error(`expected "true" or "false" for boolean option ${name}`)
+  }
+  return {
+    FIX_HASH_MISMATCH: boolVal('fixNotFound'),
+    FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
+    FIX_NOT_FOUND: boolVal('fixHashMismatch'),
+    FIX_MISSING_HASH: boolVal('fixMissingHash'),
+    LOGS: args.logs,
+  }
+}
+
+const {
+  FIX_HASH_MISMATCH,
+  FIX_DELETE_PERMISSION,
+  FIX_NOT_FOUND,
+  FIX_MISSING_HASH,
+  LOGS,
+} = parseArgs()
+if (!LOGS) {
+  throw new Error('--logs parameter missing')
+}
+const BUFFER_DIR = fs.mkdtempSync(
+  process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
+)
+const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
+if (!USER_FILES_BUCKET_NAME) {
+  throw new Error('env var USER_FILES_BUCKET_NAME is missing')
+}
+// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
+const STREAM_HIGH_WATER_MARK = parseInt(
+  process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
+  10
+)
+const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
+
+/** @type {ProjectsCollection} */
+const projectsCollection = db.collection('projects')
+/** @type {DeletedProjectsCollection} */
+const deletedProjectsCollection = db.collection('deletedProjects')
+
+let gracefulShutdownInitiated = false
+
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  gracefulShutdownInitiated = true
+  console.warn('graceful shutdown initiated, draining queue')
+}
+
+class FileDeletedError extends OError {}
+
+/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
+const PROJECT_CACHE = new Map()
+
+/**
+ * @param {string} projectId
+ * @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
+ */
+async function getProject(projectId) {
+  const cached = PROJECT_CACHE.get(projectId)
+  if (cached) return cached
+
+  let projectSoftDeleted
+  let project = await projectsCollection.findOne({
+    _id: new ObjectId(projectId),
+  })
+  if (project) {
+    projectSoftDeleted = false
+  } else {
+    const softDeleted = await deletedProjectsCollection.findOne({
+      'deleterData.deletedProjectId': new ObjectId(projectId),
+      project: { $exists: true },
+    })
+    if (!softDeleted) {
+      throw new OError('project hard-deleted')
+    }
+    project = softDeleted.project
+    projectSoftDeleted = true
+  }
+  PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
+  return { projectSoftDeleted, project }
+}
+
+/**
+ * @param {Folder} folder
+ * @param {string} fileId
+ * @return {{path: string, fileRef: FileRef, folder: Folder}|null}
+ */
+function getFileTreePath(folder, fileId) {
+  if (!folder) return null
+  let idx = 0
+  if (Array.isArray(folder.fileRefs)) {
+    for (const fileRef of folder.fileRefs) {
+      if (fileRef?._id.toString() === fileId) {
+        return {
+          fileRef,
+          path: `.fileRefs.${idx}`,
+          folder,
+        }
+      }
+      idx++
+    }
+  }
+  idx = 0
+  if (Array.isArray(folder.folders)) {
+    for (const child of folder.folders) {
+      const match = getFileTreePath(child, fileId)
+      if (match) {
+        return {
+          fileRef: match.fileRef,
+          folder: match.folder,
+          path: `.folders.${idx}${match.path}`,
+        }
+      }
+      idx++
+    }
+  }
+  return null
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
+ */
+async function findFile(projectId, fileId) {
+  const { projectSoftDeleted, project } = await getProject(projectId)
+  const match = getFileTreePath(project.rootFolder[0], fileId)
+  if (!match) {
+    throw new FileDeletedError('file not found in file-tree', {
+      projectSoftDeleted,
+    })
+  }
+  const { path, fileRef, folder } = match
+  let fullPath
+  let query
+  if (projectSoftDeleted) {
+    fullPath = `project.rootFolder.0${path}`
+    query = {
+      'deleterData.deletedProjectId': new ObjectId(projectId),
+      [`${fullPath}._id`]: new ObjectId(fileId),
+    }
+  } else {
+    fullPath = `rootFolder.0${path}`
+    query = {
+      _id: new ObjectId(projectId),
+      [`${fullPath}._id`]: new ObjectId(fileId),
+    }
+  }
+  return {
+    projectSoftDeleted,
+    query,
+    fullPath,
+    fileRef,
+    folder,
+  }
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixNotFound(line) {
+  const { projectId, fileId, bucketName } = JSON.parse(line)
+  if (bucketName !== USER_FILES_BUCKET_NAME) {
+    throw new OError('not found case for another bucket')
+  }
+
+  const { projectSoftDeleted, query, fullPath, fileRef, folder } =
+    await findFile(projectId, fileId)
+  logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
+  // Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
+  const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
+  let result
+  if (projectSoftDeleted) {
+    result = await deletedProjectsCollection.updateOne(query, {
+      $pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
+      $inc: { 'project.version': 1 },
+    })
+  } else {
+    result = await projectsCollection.updateOne(query, {
+      $pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
+      $inc: { version: 1 },
+    })
+  }
+  if (result.matchedCount !== 1) {
+    throw new OError('file-tree write did not match', { result })
+  }
+  // Update the cache. The mongo-path of the next file will be off otherwise.
+  folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
+  return true
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function setHashInMongo(projectId, fileId, hash) {
+  const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
+    projectId,
+    fileId
+  )
+  if (fileRef.hash === hash) return
+  logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
+  let result
+  if (projectSoftDeleted) {
+    result = await deletedProjectsCollection.updateOne(query, {
+      $set: { [`${fullPath}.hash`]: hash },
+      $inc: { 'project.version': 1 },
+    })
+  } else {
+    result = await projectsCollection.updateOne(query, {
+      $set: { [`${fullPath}.hash`]: hash },
+      $inc: { version: 1 },
+    })
+  }
+  if (result.matchedCount !== 1) {
+    throw new OError('file-tree write did not match', { result })
+  }
+  fileRef.hash = hash // Update cache for completeness.
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} historyId
+ * @return {Promise<void>}
+ */
+async function importRestoredFilestoreFile(projectId, fileId, historyId) {
+  const filestoreKey = `${projectId}/${fileId}`
+  const path = `${BUFFER_DIR}/${projectId}_${fileId}`
+  try {
+    let s
+    try {
+      s = await filestorePersistor.getObjectStream(
+        USER_FILES_BUCKET_NAME,
+        filestoreKey
+      )
+    } catch (err) {
+      if (err instanceof NotFoundError) {
+        throw new OError('missing blob, need to restore filestore file', {
+          filestoreKey,
+        })
+      }
+      throw err
+    }
+    await Stream.promises.pipeline(
+      s,
+      fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
+    )
+    const blobStore = new BlobStore(historyId)
+    const blob = await blobStore.putFile(path)
+    await backupBlob(historyId, blob, path)
+    await setHashInMongo(projectId, fileId, blob.getHash())
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} path
+ * @return {Promise<Blob>}
+ */
+async function bufferFilestoreFileToDisk(projectId, fileId, path) {
+  const filestoreKey = `${projectId}/${fileId}`
+  try {
+    await Stream.promises.pipeline(
+      await filestorePersistor.getObjectStream(
+        USER_FILES_BUCKET_NAME,
+        filestoreKey
+      ),
+      fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
+    )
+    const blob = await makeBlobForFile(path)
+    blob.setStringLength(
+      await getStringLengthOfFile(blob.getByteLength(), path)
+    )
+    return blob
+  } catch (err) {
+    if (err instanceof NotFoundError) {
+      throw new OError('missing blob, need to restore filestore file', {
+        filestoreKey,
+      })
+    }
+    throw err
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @return {Promise<string>}
+ */
+async function computeFilestoreFileHash(projectId, fileId) {
+  const path = `${BUFFER_DIR}/${projectId}_${fileId}`
+  try {
+    const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
+    return blob.getHash()
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @return {Promise<void>}
+ */
+async function uploadFilestoreFile(projectId, fileId) {
+  const path = `${BUFFER_DIR}/${projectId}_${fileId}`
+  try {
+    const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
+    const hash = blob.getHash()
+    try {
+      await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+    } catch (err) {
+      if (!(err instanceof Blob.NotFoundError)) throw err
+
+      const { project } = await getProject(projectId)
+      const historyId = project.overleaf.history.id.toString()
+      const blobStore = new BlobStore(historyId)
+      await blobStore.putBlob(path, blob)
+      await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+    }
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixHashMismatch(line) {
+  const {
+    projectId,
+    fileId,
+    hash: computedHash,
+    entry: {
+      hash: fileTreeHash,
+      ctx: { historyId },
+    },
+  } = JSON.parse(line)
+  const blobStore = new BlobStore(historyId)
+  if (await blobStore.getBlob(fileTreeHash)) {
+    throw new OError('found blob with computed filestore object hash')
+  }
+  if (!(await blobStore.getBlob(computedHash))) {
+    await importRestoredFilestoreFile(projectId, fileId, historyId)
+    return true
+  }
+  return await ensureBlobExistsForFileAndUploadToAWS(
+    projectId,
+    fileId,
+    computedHash
+  )
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} hash
+ * @return {Promise<boolean>}
+ */
+async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
+  const { fileRef } = await findFile(projectId, fileId)
+  return fileRef.hash === hash
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<boolean>}
+ */
+async function needsBackingUpToAWS(projectId, hash) {
+  if (GLOBAL_BLOBS.has(hash)) return false
+  return !(await _blobIsBackedUp(projectId, hash))
+}
+
+/**
+ * @param {string} projectId
+ * @param {string} fileId
+ * @param {string} hash
+ * @return {Promise<boolean>}
+ */
+async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
+  const { project } = await getProject(projectId)
+  const historyId = project.overleaf.history.id.toString()
+  const blobStore = new BlobStore(historyId)
+  if (
+    (await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
+    (await blobStore.getBlob(hash)) &&
+    !(await needsBackingUpToAWS(projectId, hash))
+  ) {
+    return false // already processed
+  }
+
+  const stream = await blobStore.getStream(hash)
+  const path = `${BUFFER_DIR}/${historyId}_${hash}`
+  try {
+    await Stream.promises.pipeline(
+      stream,
+      fs.createWriteStream(path, {
+        highWaterMark: STREAM_HIGH_WATER_MARK,
+      })
+    )
+
+    const writtenBlob = await makeBlobForFile(path)
+    writtenBlob.setStringLength(
+      await getStringLengthOfFile(writtenBlob.getByteLength(), path)
+    )
+    if (writtenBlob.getHash() !== hash) {
+      // Double check download, better safe than sorry.
+      throw new OError('blob corrupted', { writtenBlob })
+    }
+
+    let blob = await blobStore.getBlob(hash)
+    if (!blob) {
+      // Calling blobStore.putBlob would result in the same error again.
+      // HACK: Skip upload to GCS and finalize putBlob operation directly.
+      await blobStore.backend.insertBlob(historyId, writtenBlob)
+    }
+    await backupBlob(historyId, writtenBlob, path)
+  } finally {
+    await fs.promises.rm(path, { force: true })
+  }
+  await setHashInMongo(projectId, fileId, hash)
+  return true
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixDeletePermission(line) {
+  let { projectId, fileId, hash } = JSON.parse(line)
+  if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
+  return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+}
+
+/**
+ * @param {string} line
+ * @return {Promise<boolean>}
+ */
+async function fixMissingHash(line) {
+  let { projectId, _id: fileId } = JSON.parse(line)
+  const {
+    fileRef: { hash },
+  } = await findFile(projectId, fileId)
+  if (hash) {
+    // processed, double check
+    return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
+  }
+  await uploadFilestoreFile(projectId, fileId)
+  return true
+}
+
+const CASES = {
+  'not found': {
+    match: 'NotFoundError',
+    flag: FIX_NOT_FOUND,
+    action: fixNotFound,
+  },
+  'hash mismatch': {
+    match: 'OError: hash mismatch',
+    flag: FIX_HASH_MISMATCH,
+    action: fixHashMismatch,
+  },
+  'delete permission': {
+    match: 'storage.objects.delete',
+    flag: FIX_DELETE_PERMISSION,
+    action: fixDeletePermission,
+  },
+  'missing file hash': {
+    match: '"bad file hash"',
+    flag: FIX_MISSING_HASH,
+    action: fixMissingHash,
+  },
+}
+
+const STATS = {
+  processedLines: 0,
+  success: 0,
+  alreadyProcessed: 0,
+  fileDeleted: 0,
+  skipped: 0,
+  failed: 0,
+  unmatched: 0,
+}
+function logStats() {
+  console.log(
+    JSON.stringify({
+      time: new Date(),
+      gracefulShutdownInitiated,
+      ...STATS,
+    })
+  )
+}
+setInterval(logStats, 10_000)
+
+async function processLog() {
+  const rl = readline.createInterface({
+    input: fs.createReadStream(LOGS),
+  })
+  nextLine: for await (const line of rl) {
+    if (gracefulShutdownInitiated) break
+    STATS.processedLines++
+    if (
+      !(
+        line.includes('"failed to process file"') ||
+        // Process missing hashes as flagged by find_malformed_filetrees.mjs
+        line.includes('"bad file-tree path"')
+      )
+    ) {
+      continue
+    }
+
+    for (const [name, { match, flag, action }] of Object.entries(CASES)) {
+      if (!line.includes(match)) continue
+      if (flag) {
+        try {
+          if (await action(line)) {
+            STATS.success++
+          } else {
+            STATS.alreadyProcessed++
+          }
+        } catch (err) {
+          if (err instanceof FileDeletedError) {
+            STATS.fileDeleted++
+            logger.info({ err, line }, 'file deleted, skipping')
+          } else {
+            STATS.failed++
+            logger.error({ err, line }, `failed to fix ${name}`)
+          }
+        }
+      } else {
+        STATS.skipped++
+      }
+      continue nextLine
+    }
+    STATS.unmatched++
+    logger.warn({ line }, 'unknown fatal error')
+  }
+}
+
+async function main() {
+  try {
+    await processLog()
+  } finally {
+    logStats()
+    try {
+      await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
+    } catch (err) {
+      console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
+    }
+  }
+  const { skipped, failed, unmatched } = STATS
+  await setTimeout(SLEEP_BEFORE_EXIT)
+  if (failed > 0) {
+    process.exit(Math.min(failed, 99))
+  } else if (unmatched > 0) {
+    process.exit(100)
+  } else if (skipped > 0) {
+    process.exit(101)
+  } else {
+    process.exit(0)
+  }
+}
+
+await main()
--- a/services/history-v1/storage/scripts/backup.mjs
+++ b/services/history-v1/storage/scripts/backup.mjs
--- a/services/history-v1/storage/scripts/backup_blob.mjs
+++ b/services/history-v1/storage/scripts/backup_blob.mjs
@@ -0,0 +1,173 @@
+// @ts-check
+import commandLineArgs from 'command-line-args'
+import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
+import withTmpDir from '../../api/controllers/with_tmp_dir.js'
+import {
+  BlobStore,
+  GLOBAL_BLOBS,
+  loadGlobalBlobs,
+} from '../lib/blob_store/index.js'
+import assert from '../lib/assert.js'
+import knex from '../lib/knex.js'
+import { client } from '../lib/mongodb.js'
+import redis from '../lib/redis.js'
+import { setTimeout } from 'node:timers/promises'
+import fs from 'node:fs'
+
+await loadGlobalBlobs()
+
+/**
+ * Gracefully shutdown the process
+ * @return {Promise<void>}
+ */
+async function gracefulShutdown() {
+  console.log('Gracefully shutting down')
+  await knex.destroy()
+  await client.close()
+  await redis.disconnect()
+  await setTimeout(100)
+  process.exit()
+}
+
+/**
+ *
+ * @param {string} row
+ * @return {BackupBlobJob}
+ */
+function parseCSVRow(row) {
+  const [historyId, hash] = row.split(',')
+  validateBackedUpBlobJob({ historyId, hash })
+  return { historyId, hash }
+}
+
+/**
+ *
+ * @param {BackupBlobJob} job
+ */
+function validateBackedUpBlobJob(job) {
+  assert.projectId(job.historyId)
+  assert.blobHash(job.hash)
+}
+
+/**
+ *
+ * @param {string} path
+ * @return {Promise<Array<BackupBlobJob>>}
+ */
+async function readCSV(path) {
+  let fh
+  /** @type {Array<BackupBlobJob>} */
+  const rows = []
+  try {
+    fh = await fs.promises.open(path, 'r')
+  } catch (error) {
+    console.error(`Could not open file: ${error}`)
+    throw error
+  }
+  for await (const line of fh.readLines()) {
+    try {
+      const row = parseCSVRow(line)
+      if (GLOBAL_BLOBS.has(row.hash)) {
+        console.log(`Skipping global blob: ${line}`)
+        continue
+      }
+      rows.push(row)
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      console.log(`Skipping invalid row: ${line}`)
+    }
+  }
+  return rows
+}
+
+/**
+ * @typedef {Object} BackupBlobJob
+ * @property {string} hash
+ * @property {string} historyId
+ */
+
+/**
+ * @param {Object} options
+ * @property {string} [options.historyId]
+ * @property {string} [options.hash]
+ * @property {string} [options.input]
+ * @return {Promise<Array<BackupBlobJob>>}
+ */
+async function initialiseJobs({ historyId, hash, input }) {
+  if (input) {
+    return await readCSV(input)
+  }
+
+  if (!historyId) {
+    console.error('historyId is required')
+    process.exitCode = 1
+    await gracefulShutdown()
+  }
+
+  if (!hash) {
+    console.error('hash is required')
+    process.exitCode = 1
+    await gracefulShutdown()
+  }
+
+  validateBackedUpBlobJob({ historyId, hash })
+
+  if (GLOBAL_BLOBS.has(hash)) {
+    console.error(`Blob ${hash} is a global blob; not backing up`)
+    process.exitCode = 1
+    await gracefulShutdown()
+  }
+  return [{ hash, historyId }]
+}
+
+/**
+ *
+ * @param {string} historyId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+export async function downloadAndBackupBlob(historyId, hash) {
+  const blobStore = new BlobStore(historyId)
+  const blob = await blobStore.getBlob(hash)
+  if (!blob) {
+    throw new Error(`Blob ${hash} could not be loaded`)
+  }
+  await withTmpDir(`blob-${hash}`, async tmpDir => {
+    const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
+    console.log(`Downloaded blob ${hash} to ${filePath}`)
+    await backupBlob(historyId, blob, filePath)
+    console.log('Backed up blob')
+  })
+}
+
+let jobs
+
+const options = commandLineArgs([
+  { name: 'historyId', type: String },
+  { name: 'hash', type: String },
+  { name: 'input', type: String },
+])
+
+try {
+  jobs = await initialiseJobs(options)
+} catch (error) {
+  console.error(error)
+  await gracefulShutdown()
+}
+
+if (!Array.isArray(jobs)) {
+  // This is mostly to satisfy typescript
+  process.exitCode = 1
+  await gracefulShutdown()
+  process.exit(1)
+}
+
+for (const { historyId, hash } of jobs) {
+  try {
+    await downloadAndBackupBlob(historyId, hash)
+  } catch (error) {
+    console.error(error)
+    process.exitCode = 1
+  }
+}
+await gracefulShutdown()
--- a/services/history-v1/storage/scripts/backup_sample.mjs
+++ b/services/history-v1/storage/scripts/backup_sample.mjs
@@ -0,0 +1,153 @@
+// @ts-check
+import { ObjectId } from 'mongodb'
+import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
+import { db, client } from '../lib/mongodb.js'
+
+const projectsCollection = db.collection('projects')
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+// Configuration
+const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
+  ? parseInt(process.argv[2], 10)
+  : 10000
+const TARGET_ERROR_PERCENTAGE = process.argv[3]
+  ? parseFloat(process.argv[3])
+  : 5.0
+
+let gracefulShutdownInitiated = false
+
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  gracefulShutdownInitiated = true
+  console.warn('graceful shutdown initiated')
+}
+
+async function takeSample(sampleSize) {
+  const results = await projectsCollection
+    .aggregate(
+      [
+        { $sample: { size: sampleSize } },
+        {
+          $match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
+        },
+        {
+          $count: 'total',
+        },
+      ],
+      { readPreference: READ_PREFERENCE_SECONDARY }
+    )
+    .toArray()
+
+  const count = results[0]?.total || 0
+  return { totalSampled: sampleSize, backedUp: count }
+}
+
+function calculateStatistics(
+  cumulativeSampled,
+  cumulativeBackedUp,
+  totalPopulation
+) {
+  const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
+
+  // Standard error with finite population correction
+  const fpc = Math.sqrt(
+    (totalPopulation - cumulativeSampled) / (totalPopulation - 1)
+  )
+  const stdError =
+    Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
+
+  // 95% confidence interval is approximately ±1.96 standard errors
+  const marginOfError = 1.96 * stdError
+
+  return {
+    proportion,
+    percentage: (proportion * 100).toFixed(2),
+    marginOfError,
+    errorPercentage: (marginOfError * 100).toFixed(2),
+    lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
+    upperBound: ((proportion + marginOfError) * 100).toFixed(2),
+    sampleSize: cumulativeSampled,
+    populationSize: totalPopulation,
+  }
+}
+
+async function main() {
+  console.log('Date:', new Date().toISOString())
+  const totalCount = await projectsCollection.estimatedDocumentCount({
+    readPreference: READ_PREFERENCE_SECONDARY,
+  })
+  console.log(
+    `Total projects in collection (estimated): ${totalCount.toLocaleString()}`
+  )
+  console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
+
+  let cumulativeSampled = 0
+  let cumulativeBackedUp = 0
+  let currentError = Infinity
+  let iteration = 0
+
+  console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
+  console.log('----------|---------------|-------------|----------------')
+
+  while (currentError > TARGET_ERROR_PERCENTAGE) {
+    if (gracefulShutdownInitiated) {
+      console.log('Graceful shutdown initiated. Exiting sampling loop.')
+      break
+    }
+
+    iteration++
+    const { totalSampled, backedUp } = await takeSample(
+      SAMPLE_SIZE_PER_ITERATION
+    )
+    cumulativeSampled += totalSampled
+    cumulativeBackedUp += backedUp
+
+    const stats = calculateStatistics(
+      cumulativeSampled,
+      cumulativeBackedUp,
+      totalCount
+    )
+    currentError = parseFloat(stats.errorPercentage)
+
+    console.log(
+      `${iteration.toString().padStart(9)} | ` +
+        `${cumulativeSampled.toString().padStart(13)} | ` +
+        `${stats.percentage.padStart(10)}% | ` +
+        `\u00B1${stats.errorPercentage}%`
+    )
+
+    // Small delay between iterations
+    await new Promise(resolve => setTimeout(resolve, 100))
+  }
+
+  const finalStats = calculateStatistics(
+    cumulativeSampled,
+    cumulativeBackedUp,
+    totalCount
+  )
+
+  console.log(
+    `Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
+  )
+  console.log(
+    `Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
+  )
+  console.log(
+    `95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
+  )
+  console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+  })
--- a/services/history-v1/storage/scripts/backup_scheduler.mjs
+++ b/services/history-v1/storage/scripts/backup_scheduler.mjs
@@ -0,0 +1,429 @@
+import Queue from 'bull'
+import config from 'config'
+import commandLineArgs from 'command-line-args'
+import logger from '@overleaf/logger'
+import {
+  listPendingBackups,
+  listUninitializedBackups,
+  getBackupStatus,
+} from '../lib/backup_store/index.js'
+
+logger.initialize('backup-queue')
+
+// Use the same redis config as backup_worker
+const redisOptions = config.get('redis.queue')
+
+// Create a Bull queue named 'backup'
+const backupQueue = new Queue('backup', {
+  redis: redisOptions,
+  defaultJobOptions: {
+    removeOnComplete: true,
+    removeOnFail: true,
+  },
+})
+
+// Define command-line options
+const optionDefinitions = [
+  { name: 'clean', type: Boolean },
+  { name: 'status', type: Boolean },
+  {
+    name: 'add',
+    type: String,
+    multiple: true,
+    description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
+  },
+  { name: 'monitor', type: Boolean },
+  {
+    name: 'queue-pending',
+    type: Number,
+    description:
+      'Find projects with pending changes older than N seconds and add them to the queue',
+  },
+  {
+    name: 'show-pending',
+    type: Number,
+    description:
+      'Show count of pending projects older than N seconds without adding to queue',
+  },
+  {
+    name: 'limit',
+    type: Number,
+    description: 'Limit the number of jobs to be added',
+  },
+  {
+    name: 'interval',
+    type: Number,
+    description: 'Time in seconds to spread jobs over (default: 300)',
+    defaultValue: 300,
+  },
+  {
+    name: 'backoff-delay',
+    type: Number,
+    description:
+      'Backoff delay in milliseconds for failed jobs (default: 1000)',
+    defaultValue: 1000,
+  },
+  {
+    name: 'attempts',
+    type: Number,
+    description: 'Number of retry attempts for failed jobs (default: 3)',
+    defaultValue: 3,
+  },
+  {
+    name: 'warn-threshold',
+    type: Number,
+    description: 'Warn about any project exceeding this pending age',
+    defaultValue: 2 * 3600, // 2 hours
+  },
+  {
+    name: 'verbose',
+    alias: 'v',
+    type: Boolean,
+    description: 'Show detailed information when used with --show-pending',
+  },
+]
+
+// Parse command line arguments
+const options = commandLineArgs(optionDefinitions)
+const WARN_THRESHOLD = options['warn-threshold']
+
+// Helper to validate date format
+function isValidDateFormat(dateStr) {
+  return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
+}
+
+// Helper to validate the pending time parameter
+function validatePendingTime(option, value) {
+  if (typeof value !== 'number' || value <= 0) {
+    console.error(
+      `Error: --${option} requires a positive numeric TIME argument in seconds`
+    )
+    console.error(`Example: --${option} 3600`)
+    process.exit(1)
+  }
+  return value
+}
+
+// Helper to format the pending time display
+function formatPendingTime(timestamp) {
+  const now = new Date()
+  const diffMs = now - timestamp
+  const seconds = Math.floor(diffMs / 1000)
+  return `${timestamp.toISOString()} (${seconds} seconds ago)`
+}
+
+// Helper to add a job to the queue, checking for duplicates
+async function addJobWithCheck(queue, data, options) {
+  const jobId = options.jobId
+
+  // Check if the job already exists
+  const existingJob = await queue.getJob(jobId)
+
+  if (existingJob) {
+    return { job: existingJob, added: false }
+  } else {
+    const job = await queue.add(data, options)
+    return { job, added: true }
+  }
+}
+
+// Setup queue event listeners
+function setupMonitoring() {
+  console.log('Starting queue monitoring. Press Ctrl+C to exit.')
+
+  backupQueue.on('global:error', error => {
+    logger.info({ error }, 'Queue error')
+  })
+
+  backupQueue.on('global:waiting', jobId => {
+    logger.info({ jobId }, 'job is waiting')
+  })
+
+  backupQueue.on('global:active', jobId => {
+    logger.info({ jobId }, 'job is now active')
+  })
+
+  backupQueue.on('global:stalled', jobId => {
+    logger.info({ jobId }, 'job has stalled')
+  })
+
+  backupQueue.on('global:progress', (jobId, progress) => {
+    logger.info({ jobId, progress }, 'job progress')
+  })
+
+  backupQueue.on('global:completed', (jobId, result) => {
+    logger.info({ jobId, result }, 'job completed')
+  })
+
+  backupQueue.on('global:failed', (jobId, err) => {
+    logger.info({ jobId, err }, 'job failed')
+  })
+
+  backupQueue.on('global:paused', () => {
+    logger.info({}, 'Queue paused')
+  })
+
+  backupQueue.on('global:resumed', () => {
+    logger.info({}, 'Queue resumed')
+  })
+
+  backupQueue.on('global:cleaned', (jobs, type) => {
+    logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
+  })
+
+  backupQueue.on('global:drained', () => {
+    logger.info({}, 'Queue drained')
+  })
+
+  backupQueue.on('global:removed', jobId => {
+    logger.info({ jobId }, 'Job removed')
+  })
+}
+
+async function addDateRangeJob(input) {
+  const [startDate, endDate] = input.split(':')
+  if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
+    console.error(
+      `Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
+    )
+    return
+  }
+
+  const jobId = `backup-${startDate}-to-${endDate}`
+  const { job, added } = await addJobWithCheck(
+    backupQueue,
+    { startDate, endDate },
+    { jobId }
+  )
+
+  console.log(
+    `${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
+  )
+}
+
+// Helper to list pending and uninitialized backups
+// This function combines the two cursors into a single generator
+// to yield projects from both lists
+async function* pendingCursor(timeIntervalMs, limit) {
+  for await (const project of listPendingBackups(timeIntervalMs, limit)) {
+    yield project
+  }
+  for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
+    yield project
+  }
+}
+
+// Process pending projects with changes older than the specified seconds
+async function processPendingProjects(
+  age,
+  showOnly,
+  limit,
+  verbose,
+  jobInterval,
+  jobOpts = {}
+) {
+  const timeIntervalMs = age * 1000
+  console.log(
+    `Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
+  )
+
+  let count = 0
+  let addedCount = 0
+  let existingCount = 0
+  // Pass the limit directly to MongoDB query for better performance
+  const changeTimes = []
+  for await (const project of pendingCursor(timeIntervalMs, limit)) {
+    const projectId = project._id.toHexString()
+    const pendingAt =
+      project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
+    if (pendingAt) {
+      changeTimes.push(pendingAt)
+      const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
+      if (pendingAge > WARN_THRESHOLD) {
+        try {
+          const backupStatus = await getBackupStatus(projectId)
+          logger.warn(
+            {
+              projectId,
+              pendingAt,
+              pendingAge,
+              backupStatus,
+              warnThreshold: WARN_THRESHOLD,
+            },
+            `pending change exceeds rpo warning threshold`
+          )
+        } catch (err) {
+          logger.error(
+            { projectId, pendingAt, pendingAge },
+            'Error getting backup status'
+          )
+          throw err
+        }
+      }
+    }
+    if (showOnly && verbose) {
+      console.log(
+        `Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
+      )
+    } else if (!showOnly) {
+      const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
+      const { job, added } = await addJobWithCheck(
+        backupQueue,
+        { projectId, pendingChangeAt: pendingAt.getTime() },
+        { ...jobOpts, delay, jobId: projectId }
+      )
+
+      if (added) {
+        if (verbose) {
+          console.log(
+            `Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
+          )
+        }
+        addedCount++
+      } else {
+        if (verbose) {
+          console.log(
+            `Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
+          )
+        }
+        existingCount++
+      }
+    }
+
+    count++
+    if (count % 1000 === 0) {
+      console.log(
+        `Processed ${count} projects`,
+        showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
+      )
+    }
+  }
+  // Set oldestChange to undefined if there are no changes
+  const oldestChange =
+    changeTimes.length > 0
+      ? changeTimes.reduce((min, time) => (time < min ? time : min))
+      : undefined
+
+  if (showOnly) {
+    console.log(
+      `Found ${count} projects with pending changes (not added to queue)`
+    )
+  } else {
+    console.log(`Found ${count} projects with pending changes:`)
+    console.log(`  ${addedCount} jobs added to queue`)
+    console.log(`  ${existingCount} jobs already existed in queue`)
+    if (oldestChange) {
+      console.log(`  Oldest pending change: ${formatPendingTime(oldestChange)}`)
+    }
+  }
+}
+
+// Main execution block
+async function run() {
+  const optionCount = [
+    options.clean,
+    options.status,
+    options.add,
+    options.monitor,
+    options['queue-pending'] !== undefined,
+    options['show-pending'] !== undefined,
+  ].filter(Boolean).length
+  if (optionCount > 1) {
+    console.error('Only one option can be specified')
+    process.exit(1)
+  }
+
+  if (options.clean) {
+    const beforeCounts = await backupQueue.getJobCounts()
+    console.log('Current queue state:', JSON.stringify(beforeCounts))
+    console.log('Cleaning completed and failed jobs...')
+    await backupQueue.clean(1, 'completed')
+    await backupQueue.clean(1, 'failed')
+    const afterCounts = await backupQueue.getJobCounts()
+    console.log('Current queue state:', JSON.stringify(afterCounts))
+    console.log('Queue cleaned successfully')
+  } else if (options.status) {
+    const counts = await backupQueue.getJobCounts()
+    console.log('Current queue state:', JSON.stringify(counts))
+  } else if (options.add) {
+    const inputs = Array.isArray(options.add) ? options.add : [options.add]
+    for (const input of inputs) {
+      if (input.includes(':')) {
+        // Handle date range format
+        await addDateRangeJob(input)
+      } else {
+        // Handle project ID format
+        const { job, added } = await addJobWithCheck(
+          backupQueue,
+          { projectId: input },
+          { jobId: input }
+        )
+        console.log(
+          `${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
+        )
+      }
+    }
+  } else if (options.monitor) {
+    setupMonitoring()
+  } else if (options['queue-pending'] !== undefined) {
+    const age = validatePendingTime('queue-pending', options['queue-pending'])
+    await processPendingProjects(
+      age,
+      false,
+      options.limit,
+      options.verbose,
+      options.interval,
+      {
+        attempts: options.attempts,
+        backoff: {
+          type: 'exponential',
+          delay: options['backoff-delay'],
+        },
+      }
+    )
+  } else if (options['show-pending'] !== undefined) {
+    const age = validatePendingTime('show-pending', options['show-pending'])
+    await processPendingProjects(age, true, options.limit, options.verbose)
+  } else {
+    console.log('Usage:')
+    console.log('  --clean               Clean up completed and failed jobs')
+    console.log('  --status              Show current job counts')
+    console.log('  --add [projectId]     Add a job for the specified projectId')
+    console.log(
+      '  --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
+    )
+    console.log('  --monitor             Monitor queue events')
+    console.log(
+      '  --queue-pending TIME  Find projects with changes older than TIME seconds and add them to the queue'
+    )
+    console.log(
+      '  --show-pending TIME   Show count of pending projects older than TIME seconds'
+    )
+    console.log('  --limit N             Limit the number of jobs to be added')
+    console.log(
+      '  --interval TIME       Time interval in seconds to spread jobs over'
+    )
+    console.log(
+      '  --backoff-delay TIME  Backoff delay in milliseconds for failed jobs (default: 1000)'
+    )
+    console.log(
+      '  --attempts N          Number of retry attempts for failed jobs (default: 3)'
+    )
+    console.log(
+      '  --verbose, -v         Show detailed information when used with --show-pending'
+    )
+  }
+}
+
+// Run and handle errors
+run()
+  .catch(err => {
+    console.error('Error:', err)
+    process.exit(1)
+  })
+  .then(result => {
+    // Only exit if not in monitor mode
+    if (!options.monitor) {
+      process.exit(0)
+    }
+  })
--- a/services/history-v1/storage/scripts/backup_worker.mjs
+++ b/services/history-v1/storage/scripts/backup_worker.mjs
@@ -0,0 +1,144 @@
+import Queue from 'bull'
+import logger from '@overleaf/logger'
+import config from 'config'
+import metrics from '@overleaf/metrics'
+import {
+  backupProject,
+  initializeProjects,
+  configureBackup,
+} from './backup.mjs'
+
+const CONCURRENCY = 15
+const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
+const redisOptions = config.get('redis.queue')
+const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
+const LAG_TIME_BUCKETS_HRS = [
+  0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
+] // hours
+
+// Configure backup settings to match worker concurrency
+configureBackup({ concurrency: 50, useSecondary: true })
+
+// Create a Bull queue named 'backup'
+const backupQueue = new Queue('backup', {
+  redis: redisOptions,
+  settings: {
+    lockDuration: 15 * 60 * 1000, // 15 minutes
+    lockRenewTime: 60 * 1000, // 1 minute
+    maxStalledCount: 0, // mark stalled jobs as failed
+  },
+})
+
+// Log queue events
+backupQueue.on('active', job => {
+  logger.debug({ job }, 'job is now active')
+})
+
+backupQueue.on('completed', (job, result) => {
+  metrics.inc('backup_worker_job', 1, { status: 'completed' })
+  logger.debug({ job, result }, 'job completed')
+})
+
+backupQueue.on('failed', (job, err) => {
+  metrics.inc('backup_worker_job', 1, { status: 'failed' })
+  logger.error({ job, err }, 'job failed')
+})
+
+backupQueue.on('waiting', jobId => {
+  logger.debug({ jobId }, 'job is waiting')
+})
+
+backupQueue.on('error', error => {
+  logger.error({ error }, 'queue error')
+})
+
+backupQueue.on('stalled', job => {
+  logger.error({ job }, 'job has stalled')
+})
+
+backupQueue.on('lock-extension-failed', (job, err) => {
+  logger.error({ job, err }, 'lock extension failed')
+})
+
+backupQueue.on('paused', () => {
+  logger.info('queue paused')
+})
+
+backupQueue.on('resumed', () => {
+  logger.info('queue resumed')
+})
+
+// Process jobs
+backupQueue.process(CONCURRENCY, async job => {
+  const { projectId, startDate, endDate } = job.data
+
+  if (projectId) {
+    return await runBackup(projectId, job.data, job)
+  } else if (startDate && endDate) {
+    return await runInit(startDate, endDate)
+  } else {
+    throw new Error('invalid job data')
+  }
+})
+
+async function runBackup(projectId, data, job) {
+  const { pendingChangeAt } = data
+  // record the time it takes to run the backup job
+  const timer = new metrics.Timer(
+    'backup_worker_job_duration',
+    1,
+    {},
+    JOB_TIME_BUCKETS
+  )
+  const pendingAge = Date.now() - pendingChangeAt
+  if (pendingAge > WARN_THRESHOLD) {
+    logger.warn(
+      { projectId, pendingAge, job },
+      'project has been pending for a long time'
+    )
+  }
+  try {
+    logger.debug({ projectId }, 'processing backup for project')
+    await backupProject(projectId, {})
+    metrics.inc('backup_worker_project', 1, {
+      status: 'success',
+    })
+    timer.done()
+    // record the replication lag (time from change to backup)
+    if (pendingChangeAt) {
+      metrics.histogram(
+        'backup_worker_replication_lag_in_hours',
+        (Date.now() - pendingChangeAt) / (3600 * 1000),
+        LAG_TIME_BUCKETS_HRS
+      )
+    }
+    return `backup completed ${projectId}`
+  } catch (err) {
+    metrics.inc('backup_worker_project', 1, { status: 'failed' })
+    logger.error({ projectId, err }, 'backup failed')
+    throw err // Re-throw to mark job as failed
+  }
+}
+
+async function runInit(startDate, endDate) {
+  try {
+    logger.info({ startDate, endDate }, 'initializing projects')
+    await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
+    return `initialization completed ${startDate} - ${endDate}`
+  } catch (err) {
+    logger.error({ startDate, endDate, err }, 'initialization failed')
+    throw err
+  }
+}
+
+export async function drainQueue() {
+  logger.info({ queue: backupQueue.name }, 'pausing queue')
+  await backupQueue.pause(true) // pause this worker and wait for jobs to finish
+  logger.info({ queue: backupQueue.name }, 'closing queue')
+  await backupQueue.close()
+}
+
+export async function healthCheck() {
+  const count = await backupQueue.count()
+  metrics.gauge('backup_worker_queue_length', count)
+}
--- a/services/history-v1/storage/scripts/export_global_blobs.mjs
+++ b/services/history-v1/storage/scripts/export_global_blobs.mjs
@@ -0,0 +1,69 @@
+/**
+ * A script to export the global blobs from mongo to a CSV file.
+ *
+ * node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
+ *
+ * The output CSV has the following format:
+ *
+ * hash,path,byteLength,stringLength,demoted
+ *
+ * hash: the hash of the blob
+ * path: the path of the blob in the blob store
+ * byteLength: the byte length of the blob, or empty if unknown
+ * stringLength: the string length of the blob, or empty if unknown
+ * demoted: true if the blob has been demoted to a reference, false otherwise
+ */
+
+// @ts-check
+import { ObjectId } from 'mongodb'
+import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
+import { client } from '../lib/mongodb.js'
+import commandLineArgs from 'command-line-args'
+import fs from 'node:fs'
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+function parseArgs() {
+  const args = commandLineArgs([
+    {
+      name: 'output',
+      type: String,
+      alias: 'o',
+    },
+  ])
+  const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
+
+  return {
+    OUTPUT_STREAM,
+  }
+}
+
+const { OUTPUT_STREAM } = parseArgs()
+
+async function main() {
+  await loadGlobalBlobs()
+  OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
+  for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
+    const { hash: blobHash, byteLength, stringLength } = blob
+    if (blobHash !== hash) {
+      throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
+    }
+    const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
+    const byteLengthStr = byteLength === null ? '' : byteLength
+    const stringLengthStr = stringLength === null ? '' : stringLength
+    OUTPUT_STREAM.write(
+      `${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
+    )
+  }
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+  })
--- a/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs
+++ b/services/history-v1/storage/scripts/fix_string_backedUpBlobs_ids.mjs
@@ -0,0 +1,51 @@
+// @ts-check
+import { backedUpBlobs } from '../lib/mongodb.js'
+import { mongoId } from '../lib/assert.js'
+import { ObjectId } from 'mongodb'
+import commandLineArgs from 'command-line-args'
+
+const STATS = {
+  total: 0,
+  replaced: 0,
+  skipped: 0,
+}
+
+const config = commandLineArgs([
+  { name: 'commit', type: Boolean, defaultValue: false },
+])
+
+async function processRecord(record) {
+  STATS.total++
+  try {
+    mongoId(record._id)
+    const newId = new ObjectId(record._id)
+    if (config.commit) {
+      await backedUpBlobs.updateOne(
+        { _id: newId },
+        {
+          $addToSet: { blobs: { $each: record.blobs } },
+        },
+        { upsert: true }
+      )
+      await backedUpBlobs.deleteOne({ _id: record._id })
+    }
+    STATS.replaced++
+  } catch (error) {
+    console.log(error)
+    STATS.skipped++
+  }
+}
+
+const cursor = backedUpBlobs
+  .find({ _id: { $type: 'string' } })
+  .project({ _id: 1, blobs: 1 })
+
+while (await cursor.hasNext()) {
+  const record = await cursor.next()
+  await processRecord(record)
+}
+
+console.log(
+  `${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
+)
+process.exit()
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/01-create-blob-hashes-table.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/01-create-blob-hashes-table.sql
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/02-set-global-flag.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/02-set-global-flag.sql
@@ -0,0 +1,3 @@
+UPDATE blobs
+SET global = TRUE
+WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/03-create-global-blobs-table.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/03-create-global-blobs-table.sql
@@ -0,0 +1,16 @@
+CREATE TABLE global_blobs (
+    hash_bytes bytea NOT NULL,
+    byte_length integer NOT NULL,
+    string_length integer,
+    global boolean,
+    CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
+    CONSTRAINT global_blobs_byte_length_non_negative
+        CHECK (byte_length >= 0),
+    CONSTRAINT global_blobs_string_length_non_negative
+        CHECK (string_length IS NULL OR string_length >= 0)
+);
+
+INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
+SELECT hash_bytes, byte_length, string_length, true
+FROM blobs
+WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/04-swap-global-blob-tables.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/04-swap-global-blob-tables.sql
@@ -0,0 +1,22 @@
+BEGIN;
+    ALTER TABLE blobs RENAME TO old_blobs;
+    ALTER TABLE global_blobs RENAME TO blobs;
+
+    ALTER TABLE old_blobs
+        RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
+    ALTER TABLE old_blobs
+        RENAME CONSTRAINT blobs_byte_length_non_negative
+        TO old_blobs_byte_length_non_negative;
+    ALTER TABLE old_blobs
+        RENAME CONSTRAINT blobs_string_length_non_negative
+        TO old_blobs_string_length_non_negative;
+
+    ALTER TABLE blobs
+        RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT global_blobs_byte_length_non_negative
+        TO blobs_byte_length_non_negative;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT global_blobs_string_length_non_negative
+        TO blobs_string_length_non_negative;
+COMMIT;
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/README.md
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/README.md
@@ -0,0 +1,9 @@
+Scripts in this directory were used when we cleaned up the global blobs table,
+ensuring that it only contained global blobs. The scripts are meant to be run in this order:
+
+* `01-create-blob-hashes-table.sql`
+* `02-set-global-flag.sql`
+* `03-create-global-blobs-table.sql`
+* `04-swap-global-blob-tables.sql`
+
+The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.
--- a/services/history-v1/storage/scripts/global-blobs-db-cleanup/rollback.sql
+++ b/services/history-v1/storage/scripts/global-blobs-db-cleanup/rollback.sql
@@ -0,0 +1,22 @@
+BEGIN;
+    ALTER TABLE blobs RENAME TO global_blobs;
+    ALTER TABLE old_blobs RENAME TO blobs;
+
+    ALTER TABLE global_blobs
+        RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
+    ALTER TABLE global_blobs
+        RENAME CONSTRAINT blobs_byte_length_non_negative
+        TO global_blobs_byte_length_non_negative;
+    ALTER TABLE global_blobs
+        RENAME CONSTRAINT blobs_string_length_non_negative
+        TO global_blobs_string_length_non_negative;
+
+    ALTER TABLE blobs
+        RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT old_blobs_byte_length_non_negative
+        TO blobs_byte_length_non_negative;
+    ALTER TABLE blobs
+        RENAME CONSTRAINT old_blobs_string_length_non_negative
+        TO blobs_string_length_non_negative;
+COMMIT;
--- a/services/history-v1/storage/scripts/recover_doc_versions.js
+++ b/services/history-v1/storage/scripts/recover_doc_versions.js
@@ -0,0 +1,379 @@
+const fsPromises = require('node:fs/promises')
+const { ObjectId } = require('mongodb')
+const BPromise = require('bluebird')
+const logger = require('@overleaf/logger')
+const Settings = require('@overleaf/settings')
+const rclient = require('@overleaf/redis-wrapper').createClient(
+  Settings.redis.documentupdater
+)
+const mongodb = require('../lib/mongodb')
+const { chunkStore } = require('..')
+const Events = require('node:events')
+
+// Silence warning.
+Events.setMaxListeners(20)
+
+const BATCH_SIZE = 1000
+const OPTIONS = {
+  concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
+  force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
+  'skip-history-failures':
+    process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
+  'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
+}
+
+const db = {
+  deletedProjects: mongodb.db.collection('deletedProjects'),
+  docs: mongodb.db.collection('docs'),
+  migrations: mongodb.db.collection('migrations'),
+  projects: mongodb.db.collection('projects'),
+}
+
+const BAD_MIGRATION_NAME =
+  '20231219081700_move_doc_versions_from_docops_to_docs'
+
+const RECOVERY_FILES_502 = [
+  '/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
+  '/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
+]
+
+let loggingChain = Promise.resolve()
+const projectIdsThatNeedResyncing = []
+const unflushedDocIds = new Set()
+
+async function flushLogQueue() {
+  const logPath = OPTIONS['resyncs-needed-file']
+  loggingChain = loggingChain.then(async () => {
+    const batch = projectIdsThatNeedResyncing.splice(0)
+    if (batch.length === 0) return
+    try {
+      await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
+    } catch (err) {
+      projectIdsThatNeedResyncing.push(...batch)
+      logger.err({ err, logPath, batch }, 'Failed to write to log file')
+    }
+  })
+  await loggingChain
+}
+async function recordProjectNeedsResync(projectId) {
+  if (OPTIONS['resyncs-needed-file']) {
+    projectIdsThatNeedResyncing.push(projectId)
+    await flushLogQueue()
+  } else {
+    console.log(`Project ${projectId} needs a hard resync.`)
+  }
+}
+
+async function main() {
+  const recovery502Ran = await did502RecoveryRun()
+  await getUnflushedDocIds()
+  const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
+
+  if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
+    // Tell customers that they need to flush
+    console.log(`
+--------------------------------------------------------------------
+Detected unflushed changes while recovering doc versions.
+Please go back to version 5.0.1 and follow the recovery procedure
+for flushing document updates:
+
+https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
+--------------------------------------------------------------------`)
+    process.exit(1)
+  }
+
+  if (OPTIONS.force || recovery502Ran || badMigration != null) {
+    console.warn('Need to recover doc versions. This will take a while.')
+    await runRecovery()
+    await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
+    await delete502RecoveryFiles()
+  }
+
+  console.log('Done.')
+}
+
+async function did502RecoveryRun() {
+  for (const file of RECOVERY_FILES_502) {
+    try {
+      await fsPromises.stat(file)
+      return true
+    } catch (err) {
+      // file doesn't exist. continue
+    }
+  }
+  return false
+}
+
+async function delete502RecoveryFiles() {
+  for (const file of RECOVERY_FILES_502) {
+    try {
+      await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
+    } catch (err) {
+      // file doesn't exist. continue
+    }
+  }
+}
+
+async function runRecovery() {
+  let batch = []
+  const summary = {
+    ignored: 0,
+    skipped: 0,
+    deletedUpdatedMongo: 0,
+    deletedUpdatedRedis: 0,
+    deletedUpdatedBoth: 0,
+    deletedIgnored: 0,
+    updatedMongo: 0,
+    updatedRedis: 0,
+    updatedBoth: 0,
+  }
+  const processBatchAndLogProgress = async () => {
+    try {
+      await BPromise.map(batch, project => processProject(project, summary), {
+        concurrency: OPTIONS.concurrency,
+      })
+    } finally {
+      console.log(`${summary.updatedRedis} projects updated in Redis`)
+      console.log(`${summary.updatedMongo} projects updated in Mongo`)
+      console.log(
+        `${summary.updatedBoth} projects updated in both Mongo and Redis`
+      )
+      console.log(`${summary.ignored} projects had good versions`)
+      console.log(
+        `${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
+      )
+      console.log(
+        `${summary.deletedUpdatedRedis} deleted projects updated in Redis`
+      )
+      console.log(
+        `${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
+      )
+      console.log(
+        `${summary.deletedIgnored} deleted projects had good versions`
+      )
+      console.log(`${summary.skipped} projects skipped`)
+    }
+    batch = []
+  }
+
+  await printDBStats()
+  await initResyncsNeededFile()
+  for await (const project of getProjects()) {
+    batch.push(project)
+    if (batch.length >= BATCH_SIZE) {
+      await processBatchAndLogProgress()
+    }
+  }
+
+  for await (const deletedProject of getDeletedProjects()) {
+    const project = deletedProject.project
+    project.isDeleted = true
+    batch.push(project)
+    if (batch.length >= BATCH_SIZE) {
+      await processBatchAndLogProgress()
+    }
+  }
+
+  if (batch.length > 0) {
+    await processBatchAndLogProgress()
+  }
+
+  await backfillMissingVersions()
+}
+
+async function getUnflushedDocIds() {
+  const batchSize = 1000
+  let cursor = '0'
+  do {
+    const [newCursor, keys] = await rclient.scan(
+      cursor,
+      'MATCH',
+      Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
+      'COUNT',
+      batchSize
+    )
+    for (const key of keys) {
+      unflushedDocIds.add(key.slice('DocVersion:'.length))
+    }
+    cursor = newCursor
+  } while (cursor !== '0')
+}
+
+async function printDBStats() {
+  const projects = await db.projects.estimatedDocumentCount()
+  const deletedProjects = await db.deletedProjects.countDocuments()
+  const docs = await db.docs.estimatedDocumentCount()
+  console.log(
+    `Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
+  )
+}
+
+async function initResyncsNeededFile() {
+  const logPath = OPTIONS['resyncs-needed-file']
+  if (logPath) {
+    await fsPromises.writeFile(logPath, '')
+    await fsPromises.rm(`${logPath}.done`, { force: true })
+  }
+}
+
+function getProjects() {
+  return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
+}
+
+function getDeletedProjects() {
+  return db.deletedProjects.find(
+    { 'project.overleaf.history.id': { $exists: true } },
+    { projection: { 'project._id': 1, 'project.overleaf': 1 } }
+  )
+}
+
+async function processProject(project, summary) {
+  const projectId = project._id.toString()
+  let updatedMongo = false
+  let updatedRedis = false
+  try {
+    const historyDocVersions = await getHistoryDocVersions(project)
+
+    for (const { docId, version } of historyDocVersions) {
+      const update = await fixDocVersion(docId, version)
+      if (update != null) {
+        if (update.in === 'mongo') {
+          updatedMongo = true
+        } else if (update.in === 'redis') {
+          updatedRedis = true
+        }
+      }
+    }
+
+    if (project.isDeleted) {
+      if (updatedMongo && updatedRedis) {
+        summary.deletedUpdatedBoth += 1
+      } else if (updatedMongo) {
+        summary.deletedUpdatedMongo += 1
+      } else if (updatedRedis) {
+        summary.deletedUpdatedRedis += 1
+      } else {
+        summary.deletedIgnored += 1
+      }
+    } else {
+      await recordProjectNeedsResync(projectId)
+      if (updatedMongo && updatedRedis) {
+        summary.updatedBoth += 1
+      } else if (updatedMongo) {
+        summary.updatedMongo += 1
+      } else if (updatedRedis) {
+        summary.updatedRedis += 1
+      } else {
+        summary.ignored += 1
+      }
+    }
+  } catch (err) {
+    logger.error({ err, projectId }, 'Failed to process project')
+    if (OPTIONS['skip-history-failures']) {
+      summary.skipped += 1
+    } else {
+      throw err
+    }
+  }
+}
+
+async function getHistoryDocVersions(project) {
+  const historyId = project.overleaf.history.id
+  const chunk = await chunkStore.loadLatest(historyId)
+  if (chunk == null) {
+    return []
+  }
+
+  const snapshot = chunk.getSnapshot()
+  const changes = chunk.getChanges()
+  snapshot.applyAll(changes)
+  const v2DocVersions = snapshot.getV2DocVersions()
+  if (v2DocVersions == null) {
+    return []
+  }
+  return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
+    docId,
+    version: versionInfo.v,
+  }))
+}
+
+async function fixDocVersion(docId, historyVersion) {
+  const redisVersion = await getRedisDocVersion(docId)
+  if (redisVersion != null && historyVersion >= redisVersion) {
+    await setRedisDocVersion(docId, historyVersion + 1)
+    return {
+      in: 'redis',
+      previousVersion: redisVersion,
+      newVersion: historyVersion + 1,
+    }
+  } else {
+    const docBeforeUpdate = await db.docs.findOneAndUpdate(
+      {
+        _id: new ObjectId(docId),
+        $or: [
+          { version: { $lte: historyVersion } },
+          { version: { $exists: false } },
+        ],
+      },
+      { $set: { version: historyVersion + 1 } },
+      { projection: { _id: 1, version: 1 } }
+    )
+
+    if (docBeforeUpdate != null) {
+      return {
+        in: 'mongo',
+        previousVersion: docBeforeUpdate.version,
+        newVersion: historyVersion + 1,
+      }
+    } else {
+      return null
+    }
+  }
+}
+
+async function getRedisDocVersion(docId) {
+  if (!unflushedDocIds.has(docId)) {
+    return null
+  }
+  const result = await rclient.get(
+    Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
+  )
+  if (result == null) {
+    return null
+  }
+  return parseInt(result, 10)
+}
+
+async function setRedisDocVersion(docId, version) {
+  const multi = rclient.multi()
+  multi.set(
+    Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
+    version
+  )
+  multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
+  await multi.exec()
+}
+
+/**
+ * Set all remaining versions to 0
+ */
+async function backfillMissingVersions() {
+  console.log('Defaulting version to 0 for remaining docs.')
+  await db.docs.updateMany(
+    { version: { $exists: false } },
+    { $set: { version: 0 } }
+  )
+}
+
+main()
+  .finally(async () => {
+    console.log('Flushing log queue.')
+    await flushLogQueue()
+  })
+  .then(() => {
+    process.exit(0)
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/scripts/recover_zip.js
+++ b/services/history-v1/storage/scripts/recover_zip.js
@@ -0,0 +1,255 @@
+/**
+ * Try to recover a zip of the latest version of a project using only data in
+ * GCS, where this data may have been (recently) hard deleted (i.e. may exist
+ * wholely or in part as non-current versions). This should be able to
+ * retrieve the latest content of a project up to 180 days after it was
+ * deleted.
+ *
+ * Usage:
+ * node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
+ *
+ * Output:
+ * Signed URL(s) for the uploaded zip files. Note that these are valid for
+ * only 24h, to match the lifecycle rule on the zip bucket.
+ */
+
+const fs = require('node:fs')
+const os = require('node:os')
+const path = require('node:path')
+const util = require('node:util')
+
+// Something is registering 11 listeners, over the limit of 10, which generates
+// a lot of warning noise.
+require('node:events').EventEmitter.defaultMaxListeners = 11
+
+const config = require('config')
+// We depend on this via object-persistor.
+// eslint-disable-next-line import/no-extraneous-dependencies
+const { Storage } = require('@google-cloud/storage')
+const isValidUtf8 = require('utf-8-validate')
+
+const core = require('overleaf-editor-core')
+const projectKey = require('../lib/project_key')
+const streams = require('../lib/streams')
+const ProjectArchive = require('../lib/project_archive')
+
+const {
+  values: { verbose: VERBOSE },
+  positionals: HISTORY_IDS,
+} = util.parseArgs({
+  options: {
+    verbose: {
+      type: 'boolean',
+      default: false,
+    },
+  },
+  allowPositionals: true,
+})
+
+if (HISTORY_IDS.length === 0) {
+  console.error('no history IDs; see usage')
+  process.exit(1)
+}
+
+async function listDeletedChunks(historyId) {
+  const bucketName = config.get('chunkStore.bucket')
+  const storage = new Storage()
+  const [files] = await storage.bucket(bucketName).getFiles({
+    prefix: projectKey.format(historyId),
+    versions: true,
+  })
+  return files
+}
+
+async function findLatestChunk(historyId) {
+  const files = await listDeletedChunks(historyId)
+  if (files.length === 0) return null
+  files.sort((a, b) => {
+    if (a.name < b.name) return -1
+    if (a.name > b.name) return 1
+    return 0
+  })
+  return files[files.length - 1]
+}
+
+async function downloadLatestChunk(tmp, historyId) {
+  const latestChunkFile = await findLatestChunk(historyId)
+  if (!latestChunkFile) throw new Error('no chunk found to recover')
+
+  const destination = path.join(tmp, 'latest.json')
+  await latestChunkFile.download({ destination })
+  return destination
+}
+
+async function loadHistory(historyPathname) {
+  const data = await fs.promises.readFile(historyPathname)
+  const rawHistory = JSON.parse(data)
+  return core.History.fromRaw(rawHistory)
+}
+
+async function loadChunk(historyPathname, blobStore) {
+  const history = await loadHistory(historyPathname)
+
+  const blobHashes = new Set()
+  history.findBlobHashes(blobHashes)
+
+  await blobStore.fetchBlobs(blobHashes)
+  await history.loadFiles('lazy', blobStore)
+
+  return new core.Chunk(history, 0)
+}
+
+// TODO: it would be nice to export / expose this from BlobStore;
+// currently this is a copy of the method there.
+async function getStringLengthOfFile(byteLength, pathname) {
+  // We have to read the file into memory to get its UTF-8 length, so don't
+  // bother for files that are too large for us to edit anyway.
+  if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
+    return null
+  }
+
+  // We need to check if the file contains nonBmp or null characters
+  let data = await fs.promises.readFile(pathname)
+  if (!isValidUtf8(data)) return null
+  data = data.toString()
+  if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
+  if (core.util.containsNonBmpChars(data)) return null
+  if (data.indexOf('\x00') !== -1) return null
+  return data.length
+}
+
+class RecoveryBlobStore {
+  constructor(historyId, tmp) {
+    this.historyId = historyId
+    this.tmp = tmp
+    this.blobs = new Map()
+  }
+
+  async fetchBlobs(blobHashes) {
+    for await (const blobHash of blobHashes) {
+      await this.fetchBlob(blobHash)
+    }
+  }
+
+  async fetchBlob(hash) {
+    if (this.blobs.has(hash)) return
+
+    if (VERBOSE) console.log('fetching blob', hash)
+
+    const bucketName = config.get('blobStore.projectBucket')
+    const storage = new Storage()
+    const [files] = await storage.bucket(bucketName).getFiles({
+      prefix: this.makeProjectBlobKey(hash),
+      versions: true,
+    })
+
+    const destination = this.getBlobPathname(hash)
+
+    if (files.length === 0) {
+      await this.fetchGlobalBlob(hash, destination)
+    } else if (files.length === 1) {
+      await files[0].download({ destination })
+    } else {
+      throw new Error('Multiple versions of blob ' + hash)
+    }
+
+    this.blobs.set(hash, await this.makeBlob(hash, destination))
+  }
+
+  async fetchGlobalBlob(hash, destination) {
+    const bucketName = config.get('blobStore.globalBucket')
+    const storage = new Storage()
+    const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
+    await file.download({ destination })
+  }
+
+  async makeBlob(hash, pathname) {
+    const stat = await fs.promises.stat(pathname)
+    const byteLength = stat.size
+    const stringLength = await getStringLengthOfFile(byteLength, pathname)
+    return new core.Blob(hash, byteLength, stringLength)
+  }
+
+  async getString(hash) {
+    const stream = await this.getStream(hash)
+    const buffer = await streams.readStreamToBuffer(stream)
+    return buffer.toString()
+  }
+
+  async getStream(hash) {
+    return fs.createReadStream(this.getBlobPathname(hash))
+  }
+
+  async getBlob(hash) {
+    return this.blobs.get(hash)
+  }
+
+  getBlobPathname(hash) {
+    return path.join(this.tmp, hash)
+  }
+
+  makeGlobalBlobKey(hash) {
+    return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
+  }
+
+  makeProjectBlobKey(hash) {
+    return `${projectKey.format(this.historyId)}/${hash.slice(
+      0,
+      2
+    )}/${hash.slice(2)}`
+  }
+}
+
+async function uploadZip(historyId, zipPathname) {
+  const bucketName = config.get('zipStore.bucket')
+  const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
+  const storage = new Storage()
+  const destination = `${historyId}-recovered.zip`
+  await storage.bucket(bucketName).upload(zipPathname, { destination })
+
+  const signedUrls = await storage
+    .bucket(bucketName)
+    .file(destination)
+    .getSignedUrl({
+      version: 'v4',
+      action: 'read',
+      expires: Date.now() + deadline,
+    })
+
+  return signedUrls[0]
+}
+
+async function restoreProject(historyId) {
+  const tmp = await fs.promises.mkdtemp(
+    path.join(os.tmpdir(), historyId.toString())
+  )
+  if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
+
+  const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
+  const blobStore = new RecoveryBlobStore(historyId, tmp)
+  const chunk = await loadChunk(latestJsonPathname, blobStore)
+
+  const snapshot = chunk.getSnapshot()
+  for (const change of chunk.getChanges()) {
+    change.applyTo(snapshot)
+  }
+
+  if (VERBOSE) console.log('zipping', historyId)
+
+  const zipPathname = path.join(tmp, `${historyId}.zip`)
+  const zipTimeoutMs = 60 * 1000
+  const archive = new ProjectArchive(snapshot, zipTimeoutMs)
+  await archive.writeZip(blobStore, zipPathname)
+
+  if (VERBOSE) console.log('uploading', historyId)
+
+  return await uploadZip(historyId, zipPathname)
+}
+
+async function main() {
+  for (const historyId of HISTORY_IDS) {
+    const signedUrl = await restoreProject(historyId)
+    console.log(signedUrl)
+  }
+}
+main().catch(console.error)
--- a/services/history-v1/storage/scripts/redis.mjs
+++ b/services/history-v1/storage/scripts/redis.mjs
@@ -0,0 +1,36 @@
+import redis from '@overleaf/redis-wrapper'
+import config from 'config'
+
+// Get allowed Redis dbs from config
+const redisConfig = config.get('redis')
+const allowedDbs = Object.keys(redisConfig)
+
+// Get the Redis db from command line argument or use the first available db as default
+const db = process.argv[2]
+
+// Validate redis db
+if (!allowedDbs.includes(db)) {
+  if (db) {
+    console.error('Invalid redis db:', db)
+  }
+  console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
+  process.exit(1)
+}
+
+// Get redis options based on command line argument
+const redisOptions = config.get(`redis.${db}`)
+console.log('Using redis db:', db)
+console.log('REDIS CONFIG', {
+  ...redisOptions,
+  password: '*'.repeat(redisOptions.password?.length),
+})
+const rclient = redis.createClient(redisOptions)
+
+try {
+  await rclient.healthCheck()
+  console.log('REDIS HEALTHCHECK SUCCEEDED')
+} catch (error) {
+  console.error('REDIS HEALTHCHECK FAILED', error)
+} finally {
+  await rclient.quit()
+}
--- a/services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
+++ b/services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
@@ -0,0 +1,104 @@
+// @ts-check
+import { readFileSync } from 'node:fs'
+import commandLineArgs from 'command-line-args'
+import { client } from '../lib/mongodb.js'
+import {
+  getBackedUpBlobHashes,
+  unsetBackedUpBlobHashes,
+} from '../lib/backup_store/index.js'
+
+let gracefulShutdownInitiated = false
+
+// Parse command line arguments
+const args = commandLineArgs([
+  { name: 'input', type: String, alias: 'i', defaultOption: true },
+  { name: 'commit', type: Boolean, default: false },
+])
+
+if (!args.input) {
+  console.error(
+    'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
+  )
+  process.exit(1)
+}
+
+if (!args.commit) {
+  console.log('Running in dry-run mode. Use --commit to apply changes.')
+}
+
+// Signal handling
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  console.warn('Graceful shutdown initiated')
+  gracefulShutdownInitiated = true
+}
+
+// Process CSV and remove blobs
+async function main() {
+  const projectBlobs = new Map()
+  const lines = readFileSync(args.input, 'utf8').split('\n')
+  const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
+
+  // Skip header
+  for (const line of lines.slice(1)) {
+    if (!line.trim() || gracefulShutdownInitiated) break
+
+    const [projectId, path] = line.split(',')
+    const pathParts = path.split('/')
+    const hash = pathParts[3] + pathParts[4]
+
+    if (!SHA1_HEX_REGEX.test(hash)) {
+      console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
+      continue
+    }
+
+    if (!projectBlobs.has(projectId)) {
+      projectBlobs.set(projectId, new Set())
+    }
+    projectBlobs.get(projectId).add(hash)
+  }
+
+  // Process each project
+  for (const [projectId, hashes] of projectBlobs) {
+    if (gracefulShutdownInitiated) break
+
+    if (!args.commit) {
+      console.log(
+        `DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
+      )
+      continue
+    }
+
+    try {
+      const originalHashes = await getBackedUpBlobHashes(projectId)
+      if (originalHashes.size === 0) {
+        continue
+      }
+      const result = await unsetBackedUpBlobHashes(
+        projectId,
+        Array.from(hashes)
+      )
+      if (result) {
+        console.log(
+          `Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
+        )
+      }
+    } catch (err) {
+      console.error(`Error updating project ${projectId}:`, err)
+    }
+  }
+}
+
+// Run the script
+main()
+  .catch(err => {
+    console.error('Fatal error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    client
+      .close()
+      .catch(err => console.error('Error closing MongoDB connection:', err))
+  })
--- a/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs
+++ b/services/history-v1/storage/scripts/remove_backup_blobs_from_wrong_path.mjs
@@ -0,0 +1,221 @@
+// @ts-check
+
+/**
+ * This script is used to remove blobs that have been backed up under the project ID
+ * instead of the history ID (where those are different).
+ *
+ * This script reads a CSV file with the following format:
+ * ```
+ * project_id,hash
+ * <mongo ID>,<hash>
+ * ```
+ *
+ * The header row is optional. All rows will be checked for conformance to the format.
+ */
+
+import commandLineArgs from 'command-line-args'
+import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
+import { makeProjectKey } from '../lib/blob_store/index.js'
+import fs from 'node:fs'
+import assert from '../lib/assert.js'
+import { client } from '../lib/mongodb.js'
+import { verifyBlobs } from '../lib/backupVerifier.mjs'
+import { setTimeout } from 'node:timers/promises'
+import { getHistoryId } from '../lib/backup_store/index.js'
+
+const argsSchema = [
+  {
+    name: 'input',
+    type: String,
+  },
+  {
+    name: 'commit',
+    type: Boolean,
+  },
+  {
+    name: 'header',
+    type: Boolean,
+  },
+  {
+    name: 'force',
+    type: Boolean,
+  },
+  {
+    name: 'verbose',
+    type: Boolean,
+  },
+]
+
+const args = commandLineArgs(argsSchema)
+
+async function gracefulClose(code = 0) {
+  await client.close()
+  process.exit(code)
+}
+
+/**
+ *
+ * @param {(value: unknown) => void} fn
+ * @param {unknown} value
+ * @return {boolean}
+ */
+function not(fn, value) {
+  try {
+    fn(value)
+    return false
+  } catch {
+    return true
+  }
+}
+
+/**
+ *
+ * @param {string} row
+ * @return {{projectId: string, hash: string}}
+ */
+function parseCSVRow(row) {
+  const [projectId, hash] = row.split(',')
+  assert.mongoId(projectId, `invalid projectId ${projectId}`)
+  assert.blobHash(hash, `invalid hash ${hash}`)
+  return { projectId, hash }
+}
+
+/**
+ *
+ * @param {string} path
+ * @param {boolean} hasHeader
+ * @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
+ */
+async function* readCSV(path, hasHeader) {
+  let seenHeader = !hasHeader
+  let fh
+  try {
+    fh = await fs.promises.open(path, 'r')
+  } catch (error) {
+    console.error(`Could not open file: ${error}`)
+    return await gracefulClose(1)
+  }
+  for await (const line of fh.readLines()) {
+    if (!seenHeader) {
+      const [first, second] = line.split(',')
+      const noDataInHeader =
+        not(assert.mongoId, first) && not(assert.blobHash, second)
+      if (!noDataInHeader) {
+        console.error('Data found in header row')
+        return await gracefulClose(1)
+      }
+      seenHeader = true
+      continue
+    }
+    try {
+      yield parseCSVRow(line)
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      console.info(`Skipping invalid row: ${line}`)
+    }
+  }
+}
+
+function usage() {
+  console.info(
+    'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
+  )
+}
+
+if (!args.input) {
+  console.error('--input was missing')
+  usage()
+  await gracefulClose(1)
+}
+
+/**
+ *
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function deleteBlob(projectId, hash) {
+  const path = makeProjectKey(projectId, hash)
+  if (args.commit) {
+    await backupPersistor.deleteObject(projectBlobsBucket, path)
+  } else {
+    console.log(`DELETE: ${path}`)
+  }
+}
+
+/**
+ *
+ * @param {string} projectId
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function canDeleteBlob(projectId, hash) {
+  let historyId
+  try {
+    historyId = await getHistoryId(projectId)
+  } catch (error) {
+    if (args.verbose) {
+      console.error(error)
+    }
+    throw new Error(`No history ID found for project ${projectId}, skipping`)
+  }
+  if (historyId === projectId) {
+    throw new Error(
+      `Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
+    )
+  }
+
+  // TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
+  assert.postgresId(
+    `${historyId}`,
+    `History ID ${historyId} does not appear to be for a postgres project`
+  )
+
+  try {
+    await verifyBlobs(`${historyId}`, [hash])
+  } catch (error) {
+    if (args.verbose) {
+      console.error(error)
+    }
+    throw new Error(
+      `Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
+    )
+  }
+}
+
+if (!args.commit) {
+  console.log('DRY RUN: provide --commit to perform operations')
+}
+
+if (args.force) {
+  console.log(
+    'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
+  )
+  await setTimeout(5_000)
+}
+
+let deleted = 0
+let errors = 0
+
+for await (const { projectId, hash } of readCSV(args.input, args.header)) {
+  if (!args.force) {
+    try {
+      await canDeleteBlob(projectId, hash)
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      continue
+    }
+  }
+  try {
+    await deleteBlob(projectId, hash)
+    deleted++
+  } catch (error) {
+    errors++
+    console.error(error)
+  }
+}
+
+console.log(`Deleted: ${deleted}`)
+console.log(`Errors: ${errors}`)
+
+await gracefulClose()
--- a/services/history-v1/storage/scripts/show.mjs
+++ b/services/history-v1/storage/scripts/show.mjs
@@ -0,0 +1,254 @@
+import commandLineArgs from 'command-line-args'
+import {
+  loadAtVersion,
+  getChunkMetadataForVersion,
+  getProjectChunksFromVersion,
+} from '../lib/chunk_store/index.js'
+import { client } from '../lib/mongodb.js'
+import knex from '../lib/knex.js'
+import redis from '../lib/redis.js'
+import {
+  loadGlobalBlobs,
+  BlobStore,
+  makeProjectKey,
+} from '../lib/blob_store/index.js'
+import { TextDecoder } from 'node:util'
+import {
+  backupPersistor,
+  chunksBucket,
+  projectBlobsBucket,
+} from '../lib/backupPersistor.mjs'
+import fs from 'node:fs'
+import { pipeline } from 'node:stream/promises'
+import os from 'node:os'
+import path from 'node:path'
+import { createHash } from 'node:crypto'
+import projectKey from '../lib/project_key.js'
+import { createGunzip } from 'node:zlib'
+import { text } from 'node:stream/consumers'
+
+const optionDefinitions = [
+  { name: 'historyId', alias: 'p', type: String },
+  { name: 'version', alias: 'v', type: Number },
+  { name: 'blob', alias: 'b', type: String },
+  { name: 'remote', alias: 'r', type: Boolean },
+  { name: 'keep', alias: 'k', type: Boolean },
+]
+
+function makeChunkKey(projectId, startVersion) {
+  return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
+}
+
+async function listChunks(historyId) {
+  for await (const chunkRecord of getProjectChunksFromVersion(historyId, 0)) {
+    console.log('Chunk record:', chunkRecord)
+  }
+}
+
+async function fetchChunkLocal(historyId, version) {
+  const chunkRecord = await getChunkMetadataForVersion(historyId, version)
+  const chunk = await loadAtVersion(historyId, version)
+  return { key: version, chunk, metadata: chunkRecord, source: 'local storage' }
+}
+
+async function fetchChunkRemote(historyId, version) {
+  const chunkRecord = await getChunkMetadataForVersion(historyId, version)
+  const startVersion = chunkRecord.startVersion
+  const key = makeChunkKey(historyId, startVersion)
+  const backupPersistorForProject = await backupPersistor.forProject(
+    chunksBucket,
+    key
+  )
+  const backupChunkStream = await backupPersistorForProject.getObjectStream(
+    chunksBucket,
+    key
+  )
+  const backupStr = await text(backupChunkStream.pipe(createGunzip()))
+  return {
+    key,
+    chunk: JSON.parse(backupStr),
+    metadata: chunkRecord,
+    source: 'remote backup',
+  }
+}
+
+async function displayChunk(historyId, version, options) {
+  const { key, chunk, metadata, source } = await (options.remote
+    ? fetchChunkRemote(historyId, version)
+    : fetchChunkLocal(historyId, version))
+  console.log('Source:', source)
+  console.log('Chunk record', metadata)
+  console.log('Key', key)
+  // console.log('Number of changes', chunk.getChanges().length)
+  console.log(JSON.stringify(chunk))
+}
+
+async function fetchBlobRemote(historyId, blobHash) {
+  const backupPersistorForProject = await backupPersistor.forProject(
+    projectBlobsBucket,
+    makeProjectKey(historyId, '')
+  )
+  const blobKey = makeProjectKey(historyId, blobHash)
+  return {
+    stream: await backupPersistorForProject.getObjectStream(
+      projectBlobsBucket,
+      blobKey,
+      { autoGunzip: true }
+    ),
+    metadata: { hash: blobHash },
+    source: 'remote backup',
+  }
+}
+
+async function fetchBlobLocal(historyId, blobHash) {
+  const blobStore = new BlobStore(historyId)
+  const blob = await blobStore.getBlob(blobHash)
+  if (!blob) throw new Error(`Blob ${blobHash} not found`)
+  return {
+    stream: await blobStore.getStream(blobHash),
+    metadata: blob,
+    source: 'local storage',
+  }
+}
+
+async function displayBlobContent(filepath, metadata, source, blobHash) {
+  console.log('Source:', source)
+  console.log('Blob metadata:', metadata)
+
+  // Compute git hash using streaming
+  const stat = fs.statSync(filepath)
+  const header = `blob ${stat.size}\0`
+  const hash = createHash('sha1')
+  hash.update(header)
+
+  const hashStream = fs.createReadStream(filepath)
+  for await (const chunk of hashStream) {
+    hash.update(chunk)
+  }
+  const gitHash = hash.digest('hex')
+
+  // Check content type and display preview
+  const fd = fs.openSync(filepath, 'r')
+  try {
+    const headBuf = Buffer.alloc(16)
+    const tailBuf = Buffer.alloc(16)
+
+    try {
+      // Stream through TextDecoderStream to check for valid UTF-8
+      const textStream = fs.createReadStream(filepath)
+      const decoder = new TextDecoder('utf-8', { fatal: true })
+      for await (const chunk of textStream) {
+        decoder.decode(chunk, { stream: true })
+      }
+      decoder.decode()
+      // If we get here, it's valid UTF-8
+      if (stat.size <= 1024) {
+        console.log('Content (text):', await fs.readFileSync(filepath, 'utf8'))
+      } else {
+        console.log('Content (text, truncated):')
+        console.log(`  Length: ${stat.size} bytes`)
+        fs.readSync(fd, headBuf, 0, 16, 0)
+        fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
+        console.log(
+          '  Content:',
+          headBuf.toString('utf8') +
+            ' ...(truncated)... ' +
+            tailBuf.toString('utf8')
+        )
+      }
+    } catch (e) {
+      // Binary content - show head and tail
+      console.log('Content (binary):')
+      console.log(`  Length: ${stat.size} bytes`)
+
+      if (stat.size <= 32) {
+        // Small file - read it all
+        const buf = Buffer.alloc(stat.size)
+        fs.readSync(fd, buf, 0, stat.size, 0)
+        const hexBytes = buf.toString('hex').match(/../g).join(' ')
+        console.log('  Bytes:', hexBytes)
+      } else {
+        // Read tail for large files
+        fs.readSync(fd, headBuf, 0, 16, 0)
+        fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
+        const headHex = headBuf.toString('hex').match(/../g).join(' ')
+        const tailHex = tailBuf.toString('hex').match(/../g).join(' ')
+        console.log('  Bytes:', headHex + ' ... ' + tailHex)
+      }
+      console.log('  Git-style SHA1:', gitHash)
+      if (gitHash !== blobHash) {
+        console.log('  Warning: Git hash differs from blob hash!\x1b[0m')
+        console.log('  Blob hash:', blobHash)
+      }
+    }
+  } finally {
+    fs.closeSync(fd)
+  }
+}
+
+async function withTempDir(prefix, fn, options = {}) {
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix))
+  try {
+    return await Promise.resolve(fn(tmpDir))
+  } finally {
+    if (!options.keep) {
+      fs.rmSync(tmpDir, { recursive: true, force: true })
+    } else {
+      console.log('Keeping temporary file:', path.join(tmpDir, 'blob'))
+    }
+  }
+}
+
+async function displayBlob(historyId, blobHash, options) {
+  try {
+    const { stream, metadata, source } = await (options.remote
+      ? fetchBlobRemote(historyId, blobHash)
+      : fetchBlobLocal(historyId, blobHash))
+
+    await withTempDir(
+      'blob-show-',
+      async tmpDir => {
+        const tmpPath = path.join(tmpDir, 'blob')
+        await pipeline(stream, fs.createWriteStream(tmpPath))
+        await displayBlobContent(tmpPath, metadata, source, blobHash)
+      },
+      { keep: options.keep }
+    )
+  } catch (err) {
+    if (err.code === 'NoSuchKey') {
+      throw new Error(`Blob ${blobHash} not found in backup`)
+    }
+    throw err
+  }
+}
+
+async function main() {
+  const { historyId, version, blob, remote, keep } =
+    commandLineArgs(optionDefinitions)
+  if (!historyId) {
+    console.error('Error: --historyId is required.')
+    process.exit(1)
+  }
+  await loadGlobalBlobs()
+  if (version != null) {
+    await displayChunk(historyId, version, { remote })
+  } else if (blob != null) {
+    await displayBlob(historyId, blob, { remote, keep })
+  } else {
+    await listChunks(historyId)
+  }
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exit(1)
+  })
+  .finally(() => {
+    knex.destroy().catch(err => console.error('Error closing Postgres:', err))
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+    redis
+      .disconnect()
+      .catch(err => console.error('Error disconnecting Redis:', err))
+  })
--- a/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
+++ b/services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
@@ -0,0 +1,153 @@
+// @ts-check
+import { ObjectId } from 'mongodb'
+import knex from '../lib/knex.js'
+import {
+  batchedUpdate,
+  objectIdFromInput,
+  READ_PREFERENCE_SECONDARY,
+} from '@overleaf/mongo-utils/batchedUpdate.js'
+import {
+  GLOBAL_BLOBS,
+  loadGlobalBlobs,
+  makeProjectKey,
+} from '../lib/blob_store/index.js'
+import {
+  backedUpBlobs as backedUpBlobsCollection,
+  db,
+  client,
+} from '../lib/mongodb.js'
+import redis from '../lib/redis.js'
+import commandLineArgs from 'command-line-args'
+import fs from 'node:fs'
+
+const projectsCollection = db.collection('projects')
+
+// Enable caching for ObjectId.toString()
+ObjectId.cacheHexString = true
+
+function parseArgs() {
+  const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
+  const args = commandLineArgs([
+    {
+      name: 'BATCH_RANGE_START',
+      type: String,
+      defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
+    },
+    {
+      name: 'BATCH_RANGE_END',
+      type: String,
+      defaultValue: new Date().toISOString(),
+    },
+    {
+      name: 'output',
+      type: String,
+      alias: 'o',
+    },
+  ])
+  const BATCH_RANGE_START = objectIdFromInput(
+    args['BATCH_RANGE_START']
+  ).toString()
+  const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
+  if (!args['output']) {
+    throw new Error('missing --output')
+  }
+  const OUTPUT_STREAM = fs.createWriteStream(args['output'])
+
+  return {
+    BATCH_RANGE_START,
+    BATCH_RANGE_END,
+    OUTPUT_STREAM,
+  }
+}
+
+const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs()
+
+// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
+if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
+  throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
+}
+
+let gracefulShutdownInitiated = false
+
+process.on('SIGINT', handleSignal)
+process.on('SIGTERM', handleSignal)
+
+function handleSignal() {
+  gracefulShutdownInitiated = true
+  console.warn('graceful shutdown initiated, draining queue')
+}
+
+async function processBatch(batch) {
+  if (gracefulShutdownInitiated) {
+    throw new Error('graceful shutdown: aborting batch processing')
+  }
+
+  const N = batch.length
+  const firstId = batch[0]._id
+  const lastId = batch[N - 1]._id
+  const projectCursor = await projectsCollection.find(
+    { _id: { $gte: firstId, $lte: lastId } },
+    {
+      projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 },
+      readPreference: READ_PREFERENCE_SECONDARY,
+    }
+  )
+  const projectMap = new Map()
+  for await (const project of projectCursor) {
+    projectMap.set(project._id.toString(), project)
+  }
+  for (const project of batch) {
+    const projectId = project._id.toString()
+    const projectRecord = projectMap.get(projectId)
+    if (!projectRecord) {
+      console.error(`project not found: ${projectId}`)
+      continue
+    }
+    if (!projectRecord.overleaf?.history?.id) {
+      console.error(`project missing history: ${projectId}`)
+      continue
+    }
+    const historyId = projectRecord.overleaf.history.id.toString()
+    const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},`
+    const hashes = project.blobs.map(blob => blob.toString('hex'))
+    const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash))
+    if (projectBlobHashes.length < hashes.length) {
+      console.warn(
+        `project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs`
+      )
+    }
+    const rows = projectBlobHashes.map(
+      hash => prefix + makeProjectKey(historyId, hash) + '\n'
+    )
+    OUTPUT_STREAM.write(rows.join(''))
+  }
+}
+
+async function main() {
+  await loadGlobalBlobs()
+  OUTPUT_STREAM.write('projectId,lastUpdated,path\n')
+  await batchedUpdate(
+    backedUpBlobsCollection,
+    {},
+    processBatch,
+    {},
+    {},
+    { BATCH_RANGE_START, BATCH_RANGE_END }
+  )
+}
+
+main()
+  .then(() => console.log('Done.'))
+  .catch(err => {
+    console.error('Error:', err)
+    process.exitCode = 1
+  })
+  .finally(() => {
+    knex.destroy().catch(err => {
+      console.error('Error closing Postgres connection:', err)
+    })
+    client.close().catch(err => console.error('Error closing MongoDB:', err))
+    redis.disconnect().catch(err => {
+      console.error('Error disconnecting Redis:', err)
+    })
+  })
--- a/services/history-v1/storage/scripts/verify_backup_blob.mjs
+++ b/services/history-v1/storage/scripts/verify_backup_blob.mjs
@@ -0,0 +1,21 @@
+import logger from '@overleaf/logger'
+import commandLineArgs from 'command-line-args'
+import { verifyBlobs } from '../lib/backupVerifier.mjs'
+
+const { historyId, hashes } = commandLineArgs([
+  { name: 'historyId', type: String },
+  { name: 'hashes', type: String, multiple: true, defaultOption: true },
+])
+
+if (hashes.length === 0) {
+  throw new Error('missing --hashes flag')
+}
+
+try {
+  await verifyBlobs(historyId, hashes)
+  console.log('OK')
+  process.exit(0)
+} catch (err) {
+  logger.err({ err }, 'failed to verify blob')
+  process.exit(1)
+}
--- a/services/history-v1/storage/scripts/verify_blob_backed_up_by_path_bulk.mjs
+++ b/services/history-v1/storage/scripts/verify_blob_backed_up_by_path_bulk.mjs
@@ -0,0 +1,177 @@
+import fs from 'node:fs'
+import { makeProjectKey } from '../lib/blob_store/index.js'
+import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
+import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
+import commandLineArgs from 'command-line-args'
+import OError from '@overleaf/o-error'
+import assert from '../lib/assert.js'
+import { client, projects } from '../lib/mongodb.js'
+import { ObjectId } from 'mongodb'
+import { setTimeout } from 'node:timers/promises'
+
+const { input, verbose } = commandLineArgs([
+  { name: 'input', type: String },
+  { name: 'verbose', type: Boolean, defaultValue: false },
+])
+
+function parseCSVRow(row) {
+  const [path] = row.split(',')
+  const pathSegments = path.split('/')
+  const historyId = `${pathSegments[0]}${pathSegments[1]}${pathSegments[2]}`
+    .split('')
+    .reverse()
+    .join('')
+
+  return { historyId, path, hash: `${pathSegments[3]}${pathSegments[4]}` }
+}
+
+async function* readCSV(path) {
+  let fh
+  try {
+    fh = await fs.promises.open(path, 'r')
+  } catch (error) {
+    console.error(`Could not open file: ${error}`)
+    throw error
+  }
+  for await (const line of fh.readLines()) {
+    try {
+      const row = parseCSVRow(line)
+      yield row
+    } catch (error) {
+      console.error(error instanceof Error ? error.message : error)
+      console.log(`Skipping invalid row: ${line}`)
+    }
+  }
+}
+
+class MissingDEKError extends OError {}
+class InvalidHistoryIdError extends OError {}
+class MissingProjectError extends OError {}
+class MissingBlobError extends OError {}
+
+async function getProjectPersistor(historyId) {
+  try {
+    return await backupPersistor.forProjectRO(
+      projectBlobsBucket,
+      makeProjectKey(historyId, '')
+    )
+  } catch (err) {
+    if (err instanceof NotFoundError) {
+      throw new MissingDEKError('dek does not exist', { historyId }, err)
+    }
+    throw err
+  }
+}
+
+async function checkBlobExists(path, historyId) {
+  const persistor = await getProjectPersistor(historyId)
+  return await persistor.getObjectSize(projectBlobsBucket, path)
+}
+
+let total = 0
+const errors = {
+  invalidProjectId: 0,
+  notBackedUpProjectId: 0,
+  missingBlob: 0,
+  notInMongo: 0,
+  unknown: 0,
+}
+
+const notInMongoProjectIds = new Set()
+const notBackedUpProjectIds = new Set()
+
+let stopping = false
+
+process.on('SIGTERM', () => {
+  console.log('SIGTERM received')
+  stopping = true
+})
+
+process.on('SIGINT', () => {
+  console.log('SIGINT received')
+  stopping = true
+})
+
+/**
+ *
+ * @param {string} historyId
+ * @param {string} path
+ * @param {string} hash
+ * @return {Promise<void>}
+ */
+async function checkPath(historyId, path, hash) {
+  try {
+    assert.mongoId(historyId)
+  } catch (error) {
+    throw InvalidHistoryIdError('invalid history id', { historyId })
+  }
+  if (notInMongoProjectIds.has(historyId)) {
+    throw new MissingProjectError('project not in mongo', { historyId })
+  }
+  if (notBackedUpProjectIds.has(historyId)) {
+    throw new MissingDEKError('project not backed up', { historyId })
+  }
+
+  const project = await projects.findOne({ _id: new ObjectId(historyId) })
+  if (!project) {
+    notInMongoProjectIds.add(historyId)
+    throw new MissingProjectError('project not in mongo', { historyId })
+  }
+  try {
+    await checkBlobExists(path, historyId)
+  } catch (error) {
+    if (error instanceof NotFoundError) {
+      throw new MissingBlobError('missing blob', { historyId, hash })
+    }
+    if (error instanceof MissingDEKError) {
+      notBackedUpProjectIds.add(historyId)
+    }
+    throw error
+  }
+}
+
+for await (const line of readCSV(input)) {
+  if (stopping) break
+  total++
+  if (total % 10_000 === 0) {
+    console.log(`checked ${total}`)
+  }
+  const { historyId, path, hash } = line
+  try {
+    await checkPath(historyId, path, hash)
+    if (verbose) {
+      console.log(`✓ Project ${historyId} has ${hash} backed up`)
+    }
+  } catch (error) {
+    if (error instanceof InvalidHistoryIdError) {
+      errors.invalidProjectId++
+      console.warn(`invalid historyId ${historyId}`)
+      continue
+    } else if (error instanceof MissingProjectError) {
+      errors.notInMongo++
+      console.warn(`✗ project ${historyId} not in mongo`)
+      continue
+    } else if (error instanceof MissingDEKError) {
+      errors.notBackedUpProjectId++
+      console.error(`✗ Project DEK ${historyId} not found`)
+      continue
+    } else if (error instanceof MissingBlobError) {
+      errors.missingBlob++
+      console.error(`✗ missing blob ${hash} from project ${historyId}`)
+      continue
+    }
+    errors.unknown++
+    console.error(error)
+  }
+}
+
+console.log(`total checked: ${total}`)
+console.log(`invalid project id: ${errors.invalidProjectId}`)
+console.log(`not found in mongo: ${errors.notInMongo}`)
+console.log(`missing blob: ${errors.missingBlob}`)
+console.log(`project not backed up: ${errors.notBackedUpProjectId}`)
+console.log(`unknown errors: ${errors.unknown}`)
+
+await client.close()
+await setTimeout(100)
+process.exit()
--- a/services/history-v1/storage/scripts/verify_project.mjs
+++ b/services/history-v1/storage/scripts/verify_project.mjs
@@ -0,0 +1,35 @@
+import commandLineArgs from 'command-line-args'
+import { verifyProjectWithErrorContext } from '../lib/backupVerifier.mjs'
+import knex from '../lib/knex.js'
+import { client } from '../lib/mongodb.js'
+import redis from '../lib/redis.js'
+import { setTimeout } from 'node:timers/promises'
+import { loadGlobalBlobs } from '../lib/blob_store/index.js'
+
+const { historyId } = commandLineArgs([{ name: 'historyId', type: String }])
+
+async function gracefulShutdown(code = process.exitCode) {
+  await knex.destroy()
+  await client.close()
+  await redis.disconnect()
+  await setTimeout(1_000)
+  process.exit(code)
+}
+
+if (!historyId) {
+  console.error('missing --historyId')
+  process.exitCode = 1
+  await gracefulShutdown()
+}
+
+await loadGlobalBlobs()
+
+try {
+  await verifyProjectWithErrorContext(historyId)
+  console.log('OK')
+} catch (error) {
+  console.error('error verifying', error)
+  process.exitCode = 1
+} finally {
+  await gracefulShutdown()
+}
--- a/services/history-v1/storage/scripts/verify_sampled_projects.mjs
+++ b/services/history-v1/storage/scripts/verify_sampled_projects.mjs
@@ -0,0 +1,217 @@
+// @ts-check
+import commandLineArgs from 'command-line-args'
+import {
+  setWriteMetrics,
+  verifyProjectsCreatedInDateRange,
+  verifyRandomProjectSample,
+  verifyProjectsUpdatedInDateRange,
+} from '../../backupVerifier/ProjectVerifier.mjs'
+import knex from '../lib/knex.js'
+import { client } from '../lib/mongodb.js'
+import { setTimeout } from 'node:timers/promises'
+import logger from '@overleaf/logger'
+import { loadGlobalBlobs } from '../lib/blob_store/index.js'
+import { getDatesBeforeRPO } from '../../backupVerifier/utils.mjs'
+import { EventEmitter } from 'node:events'
+import { mongodb } from '../index.js'
+import redis from '../lib/redis.js'
+
+logger.logger.level('fatal')
+
+const usageMessage = [
+  'Usage: node verify_sampled_projects.mjs [--startDate <start>] [--endDate <end>] [--nProjects <n>] [--verbose] [--usage] [--writeMetrics] [--concurrency <n>] [--strategy <range|random>]',
+  'strategy: defaults to "range"; startDate and endDate are required for "range" strategy',
+].join('\n')
+
+/**
+ * Gracefully shutdown the process
+ * @param code
+ * @return {Promise<void>}
+ */
+async function gracefulShutdown(code = process.exitCode) {
+  await knex.destroy()
+  await client.close()
+  await redis.disconnect()
+  await setTimeout(1_000)
+  process.exit(code)
+}
+
+const STATS = {
+  verifiable: 0,
+  unverifiable: 0,
+}
+
+/**
+ * @typedef {Object} CLIOptions
+ * @property {(signal: EventEmitter) => Promise<VerificationJobStatus>} projectVerifier
+ * @property {boolean} verbose
+ */
+
+/**
+ * @typedef {import('../../backupVerifier/types.d.ts').VerificationJobStatus} VerificationJobStatus
+ */
+
+/**
+ *
+ * @return {CLIOptions}
+ */
+function getOptions() {
+  const {
+    startDate,
+    endDate,
+    concurrency,
+    writeMetrics,
+    verbose,
+    nProjects,
+    strategy,
+    usage,
+  } = commandLineArgs([
+    { name: 'startDate', type: String },
+    { name: 'endDate', type: String },
+    { name: 'concurrency', type: Number, defaultValue: 1 },
+    { name: 'verbose', type: Boolean, defaultValue: false },
+    { name: 'nProjects', type: Number, defaultValue: 10 },
+    { name: 'usage', type: Boolean, defaultValue: false },
+    { name: 'writeMetrics', type: Boolean, defaultValue: false },
+    { name: 'strategy', type: String, defaultValue: 'range' },
+  ])
+
+  if (usage) {
+    console.log(usageMessage)
+    process.exit(0)
+  }
+
+  if (!['range', 'random', 'recent'].includes(strategy)) {
+    throw new Error(`Invalid strategy: ${strategy}`)
+  }
+
+  setWriteMetrics(writeMetrics)
+
+  switch (strategy) {
+    case 'random':
+      console.log('Verifying random projects')
+      return {
+        verbose,
+        projectVerifier: signal => verifyRandomProjectSample(nProjects, signal),
+      }
+    case 'recent':
+      return {
+        verbose,
+        projectVerifier: async signal => {
+          const { startDate, endDate } = getDatesBeforeRPO(3 * 3600)
+          return await verifyProjectsUpdatedInDateRange(
+            startDate,
+            endDate,
+            nProjects,
+            signal
+          )
+        },
+      }
+    case 'range':
+    default: {
+      if (!startDate || !endDate) {
+        throw new Error(usageMessage)
+      }
+      const start = Date.parse(startDate)
+      const end = Date.parse(endDate)
+      if (Number.isNaN(start)) {
+        throw new Error(`Invalid start date: ${startDate}`)
+      }
+
+      if (Number.isNaN(end)) {
+        throw new Error(`Invalid end date: ${endDate}`)
+      }
+      if (verbose) {
+        console.log(`Verifying from ${startDate} to ${endDate}`)
+        console.log(`Concurrency: ${concurrency}`)
+      }
+      STATS.ranges = 0
+      return {
+        projectVerifier: signal =>
+          verifyProjectsCreatedInDateRange({
+            startDate: new Date(start),
+            endDate: new Date(end),
+            projectsPerRange: nProjects,
+            concurrency,
+            signal,
+          }),
+        verbose,
+      }
+    }
+  }
+}
+
+/**
+ * @type {CLIOptions}
+ */
+let options
+try {
+  options = getOptions()
+} catch (error) {
+  console.error(error)
+  process.exitCode = 1
+  await gracefulShutdown(1)
+  process.exit() // just here so the type checker knows that the process will exit
+}
+
+const { projectVerifier, verbose } = options
+
+if (verbose) {
+  logger.logger.level('debug')
+}
+
+/**
+ *
+ * @param {Array<string>} array
+ * @param {string} matchString
+ * @return {*}
+ */
+function sumStringInstances(array, matchString) {
+  return array.reduce((total, string) => {
+    return string === matchString ? total + 1 : total
+  }, 0)
+}
+
+/**
+ *
+ * @param {VerificationJobStatus} stats
+ */
+function displayStats(stats) {
+  console.log(`Verified projects: ${stats.verified}`)
+  console.log(`Total projects sampled: ${stats.total}`)
+  if (stats.errorTypes.length > 0) {
+    console.log('Errors:')
+    for (const error of new Set(stats.errorTypes)) {
+      console.log(`${error}: ${sumStringInstances(stats.errorTypes, error)}`)
+    }
+  }
+}
+
+const shutdownEmitter = new EventEmitter()
+
+shutdownEmitter.on('shutdown', async () => {
+  await gracefulShutdown()
+})
+
+process.on('SIGTERM', () => {
+  shutdownEmitter.emit('shutdown')
+})
+
+process.on('SIGINT', () => {
+  shutdownEmitter.emit('shutdown')
+})
+
+await loadGlobalBlobs()
+
+try {
+  const stats = await projectVerifier(shutdownEmitter)
+  displayStats(stats)
+  console.log(`completed`)
+} catch (error) {
+  console.error(error)
+  console.log('completed with errors')
+  process.exitCode = 1
+} finally {
+  console.log('shutting down')
+  await gracefulShutdown()
+}