first commit

2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions
--- a/services/history-v1/storage/tasks/backfill_start_version.js
+++ b/services/history-v1/storage/tasks/backfill_start_version.js
@@ -0,0 +1,109 @@
+const commandLineArgs = require('command-line-args')
+const BPromise = require('bluebird')
+const timersPromises = require('node:timers/promises')
+
+const { knex, historyStore } = require('..')
+
+const MAX_POSTGRES_INTEGER = 2147483647
+const DEFAULT_BATCH_SIZE = 1000
+const DEFAULT_CONCURRENCY = 1
+const MAX_RETRIES = 10
+const RETRY_DELAY_MS = 5000
+
+async function main() {
+  const options = parseOptions()
+  let batchStart = options.minId
+  while (batchStart <= options.maxId) {
+    const chunks = await getChunks(batchStart, options.maxId, options.batchSize)
+    if (chunks.length === 0) {
+      // No results. We're done.
+      break
+    }
+    const batchEnd = chunks[chunks.length - 1].id
+    await processBatch(chunks, options)
+    console.log(`Processed chunks ${batchStart} to ${batchEnd}`)
+    batchStart = batchEnd + 1
+  }
+}
+
+function parseOptions() {
+  const args = commandLineArgs([
+    { name: 'min-id', type: Number, defaultValue: 1 },
+    {
+      name: 'max-id',
+      type: Number,
+      defaultValue: MAX_POSTGRES_INTEGER,
+    },
+    { name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
+    { name: 'concurrency', type: Number, defaultValue: DEFAULT_CONCURRENCY },
+  ])
+  return {
+    minId: args['min-id'],
+    maxId: args['max-id'],
+    batchSize: args['batch-size'],
+    concurrency: args.concurrency,
+  }
+}
+
+async function getChunks(minId, maxId, batchSize) {
+  const chunks = await knex('chunks')
+    .where('id', '>=', minId)
+    .andWhere('id', '<=', maxId)
+    .orderBy('id')
+    .limit(batchSize)
+  return chunks
+}
+
+async function processBatch(chunks, options) {
+  let retries = 0
+  while (true) {
+    const results = await BPromise.map(chunks, processChunk, {
+      concurrency: options.concurrency,
+    })
+    const failedChunks = results
+      .filter(result => !result.success)
+      .map(result => result.chunk)
+    if (failedChunks.length === 0) {
+      // All chunks processed. Carry on.
+      break
+    }
+
+    // Some projects failed. Retry.
+    retries += 1
+    if (retries > MAX_RETRIES) {
+      console.log('Too many retries processing chunks. Giving up.')
+      process.exit(1)
+    }
+    console.log(
+      `Retrying chunks: ${failedChunks.map(chunk => chunk.id).join(', ')}`
+    )
+    await timersPromises.setTimeout(RETRY_DELAY_MS)
+    chunks = failedChunks
+  }
+}
+
+async function processChunk(chunk) {
+  try {
+    const rawHistory = await historyStore.loadRaw(
+      chunk.doc_id.toString(),
+      chunk.id
+    )
+    const startVersion = chunk.end_version - rawHistory.changes.length
+    await knex('chunks')
+      .where('id', chunk.id)
+      .update({ start_version: startVersion })
+    return { chunk, success: true }
+  } catch (err) {
+    console.error(`Failed to process chunk ${chunk.id}:`, err.stack)
+    return { chunk, success: false }
+  }
+}
+
+main()
+  .then(() => {
+    process.exit()
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/tasks/compress_changes.js
+++ b/services/history-v1/storage/tasks/compress_changes.js
@@ -0,0 +1,107 @@
+/**
+ * Compress changes for projects that have too many text operations.
+ *
+ * Usage:
+ *
+ *   node tasks/compress_changes.js CSV_FILE
+ *
+ * where CSV_FILE contains a list of project ids in the first column
+ */
+
+const fs = require('node:fs')
+const BPromise = require('bluebird')
+const { History } = require('overleaf-editor-core')
+const { historyStore, chunkStore } = require('..')
+
+const CONCURRENCY = 10
+
+async function main() {
+  const filename = process.argv[2]
+  const projectIds = await readCsv(filename)
+  const chunks = []
+  for (const projectId of projectIds) {
+    const chunkIds = await chunkStore.getProjectChunkIds(projectId)
+    chunks.push(...chunkIds.map(id => ({ id, projectId })))
+  }
+  let totalCompressed = 0
+  await BPromise.map(
+    chunks,
+    async chunk => {
+      try {
+        const history = await getHistory(chunk)
+        const numCompressed = compressChanges(history)
+        if (numCompressed > 0) {
+          await storeHistory(chunk, history)
+          console.log(
+            `Compressed project ${chunk.projectId}, chunk ${chunk.id}`
+          )
+        }
+        totalCompressed += numCompressed
+      } catch (err) {
+        console.log(err)
+      }
+    },
+    { concurrency: CONCURRENCY }
+  )
+  console.log('CHANGES:', totalCompressed)
+}
+
+async function readCsv(filename) {
+  const csv = await fs.promises.readFile(filename, 'utf-8')
+  const lines = csv.trim().split('\n')
+  const projectIds = lines.map(line => line.split(',')[0])
+  return projectIds
+}
+
+async function getHistory(chunk) {
+  const rawHistory = await historyStore.loadRaw(chunk.projectId, chunk.id)
+  const history = History.fromRaw(rawHistory)
+  return history
+}
+
+async function storeHistory(chunk, history) {
+  const rawHistory = history.toRaw()
+  await historyStore.storeRaw(chunk.projectId, chunk.id, rawHistory)
+}
+
+function compressChanges(history) {
+  let numCompressed = 0
+  for (const change of history.getChanges()) {
+    const newOperations = compressOperations(change.operations)
+    if (newOperations.length !== change.operations.length) {
+      numCompressed++
+    }
+    change.setOperations(newOperations)
+  }
+  return numCompressed
+}
+
+function compressOperations(operations) {
+  if (!operations.length) return []
+
+  const newOperations = []
+  let currentOperation = operations[0]
+  for (let operationId = 1; operationId < operations.length; operationId++) {
+    const nextOperation = operations[operationId]
+    if (currentOperation.canBeComposedWith(nextOperation)) {
+      currentOperation = currentOperation.compose(nextOperation)
+    } else {
+      // currentOperation and nextOperation cannot be composed. Push the
+      // currentOperation and start over with nextOperation.
+      newOperations.push(currentOperation)
+      currentOperation = nextOperation
+    }
+  }
+  newOperations.push(currentOperation)
+
+  return newOperations
+}
+
+main()
+  .then(() => {
+    process.exit()
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/tasks/copy_project_blobs.js
+++ b/services/history-v1/storage/tasks/copy_project_blobs.js
@@ -0,0 +1,294 @@
+#!/usr/bin/env node
+
+const { promisify } = require('node:util')
+const BPromise = require('bluebird')
+const commandLineArgs = require('command-line-args')
+const config = require('config')
+const fs = require('node:fs')
+const readline = require('node:readline')
+const { History } = require('overleaf-editor-core')
+const { knex, historyStore, persistor } = require('..')
+const projectKey = require('../lib/project_key')
+
+const MAX_POSTGRES_INTEGER = 2147483647
+const DEFAULT_BATCH_SIZE = 1000
+const MAX_RETRIES = 10
+const RETRY_DELAY_MS = 5000
+
+// Obtain a preconfigured GCS client through a non-documented property of
+// object-persistor. Sorry about that. We need the GCS client because we use
+// operations that are not implemented in object-persistor.
+const gcsClient = persistor.storage
+const globalBucket = gcsClient.bucket(config.get('blobStore.globalBucket'))
+const projectBucket = gcsClient.bucket(config.get('blobStore.projectBucket'))
+const delay = promisify(setTimeout)
+
+async function main() {
+  const options = commandLineArgs([
+    { name: 'global-blobs', type: String },
+    { name: 'min-project-id', type: Number, defaultValue: 1 },
+    {
+      name: 'max-project-id',
+      type: Number,
+      defaultValue: MAX_POSTGRES_INTEGER,
+    },
+    { name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
+    { name: 'concurrency', type: Number, defaultValue: 1 },
+  ])
+  if (!options['global-blobs']) {
+    console.error(
+      'You must specify a global blobs file with the --global-blobs option'
+    )
+    process.exit(1)
+  }
+  const globalBlobs = await readGlobalBlobs(options['global-blobs'])
+  const minProjectId = options['min-project-id']
+  const maxProjectId = options['max-project-id']
+  const batchSize = options['batch-size']
+  const concurrency = options.concurrency
+  console.log(`Keeping ${globalBlobs.size} global blobs`)
+  await run({ globalBlobs, minProjectId, maxProjectId, batchSize, concurrency })
+  console.log('Done.')
+}
+
+async function readGlobalBlobs(filename) {
+  const stream = fs.createReadStream(filename)
+  const reader = readline.createInterface({
+    input: stream,
+    crlfDelay: Infinity,
+  })
+  const blobs = new Set()
+  for await (const line of reader) {
+    blobs.add(line.trim())
+  }
+  return blobs
+}
+
+async function run(options) {
+  const { globalBlobs, minProjectId, maxProjectId, batchSize, concurrency } =
+    options
+  let batchStart = minProjectId
+  while (batchStart <= maxProjectId) {
+    let projectIds = await getProjectIds(batchStart, maxProjectId, batchSize)
+    if (projectIds.length === 0) {
+      break
+    }
+    const batchEnd = projectIds[projectIds.length - 1]
+    console.log(`Processing projects ${batchStart} to ${batchEnd}`)
+    const chunkIdsByProject = await getChunkIdsByProject(projectIds)
+
+    let retries = 0
+    while (true) {
+      const results = await BPromise.map(
+        projectIds,
+        async projectId =>
+          await processProject(
+            projectId,
+            chunkIdsByProject.get(projectId),
+            globalBlobs
+          ),
+        { concurrency }
+      )
+      const failedProjectIds = results
+        .filter(result => !result.success)
+        .map(result => result.projectId)
+      if (failedProjectIds.length === 0) {
+        // All projects were copied successfully. Carry on.
+        break
+      }
+
+      // Some projects failed. Retry.
+      retries += 1
+      if (retries > MAX_RETRIES) {
+        console.log(
+          `Too many retries processing projects ${batchStart} to ${batchEnd}. Giving up.`
+        )
+        process.exit(1)
+      }
+      console.log(`Retrying projects: ${failedProjectIds.join(', ')}`)
+      await delay(RETRY_DELAY_MS)
+      projectIds = failedProjectIds
+    }
+
+    // Set up next batch
+    batchStart = batchEnd + 1
+  }
+}
+
+async function getProjectIds(minProjectId, maxProjectId, batchSize) {
+  const projectIds = await knex('chunks')
+    .distinct('doc_id')
+    .where('doc_id', '>=', minProjectId)
+    .andWhere('doc_id', '<=', maxProjectId)
+    .orderBy('doc_id')
+    .limit(batchSize)
+    .pluck('doc_id')
+  return projectIds
+}
+
+async function getChunkIdsByProject(projectIds) {
+  const chunks = await knex('chunks')
+    .select('id', { projectId: 'doc_id' })
+    .where('doc_id', 'in', projectIds)
+  const chunkIdsByProject = new Map()
+  for (const projectId of projectIds) {
+    chunkIdsByProject.set(projectId, [])
+  }
+  for (const chunk of chunks) {
+    chunkIdsByProject.get(chunk.projectId).push(chunk.id)
+  }
+  return chunkIdsByProject
+}
+
+async function processProject(projectId, chunkIds, globalBlobs) {
+  try {
+    const blobHashes = await getBlobHashes(projectId, chunkIds)
+    const projectBlobHashes = blobHashes.filter(hash => !globalBlobs.has(hash))
+    const gcsSizesByHash = new Map()
+    for (const blobHash of projectBlobHashes) {
+      const blobSize = await copyBlobInGcs(projectId, blobHash)
+      if (blobSize != null) {
+        gcsSizesByHash.set(blobHash, blobSize)
+      }
+    }
+    const dbSizesByHash = await copyBlobsInDatabase(
+      projectId,
+      projectBlobHashes
+    )
+    compareBlobSizes(gcsSizesByHash, dbSizesByHash)
+    return { projectId, success: true }
+  } catch (err) {
+    console.error(`Failed to process project ${projectId}:`, err.stack)
+    return { projectId, success: false }
+  }
+}
+
+function compareBlobSizes(gcsSizesByHash, dbSizesByHash) {
+  // Throw an error if the database doesn't report as many blobs as GCS
+  if (dbSizesByHash.size !== gcsSizesByHash.size) {
+    throw new Error(
+      `the database reported ${dbSizesByHash.size} blobs copied, but GCS reported ${gcsSizesByHash.size} blobs copied`
+    )
+  }
+
+  const mismatches = []
+  for (const [hash, dbSize] of dbSizesByHash.entries()) {
+    if (gcsSizesByHash.get(hash) !== dbSize) {
+      mismatches.push(hash)
+    }
+  }
+  if (mismatches.length > 0) {
+    throw new Error(`blob size mismatch for hashes: ${mismatches.join(', ')}`)
+  }
+}
+
+async function getHistory(projectId, chunkId) {
+  const rawHistory = await historyStore.loadRaw(projectId, chunkId)
+  const history = History.fromRaw(rawHistory)
+  return history
+}
+
+async function getBlobHashes(projectId, chunkIds) {
+  const blobHashes = new Set()
+  for (const chunkId of chunkIds) {
+    const history = await getHistory(projectId, chunkId)
+    history.findBlobHashes(blobHashes)
+  }
+  return Array.from(blobHashes)
+}
+
+async function copyBlobInGcs(projectId, blobHash) {
+  const globalBlobKey = [
+    blobHash.slice(0, 2),
+    blobHash.slice(2, 4),
+    blobHash.slice(4),
+  ].join('/')
+  const projectBlobKey = [
+    projectKey.format(projectId),
+    blobHash.slice(0, 2),
+    blobHash.slice(2),
+  ].join('/')
+  const globalBlobObject = globalBucket.file(globalBlobKey)
+  const projectBlobObject = projectBucket.file(projectBlobKey)
+
+  // Check if the project blob exists
+  let projectBlobMetadata = null
+  try {
+    ;[projectBlobMetadata] = await projectBlobObject.getMetadata()
+  } catch (err) {
+    if (err.code !== 404) {
+      throw err
+    }
+  }
+
+  // Check that the blob exists
+  let globalBlobMetadata = null
+  try {
+    ;[globalBlobMetadata] = await globalBlobObject.getMetadata()
+  } catch (err) {
+    if (err.code !== 404) {
+      throw err
+    }
+  }
+
+  if (projectBlobMetadata) {
+    // Project blob already exists. Compare the metadata if the global blob
+    // also exists and return early.
+    if (
+      globalBlobMetadata != null &&
+      (globalBlobMetadata.size !== projectBlobMetadata.size ||
+        globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash)
+    ) {
+      throw new Error(
+        `Project blob ${blobHash} in project ${projectId} doesn't match global blob`
+      )
+    }
+    return null
+  }
+
+  await globalBlobObject.copy(projectBlobObject)
+
+  // Paranoid check that the copy went well. The getMetadata() method returns
+  // an array, with the metadata in first position.
+  ;[projectBlobMetadata] = await projectBlobObject.getMetadata()
+  if (
+    globalBlobMetadata.size !== projectBlobMetadata.size ||
+    globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash
+  ) {
+    throw new Error(`Failed to copy blob ${blobHash} to project ${projectId})`)
+  }
+
+  return parseInt(projectBlobMetadata.size, 10)
+}
+
+async function copyBlobsInDatabase(projectId, blobHashes) {
+  const blobSizesByHash = new Map()
+  if (blobHashes.length === 0) {
+    return blobSizesByHash
+  }
+  const binaryBlobHashes = blobHashes.map(hash => Buffer.from(hash, 'hex'))
+  const result = await knex.raw(
+    `INSERT INTO project_blobs (
+      project_id, hash_bytes, byte_length, string_length
+    )
+    SELECT ?, hash_bytes, byte_length, string_length
+    FROM blobs
+    WHERE hash_bytes IN (${binaryBlobHashes.map(_ => '?').join(',')})
+    ON CONFLICT (project_id, hash_bytes) DO NOTHING
+    RETURNING hash_bytes, byte_length`,
+    [projectId, ...binaryBlobHashes]
+  )
+  for (const row of result.rows) {
+    blobSizesByHash.set(row.hash_bytes.toString('hex'), row.byte_length)
+  }
+  return blobSizesByHash
+}
+
+main()
+  .then(() => {
+    process.exit()
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
--- a/services/history-v1/storage/tasks/delete_old_chunks.js
+++ b/services/history-v1/storage/tasks/delete_old_chunks.js
@@ -0,0 +1,36 @@
+#!/usr/bin/env node
+
+'use strict'
+
+const commandLineArgs = require('command-line-args')
+const { chunkStore } = require('../')
+
+async function deleteOldChunks(options) {
+  const deletedChunksTotal = await chunkStore.deleteOldChunks(options)
+  console.log(`Deleted ${deletedChunksTotal} old chunks`)
+}
+
+exports.deleteOldChunks = deleteOldChunks
+
+if (require.main === module) {
+  const options = commandLineArgs([
+    { name: 'batch-size', type: Number },
+    { name: 'max-batches', type: Number },
+    { name: 'min-age', type: Number },
+    { name: 'timeout', type: Number },
+    { name: 'verbose', type: Boolean, alias: 'v', defaultValue: false },
+  ])
+  deleteOldChunks({
+    batchSize: options['batch-size'],
+    maxBatches: options['max-batches'],
+    timeout: options.timeout,
+    minAgeSecs: options['min-age'],
+  })
+    .then(() => {
+      process.exit()
+    })
+    .catch(err => {
+      console.error(err)
+      process.exit(1)
+    })
+}
--- a/services/history-v1/storage/tasks/fix_duplicate_versions.js
+++ b/services/history-v1/storage/tasks/fix_duplicate_versions.js
@@ -0,0 +1,156 @@
+#!/usr/bin/env node
+
+'use strict'
+
+const commandLineArgs = require('command-line-args')
+const { chunkStore } = require('..')
+
+main()
+  .then(() => {
+    process.exit(0)
+  })
+  .catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
+
+async function main() {
+  const opts = commandLineArgs([
+    { name: 'project-ids', type: String, multiple: true, defaultOption: true },
+    { name: 'save', type: Boolean, defaultValue: false },
+    { name: 'help', type: Boolean, defaultValue: false },
+  ])
+  if (opts.help || opts['project-ids'] == null) {
+    console.log('Usage: fix_duplicate_versions [--save] PROJECT_ID...')
+    process.exit()
+  }
+  for (const projectId of opts['project-ids']) {
+    await processProject(projectId, opts.save)
+  }
+  if (!opts.save) {
+    console.log('\nThis was a dry run. Re-run with --save to persist changes.')
+  }
+}
+
+async function processProject(projectId, save) {
+  console.log(`Project ${projectId}:`)
+  const chunk = await chunkStore.loadLatest(projectId)
+  let numChanges = 0
+  numChanges += removeDuplicateProjectVersions(chunk)
+  numChanges += removeDuplicateDocVersions(chunk)
+  console.log(`    ${numChanges > 0 ? numChanges : 'no'} changes`)
+  if (save && numChanges > 0) {
+    await replaceChunk(projectId, chunk)
+  }
+}
+
+function removeDuplicateProjectVersions(chunk) {
+  let numChanges = 0
+  let lastVersion = null
+  const { snapshot, changes } = chunk.history
+  if (snapshot.projectVersion != null) {
+    lastVersion = snapshot.projectVersion
+  }
+  for (const change of changes) {
+    if (change.projectVersion == null) {
+      // Not a project structure change. Ignore.
+      continue
+    }
+    if (
+      lastVersion != null &&
+      !areProjectVersionsIncreasing(lastVersion, change.projectVersion)
+    ) {
+      // Duplicate. Remove all ops
+      console.log(
+        `    Removing out-of-order project structure change: ${change.projectVersion} <= ${lastVersion}`
+      )
+      change.setOperations([])
+      delete change.projectVersion
+      numChanges++
+    } else {
+      lastVersion = change.projectVersion
+    }
+  }
+
+  return numChanges
+}
+
+function removeDuplicateDocVersions(chunk) {
+  let numChanges = 0
+  const lastVersions = new Map()
+  const { snapshot, changes } = chunk.history
+  if (snapshot.v2DocVersions != null) {
+    for (const { pathname, v } of Object.values(snapshot.v2DocVersions.data)) {
+      lastVersions.set(pathname, v)
+    }
+  }
+  for (const change of changes) {
+    if (change.v2DocVersions == null) {
+      continue
+    }
+
+    // Collect all docs that have problematic versions
+    const badPaths = []
+    const badDocIds = []
+    for (const [docId, { pathname, v }] of Object.entries(
+      change.v2DocVersions.data
+    )) {
+      const lastVersion = lastVersions.get(docId)
+      if (lastVersion != null && v <= lastVersion) {
+        // Duplicate. Remove ops related to that doc
+        console.log(
+          `    Removing out-of-order change for doc ${docId} (${pathname}): ${v} <= ${lastVersion}`
+        )
+        badPaths.push(pathname)
+        badDocIds.push(docId)
+        numChanges++
+      } else {
+        lastVersions.set(docId, v)
+      }
+    }
+
+    // Remove bad operations
+    if (badPaths.length > 0) {
+      change.setOperations(
+        change.operations.filter(
+          op => op.pathname == null || !badPaths.includes(op.pathname)
+        )
+      )
+    }
+
+    // Remove bad v2 doc versions
+    for (const docId of badDocIds) {
+      delete change.v2DocVersions.data[docId]
+    }
+  }
+
+  return numChanges
+}
+
+function areProjectVersionsIncreasing(v1Str, v2Str) {
+  const v1 = parseProjectVersion(v1Str)
+  const v2 = parseProjectVersion(v2Str)
+  return v2.major > v1.major || (v2.major === v1.major && v2.minor > v1.minor)
+}
+
+function parseProjectVersion(version) {
+  const [major, minor] = version.split('.').map(x => parseInt(x, 10))
+  if (isNaN(major) || isNaN(minor)) {
+    throw new Error(`Invalid project version: ${version}`)
+  }
+  return { major, minor }
+}
+
+async function replaceChunk(projectId, chunk) {
+  const endVersion = chunk.getEndVersion()
+  const oldChunkId = await chunkStore.getChunkIdForVersion(
+    projectId,
+    endVersion
+  )
+  console.log(`    Replacing chunk ${oldChunkId}`)
+  // The chunks table has a unique constraint on doc_id and end_version. Because
+  // we're replacing a chunk with the same end version, we need to destroy the
+  // chunk first.
+  await chunkStore.destroy(projectId, oldChunkId)
+  await chunkStore.create(projectId, chunk)
+}
--- a/services/history-v1/storage/tasks/index.js
+++ b/services/history-v1/storage/tasks/index.js
@@ -0,0 +1 @@
+exports.deleteOldChunks = require('./delete_old_chunks').deleteOldChunks
				`@@ -0,0 +1 @@`
				`exports.deleteOldChunks = require('./delete_old_chunks').deleteOldChunks`