first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,109 @@
const commandLineArgs = require('command-line-args')
const BPromise = require('bluebird')
const timersPromises = require('node:timers/promises')
const { knex, historyStore } = require('..')
const MAX_POSTGRES_INTEGER = 2147483647
const DEFAULT_BATCH_SIZE = 1000
const DEFAULT_CONCURRENCY = 1
const MAX_RETRIES = 10
const RETRY_DELAY_MS = 5000
async function main() {
const options = parseOptions()
let batchStart = options.minId
while (batchStart <= options.maxId) {
const chunks = await getChunks(batchStart, options.maxId, options.batchSize)
if (chunks.length === 0) {
// No results. We're done.
break
}
const batchEnd = chunks[chunks.length - 1].id
await processBatch(chunks, options)
console.log(`Processed chunks ${batchStart} to ${batchEnd}`)
batchStart = batchEnd + 1
}
}
function parseOptions() {
const args = commandLineArgs([
{ name: 'min-id', type: Number, defaultValue: 1 },
{
name: 'max-id',
type: Number,
defaultValue: MAX_POSTGRES_INTEGER,
},
{ name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
{ name: 'concurrency', type: Number, defaultValue: DEFAULT_CONCURRENCY },
])
return {
minId: args['min-id'],
maxId: args['max-id'],
batchSize: args['batch-size'],
concurrency: args.concurrency,
}
}
async function getChunks(minId, maxId, batchSize) {
const chunks = await knex('chunks')
.where('id', '>=', minId)
.andWhere('id', '<=', maxId)
.orderBy('id')
.limit(batchSize)
return chunks
}
async function processBatch(chunks, options) {
let retries = 0
while (true) {
const results = await BPromise.map(chunks, processChunk, {
concurrency: options.concurrency,
})
const failedChunks = results
.filter(result => !result.success)
.map(result => result.chunk)
if (failedChunks.length === 0) {
// All chunks processed. Carry on.
break
}
// Some projects failed. Retry.
retries += 1
if (retries > MAX_RETRIES) {
console.log('Too many retries processing chunks. Giving up.')
process.exit(1)
}
console.log(
`Retrying chunks: ${failedChunks.map(chunk => chunk.id).join(', ')}`
)
await timersPromises.setTimeout(RETRY_DELAY_MS)
chunks = failedChunks
}
}
async function processChunk(chunk) {
try {
const rawHistory = await historyStore.loadRaw(
chunk.doc_id.toString(),
chunk.id
)
const startVersion = chunk.end_version - rawHistory.changes.length
await knex('chunks')
.where('id', chunk.id)
.update({ start_version: startVersion })
return { chunk, success: true }
} catch (err) {
console.error(`Failed to process chunk ${chunk.id}:`, err.stack)
return { chunk, success: false }
}
}
main()
.then(() => {
process.exit()
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,107 @@
/**
* Compress changes for projects that have too many text operations.
*
* Usage:
*
* node tasks/compress_changes.js CSV_FILE
*
* where CSV_FILE contains a list of project ids in the first column
*/
const fs = require('node:fs')
const BPromise = require('bluebird')
const { History } = require('overleaf-editor-core')
const { historyStore, chunkStore } = require('..')
const CONCURRENCY = 10
async function main() {
const filename = process.argv[2]
const projectIds = await readCsv(filename)
const chunks = []
for (const projectId of projectIds) {
const chunkIds = await chunkStore.getProjectChunkIds(projectId)
chunks.push(...chunkIds.map(id => ({ id, projectId })))
}
let totalCompressed = 0
await BPromise.map(
chunks,
async chunk => {
try {
const history = await getHistory(chunk)
const numCompressed = compressChanges(history)
if (numCompressed > 0) {
await storeHistory(chunk, history)
console.log(
`Compressed project ${chunk.projectId}, chunk ${chunk.id}`
)
}
totalCompressed += numCompressed
} catch (err) {
console.log(err)
}
},
{ concurrency: CONCURRENCY }
)
console.log('CHANGES:', totalCompressed)
}
async function readCsv(filename) {
const csv = await fs.promises.readFile(filename, 'utf-8')
const lines = csv.trim().split('\n')
const projectIds = lines.map(line => line.split(',')[0])
return projectIds
}
async function getHistory(chunk) {
const rawHistory = await historyStore.loadRaw(chunk.projectId, chunk.id)
const history = History.fromRaw(rawHistory)
return history
}
async function storeHistory(chunk, history) {
const rawHistory = history.toRaw()
await historyStore.storeRaw(chunk.projectId, chunk.id, rawHistory)
}
function compressChanges(history) {
let numCompressed = 0
for (const change of history.getChanges()) {
const newOperations = compressOperations(change.operations)
if (newOperations.length !== change.operations.length) {
numCompressed++
}
change.setOperations(newOperations)
}
return numCompressed
}
function compressOperations(operations) {
if (!operations.length) return []
const newOperations = []
let currentOperation = operations[0]
for (let operationId = 1; operationId < operations.length; operationId++) {
const nextOperation = operations[operationId]
if (currentOperation.canBeComposedWith(nextOperation)) {
currentOperation = currentOperation.compose(nextOperation)
} else {
// currentOperation and nextOperation cannot be composed. Push the
// currentOperation and start over with nextOperation.
newOperations.push(currentOperation)
currentOperation = nextOperation
}
}
newOperations.push(currentOperation)
return newOperations
}
main()
.then(() => {
process.exit()
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,294 @@
#!/usr/bin/env node
const { promisify } = require('node:util')
const BPromise = require('bluebird')
const commandLineArgs = require('command-line-args')
const config = require('config')
const fs = require('node:fs')
const readline = require('node:readline')
const { History } = require('overleaf-editor-core')
const { knex, historyStore, persistor } = require('..')
const projectKey = require('../lib/project_key')
const MAX_POSTGRES_INTEGER = 2147483647
const DEFAULT_BATCH_SIZE = 1000
const MAX_RETRIES = 10
const RETRY_DELAY_MS = 5000
// Obtain a preconfigured GCS client through a non-documented property of
// object-persistor. Sorry about that. We need the GCS client because we use
// operations that are not implemented in object-persistor.
const gcsClient = persistor.storage
const globalBucket = gcsClient.bucket(config.get('blobStore.globalBucket'))
const projectBucket = gcsClient.bucket(config.get('blobStore.projectBucket'))
const delay = promisify(setTimeout)
async function main() {
const options = commandLineArgs([
{ name: 'global-blobs', type: String },
{ name: 'min-project-id', type: Number, defaultValue: 1 },
{
name: 'max-project-id',
type: Number,
defaultValue: MAX_POSTGRES_INTEGER,
},
{ name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
{ name: 'concurrency', type: Number, defaultValue: 1 },
])
if (!options['global-blobs']) {
console.error(
'You must specify a global blobs file with the --global-blobs option'
)
process.exit(1)
}
const globalBlobs = await readGlobalBlobs(options['global-blobs'])
const minProjectId = options['min-project-id']
const maxProjectId = options['max-project-id']
const batchSize = options['batch-size']
const concurrency = options.concurrency
console.log(`Keeping ${globalBlobs.size} global blobs`)
await run({ globalBlobs, minProjectId, maxProjectId, batchSize, concurrency })
console.log('Done.')
}
async function readGlobalBlobs(filename) {
const stream = fs.createReadStream(filename)
const reader = readline.createInterface({
input: stream,
crlfDelay: Infinity,
})
const blobs = new Set()
for await (const line of reader) {
blobs.add(line.trim())
}
return blobs
}
async function run(options) {
const { globalBlobs, minProjectId, maxProjectId, batchSize, concurrency } =
options
let batchStart = minProjectId
while (batchStart <= maxProjectId) {
let projectIds = await getProjectIds(batchStart, maxProjectId, batchSize)
if (projectIds.length === 0) {
break
}
const batchEnd = projectIds[projectIds.length - 1]
console.log(`Processing projects ${batchStart} to ${batchEnd}`)
const chunkIdsByProject = await getChunkIdsByProject(projectIds)
let retries = 0
while (true) {
const results = await BPromise.map(
projectIds,
async projectId =>
await processProject(
projectId,
chunkIdsByProject.get(projectId),
globalBlobs
),
{ concurrency }
)
const failedProjectIds = results
.filter(result => !result.success)
.map(result => result.projectId)
if (failedProjectIds.length === 0) {
// All projects were copied successfully. Carry on.
break
}
// Some projects failed. Retry.
retries += 1
if (retries > MAX_RETRIES) {
console.log(
`Too many retries processing projects ${batchStart} to ${batchEnd}. Giving up.`
)
process.exit(1)
}
console.log(`Retrying projects: ${failedProjectIds.join(', ')}`)
await delay(RETRY_DELAY_MS)
projectIds = failedProjectIds
}
// Set up next batch
batchStart = batchEnd + 1
}
}
async function getProjectIds(minProjectId, maxProjectId, batchSize) {
const projectIds = await knex('chunks')
.distinct('doc_id')
.where('doc_id', '>=', minProjectId)
.andWhere('doc_id', '<=', maxProjectId)
.orderBy('doc_id')
.limit(batchSize)
.pluck('doc_id')
return projectIds
}
async function getChunkIdsByProject(projectIds) {
const chunks = await knex('chunks')
.select('id', { projectId: 'doc_id' })
.where('doc_id', 'in', projectIds)
const chunkIdsByProject = new Map()
for (const projectId of projectIds) {
chunkIdsByProject.set(projectId, [])
}
for (const chunk of chunks) {
chunkIdsByProject.get(chunk.projectId).push(chunk.id)
}
return chunkIdsByProject
}
async function processProject(projectId, chunkIds, globalBlobs) {
try {
const blobHashes = await getBlobHashes(projectId, chunkIds)
const projectBlobHashes = blobHashes.filter(hash => !globalBlobs.has(hash))
const gcsSizesByHash = new Map()
for (const blobHash of projectBlobHashes) {
const blobSize = await copyBlobInGcs(projectId, blobHash)
if (blobSize != null) {
gcsSizesByHash.set(blobHash, blobSize)
}
}
const dbSizesByHash = await copyBlobsInDatabase(
projectId,
projectBlobHashes
)
compareBlobSizes(gcsSizesByHash, dbSizesByHash)
return { projectId, success: true }
} catch (err) {
console.error(`Failed to process project ${projectId}:`, err.stack)
return { projectId, success: false }
}
}
function compareBlobSizes(gcsSizesByHash, dbSizesByHash) {
// Throw an error if the database doesn't report as many blobs as GCS
if (dbSizesByHash.size !== gcsSizesByHash.size) {
throw new Error(
`the database reported ${dbSizesByHash.size} blobs copied, but GCS reported ${gcsSizesByHash.size} blobs copied`
)
}
const mismatches = []
for (const [hash, dbSize] of dbSizesByHash.entries()) {
if (gcsSizesByHash.get(hash) !== dbSize) {
mismatches.push(hash)
}
}
if (mismatches.length > 0) {
throw new Error(`blob size mismatch for hashes: ${mismatches.join(', ')}`)
}
}
async function getHistory(projectId, chunkId) {
const rawHistory = await historyStore.loadRaw(projectId, chunkId)
const history = History.fromRaw(rawHistory)
return history
}
async function getBlobHashes(projectId, chunkIds) {
const blobHashes = new Set()
for (const chunkId of chunkIds) {
const history = await getHistory(projectId, chunkId)
history.findBlobHashes(blobHashes)
}
return Array.from(blobHashes)
}
async function copyBlobInGcs(projectId, blobHash) {
const globalBlobKey = [
blobHash.slice(0, 2),
blobHash.slice(2, 4),
blobHash.slice(4),
].join('/')
const projectBlobKey = [
projectKey.format(projectId),
blobHash.slice(0, 2),
blobHash.slice(2),
].join('/')
const globalBlobObject = globalBucket.file(globalBlobKey)
const projectBlobObject = projectBucket.file(projectBlobKey)
// Check if the project blob exists
let projectBlobMetadata = null
try {
;[projectBlobMetadata] = await projectBlobObject.getMetadata()
} catch (err) {
if (err.code !== 404) {
throw err
}
}
// Check that the blob exists
let globalBlobMetadata = null
try {
;[globalBlobMetadata] = await globalBlobObject.getMetadata()
} catch (err) {
if (err.code !== 404) {
throw err
}
}
if (projectBlobMetadata) {
// Project blob already exists. Compare the metadata if the global blob
// also exists and return early.
if (
globalBlobMetadata != null &&
(globalBlobMetadata.size !== projectBlobMetadata.size ||
globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash)
) {
throw new Error(
`Project blob ${blobHash} in project ${projectId} doesn't match global blob`
)
}
return null
}
await globalBlobObject.copy(projectBlobObject)
// Paranoid check that the copy went well. The getMetadata() method returns
// an array, with the metadata in first position.
;[projectBlobMetadata] = await projectBlobObject.getMetadata()
if (
globalBlobMetadata.size !== projectBlobMetadata.size ||
globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash
) {
throw new Error(`Failed to copy blob ${blobHash} to project ${projectId})`)
}
return parseInt(projectBlobMetadata.size, 10)
}
async function copyBlobsInDatabase(projectId, blobHashes) {
const blobSizesByHash = new Map()
if (blobHashes.length === 0) {
return blobSizesByHash
}
const binaryBlobHashes = blobHashes.map(hash => Buffer.from(hash, 'hex'))
const result = await knex.raw(
`INSERT INTO project_blobs (
project_id, hash_bytes, byte_length, string_length
)
SELECT ?, hash_bytes, byte_length, string_length
FROM blobs
WHERE hash_bytes IN (${binaryBlobHashes.map(_ => '?').join(',')})
ON CONFLICT (project_id, hash_bytes) DO NOTHING
RETURNING hash_bytes, byte_length`,
[projectId, ...binaryBlobHashes]
)
for (const row of result.rows) {
blobSizesByHash.set(row.hash_bytes.toString('hex'), row.byte_length)
}
return blobSizesByHash
}
main()
.then(() => {
process.exit()
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env node
'use strict'
const commandLineArgs = require('command-line-args')
const { chunkStore } = require('../')
async function deleteOldChunks(options) {
const deletedChunksTotal = await chunkStore.deleteOldChunks(options)
console.log(`Deleted ${deletedChunksTotal} old chunks`)
}
exports.deleteOldChunks = deleteOldChunks
if (require.main === module) {
const options = commandLineArgs([
{ name: 'batch-size', type: Number },
{ name: 'max-batches', type: Number },
{ name: 'min-age', type: Number },
{ name: 'timeout', type: Number },
{ name: 'verbose', type: Boolean, alias: 'v', defaultValue: false },
])
deleteOldChunks({
batchSize: options['batch-size'],
maxBatches: options['max-batches'],
timeout: options.timeout,
minAgeSecs: options['min-age'],
})
.then(() => {
process.exit()
})
.catch(err => {
console.error(err)
process.exit(1)
})
}

View File

@@ -0,0 +1,156 @@
#!/usr/bin/env node
'use strict'
const commandLineArgs = require('command-line-args')
const { chunkStore } = require('..')
main()
.then(() => {
process.exit(0)
})
.catch(err => {
console.error(err)
process.exit(1)
})
async function main() {
const opts = commandLineArgs([
{ name: 'project-ids', type: String, multiple: true, defaultOption: true },
{ name: 'save', type: Boolean, defaultValue: false },
{ name: 'help', type: Boolean, defaultValue: false },
])
if (opts.help || opts['project-ids'] == null) {
console.log('Usage: fix_duplicate_versions [--save] PROJECT_ID...')
process.exit()
}
for (const projectId of opts['project-ids']) {
await processProject(projectId, opts.save)
}
if (!opts.save) {
console.log('\nThis was a dry run. Re-run with --save to persist changes.')
}
}
async function processProject(projectId, save) {
console.log(`Project ${projectId}:`)
const chunk = await chunkStore.loadLatest(projectId)
let numChanges = 0
numChanges += removeDuplicateProjectVersions(chunk)
numChanges += removeDuplicateDocVersions(chunk)
console.log(` ${numChanges > 0 ? numChanges : 'no'} changes`)
if (save && numChanges > 0) {
await replaceChunk(projectId, chunk)
}
}
function removeDuplicateProjectVersions(chunk) {
let numChanges = 0
let lastVersion = null
const { snapshot, changes } = chunk.history
if (snapshot.projectVersion != null) {
lastVersion = snapshot.projectVersion
}
for (const change of changes) {
if (change.projectVersion == null) {
// Not a project structure change. Ignore.
continue
}
if (
lastVersion != null &&
!areProjectVersionsIncreasing(lastVersion, change.projectVersion)
) {
// Duplicate. Remove all ops
console.log(
` Removing out-of-order project structure change: ${change.projectVersion} <= ${lastVersion}`
)
change.setOperations([])
delete change.projectVersion
numChanges++
} else {
lastVersion = change.projectVersion
}
}
return numChanges
}
function removeDuplicateDocVersions(chunk) {
let numChanges = 0
const lastVersions = new Map()
const { snapshot, changes } = chunk.history
if (snapshot.v2DocVersions != null) {
for (const { pathname, v } of Object.values(snapshot.v2DocVersions.data)) {
lastVersions.set(pathname, v)
}
}
for (const change of changes) {
if (change.v2DocVersions == null) {
continue
}
// Collect all docs that have problematic versions
const badPaths = []
const badDocIds = []
for (const [docId, { pathname, v }] of Object.entries(
change.v2DocVersions.data
)) {
const lastVersion = lastVersions.get(docId)
if (lastVersion != null && v <= lastVersion) {
// Duplicate. Remove ops related to that doc
console.log(
` Removing out-of-order change for doc ${docId} (${pathname}): ${v} <= ${lastVersion}`
)
badPaths.push(pathname)
badDocIds.push(docId)
numChanges++
} else {
lastVersions.set(docId, v)
}
}
// Remove bad operations
if (badPaths.length > 0) {
change.setOperations(
change.operations.filter(
op => op.pathname == null || !badPaths.includes(op.pathname)
)
)
}
// Remove bad v2 doc versions
for (const docId of badDocIds) {
delete change.v2DocVersions.data[docId]
}
}
return numChanges
}
function areProjectVersionsIncreasing(v1Str, v2Str) {
const v1 = parseProjectVersion(v1Str)
const v2 = parseProjectVersion(v2Str)
return v2.major > v1.major || (v2.major === v1.major && v2.minor > v1.minor)
}
function parseProjectVersion(version) {
const [major, minor] = version.split('.').map(x => parseInt(x, 10))
if (isNaN(major) || isNaN(minor)) {
throw new Error(`Invalid project version: ${version}`)
}
return { major, minor }
}
async function replaceChunk(projectId, chunk) {
const endVersion = chunk.getEndVersion()
const oldChunkId = await chunkStore.getChunkIdForVersion(
projectId,
endVersion
)
console.log(` Replacing chunk ${oldChunkId}`)
// The chunks table has a unique constraint on doc_id and end_version. Because
// we're replacing a chunk with the same end version, we need to destroy the
// chunk first.
await chunkStore.destroy(projectId, oldChunkId)
await chunkStore.create(projectId, chunk)
}

View File

@@ -0,0 +1 @@
exports.deleteOldChunks = require('./delete_old_chunks').deleteOldChunks