first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,647 @@
// @ts-check
import Events from 'node:events'
import fs from 'node:fs'
import Stream from 'node:stream'
import { ObjectId } from 'mongodb'
import logger from '@overleaf/logger'
import OError from '@overleaf/o-error'
import { Blob } from 'overleaf-editor-core'
import {
BlobStore,
getStringLengthOfFile,
GLOBAL_BLOBS,
makeBlobForFile,
} from '../lib/blob_store/index.js'
import { db } from '../lib/mongodb.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import filestorePersistor from '../lib/persistor.js'
import { setTimeout } from 'node:timers/promises'
// Silence warning.
Events.setMaxListeners(20)
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
/**
* @typedef {import("mongodb").Collection} Collection
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
*/
/**
* @typedef {Object} FileRef
* @property {ObjectId} _id
* @property {string} hash
*/
/**
* @typedef {Object} Folder
* @property {Array<Folder>} folders
* @property {Array<FileRef>} fileRefs
*/
/**
* @typedef {Object} Project
* @property {ObjectId} _id
* @property {Array<Folder>} rootFolder
* @property {{history: {id: (number|string)}}} overleaf
*/
/**
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
*/
function parseArgs() {
const args = commandLineArgs([
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
{ name: 'fixMissingHash', type: String, defaultValue: 'true' },
{ name: 'logs', type: String, defaultValue: '' },
])
/**
* commandLineArgs cannot handle --foo=false, so go the long way
* @param {string} name
* @return {boolean}
*/
function boolVal(name) {
const v = args[name]
if (['true', 'false'].includes(v)) return v === 'true'
throw new Error(`expected "true" or "false" for boolean option ${name}`)
}
return {
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
FIX_MISSING_HASH: boolVal('fixMissingHash'),
LOGS: args.logs,
}
}
const {
FIX_HASH_MISMATCH,
FIX_DELETE_PERMISSION,
FIX_NOT_FOUND,
FIX_MISSING_HASH,
LOGS,
} = parseArgs()
if (!LOGS) {
throw new Error('--logs parameter missing')
}
const BUFFER_DIR = fs.mkdtempSync(
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
)
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
if (!USER_FILES_BUCKET_NAME) {
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
}
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
const STREAM_HIGH_WATER_MARK = parseInt(
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
10
)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
/** @type {ProjectsCollection} */
const projectsCollection = db.collection('projects')
/** @type {DeletedProjectsCollection} */
const deletedProjectsCollection = db.collection('deletedProjects')
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated, draining queue')
}
class FileDeletedError extends OError {}
/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
const PROJECT_CACHE = new Map()
/**
* @param {string} projectId
* @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
*/
async function getProject(projectId) {
const cached = PROJECT_CACHE.get(projectId)
if (cached) return cached
let projectSoftDeleted
let project = await projectsCollection.findOne({
_id: new ObjectId(projectId),
})
if (project) {
projectSoftDeleted = false
} else {
const softDeleted = await deletedProjectsCollection.findOne({
'deleterData.deletedProjectId': new ObjectId(projectId),
project: { $exists: true },
})
if (!softDeleted) {
throw new OError('project hard-deleted')
}
project = softDeleted.project
projectSoftDeleted = true
}
PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
return { projectSoftDeleted, project }
}
/**
* @param {Folder} folder
* @param {string} fileId
* @return {{path: string, fileRef: FileRef, folder: Folder}|null}
*/
function getFileTreePath(folder, fileId) {
if (!folder) return null
let idx = 0
if (Array.isArray(folder.fileRefs)) {
for (const fileRef of folder.fileRefs) {
if (fileRef?._id.toString() === fileId) {
return {
fileRef,
path: `.fileRefs.${idx}`,
folder,
}
}
idx++
}
}
idx = 0
if (Array.isArray(folder.folders)) {
for (const child of folder.folders) {
const match = getFileTreePath(child, fileId)
if (match) {
return {
fileRef: match.fileRef,
folder: match.folder,
path: `.folders.${idx}${match.path}`,
}
}
idx++
}
}
return null
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
*/
async function findFile(projectId, fileId) {
const { projectSoftDeleted, project } = await getProject(projectId)
const match = getFileTreePath(project.rootFolder[0], fileId)
if (!match) {
throw new FileDeletedError('file not found in file-tree', {
projectSoftDeleted,
})
}
const { path, fileRef, folder } = match
let fullPath
let query
if (projectSoftDeleted) {
fullPath = `project.rootFolder.0${path}`
query = {
'deleterData.deletedProjectId': new ObjectId(projectId),
[`${fullPath}._id`]: new ObjectId(fileId),
}
} else {
fullPath = `rootFolder.0${path}`
query = {
_id: new ObjectId(projectId),
[`${fullPath}._id`]: new ObjectId(fileId),
}
}
return {
projectSoftDeleted,
query,
fullPath,
fileRef,
folder,
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixNotFound(line) {
const { projectId, fileId, bucketName } = JSON.parse(line)
if (bucketName !== USER_FILES_BUCKET_NAME) {
throw new OError('not found case for another bucket')
}
const { projectSoftDeleted, query, fullPath, fileRef, folder } =
await findFile(projectId, fileId)
logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
// Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
let result
if (projectSoftDeleted) {
result = await deletedProjectsCollection.updateOne(query, {
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
$inc: { 'project.version': 1 },
})
} else {
result = await projectsCollection.updateOne(query, {
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
$inc: { version: 1 },
})
}
if (result.matchedCount !== 1) {
throw new OError('file-tree write did not match', { result })
}
// Update the cache. The mongo-path of the next file will be off otherwise.
folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
return true
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<void>}
*/
async function setHashInMongo(projectId, fileId, hash) {
const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
projectId,
fileId
)
if (fileRef.hash === hash) return
logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
let result
if (projectSoftDeleted) {
result = await deletedProjectsCollection.updateOne(query, {
$set: { [`${fullPath}.hash`]: hash },
$inc: { 'project.version': 1 },
})
} else {
result = await projectsCollection.updateOne(query, {
$set: { [`${fullPath}.hash`]: hash },
$inc: { version: 1 },
})
}
if (result.matchedCount !== 1) {
throw new OError('file-tree write did not match', { result })
}
fileRef.hash = hash // Update cache for completeness.
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} historyId
* @return {Promise<void>}
*/
async function importRestoredFilestoreFile(projectId, fileId, historyId) {
const filestoreKey = `${projectId}/${fileId}`
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
let s
try {
s = await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
await Stream.promises.pipeline(
s,
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blobStore = new BlobStore(historyId)
const blob = await blobStore.putFile(path)
await backupBlob(historyId, blob, path)
await setHashInMongo(projectId, fileId, blob.getHash())
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} path
* @return {Promise<Blob>}
*/
async function bufferFilestoreFileToDisk(projectId, fileId, path) {
const filestoreKey = `${projectId}/${fileId}`
try {
await Stream.promises.pipeline(
await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
),
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blob = await makeBlobForFile(path)
blob.setStringLength(
await getStringLengthOfFile(blob.getByteLength(), path)
)
return blob
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<string>}
*/
async function computeFilestoreFileHash(projectId, fileId) {
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
return blob.getHash()
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<void>}
*/
async function uploadFilestoreFile(projectId, fileId) {
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
const hash = blob.getHash()
try {
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
} catch (err) {
if (!(err instanceof Blob.NotFoundError)) throw err
const { project } = await getProject(projectId)
const historyId = project.overleaf.history.id.toString()
const blobStore = new BlobStore(historyId)
await blobStore.putBlob(path, blob)
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixHashMismatch(line) {
const {
projectId,
fileId,
hash: computedHash,
entry: {
hash: fileTreeHash,
ctx: { historyId },
},
} = JSON.parse(line)
const blobStore = new BlobStore(historyId)
if (await blobStore.getBlob(fileTreeHash)) {
throw new OError('found blob with computed filestore object hash')
}
if (!(await blobStore.getBlob(computedHash))) {
await importRestoredFilestoreFile(projectId, fileId, historyId)
return true
}
return await ensureBlobExistsForFileAndUploadToAWS(
projectId,
fileId,
computedHash
)
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
const { fileRef } = await findFile(projectId, fileId)
return fileRef.hash === hash
}
/**
* @param {string} projectId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function needsBackingUpToAWS(projectId, hash) {
if (GLOBAL_BLOBS.has(hash)) return false
return !(await _blobIsBackedUp(projectId, hash))
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
const { project } = await getProject(projectId)
const historyId = project.overleaf.history.id.toString()
const blobStore = new BlobStore(historyId)
if (
(await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
(await blobStore.getBlob(hash)) &&
!(await needsBackingUpToAWS(projectId, hash))
) {
return false // already processed
}
const stream = await blobStore.getStream(hash)
const path = `${BUFFER_DIR}/${historyId}_${hash}`
try {
await Stream.promises.pipeline(
stream,
fs.createWriteStream(path, {
highWaterMark: STREAM_HIGH_WATER_MARK,
})
)
const writtenBlob = await makeBlobForFile(path)
writtenBlob.setStringLength(
await getStringLengthOfFile(writtenBlob.getByteLength(), path)
)
if (writtenBlob.getHash() !== hash) {
// Double check download, better safe than sorry.
throw new OError('blob corrupted', { writtenBlob })
}
let blob = await blobStore.getBlob(hash)
if (!blob) {
// Calling blobStore.putBlob would result in the same error again.
// HACK: Skip upload to GCS and finalize putBlob operation directly.
await blobStore.backend.insertBlob(historyId, writtenBlob)
}
await backupBlob(historyId, writtenBlob, path)
} finally {
await fs.promises.rm(path, { force: true })
}
await setHashInMongo(projectId, fileId, hash)
return true
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixDeletePermission(line) {
let { projectId, fileId, hash } = JSON.parse(line)
if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixMissingHash(line) {
let { projectId, _id: fileId } = JSON.parse(line)
const {
fileRef: { hash },
} = await findFile(projectId, fileId)
if (hash) {
// processed, double check
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
await uploadFilestoreFile(projectId, fileId)
return true
}
const CASES = {
'not found': {
match: 'NotFoundError',
flag: FIX_NOT_FOUND,
action: fixNotFound,
},
'hash mismatch': {
match: 'OError: hash mismatch',
flag: FIX_HASH_MISMATCH,
action: fixHashMismatch,
},
'delete permission': {
match: 'storage.objects.delete',
flag: FIX_DELETE_PERMISSION,
action: fixDeletePermission,
},
'missing file hash': {
match: '"bad file hash"',
flag: FIX_MISSING_HASH,
action: fixMissingHash,
},
}
const STATS = {
processedLines: 0,
success: 0,
alreadyProcessed: 0,
fileDeleted: 0,
skipped: 0,
failed: 0,
unmatched: 0,
}
function logStats() {
console.log(
JSON.stringify({
time: new Date(),
gracefulShutdownInitiated,
...STATS,
})
)
}
setInterval(logStats, 10_000)
async function processLog() {
const rl = readline.createInterface({
input: fs.createReadStream(LOGS),
})
nextLine: for await (const line of rl) {
if (gracefulShutdownInitiated) break
STATS.processedLines++
if (
!(
line.includes('"failed to process file"') ||
// Process missing hashes as flagged by find_malformed_filetrees.mjs
line.includes('"bad file-tree path"')
)
) {
continue
}
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
if (!line.includes(match)) continue
if (flag) {
try {
if (await action(line)) {
STATS.success++
} else {
STATS.alreadyProcessed++
}
} catch (err) {
if (err instanceof FileDeletedError) {
STATS.fileDeleted++
logger.info({ err, line }, 'file deleted, skipping')
} else {
STATS.failed++
logger.error({ err, line }, `failed to fix ${name}`)
}
}
} else {
STATS.skipped++
}
continue nextLine
}
STATS.unmatched++
logger.warn({ line }, 'unknown fatal error')
}
}
async function main() {
try {
await processLog()
} finally {
logStats()
try {
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
} catch (err) {
console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
}
}
const { skipped, failed, unmatched } = STATS
await setTimeout(SLEEP_BEFORE_EXIT)
if (failed > 0) {
process.exit(Math.min(failed, 99))
} else if (unmatched > 0) {
process.exit(100)
} else if (skipped > 0) {
process.exit(101)
} else {
process.exit(0)
}
}
await main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,173 @@
// @ts-check
import commandLineArgs from 'command-line-args'
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
import withTmpDir from '../../api/controllers/with_tmp_dir.js'
import {
BlobStore,
GLOBAL_BLOBS,
loadGlobalBlobs,
} from '../lib/blob_store/index.js'
import assert from '../lib/assert.js'
import knex from '../lib/knex.js'
import { client } from '../lib/mongodb.js'
import redis from '../lib/redis.js'
import { setTimeout } from 'node:timers/promises'
import fs from 'node:fs'
await loadGlobalBlobs()
/**
* Gracefully shutdown the process
* @return {Promise<void>}
*/
async function gracefulShutdown() {
console.log('Gracefully shutting down')
await knex.destroy()
await client.close()
await redis.disconnect()
await setTimeout(100)
process.exit()
}
/**
*
* @param {string} row
* @return {BackupBlobJob}
*/
function parseCSVRow(row) {
const [historyId, hash] = row.split(',')
validateBackedUpBlobJob({ historyId, hash })
return { historyId, hash }
}
/**
*
* @param {BackupBlobJob} job
*/
function validateBackedUpBlobJob(job) {
assert.projectId(job.historyId)
assert.blobHash(job.hash)
}
/**
*
* @param {string} path
* @return {Promise<Array<BackupBlobJob>>}
*/
async function readCSV(path) {
let fh
/** @type {Array<BackupBlobJob>} */
const rows = []
try {
fh = await fs.promises.open(path, 'r')
} catch (error) {
console.error(`Could not open file: ${error}`)
throw error
}
for await (const line of fh.readLines()) {
try {
const row = parseCSVRow(line)
if (GLOBAL_BLOBS.has(row.hash)) {
console.log(`Skipping global blob: ${line}`)
continue
}
rows.push(row)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
console.log(`Skipping invalid row: ${line}`)
}
}
return rows
}
/**
* @typedef {Object} BackupBlobJob
* @property {string} hash
* @property {string} historyId
*/
/**
* @param {Object} options
* @property {string} [options.historyId]
* @property {string} [options.hash]
* @property {string} [options.input]
* @return {Promise<Array<BackupBlobJob>>}
*/
async function initialiseJobs({ historyId, hash, input }) {
if (input) {
return await readCSV(input)
}
if (!historyId) {
console.error('historyId is required')
process.exitCode = 1
await gracefulShutdown()
}
if (!hash) {
console.error('hash is required')
process.exitCode = 1
await gracefulShutdown()
}
validateBackedUpBlobJob({ historyId, hash })
if (GLOBAL_BLOBS.has(hash)) {
console.error(`Blob ${hash} is a global blob; not backing up`)
process.exitCode = 1
await gracefulShutdown()
}
return [{ hash, historyId }]
}
/**
*
* @param {string} historyId
* @param {string} hash
* @return {Promise<void>}
*/
export async function downloadAndBackupBlob(historyId, hash) {
const blobStore = new BlobStore(historyId)
const blob = await blobStore.getBlob(hash)
if (!blob) {
throw new Error(`Blob ${hash} could not be loaded`)
}
await withTmpDir(`blob-${hash}`, async tmpDir => {
const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
console.log(`Downloaded blob ${hash} to ${filePath}`)
await backupBlob(historyId, blob, filePath)
console.log('Backed up blob')
})
}
let jobs
const options = commandLineArgs([
{ name: 'historyId', type: String },
{ name: 'hash', type: String },
{ name: 'input', type: String },
])
try {
jobs = await initialiseJobs(options)
} catch (error) {
console.error(error)
await gracefulShutdown()
}
if (!Array.isArray(jobs)) {
// This is mostly to satisfy typescript
process.exitCode = 1
await gracefulShutdown()
process.exit(1)
}
for (const { historyId, hash } of jobs) {
try {
await downloadAndBackupBlob(historyId, hash)
} catch (error) {
console.error(error)
process.exitCode = 1
}
}
await gracefulShutdown()

View File

@@ -0,0 +1,153 @@
// @ts-check
import { ObjectId } from 'mongodb'
import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
import { db, client } from '../lib/mongodb.js'
const projectsCollection = db.collection('projects')
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
// Configuration
const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
? parseInt(process.argv[2], 10)
: 10000
const TARGET_ERROR_PERCENTAGE = process.argv[3]
? parseFloat(process.argv[3])
: 5.0
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated')
}
async function takeSample(sampleSize) {
const results = await projectsCollection
.aggregate(
[
{ $sample: { size: sampleSize } },
{
$match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
},
{
$count: 'total',
},
],
{ readPreference: READ_PREFERENCE_SECONDARY }
)
.toArray()
const count = results[0]?.total || 0
return { totalSampled: sampleSize, backedUp: count }
}
function calculateStatistics(
cumulativeSampled,
cumulativeBackedUp,
totalPopulation
) {
const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
// Standard error with finite population correction
const fpc = Math.sqrt(
(totalPopulation - cumulativeSampled) / (totalPopulation - 1)
)
const stdError =
Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
// 95% confidence interval is approximately ±1.96 standard errors
const marginOfError = 1.96 * stdError
return {
proportion,
percentage: (proportion * 100).toFixed(2),
marginOfError,
errorPercentage: (marginOfError * 100).toFixed(2),
lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
upperBound: ((proportion + marginOfError) * 100).toFixed(2),
sampleSize: cumulativeSampled,
populationSize: totalPopulation,
}
}
async function main() {
console.log('Date:', new Date().toISOString())
const totalCount = await projectsCollection.estimatedDocumentCount({
readPreference: READ_PREFERENCE_SECONDARY,
})
console.log(
`Total projects in collection (estimated): ${totalCount.toLocaleString()}`
)
console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
let cumulativeSampled = 0
let cumulativeBackedUp = 0
let currentError = Infinity
let iteration = 0
console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
console.log('----------|---------------|-------------|----------------')
while (currentError > TARGET_ERROR_PERCENTAGE) {
if (gracefulShutdownInitiated) {
console.log('Graceful shutdown initiated. Exiting sampling loop.')
break
}
iteration++
const { totalSampled, backedUp } = await takeSample(
SAMPLE_SIZE_PER_ITERATION
)
cumulativeSampled += totalSampled
cumulativeBackedUp += backedUp
const stats = calculateStatistics(
cumulativeSampled,
cumulativeBackedUp,
totalCount
)
currentError = parseFloat(stats.errorPercentage)
console.log(
`${iteration.toString().padStart(9)} | ` +
`${cumulativeSampled.toString().padStart(13)} | ` +
`${stats.percentage.padStart(10)}% | ` +
`\u00B1${stats.errorPercentage}%`
)
// Small delay between iterations
await new Promise(resolve => setTimeout(resolve, 100))
}
const finalStats = calculateStatistics(
cumulativeSampled,
cumulativeBackedUp,
totalCount
)
console.log(
`Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
)
console.log(
`Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
)
console.log(
`95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
)
console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err)
process.exitCode = 1
})
.finally(() => {
client.close().catch(err => console.error('Error closing MongoDB:', err))
})

View File

@@ -0,0 +1,429 @@
import Queue from 'bull'
import config from 'config'
import commandLineArgs from 'command-line-args'
import logger from '@overleaf/logger'
import {
listPendingBackups,
listUninitializedBackups,
getBackupStatus,
} from '../lib/backup_store/index.js'
logger.initialize('backup-queue')
// Use the same redis config as backup_worker
const redisOptions = config.get('redis.queue')
// Create a Bull queue named 'backup'
const backupQueue = new Queue('backup', {
redis: redisOptions,
defaultJobOptions: {
removeOnComplete: true,
removeOnFail: true,
},
})
// Define command-line options
const optionDefinitions = [
{ name: 'clean', type: Boolean },
{ name: 'status', type: Boolean },
{
name: 'add',
type: String,
multiple: true,
description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
},
{ name: 'monitor', type: Boolean },
{
name: 'queue-pending',
type: Number,
description:
'Find projects with pending changes older than N seconds and add them to the queue',
},
{
name: 'show-pending',
type: Number,
description:
'Show count of pending projects older than N seconds without adding to queue',
},
{
name: 'limit',
type: Number,
description: 'Limit the number of jobs to be added',
},
{
name: 'interval',
type: Number,
description: 'Time in seconds to spread jobs over (default: 300)',
defaultValue: 300,
},
{
name: 'backoff-delay',
type: Number,
description:
'Backoff delay in milliseconds for failed jobs (default: 1000)',
defaultValue: 1000,
},
{
name: 'attempts',
type: Number,
description: 'Number of retry attempts for failed jobs (default: 3)',
defaultValue: 3,
},
{
name: 'warn-threshold',
type: Number,
description: 'Warn about any project exceeding this pending age',
defaultValue: 2 * 3600, // 2 hours
},
{
name: 'verbose',
alias: 'v',
type: Boolean,
description: 'Show detailed information when used with --show-pending',
},
]
// Parse command line arguments
const options = commandLineArgs(optionDefinitions)
const WARN_THRESHOLD = options['warn-threshold']
// Helper to validate date format
function isValidDateFormat(dateStr) {
return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
}
// Helper to validate the pending time parameter
function validatePendingTime(option, value) {
if (typeof value !== 'number' || value <= 0) {
console.error(
`Error: --${option} requires a positive numeric TIME argument in seconds`
)
console.error(`Example: --${option} 3600`)
process.exit(1)
}
return value
}
// Helper to format the pending time display
function formatPendingTime(timestamp) {
const now = new Date()
const diffMs = now - timestamp
const seconds = Math.floor(diffMs / 1000)
return `${timestamp.toISOString()} (${seconds} seconds ago)`
}
// Helper to add a job to the queue, checking for duplicates
async function addJobWithCheck(queue, data, options) {
const jobId = options.jobId
// Check if the job already exists
const existingJob = await queue.getJob(jobId)
if (existingJob) {
return { job: existingJob, added: false }
} else {
const job = await queue.add(data, options)
return { job, added: true }
}
}
// Setup queue event listeners
function setupMonitoring() {
console.log('Starting queue monitoring. Press Ctrl+C to exit.')
backupQueue.on('global:error', error => {
logger.info({ error }, 'Queue error')
})
backupQueue.on('global:waiting', jobId => {
logger.info({ jobId }, 'job is waiting')
})
backupQueue.on('global:active', jobId => {
logger.info({ jobId }, 'job is now active')
})
backupQueue.on('global:stalled', jobId => {
logger.info({ jobId }, 'job has stalled')
})
backupQueue.on('global:progress', (jobId, progress) => {
logger.info({ jobId, progress }, 'job progress')
})
backupQueue.on('global:completed', (jobId, result) => {
logger.info({ jobId, result }, 'job completed')
})
backupQueue.on('global:failed', (jobId, err) => {
logger.info({ jobId, err }, 'job failed')
})
backupQueue.on('global:paused', () => {
logger.info({}, 'Queue paused')
})
backupQueue.on('global:resumed', () => {
logger.info({}, 'Queue resumed')
})
backupQueue.on('global:cleaned', (jobs, type) => {
logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
})
backupQueue.on('global:drained', () => {
logger.info({}, 'Queue drained')
})
backupQueue.on('global:removed', jobId => {
logger.info({ jobId }, 'Job removed')
})
}
async function addDateRangeJob(input) {
const [startDate, endDate] = input.split(':')
if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
console.error(
`Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
)
return
}
const jobId = `backup-${startDate}-to-${endDate}`
const { job, added } = await addJobWithCheck(
backupQueue,
{ startDate, endDate },
{ jobId }
)
console.log(
`${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
)
}
// Helper to list pending and uninitialized backups
// This function combines the two cursors into a single generator
// to yield projects from both lists
async function* pendingCursor(timeIntervalMs, limit) {
for await (const project of listPendingBackups(timeIntervalMs, limit)) {
yield project
}
for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
yield project
}
}
// Process pending projects with changes older than the specified seconds
async function processPendingProjects(
age,
showOnly,
limit,
verbose,
jobInterval,
jobOpts = {}
) {
const timeIntervalMs = age * 1000
console.log(
`Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
)
let count = 0
let addedCount = 0
let existingCount = 0
// Pass the limit directly to MongoDB query for better performance
const changeTimes = []
for await (const project of pendingCursor(timeIntervalMs, limit)) {
const projectId = project._id.toHexString()
const pendingAt =
project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
if (pendingAt) {
changeTimes.push(pendingAt)
const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
if (pendingAge > WARN_THRESHOLD) {
try {
const backupStatus = await getBackupStatus(projectId)
logger.warn(
{
projectId,
pendingAt,
pendingAge,
backupStatus,
warnThreshold: WARN_THRESHOLD,
},
`pending change exceeds rpo warning threshold`
)
} catch (err) {
logger.error(
{ projectId, pendingAt, pendingAge },
'Error getting backup status'
)
throw err
}
}
}
if (showOnly && verbose) {
console.log(
`Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
)
} else if (!showOnly) {
const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
const { job, added } = await addJobWithCheck(
backupQueue,
{ projectId, pendingChangeAt: pendingAt.getTime() },
{ ...jobOpts, delay, jobId: projectId }
)
if (added) {
if (verbose) {
console.log(
`Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
)
}
addedCount++
} else {
if (verbose) {
console.log(
`Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
)
}
existingCount++
}
}
count++
if (count % 1000 === 0) {
console.log(
`Processed ${count} projects`,
showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
)
}
}
// Set oldestChange to undefined if there are no changes
const oldestChange =
changeTimes.length > 0
? changeTimes.reduce((min, time) => (time < min ? time : min))
: undefined
if (showOnly) {
console.log(
`Found ${count} projects with pending changes (not added to queue)`
)
} else {
console.log(`Found ${count} projects with pending changes:`)
console.log(` ${addedCount} jobs added to queue`)
console.log(` ${existingCount} jobs already existed in queue`)
if (oldestChange) {
console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`)
}
}
}
// Main execution block
async function run() {
const optionCount = [
options.clean,
options.status,
options.add,
options.monitor,
options['queue-pending'] !== undefined,
options['show-pending'] !== undefined,
].filter(Boolean).length
if (optionCount > 1) {
console.error('Only one option can be specified')
process.exit(1)
}
if (options.clean) {
const beforeCounts = await backupQueue.getJobCounts()
console.log('Current queue state:', JSON.stringify(beforeCounts))
console.log('Cleaning completed and failed jobs...')
await backupQueue.clean(1, 'completed')
await backupQueue.clean(1, 'failed')
const afterCounts = await backupQueue.getJobCounts()
console.log('Current queue state:', JSON.stringify(afterCounts))
console.log('Queue cleaned successfully')
} else if (options.status) {
const counts = await backupQueue.getJobCounts()
console.log('Current queue state:', JSON.stringify(counts))
} else if (options.add) {
const inputs = Array.isArray(options.add) ? options.add : [options.add]
for (const input of inputs) {
if (input.includes(':')) {
// Handle date range format
await addDateRangeJob(input)
} else {
// Handle project ID format
const { job, added } = await addJobWithCheck(
backupQueue,
{ projectId: input },
{ jobId: input }
)
console.log(
`${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
)
}
}
} else if (options.monitor) {
setupMonitoring()
} else if (options['queue-pending'] !== undefined) {
const age = validatePendingTime('queue-pending', options['queue-pending'])
await processPendingProjects(
age,
false,
options.limit,
options.verbose,
options.interval,
{
attempts: options.attempts,
backoff: {
type: 'exponential',
delay: options['backoff-delay'],
},
}
)
} else if (options['show-pending'] !== undefined) {
const age = validatePendingTime('show-pending', options['show-pending'])
await processPendingProjects(age, true, options.limit, options.verbose)
} else {
console.log('Usage:')
console.log(' --clean Clean up completed and failed jobs')
console.log(' --status Show current job counts')
console.log(' --add [projectId] Add a job for the specified projectId')
console.log(
' --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
)
console.log(' --monitor Monitor queue events')
console.log(
' --queue-pending TIME Find projects with changes older than TIME seconds and add them to the queue'
)
console.log(
' --show-pending TIME Show count of pending projects older than TIME seconds'
)
console.log(' --limit N Limit the number of jobs to be added')
console.log(
' --interval TIME Time interval in seconds to spread jobs over'
)
console.log(
' --backoff-delay TIME Backoff delay in milliseconds for failed jobs (default: 1000)'
)
console.log(
' --attempts N Number of retry attempts for failed jobs (default: 3)'
)
console.log(
' --verbose, -v Show detailed information when used with --show-pending'
)
}
}
// Run and handle errors
run()
.catch(err => {
console.error('Error:', err)
process.exit(1)
})
.then(result => {
// Only exit if not in monitor mode
if (!options.monitor) {
process.exit(0)
}
})

View File

@@ -0,0 +1,144 @@
import Queue from 'bull'
import logger from '@overleaf/logger'
import config from 'config'
import metrics from '@overleaf/metrics'
import {
backupProject,
initializeProjects,
configureBackup,
} from './backup.mjs'
const CONCURRENCY = 15
const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
const redisOptions = config.get('redis.queue')
const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
const LAG_TIME_BUCKETS_HRS = [
0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
] // hours
// Configure backup settings to match worker concurrency
configureBackup({ concurrency: 50, useSecondary: true })
// Create a Bull queue named 'backup'
const backupQueue = new Queue('backup', {
redis: redisOptions,
settings: {
lockDuration: 15 * 60 * 1000, // 15 minutes
lockRenewTime: 60 * 1000, // 1 minute
maxStalledCount: 0, // mark stalled jobs as failed
},
})
// Log queue events
backupQueue.on('active', job => {
logger.debug({ job }, 'job is now active')
})
backupQueue.on('completed', (job, result) => {
metrics.inc('backup_worker_job', 1, { status: 'completed' })
logger.debug({ job, result }, 'job completed')
})
backupQueue.on('failed', (job, err) => {
metrics.inc('backup_worker_job', 1, { status: 'failed' })
logger.error({ job, err }, 'job failed')
})
backupQueue.on('waiting', jobId => {
logger.debug({ jobId }, 'job is waiting')
})
backupQueue.on('error', error => {
logger.error({ error }, 'queue error')
})
backupQueue.on('stalled', job => {
logger.error({ job }, 'job has stalled')
})
backupQueue.on('lock-extension-failed', (job, err) => {
logger.error({ job, err }, 'lock extension failed')
})
backupQueue.on('paused', () => {
logger.info('queue paused')
})
backupQueue.on('resumed', () => {
logger.info('queue resumed')
})
// Process jobs
backupQueue.process(CONCURRENCY, async job => {
const { projectId, startDate, endDate } = job.data
if (projectId) {
return await runBackup(projectId, job.data, job)
} else if (startDate && endDate) {
return await runInit(startDate, endDate)
} else {
throw new Error('invalid job data')
}
})
async function runBackup(projectId, data, job) {
const { pendingChangeAt } = data
// record the time it takes to run the backup job
const timer = new metrics.Timer(
'backup_worker_job_duration',
1,
{},
JOB_TIME_BUCKETS
)
const pendingAge = Date.now() - pendingChangeAt
if (pendingAge > WARN_THRESHOLD) {
logger.warn(
{ projectId, pendingAge, job },
'project has been pending for a long time'
)
}
try {
logger.debug({ projectId }, 'processing backup for project')
await backupProject(projectId, {})
metrics.inc('backup_worker_project', 1, {
status: 'success',
})
timer.done()
// record the replication lag (time from change to backup)
if (pendingChangeAt) {
metrics.histogram(
'backup_worker_replication_lag_in_hours',
(Date.now() - pendingChangeAt) / (3600 * 1000),
LAG_TIME_BUCKETS_HRS
)
}
return `backup completed ${projectId}`
} catch (err) {
metrics.inc('backup_worker_project', 1, { status: 'failed' })
logger.error({ projectId, err }, 'backup failed')
throw err // Re-throw to mark job as failed
}
}
async function runInit(startDate, endDate) {
try {
logger.info({ startDate, endDate }, 'initializing projects')
await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
return `initialization completed ${startDate} - ${endDate}`
} catch (err) {
logger.error({ startDate, endDate, err }, 'initialization failed')
throw err
}
}
export async function drainQueue() {
logger.info({ queue: backupQueue.name }, 'pausing queue')
await backupQueue.pause(true) // pause this worker and wait for jobs to finish
logger.info({ queue: backupQueue.name }, 'closing queue')
await backupQueue.close()
}
export async function healthCheck() {
const count = await backupQueue.count()
metrics.gauge('backup_worker_queue_length', count)
}

View File

@@ -0,0 +1,69 @@
/**
* A script to export the global blobs from mongo to a CSV file.
*
* node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
*
* The output CSV has the following format:
*
* hash,path,byteLength,stringLength,demoted
*
* hash: the hash of the blob
* path: the path of the blob in the blob store
* byteLength: the byte length of the blob, or empty if unknown
* stringLength: the string length of the blob, or empty if unknown
* demoted: true if the blob has been demoted to a reference, false otherwise
*/
// @ts-check
import { ObjectId } from 'mongodb'
import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
import { client } from '../lib/mongodb.js'
import commandLineArgs from 'command-line-args'
import fs from 'node:fs'
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
function parseArgs() {
const args = commandLineArgs([
{
name: 'output',
type: String,
alias: 'o',
},
])
const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
return {
OUTPUT_STREAM,
}
}
const { OUTPUT_STREAM } = parseArgs()
async function main() {
await loadGlobalBlobs()
OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
const { hash: blobHash, byteLength, stringLength } = blob
if (blobHash !== hash) {
throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
}
const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
const byteLengthStr = byteLength === null ? '' : byteLength
const stringLengthStr = stringLength === null ? '' : stringLength
OUTPUT_STREAM.write(
`${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
)
}
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err)
process.exitCode = 1
})
.finally(() => {
client.close().catch(err => console.error('Error closing MongoDB:', err))
})

View File

@@ -0,0 +1,51 @@
// @ts-check
import { backedUpBlobs } from '../lib/mongodb.js'
import { mongoId } from '../lib/assert.js'
import { ObjectId } from 'mongodb'
import commandLineArgs from 'command-line-args'
const STATS = {
total: 0,
replaced: 0,
skipped: 0,
}
const config = commandLineArgs([
{ name: 'commit', type: Boolean, defaultValue: false },
])
async function processRecord(record) {
STATS.total++
try {
mongoId(record._id)
const newId = new ObjectId(record._id)
if (config.commit) {
await backedUpBlobs.updateOne(
{ _id: newId },
{
$addToSet: { blobs: { $each: record.blobs } },
},
{ upsert: true }
)
await backedUpBlobs.deleteOne({ _id: record._id })
}
STATS.replaced++
} catch (error) {
console.log(error)
STATS.skipped++
}
}
const cursor = backedUpBlobs
.find({ _id: { $type: 'string' } })
.project({ _id: 1, blobs: 1 })
while (await cursor.hasNext()) {
const record = await cursor.next()
await processRecord(record)
}
console.log(
`${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
)
process.exit()

View File

@@ -0,0 +1,3 @@
UPDATE blobs
SET global = TRUE
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);

View File

@@ -0,0 +1,16 @@
CREATE TABLE global_blobs (
hash_bytes bytea NOT NULL,
byte_length integer NOT NULL,
string_length integer,
global boolean,
CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
CONSTRAINT global_blobs_byte_length_non_negative
CHECK (byte_length >= 0),
CONSTRAINT global_blobs_string_length_non_negative
CHECK (string_length IS NULL OR string_length >= 0)
);
INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
SELECT hash_bytes, byte_length, string_length, true
FROM blobs
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);

View File

@@ -0,0 +1,22 @@
BEGIN;
ALTER TABLE blobs RENAME TO old_blobs;
ALTER TABLE global_blobs RENAME TO blobs;
ALTER TABLE old_blobs
RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
ALTER TABLE old_blobs
RENAME CONSTRAINT blobs_byte_length_non_negative
TO old_blobs_byte_length_non_negative;
ALTER TABLE old_blobs
RENAME CONSTRAINT blobs_string_length_non_negative
TO old_blobs_string_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
ALTER TABLE blobs
RENAME CONSTRAINT global_blobs_byte_length_non_negative
TO blobs_byte_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT global_blobs_string_length_non_negative
TO blobs_string_length_non_negative;
COMMIT;

View File

@@ -0,0 +1,9 @@
Scripts in this directory were used when we cleaned up the global blobs table,
ensuring that it only contained global blobs. The scripts are meant to be run in this order:
* `01-create-blob-hashes-table.sql`
* `02-set-global-flag.sql`
* `03-create-global-blobs-table.sql`
* `04-swap-global-blob-tables.sql`
The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.

View File

@@ -0,0 +1,22 @@
BEGIN;
ALTER TABLE blobs RENAME TO global_blobs;
ALTER TABLE old_blobs RENAME TO blobs;
ALTER TABLE global_blobs
RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
ALTER TABLE global_blobs
RENAME CONSTRAINT blobs_byte_length_non_negative
TO global_blobs_byte_length_non_negative;
ALTER TABLE global_blobs
RENAME CONSTRAINT blobs_string_length_non_negative
TO global_blobs_string_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
ALTER TABLE blobs
RENAME CONSTRAINT old_blobs_byte_length_non_negative
TO blobs_byte_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT old_blobs_string_length_non_negative
TO blobs_string_length_non_negative;
COMMIT;

View File

@@ -0,0 +1,379 @@
const fsPromises = require('node:fs/promises')
const { ObjectId } = require('mongodb')
const BPromise = require('bluebird')
const logger = require('@overleaf/logger')
const Settings = require('@overleaf/settings')
const rclient = require('@overleaf/redis-wrapper').createClient(
Settings.redis.documentupdater
)
const mongodb = require('../lib/mongodb')
const { chunkStore } = require('..')
const Events = require('node:events')
// Silence warning.
Events.setMaxListeners(20)
const BATCH_SIZE = 1000
const OPTIONS = {
concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
'skip-history-failures':
process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
}
const db = {
deletedProjects: mongodb.db.collection('deletedProjects'),
docs: mongodb.db.collection('docs'),
migrations: mongodb.db.collection('migrations'),
projects: mongodb.db.collection('projects'),
}
const BAD_MIGRATION_NAME =
'20231219081700_move_doc_versions_from_docops_to_docs'
const RECOVERY_FILES_502 = [
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
]
let loggingChain = Promise.resolve()
const projectIdsThatNeedResyncing = []
const unflushedDocIds = new Set()
async function flushLogQueue() {
const logPath = OPTIONS['resyncs-needed-file']
loggingChain = loggingChain.then(async () => {
const batch = projectIdsThatNeedResyncing.splice(0)
if (batch.length === 0) return
try {
await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
} catch (err) {
projectIdsThatNeedResyncing.push(...batch)
logger.err({ err, logPath, batch }, 'Failed to write to log file')
}
})
await loggingChain
}
async function recordProjectNeedsResync(projectId) {
if (OPTIONS['resyncs-needed-file']) {
projectIdsThatNeedResyncing.push(projectId)
await flushLogQueue()
} else {
console.log(`Project ${projectId} needs a hard resync.`)
}
}
async function main() {
const recovery502Ran = await did502RecoveryRun()
await getUnflushedDocIds()
const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
// Tell customers that they need to flush
console.log(`
--------------------------------------------------------------------
Detected unflushed changes while recovering doc versions.
Please go back to version 5.0.1 and follow the recovery procedure
for flushing document updates:
https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
--------------------------------------------------------------------`)
process.exit(1)
}
if (OPTIONS.force || recovery502Ran || badMigration != null) {
console.warn('Need to recover doc versions. This will take a while.')
await runRecovery()
await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
await delete502RecoveryFiles()
}
console.log('Done.')
}
async function did502RecoveryRun() {
for (const file of RECOVERY_FILES_502) {
try {
await fsPromises.stat(file)
return true
} catch (err) {
// file doesn't exist. continue
}
}
return false
}
async function delete502RecoveryFiles() {
for (const file of RECOVERY_FILES_502) {
try {
await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
} catch (err) {
// file doesn't exist. continue
}
}
}
async function runRecovery() {
let batch = []
const summary = {
ignored: 0,
skipped: 0,
deletedUpdatedMongo: 0,
deletedUpdatedRedis: 0,
deletedUpdatedBoth: 0,
deletedIgnored: 0,
updatedMongo: 0,
updatedRedis: 0,
updatedBoth: 0,
}
const processBatchAndLogProgress = async () => {
try {
await BPromise.map(batch, project => processProject(project, summary), {
concurrency: OPTIONS.concurrency,
})
} finally {
console.log(`${summary.updatedRedis} projects updated in Redis`)
console.log(`${summary.updatedMongo} projects updated in Mongo`)
console.log(
`${summary.updatedBoth} projects updated in both Mongo and Redis`
)
console.log(`${summary.ignored} projects had good versions`)
console.log(
`${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
)
console.log(
`${summary.deletedUpdatedRedis} deleted projects updated in Redis`
)
console.log(
`${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
)
console.log(
`${summary.deletedIgnored} deleted projects had good versions`
)
console.log(`${summary.skipped} projects skipped`)
}
batch = []
}
await printDBStats()
await initResyncsNeededFile()
for await (const project of getProjects()) {
batch.push(project)
if (batch.length >= BATCH_SIZE) {
await processBatchAndLogProgress()
}
}
for await (const deletedProject of getDeletedProjects()) {
const project = deletedProject.project
project.isDeleted = true
batch.push(project)
if (batch.length >= BATCH_SIZE) {
await processBatchAndLogProgress()
}
}
if (batch.length > 0) {
await processBatchAndLogProgress()
}
await backfillMissingVersions()
}
async function getUnflushedDocIds() {
const batchSize = 1000
let cursor = '0'
do {
const [newCursor, keys] = await rclient.scan(
cursor,
'MATCH',
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
'COUNT',
batchSize
)
for (const key of keys) {
unflushedDocIds.add(key.slice('DocVersion:'.length))
}
cursor = newCursor
} while (cursor !== '0')
}
async function printDBStats() {
const projects = await db.projects.estimatedDocumentCount()
const deletedProjects = await db.deletedProjects.countDocuments()
const docs = await db.docs.estimatedDocumentCount()
console.log(
`Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
)
}
async function initResyncsNeededFile() {
const logPath = OPTIONS['resyncs-needed-file']
if (logPath) {
await fsPromises.writeFile(logPath, '')
await fsPromises.rm(`${logPath}.done`, { force: true })
}
}
function getProjects() {
return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
}
function getDeletedProjects() {
return db.deletedProjects.find(
{ 'project.overleaf.history.id': { $exists: true } },
{ projection: { 'project._id': 1, 'project.overleaf': 1 } }
)
}
async function processProject(project, summary) {
const projectId = project._id.toString()
let updatedMongo = false
let updatedRedis = false
try {
const historyDocVersions = await getHistoryDocVersions(project)
for (const { docId, version } of historyDocVersions) {
const update = await fixDocVersion(docId, version)
if (update != null) {
if (update.in === 'mongo') {
updatedMongo = true
} else if (update.in === 'redis') {
updatedRedis = true
}
}
}
if (project.isDeleted) {
if (updatedMongo && updatedRedis) {
summary.deletedUpdatedBoth += 1
} else if (updatedMongo) {
summary.deletedUpdatedMongo += 1
} else if (updatedRedis) {
summary.deletedUpdatedRedis += 1
} else {
summary.deletedIgnored += 1
}
} else {
await recordProjectNeedsResync(projectId)
if (updatedMongo && updatedRedis) {
summary.updatedBoth += 1
} else if (updatedMongo) {
summary.updatedMongo += 1
} else if (updatedRedis) {
summary.updatedRedis += 1
} else {
summary.ignored += 1
}
}
} catch (err) {
logger.error({ err, projectId }, 'Failed to process project')
if (OPTIONS['skip-history-failures']) {
summary.skipped += 1
} else {
throw err
}
}
}
async function getHistoryDocVersions(project) {
const historyId = project.overleaf.history.id
const chunk = await chunkStore.loadLatest(historyId)
if (chunk == null) {
return []
}
const snapshot = chunk.getSnapshot()
const changes = chunk.getChanges()
snapshot.applyAll(changes)
const v2DocVersions = snapshot.getV2DocVersions()
if (v2DocVersions == null) {
return []
}
return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
docId,
version: versionInfo.v,
}))
}
async function fixDocVersion(docId, historyVersion) {
const redisVersion = await getRedisDocVersion(docId)
if (redisVersion != null && historyVersion >= redisVersion) {
await setRedisDocVersion(docId, historyVersion + 1)
return {
in: 'redis',
previousVersion: redisVersion,
newVersion: historyVersion + 1,
}
} else {
const docBeforeUpdate = await db.docs.findOneAndUpdate(
{
_id: new ObjectId(docId),
$or: [
{ version: { $lte: historyVersion } },
{ version: { $exists: false } },
],
},
{ $set: { version: historyVersion + 1 } },
{ projection: { _id: 1, version: 1 } }
)
if (docBeforeUpdate != null) {
return {
in: 'mongo',
previousVersion: docBeforeUpdate.version,
newVersion: historyVersion + 1,
}
} else {
return null
}
}
}
async function getRedisDocVersion(docId) {
if (!unflushedDocIds.has(docId)) {
return null
}
const result = await rclient.get(
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
)
if (result == null) {
return null
}
return parseInt(result, 10)
}
async function setRedisDocVersion(docId, version) {
const multi = rclient.multi()
multi.set(
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
version
)
multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
await multi.exec()
}
/**
* Set all remaining versions to 0
*/
async function backfillMissingVersions() {
console.log('Defaulting version to 0 for remaining docs.')
await db.docs.updateMany(
{ version: { $exists: false } },
{ $set: { version: 0 } }
)
}
main()
.finally(async () => {
console.log('Flushing log queue.')
await flushLogQueue()
})
.then(() => {
process.exit(0)
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,255 @@
/**
* Try to recover a zip of the latest version of a project using only data in
* GCS, where this data may have been (recently) hard deleted (i.e. may exist
* wholely or in part as non-current versions). This should be able to
* retrieve the latest content of a project up to 180 days after it was
* deleted.
*
* Usage:
* node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
*
* Output:
* Signed URL(s) for the uploaded zip files. Note that these are valid for
* only 24h, to match the lifecycle rule on the zip bucket.
*/
const fs = require('node:fs')
const os = require('node:os')
const path = require('node:path')
const util = require('node:util')
// Something is registering 11 listeners, over the limit of 10, which generates
// a lot of warning noise.
require('node:events').EventEmitter.defaultMaxListeners = 11
const config = require('config')
// We depend on this via object-persistor.
// eslint-disable-next-line import/no-extraneous-dependencies
const { Storage } = require('@google-cloud/storage')
const isValidUtf8 = require('utf-8-validate')
const core = require('overleaf-editor-core')
const projectKey = require('../lib/project_key')
const streams = require('../lib/streams')
const ProjectArchive = require('../lib/project_archive')
const {
values: { verbose: VERBOSE },
positionals: HISTORY_IDS,
} = util.parseArgs({
options: {
verbose: {
type: 'boolean',
default: false,
},
},
allowPositionals: true,
})
if (HISTORY_IDS.length === 0) {
console.error('no history IDs; see usage')
process.exit(1)
}
async function listDeletedChunks(historyId) {
const bucketName = config.get('chunkStore.bucket')
const storage = new Storage()
const [files] = await storage.bucket(bucketName).getFiles({
prefix: projectKey.format(historyId),
versions: true,
})
return files
}
async function findLatestChunk(historyId) {
const files = await listDeletedChunks(historyId)
if (files.length === 0) return null
files.sort((a, b) => {
if (a.name < b.name) return -1
if (a.name > b.name) return 1
return 0
})
return files[files.length - 1]
}
async function downloadLatestChunk(tmp, historyId) {
const latestChunkFile = await findLatestChunk(historyId)
if (!latestChunkFile) throw new Error('no chunk found to recover')
const destination = path.join(tmp, 'latest.json')
await latestChunkFile.download({ destination })
return destination
}
async function loadHistory(historyPathname) {
const data = await fs.promises.readFile(historyPathname)
const rawHistory = JSON.parse(data)
return core.History.fromRaw(rawHistory)
}
async function loadChunk(historyPathname, blobStore) {
const history = await loadHistory(historyPathname)
const blobHashes = new Set()
history.findBlobHashes(blobHashes)
await blobStore.fetchBlobs(blobHashes)
await history.loadFiles('lazy', blobStore)
return new core.Chunk(history, 0)
}
// TODO: it would be nice to export / expose this from BlobStore;
// currently this is a copy of the method there.
async function getStringLengthOfFile(byteLength, pathname) {
// We have to read the file into memory to get its UTF-8 length, so don't
// bother for files that are too large for us to edit anyway.
if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
return null
}
// We need to check if the file contains nonBmp or null characters
let data = await fs.promises.readFile(pathname)
if (!isValidUtf8(data)) return null
data = data.toString()
if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
if (core.util.containsNonBmpChars(data)) return null
if (data.indexOf('\x00') !== -1) return null
return data.length
}
class RecoveryBlobStore {
constructor(historyId, tmp) {
this.historyId = historyId
this.tmp = tmp
this.blobs = new Map()
}
async fetchBlobs(blobHashes) {
for await (const blobHash of blobHashes) {
await this.fetchBlob(blobHash)
}
}
async fetchBlob(hash) {
if (this.blobs.has(hash)) return
if (VERBOSE) console.log('fetching blob', hash)
const bucketName = config.get('blobStore.projectBucket')
const storage = new Storage()
const [files] = await storage.bucket(bucketName).getFiles({
prefix: this.makeProjectBlobKey(hash),
versions: true,
})
const destination = this.getBlobPathname(hash)
if (files.length === 0) {
await this.fetchGlobalBlob(hash, destination)
} else if (files.length === 1) {
await files[0].download({ destination })
} else {
throw new Error('Multiple versions of blob ' + hash)
}
this.blobs.set(hash, await this.makeBlob(hash, destination))
}
async fetchGlobalBlob(hash, destination) {
const bucketName = config.get('blobStore.globalBucket')
const storage = new Storage()
const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
await file.download({ destination })
}
async makeBlob(hash, pathname) {
const stat = await fs.promises.stat(pathname)
const byteLength = stat.size
const stringLength = await getStringLengthOfFile(byteLength, pathname)
return new core.Blob(hash, byteLength, stringLength)
}
async getString(hash) {
const stream = await this.getStream(hash)
const buffer = await streams.readStreamToBuffer(stream)
return buffer.toString()
}
async getStream(hash) {
return fs.createReadStream(this.getBlobPathname(hash))
}
async getBlob(hash) {
return this.blobs.get(hash)
}
getBlobPathname(hash) {
return path.join(this.tmp, hash)
}
makeGlobalBlobKey(hash) {
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
}
makeProjectBlobKey(hash) {
return `${projectKey.format(this.historyId)}/${hash.slice(
0,
2
)}/${hash.slice(2)}`
}
}
async function uploadZip(historyId, zipPathname) {
const bucketName = config.get('zipStore.bucket')
const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
const storage = new Storage()
const destination = `${historyId}-recovered.zip`
await storage.bucket(bucketName).upload(zipPathname, { destination })
const signedUrls = await storage
.bucket(bucketName)
.file(destination)
.getSignedUrl({
version: 'v4',
action: 'read',
expires: Date.now() + deadline,
})
return signedUrls[0]
}
async function restoreProject(historyId) {
const tmp = await fs.promises.mkdtemp(
path.join(os.tmpdir(), historyId.toString())
)
if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
const blobStore = new RecoveryBlobStore(historyId, tmp)
const chunk = await loadChunk(latestJsonPathname, blobStore)
const snapshot = chunk.getSnapshot()
for (const change of chunk.getChanges()) {
change.applyTo(snapshot)
}
if (VERBOSE) console.log('zipping', historyId)
const zipPathname = path.join(tmp, `${historyId}.zip`)
const zipTimeoutMs = 60 * 1000
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
await archive.writeZip(blobStore, zipPathname)
if (VERBOSE) console.log('uploading', historyId)
return await uploadZip(historyId, zipPathname)
}
async function main() {
for (const historyId of HISTORY_IDS) {
const signedUrl = await restoreProject(historyId)
console.log(signedUrl)
}
}
main().catch(console.error)

View File

@@ -0,0 +1,36 @@
import redis from '@overleaf/redis-wrapper'
import config from 'config'
// Get allowed Redis dbs from config
const redisConfig = config.get('redis')
const allowedDbs = Object.keys(redisConfig)
// Get the Redis db from command line argument or use the first available db as default
const db = process.argv[2]
// Validate redis db
if (!allowedDbs.includes(db)) {
if (db) {
console.error('Invalid redis db:', db)
}
console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
process.exit(1)
}
// Get redis options based on command line argument
const redisOptions = config.get(`redis.${db}`)
console.log('Using redis db:', db)
console.log('REDIS CONFIG', {
...redisOptions,
password: '*'.repeat(redisOptions.password?.length),
})
const rclient = redis.createClient(redisOptions)
try {
await rclient.healthCheck()
console.log('REDIS HEALTHCHECK SUCCEEDED')
} catch (error) {
console.error('REDIS HEALTHCHECK FAILED', error)
} finally {
await rclient.quit()
}

View File

@@ -0,0 +1,104 @@
// @ts-check
import { readFileSync } from 'node:fs'
import commandLineArgs from 'command-line-args'
import { client } from '../lib/mongodb.js'
import {
getBackedUpBlobHashes,
unsetBackedUpBlobHashes,
} from '../lib/backup_store/index.js'
let gracefulShutdownInitiated = false
// Parse command line arguments
const args = commandLineArgs([
{ name: 'input', type: String, alias: 'i', defaultOption: true },
{ name: 'commit', type: Boolean, default: false },
])
if (!args.input) {
console.error(
'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
)
process.exit(1)
}
if (!args.commit) {
console.log('Running in dry-run mode. Use --commit to apply changes.')
}
// Signal handling
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
console.warn('Graceful shutdown initiated')
gracefulShutdownInitiated = true
}
// Process CSV and remove blobs
async function main() {
const projectBlobs = new Map()
const lines = readFileSync(args.input, 'utf8').split('\n')
const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
// Skip header
for (const line of lines.slice(1)) {
if (!line.trim() || gracefulShutdownInitiated) break
const [projectId, path] = line.split(',')
const pathParts = path.split('/')
const hash = pathParts[3] + pathParts[4]
if (!SHA1_HEX_REGEX.test(hash)) {
console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
continue
}
if (!projectBlobs.has(projectId)) {
projectBlobs.set(projectId, new Set())
}
projectBlobs.get(projectId).add(hash)
}
// Process each project
for (const [projectId, hashes] of projectBlobs) {
if (gracefulShutdownInitiated) break
if (!args.commit) {
console.log(
`DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
)
continue
}
try {
const originalHashes = await getBackedUpBlobHashes(projectId)
if (originalHashes.size === 0) {
continue
}
const result = await unsetBackedUpBlobHashes(
projectId,
Array.from(hashes)
)
if (result) {
console.log(
`Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
)
}
} catch (err) {
console.error(`Error updating project ${projectId}:`, err)
}
}
}
// Run the script
main()
.catch(err => {
console.error('Fatal error:', err)
process.exitCode = 1
})
.finally(() => {
client
.close()
.catch(err => console.error('Error closing MongoDB connection:', err))
})

View File

@@ -0,0 +1,221 @@
// @ts-check
/**
* This script is used to remove blobs that have been backed up under the project ID
* instead of the history ID (where those are different).
*
* This script reads a CSV file with the following format:
* ```
* project_id,hash
* <mongo ID>,<hash>
* ```
*
* The header row is optional. All rows will be checked for conformance to the format.
*/
import commandLineArgs from 'command-line-args'
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
import { makeProjectKey } from '../lib/blob_store/index.js'
import fs from 'node:fs'
import assert from '../lib/assert.js'
import { client } from '../lib/mongodb.js'
import { verifyBlobs } from '../lib/backupVerifier.mjs'
import { setTimeout } from 'node:timers/promises'
import { getHistoryId } from '../lib/backup_store/index.js'
const argsSchema = [
{
name: 'input',
type: String,
},
{
name: 'commit',
type: Boolean,
},
{
name: 'header',
type: Boolean,
},
{
name: 'force',
type: Boolean,
},
{
name: 'verbose',
type: Boolean,
},
]
const args = commandLineArgs(argsSchema)
async function gracefulClose(code = 0) {
await client.close()
process.exit(code)
}
/**
*
* @param {(value: unknown) => void} fn
* @param {unknown} value
* @return {boolean}
*/
function not(fn, value) {
try {
fn(value)
return false
} catch {
return true
}
}
/**
*
* @param {string} row
* @return {{projectId: string, hash: string}}
*/
function parseCSVRow(row) {
const [projectId, hash] = row.split(',')
assert.mongoId(projectId, `invalid projectId ${projectId}`)
assert.blobHash(hash, `invalid hash ${hash}`)
return { projectId, hash }
}
/**
*
* @param {string} path
* @param {boolean} hasHeader
* @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
*/
async function* readCSV(path, hasHeader) {
let seenHeader = !hasHeader
let fh
try {
fh = await fs.promises.open(path, 'r')
} catch (error) {
console.error(`Could not open file: ${error}`)
return await gracefulClose(1)
}
for await (const line of fh.readLines()) {
if (!seenHeader) {
const [first, second] = line.split(',')
const noDataInHeader =
not(assert.mongoId, first) && not(assert.blobHash, second)
if (!noDataInHeader) {
console.error('Data found in header row')
return await gracefulClose(1)
}
seenHeader = true
continue
}
try {
yield parseCSVRow(line)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
console.info(`Skipping invalid row: ${line}`)
}
}
}
function usage() {
console.info(
'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
)
}
if (!args.input) {
console.error('--input was missing')
usage()
await gracefulClose(1)
}
/**
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<void>}
*/
async function deleteBlob(projectId, hash) {
const path = makeProjectKey(projectId, hash)
if (args.commit) {
await backupPersistor.deleteObject(projectBlobsBucket, path)
} else {
console.log(`DELETE: ${path}`)
}
}
/**
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<void>}
*/
async function canDeleteBlob(projectId, hash) {
let historyId
try {
historyId = await getHistoryId(projectId)
} catch (error) {
if (args.verbose) {
console.error(error)
}
throw new Error(`No history ID found for project ${projectId}, skipping`)
}
if (historyId === projectId) {
throw new Error(
`Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
)
}
// TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
assert.postgresId(
`${historyId}`,
`History ID ${historyId} does not appear to be for a postgres project`
)
try {
await verifyBlobs(`${historyId}`, [hash])
} catch (error) {
if (args.verbose) {
console.error(error)
}
throw new Error(
`Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
)
}
}
if (!args.commit) {
console.log('DRY RUN: provide --commit to perform operations')
}
if (args.force) {
console.log(
'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
)
await setTimeout(5_000)
}
let deleted = 0
let errors = 0
for await (const { projectId, hash } of readCSV(args.input, args.header)) {
if (!args.force) {
try {
await canDeleteBlob(projectId, hash)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
continue
}
}
try {
await deleteBlob(projectId, hash)
deleted++
} catch (error) {
errors++
console.error(error)
}
}
console.log(`Deleted: ${deleted}`)
console.log(`Errors: ${errors}`)
await gracefulClose()

View File

@@ -0,0 +1,254 @@
import commandLineArgs from 'command-line-args'
import {
loadAtVersion,
getChunkMetadataForVersion,
getProjectChunksFromVersion,
} from '../lib/chunk_store/index.js'
import { client } from '../lib/mongodb.js'
import knex from '../lib/knex.js'
import redis from '../lib/redis.js'
import {
loadGlobalBlobs,
BlobStore,
makeProjectKey,
} from '../lib/blob_store/index.js'
import { TextDecoder } from 'node:util'
import {
backupPersistor,
chunksBucket,
projectBlobsBucket,
} from '../lib/backupPersistor.mjs'
import fs from 'node:fs'
import { pipeline } from 'node:stream/promises'
import os from 'node:os'
import path from 'node:path'
import { createHash } from 'node:crypto'
import projectKey from '../lib/project_key.js'
import { createGunzip } from 'node:zlib'
import { text } from 'node:stream/consumers'
const optionDefinitions = [
{ name: 'historyId', alias: 'p', type: String },
{ name: 'version', alias: 'v', type: Number },
{ name: 'blob', alias: 'b', type: String },
{ name: 'remote', alias: 'r', type: Boolean },
{ name: 'keep', alias: 'k', type: Boolean },
]
function makeChunkKey(projectId, startVersion) {
return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
}
async function listChunks(historyId) {
for await (const chunkRecord of getProjectChunksFromVersion(historyId, 0)) {
console.log('Chunk record:', chunkRecord)
}
}
async function fetchChunkLocal(historyId, version) {
const chunkRecord = await getChunkMetadataForVersion(historyId, version)
const chunk = await loadAtVersion(historyId, version)
return { key: version, chunk, metadata: chunkRecord, source: 'local storage' }
}
async function fetchChunkRemote(historyId, version) {
const chunkRecord = await getChunkMetadataForVersion(historyId, version)
const startVersion = chunkRecord.startVersion
const key = makeChunkKey(historyId, startVersion)
const backupPersistorForProject = await backupPersistor.forProject(
chunksBucket,
key
)
const backupChunkStream = await backupPersistorForProject.getObjectStream(
chunksBucket,
key
)
const backupStr = await text(backupChunkStream.pipe(createGunzip()))
return {
key,
chunk: JSON.parse(backupStr),
metadata: chunkRecord,
source: 'remote backup',
}
}
async function displayChunk(historyId, version, options) {
const { key, chunk, metadata, source } = await (options.remote
? fetchChunkRemote(historyId, version)
: fetchChunkLocal(historyId, version))
console.log('Source:', source)
console.log('Chunk record', metadata)
console.log('Key', key)
// console.log('Number of changes', chunk.getChanges().length)
console.log(JSON.stringify(chunk))
}
async function fetchBlobRemote(historyId, blobHash) {
const backupPersistorForProject = await backupPersistor.forProject(
projectBlobsBucket,
makeProjectKey(historyId, '')
)
const blobKey = makeProjectKey(historyId, blobHash)
return {
stream: await backupPersistorForProject.getObjectStream(
projectBlobsBucket,
blobKey,
{ autoGunzip: true }
),
metadata: { hash: blobHash },
source: 'remote backup',
}
}
async function fetchBlobLocal(historyId, blobHash) {
const blobStore = new BlobStore(historyId)
const blob = await blobStore.getBlob(blobHash)
if (!blob) throw new Error(`Blob ${blobHash} not found`)
return {
stream: await blobStore.getStream(blobHash),
metadata: blob,
source: 'local storage',
}
}
async function displayBlobContent(filepath, metadata, source, blobHash) {
console.log('Source:', source)
console.log('Blob metadata:', metadata)
// Compute git hash using streaming
const stat = fs.statSync(filepath)
const header = `blob ${stat.size}\0`
const hash = createHash('sha1')
hash.update(header)
const hashStream = fs.createReadStream(filepath)
for await (const chunk of hashStream) {
hash.update(chunk)
}
const gitHash = hash.digest('hex')
// Check content type and display preview
const fd = fs.openSync(filepath, 'r')
try {
const headBuf = Buffer.alloc(16)
const tailBuf = Buffer.alloc(16)
try {
// Stream through TextDecoderStream to check for valid UTF-8
const textStream = fs.createReadStream(filepath)
const decoder = new TextDecoder('utf-8', { fatal: true })
for await (const chunk of textStream) {
decoder.decode(chunk, { stream: true })
}
decoder.decode()
// If we get here, it's valid UTF-8
if (stat.size <= 1024) {
console.log('Content (text):', await fs.readFileSync(filepath, 'utf8'))
} else {
console.log('Content (text, truncated):')
console.log(` Length: ${stat.size} bytes`)
fs.readSync(fd, headBuf, 0, 16, 0)
fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
console.log(
' Content:',
headBuf.toString('utf8') +
' ...(truncated)... ' +
tailBuf.toString('utf8')
)
}
} catch (e) {
// Binary content - show head and tail
console.log('Content (binary):')
console.log(` Length: ${stat.size} bytes`)
if (stat.size <= 32) {
// Small file - read it all
const buf = Buffer.alloc(stat.size)
fs.readSync(fd, buf, 0, stat.size, 0)
const hexBytes = buf.toString('hex').match(/../g).join(' ')
console.log(' Bytes:', hexBytes)
} else {
// Read tail for large files
fs.readSync(fd, headBuf, 0, 16, 0)
fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
const headHex = headBuf.toString('hex').match(/../g).join(' ')
const tailHex = tailBuf.toString('hex').match(/../g).join(' ')
console.log(' Bytes:', headHex + ' ... ' + tailHex)
}
console.log(' Git-style SHA1:', gitHash)
if (gitHash !== blobHash) {
console.log(' Warning: Git hash differs from blob hash!\x1b[0m')
console.log(' Blob hash:', blobHash)
}
}
} finally {
fs.closeSync(fd)
}
}
async function withTempDir(prefix, fn, options = {}) {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix))
try {
return await Promise.resolve(fn(tmpDir))
} finally {
if (!options.keep) {
fs.rmSync(tmpDir, { recursive: true, force: true })
} else {
console.log('Keeping temporary file:', path.join(tmpDir, 'blob'))
}
}
}
async function displayBlob(historyId, blobHash, options) {
try {
const { stream, metadata, source } = await (options.remote
? fetchBlobRemote(historyId, blobHash)
: fetchBlobLocal(historyId, blobHash))
await withTempDir(
'blob-show-',
async tmpDir => {
const tmpPath = path.join(tmpDir, 'blob')
await pipeline(stream, fs.createWriteStream(tmpPath))
await displayBlobContent(tmpPath, metadata, source, blobHash)
},
{ keep: options.keep }
)
} catch (err) {
if (err.code === 'NoSuchKey') {
throw new Error(`Blob ${blobHash} not found in backup`)
}
throw err
}
}
async function main() {
const { historyId, version, blob, remote, keep } =
commandLineArgs(optionDefinitions)
if (!historyId) {
console.error('Error: --historyId is required.')
process.exit(1)
}
await loadGlobalBlobs()
if (version != null) {
await displayChunk(historyId, version, { remote })
} else if (blob != null) {
await displayBlob(historyId, blob, { remote, keep })
} else {
await listChunks(historyId)
}
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err)
process.exit(1)
})
.finally(() => {
knex.destroy().catch(err => console.error('Error closing Postgres:', err))
client.close().catch(err => console.error('Error closing MongoDB:', err))
redis
.disconnect()
.catch(err => console.error('Error disconnecting Redis:', err))
})

View File

@@ -0,0 +1,153 @@
// @ts-check
import { ObjectId } from 'mongodb'
import knex from '../lib/knex.js'
import {
batchedUpdate,
objectIdFromInput,
READ_PREFERENCE_SECONDARY,
} from '@overleaf/mongo-utils/batchedUpdate.js'
import {
GLOBAL_BLOBS,
loadGlobalBlobs,
makeProjectKey,
} from '../lib/blob_store/index.js'
import {
backedUpBlobs as backedUpBlobsCollection,
db,
client,
} from '../lib/mongodb.js'
import redis from '../lib/redis.js'
import commandLineArgs from 'command-line-args'
import fs from 'node:fs'
const projectsCollection = db.collection('projects')
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
const args = commandLineArgs([
{
name: 'BATCH_RANGE_START',
type: String,
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
},
{
name: 'BATCH_RANGE_END',
type: String,
defaultValue: new Date().toISOString(),
},
{
name: 'output',
type: String,
alias: 'o',
},
])
const BATCH_RANGE_START = objectIdFromInput(
args['BATCH_RANGE_START']
).toString()
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
if (!args['output']) {
throw new Error('missing --output')
}
const OUTPUT_STREAM = fs.createWriteStream(args['output'])
return {
BATCH_RANGE_START,
BATCH_RANGE_END,
OUTPUT_STREAM,
}
}
const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs()
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
}
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated, draining queue')
}
async function processBatch(batch) {
if (gracefulShutdownInitiated) {
throw new Error('graceful shutdown: aborting batch processing')
}
const N = batch.length
const firstId = batch[0]._id
const lastId = batch[N - 1]._id
const projectCursor = await projectsCollection.find(
{ _id: { $gte: firstId, $lte: lastId } },
{
projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 },
readPreference: READ_PREFERENCE_SECONDARY,
}
)
const projectMap = new Map()
for await (const project of projectCursor) {
projectMap.set(project._id.toString(), project)
}
for (const project of batch) {
const projectId = project._id.toString()
const projectRecord = projectMap.get(projectId)
if (!projectRecord) {
console.error(`project not found: ${projectId}`)
continue
}
if (!projectRecord.overleaf?.history?.id) {
console.error(`project missing history: ${projectId}`)
continue
}
const historyId = projectRecord.overleaf.history.id.toString()
const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},`
const hashes = project.blobs.map(blob => blob.toString('hex'))
const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash))
if (projectBlobHashes.length < hashes.length) {
console.warn(
`project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs`
)
}
const rows = projectBlobHashes.map(
hash => prefix + makeProjectKey(historyId, hash) + '\n'
)
OUTPUT_STREAM.write(rows.join(''))
}
}
async function main() {
await loadGlobalBlobs()
OUTPUT_STREAM.write('projectId,lastUpdated,path\n')
await batchedUpdate(
backedUpBlobsCollection,
{},
processBatch,
{},
{},
{ BATCH_RANGE_START, BATCH_RANGE_END }
)
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err)
process.exitCode = 1
})
.finally(() => {
knex.destroy().catch(err => {
console.error('Error closing Postgres connection:', err)
})
client.close().catch(err => console.error('Error closing MongoDB:', err))
redis.disconnect().catch(err => {
console.error('Error disconnecting Redis:', err)
})
})

View File

@@ -0,0 +1,21 @@
import logger from '@overleaf/logger'
import commandLineArgs from 'command-line-args'
import { verifyBlobs } from '../lib/backupVerifier.mjs'
const { historyId, hashes } = commandLineArgs([
{ name: 'historyId', type: String },
{ name: 'hashes', type: String, multiple: true, defaultOption: true },
])
if (hashes.length === 0) {
throw new Error('missing --hashes flag')
}
try {
await verifyBlobs(historyId, hashes)
console.log('OK')
process.exit(0)
} catch (err) {
logger.err({ err }, 'failed to verify blob')
process.exit(1)
}

View File

@@ -0,0 +1,177 @@
import fs from 'node:fs'
import { makeProjectKey } from '../lib/blob_store/index.js'
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import commandLineArgs from 'command-line-args'
import OError from '@overleaf/o-error'
import assert from '../lib/assert.js'
import { client, projects } from '../lib/mongodb.js'
import { ObjectId } from 'mongodb'
import { setTimeout } from 'node:timers/promises'
const { input, verbose } = commandLineArgs([
{ name: 'input', type: String },
{ name: 'verbose', type: Boolean, defaultValue: false },
])
function parseCSVRow(row) {
const [path] = row.split(',')
const pathSegments = path.split('/')
const historyId = `${pathSegments[0]}${pathSegments[1]}${pathSegments[2]}`
.split('')
.reverse()
.join('')
return { historyId, path, hash: `${pathSegments[3]}${pathSegments[4]}` }
}
async function* readCSV(path) {
let fh
try {
fh = await fs.promises.open(path, 'r')
} catch (error) {
console.error(`Could not open file: ${error}`)
throw error
}
for await (const line of fh.readLines()) {
try {
const row = parseCSVRow(line)
yield row
} catch (error) {
console.error(error instanceof Error ? error.message : error)
console.log(`Skipping invalid row: ${line}`)
}
}
}
class MissingDEKError extends OError {}
class InvalidHistoryIdError extends OError {}
class MissingProjectError extends OError {}
class MissingBlobError extends OError {}
async function getProjectPersistor(historyId) {
try {
return await backupPersistor.forProjectRO(
projectBlobsBucket,
makeProjectKey(historyId, '')
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new MissingDEKError('dek does not exist', { historyId }, err)
}
throw err
}
}
async function checkBlobExists(path, historyId) {
const persistor = await getProjectPersistor(historyId)
return await persistor.getObjectSize(projectBlobsBucket, path)
}
let total = 0
const errors = {
invalidProjectId: 0,
notBackedUpProjectId: 0,
missingBlob: 0,
notInMongo: 0,
unknown: 0,
}
const notInMongoProjectIds = new Set()
const notBackedUpProjectIds = new Set()
let stopping = false
process.on('SIGTERM', () => {
console.log('SIGTERM received')
stopping = true
})
process.on('SIGINT', () => {
console.log('SIGINT received')
stopping = true
})
/**
*
* @param {string} historyId
* @param {string} path
* @param {string} hash
* @return {Promise<void>}
*/
async function checkPath(historyId, path, hash) {
try {
assert.mongoId(historyId)
} catch (error) {
throw InvalidHistoryIdError('invalid history id', { historyId })
}
if (notInMongoProjectIds.has(historyId)) {
throw new MissingProjectError('project not in mongo', { historyId })
}
if (notBackedUpProjectIds.has(historyId)) {
throw new MissingDEKError('project not backed up', { historyId })
}
const project = await projects.findOne({ _id: new ObjectId(historyId) })
if (!project) {
notInMongoProjectIds.add(historyId)
throw new MissingProjectError('project not in mongo', { historyId })
}
try {
await checkBlobExists(path, historyId)
} catch (error) {
if (error instanceof NotFoundError) {
throw new MissingBlobError('missing blob', { historyId, hash })
}
if (error instanceof MissingDEKError) {
notBackedUpProjectIds.add(historyId)
}
throw error
}
}
for await (const line of readCSV(input)) {
if (stopping) break
total++
if (total % 10_000 === 0) {
console.log(`checked ${total}`)
}
const { historyId, path, hash } = line
try {
await checkPath(historyId, path, hash)
if (verbose) {
console.log(`✓ Project ${historyId} has ${hash} backed up`)
}
} catch (error) {
if (error instanceof InvalidHistoryIdError) {
errors.invalidProjectId++
console.warn(`invalid historyId ${historyId}`)
continue
} else if (error instanceof MissingProjectError) {
errors.notInMongo++
console.warn(`✗ project ${historyId} not in mongo`)
continue
} else if (error instanceof MissingDEKError) {
errors.notBackedUpProjectId++
console.error(`✗ Project DEK ${historyId} not found`)
continue
} else if (error instanceof MissingBlobError) {
errors.missingBlob++
console.error(`✗ missing blob ${hash} from project ${historyId}`)
continue
}
errors.unknown++
console.error(error)
}
}
console.log(`total checked: ${total}`)
console.log(`invalid project id: ${errors.invalidProjectId}`)
console.log(`not found in mongo: ${errors.notInMongo}`)
console.log(`missing blob: ${errors.missingBlob}`)
console.log(`project not backed up: ${errors.notBackedUpProjectId}`)
console.log(`unknown errors: ${errors.unknown}`)
await client.close()
await setTimeout(100)
process.exit()

View File

@@ -0,0 +1,35 @@
import commandLineArgs from 'command-line-args'
import { verifyProjectWithErrorContext } from '../lib/backupVerifier.mjs'
import knex from '../lib/knex.js'
import { client } from '../lib/mongodb.js'
import redis from '../lib/redis.js'
import { setTimeout } from 'node:timers/promises'
import { loadGlobalBlobs } from '../lib/blob_store/index.js'
const { historyId } = commandLineArgs([{ name: 'historyId', type: String }])
async function gracefulShutdown(code = process.exitCode) {
await knex.destroy()
await client.close()
await redis.disconnect()
await setTimeout(1_000)
process.exit(code)
}
if (!historyId) {
console.error('missing --historyId')
process.exitCode = 1
await gracefulShutdown()
}
await loadGlobalBlobs()
try {
await verifyProjectWithErrorContext(historyId)
console.log('OK')
} catch (error) {
console.error('error verifying', error)
process.exitCode = 1
} finally {
await gracefulShutdown()
}

View File

@@ -0,0 +1,217 @@
// @ts-check
import commandLineArgs from 'command-line-args'
import {
setWriteMetrics,
verifyProjectsCreatedInDateRange,
verifyRandomProjectSample,
verifyProjectsUpdatedInDateRange,
} from '../../backupVerifier/ProjectVerifier.mjs'
import knex from '../lib/knex.js'
import { client } from '../lib/mongodb.js'
import { setTimeout } from 'node:timers/promises'
import logger from '@overleaf/logger'
import { loadGlobalBlobs } from '../lib/blob_store/index.js'
import { getDatesBeforeRPO } from '../../backupVerifier/utils.mjs'
import { EventEmitter } from 'node:events'
import { mongodb } from '../index.js'
import redis from '../lib/redis.js'
logger.logger.level('fatal')
const usageMessage = [
'Usage: node verify_sampled_projects.mjs [--startDate <start>] [--endDate <end>] [--nProjects <n>] [--verbose] [--usage] [--writeMetrics] [--concurrency <n>] [--strategy <range|random>]',
'strategy: defaults to "range"; startDate and endDate are required for "range" strategy',
].join('\n')
/**
* Gracefully shutdown the process
* @param code
* @return {Promise<void>}
*/
async function gracefulShutdown(code = process.exitCode) {
await knex.destroy()
await client.close()
await redis.disconnect()
await setTimeout(1_000)
process.exit(code)
}
const STATS = {
verifiable: 0,
unverifiable: 0,
}
/**
* @typedef {Object} CLIOptions
* @property {(signal: EventEmitter) => Promise<VerificationJobStatus>} projectVerifier
* @property {boolean} verbose
*/
/**
* @typedef {import('../../backupVerifier/types.d.ts').VerificationJobStatus} VerificationJobStatus
*/
/**
*
* @return {CLIOptions}
*/
function getOptions() {
const {
startDate,
endDate,
concurrency,
writeMetrics,
verbose,
nProjects,
strategy,
usage,
} = commandLineArgs([
{ name: 'startDate', type: String },
{ name: 'endDate', type: String },
{ name: 'concurrency', type: Number, defaultValue: 1 },
{ name: 'verbose', type: Boolean, defaultValue: false },
{ name: 'nProjects', type: Number, defaultValue: 10 },
{ name: 'usage', type: Boolean, defaultValue: false },
{ name: 'writeMetrics', type: Boolean, defaultValue: false },
{ name: 'strategy', type: String, defaultValue: 'range' },
])
if (usage) {
console.log(usageMessage)
process.exit(0)
}
if (!['range', 'random', 'recent'].includes(strategy)) {
throw new Error(`Invalid strategy: ${strategy}`)
}
setWriteMetrics(writeMetrics)
switch (strategy) {
case 'random':
console.log('Verifying random projects')
return {
verbose,
projectVerifier: signal => verifyRandomProjectSample(nProjects, signal),
}
case 'recent':
return {
verbose,
projectVerifier: async signal => {
const { startDate, endDate } = getDatesBeforeRPO(3 * 3600)
return await verifyProjectsUpdatedInDateRange(
startDate,
endDate,
nProjects,
signal
)
},
}
case 'range':
default: {
if (!startDate || !endDate) {
throw new Error(usageMessage)
}
const start = Date.parse(startDate)
const end = Date.parse(endDate)
if (Number.isNaN(start)) {
throw new Error(`Invalid start date: ${startDate}`)
}
if (Number.isNaN(end)) {
throw new Error(`Invalid end date: ${endDate}`)
}
if (verbose) {
console.log(`Verifying from ${startDate} to ${endDate}`)
console.log(`Concurrency: ${concurrency}`)
}
STATS.ranges = 0
return {
projectVerifier: signal =>
verifyProjectsCreatedInDateRange({
startDate: new Date(start),
endDate: new Date(end),
projectsPerRange: nProjects,
concurrency,
signal,
}),
verbose,
}
}
}
}
/**
* @type {CLIOptions}
*/
let options
try {
options = getOptions()
} catch (error) {
console.error(error)
process.exitCode = 1
await gracefulShutdown(1)
process.exit() // just here so the type checker knows that the process will exit
}
const { projectVerifier, verbose } = options
if (verbose) {
logger.logger.level('debug')
}
/**
*
* @param {Array<string>} array
* @param {string} matchString
* @return {*}
*/
function sumStringInstances(array, matchString) {
return array.reduce((total, string) => {
return string === matchString ? total + 1 : total
}, 0)
}
/**
*
* @param {VerificationJobStatus} stats
*/
function displayStats(stats) {
console.log(`Verified projects: ${stats.verified}`)
console.log(`Total projects sampled: ${stats.total}`)
if (stats.errorTypes.length > 0) {
console.log('Errors:')
for (const error of new Set(stats.errorTypes)) {
console.log(`${error}: ${sumStringInstances(stats.errorTypes, error)}`)
}
}
}
const shutdownEmitter = new EventEmitter()
shutdownEmitter.on('shutdown', async () => {
await gracefulShutdown()
})
process.on('SIGTERM', () => {
shutdownEmitter.emit('shutdown')
})
process.on('SIGINT', () => {
shutdownEmitter.emit('shutdown')
})
await loadGlobalBlobs()
try {
const stats = await projectVerifier(shutdownEmitter)
displayStats(stats)
console.log(`completed`)
} catch (error) {
console.error(error)
console.log('completed with errors')
process.exitCode = 1
} finally {
console.log('shutting down')
await gracefulShutdown()
}