2025-04-24 13:11:28 +08:00

1105 lines
30 KiB
JavaScript

// @ts-check
import logger from '@overleaf/logger'
import commandLineArgs from 'command-line-args'
import { Chunk, History, Snapshot } from 'overleaf-editor-core'
import {
getProjectChunks,
loadLatestRaw,
create,
} from '../lib/chunk_store/index.js'
import { client } from '../lib/mongodb.js'
import redis from '../lib/redis.js'
import knex from '../lib/knex.js'
import { historyStore } from '../lib/history_store.js'
import pLimit from 'p-limit'
import {
GLOBAL_BLOBS,
loadGlobalBlobs,
makeProjectKey,
BlobStore,
} from '../lib/blob_store/index.js'
import {
listPendingBackups,
getBackupStatus,
setBackupVersion,
updateCurrentMetadataIfNotSet,
updatePendingChangeTimestamp,
getBackedUpBlobHashes,
unsetBackedUpBlobHashes,
} from '../lib/backup_store/index.js'
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
import {
backupPersistor,
chunksBucket,
projectBlobsBucket,
} from '../lib/backupPersistor.mjs'
import { backupGenerator } from '../lib/backupGenerator.mjs'
import { promises as fs, createWriteStream } from 'node:fs'
import os from 'node:os'
import path from 'node:path'
import projectKey from '../lib/project_key.js'
import Crypto from 'node:crypto'
import Stream from 'node:stream'
import { EventEmitter } from 'node:events'
import {
objectIdFromInput,
batchedUpdate,
READ_PREFERENCE_SECONDARY,
} from '@overleaf/mongo-utils/batchedUpdate.js'
import { createGunzip } from 'node:zlib'
import { text } from 'node:stream/consumers'
import { fromStream as blobHashFromStream } from '../lib/blob_hash.js'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
// Create a singleton promise that loads global blobs once
let globalBlobsPromise = null
function ensureGlobalBlobsLoaded() {
if (!globalBlobsPromise) {
globalBlobsPromise = loadGlobalBlobs()
}
return globalBlobsPromise
}
EventEmitter.defaultMaxListeners = 20
logger.initialize('history-v1-backup')
// Settings shared between command-line and module usage
let DRY_RUN = false
let RETRY_LIMIT = 3
const RETRY_DELAY = 1000
let CONCURRENCY = 4
let BATCH_CONCURRENCY = 1
let BLOB_LIMITER = pLimit(CONCURRENCY)
let USE_SECONDARY = false
/**
* Configure backup settings
* @param {Object} options Backup configuration options
*/
export function configureBackup(options = {}) {
DRY_RUN = options.dryRun || false
RETRY_LIMIT = options.retries || 3
CONCURRENCY = options.concurrency || 1
BATCH_CONCURRENCY = options.batchConcurrency || 1
BLOB_LIMITER = pLimit(CONCURRENCY)
USE_SECONDARY = options.useSecondary || false
}
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
logger.info({}, 'graceful shutdown initiated, draining queue')
}
async function retry(fn, times, delayMs) {
let attempts = times
while (attempts > 0) {
try {
const result = await fn()
return result
} catch (err) {
attempts--
if (attempts === 0) throw err
await new Promise(resolve => setTimeout(resolve, delayMs))
}
}
}
function wrapWithRetry(fn, retries, delayMs) {
return async (...args) => {
const result = await retry(() => fn(...args), retries, delayMs)
return result
}
}
const downloadWithRetry = wrapWithRetry(
downloadBlobToDir,
RETRY_LIMIT,
RETRY_DELAY
)
// FIXME: this creates a new backupPersistor for each blob
// so there is no caching of the DEK
const backupWithRetry = wrapWithRetry(backupBlob, RETRY_LIMIT, RETRY_DELAY)
async function findNewBlobs(projectId, blobs) {
const newBlobs = []
const existingBackedUpBlobHashes = await getBackedUpBlobHashes(projectId)
for (const blob of blobs) {
const hash = blob.getHash()
if (existingBackedUpBlobHashes.has(blob.getHash())) {
logger.debug({ projectId, hash }, 'Blob is already backed up, skipping')
continue
}
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob && !globalBlob.demoted) {
logger.debug(
{ projectId, hash },
'Blob is a global blob and not demoted, skipping'
)
continue
}
newBlobs.push(blob)
}
return newBlobs
}
async function cleanBackedUpBlobs(projectId, blobs) {
const hashes = blobs.map(blob => blob.getHash())
if (DRY_RUN) {
console.log(
'Would remove blobs',
hashes.join(' '),
'from project',
projectId
)
return
}
await unsetBackedUpBlobHashes(projectId, hashes)
}
async function backupSingleBlob(projectId, historyId, blob, tmpDir, persistor) {
if (DRY_RUN) {
console.log(
'Would back up blob',
JSON.stringify(blob),
'in history',
historyId,
'for project',
projectId
)
return
}
logger.debug({ blob, historyId }, 'backing up blob')
const blobPath = await downloadWithRetry(historyId, blob, tmpDir)
await backupWithRetry(historyId, blob, blobPath, persistor)
}
async function backupBlobs(projectId, historyId, blobs, limiter, persistor) {
let tmpDir
try {
tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'blob-backup-'))
const blobBackupOperations = blobs.map(blob =>
limiter(backupSingleBlob, projectId, historyId, blob, tmpDir, persistor)
)
// Reject if any blob backup fails
await Promise.all(blobBackupOperations)
} finally {
if (tmpDir) {
await fs.rm(tmpDir, { recursive: true, force: true })
}
}
}
async function backupChunk(
projectId,
historyId,
chunkBackupPersistorForProject,
chunkToBackup,
chunkRecord,
chunkBuffer
) {
if (DRY_RUN) {
console.log(
'Would back up chunk',
JSON.stringify(chunkRecord),
'in history',
historyId,
'for project',
projectId,
'key',
makeChunkKey(historyId, chunkToBackup.startVersion)
)
return
}
const key = makeChunkKey(historyId, chunkToBackup.startVersion)
logger.debug({ chunkRecord, historyId, projectId, key }, 'backing up chunk')
const md5 = Crypto.createHash('md5').update(chunkBuffer)
await chunkBackupPersistorForProject.sendStream(
chunksBucket,
makeChunkKey(historyId, chunkToBackup.startVersion),
Stream.Readable.from([chunkBuffer]),
{
contentType: 'application/json',
contentEncoding: 'gzip',
contentLength: chunkBuffer.byteLength,
sourceMd5: md5.digest('hex'),
}
)
}
async function updateBackupStatus(
projectId,
lastBackedUpVersion,
chunkRecord,
startOfBackupTime
) {
if (DRY_RUN) {
console.log(
'Would set backup version to',
chunkRecord.endVersion,
'with lastBackedUpTimestamp',
startOfBackupTime
)
return
}
logger.debug(
{ projectId, chunkRecord, startOfBackupTime },
'setting backupVersion and lastBackedUpTimestamp'
)
await setBackupVersion(
projectId,
lastBackedUpVersion,
chunkRecord.endVersion,
startOfBackupTime
)
}
// Define command-line options
const optionDefinitions = [
{
name: 'projectId',
alias: 'p',
type: String,
description: 'The ID of the project to backup',
defaultOption: true,
},
{
name: 'help',
alias: 'h',
type: Boolean,
description: 'Display this usage guide.',
},
{
name: 'status',
alias: 's',
type: Boolean,
description: 'Display project status.',
},
{
name: 'list',
alias: 'l',
type: Boolean,
description: 'List projects that need to be backed up',
},
{
name: 'dry-run',
alias: 'n',
type: Boolean,
description: 'Perform a dry run without making any changes.',
},
{
name: 'retries',
alias: 'r',
type: Number,
description: 'Number of retries, default is 3.',
},
{
name: 'concurrency',
alias: 'c',
type: Number,
description: 'Number of concurrent blob downloads (default: 1)',
},
{
name: 'batch-concurrency',
alias: 'b',
type: Number,
description: 'Number of concurrent project operations (default: 1)',
},
{
name: 'pending',
alias: 'P',
type: Boolean,
description: 'Backup all pending projects.',
},
{
name: 'interval',
alias: 'i',
type: Number,
description: 'Time interval in seconds for pending backups (default: 3600)',
defaultValue: 3600,
},
{
name: 'fix',
type: Number,
description: 'Fix projects without chunks',
},
{
name: 'init',
alias: 'I',
type: Boolean,
description: 'Initialize backups for all projects.',
},
{ name: 'output', alias: 'o', type: String, description: 'Output file' },
{
name: 'start-date',
type: String,
description: 'Start date for initialization (ISO format)',
},
{
name: 'end-date',
type: String,
description: 'End date for initialization (ISO format)',
},
{
name: 'use-secondary',
type: Boolean,
description: 'Use secondary read preference for backup status',
},
{
name: 'compare',
alias: 'C',
type: Boolean,
description:
'Compare backup with original chunks. With --start-date and --end-date compares all projects in range.',
},
]
function handleOptions() {
const options = commandLineArgs(optionDefinitions)
if (options.help) {
console.log('Usage:')
optionDefinitions.forEach(option => {
console.log(` --${option.name}, -${option.alias}: ${option.description}`)
})
process.exit(0)
}
const projectIdRequired =
!options.list &&
!options.pending &&
!options.init &&
!(options.fix >= 0) &&
!(options.compare && options['start-date'] && options['end-date'])
if (projectIdRequired && !options.projectId) {
console.error('Error: projectId is required')
process.exit(1)
}
if (options.pending && options.projectId) {
console.error('Error: --pending cannot be specified with projectId')
process.exit(1)
}
if (options.pending && (options.list || options.status)) {
console.error('Error: --pending is exclusive with --list and --status')
process.exit(1)
}
if (options.init && options.pending) {
console.error('Error: --init cannot be specified with --pending')
process.exit(1)
}
if (
(options['start-date'] || options['end-date']) &&
!options.init &&
!options.compare
) {
console.error(
'Error: date options can only be used with --init or --compare'
)
process.exit(1)
}
if (options['use-secondary']) {
USE_SECONDARY = true
}
if (
options.compare &&
!options.projectId &&
!(options['start-date'] && options['end-date'])
) {
console.error(
'Error: --compare requires either projectId or both --start-date and --end-date'
)
process.exit(1)
}
DRY_RUN = options['dry-run'] || false
RETRY_LIMIT = options.retries || 3
CONCURRENCY = options.concurrency || 1
BATCH_CONCURRENCY = options['batch-concurrency'] || 1
BLOB_LIMITER = pLimit(CONCURRENCY)
return options
}
async function displayBackupStatus(projectId) {
const result = await analyseBackupStatus(projectId)
console.log('Backup status:', JSON.stringify(result))
}
async function analyseBackupStatus(projectId) {
const { backupStatus, historyId, currentEndVersion, currentEndTimestamp } =
await getBackupStatus(projectId)
// TODO: when we have confidence that the latestChunkMetadata always matches
// the values from the backupStatus we can skip loading it here
const latestChunkMetadata = await loadLatestRaw(historyId, {
readOnly: Boolean(USE_SECONDARY),
})
if (
currentEndVersion &&
currentEndVersion !== latestChunkMetadata.endVersion
) {
// compare the current end version with the latest chunk metadata to check that
// the updates to the project collection are reliable
// expect some failures due to the time window between getBackupStatus and
// loadLatestRaw where the project is being actively edited.
logger.warn(
{
projectId,
historyId,
currentEndVersion,
currentEndTimestamp,
latestChunkMetadata,
},
'currentEndVersion does not match latest chunk metadata'
)
}
if (DRY_RUN) {
console.log('Project:', projectId)
console.log('History ID:', historyId)
console.log('Latest Chunk Metadata:', JSON.stringify(latestChunkMetadata))
console.log('Current end version:', currentEndVersion)
console.log('Current end timestamp:', currentEndTimestamp)
console.log('Backup status:', backupStatus ?? 'none')
}
if (!backupStatus) {
if (DRY_RUN) {
console.log('No backup status found - doing full backup')
}
}
const lastBackedUpVersion = backupStatus?.lastBackedUpVersion
const endVersion = latestChunkMetadata.endVersion
if (endVersion >= 0 && endVersion === lastBackedUpVersion) {
if (DRY_RUN) {
console.log(
'Project is up to date, last backed up at version',
lastBackedUpVersion
)
}
} else if (endVersion < lastBackedUpVersion) {
throw new Error('backup is ahead of project')
} else {
if (DRY_RUN) {
console.log(
'Project needs to be backed up from',
lastBackedUpVersion,
'to',
endVersion
)
}
}
return {
historyId,
lastBackedUpVersion,
currentVersion: latestChunkMetadata.endVersion || 0,
upToDate: endVersion >= 0 && lastBackedUpVersion === endVersion,
pendingChangeAt: backupStatus?.pendingChangeAt,
currentEndVersion,
currentEndTimestamp,
latestChunkMetadata,
}
}
async function displayPendingBackups(options) {
const intervalMs = options.interval * 1000
for await (const project of listPendingBackups(intervalMs)) {
console.log(
'Project:',
project._id.toHexString(),
'backup status:',
JSON.stringify(project.overleaf.backup),
'history status:',
JSON.stringify(project.overleaf.history, [
'currentEndVersion',
'currentEndTimestamp',
])
)
}
}
function makeChunkKey(projectId, startVersion) {
return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
}
export async function backupProject(projectId, options) {
if (gracefulShutdownInitiated) {
return
}
await ensureGlobalBlobsLoaded()
// FIXME: flush the project first!
// Let's assume the the flush happens externally and triggers this backup
const backupStartTime = new Date()
// find the last backed up version
const {
historyId,
lastBackedUpVersion,
currentVersion,
upToDate,
pendingChangeAt,
currentEndVersion,
latestChunkMetadata,
} = await analyseBackupStatus(projectId)
if (upToDate) {
logger.debug(
{
projectId,
historyId,
lastBackedUpVersion,
currentVersion,
pendingChangeAt,
},
'backup is up to date'
)
if (
currentEndVersion === undefined &&
latestChunkMetadata.endVersion >= 0
) {
if (DRY_RUN) {
console.log('Would update current metadata to', latestChunkMetadata)
} else {
await updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata)
}
}
// clear the pending changes timestamp if the backup is complete
if (pendingChangeAt) {
if (DRY_RUN) {
console.log(
'Would update or clear pending changes timestamp',
backupStartTime
)
} else {
await updatePendingChangeTimestamp(projectId, backupStartTime)
}
}
return
}
logger.debug(
{
projectId,
historyId,
lastBackedUpVersion,
currentVersion,
pendingChangeAt,
},
'backing up project'
)
// this persistor works for both the chunks and blobs buckets,
// because they use the same DEK
const backupPersistorForProject = await backupPersistor.forProject(
chunksBucket,
makeProjectKey(historyId, '')
)
let previousBackedUpVersion = lastBackedUpVersion
const backupVersions = [previousBackedUpVersion]
for await (const {
blobsToBackup,
chunkToBackup,
chunkRecord,
chunkBuffer,
} of backupGenerator(historyId, lastBackedUpVersion)) {
// backup the blobs first
// this can be done in parallel but must fail if any blob cannot be backed up
// if the blob already exists in the backup then that is allowed
const newBlobs = await findNewBlobs(projectId, blobsToBackup)
await backupBlobs(
projectId,
historyId,
newBlobs,
BLOB_LIMITER,
backupPersistorForProject
)
// then backup the original compressed chunk using the startVersion as the key
await backupChunk(
projectId,
historyId,
backupPersistorForProject,
chunkToBackup,
chunkRecord,
chunkBuffer
)
// persist the backup status in mongo for the current chunk
try {
await updateBackupStatus(
projectId,
previousBackedUpVersion,
chunkRecord,
backupStartTime
)
} catch (err) {
logger.error(
{ projectId, chunkRecord, err, backupVersions },
'error updating backup status'
)
throw err
}
previousBackedUpVersion = chunkRecord.endVersion
backupVersions.push(previousBackedUpVersion)
await cleanBackedUpBlobs(projectId, blobsToBackup)
}
// update the current end version and timestamp if they are not set
if (currentEndVersion === undefined && latestChunkMetadata.endVersion >= 0) {
if (DRY_RUN) {
console.log('Would update current metadata to', latestChunkMetadata)
} else {
await updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata)
}
}
// clear the pending changes timestamp if the backup is complete, otherwise set it to the time
// when the backup started (to pick up the new changes on the next backup)
if (DRY_RUN) {
console.log(
'Would update or clear pending changes timestamp',
backupStartTime
)
} else {
await updatePendingChangeTimestamp(projectId, backupStartTime)
}
}
function convertToISODate(dateStr) {
// Expecting YYYY-MM-DD format
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) {
throw new Error('Date must be in YYYY-MM-DD format')
}
return new Date(dateStr + 'T00:00:00.000Z').toISOString()
}
export async function fixProjectsWithoutChunks(options) {
const limit = options.fix || 1
const query = {
'overleaf.history.id': { $exists: true },
'overleaf.backup.lastBackedUpVersion': { $in: [null] },
}
const cursor = client
.db()
.collection('projects')
.find(query, {
projection: { _id: 1, 'overleaf.history.id': 1 },
readPreference: READ_PREFERENCE_SECONDARY,
})
.limit(limit)
for await (const project of cursor) {
const historyId = project.overleaf.history.id.toString()
const chunks = await getProjectChunks(historyId)
if (chunks.length > 0) {
continue
}
if (DRY_RUN) {
console.log(
'Would create new chunk for Project ID:',
project._id.toHexString(),
'History ID:',
historyId,
'Chunks:',
chunks
)
} else {
console.log(
'Creating new chunk for Project ID:',
project._id.toHexString(),
'History ID:',
historyId,
'Chunks:',
chunks
)
const snapshot = new Snapshot()
const history = new History(snapshot, [])
const chunk = new Chunk(history, 0)
await create(historyId, chunk)
const newChunks = await getProjectChunks(historyId)
console.log('New chunk:', newChunks)
}
}
}
export async function initializeProjects(options) {
await ensureGlobalBlobsLoaded()
let totalErrors = 0
let totalProjects = 0
const query = {
'overleaf.backup.lastBackedUpVersion': { $in: [null] },
}
if (options['start-date'] && options['end-date']) {
query._id = {
$gte: objectIdFromInput(convertToISODate(options['start-date'])),
$lt: objectIdFromInput(convertToISODate(options['end-date'])),
}
}
const cursor = client
.db()
.collection('projects')
.find(query, {
projection: { _id: 1 },
readPreference: READ_PREFERENCE_SECONDARY,
})
if (options.output) {
console.log("Writing project IDs to file: '" + options.output + "'")
const output = createWriteStream(options.output)
for await (const project of cursor) {
output.write(project._id.toHexString() + '\n')
totalProjects++
}
output.end()
console.log('Wrote ' + totalProjects + ' project IDs to file')
return
}
for await (const project of cursor) {
if (gracefulShutdownInitiated) {
console.warn('graceful shutdown: stopping project initialization')
break
}
totalProjects++
const projectId = project._id.toHexString()
try {
await backupProject(projectId, options)
} catch (err) {
totalErrors++
logger.error({ projectId, err }, 'error backing up project')
}
}
return { errors: totalErrors, projects: totalProjects }
}
async function backupPendingProjects(options) {
const intervalMs = options.interval * 1000
for await (const project of listPendingBackups(intervalMs)) {
if (gracefulShutdownInitiated) {
console.warn('graceful shutdown: stopping pending project backups')
break
}
const projectId = project._id.toHexString()
console.log(`Backing up pending project with ID: ${projectId}`)
await backupProject(projectId, options)
}
}
class BlobComparator {
constructor(backupPersistorForProject) {
this.cache = new Map()
this.backupPersistorForProject = backupPersistorForProject
}
async compareBlob(historyId, blob) {
let computedHash = this.cache.get(blob.hash)
const fromCache = !!computedHash
if (!computedHash) {
const blobKey = makeProjectKey(historyId, blob.hash)
const backupBlobStream =
await this.backupPersistorForProject.getObjectStream(
projectBlobsBucket,
blobKey,
{ autoGunzip: true }
)
computedHash = await blobHashFromStream(blob.byteLength, backupBlobStream)
this.cache.set(blob.hash, computedHash)
}
const matches = computedHash === blob.hash
return {
matches,
computedHash,
fromCache,
}
}
}
async function compareBackups(projectId, options) {
console.log(`Comparing backups for project ${projectId}`)
const { historyId } = await getBackupStatus(projectId)
const chunks = await getProjectChunks(historyId)
const blobStore = new BlobStore(historyId)
const backupPersistorForProject = await backupPersistor.forProject(
chunksBucket,
makeProjectKey(historyId, '')
)
let totalChunkMatches = 0
let totalChunkMismatches = 0
let totalChunksNotFound = 0
let totalBlobMatches = 0
let totalBlobMismatches = 0
let totalBlobsNotFound = 0
const errors = []
const blobComparator = new BlobComparator(backupPersistorForProject)
for (const chunk of chunks) {
try {
// Compare chunk content
const originalChunk = await historyStore.loadRaw(historyId, chunk.id)
const key = makeChunkKey(historyId, chunk.startVersion)
try {
const backupChunkStream =
await backupPersistorForProject.getObjectStream(chunksBucket, key)
const backupStr = await text(backupChunkStream.pipe(createGunzip()))
const originalStr = JSON.stringify(originalChunk)
const backupChunk = JSON.parse(backupStr)
const backupStartVersion = chunk.startVersion
const backupEndVersion = chunk.startVersion + backupChunk.changes.length
if (originalStr === backupStr) {
console.log(
`✓ Chunk ${chunk.id} (v${chunk.startVersion}-v${chunk.endVersion}) matches`
)
totalChunkMatches++
} else if (originalStr === JSON.stringify(JSON.parse(backupStr))) {
console.log(
`✓ Chunk ${chunk.id} (v${chunk.startVersion}-v${chunk.endVersion}) matches (after normalisation)`
)
totalChunkMatches++
} else if (backupEndVersion < chunk.endVersion) {
console.log(
`✗ Chunk ${chunk.id} is ahead of backup (v${chunk.startVersion}-v${chunk.endVersion} vs v${backupStartVersion}-v${backupEndVersion})`
)
totalChunkMismatches++
errors.push({ chunkId: chunk.id, error: 'Chunk ahead of backup' })
} else {
console.log(
`✗ Chunk ${chunk.id} (v${chunk.startVersion}-v${chunk.endVersion}) MISMATCH`
)
totalChunkMismatches++
errors.push({ chunkId: chunk.id, error: 'Chunk mismatch' })
}
} catch (err) {
if (err instanceof NotFoundError) {
console.log(`✗ Chunk ${chunk.id} not found in backup`, err.cause)
totalChunksNotFound++
errors.push({ chunkId: chunk.id, error: `Chunk not found` })
} else {
throw err
}
}
const history = History.fromRaw(originalChunk)
// Compare blobs in chunk
const blobHashes = new Set()
history.findBlobHashes(blobHashes)
const blobs = await blobStore.getBlobs(Array.from(blobHashes))
for (const blob of blobs) {
if (GLOBAL_BLOBS.has(blob.hash)) {
const globalBlob = GLOBAL_BLOBS.get(blob.hash)
console.log(
` ✓ Blob ${blob.hash} is a global blob`,
globalBlob.demoted ? '(demoted)' : ''
)
continue
}
try {
const { matches, computedHash, fromCache } =
await blobComparator.compareBlob(historyId, blob)
if (matches) {
console.log(
` ✓ Blob ${blob.hash} hash matches (${blob.byteLength} bytes)` +
(fromCache ? ' (from cache)' : '')
)
totalBlobMatches++
} else {
console.log(
` ✗ Blob ${blob.hash} hash mismatch (original: ${blob.hash}, backup: ${computedHash}) (${blob.byteLength} bytes, ${blob.stringLength} string length)` +
(fromCache ? ' (from cache)' : '')
)
totalBlobMismatches++
errors.push({
chunkId: chunk.id,
error: `Blob ${blob.hash} hash mismatch`,
})
}
} catch (err) {
if (err instanceof NotFoundError) {
console.log(` ✗ Blob ${blob.hash} not found in backup`, err.cause)
totalBlobsNotFound++
errors.push({
chunkId: chunk.id,
error: `Blob ${blob.hash} not found`,
})
} else {
throw err
}
}
}
} catch (err) {
console.error(`Error comparing chunk ${chunk.id}:`, err)
errors.push({ chunkId: chunk.id, error: err })
}
}
// Print summary
console.log('\nComparison Summary:')
console.log('==================')
console.log(`Total chunks: ${chunks.length}`)
console.log(`Chunk matches: ${totalChunkMatches}`)
console.log(`Chunk mismatches: ${totalChunkMismatches}`)
console.log(`Chunk not found: ${totalChunksNotFound}`)
console.log(`Blob matches: ${totalBlobMatches}`)
console.log(`Blob mismatches: ${totalBlobMismatches}`)
console.log(`Blob not found: ${totalBlobsNotFound}`)
console.log(`Errors: ${errors.length}`)
if (errors.length > 0) {
console.log('\nErrors:')
errors.forEach(({ chunkId, error }) => {
console.log(` Chunk ${chunkId}: ${error}`)
})
throw new Error('Backup comparison FAILED')
} else {
console.log('Backup comparison successful')
}
}
async function compareAllProjects(options) {
const limiter = pLimit(BATCH_CONCURRENCY)
let totalErrors = 0
let totalProjects = 0
async function processBatch(batch) {
if (gracefulShutdownInitiated) {
throw new Error('graceful shutdown')
}
const batchOperations = batch.map(project =>
limiter(async () => {
const projectId = project._id.toHexString()
totalProjects++
try {
console.log(`\nComparing project ${projectId} (${totalProjects})`)
await compareBackups(projectId, options)
} catch (err) {
totalErrors++
console.error(`Failed to compare project ${projectId}:`, err)
}
})
)
await Promise.allSettled(batchOperations)
}
const query = {
'overleaf.history.id': { $exists: true },
'overleaf.backup.lastBackedUpVersion': { $exists: true },
}
await batchedUpdate(
client.db().collection('projects'),
query,
processBatch,
{
_id: 1,
'overleaf.history': 1,
'overleaf.backup': 1,
},
{ readPreference: 'secondary' },
{
BATCH_RANGE_START: convertToISODate(options['start-date']),
BATCH_RANGE_END: convertToISODate(options['end-date']),
}
)
console.log('\nComparison Summary:')
console.log('==================')
console.log(`Total projects processed: ${totalProjects}`)
console.log(`Projects with errors: ${totalErrors}`)
if (totalErrors > 0) {
throw new Error('Some project comparisons failed')
}
}
async function main() {
const options = handleOptions()
await ensureGlobalBlobsLoaded()
const projectId = options.projectId
if (options.status) {
await displayBackupStatus(projectId)
} else if (options.list) {
await displayPendingBackups(options)
} else if (options.fix !== undefined) {
await fixProjectsWithoutChunks(options)
} else if (options.pending) {
await backupPendingProjects(options)
} else if (options.init) {
await initializeProjects(options)
} else if (options.compare) {
if (options['start-date'] && options['end-date']) {
await compareAllProjects(options)
} else {
await compareBackups(projectId, options)
}
} else {
await backupProject(projectId, options)
}
}
// Only run command-line interface when script is run directly
if (import.meta.url === `file://${process.argv[1]}`) {
main()
.then(() => {
console.log(
gracefulShutdownInitiated ? 'Exited - graceful shutdown' : 'Completed'
)
})
.catch(err => {
console.error('Error backing up project:', err)
process.exit(1)
})
.finally(() => {
knex
.destroy()
.then(() => {
console.log('Postgres connection closed')
})
.catch(err => {
console.error('Error closing Postgres connection:', err)
})
client
.close()
.then(() => {
console.log('MongoDB connection closed')
})
.catch(err => {
console.error('Error closing MongoDB connection:', err)
})
redis
.disconnect()
.then(() => {
console.log('Redis connection closed')
})
.catch(err => {
console.error('Error closing Redis connection:', err)
})
})
}