1105 lines
30 KiB
JavaScript
1105 lines
30 KiB
JavaScript
// @ts-check
|
|
|
|
import logger from '@overleaf/logger'
|
|
import commandLineArgs from 'command-line-args'
|
|
import { Chunk, History, Snapshot } from 'overleaf-editor-core'
|
|
import {
|
|
getProjectChunks,
|
|
loadLatestRaw,
|
|
create,
|
|
} from '../lib/chunk_store/index.js'
|
|
import { client } from '../lib/mongodb.js'
|
|
import redis from '../lib/redis.js'
|
|
import knex from '../lib/knex.js'
|
|
import { historyStore } from '../lib/history_store.js'
|
|
import pLimit from 'p-limit'
|
|
import {
|
|
GLOBAL_BLOBS,
|
|
loadGlobalBlobs,
|
|
makeProjectKey,
|
|
BlobStore,
|
|
} from '../lib/blob_store/index.js'
|
|
import {
|
|
listPendingBackups,
|
|
getBackupStatus,
|
|
setBackupVersion,
|
|
updateCurrentMetadataIfNotSet,
|
|
updatePendingChangeTimestamp,
|
|
getBackedUpBlobHashes,
|
|
unsetBackedUpBlobHashes,
|
|
} from '../lib/backup_store/index.js'
|
|
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
|
|
import {
|
|
backupPersistor,
|
|
chunksBucket,
|
|
projectBlobsBucket,
|
|
} from '../lib/backupPersistor.mjs'
|
|
import { backupGenerator } from '../lib/backupGenerator.mjs'
|
|
import { promises as fs, createWriteStream } from 'node:fs'
|
|
import os from 'node:os'
|
|
import path from 'node:path'
|
|
import projectKey from '../lib/project_key.js'
|
|
import Crypto from 'node:crypto'
|
|
import Stream from 'node:stream'
|
|
import { EventEmitter } from 'node:events'
|
|
import {
|
|
objectIdFromInput,
|
|
batchedUpdate,
|
|
READ_PREFERENCE_SECONDARY,
|
|
} from '@overleaf/mongo-utils/batchedUpdate.js'
|
|
import { createGunzip } from 'node:zlib'
|
|
import { text } from 'node:stream/consumers'
|
|
import { fromStream as blobHashFromStream } from '../lib/blob_hash.js'
|
|
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
|
|
|
// Create a singleton promise that loads global blobs once
|
|
let globalBlobsPromise = null
|
|
function ensureGlobalBlobsLoaded() {
|
|
if (!globalBlobsPromise) {
|
|
globalBlobsPromise = loadGlobalBlobs()
|
|
}
|
|
return globalBlobsPromise
|
|
}
|
|
|
|
EventEmitter.defaultMaxListeners = 20
|
|
|
|
logger.initialize('history-v1-backup')
|
|
|
|
// Settings shared between command-line and module usage
|
|
let DRY_RUN = false
|
|
let RETRY_LIMIT = 3
|
|
const RETRY_DELAY = 1000
|
|
let CONCURRENCY = 4
|
|
let BATCH_CONCURRENCY = 1
|
|
let BLOB_LIMITER = pLimit(CONCURRENCY)
|
|
let USE_SECONDARY = false
|
|
|
|
/**
|
|
* Configure backup settings
|
|
* @param {Object} options Backup configuration options
|
|
*/
|
|
export function configureBackup(options = {}) {
|
|
DRY_RUN = options.dryRun || false
|
|
RETRY_LIMIT = options.retries || 3
|
|
CONCURRENCY = options.concurrency || 1
|
|
BATCH_CONCURRENCY = options.batchConcurrency || 1
|
|
BLOB_LIMITER = pLimit(CONCURRENCY)
|
|
USE_SECONDARY = options.useSecondary || false
|
|
}
|
|
|
|
let gracefulShutdownInitiated = false
|
|
|
|
process.on('SIGINT', handleSignal)
|
|
process.on('SIGTERM', handleSignal)
|
|
|
|
function handleSignal() {
|
|
gracefulShutdownInitiated = true
|
|
logger.info({}, 'graceful shutdown initiated, draining queue')
|
|
}
|
|
|
|
async function retry(fn, times, delayMs) {
|
|
let attempts = times
|
|
while (attempts > 0) {
|
|
try {
|
|
const result = await fn()
|
|
return result
|
|
} catch (err) {
|
|
attempts--
|
|
if (attempts === 0) throw err
|
|
await new Promise(resolve => setTimeout(resolve, delayMs))
|
|
}
|
|
}
|
|
}
|
|
|
|
function wrapWithRetry(fn, retries, delayMs) {
|
|
return async (...args) => {
|
|
const result = await retry(() => fn(...args), retries, delayMs)
|
|
return result
|
|
}
|
|
}
|
|
|
|
const downloadWithRetry = wrapWithRetry(
|
|
downloadBlobToDir,
|
|
RETRY_LIMIT,
|
|
RETRY_DELAY
|
|
)
|
|
// FIXME: this creates a new backupPersistor for each blob
|
|
// so there is no caching of the DEK
|
|
const backupWithRetry = wrapWithRetry(backupBlob, RETRY_LIMIT, RETRY_DELAY)
|
|
|
|
async function findNewBlobs(projectId, blobs) {
|
|
const newBlobs = []
|
|
const existingBackedUpBlobHashes = await getBackedUpBlobHashes(projectId)
|
|
for (const blob of blobs) {
|
|
const hash = blob.getHash()
|
|
if (existingBackedUpBlobHashes.has(blob.getHash())) {
|
|
logger.debug({ projectId, hash }, 'Blob is already backed up, skipping')
|
|
continue
|
|
}
|
|
const globalBlob = GLOBAL_BLOBS.get(hash)
|
|
if (globalBlob && !globalBlob.demoted) {
|
|
logger.debug(
|
|
{ projectId, hash },
|
|
'Blob is a global blob and not demoted, skipping'
|
|
)
|
|
continue
|
|
}
|
|
newBlobs.push(blob)
|
|
}
|
|
return newBlobs
|
|
}
|
|
|
|
async function cleanBackedUpBlobs(projectId, blobs) {
|
|
const hashes = blobs.map(blob => blob.getHash())
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would remove blobs',
|
|
hashes.join(' '),
|
|
'from project',
|
|
projectId
|
|
)
|
|
return
|
|
}
|
|
await unsetBackedUpBlobHashes(projectId, hashes)
|
|
}
|
|
|
|
async function backupSingleBlob(projectId, historyId, blob, tmpDir, persistor) {
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would back up blob',
|
|
JSON.stringify(blob),
|
|
'in history',
|
|
historyId,
|
|
'for project',
|
|
projectId
|
|
)
|
|
return
|
|
}
|
|
logger.debug({ blob, historyId }, 'backing up blob')
|
|
const blobPath = await downloadWithRetry(historyId, blob, tmpDir)
|
|
await backupWithRetry(historyId, blob, blobPath, persistor)
|
|
}
|
|
|
|
async function backupBlobs(projectId, historyId, blobs, limiter, persistor) {
|
|
let tmpDir
|
|
try {
|
|
tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'blob-backup-'))
|
|
|
|
const blobBackupOperations = blobs.map(blob =>
|
|
limiter(backupSingleBlob, projectId, historyId, blob, tmpDir, persistor)
|
|
)
|
|
|
|
// Reject if any blob backup fails
|
|
await Promise.all(blobBackupOperations)
|
|
} finally {
|
|
if (tmpDir) {
|
|
await fs.rm(tmpDir, { recursive: true, force: true })
|
|
}
|
|
}
|
|
}
|
|
|
|
async function backupChunk(
|
|
projectId,
|
|
historyId,
|
|
chunkBackupPersistorForProject,
|
|
chunkToBackup,
|
|
chunkRecord,
|
|
chunkBuffer
|
|
) {
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would back up chunk',
|
|
JSON.stringify(chunkRecord),
|
|
'in history',
|
|
historyId,
|
|
'for project',
|
|
projectId,
|
|
'key',
|
|
makeChunkKey(historyId, chunkToBackup.startVersion)
|
|
)
|
|
return
|
|
}
|
|
const key = makeChunkKey(historyId, chunkToBackup.startVersion)
|
|
logger.debug({ chunkRecord, historyId, projectId, key }, 'backing up chunk')
|
|
const md5 = Crypto.createHash('md5').update(chunkBuffer)
|
|
await chunkBackupPersistorForProject.sendStream(
|
|
chunksBucket,
|
|
makeChunkKey(historyId, chunkToBackup.startVersion),
|
|
Stream.Readable.from([chunkBuffer]),
|
|
{
|
|
contentType: 'application/json',
|
|
contentEncoding: 'gzip',
|
|
contentLength: chunkBuffer.byteLength,
|
|
sourceMd5: md5.digest('hex'),
|
|
}
|
|
)
|
|
}
|
|
|
|
async function updateBackupStatus(
|
|
projectId,
|
|
lastBackedUpVersion,
|
|
chunkRecord,
|
|
startOfBackupTime
|
|
) {
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would set backup version to',
|
|
chunkRecord.endVersion,
|
|
'with lastBackedUpTimestamp',
|
|
startOfBackupTime
|
|
)
|
|
return
|
|
}
|
|
logger.debug(
|
|
{ projectId, chunkRecord, startOfBackupTime },
|
|
'setting backupVersion and lastBackedUpTimestamp'
|
|
)
|
|
await setBackupVersion(
|
|
projectId,
|
|
lastBackedUpVersion,
|
|
chunkRecord.endVersion,
|
|
startOfBackupTime
|
|
)
|
|
}
|
|
|
|
// Define command-line options
|
|
const optionDefinitions = [
|
|
{
|
|
name: 'projectId',
|
|
alias: 'p',
|
|
type: String,
|
|
description: 'The ID of the project to backup',
|
|
defaultOption: true,
|
|
},
|
|
{
|
|
name: 'help',
|
|
alias: 'h',
|
|
type: Boolean,
|
|
description: 'Display this usage guide.',
|
|
},
|
|
{
|
|
name: 'status',
|
|
alias: 's',
|
|
type: Boolean,
|
|
description: 'Display project status.',
|
|
},
|
|
{
|
|
name: 'list',
|
|
alias: 'l',
|
|
type: Boolean,
|
|
description: 'List projects that need to be backed up',
|
|
},
|
|
{
|
|
name: 'dry-run',
|
|
alias: 'n',
|
|
type: Boolean,
|
|
description: 'Perform a dry run without making any changes.',
|
|
},
|
|
{
|
|
name: 'retries',
|
|
alias: 'r',
|
|
type: Number,
|
|
description: 'Number of retries, default is 3.',
|
|
},
|
|
{
|
|
name: 'concurrency',
|
|
alias: 'c',
|
|
type: Number,
|
|
description: 'Number of concurrent blob downloads (default: 1)',
|
|
},
|
|
{
|
|
name: 'batch-concurrency',
|
|
alias: 'b',
|
|
type: Number,
|
|
description: 'Number of concurrent project operations (default: 1)',
|
|
},
|
|
{
|
|
name: 'pending',
|
|
alias: 'P',
|
|
type: Boolean,
|
|
description: 'Backup all pending projects.',
|
|
},
|
|
{
|
|
name: 'interval',
|
|
alias: 'i',
|
|
type: Number,
|
|
description: 'Time interval in seconds for pending backups (default: 3600)',
|
|
defaultValue: 3600,
|
|
},
|
|
{
|
|
name: 'fix',
|
|
type: Number,
|
|
description: 'Fix projects without chunks',
|
|
},
|
|
{
|
|
name: 'init',
|
|
alias: 'I',
|
|
type: Boolean,
|
|
description: 'Initialize backups for all projects.',
|
|
},
|
|
{ name: 'output', alias: 'o', type: String, description: 'Output file' },
|
|
{
|
|
name: 'start-date',
|
|
type: String,
|
|
description: 'Start date for initialization (ISO format)',
|
|
},
|
|
{
|
|
name: 'end-date',
|
|
type: String,
|
|
description: 'End date for initialization (ISO format)',
|
|
},
|
|
{
|
|
name: 'use-secondary',
|
|
type: Boolean,
|
|
description: 'Use secondary read preference for backup status',
|
|
},
|
|
{
|
|
name: 'compare',
|
|
alias: 'C',
|
|
type: Boolean,
|
|
description:
|
|
'Compare backup with original chunks. With --start-date and --end-date compares all projects in range.',
|
|
},
|
|
]
|
|
|
|
function handleOptions() {
|
|
const options = commandLineArgs(optionDefinitions)
|
|
|
|
if (options.help) {
|
|
console.log('Usage:')
|
|
optionDefinitions.forEach(option => {
|
|
console.log(` --${option.name}, -${option.alias}: ${option.description}`)
|
|
})
|
|
process.exit(0)
|
|
}
|
|
|
|
const projectIdRequired =
|
|
!options.list &&
|
|
!options.pending &&
|
|
!options.init &&
|
|
!(options.fix >= 0) &&
|
|
!(options.compare && options['start-date'] && options['end-date'])
|
|
|
|
if (projectIdRequired && !options.projectId) {
|
|
console.error('Error: projectId is required')
|
|
process.exit(1)
|
|
}
|
|
|
|
if (options.pending && options.projectId) {
|
|
console.error('Error: --pending cannot be specified with projectId')
|
|
process.exit(1)
|
|
}
|
|
|
|
if (options.pending && (options.list || options.status)) {
|
|
console.error('Error: --pending is exclusive with --list and --status')
|
|
process.exit(1)
|
|
}
|
|
|
|
if (options.init && options.pending) {
|
|
console.error('Error: --init cannot be specified with --pending')
|
|
process.exit(1)
|
|
}
|
|
|
|
if (
|
|
(options['start-date'] || options['end-date']) &&
|
|
!options.init &&
|
|
!options.compare
|
|
) {
|
|
console.error(
|
|
'Error: date options can only be used with --init or --compare'
|
|
)
|
|
process.exit(1)
|
|
}
|
|
|
|
if (options['use-secondary']) {
|
|
USE_SECONDARY = true
|
|
}
|
|
|
|
if (
|
|
options.compare &&
|
|
!options.projectId &&
|
|
!(options['start-date'] && options['end-date'])
|
|
) {
|
|
console.error(
|
|
'Error: --compare requires either projectId or both --start-date and --end-date'
|
|
)
|
|
process.exit(1)
|
|
}
|
|
|
|
DRY_RUN = options['dry-run'] || false
|
|
RETRY_LIMIT = options.retries || 3
|
|
CONCURRENCY = options.concurrency || 1
|
|
BATCH_CONCURRENCY = options['batch-concurrency'] || 1
|
|
BLOB_LIMITER = pLimit(CONCURRENCY)
|
|
return options
|
|
}
|
|
|
|
async function displayBackupStatus(projectId) {
|
|
const result = await analyseBackupStatus(projectId)
|
|
console.log('Backup status:', JSON.stringify(result))
|
|
}
|
|
|
|
async function analyseBackupStatus(projectId) {
|
|
const { backupStatus, historyId, currentEndVersion, currentEndTimestamp } =
|
|
await getBackupStatus(projectId)
|
|
// TODO: when we have confidence that the latestChunkMetadata always matches
|
|
// the values from the backupStatus we can skip loading it here
|
|
const latestChunkMetadata = await loadLatestRaw(historyId, {
|
|
readOnly: Boolean(USE_SECONDARY),
|
|
})
|
|
if (
|
|
currentEndVersion &&
|
|
currentEndVersion !== latestChunkMetadata.endVersion
|
|
) {
|
|
// compare the current end version with the latest chunk metadata to check that
|
|
// the updates to the project collection are reliable
|
|
// expect some failures due to the time window between getBackupStatus and
|
|
// loadLatestRaw where the project is being actively edited.
|
|
logger.warn(
|
|
{
|
|
projectId,
|
|
historyId,
|
|
currentEndVersion,
|
|
currentEndTimestamp,
|
|
latestChunkMetadata,
|
|
},
|
|
'currentEndVersion does not match latest chunk metadata'
|
|
)
|
|
}
|
|
|
|
if (DRY_RUN) {
|
|
console.log('Project:', projectId)
|
|
console.log('History ID:', historyId)
|
|
console.log('Latest Chunk Metadata:', JSON.stringify(latestChunkMetadata))
|
|
console.log('Current end version:', currentEndVersion)
|
|
console.log('Current end timestamp:', currentEndTimestamp)
|
|
console.log('Backup status:', backupStatus ?? 'none')
|
|
}
|
|
if (!backupStatus) {
|
|
if (DRY_RUN) {
|
|
console.log('No backup status found - doing full backup')
|
|
}
|
|
}
|
|
const lastBackedUpVersion = backupStatus?.lastBackedUpVersion
|
|
const endVersion = latestChunkMetadata.endVersion
|
|
if (endVersion >= 0 && endVersion === lastBackedUpVersion) {
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Project is up to date, last backed up at version',
|
|
lastBackedUpVersion
|
|
)
|
|
}
|
|
} else if (endVersion < lastBackedUpVersion) {
|
|
throw new Error('backup is ahead of project')
|
|
} else {
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Project needs to be backed up from',
|
|
lastBackedUpVersion,
|
|
'to',
|
|
endVersion
|
|
)
|
|
}
|
|
}
|
|
|
|
return {
|
|
historyId,
|
|
lastBackedUpVersion,
|
|
currentVersion: latestChunkMetadata.endVersion || 0,
|
|
upToDate: endVersion >= 0 && lastBackedUpVersion === endVersion,
|
|
pendingChangeAt: backupStatus?.pendingChangeAt,
|
|
currentEndVersion,
|
|
currentEndTimestamp,
|
|
latestChunkMetadata,
|
|
}
|
|
}
|
|
|
|
async function displayPendingBackups(options) {
|
|
const intervalMs = options.interval * 1000
|
|
for await (const project of listPendingBackups(intervalMs)) {
|
|
console.log(
|
|
'Project:',
|
|
project._id.toHexString(),
|
|
'backup status:',
|
|
JSON.stringify(project.overleaf.backup),
|
|
'history status:',
|
|
JSON.stringify(project.overleaf.history, [
|
|
'currentEndVersion',
|
|
'currentEndTimestamp',
|
|
])
|
|
)
|
|
}
|
|
}
|
|
|
|
function makeChunkKey(projectId, startVersion) {
|
|
return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
|
|
}
|
|
|
|
export async function backupProject(projectId, options) {
|
|
if (gracefulShutdownInitiated) {
|
|
return
|
|
}
|
|
await ensureGlobalBlobsLoaded()
|
|
// FIXME: flush the project first!
|
|
// Let's assume the the flush happens externally and triggers this backup
|
|
const backupStartTime = new Date()
|
|
// find the last backed up version
|
|
const {
|
|
historyId,
|
|
lastBackedUpVersion,
|
|
currentVersion,
|
|
upToDate,
|
|
pendingChangeAt,
|
|
currentEndVersion,
|
|
latestChunkMetadata,
|
|
} = await analyseBackupStatus(projectId)
|
|
|
|
if (upToDate) {
|
|
logger.debug(
|
|
{
|
|
projectId,
|
|
historyId,
|
|
lastBackedUpVersion,
|
|
currentVersion,
|
|
pendingChangeAt,
|
|
},
|
|
'backup is up to date'
|
|
)
|
|
|
|
if (
|
|
currentEndVersion === undefined &&
|
|
latestChunkMetadata.endVersion >= 0
|
|
) {
|
|
if (DRY_RUN) {
|
|
console.log('Would update current metadata to', latestChunkMetadata)
|
|
} else {
|
|
await updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata)
|
|
}
|
|
}
|
|
|
|
// clear the pending changes timestamp if the backup is complete
|
|
if (pendingChangeAt) {
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would update or clear pending changes timestamp',
|
|
backupStartTime
|
|
)
|
|
} else {
|
|
await updatePendingChangeTimestamp(projectId, backupStartTime)
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
logger.debug(
|
|
{
|
|
projectId,
|
|
historyId,
|
|
lastBackedUpVersion,
|
|
currentVersion,
|
|
pendingChangeAt,
|
|
},
|
|
'backing up project'
|
|
)
|
|
|
|
// this persistor works for both the chunks and blobs buckets,
|
|
// because they use the same DEK
|
|
const backupPersistorForProject = await backupPersistor.forProject(
|
|
chunksBucket,
|
|
makeProjectKey(historyId, '')
|
|
)
|
|
|
|
let previousBackedUpVersion = lastBackedUpVersion
|
|
const backupVersions = [previousBackedUpVersion]
|
|
|
|
for await (const {
|
|
blobsToBackup,
|
|
chunkToBackup,
|
|
chunkRecord,
|
|
chunkBuffer,
|
|
} of backupGenerator(historyId, lastBackedUpVersion)) {
|
|
// backup the blobs first
|
|
// this can be done in parallel but must fail if any blob cannot be backed up
|
|
// if the blob already exists in the backup then that is allowed
|
|
const newBlobs = await findNewBlobs(projectId, blobsToBackup)
|
|
|
|
await backupBlobs(
|
|
projectId,
|
|
historyId,
|
|
newBlobs,
|
|
BLOB_LIMITER,
|
|
backupPersistorForProject
|
|
)
|
|
|
|
// then backup the original compressed chunk using the startVersion as the key
|
|
await backupChunk(
|
|
projectId,
|
|
historyId,
|
|
backupPersistorForProject,
|
|
chunkToBackup,
|
|
chunkRecord,
|
|
chunkBuffer
|
|
)
|
|
|
|
// persist the backup status in mongo for the current chunk
|
|
try {
|
|
await updateBackupStatus(
|
|
projectId,
|
|
previousBackedUpVersion,
|
|
chunkRecord,
|
|
backupStartTime
|
|
)
|
|
} catch (err) {
|
|
logger.error(
|
|
{ projectId, chunkRecord, err, backupVersions },
|
|
'error updating backup status'
|
|
)
|
|
throw err
|
|
}
|
|
|
|
previousBackedUpVersion = chunkRecord.endVersion
|
|
backupVersions.push(previousBackedUpVersion)
|
|
|
|
await cleanBackedUpBlobs(projectId, blobsToBackup)
|
|
}
|
|
|
|
// update the current end version and timestamp if they are not set
|
|
if (currentEndVersion === undefined && latestChunkMetadata.endVersion >= 0) {
|
|
if (DRY_RUN) {
|
|
console.log('Would update current metadata to', latestChunkMetadata)
|
|
} else {
|
|
await updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata)
|
|
}
|
|
}
|
|
|
|
// clear the pending changes timestamp if the backup is complete, otherwise set it to the time
|
|
// when the backup started (to pick up the new changes on the next backup)
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would update or clear pending changes timestamp',
|
|
backupStartTime
|
|
)
|
|
} else {
|
|
await updatePendingChangeTimestamp(projectId, backupStartTime)
|
|
}
|
|
}
|
|
|
|
function convertToISODate(dateStr) {
|
|
// Expecting YYYY-MM-DD format
|
|
if (!/^\d{4}-\d{2}-\d{2}$/.test(dateStr)) {
|
|
throw new Error('Date must be in YYYY-MM-DD format')
|
|
}
|
|
return new Date(dateStr + 'T00:00:00.000Z').toISOString()
|
|
}
|
|
|
|
export async function fixProjectsWithoutChunks(options) {
|
|
const limit = options.fix || 1
|
|
const query = {
|
|
'overleaf.history.id': { $exists: true },
|
|
'overleaf.backup.lastBackedUpVersion': { $in: [null] },
|
|
}
|
|
const cursor = client
|
|
.db()
|
|
.collection('projects')
|
|
.find(query, {
|
|
projection: { _id: 1, 'overleaf.history.id': 1 },
|
|
readPreference: READ_PREFERENCE_SECONDARY,
|
|
})
|
|
.limit(limit)
|
|
for await (const project of cursor) {
|
|
const historyId = project.overleaf.history.id.toString()
|
|
const chunks = await getProjectChunks(historyId)
|
|
if (chunks.length > 0) {
|
|
continue
|
|
}
|
|
if (DRY_RUN) {
|
|
console.log(
|
|
'Would create new chunk for Project ID:',
|
|
project._id.toHexString(),
|
|
'History ID:',
|
|
historyId,
|
|
'Chunks:',
|
|
chunks
|
|
)
|
|
} else {
|
|
console.log(
|
|
'Creating new chunk for Project ID:',
|
|
project._id.toHexString(),
|
|
'History ID:',
|
|
historyId,
|
|
'Chunks:',
|
|
chunks
|
|
)
|
|
const snapshot = new Snapshot()
|
|
const history = new History(snapshot, [])
|
|
const chunk = new Chunk(history, 0)
|
|
await create(historyId, chunk)
|
|
const newChunks = await getProjectChunks(historyId)
|
|
console.log('New chunk:', newChunks)
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function initializeProjects(options) {
|
|
await ensureGlobalBlobsLoaded()
|
|
let totalErrors = 0
|
|
let totalProjects = 0
|
|
|
|
const query = {
|
|
'overleaf.backup.lastBackedUpVersion': { $in: [null] },
|
|
}
|
|
|
|
if (options['start-date'] && options['end-date']) {
|
|
query._id = {
|
|
$gte: objectIdFromInput(convertToISODate(options['start-date'])),
|
|
$lt: objectIdFromInput(convertToISODate(options['end-date'])),
|
|
}
|
|
}
|
|
|
|
const cursor = client
|
|
.db()
|
|
.collection('projects')
|
|
.find(query, {
|
|
projection: { _id: 1 },
|
|
readPreference: READ_PREFERENCE_SECONDARY,
|
|
})
|
|
|
|
if (options.output) {
|
|
console.log("Writing project IDs to file: '" + options.output + "'")
|
|
const output = createWriteStream(options.output)
|
|
for await (const project of cursor) {
|
|
output.write(project._id.toHexString() + '\n')
|
|
totalProjects++
|
|
}
|
|
output.end()
|
|
console.log('Wrote ' + totalProjects + ' project IDs to file')
|
|
return
|
|
}
|
|
|
|
for await (const project of cursor) {
|
|
if (gracefulShutdownInitiated) {
|
|
console.warn('graceful shutdown: stopping project initialization')
|
|
break
|
|
}
|
|
totalProjects++
|
|
const projectId = project._id.toHexString()
|
|
try {
|
|
await backupProject(projectId, options)
|
|
} catch (err) {
|
|
totalErrors++
|
|
logger.error({ projectId, err }, 'error backing up project')
|
|
}
|
|
}
|
|
|
|
return { errors: totalErrors, projects: totalProjects }
|
|
}
|
|
|
|
async function backupPendingProjects(options) {
|
|
const intervalMs = options.interval * 1000
|
|
for await (const project of listPendingBackups(intervalMs)) {
|
|
if (gracefulShutdownInitiated) {
|
|
console.warn('graceful shutdown: stopping pending project backups')
|
|
break
|
|
}
|
|
const projectId = project._id.toHexString()
|
|
console.log(`Backing up pending project with ID: ${projectId}`)
|
|
await backupProject(projectId, options)
|
|
}
|
|
}
|
|
|
|
class BlobComparator {
|
|
constructor(backupPersistorForProject) {
|
|
this.cache = new Map()
|
|
this.backupPersistorForProject = backupPersistorForProject
|
|
}
|
|
|
|
async compareBlob(historyId, blob) {
|
|
let computedHash = this.cache.get(blob.hash)
|
|
const fromCache = !!computedHash
|
|
|
|
if (!computedHash) {
|
|
const blobKey = makeProjectKey(historyId, blob.hash)
|
|
const backupBlobStream =
|
|
await this.backupPersistorForProject.getObjectStream(
|
|
projectBlobsBucket,
|
|
blobKey,
|
|
{ autoGunzip: true }
|
|
)
|
|
computedHash = await blobHashFromStream(blob.byteLength, backupBlobStream)
|
|
this.cache.set(blob.hash, computedHash)
|
|
}
|
|
|
|
const matches = computedHash === blob.hash
|
|
return {
|
|
matches,
|
|
computedHash,
|
|
fromCache,
|
|
}
|
|
}
|
|
}
|
|
|
|
async function compareBackups(projectId, options) {
|
|
console.log(`Comparing backups for project ${projectId}`)
|
|
const { historyId } = await getBackupStatus(projectId)
|
|
const chunks = await getProjectChunks(historyId)
|
|
const blobStore = new BlobStore(historyId)
|
|
const backupPersistorForProject = await backupPersistor.forProject(
|
|
chunksBucket,
|
|
makeProjectKey(historyId, '')
|
|
)
|
|
|
|
let totalChunkMatches = 0
|
|
let totalChunkMismatches = 0
|
|
let totalChunksNotFound = 0
|
|
let totalBlobMatches = 0
|
|
let totalBlobMismatches = 0
|
|
let totalBlobsNotFound = 0
|
|
const errors = []
|
|
const blobComparator = new BlobComparator(backupPersistorForProject)
|
|
|
|
for (const chunk of chunks) {
|
|
try {
|
|
// Compare chunk content
|
|
const originalChunk = await historyStore.loadRaw(historyId, chunk.id)
|
|
const key = makeChunkKey(historyId, chunk.startVersion)
|
|
try {
|
|
const backupChunkStream =
|
|
await backupPersistorForProject.getObjectStream(chunksBucket, key)
|
|
const backupStr = await text(backupChunkStream.pipe(createGunzip()))
|
|
const originalStr = JSON.stringify(originalChunk)
|
|
const backupChunk = JSON.parse(backupStr)
|
|
const backupStartVersion = chunk.startVersion
|
|
const backupEndVersion = chunk.startVersion + backupChunk.changes.length
|
|
|
|
if (originalStr === backupStr) {
|
|
console.log(
|
|
`✓ Chunk ${chunk.id} (v${chunk.startVersion}-v${chunk.endVersion}) matches`
|
|
)
|
|
totalChunkMatches++
|
|
} else if (originalStr === JSON.stringify(JSON.parse(backupStr))) {
|
|
console.log(
|
|
`✓ Chunk ${chunk.id} (v${chunk.startVersion}-v${chunk.endVersion}) matches (after normalisation)`
|
|
)
|
|
totalChunkMatches++
|
|
} else if (backupEndVersion < chunk.endVersion) {
|
|
console.log(
|
|
`✗ Chunk ${chunk.id} is ahead of backup (v${chunk.startVersion}-v${chunk.endVersion} vs v${backupStartVersion}-v${backupEndVersion})`
|
|
)
|
|
totalChunkMismatches++
|
|
errors.push({ chunkId: chunk.id, error: 'Chunk ahead of backup' })
|
|
} else {
|
|
console.log(
|
|
`✗ Chunk ${chunk.id} (v${chunk.startVersion}-v${chunk.endVersion}) MISMATCH`
|
|
)
|
|
totalChunkMismatches++
|
|
errors.push({ chunkId: chunk.id, error: 'Chunk mismatch' })
|
|
}
|
|
} catch (err) {
|
|
if (err instanceof NotFoundError) {
|
|
console.log(`✗ Chunk ${chunk.id} not found in backup`, err.cause)
|
|
totalChunksNotFound++
|
|
errors.push({ chunkId: chunk.id, error: `Chunk not found` })
|
|
} else {
|
|
throw err
|
|
}
|
|
}
|
|
|
|
const history = History.fromRaw(originalChunk)
|
|
|
|
// Compare blobs in chunk
|
|
const blobHashes = new Set()
|
|
history.findBlobHashes(blobHashes)
|
|
const blobs = await blobStore.getBlobs(Array.from(blobHashes))
|
|
for (const blob of blobs) {
|
|
if (GLOBAL_BLOBS.has(blob.hash)) {
|
|
const globalBlob = GLOBAL_BLOBS.get(blob.hash)
|
|
console.log(
|
|
` ✓ Blob ${blob.hash} is a global blob`,
|
|
globalBlob.demoted ? '(demoted)' : ''
|
|
)
|
|
continue
|
|
}
|
|
try {
|
|
const { matches, computedHash, fromCache } =
|
|
await blobComparator.compareBlob(historyId, blob)
|
|
|
|
if (matches) {
|
|
console.log(
|
|
` ✓ Blob ${blob.hash} hash matches (${blob.byteLength} bytes)` +
|
|
(fromCache ? ' (from cache)' : '')
|
|
)
|
|
totalBlobMatches++
|
|
} else {
|
|
console.log(
|
|
` ✗ Blob ${blob.hash} hash mismatch (original: ${blob.hash}, backup: ${computedHash}) (${blob.byteLength} bytes, ${blob.stringLength} string length)` +
|
|
(fromCache ? ' (from cache)' : '')
|
|
)
|
|
totalBlobMismatches++
|
|
errors.push({
|
|
chunkId: chunk.id,
|
|
error: `Blob ${blob.hash} hash mismatch`,
|
|
})
|
|
}
|
|
} catch (err) {
|
|
if (err instanceof NotFoundError) {
|
|
console.log(` ✗ Blob ${blob.hash} not found in backup`, err.cause)
|
|
totalBlobsNotFound++
|
|
errors.push({
|
|
chunkId: chunk.id,
|
|
error: `Blob ${blob.hash} not found`,
|
|
})
|
|
} else {
|
|
throw err
|
|
}
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error(`Error comparing chunk ${chunk.id}:`, err)
|
|
errors.push({ chunkId: chunk.id, error: err })
|
|
}
|
|
}
|
|
|
|
// Print summary
|
|
console.log('\nComparison Summary:')
|
|
console.log('==================')
|
|
console.log(`Total chunks: ${chunks.length}`)
|
|
console.log(`Chunk matches: ${totalChunkMatches}`)
|
|
console.log(`Chunk mismatches: ${totalChunkMismatches}`)
|
|
console.log(`Chunk not found: ${totalChunksNotFound}`)
|
|
console.log(`Blob matches: ${totalBlobMatches}`)
|
|
console.log(`Blob mismatches: ${totalBlobMismatches}`)
|
|
console.log(`Blob not found: ${totalBlobsNotFound}`)
|
|
console.log(`Errors: ${errors.length}`)
|
|
|
|
if (errors.length > 0) {
|
|
console.log('\nErrors:')
|
|
errors.forEach(({ chunkId, error }) => {
|
|
console.log(` Chunk ${chunkId}: ${error}`)
|
|
})
|
|
throw new Error('Backup comparison FAILED')
|
|
} else {
|
|
console.log('Backup comparison successful')
|
|
}
|
|
}
|
|
|
|
async function compareAllProjects(options) {
|
|
const limiter = pLimit(BATCH_CONCURRENCY)
|
|
let totalErrors = 0
|
|
let totalProjects = 0
|
|
|
|
async function processBatch(batch) {
|
|
if (gracefulShutdownInitiated) {
|
|
throw new Error('graceful shutdown')
|
|
}
|
|
const batchOperations = batch.map(project =>
|
|
limiter(async () => {
|
|
const projectId = project._id.toHexString()
|
|
totalProjects++
|
|
try {
|
|
console.log(`\nComparing project ${projectId} (${totalProjects})`)
|
|
await compareBackups(projectId, options)
|
|
} catch (err) {
|
|
totalErrors++
|
|
console.error(`Failed to compare project ${projectId}:`, err)
|
|
}
|
|
})
|
|
)
|
|
await Promise.allSettled(batchOperations)
|
|
}
|
|
|
|
const query = {
|
|
'overleaf.history.id': { $exists: true },
|
|
'overleaf.backup.lastBackedUpVersion': { $exists: true },
|
|
}
|
|
|
|
await batchedUpdate(
|
|
client.db().collection('projects'),
|
|
query,
|
|
processBatch,
|
|
{
|
|
_id: 1,
|
|
'overleaf.history': 1,
|
|
'overleaf.backup': 1,
|
|
},
|
|
{ readPreference: 'secondary' },
|
|
{
|
|
BATCH_RANGE_START: convertToISODate(options['start-date']),
|
|
BATCH_RANGE_END: convertToISODate(options['end-date']),
|
|
}
|
|
)
|
|
|
|
console.log('\nComparison Summary:')
|
|
console.log('==================')
|
|
console.log(`Total projects processed: ${totalProjects}`)
|
|
console.log(`Projects with errors: ${totalErrors}`)
|
|
|
|
if (totalErrors > 0) {
|
|
throw new Error('Some project comparisons failed')
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const options = handleOptions()
|
|
await ensureGlobalBlobsLoaded()
|
|
const projectId = options.projectId
|
|
if (options.status) {
|
|
await displayBackupStatus(projectId)
|
|
} else if (options.list) {
|
|
await displayPendingBackups(options)
|
|
} else if (options.fix !== undefined) {
|
|
await fixProjectsWithoutChunks(options)
|
|
} else if (options.pending) {
|
|
await backupPendingProjects(options)
|
|
} else if (options.init) {
|
|
await initializeProjects(options)
|
|
} else if (options.compare) {
|
|
if (options['start-date'] && options['end-date']) {
|
|
await compareAllProjects(options)
|
|
} else {
|
|
await compareBackups(projectId, options)
|
|
}
|
|
} else {
|
|
await backupProject(projectId, options)
|
|
}
|
|
}
|
|
|
|
// Only run command-line interface when script is run directly
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
main()
|
|
.then(() => {
|
|
console.log(
|
|
gracefulShutdownInitiated ? 'Exited - graceful shutdown' : 'Completed'
|
|
)
|
|
})
|
|
.catch(err => {
|
|
console.error('Error backing up project:', err)
|
|
process.exit(1)
|
|
})
|
|
.finally(() => {
|
|
knex
|
|
.destroy()
|
|
.then(() => {
|
|
console.log('Postgres connection closed')
|
|
})
|
|
.catch(err => {
|
|
console.error('Error closing Postgres connection:', err)
|
|
})
|
|
client
|
|
.close()
|
|
.then(() => {
|
|
console.log('MongoDB connection closed')
|
|
})
|
|
.catch(err => {
|
|
console.error('Error closing MongoDB connection:', err)
|
|
})
|
|
redis
|
|
.disconnect()
|
|
.then(() => {
|
|
console.log('Redis connection closed')
|
|
})
|
|
.catch(err => {
|
|
console.error('Error closing Redis connection:', err)
|
|
})
|
|
})
|
|
}
|