first commit
This commit is contained in:
1476
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
1476
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,647 @@
|
||||
// @ts-check
|
||||
import Events from 'node:events'
|
||||
import fs from 'node:fs'
|
||||
import Stream from 'node:stream'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import logger from '@overleaf/logger'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { Blob } from 'overleaf-editor-core'
|
||||
import {
|
||||
BlobStore,
|
||||
getStringLengthOfFile,
|
||||
GLOBAL_BLOBS,
|
||||
makeBlobForFile,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { db } from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import readline from 'node:readline'
|
||||
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import filestorePersistor from '../lib/persistor.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
/**
|
||||
* @typedef {import("mongodb").Collection} Collection
|
||||
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
|
||||
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} FileRef
|
||||
* @property {ObjectId} _id
|
||||
* @property {string} hash
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Folder
|
||||
* @property {Array<Folder>} folders
|
||||
* @property {Array<FileRef>} fileRefs
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Project
|
||||
* @property {ObjectId} _id
|
||||
* @property {Array<Folder>} rootFolder
|
||||
* @property {{history: {id: (number|string)}}} overleaf
|
||||
*/
|
||||
|
||||
/**
|
||||
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
|
||||
*/
|
||||
function parseArgs() {
|
||||
const args = commandLineArgs([
|
||||
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixMissingHash', type: String, defaultValue: 'true' },
|
||||
{ name: 'logs', type: String, defaultValue: '' },
|
||||
])
|
||||
/**
|
||||
* commandLineArgs cannot handle --foo=false, so go the long way
|
||||
* @param {string} name
|
||||
* @return {boolean}
|
||||
*/
|
||||
function boolVal(name) {
|
||||
const v = args[name]
|
||||
if (['true', 'false'].includes(v)) return v === 'true'
|
||||
throw new Error(`expected "true" or "false" for boolean option ${name}`)
|
||||
}
|
||||
return {
|
||||
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
|
||||
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
|
||||
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
|
||||
FIX_MISSING_HASH: boolVal('fixMissingHash'),
|
||||
LOGS: args.logs,
|
||||
}
|
||||
}
|
||||
|
||||
const {
|
||||
FIX_HASH_MISMATCH,
|
||||
FIX_DELETE_PERMISSION,
|
||||
FIX_NOT_FOUND,
|
||||
FIX_MISSING_HASH,
|
||||
LOGS,
|
||||
} = parseArgs()
|
||||
if (!LOGS) {
|
||||
throw new Error('--logs parameter missing')
|
||||
}
|
||||
const BUFFER_DIR = fs.mkdtempSync(
|
||||
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
|
||||
)
|
||||
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
|
||||
if (!USER_FILES_BUCKET_NAME) {
|
||||
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
|
||||
}
|
||||
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
|
||||
const STREAM_HIGH_WATER_MARK = parseInt(
|
||||
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
|
||||
10
|
||||
)
|
||||
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
|
||||
|
||||
/** @type {ProjectsCollection} */
|
||||
const projectsCollection = db.collection('projects')
|
||||
/** @type {DeletedProjectsCollection} */
|
||||
const deletedProjectsCollection = db.collection('deletedProjects')
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated, draining queue')
|
||||
}
|
||||
|
||||
class FileDeletedError extends OError {}
|
||||
|
||||
/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
|
||||
const PROJECT_CACHE = new Map()
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
|
||||
*/
|
||||
async function getProject(projectId) {
|
||||
const cached = PROJECT_CACHE.get(projectId)
|
||||
if (cached) return cached
|
||||
|
||||
let projectSoftDeleted
|
||||
let project = await projectsCollection.findOne({
|
||||
_id: new ObjectId(projectId),
|
||||
})
|
||||
if (project) {
|
||||
projectSoftDeleted = false
|
||||
} else {
|
||||
const softDeleted = await deletedProjectsCollection.findOne({
|
||||
'deleterData.deletedProjectId': new ObjectId(projectId),
|
||||
project: { $exists: true },
|
||||
})
|
||||
if (!softDeleted) {
|
||||
throw new OError('project hard-deleted')
|
||||
}
|
||||
project = softDeleted.project
|
||||
projectSoftDeleted = true
|
||||
}
|
||||
PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
|
||||
return { projectSoftDeleted, project }
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Folder} folder
|
||||
* @param {string} fileId
|
||||
* @return {{path: string, fileRef: FileRef, folder: Folder}|null}
|
||||
*/
|
||||
function getFileTreePath(folder, fileId) {
|
||||
if (!folder) return null
|
||||
let idx = 0
|
||||
if (Array.isArray(folder.fileRefs)) {
|
||||
for (const fileRef of folder.fileRefs) {
|
||||
if (fileRef?._id.toString() === fileId) {
|
||||
return {
|
||||
fileRef,
|
||||
path: `.fileRefs.${idx}`,
|
||||
folder,
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
}
|
||||
idx = 0
|
||||
if (Array.isArray(folder.folders)) {
|
||||
for (const child of folder.folders) {
|
||||
const match = getFileTreePath(child, fileId)
|
||||
if (match) {
|
||||
return {
|
||||
fileRef: match.fileRef,
|
||||
folder: match.folder,
|
||||
path: `.folders.${idx}${match.path}`,
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
|
||||
*/
|
||||
async function findFile(projectId, fileId) {
|
||||
const { projectSoftDeleted, project } = await getProject(projectId)
|
||||
const match = getFileTreePath(project.rootFolder[0], fileId)
|
||||
if (!match) {
|
||||
throw new FileDeletedError('file not found in file-tree', {
|
||||
projectSoftDeleted,
|
||||
})
|
||||
}
|
||||
const { path, fileRef, folder } = match
|
||||
let fullPath
|
||||
let query
|
||||
if (projectSoftDeleted) {
|
||||
fullPath = `project.rootFolder.0${path}`
|
||||
query = {
|
||||
'deleterData.deletedProjectId': new ObjectId(projectId),
|
||||
[`${fullPath}._id`]: new ObjectId(fileId),
|
||||
}
|
||||
} else {
|
||||
fullPath = `rootFolder.0${path}`
|
||||
query = {
|
||||
_id: new ObjectId(projectId),
|
||||
[`${fullPath}._id`]: new ObjectId(fileId),
|
||||
}
|
||||
}
|
||||
return {
|
||||
projectSoftDeleted,
|
||||
query,
|
||||
fullPath,
|
||||
fileRef,
|
||||
folder,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixNotFound(line) {
|
||||
const { projectId, fileId, bucketName } = JSON.parse(line)
|
||||
if (bucketName !== USER_FILES_BUCKET_NAME) {
|
||||
throw new OError('not found case for another bucket')
|
||||
}
|
||||
|
||||
const { projectSoftDeleted, query, fullPath, fileRef, folder } =
|
||||
await findFile(projectId, fileId)
|
||||
logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
|
||||
// Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
|
||||
const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
|
||||
let result
|
||||
if (projectSoftDeleted) {
|
||||
result = await deletedProjectsCollection.updateOne(query, {
|
||||
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
|
||||
$inc: { 'project.version': 1 },
|
||||
})
|
||||
} else {
|
||||
result = await projectsCollection.updateOne(query, {
|
||||
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
|
||||
$inc: { version: 1 },
|
||||
})
|
||||
}
|
||||
if (result.matchedCount !== 1) {
|
||||
throw new OError('file-tree write did not match', { result })
|
||||
}
|
||||
// Update the cache. The mongo-path of the next file will be off otherwise.
|
||||
folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function setHashInMongo(projectId, fileId, hash) {
|
||||
const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
|
||||
projectId,
|
||||
fileId
|
||||
)
|
||||
if (fileRef.hash === hash) return
|
||||
logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
|
||||
let result
|
||||
if (projectSoftDeleted) {
|
||||
result = await deletedProjectsCollection.updateOne(query, {
|
||||
$set: { [`${fullPath}.hash`]: hash },
|
||||
$inc: { 'project.version': 1 },
|
||||
})
|
||||
} else {
|
||||
result = await projectsCollection.updateOne(query, {
|
||||
$set: { [`${fullPath}.hash`]: hash },
|
||||
$inc: { version: 1 },
|
||||
})
|
||||
}
|
||||
if (result.matchedCount !== 1) {
|
||||
throw new OError('file-tree write did not match', { result })
|
||||
}
|
||||
fileRef.hash = hash // Update cache for completeness.
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} historyId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function importRestoredFilestoreFile(projectId, fileId, historyId) {
|
||||
const filestoreKey = `${projectId}/${fileId}`
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
let s
|
||||
try {
|
||||
s = await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
filestoreKey
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new OError('missing blob, need to restore filestore file', {
|
||||
filestoreKey,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
await Stream.promises.pipeline(
|
||||
s,
|
||||
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
|
||||
)
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.putFile(path)
|
||||
await backupBlob(historyId, blob, path)
|
||||
await setHashInMongo(projectId, fileId, blob.getHash())
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} path
|
||||
* @return {Promise<Blob>}
|
||||
*/
|
||||
async function bufferFilestoreFileToDisk(projectId, fileId, path) {
|
||||
const filestoreKey = `${projectId}/${fileId}`
|
||||
try {
|
||||
await Stream.promises.pipeline(
|
||||
await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
filestoreKey
|
||||
),
|
||||
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
|
||||
)
|
||||
const blob = await makeBlobForFile(path)
|
||||
blob.setStringLength(
|
||||
await getStringLengthOfFile(blob.getByteLength(), path)
|
||||
)
|
||||
return blob
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new OError('missing blob, need to restore filestore file', {
|
||||
filestoreKey,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function computeFilestoreFileHash(projectId, fileId) {
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
|
||||
return blob.getHash()
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function uploadFilestoreFile(projectId, fileId) {
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
|
||||
const hash = blob.getHash()
|
||||
try {
|
||||
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
} catch (err) {
|
||||
if (!(err instanceof Blob.NotFoundError)) throw err
|
||||
|
||||
const { project } = await getProject(projectId)
|
||||
const historyId = project.overleaf.history.id.toString()
|
||||
const blobStore = new BlobStore(historyId)
|
||||
await blobStore.putBlob(path, blob)
|
||||
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixHashMismatch(line) {
|
||||
const {
|
||||
projectId,
|
||||
fileId,
|
||||
hash: computedHash,
|
||||
entry: {
|
||||
hash: fileTreeHash,
|
||||
ctx: { historyId },
|
||||
},
|
||||
} = JSON.parse(line)
|
||||
const blobStore = new BlobStore(historyId)
|
||||
if (await blobStore.getBlob(fileTreeHash)) {
|
||||
throw new OError('found blob with computed filestore object hash')
|
||||
}
|
||||
if (!(await blobStore.getBlob(computedHash))) {
|
||||
await importRestoredFilestoreFile(projectId, fileId, historyId)
|
||||
return true
|
||||
}
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(
|
||||
projectId,
|
||||
fileId,
|
||||
computedHash
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
|
||||
const { fileRef } = await findFile(projectId, fileId)
|
||||
return fileRef.hash === hash
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function needsBackingUpToAWS(projectId, hash) {
|
||||
if (GLOBAL_BLOBS.has(hash)) return false
|
||||
return !(await _blobIsBackedUp(projectId, hash))
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
|
||||
const { project } = await getProject(projectId)
|
||||
const historyId = project.overleaf.history.id.toString()
|
||||
const blobStore = new BlobStore(historyId)
|
||||
if (
|
||||
(await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
|
||||
(await blobStore.getBlob(hash)) &&
|
||||
!(await needsBackingUpToAWS(projectId, hash))
|
||||
) {
|
||||
return false // already processed
|
||||
}
|
||||
|
||||
const stream = await blobStore.getStream(hash)
|
||||
const path = `${BUFFER_DIR}/${historyId}_${hash}`
|
||||
try {
|
||||
await Stream.promises.pipeline(
|
||||
stream,
|
||||
fs.createWriteStream(path, {
|
||||
highWaterMark: STREAM_HIGH_WATER_MARK,
|
||||
})
|
||||
)
|
||||
|
||||
const writtenBlob = await makeBlobForFile(path)
|
||||
writtenBlob.setStringLength(
|
||||
await getStringLengthOfFile(writtenBlob.getByteLength(), path)
|
||||
)
|
||||
if (writtenBlob.getHash() !== hash) {
|
||||
// Double check download, better safe than sorry.
|
||||
throw new OError('blob corrupted', { writtenBlob })
|
||||
}
|
||||
|
||||
let blob = await blobStore.getBlob(hash)
|
||||
if (!blob) {
|
||||
// Calling blobStore.putBlob would result in the same error again.
|
||||
// HACK: Skip upload to GCS and finalize putBlob operation directly.
|
||||
await blobStore.backend.insertBlob(historyId, writtenBlob)
|
||||
}
|
||||
await backupBlob(historyId, writtenBlob, path)
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
await setHashInMongo(projectId, fileId, hash)
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixDeletePermission(line) {
|
||||
let { projectId, fileId, hash } = JSON.parse(line)
|
||||
if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixMissingHash(line) {
|
||||
let { projectId, _id: fileId } = JSON.parse(line)
|
||||
const {
|
||||
fileRef: { hash },
|
||||
} = await findFile(projectId, fileId)
|
||||
if (hash) {
|
||||
// processed, double check
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
await uploadFilestoreFile(projectId, fileId)
|
||||
return true
|
||||
}
|
||||
|
||||
const CASES = {
|
||||
'not found': {
|
||||
match: 'NotFoundError',
|
||||
flag: FIX_NOT_FOUND,
|
||||
action: fixNotFound,
|
||||
},
|
||||
'hash mismatch': {
|
||||
match: 'OError: hash mismatch',
|
||||
flag: FIX_HASH_MISMATCH,
|
||||
action: fixHashMismatch,
|
||||
},
|
||||
'delete permission': {
|
||||
match: 'storage.objects.delete',
|
||||
flag: FIX_DELETE_PERMISSION,
|
||||
action: fixDeletePermission,
|
||||
},
|
||||
'missing file hash': {
|
||||
match: '"bad file hash"',
|
||||
flag: FIX_MISSING_HASH,
|
||||
action: fixMissingHash,
|
||||
},
|
||||
}
|
||||
|
||||
const STATS = {
|
||||
processedLines: 0,
|
||||
success: 0,
|
||||
alreadyProcessed: 0,
|
||||
fileDeleted: 0,
|
||||
skipped: 0,
|
||||
failed: 0,
|
||||
unmatched: 0,
|
||||
}
|
||||
function logStats() {
|
||||
console.log(
|
||||
JSON.stringify({
|
||||
time: new Date(),
|
||||
gracefulShutdownInitiated,
|
||||
...STATS,
|
||||
})
|
||||
)
|
||||
}
|
||||
setInterval(logStats, 10_000)
|
||||
|
||||
async function processLog() {
|
||||
const rl = readline.createInterface({
|
||||
input: fs.createReadStream(LOGS),
|
||||
})
|
||||
nextLine: for await (const line of rl) {
|
||||
if (gracefulShutdownInitiated) break
|
||||
STATS.processedLines++
|
||||
if (
|
||||
!(
|
||||
line.includes('"failed to process file"') ||
|
||||
// Process missing hashes as flagged by find_malformed_filetrees.mjs
|
||||
line.includes('"bad file-tree path"')
|
||||
)
|
||||
) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
|
||||
if (!line.includes(match)) continue
|
||||
if (flag) {
|
||||
try {
|
||||
if (await action(line)) {
|
||||
STATS.success++
|
||||
} else {
|
||||
STATS.alreadyProcessed++
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof FileDeletedError) {
|
||||
STATS.fileDeleted++
|
||||
logger.info({ err, line }, 'file deleted, skipping')
|
||||
} else {
|
||||
STATS.failed++
|
||||
logger.error({ err, line }, `failed to fix ${name}`)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
STATS.skipped++
|
||||
}
|
||||
continue nextLine
|
||||
}
|
||||
STATS.unmatched++
|
||||
logger.warn({ line }, 'unknown fatal error')
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
await processLog()
|
||||
} finally {
|
||||
logStats()
|
||||
try {
|
||||
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
|
||||
} catch (err) {
|
||||
console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
|
||||
}
|
||||
}
|
||||
const { skipped, failed, unmatched } = STATS
|
||||
await setTimeout(SLEEP_BEFORE_EXIT)
|
||||
if (failed > 0) {
|
||||
process.exit(Math.min(failed, 99))
|
||||
} else if (unmatched > 0) {
|
||||
process.exit(100)
|
||||
} else if (skipped > 0) {
|
||||
process.exit(101)
|
||||
} else {
|
||||
process.exit(0)
|
||||
}
|
||||
}
|
||||
|
||||
await main()
|
||||
1104
services/history-v1/storage/scripts/backup.mjs
Normal file
1104
services/history-v1/storage/scripts/backup.mjs
Normal file
File diff suppressed because it is too large
Load Diff
173
services/history-v1/storage/scripts/backup_blob.mjs
Normal file
173
services/history-v1/storage/scripts/backup_blob.mjs
Normal file
@@ -0,0 +1,173 @@
|
||||
// @ts-check
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
|
||||
import withTmpDir from '../../api/controllers/with_tmp_dir.js'
|
||||
import {
|
||||
BlobStore,
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import assert from '../lib/assert.js'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import fs from 'node:fs'
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
/**
|
||||
* Gracefully shutdown the process
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function gracefulShutdown() {
|
||||
console.log('Gracefully shutting down')
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(100)
|
||||
process.exit()
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} row
|
||||
* @return {BackupBlobJob}
|
||||
*/
|
||||
function parseCSVRow(row) {
|
||||
const [historyId, hash] = row.split(',')
|
||||
validateBackedUpBlobJob({ historyId, hash })
|
||||
return { historyId, hash }
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {BackupBlobJob} job
|
||||
*/
|
||||
function validateBackedUpBlobJob(job) {
|
||||
assert.projectId(job.historyId)
|
||||
assert.blobHash(job.hash)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} path
|
||||
* @return {Promise<Array<BackupBlobJob>>}
|
||||
*/
|
||||
async function readCSV(path) {
|
||||
let fh
|
||||
/** @type {Array<BackupBlobJob>} */
|
||||
const rows = []
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
throw error
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
try {
|
||||
const row = parseCSVRow(line)
|
||||
if (GLOBAL_BLOBS.has(row.hash)) {
|
||||
console.log(`Skipping global blob: ${line}`)
|
||||
continue
|
||||
}
|
||||
rows.push(row)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.log(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} BackupBlobJob
|
||||
* @property {string} hash
|
||||
* @property {string} historyId
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param {Object} options
|
||||
* @property {string} [options.historyId]
|
||||
* @property {string} [options.hash]
|
||||
* @property {string} [options.input]
|
||||
* @return {Promise<Array<BackupBlobJob>>}
|
||||
*/
|
||||
async function initialiseJobs({ historyId, hash, input }) {
|
||||
if (input) {
|
||||
return await readCSV(input)
|
||||
}
|
||||
|
||||
if (!historyId) {
|
||||
console.error('historyId is required')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
if (!hash) {
|
||||
console.error('hash is required')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
validateBackedUpBlobJob({ historyId, hash })
|
||||
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
console.error(`Blob ${hash} is a global blob; not backing up`)
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
return [{ hash, historyId }]
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function downloadAndBackupBlob(historyId, hash) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.getBlob(hash)
|
||||
if (!blob) {
|
||||
throw new Error(`Blob ${hash} could not be loaded`)
|
||||
}
|
||||
await withTmpDir(`blob-${hash}`, async tmpDir => {
|
||||
const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
|
||||
console.log(`Downloaded blob ${hash} to ${filePath}`)
|
||||
await backupBlob(historyId, blob, filePath)
|
||||
console.log('Backed up blob')
|
||||
})
|
||||
}
|
||||
|
||||
let jobs
|
||||
|
||||
const options = commandLineArgs([
|
||||
{ name: 'historyId', type: String },
|
||||
{ name: 'hash', type: String },
|
||||
{ name: 'input', type: String },
|
||||
])
|
||||
|
||||
try {
|
||||
jobs = await initialiseJobs(options)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
if (!Array.isArray(jobs)) {
|
||||
// This is mostly to satisfy typescript
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
for (const { historyId, hash } of jobs) {
|
||||
try {
|
||||
await downloadAndBackupBlob(historyId, hash)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
process.exitCode = 1
|
||||
}
|
||||
}
|
||||
await gracefulShutdown()
|
||||
153
services/history-v1/storage/scripts/backup_sample.mjs
Normal file
153
services/history-v1/storage/scripts/backup_sample.mjs
Normal file
@@ -0,0 +1,153 @@
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import { db, client } from '../lib/mongodb.js'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
// Configuration
|
||||
const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
|
||||
? parseInt(process.argv[2], 10)
|
||||
: 10000
|
||||
const TARGET_ERROR_PERCENTAGE = process.argv[3]
|
||||
? parseFloat(process.argv[3])
|
||||
: 5.0
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated')
|
||||
}
|
||||
|
||||
async function takeSample(sampleSize) {
|
||||
const results = await projectsCollection
|
||||
.aggregate(
|
||||
[
|
||||
{ $sample: { size: sampleSize } },
|
||||
{
|
||||
$match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
|
||||
},
|
||||
{
|
||||
$count: 'total',
|
||||
},
|
||||
],
|
||||
{ readPreference: READ_PREFERENCE_SECONDARY }
|
||||
)
|
||||
.toArray()
|
||||
|
||||
const count = results[0]?.total || 0
|
||||
return { totalSampled: sampleSize, backedUp: count }
|
||||
}
|
||||
|
||||
function calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalPopulation
|
||||
) {
|
||||
const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
|
||||
|
||||
// Standard error with finite population correction
|
||||
const fpc = Math.sqrt(
|
||||
(totalPopulation - cumulativeSampled) / (totalPopulation - 1)
|
||||
)
|
||||
const stdError =
|
||||
Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
|
||||
|
||||
// 95% confidence interval is approximately ±1.96 standard errors
|
||||
const marginOfError = 1.96 * stdError
|
||||
|
||||
return {
|
||||
proportion,
|
||||
percentage: (proportion * 100).toFixed(2),
|
||||
marginOfError,
|
||||
errorPercentage: (marginOfError * 100).toFixed(2),
|
||||
lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
|
||||
upperBound: ((proportion + marginOfError) * 100).toFixed(2),
|
||||
sampleSize: cumulativeSampled,
|
||||
populationSize: totalPopulation,
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('Date:', new Date().toISOString())
|
||||
const totalCount = await projectsCollection.estimatedDocumentCount({
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
})
|
||||
console.log(
|
||||
`Total projects in collection (estimated): ${totalCount.toLocaleString()}`
|
||||
)
|
||||
console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
|
||||
|
||||
let cumulativeSampled = 0
|
||||
let cumulativeBackedUp = 0
|
||||
let currentError = Infinity
|
||||
let iteration = 0
|
||||
|
||||
console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
|
||||
console.log('----------|---------------|-------------|----------------')
|
||||
|
||||
while (currentError > TARGET_ERROR_PERCENTAGE) {
|
||||
if (gracefulShutdownInitiated) {
|
||||
console.log('Graceful shutdown initiated. Exiting sampling loop.')
|
||||
break
|
||||
}
|
||||
|
||||
iteration++
|
||||
const { totalSampled, backedUp } = await takeSample(
|
||||
SAMPLE_SIZE_PER_ITERATION
|
||||
)
|
||||
cumulativeSampled += totalSampled
|
||||
cumulativeBackedUp += backedUp
|
||||
|
||||
const stats = calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalCount
|
||||
)
|
||||
currentError = parseFloat(stats.errorPercentage)
|
||||
|
||||
console.log(
|
||||
`${iteration.toString().padStart(9)} | ` +
|
||||
`${cumulativeSampled.toString().padStart(13)} | ` +
|
||||
`${stats.percentage.padStart(10)}% | ` +
|
||||
`\u00B1${stats.errorPercentage}%`
|
||||
)
|
||||
|
||||
// Small delay between iterations
|
||||
await new Promise(resolve => setTimeout(resolve, 100))
|
||||
}
|
||||
|
||||
const finalStats = calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalCount
|
||||
)
|
||||
|
||||
console.log(
|
||||
`Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
|
||||
)
|
||||
console.log(
|
||||
`Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
|
||||
)
|
||||
console.log(
|
||||
`95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
|
||||
)
|
||||
console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
||||
429
services/history-v1/storage/scripts/backup_scheduler.mjs
Normal file
429
services/history-v1/storage/scripts/backup_scheduler.mjs
Normal file
@@ -0,0 +1,429 @@
|
||||
import Queue from 'bull'
|
||||
import config from 'config'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import logger from '@overleaf/logger'
|
||||
import {
|
||||
listPendingBackups,
|
||||
listUninitializedBackups,
|
||||
getBackupStatus,
|
||||
} from '../lib/backup_store/index.js'
|
||||
|
||||
logger.initialize('backup-queue')
|
||||
|
||||
// Use the same redis config as backup_worker
|
||||
const redisOptions = config.get('redis.queue')
|
||||
|
||||
// Create a Bull queue named 'backup'
|
||||
const backupQueue = new Queue('backup', {
|
||||
redis: redisOptions,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: true,
|
||||
removeOnFail: true,
|
||||
},
|
||||
})
|
||||
|
||||
// Define command-line options
|
||||
const optionDefinitions = [
|
||||
{ name: 'clean', type: Boolean },
|
||||
{ name: 'status', type: Boolean },
|
||||
{
|
||||
name: 'add',
|
||||
type: String,
|
||||
multiple: true,
|
||||
description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
|
||||
},
|
||||
{ name: 'monitor', type: Boolean },
|
||||
{
|
||||
name: 'queue-pending',
|
||||
type: Number,
|
||||
description:
|
||||
'Find projects with pending changes older than N seconds and add them to the queue',
|
||||
},
|
||||
{
|
||||
name: 'show-pending',
|
||||
type: Number,
|
||||
description:
|
||||
'Show count of pending projects older than N seconds without adding to queue',
|
||||
},
|
||||
{
|
||||
name: 'limit',
|
||||
type: Number,
|
||||
description: 'Limit the number of jobs to be added',
|
||||
},
|
||||
{
|
||||
name: 'interval',
|
||||
type: Number,
|
||||
description: 'Time in seconds to spread jobs over (default: 300)',
|
||||
defaultValue: 300,
|
||||
},
|
||||
{
|
||||
name: 'backoff-delay',
|
||||
type: Number,
|
||||
description:
|
||||
'Backoff delay in milliseconds for failed jobs (default: 1000)',
|
||||
defaultValue: 1000,
|
||||
},
|
||||
{
|
||||
name: 'attempts',
|
||||
type: Number,
|
||||
description: 'Number of retry attempts for failed jobs (default: 3)',
|
||||
defaultValue: 3,
|
||||
},
|
||||
{
|
||||
name: 'warn-threshold',
|
||||
type: Number,
|
||||
description: 'Warn about any project exceeding this pending age',
|
||||
defaultValue: 2 * 3600, // 2 hours
|
||||
},
|
||||
{
|
||||
name: 'verbose',
|
||||
alias: 'v',
|
||||
type: Boolean,
|
||||
description: 'Show detailed information when used with --show-pending',
|
||||
},
|
||||
]
|
||||
|
||||
// Parse command line arguments
|
||||
const options = commandLineArgs(optionDefinitions)
|
||||
const WARN_THRESHOLD = options['warn-threshold']
|
||||
|
||||
// Helper to validate date format
|
||||
function isValidDateFormat(dateStr) {
|
||||
return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
|
||||
}
|
||||
|
||||
// Helper to validate the pending time parameter
|
||||
function validatePendingTime(option, value) {
|
||||
if (typeof value !== 'number' || value <= 0) {
|
||||
console.error(
|
||||
`Error: --${option} requires a positive numeric TIME argument in seconds`
|
||||
)
|
||||
console.error(`Example: --${option} 3600`)
|
||||
process.exit(1)
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// Helper to format the pending time display
|
||||
function formatPendingTime(timestamp) {
|
||||
const now = new Date()
|
||||
const diffMs = now - timestamp
|
||||
const seconds = Math.floor(diffMs / 1000)
|
||||
return `${timestamp.toISOString()} (${seconds} seconds ago)`
|
||||
}
|
||||
|
||||
// Helper to add a job to the queue, checking for duplicates
|
||||
async function addJobWithCheck(queue, data, options) {
|
||||
const jobId = options.jobId
|
||||
|
||||
// Check if the job already exists
|
||||
const existingJob = await queue.getJob(jobId)
|
||||
|
||||
if (existingJob) {
|
||||
return { job: existingJob, added: false }
|
||||
} else {
|
||||
const job = await queue.add(data, options)
|
||||
return { job, added: true }
|
||||
}
|
||||
}
|
||||
|
||||
// Setup queue event listeners
|
||||
function setupMonitoring() {
|
||||
console.log('Starting queue monitoring. Press Ctrl+C to exit.')
|
||||
|
||||
backupQueue.on('global:error', error => {
|
||||
logger.info({ error }, 'Queue error')
|
||||
})
|
||||
|
||||
backupQueue.on('global:waiting', jobId => {
|
||||
logger.info({ jobId }, 'job is waiting')
|
||||
})
|
||||
|
||||
backupQueue.on('global:active', jobId => {
|
||||
logger.info({ jobId }, 'job is now active')
|
||||
})
|
||||
|
||||
backupQueue.on('global:stalled', jobId => {
|
||||
logger.info({ jobId }, 'job has stalled')
|
||||
})
|
||||
|
||||
backupQueue.on('global:progress', (jobId, progress) => {
|
||||
logger.info({ jobId, progress }, 'job progress')
|
||||
})
|
||||
|
||||
backupQueue.on('global:completed', (jobId, result) => {
|
||||
logger.info({ jobId, result }, 'job completed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:failed', (jobId, err) => {
|
||||
logger.info({ jobId, err }, 'job failed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:paused', () => {
|
||||
logger.info({}, 'Queue paused')
|
||||
})
|
||||
|
||||
backupQueue.on('global:resumed', () => {
|
||||
logger.info({}, 'Queue resumed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:cleaned', (jobs, type) => {
|
||||
logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
|
||||
})
|
||||
|
||||
backupQueue.on('global:drained', () => {
|
||||
logger.info({}, 'Queue drained')
|
||||
})
|
||||
|
||||
backupQueue.on('global:removed', jobId => {
|
||||
logger.info({ jobId }, 'Job removed')
|
||||
})
|
||||
}
|
||||
|
||||
async function addDateRangeJob(input) {
|
||||
const [startDate, endDate] = input.split(':')
|
||||
if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
|
||||
console.error(
|
||||
`Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
const jobId = `backup-${startDate}-to-${endDate}`
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ startDate, endDate },
|
||||
{ jobId }
|
||||
)
|
||||
|
||||
console.log(
|
||||
`${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
|
||||
)
|
||||
}
|
||||
|
||||
// Helper to list pending and uninitialized backups
|
||||
// This function combines the two cursors into a single generator
|
||||
// to yield projects from both lists
|
||||
async function* pendingCursor(timeIntervalMs, limit) {
|
||||
for await (const project of listPendingBackups(timeIntervalMs, limit)) {
|
||||
yield project
|
||||
}
|
||||
for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
|
||||
yield project
|
||||
}
|
||||
}
|
||||
|
||||
// Process pending projects with changes older than the specified seconds
|
||||
async function processPendingProjects(
|
||||
age,
|
||||
showOnly,
|
||||
limit,
|
||||
verbose,
|
||||
jobInterval,
|
||||
jobOpts = {}
|
||||
) {
|
||||
const timeIntervalMs = age * 1000
|
||||
console.log(
|
||||
`Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
|
||||
)
|
||||
|
||||
let count = 0
|
||||
let addedCount = 0
|
||||
let existingCount = 0
|
||||
// Pass the limit directly to MongoDB query for better performance
|
||||
const changeTimes = []
|
||||
for await (const project of pendingCursor(timeIntervalMs, limit)) {
|
||||
const projectId = project._id.toHexString()
|
||||
const pendingAt =
|
||||
project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
|
||||
if (pendingAt) {
|
||||
changeTimes.push(pendingAt)
|
||||
const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
|
||||
if (pendingAge > WARN_THRESHOLD) {
|
||||
try {
|
||||
const backupStatus = await getBackupStatus(projectId)
|
||||
logger.warn(
|
||||
{
|
||||
projectId,
|
||||
pendingAt,
|
||||
pendingAge,
|
||||
backupStatus,
|
||||
warnThreshold: WARN_THRESHOLD,
|
||||
},
|
||||
`pending change exceeds rpo warning threshold`
|
||||
)
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ projectId, pendingAt, pendingAge },
|
||||
'Error getting backup status'
|
||||
)
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
if (showOnly && verbose) {
|
||||
console.log(
|
||||
`Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
} else if (!showOnly) {
|
||||
const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ projectId, pendingChangeAt: pendingAt.getTime() },
|
||||
{ ...jobOpts, delay, jobId: projectId }
|
||||
)
|
||||
|
||||
if (added) {
|
||||
if (verbose) {
|
||||
console.log(
|
||||
`Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
}
|
||||
addedCount++
|
||||
} else {
|
||||
if (verbose) {
|
||||
console.log(
|
||||
`Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
}
|
||||
existingCount++
|
||||
}
|
||||
}
|
||||
|
||||
count++
|
||||
if (count % 1000 === 0) {
|
||||
console.log(
|
||||
`Processed ${count} projects`,
|
||||
showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
|
||||
)
|
||||
}
|
||||
}
|
||||
// Set oldestChange to undefined if there are no changes
|
||||
const oldestChange =
|
||||
changeTimes.length > 0
|
||||
? changeTimes.reduce((min, time) => (time < min ? time : min))
|
||||
: undefined
|
||||
|
||||
if (showOnly) {
|
||||
console.log(
|
||||
`Found ${count} projects with pending changes (not added to queue)`
|
||||
)
|
||||
} else {
|
||||
console.log(`Found ${count} projects with pending changes:`)
|
||||
console.log(` ${addedCount} jobs added to queue`)
|
||||
console.log(` ${existingCount} jobs already existed in queue`)
|
||||
if (oldestChange) {
|
||||
console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main execution block
|
||||
async function run() {
|
||||
const optionCount = [
|
||||
options.clean,
|
||||
options.status,
|
||||
options.add,
|
||||
options.monitor,
|
||||
options['queue-pending'] !== undefined,
|
||||
options['show-pending'] !== undefined,
|
||||
].filter(Boolean).length
|
||||
if (optionCount > 1) {
|
||||
console.error('Only one option can be specified')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (options.clean) {
|
||||
const beforeCounts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(beforeCounts))
|
||||
console.log('Cleaning completed and failed jobs...')
|
||||
await backupQueue.clean(1, 'completed')
|
||||
await backupQueue.clean(1, 'failed')
|
||||
const afterCounts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(afterCounts))
|
||||
console.log('Queue cleaned successfully')
|
||||
} else if (options.status) {
|
||||
const counts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(counts))
|
||||
} else if (options.add) {
|
||||
const inputs = Array.isArray(options.add) ? options.add : [options.add]
|
||||
for (const input of inputs) {
|
||||
if (input.includes(':')) {
|
||||
// Handle date range format
|
||||
await addDateRangeJob(input)
|
||||
} else {
|
||||
// Handle project ID format
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ projectId: input },
|
||||
{ jobId: input }
|
||||
)
|
||||
console.log(
|
||||
`${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
|
||||
)
|
||||
}
|
||||
}
|
||||
} else if (options.monitor) {
|
||||
setupMonitoring()
|
||||
} else if (options['queue-pending'] !== undefined) {
|
||||
const age = validatePendingTime('queue-pending', options['queue-pending'])
|
||||
await processPendingProjects(
|
||||
age,
|
||||
false,
|
||||
options.limit,
|
||||
options.verbose,
|
||||
options.interval,
|
||||
{
|
||||
attempts: options.attempts,
|
||||
backoff: {
|
||||
type: 'exponential',
|
||||
delay: options['backoff-delay'],
|
||||
},
|
||||
}
|
||||
)
|
||||
} else if (options['show-pending'] !== undefined) {
|
||||
const age = validatePendingTime('show-pending', options['show-pending'])
|
||||
await processPendingProjects(age, true, options.limit, options.verbose)
|
||||
} else {
|
||||
console.log('Usage:')
|
||||
console.log(' --clean Clean up completed and failed jobs')
|
||||
console.log(' --status Show current job counts')
|
||||
console.log(' --add [projectId] Add a job for the specified projectId')
|
||||
console.log(
|
||||
' --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
|
||||
)
|
||||
console.log(' --monitor Monitor queue events')
|
||||
console.log(
|
||||
' --queue-pending TIME Find projects with changes older than TIME seconds and add them to the queue'
|
||||
)
|
||||
console.log(
|
||||
' --show-pending TIME Show count of pending projects older than TIME seconds'
|
||||
)
|
||||
console.log(' --limit N Limit the number of jobs to be added')
|
||||
console.log(
|
||||
' --interval TIME Time interval in seconds to spread jobs over'
|
||||
)
|
||||
console.log(
|
||||
' --backoff-delay TIME Backoff delay in milliseconds for failed jobs (default: 1000)'
|
||||
)
|
||||
console.log(
|
||||
' --attempts N Number of retry attempts for failed jobs (default: 3)'
|
||||
)
|
||||
console.log(
|
||||
' --verbose, -v Show detailed information when used with --show-pending'
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Run and handle errors
|
||||
run()
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
.then(result => {
|
||||
// Only exit if not in monitor mode
|
||||
if (!options.monitor) {
|
||||
process.exit(0)
|
||||
}
|
||||
})
|
||||
144
services/history-v1/storage/scripts/backup_worker.mjs
Normal file
144
services/history-v1/storage/scripts/backup_worker.mjs
Normal file
@@ -0,0 +1,144 @@
|
||||
import Queue from 'bull'
|
||||
import logger from '@overleaf/logger'
|
||||
import config from 'config'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import {
|
||||
backupProject,
|
||||
initializeProjects,
|
||||
configureBackup,
|
||||
} from './backup.mjs'
|
||||
|
||||
const CONCURRENCY = 15
|
||||
const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
|
||||
const redisOptions = config.get('redis.queue')
|
||||
const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
|
||||
const LAG_TIME_BUCKETS_HRS = [
|
||||
0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
|
||||
] // hours
|
||||
|
||||
// Configure backup settings to match worker concurrency
|
||||
configureBackup({ concurrency: 50, useSecondary: true })
|
||||
|
||||
// Create a Bull queue named 'backup'
|
||||
const backupQueue = new Queue('backup', {
|
||||
redis: redisOptions,
|
||||
settings: {
|
||||
lockDuration: 15 * 60 * 1000, // 15 minutes
|
||||
lockRenewTime: 60 * 1000, // 1 minute
|
||||
maxStalledCount: 0, // mark stalled jobs as failed
|
||||
},
|
||||
})
|
||||
|
||||
// Log queue events
|
||||
backupQueue.on('active', job => {
|
||||
logger.debug({ job }, 'job is now active')
|
||||
})
|
||||
|
||||
backupQueue.on('completed', (job, result) => {
|
||||
metrics.inc('backup_worker_job', 1, { status: 'completed' })
|
||||
logger.debug({ job, result }, 'job completed')
|
||||
})
|
||||
|
||||
backupQueue.on('failed', (job, err) => {
|
||||
metrics.inc('backup_worker_job', 1, { status: 'failed' })
|
||||
logger.error({ job, err }, 'job failed')
|
||||
})
|
||||
|
||||
backupQueue.on('waiting', jobId => {
|
||||
logger.debug({ jobId }, 'job is waiting')
|
||||
})
|
||||
|
||||
backupQueue.on('error', error => {
|
||||
logger.error({ error }, 'queue error')
|
||||
})
|
||||
|
||||
backupQueue.on('stalled', job => {
|
||||
logger.error({ job }, 'job has stalled')
|
||||
})
|
||||
|
||||
backupQueue.on('lock-extension-failed', (job, err) => {
|
||||
logger.error({ job, err }, 'lock extension failed')
|
||||
})
|
||||
|
||||
backupQueue.on('paused', () => {
|
||||
logger.info('queue paused')
|
||||
})
|
||||
|
||||
backupQueue.on('resumed', () => {
|
||||
logger.info('queue resumed')
|
||||
})
|
||||
|
||||
// Process jobs
|
||||
backupQueue.process(CONCURRENCY, async job => {
|
||||
const { projectId, startDate, endDate } = job.data
|
||||
|
||||
if (projectId) {
|
||||
return await runBackup(projectId, job.data, job)
|
||||
} else if (startDate && endDate) {
|
||||
return await runInit(startDate, endDate)
|
||||
} else {
|
||||
throw new Error('invalid job data')
|
||||
}
|
||||
})
|
||||
|
||||
async function runBackup(projectId, data, job) {
|
||||
const { pendingChangeAt } = data
|
||||
// record the time it takes to run the backup job
|
||||
const timer = new metrics.Timer(
|
||||
'backup_worker_job_duration',
|
||||
1,
|
||||
{},
|
||||
JOB_TIME_BUCKETS
|
||||
)
|
||||
const pendingAge = Date.now() - pendingChangeAt
|
||||
if (pendingAge > WARN_THRESHOLD) {
|
||||
logger.warn(
|
||||
{ projectId, pendingAge, job },
|
||||
'project has been pending for a long time'
|
||||
)
|
||||
}
|
||||
try {
|
||||
logger.debug({ projectId }, 'processing backup for project')
|
||||
await backupProject(projectId, {})
|
||||
metrics.inc('backup_worker_project', 1, {
|
||||
status: 'success',
|
||||
})
|
||||
timer.done()
|
||||
// record the replication lag (time from change to backup)
|
||||
if (pendingChangeAt) {
|
||||
metrics.histogram(
|
||||
'backup_worker_replication_lag_in_hours',
|
||||
(Date.now() - pendingChangeAt) / (3600 * 1000),
|
||||
LAG_TIME_BUCKETS_HRS
|
||||
)
|
||||
}
|
||||
return `backup completed ${projectId}`
|
||||
} catch (err) {
|
||||
metrics.inc('backup_worker_project', 1, { status: 'failed' })
|
||||
logger.error({ projectId, err }, 'backup failed')
|
||||
throw err // Re-throw to mark job as failed
|
||||
}
|
||||
}
|
||||
|
||||
async function runInit(startDate, endDate) {
|
||||
try {
|
||||
logger.info({ startDate, endDate }, 'initializing projects')
|
||||
await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
|
||||
return `initialization completed ${startDate} - ${endDate}`
|
||||
} catch (err) {
|
||||
logger.error({ startDate, endDate, err }, 'initialization failed')
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
export async function drainQueue() {
|
||||
logger.info({ queue: backupQueue.name }, 'pausing queue')
|
||||
await backupQueue.pause(true) // pause this worker and wait for jobs to finish
|
||||
logger.info({ queue: backupQueue.name }, 'closing queue')
|
||||
await backupQueue.close()
|
||||
}
|
||||
|
||||
export async function healthCheck() {
|
||||
const count = await backupQueue.count()
|
||||
metrics.gauge('backup_worker_queue_length', count)
|
||||
}
|
||||
69
services/history-v1/storage/scripts/export_global_blobs.mjs
Normal file
69
services/history-v1/storage/scripts/export_global_blobs.mjs
Normal file
@@ -0,0 +1,69 @@
|
||||
/**
|
||||
* A script to export the global blobs from mongo to a CSV file.
|
||||
*
|
||||
* node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
|
||||
*
|
||||
* The output CSV has the following format:
|
||||
*
|
||||
* hash,path,byteLength,stringLength,demoted
|
||||
*
|
||||
* hash: the hash of the blob
|
||||
* path: the path of the blob in the blob store
|
||||
* byteLength: the byte length of the blob, or empty if unknown
|
||||
* stringLength: the string length of the blob, or empty if unknown
|
||||
* demoted: true if the blob has been demoted to a reference, false otherwise
|
||||
*/
|
||||
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import fs from 'node:fs'
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
function parseArgs() {
|
||||
const args = commandLineArgs([
|
||||
{
|
||||
name: 'output',
|
||||
type: String,
|
||||
alias: 'o',
|
||||
},
|
||||
])
|
||||
const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
|
||||
|
||||
return {
|
||||
OUTPUT_STREAM,
|
||||
}
|
||||
}
|
||||
|
||||
const { OUTPUT_STREAM } = parseArgs()
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
|
||||
for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
|
||||
const { hash: blobHash, byteLength, stringLength } = blob
|
||||
if (blobHash !== hash) {
|
||||
throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
|
||||
}
|
||||
const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
|
||||
const byteLengthStr = byteLength === null ? '' : byteLength
|
||||
const stringLengthStr = stringLength === null ? '' : stringLength
|
||||
OUTPUT_STREAM.write(
|
||||
`${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
||||
@@ -0,0 +1,51 @@
|
||||
// @ts-check
|
||||
import { backedUpBlobs } from '../lib/mongodb.js'
|
||||
import { mongoId } from '../lib/assert.js'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
|
||||
const STATS = {
|
||||
total: 0,
|
||||
replaced: 0,
|
||||
skipped: 0,
|
||||
}
|
||||
|
||||
const config = commandLineArgs([
|
||||
{ name: 'commit', type: Boolean, defaultValue: false },
|
||||
])
|
||||
|
||||
async function processRecord(record) {
|
||||
STATS.total++
|
||||
try {
|
||||
mongoId(record._id)
|
||||
const newId = new ObjectId(record._id)
|
||||
if (config.commit) {
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: newId },
|
||||
{
|
||||
$addToSet: { blobs: { $each: record.blobs } },
|
||||
},
|
||||
{ upsert: true }
|
||||
)
|
||||
await backedUpBlobs.deleteOne({ _id: record._id })
|
||||
}
|
||||
STATS.replaced++
|
||||
} catch (error) {
|
||||
console.log(error)
|
||||
STATS.skipped++
|
||||
}
|
||||
}
|
||||
|
||||
const cursor = backedUpBlobs
|
||||
.find({ _id: { $type: 'string' } })
|
||||
.project({ _id: 1, blobs: 1 })
|
||||
|
||||
while (await cursor.hasNext()) {
|
||||
const record = await cursor.next()
|
||||
await processRecord(record)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
|
||||
)
|
||||
process.exit()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
UPDATE blobs
|
||||
SET global = TRUE
|
||||
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
|
||||
@@ -0,0 +1,16 @@
|
||||
CREATE TABLE global_blobs (
|
||||
hash_bytes bytea NOT NULL,
|
||||
byte_length integer NOT NULL,
|
||||
string_length integer,
|
||||
global boolean,
|
||||
CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
|
||||
CONSTRAINT global_blobs_byte_length_non_negative
|
||||
CHECK (byte_length >= 0),
|
||||
CONSTRAINT global_blobs_string_length_non_negative
|
||||
CHECK (string_length IS NULL OR string_length >= 0)
|
||||
);
|
||||
|
||||
INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
|
||||
SELECT hash_bytes, byte_length, string_length, true
|
||||
FROM blobs
|
||||
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
|
||||
@@ -0,0 +1,22 @@
|
||||
BEGIN;
|
||||
ALTER TABLE blobs RENAME TO old_blobs;
|
||||
ALTER TABLE global_blobs RENAME TO blobs;
|
||||
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_byte_length_non_negative
|
||||
TO old_blobs_byte_length_non_negative;
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_string_length_non_negative
|
||||
TO old_blobs_string_length_non_negative;
|
||||
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_byte_length_non_negative
|
||||
TO blobs_byte_length_non_negative;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_string_length_non_negative
|
||||
TO blobs_string_length_non_negative;
|
||||
COMMIT;
|
||||
@@ -0,0 +1,9 @@
|
||||
Scripts in this directory were used when we cleaned up the global blobs table,
|
||||
ensuring that it only contained global blobs. The scripts are meant to be run in this order:
|
||||
|
||||
* `01-create-blob-hashes-table.sql`
|
||||
* `02-set-global-flag.sql`
|
||||
* `03-create-global-blobs-table.sql`
|
||||
* `04-swap-global-blob-tables.sql`
|
||||
|
||||
The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.
|
||||
@@ -0,0 +1,22 @@
|
||||
BEGIN;
|
||||
ALTER TABLE blobs RENAME TO global_blobs;
|
||||
ALTER TABLE old_blobs RENAME TO blobs;
|
||||
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_byte_length_non_negative
|
||||
TO global_blobs_byte_length_non_negative;
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_string_length_non_negative
|
||||
TO global_blobs_string_length_non_negative;
|
||||
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_byte_length_non_negative
|
||||
TO blobs_byte_length_non_negative;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_string_length_non_negative
|
||||
TO blobs_string_length_non_negative;
|
||||
COMMIT;
|
||||
379
services/history-v1/storage/scripts/recover_doc_versions.js
Normal file
379
services/history-v1/storage/scripts/recover_doc_versions.js
Normal file
@@ -0,0 +1,379 @@
|
||||
const fsPromises = require('node:fs/promises')
|
||||
const { ObjectId } = require('mongodb')
|
||||
const BPromise = require('bluebird')
|
||||
const logger = require('@overleaf/logger')
|
||||
const Settings = require('@overleaf/settings')
|
||||
const rclient = require('@overleaf/redis-wrapper').createClient(
|
||||
Settings.redis.documentupdater
|
||||
)
|
||||
const mongodb = require('../lib/mongodb')
|
||||
const { chunkStore } = require('..')
|
||||
const Events = require('node:events')
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
const BATCH_SIZE = 1000
|
||||
const OPTIONS = {
|
||||
concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
|
||||
force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
|
||||
'skip-history-failures':
|
||||
process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
|
||||
'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
|
||||
}
|
||||
|
||||
const db = {
|
||||
deletedProjects: mongodb.db.collection('deletedProjects'),
|
||||
docs: mongodb.db.collection('docs'),
|
||||
migrations: mongodb.db.collection('migrations'),
|
||||
projects: mongodb.db.collection('projects'),
|
||||
}
|
||||
|
||||
const BAD_MIGRATION_NAME =
|
||||
'20231219081700_move_doc_versions_from_docops_to_docs'
|
||||
|
||||
const RECOVERY_FILES_502 = [
|
||||
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
|
||||
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
|
||||
]
|
||||
|
||||
let loggingChain = Promise.resolve()
|
||||
const projectIdsThatNeedResyncing = []
|
||||
const unflushedDocIds = new Set()
|
||||
|
||||
async function flushLogQueue() {
|
||||
const logPath = OPTIONS['resyncs-needed-file']
|
||||
loggingChain = loggingChain.then(async () => {
|
||||
const batch = projectIdsThatNeedResyncing.splice(0)
|
||||
if (batch.length === 0) return
|
||||
try {
|
||||
await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
|
||||
} catch (err) {
|
||||
projectIdsThatNeedResyncing.push(...batch)
|
||||
logger.err({ err, logPath, batch }, 'Failed to write to log file')
|
||||
}
|
||||
})
|
||||
await loggingChain
|
||||
}
|
||||
async function recordProjectNeedsResync(projectId) {
|
||||
if (OPTIONS['resyncs-needed-file']) {
|
||||
projectIdsThatNeedResyncing.push(projectId)
|
||||
await flushLogQueue()
|
||||
} else {
|
||||
console.log(`Project ${projectId} needs a hard resync.`)
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const recovery502Ran = await did502RecoveryRun()
|
||||
await getUnflushedDocIds()
|
||||
const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
|
||||
|
||||
if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
|
||||
// Tell customers that they need to flush
|
||||
console.log(`
|
||||
--------------------------------------------------------------------
|
||||
Detected unflushed changes while recovering doc versions.
|
||||
Please go back to version 5.0.1 and follow the recovery procedure
|
||||
for flushing document updates:
|
||||
|
||||
https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
|
||||
--------------------------------------------------------------------`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (OPTIONS.force || recovery502Ran || badMigration != null) {
|
||||
console.warn('Need to recover doc versions. This will take a while.')
|
||||
await runRecovery()
|
||||
await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
|
||||
await delete502RecoveryFiles()
|
||||
}
|
||||
|
||||
console.log('Done.')
|
||||
}
|
||||
|
||||
async function did502RecoveryRun() {
|
||||
for (const file of RECOVERY_FILES_502) {
|
||||
try {
|
||||
await fsPromises.stat(file)
|
||||
return true
|
||||
} catch (err) {
|
||||
// file doesn't exist. continue
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async function delete502RecoveryFiles() {
|
||||
for (const file of RECOVERY_FILES_502) {
|
||||
try {
|
||||
await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
|
||||
} catch (err) {
|
||||
// file doesn't exist. continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runRecovery() {
|
||||
let batch = []
|
||||
const summary = {
|
||||
ignored: 0,
|
||||
skipped: 0,
|
||||
deletedUpdatedMongo: 0,
|
||||
deletedUpdatedRedis: 0,
|
||||
deletedUpdatedBoth: 0,
|
||||
deletedIgnored: 0,
|
||||
updatedMongo: 0,
|
||||
updatedRedis: 0,
|
||||
updatedBoth: 0,
|
||||
}
|
||||
const processBatchAndLogProgress = async () => {
|
||||
try {
|
||||
await BPromise.map(batch, project => processProject(project, summary), {
|
||||
concurrency: OPTIONS.concurrency,
|
||||
})
|
||||
} finally {
|
||||
console.log(`${summary.updatedRedis} projects updated in Redis`)
|
||||
console.log(`${summary.updatedMongo} projects updated in Mongo`)
|
||||
console.log(
|
||||
`${summary.updatedBoth} projects updated in both Mongo and Redis`
|
||||
)
|
||||
console.log(`${summary.ignored} projects had good versions`)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedRedis} deleted projects updated in Redis`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedIgnored} deleted projects had good versions`
|
||||
)
|
||||
console.log(`${summary.skipped} projects skipped`)
|
||||
}
|
||||
batch = []
|
||||
}
|
||||
|
||||
await printDBStats()
|
||||
await initResyncsNeededFile()
|
||||
for await (const project of getProjects()) {
|
||||
batch.push(project)
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
}
|
||||
|
||||
for await (const deletedProject of getDeletedProjects()) {
|
||||
const project = deletedProject.project
|
||||
project.isDeleted = true
|
||||
batch.push(project)
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.length > 0) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
|
||||
await backfillMissingVersions()
|
||||
}
|
||||
|
||||
async function getUnflushedDocIds() {
|
||||
const batchSize = 1000
|
||||
let cursor = '0'
|
||||
do {
|
||||
const [newCursor, keys] = await rclient.scan(
|
||||
cursor,
|
||||
'MATCH',
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
|
||||
'COUNT',
|
||||
batchSize
|
||||
)
|
||||
for (const key of keys) {
|
||||
unflushedDocIds.add(key.slice('DocVersion:'.length))
|
||||
}
|
||||
cursor = newCursor
|
||||
} while (cursor !== '0')
|
||||
}
|
||||
|
||||
async function printDBStats() {
|
||||
const projects = await db.projects.estimatedDocumentCount()
|
||||
const deletedProjects = await db.deletedProjects.countDocuments()
|
||||
const docs = await db.docs.estimatedDocumentCount()
|
||||
console.log(
|
||||
`Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
|
||||
)
|
||||
}
|
||||
|
||||
async function initResyncsNeededFile() {
|
||||
const logPath = OPTIONS['resyncs-needed-file']
|
||||
if (logPath) {
|
||||
await fsPromises.writeFile(logPath, '')
|
||||
await fsPromises.rm(`${logPath}.done`, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
function getProjects() {
|
||||
return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
|
||||
}
|
||||
|
||||
function getDeletedProjects() {
|
||||
return db.deletedProjects.find(
|
||||
{ 'project.overleaf.history.id': { $exists: true } },
|
||||
{ projection: { 'project._id': 1, 'project.overleaf': 1 } }
|
||||
)
|
||||
}
|
||||
|
||||
async function processProject(project, summary) {
|
||||
const projectId = project._id.toString()
|
||||
let updatedMongo = false
|
||||
let updatedRedis = false
|
||||
try {
|
||||
const historyDocVersions = await getHistoryDocVersions(project)
|
||||
|
||||
for (const { docId, version } of historyDocVersions) {
|
||||
const update = await fixDocVersion(docId, version)
|
||||
if (update != null) {
|
||||
if (update.in === 'mongo') {
|
||||
updatedMongo = true
|
||||
} else if (update.in === 'redis') {
|
||||
updatedRedis = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (project.isDeleted) {
|
||||
if (updatedMongo && updatedRedis) {
|
||||
summary.deletedUpdatedBoth += 1
|
||||
} else if (updatedMongo) {
|
||||
summary.deletedUpdatedMongo += 1
|
||||
} else if (updatedRedis) {
|
||||
summary.deletedUpdatedRedis += 1
|
||||
} else {
|
||||
summary.deletedIgnored += 1
|
||||
}
|
||||
} else {
|
||||
await recordProjectNeedsResync(projectId)
|
||||
if (updatedMongo && updatedRedis) {
|
||||
summary.updatedBoth += 1
|
||||
} else if (updatedMongo) {
|
||||
summary.updatedMongo += 1
|
||||
} else if (updatedRedis) {
|
||||
summary.updatedRedis += 1
|
||||
} else {
|
||||
summary.ignored += 1
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'Failed to process project')
|
||||
if (OPTIONS['skip-history-failures']) {
|
||||
summary.skipped += 1
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getHistoryDocVersions(project) {
|
||||
const historyId = project.overleaf.history.id
|
||||
const chunk = await chunkStore.loadLatest(historyId)
|
||||
if (chunk == null) {
|
||||
return []
|
||||
}
|
||||
|
||||
const snapshot = chunk.getSnapshot()
|
||||
const changes = chunk.getChanges()
|
||||
snapshot.applyAll(changes)
|
||||
const v2DocVersions = snapshot.getV2DocVersions()
|
||||
if (v2DocVersions == null) {
|
||||
return []
|
||||
}
|
||||
return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
|
||||
docId,
|
||||
version: versionInfo.v,
|
||||
}))
|
||||
}
|
||||
|
||||
async function fixDocVersion(docId, historyVersion) {
|
||||
const redisVersion = await getRedisDocVersion(docId)
|
||||
if (redisVersion != null && historyVersion >= redisVersion) {
|
||||
await setRedisDocVersion(docId, historyVersion + 1)
|
||||
return {
|
||||
in: 'redis',
|
||||
previousVersion: redisVersion,
|
||||
newVersion: historyVersion + 1,
|
||||
}
|
||||
} else {
|
||||
const docBeforeUpdate = await db.docs.findOneAndUpdate(
|
||||
{
|
||||
_id: new ObjectId(docId),
|
||||
$or: [
|
||||
{ version: { $lte: historyVersion } },
|
||||
{ version: { $exists: false } },
|
||||
],
|
||||
},
|
||||
{ $set: { version: historyVersion + 1 } },
|
||||
{ projection: { _id: 1, version: 1 } }
|
||||
)
|
||||
|
||||
if (docBeforeUpdate != null) {
|
||||
return {
|
||||
in: 'mongo',
|
||||
previousVersion: docBeforeUpdate.version,
|
||||
newVersion: historyVersion + 1,
|
||||
}
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getRedisDocVersion(docId) {
|
||||
if (!unflushedDocIds.has(docId)) {
|
||||
return null
|
||||
}
|
||||
const result = await rclient.get(
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
|
||||
)
|
||||
if (result == null) {
|
||||
return null
|
||||
}
|
||||
return parseInt(result, 10)
|
||||
}
|
||||
|
||||
async function setRedisDocVersion(docId, version) {
|
||||
const multi = rclient.multi()
|
||||
multi.set(
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
|
||||
version
|
||||
)
|
||||
multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
|
||||
await multi.exec()
|
||||
}
|
||||
|
||||
/**
|
||||
* Set all remaining versions to 0
|
||||
*/
|
||||
async function backfillMissingVersions() {
|
||||
console.log('Defaulting version to 0 for remaining docs.')
|
||||
await db.docs.updateMany(
|
||||
{ version: { $exists: false } },
|
||||
{ $set: { version: 0 } }
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
.finally(async () => {
|
||||
console.log('Flushing log queue.')
|
||||
await flushLogQueue()
|
||||
})
|
||||
.then(() => {
|
||||
process.exit(0)
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
255
services/history-v1/storage/scripts/recover_zip.js
Normal file
255
services/history-v1/storage/scripts/recover_zip.js
Normal file
@@ -0,0 +1,255 @@
|
||||
/**
|
||||
* Try to recover a zip of the latest version of a project using only data in
|
||||
* GCS, where this data may have been (recently) hard deleted (i.e. may exist
|
||||
* wholely or in part as non-current versions). This should be able to
|
||||
* retrieve the latest content of a project up to 180 days after it was
|
||||
* deleted.
|
||||
*
|
||||
* Usage:
|
||||
* node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
|
||||
*
|
||||
* Output:
|
||||
* Signed URL(s) for the uploaded zip files. Note that these are valid for
|
||||
* only 24h, to match the lifecycle rule on the zip bucket.
|
||||
*/
|
||||
|
||||
const fs = require('node:fs')
|
||||
const os = require('node:os')
|
||||
const path = require('node:path')
|
||||
const util = require('node:util')
|
||||
|
||||
// Something is registering 11 listeners, over the limit of 10, which generates
|
||||
// a lot of warning noise.
|
||||
require('node:events').EventEmitter.defaultMaxListeners = 11
|
||||
|
||||
const config = require('config')
|
||||
// We depend on this via object-persistor.
|
||||
// eslint-disable-next-line import/no-extraneous-dependencies
|
||||
const { Storage } = require('@google-cloud/storage')
|
||||
const isValidUtf8 = require('utf-8-validate')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const projectKey = require('../lib/project_key')
|
||||
const streams = require('../lib/streams')
|
||||
const ProjectArchive = require('../lib/project_archive')
|
||||
|
||||
const {
|
||||
values: { verbose: VERBOSE },
|
||||
positionals: HISTORY_IDS,
|
||||
} = util.parseArgs({
|
||||
options: {
|
||||
verbose: {
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
},
|
||||
},
|
||||
allowPositionals: true,
|
||||
})
|
||||
|
||||
if (HISTORY_IDS.length === 0) {
|
||||
console.error('no history IDs; see usage')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
async function listDeletedChunks(historyId) {
|
||||
const bucketName = config.get('chunkStore.bucket')
|
||||
const storage = new Storage()
|
||||
const [files] = await storage.bucket(bucketName).getFiles({
|
||||
prefix: projectKey.format(historyId),
|
||||
versions: true,
|
||||
})
|
||||
return files
|
||||
}
|
||||
|
||||
async function findLatestChunk(historyId) {
|
||||
const files = await listDeletedChunks(historyId)
|
||||
if (files.length === 0) return null
|
||||
files.sort((a, b) => {
|
||||
if (a.name < b.name) return -1
|
||||
if (a.name > b.name) return 1
|
||||
return 0
|
||||
})
|
||||
return files[files.length - 1]
|
||||
}
|
||||
|
||||
async function downloadLatestChunk(tmp, historyId) {
|
||||
const latestChunkFile = await findLatestChunk(historyId)
|
||||
if (!latestChunkFile) throw new Error('no chunk found to recover')
|
||||
|
||||
const destination = path.join(tmp, 'latest.json')
|
||||
await latestChunkFile.download({ destination })
|
||||
return destination
|
||||
}
|
||||
|
||||
async function loadHistory(historyPathname) {
|
||||
const data = await fs.promises.readFile(historyPathname)
|
||||
const rawHistory = JSON.parse(data)
|
||||
return core.History.fromRaw(rawHistory)
|
||||
}
|
||||
|
||||
async function loadChunk(historyPathname, blobStore) {
|
||||
const history = await loadHistory(historyPathname)
|
||||
|
||||
const blobHashes = new Set()
|
||||
history.findBlobHashes(blobHashes)
|
||||
|
||||
await blobStore.fetchBlobs(blobHashes)
|
||||
await history.loadFiles('lazy', blobStore)
|
||||
|
||||
return new core.Chunk(history, 0)
|
||||
}
|
||||
|
||||
// TODO: it would be nice to export / expose this from BlobStore;
|
||||
// currently this is a copy of the method there.
|
||||
async function getStringLengthOfFile(byteLength, pathname) {
|
||||
// We have to read the file into memory to get its UTF-8 length, so don't
|
||||
// bother for files that are too large for us to edit anyway.
|
||||
if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
||||
return null
|
||||
}
|
||||
|
||||
// We need to check if the file contains nonBmp or null characters
|
||||
let data = await fs.promises.readFile(pathname)
|
||||
if (!isValidUtf8(data)) return null
|
||||
data = data.toString()
|
||||
if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
|
||||
if (core.util.containsNonBmpChars(data)) return null
|
||||
if (data.indexOf('\x00') !== -1) return null
|
||||
return data.length
|
||||
}
|
||||
|
||||
class RecoveryBlobStore {
|
||||
constructor(historyId, tmp) {
|
||||
this.historyId = historyId
|
||||
this.tmp = tmp
|
||||
this.blobs = new Map()
|
||||
}
|
||||
|
||||
async fetchBlobs(blobHashes) {
|
||||
for await (const blobHash of blobHashes) {
|
||||
await this.fetchBlob(blobHash)
|
||||
}
|
||||
}
|
||||
|
||||
async fetchBlob(hash) {
|
||||
if (this.blobs.has(hash)) return
|
||||
|
||||
if (VERBOSE) console.log('fetching blob', hash)
|
||||
|
||||
const bucketName = config.get('blobStore.projectBucket')
|
||||
const storage = new Storage()
|
||||
const [files] = await storage.bucket(bucketName).getFiles({
|
||||
prefix: this.makeProjectBlobKey(hash),
|
||||
versions: true,
|
||||
})
|
||||
|
||||
const destination = this.getBlobPathname(hash)
|
||||
|
||||
if (files.length === 0) {
|
||||
await this.fetchGlobalBlob(hash, destination)
|
||||
} else if (files.length === 1) {
|
||||
await files[0].download({ destination })
|
||||
} else {
|
||||
throw new Error('Multiple versions of blob ' + hash)
|
||||
}
|
||||
|
||||
this.blobs.set(hash, await this.makeBlob(hash, destination))
|
||||
}
|
||||
|
||||
async fetchGlobalBlob(hash, destination) {
|
||||
const bucketName = config.get('blobStore.globalBucket')
|
||||
const storage = new Storage()
|
||||
const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
|
||||
await file.download({ destination })
|
||||
}
|
||||
|
||||
async makeBlob(hash, pathname) {
|
||||
const stat = await fs.promises.stat(pathname)
|
||||
const byteLength = stat.size
|
||||
const stringLength = await getStringLengthOfFile(byteLength, pathname)
|
||||
return new core.Blob(hash, byteLength, stringLength)
|
||||
}
|
||||
|
||||
async getString(hash) {
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.readStreamToBuffer(stream)
|
||||
return buffer.toString()
|
||||
}
|
||||
|
||||
async getStream(hash) {
|
||||
return fs.createReadStream(this.getBlobPathname(hash))
|
||||
}
|
||||
|
||||
async getBlob(hash) {
|
||||
return this.blobs.get(hash)
|
||||
}
|
||||
|
||||
getBlobPathname(hash) {
|
||||
return path.join(this.tmp, hash)
|
||||
}
|
||||
|
||||
makeGlobalBlobKey(hash) {
|
||||
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
||||
}
|
||||
|
||||
makeProjectBlobKey(hash) {
|
||||
return `${projectKey.format(this.historyId)}/${hash.slice(
|
||||
0,
|
||||
2
|
||||
)}/${hash.slice(2)}`
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadZip(historyId, zipPathname) {
|
||||
const bucketName = config.get('zipStore.bucket')
|
||||
const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
|
||||
const storage = new Storage()
|
||||
const destination = `${historyId}-recovered.zip`
|
||||
await storage.bucket(bucketName).upload(zipPathname, { destination })
|
||||
|
||||
const signedUrls = await storage
|
||||
.bucket(bucketName)
|
||||
.file(destination)
|
||||
.getSignedUrl({
|
||||
version: 'v4',
|
||||
action: 'read',
|
||||
expires: Date.now() + deadline,
|
||||
})
|
||||
|
||||
return signedUrls[0]
|
||||
}
|
||||
|
||||
async function restoreProject(historyId) {
|
||||
const tmp = await fs.promises.mkdtemp(
|
||||
path.join(os.tmpdir(), historyId.toString())
|
||||
)
|
||||
if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
|
||||
|
||||
const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
|
||||
const blobStore = new RecoveryBlobStore(historyId, tmp)
|
||||
const chunk = await loadChunk(latestJsonPathname, blobStore)
|
||||
|
||||
const snapshot = chunk.getSnapshot()
|
||||
for (const change of chunk.getChanges()) {
|
||||
change.applyTo(snapshot)
|
||||
}
|
||||
|
||||
if (VERBOSE) console.log('zipping', historyId)
|
||||
|
||||
const zipPathname = path.join(tmp, `${historyId}.zip`)
|
||||
const zipTimeoutMs = 60 * 1000
|
||||
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
|
||||
await archive.writeZip(blobStore, zipPathname)
|
||||
|
||||
if (VERBOSE) console.log('uploading', historyId)
|
||||
|
||||
return await uploadZip(historyId, zipPathname)
|
||||
}
|
||||
|
||||
async function main() {
|
||||
for (const historyId of HISTORY_IDS) {
|
||||
const signedUrl = await restoreProject(historyId)
|
||||
console.log(signedUrl)
|
||||
}
|
||||
}
|
||||
main().catch(console.error)
|
||||
36
services/history-v1/storage/scripts/redis.mjs
Normal file
36
services/history-v1/storage/scripts/redis.mjs
Normal file
@@ -0,0 +1,36 @@
|
||||
import redis from '@overleaf/redis-wrapper'
|
||||
import config from 'config'
|
||||
|
||||
// Get allowed Redis dbs from config
|
||||
const redisConfig = config.get('redis')
|
||||
const allowedDbs = Object.keys(redisConfig)
|
||||
|
||||
// Get the Redis db from command line argument or use the first available db as default
|
||||
const db = process.argv[2]
|
||||
|
||||
// Validate redis db
|
||||
if (!allowedDbs.includes(db)) {
|
||||
if (db) {
|
||||
console.error('Invalid redis db:', db)
|
||||
}
|
||||
console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Get redis options based on command line argument
|
||||
const redisOptions = config.get(`redis.${db}`)
|
||||
console.log('Using redis db:', db)
|
||||
console.log('REDIS CONFIG', {
|
||||
...redisOptions,
|
||||
password: '*'.repeat(redisOptions.password?.length),
|
||||
})
|
||||
const rclient = redis.createClient(redisOptions)
|
||||
|
||||
try {
|
||||
await rclient.healthCheck()
|
||||
console.log('REDIS HEALTHCHECK SUCCEEDED')
|
||||
} catch (error) {
|
||||
console.error('REDIS HEALTHCHECK FAILED', error)
|
||||
} finally {
|
||||
await rclient.quit()
|
||||
}
|
||||
104
services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
Normal file
104
services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
Normal file
@@ -0,0 +1,104 @@
|
||||
// @ts-check
|
||||
import { readFileSync } from 'node:fs'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import {
|
||||
getBackedUpBlobHashes,
|
||||
unsetBackedUpBlobHashes,
|
||||
} from '../lib/backup_store/index.js'
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
// Parse command line arguments
|
||||
const args = commandLineArgs([
|
||||
{ name: 'input', type: String, alias: 'i', defaultOption: true },
|
||||
{ name: 'commit', type: Boolean, default: false },
|
||||
])
|
||||
|
||||
if (!args.input) {
|
||||
console.error(
|
||||
'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (!args.commit) {
|
||||
console.log('Running in dry-run mode. Use --commit to apply changes.')
|
||||
}
|
||||
|
||||
// Signal handling
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
console.warn('Graceful shutdown initiated')
|
||||
gracefulShutdownInitiated = true
|
||||
}
|
||||
|
||||
// Process CSV and remove blobs
|
||||
async function main() {
|
||||
const projectBlobs = new Map()
|
||||
const lines = readFileSync(args.input, 'utf8').split('\n')
|
||||
const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
|
||||
|
||||
// Skip header
|
||||
for (const line of lines.slice(1)) {
|
||||
if (!line.trim() || gracefulShutdownInitiated) break
|
||||
|
||||
const [projectId, path] = line.split(',')
|
||||
const pathParts = path.split('/')
|
||||
const hash = pathParts[3] + pathParts[4]
|
||||
|
||||
if (!SHA1_HEX_REGEX.test(hash)) {
|
||||
console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (!projectBlobs.has(projectId)) {
|
||||
projectBlobs.set(projectId, new Set())
|
||||
}
|
||||
projectBlobs.get(projectId).add(hash)
|
||||
}
|
||||
|
||||
// Process each project
|
||||
for (const [projectId, hashes] of projectBlobs) {
|
||||
if (gracefulShutdownInitiated) break
|
||||
|
||||
if (!args.commit) {
|
||||
console.log(
|
||||
`DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
try {
|
||||
const originalHashes = await getBackedUpBlobHashes(projectId)
|
||||
if (originalHashes.size === 0) {
|
||||
continue
|
||||
}
|
||||
const result = await unsetBackedUpBlobHashes(
|
||||
projectId,
|
||||
Array.from(hashes)
|
||||
)
|
||||
if (result) {
|
||||
console.log(
|
||||
`Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
|
||||
)
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Error updating project ${projectId}:`, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
main()
|
||||
.catch(err => {
|
||||
console.error('Fatal error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client
|
||||
.close()
|
||||
.catch(err => console.error('Error closing MongoDB connection:', err))
|
||||
})
|
||||
@@ -0,0 +1,221 @@
|
||||
// @ts-check
|
||||
|
||||
/**
|
||||
* This script is used to remove blobs that have been backed up under the project ID
|
||||
* instead of the history ID (where those are different).
|
||||
*
|
||||
* This script reads a CSV file with the following format:
|
||||
* ```
|
||||
* project_id,hash
|
||||
* <mongo ID>,<hash>
|
||||
* ```
|
||||
*
|
||||
* The header row is optional. All rows will be checked for conformance to the format.
|
||||
*/
|
||||
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
|
||||
import { makeProjectKey } from '../lib/blob_store/index.js'
|
||||
import fs from 'node:fs'
|
||||
import assert from '../lib/assert.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import { verifyBlobs } from '../lib/backupVerifier.mjs'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import { getHistoryId } from '../lib/backup_store/index.js'
|
||||
|
||||
const argsSchema = [
|
||||
{
|
||||
name: 'input',
|
||||
type: String,
|
||||
},
|
||||
{
|
||||
name: 'commit',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'header',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'force',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'verbose',
|
||||
type: Boolean,
|
||||
},
|
||||
]
|
||||
|
||||
const args = commandLineArgs(argsSchema)
|
||||
|
||||
async function gracefulClose(code = 0) {
|
||||
await client.close()
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {(value: unknown) => void} fn
|
||||
* @param {unknown} value
|
||||
* @return {boolean}
|
||||
*/
|
||||
function not(fn, value) {
|
||||
try {
|
||||
fn(value)
|
||||
return false
|
||||
} catch {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} row
|
||||
* @return {{projectId: string, hash: string}}
|
||||
*/
|
||||
function parseCSVRow(row) {
|
||||
const [projectId, hash] = row.split(',')
|
||||
assert.mongoId(projectId, `invalid projectId ${projectId}`)
|
||||
assert.blobHash(hash, `invalid hash ${hash}`)
|
||||
return { projectId, hash }
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} path
|
||||
* @param {boolean} hasHeader
|
||||
* @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
|
||||
*/
|
||||
async function* readCSV(path, hasHeader) {
|
||||
let seenHeader = !hasHeader
|
||||
let fh
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
return await gracefulClose(1)
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
if (!seenHeader) {
|
||||
const [first, second] = line.split(',')
|
||||
const noDataInHeader =
|
||||
not(assert.mongoId, first) && not(assert.blobHash, second)
|
||||
if (!noDataInHeader) {
|
||||
console.error('Data found in header row')
|
||||
return await gracefulClose(1)
|
||||
}
|
||||
seenHeader = true
|
||||
continue
|
||||
}
|
||||
try {
|
||||
yield parseCSVRow(line)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.info(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function usage() {
|
||||
console.info(
|
||||
'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
|
||||
)
|
||||
}
|
||||
|
||||
if (!args.input) {
|
||||
console.error('--input was missing')
|
||||
usage()
|
||||
await gracefulClose(1)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function deleteBlob(projectId, hash) {
|
||||
const path = makeProjectKey(projectId, hash)
|
||||
if (args.commit) {
|
||||
await backupPersistor.deleteObject(projectBlobsBucket, path)
|
||||
} else {
|
||||
console.log(`DELETE: ${path}`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function canDeleteBlob(projectId, hash) {
|
||||
let historyId
|
||||
try {
|
||||
historyId = await getHistoryId(projectId)
|
||||
} catch (error) {
|
||||
if (args.verbose) {
|
||||
console.error(error)
|
||||
}
|
||||
throw new Error(`No history ID found for project ${projectId}, skipping`)
|
||||
}
|
||||
if (historyId === projectId) {
|
||||
throw new Error(
|
||||
`Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
|
||||
)
|
||||
}
|
||||
|
||||
// TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
|
||||
assert.postgresId(
|
||||
`${historyId}`,
|
||||
`History ID ${historyId} does not appear to be for a postgres project`
|
||||
)
|
||||
|
||||
try {
|
||||
await verifyBlobs(`${historyId}`, [hash])
|
||||
} catch (error) {
|
||||
if (args.verbose) {
|
||||
console.error(error)
|
||||
}
|
||||
throw new Error(
|
||||
`Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if (!args.commit) {
|
||||
console.log('DRY RUN: provide --commit to perform operations')
|
||||
}
|
||||
|
||||
if (args.force) {
|
||||
console.log(
|
||||
'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
|
||||
)
|
||||
await setTimeout(5_000)
|
||||
}
|
||||
|
||||
let deleted = 0
|
||||
let errors = 0
|
||||
|
||||
for await (const { projectId, hash } of readCSV(args.input, args.header)) {
|
||||
if (!args.force) {
|
||||
try {
|
||||
await canDeleteBlob(projectId, hash)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
continue
|
||||
}
|
||||
}
|
||||
try {
|
||||
await deleteBlob(projectId, hash)
|
||||
deleted++
|
||||
} catch (error) {
|
||||
errors++
|
||||
console.error(error)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Deleted: ${deleted}`)
|
||||
console.log(`Errors: ${errors}`)
|
||||
|
||||
await gracefulClose()
|
||||
254
services/history-v1/storage/scripts/show.mjs
Normal file
254
services/history-v1/storage/scripts/show.mjs
Normal file
@@ -0,0 +1,254 @@
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import {
|
||||
loadAtVersion,
|
||||
getChunkMetadataForVersion,
|
||||
getProjectChunksFromVersion,
|
||||
} from '../lib/chunk_store/index.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import knex from '../lib/knex.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import {
|
||||
loadGlobalBlobs,
|
||||
BlobStore,
|
||||
makeProjectKey,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { TextDecoder } from 'node:util'
|
||||
import {
|
||||
backupPersistor,
|
||||
chunksBucket,
|
||||
projectBlobsBucket,
|
||||
} from '../lib/backupPersistor.mjs'
|
||||
import fs from 'node:fs'
|
||||
import { pipeline } from 'node:stream/promises'
|
||||
import os from 'node:os'
|
||||
import path from 'node:path'
|
||||
import { createHash } from 'node:crypto'
|
||||
import projectKey from '../lib/project_key.js'
|
||||
import { createGunzip } from 'node:zlib'
|
||||
import { text } from 'node:stream/consumers'
|
||||
|
||||
const optionDefinitions = [
|
||||
{ name: 'historyId', alias: 'p', type: String },
|
||||
{ name: 'version', alias: 'v', type: Number },
|
||||
{ name: 'blob', alias: 'b', type: String },
|
||||
{ name: 'remote', alias: 'r', type: Boolean },
|
||||
{ name: 'keep', alias: 'k', type: Boolean },
|
||||
]
|
||||
|
||||
function makeChunkKey(projectId, startVersion) {
|
||||
return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
|
||||
}
|
||||
|
||||
async function listChunks(historyId) {
|
||||
for await (const chunkRecord of getProjectChunksFromVersion(historyId, 0)) {
|
||||
console.log('Chunk record:', chunkRecord)
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchChunkLocal(historyId, version) {
|
||||
const chunkRecord = await getChunkMetadataForVersion(historyId, version)
|
||||
const chunk = await loadAtVersion(historyId, version)
|
||||
return { key: version, chunk, metadata: chunkRecord, source: 'local storage' }
|
||||
}
|
||||
|
||||
async function fetchChunkRemote(historyId, version) {
|
||||
const chunkRecord = await getChunkMetadataForVersion(historyId, version)
|
||||
const startVersion = chunkRecord.startVersion
|
||||
const key = makeChunkKey(historyId, startVersion)
|
||||
const backupPersistorForProject = await backupPersistor.forProject(
|
||||
chunksBucket,
|
||||
key
|
||||
)
|
||||
const backupChunkStream = await backupPersistorForProject.getObjectStream(
|
||||
chunksBucket,
|
||||
key
|
||||
)
|
||||
const backupStr = await text(backupChunkStream.pipe(createGunzip()))
|
||||
return {
|
||||
key,
|
||||
chunk: JSON.parse(backupStr),
|
||||
metadata: chunkRecord,
|
||||
source: 'remote backup',
|
||||
}
|
||||
}
|
||||
|
||||
async function displayChunk(historyId, version, options) {
|
||||
const { key, chunk, metadata, source } = await (options.remote
|
||||
? fetchChunkRemote(historyId, version)
|
||||
: fetchChunkLocal(historyId, version))
|
||||
console.log('Source:', source)
|
||||
console.log('Chunk record', metadata)
|
||||
console.log('Key', key)
|
||||
// console.log('Number of changes', chunk.getChanges().length)
|
||||
console.log(JSON.stringify(chunk))
|
||||
}
|
||||
|
||||
async function fetchBlobRemote(historyId, blobHash) {
|
||||
const backupPersistorForProject = await backupPersistor.forProject(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
const blobKey = makeProjectKey(historyId, blobHash)
|
||||
return {
|
||||
stream: await backupPersistorForProject.getObjectStream(
|
||||
projectBlobsBucket,
|
||||
blobKey,
|
||||
{ autoGunzip: true }
|
||||
),
|
||||
metadata: { hash: blobHash },
|
||||
source: 'remote backup',
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchBlobLocal(historyId, blobHash) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.getBlob(blobHash)
|
||||
if (!blob) throw new Error(`Blob ${blobHash} not found`)
|
||||
return {
|
||||
stream: await blobStore.getStream(blobHash),
|
||||
metadata: blob,
|
||||
source: 'local storage',
|
||||
}
|
||||
}
|
||||
|
||||
async function displayBlobContent(filepath, metadata, source, blobHash) {
|
||||
console.log('Source:', source)
|
||||
console.log('Blob metadata:', metadata)
|
||||
|
||||
// Compute git hash using streaming
|
||||
const stat = fs.statSync(filepath)
|
||||
const header = `blob ${stat.size}\0`
|
||||
const hash = createHash('sha1')
|
||||
hash.update(header)
|
||||
|
||||
const hashStream = fs.createReadStream(filepath)
|
||||
for await (const chunk of hashStream) {
|
||||
hash.update(chunk)
|
||||
}
|
||||
const gitHash = hash.digest('hex')
|
||||
|
||||
// Check content type and display preview
|
||||
const fd = fs.openSync(filepath, 'r')
|
||||
try {
|
||||
const headBuf = Buffer.alloc(16)
|
||||
const tailBuf = Buffer.alloc(16)
|
||||
|
||||
try {
|
||||
// Stream through TextDecoderStream to check for valid UTF-8
|
||||
const textStream = fs.createReadStream(filepath)
|
||||
const decoder = new TextDecoder('utf-8', { fatal: true })
|
||||
for await (const chunk of textStream) {
|
||||
decoder.decode(chunk, { stream: true })
|
||||
}
|
||||
decoder.decode()
|
||||
// If we get here, it's valid UTF-8
|
||||
if (stat.size <= 1024) {
|
||||
console.log('Content (text):', await fs.readFileSync(filepath, 'utf8'))
|
||||
} else {
|
||||
console.log('Content (text, truncated):')
|
||||
console.log(` Length: ${stat.size} bytes`)
|
||||
fs.readSync(fd, headBuf, 0, 16, 0)
|
||||
fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
|
||||
console.log(
|
||||
' Content:',
|
||||
headBuf.toString('utf8') +
|
||||
' ...(truncated)... ' +
|
||||
tailBuf.toString('utf8')
|
||||
)
|
||||
}
|
||||
} catch (e) {
|
||||
// Binary content - show head and tail
|
||||
console.log('Content (binary):')
|
||||
console.log(` Length: ${stat.size} bytes`)
|
||||
|
||||
if (stat.size <= 32) {
|
||||
// Small file - read it all
|
||||
const buf = Buffer.alloc(stat.size)
|
||||
fs.readSync(fd, buf, 0, stat.size, 0)
|
||||
const hexBytes = buf.toString('hex').match(/../g).join(' ')
|
||||
console.log(' Bytes:', hexBytes)
|
||||
} else {
|
||||
// Read tail for large files
|
||||
fs.readSync(fd, headBuf, 0, 16, 0)
|
||||
fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
|
||||
const headHex = headBuf.toString('hex').match(/../g).join(' ')
|
||||
const tailHex = tailBuf.toString('hex').match(/../g).join(' ')
|
||||
console.log(' Bytes:', headHex + ' ... ' + tailHex)
|
||||
}
|
||||
console.log(' Git-style SHA1:', gitHash)
|
||||
if (gitHash !== blobHash) {
|
||||
console.log(' Warning: Git hash differs from blob hash!\x1b[0m')
|
||||
console.log(' Blob hash:', blobHash)
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fs.closeSync(fd)
|
||||
}
|
||||
}
|
||||
|
||||
async function withTempDir(prefix, fn, options = {}) {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix))
|
||||
try {
|
||||
return await Promise.resolve(fn(tmpDir))
|
||||
} finally {
|
||||
if (!options.keep) {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
} else {
|
||||
console.log('Keeping temporary file:', path.join(tmpDir, 'blob'))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function displayBlob(historyId, blobHash, options) {
|
||||
try {
|
||||
const { stream, metadata, source } = await (options.remote
|
||||
? fetchBlobRemote(historyId, blobHash)
|
||||
: fetchBlobLocal(historyId, blobHash))
|
||||
|
||||
await withTempDir(
|
||||
'blob-show-',
|
||||
async tmpDir => {
|
||||
const tmpPath = path.join(tmpDir, 'blob')
|
||||
await pipeline(stream, fs.createWriteStream(tmpPath))
|
||||
await displayBlobContent(tmpPath, metadata, source, blobHash)
|
||||
},
|
||||
{ keep: options.keep }
|
||||
)
|
||||
} catch (err) {
|
||||
if (err.code === 'NoSuchKey') {
|
||||
throw new Error(`Blob ${blobHash} not found in backup`)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { historyId, version, blob, remote, keep } =
|
||||
commandLineArgs(optionDefinitions)
|
||||
if (!historyId) {
|
||||
console.error('Error: --historyId is required.')
|
||||
process.exit(1)
|
||||
}
|
||||
await loadGlobalBlobs()
|
||||
if (version != null) {
|
||||
await displayChunk(historyId, version, { remote })
|
||||
} else if (blob != null) {
|
||||
await displayBlob(historyId, blob, { remote, keep })
|
||||
} else {
|
||||
await listChunks(historyId)
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
.finally(() => {
|
||||
knex.destroy().catch(err => console.error('Error closing Postgres:', err))
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
redis
|
||||
.disconnect()
|
||||
.catch(err => console.error('Error disconnecting Redis:', err))
|
||||
})
|
||||
153
services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
Normal file
153
services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
Normal file
@@ -0,0 +1,153 @@
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import knex from '../lib/knex.js'
|
||||
import {
|
||||
batchedUpdate,
|
||||
objectIdFromInput,
|
||||
READ_PREFERENCE_SECONDARY,
|
||||
} from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import {
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import {
|
||||
backedUpBlobs as backedUpBlobsCollection,
|
||||
db,
|
||||
client,
|
||||
} from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import fs from 'node:fs'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
function parseArgs() {
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const args = commandLineArgs([
|
||||
{
|
||||
name: 'BATCH_RANGE_START',
|
||||
type: String,
|
||||
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'BATCH_RANGE_END',
|
||||
type: String,
|
||||
defaultValue: new Date().toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'output',
|
||||
type: String,
|
||||
alias: 'o',
|
||||
},
|
||||
])
|
||||
const BATCH_RANGE_START = objectIdFromInput(
|
||||
args['BATCH_RANGE_START']
|
||||
).toString()
|
||||
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
|
||||
if (!args['output']) {
|
||||
throw new Error('missing --output')
|
||||
}
|
||||
const OUTPUT_STREAM = fs.createWriteStream(args['output'])
|
||||
|
||||
return {
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
OUTPUT_STREAM,
|
||||
}
|
||||
}
|
||||
|
||||
const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs()
|
||||
|
||||
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
|
||||
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
|
||||
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
|
||||
}
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated, draining queue')
|
||||
}
|
||||
|
||||
async function processBatch(batch) {
|
||||
if (gracefulShutdownInitiated) {
|
||||
throw new Error('graceful shutdown: aborting batch processing')
|
||||
}
|
||||
|
||||
const N = batch.length
|
||||
const firstId = batch[0]._id
|
||||
const lastId = batch[N - 1]._id
|
||||
const projectCursor = await projectsCollection.find(
|
||||
{ _id: { $gte: firstId, $lte: lastId } },
|
||||
{
|
||||
projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 },
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
}
|
||||
)
|
||||
const projectMap = new Map()
|
||||
for await (const project of projectCursor) {
|
||||
projectMap.set(project._id.toString(), project)
|
||||
}
|
||||
for (const project of batch) {
|
||||
const projectId = project._id.toString()
|
||||
const projectRecord = projectMap.get(projectId)
|
||||
if (!projectRecord) {
|
||||
console.error(`project not found: ${projectId}`)
|
||||
continue
|
||||
}
|
||||
if (!projectRecord.overleaf?.history?.id) {
|
||||
console.error(`project missing history: ${projectId}`)
|
||||
continue
|
||||
}
|
||||
const historyId = projectRecord.overleaf.history.id.toString()
|
||||
const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},`
|
||||
const hashes = project.blobs.map(blob => blob.toString('hex'))
|
||||
const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash))
|
||||
if (projectBlobHashes.length < hashes.length) {
|
||||
console.warn(
|
||||
`project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs`
|
||||
)
|
||||
}
|
||||
const rows = projectBlobHashes.map(
|
||||
hash => prefix + makeProjectKey(historyId, hash) + '\n'
|
||||
)
|
||||
OUTPUT_STREAM.write(rows.join(''))
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
OUTPUT_STREAM.write('projectId,lastUpdated,path\n')
|
||||
await batchedUpdate(
|
||||
backedUpBlobsCollection,
|
||||
{},
|
||||
processBatch,
|
||||
{},
|
||||
{},
|
||||
{ BATCH_RANGE_START, BATCH_RANGE_END }
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
knex.destroy().catch(err => {
|
||||
console.error('Error closing Postgres connection:', err)
|
||||
})
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
redis.disconnect().catch(err => {
|
||||
console.error('Error disconnecting Redis:', err)
|
||||
})
|
||||
})
|
||||
21
services/history-v1/storage/scripts/verify_backup_blob.mjs
Normal file
21
services/history-v1/storage/scripts/verify_backup_blob.mjs
Normal file
@@ -0,0 +1,21 @@
|
||||
import logger from '@overleaf/logger'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { verifyBlobs } from '../lib/backupVerifier.mjs'
|
||||
|
||||
const { historyId, hashes } = commandLineArgs([
|
||||
{ name: 'historyId', type: String },
|
||||
{ name: 'hashes', type: String, multiple: true, defaultOption: true },
|
||||
])
|
||||
|
||||
if (hashes.length === 0) {
|
||||
throw new Error('missing --hashes flag')
|
||||
}
|
||||
|
||||
try {
|
||||
await verifyBlobs(historyId, hashes)
|
||||
console.log('OK')
|
||||
process.exit(0)
|
||||
} catch (err) {
|
||||
logger.err({ err }, 'failed to verify blob')
|
||||
process.exit(1)
|
||||
}
|
||||
@@ -0,0 +1,177 @@
|
||||
import fs from 'node:fs'
|
||||
import { makeProjectKey } from '../lib/blob_store/index.js'
|
||||
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import OError from '@overleaf/o-error'
|
||||
import assert from '../lib/assert.js'
|
||||
import { client, projects } from '../lib/mongodb.js'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
const { input, verbose } = commandLineArgs([
|
||||
{ name: 'input', type: String },
|
||||
{ name: 'verbose', type: Boolean, defaultValue: false },
|
||||
])
|
||||
|
||||
function parseCSVRow(row) {
|
||||
const [path] = row.split(',')
|
||||
const pathSegments = path.split('/')
|
||||
const historyId = `${pathSegments[0]}${pathSegments[1]}${pathSegments[2]}`
|
||||
.split('')
|
||||
.reverse()
|
||||
.join('')
|
||||
|
||||
return { historyId, path, hash: `${pathSegments[3]}${pathSegments[4]}` }
|
||||
}
|
||||
|
||||
async function* readCSV(path) {
|
||||
let fh
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
throw error
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
try {
|
||||
const row = parseCSVRow(line)
|
||||
yield row
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.log(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class MissingDEKError extends OError {}
|
||||
class InvalidHistoryIdError extends OError {}
|
||||
class MissingProjectError extends OError {}
|
||||
class MissingBlobError extends OError {}
|
||||
|
||||
async function getProjectPersistor(historyId) {
|
||||
try {
|
||||
return await backupPersistor.forProjectRO(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new MissingDEKError('dek does not exist', { historyId }, err)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
async function checkBlobExists(path, historyId) {
|
||||
const persistor = await getProjectPersistor(historyId)
|
||||
return await persistor.getObjectSize(projectBlobsBucket, path)
|
||||
}
|
||||
|
||||
let total = 0
|
||||
const errors = {
|
||||
invalidProjectId: 0,
|
||||
notBackedUpProjectId: 0,
|
||||
missingBlob: 0,
|
||||
notInMongo: 0,
|
||||
unknown: 0,
|
||||
}
|
||||
|
||||
const notInMongoProjectIds = new Set()
|
||||
const notBackedUpProjectIds = new Set()
|
||||
|
||||
let stopping = false
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
console.log('SIGTERM received')
|
||||
stopping = true
|
||||
})
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
console.log('SIGINT received')
|
||||
stopping = true
|
||||
})
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {string} path
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function checkPath(historyId, path, hash) {
|
||||
try {
|
||||
assert.mongoId(historyId)
|
||||
} catch (error) {
|
||||
throw InvalidHistoryIdError('invalid history id', { historyId })
|
||||
}
|
||||
if (notInMongoProjectIds.has(historyId)) {
|
||||
throw new MissingProjectError('project not in mongo', { historyId })
|
||||
}
|
||||
if (notBackedUpProjectIds.has(historyId)) {
|
||||
throw new MissingDEKError('project not backed up', { historyId })
|
||||
}
|
||||
|
||||
const project = await projects.findOne({ _id: new ObjectId(historyId) })
|
||||
if (!project) {
|
||||
notInMongoProjectIds.add(historyId)
|
||||
throw new MissingProjectError('project not in mongo', { historyId })
|
||||
}
|
||||
try {
|
||||
await checkBlobExists(path, historyId)
|
||||
} catch (error) {
|
||||
if (error instanceof NotFoundError) {
|
||||
throw new MissingBlobError('missing blob', { historyId, hash })
|
||||
}
|
||||
if (error instanceof MissingDEKError) {
|
||||
notBackedUpProjectIds.add(historyId)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
for await (const line of readCSV(input)) {
|
||||
if (stopping) break
|
||||
total++
|
||||
if (total % 10_000 === 0) {
|
||||
console.log(`checked ${total}`)
|
||||
}
|
||||
const { historyId, path, hash } = line
|
||||
try {
|
||||
await checkPath(historyId, path, hash)
|
||||
if (verbose) {
|
||||
console.log(`✓ Project ${historyId} has ${hash} backed up`)
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof InvalidHistoryIdError) {
|
||||
errors.invalidProjectId++
|
||||
console.warn(`invalid historyId ${historyId}`)
|
||||
continue
|
||||
} else if (error instanceof MissingProjectError) {
|
||||
errors.notInMongo++
|
||||
console.warn(`✗ project ${historyId} not in mongo`)
|
||||
continue
|
||||
} else if (error instanceof MissingDEKError) {
|
||||
errors.notBackedUpProjectId++
|
||||
console.error(`✗ Project DEK ${historyId} not found`)
|
||||
continue
|
||||
} else if (error instanceof MissingBlobError) {
|
||||
errors.missingBlob++
|
||||
console.error(`✗ missing blob ${hash} from project ${historyId}`)
|
||||
continue
|
||||
}
|
||||
errors.unknown++
|
||||
console.error(error)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`total checked: ${total}`)
|
||||
console.log(`invalid project id: ${errors.invalidProjectId}`)
|
||||
console.log(`not found in mongo: ${errors.notInMongo}`)
|
||||
console.log(`missing blob: ${errors.missingBlob}`)
|
||||
console.log(`project not backed up: ${errors.notBackedUpProjectId}`)
|
||||
console.log(`unknown errors: ${errors.unknown}`)
|
||||
|
||||
await client.close()
|
||||
await setTimeout(100)
|
||||
process.exit()
|
||||
35
services/history-v1/storage/scripts/verify_project.mjs
Normal file
35
services/history-v1/storage/scripts/verify_project.mjs
Normal file
@@ -0,0 +1,35 @@
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { verifyProjectWithErrorContext } from '../lib/backupVerifier.mjs'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import { loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
|
||||
const { historyId } = commandLineArgs([{ name: 'historyId', type: String }])
|
||||
|
||||
async function gracefulShutdown(code = process.exitCode) {
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(1_000)
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
if (!historyId) {
|
||||
console.error('missing --historyId')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
try {
|
||||
await verifyProjectWithErrorContext(historyId)
|
||||
console.log('OK')
|
||||
} catch (error) {
|
||||
console.error('error verifying', error)
|
||||
process.exitCode = 1
|
||||
} finally {
|
||||
await gracefulShutdown()
|
||||
}
|
||||
217
services/history-v1/storage/scripts/verify_sampled_projects.mjs
Normal file
217
services/history-v1/storage/scripts/verify_sampled_projects.mjs
Normal file
@@ -0,0 +1,217 @@
|
||||
// @ts-check
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import {
|
||||
setWriteMetrics,
|
||||
verifyProjectsCreatedInDateRange,
|
||||
verifyRandomProjectSample,
|
||||
verifyProjectsUpdatedInDateRange,
|
||||
} from '../../backupVerifier/ProjectVerifier.mjs'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import logger from '@overleaf/logger'
|
||||
import { loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
import { getDatesBeforeRPO } from '../../backupVerifier/utils.mjs'
|
||||
import { EventEmitter } from 'node:events'
|
||||
import { mongodb } from '../index.js'
|
||||
import redis from '../lib/redis.js'
|
||||
|
||||
logger.logger.level('fatal')
|
||||
|
||||
const usageMessage = [
|
||||
'Usage: node verify_sampled_projects.mjs [--startDate <start>] [--endDate <end>] [--nProjects <n>] [--verbose] [--usage] [--writeMetrics] [--concurrency <n>] [--strategy <range|random>]',
|
||||
'strategy: defaults to "range"; startDate and endDate are required for "range" strategy',
|
||||
].join('\n')
|
||||
|
||||
/**
|
||||
* Gracefully shutdown the process
|
||||
* @param code
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function gracefulShutdown(code = process.exitCode) {
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(1_000)
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
const STATS = {
|
||||
verifiable: 0,
|
||||
unverifiable: 0,
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} CLIOptions
|
||||
* @property {(signal: EventEmitter) => Promise<VerificationJobStatus>} projectVerifier
|
||||
* @property {boolean} verbose
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import('../../backupVerifier/types.d.ts').VerificationJobStatus} VerificationJobStatus
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @return {CLIOptions}
|
||||
*/
|
||||
function getOptions() {
|
||||
const {
|
||||
startDate,
|
||||
endDate,
|
||||
concurrency,
|
||||
writeMetrics,
|
||||
verbose,
|
||||
nProjects,
|
||||
strategy,
|
||||
usage,
|
||||
} = commandLineArgs([
|
||||
{ name: 'startDate', type: String },
|
||||
{ name: 'endDate', type: String },
|
||||
{ name: 'concurrency', type: Number, defaultValue: 1 },
|
||||
{ name: 'verbose', type: Boolean, defaultValue: false },
|
||||
{ name: 'nProjects', type: Number, defaultValue: 10 },
|
||||
{ name: 'usage', type: Boolean, defaultValue: false },
|
||||
{ name: 'writeMetrics', type: Boolean, defaultValue: false },
|
||||
{ name: 'strategy', type: String, defaultValue: 'range' },
|
||||
])
|
||||
|
||||
if (usage) {
|
||||
console.log(usageMessage)
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
if (!['range', 'random', 'recent'].includes(strategy)) {
|
||||
throw new Error(`Invalid strategy: ${strategy}`)
|
||||
}
|
||||
|
||||
setWriteMetrics(writeMetrics)
|
||||
|
||||
switch (strategy) {
|
||||
case 'random':
|
||||
console.log('Verifying random projects')
|
||||
return {
|
||||
verbose,
|
||||
projectVerifier: signal => verifyRandomProjectSample(nProjects, signal),
|
||||
}
|
||||
case 'recent':
|
||||
return {
|
||||
verbose,
|
||||
projectVerifier: async signal => {
|
||||
const { startDate, endDate } = getDatesBeforeRPO(3 * 3600)
|
||||
return await verifyProjectsUpdatedInDateRange(
|
||||
startDate,
|
||||
endDate,
|
||||
nProjects,
|
||||
signal
|
||||
)
|
||||
},
|
||||
}
|
||||
case 'range':
|
||||
default: {
|
||||
if (!startDate || !endDate) {
|
||||
throw new Error(usageMessage)
|
||||
}
|
||||
const start = Date.parse(startDate)
|
||||
const end = Date.parse(endDate)
|
||||
if (Number.isNaN(start)) {
|
||||
throw new Error(`Invalid start date: ${startDate}`)
|
||||
}
|
||||
|
||||
if (Number.isNaN(end)) {
|
||||
throw new Error(`Invalid end date: ${endDate}`)
|
||||
}
|
||||
if (verbose) {
|
||||
console.log(`Verifying from ${startDate} to ${endDate}`)
|
||||
console.log(`Concurrency: ${concurrency}`)
|
||||
}
|
||||
STATS.ranges = 0
|
||||
return {
|
||||
projectVerifier: signal =>
|
||||
verifyProjectsCreatedInDateRange({
|
||||
startDate: new Date(start),
|
||||
endDate: new Date(end),
|
||||
projectsPerRange: nProjects,
|
||||
concurrency,
|
||||
signal,
|
||||
}),
|
||||
verbose,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @type {CLIOptions}
|
||||
*/
|
||||
let options
|
||||
try {
|
||||
options = getOptions()
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown(1)
|
||||
process.exit() // just here so the type checker knows that the process will exit
|
||||
}
|
||||
|
||||
const { projectVerifier, verbose } = options
|
||||
|
||||
if (verbose) {
|
||||
logger.logger.level('debug')
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Array<string>} array
|
||||
* @param {string} matchString
|
||||
* @return {*}
|
||||
*/
|
||||
function sumStringInstances(array, matchString) {
|
||||
return array.reduce((total, string) => {
|
||||
return string === matchString ? total + 1 : total
|
||||
}, 0)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {VerificationJobStatus} stats
|
||||
*/
|
||||
function displayStats(stats) {
|
||||
console.log(`Verified projects: ${stats.verified}`)
|
||||
console.log(`Total projects sampled: ${stats.total}`)
|
||||
if (stats.errorTypes.length > 0) {
|
||||
console.log('Errors:')
|
||||
for (const error of new Set(stats.errorTypes)) {
|
||||
console.log(`${error}: ${sumStringInstances(stats.errorTypes, error)}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const shutdownEmitter = new EventEmitter()
|
||||
|
||||
shutdownEmitter.on('shutdown', async () => {
|
||||
await gracefulShutdown()
|
||||
})
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
shutdownEmitter.emit('shutdown')
|
||||
})
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
shutdownEmitter.emit('shutdown')
|
||||
})
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
try {
|
||||
const stats = await projectVerifier(shutdownEmitter)
|
||||
displayStats(stats)
|
||||
console.log(`completed`)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
console.log('completed with errors')
|
||||
process.exitCode = 1
|
||||
} finally {
|
||||
console.log('shutting down')
|
||||
await gracefulShutdown()
|
||||
}
|
||||
Reference in New Issue
Block a user