first commit
This commit is contained in:
25
services/history-v1/storage/index.js
Normal file
25
services/history-v1/storage/index.js
Normal file
@@ -0,0 +1,25 @@
|
||||
exports.BatchBlobStore = require('./lib/batch_blob_store')
|
||||
exports.blobHash = require('./lib/blob_hash')
|
||||
exports.HashCheckBlobStore = require('./lib/hash_check_blob_store')
|
||||
exports.chunkBuffer = require('./lib/chunk_buffer')
|
||||
exports.chunkStore = require('./lib/chunk_store')
|
||||
exports.historyStore = require('./lib/history_store').historyStore
|
||||
exports.knex = require('./lib/knex')
|
||||
exports.mongodb = require('./lib/mongodb')
|
||||
exports.redis = require('./lib/redis')
|
||||
exports.persistChanges = require('./lib/persist_changes')
|
||||
exports.persistor = require('./lib/persistor')
|
||||
exports.ProjectArchive = require('./lib/project_archive')
|
||||
exports.streams = require('./lib/streams')
|
||||
exports.temp = require('./lib/temp')
|
||||
exports.zipStore = require('./lib/zip_store')
|
||||
|
||||
const { BlobStore, loadGlobalBlobs } = require('./lib/blob_store')
|
||||
exports.BlobStore = BlobStore
|
||||
exports.loadGlobalBlobs = loadGlobalBlobs
|
||||
|
||||
const { InvalidChangeError } = require('./lib/errors')
|
||||
exports.InvalidChangeError = InvalidChangeError
|
||||
|
||||
const { ChunkVersionConflictError } = require('./lib/chunk_store/errors')
|
||||
exports.ChunkVersionConflictError = ChunkVersionConflictError
|
||||
76
services/history-v1/storage/lib/assert.js
Normal file
76
services/history-v1/storage/lib/assert.js
Normal file
@@ -0,0 +1,76 @@
|
||||
'use strict'
|
||||
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
const check = require('check-types')
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
|
||||
const assert = check.assert
|
||||
|
||||
const MONGO_ID_REGEXP = /^[0-9a-f]{24}$/
|
||||
const POSTGRES_ID_REGEXP = /^[1-9][0-9]{0,9}$/
|
||||
const MONGO_OR_POSTGRES_ID_REGEXP = /^([0-9a-f]{24}|[1-9][0-9]{0,9})$/
|
||||
|
||||
function transaction(transaction, message) {
|
||||
assert.function(transaction, message)
|
||||
}
|
||||
|
||||
function blobHash(arg, message) {
|
||||
try {
|
||||
assert.match(arg, Blob.HEX_HASH_RX, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A project id is a string that contains either an integer (for projects stored in Postgres) or 24
|
||||
* hex digits (for projects stored in Mongo)
|
||||
*/
|
||||
function projectId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A chunk id is a string that contains either an integer (for projects stored in Postgres) or 24
|
||||
* hex digits (for projects stored in Mongo)
|
||||
*/
|
||||
function chunkId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
function mongoId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, MONGO_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
function postgresId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, POSTGRES_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
...assert,
|
||||
transaction,
|
||||
blobHash,
|
||||
projectId,
|
||||
chunkId,
|
||||
mongoId,
|
||||
postgresId,
|
||||
MONGO_ID_REGEXP,
|
||||
POSTGRES_ID_REGEXP,
|
||||
}
|
||||
251
services/history-v1/storage/lib/backupBlob.mjs
Normal file
251
services/history-v1/storage/lib/backupBlob.mjs
Normal file
@@ -0,0 +1,251 @@
|
||||
// @ts-check
|
||||
import { backupPersistor, projectBlobsBucket } from './backupPersistor.mjs'
|
||||
import { GLOBAL_BLOBS, makeProjectKey, BlobStore } from './blob_store/index.js'
|
||||
import Stream from 'node:stream'
|
||||
import fs from 'node:fs'
|
||||
import Crypto from 'node:crypto'
|
||||
import assert from './assert.js'
|
||||
import { backedUpBlobs, projects } from './mongodb.js'
|
||||
import { Binary, ObjectId } from 'mongodb'
|
||||
import logger from '@overleaf/logger/logging-manager.js'
|
||||
import { AlreadyWrittenError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import zLib from 'node:zlib'
|
||||
import Path from 'node:path'
|
||||
|
||||
const HIGHWATER_MARK = 1024 * 1024
|
||||
|
||||
/**
|
||||
* @typedef {import("overleaf-editor-core").Blob} Blob
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
|
||||
*/
|
||||
|
||||
/**
|
||||
* Increment a metric to record the outcome of a backup operation.
|
||||
*
|
||||
* @param {"success"|"failure"|"skipped"} status
|
||||
* @param {"global"|"already_backed_up"|"none"} reason
|
||||
*/
|
||||
function recordBackupConclusion(status, reason = 'none') {
|
||||
metrics.inc('blob_backed_up', 1, { status, reason })
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads a blob to a specified directory
|
||||
*
|
||||
* @param {string} historyId - The history ID of the project the blob belongs to
|
||||
* @param {Blob} blob - The blob to download
|
||||
* @param {string} tmpDir - The directory path where the blob will be downloaded
|
||||
* @returns {Promise<string>} The full path where the blob was downloaded
|
||||
*/
|
||||
export async function downloadBlobToDir(historyId, blob, tmpDir) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blobHash = blob.getHash()
|
||||
const src = await blobStore.getStream(blobHash)
|
||||
const filePath = Path.join(tmpDir, `${historyId}-${blobHash}`)
|
||||
try {
|
||||
const dst = fs.createWriteStream(filePath, {
|
||||
highWaterMark: HIGHWATER_MARK,
|
||||
flags: 'wx',
|
||||
})
|
||||
await Stream.promises.pipeline(src, dst)
|
||||
return filePath
|
||||
} catch (error) {
|
||||
try {
|
||||
await fs.promises.unlink(filePath)
|
||||
} catch {}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the actual upload of the blob to the backup storage.
|
||||
*
|
||||
* @param {string} historyId - The history ID of the project the blob belongs to
|
||||
* @param {Blob} blob - The blob being uploaded
|
||||
* @param {string} path - The path to the file to upload (should have been stored on disk already)
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function uploadBlobToBackup(historyId, blob, path, persistor) {
|
||||
const md5 = Crypto.createHash('md5')
|
||||
const filePathCompressed = path + '.gz'
|
||||
let backupSource
|
||||
let contentEncoding
|
||||
let size
|
||||
try {
|
||||
if (blob.getStringLength()) {
|
||||
backupSource = filePathCompressed
|
||||
contentEncoding = 'gzip'
|
||||
size = 0
|
||||
await Stream.promises.pipeline(
|
||||
fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
|
||||
zLib.createGzip(),
|
||||
async function* (source) {
|
||||
for await (const chunk of source) {
|
||||
size += chunk.byteLength
|
||||
md5.update(chunk)
|
||||
yield chunk
|
||||
}
|
||||
},
|
||||
fs.createWriteStream(filePathCompressed, {
|
||||
highWaterMark: HIGHWATER_MARK,
|
||||
})
|
||||
)
|
||||
} else {
|
||||
backupSource = path
|
||||
size = blob.getByteLength()
|
||||
await Stream.promises.pipeline(
|
||||
fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
|
||||
md5
|
||||
)
|
||||
}
|
||||
const key = makeProjectKey(historyId, blob.getHash())
|
||||
await persistor.sendStream(
|
||||
projectBlobsBucket,
|
||||
key,
|
||||
fs.createReadStream(backupSource, { highWaterMark: HIGHWATER_MARK }),
|
||||
{
|
||||
contentEncoding,
|
||||
contentType: 'application/octet-stream',
|
||||
contentLength: size,
|
||||
sourceMd5: md5.digest('hex'),
|
||||
ifNoneMatch: '*',
|
||||
}
|
||||
)
|
||||
} finally {
|
||||
if (backupSource === filePathCompressed) {
|
||||
try {
|
||||
await fs.promises.rm(filePathCompressed, { force: true })
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a legacy (postgres) historyId to a mongo projectId
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @return {Promise<string>}
|
||||
* @private
|
||||
*/
|
||||
async function _convertLegacyHistoryIdToProjectId(historyId) {
|
||||
const project = await projects.findOne(
|
||||
{ 'overleaf.history.id': parseInt(historyId) },
|
||||
{ projection: { _id: 1 } }
|
||||
)
|
||||
|
||||
if (!project?._id) {
|
||||
throw new Error('Did not find project for history id')
|
||||
}
|
||||
|
||||
return project?._id?.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Records that a blob was backed up for a project.
|
||||
*
|
||||
* @param {string} projectId - projectId for a project (mongo format)
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function storeBlobBackup(projectId, hash) {
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ $addToSet: { blobs: new Binary(Buffer.from(hash, 'hex')) } },
|
||||
{ upsert: true }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether a specific blob has been backed up in this project.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<*>}
|
||||
* @private
|
||||
*/
|
||||
export async function _blobIsBackedUp(projectId, hash) {
|
||||
const blobs = await backedUpBlobs.findOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: new Binary(Buffer.from(hash, 'hex')),
|
||||
},
|
||||
{ projection: { _id: 1 } }
|
||||
)
|
||||
return blobs?._id
|
||||
}
|
||||
|
||||
/**
|
||||
* Back up a blob to the global storage and record that it was backed up.
|
||||
*
|
||||
* @param {string} historyId - history ID for a project (can be postgres format or mongo format)
|
||||
* @param {Blob} blob - The blob that is being backed up
|
||||
* @param {string} tmpPath - The path to a temporary file storing the contents of the blob.
|
||||
* @param {CachedPerProjectEncryptedS3Persistor} [persistor] - The persistor to use (optional)
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function backupBlob(historyId, blob, tmpPath, persistor) {
|
||||
const hash = blob.getHash()
|
||||
|
||||
let projectId = historyId
|
||||
if (assert.POSTGRES_ID_REGEXP.test(historyId)) {
|
||||
projectId = await _convertLegacyHistoryIdToProjectId(historyId)
|
||||
}
|
||||
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
|
||||
if (globalBlob && !globalBlob.demoted) {
|
||||
recordBackupConclusion('skipped', 'global')
|
||||
logger.debug({ projectId, hash }, 'Blob is global - skipping backup')
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
if (await _blobIsBackedUp(projectId, hash)) {
|
||||
recordBackupConclusion('skipped', 'already_backed_up')
|
||||
logger.debug(
|
||||
{ projectId, hash },
|
||||
'Blob already backed up - skipping backup'
|
||||
)
|
||||
return
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn({ error }, 'Failed to check if blob is backed up')
|
||||
// We'll try anyway - we'll catch the error if it was backed up
|
||||
}
|
||||
// If we weren't passed a persistor for this project, create one.
|
||||
// This will fetch the key from AWS, so it's prefereable to use
|
||||
// the same persistor for all blobs in a project where possible.
|
||||
if (!persistor) {
|
||||
logger.debug(
|
||||
{ historyId, hash },
|
||||
'warning: persistor not passed to backupBlob'
|
||||
)
|
||||
}
|
||||
persistor ??= await backupPersistor.forProject(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
try {
|
||||
logger.debug({ projectId, hash }, 'Starting blob backup')
|
||||
await uploadBlobToBackup(historyId, blob, tmpPath, persistor)
|
||||
await storeBlobBackup(projectId, hash)
|
||||
recordBackupConclusion('success')
|
||||
} catch (error) {
|
||||
if (error instanceof AlreadyWrittenError) {
|
||||
logger.debug({ error, projectId, hash }, 'Blob already backed up')
|
||||
// record that we backed it up already
|
||||
await storeBlobBackup(projectId, hash)
|
||||
recordBackupConclusion('failure', 'already_backed_up')
|
||||
return
|
||||
}
|
||||
// eventually queue this for retry - for now this will be fixed by running the script
|
||||
recordBackupConclusion('failure')
|
||||
logger.warn({ error, projectId, hash }, 'Failed to upload blob to backup')
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'Ended blob backup')
|
||||
}
|
||||
}
|
||||
93
services/history-v1/storage/lib/backupDeletion.mjs
Normal file
93
services/history-v1/storage/lib/backupDeletion.mjs
Normal file
@@ -0,0 +1,93 @@
|
||||
// @ts-check
|
||||
import { callbackify } from 'util'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import config from 'config'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { db } from './mongodb.js'
|
||||
import projectKey from './project_key.js'
|
||||
import chunkStore from '../lib/chunk_store/index.js'
|
||||
import {
|
||||
backupPersistor,
|
||||
chunksBucket,
|
||||
projectBlobsBucket,
|
||||
} from './backupPersistor.mjs'
|
||||
|
||||
const MS_PER_DAY = 24 * 60 * 60 * 1000
|
||||
const EXPIRE_PROJECTS_AFTER_MS =
|
||||
parseInt(config.get('minSoftDeletionPeriodDays'), 10) * MS_PER_DAY
|
||||
const deletedProjectsCollection = db.collection('deletedProjects')
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function projectHasLatestChunk(historyId) {
|
||||
const chunk = await chunkStore.getBackend(historyId).getLatestChunk(historyId)
|
||||
return chunk != null
|
||||
}
|
||||
|
||||
export class NotReadyToDelete extends OError {}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function deleteProjectBackup(projectId) {
|
||||
const deletedProject = await deletedProjectsCollection.findOne(
|
||||
{ 'deleterData.deletedProjectId': new ObjectId(projectId) },
|
||||
{
|
||||
projection: {
|
||||
'deleterData.deletedProjectOverleafHistoryId': 1,
|
||||
'deleterData.deletedAt': 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (!deletedProject) {
|
||||
throw new NotReadyToDelete('refusing to delete non-deleted project')
|
||||
}
|
||||
const expiresAt =
|
||||
deletedProject.deleterData.deletedAt.getTime() + EXPIRE_PROJECTS_AFTER_MS
|
||||
if (expiresAt > Date.now()) {
|
||||
throw new NotReadyToDelete('refusing to delete non-expired project')
|
||||
}
|
||||
|
||||
const historyId =
|
||||
deletedProject.deleterData.deletedProjectOverleafHistoryId?.toString()
|
||||
if (!historyId) {
|
||||
throw new NotReadyToDelete(
|
||||
'refusing to delete project with unknown historyId'
|
||||
)
|
||||
}
|
||||
|
||||
if (await projectHasLatestChunk(historyId)) {
|
||||
throw new NotReadyToDelete(
|
||||
'refusing to delete project with remaining chunks'
|
||||
)
|
||||
}
|
||||
|
||||
const prefix = projectKey.format(historyId) + '/'
|
||||
await backupPersistor.deleteDirectory(chunksBucket, prefix)
|
||||
await backupPersistor.deleteDirectory(projectBlobsBucket, prefix)
|
||||
}
|
||||
|
||||
export async function healthCheck() {
|
||||
const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
|
||||
if (HEALTH_CHECK_PROJECTS.length !== 2) {
|
||||
throw new Error('expected 2 healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
|
||||
throw new Error('expected mongo id in healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
|
||||
throw new Error('expected postgres id in healthCheckProjects')
|
||||
}
|
||||
|
||||
for (const historyId of HEALTH_CHECK_PROJECTS) {
|
||||
if (!(await projectHasLatestChunk(historyId))) {
|
||||
throw new Error(`project has no history: ${historyId}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const healthCheckCb = callbackify(healthCheck)
|
||||
export const deleteProjectBackupCb = callbackify(deleteProjectBackup)
|
||||
152
services/history-v1/storage/lib/backupGenerator.mjs
Normal file
152
services/history-v1/storage/lib/backupGenerator.mjs
Normal file
@@ -0,0 +1,152 @@
|
||||
/**
|
||||
* Provides a generator function to back up project chunks and blobs.
|
||||
*/
|
||||
|
||||
import chunkStore from './chunk_store/index.js'
|
||||
|
||||
import {
|
||||
GLOBAL_BLOBS, // NOTE: must call loadGlobalBlobs() before using this
|
||||
BlobStore,
|
||||
} from './blob_store/index.js'
|
||||
|
||||
import assert from './assert.js'
|
||||
|
||||
async function lookBehindForSeenBlobs(
|
||||
projectId,
|
||||
chunk,
|
||||
lastBackedUpVersion,
|
||||
seenBlobs
|
||||
) {
|
||||
if (chunk.startVersion === 0) {
|
||||
return // this is the first chunk, no need to check for blobs in the previous chunk
|
||||
}
|
||||
if (chunk.startVersion > 0 && lastBackedUpVersion > chunk.startVersion) {
|
||||
return // the snapshot in this chunk has already been backed up
|
||||
}
|
||||
if (
|
||||
chunk.startVersion > 0 &&
|
||||
lastBackedUpVersion === chunk.startVersion // same as previousChunk.endVersion
|
||||
) {
|
||||
// the snapshot in this chunk has not been backed up
|
||||
// so we find the set of backed up blobs from the previous chunk
|
||||
const previousChunk = await chunkStore.loadAtVersion(
|
||||
projectId,
|
||||
lastBackedUpVersion
|
||||
)
|
||||
const previousChunkHistory = previousChunk.getHistory()
|
||||
previousChunkHistory.findBlobHashes(seenBlobs)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Records blob hashes that have been previously seen in a chunk's history.
|
||||
*
|
||||
* @param {Object} chunk - The chunk containing history data
|
||||
* @param {number} currentBackedUpVersion - The version number that has been backed up
|
||||
* @param {Set<string>} seenBlobs - Set to collect previously seen blob hashes
|
||||
* @returns {void}
|
||||
*/
|
||||
function recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs) {
|
||||
// We need to look at the chunk and decide how far we have backed up.
|
||||
// If we have not backed up this chunk at all, we need to backup the blobs
|
||||
// in the snapshot. Otherwise we need to backup the blobs in the changes
|
||||
// that have occurred since the last backup.
|
||||
const history = chunk.getHistory()
|
||||
const startVersion = chunk.getStartVersion()
|
||||
if (currentBackedUpVersion === 0) {
|
||||
// If we have only backed up version 0 (i.e. the first change)
|
||||
// then that includes the initial snapshot, so we consider
|
||||
// the blobs of the initial snapshot as seen. If the project
|
||||
// has not been backed up at all then currentBackedUpVersion
|
||||
// will be undefined.
|
||||
history.snapshot.findBlobHashes(seenBlobs)
|
||||
} else if (currentBackedUpVersion > startVersion) {
|
||||
history.snapshot.findBlobHashes(seenBlobs)
|
||||
for (let i = 0; i < currentBackedUpVersion - startVersion; i++) {
|
||||
history.changes[i].findBlobHashes(seenBlobs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects new blob objects that need to be backed up from a given chunk.
|
||||
*
|
||||
* @param {Object} chunk - The chunk object containing history data
|
||||
* @param {Object} blobStore - Storage interface for retrieving blobs
|
||||
* @param {Set<string>} seenBlobs - Set of blob hashes that have already been processed
|
||||
* @returns {Promise<Object[]>} Array of blob objects that need to be backed up
|
||||
* @throws {Error} If blob retrieval fails
|
||||
*/
|
||||
async function collectNewBlobsForBackup(chunk, blobStore, seenBlobs) {
|
||||
/** @type {Set<string>} */
|
||||
const blobHashes = new Set()
|
||||
const history = chunk.getHistory()
|
||||
// Get all the blobs in this chunk, then exclude the seenBlobs and global blobs
|
||||
history.findBlobHashes(blobHashes)
|
||||
const blobsToBackup = await blobStore.getBlobs(
|
||||
[...blobHashes].filter(
|
||||
hash =>
|
||||
hash &&
|
||||
!seenBlobs.has(hash) &&
|
||||
(!GLOBAL_BLOBS.has(hash) || GLOBAL_BLOBS.get(hash).demoted)
|
||||
)
|
||||
)
|
||||
return blobsToBackup
|
||||
}
|
||||
|
||||
/**
|
||||
* Asynchronously generates backups for a project based on provided versions.
|
||||
* @param {string} projectId - The ID of the project's history to back up.
|
||||
* @param {number} lastBackedUpVersion - The last version that was successfully backed up.
|
||||
* @yields {AsyncGenerator<{ chunkRecord: object, chunkToBackup: object, chunkBuffer: Buffer, blobsToBackup: object[] }>}
|
||||
* Yields chunk records and corresponding data needed for backups.
|
||||
*/
|
||||
export async function* backupGenerator(projectId, lastBackedUpVersion) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.maybe.integer(lastBackedUpVersion, 'bad lastBackedUpVersion')
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
/** @type {Set<string>} */
|
||||
const seenBlobs = new Set() // records the blobs that are already backed up
|
||||
|
||||
const firstPendingVersion =
|
||||
lastBackedUpVersion >= 0 ? lastBackedUpVersion + 1 : 0
|
||||
let isStartingChunk = true
|
||||
let currentBackedUpVersion = lastBackedUpVersion
|
||||
const chunkRecordIterator = chunkStore.getProjectChunksFromVersion(
|
||||
projectId,
|
||||
firstPendingVersion
|
||||
)
|
||||
|
||||
for await (const chunkRecord of chunkRecordIterator) {
|
||||
const { chunk, chunkBuffer } = await chunkStore.loadByChunkRecord(
|
||||
projectId,
|
||||
chunkRecord
|
||||
)
|
||||
|
||||
if (isStartingChunk) {
|
||||
await lookBehindForSeenBlobs(
|
||||
projectId,
|
||||
chunkRecord,
|
||||
lastBackedUpVersion,
|
||||
seenBlobs
|
||||
)
|
||||
isStartingChunk = false
|
||||
}
|
||||
|
||||
recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs)
|
||||
|
||||
const blobsToBackup = await collectNewBlobsForBackup(
|
||||
chunk,
|
||||
blobStore,
|
||||
seenBlobs
|
||||
)
|
||||
|
||||
yield { chunkRecord, chunkToBackup: chunk, chunkBuffer, blobsToBackup }
|
||||
|
||||
// After we generate a backup of this chunk, mark the backed up blobs as seen
|
||||
blobsToBackup.forEach(blob => seenBlobs.add(blob.getHash()))
|
||||
currentBackedUpVersion = chunkRecord.endVersion
|
||||
}
|
||||
}
|
||||
121
services/history-v1/storage/lib/backupPersistor.mjs
Normal file
121
services/history-v1/storage/lib/backupPersistor.mjs
Normal file
@@ -0,0 +1,121 @@
|
||||
// @ts-check
|
||||
import fs from 'node:fs'
|
||||
import Path from 'node:path'
|
||||
import _ from 'lodash'
|
||||
import config from 'config'
|
||||
import { SecretManagerServiceClient } from '@google-cloud/secret-manager'
|
||||
import OError from '@overleaf/o-error'
|
||||
import {
|
||||
PerProjectEncryptedS3Persistor,
|
||||
RootKeyEncryptionKey,
|
||||
} from '@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js'
|
||||
import { HistoryStore } from './history_store.js'
|
||||
|
||||
const persistorConfig = _.cloneDeep(config.get('backupPersistor'))
|
||||
const { chunksBucket, deksBucket, globalBlobsBucket, projectBlobsBucket } =
|
||||
config.get('backupStore')
|
||||
|
||||
export { chunksBucket, globalBlobsBucket, projectBlobsBucket }
|
||||
|
||||
function convertKey(key, convertFn) {
|
||||
if (_.has(persistorConfig, key)) {
|
||||
_.update(persistorConfig, key, convertFn)
|
||||
}
|
||||
}
|
||||
|
||||
convertKey('s3SSEC.httpOptions.timeout', s => parseInt(s, 10))
|
||||
convertKey('s3SSEC.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('s3SSEC.pathStyle', s => s === 'true')
|
||||
// array of CA, either inlined or on disk
|
||||
convertKey('s3SSEC.ca', s =>
|
||||
JSON.parse(s).map(ca => (ca.startsWith('/') ? fs.readFileSync(ca) : ca))
|
||||
)
|
||||
|
||||
/** @type {() => Promise<string>} */
|
||||
let getRawRootKeyEncryptionKeys
|
||||
|
||||
if ((process.env.NODE_ENV || 'production') === 'production') {
|
||||
;[persistorConfig.s3SSEC.key, persistorConfig.s3SSEC.secret] = (
|
||||
await loadFromSecretsManager(
|
||||
process.env.BACKUP_AWS_CREDENTIALS || '',
|
||||
'BACKUP_AWS_CREDENTIALS'
|
||||
)
|
||||
).split(':')
|
||||
getRawRootKeyEncryptionKeys = () =>
|
||||
loadFromSecretsManager(
|
||||
persistorConfig.keyEncryptionKeys,
|
||||
'BACKUP_KEY_ENCRYPTION_KEYS'
|
||||
)
|
||||
} else {
|
||||
getRawRootKeyEncryptionKeys = () => persistorConfig.keyEncryptionKeys
|
||||
}
|
||||
|
||||
export const DELETION_ONLY = persistorConfig.keyEncryptionKeys === 'none'
|
||||
if (DELETION_ONLY) {
|
||||
// For Backup-deleter; should not encrypt or read data; deleting does not need key.
|
||||
getRawRootKeyEncryptionKeys = () => new Promise(_resolve => {})
|
||||
}
|
||||
|
||||
const PROJECT_FOLDER_REGEX =
|
||||
/^\d{3}\/\d{3}\/\d{3,}\/|[0-9a-f]{3}\/[0-9a-f]{3}\/[0-9a-f]{18}\/$/
|
||||
|
||||
/**
|
||||
* @param {string} bucketName
|
||||
* @param {string} path
|
||||
* @return {string}
|
||||
*/
|
||||
export function pathToProjectFolder(bucketName, path) {
|
||||
switch (bucketName) {
|
||||
case deksBucket:
|
||||
case chunksBucket:
|
||||
case projectBlobsBucket:
|
||||
const projectFolder = Path.join(...path.split('/').slice(0, 3)) + '/'
|
||||
if (!PROJECT_FOLDER_REGEX.test(projectFolder)) {
|
||||
throw new OError('invalid project folder', { bucketName, path })
|
||||
}
|
||||
return projectFolder
|
||||
default:
|
||||
throw new Error(`${bucketName} does not store per-project files`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} name
|
||||
* @param {string} label
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function loadFromSecretsManager(name, label) {
|
||||
const client = new SecretManagerServiceClient()
|
||||
const [version] = await client.accessSecretVersion({ name })
|
||||
if (!version.payload?.data) throw new Error(`empty secret: ${label}`)
|
||||
return version.payload.data.toString()
|
||||
}
|
||||
|
||||
async function getRootKeyEncryptionKeys() {
|
||||
return JSON.parse(await getRawRootKeyEncryptionKeys()).map(
|
||||
({ key, salt }) => {
|
||||
return new RootKeyEncryptionKey(
|
||||
Buffer.from(key, 'base64'),
|
||||
Buffer.from(salt, 'base64')
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
export const backupPersistor = new PerProjectEncryptedS3Persistor({
|
||||
...persistorConfig.s3SSEC,
|
||||
disableMultiPartUpload: true,
|
||||
dataEncryptionKeyBucketName: deksBucket,
|
||||
pathToProjectFolder,
|
||||
getRootKeyEncryptionKeys,
|
||||
storageClass: {
|
||||
[deksBucket]: 'STANDARD',
|
||||
[chunksBucket]: persistorConfig.tieringStorageClass,
|
||||
[projectBlobsBucket]: persistorConfig.tieringStorageClass,
|
||||
},
|
||||
})
|
||||
|
||||
export const backupHistoryStore = new HistoryStore(
|
||||
backupPersistor,
|
||||
chunksBucket
|
||||
)
|
||||
216
services/history-v1/storage/lib/backupVerifier.mjs
Normal file
216
services/history-v1/storage/lib/backupVerifier.mjs
Normal file
@@ -0,0 +1,216 @@
|
||||
// @ts-check
|
||||
import OError from '@overleaf/o-error'
|
||||
import chunkStore from '../lib/chunk_store/index.js'
|
||||
import {
|
||||
backupPersistor,
|
||||
chunksBucket,
|
||||
projectBlobsBucket,
|
||||
} from './backupPersistor.mjs'
|
||||
import { Blob, Chunk, History } from 'overleaf-editor-core'
|
||||
import { BlobStore, GLOBAL_BLOBS, makeProjectKey } from './blob_store/index.js'
|
||||
import blobHash from './blob_hash.js'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import logger from '@overleaf/logger'
|
||||
import path from 'node:path'
|
||||
import projectKey from './project_key.js'
|
||||
import streams from './streams.js'
|
||||
import objectPersistor from '@overleaf/object-persistor'
|
||||
import { getEndDateForRPO } from '../../backupVerifier/utils.mjs'
|
||||
|
||||
/**
|
||||
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {string} hash
|
||||
*/
|
||||
export async function verifyBlob(historyId, hash) {
|
||||
return await verifyBlobs(historyId, [hash])
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @return {Promise<CachedPerProjectEncryptedS3Persistor>}
|
||||
*/
|
||||
async function getProjectPersistor(historyId) {
|
||||
try {
|
||||
return await backupPersistor.forProjectRO(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new BackupCorruptedError('dek does not exist', {}, err)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {Array<string>} hashes
|
||||
* @param {CachedPerProjectEncryptedS3Persistor} [projectCache]
|
||||
*/
|
||||
export async function verifyBlobs(historyId, hashes, projectCache) {
|
||||
if (hashes.length === 0) throw new Error('bug: empty hashes')
|
||||
|
||||
if (!projectCache) {
|
||||
projectCache = await getProjectPersistor(historyId)
|
||||
}
|
||||
const blobStore = new BlobStore(historyId)
|
||||
for (const hash of hashes) {
|
||||
const path = makeProjectKey(historyId, hash)
|
||||
const blob = await blobStore.getBlob(hash)
|
||||
if (!blob) throw new Blob.NotFoundError(hash)
|
||||
let stream
|
||||
try {
|
||||
stream = await projectCache.getObjectStream(projectBlobsBucket, path, {
|
||||
autoGunzip: true,
|
||||
})
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new BackupCorruptedMissingBlobError('missing blob', {
|
||||
path,
|
||||
hash,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
const backupHash = await blobHash.fromStream(blob.getByteLength(), stream)
|
||||
if (backupHash !== hash) {
|
||||
throw new BackupCorruptedInvalidBlobError(
|
||||
'hash mismatch for backed up blob',
|
||||
{
|
||||
path,
|
||||
hash,
|
||||
backupHash,
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {Date} [endTimestamp]
|
||||
*/
|
||||
export async function verifyProjectWithErrorContext(
|
||||
historyId,
|
||||
endTimestamp = getEndDateForRPO()
|
||||
) {
|
||||
try {
|
||||
await verifyProject(historyId, endTimestamp)
|
||||
} catch (err) {
|
||||
// @ts-ignore err is Error instance
|
||||
throw OError.tag(err, 'verifyProject', { historyId, endTimestamp })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {number} startVersion
|
||||
* @param {CachedPerProjectEncryptedS3Persistor} backupPersistorForProject
|
||||
* @return {Promise<any>}
|
||||
*/
|
||||
async function loadChunk(historyId, startVersion, backupPersistorForProject) {
|
||||
const key = path.join(
|
||||
projectKey.format(historyId),
|
||||
projectKey.pad(startVersion)
|
||||
)
|
||||
try {
|
||||
const buf = await streams.gunzipStreamToBuffer(
|
||||
await backupPersistorForProject.getObjectStream(chunksBucket, key)
|
||||
)
|
||||
return JSON.parse(buf.toString('utf-8'))
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Chunk.NotPersistedError(historyId)
|
||||
}
|
||||
if (err instanceof Error) {
|
||||
throw OError.tag(err, 'Failed to load chunk', { historyId, startVersion })
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {Date} endTimestamp
|
||||
*/
|
||||
export async function verifyProject(historyId, endTimestamp) {
|
||||
const backend = chunkStore.getBackend(historyId)
|
||||
const [first, last] = await Promise.all([
|
||||
backend.getFirstChunkBeforeTimestamp(historyId, endTimestamp),
|
||||
backend.getLastActiveChunkBeforeTimestamp(historyId, endTimestamp),
|
||||
])
|
||||
|
||||
const chunksRecordsToVerify = [
|
||||
{
|
||||
chunkId: first.id,
|
||||
chunkLabel: 'first',
|
||||
},
|
||||
]
|
||||
if (first.startVersion !== last.startVersion) {
|
||||
chunksRecordsToVerify.push({
|
||||
chunkId: last.id,
|
||||
chunkLabel: 'last before RPO',
|
||||
})
|
||||
}
|
||||
|
||||
const projectCache = await getProjectPersistor(historyId)
|
||||
|
||||
const chunks = await Promise.all(
|
||||
chunksRecordsToVerify.map(async chunk => {
|
||||
try {
|
||||
return History.fromRaw(
|
||||
await loadChunk(historyId, chunk.startVersion, projectCache)
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotPersistedError) {
|
||||
throw new BackupRPOViolationChunkNotBackedUpError(
|
||||
'BackupRPOviolation: chunk not backed up',
|
||||
chunk
|
||||
)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
})
|
||||
)
|
||||
const seenBlobs = new Set()
|
||||
const blobsToVerify = []
|
||||
for (const chunk of chunks) {
|
||||
/** @type {Set<string>} */
|
||||
const chunkBlobs = new Set()
|
||||
chunk.findBlobHashes(chunkBlobs)
|
||||
let hasAddedBlobFromThisChunk = false
|
||||
for (const blobHash of chunkBlobs) {
|
||||
if (seenBlobs.has(blobHash)) continue // old blob
|
||||
if (GLOBAL_BLOBS.has(blobHash)) continue // global blob
|
||||
seenBlobs.add(blobHash)
|
||||
if (!hasAddedBlobFromThisChunk) {
|
||||
blobsToVerify.push(blobHash)
|
||||
hasAddedBlobFromThisChunk = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if (blobsToVerify.length === 0) {
|
||||
logger.debug(
|
||||
{
|
||||
historyId,
|
||||
chunksRecordsToVerify: chunksRecordsToVerify.map(c => c.chunkId),
|
||||
},
|
||||
'chunks contain no blobs to verify'
|
||||
)
|
||||
return
|
||||
}
|
||||
await verifyBlobs(historyId, blobsToVerify, projectCache)
|
||||
}
|
||||
|
||||
export class BackupCorruptedError extends OError {}
|
||||
export class BackupRPOViolationError extends OError {}
|
||||
export class BackupCorruptedMissingBlobError extends BackupCorruptedError {}
|
||||
export class BackupCorruptedInvalidBlobError extends BackupCorruptedError {}
|
||||
export class BackupRPOViolationChunkNotBackedUpError extends OError {}
|
||||
212
services/history-v1/storage/lib/backup_store/index.js
Normal file
212
services/history-v1/storage/lib/backup_store/index.js
Normal file
@@ -0,0 +1,212 @@
|
||||
const { Binary, ObjectId } = require('mongodb')
|
||||
const { projects, backedUpBlobs } = require('../mongodb')
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
// List projects with pending backups older than the specified interval
|
||||
function listPendingBackups(timeIntervalMs = 0, limit = null) {
|
||||
const cutoffTime = new Date(Date.now() - timeIntervalMs)
|
||||
const options = {
|
||||
projection: { 'overleaf.backup.pendingChangeAt': 1 },
|
||||
sort: { 'overleaf.backup.pendingChangeAt': 1 },
|
||||
}
|
||||
|
||||
// Apply limit if provided
|
||||
if (limit) {
|
||||
options.limit = limit
|
||||
}
|
||||
|
||||
const cursor = projects.find(
|
||||
{
|
||||
'overleaf.backup.pendingChangeAt': {
|
||||
$exists: true,
|
||||
$lt: cutoffTime,
|
||||
},
|
||||
},
|
||||
options
|
||||
)
|
||||
return cursor
|
||||
}
|
||||
|
||||
// List projects that have never been backed up and are older than the specified interval
|
||||
function listUninitializedBackups(timeIntervalMs = 0, limit = null) {
|
||||
const cutoffTimeInSeconds = (Date.now() - timeIntervalMs) / 1000
|
||||
const options = {
|
||||
projection: { _id: 1 },
|
||||
sort: { _id: 1 },
|
||||
}
|
||||
// Apply limit if provided
|
||||
if (limit) {
|
||||
options.limit = limit
|
||||
}
|
||||
const cursor = projects.find(
|
||||
{
|
||||
'overleaf.backup.lastBackedUpVersion': null,
|
||||
_id: {
|
||||
$lt: ObjectId.createFromTime(cutoffTimeInSeconds),
|
||||
},
|
||||
},
|
||||
options
|
||||
)
|
||||
return cursor
|
||||
}
|
||||
|
||||
// Retrieve the history ID for a given project without giving direct access to the
|
||||
// projects collection.
|
||||
|
||||
async function getHistoryId(projectId) {
|
||||
const project = await projects.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{
|
||||
projection: {
|
||||
'overleaf.history.id': 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (!project) {
|
||||
throw new Error('Project not found')
|
||||
}
|
||||
return project.overleaf.history.id
|
||||
}
|
||||
|
||||
async function getBackupStatus(projectId) {
|
||||
const project = await projects.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{
|
||||
projection: {
|
||||
'overleaf.history': 1,
|
||||
'overleaf.backup': 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (!project) {
|
||||
throw new Error('Project not found')
|
||||
}
|
||||
return {
|
||||
backupStatus: project.overleaf.backup,
|
||||
historyId: `${project.overleaf.history.id}`,
|
||||
currentEndVersion: project.overleaf.history.currentEndVersion,
|
||||
currentEndTimestamp: project.overleaf.history.currentEndTimestamp,
|
||||
}
|
||||
}
|
||||
|
||||
async function setBackupVersion(
|
||||
projectId,
|
||||
previousBackedUpVersion,
|
||||
currentBackedUpVersion,
|
||||
currentBackedUpAt
|
||||
) {
|
||||
// FIXME: include a check to handle race conditions
|
||||
// to make sure only one process updates the version numbers
|
||||
const result = await projects.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
'overleaf.backup.lastBackedUpVersion': previousBackedUpVersion,
|
||||
},
|
||||
{
|
||||
$set: {
|
||||
'overleaf.backup.lastBackedUpVersion': currentBackedUpVersion,
|
||||
'overleaf.backup.lastBackedUpAt': currentBackedUpAt,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (result.matchedCount === 0 || result.modifiedCount === 0) {
|
||||
throw new OError('Failed to update backup version', {
|
||||
previousBackedUpVersion,
|
||||
currentBackedUpVersion,
|
||||
currentBackedUpAt,
|
||||
result,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
async function updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata) {
|
||||
await projects.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
'overleaf.history.currentEndVersion': { $exists: false },
|
||||
'overleaf.history.currentEndTimestamp': { $exists: false },
|
||||
},
|
||||
{
|
||||
$set: {
|
||||
'overleaf.history.currentEndVersion': latestChunkMetadata.endVersion,
|
||||
'overleaf.history.currentEndTimestamp':
|
||||
latestChunkMetadata.endTimestamp,
|
||||
},
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the pending change timestamp for a project's backup status
|
||||
* @param {string} projectId - The ID of the project to update
|
||||
* @param {Date} backupStartTime - The timestamp to set for pending changes
|
||||
* @returns {Promise<void>}
|
||||
*
|
||||
* If the project's last backed up version matches the current end version,
|
||||
* the pending change timestamp is removed. Otherwise, it's set to the provided
|
||||
* backup start time.
|
||||
*/
|
||||
async function updatePendingChangeTimestamp(projectId, backupStartTime) {
|
||||
await projects.updateOne({ _id: new ObjectId(projectId) }, [
|
||||
{
|
||||
$set: {
|
||||
'overleaf.backup.pendingChangeAt': {
|
||||
$cond: {
|
||||
if: {
|
||||
$eq: [
|
||||
'$overleaf.backup.lastBackedUpVersion',
|
||||
'$overleaf.history.currentEndVersion',
|
||||
],
|
||||
},
|
||||
then: '$$REMOVE',
|
||||
else: backupStartTime,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
async function getBackedUpBlobHashes(projectId) {
|
||||
const result = await backedUpBlobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { blobs: 1 } }
|
||||
)
|
||||
if (!result) {
|
||||
return new Set()
|
||||
}
|
||||
const hashes = result.blobs.map(b => b.buffer.toString('hex'))
|
||||
return new Set(hashes)
|
||||
}
|
||||
|
||||
async function unsetBackedUpBlobHashes(projectId, hashes) {
|
||||
const binaryHashes = hashes.map(h => new Binary(Buffer.from(h, 'hex')))
|
||||
const result = await backedUpBlobs.findOneAndUpdate(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{
|
||||
$pullAll: {
|
||||
blobs: binaryHashes,
|
||||
},
|
||||
},
|
||||
{ returnDocument: 'after' }
|
||||
)
|
||||
if (result && result.blobs.length === 0) {
|
||||
await backedUpBlobs.deleteOne({
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: { $size: 0 },
|
||||
})
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getHistoryId,
|
||||
getBackupStatus,
|
||||
setBackupVersion,
|
||||
updateCurrentMetadataIfNotSet,
|
||||
updatePendingChangeTimestamp,
|
||||
listPendingBackups,
|
||||
listUninitializedBackups,
|
||||
getBackedUpBlobHashes,
|
||||
unsetBackedUpBlobHashes,
|
||||
}
|
||||
40
services/history-v1/storage/lib/batch_blob_store.js
Normal file
40
services/history-v1/storage/lib/batch_blob_store.js
Normal file
@@ -0,0 +1,40 @@
|
||||
'use strict'
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
|
||||
/**
|
||||
* @constructor
|
||||
* @param {BlobStore} blobStore
|
||||
* @classdesc
|
||||
* Wrapper for BlobStore that pre-fetches blob metadata to avoid making one
|
||||
* database call per blob lookup.
|
||||
*/
|
||||
function BatchBlobStore(blobStore) {
|
||||
this.blobStore = blobStore
|
||||
this.blobs = new Map()
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-fetch metadata for the given blob hashes.
|
||||
*
|
||||
* @param {Array.<string>} hashes
|
||||
* @return {Promise}
|
||||
*/
|
||||
BatchBlobStore.prototype.preload = function batchBlobStorePreload(hashes) {
|
||||
return BPromise.each(this.blobStore.getBlobs(hashes), blob => {
|
||||
this.blobs.set(blob.getHash(), blob)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* @see BlobStore#getBlob
|
||||
*/
|
||||
BatchBlobStore.prototype.getBlob = BPromise.method(
|
||||
function batchBlobStoreGetBlob(hash) {
|
||||
const blob = this.blobs.get(hash)
|
||||
if (blob) return blob
|
||||
return this.blobStore.getBlob(hash)
|
||||
}
|
||||
)
|
||||
|
||||
module.exports = BatchBlobStore
|
||||
80
services/history-v1/storage/lib/blob_hash.js
Normal file
80
services/history-v1/storage/lib/blob_hash.js
Normal file
@@ -0,0 +1,80 @@
|
||||
/** @module */
|
||||
'use strict'
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
const fs = BPromise.promisifyAll(require('node:fs'))
|
||||
const crypto = require('node:crypto')
|
||||
const { pipeline } = require('node:stream')
|
||||
const assert = require('./assert')
|
||||
|
||||
function getGitBlobHeader(byteLength) {
|
||||
return 'blob ' + byteLength + '\x00'
|
||||
}
|
||||
|
||||
function getBlobHash(byteLength) {
|
||||
const hash = crypto.createHash('sha1')
|
||||
hash.setEncoding('hex')
|
||||
hash.update(getGitBlobHeader(byteLength))
|
||||
return hash
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the git blob hash for a blob from a readable stream of its content.
|
||||
*
|
||||
* @function
|
||||
* @param {number} byteLength
|
||||
* @param {stream.Readable} stream
|
||||
* @return {Promise.<string>} hexadecimal SHA-1 hash
|
||||
*/
|
||||
exports.fromStream = BPromise.method(
|
||||
function blobHashFromStream(byteLength, stream) {
|
||||
assert.integer(byteLength, 'blobHash: bad byteLength')
|
||||
assert.object(stream, 'blobHash: bad stream')
|
||||
|
||||
const hash = getBlobHash(byteLength)
|
||||
return new BPromise(function (resolve, reject) {
|
||||
pipeline(stream, hash, function (err) {
|
||||
if (err) {
|
||||
reject(err)
|
||||
} else {
|
||||
hash.end()
|
||||
resolve(hash.read())
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
/**
|
||||
* Compute the git blob hash for a blob with the given string content.
|
||||
*
|
||||
* @param {string} string
|
||||
* @return {string} hexadecimal SHA-1 hash
|
||||
*/
|
||||
exports.fromString = function blobHashFromString(string) {
|
||||
assert.string(string, 'blobHash: bad string')
|
||||
const hash = getBlobHash(Buffer.byteLength(string))
|
||||
hash.update(string, 'utf8')
|
||||
hash.end()
|
||||
return hash.read()
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the git blob hash for the content of a file
|
||||
*
|
||||
* @param {string} filePath
|
||||
* @return {string} hexadecimal SHA-1 hash
|
||||
*/
|
||||
exports.fromFile = function blobHashFromFile(pathname) {
|
||||
assert.string(pathname, 'blobHash: bad pathname')
|
||||
|
||||
function getByteLengthOfFile() {
|
||||
return fs.statAsync(pathname).then(stat => stat.size)
|
||||
}
|
||||
|
||||
const fromStream = this.fromStream
|
||||
return getByteLengthOfFile(pathname).then(function (byteLength) {
|
||||
const stream = fs.createReadStream(pathname)
|
||||
return fromStream(byteLength, stream)
|
||||
})
|
||||
}
|
||||
433
services/history-v1/storage/lib/blob_store/index.js
Normal file
433
services/history-v1/storage/lib/blob_store/index.js
Normal file
@@ -0,0 +1,433 @@
|
||||
'use strict'
|
||||
|
||||
const config = require('config')
|
||||
const fs = require('node:fs')
|
||||
const isValidUtf8 = require('utf-8-validate')
|
||||
const { ReadableString } = require('@overleaf/stream-utils')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const Blob = core.Blob
|
||||
const TextOperation = core.TextOperation
|
||||
const containsNonBmpChars = core.util.containsNonBmpChars
|
||||
|
||||
const assert = require('../assert')
|
||||
const blobHash = require('../blob_hash')
|
||||
const mongodb = require('../mongodb')
|
||||
const persistor = require('../persistor')
|
||||
const projectKey = require('../project_key')
|
||||
const streams = require('../streams')
|
||||
const postgresBackend = require('./postgres')
|
||||
const mongoBackend = require('./mongo')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
/** @import { Readable } from 'stream' */
|
||||
|
||||
const GLOBAL_BLOBS = new Map()
|
||||
|
||||
function makeGlobalKey(hash) {
|
||||
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
||||
}
|
||||
|
||||
function makeProjectKey(projectId, hash) {
|
||||
return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
|
||||
}
|
||||
|
||||
async function uploadBlob(projectId, blob, stream, opts = {}) {
|
||||
const bucket = config.get('blobStore.projectBucket')
|
||||
const key = makeProjectKey(projectId, blob.getHash())
|
||||
logger.debug({ projectId, blob }, 'uploadBlob started')
|
||||
try {
|
||||
await persistor.sendStream(bucket, key, stream, {
|
||||
contentType: 'application/octet-stream',
|
||||
...opts,
|
||||
})
|
||||
} finally {
|
||||
logger.debug({ projectId, blob }, 'uploadBlob finished')
|
||||
}
|
||||
}
|
||||
|
||||
function getBlobLocation(projectId, hash) {
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
return {
|
||||
bucket: config.get('blobStore.globalBucket'),
|
||||
key: makeGlobalKey(hash),
|
||||
}
|
||||
} else {
|
||||
return {
|
||||
bucket: config.get('blobStore.projectBucket'),
|
||||
key: makeProjectKey(projectId, hash),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the appropriate backend for the given project id
|
||||
*
|
||||
* Numeric ids use the Postgres backend.
|
||||
* Strings of 24 characters use the Mongo backend.
|
||||
*/
|
||||
function getBackend(projectId) {
|
||||
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
|
||||
return postgresBackend
|
||||
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
|
||||
return mongoBackend
|
||||
} else {
|
||||
throw new OError('bad project id', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
async function makeBlobForFile(pathname) {
|
||||
const { size: byteLength } = await fs.promises.stat(pathname)
|
||||
const hash = await blobHash.fromStream(
|
||||
byteLength,
|
||||
fs.createReadStream(pathname)
|
||||
)
|
||||
return new Blob(hash, byteLength)
|
||||
}
|
||||
|
||||
async function getStringLengthOfFile(byteLength, pathname) {
|
||||
// We have to read the file into memory to get its UTF-8 length, so don't
|
||||
// bother for files that are too large for us to edit anyway.
|
||||
if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
||||
return null
|
||||
}
|
||||
|
||||
// We need to check if the file contains nonBmp or null characters
|
||||
let data = await fs.promises.readFile(pathname)
|
||||
if (!isValidUtf8(data)) return null
|
||||
data = data.toString()
|
||||
if (data.length > TextOperation.MAX_STRING_LENGTH) return null
|
||||
if (containsNonBmpChars(data)) return null
|
||||
if (data.indexOf('\x00') !== -1) return null
|
||||
return data.length
|
||||
}
|
||||
|
||||
async function deleteBlobsInBucket(projectId) {
|
||||
const bucket = config.get('blobStore.projectBucket')
|
||||
const prefix = `${projectKey.format(projectId)}/`
|
||||
logger.debug({ projectId }, 'deleteBlobsInBucket started')
|
||||
try {
|
||||
await persistor.deleteDirectory(bucket, prefix)
|
||||
} finally {
|
||||
logger.debug({ projectId }, 'deleteBlobsInBucket finished')
|
||||
}
|
||||
}
|
||||
|
||||
async function loadGlobalBlobs() {
|
||||
const blobs = await mongodb.globalBlobs.find()
|
||||
for await (const blob of blobs) {
|
||||
GLOBAL_BLOBS.set(blob._id, {
|
||||
blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
|
||||
demoted: Boolean(blob.demoted),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<string|number>} projectIds
|
||||
* @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
const mongoProjects = []
|
||||
const postgresProjects = []
|
||||
for (const projectId of projectIds) {
|
||||
if (typeof projectId === 'number') {
|
||||
postgresProjects.push(projectId)
|
||||
} else {
|
||||
mongoProjects.push(projectId)
|
||||
}
|
||||
}
|
||||
const [
|
||||
{ nBlobs: nBlobsPostgres, blobs: blobsPostgres },
|
||||
{ nBlobs: nBlobsMongo, blobs: blobsMongo },
|
||||
] = await Promise.all([
|
||||
postgresBackend.getProjectBlobsBatch(postgresProjects),
|
||||
mongoBackend.getProjectBlobsBatch(mongoProjects),
|
||||
])
|
||||
for (const [id, blobs] of blobsPostgres.entries()) {
|
||||
blobsMongo.set(id.toString(), blobs)
|
||||
}
|
||||
return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
|
||||
}
|
||||
|
||||
/**
|
||||
* @classdesc
|
||||
* Fetch and store the content of files using content-addressable hashing. The
|
||||
* blob store manages both content and metadata (byte and UTF-8 length) for
|
||||
* blobs.
|
||||
*/
|
||||
class BlobStore {
|
||||
/**
|
||||
* @constructor
|
||||
* @param {string} projectId the project for which we'd like to find blobs
|
||||
*/
|
||||
constructor(projectId) {
|
||||
assert.projectId(projectId)
|
||||
this.projectId = projectId
|
||||
this.backend = getBackend(this.projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the initial data structure for a given project
|
||||
*/
|
||||
async initialize() {
|
||||
await this.backend.initialize(this.projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a blob, if one does not already exist, with the given UTF-8 encoded
|
||||
* string content.
|
||||
*
|
||||
* @param {string} string
|
||||
* @return {Promise.<core.Blob>}
|
||||
*/
|
||||
async putString(string) {
|
||||
assert.string(string, 'bad string')
|
||||
const hash = blobHash.fromString(string)
|
||||
|
||||
const existingBlob = await this._findBlobBeforeInsert(hash)
|
||||
if (existingBlob != null) {
|
||||
return existingBlob
|
||||
}
|
||||
const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
|
||||
// Note: the ReadableString is to work around a bug in the AWS SDK: it won't
|
||||
// allow Body to be blank.
|
||||
await uploadBlob(this.projectId, newBlob, new ReadableString(string))
|
||||
await this.backend.insertBlob(this.projectId, newBlob)
|
||||
return newBlob
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a blob, if one does not already exist, with the given file (usually a
|
||||
* temporary file).
|
||||
*
|
||||
* @param {string} pathname
|
||||
* @return {Promise<core.Blob>}
|
||||
*/
|
||||
async putFile(pathname) {
|
||||
assert.string(pathname, 'bad pathname')
|
||||
const newBlob = await makeBlobForFile(pathname)
|
||||
const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
|
||||
if (existingBlob != null) {
|
||||
return existingBlob
|
||||
}
|
||||
const stringLength = await getStringLengthOfFile(
|
||||
newBlob.getByteLength(),
|
||||
pathname
|
||||
)
|
||||
newBlob.setStringLength(stringLength)
|
||||
await this.putBlob(pathname, newBlob)
|
||||
return newBlob
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a new blob, the stringLength must have been added already. It should
|
||||
* have been checked that the blob does not exist yet. Consider using
|
||||
* {@link putFile} instead of this lower-level method.
|
||||
*
|
||||
* @param {string} pathname
|
||||
* @param {core.Blob} finializedBlob
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async putBlob(pathname, finializedBlob) {
|
||||
await uploadBlob(
|
||||
this.projectId,
|
||||
finializedBlob,
|
||||
fs.createReadStream(pathname)
|
||||
)
|
||||
await this.backend.insertBlob(this.projectId, finializedBlob)
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores an object as a JSON string in a blob.
|
||||
*
|
||||
* @param {object} obj
|
||||
* @returns {Promise.<core.Blob>}
|
||||
*/
|
||||
async putObject(obj) {
|
||||
assert.object(obj, 'bad object')
|
||||
const string = JSON.stringify(obj)
|
||||
return await this.putString(string)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Fetch a blob's content by its hash as a UTF-8 encoded string.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise.<string>} promise for the content of the file
|
||||
*/
|
||||
async getString(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const projectId = this.projectId
|
||||
logger.debug({ projectId, hash }, 'getString started')
|
||||
try {
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.readStreamToBuffer(stream)
|
||||
return buffer.toString()
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getString finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a JSON encoded blob by its hash and deserialize it.
|
||||
*
|
||||
* @template [T=unknown]
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise.<T>} promise for the content of the file
|
||||
*/
|
||||
async getObject(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
const projectId = this.projectId
|
||||
logger.debug({ projectId, hash }, 'getObject started')
|
||||
try {
|
||||
const jsonString = await this.getString(hash)
|
||||
const object = JSON.parse(jsonString)
|
||||
return object
|
||||
} catch (error) {
|
||||
// Maybe this is blob is gzipped. Try to gunzip it.
|
||||
// TODO: Remove once we've ensured this is not reached
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.gunzipStreamToBuffer(stream)
|
||||
const object = JSON.parse(buffer.toString())
|
||||
logger.warn('getObject: Gzipped object in BlobStore')
|
||||
return object
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getObject finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a blob by its hash as a stream.
|
||||
*
|
||||
* Note that, according to the AWS SDK docs, this does not retry after initial
|
||||
* failure, so the caller must be prepared to retry on errors, if appropriate.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @param {Object} opts
|
||||
* @return {Promise.<Readable>} a stream to read the file
|
||||
*/
|
||||
async getStream(hash, opts = {}) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const { bucket, key } = getBlobLocation(this.projectId, hash)
|
||||
try {
|
||||
const stream = await persistor.getObjectStream(bucket, key, opts)
|
||||
return stream
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Blob.NotFoundError(hash)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a blob metadata record by hexadecimal hash.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise<core.Blob | null>}
|
||||
*/
|
||||
async getBlob(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null) {
|
||||
return globalBlob.blob
|
||||
}
|
||||
const blob = await this.backend.findBlob(this.projectId, hash)
|
||||
return blob
|
||||
}
|
||||
|
||||
async getBlobs(hashes) {
|
||||
assert.array(hashes, 'bad hashes')
|
||||
const nonGlobalHashes = []
|
||||
const blobs = []
|
||||
for (const hash of hashes) {
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null) {
|
||||
blobs.push(globalBlob.blob)
|
||||
} else {
|
||||
nonGlobalHashes.push(hash)
|
||||
}
|
||||
}
|
||||
if (nonGlobalHashes.length === 0) {
|
||||
return blobs // to avoid unnecessary database lookup
|
||||
}
|
||||
const projectBlobs = await this.backend.findBlobs(
|
||||
this.projectId,
|
||||
nonGlobalHashes
|
||||
)
|
||||
blobs.push(...projectBlobs)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve all blobs associated with the project.
|
||||
* @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
|
||||
*/
|
||||
|
||||
async getProjectBlobs() {
|
||||
const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
|
||||
return projectBlobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all blobs that belong to the project.
|
||||
*/
|
||||
async deleteBlobs() {
|
||||
await Promise.all([
|
||||
this.backend.deleteBlobs(this.projectId),
|
||||
deleteBlobsInBucket(this.projectId),
|
||||
])
|
||||
}
|
||||
|
||||
async _findBlobBeforeInsert(hash) {
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null && !globalBlob.demoted) {
|
||||
return globalBlob.blob
|
||||
}
|
||||
const blob = await this.backend.findBlob(this.projectId, hash)
|
||||
return blob
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy an existing sourceBlob in this project to a target project.
|
||||
* @param {Blob} sourceBlob
|
||||
* @param {string} targetProjectId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async copyBlob(sourceBlob, targetProjectId) {
|
||||
assert.instance(sourceBlob, Blob, 'bad sourceBlob')
|
||||
assert.projectId(targetProjectId, 'bad targetProjectId')
|
||||
const hash = sourceBlob.getHash()
|
||||
const sourceProjectId = this.projectId
|
||||
const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
|
||||
const destKey = makeProjectKey(targetProjectId, hash)
|
||||
const targetBackend = getBackend(targetProjectId)
|
||||
logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
|
||||
try {
|
||||
await persistor.copyObject(bucket, sourceKey, destKey)
|
||||
await targetBackend.insertBlob(targetProjectId, sourceBlob)
|
||||
} finally {
|
||||
logger.debug(
|
||||
{ sourceProjectId, targetProjectId, hash },
|
||||
'copyBlob finished'
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BlobStore,
|
||||
getProjectBlobsBatch,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
makeBlobForFile,
|
||||
getStringLengthOfFile,
|
||||
GLOBAL_BLOBS,
|
||||
}
|
||||
437
services/history-v1/storage/lib/blob_store/mongo.js
Normal file
437
services/history-v1/storage/lib/blob_store/mongo.js
Normal file
@@ -0,0 +1,437 @@
|
||||
// @ts-check
|
||||
/**
|
||||
* Mongo backend for the blob store.
|
||||
*
|
||||
* Blobs are stored in the projectHistoryBlobs collection. Each project has a
|
||||
* document in that collection. That document has a "blobs" subdocument whose
|
||||
* fields are buckets of blobs. The key of a bucket is the first three hex
|
||||
* digits of the blob hash. The value of the bucket is an array of blobs that
|
||||
* match the key.
|
||||
*
|
||||
* Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
|
||||
* blobs are stored in a secondary collection: the projectHistoryShardedBlobs
|
||||
* collection. This collection shards blobs between 16 documents per project.
|
||||
* The shard key is the first hex digit of the hash. The documents are also
|
||||
* organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
|
||||
*/
|
||||
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
|
||||
const assert = require('../assert')
|
||||
const mongodb = require('../mongodb')
|
||||
|
||||
const MAX_BLOBS_IN_BUCKET = 8
|
||||
const DUPLICATE_KEY_ERROR_CODE = 11000
|
||||
|
||||
/**
|
||||
* @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
|
||||
*/
|
||||
|
||||
/**
|
||||
* Set up the data structures for a given project.
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function initialize(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
try {
|
||||
await mongodb.blobs.insertOne({
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: {},
|
||||
})
|
||||
} catch (err) {
|
||||
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
|
||||
return // ignore already initialized case
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return blob metadata for the given project and hash.
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<Blob | null>}
|
||||
*/
|
||||
async function findBlob(projectId, hash) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const bucket = getBucket(hash)
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { _id: 0, bucket: `$${bucket}` } }
|
||||
)
|
||||
|
||||
if (result?.bucket == null) {
|
||||
return null
|
||||
}
|
||||
|
||||
const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
|
||||
if (record == null) {
|
||||
if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
|
||||
return await findBlobSharded(projectId, hash)
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the sharded collection for blob metadata
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<Blob | null>}
|
||||
*/
|
||||
async function findBlobSharded(projectId, hash) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
const id = makeShardedId(projectId, shard)
|
||||
const result = await mongodb.shardedBlobs.findOne(
|
||||
{ _id: id },
|
||||
{ projection: { _id: 0, blobs: `$${bucket}` } }
|
||||
)
|
||||
if (result?.blobs == null) {
|
||||
return null
|
||||
}
|
||||
const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
|
||||
if (!record) return null
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read multiple blob metadata records by hexadecimal hashes.
|
||||
* @param {string} projectId
|
||||
* @param {Array<string>} hashes
|
||||
* @return {Promise<Array<Blob>>}
|
||||
*/
|
||||
async function findBlobs(projectId, hashes) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.array(hashes, 'bad hashes: not array')
|
||||
hashes.forEach(function (hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
})
|
||||
|
||||
// Build a set of unique buckets
|
||||
const buckets = new Set(hashes.map(getBucket))
|
||||
|
||||
// Get buckets from Mongo
|
||||
const projection = { _id: 0 }
|
||||
for (const bucket of buckets) {
|
||||
projection[bucket] = 1
|
||||
}
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection }
|
||||
)
|
||||
|
||||
if (result?.blobs == null) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Build blobs from the query results
|
||||
const hashSet = new Set(hashes)
|
||||
const blobs = []
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
const hash = record.h.toString('hex')
|
||||
if (hashSet.has(hash)) {
|
||||
blobs.push(recordToBlob(record))
|
||||
hashSet.delete(hash)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we haven't found all the blobs, look in the sharded collection
|
||||
if (hashSet.size > 0) {
|
||||
const shardedBlobs = await findBlobsSharded(projectId, hashSet)
|
||||
blobs.push(...shardedBlobs)
|
||||
}
|
||||
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the sharded collection for blob metadata.
|
||||
* @param {string} projectId
|
||||
* @param {Set<string>} hashSet
|
||||
* @return {Promise<Array<Blob>>}
|
||||
*/
|
||||
async function findBlobsSharded(projectId, hashSet) {
|
||||
// Build a map of buckets by shard key
|
||||
const bucketsByShard = new Map()
|
||||
for (const hash of hashSet) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
let buckets = bucketsByShard.get(shard)
|
||||
if (buckets == null) {
|
||||
buckets = new Set()
|
||||
bucketsByShard.set(shard, buckets)
|
||||
}
|
||||
buckets.add(bucket)
|
||||
}
|
||||
|
||||
// Make parallel requests to the shards that might contain the hashes we want
|
||||
const requests = []
|
||||
for (const [shard, buckets] of bucketsByShard.entries()) {
|
||||
const id = makeShardedId(projectId, shard)
|
||||
const projection = { _id: 0 }
|
||||
for (const bucket of buckets) {
|
||||
projection[bucket] = 1
|
||||
}
|
||||
const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
|
||||
requests.push(request)
|
||||
}
|
||||
const results = await Promise.all(requests)
|
||||
|
||||
// Build blobs from the query results
|
||||
const blobs = []
|
||||
for (const result of results) {
|
||||
if (result?.blobs == null) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
const hash = record.h.toString('hex')
|
||||
if (hashSet.has(hash)) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
*/
|
||||
async function getProjectBlobs(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { _id: 0 } }
|
||||
)
|
||||
|
||||
if (!result) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Build blobs from the query results
|
||||
const blobs = []
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
|
||||
// Look for all possible sharded blobs
|
||||
|
||||
const minShardedId = makeShardedId(projectId, '0')
|
||||
const maxShardedId = makeShardedId(projectId, 'f')
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
const shardedRecords = mongodb.shardedBlobs.find(
|
||||
{
|
||||
_id: { $gte: minShardedId, $lte: maxShardedId },
|
||||
},
|
||||
{ projection: { _id: 0 } }
|
||||
)
|
||||
|
||||
for await (const shardedRecord of shardedRecords) {
|
||||
if (shardedRecord.blobs == null) {
|
||||
continue
|
||||
}
|
||||
for (const bucket of Object.values(shardedRecord.blobs)) {
|
||||
for (const record of bucket) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<string>} projectIds
|
||||
* @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
for (const project of projectIds) {
|
||||
assert.mongoId(project, 'bad projectId')
|
||||
}
|
||||
let nBlobs = 0
|
||||
const blobs = new Map()
|
||||
if (projectIds.length === 0) return { nBlobs, blobs }
|
||||
|
||||
// blobs
|
||||
{
|
||||
const cursor = await mongodb.blobs.find(
|
||||
{ _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
|
||||
{ readPreference: ReadPreference.secondaryPreferred }
|
||||
)
|
||||
for await (const record of cursor) {
|
||||
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
|
||||
blobs.set(record._id.toString(), projectBlobs)
|
||||
nBlobs += projectBlobs.length
|
||||
}
|
||||
}
|
||||
|
||||
// sharded blobs
|
||||
{
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
const cursor = await mongodb.shardedBlobs.find(
|
||||
{
|
||||
_id: {
|
||||
$gte: makeShardedId(projectIds[0], '0'),
|
||||
$lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
|
||||
},
|
||||
},
|
||||
{ readPreference: ReadPreference.secondaryPreferred }
|
||||
)
|
||||
for await (const record of cursor) {
|
||||
const recordIdHex = record._id.toString('hex')
|
||||
const recordProjectId = recordIdHex.slice(0, 24)
|
||||
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
|
||||
const found = blobs.get(recordProjectId)
|
||||
if (found) {
|
||||
found.push(...projectBlobs)
|
||||
} else {
|
||||
blobs.set(recordProjectId, projectBlobs)
|
||||
}
|
||||
nBlobs += projectBlobs.length
|
||||
}
|
||||
}
|
||||
return { nBlobs, blobs }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the blobs collection after it has been uploaded.
|
||||
* @param {string} projectId
|
||||
* @param {Blob} blob
|
||||
*/
|
||||
async function insertBlob(projectId, blob) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
const hash = blob.getHash()
|
||||
const bucket = getBucket(hash)
|
||||
const record = blobToRecord(blob)
|
||||
const result = await mongodb.blobs.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
$expr: {
|
||||
$lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
|
||||
},
|
||||
},
|
||||
{
|
||||
$addToSet: { [bucket]: record },
|
||||
}
|
||||
)
|
||||
|
||||
if (result.matchedCount === 0) {
|
||||
await insertRecordSharded(projectId, hash, record)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the sharded blobs collection.
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @param {Record} record
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function insertRecordSharded(projectId, hash, record) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
const id = makeShardedId(projectId, shard)
|
||||
await mongodb.shardedBlobs.updateOne(
|
||||
{ _id: id },
|
||||
{ $addToSet: { [bucket]: record } },
|
||||
{ upsert: true }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all blobs for a given project.
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function deleteBlobs(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
|
||||
const minShardedId = makeShardedId(projectId, '0')
|
||||
const maxShardedId = makeShardedId(projectId, 'f')
|
||||
await mongodb.shardedBlobs.deleteMany({
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
_id: { $gte: minShardedId, $lte: maxShardedId },
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the Mongo path to the bucket for the given hash.
|
||||
* @param {string} hash
|
||||
* @return {string}
|
||||
*/
|
||||
function getBucket(hash) {
|
||||
return `blobs.${hash.slice(0, 3)}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the shard key and Mongo path to the bucket for the given hash in the
|
||||
* sharded collection.
|
||||
* @param {string} hash
|
||||
* @return {[string, string]}
|
||||
*/
|
||||
function getShardedBucket(hash) {
|
||||
const shard = hash.slice(0, 1)
|
||||
const bucket = `blobs.${hash.slice(1, 4)}`
|
||||
return [shard, bucket]
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an _id key for the sharded collection.
|
||||
* @param {string} projectId
|
||||
* @param {string} shard
|
||||
* @return {Binary}
|
||||
*/
|
||||
function makeShardedId(projectId, shard) {
|
||||
return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} Record
|
||||
* @property {Binary} h
|
||||
* @property {number} b
|
||||
* @property {number} [s]
|
||||
*/
|
||||
|
||||
/**
|
||||
* Return the Mongo record for the given blob.
|
||||
* @param {Blob} blob
|
||||
* @return {Record}
|
||||
*/
|
||||
function blobToRecord(blob) {
|
||||
const hash = blob.getHash()
|
||||
const byteLength = blob.getByteLength()
|
||||
const stringLength = blob.getStringLength()
|
||||
return {
|
||||
h: new Binary(Buffer.from(hash, 'hex')),
|
||||
b: byteLength,
|
||||
s: stringLength,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a blob from the given Mongo record.
|
||||
* @param {Record} record
|
||||
* @return {Blob}
|
||||
*/
|
||||
function recordToBlob(record) {
|
||||
return new Blob(record.h.toString('hex'), record.b, record.s)
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initialize,
|
||||
findBlob,
|
||||
findBlobs,
|
||||
getProjectBlobs,
|
||||
getProjectBlobsBatch,
|
||||
insertBlob,
|
||||
deleteBlobs,
|
||||
}
|
||||
161
services/history-v1/storage/lib/blob_store/postgres.js
Normal file
161
services/history-v1/storage/lib/blob_store/postgres.js
Normal file
@@ -0,0 +1,161 @@
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const assert = require('../assert')
|
||||
const knex = require('../knex')
|
||||
|
||||
/**
|
||||
* Set up the initial data structures for a project
|
||||
*/
|
||||
async function initialize(projectId) {
|
||||
// Nothing to do for Postgres
|
||||
}
|
||||
|
||||
/**
|
||||
* Return blob metadata for the given project and hash
|
||||
*/
|
||||
async function findBlob(projectId, hash) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const binaryHash = hashToBuffer(hash)
|
||||
const record = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where({
|
||||
project_id: projectId,
|
||||
hash_bytes: binaryHash,
|
||||
})
|
||||
.first()
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read multiple blob metadata records by hexadecimal hashes.
|
||||
*
|
||||
* @param {Array.<string>} hashes hexadecimal SHA-1 hashes
|
||||
* @return {Promise.<Array.<Blob?>>} no guarantee on order
|
||||
*/
|
||||
async function findBlobs(projectId, hashes) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
assert.array(hashes, 'bad hashes: not array')
|
||||
hashes.forEach(function (hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
})
|
||||
|
||||
const binaryHashes = hashes.map(hashToBuffer)
|
||||
|
||||
const records = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where('project_id', projectId)
|
||||
.whereIn('hash_bytes', binaryHashes)
|
||||
|
||||
const blobs = records.map(recordToBlob)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
*/
|
||||
async function getProjectBlobs(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
const records = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where({
|
||||
project_id: projectId,
|
||||
})
|
||||
|
||||
const blobs = records.map(recordToBlob)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<number>} projectIds
|
||||
* @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
for (const projectId of projectIds) {
|
||||
assert.integer(projectId, 'bad projectId')
|
||||
}
|
||||
let nBlobs = 0
|
||||
const blobs = new Map()
|
||||
if (projectIds.length === 0) return { nBlobs, blobs }
|
||||
|
||||
const cursor = knex('project_blobs')
|
||||
.select('project_id', 'hash_bytes', 'byte_length', 'string_length')
|
||||
.whereIn('project_id', projectIds)
|
||||
.stream()
|
||||
for await (const record of cursor) {
|
||||
const found = blobs.get(record.project_id)
|
||||
if (found) {
|
||||
found.push(recordToBlob(record))
|
||||
} else {
|
||||
blobs.set(record.project_id, [recordToBlob(record)])
|
||||
}
|
||||
nBlobs++
|
||||
}
|
||||
return { nBlobs, blobs }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the blobs table after it has been uploaded.
|
||||
*/
|
||||
async function insertBlob(projectId, blob) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
await knex('project_blobs')
|
||||
.insert(blobToRecord(projectId, blob))
|
||||
.onConflict(['project_id', 'hash_bytes'])
|
||||
.ignore()
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes all blobs for a given project
|
||||
*/
|
||||
async function deleteBlobs(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
await knex('project_blobs').where('project_id', projectId).delete()
|
||||
}
|
||||
|
||||
function blobToRecord(projectId, blob) {
|
||||
return {
|
||||
project_id: projectId,
|
||||
hash_bytes: hashToBuffer(blob.hash),
|
||||
byte_length: blob.getByteLength(),
|
||||
string_length: blob.getStringLength(),
|
||||
}
|
||||
}
|
||||
|
||||
function recordToBlob(record) {
|
||||
if (!record) return
|
||||
return new Blob(
|
||||
hashFromBuffer(record.hash_bytes),
|
||||
record.byte_length,
|
||||
record.string_length
|
||||
)
|
||||
}
|
||||
|
||||
function hashToBuffer(hash) {
|
||||
if (!hash) return
|
||||
return Buffer.from(hash, 'hex')
|
||||
}
|
||||
|
||||
function hashFromBuffer(buffer) {
|
||||
if (!buffer) return
|
||||
return buffer.toString('hex')
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initialize,
|
||||
findBlob,
|
||||
findBlobs,
|
||||
getProjectBlobs,
|
||||
getProjectBlobsBatch,
|
||||
insertBlob,
|
||||
deleteBlobs,
|
||||
}
|
||||
40
services/history-v1/storage/lib/chunk_buffer/index.js
Normal file
40
services/history-v1/storage/lib/chunk_buffer/index.js
Normal file
@@ -0,0 +1,40 @@
|
||||
'use strict'
|
||||
|
||||
/**
|
||||
* @module storage/lib/chunk_buffer
|
||||
*/
|
||||
|
||||
const chunkStore = require('../chunk_store')
|
||||
const redisBackend = require('../chunk_store/redis')
|
||||
const metrics = require('@overleaf/metrics')
|
||||
/**
|
||||
* Load the latest Chunk stored for a project, including blob metadata.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @return {Promise.<Chunk>}
|
||||
*/
|
||||
async function loadLatest(projectId) {
|
||||
const cachedChunk = await redisBackend.getCurrentChunk(projectId)
|
||||
const chunkRecord = await chunkStore.loadLatestRaw(projectId)
|
||||
const cachedChunkIsValid = redisBackend.checkCacheValidityWithMetadata(
|
||||
cachedChunk,
|
||||
chunkRecord
|
||||
)
|
||||
if (cachedChunkIsValid) {
|
||||
metrics.inc('chunk_buffer.loadLatest', 1, {
|
||||
status: 'cache-hit',
|
||||
})
|
||||
return cachedChunk
|
||||
} else {
|
||||
metrics.inc('chunk_buffer.loadLatest', 1, {
|
||||
status: 'cache-miss',
|
||||
})
|
||||
const chunk = await chunkStore.loadLatest(projectId)
|
||||
await redisBackend.setCurrentChunk(projectId, chunk)
|
||||
return chunk
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
loadLatest,
|
||||
}
|
||||
7
services/history-v1/storage/lib/chunk_store/errors.js
Normal file
7
services/history-v1/storage/lib/chunk_store/errors.js
Normal file
@@ -0,0 +1,7 @@
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
class ChunkVersionConflictError extends OError {}
|
||||
|
||||
module.exports = {
|
||||
ChunkVersionConflictError,
|
||||
}
|
||||
447
services/history-v1/storage/lib/chunk_store/index.js
Normal file
447
services/history-v1/storage/lib/chunk_store/index.js
Normal file
@@ -0,0 +1,447 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
/**
|
||||
* Manage {@link Chunk} and {@link History} storage.
|
||||
*
|
||||
* For storage, chunks are immutable. If we want to update a project with new
|
||||
* changes, we create a new chunk record and History object and delete the old
|
||||
* ones. If we compact a project's history, we similarly destroy the old chunk
|
||||
* (or chunks) and replace them with a new one. This is helpful when using S3,
|
||||
* because it guarantees only eventual consistency for updates but provides
|
||||
* stronger consistency guarantees for object creation.
|
||||
*
|
||||
* When a chunk record in the database is removed, we save its ID for later
|
||||
* in the `old_chunks` table, rather than deleting it immediately. This lets us
|
||||
* use batch deletion to reduce the number of delete requests to S3.
|
||||
*
|
||||
* The chunk store also caches data about which blobs are referenced by each
|
||||
* chunk, which allows us to find unused blobs without loading all of the data
|
||||
* for all projects from S3. Whenever we create a chunk, we also insert records
|
||||
* into the `chunk_blobs` table, to help with this bookkeeping.
|
||||
*/
|
||||
|
||||
const config = require('config')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const { Chunk, History, Snapshot } = require('overleaf-editor-core')
|
||||
|
||||
const assert = require('../assert')
|
||||
const BatchBlobStore = require('../batch_blob_store')
|
||||
const { BlobStore } = require('../blob_store')
|
||||
const { historyStore } = require('../history_store')
|
||||
const mongoBackend = require('./mongo')
|
||||
const postgresBackend = require('./postgres')
|
||||
const { ChunkVersionConflictError } = require('./errors')
|
||||
|
||||
const DEFAULT_DELETE_BATCH_SIZE = parseInt(config.get('maxDeleteKeys'), 10)
|
||||
const DEFAULT_DELETE_TIMEOUT_SECS = 3000 // 50 minutes
|
||||
const DEFAULT_DELETE_MIN_AGE_SECS = 86400 // 1 day
|
||||
|
||||
/**
|
||||
* Create the initial chunk for a project.
|
||||
*/
|
||||
async function initializeProject(projectId, snapshot) {
|
||||
if (projectId != null) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
} else {
|
||||
projectId = await postgresBackend.generateProjectId()
|
||||
}
|
||||
|
||||
if (snapshot != null) {
|
||||
assert.instance(snapshot, Snapshot, 'bad snapshot')
|
||||
} else {
|
||||
snapshot = new Snapshot()
|
||||
}
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
await blobStore.initialize()
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getLatestChunk(projectId)
|
||||
if (chunkRecord != null) {
|
||||
throw new AlreadyInitialized(projectId)
|
||||
}
|
||||
|
||||
const history = new History(snapshot, [])
|
||||
const chunk = new Chunk(history, 0)
|
||||
await create(projectId, chunk)
|
||||
return projectId
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the blobs referenced in the given history
|
||||
*/
|
||||
async function lazyLoadHistoryFiles(history, batchBlobStore) {
|
||||
const blobHashes = new Set()
|
||||
history.findBlobHashes(blobHashes)
|
||||
|
||||
await batchBlobStore.preload(Array.from(blobHashes))
|
||||
await history.loadFiles('lazy', batchBlobStore)
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the latest Chunk stored for a project, including blob metadata.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Object} [opts]
|
||||
* @param {boolean} [opts.readOnly]
|
||||
* @return {Promise<{id: string, startVersion: number, endVersion: number, endTimestamp: Date}>}
|
||||
*/
|
||||
async function loadLatestRaw(projectId, opts) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getLatestChunk(projectId, opts)
|
||||
if (chunkRecord == null) {
|
||||
throw new Chunk.NotFoundError(projectId)
|
||||
}
|
||||
return chunkRecord
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the latest Chunk stored for a project, including blob metadata.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @return {Promise.<Chunk>}
|
||||
*/
|
||||
async function loadLatest(projectId) {
|
||||
const chunkRecord = await loadLatestRaw(projectId)
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return new Chunk(history, chunkRecord.startVersion)
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the the chunk that contains the given version, including blob metadata.
|
||||
*/
|
||||
async function loadAtVersion(projectId, version) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
|
||||
const chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the chunk that contains the version that was current at the given
|
||||
* timestamp, including blob metadata.
|
||||
*/
|
||||
async function loadAtTimestamp(projectId, timestamp) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
|
||||
const chunkRecord = await backend.getChunkForTimestamp(projectId, timestamp)
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
|
||||
}
|
||||
|
||||
/**
|
||||
* Store the chunk and insert corresponding records in the database.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {Date} [earliestChangeTimestamp]
|
||||
*/
|
||||
async function create(projectId, chunk, earliestChangeTimestamp) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.instance(chunk, Chunk, 'bad chunk')
|
||||
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const chunkStart = chunk.getStartVersion()
|
||||
const chunkId = await uploadChunk(projectId, chunk)
|
||||
|
||||
const opts = {}
|
||||
if (chunkStart > 0) {
|
||||
opts.oldChunkId = await getChunkIdForVersion(projectId, chunkStart - 1)
|
||||
}
|
||||
if (earliestChangeTimestamp != null) {
|
||||
opts.earliestChangeTimestamp = earliestChangeTimestamp
|
||||
}
|
||||
|
||||
await backend.confirmCreate(projectId, chunk, chunkId, opts)
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload the given chunk to object storage.
|
||||
*
|
||||
* This is used by the create and update methods.
|
||||
*/
|
||||
async function uploadChunk(projectId, chunk) {
|
||||
const backend = getBackend(projectId)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
const historyStoreConcurrency = parseInt(
|
||||
config.get('chunkStore.historyStoreConcurrency'),
|
||||
10
|
||||
)
|
||||
|
||||
const rawHistory = await chunk
|
||||
.getHistory()
|
||||
.store(blobStore, historyStoreConcurrency)
|
||||
const chunkId = await backend.insertPendingChunk(projectId, chunk)
|
||||
await historyStore.storeRaw(projectId, chunkId, rawHistory)
|
||||
return chunkId
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the project's history by replacing the latest chunk with a new
|
||||
* chunk.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} oldEndVersion
|
||||
* @param {Chunk} newChunk
|
||||
* @param {Date} [earliestChangeTimestamp]
|
||||
* @return {Promise}
|
||||
*/
|
||||
async function update(
|
||||
projectId,
|
||||
oldEndVersion,
|
||||
newChunk,
|
||||
earliestChangeTimestamp
|
||||
) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(oldEndVersion, 'bad oldEndVersion')
|
||||
assert.instance(newChunk, Chunk, 'bad newChunk')
|
||||
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const oldChunkId = await getChunkIdForVersion(projectId, oldEndVersion)
|
||||
const newChunkId = await uploadChunk(projectId, newChunk)
|
||||
|
||||
const opts = {}
|
||||
if (earliestChangeTimestamp != null) {
|
||||
opts.earliestChangeTimestamp = earliestChangeTimestamp
|
||||
}
|
||||
|
||||
await backend.confirmUpdate(projectId, oldChunkId, newChunk, newChunkId, opts)
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the chunk ID for a given version of a project.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} version
|
||||
* @return {Promise.<string>}
|
||||
*/
|
||||
async function getChunkIdForVersion(projectId, version) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
return chunkRecord.id
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the chunk metadata for a given version of a project.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} version
|
||||
* @return {Promise.<{id: string|number, startVersion: number, endVersion: number}>}
|
||||
*/
|
||||
async function getChunkMetadataForVersion(projectId, version) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
return chunkRecord
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a project's chunk ids
|
||||
*/
|
||||
async function getProjectChunkIds(projectId) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkIds = await backend.getProjectChunkIds(projectId)
|
||||
return chunkIds
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a projects chunks directly
|
||||
*/
|
||||
async function getProjectChunks(projectId) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkIds = await backend.getProjectChunks(projectId)
|
||||
return chunkIds
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the chunk for a given chunk record, including blob metadata.
|
||||
*/
|
||||
async function loadByChunkRecord(projectId, chunkRecord) {
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
const { raw: rawHistory, buffer: chunkBuffer } =
|
||||
await historyStore.loadRawWithBuffer(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return {
|
||||
chunk: new Chunk(history, chunkRecord.endVersion - history.countChanges()),
|
||||
chunkBuffer,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Asynchronously retrieves project chunks starting from a specific version.
|
||||
*
|
||||
* This generator function yields chunk records for a given project starting from the specified version (inclusive).
|
||||
* It continues to fetch and yield subsequent chunk records until the end version of the latest chunk metadata is reached.
|
||||
* If you want to fetch all the chunks *after* a version V, call this function with V+1.
|
||||
*
|
||||
* @param {string} projectId - The ID of the project.
|
||||
* @param {number} version - The starting version to retrieve chunks from.
|
||||
* @returns {AsyncGenerator<Object, void, undefined>} An async generator that yields chunk records.
|
||||
*/
|
||||
async function* getProjectChunksFromVersion(projectId, version) {
|
||||
const backend = getBackend(projectId)
|
||||
const latestChunkMetadata = await loadLatestRaw(projectId)
|
||||
if (!latestChunkMetadata || version > latestChunkMetadata.endVersion) {
|
||||
return
|
||||
}
|
||||
let chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
while (chunkRecord != null) {
|
||||
yield chunkRecord
|
||||
if (chunkRecord.endVersion >= latestChunkMetadata.endVersion) {
|
||||
break
|
||||
} else {
|
||||
chunkRecord = await backend.getChunkForVersion(
|
||||
projectId,
|
||||
chunkRecord.endVersion + 1
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete the given chunk from the database.
|
||||
*
|
||||
* This doesn't delete the chunk from object storage yet. The old chunks
|
||||
* collection will do that.
|
||||
*/
|
||||
async function destroy(projectId, chunkId) {
|
||||
const backend = getBackend(projectId)
|
||||
await backend.deleteChunk(projectId, chunkId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all of a project's chunks from the database.
|
||||
*/
|
||||
async function deleteProjectChunks(projectId) {
|
||||
const backend = getBackend(projectId)
|
||||
await backend.deleteProjectChunks(projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a given number of old chunks from both the database
|
||||
* and from object storage.
|
||||
*
|
||||
* @param {object} options
|
||||
* @param {number} [options.batchSize] - number of chunks to delete in each
|
||||
* batch
|
||||
* @param {number} [options.maxBatches] - maximum number of batches to process
|
||||
* @param {number} [options.minAgeSecs] - minimum age of chunks to delete
|
||||
* @param {number} [options.timeout] - maximum time to spend deleting chunks
|
||||
*
|
||||
* @return {Promise<number>} number of chunks deleted
|
||||
*/
|
||||
async function deleteOldChunks(options = {}) {
|
||||
const batchSize = options.batchSize ?? DEFAULT_DELETE_BATCH_SIZE
|
||||
const maxBatches = options.maxBatches ?? Number.MAX_SAFE_INTEGER
|
||||
const minAgeSecs = options.minAgeSecs ?? DEFAULT_DELETE_MIN_AGE_SECS
|
||||
const timeout = options.timeout ?? DEFAULT_DELETE_TIMEOUT_SECS
|
||||
assert.greater(batchSize, 0)
|
||||
assert.greater(timeout, 0)
|
||||
assert.greater(maxBatches, 0)
|
||||
assert.greaterOrEqual(minAgeSecs, 0)
|
||||
|
||||
const timeoutAfter = Date.now() + timeout * 1000
|
||||
let deletedChunksTotal = 0
|
||||
for (const backend of [postgresBackend, mongoBackend]) {
|
||||
for (let i = 0; i < maxBatches; i++) {
|
||||
if (Date.now() > timeoutAfter) {
|
||||
break
|
||||
}
|
||||
const deletedChunks = await deleteOldChunksBatch(
|
||||
backend,
|
||||
batchSize,
|
||||
minAgeSecs
|
||||
)
|
||||
deletedChunksTotal += deletedChunks.length
|
||||
if (deletedChunks.length !== batchSize) {
|
||||
// Last batch was incomplete. There probably are no old chunks left
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return deletedChunksTotal
|
||||
}
|
||||
|
||||
async function deleteOldChunksBatch(backend, count, minAgeSecs) {
|
||||
assert.greater(count, 0, 'bad count')
|
||||
assert.greaterOrEqual(minAgeSecs, 0, 'bad minAgeSecs')
|
||||
|
||||
const oldChunks = await backend.getOldChunksBatch(count, minAgeSecs)
|
||||
if (oldChunks.length === 0) {
|
||||
return []
|
||||
}
|
||||
await historyStore.deleteChunks(oldChunks)
|
||||
await backend.deleteOldChunks(oldChunks.map(chunk => chunk.chunkId))
|
||||
return oldChunks
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the appropriate backend for the given project id
|
||||
*
|
||||
* Numeric ids use the Postgres backend.
|
||||
* Strings of 24 characters use the Mongo backend.
|
||||
*/
|
||||
function getBackend(projectId) {
|
||||
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
|
||||
return postgresBackend
|
||||
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
|
||||
return mongoBackend
|
||||
} else {
|
||||
throw new OError('bad project id', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
class AlreadyInitialized extends OError {
|
||||
constructor(projectId) {
|
||||
super('Project is already initialized', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getBackend,
|
||||
initializeProject,
|
||||
loadLatest,
|
||||
loadLatestRaw,
|
||||
loadAtVersion,
|
||||
loadAtTimestamp,
|
||||
loadByChunkRecord,
|
||||
create,
|
||||
update,
|
||||
destroy,
|
||||
getChunkIdForVersion,
|
||||
getChunkMetadataForVersion,
|
||||
getProjectChunkIds,
|
||||
getProjectChunks,
|
||||
getProjectChunksFromVersion,
|
||||
deleteProjectChunks,
|
||||
deleteOldChunks,
|
||||
AlreadyInitialized,
|
||||
ChunkVersionConflictError,
|
||||
}
|
||||
526
services/history-v1/storage/lib/chunk_store/mongo.js
Normal file
526
services/history-v1/storage/lib/chunk_store/mongo.js
Normal file
@@ -0,0 +1,526 @@
|
||||
// @ts-check
|
||||
|
||||
const { ObjectId, ReadPreference, MongoError } = require('mongodb')
|
||||
const { Chunk } = require('overleaf-editor-core')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const assert = require('../assert')
|
||||
const mongodb = require('../mongodb')
|
||||
const { ChunkVersionConflictError } = require('./errors')
|
||||
|
||||
const DUPLICATE_KEY_ERROR_CODE = 11000
|
||||
|
||||
/**
|
||||
* @import { ClientSession } from 'mongodb'
|
||||
*/
|
||||
|
||||
/**
|
||||
* Get the latest chunk's metadata from the database
|
||||
* @param {string} projectId
|
||||
* @param {Object} [opts]
|
||||
* @param {boolean} [opts.readOnly]
|
||||
*/
|
||||
async function getLatestChunk(projectId, opts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
const { readOnly = false } = opts
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{
|
||||
sort: { startVersion: -1 },
|
||||
readPreference: readOnly
|
||||
? ReadPreference.secondaryPreferred
|
||||
: ReadPreference.primary,
|
||||
}
|
||||
)
|
||||
if (record == null) {
|
||||
return null
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version.
|
||||
*/
|
||||
async function getChunkForVersion(projectId, version) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
startVersion: { $lte: version },
|
||||
endVersion: { $gte: version },
|
||||
},
|
||||
{ sort: { startVersion: 1 } }
|
||||
)
|
||||
if (record == null) {
|
||||
throw new Chunk.VersionNotFoundError(projectId, version)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version before the endTime.
|
||||
*/
|
||||
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const recordActive = await getChunkForVersion(projectId, 0)
|
||||
if (recordActive && recordActive.endTimestamp <= timestamp) {
|
||||
return recordActive
|
||||
}
|
||||
|
||||
// fallback to deleted chunk
|
||||
const recordDeleted = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'deleted',
|
||||
startVersion: 0,
|
||||
updatedAt: { $lte: timestamp }, // indexed for state=deleted
|
||||
endTimestamp: { $lte: timestamp },
|
||||
},
|
||||
{ sort: { updatedAt: -1 } }
|
||||
)
|
||||
if (recordDeleted) {
|
||||
return chunkFromRecord(recordDeleted)
|
||||
}
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current at
|
||||
* the given timestamp.
|
||||
*/
|
||||
async function getChunkForTimestamp(projectId, timestamp) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
endTimestamp: { $gte: timestamp },
|
||||
},
|
||||
// We use the index on the startVersion for sorting records. This assumes
|
||||
// that timestamps go up with each version.
|
||||
{ sort: { startVersion: 1 } }
|
||||
)
|
||||
|
||||
if (record == null) {
|
||||
// Couldn't find a chunk that had modifications after the given timestamp.
|
||||
// Fetch the latest chunk instead.
|
||||
const chunk = await getLatestChunk(projectId)
|
||||
if (chunk == null) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunk
|
||||
}
|
||||
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current before
|
||||
* the given timestamp.
|
||||
*/
|
||||
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
$or: [
|
||||
{
|
||||
endTimestamp: {
|
||||
$lte: timestamp,
|
||||
},
|
||||
},
|
||||
{
|
||||
endTimestamp: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
// We use the index on the startVersion for sorting records. This assumes
|
||||
// that timestamps go up with each version.
|
||||
{ sort: { startVersion: -1 } }
|
||||
)
|
||||
if (record == null) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a project's chunk ids
|
||||
*/
|
||||
async function getProjectChunkIds(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const cursor = mongodb.chunks.find(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{ projection: { _id: 1 } }
|
||||
)
|
||||
return await cursor.map(record => record._id).toArray()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a projects chunks directly
|
||||
*/
|
||||
async function getProjectChunks(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const cursor = mongodb.chunks
|
||||
.find(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{ projection: { state: 0 } }
|
||||
)
|
||||
.sort({ startVersion: 1 })
|
||||
return await cursor.map(chunkFromRecord).toArray()
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a pending chunk before sending it to object storage.
|
||||
*/
|
||||
async function insertPendingChunk(projectId, chunk) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.instance(chunk, Chunk, 'bad chunk')
|
||||
|
||||
const chunkId = new ObjectId()
|
||||
await mongodb.chunks.insertOne({
|
||||
_id: chunkId,
|
||||
projectId: new ObjectId(projectId),
|
||||
startVersion: chunk.getStartVersion(),
|
||||
endVersion: chunk.getEndVersion(),
|
||||
endTimestamp: chunk.getEndTimestamp(),
|
||||
state: 'pending',
|
||||
updatedAt: new Date(),
|
||||
})
|
||||
return chunkId.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a new chunk was created.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {string} chunkId
|
||||
* @param {object} opts
|
||||
* @param {Date} [opts.earliestChangeTimestamp]
|
||||
* @param {string} [opts.oldChunkId]
|
||||
*/
|
||||
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.instance(chunk, Chunk, 'bad newChunk')
|
||||
assert.mongoId(chunkId, 'bad newChunkId')
|
||||
|
||||
await mongodb.client.withSession(async session => {
|
||||
await session.withTransaction(async () => {
|
||||
if (opts.oldChunkId != null) {
|
||||
await closeChunk(projectId, opts.oldChunkId, { session })
|
||||
}
|
||||
|
||||
await activateChunk(projectId, chunkId, { session })
|
||||
|
||||
await updateProjectRecord(
|
||||
projectId,
|
||||
chunk,
|
||||
opts.earliestChangeTimestamp,
|
||||
{ session }
|
||||
)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the metadata to the project record
|
||||
*/
|
||||
async function updateProjectRecord(
|
||||
projectId,
|
||||
chunk,
|
||||
earliestChangeTimestamp,
|
||||
mongoOpts = {}
|
||||
) {
|
||||
// record the end version against the project
|
||||
await mongodb.projects.updateOne(
|
||||
{
|
||||
'overleaf.history.id': projectId, // string for Object ids, number for postgres ids
|
||||
},
|
||||
{
|
||||
// always store the latest end version and timestamp for the chunk
|
||||
$max: {
|
||||
'overleaf.history.currentEndVersion': chunk.getEndVersion(),
|
||||
'overleaf.history.currentEndTimestamp': chunk.getEndTimestamp(),
|
||||
'overleaf.history.updatedAt': new Date(),
|
||||
},
|
||||
// store the first pending change timestamp for the chunk, this will
|
||||
// be cleared every time a backup is completed.
|
||||
$min: {
|
||||
'overleaf.backup.pendingChangeAt':
|
||||
earliestChangeTimestamp || chunk.getEndTimestamp() || new Date(),
|
||||
},
|
||||
},
|
||||
mongoOpts
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a chunk was replaced by a new one.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} oldChunkId
|
||||
* @param {Chunk} newChunk
|
||||
* @param {string} newChunkId
|
||||
* @param {object} [opts]
|
||||
* @param {Date} [opts.earliestChangeTimestamp]
|
||||
*/
|
||||
async function confirmUpdate(
|
||||
projectId,
|
||||
oldChunkId,
|
||||
newChunk,
|
||||
newChunkId,
|
||||
opts = {}
|
||||
) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.mongoId(oldChunkId, 'bad oldChunkId')
|
||||
assert.instance(newChunk, Chunk, 'bad newChunk')
|
||||
assert.mongoId(newChunkId, 'bad newChunkId')
|
||||
|
||||
await mongodb.client.withSession(async session => {
|
||||
await session.withTransaction(async () => {
|
||||
await deleteActiveChunk(projectId, oldChunkId, { session })
|
||||
|
||||
await activateChunk(projectId, newChunkId, { session })
|
||||
|
||||
await updateProjectRecord(
|
||||
projectId,
|
||||
newChunk,
|
||||
opts.earliestChangeTimestamp,
|
||||
{ session }
|
||||
)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Activate a pending chunk
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {object} [opts]
|
||||
* @param {ClientSession} [opts.session]
|
||||
*/
|
||||
async function activateChunk(projectId, chunkId, opts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.mongoId(chunkId, 'bad chunkId')
|
||||
|
||||
let result
|
||||
try {
|
||||
result = await mongodb.chunks.updateOne(
|
||||
{
|
||||
_id: new ObjectId(chunkId),
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'pending',
|
||||
},
|
||||
{ $set: { state: 'active', updatedAt: new Date() } },
|
||||
opts
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
|
||||
throw new ChunkVersionConflictError('chunk start version is not unique', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
if (result.matchedCount === 0) {
|
||||
throw new OError('pending chunk not found', { projectId, chunkId })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close a chunk
|
||||
*
|
||||
* A closed chunk is one that can't be extended anymore.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {object} [opts]
|
||||
* @param {ClientSession} [opts.session]
|
||||
*/
|
||||
async function closeChunk(projectId, chunkId, opts = {}) {
|
||||
const result = await mongodb.chunks.updateOne(
|
||||
{
|
||||
_id: new ObjectId(chunkId),
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'active',
|
||||
},
|
||||
{ $set: { state: 'closed' } },
|
||||
opts
|
||||
)
|
||||
|
||||
if (result.matchedCount === 0) {
|
||||
throw new ChunkVersionConflictError('unable to close chunk', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete an active chunk
|
||||
*
|
||||
* This is used to delete chunks that are in the process of being extended. It
|
||||
* will refuse to delete chunks that are already closed and can therefore not be
|
||||
* extended.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {object} [opts]
|
||||
* @param {ClientSession} [opts.session]
|
||||
*/
|
||||
async function deleteActiveChunk(projectId, chunkId, opts = {}) {
|
||||
const updateResult = await mongodb.chunks.updateOne(
|
||||
{
|
||||
_id: new ObjectId(chunkId),
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'active',
|
||||
},
|
||||
{ $set: { state: 'deleted', updatedAt: new Date() } },
|
||||
opts
|
||||
)
|
||||
|
||||
if (updateResult.matchedCount === 0) {
|
||||
throw new ChunkVersionConflictError('unable to delete active chunk', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a chunk.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @return {Promise}
|
||||
*/
|
||||
async function deleteChunk(projectId, chunkId, mongoOpts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.mongoId(chunkId, 'bad chunkId')
|
||||
|
||||
await mongodb.chunks.updateOne(
|
||||
{ _id: new ObjectId(chunkId), projectId: new ObjectId(projectId) },
|
||||
{ $set: { state: 'deleted', updatedAt: new Date() } },
|
||||
mongoOpts
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all of a project's chunks
|
||||
*/
|
||||
async function deleteProjectChunks(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
await mongodb.chunks.updateMany(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{ $set: { state: 'deleted', updatedAt: new Date() } }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a batch of old chunks for deletion
|
||||
*/
|
||||
async function getOldChunksBatch(count, minAgeSecs) {
|
||||
const maxUpdatedAt = new Date(Date.now() - minAgeSecs * 1000)
|
||||
const batch = []
|
||||
|
||||
// We need to fetch one state at a time to take advantage of the partial
|
||||
// indexes on the chunks collection.
|
||||
//
|
||||
// Mongo 6.0 allows partial indexes that use the $in operator. When we reach
|
||||
// that Mongo version, we can create a partial index on both the deleted and
|
||||
// pending states and simplify this logic a bit.
|
||||
for (const state of ['deleted', 'pending']) {
|
||||
if (count === 0) {
|
||||
// There's no more space in the batch
|
||||
break
|
||||
}
|
||||
|
||||
const cursor = mongodb.chunks
|
||||
.find(
|
||||
{ state, updatedAt: { $lt: maxUpdatedAt } },
|
||||
{
|
||||
limit: count,
|
||||
projection: { _id: 1, projectId: 1 },
|
||||
}
|
||||
)
|
||||
.map(record => ({
|
||||
chunkId: record._id.toString(),
|
||||
projectId: record.projectId.toString(),
|
||||
}))
|
||||
|
||||
for await (const record of cursor) {
|
||||
batch.push(record)
|
||||
count -= 1
|
||||
}
|
||||
}
|
||||
return batch
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a batch of old chunks from the database
|
||||
*/
|
||||
async function deleteOldChunks(chunkIds) {
|
||||
await mongodb.chunks.deleteMany({
|
||||
_id: { $in: chunkIds.map(id => new ObjectId(id)) },
|
||||
state: { $in: ['deleted', 'pending'] },
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a chunk metadata object from the database record
|
||||
*/
|
||||
function chunkFromRecord(record) {
|
||||
return {
|
||||
id: record._id.toString(),
|
||||
startVersion: record.startVersion,
|
||||
endVersion: record.endVersion,
|
||||
endTimestamp: record.endTimestamp,
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getLatestChunk,
|
||||
getFirstChunkBeforeTimestamp,
|
||||
getLastActiveChunkBeforeTimestamp,
|
||||
getChunkForVersion,
|
||||
getChunkForTimestamp,
|
||||
getProjectChunkIds,
|
||||
getProjectChunks,
|
||||
insertPendingChunk,
|
||||
confirmCreate,
|
||||
confirmUpdate,
|
||||
updateProjectRecord,
|
||||
deleteChunk,
|
||||
deleteProjectChunks,
|
||||
getOldChunksBatch,
|
||||
deleteOldChunks,
|
||||
}
|
||||
487
services/history-v1/storage/lib/chunk_store/postgres.js
Normal file
487
services/history-v1/storage/lib/chunk_store/postgres.js
Normal file
@@ -0,0 +1,487 @@
|
||||
// @ts-check
|
||||
|
||||
const { Chunk } = require('overleaf-editor-core')
|
||||
const assert = require('../assert')
|
||||
const knex = require('../knex')
|
||||
const knexReadOnly = require('../knex_read_only')
|
||||
const { ChunkVersionConflictError } = require('./errors')
|
||||
const { updateProjectRecord } = require('./mongo')
|
||||
|
||||
const DUPLICATE_KEY_ERROR_CODE = '23505'
|
||||
|
||||
/**
|
||||
* @import { Knex } from 'knex'
|
||||
*/
|
||||
|
||||
/**
|
||||
* Get the latest chunk's metadata from the database
|
||||
* @param {string} projectId
|
||||
* @param {Object} [opts]
|
||||
* @param {boolean} [opts.readOnly]
|
||||
*/
|
||||
async function getLatestChunk(projectId, opts = {}) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
const { readOnly = false } = opts
|
||||
|
||||
const record = await (readOnly ? knexReadOnly : knex)('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.orderBy('end_version', 'desc')
|
||||
.first()
|
||||
if (record == null) {
|
||||
return null
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} version
|
||||
*/
|
||||
async function getChunkForVersion(projectId, version) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const record = await knex('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('end_version', '>=', version)
|
||||
.orderBy('end_version')
|
||||
.first()
|
||||
if (!record) {
|
||||
throw new Chunk.VersionNotFoundError(projectId, version)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Date} timestamp
|
||||
*/
|
||||
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const recordActive = await getChunkForVersion(projectId, 0)
|
||||
|
||||
// projectId must be valid if getChunkForVersion did not throw
|
||||
if (recordActive && recordActive.endTimestamp <= timestamp) {
|
||||
return recordActive
|
||||
}
|
||||
|
||||
// fallback to deleted chunk
|
||||
const recordDeleted = await knex('old_chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('start_version', '=', 0)
|
||||
.where('end_timestamp', '<=', timestamp)
|
||||
.orderBy('end_version', 'desc')
|
||||
.first()
|
||||
if (recordDeleted) {
|
||||
return chunkFromRecord(recordDeleted)
|
||||
}
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current at
|
||||
* the given timestamp.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Date} timestamp
|
||||
*/
|
||||
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const query = knex('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where(function () {
|
||||
this.where('end_timestamp', '<=', timestamp).orWhere(
|
||||
'end_timestamp',
|
||||
null
|
||||
)
|
||||
})
|
||||
.orderBy('end_version', 'desc', 'last')
|
||||
|
||||
const record = await query.first()
|
||||
|
||||
if (!record) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current at
|
||||
* the given timestamp.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Date} timestamp
|
||||
*/
|
||||
async function getChunkForTimestamp(projectId, timestamp) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
// This query will find the latest chunk after the timestamp (query orders
|
||||
// in reverse chronological order), OR the latest chunk
|
||||
// This accounts for the case where the timestamp is ahead of the chunk's
|
||||
// timestamp and therefore will not return any results
|
||||
const whereAfterEndTimestampOrLatestChunk = knex.raw(
|
||||
'end_timestamp >= ? ' +
|
||||
'OR id = ( ' +
|
||||
'SELECT id FROM chunks ' +
|
||||
'WHERE doc_id = ? ' +
|
||||
'ORDER BY end_version desc LIMIT 1' +
|
||||
')',
|
||||
[timestamp, parseInt(projectId, 10)]
|
||||
)
|
||||
|
||||
const record = await knex('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where(whereAfterEndTimestampOrLatestChunk)
|
||||
.orderBy('end_version')
|
||||
.first()
|
||||
if (!record) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a chunk metadata object from the database record
|
||||
*/
|
||||
function chunkFromRecord(record) {
|
||||
return {
|
||||
id: record.id.toString(),
|
||||
startVersion: record.start_version,
|
||||
endVersion: record.end_version,
|
||||
endTimestamp: record.end_timestamp,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a project's chunk ids
|
||||
*
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function getProjectChunkIds(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const records = await knex('chunks')
|
||||
.select('id')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
return records.map(record => record.id)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a projects chunks directly
|
||||
*
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function getProjectChunks(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const records = await knex('chunks')
|
||||
.select()
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.orderBy('end_version')
|
||||
return records.map(chunkFromRecord)
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a pending chunk before sending it to object storage.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
*/
|
||||
async function insertPendingChunk(projectId, chunk) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const result = await knex.first(
|
||||
knex.raw("nextval('chunks_id_seq'::regclass)::integer as chunkid")
|
||||
)
|
||||
const chunkId = result.chunkid
|
||||
await knex('pending_chunks').insert({
|
||||
id: chunkId,
|
||||
doc_id: parseInt(projectId, 10),
|
||||
end_version: chunk.getEndVersion(),
|
||||
start_version: chunk.getStartVersion(),
|
||||
end_timestamp: chunk.getEndTimestamp(),
|
||||
})
|
||||
return chunkId.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a new chunk was created.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {string} chunkId
|
||||
* @param {object} opts
|
||||
* @param {Date} [opts.earliestChangeTimestamp]
|
||||
* @param {string} [opts.oldChunkId]
|
||||
*/
|
||||
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
await knex.transaction(async tx => {
|
||||
if (opts.oldChunkId != null) {
|
||||
await _assertChunkIsNotClosed(tx, projectId, opts.oldChunkId)
|
||||
await _closeChunk(tx, projectId, opts.oldChunkId)
|
||||
}
|
||||
await Promise.all([
|
||||
_deletePendingChunk(tx, projectId, chunkId),
|
||||
_insertChunk(tx, projectId, chunk, chunkId),
|
||||
])
|
||||
await updateProjectRecord(
|
||||
// The history id in Mongo is an integer for Postgres projects
|
||||
parseInt(projectId, 10),
|
||||
chunk,
|
||||
opts.earliestChangeTimestamp
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a chunk was replaced by a new one.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} oldChunkId
|
||||
* @param {Chunk} newChunk
|
||||
* @param {string} newChunkId
|
||||
*/
|
||||
async function confirmUpdate(
|
||||
projectId,
|
||||
oldChunkId,
|
||||
newChunk,
|
||||
newChunkId,
|
||||
opts = {}
|
||||
) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
await knex.transaction(async tx => {
|
||||
await _assertChunkIsNotClosed(tx, projectId, oldChunkId)
|
||||
await _deleteChunks(tx, { doc_id: projectId, id: oldChunkId })
|
||||
await Promise.all([
|
||||
_deletePendingChunk(tx, projectId, newChunkId),
|
||||
_insertChunk(tx, projectId, newChunk, newChunkId),
|
||||
])
|
||||
await updateProjectRecord(
|
||||
// The history id in Mongo is an integer for Postgres projects
|
||||
parseInt(projectId, 10),
|
||||
newChunk,
|
||||
opts.earliestChangeTimestamp
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a pending chunk
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _deletePendingChunk(tx, projectId, chunkId) {
|
||||
await tx('pending_chunks')
|
||||
.where({
|
||||
doc_id: parseInt(projectId, 10),
|
||||
id: parseInt(chunkId, 10),
|
||||
})
|
||||
.del()
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an active chunk
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _insertChunk(tx, projectId, chunk, chunkId) {
|
||||
const startVersion = chunk.getStartVersion()
|
||||
const endVersion = chunk.getEndVersion()
|
||||
try {
|
||||
await tx('chunks').insert({
|
||||
id: parseInt(chunkId, 10),
|
||||
doc_id: parseInt(projectId, 10),
|
||||
start_version: startVersion,
|
||||
end_version: endVersion,
|
||||
end_timestamp: chunk.getEndTimestamp(),
|
||||
})
|
||||
} catch (err) {
|
||||
if (
|
||||
err instanceof Error &&
|
||||
'code' in err &&
|
||||
err.code === DUPLICATE_KEY_ERROR_CODE
|
||||
) {
|
||||
throw new ChunkVersionConflictError(
|
||||
'chunk start or end version is not unique',
|
||||
{ projectId, chunkId, startVersion, endVersion }
|
||||
)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that a chunk is not closed
|
||||
*
|
||||
* This is used to synchronize chunk creations and extensions.
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _assertChunkIsNotClosed(tx, projectId, chunkId) {
|
||||
const record = await tx('chunks')
|
||||
.forUpdate()
|
||||
.select('closed')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('id', parseInt(chunkId, 10))
|
||||
.first()
|
||||
if (!record) {
|
||||
throw new ChunkVersionConflictError('unable to close chunk: not found', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
}
|
||||
if (record.closed) {
|
||||
throw new ChunkVersionConflictError(
|
||||
'unable to close chunk: already closed',
|
||||
{
|
||||
projectId,
|
||||
chunkId,
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close a chunk
|
||||
*
|
||||
* A closed chunk can no longer be extended.
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _closeChunk(tx, projectId, chunkId) {
|
||||
await tx('chunks')
|
||||
.update({ closed: true })
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('id', parseInt(chunkId, 10))
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a chunk.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function deleteChunk(projectId, chunkId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
assert.integer(chunkId, 'bad chunkId')
|
||||
|
||||
await _deleteChunks(knex, {
|
||||
doc_id: parseInt(projectId, 10),
|
||||
id: parseInt(chunkId, 10),
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all of a project's chunks
|
||||
*
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function deleteProjectChunks(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
await knex.transaction(async tx => {
|
||||
await _deleteChunks(knex, { doc_id: parseInt(projectId, 10) })
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete many chunks
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {any} whereClause
|
||||
*/
|
||||
async function _deleteChunks(tx, whereClause) {
|
||||
const rows = await tx('chunks').where(whereClause).del().returning('*')
|
||||
if (rows.length === 0) {
|
||||
return
|
||||
}
|
||||
|
||||
const oldChunks = rows.map(row => ({
|
||||
doc_id: row.doc_id,
|
||||
chunk_id: row.id,
|
||||
start_version: row.start_version,
|
||||
end_version: row.end_version,
|
||||
end_timestamp: row.end_timestamp,
|
||||
deleted_at: tx.fn.now(),
|
||||
}))
|
||||
await tx('old_chunks').insert(oldChunks)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a batch of old chunks for deletion
|
||||
*
|
||||
* @param {number} count
|
||||
* @param {number} minAgeSecs
|
||||
*/
|
||||
async function getOldChunksBatch(count, minAgeSecs) {
|
||||
const maxDeletedAt = new Date(Date.now() - minAgeSecs * 1000)
|
||||
const records = await knex('old_chunks')
|
||||
.whereNull('deleted_at')
|
||||
.orWhere('deleted_at', '<', maxDeletedAt)
|
||||
.orderBy('chunk_id')
|
||||
.limit(count)
|
||||
return records.map(oldChunk => ({
|
||||
projectId: oldChunk.doc_id.toString(),
|
||||
chunkId: oldChunk.chunk_id.toString(),
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a batch of old chunks from the database
|
||||
*
|
||||
* @param {string[]} chunkIds
|
||||
*/
|
||||
async function deleteOldChunks(chunkIds) {
|
||||
await knex('old_chunks')
|
||||
.whereIn(
|
||||
'chunk_id',
|
||||
chunkIds.map(id => parseInt(id, 10))
|
||||
)
|
||||
.del()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a new project id
|
||||
*/
|
||||
async function generateProjectId() {
|
||||
const record = await knex.first(
|
||||
knex.raw("nextval('docs_id_seq'::regclass)::integer as doc_id")
|
||||
)
|
||||
return record.doc_id.toString()
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getLatestChunk,
|
||||
getFirstChunkBeforeTimestamp,
|
||||
getLastActiveChunkBeforeTimestamp,
|
||||
getChunkForVersion,
|
||||
getChunkForTimestamp,
|
||||
getProjectChunkIds,
|
||||
getProjectChunks,
|
||||
insertPendingChunk,
|
||||
confirmCreate,
|
||||
confirmUpdate,
|
||||
deleteChunk,
|
||||
deleteProjectChunks,
|
||||
getOldChunksBatch,
|
||||
deleteOldChunks,
|
||||
generateProjectId,
|
||||
}
|
||||
254
services/history-v1/storage/lib/chunk_store/redis.js
Normal file
254
services/history-v1/storage/lib/chunk_store/redis.js
Normal file
@@ -0,0 +1,254 @@
|
||||
const metrics = require('@overleaf/metrics')
|
||||
const logger = require('@overleaf/logger')
|
||||
const redis = require('../redis')
|
||||
const rclient = redis.rclientHistory //
|
||||
const { Snapshot, Change, History, Chunk } = require('overleaf-editor-core')
|
||||
|
||||
const TEMPORARY_CACHE_LIFETIME = 300 // 5 minutes
|
||||
|
||||
const keySchema = {
|
||||
snapshot({ projectId }) {
|
||||
return `snapshot:{${projectId}}`
|
||||
},
|
||||
startVersion({ projectId }) {
|
||||
return `snapshot-version:{${projectId}}`
|
||||
},
|
||||
changes({ projectId }) {
|
||||
return `changes:{${projectId}}`
|
||||
},
|
||||
}
|
||||
|
||||
rclient.defineCommand('get_current_chunk', {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
local startVersionValue = redis.call('GET', KEYS[2])
|
||||
if not startVersionValue then
|
||||
return nil -- this is a cache-miss
|
||||
end
|
||||
local snapshotValue = redis.call('GET', KEYS[1])
|
||||
local changesValues = redis.call('LRANGE', KEYS[3], 0, -1)
|
||||
return {snapshotValue, startVersionValue, changesValues}
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Retrieves the current chunk of project history from Redis storage
|
||||
* @param {string} projectId - The unique identifier of the project
|
||||
* @returns {Promise<Chunk|null>} A Promise that resolves to a Chunk object containing project history,
|
||||
* or null if retrieval fails
|
||||
* @throws {Error} If Redis operations fail
|
||||
*/
|
||||
async function getCurrentChunk(projectId) {
|
||||
try {
|
||||
const result = await rclient.get_current_chunk(
|
||||
keySchema.snapshot({ projectId }),
|
||||
keySchema.startVersion({ projectId }),
|
||||
keySchema.changes({ projectId })
|
||||
)
|
||||
if (!result) {
|
||||
return null // cache-miss
|
||||
}
|
||||
const snapshot = Snapshot.fromRaw(JSON.parse(result[0]))
|
||||
const startVersion = JSON.parse(result[1])
|
||||
const changes = result[2].map(c => Change.fromRaw(JSON.parse(c)))
|
||||
const history = new History(snapshot, changes)
|
||||
const chunk = new Chunk(history, startVersion)
|
||||
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'success' })
|
||||
return chunk
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'error getting current chunk from redis')
|
||||
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'error' })
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
rclient.defineCommand('get_current_chunk_metadata', {
|
||||
numberOfKeys: 2,
|
||||
lua: `
|
||||
local startVersionValue = redis.call('GET', KEYS[1])
|
||||
local changesCount = redis.call('LLEN', KEYS[2])
|
||||
return {startVersionValue, changesCount}
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Retrieves the current chunk metadata for a given project from Redis
|
||||
* @param {string} projectId - The ID of the project to get metadata for
|
||||
* @returns {Promise<Object|null>} Object containing startVersion and changesCount if found, null on error or cache miss
|
||||
* @property {number} startVersion - The starting version information
|
||||
* @property {number} changesCount - The number of changes in the chunk
|
||||
*/
|
||||
async function getCurrentChunkMetadata(projectId) {
|
||||
try {
|
||||
const result = await rclient.get_current_chunk_metadata(
|
||||
keySchema.startVersion({ projectId }),
|
||||
keySchema.changes({ projectId })
|
||||
)
|
||||
if (!result) {
|
||||
return null // cache-miss
|
||||
}
|
||||
const startVersion = JSON.parse(result[0])
|
||||
const changesCount = parseInt(result[1], 10)
|
||||
return { startVersion, changesCount }
|
||||
} catch (err) {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
rclient.defineCommand('set_current_chunk', {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
local snapshotValue = ARGV[1]
|
||||
local startVersionValue = ARGV[2]
|
||||
redis.call('SETEX', KEYS[1], ${TEMPORARY_CACHE_LIFETIME}, snapshotValue)
|
||||
redis.call('SETEX', KEYS[2], ${TEMPORARY_CACHE_LIFETIME}, startVersionValue)
|
||||
redis.call('DEL', KEYS[3]) -- clear the old changes list
|
||||
if #ARGV >= 3 then
|
||||
redis.call('RPUSH', KEYS[3], unpack(ARGV, 3))
|
||||
redis.call('EXPIRE', KEYS[3], ${TEMPORARY_CACHE_LIFETIME})
|
||||
end
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Stores the current chunk of project history in Redis
|
||||
* @param {string} projectId - The ID of the project
|
||||
* @param {Chunk} chunk - The chunk object containing history data
|
||||
* @returns {Promise<*>} Returns the result of the Redis operation, or null if an error occurs
|
||||
* @throws {Error} May throw Redis-related errors which are caught internally
|
||||
*/
|
||||
async function setCurrentChunk(projectId, chunk) {
|
||||
try {
|
||||
const snapshotKey = keySchema.snapshot({ projectId })
|
||||
const startVersionKey = keySchema.startVersion({ projectId })
|
||||
const changesKey = keySchema.changes({ projectId })
|
||||
|
||||
const snapshot = chunk.history.snapshot
|
||||
const startVersion = chunk.startVersion
|
||||
const changes = chunk.history.changes
|
||||
|
||||
await rclient.set_current_chunk(
|
||||
snapshotKey,
|
||||
startVersionKey,
|
||||
changesKey,
|
||||
JSON.stringify(snapshot.toRaw()),
|
||||
startVersion,
|
||||
...changes.map(c => JSON.stringify(c.toRaw()))
|
||||
)
|
||||
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'success' })
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ err, projectId, chunk },
|
||||
'error setting current chunk inredis'
|
||||
)
|
||||
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'error' })
|
||||
return null // while testing we will suppress any errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a cached chunk's version metadata matches the current chunk's metadata
|
||||
* @param {Chunk} cachedChunk - The chunk retrieved from cache
|
||||
* @param {Chunk} currentChunk - The current chunk to compare against
|
||||
* @returns {boolean} - Returns true if the chunks have matching start and end versions, false otherwise
|
||||
*/
|
||||
function checkCacheValidity(cachedChunk, currentChunk) {
|
||||
return Boolean(
|
||||
cachedChunk &&
|
||||
cachedChunk.getStartVersion() === currentChunk.getStartVersion() &&
|
||||
cachedChunk.getEndVersion() === currentChunk.getEndVersion()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates if a cached chunk matches the current chunk metadata by comparing versions
|
||||
* @param {Object} cachedChunk - The cached chunk object to validate
|
||||
* @param {Object} currentChunkMetadata - The current chunk metadata to compare against
|
||||
* @param {number} currentChunkMetadata.startVersion - The starting version number
|
||||
* @param {number} currentChunkMetadata.endVersion - The ending version number
|
||||
* @returns {boolean} - True if the cached chunk is valid, false otherwise
|
||||
*/
|
||||
function checkCacheValidityWithMetadata(cachedChunk, currentChunkMetadata) {
|
||||
return Boolean(
|
||||
cachedChunk &&
|
||||
cachedChunk.getStartVersion() === currentChunkMetadata.startVersion &&
|
||||
cachedChunk.getEndVersion() === currentChunkMetadata.endVersion
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two chunks for equality using stringified JSON comparison
|
||||
* @param {string} projectId - The ID of the project
|
||||
* @param {Chunk} cachedChunk - The cached chunk to compare
|
||||
* @param {Chunk} currentChunk - The current chunk to compare against
|
||||
* @returns {boolean} - Returns false if either chunk is null/undefined, otherwise returns the comparison result
|
||||
*/
|
||||
function compareChunks(projectId, cachedChunk, currentChunk) {
|
||||
if (!cachedChunk || !currentChunk) {
|
||||
return false
|
||||
}
|
||||
const identical = JSON.stringify(cachedChunk) === JSON.stringify(currentChunk)
|
||||
if (!identical) {
|
||||
try {
|
||||
logger.error(
|
||||
{
|
||||
projectId,
|
||||
cachedChunkStartVersion: cachedChunk.getStartVersion(),
|
||||
cachedChunkEndVersion: cachedChunk.getEndVersion(),
|
||||
currentChunkStartVersion: currentChunk.getStartVersion(),
|
||||
currentChunkEndVersion: currentChunk.getEndVersion(),
|
||||
},
|
||||
'chunk cache mismatch'
|
||||
)
|
||||
} catch (err) {
|
||||
// ignore errors while logging
|
||||
}
|
||||
}
|
||||
metrics.inc('chunk_store.redis.compare_chunks', 1, {
|
||||
status: identical ? 'success' : 'fail',
|
||||
})
|
||||
return identical
|
||||
}
|
||||
|
||||
// Define Lua script for atomic cache clearing
|
||||
rclient.defineCommand('clear_chunk_cache', {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
-- Delete all keys related to a project's chunk cache atomically
|
||||
redis.call('DEL', KEYS[1]) -- snapshot key
|
||||
redis.call('DEL', KEYS[2]) -- startVersion key
|
||||
redis.call('DEL', KEYS[3]) -- changes key
|
||||
return 1
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Clears all cache entries for a project's chunk data
|
||||
* @param {string} projectId - The ID of the project whose cache should be cleared
|
||||
* @returns {Promise<boolean>} A promise that resolves to true if successful, false on error
|
||||
*/
|
||||
async function clearCache(projectId) {
|
||||
try {
|
||||
const snapshotKey = keySchema.snapshot({ projectId })
|
||||
const startVersionKey = keySchema.startVersion({ projectId })
|
||||
const changesKey = keySchema.changes({ projectId })
|
||||
|
||||
await rclient.clear_chunk_cache(snapshotKey, startVersionKey, changesKey)
|
||||
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'success' })
|
||||
return true
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'error clearing chunk cache from redis')
|
||||
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'error' })
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getCurrentChunk,
|
||||
setCurrentChunk,
|
||||
getCurrentChunkMetadata,
|
||||
checkCacheValidity,
|
||||
checkCacheValidityWithMetadata,
|
||||
compareChunks,
|
||||
clearCache,
|
||||
}
|
||||
18
services/history-v1/storage/lib/content_hash.js
Normal file
18
services/history-v1/storage/lib/content_hash.js
Normal file
@@ -0,0 +1,18 @@
|
||||
// @ts-check
|
||||
|
||||
const { createHash } = require('node:crypto')
|
||||
|
||||
/**
|
||||
* Compute a SHA-1 hash of the content
|
||||
*
|
||||
* This is used to validate incoming updates.
|
||||
*
|
||||
* @param {string} content
|
||||
*/
|
||||
function getContentHash(content) {
|
||||
const hash = createHash('sha-1')
|
||||
hash.update(content)
|
||||
return hash.digest('hex')
|
||||
}
|
||||
|
||||
module.exports = { getContentHash }
|
||||
5
services/history-v1/storage/lib/errors.js
Normal file
5
services/history-v1/storage/lib/errors.js
Normal file
@@ -0,0 +1,5 @@
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
class InvalidChangeError extends OError {}
|
||||
|
||||
module.exports = { InvalidChangeError }
|
||||
30
services/history-v1/storage/lib/hash_check_blob_store.js
Normal file
30
services/history-v1/storage/lib/hash_check_blob_store.js
Normal file
@@ -0,0 +1,30 @@
|
||||
const Blob = require('overleaf-editor-core').Blob
|
||||
const blobHash = require('./blob_hash')
|
||||
const BPromise = require('bluebird')
|
||||
|
||||
// We want to simulate applying all of the operations so we can return the
|
||||
// resulting hashes to the caller for them to check. To do this, we need to be
|
||||
// able to take the lazy files in the final snapshot, fetch their content, and
|
||||
// compute the new content hashes. We don't, however, need to actually store
|
||||
// that content; we just need to get the hash.
|
||||
function HashCheckBlobStore(realBlobStore) {
|
||||
this.realBlobStore = realBlobStore
|
||||
}
|
||||
|
||||
HashCheckBlobStore.prototype.getString = BPromise.method(
|
||||
function hashCheckBlobStoreGetString(hash) {
|
||||
return this.realBlobStore.getString(hash)
|
||||
}
|
||||
)
|
||||
|
||||
HashCheckBlobStore.prototype.putString = BPromise.method(
|
||||
function hashCheckBlobStorePutString(string) {
|
||||
return new Blob(
|
||||
blobHash.fromString(string),
|
||||
Buffer.byteLength(string),
|
||||
string.length
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
module.exports = HashCheckBlobStore
|
||||
202
services/history-v1/storage/lib/history_store.js
Normal file
202
services/history-v1/storage/lib/history_store.js
Normal file
@@ -0,0 +1,202 @@
|
||||
// @ts-check
|
||||
'use strict'
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
|
||||
const config = require('config')
|
||||
const path = require('node:path')
|
||||
const Stream = require('node:stream')
|
||||
const { promisify } = require('node:util')
|
||||
const zlib = require('node:zlib')
|
||||
|
||||
const OError = require('@overleaf/o-error')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
const assert = require('./assert')
|
||||
const persistor = require('./persistor')
|
||||
const projectKey = require('./project_key')
|
||||
const streams = require('./streams')
|
||||
|
||||
const Chunk = core.Chunk
|
||||
|
||||
const gzip = promisify(zlib.gzip)
|
||||
const gunzip = promisify(zlib.gunzip)
|
||||
|
||||
class LoadError extends OError {
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {any} cause
|
||||
*/
|
||||
constructor(projectId, chunkId, cause) {
|
||||
super(
|
||||
'HistoryStore: failed to load chunk history',
|
||||
{ projectId, chunkId },
|
||||
cause
|
||||
)
|
||||
this.projectId = projectId
|
||||
this.chunkId = chunkId
|
||||
}
|
||||
}
|
||||
|
||||
class StoreError extends OError {
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {any} cause
|
||||
*/
|
||||
constructor(projectId, chunkId, cause) {
|
||||
super(
|
||||
'HistoryStore: failed to store chunk history',
|
||||
{ projectId, chunkId },
|
||||
cause
|
||||
)
|
||||
this.projectId = projectId
|
||||
this.chunkId = chunkId
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @return {string}
|
||||
*/
|
||||
function getKey(projectId, chunkId) {
|
||||
return path.join(projectKey.format(projectId), projectKey.pad(chunkId))
|
||||
}
|
||||
|
||||
/**
|
||||
* Store and retreive raw {@link History} objects from bucket. Mainly used via the
|
||||
* {@link ChunkStore}.
|
||||
*
|
||||
* Histories are stored as gzipped JSON blobs, keyed on the project ID and the
|
||||
* ID of the Chunk that owns the history. The project ID is currently redundant,
|
||||
* but I think it might help in future if we have to shard on project ID, and
|
||||
* it gives us some chance of reconstructing histories even if there is a
|
||||
* problem with the chunk metadata in the database.
|
||||
*
|
||||
* @class
|
||||
*/
|
||||
class HistoryStore {
|
||||
#persistor
|
||||
#bucket
|
||||
constructor(persistor, bucket) {
|
||||
this.#persistor = persistor
|
||||
this.#bucket = bucket
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the raw object for a History.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @return {Promise<import('overleaf-editor-core/lib/types').RawHistory>}
|
||||
*/
|
||||
async loadRaw(projectId, chunkId) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.chunkId(chunkId, 'bad chunkId')
|
||||
|
||||
const key = getKey(projectId, chunkId)
|
||||
|
||||
logger.debug({ projectId, chunkId }, 'loadRaw started')
|
||||
try {
|
||||
const buf = await streams.gunzipStreamToBuffer(
|
||||
await this.#persistor.getObjectStream(this.#bucket, key)
|
||||
)
|
||||
return JSON.parse(buf.toString('utf-8'))
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Chunk.NotPersistedError(projectId)
|
||||
}
|
||||
throw new LoadError(projectId, chunkId, err)
|
||||
} finally {
|
||||
logger.debug({ projectId, chunkId }, 'loadRaw finished')
|
||||
}
|
||||
}
|
||||
|
||||
async loadRawWithBuffer(projectId, chunkId) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.chunkId(chunkId, 'bad chunkId')
|
||||
|
||||
const key = getKey(projectId, chunkId)
|
||||
|
||||
logger.debug({ projectId, chunkId }, 'loadBuffer started')
|
||||
try {
|
||||
const buf = await streams.readStreamToBuffer(
|
||||
await this.#persistor.getObjectStream(this.#bucket, key)
|
||||
)
|
||||
const unzipped = await gunzip(buf)
|
||||
return {
|
||||
buffer: buf,
|
||||
raw: JSON.parse(unzipped.toString('utf-8')),
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Chunk.NotPersistedError(projectId)
|
||||
}
|
||||
throw new LoadError(projectId, chunkId, err)
|
||||
} finally {
|
||||
logger.debug({ projectId, chunkId }, 'loadBuffer finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress and store a {@link History}.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {import('overleaf-editor-core/lib/types').RawHistory} rawHistory
|
||||
*/
|
||||
async storeRaw(projectId, chunkId, rawHistory) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.chunkId(chunkId, 'bad chunkId')
|
||||
assert.object(rawHistory, 'bad rawHistory')
|
||||
|
||||
const key = getKey(projectId, chunkId)
|
||||
|
||||
logger.debug({ projectId, chunkId }, 'storeRaw started')
|
||||
|
||||
const buf = await gzip(JSON.stringify(rawHistory))
|
||||
try {
|
||||
await this.#persistor.sendStream(
|
||||
this.#bucket,
|
||||
key,
|
||||
Stream.Readable.from([buf]),
|
||||
{
|
||||
contentType: 'application/json',
|
||||
contentEncoding: 'gzip',
|
||||
contentLength: buf.byteLength,
|
||||
}
|
||||
)
|
||||
} catch (err) {
|
||||
throw new StoreError(projectId, chunkId, err)
|
||||
} finally {
|
||||
logger.debug({ projectId, chunkId }, 'storeRaw finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete multiple chunks from bucket. Expects an Array of objects with
|
||||
* projectId and chunkId properties
|
||||
* @param {Array<{projectId: string,chunkId:string}>} chunks
|
||||
*/
|
||||
async deleteChunks(chunks) {
|
||||
logger.debug({ chunks }, 'deleteChunks started')
|
||||
try {
|
||||
await Promise.all(
|
||||
chunks.map(chunk => {
|
||||
const key = getKey(chunk.projectId, chunk.chunkId)
|
||||
return this.#persistor.deleteObject(this.#bucket, key)
|
||||
})
|
||||
)
|
||||
} finally {
|
||||
logger.debug({ chunks }, 'deleteChunks finished')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
HistoryStore,
|
||||
historyStore: new HistoryStore(persistor, config.get('chunkStore.bucket')),
|
||||
}
|
||||
8
services/history-v1/storage/lib/knex.js
Normal file
8
services/history-v1/storage/lib/knex.js
Normal file
@@ -0,0 +1,8 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
const env = process.env.NODE_ENV || 'development'
|
||||
|
||||
const knexfile = require('../../knexfile')
|
||||
module.exports = require('knex').default(knexfile[env])
|
||||
19
services/history-v1/storage/lib/knex_read_only.js
Normal file
19
services/history-v1/storage/lib/knex_read_only.js
Normal file
@@ -0,0 +1,19 @@
|
||||
'use strict'
|
||||
|
||||
const config = require('config')
|
||||
const knexfile = require('../../knexfile')
|
||||
|
||||
const env = process.env.NODE_ENV || 'development'
|
||||
|
||||
if (config.databaseUrlReadOnly) {
|
||||
module.exports = require('knex')({
|
||||
...knexfile[env],
|
||||
pool: {
|
||||
...knexfile[env].pool,
|
||||
min: 0,
|
||||
},
|
||||
connection: config.databaseUrlReadOnly,
|
||||
})
|
||||
} else {
|
||||
module.exports = require('./knex')
|
||||
}
|
||||
30
services/history-v1/storage/lib/mongodb.js
Normal file
30
services/history-v1/storage/lib/mongodb.js
Normal file
@@ -0,0 +1,30 @@
|
||||
const Metrics = require('@overleaf/metrics')
|
||||
|
||||
const config = require('config')
|
||||
const { MongoClient } = require('mongodb')
|
||||
|
||||
const client = new MongoClient(config.mongo.uri)
|
||||
const db = client.db()
|
||||
|
||||
const chunks = db.collection('projectHistoryChunks')
|
||||
const blobs = db.collection('projectHistoryBlobs')
|
||||
const globalBlobs = db.collection('projectHistoryGlobalBlobs')
|
||||
const shardedBlobs = db.collection('projectHistoryShardedBlobs')
|
||||
const projects = db.collection('projects')
|
||||
// Temporary collection for tracking progress of backed up old blobs (without a hash).
|
||||
// The initial sync process will be able to skip over these.
|
||||
// Schema: _id: projectId, blobs: [Binary]
|
||||
const backedUpBlobs = db.collection('projectHistoryBackedUpBlobs')
|
||||
|
||||
Metrics.mongodb.monitor(client)
|
||||
|
||||
module.exports = {
|
||||
client,
|
||||
db,
|
||||
chunks,
|
||||
blobs,
|
||||
globalBlobs,
|
||||
projects,
|
||||
shardedBlobs,
|
||||
backedUpBlobs,
|
||||
}
|
||||
261
services/history-v1/storage/lib/persist_changes.js
Normal file
261
services/history-v1/storage/lib/persist_changes.js
Normal file
@@ -0,0 +1,261 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
const _ = require('lodash')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const Chunk = core.Chunk
|
||||
const History = core.History
|
||||
|
||||
const assert = require('./assert')
|
||||
const chunkStore = require('./chunk_store')
|
||||
const { BlobStore } = require('./blob_store')
|
||||
const { InvalidChangeError } = require('./errors')
|
||||
const { getContentHash } = require('./content_hash')
|
||||
|
||||
function countChangeBytes(change) {
|
||||
// Note: This is not quite accurate, because the raw change may contain raw
|
||||
// file info (or conceivably even content) that will not be included in the
|
||||
// actual stored object.
|
||||
return Buffer.byteLength(JSON.stringify(change.toRaw()))
|
||||
}
|
||||
|
||||
function totalChangeBytes(changes) {
|
||||
return changes.length ? _(changes).map(countChangeBytes).sum() : 0
|
||||
}
|
||||
|
||||
// provide a simple timer function
|
||||
function Timer() {
|
||||
this.t0 = process.hrtime()
|
||||
}
|
||||
Timer.prototype.elapsed = function () {
|
||||
const dt = process.hrtime(this.t0)
|
||||
const timeInMilliseconds = (dt[0] + dt[1] * 1e-9) * 1e3
|
||||
return timeInMilliseconds
|
||||
}
|
||||
|
||||
/**
|
||||
* Break the given set of changes into zero or more Chunks according to the
|
||||
* provided limits and store them.
|
||||
*
|
||||
* Some other possible improvements:
|
||||
* 1. This does a lot more JSON serialization than it has to. We may know the
|
||||
* JSON for the changes before we call this function, so we could in that
|
||||
* case get the byte size of each change without doing any work. Even if we
|
||||
* don't know it initially, we could save some computation by caching this
|
||||
* info rather than recomputing it many times. TBD whether it is worthwhile.
|
||||
* 2. We don't necessarily have to fetch the latest chunk in order to determine
|
||||
* that it is full. We could store this in the chunk metadata record. It may
|
||||
* be worth distinguishing between a Chunk and its metadata record. The
|
||||
* endVersion may be better suited to the metadata record.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {core.Change[]} allChanges
|
||||
* @param {Object} limits
|
||||
* @param {number} clientEndVersion
|
||||
* @return {Promise.<Object?>}
|
||||
*/
|
||||
async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
|
||||
assert.projectId(projectId)
|
||||
assert.array(allChanges)
|
||||
assert.maybe.object(limits)
|
||||
assert.integer(clientEndVersion)
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
const earliestChangeTimestamp =
|
||||
allChanges.length > 0 ? allChanges[0].getTimestamp() : null
|
||||
|
||||
let currentChunk
|
||||
|
||||
/**
|
||||
* currentSnapshot tracks the latest change that we're applying; we use it to
|
||||
* check that the changes we are persisting are valid.
|
||||
*
|
||||
* @type {core.Snapshot}
|
||||
*/
|
||||
let currentSnapshot
|
||||
|
||||
let originalEndVersion
|
||||
let changesToPersist
|
||||
|
||||
limits = limits || {}
|
||||
_.defaults(limits, {
|
||||
changeBucketMinutes: 60,
|
||||
maxChanges: 2500,
|
||||
maxChangeBytes: 5 * 1024 * 1024,
|
||||
maxChunkChanges: 2000,
|
||||
maxChunkChangeBytes: 5 * 1024 * 1024,
|
||||
maxChunkChangeTime: 5000, // warn if total time for changes in a chunk takes longer than this
|
||||
})
|
||||
|
||||
function checkElapsedTime(timer) {
|
||||
const timeTaken = timer.elapsed()
|
||||
if (timeTaken > limits.maxChunkChangeTime) {
|
||||
console.log('warning: slow chunk', projectId, timeTaken)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add changes to a chunk until the chunk is full
|
||||
*
|
||||
* The chunk is full if it reaches a certain number of changes or a certain
|
||||
* size in bytes
|
||||
*
|
||||
* @param {core.Chunk} chunk
|
||||
* @param {core.Change[]} changes
|
||||
*/
|
||||
async function fillChunk(chunk, changes) {
|
||||
let totalBytes = totalChangeBytes(chunk.getChanges())
|
||||
let changesPushed = false
|
||||
while (changes.length > 0) {
|
||||
if (chunk.getChanges().length >= limits.maxChunkChanges) {
|
||||
break
|
||||
}
|
||||
|
||||
const change = changes[0]
|
||||
const changeBytes = countChangeBytes(change)
|
||||
|
||||
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) {
|
||||
break
|
||||
}
|
||||
|
||||
for (const operation of change.iterativelyApplyTo(currentSnapshot, {
|
||||
strict: true,
|
||||
})) {
|
||||
await validateContentHash(operation)
|
||||
}
|
||||
|
||||
chunk.pushChanges([change])
|
||||
changes.shift()
|
||||
totalBytes += changeBytes
|
||||
changesPushed = true
|
||||
}
|
||||
return changesPushed
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the operation is valid and can be incorporated to the history.
|
||||
*
|
||||
* For now, this checks content hashes when they are provided.
|
||||
*
|
||||
* @param {core.Operation} operation
|
||||
*/
|
||||
async function validateContentHash(operation) {
|
||||
if (operation instanceof core.EditFileOperation) {
|
||||
const editOperation = operation.getOperation()
|
||||
if (
|
||||
editOperation instanceof core.TextOperation &&
|
||||
editOperation.contentHash != null
|
||||
) {
|
||||
const path = operation.getPathname()
|
||||
const file = currentSnapshot.getFile(path)
|
||||
if (file == null) {
|
||||
throw new InvalidChangeError('file not found for hash validation', {
|
||||
projectId,
|
||||
path,
|
||||
})
|
||||
}
|
||||
await file.load('eager', blobStore)
|
||||
const content = file.getContent({ filterTrackedDeletes: true })
|
||||
const expectedHash = editOperation.contentHash
|
||||
const actualHash = content != null ? getContentHash(content) : null
|
||||
logger.debug({ expectedHash, actualHash }, 'validating content hash')
|
||||
if (actualHash !== expectedHash) {
|
||||
throw new InvalidChangeError('content hash mismatch', {
|
||||
projectId,
|
||||
path,
|
||||
expectedHash,
|
||||
actualHash,
|
||||
})
|
||||
}
|
||||
|
||||
// Remove the content hash from the change before storing it in the chunk.
|
||||
// It was only useful for validation.
|
||||
editOperation.contentHash = null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function extendLastChunkIfPossible() {
|
||||
const latestChunk = await chunkStore.loadLatest(projectId)
|
||||
|
||||
currentChunk = latestChunk
|
||||
originalEndVersion = latestChunk.getEndVersion()
|
||||
if (originalEndVersion !== clientEndVersion) {
|
||||
throw new Chunk.ConflictingEndVersion(
|
||||
clientEndVersion,
|
||||
originalEndVersion
|
||||
)
|
||||
}
|
||||
|
||||
currentSnapshot = latestChunk.getSnapshot().clone()
|
||||
const timer = new Timer()
|
||||
currentSnapshot.applyAll(latestChunk.getChanges())
|
||||
|
||||
const changesPushed = await fillChunk(currentChunk, changesToPersist)
|
||||
if (!changesPushed) {
|
||||
return
|
||||
}
|
||||
|
||||
checkElapsedTime(timer)
|
||||
|
||||
await chunkStore.update(
|
||||
projectId,
|
||||
originalEndVersion,
|
||||
currentChunk,
|
||||
earliestChangeTimestamp
|
||||
)
|
||||
}
|
||||
|
||||
async function createNewChunksAsNeeded() {
|
||||
while (changesToPersist.length > 0) {
|
||||
const endVersion = currentChunk.getEndVersion()
|
||||
const history = new History(currentSnapshot.clone(), [])
|
||||
const chunk = new Chunk(history, endVersion)
|
||||
const timer = new Timer()
|
||||
|
||||
const changesPushed = await fillChunk(chunk, changesToPersist)
|
||||
if (changesPushed) {
|
||||
checkElapsedTime(timer)
|
||||
currentChunk = chunk
|
||||
await chunkStore.create(projectId, chunk, earliestChangeTimestamp)
|
||||
} else {
|
||||
throw new Error('failed to fill empty chunk')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function isOlderThanMinChangeTimestamp(change) {
|
||||
return change.getTimestamp().getTime() < limits.minChangeTimestamp
|
||||
}
|
||||
|
||||
function isOlderThanMaxChangeTimestamp(change) {
|
||||
return change.getTimestamp().getTime() < limits.maxChangeTimestamp
|
||||
}
|
||||
|
||||
const oldChanges = _.filter(allChanges, isOlderThanMinChangeTimestamp)
|
||||
const anyTooOld = _.some(oldChanges, isOlderThanMaxChangeTimestamp)
|
||||
const tooManyChanges = oldChanges.length > limits.maxChanges
|
||||
const tooManyBytes = totalChangeBytes(oldChanges) > limits.maxChangeBytes
|
||||
|
||||
if (anyTooOld || tooManyChanges || tooManyBytes) {
|
||||
changesToPersist = oldChanges
|
||||
const numberOfChangesToPersist = oldChanges.length
|
||||
|
||||
await extendLastChunkIfPossible()
|
||||
await createNewChunksAsNeeded()
|
||||
|
||||
return {
|
||||
numberOfChangesPersisted: numberOfChangesToPersist,
|
||||
originalEndVersion,
|
||||
currentChunk,
|
||||
}
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = persistChanges
|
||||
27
services/history-v1/storage/lib/persistor.js
Normal file
27
services/history-v1/storage/lib/persistor.js
Normal file
@@ -0,0 +1,27 @@
|
||||
const _ = require('lodash')
|
||||
const config = require('config')
|
||||
const metrics = require('@overleaf/metrics')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
|
||||
const persistorConfig = _.cloneDeep(config.get('persistor'))
|
||||
|
||||
function convertKey(key, convertFn) {
|
||||
if (_.has(persistorConfig, key)) {
|
||||
_.update(persistorConfig, key, convertFn)
|
||||
}
|
||||
}
|
||||
|
||||
convertKey('s3.signedUrlExpiryInMs', s => parseInt(s, 10))
|
||||
convertKey('s3.httpOptions.timeout', s => parseInt(s, 10))
|
||||
convertKey('s3.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('s3.pathStyle', s => s === 'true')
|
||||
convertKey('gcs.unlockBeforeDelete', s => s === 'true')
|
||||
convertKey('gcs.unsignedUrls', s => s === 'true')
|
||||
convertKey('gcs.signedUrlExpiryInMs', s => parseInt(s, 10))
|
||||
convertKey('gcs.deleteConcurrency', s => parseInt(s, 10))
|
||||
convertKey('gcs.retryOptions.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('fallback.buckets', s => JSON.parse(s || '{}'))
|
||||
|
||||
persistorConfig.Metrics = metrics
|
||||
|
||||
module.exports = objectPersistor(persistorConfig)
|
||||
140
services/history-v1/storage/lib/project_archive.js
Normal file
140
services/history-v1/storage/lib/project_archive.js
Normal file
@@ -0,0 +1,140 @@
|
||||
// @ts-check
|
||||
'use strict'
|
||||
|
||||
/**
|
||||
* @import { Snapshot } from 'overleaf-editor-core'
|
||||
* @import { BlobStore } from '../../storage/lib/blob_store/index'
|
||||
*/
|
||||
|
||||
const Archive = require('archiver')
|
||||
const BPromise = require('bluebird')
|
||||
const fs = require('node:fs')
|
||||
const { pipeline } = require('node:stream')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
|
||||
const Snapshot = core.Snapshot
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
const assert = require('./assert')
|
||||
|
||||
// The maximum safe concurrency appears to be 1.
|
||||
// https://github.com/overleaf/issues/issues/1909
|
||||
const FETCH_CONCURRENCY = 1 // number of files to fetch at once
|
||||
const DEFAULT_ZIP_TIMEOUT = 25000 // ms
|
||||
|
||||
class DownloadError extends OError {
|
||||
constructor(hash) {
|
||||
super(`ProjectArchive: blob download failed: ${hash}`, { hash })
|
||||
}
|
||||
}
|
||||
|
||||
class ArchiveTimeout extends OError {
|
||||
constructor() {
|
||||
super('ProjectArchive timed out')
|
||||
}
|
||||
}
|
||||
|
||||
class MissingfileError extends OError {
|
||||
constructor() {
|
||||
super('ProjectArchive: attempting to look up a file that does not exist')
|
||||
}
|
||||
}
|
||||
|
||||
class ProjectArchive {
|
||||
static ArchiveTimeout = ArchiveTimeout
|
||||
static MissingfileError = MissingfileError
|
||||
static DownloadError = DownloadError
|
||||
|
||||
/**
|
||||
* @constructor
|
||||
* @param {Snapshot} snapshot
|
||||
* @param {number} [timeout] in ms
|
||||
* @classdesc
|
||||
* Writes the project snapshot to a zip file.
|
||||
*/
|
||||
constructor(snapshot, timeout) {
|
||||
assert.instance(snapshot, Snapshot)
|
||||
this.snapshot = snapshot
|
||||
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
|
||||
}
|
||||
|
||||
/**
|
||||
* Write zip archive to the given file path.
|
||||
*
|
||||
* @param {BlobStore} blobStore
|
||||
* @param {string} zipFilePath
|
||||
*/
|
||||
writeZip(blobStore, zipFilePath) {
|
||||
const snapshot = this.snapshot
|
||||
const timeout = this.timeout
|
||||
|
||||
const startTime = process.hrtime()
|
||||
const archive = new Archive('zip')
|
||||
|
||||
// Convert elapsed seconds and nanoseconds to milliseconds.
|
||||
function findElapsedMilliseconds() {
|
||||
const elapsed = process.hrtime(startTime)
|
||||
return elapsed[0] * 1e3 + elapsed[1] * 1e-6
|
||||
}
|
||||
|
||||
function addFileToArchive(pathname) {
|
||||
if (findElapsedMilliseconds() > timeout) {
|
||||
throw new ProjectArchive.ArchiveTimeout()
|
||||
}
|
||||
|
||||
const file = snapshot.getFile(pathname)
|
||||
if (!file) {
|
||||
throw new ProjectArchive.MissingfileError()
|
||||
}
|
||||
return file.load('eager', blobStore).then(function () {
|
||||
const content = file.getContent({ filterTrackedDeletes: true })
|
||||
if (content === null) {
|
||||
return streamFileToArchive(pathname, file).catch(function (err) {
|
||||
throw new ProjectArchive.DownloadError(file.getHash()).withCause(
|
||||
err
|
||||
)
|
||||
})
|
||||
} else {
|
||||
archive.append(content, { name: pathname })
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
function streamFileToArchive(pathname, file) {
|
||||
return new BPromise(function (resolve, reject) {
|
||||
blobStore
|
||||
.getStream(file.getHash())
|
||||
.then(stream => {
|
||||
stream.on('error', reject)
|
||||
stream.on('end', resolve)
|
||||
archive.append(stream, { name: pathname })
|
||||
})
|
||||
.catch(reject)
|
||||
})
|
||||
}
|
||||
|
||||
const addFilesToArchiveAndFinalize = BPromise.map(
|
||||
snapshot.getFilePathnames(),
|
||||
addFileToArchive,
|
||||
{ concurrency: FETCH_CONCURRENCY }
|
||||
).then(function () {
|
||||
archive.finalize()
|
||||
})
|
||||
|
||||
const streamArchiveToFile = new BPromise(function (resolve, reject) {
|
||||
const stream = fs.createWriteStream(zipFilePath)
|
||||
pipeline(archive, stream, function (err) {
|
||||
if (err) {
|
||||
reject(err)
|
||||
} else {
|
||||
resolve()
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = ProjectArchive
|
||||
24
services/history-v1/storage/lib/project_key.js
Normal file
24
services/history-v1/storage/lib/project_key.js
Normal file
@@ -0,0 +1,24 @@
|
||||
// Keep in sync with services/web/app/src/Features/History/project_key.js
|
||||
const _ = require('lodash')
|
||||
const path = require('node:path')
|
||||
|
||||
//
|
||||
// The advice in http://docs.aws.amazon.com/AmazonS3/latest/dev/
|
||||
// request-rate-perf-considerations.html is to avoid sequential key prefixes,
|
||||
// so we reverse the project ID part of the key as they suggest.
|
||||
//
|
||||
function format(projectId) {
|
||||
const prefix = naiveReverse(pad(projectId))
|
||||
return path.join(prefix.slice(0, 3), prefix.slice(3, 6), prefix.slice(6))
|
||||
}
|
||||
|
||||
function pad(number) {
|
||||
return _.padStart(number, 9, '0')
|
||||
}
|
||||
|
||||
function naiveReverse(string) {
|
||||
return string.split('').reverse().join('')
|
||||
}
|
||||
|
||||
exports.format = format
|
||||
exports.pad = pad
|
||||
19
services/history-v1/storage/lib/redis.js
Normal file
19
services/history-v1/storage/lib/redis.js
Normal file
@@ -0,0 +1,19 @@
|
||||
const config = require('config')
|
||||
const redis = require('@overleaf/redis-wrapper')
|
||||
|
||||
const historyRedisOptions = config.get('redis.history')
|
||||
const rclientHistory = redis.createClient(historyRedisOptions)
|
||||
|
||||
const lockRedisOptions = config.get('redis.history')
|
||||
const rclientLock = redis.createClient(lockRedisOptions)
|
||||
|
||||
async function disconnect() {
|
||||
await Promise.all([rclientHistory.disconnect(), rclientLock.disconnect()])
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
rclientHistory,
|
||||
rclientLock,
|
||||
redis,
|
||||
disconnect,
|
||||
}
|
||||
40
services/history-v1/storage/lib/streams.js
Normal file
40
services/history-v1/storage/lib/streams.js
Normal file
@@ -0,0 +1,40 @@
|
||||
// @ts-check
|
||||
/**
|
||||
* Promises are promises and streams are streams, and ne'er the twain shall
|
||||
* meet.
|
||||
* @module
|
||||
*/
|
||||
'use strict'
|
||||
|
||||
const Stream = require('node:stream')
|
||||
const zlib = require('node:zlib')
|
||||
const { WritableBuffer } = require('@overleaf/stream-utils')
|
||||
|
||||
/**
|
||||
* Create a promise for the result of reading a stream to a buffer.
|
||||
*
|
||||
* @param {Stream.Readable} readStream
|
||||
* @return {Promise<Buffer>}
|
||||
*/
|
||||
async function readStreamToBuffer(readStream) {
|
||||
const bufferStream = new WritableBuffer()
|
||||
await Stream.promises.pipeline(readStream, bufferStream)
|
||||
return bufferStream.contents()
|
||||
}
|
||||
|
||||
exports.readStreamToBuffer = readStreamToBuffer
|
||||
|
||||
/**
|
||||
* Create a promise for the result of un-gzipping a stream to a buffer.
|
||||
*
|
||||
* @param {NodeJS.ReadableStream} readStream
|
||||
* @return {Promise<Buffer>}
|
||||
*/
|
||||
async function gunzipStreamToBuffer(readStream) {
|
||||
const gunzip = zlib.createGunzip()
|
||||
const bufferStream = new WritableBuffer()
|
||||
await Stream.promises.pipeline(readStream, gunzip, bufferStream)
|
||||
return bufferStream.contents()
|
||||
}
|
||||
|
||||
exports.gunzipStreamToBuffer = gunzipStreamToBuffer
|
||||
25
services/history-v1/storage/lib/temp.js
Normal file
25
services/history-v1/storage/lib/temp.js
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Taken from renderer/app/helpers/temp.js with minor cosmetic changes.
|
||||
* Promisify the temp package. The temp package provides a 'track' feature
|
||||
* that automatically cleans up temp files at process exit, but that is not
|
||||
* very useful. They also provide a method to trigger cleanup, but that is not
|
||||
* safe for concurrent use. So, we use a disposer to unlink the file.
|
||||
*/
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
const fs = BPromise.promisifyAll(require('node:fs'))
|
||||
const temp = BPromise.promisifyAll(require('temp'))
|
||||
|
||||
exports.open = function (affixes) {
|
||||
return temp.openAsync(affixes).disposer(function (fileInfo) {
|
||||
fs.closeAsync(fileInfo.fd)
|
||||
.then(() => {
|
||||
return fs.unlinkAsync(fileInfo.path)
|
||||
})
|
||||
.catch(function (err) {
|
||||
if (err.code !== 'ENOENT') {
|
||||
throw err
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
134
services/history-v1/storage/lib/zip_store.js
Normal file
134
services/history-v1/storage/lib/zip_store.js
Normal file
@@ -0,0 +1,134 @@
|
||||
'use strict'
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
const config = require('config')
|
||||
const fs = require('node:fs')
|
||||
const path = require('node:path')
|
||||
|
||||
const OError = require('@overleaf/o-error')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
|
||||
const assert = require('./assert')
|
||||
const { BlobStore } = require('./blob_store')
|
||||
const persistor = require('./persistor')
|
||||
const ProjectArchive = require('./project_archive')
|
||||
const projectKey = require('./project_key')
|
||||
const temp = require('./temp')
|
||||
|
||||
const BUCKET = config.get('zipStore.bucket')
|
||||
|
||||
function getZipKey(projectId, version) {
|
||||
return path.join(
|
||||
projectKey.format(projectId),
|
||||
version.toString(),
|
||||
'project.zip'
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Store a zip of a given version of a project in bucket.
|
||||
*
|
||||
* @class
|
||||
*/
|
||||
class ZipStore {
|
||||
/**
|
||||
* Generate signed link to access the zip file.
|
||||
*
|
||||
* @param {number | string} projectId
|
||||
* @param {number} version
|
||||
* @return {string}
|
||||
*/
|
||||
async getSignedUrl(projectId, version) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
|
||||
const key = getZipKey(projectId, version)
|
||||
return await persistor.getRedirectUrl(BUCKET, key)
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a zip of the given snapshot.
|
||||
*
|
||||
* @param {number | string} projectId
|
||||
* @param {number} version
|
||||
* @param {Snapshot} snapshot
|
||||
*/
|
||||
async storeZip(projectId, version, snapshot) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
assert.object(snapshot, 'bad snapshot')
|
||||
|
||||
const zipKey = getZipKey(projectId, version)
|
||||
|
||||
if (await isZipPresent()) return
|
||||
|
||||
await BPromise.using(temp.open('zip'), async tempFileInfo => {
|
||||
await zipSnapshot(tempFileInfo.path, snapshot)
|
||||
await uploadZip(tempFileInfo.path)
|
||||
})
|
||||
|
||||
// If the file is already there, we don't need to build the zip again. If we
|
||||
// just HEAD the file, there's a race condition, because the zip files
|
||||
// automatically expire. So, we try to copy the file from itself to itself,
|
||||
// and if it fails, we know the file didn't exist. If it succeeds, this has
|
||||
// the effect of re-extending its lifetime.
|
||||
async function isZipPresent() {
|
||||
try {
|
||||
await persistor.copyObject(BUCKET, zipKey, zipKey)
|
||||
return true
|
||||
} catch (error) {
|
||||
if (!(error instanceof objectPersistor.Errors.NotFoundError)) {
|
||||
console.error(
|
||||
'storeZip: isZipPresent: unexpected error (except in dev): %s',
|
||||
error
|
||||
)
|
||||
}
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async function zipSnapshot(tempPathname, snapshot) {
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const zipTimeoutMs = parseInt(config.get('zipStore.zipTimeoutMs'), 10)
|
||||
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
|
||||
try {
|
||||
await archive.writeZip(blobStore, tempPathname)
|
||||
} catch (err) {
|
||||
throw new ZipStore.CreationError(projectId, version).withCause(err)
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadZip(tempPathname, snapshot) {
|
||||
const stream = fs.createReadStream(tempPathname)
|
||||
try {
|
||||
await persistor.sendStream(BUCKET, zipKey, stream, {
|
||||
contentType: 'application/zip',
|
||||
})
|
||||
} catch (err) {
|
||||
throw new ZipStore.UploadError(projectId, version).withCause(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CreationError extends OError {
|
||||
constructor(projectId, version) {
|
||||
super(`Zip creation failed for ${projectId} version ${version}`, {
|
||||
projectId,
|
||||
version,
|
||||
})
|
||||
}
|
||||
}
|
||||
ZipStore.CreationError = CreationError
|
||||
|
||||
class UploadError extends OError {
|
||||
constructor(projectId, version) {
|
||||
super(`Zip upload failed for ${projectId} version ${version}`, {
|
||||
projectId,
|
||||
version,
|
||||
})
|
||||
}
|
||||
}
|
||||
ZipStore.UploadError = UploadError
|
||||
|
||||
module.exports = new ZipStore()
|
||||
1476
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
1476
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,647 @@
|
||||
// @ts-check
|
||||
import Events from 'node:events'
|
||||
import fs from 'node:fs'
|
||||
import Stream from 'node:stream'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import logger from '@overleaf/logger'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { Blob } from 'overleaf-editor-core'
|
||||
import {
|
||||
BlobStore,
|
||||
getStringLengthOfFile,
|
||||
GLOBAL_BLOBS,
|
||||
makeBlobForFile,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { db } from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import readline from 'node:readline'
|
||||
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import filestorePersistor from '../lib/persistor.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
/**
|
||||
* @typedef {import("mongodb").Collection} Collection
|
||||
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
|
||||
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} FileRef
|
||||
* @property {ObjectId} _id
|
||||
* @property {string} hash
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Folder
|
||||
* @property {Array<Folder>} folders
|
||||
* @property {Array<FileRef>} fileRefs
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Project
|
||||
* @property {ObjectId} _id
|
||||
* @property {Array<Folder>} rootFolder
|
||||
* @property {{history: {id: (number|string)}}} overleaf
|
||||
*/
|
||||
|
||||
/**
|
||||
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
|
||||
*/
|
||||
function parseArgs() {
|
||||
const args = commandLineArgs([
|
||||
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixMissingHash', type: String, defaultValue: 'true' },
|
||||
{ name: 'logs', type: String, defaultValue: '' },
|
||||
])
|
||||
/**
|
||||
* commandLineArgs cannot handle --foo=false, so go the long way
|
||||
* @param {string} name
|
||||
* @return {boolean}
|
||||
*/
|
||||
function boolVal(name) {
|
||||
const v = args[name]
|
||||
if (['true', 'false'].includes(v)) return v === 'true'
|
||||
throw new Error(`expected "true" or "false" for boolean option ${name}`)
|
||||
}
|
||||
return {
|
||||
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
|
||||
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
|
||||
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
|
||||
FIX_MISSING_HASH: boolVal('fixMissingHash'),
|
||||
LOGS: args.logs,
|
||||
}
|
||||
}
|
||||
|
||||
const {
|
||||
FIX_HASH_MISMATCH,
|
||||
FIX_DELETE_PERMISSION,
|
||||
FIX_NOT_FOUND,
|
||||
FIX_MISSING_HASH,
|
||||
LOGS,
|
||||
} = parseArgs()
|
||||
if (!LOGS) {
|
||||
throw new Error('--logs parameter missing')
|
||||
}
|
||||
const BUFFER_DIR = fs.mkdtempSync(
|
||||
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
|
||||
)
|
||||
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
|
||||
if (!USER_FILES_BUCKET_NAME) {
|
||||
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
|
||||
}
|
||||
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
|
||||
const STREAM_HIGH_WATER_MARK = parseInt(
|
||||
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
|
||||
10
|
||||
)
|
||||
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
|
||||
|
||||
/** @type {ProjectsCollection} */
|
||||
const projectsCollection = db.collection('projects')
|
||||
/** @type {DeletedProjectsCollection} */
|
||||
const deletedProjectsCollection = db.collection('deletedProjects')
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated, draining queue')
|
||||
}
|
||||
|
||||
class FileDeletedError extends OError {}
|
||||
|
||||
/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
|
||||
const PROJECT_CACHE = new Map()
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
|
||||
*/
|
||||
async function getProject(projectId) {
|
||||
const cached = PROJECT_CACHE.get(projectId)
|
||||
if (cached) return cached
|
||||
|
||||
let projectSoftDeleted
|
||||
let project = await projectsCollection.findOne({
|
||||
_id: new ObjectId(projectId),
|
||||
})
|
||||
if (project) {
|
||||
projectSoftDeleted = false
|
||||
} else {
|
||||
const softDeleted = await deletedProjectsCollection.findOne({
|
||||
'deleterData.deletedProjectId': new ObjectId(projectId),
|
||||
project: { $exists: true },
|
||||
})
|
||||
if (!softDeleted) {
|
||||
throw new OError('project hard-deleted')
|
||||
}
|
||||
project = softDeleted.project
|
||||
projectSoftDeleted = true
|
||||
}
|
||||
PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
|
||||
return { projectSoftDeleted, project }
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Folder} folder
|
||||
* @param {string} fileId
|
||||
* @return {{path: string, fileRef: FileRef, folder: Folder}|null}
|
||||
*/
|
||||
function getFileTreePath(folder, fileId) {
|
||||
if (!folder) return null
|
||||
let idx = 0
|
||||
if (Array.isArray(folder.fileRefs)) {
|
||||
for (const fileRef of folder.fileRefs) {
|
||||
if (fileRef?._id.toString() === fileId) {
|
||||
return {
|
||||
fileRef,
|
||||
path: `.fileRefs.${idx}`,
|
||||
folder,
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
}
|
||||
idx = 0
|
||||
if (Array.isArray(folder.folders)) {
|
||||
for (const child of folder.folders) {
|
||||
const match = getFileTreePath(child, fileId)
|
||||
if (match) {
|
||||
return {
|
||||
fileRef: match.fileRef,
|
||||
folder: match.folder,
|
||||
path: `.folders.${idx}${match.path}`,
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
|
||||
*/
|
||||
async function findFile(projectId, fileId) {
|
||||
const { projectSoftDeleted, project } = await getProject(projectId)
|
||||
const match = getFileTreePath(project.rootFolder[0], fileId)
|
||||
if (!match) {
|
||||
throw new FileDeletedError('file not found in file-tree', {
|
||||
projectSoftDeleted,
|
||||
})
|
||||
}
|
||||
const { path, fileRef, folder } = match
|
||||
let fullPath
|
||||
let query
|
||||
if (projectSoftDeleted) {
|
||||
fullPath = `project.rootFolder.0${path}`
|
||||
query = {
|
||||
'deleterData.deletedProjectId': new ObjectId(projectId),
|
||||
[`${fullPath}._id`]: new ObjectId(fileId),
|
||||
}
|
||||
} else {
|
||||
fullPath = `rootFolder.0${path}`
|
||||
query = {
|
||||
_id: new ObjectId(projectId),
|
||||
[`${fullPath}._id`]: new ObjectId(fileId),
|
||||
}
|
||||
}
|
||||
return {
|
||||
projectSoftDeleted,
|
||||
query,
|
||||
fullPath,
|
||||
fileRef,
|
||||
folder,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixNotFound(line) {
|
||||
const { projectId, fileId, bucketName } = JSON.parse(line)
|
||||
if (bucketName !== USER_FILES_BUCKET_NAME) {
|
||||
throw new OError('not found case for another bucket')
|
||||
}
|
||||
|
||||
const { projectSoftDeleted, query, fullPath, fileRef, folder } =
|
||||
await findFile(projectId, fileId)
|
||||
logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
|
||||
// Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
|
||||
const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
|
||||
let result
|
||||
if (projectSoftDeleted) {
|
||||
result = await deletedProjectsCollection.updateOne(query, {
|
||||
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
|
||||
$inc: { 'project.version': 1 },
|
||||
})
|
||||
} else {
|
||||
result = await projectsCollection.updateOne(query, {
|
||||
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
|
||||
$inc: { version: 1 },
|
||||
})
|
||||
}
|
||||
if (result.matchedCount !== 1) {
|
||||
throw new OError('file-tree write did not match', { result })
|
||||
}
|
||||
// Update the cache. The mongo-path of the next file will be off otherwise.
|
||||
folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function setHashInMongo(projectId, fileId, hash) {
|
||||
const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
|
||||
projectId,
|
||||
fileId
|
||||
)
|
||||
if (fileRef.hash === hash) return
|
||||
logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
|
||||
let result
|
||||
if (projectSoftDeleted) {
|
||||
result = await deletedProjectsCollection.updateOne(query, {
|
||||
$set: { [`${fullPath}.hash`]: hash },
|
||||
$inc: { 'project.version': 1 },
|
||||
})
|
||||
} else {
|
||||
result = await projectsCollection.updateOne(query, {
|
||||
$set: { [`${fullPath}.hash`]: hash },
|
||||
$inc: { version: 1 },
|
||||
})
|
||||
}
|
||||
if (result.matchedCount !== 1) {
|
||||
throw new OError('file-tree write did not match', { result })
|
||||
}
|
||||
fileRef.hash = hash // Update cache for completeness.
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} historyId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function importRestoredFilestoreFile(projectId, fileId, historyId) {
|
||||
const filestoreKey = `${projectId}/${fileId}`
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
let s
|
||||
try {
|
||||
s = await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
filestoreKey
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new OError('missing blob, need to restore filestore file', {
|
||||
filestoreKey,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
await Stream.promises.pipeline(
|
||||
s,
|
||||
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
|
||||
)
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.putFile(path)
|
||||
await backupBlob(historyId, blob, path)
|
||||
await setHashInMongo(projectId, fileId, blob.getHash())
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} path
|
||||
* @return {Promise<Blob>}
|
||||
*/
|
||||
async function bufferFilestoreFileToDisk(projectId, fileId, path) {
|
||||
const filestoreKey = `${projectId}/${fileId}`
|
||||
try {
|
||||
await Stream.promises.pipeline(
|
||||
await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
filestoreKey
|
||||
),
|
||||
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
|
||||
)
|
||||
const blob = await makeBlobForFile(path)
|
||||
blob.setStringLength(
|
||||
await getStringLengthOfFile(blob.getByteLength(), path)
|
||||
)
|
||||
return blob
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new OError('missing blob, need to restore filestore file', {
|
||||
filestoreKey,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function computeFilestoreFileHash(projectId, fileId) {
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
|
||||
return blob.getHash()
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function uploadFilestoreFile(projectId, fileId) {
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
|
||||
const hash = blob.getHash()
|
||||
try {
|
||||
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
} catch (err) {
|
||||
if (!(err instanceof Blob.NotFoundError)) throw err
|
||||
|
||||
const { project } = await getProject(projectId)
|
||||
const historyId = project.overleaf.history.id.toString()
|
||||
const blobStore = new BlobStore(historyId)
|
||||
await blobStore.putBlob(path, blob)
|
||||
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixHashMismatch(line) {
|
||||
const {
|
||||
projectId,
|
||||
fileId,
|
||||
hash: computedHash,
|
||||
entry: {
|
||||
hash: fileTreeHash,
|
||||
ctx: { historyId },
|
||||
},
|
||||
} = JSON.parse(line)
|
||||
const blobStore = new BlobStore(historyId)
|
||||
if (await blobStore.getBlob(fileTreeHash)) {
|
||||
throw new OError('found blob with computed filestore object hash')
|
||||
}
|
||||
if (!(await blobStore.getBlob(computedHash))) {
|
||||
await importRestoredFilestoreFile(projectId, fileId, historyId)
|
||||
return true
|
||||
}
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(
|
||||
projectId,
|
||||
fileId,
|
||||
computedHash
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
|
||||
const { fileRef } = await findFile(projectId, fileId)
|
||||
return fileRef.hash === hash
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function needsBackingUpToAWS(projectId, hash) {
|
||||
if (GLOBAL_BLOBS.has(hash)) return false
|
||||
return !(await _blobIsBackedUp(projectId, hash))
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
|
||||
const { project } = await getProject(projectId)
|
||||
const historyId = project.overleaf.history.id.toString()
|
||||
const blobStore = new BlobStore(historyId)
|
||||
if (
|
||||
(await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
|
||||
(await blobStore.getBlob(hash)) &&
|
||||
!(await needsBackingUpToAWS(projectId, hash))
|
||||
) {
|
||||
return false // already processed
|
||||
}
|
||||
|
||||
const stream = await blobStore.getStream(hash)
|
||||
const path = `${BUFFER_DIR}/${historyId}_${hash}`
|
||||
try {
|
||||
await Stream.promises.pipeline(
|
||||
stream,
|
||||
fs.createWriteStream(path, {
|
||||
highWaterMark: STREAM_HIGH_WATER_MARK,
|
||||
})
|
||||
)
|
||||
|
||||
const writtenBlob = await makeBlobForFile(path)
|
||||
writtenBlob.setStringLength(
|
||||
await getStringLengthOfFile(writtenBlob.getByteLength(), path)
|
||||
)
|
||||
if (writtenBlob.getHash() !== hash) {
|
||||
// Double check download, better safe than sorry.
|
||||
throw new OError('blob corrupted', { writtenBlob })
|
||||
}
|
||||
|
||||
let blob = await blobStore.getBlob(hash)
|
||||
if (!blob) {
|
||||
// Calling blobStore.putBlob would result in the same error again.
|
||||
// HACK: Skip upload to GCS and finalize putBlob operation directly.
|
||||
await blobStore.backend.insertBlob(historyId, writtenBlob)
|
||||
}
|
||||
await backupBlob(historyId, writtenBlob, path)
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
await setHashInMongo(projectId, fileId, hash)
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixDeletePermission(line) {
|
||||
let { projectId, fileId, hash } = JSON.parse(line)
|
||||
if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixMissingHash(line) {
|
||||
let { projectId, _id: fileId } = JSON.parse(line)
|
||||
const {
|
||||
fileRef: { hash },
|
||||
} = await findFile(projectId, fileId)
|
||||
if (hash) {
|
||||
// processed, double check
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
await uploadFilestoreFile(projectId, fileId)
|
||||
return true
|
||||
}
|
||||
|
||||
const CASES = {
|
||||
'not found': {
|
||||
match: 'NotFoundError',
|
||||
flag: FIX_NOT_FOUND,
|
||||
action: fixNotFound,
|
||||
},
|
||||
'hash mismatch': {
|
||||
match: 'OError: hash mismatch',
|
||||
flag: FIX_HASH_MISMATCH,
|
||||
action: fixHashMismatch,
|
||||
},
|
||||
'delete permission': {
|
||||
match: 'storage.objects.delete',
|
||||
flag: FIX_DELETE_PERMISSION,
|
||||
action: fixDeletePermission,
|
||||
},
|
||||
'missing file hash': {
|
||||
match: '"bad file hash"',
|
||||
flag: FIX_MISSING_HASH,
|
||||
action: fixMissingHash,
|
||||
},
|
||||
}
|
||||
|
||||
const STATS = {
|
||||
processedLines: 0,
|
||||
success: 0,
|
||||
alreadyProcessed: 0,
|
||||
fileDeleted: 0,
|
||||
skipped: 0,
|
||||
failed: 0,
|
||||
unmatched: 0,
|
||||
}
|
||||
function logStats() {
|
||||
console.log(
|
||||
JSON.stringify({
|
||||
time: new Date(),
|
||||
gracefulShutdownInitiated,
|
||||
...STATS,
|
||||
})
|
||||
)
|
||||
}
|
||||
setInterval(logStats, 10_000)
|
||||
|
||||
async function processLog() {
|
||||
const rl = readline.createInterface({
|
||||
input: fs.createReadStream(LOGS),
|
||||
})
|
||||
nextLine: for await (const line of rl) {
|
||||
if (gracefulShutdownInitiated) break
|
||||
STATS.processedLines++
|
||||
if (
|
||||
!(
|
||||
line.includes('"failed to process file"') ||
|
||||
// Process missing hashes as flagged by find_malformed_filetrees.mjs
|
||||
line.includes('"bad file-tree path"')
|
||||
)
|
||||
) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
|
||||
if (!line.includes(match)) continue
|
||||
if (flag) {
|
||||
try {
|
||||
if (await action(line)) {
|
||||
STATS.success++
|
||||
} else {
|
||||
STATS.alreadyProcessed++
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof FileDeletedError) {
|
||||
STATS.fileDeleted++
|
||||
logger.info({ err, line }, 'file deleted, skipping')
|
||||
} else {
|
||||
STATS.failed++
|
||||
logger.error({ err, line }, `failed to fix ${name}`)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
STATS.skipped++
|
||||
}
|
||||
continue nextLine
|
||||
}
|
||||
STATS.unmatched++
|
||||
logger.warn({ line }, 'unknown fatal error')
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
await processLog()
|
||||
} finally {
|
||||
logStats()
|
||||
try {
|
||||
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
|
||||
} catch (err) {
|
||||
console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
|
||||
}
|
||||
}
|
||||
const { skipped, failed, unmatched } = STATS
|
||||
await setTimeout(SLEEP_BEFORE_EXIT)
|
||||
if (failed > 0) {
|
||||
process.exit(Math.min(failed, 99))
|
||||
} else if (unmatched > 0) {
|
||||
process.exit(100)
|
||||
} else if (skipped > 0) {
|
||||
process.exit(101)
|
||||
} else {
|
||||
process.exit(0)
|
||||
}
|
||||
}
|
||||
|
||||
await main()
|
||||
1104
services/history-v1/storage/scripts/backup.mjs
Normal file
1104
services/history-v1/storage/scripts/backup.mjs
Normal file
File diff suppressed because it is too large
Load Diff
173
services/history-v1/storage/scripts/backup_blob.mjs
Normal file
173
services/history-v1/storage/scripts/backup_blob.mjs
Normal file
@@ -0,0 +1,173 @@
|
||||
// @ts-check
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
|
||||
import withTmpDir from '../../api/controllers/with_tmp_dir.js'
|
||||
import {
|
||||
BlobStore,
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import assert from '../lib/assert.js'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import fs from 'node:fs'
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
/**
|
||||
* Gracefully shutdown the process
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function gracefulShutdown() {
|
||||
console.log('Gracefully shutting down')
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(100)
|
||||
process.exit()
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} row
|
||||
* @return {BackupBlobJob}
|
||||
*/
|
||||
function parseCSVRow(row) {
|
||||
const [historyId, hash] = row.split(',')
|
||||
validateBackedUpBlobJob({ historyId, hash })
|
||||
return { historyId, hash }
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {BackupBlobJob} job
|
||||
*/
|
||||
function validateBackedUpBlobJob(job) {
|
||||
assert.projectId(job.historyId)
|
||||
assert.blobHash(job.hash)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} path
|
||||
* @return {Promise<Array<BackupBlobJob>>}
|
||||
*/
|
||||
async function readCSV(path) {
|
||||
let fh
|
||||
/** @type {Array<BackupBlobJob>} */
|
||||
const rows = []
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
throw error
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
try {
|
||||
const row = parseCSVRow(line)
|
||||
if (GLOBAL_BLOBS.has(row.hash)) {
|
||||
console.log(`Skipping global blob: ${line}`)
|
||||
continue
|
||||
}
|
||||
rows.push(row)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.log(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} BackupBlobJob
|
||||
* @property {string} hash
|
||||
* @property {string} historyId
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param {Object} options
|
||||
* @property {string} [options.historyId]
|
||||
* @property {string} [options.hash]
|
||||
* @property {string} [options.input]
|
||||
* @return {Promise<Array<BackupBlobJob>>}
|
||||
*/
|
||||
async function initialiseJobs({ historyId, hash, input }) {
|
||||
if (input) {
|
||||
return await readCSV(input)
|
||||
}
|
||||
|
||||
if (!historyId) {
|
||||
console.error('historyId is required')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
if (!hash) {
|
||||
console.error('hash is required')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
validateBackedUpBlobJob({ historyId, hash })
|
||||
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
console.error(`Blob ${hash} is a global blob; not backing up`)
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
return [{ hash, historyId }]
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function downloadAndBackupBlob(historyId, hash) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.getBlob(hash)
|
||||
if (!blob) {
|
||||
throw new Error(`Blob ${hash} could not be loaded`)
|
||||
}
|
||||
await withTmpDir(`blob-${hash}`, async tmpDir => {
|
||||
const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
|
||||
console.log(`Downloaded blob ${hash} to ${filePath}`)
|
||||
await backupBlob(historyId, blob, filePath)
|
||||
console.log('Backed up blob')
|
||||
})
|
||||
}
|
||||
|
||||
let jobs
|
||||
|
||||
const options = commandLineArgs([
|
||||
{ name: 'historyId', type: String },
|
||||
{ name: 'hash', type: String },
|
||||
{ name: 'input', type: String },
|
||||
])
|
||||
|
||||
try {
|
||||
jobs = await initialiseJobs(options)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
if (!Array.isArray(jobs)) {
|
||||
// This is mostly to satisfy typescript
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
for (const { historyId, hash } of jobs) {
|
||||
try {
|
||||
await downloadAndBackupBlob(historyId, hash)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
process.exitCode = 1
|
||||
}
|
||||
}
|
||||
await gracefulShutdown()
|
||||
153
services/history-v1/storage/scripts/backup_sample.mjs
Normal file
153
services/history-v1/storage/scripts/backup_sample.mjs
Normal file
@@ -0,0 +1,153 @@
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import { db, client } from '../lib/mongodb.js'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
// Configuration
|
||||
const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
|
||||
? parseInt(process.argv[2], 10)
|
||||
: 10000
|
||||
const TARGET_ERROR_PERCENTAGE = process.argv[3]
|
||||
? parseFloat(process.argv[3])
|
||||
: 5.0
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated')
|
||||
}
|
||||
|
||||
async function takeSample(sampleSize) {
|
||||
const results = await projectsCollection
|
||||
.aggregate(
|
||||
[
|
||||
{ $sample: { size: sampleSize } },
|
||||
{
|
||||
$match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
|
||||
},
|
||||
{
|
||||
$count: 'total',
|
||||
},
|
||||
],
|
||||
{ readPreference: READ_PREFERENCE_SECONDARY }
|
||||
)
|
||||
.toArray()
|
||||
|
||||
const count = results[0]?.total || 0
|
||||
return { totalSampled: sampleSize, backedUp: count }
|
||||
}
|
||||
|
||||
function calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalPopulation
|
||||
) {
|
||||
const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
|
||||
|
||||
// Standard error with finite population correction
|
||||
const fpc = Math.sqrt(
|
||||
(totalPopulation - cumulativeSampled) / (totalPopulation - 1)
|
||||
)
|
||||
const stdError =
|
||||
Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
|
||||
|
||||
// 95% confidence interval is approximately ±1.96 standard errors
|
||||
const marginOfError = 1.96 * stdError
|
||||
|
||||
return {
|
||||
proportion,
|
||||
percentage: (proportion * 100).toFixed(2),
|
||||
marginOfError,
|
||||
errorPercentage: (marginOfError * 100).toFixed(2),
|
||||
lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
|
||||
upperBound: ((proportion + marginOfError) * 100).toFixed(2),
|
||||
sampleSize: cumulativeSampled,
|
||||
populationSize: totalPopulation,
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('Date:', new Date().toISOString())
|
||||
const totalCount = await projectsCollection.estimatedDocumentCount({
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
})
|
||||
console.log(
|
||||
`Total projects in collection (estimated): ${totalCount.toLocaleString()}`
|
||||
)
|
||||
console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
|
||||
|
||||
let cumulativeSampled = 0
|
||||
let cumulativeBackedUp = 0
|
||||
let currentError = Infinity
|
||||
let iteration = 0
|
||||
|
||||
console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
|
||||
console.log('----------|---------------|-------------|----------------')
|
||||
|
||||
while (currentError > TARGET_ERROR_PERCENTAGE) {
|
||||
if (gracefulShutdownInitiated) {
|
||||
console.log('Graceful shutdown initiated. Exiting sampling loop.')
|
||||
break
|
||||
}
|
||||
|
||||
iteration++
|
||||
const { totalSampled, backedUp } = await takeSample(
|
||||
SAMPLE_SIZE_PER_ITERATION
|
||||
)
|
||||
cumulativeSampled += totalSampled
|
||||
cumulativeBackedUp += backedUp
|
||||
|
||||
const stats = calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalCount
|
||||
)
|
||||
currentError = parseFloat(stats.errorPercentage)
|
||||
|
||||
console.log(
|
||||
`${iteration.toString().padStart(9)} | ` +
|
||||
`${cumulativeSampled.toString().padStart(13)} | ` +
|
||||
`${stats.percentage.padStart(10)}% | ` +
|
||||
`\u00B1${stats.errorPercentage}%`
|
||||
)
|
||||
|
||||
// Small delay between iterations
|
||||
await new Promise(resolve => setTimeout(resolve, 100))
|
||||
}
|
||||
|
||||
const finalStats = calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalCount
|
||||
)
|
||||
|
||||
console.log(
|
||||
`Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
|
||||
)
|
||||
console.log(
|
||||
`Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
|
||||
)
|
||||
console.log(
|
||||
`95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
|
||||
)
|
||||
console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
||||
429
services/history-v1/storage/scripts/backup_scheduler.mjs
Normal file
429
services/history-v1/storage/scripts/backup_scheduler.mjs
Normal file
@@ -0,0 +1,429 @@
|
||||
import Queue from 'bull'
|
||||
import config from 'config'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import logger from '@overleaf/logger'
|
||||
import {
|
||||
listPendingBackups,
|
||||
listUninitializedBackups,
|
||||
getBackupStatus,
|
||||
} from '../lib/backup_store/index.js'
|
||||
|
||||
logger.initialize('backup-queue')
|
||||
|
||||
// Use the same redis config as backup_worker
|
||||
const redisOptions = config.get('redis.queue')
|
||||
|
||||
// Create a Bull queue named 'backup'
|
||||
const backupQueue = new Queue('backup', {
|
||||
redis: redisOptions,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: true,
|
||||
removeOnFail: true,
|
||||
},
|
||||
})
|
||||
|
||||
// Define command-line options
|
||||
const optionDefinitions = [
|
||||
{ name: 'clean', type: Boolean },
|
||||
{ name: 'status', type: Boolean },
|
||||
{
|
||||
name: 'add',
|
||||
type: String,
|
||||
multiple: true,
|
||||
description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
|
||||
},
|
||||
{ name: 'monitor', type: Boolean },
|
||||
{
|
||||
name: 'queue-pending',
|
||||
type: Number,
|
||||
description:
|
||||
'Find projects with pending changes older than N seconds and add them to the queue',
|
||||
},
|
||||
{
|
||||
name: 'show-pending',
|
||||
type: Number,
|
||||
description:
|
||||
'Show count of pending projects older than N seconds without adding to queue',
|
||||
},
|
||||
{
|
||||
name: 'limit',
|
||||
type: Number,
|
||||
description: 'Limit the number of jobs to be added',
|
||||
},
|
||||
{
|
||||
name: 'interval',
|
||||
type: Number,
|
||||
description: 'Time in seconds to spread jobs over (default: 300)',
|
||||
defaultValue: 300,
|
||||
},
|
||||
{
|
||||
name: 'backoff-delay',
|
||||
type: Number,
|
||||
description:
|
||||
'Backoff delay in milliseconds for failed jobs (default: 1000)',
|
||||
defaultValue: 1000,
|
||||
},
|
||||
{
|
||||
name: 'attempts',
|
||||
type: Number,
|
||||
description: 'Number of retry attempts for failed jobs (default: 3)',
|
||||
defaultValue: 3,
|
||||
},
|
||||
{
|
||||
name: 'warn-threshold',
|
||||
type: Number,
|
||||
description: 'Warn about any project exceeding this pending age',
|
||||
defaultValue: 2 * 3600, // 2 hours
|
||||
},
|
||||
{
|
||||
name: 'verbose',
|
||||
alias: 'v',
|
||||
type: Boolean,
|
||||
description: 'Show detailed information when used with --show-pending',
|
||||
},
|
||||
]
|
||||
|
||||
// Parse command line arguments
|
||||
const options = commandLineArgs(optionDefinitions)
|
||||
const WARN_THRESHOLD = options['warn-threshold']
|
||||
|
||||
// Helper to validate date format
|
||||
function isValidDateFormat(dateStr) {
|
||||
return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
|
||||
}
|
||||
|
||||
// Helper to validate the pending time parameter
|
||||
function validatePendingTime(option, value) {
|
||||
if (typeof value !== 'number' || value <= 0) {
|
||||
console.error(
|
||||
`Error: --${option} requires a positive numeric TIME argument in seconds`
|
||||
)
|
||||
console.error(`Example: --${option} 3600`)
|
||||
process.exit(1)
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// Helper to format the pending time display
|
||||
function formatPendingTime(timestamp) {
|
||||
const now = new Date()
|
||||
const diffMs = now - timestamp
|
||||
const seconds = Math.floor(diffMs / 1000)
|
||||
return `${timestamp.toISOString()} (${seconds} seconds ago)`
|
||||
}
|
||||
|
||||
// Helper to add a job to the queue, checking for duplicates
|
||||
async function addJobWithCheck(queue, data, options) {
|
||||
const jobId = options.jobId
|
||||
|
||||
// Check if the job already exists
|
||||
const existingJob = await queue.getJob(jobId)
|
||||
|
||||
if (existingJob) {
|
||||
return { job: existingJob, added: false }
|
||||
} else {
|
||||
const job = await queue.add(data, options)
|
||||
return { job, added: true }
|
||||
}
|
||||
}
|
||||
|
||||
// Setup queue event listeners
|
||||
function setupMonitoring() {
|
||||
console.log('Starting queue monitoring. Press Ctrl+C to exit.')
|
||||
|
||||
backupQueue.on('global:error', error => {
|
||||
logger.info({ error }, 'Queue error')
|
||||
})
|
||||
|
||||
backupQueue.on('global:waiting', jobId => {
|
||||
logger.info({ jobId }, 'job is waiting')
|
||||
})
|
||||
|
||||
backupQueue.on('global:active', jobId => {
|
||||
logger.info({ jobId }, 'job is now active')
|
||||
})
|
||||
|
||||
backupQueue.on('global:stalled', jobId => {
|
||||
logger.info({ jobId }, 'job has stalled')
|
||||
})
|
||||
|
||||
backupQueue.on('global:progress', (jobId, progress) => {
|
||||
logger.info({ jobId, progress }, 'job progress')
|
||||
})
|
||||
|
||||
backupQueue.on('global:completed', (jobId, result) => {
|
||||
logger.info({ jobId, result }, 'job completed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:failed', (jobId, err) => {
|
||||
logger.info({ jobId, err }, 'job failed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:paused', () => {
|
||||
logger.info({}, 'Queue paused')
|
||||
})
|
||||
|
||||
backupQueue.on('global:resumed', () => {
|
||||
logger.info({}, 'Queue resumed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:cleaned', (jobs, type) => {
|
||||
logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
|
||||
})
|
||||
|
||||
backupQueue.on('global:drained', () => {
|
||||
logger.info({}, 'Queue drained')
|
||||
})
|
||||
|
||||
backupQueue.on('global:removed', jobId => {
|
||||
logger.info({ jobId }, 'Job removed')
|
||||
})
|
||||
}
|
||||
|
||||
async function addDateRangeJob(input) {
|
||||
const [startDate, endDate] = input.split(':')
|
||||
if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
|
||||
console.error(
|
||||
`Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
const jobId = `backup-${startDate}-to-${endDate}`
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ startDate, endDate },
|
||||
{ jobId }
|
||||
)
|
||||
|
||||
console.log(
|
||||
`${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
|
||||
)
|
||||
}
|
||||
|
||||
// Helper to list pending and uninitialized backups
|
||||
// This function combines the two cursors into a single generator
|
||||
// to yield projects from both lists
|
||||
async function* pendingCursor(timeIntervalMs, limit) {
|
||||
for await (const project of listPendingBackups(timeIntervalMs, limit)) {
|
||||
yield project
|
||||
}
|
||||
for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
|
||||
yield project
|
||||
}
|
||||
}
|
||||
|
||||
// Process pending projects with changes older than the specified seconds
|
||||
async function processPendingProjects(
|
||||
age,
|
||||
showOnly,
|
||||
limit,
|
||||
verbose,
|
||||
jobInterval,
|
||||
jobOpts = {}
|
||||
) {
|
||||
const timeIntervalMs = age * 1000
|
||||
console.log(
|
||||
`Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
|
||||
)
|
||||
|
||||
let count = 0
|
||||
let addedCount = 0
|
||||
let existingCount = 0
|
||||
// Pass the limit directly to MongoDB query for better performance
|
||||
const changeTimes = []
|
||||
for await (const project of pendingCursor(timeIntervalMs, limit)) {
|
||||
const projectId = project._id.toHexString()
|
||||
const pendingAt =
|
||||
project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
|
||||
if (pendingAt) {
|
||||
changeTimes.push(pendingAt)
|
||||
const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
|
||||
if (pendingAge > WARN_THRESHOLD) {
|
||||
try {
|
||||
const backupStatus = await getBackupStatus(projectId)
|
||||
logger.warn(
|
||||
{
|
||||
projectId,
|
||||
pendingAt,
|
||||
pendingAge,
|
||||
backupStatus,
|
||||
warnThreshold: WARN_THRESHOLD,
|
||||
},
|
||||
`pending change exceeds rpo warning threshold`
|
||||
)
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ projectId, pendingAt, pendingAge },
|
||||
'Error getting backup status'
|
||||
)
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
if (showOnly && verbose) {
|
||||
console.log(
|
||||
`Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
} else if (!showOnly) {
|
||||
const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ projectId, pendingChangeAt: pendingAt.getTime() },
|
||||
{ ...jobOpts, delay, jobId: projectId }
|
||||
)
|
||||
|
||||
if (added) {
|
||||
if (verbose) {
|
||||
console.log(
|
||||
`Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
}
|
||||
addedCount++
|
||||
} else {
|
||||
if (verbose) {
|
||||
console.log(
|
||||
`Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
}
|
||||
existingCount++
|
||||
}
|
||||
}
|
||||
|
||||
count++
|
||||
if (count % 1000 === 0) {
|
||||
console.log(
|
||||
`Processed ${count} projects`,
|
||||
showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
|
||||
)
|
||||
}
|
||||
}
|
||||
// Set oldestChange to undefined if there are no changes
|
||||
const oldestChange =
|
||||
changeTimes.length > 0
|
||||
? changeTimes.reduce((min, time) => (time < min ? time : min))
|
||||
: undefined
|
||||
|
||||
if (showOnly) {
|
||||
console.log(
|
||||
`Found ${count} projects with pending changes (not added to queue)`
|
||||
)
|
||||
} else {
|
||||
console.log(`Found ${count} projects with pending changes:`)
|
||||
console.log(` ${addedCount} jobs added to queue`)
|
||||
console.log(` ${existingCount} jobs already existed in queue`)
|
||||
if (oldestChange) {
|
||||
console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main execution block
|
||||
async function run() {
|
||||
const optionCount = [
|
||||
options.clean,
|
||||
options.status,
|
||||
options.add,
|
||||
options.monitor,
|
||||
options['queue-pending'] !== undefined,
|
||||
options['show-pending'] !== undefined,
|
||||
].filter(Boolean).length
|
||||
if (optionCount > 1) {
|
||||
console.error('Only one option can be specified')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (options.clean) {
|
||||
const beforeCounts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(beforeCounts))
|
||||
console.log('Cleaning completed and failed jobs...')
|
||||
await backupQueue.clean(1, 'completed')
|
||||
await backupQueue.clean(1, 'failed')
|
||||
const afterCounts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(afterCounts))
|
||||
console.log('Queue cleaned successfully')
|
||||
} else if (options.status) {
|
||||
const counts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(counts))
|
||||
} else if (options.add) {
|
||||
const inputs = Array.isArray(options.add) ? options.add : [options.add]
|
||||
for (const input of inputs) {
|
||||
if (input.includes(':')) {
|
||||
// Handle date range format
|
||||
await addDateRangeJob(input)
|
||||
} else {
|
||||
// Handle project ID format
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ projectId: input },
|
||||
{ jobId: input }
|
||||
)
|
||||
console.log(
|
||||
`${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
|
||||
)
|
||||
}
|
||||
}
|
||||
} else if (options.monitor) {
|
||||
setupMonitoring()
|
||||
} else if (options['queue-pending'] !== undefined) {
|
||||
const age = validatePendingTime('queue-pending', options['queue-pending'])
|
||||
await processPendingProjects(
|
||||
age,
|
||||
false,
|
||||
options.limit,
|
||||
options.verbose,
|
||||
options.interval,
|
||||
{
|
||||
attempts: options.attempts,
|
||||
backoff: {
|
||||
type: 'exponential',
|
||||
delay: options['backoff-delay'],
|
||||
},
|
||||
}
|
||||
)
|
||||
} else if (options['show-pending'] !== undefined) {
|
||||
const age = validatePendingTime('show-pending', options['show-pending'])
|
||||
await processPendingProjects(age, true, options.limit, options.verbose)
|
||||
} else {
|
||||
console.log('Usage:')
|
||||
console.log(' --clean Clean up completed and failed jobs')
|
||||
console.log(' --status Show current job counts')
|
||||
console.log(' --add [projectId] Add a job for the specified projectId')
|
||||
console.log(
|
||||
' --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
|
||||
)
|
||||
console.log(' --monitor Monitor queue events')
|
||||
console.log(
|
||||
' --queue-pending TIME Find projects with changes older than TIME seconds and add them to the queue'
|
||||
)
|
||||
console.log(
|
||||
' --show-pending TIME Show count of pending projects older than TIME seconds'
|
||||
)
|
||||
console.log(' --limit N Limit the number of jobs to be added')
|
||||
console.log(
|
||||
' --interval TIME Time interval in seconds to spread jobs over'
|
||||
)
|
||||
console.log(
|
||||
' --backoff-delay TIME Backoff delay in milliseconds for failed jobs (default: 1000)'
|
||||
)
|
||||
console.log(
|
||||
' --attempts N Number of retry attempts for failed jobs (default: 3)'
|
||||
)
|
||||
console.log(
|
||||
' --verbose, -v Show detailed information when used with --show-pending'
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Run and handle errors
|
||||
run()
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
.then(result => {
|
||||
// Only exit if not in monitor mode
|
||||
if (!options.monitor) {
|
||||
process.exit(0)
|
||||
}
|
||||
})
|
||||
144
services/history-v1/storage/scripts/backup_worker.mjs
Normal file
144
services/history-v1/storage/scripts/backup_worker.mjs
Normal file
@@ -0,0 +1,144 @@
|
||||
import Queue from 'bull'
|
||||
import logger from '@overleaf/logger'
|
||||
import config from 'config'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import {
|
||||
backupProject,
|
||||
initializeProjects,
|
||||
configureBackup,
|
||||
} from './backup.mjs'
|
||||
|
||||
const CONCURRENCY = 15
|
||||
const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
|
||||
const redisOptions = config.get('redis.queue')
|
||||
const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
|
||||
const LAG_TIME_BUCKETS_HRS = [
|
||||
0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
|
||||
] // hours
|
||||
|
||||
// Configure backup settings to match worker concurrency
|
||||
configureBackup({ concurrency: 50, useSecondary: true })
|
||||
|
||||
// Create a Bull queue named 'backup'
|
||||
const backupQueue = new Queue('backup', {
|
||||
redis: redisOptions,
|
||||
settings: {
|
||||
lockDuration: 15 * 60 * 1000, // 15 minutes
|
||||
lockRenewTime: 60 * 1000, // 1 minute
|
||||
maxStalledCount: 0, // mark stalled jobs as failed
|
||||
},
|
||||
})
|
||||
|
||||
// Log queue events
|
||||
backupQueue.on('active', job => {
|
||||
logger.debug({ job }, 'job is now active')
|
||||
})
|
||||
|
||||
backupQueue.on('completed', (job, result) => {
|
||||
metrics.inc('backup_worker_job', 1, { status: 'completed' })
|
||||
logger.debug({ job, result }, 'job completed')
|
||||
})
|
||||
|
||||
backupQueue.on('failed', (job, err) => {
|
||||
metrics.inc('backup_worker_job', 1, { status: 'failed' })
|
||||
logger.error({ job, err }, 'job failed')
|
||||
})
|
||||
|
||||
backupQueue.on('waiting', jobId => {
|
||||
logger.debug({ jobId }, 'job is waiting')
|
||||
})
|
||||
|
||||
backupQueue.on('error', error => {
|
||||
logger.error({ error }, 'queue error')
|
||||
})
|
||||
|
||||
backupQueue.on('stalled', job => {
|
||||
logger.error({ job }, 'job has stalled')
|
||||
})
|
||||
|
||||
backupQueue.on('lock-extension-failed', (job, err) => {
|
||||
logger.error({ job, err }, 'lock extension failed')
|
||||
})
|
||||
|
||||
backupQueue.on('paused', () => {
|
||||
logger.info('queue paused')
|
||||
})
|
||||
|
||||
backupQueue.on('resumed', () => {
|
||||
logger.info('queue resumed')
|
||||
})
|
||||
|
||||
// Process jobs
|
||||
backupQueue.process(CONCURRENCY, async job => {
|
||||
const { projectId, startDate, endDate } = job.data
|
||||
|
||||
if (projectId) {
|
||||
return await runBackup(projectId, job.data, job)
|
||||
} else if (startDate && endDate) {
|
||||
return await runInit(startDate, endDate)
|
||||
} else {
|
||||
throw new Error('invalid job data')
|
||||
}
|
||||
})
|
||||
|
||||
async function runBackup(projectId, data, job) {
|
||||
const { pendingChangeAt } = data
|
||||
// record the time it takes to run the backup job
|
||||
const timer = new metrics.Timer(
|
||||
'backup_worker_job_duration',
|
||||
1,
|
||||
{},
|
||||
JOB_TIME_BUCKETS
|
||||
)
|
||||
const pendingAge = Date.now() - pendingChangeAt
|
||||
if (pendingAge > WARN_THRESHOLD) {
|
||||
logger.warn(
|
||||
{ projectId, pendingAge, job },
|
||||
'project has been pending for a long time'
|
||||
)
|
||||
}
|
||||
try {
|
||||
logger.debug({ projectId }, 'processing backup for project')
|
||||
await backupProject(projectId, {})
|
||||
metrics.inc('backup_worker_project', 1, {
|
||||
status: 'success',
|
||||
})
|
||||
timer.done()
|
||||
// record the replication lag (time from change to backup)
|
||||
if (pendingChangeAt) {
|
||||
metrics.histogram(
|
||||
'backup_worker_replication_lag_in_hours',
|
||||
(Date.now() - pendingChangeAt) / (3600 * 1000),
|
||||
LAG_TIME_BUCKETS_HRS
|
||||
)
|
||||
}
|
||||
return `backup completed ${projectId}`
|
||||
} catch (err) {
|
||||
metrics.inc('backup_worker_project', 1, { status: 'failed' })
|
||||
logger.error({ projectId, err }, 'backup failed')
|
||||
throw err // Re-throw to mark job as failed
|
||||
}
|
||||
}
|
||||
|
||||
async function runInit(startDate, endDate) {
|
||||
try {
|
||||
logger.info({ startDate, endDate }, 'initializing projects')
|
||||
await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
|
||||
return `initialization completed ${startDate} - ${endDate}`
|
||||
} catch (err) {
|
||||
logger.error({ startDate, endDate, err }, 'initialization failed')
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
export async function drainQueue() {
|
||||
logger.info({ queue: backupQueue.name }, 'pausing queue')
|
||||
await backupQueue.pause(true) // pause this worker and wait for jobs to finish
|
||||
logger.info({ queue: backupQueue.name }, 'closing queue')
|
||||
await backupQueue.close()
|
||||
}
|
||||
|
||||
export async function healthCheck() {
|
||||
const count = await backupQueue.count()
|
||||
metrics.gauge('backup_worker_queue_length', count)
|
||||
}
|
||||
69
services/history-v1/storage/scripts/export_global_blobs.mjs
Normal file
69
services/history-v1/storage/scripts/export_global_blobs.mjs
Normal file
@@ -0,0 +1,69 @@
|
||||
/**
|
||||
* A script to export the global blobs from mongo to a CSV file.
|
||||
*
|
||||
* node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
|
||||
*
|
||||
* The output CSV has the following format:
|
||||
*
|
||||
* hash,path,byteLength,stringLength,demoted
|
||||
*
|
||||
* hash: the hash of the blob
|
||||
* path: the path of the blob in the blob store
|
||||
* byteLength: the byte length of the blob, or empty if unknown
|
||||
* stringLength: the string length of the blob, or empty if unknown
|
||||
* demoted: true if the blob has been demoted to a reference, false otherwise
|
||||
*/
|
||||
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import fs from 'node:fs'
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
function parseArgs() {
|
||||
const args = commandLineArgs([
|
||||
{
|
||||
name: 'output',
|
||||
type: String,
|
||||
alias: 'o',
|
||||
},
|
||||
])
|
||||
const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
|
||||
|
||||
return {
|
||||
OUTPUT_STREAM,
|
||||
}
|
||||
}
|
||||
|
||||
const { OUTPUT_STREAM } = parseArgs()
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
|
||||
for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
|
||||
const { hash: blobHash, byteLength, stringLength } = blob
|
||||
if (blobHash !== hash) {
|
||||
throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
|
||||
}
|
||||
const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
|
||||
const byteLengthStr = byteLength === null ? '' : byteLength
|
||||
const stringLengthStr = stringLength === null ? '' : stringLength
|
||||
OUTPUT_STREAM.write(
|
||||
`${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
||||
@@ -0,0 +1,51 @@
|
||||
// @ts-check
|
||||
import { backedUpBlobs } from '../lib/mongodb.js'
|
||||
import { mongoId } from '../lib/assert.js'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
|
||||
const STATS = {
|
||||
total: 0,
|
||||
replaced: 0,
|
||||
skipped: 0,
|
||||
}
|
||||
|
||||
const config = commandLineArgs([
|
||||
{ name: 'commit', type: Boolean, defaultValue: false },
|
||||
])
|
||||
|
||||
async function processRecord(record) {
|
||||
STATS.total++
|
||||
try {
|
||||
mongoId(record._id)
|
||||
const newId = new ObjectId(record._id)
|
||||
if (config.commit) {
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: newId },
|
||||
{
|
||||
$addToSet: { blobs: { $each: record.blobs } },
|
||||
},
|
||||
{ upsert: true }
|
||||
)
|
||||
await backedUpBlobs.deleteOne({ _id: record._id })
|
||||
}
|
||||
STATS.replaced++
|
||||
} catch (error) {
|
||||
console.log(error)
|
||||
STATS.skipped++
|
||||
}
|
||||
}
|
||||
|
||||
const cursor = backedUpBlobs
|
||||
.find({ _id: { $type: 'string' } })
|
||||
.project({ _id: 1, blobs: 1 })
|
||||
|
||||
while (await cursor.hasNext()) {
|
||||
const record = await cursor.next()
|
||||
await processRecord(record)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
|
||||
)
|
||||
process.exit()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
UPDATE blobs
|
||||
SET global = TRUE
|
||||
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
|
||||
@@ -0,0 +1,16 @@
|
||||
CREATE TABLE global_blobs (
|
||||
hash_bytes bytea NOT NULL,
|
||||
byte_length integer NOT NULL,
|
||||
string_length integer,
|
||||
global boolean,
|
||||
CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
|
||||
CONSTRAINT global_blobs_byte_length_non_negative
|
||||
CHECK (byte_length >= 0),
|
||||
CONSTRAINT global_blobs_string_length_non_negative
|
||||
CHECK (string_length IS NULL OR string_length >= 0)
|
||||
);
|
||||
|
||||
INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
|
||||
SELECT hash_bytes, byte_length, string_length, true
|
||||
FROM blobs
|
||||
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
|
||||
@@ -0,0 +1,22 @@
|
||||
BEGIN;
|
||||
ALTER TABLE blobs RENAME TO old_blobs;
|
||||
ALTER TABLE global_blobs RENAME TO blobs;
|
||||
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_byte_length_non_negative
|
||||
TO old_blobs_byte_length_non_negative;
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_string_length_non_negative
|
||||
TO old_blobs_string_length_non_negative;
|
||||
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_byte_length_non_negative
|
||||
TO blobs_byte_length_non_negative;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_string_length_non_negative
|
||||
TO blobs_string_length_non_negative;
|
||||
COMMIT;
|
||||
@@ -0,0 +1,9 @@
|
||||
Scripts in this directory were used when we cleaned up the global blobs table,
|
||||
ensuring that it only contained global blobs. The scripts are meant to be run in this order:
|
||||
|
||||
* `01-create-blob-hashes-table.sql`
|
||||
* `02-set-global-flag.sql`
|
||||
* `03-create-global-blobs-table.sql`
|
||||
* `04-swap-global-blob-tables.sql`
|
||||
|
||||
The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.
|
||||
@@ -0,0 +1,22 @@
|
||||
BEGIN;
|
||||
ALTER TABLE blobs RENAME TO global_blobs;
|
||||
ALTER TABLE old_blobs RENAME TO blobs;
|
||||
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_byte_length_non_negative
|
||||
TO global_blobs_byte_length_non_negative;
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_string_length_non_negative
|
||||
TO global_blobs_string_length_non_negative;
|
||||
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_byte_length_non_negative
|
||||
TO blobs_byte_length_non_negative;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_string_length_non_negative
|
||||
TO blobs_string_length_non_negative;
|
||||
COMMIT;
|
||||
379
services/history-v1/storage/scripts/recover_doc_versions.js
Normal file
379
services/history-v1/storage/scripts/recover_doc_versions.js
Normal file
@@ -0,0 +1,379 @@
|
||||
const fsPromises = require('node:fs/promises')
|
||||
const { ObjectId } = require('mongodb')
|
||||
const BPromise = require('bluebird')
|
||||
const logger = require('@overleaf/logger')
|
||||
const Settings = require('@overleaf/settings')
|
||||
const rclient = require('@overleaf/redis-wrapper').createClient(
|
||||
Settings.redis.documentupdater
|
||||
)
|
||||
const mongodb = require('../lib/mongodb')
|
||||
const { chunkStore } = require('..')
|
||||
const Events = require('node:events')
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
const BATCH_SIZE = 1000
|
||||
const OPTIONS = {
|
||||
concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
|
||||
force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
|
||||
'skip-history-failures':
|
||||
process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
|
||||
'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
|
||||
}
|
||||
|
||||
const db = {
|
||||
deletedProjects: mongodb.db.collection('deletedProjects'),
|
||||
docs: mongodb.db.collection('docs'),
|
||||
migrations: mongodb.db.collection('migrations'),
|
||||
projects: mongodb.db.collection('projects'),
|
||||
}
|
||||
|
||||
const BAD_MIGRATION_NAME =
|
||||
'20231219081700_move_doc_versions_from_docops_to_docs'
|
||||
|
||||
const RECOVERY_FILES_502 = [
|
||||
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
|
||||
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
|
||||
]
|
||||
|
||||
let loggingChain = Promise.resolve()
|
||||
const projectIdsThatNeedResyncing = []
|
||||
const unflushedDocIds = new Set()
|
||||
|
||||
async function flushLogQueue() {
|
||||
const logPath = OPTIONS['resyncs-needed-file']
|
||||
loggingChain = loggingChain.then(async () => {
|
||||
const batch = projectIdsThatNeedResyncing.splice(0)
|
||||
if (batch.length === 0) return
|
||||
try {
|
||||
await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
|
||||
} catch (err) {
|
||||
projectIdsThatNeedResyncing.push(...batch)
|
||||
logger.err({ err, logPath, batch }, 'Failed to write to log file')
|
||||
}
|
||||
})
|
||||
await loggingChain
|
||||
}
|
||||
async function recordProjectNeedsResync(projectId) {
|
||||
if (OPTIONS['resyncs-needed-file']) {
|
||||
projectIdsThatNeedResyncing.push(projectId)
|
||||
await flushLogQueue()
|
||||
} else {
|
||||
console.log(`Project ${projectId} needs a hard resync.`)
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const recovery502Ran = await did502RecoveryRun()
|
||||
await getUnflushedDocIds()
|
||||
const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
|
||||
|
||||
if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
|
||||
// Tell customers that they need to flush
|
||||
console.log(`
|
||||
--------------------------------------------------------------------
|
||||
Detected unflushed changes while recovering doc versions.
|
||||
Please go back to version 5.0.1 and follow the recovery procedure
|
||||
for flushing document updates:
|
||||
|
||||
https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
|
||||
--------------------------------------------------------------------`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (OPTIONS.force || recovery502Ran || badMigration != null) {
|
||||
console.warn('Need to recover doc versions. This will take a while.')
|
||||
await runRecovery()
|
||||
await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
|
||||
await delete502RecoveryFiles()
|
||||
}
|
||||
|
||||
console.log('Done.')
|
||||
}
|
||||
|
||||
async function did502RecoveryRun() {
|
||||
for (const file of RECOVERY_FILES_502) {
|
||||
try {
|
||||
await fsPromises.stat(file)
|
||||
return true
|
||||
} catch (err) {
|
||||
// file doesn't exist. continue
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async function delete502RecoveryFiles() {
|
||||
for (const file of RECOVERY_FILES_502) {
|
||||
try {
|
||||
await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
|
||||
} catch (err) {
|
||||
// file doesn't exist. continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runRecovery() {
|
||||
let batch = []
|
||||
const summary = {
|
||||
ignored: 0,
|
||||
skipped: 0,
|
||||
deletedUpdatedMongo: 0,
|
||||
deletedUpdatedRedis: 0,
|
||||
deletedUpdatedBoth: 0,
|
||||
deletedIgnored: 0,
|
||||
updatedMongo: 0,
|
||||
updatedRedis: 0,
|
||||
updatedBoth: 0,
|
||||
}
|
||||
const processBatchAndLogProgress = async () => {
|
||||
try {
|
||||
await BPromise.map(batch, project => processProject(project, summary), {
|
||||
concurrency: OPTIONS.concurrency,
|
||||
})
|
||||
} finally {
|
||||
console.log(`${summary.updatedRedis} projects updated in Redis`)
|
||||
console.log(`${summary.updatedMongo} projects updated in Mongo`)
|
||||
console.log(
|
||||
`${summary.updatedBoth} projects updated in both Mongo and Redis`
|
||||
)
|
||||
console.log(`${summary.ignored} projects had good versions`)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedRedis} deleted projects updated in Redis`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedIgnored} deleted projects had good versions`
|
||||
)
|
||||
console.log(`${summary.skipped} projects skipped`)
|
||||
}
|
||||
batch = []
|
||||
}
|
||||
|
||||
await printDBStats()
|
||||
await initResyncsNeededFile()
|
||||
for await (const project of getProjects()) {
|
||||
batch.push(project)
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
}
|
||||
|
||||
for await (const deletedProject of getDeletedProjects()) {
|
||||
const project = deletedProject.project
|
||||
project.isDeleted = true
|
||||
batch.push(project)
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.length > 0) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
|
||||
await backfillMissingVersions()
|
||||
}
|
||||
|
||||
async function getUnflushedDocIds() {
|
||||
const batchSize = 1000
|
||||
let cursor = '0'
|
||||
do {
|
||||
const [newCursor, keys] = await rclient.scan(
|
||||
cursor,
|
||||
'MATCH',
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
|
||||
'COUNT',
|
||||
batchSize
|
||||
)
|
||||
for (const key of keys) {
|
||||
unflushedDocIds.add(key.slice('DocVersion:'.length))
|
||||
}
|
||||
cursor = newCursor
|
||||
} while (cursor !== '0')
|
||||
}
|
||||
|
||||
async function printDBStats() {
|
||||
const projects = await db.projects.estimatedDocumentCount()
|
||||
const deletedProjects = await db.deletedProjects.countDocuments()
|
||||
const docs = await db.docs.estimatedDocumentCount()
|
||||
console.log(
|
||||
`Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
|
||||
)
|
||||
}
|
||||
|
||||
async function initResyncsNeededFile() {
|
||||
const logPath = OPTIONS['resyncs-needed-file']
|
||||
if (logPath) {
|
||||
await fsPromises.writeFile(logPath, '')
|
||||
await fsPromises.rm(`${logPath}.done`, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
function getProjects() {
|
||||
return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
|
||||
}
|
||||
|
||||
function getDeletedProjects() {
|
||||
return db.deletedProjects.find(
|
||||
{ 'project.overleaf.history.id': { $exists: true } },
|
||||
{ projection: { 'project._id': 1, 'project.overleaf': 1 } }
|
||||
)
|
||||
}
|
||||
|
||||
async function processProject(project, summary) {
|
||||
const projectId = project._id.toString()
|
||||
let updatedMongo = false
|
||||
let updatedRedis = false
|
||||
try {
|
||||
const historyDocVersions = await getHistoryDocVersions(project)
|
||||
|
||||
for (const { docId, version } of historyDocVersions) {
|
||||
const update = await fixDocVersion(docId, version)
|
||||
if (update != null) {
|
||||
if (update.in === 'mongo') {
|
||||
updatedMongo = true
|
||||
} else if (update.in === 'redis') {
|
||||
updatedRedis = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (project.isDeleted) {
|
||||
if (updatedMongo && updatedRedis) {
|
||||
summary.deletedUpdatedBoth += 1
|
||||
} else if (updatedMongo) {
|
||||
summary.deletedUpdatedMongo += 1
|
||||
} else if (updatedRedis) {
|
||||
summary.deletedUpdatedRedis += 1
|
||||
} else {
|
||||
summary.deletedIgnored += 1
|
||||
}
|
||||
} else {
|
||||
await recordProjectNeedsResync(projectId)
|
||||
if (updatedMongo && updatedRedis) {
|
||||
summary.updatedBoth += 1
|
||||
} else if (updatedMongo) {
|
||||
summary.updatedMongo += 1
|
||||
} else if (updatedRedis) {
|
||||
summary.updatedRedis += 1
|
||||
} else {
|
||||
summary.ignored += 1
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'Failed to process project')
|
||||
if (OPTIONS['skip-history-failures']) {
|
||||
summary.skipped += 1
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getHistoryDocVersions(project) {
|
||||
const historyId = project.overleaf.history.id
|
||||
const chunk = await chunkStore.loadLatest(historyId)
|
||||
if (chunk == null) {
|
||||
return []
|
||||
}
|
||||
|
||||
const snapshot = chunk.getSnapshot()
|
||||
const changes = chunk.getChanges()
|
||||
snapshot.applyAll(changes)
|
||||
const v2DocVersions = snapshot.getV2DocVersions()
|
||||
if (v2DocVersions == null) {
|
||||
return []
|
||||
}
|
||||
return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
|
||||
docId,
|
||||
version: versionInfo.v,
|
||||
}))
|
||||
}
|
||||
|
||||
async function fixDocVersion(docId, historyVersion) {
|
||||
const redisVersion = await getRedisDocVersion(docId)
|
||||
if (redisVersion != null && historyVersion >= redisVersion) {
|
||||
await setRedisDocVersion(docId, historyVersion + 1)
|
||||
return {
|
||||
in: 'redis',
|
||||
previousVersion: redisVersion,
|
||||
newVersion: historyVersion + 1,
|
||||
}
|
||||
} else {
|
||||
const docBeforeUpdate = await db.docs.findOneAndUpdate(
|
||||
{
|
||||
_id: new ObjectId(docId),
|
||||
$or: [
|
||||
{ version: { $lte: historyVersion } },
|
||||
{ version: { $exists: false } },
|
||||
],
|
||||
},
|
||||
{ $set: { version: historyVersion + 1 } },
|
||||
{ projection: { _id: 1, version: 1 } }
|
||||
)
|
||||
|
||||
if (docBeforeUpdate != null) {
|
||||
return {
|
||||
in: 'mongo',
|
||||
previousVersion: docBeforeUpdate.version,
|
||||
newVersion: historyVersion + 1,
|
||||
}
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getRedisDocVersion(docId) {
|
||||
if (!unflushedDocIds.has(docId)) {
|
||||
return null
|
||||
}
|
||||
const result = await rclient.get(
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
|
||||
)
|
||||
if (result == null) {
|
||||
return null
|
||||
}
|
||||
return parseInt(result, 10)
|
||||
}
|
||||
|
||||
async function setRedisDocVersion(docId, version) {
|
||||
const multi = rclient.multi()
|
||||
multi.set(
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
|
||||
version
|
||||
)
|
||||
multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
|
||||
await multi.exec()
|
||||
}
|
||||
|
||||
/**
|
||||
* Set all remaining versions to 0
|
||||
*/
|
||||
async function backfillMissingVersions() {
|
||||
console.log('Defaulting version to 0 for remaining docs.')
|
||||
await db.docs.updateMany(
|
||||
{ version: { $exists: false } },
|
||||
{ $set: { version: 0 } }
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
.finally(async () => {
|
||||
console.log('Flushing log queue.')
|
||||
await flushLogQueue()
|
||||
})
|
||||
.then(() => {
|
||||
process.exit(0)
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
255
services/history-v1/storage/scripts/recover_zip.js
Normal file
255
services/history-v1/storage/scripts/recover_zip.js
Normal file
@@ -0,0 +1,255 @@
|
||||
/**
|
||||
* Try to recover a zip of the latest version of a project using only data in
|
||||
* GCS, where this data may have been (recently) hard deleted (i.e. may exist
|
||||
* wholely or in part as non-current versions). This should be able to
|
||||
* retrieve the latest content of a project up to 180 days after it was
|
||||
* deleted.
|
||||
*
|
||||
* Usage:
|
||||
* node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
|
||||
*
|
||||
* Output:
|
||||
* Signed URL(s) for the uploaded zip files. Note that these are valid for
|
||||
* only 24h, to match the lifecycle rule on the zip bucket.
|
||||
*/
|
||||
|
||||
const fs = require('node:fs')
|
||||
const os = require('node:os')
|
||||
const path = require('node:path')
|
||||
const util = require('node:util')
|
||||
|
||||
// Something is registering 11 listeners, over the limit of 10, which generates
|
||||
// a lot of warning noise.
|
||||
require('node:events').EventEmitter.defaultMaxListeners = 11
|
||||
|
||||
const config = require('config')
|
||||
// We depend on this via object-persistor.
|
||||
// eslint-disable-next-line import/no-extraneous-dependencies
|
||||
const { Storage } = require('@google-cloud/storage')
|
||||
const isValidUtf8 = require('utf-8-validate')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const projectKey = require('../lib/project_key')
|
||||
const streams = require('../lib/streams')
|
||||
const ProjectArchive = require('../lib/project_archive')
|
||||
|
||||
const {
|
||||
values: { verbose: VERBOSE },
|
||||
positionals: HISTORY_IDS,
|
||||
} = util.parseArgs({
|
||||
options: {
|
||||
verbose: {
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
},
|
||||
},
|
||||
allowPositionals: true,
|
||||
})
|
||||
|
||||
if (HISTORY_IDS.length === 0) {
|
||||
console.error('no history IDs; see usage')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
async function listDeletedChunks(historyId) {
|
||||
const bucketName = config.get('chunkStore.bucket')
|
||||
const storage = new Storage()
|
||||
const [files] = await storage.bucket(bucketName).getFiles({
|
||||
prefix: projectKey.format(historyId),
|
||||
versions: true,
|
||||
})
|
||||
return files
|
||||
}
|
||||
|
||||
async function findLatestChunk(historyId) {
|
||||
const files = await listDeletedChunks(historyId)
|
||||
if (files.length === 0) return null
|
||||
files.sort((a, b) => {
|
||||
if (a.name < b.name) return -1
|
||||
if (a.name > b.name) return 1
|
||||
return 0
|
||||
})
|
||||
return files[files.length - 1]
|
||||
}
|
||||
|
||||
async function downloadLatestChunk(tmp, historyId) {
|
||||
const latestChunkFile = await findLatestChunk(historyId)
|
||||
if (!latestChunkFile) throw new Error('no chunk found to recover')
|
||||
|
||||
const destination = path.join(tmp, 'latest.json')
|
||||
await latestChunkFile.download({ destination })
|
||||
return destination
|
||||
}
|
||||
|
||||
async function loadHistory(historyPathname) {
|
||||
const data = await fs.promises.readFile(historyPathname)
|
||||
const rawHistory = JSON.parse(data)
|
||||
return core.History.fromRaw(rawHistory)
|
||||
}
|
||||
|
||||
async function loadChunk(historyPathname, blobStore) {
|
||||
const history = await loadHistory(historyPathname)
|
||||
|
||||
const blobHashes = new Set()
|
||||
history.findBlobHashes(blobHashes)
|
||||
|
||||
await blobStore.fetchBlobs(blobHashes)
|
||||
await history.loadFiles('lazy', blobStore)
|
||||
|
||||
return new core.Chunk(history, 0)
|
||||
}
|
||||
|
||||
// TODO: it would be nice to export / expose this from BlobStore;
|
||||
// currently this is a copy of the method there.
|
||||
async function getStringLengthOfFile(byteLength, pathname) {
|
||||
// We have to read the file into memory to get its UTF-8 length, so don't
|
||||
// bother for files that are too large for us to edit anyway.
|
||||
if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
||||
return null
|
||||
}
|
||||
|
||||
// We need to check if the file contains nonBmp or null characters
|
||||
let data = await fs.promises.readFile(pathname)
|
||||
if (!isValidUtf8(data)) return null
|
||||
data = data.toString()
|
||||
if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
|
||||
if (core.util.containsNonBmpChars(data)) return null
|
||||
if (data.indexOf('\x00') !== -1) return null
|
||||
return data.length
|
||||
}
|
||||
|
||||
class RecoveryBlobStore {
|
||||
constructor(historyId, tmp) {
|
||||
this.historyId = historyId
|
||||
this.tmp = tmp
|
||||
this.blobs = new Map()
|
||||
}
|
||||
|
||||
async fetchBlobs(blobHashes) {
|
||||
for await (const blobHash of blobHashes) {
|
||||
await this.fetchBlob(blobHash)
|
||||
}
|
||||
}
|
||||
|
||||
async fetchBlob(hash) {
|
||||
if (this.blobs.has(hash)) return
|
||||
|
||||
if (VERBOSE) console.log('fetching blob', hash)
|
||||
|
||||
const bucketName = config.get('blobStore.projectBucket')
|
||||
const storage = new Storage()
|
||||
const [files] = await storage.bucket(bucketName).getFiles({
|
||||
prefix: this.makeProjectBlobKey(hash),
|
||||
versions: true,
|
||||
})
|
||||
|
||||
const destination = this.getBlobPathname(hash)
|
||||
|
||||
if (files.length === 0) {
|
||||
await this.fetchGlobalBlob(hash, destination)
|
||||
} else if (files.length === 1) {
|
||||
await files[0].download({ destination })
|
||||
} else {
|
||||
throw new Error('Multiple versions of blob ' + hash)
|
||||
}
|
||||
|
||||
this.blobs.set(hash, await this.makeBlob(hash, destination))
|
||||
}
|
||||
|
||||
async fetchGlobalBlob(hash, destination) {
|
||||
const bucketName = config.get('blobStore.globalBucket')
|
||||
const storage = new Storage()
|
||||
const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
|
||||
await file.download({ destination })
|
||||
}
|
||||
|
||||
async makeBlob(hash, pathname) {
|
||||
const stat = await fs.promises.stat(pathname)
|
||||
const byteLength = stat.size
|
||||
const stringLength = await getStringLengthOfFile(byteLength, pathname)
|
||||
return new core.Blob(hash, byteLength, stringLength)
|
||||
}
|
||||
|
||||
async getString(hash) {
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.readStreamToBuffer(stream)
|
||||
return buffer.toString()
|
||||
}
|
||||
|
||||
async getStream(hash) {
|
||||
return fs.createReadStream(this.getBlobPathname(hash))
|
||||
}
|
||||
|
||||
async getBlob(hash) {
|
||||
return this.blobs.get(hash)
|
||||
}
|
||||
|
||||
getBlobPathname(hash) {
|
||||
return path.join(this.tmp, hash)
|
||||
}
|
||||
|
||||
makeGlobalBlobKey(hash) {
|
||||
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
||||
}
|
||||
|
||||
makeProjectBlobKey(hash) {
|
||||
return `${projectKey.format(this.historyId)}/${hash.slice(
|
||||
0,
|
||||
2
|
||||
)}/${hash.slice(2)}`
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadZip(historyId, zipPathname) {
|
||||
const bucketName = config.get('zipStore.bucket')
|
||||
const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
|
||||
const storage = new Storage()
|
||||
const destination = `${historyId}-recovered.zip`
|
||||
await storage.bucket(bucketName).upload(zipPathname, { destination })
|
||||
|
||||
const signedUrls = await storage
|
||||
.bucket(bucketName)
|
||||
.file(destination)
|
||||
.getSignedUrl({
|
||||
version: 'v4',
|
||||
action: 'read',
|
||||
expires: Date.now() + deadline,
|
||||
})
|
||||
|
||||
return signedUrls[0]
|
||||
}
|
||||
|
||||
async function restoreProject(historyId) {
|
||||
const tmp = await fs.promises.mkdtemp(
|
||||
path.join(os.tmpdir(), historyId.toString())
|
||||
)
|
||||
if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
|
||||
|
||||
const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
|
||||
const blobStore = new RecoveryBlobStore(historyId, tmp)
|
||||
const chunk = await loadChunk(latestJsonPathname, blobStore)
|
||||
|
||||
const snapshot = chunk.getSnapshot()
|
||||
for (const change of chunk.getChanges()) {
|
||||
change.applyTo(snapshot)
|
||||
}
|
||||
|
||||
if (VERBOSE) console.log('zipping', historyId)
|
||||
|
||||
const zipPathname = path.join(tmp, `${historyId}.zip`)
|
||||
const zipTimeoutMs = 60 * 1000
|
||||
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
|
||||
await archive.writeZip(blobStore, zipPathname)
|
||||
|
||||
if (VERBOSE) console.log('uploading', historyId)
|
||||
|
||||
return await uploadZip(historyId, zipPathname)
|
||||
}
|
||||
|
||||
async function main() {
|
||||
for (const historyId of HISTORY_IDS) {
|
||||
const signedUrl = await restoreProject(historyId)
|
||||
console.log(signedUrl)
|
||||
}
|
||||
}
|
||||
main().catch(console.error)
|
||||
36
services/history-v1/storage/scripts/redis.mjs
Normal file
36
services/history-v1/storage/scripts/redis.mjs
Normal file
@@ -0,0 +1,36 @@
|
||||
import redis from '@overleaf/redis-wrapper'
|
||||
import config from 'config'
|
||||
|
||||
// Get allowed Redis dbs from config
|
||||
const redisConfig = config.get('redis')
|
||||
const allowedDbs = Object.keys(redisConfig)
|
||||
|
||||
// Get the Redis db from command line argument or use the first available db as default
|
||||
const db = process.argv[2]
|
||||
|
||||
// Validate redis db
|
||||
if (!allowedDbs.includes(db)) {
|
||||
if (db) {
|
||||
console.error('Invalid redis db:', db)
|
||||
}
|
||||
console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Get redis options based on command line argument
|
||||
const redisOptions = config.get(`redis.${db}`)
|
||||
console.log('Using redis db:', db)
|
||||
console.log('REDIS CONFIG', {
|
||||
...redisOptions,
|
||||
password: '*'.repeat(redisOptions.password?.length),
|
||||
})
|
||||
const rclient = redis.createClient(redisOptions)
|
||||
|
||||
try {
|
||||
await rclient.healthCheck()
|
||||
console.log('REDIS HEALTHCHECK SUCCEEDED')
|
||||
} catch (error) {
|
||||
console.error('REDIS HEALTHCHECK FAILED', error)
|
||||
} finally {
|
||||
await rclient.quit()
|
||||
}
|
||||
104
services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
Normal file
104
services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
Normal file
@@ -0,0 +1,104 @@
|
||||
// @ts-check
|
||||
import { readFileSync } from 'node:fs'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import {
|
||||
getBackedUpBlobHashes,
|
||||
unsetBackedUpBlobHashes,
|
||||
} from '../lib/backup_store/index.js'
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
// Parse command line arguments
|
||||
const args = commandLineArgs([
|
||||
{ name: 'input', type: String, alias: 'i', defaultOption: true },
|
||||
{ name: 'commit', type: Boolean, default: false },
|
||||
])
|
||||
|
||||
if (!args.input) {
|
||||
console.error(
|
||||
'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (!args.commit) {
|
||||
console.log('Running in dry-run mode. Use --commit to apply changes.')
|
||||
}
|
||||
|
||||
// Signal handling
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
console.warn('Graceful shutdown initiated')
|
||||
gracefulShutdownInitiated = true
|
||||
}
|
||||
|
||||
// Process CSV and remove blobs
|
||||
async function main() {
|
||||
const projectBlobs = new Map()
|
||||
const lines = readFileSync(args.input, 'utf8').split('\n')
|
||||
const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
|
||||
|
||||
// Skip header
|
||||
for (const line of lines.slice(1)) {
|
||||
if (!line.trim() || gracefulShutdownInitiated) break
|
||||
|
||||
const [projectId, path] = line.split(',')
|
||||
const pathParts = path.split('/')
|
||||
const hash = pathParts[3] + pathParts[4]
|
||||
|
||||
if (!SHA1_HEX_REGEX.test(hash)) {
|
||||
console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (!projectBlobs.has(projectId)) {
|
||||
projectBlobs.set(projectId, new Set())
|
||||
}
|
||||
projectBlobs.get(projectId).add(hash)
|
||||
}
|
||||
|
||||
// Process each project
|
||||
for (const [projectId, hashes] of projectBlobs) {
|
||||
if (gracefulShutdownInitiated) break
|
||||
|
||||
if (!args.commit) {
|
||||
console.log(
|
||||
`DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
try {
|
||||
const originalHashes = await getBackedUpBlobHashes(projectId)
|
||||
if (originalHashes.size === 0) {
|
||||
continue
|
||||
}
|
||||
const result = await unsetBackedUpBlobHashes(
|
||||
projectId,
|
||||
Array.from(hashes)
|
||||
)
|
||||
if (result) {
|
||||
console.log(
|
||||
`Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
|
||||
)
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Error updating project ${projectId}:`, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
main()
|
||||
.catch(err => {
|
||||
console.error('Fatal error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client
|
||||
.close()
|
||||
.catch(err => console.error('Error closing MongoDB connection:', err))
|
||||
})
|
||||
@@ -0,0 +1,221 @@
|
||||
// @ts-check
|
||||
|
||||
/**
|
||||
* This script is used to remove blobs that have been backed up under the project ID
|
||||
* instead of the history ID (where those are different).
|
||||
*
|
||||
* This script reads a CSV file with the following format:
|
||||
* ```
|
||||
* project_id,hash
|
||||
* <mongo ID>,<hash>
|
||||
* ```
|
||||
*
|
||||
* The header row is optional. All rows will be checked for conformance to the format.
|
||||
*/
|
||||
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
|
||||
import { makeProjectKey } from '../lib/blob_store/index.js'
|
||||
import fs from 'node:fs'
|
||||
import assert from '../lib/assert.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import { verifyBlobs } from '../lib/backupVerifier.mjs'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import { getHistoryId } from '../lib/backup_store/index.js'
|
||||
|
||||
const argsSchema = [
|
||||
{
|
||||
name: 'input',
|
||||
type: String,
|
||||
},
|
||||
{
|
||||
name: 'commit',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'header',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'force',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'verbose',
|
||||
type: Boolean,
|
||||
},
|
||||
]
|
||||
|
||||
const args = commandLineArgs(argsSchema)
|
||||
|
||||
async function gracefulClose(code = 0) {
|
||||
await client.close()
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {(value: unknown) => void} fn
|
||||
* @param {unknown} value
|
||||
* @return {boolean}
|
||||
*/
|
||||
function not(fn, value) {
|
||||
try {
|
||||
fn(value)
|
||||
return false
|
||||
} catch {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} row
|
||||
* @return {{projectId: string, hash: string}}
|
||||
*/
|
||||
function parseCSVRow(row) {
|
||||
const [projectId, hash] = row.split(',')
|
||||
assert.mongoId(projectId, `invalid projectId ${projectId}`)
|
||||
assert.blobHash(hash, `invalid hash ${hash}`)
|
||||
return { projectId, hash }
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} path
|
||||
* @param {boolean} hasHeader
|
||||
* @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
|
||||
*/
|
||||
async function* readCSV(path, hasHeader) {
|
||||
let seenHeader = !hasHeader
|
||||
let fh
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
return await gracefulClose(1)
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
if (!seenHeader) {
|
||||
const [first, second] = line.split(',')
|
||||
const noDataInHeader =
|
||||
not(assert.mongoId, first) && not(assert.blobHash, second)
|
||||
if (!noDataInHeader) {
|
||||
console.error('Data found in header row')
|
||||
return await gracefulClose(1)
|
||||
}
|
||||
seenHeader = true
|
||||
continue
|
||||
}
|
||||
try {
|
||||
yield parseCSVRow(line)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.info(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function usage() {
|
||||
console.info(
|
||||
'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
|
||||
)
|
||||
}
|
||||
|
||||
if (!args.input) {
|
||||
console.error('--input was missing')
|
||||
usage()
|
||||
await gracefulClose(1)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function deleteBlob(projectId, hash) {
|
||||
const path = makeProjectKey(projectId, hash)
|
||||
if (args.commit) {
|
||||
await backupPersistor.deleteObject(projectBlobsBucket, path)
|
||||
} else {
|
||||
console.log(`DELETE: ${path}`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function canDeleteBlob(projectId, hash) {
|
||||
let historyId
|
||||
try {
|
||||
historyId = await getHistoryId(projectId)
|
||||
} catch (error) {
|
||||
if (args.verbose) {
|
||||
console.error(error)
|
||||
}
|
||||
throw new Error(`No history ID found for project ${projectId}, skipping`)
|
||||
}
|
||||
if (historyId === projectId) {
|
||||
throw new Error(
|
||||
`Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
|
||||
)
|
||||
}
|
||||
|
||||
// TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
|
||||
assert.postgresId(
|
||||
`${historyId}`,
|
||||
`History ID ${historyId} does not appear to be for a postgres project`
|
||||
)
|
||||
|
||||
try {
|
||||
await verifyBlobs(`${historyId}`, [hash])
|
||||
} catch (error) {
|
||||
if (args.verbose) {
|
||||
console.error(error)
|
||||
}
|
||||
throw new Error(
|
||||
`Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if (!args.commit) {
|
||||
console.log('DRY RUN: provide --commit to perform operations')
|
||||
}
|
||||
|
||||
if (args.force) {
|
||||
console.log(
|
||||
'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
|
||||
)
|
||||
await setTimeout(5_000)
|
||||
}
|
||||
|
||||
let deleted = 0
|
||||
let errors = 0
|
||||
|
||||
for await (const { projectId, hash } of readCSV(args.input, args.header)) {
|
||||
if (!args.force) {
|
||||
try {
|
||||
await canDeleteBlob(projectId, hash)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
continue
|
||||
}
|
||||
}
|
||||
try {
|
||||
await deleteBlob(projectId, hash)
|
||||
deleted++
|
||||
} catch (error) {
|
||||
errors++
|
||||
console.error(error)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Deleted: ${deleted}`)
|
||||
console.log(`Errors: ${errors}`)
|
||||
|
||||
await gracefulClose()
|
||||
254
services/history-v1/storage/scripts/show.mjs
Normal file
254
services/history-v1/storage/scripts/show.mjs
Normal file
@@ -0,0 +1,254 @@
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import {
|
||||
loadAtVersion,
|
||||
getChunkMetadataForVersion,
|
||||
getProjectChunksFromVersion,
|
||||
} from '../lib/chunk_store/index.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import knex from '../lib/knex.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import {
|
||||
loadGlobalBlobs,
|
||||
BlobStore,
|
||||
makeProjectKey,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { TextDecoder } from 'node:util'
|
||||
import {
|
||||
backupPersistor,
|
||||
chunksBucket,
|
||||
projectBlobsBucket,
|
||||
} from '../lib/backupPersistor.mjs'
|
||||
import fs from 'node:fs'
|
||||
import { pipeline } from 'node:stream/promises'
|
||||
import os from 'node:os'
|
||||
import path from 'node:path'
|
||||
import { createHash } from 'node:crypto'
|
||||
import projectKey from '../lib/project_key.js'
|
||||
import { createGunzip } from 'node:zlib'
|
||||
import { text } from 'node:stream/consumers'
|
||||
|
||||
const optionDefinitions = [
|
||||
{ name: 'historyId', alias: 'p', type: String },
|
||||
{ name: 'version', alias: 'v', type: Number },
|
||||
{ name: 'blob', alias: 'b', type: String },
|
||||
{ name: 'remote', alias: 'r', type: Boolean },
|
||||
{ name: 'keep', alias: 'k', type: Boolean },
|
||||
]
|
||||
|
||||
function makeChunkKey(projectId, startVersion) {
|
||||
return path.join(projectKey.format(projectId), projectKey.pad(startVersion))
|
||||
}
|
||||
|
||||
async function listChunks(historyId) {
|
||||
for await (const chunkRecord of getProjectChunksFromVersion(historyId, 0)) {
|
||||
console.log('Chunk record:', chunkRecord)
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchChunkLocal(historyId, version) {
|
||||
const chunkRecord = await getChunkMetadataForVersion(historyId, version)
|
||||
const chunk = await loadAtVersion(historyId, version)
|
||||
return { key: version, chunk, metadata: chunkRecord, source: 'local storage' }
|
||||
}
|
||||
|
||||
async function fetchChunkRemote(historyId, version) {
|
||||
const chunkRecord = await getChunkMetadataForVersion(historyId, version)
|
||||
const startVersion = chunkRecord.startVersion
|
||||
const key = makeChunkKey(historyId, startVersion)
|
||||
const backupPersistorForProject = await backupPersistor.forProject(
|
||||
chunksBucket,
|
||||
key
|
||||
)
|
||||
const backupChunkStream = await backupPersistorForProject.getObjectStream(
|
||||
chunksBucket,
|
||||
key
|
||||
)
|
||||
const backupStr = await text(backupChunkStream.pipe(createGunzip()))
|
||||
return {
|
||||
key,
|
||||
chunk: JSON.parse(backupStr),
|
||||
metadata: chunkRecord,
|
||||
source: 'remote backup',
|
||||
}
|
||||
}
|
||||
|
||||
async function displayChunk(historyId, version, options) {
|
||||
const { key, chunk, metadata, source } = await (options.remote
|
||||
? fetchChunkRemote(historyId, version)
|
||||
: fetchChunkLocal(historyId, version))
|
||||
console.log('Source:', source)
|
||||
console.log('Chunk record', metadata)
|
||||
console.log('Key', key)
|
||||
// console.log('Number of changes', chunk.getChanges().length)
|
||||
console.log(JSON.stringify(chunk))
|
||||
}
|
||||
|
||||
async function fetchBlobRemote(historyId, blobHash) {
|
||||
const backupPersistorForProject = await backupPersistor.forProject(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
const blobKey = makeProjectKey(historyId, blobHash)
|
||||
return {
|
||||
stream: await backupPersistorForProject.getObjectStream(
|
||||
projectBlobsBucket,
|
||||
blobKey,
|
||||
{ autoGunzip: true }
|
||||
),
|
||||
metadata: { hash: blobHash },
|
||||
source: 'remote backup',
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchBlobLocal(historyId, blobHash) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.getBlob(blobHash)
|
||||
if (!blob) throw new Error(`Blob ${blobHash} not found`)
|
||||
return {
|
||||
stream: await blobStore.getStream(blobHash),
|
||||
metadata: blob,
|
||||
source: 'local storage',
|
||||
}
|
||||
}
|
||||
|
||||
async function displayBlobContent(filepath, metadata, source, blobHash) {
|
||||
console.log('Source:', source)
|
||||
console.log('Blob metadata:', metadata)
|
||||
|
||||
// Compute git hash using streaming
|
||||
const stat = fs.statSync(filepath)
|
||||
const header = `blob ${stat.size}\0`
|
||||
const hash = createHash('sha1')
|
||||
hash.update(header)
|
||||
|
||||
const hashStream = fs.createReadStream(filepath)
|
||||
for await (const chunk of hashStream) {
|
||||
hash.update(chunk)
|
||||
}
|
||||
const gitHash = hash.digest('hex')
|
||||
|
||||
// Check content type and display preview
|
||||
const fd = fs.openSync(filepath, 'r')
|
||||
try {
|
||||
const headBuf = Buffer.alloc(16)
|
||||
const tailBuf = Buffer.alloc(16)
|
||||
|
||||
try {
|
||||
// Stream through TextDecoderStream to check for valid UTF-8
|
||||
const textStream = fs.createReadStream(filepath)
|
||||
const decoder = new TextDecoder('utf-8', { fatal: true })
|
||||
for await (const chunk of textStream) {
|
||||
decoder.decode(chunk, { stream: true })
|
||||
}
|
||||
decoder.decode()
|
||||
// If we get here, it's valid UTF-8
|
||||
if (stat.size <= 1024) {
|
||||
console.log('Content (text):', await fs.readFileSync(filepath, 'utf8'))
|
||||
} else {
|
||||
console.log('Content (text, truncated):')
|
||||
console.log(` Length: ${stat.size} bytes`)
|
||||
fs.readSync(fd, headBuf, 0, 16, 0)
|
||||
fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
|
||||
console.log(
|
||||
' Content:',
|
||||
headBuf.toString('utf8') +
|
||||
' ...(truncated)... ' +
|
||||
tailBuf.toString('utf8')
|
||||
)
|
||||
}
|
||||
} catch (e) {
|
||||
// Binary content - show head and tail
|
||||
console.log('Content (binary):')
|
||||
console.log(` Length: ${stat.size} bytes`)
|
||||
|
||||
if (stat.size <= 32) {
|
||||
// Small file - read it all
|
||||
const buf = Buffer.alloc(stat.size)
|
||||
fs.readSync(fd, buf, 0, stat.size, 0)
|
||||
const hexBytes = buf.toString('hex').match(/../g).join(' ')
|
||||
console.log(' Bytes:', hexBytes)
|
||||
} else {
|
||||
// Read tail for large files
|
||||
fs.readSync(fd, headBuf, 0, 16, 0)
|
||||
fs.readSync(fd, tailBuf, 0, 16, stat.size - 16)
|
||||
const headHex = headBuf.toString('hex').match(/../g).join(' ')
|
||||
const tailHex = tailBuf.toString('hex').match(/../g).join(' ')
|
||||
console.log(' Bytes:', headHex + ' ... ' + tailHex)
|
||||
}
|
||||
console.log(' Git-style SHA1:', gitHash)
|
||||
if (gitHash !== blobHash) {
|
||||
console.log(' Warning: Git hash differs from blob hash!\x1b[0m')
|
||||
console.log(' Blob hash:', blobHash)
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fs.closeSync(fd)
|
||||
}
|
||||
}
|
||||
|
||||
async function withTempDir(prefix, fn, options = {}) {
|
||||
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix))
|
||||
try {
|
||||
return await Promise.resolve(fn(tmpDir))
|
||||
} finally {
|
||||
if (!options.keep) {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
} else {
|
||||
console.log('Keeping temporary file:', path.join(tmpDir, 'blob'))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function displayBlob(historyId, blobHash, options) {
|
||||
try {
|
||||
const { stream, metadata, source } = await (options.remote
|
||||
? fetchBlobRemote(historyId, blobHash)
|
||||
: fetchBlobLocal(historyId, blobHash))
|
||||
|
||||
await withTempDir(
|
||||
'blob-show-',
|
||||
async tmpDir => {
|
||||
const tmpPath = path.join(tmpDir, 'blob')
|
||||
await pipeline(stream, fs.createWriteStream(tmpPath))
|
||||
await displayBlobContent(tmpPath, metadata, source, blobHash)
|
||||
},
|
||||
{ keep: options.keep }
|
||||
)
|
||||
} catch (err) {
|
||||
if (err.code === 'NoSuchKey') {
|
||||
throw new Error(`Blob ${blobHash} not found in backup`)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { historyId, version, blob, remote, keep } =
|
||||
commandLineArgs(optionDefinitions)
|
||||
if (!historyId) {
|
||||
console.error('Error: --historyId is required.')
|
||||
process.exit(1)
|
||||
}
|
||||
await loadGlobalBlobs()
|
||||
if (version != null) {
|
||||
await displayChunk(historyId, version, { remote })
|
||||
} else if (blob != null) {
|
||||
await displayBlob(historyId, blob, { remote, keep })
|
||||
} else {
|
||||
await listChunks(historyId)
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
.finally(() => {
|
||||
knex.destroy().catch(err => console.error('Error closing Postgres:', err))
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
redis
|
||||
.disconnect()
|
||||
.catch(err => console.error('Error disconnecting Redis:', err))
|
||||
})
|
||||
153
services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
Normal file
153
services/history-v1/storage/scripts/verify_backed_up_blobs.mjs
Normal file
@@ -0,0 +1,153 @@
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import knex from '../lib/knex.js'
|
||||
import {
|
||||
batchedUpdate,
|
||||
objectIdFromInput,
|
||||
READ_PREFERENCE_SECONDARY,
|
||||
} from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import {
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import {
|
||||
backedUpBlobs as backedUpBlobsCollection,
|
||||
db,
|
||||
client,
|
||||
} from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import fs from 'node:fs'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
function parseArgs() {
|
||||
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
|
||||
const args = commandLineArgs([
|
||||
{
|
||||
name: 'BATCH_RANGE_START',
|
||||
type: String,
|
||||
defaultValue: PUBLIC_LAUNCH_DATE.toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'BATCH_RANGE_END',
|
||||
type: String,
|
||||
defaultValue: new Date().toISOString(),
|
||||
},
|
||||
{
|
||||
name: 'output',
|
||||
type: String,
|
||||
alias: 'o',
|
||||
},
|
||||
])
|
||||
const BATCH_RANGE_START = objectIdFromInput(
|
||||
args['BATCH_RANGE_START']
|
||||
).toString()
|
||||
const BATCH_RANGE_END = objectIdFromInput(args['BATCH_RANGE_END']).toString()
|
||||
if (!args['output']) {
|
||||
throw new Error('missing --output')
|
||||
}
|
||||
const OUTPUT_STREAM = fs.createWriteStream(args['output'])
|
||||
|
||||
return {
|
||||
BATCH_RANGE_START,
|
||||
BATCH_RANGE_END,
|
||||
OUTPUT_STREAM,
|
||||
}
|
||||
}
|
||||
|
||||
const { BATCH_RANGE_START, BATCH_RANGE_END, OUTPUT_STREAM } = parseArgs()
|
||||
|
||||
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
|
||||
if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
|
||||
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
|
||||
}
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated, draining queue')
|
||||
}
|
||||
|
||||
async function processBatch(batch) {
|
||||
if (gracefulShutdownInitiated) {
|
||||
throw new Error('graceful shutdown: aborting batch processing')
|
||||
}
|
||||
|
||||
const N = batch.length
|
||||
const firstId = batch[0]._id
|
||||
const lastId = batch[N - 1]._id
|
||||
const projectCursor = await projectsCollection.find(
|
||||
{ _id: { $gte: firstId, $lte: lastId } },
|
||||
{
|
||||
projection: { _id: 1, 'overleaf.history.id': 1, lastUpdated: 1 },
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
}
|
||||
)
|
||||
const projectMap = new Map()
|
||||
for await (const project of projectCursor) {
|
||||
projectMap.set(project._id.toString(), project)
|
||||
}
|
||||
for (const project of batch) {
|
||||
const projectId = project._id.toString()
|
||||
const projectRecord = projectMap.get(projectId)
|
||||
if (!projectRecord) {
|
||||
console.error(`project not found: ${projectId}`)
|
||||
continue
|
||||
}
|
||||
if (!projectRecord.overleaf?.history?.id) {
|
||||
console.error(`project missing history: ${projectId}`)
|
||||
continue
|
||||
}
|
||||
const historyId = projectRecord.overleaf.history.id.toString()
|
||||
const prefix = `${projectId},${projectRecord.lastUpdated.toISOString()},`
|
||||
const hashes = project.blobs.map(blob => blob.toString('hex'))
|
||||
const projectBlobHashes = hashes.filter(hash => !GLOBAL_BLOBS.has(hash))
|
||||
if (projectBlobHashes.length < hashes.length) {
|
||||
console.warn(
|
||||
`project ${projectId} has ${hashes.length - projectBlobHashes.length} global blobs`
|
||||
)
|
||||
}
|
||||
const rows = projectBlobHashes.map(
|
||||
hash => prefix + makeProjectKey(historyId, hash) + '\n'
|
||||
)
|
||||
OUTPUT_STREAM.write(rows.join(''))
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
OUTPUT_STREAM.write('projectId,lastUpdated,path\n')
|
||||
await batchedUpdate(
|
||||
backedUpBlobsCollection,
|
||||
{},
|
||||
processBatch,
|
||||
{},
|
||||
{},
|
||||
{ BATCH_RANGE_START, BATCH_RANGE_END }
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
knex.destroy().catch(err => {
|
||||
console.error('Error closing Postgres connection:', err)
|
||||
})
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
redis.disconnect().catch(err => {
|
||||
console.error('Error disconnecting Redis:', err)
|
||||
})
|
||||
})
|
||||
21
services/history-v1/storage/scripts/verify_backup_blob.mjs
Normal file
21
services/history-v1/storage/scripts/verify_backup_blob.mjs
Normal file
@@ -0,0 +1,21 @@
|
||||
import logger from '@overleaf/logger'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { verifyBlobs } from '../lib/backupVerifier.mjs'
|
||||
|
||||
const { historyId, hashes } = commandLineArgs([
|
||||
{ name: 'historyId', type: String },
|
||||
{ name: 'hashes', type: String, multiple: true, defaultOption: true },
|
||||
])
|
||||
|
||||
if (hashes.length === 0) {
|
||||
throw new Error('missing --hashes flag')
|
||||
}
|
||||
|
||||
try {
|
||||
await verifyBlobs(historyId, hashes)
|
||||
console.log('OK')
|
||||
process.exit(0)
|
||||
} catch (err) {
|
||||
logger.err({ err }, 'failed to verify blob')
|
||||
process.exit(1)
|
||||
}
|
||||
@@ -0,0 +1,177 @@
|
||||
import fs from 'node:fs'
|
||||
import { makeProjectKey } from '../lib/blob_store/index.js'
|
||||
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import OError from '@overleaf/o-error'
|
||||
import assert from '../lib/assert.js'
|
||||
import { client, projects } from '../lib/mongodb.js'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
const { input, verbose } = commandLineArgs([
|
||||
{ name: 'input', type: String },
|
||||
{ name: 'verbose', type: Boolean, defaultValue: false },
|
||||
])
|
||||
|
||||
function parseCSVRow(row) {
|
||||
const [path] = row.split(',')
|
||||
const pathSegments = path.split('/')
|
||||
const historyId = `${pathSegments[0]}${pathSegments[1]}${pathSegments[2]}`
|
||||
.split('')
|
||||
.reverse()
|
||||
.join('')
|
||||
|
||||
return { historyId, path, hash: `${pathSegments[3]}${pathSegments[4]}` }
|
||||
}
|
||||
|
||||
async function* readCSV(path) {
|
||||
let fh
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
throw error
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
try {
|
||||
const row = parseCSVRow(line)
|
||||
yield row
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.log(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class MissingDEKError extends OError {}
|
||||
class InvalidHistoryIdError extends OError {}
|
||||
class MissingProjectError extends OError {}
|
||||
class MissingBlobError extends OError {}
|
||||
|
||||
async function getProjectPersistor(historyId) {
|
||||
try {
|
||||
return await backupPersistor.forProjectRO(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new MissingDEKError('dek does not exist', { historyId }, err)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
async function checkBlobExists(path, historyId) {
|
||||
const persistor = await getProjectPersistor(historyId)
|
||||
return await persistor.getObjectSize(projectBlobsBucket, path)
|
||||
}
|
||||
|
||||
let total = 0
|
||||
const errors = {
|
||||
invalidProjectId: 0,
|
||||
notBackedUpProjectId: 0,
|
||||
missingBlob: 0,
|
||||
notInMongo: 0,
|
||||
unknown: 0,
|
||||
}
|
||||
|
||||
const notInMongoProjectIds = new Set()
|
||||
const notBackedUpProjectIds = new Set()
|
||||
|
||||
let stopping = false
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
console.log('SIGTERM received')
|
||||
stopping = true
|
||||
})
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
console.log('SIGINT received')
|
||||
stopping = true
|
||||
})
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {string} path
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function checkPath(historyId, path, hash) {
|
||||
try {
|
||||
assert.mongoId(historyId)
|
||||
} catch (error) {
|
||||
throw InvalidHistoryIdError('invalid history id', { historyId })
|
||||
}
|
||||
if (notInMongoProjectIds.has(historyId)) {
|
||||
throw new MissingProjectError('project not in mongo', { historyId })
|
||||
}
|
||||
if (notBackedUpProjectIds.has(historyId)) {
|
||||
throw new MissingDEKError('project not backed up', { historyId })
|
||||
}
|
||||
|
||||
const project = await projects.findOne({ _id: new ObjectId(historyId) })
|
||||
if (!project) {
|
||||
notInMongoProjectIds.add(historyId)
|
||||
throw new MissingProjectError('project not in mongo', { historyId })
|
||||
}
|
||||
try {
|
||||
await checkBlobExists(path, historyId)
|
||||
} catch (error) {
|
||||
if (error instanceof NotFoundError) {
|
||||
throw new MissingBlobError('missing blob', { historyId, hash })
|
||||
}
|
||||
if (error instanceof MissingDEKError) {
|
||||
notBackedUpProjectIds.add(historyId)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
for await (const line of readCSV(input)) {
|
||||
if (stopping) break
|
||||
total++
|
||||
if (total % 10_000 === 0) {
|
||||
console.log(`checked ${total}`)
|
||||
}
|
||||
const { historyId, path, hash } = line
|
||||
try {
|
||||
await checkPath(historyId, path, hash)
|
||||
if (verbose) {
|
||||
console.log(`✓ Project ${historyId} has ${hash} backed up`)
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof InvalidHistoryIdError) {
|
||||
errors.invalidProjectId++
|
||||
console.warn(`invalid historyId ${historyId}`)
|
||||
continue
|
||||
} else if (error instanceof MissingProjectError) {
|
||||
errors.notInMongo++
|
||||
console.warn(`✗ project ${historyId} not in mongo`)
|
||||
continue
|
||||
} else if (error instanceof MissingDEKError) {
|
||||
errors.notBackedUpProjectId++
|
||||
console.error(`✗ Project DEK ${historyId} not found`)
|
||||
continue
|
||||
} else if (error instanceof MissingBlobError) {
|
||||
errors.missingBlob++
|
||||
console.error(`✗ missing blob ${hash} from project ${historyId}`)
|
||||
continue
|
||||
}
|
||||
errors.unknown++
|
||||
console.error(error)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`total checked: ${total}`)
|
||||
console.log(`invalid project id: ${errors.invalidProjectId}`)
|
||||
console.log(`not found in mongo: ${errors.notInMongo}`)
|
||||
console.log(`missing blob: ${errors.missingBlob}`)
|
||||
console.log(`project not backed up: ${errors.notBackedUpProjectId}`)
|
||||
console.log(`unknown errors: ${errors.unknown}`)
|
||||
|
||||
await client.close()
|
||||
await setTimeout(100)
|
||||
process.exit()
|
||||
35
services/history-v1/storage/scripts/verify_project.mjs
Normal file
35
services/history-v1/storage/scripts/verify_project.mjs
Normal file
@@ -0,0 +1,35 @@
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { verifyProjectWithErrorContext } from '../lib/backupVerifier.mjs'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import { loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
|
||||
const { historyId } = commandLineArgs([{ name: 'historyId', type: String }])
|
||||
|
||||
async function gracefulShutdown(code = process.exitCode) {
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(1_000)
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
if (!historyId) {
|
||||
console.error('missing --historyId')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
try {
|
||||
await verifyProjectWithErrorContext(historyId)
|
||||
console.log('OK')
|
||||
} catch (error) {
|
||||
console.error('error verifying', error)
|
||||
process.exitCode = 1
|
||||
} finally {
|
||||
await gracefulShutdown()
|
||||
}
|
||||
217
services/history-v1/storage/scripts/verify_sampled_projects.mjs
Normal file
217
services/history-v1/storage/scripts/verify_sampled_projects.mjs
Normal file
@@ -0,0 +1,217 @@
|
||||
// @ts-check
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import {
|
||||
setWriteMetrics,
|
||||
verifyProjectsCreatedInDateRange,
|
||||
verifyRandomProjectSample,
|
||||
verifyProjectsUpdatedInDateRange,
|
||||
} from '../../backupVerifier/ProjectVerifier.mjs'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import logger from '@overleaf/logger'
|
||||
import { loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
import { getDatesBeforeRPO } from '../../backupVerifier/utils.mjs'
|
||||
import { EventEmitter } from 'node:events'
|
||||
import { mongodb } from '../index.js'
|
||||
import redis from '../lib/redis.js'
|
||||
|
||||
logger.logger.level('fatal')
|
||||
|
||||
const usageMessage = [
|
||||
'Usage: node verify_sampled_projects.mjs [--startDate <start>] [--endDate <end>] [--nProjects <n>] [--verbose] [--usage] [--writeMetrics] [--concurrency <n>] [--strategy <range|random>]',
|
||||
'strategy: defaults to "range"; startDate and endDate are required for "range" strategy',
|
||||
].join('\n')
|
||||
|
||||
/**
|
||||
* Gracefully shutdown the process
|
||||
* @param code
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function gracefulShutdown(code = process.exitCode) {
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(1_000)
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
const STATS = {
|
||||
verifiable: 0,
|
||||
unverifiable: 0,
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} CLIOptions
|
||||
* @property {(signal: EventEmitter) => Promise<VerificationJobStatus>} projectVerifier
|
||||
* @property {boolean} verbose
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import('../../backupVerifier/types.d.ts').VerificationJobStatus} VerificationJobStatus
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @return {CLIOptions}
|
||||
*/
|
||||
function getOptions() {
|
||||
const {
|
||||
startDate,
|
||||
endDate,
|
||||
concurrency,
|
||||
writeMetrics,
|
||||
verbose,
|
||||
nProjects,
|
||||
strategy,
|
||||
usage,
|
||||
} = commandLineArgs([
|
||||
{ name: 'startDate', type: String },
|
||||
{ name: 'endDate', type: String },
|
||||
{ name: 'concurrency', type: Number, defaultValue: 1 },
|
||||
{ name: 'verbose', type: Boolean, defaultValue: false },
|
||||
{ name: 'nProjects', type: Number, defaultValue: 10 },
|
||||
{ name: 'usage', type: Boolean, defaultValue: false },
|
||||
{ name: 'writeMetrics', type: Boolean, defaultValue: false },
|
||||
{ name: 'strategy', type: String, defaultValue: 'range' },
|
||||
])
|
||||
|
||||
if (usage) {
|
||||
console.log(usageMessage)
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
if (!['range', 'random', 'recent'].includes(strategy)) {
|
||||
throw new Error(`Invalid strategy: ${strategy}`)
|
||||
}
|
||||
|
||||
setWriteMetrics(writeMetrics)
|
||||
|
||||
switch (strategy) {
|
||||
case 'random':
|
||||
console.log('Verifying random projects')
|
||||
return {
|
||||
verbose,
|
||||
projectVerifier: signal => verifyRandomProjectSample(nProjects, signal),
|
||||
}
|
||||
case 'recent':
|
||||
return {
|
||||
verbose,
|
||||
projectVerifier: async signal => {
|
||||
const { startDate, endDate } = getDatesBeforeRPO(3 * 3600)
|
||||
return await verifyProjectsUpdatedInDateRange(
|
||||
startDate,
|
||||
endDate,
|
||||
nProjects,
|
||||
signal
|
||||
)
|
||||
},
|
||||
}
|
||||
case 'range':
|
||||
default: {
|
||||
if (!startDate || !endDate) {
|
||||
throw new Error(usageMessage)
|
||||
}
|
||||
const start = Date.parse(startDate)
|
||||
const end = Date.parse(endDate)
|
||||
if (Number.isNaN(start)) {
|
||||
throw new Error(`Invalid start date: ${startDate}`)
|
||||
}
|
||||
|
||||
if (Number.isNaN(end)) {
|
||||
throw new Error(`Invalid end date: ${endDate}`)
|
||||
}
|
||||
if (verbose) {
|
||||
console.log(`Verifying from ${startDate} to ${endDate}`)
|
||||
console.log(`Concurrency: ${concurrency}`)
|
||||
}
|
||||
STATS.ranges = 0
|
||||
return {
|
||||
projectVerifier: signal =>
|
||||
verifyProjectsCreatedInDateRange({
|
||||
startDate: new Date(start),
|
||||
endDate: new Date(end),
|
||||
projectsPerRange: nProjects,
|
||||
concurrency,
|
||||
signal,
|
||||
}),
|
||||
verbose,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @type {CLIOptions}
|
||||
*/
|
||||
let options
|
||||
try {
|
||||
options = getOptions()
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown(1)
|
||||
process.exit() // just here so the type checker knows that the process will exit
|
||||
}
|
||||
|
||||
const { projectVerifier, verbose } = options
|
||||
|
||||
if (verbose) {
|
||||
logger.logger.level('debug')
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Array<string>} array
|
||||
* @param {string} matchString
|
||||
* @return {*}
|
||||
*/
|
||||
function sumStringInstances(array, matchString) {
|
||||
return array.reduce((total, string) => {
|
||||
return string === matchString ? total + 1 : total
|
||||
}, 0)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {VerificationJobStatus} stats
|
||||
*/
|
||||
function displayStats(stats) {
|
||||
console.log(`Verified projects: ${stats.verified}`)
|
||||
console.log(`Total projects sampled: ${stats.total}`)
|
||||
if (stats.errorTypes.length > 0) {
|
||||
console.log('Errors:')
|
||||
for (const error of new Set(stats.errorTypes)) {
|
||||
console.log(`${error}: ${sumStringInstances(stats.errorTypes, error)}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const shutdownEmitter = new EventEmitter()
|
||||
|
||||
shutdownEmitter.on('shutdown', async () => {
|
||||
await gracefulShutdown()
|
||||
})
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
shutdownEmitter.emit('shutdown')
|
||||
})
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
shutdownEmitter.emit('shutdown')
|
||||
})
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
try {
|
||||
const stats = await projectVerifier(shutdownEmitter)
|
||||
displayStats(stats)
|
||||
console.log(`completed`)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
console.log('completed with errors')
|
||||
process.exitCode = 1
|
||||
} finally {
|
||||
console.log('shutting down')
|
||||
await gracefulShutdown()
|
||||
}
|
||||
109
services/history-v1/storage/tasks/backfill_start_version.js
Normal file
109
services/history-v1/storage/tasks/backfill_start_version.js
Normal file
@@ -0,0 +1,109 @@
|
||||
const commandLineArgs = require('command-line-args')
|
||||
const BPromise = require('bluebird')
|
||||
const timersPromises = require('node:timers/promises')
|
||||
|
||||
const { knex, historyStore } = require('..')
|
||||
|
||||
const MAX_POSTGRES_INTEGER = 2147483647
|
||||
const DEFAULT_BATCH_SIZE = 1000
|
||||
const DEFAULT_CONCURRENCY = 1
|
||||
const MAX_RETRIES = 10
|
||||
const RETRY_DELAY_MS = 5000
|
||||
|
||||
async function main() {
|
||||
const options = parseOptions()
|
||||
let batchStart = options.minId
|
||||
while (batchStart <= options.maxId) {
|
||||
const chunks = await getChunks(batchStart, options.maxId, options.batchSize)
|
||||
if (chunks.length === 0) {
|
||||
// No results. We're done.
|
||||
break
|
||||
}
|
||||
const batchEnd = chunks[chunks.length - 1].id
|
||||
await processBatch(chunks, options)
|
||||
console.log(`Processed chunks ${batchStart} to ${batchEnd}`)
|
||||
batchStart = batchEnd + 1
|
||||
}
|
||||
}
|
||||
|
||||
function parseOptions() {
|
||||
const args = commandLineArgs([
|
||||
{ name: 'min-id', type: Number, defaultValue: 1 },
|
||||
{
|
||||
name: 'max-id',
|
||||
type: Number,
|
||||
defaultValue: MAX_POSTGRES_INTEGER,
|
||||
},
|
||||
{ name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
|
||||
{ name: 'concurrency', type: Number, defaultValue: DEFAULT_CONCURRENCY },
|
||||
])
|
||||
return {
|
||||
minId: args['min-id'],
|
||||
maxId: args['max-id'],
|
||||
batchSize: args['batch-size'],
|
||||
concurrency: args.concurrency,
|
||||
}
|
||||
}
|
||||
|
||||
async function getChunks(minId, maxId, batchSize) {
|
||||
const chunks = await knex('chunks')
|
||||
.where('id', '>=', minId)
|
||||
.andWhere('id', '<=', maxId)
|
||||
.orderBy('id')
|
||||
.limit(batchSize)
|
||||
return chunks
|
||||
}
|
||||
|
||||
async function processBatch(chunks, options) {
|
||||
let retries = 0
|
||||
while (true) {
|
||||
const results = await BPromise.map(chunks, processChunk, {
|
||||
concurrency: options.concurrency,
|
||||
})
|
||||
const failedChunks = results
|
||||
.filter(result => !result.success)
|
||||
.map(result => result.chunk)
|
||||
if (failedChunks.length === 0) {
|
||||
// All chunks processed. Carry on.
|
||||
break
|
||||
}
|
||||
|
||||
// Some projects failed. Retry.
|
||||
retries += 1
|
||||
if (retries > MAX_RETRIES) {
|
||||
console.log('Too many retries processing chunks. Giving up.')
|
||||
process.exit(1)
|
||||
}
|
||||
console.log(
|
||||
`Retrying chunks: ${failedChunks.map(chunk => chunk.id).join(', ')}`
|
||||
)
|
||||
await timersPromises.setTimeout(RETRY_DELAY_MS)
|
||||
chunks = failedChunks
|
||||
}
|
||||
}
|
||||
|
||||
async function processChunk(chunk) {
|
||||
try {
|
||||
const rawHistory = await historyStore.loadRaw(
|
||||
chunk.doc_id.toString(),
|
||||
chunk.id
|
||||
)
|
||||
const startVersion = chunk.end_version - rawHistory.changes.length
|
||||
await knex('chunks')
|
||||
.where('id', chunk.id)
|
||||
.update({ start_version: startVersion })
|
||||
return { chunk, success: true }
|
||||
} catch (err) {
|
||||
console.error(`Failed to process chunk ${chunk.id}:`, err.stack)
|
||||
return { chunk, success: false }
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit()
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
107
services/history-v1/storage/tasks/compress_changes.js
Normal file
107
services/history-v1/storage/tasks/compress_changes.js
Normal file
@@ -0,0 +1,107 @@
|
||||
/**
|
||||
* Compress changes for projects that have too many text operations.
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* node tasks/compress_changes.js CSV_FILE
|
||||
*
|
||||
* where CSV_FILE contains a list of project ids in the first column
|
||||
*/
|
||||
|
||||
const fs = require('node:fs')
|
||||
const BPromise = require('bluebird')
|
||||
const { History } = require('overleaf-editor-core')
|
||||
const { historyStore, chunkStore } = require('..')
|
||||
|
||||
const CONCURRENCY = 10
|
||||
|
||||
async function main() {
|
||||
const filename = process.argv[2]
|
||||
const projectIds = await readCsv(filename)
|
||||
const chunks = []
|
||||
for (const projectId of projectIds) {
|
||||
const chunkIds = await chunkStore.getProjectChunkIds(projectId)
|
||||
chunks.push(...chunkIds.map(id => ({ id, projectId })))
|
||||
}
|
||||
let totalCompressed = 0
|
||||
await BPromise.map(
|
||||
chunks,
|
||||
async chunk => {
|
||||
try {
|
||||
const history = await getHistory(chunk)
|
||||
const numCompressed = compressChanges(history)
|
||||
if (numCompressed > 0) {
|
||||
await storeHistory(chunk, history)
|
||||
console.log(
|
||||
`Compressed project ${chunk.projectId}, chunk ${chunk.id}`
|
||||
)
|
||||
}
|
||||
totalCompressed += numCompressed
|
||||
} catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
},
|
||||
{ concurrency: CONCURRENCY }
|
||||
)
|
||||
console.log('CHANGES:', totalCompressed)
|
||||
}
|
||||
|
||||
async function readCsv(filename) {
|
||||
const csv = await fs.promises.readFile(filename, 'utf-8')
|
||||
const lines = csv.trim().split('\n')
|
||||
const projectIds = lines.map(line => line.split(',')[0])
|
||||
return projectIds
|
||||
}
|
||||
|
||||
async function getHistory(chunk) {
|
||||
const rawHistory = await historyStore.loadRaw(chunk.projectId, chunk.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
return history
|
||||
}
|
||||
|
||||
async function storeHistory(chunk, history) {
|
||||
const rawHistory = history.toRaw()
|
||||
await historyStore.storeRaw(chunk.projectId, chunk.id, rawHistory)
|
||||
}
|
||||
|
||||
function compressChanges(history) {
|
||||
let numCompressed = 0
|
||||
for (const change of history.getChanges()) {
|
||||
const newOperations = compressOperations(change.operations)
|
||||
if (newOperations.length !== change.operations.length) {
|
||||
numCompressed++
|
||||
}
|
||||
change.setOperations(newOperations)
|
||||
}
|
||||
return numCompressed
|
||||
}
|
||||
|
||||
function compressOperations(operations) {
|
||||
if (!operations.length) return []
|
||||
|
||||
const newOperations = []
|
||||
let currentOperation = operations[0]
|
||||
for (let operationId = 1; operationId < operations.length; operationId++) {
|
||||
const nextOperation = operations[operationId]
|
||||
if (currentOperation.canBeComposedWith(nextOperation)) {
|
||||
currentOperation = currentOperation.compose(nextOperation)
|
||||
} else {
|
||||
// currentOperation and nextOperation cannot be composed. Push the
|
||||
// currentOperation and start over with nextOperation.
|
||||
newOperations.push(currentOperation)
|
||||
currentOperation = nextOperation
|
||||
}
|
||||
}
|
||||
newOperations.push(currentOperation)
|
||||
|
||||
return newOperations
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit()
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
294
services/history-v1/storage/tasks/copy_project_blobs.js
Executable file
294
services/history-v1/storage/tasks/copy_project_blobs.js
Executable file
@@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const { promisify } = require('node:util')
|
||||
const BPromise = require('bluebird')
|
||||
const commandLineArgs = require('command-line-args')
|
||||
const config = require('config')
|
||||
const fs = require('node:fs')
|
||||
const readline = require('node:readline')
|
||||
const { History } = require('overleaf-editor-core')
|
||||
const { knex, historyStore, persistor } = require('..')
|
||||
const projectKey = require('../lib/project_key')
|
||||
|
||||
const MAX_POSTGRES_INTEGER = 2147483647
|
||||
const DEFAULT_BATCH_SIZE = 1000
|
||||
const MAX_RETRIES = 10
|
||||
const RETRY_DELAY_MS = 5000
|
||||
|
||||
// Obtain a preconfigured GCS client through a non-documented property of
|
||||
// object-persistor. Sorry about that. We need the GCS client because we use
|
||||
// operations that are not implemented in object-persistor.
|
||||
const gcsClient = persistor.storage
|
||||
const globalBucket = gcsClient.bucket(config.get('blobStore.globalBucket'))
|
||||
const projectBucket = gcsClient.bucket(config.get('blobStore.projectBucket'))
|
||||
const delay = promisify(setTimeout)
|
||||
|
||||
async function main() {
|
||||
const options = commandLineArgs([
|
||||
{ name: 'global-blobs', type: String },
|
||||
{ name: 'min-project-id', type: Number, defaultValue: 1 },
|
||||
{
|
||||
name: 'max-project-id',
|
||||
type: Number,
|
||||
defaultValue: MAX_POSTGRES_INTEGER,
|
||||
},
|
||||
{ name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE },
|
||||
{ name: 'concurrency', type: Number, defaultValue: 1 },
|
||||
])
|
||||
if (!options['global-blobs']) {
|
||||
console.error(
|
||||
'You must specify a global blobs file with the --global-blobs option'
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
const globalBlobs = await readGlobalBlobs(options['global-blobs'])
|
||||
const minProjectId = options['min-project-id']
|
||||
const maxProjectId = options['max-project-id']
|
||||
const batchSize = options['batch-size']
|
||||
const concurrency = options.concurrency
|
||||
console.log(`Keeping ${globalBlobs.size} global blobs`)
|
||||
await run({ globalBlobs, minProjectId, maxProjectId, batchSize, concurrency })
|
||||
console.log('Done.')
|
||||
}
|
||||
|
||||
async function readGlobalBlobs(filename) {
|
||||
const stream = fs.createReadStream(filename)
|
||||
const reader = readline.createInterface({
|
||||
input: stream,
|
||||
crlfDelay: Infinity,
|
||||
})
|
||||
const blobs = new Set()
|
||||
for await (const line of reader) {
|
||||
blobs.add(line.trim())
|
||||
}
|
||||
return blobs
|
||||
}
|
||||
|
||||
async function run(options) {
|
||||
const { globalBlobs, minProjectId, maxProjectId, batchSize, concurrency } =
|
||||
options
|
||||
let batchStart = minProjectId
|
||||
while (batchStart <= maxProjectId) {
|
||||
let projectIds = await getProjectIds(batchStart, maxProjectId, batchSize)
|
||||
if (projectIds.length === 0) {
|
||||
break
|
||||
}
|
||||
const batchEnd = projectIds[projectIds.length - 1]
|
||||
console.log(`Processing projects ${batchStart} to ${batchEnd}`)
|
||||
const chunkIdsByProject = await getChunkIdsByProject(projectIds)
|
||||
|
||||
let retries = 0
|
||||
while (true) {
|
||||
const results = await BPromise.map(
|
||||
projectIds,
|
||||
async projectId =>
|
||||
await processProject(
|
||||
projectId,
|
||||
chunkIdsByProject.get(projectId),
|
||||
globalBlobs
|
||||
),
|
||||
{ concurrency }
|
||||
)
|
||||
const failedProjectIds = results
|
||||
.filter(result => !result.success)
|
||||
.map(result => result.projectId)
|
||||
if (failedProjectIds.length === 0) {
|
||||
// All projects were copied successfully. Carry on.
|
||||
break
|
||||
}
|
||||
|
||||
// Some projects failed. Retry.
|
||||
retries += 1
|
||||
if (retries > MAX_RETRIES) {
|
||||
console.log(
|
||||
`Too many retries processing projects ${batchStart} to ${batchEnd}. Giving up.`
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
console.log(`Retrying projects: ${failedProjectIds.join(', ')}`)
|
||||
await delay(RETRY_DELAY_MS)
|
||||
projectIds = failedProjectIds
|
||||
}
|
||||
|
||||
// Set up next batch
|
||||
batchStart = batchEnd + 1
|
||||
}
|
||||
}
|
||||
|
||||
async function getProjectIds(minProjectId, maxProjectId, batchSize) {
|
||||
const projectIds = await knex('chunks')
|
||||
.distinct('doc_id')
|
||||
.where('doc_id', '>=', minProjectId)
|
||||
.andWhere('doc_id', '<=', maxProjectId)
|
||||
.orderBy('doc_id')
|
||||
.limit(batchSize)
|
||||
.pluck('doc_id')
|
||||
return projectIds
|
||||
}
|
||||
|
||||
async function getChunkIdsByProject(projectIds) {
|
||||
const chunks = await knex('chunks')
|
||||
.select('id', { projectId: 'doc_id' })
|
||||
.where('doc_id', 'in', projectIds)
|
||||
const chunkIdsByProject = new Map()
|
||||
for (const projectId of projectIds) {
|
||||
chunkIdsByProject.set(projectId, [])
|
||||
}
|
||||
for (const chunk of chunks) {
|
||||
chunkIdsByProject.get(chunk.projectId).push(chunk.id)
|
||||
}
|
||||
return chunkIdsByProject
|
||||
}
|
||||
|
||||
async function processProject(projectId, chunkIds, globalBlobs) {
|
||||
try {
|
||||
const blobHashes = await getBlobHashes(projectId, chunkIds)
|
||||
const projectBlobHashes = blobHashes.filter(hash => !globalBlobs.has(hash))
|
||||
const gcsSizesByHash = new Map()
|
||||
for (const blobHash of projectBlobHashes) {
|
||||
const blobSize = await copyBlobInGcs(projectId, blobHash)
|
||||
if (blobSize != null) {
|
||||
gcsSizesByHash.set(blobHash, blobSize)
|
||||
}
|
||||
}
|
||||
const dbSizesByHash = await copyBlobsInDatabase(
|
||||
projectId,
|
||||
projectBlobHashes
|
||||
)
|
||||
compareBlobSizes(gcsSizesByHash, dbSizesByHash)
|
||||
return { projectId, success: true }
|
||||
} catch (err) {
|
||||
console.error(`Failed to process project ${projectId}:`, err.stack)
|
||||
return { projectId, success: false }
|
||||
}
|
||||
}
|
||||
|
||||
function compareBlobSizes(gcsSizesByHash, dbSizesByHash) {
|
||||
// Throw an error if the database doesn't report as many blobs as GCS
|
||||
if (dbSizesByHash.size !== gcsSizesByHash.size) {
|
||||
throw new Error(
|
||||
`the database reported ${dbSizesByHash.size} blobs copied, but GCS reported ${gcsSizesByHash.size} blobs copied`
|
||||
)
|
||||
}
|
||||
|
||||
const mismatches = []
|
||||
for (const [hash, dbSize] of dbSizesByHash.entries()) {
|
||||
if (gcsSizesByHash.get(hash) !== dbSize) {
|
||||
mismatches.push(hash)
|
||||
}
|
||||
}
|
||||
if (mismatches.length > 0) {
|
||||
throw new Error(`blob size mismatch for hashes: ${mismatches.join(', ')}`)
|
||||
}
|
||||
}
|
||||
|
||||
async function getHistory(projectId, chunkId) {
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkId)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
return history
|
||||
}
|
||||
|
||||
async function getBlobHashes(projectId, chunkIds) {
|
||||
const blobHashes = new Set()
|
||||
for (const chunkId of chunkIds) {
|
||||
const history = await getHistory(projectId, chunkId)
|
||||
history.findBlobHashes(blobHashes)
|
||||
}
|
||||
return Array.from(blobHashes)
|
||||
}
|
||||
|
||||
async function copyBlobInGcs(projectId, blobHash) {
|
||||
const globalBlobKey = [
|
||||
blobHash.slice(0, 2),
|
||||
blobHash.slice(2, 4),
|
||||
blobHash.slice(4),
|
||||
].join('/')
|
||||
const projectBlobKey = [
|
||||
projectKey.format(projectId),
|
||||
blobHash.slice(0, 2),
|
||||
blobHash.slice(2),
|
||||
].join('/')
|
||||
const globalBlobObject = globalBucket.file(globalBlobKey)
|
||||
const projectBlobObject = projectBucket.file(projectBlobKey)
|
||||
|
||||
// Check if the project blob exists
|
||||
let projectBlobMetadata = null
|
||||
try {
|
||||
;[projectBlobMetadata] = await projectBlobObject.getMetadata()
|
||||
} catch (err) {
|
||||
if (err.code !== 404) {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
// Check that the blob exists
|
||||
let globalBlobMetadata = null
|
||||
try {
|
||||
;[globalBlobMetadata] = await globalBlobObject.getMetadata()
|
||||
} catch (err) {
|
||||
if (err.code !== 404) {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
if (projectBlobMetadata) {
|
||||
// Project blob already exists. Compare the metadata if the global blob
|
||||
// also exists and return early.
|
||||
if (
|
||||
globalBlobMetadata != null &&
|
||||
(globalBlobMetadata.size !== projectBlobMetadata.size ||
|
||||
globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash)
|
||||
) {
|
||||
throw new Error(
|
||||
`Project blob ${blobHash} in project ${projectId} doesn't match global blob`
|
||||
)
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
await globalBlobObject.copy(projectBlobObject)
|
||||
|
||||
// Paranoid check that the copy went well. The getMetadata() method returns
|
||||
// an array, with the metadata in first position.
|
||||
;[projectBlobMetadata] = await projectBlobObject.getMetadata()
|
||||
if (
|
||||
globalBlobMetadata.size !== projectBlobMetadata.size ||
|
||||
globalBlobMetadata.md5Hash !== projectBlobMetadata.md5Hash
|
||||
) {
|
||||
throw new Error(`Failed to copy blob ${blobHash} to project ${projectId})`)
|
||||
}
|
||||
|
||||
return parseInt(projectBlobMetadata.size, 10)
|
||||
}
|
||||
|
||||
async function copyBlobsInDatabase(projectId, blobHashes) {
|
||||
const blobSizesByHash = new Map()
|
||||
if (blobHashes.length === 0) {
|
||||
return blobSizesByHash
|
||||
}
|
||||
const binaryBlobHashes = blobHashes.map(hash => Buffer.from(hash, 'hex'))
|
||||
const result = await knex.raw(
|
||||
`INSERT INTO project_blobs (
|
||||
project_id, hash_bytes, byte_length, string_length
|
||||
)
|
||||
SELECT ?, hash_bytes, byte_length, string_length
|
||||
FROM blobs
|
||||
WHERE hash_bytes IN (${binaryBlobHashes.map(_ => '?').join(',')})
|
||||
ON CONFLICT (project_id, hash_bytes) DO NOTHING
|
||||
RETURNING hash_bytes, byte_length`,
|
||||
[projectId, ...binaryBlobHashes]
|
||||
)
|
||||
for (const row of result.rows) {
|
||||
blobSizesByHash.set(row.hash_bytes.toString('hex'), row.byte_length)
|
||||
}
|
||||
return blobSizesByHash
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit()
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
36
services/history-v1/storage/tasks/delete_old_chunks.js
Normal file
36
services/history-v1/storage/tasks/delete_old_chunks.js
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
'use strict'
|
||||
|
||||
const commandLineArgs = require('command-line-args')
|
||||
const { chunkStore } = require('../')
|
||||
|
||||
async function deleteOldChunks(options) {
|
||||
const deletedChunksTotal = await chunkStore.deleteOldChunks(options)
|
||||
console.log(`Deleted ${deletedChunksTotal} old chunks`)
|
||||
}
|
||||
|
||||
exports.deleteOldChunks = deleteOldChunks
|
||||
|
||||
if (require.main === module) {
|
||||
const options = commandLineArgs([
|
||||
{ name: 'batch-size', type: Number },
|
||||
{ name: 'max-batches', type: Number },
|
||||
{ name: 'min-age', type: Number },
|
||||
{ name: 'timeout', type: Number },
|
||||
{ name: 'verbose', type: Boolean, alias: 'v', defaultValue: false },
|
||||
])
|
||||
deleteOldChunks({
|
||||
batchSize: options['batch-size'],
|
||||
maxBatches: options['max-batches'],
|
||||
timeout: options.timeout,
|
||||
minAgeSecs: options['min-age'],
|
||||
})
|
||||
.then(() => {
|
||||
process.exit()
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
}
|
||||
156
services/history-v1/storage/tasks/fix_duplicate_versions.js
Executable file
156
services/history-v1/storage/tasks/fix_duplicate_versions.js
Executable file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
'use strict'
|
||||
|
||||
const commandLineArgs = require('command-line-args')
|
||||
const { chunkStore } = require('..')
|
||||
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit(0)
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
|
||||
async function main() {
|
||||
const opts = commandLineArgs([
|
||||
{ name: 'project-ids', type: String, multiple: true, defaultOption: true },
|
||||
{ name: 'save', type: Boolean, defaultValue: false },
|
||||
{ name: 'help', type: Boolean, defaultValue: false },
|
||||
])
|
||||
if (opts.help || opts['project-ids'] == null) {
|
||||
console.log('Usage: fix_duplicate_versions [--save] PROJECT_ID...')
|
||||
process.exit()
|
||||
}
|
||||
for (const projectId of opts['project-ids']) {
|
||||
await processProject(projectId, opts.save)
|
||||
}
|
||||
if (!opts.save) {
|
||||
console.log('\nThis was a dry run. Re-run with --save to persist changes.')
|
||||
}
|
||||
}
|
||||
|
||||
async function processProject(projectId, save) {
|
||||
console.log(`Project ${projectId}:`)
|
||||
const chunk = await chunkStore.loadLatest(projectId)
|
||||
let numChanges = 0
|
||||
numChanges += removeDuplicateProjectVersions(chunk)
|
||||
numChanges += removeDuplicateDocVersions(chunk)
|
||||
console.log(` ${numChanges > 0 ? numChanges : 'no'} changes`)
|
||||
if (save && numChanges > 0) {
|
||||
await replaceChunk(projectId, chunk)
|
||||
}
|
||||
}
|
||||
|
||||
function removeDuplicateProjectVersions(chunk) {
|
||||
let numChanges = 0
|
||||
let lastVersion = null
|
||||
const { snapshot, changes } = chunk.history
|
||||
if (snapshot.projectVersion != null) {
|
||||
lastVersion = snapshot.projectVersion
|
||||
}
|
||||
for (const change of changes) {
|
||||
if (change.projectVersion == null) {
|
||||
// Not a project structure change. Ignore.
|
||||
continue
|
||||
}
|
||||
if (
|
||||
lastVersion != null &&
|
||||
!areProjectVersionsIncreasing(lastVersion, change.projectVersion)
|
||||
) {
|
||||
// Duplicate. Remove all ops
|
||||
console.log(
|
||||
` Removing out-of-order project structure change: ${change.projectVersion} <= ${lastVersion}`
|
||||
)
|
||||
change.setOperations([])
|
||||
delete change.projectVersion
|
||||
numChanges++
|
||||
} else {
|
||||
lastVersion = change.projectVersion
|
||||
}
|
||||
}
|
||||
|
||||
return numChanges
|
||||
}
|
||||
|
||||
function removeDuplicateDocVersions(chunk) {
|
||||
let numChanges = 0
|
||||
const lastVersions = new Map()
|
||||
const { snapshot, changes } = chunk.history
|
||||
if (snapshot.v2DocVersions != null) {
|
||||
for (const { pathname, v } of Object.values(snapshot.v2DocVersions.data)) {
|
||||
lastVersions.set(pathname, v)
|
||||
}
|
||||
}
|
||||
for (const change of changes) {
|
||||
if (change.v2DocVersions == null) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Collect all docs that have problematic versions
|
||||
const badPaths = []
|
||||
const badDocIds = []
|
||||
for (const [docId, { pathname, v }] of Object.entries(
|
||||
change.v2DocVersions.data
|
||||
)) {
|
||||
const lastVersion = lastVersions.get(docId)
|
||||
if (lastVersion != null && v <= lastVersion) {
|
||||
// Duplicate. Remove ops related to that doc
|
||||
console.log(
|
||||
` Removing out-of-order change for doc ${docId} (${pathname}): ${v} <= ${lastVersion}`
|
||||
)
|
||||
badPaths.push(pathname)
|
||||
badDocIds.push(docId)
|
||||
numChanges++
|
||||
} else {
|
||||
lastVersions.set(docId, v)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove bad operations
|
||||
if (badPaths.length > 0) {
|
||||
change.setOperations(
|
||||
change.operations.filter(
|
||||
op => op.pathname == null || !badPaths.includes(op.pathname)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
// Remove bad v2 doc versions
|
||||
for (const docId of badDocIds) {
|
||||
delete change.v2DocVersions.data[docId]
|
||||
}
|
||||
}
|
||||
|
||||
return numChanges
|
||||
}
|
||||
|
||||
function areProjectVersionsIncreasing(v1Str, v2Str) {
|
||||
const v1 = parseProjectVersion(v1Str)
|
||||
const v2 = parseProjectVersion(v2Str)
|
||||
return v2.major > v1.major || (v2.major === v1.major && v2.minor > v1.minor)
|
||||
}
|
||||
|
||||
function parseProjectVersion(version) {
|
||||
const [major, minor] = version.split('.').map(x => parseInt(x, 10))
|
||||
if (isNaN(major) || isNaN(minor)) {
|
||||
throw new Error(`Invalid project version: ${version}`)
|
||||
}
|
||||
return { major, minor }
|
||||
}
|
||||
|
||||
async function replaceChunk(projectId, chunk) {
|
||||
const endVersion = chunk.getEndVersion()
|
||||
const oldChunkId = await chunkStore.getChunkIdForVersion(
|
||||
projectId,
|
||||
endVersion
|
||||
)
|
||||
console.log(` Replacing chunk ${oldChunkId}`)
|
||||
// The chunks table has a unique constraint on doc_id and end_version. Because
|
||||
// we're replacing a chunk with the same end version, we need to destroy the
|
||||
// chunk first.
|
||||
await chunkStore.destroy(projectId, oldChunkId)
|
||||
await chunkStore.create(projectId, chunk)
|
||||
}
|
||||
1
services/history-v1/storage/tasks/index.js
Normal file
1
services/history-v1/storage/tasks/index.js
Normal file
@@ -0,0 +1 @@
|
||||
exports.deleteOldChunks = require('./delete_old_chunks').deleteOldChunks
|
||||
Reference in New Issue
Block a user