first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

3
services/history-v1/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
# managed by monorepo$ bin/update_build_scripts
.npmrc

View File

@@ -0,0 +1,3 @@
{
"require": "test/setup.js"
}

View File

@@ -0,0 +1 @@
20.18.2

View File

@@ -0,0 +1,32 @@
# This file was auto-generated, do not edit it directly.
# Instead run bin/update_build_scripts from
# https://github.com/overleaf/internal/
FROM node:20.18.2 AS base
WORKDIR /overleaf/services/history-v1
COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/
RUN chmod 0755 ./install_deps.sh && ./install_deps.sh
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)
RUN mkdir /home/node/.config && chown node:node /home/node/.config
# fs persistor needs a writable folder as a target for the mounted volume
RUN mkdir /buckets && chown node:node /buckets
FROM base AS app
COPY package.json package-lock.json /overleaf/
COPY services/history-v1/package.json /overleaf/services/history-v1/
COPY libraries/ /overleaf/libraries/
COPY patches/ /overleaf/patches/
RUN cd /overleaf && npm ci --quiet
COPY services/history-v1/ /overleaf/services/history-v1/
FROM app
USER node
CMD ["node", "--expose-gc", "app.js"]

View File

@@ -0,0 +1,156 @@
# This file was auto-generated, do not edit it directly.
# Instead run bin/update_build_scripts from
# https://github.com/overleaf/internal/
BUILD_NUMBER ?= local
BRANCH_NAME ?= $(shell git rev-parse --abbrev-ref HEAD)
PROJECT_NAME = history-v1
BUILD_DIR_NAME = $(shell pwd | xargs basename | tr -cd '[a-zA-Z0-9_.\-]')
DOCKER_COMPOSE_FLAGS ?= -f docker-compose.yml
DOCKER_COMPOSE := BUILD_NUMBER=$(BUILD_NUMBER) \
BRANCH_NAME=$(BRANCH_NAME) \
PROJECT_NAME=$(PROJECT_NAME) \
MOCHA_GREP=${MOCHA_GREP} \
docker compose ${DOCKER_COMPOSE_FLAGS}
COMPOSE_PROJECT_NAME_TEST_ACCEPTANCE ?= test_acceptance_$(BUILD_DIR_NAME)
DOCKER_COMPOSE_TEST_ACCEPTANCE = \
COMPOSE_PROJECT_NAME=$(COMPOSE_PROJECT_NAME_TEST_ACCEPTANCE) $(DOCKER_COMPOSE)
COMPOSE_PROJECT_NAME_TEST_UNIT ?= test_unit_$(BUILD_DIR_NAME)
DOCKER_COMPOSE_TEST_UNIT = \
COMPOSE_PROJECT_NAME=$(COMPOSE_PROJECT_NAME_TEST_UNIT) $(DOCKER_COMPOSE)
clean:
-docker rmi ci/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER)
-docker rmi us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER)
-$(DOCKER_COMPOSE_TEST_UNIT) down --rmi local
-$(DOCKER_COMPOSE_TEST_ACCEPTANCE) down --rmi local
HERE=$(shell pwd)
MONOREPO=$(shell cd ../../ && pwd)
# Run the linting commands in the scope of the monorepo.
# Eslint and prettier (plus some configs) are on the root.
RUN_LINTING = docker run --rm -v $(MONOREPO):$(MONOREPO) -w $(HERE) node:20.18.2 npm run --silent
RUN_LINTING_CI = docker run --rm --volume $(MONOREPO)/.editorconfig:/overleaf/.editorconfig --volume $(MONOREPO)/.eslintignore:/overleaf/.eslintignore --volume $(MONOREPO)/.eslintrc:/overleaf/.eslintrc --volume $(MONOREPO)/.prettierignore:/overleaf/.prettierignore --volume $(MONOREPO)/.prettierrc:/overleaf/.prettierrc --volume $(MONOREPO)/tsconfig.backend.json:/overleaf/tsconfig.backend.json ci/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER) npm run --silent
# Same but from the top of the monorepo
RUN_LINTING_MONOREPO = docker run --rm -v $(MONOREPO):$(MONOREPO) -w $(MONOREPO) node:20.18.2 npm run --silent
SHELLCHECK_OPTS = \
--shell=bash \
--external-sources
SHELLCHECK_COLOR := $(if $(CI),--color=never,--color)
SHELLCHECK_FILES := { git ls-files "*.sh" -z; git grep -Plz "\A\#\!.*bash"; } | sort -zu
shellcheck:
@$(SHELLCHECK_FILES) | xargs -0 -r docker run --rm -v $(HERE):/mnt -w /mnt \
koalaman/shellcheck:stable $(SHELLCHECK_OPTS) $(SHELLCHECK_COLOR)
shellcheck_fix:
@$(SHELLCHECK_FILES) | while IFS= read -r -d '' file; do \
diff=$$(docker run --rm -v $(HERE):/mnt -w /mnt koalaman/shellcheck:stable $(SHELLCHECK_OPTS) --format=diff "$$file" 2>/dev/null); \
if [ -n "$$diff" ] && ! echo "$$diff" | patch -p1 >/dev/null 2>&1; then echo "\033[31m$$file\033[0m"; \
elif [ -n "$$diff" ]; then echo "$$file"; \
else echo "\033[2m$$file\033[0m"; fi \
done
format:
$(RUN_LINTING) format
format_ci:
$(RUN_LINTING_CI) format
format_fix:
$(RUN_LINTING) format:fix
lint:
$(RUN_LINTING) lint
lint_ci:
$(RUN_LINTING_CI) lint
lint_fix:
$(RUN_LINTING) lint:fix
typecheck:
$(RUN_LINTING) types:check
typecheck_ci:
$(RUN_LINTING_CI) types:check
test: format lint typecheck shellcheck test_unit test_acceptance
test_unit:
ifneq (,$(wildcard test/unit))
$(DOCKER_COMPOSE_TEST_UNIT) run --rm test_unit
$(MAKE) test_unit_clean
endif
test_clean: test_unit_clean
test_unit_clean:
ifneq (,$(wildcard test/unit))
$(DOCKER_COMPOSE_TEST_UNIT) down -v -t 0
endif
test_acceptance: test_acceptance_clean test_acceptance_pre_run test_acceptance_run
$(MAKE) test_acceptance_clean
test_acceptance_debug: test_acceptance_clean test_acceptance_pre_run test_acceptance_run_debug
$(MAKE) test_acceptance_clean
test_acceptance_run:
ifneq (,$(wildcard test/acceptance))
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run --rm test_acceptance
endif
test_acceptance_run_debug:
ifneq (,$(wildcard test/acceptance))
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run -p 127.0.0.9:19999:19999 --rm test_acceptance npm run test:acceptance -- --inspect=0.0.0.0:19999 --inspect-brk
endif
test_clean: test_acceptance_clean
test_acceptance_clean:
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) down -v -t 0
test_acceptance_pre_run:
ifneq (,$(wildcard test/acceptance/js/scripts/pre-run))
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run --rm test_acceptance test/acceptance/js/scripts/pre-run
endif
benchmarks:
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run --rm test_acceptance npm run benchmarks
build:
docker build \
--pull \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--tag ci/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER) \
--tag us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER) \
--tag us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME) \
--cache-from us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME) \
--cache-from us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):main \
--file Dockerfile \
../..
tar:
$(DOCKER_COMPOSE) up tar
publish:
docker push $(DOCKER_REPO)/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER)
.PHONY: clean \
format format_fix \
lint lint_fix \
build_types typecheck \
lint_ci format_ci typecheck_ci \
shellcheck shellcheck_fix \
test test_clean test_unit test_unit_clean \
test_acceptance test_acceptance_debug test_acceptance_pre_run \
test_acceptance_run test_acceptance_run_debug test_acceptance_clean \
benchmarks \
build tar publish \

View File

@@ -0,0 +1,51 @@
## Database migrations
The history service uses knex to manage PostgreSQL migrations.
To create a new migrations, run:
```
npx knex migrate:make migration_name
```
To apply migrations, run:
```
npx knex migrate:latest
```
For more information, consult the [knex migrations
guide](https://knexjs.org/guide/migrations.html#migration-cli).
## Global blobs
Global blobs are blobs that are shared between projects. The list of global
blobs is stored in the projectHistoryGlobalBlobs Mongo collection and is read
when the service starts. Changing the list of global blobs needs to be done
carefully.
### Adding a blob to the global blobs list
If we identify a blob that appears in many projects, we might want to move that
blob to the global blobs list.
1. Add a record for the blob to the projectHistoryGlobalBlobs collection.
2. Restart the history service.
3. Delete any corresponding project blobs.
### Removing a blob from the global blobs list
Removing a blob from the global blobs list is trickier. As soon as the global
blob is made unavailable, every project that needs the blob will have to get
its own copy. To avoid disruptions, follow these steps:
1. In the projectHistoryGlobalBlobs collection, set the `demoted` property to
`false` on the global blob to remove. This will make the history system
write new instances of this blob to project blobs, but still read from the
global blob.
2. Restart the history service.
3. Copy the blob to all projects that need it.
4. Remove the blob from the projectHistoryGlobalBlobs collection.
5. Restart the history service.

View File

@@ -0,0 +1,149 @@
'use strict'
const basicAuth = require('basic-auth')
const config = require('config')
const HTTPStatus = require('http-status')
const jwt = require('jsonwebtoken')
const tsscmp = require('tsscmp')
function setupBasicHttpAuthForSwaggerDocs(app) {
app.use('/docs', function (req, res, next) {
if (hasValidBasicAuthCredentials(req)) {
return next()
}
res.header('WWW-Authenticate', 'Basic realm="Application"')
res.status(HTTPStatus.UNAUTHORIZED).end()
})
}
exports.setupBasicHttpAuthForSwaggerDocs = setupBasicHttpAuthForSwaggerDocs
function hasValidBasicAuthCredentials(req) {
const credentials = basicAuth(req)
if (!credentials) return false
// No security in the name, so just use straight comparison.
if (credentials.name !== 'staging') return false
const password = config.get('basicHttpAuth.password')
if (password && tsscmp(credentials.pass, password)) return true
// Support an old password so we can change the password without downtime.
if (config.has('basicHttpAuth.oldPassword')) {
const oldPassword = config.get('basicHttpAuth.oldPassword')
if (oldPassword && tsscmp(credentials.pass, oldPassword)) return true
}
return false
}
function setupSSL(app) {
const httpsOnly = config.get('httpsOnly') === 'true'
if (!httpsOnly) {
return
}
app.enable('trust proxy')
app.use(function (req, res, next) {
if (req.protocol === 'https') {
next()
return
}
if (req.method === 'GET' || req.method === 'HEAD') {
res.redirect('https://' + req.headers.host + req.url)
} else {
res
.status(HTTPStatus.FORBIDDEN)
.send('Please use HTTPS when submitting data to this server.')
}
})
}
exports.setupSSL = setupSSL
function handleJWTAuth(req, authOrSecDef, scopesOrApiKey, next) {
// as a temporary solution, to make the OT demo still work
// this handler will also check for basic authorization
if (hasValidBasicAuthCredentials(req)) {
return next()
}
let token, err
if (authOrSecDef.name === 'token') {
token = req.query.token
} else if (
req.headers.authorization &&
req.headers.authorization.split(' ')[0] === 'Bearer'
) {
token = req.headers.authorization.split(' ')[1]
}
if (!token) {
err = new Error('jwt missing')
err.statusCode = HTTPStatus.UNAUTHORIZED
err.headers = { 'WWW-Authenticate': 'Bearer' }
return next(err)
}
let decoded
try {
decoded = decodeJWT(token)
} catch (error) {
if (
error instanceof jwt.JsonWebTokenError ||
error instanceof jwt.TokenExpiredError
) {
err = new Error(error.message)
err.statusCode = HTTPStatus.UNAUTHORIZED
err.headers = { 'WWW-Authenticate': 'Bearer error="invalid_token"' }
return next(err)
}
throw error
}
if (decoded.project_id.toString() !== req.swagger.params.project_id.value) {
err = new Error('Wrong project_id')
err.statusCode = HTTPStatus.FORBIDDEN
return next(err)
}
next()
}
exports.hasValidBasicAuthCredentials = hasValidBasicAuthCredentials
/**
* Verify and decode the given JSON Web Token
*/
function decodeJWT(token) {
const key = config.get('jwtAuth.key')
const algorithm = config.get('jwtAuth.algorithm')
try {
return jwt.verify(token, key, { algorithms: [algorithm] })
} catch (err) {
// Support an old key so we can change the key without downtime.
if (config.has('jwtAuth.oldKey')) {
const oldKey = config.get('jwtAuth.oldKey')
return jwt.verify(token, oldKey, { algorithms: [algorithm] })
} else {
throw err
}
}
}
function handleBasicAuth(req, authOrSecDef, scopesOrApiKey, next) {
if (hasValidBasicAuthCredentials(req)) {
return next()
}
const error = new Error()
error.statusCode = HTTPStatus.UNAUTHORIZED
error.headers = { 'WWW-Authenticate': 'Basic realm="Application"' }
return next(error)
}
function getSwaggerHandlers() {
const handlers = {}
if (!config.has('jwtAuth.key') || !config.has('basicHttpAuth.password')) {
throw new Error('missing authentication env vars')
}
handlers.jwt = handleJWTAuth
handlers.basic = handleBasicAuth
handlers.token = handleJWTAuth
return handlers
}
exports.getSwaggerHandlers = getSwaggerHandlers

View File

@@ -0,0 +1,10 @@
/**
* Turn an async function into an Express middleware
*/
function expressify(fn) {
return (req, res, next) => {
fn(req, res, next).catch(next)
}
}
module.exports = expressify

View File

@@ -0,0 +1,23 @@
const logger = require('@overleaf/logger')
const expressify = require('./expressify')
const { mongodb } = require('../../storage')
async function status(req, res) {
try {
await mongodb.db.command({ ping: 1 })
} catch (err) {
logger.warn({ err }, 'Lost connection with MongoDB')
res.status(500).send('Lost connection with MongoDB')
return
}
res.send('history-v1 is up')
}
function healthCheck(req, res) {
res.send('OK')
}
module.exports = {
status: expressify(status),
healthCheck,
}

View File

@@ -0,0 +1,141 @@
// @ts-check
'use strict'
const { expressify } = require('@overleaf/promise-utils')
const HTTPStatus = require('http-status')
const core = require('overleaf-editor-core')
const Change = core.Change
const Chunk = core.Chunk
const File = core.File
const FileMap = core.FileMap
const Snapshot = core.Snapshot
const TextOperation = core.TextOperation
const logger = require('@overleaf/logger')
const storage = require('../../storage')
const BatchBlobStore = storage.BatchBlobStore
const BlobStore = storage.BlobStore
const chunkStore = storage.chunkStore
const HashCheckBlobStore = storage.HashCheckBlobStore
const persistChanges = storage.persistChanges
const InvalidChangeError = storage.InvalidChangeError
const render = require('./render')
async function importSnapshot(req, res) {
const projectId = req.swagger.params.project_id.value
const rawSnapshot = req.swagger.params.snapshot.value
let snapshot
try {
snapshot = Snapshot.fromRaw(rawSnapshot)
} catch (err) {
return render.unprocessableEntity(res)
}
let historyId
try {
historyId = await chunkStore.initializeProject(projectId, snapshot)
} catch (err) {
if (err instanceof chunkStore.AlreadyInitialized) {
return render.conflict(res)
} else {
throw err
}
}
res.status(HTTPStatus.OK).json({ projectId: historyId })
}
async function importChanges(req, res, next) {
const projectId = req.swagger.params.project_id.value
const rawChanges = req.swagger.params.changes.value
const endVersion = req.swagger.params.end_version.value
const returnSnapshot = req.swagger.params.return_snapshot.value || 'none'
let changes
try {
changes = rawChanges.map(Change.fromRaw)
} catch (err) {
logger.warn({ err, projectId }, 'failed to parse changes')
return render.unprocessableEntity(res)
}
// Set limits to force us to persist all of the changes.
const farFuture = new Date()
farFuture.setTime(farFuture.getTime() + 7 * 24 * 3600 * 1000)
const limits = {
maxChanges: 0,
minChangeTimestamp: farFuture,
maxChangeTimestamp: farFuture,
}
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const hashCheckBlobStore = new HashCheckBlobStore(blobStore)
async function loadFiles() {
const blobHashes = new Set()
for (const change of changes) {
// This populates the set blobHashes with blobs referred to in the change
change.findBlobHashes(blobHashes)
}
await batchBlobStore.preload(Array.from(blobHashes))
for (const change of changes) {
await change.loadFiles('lazy', batchBlobStore)
}
}
async function buildResultSnapshot(resultChunk) {
const chunk = resultChunk || (await chunkStore.loadLatest(projectId))
const snapshot = chunk.getSnapshot()
snapshot.applyAll(chunk.getChanges())
const rawSnapshot = await snapshot.store(hashCheckBlobStore)
return rawSnapshot
}
await loadFiles()
let result
try {
result = await persistChanges(projectId, changes, limits, endVersion)
} catch (err) {
if (
err instanceof Chunk.ConflictingEndVersion ||
err instanceof TextOperation.UnprocessableError ||
err instanceof File.NotEditableError ||
err instanceof FileMap.PathnameError ||
err instanceof Snapshot.EditMissingFileError ||
err instanceof chunkStore.ChunkVersionConflictError ||
err instanceof InvalidChangeError
) {
// If we failed to apply operations, that's probably because they were
// invalid.
logger.warn({ err, projectId, endVersion }, 'changes rejected by history')
return render.unprocessableEntity(res)
} else if (err instanceof Chunk.NotFoundError) {
logger.warn({ err, projectId }, 'chunk not found')
return render.notFound(res)
} else {
throw err
}
}
if (returnSnapshot === 'none') {
res.status(HTTPStatus.CREATED).json({})
} else {
const rawSnapshot = await buildResultSnapshot(result && result.currentChunk)
res.status(HTTPStatus.CREATED).json(rawSnapshot)
}
}
exports.importSnapshot = expressify(importSnapshot)
exports.importChanges = expressify(importChanges)

View File

@@ -0,0 +1,388 @@
'use strict'
const _ = require('lodash')
const Path = require('node:path')
const Stream = require('node:stream')
const HTTPStatus = require('http-status')
const fs = require('node:fs')
const { promisify } = require('node:util')
const config = require('config')
const OError = require('@overleaf/o-error')
const logger = require('@overleaf/logger')
const { Chunk, ChunkResponse, Blob } = require('overleaf-editor-core')
const {
BlobStore,
blobHash,
chunkStore,
HashCheckBlobStore,
ProjectArchive,
zipStore,
chunkBuffer,
} = require('../../storage')
const render = require('./render')
const expressify = require('./expressify')
const withTmpDir = require('./with_tmp_dir')
const StreamSizeLimit = require('./stream_size_limit')
const pipeline = promisify(Stream.pipeline)
async function initializeProject(req, res, next) {
let projectId = req.swagger.params.body.value.projectId
try {
projectId = await chunkStore.initializeProject(projectId)
res.status(HTTPStatus.OK).json({ projectId })
} catch (err) {
if (err instanceof chunkStore.AlreadyInitialized) {
render.conflict(res)
} else {
throw err
}
}
}
async function getLatestContent(req, res, next) {
const projectId = req.swagger.params.project_id.value
const blobStore = new BlobStore(projectId)
const chunk = await chunkBuffer.loadLatest(projectId)
const snapshot = chunk.getSnapshot()
snapshot.applyAll(chunk.getChanges())
await snapshot.loadFiles('eager', blobStore)
res.json(snapshot.toRaw())
}
async function getContentAtVersion(req, res, next) {
const projectId = req.swagger.params.project_id.value
const version = req.swagger.params.version.value
const blobStore = new BlobStore(projectId)
const snapshot = await getSnapshotAtVersion(projectId, version)
await snapshot.loadFiles('eager', blobStore)
res.json(snapshot.toRaw())
}
async function getLatestHashedContent(req, res, next) {
const projectId = req.swagger.params.project_id.value
const blobStore = new HashCheckBlobStore(new BlobStore(projectId))
const chunk = await chunkBuffer.loadLatest(projectId)
const snapshot = chunk.getSnapshot()
snapshot.applyAll(chunk.getChanges())
await snapshot.loadFiles('eager', blobStore)
const rawSnapshot = await snapshot.store(blobStore)
res.json(rawSnapshot)
}
async function getLatestHistory(req, res, next) {
const projectId = req.swagger.params.project_id.value
try {
const chunk = await chunkBuffer.loadLatest(projectId)
const chunkResponse = new ChunkResponse(chunk)
res.json(chunkResponse.toRaw())
} catch (err) {
if (err instanceof Chunk.NotFoundError) {
render.notFound(res)
} else {
throw err
}
}
}
async function getLatestHistoryRaw(req, res, next) {
const projectId = req.swagger.params.project_id.value
const readOnly = req.swagger.params.readOnly.value
try {
const { startVersion, endVersion, endTimestamp } =
await chunkStore.loadLatestRaw(projectId, { readOnly })
res.json({
startVersion,
endVersion,
endTimestamp,
})
} catch (err) {
if (err instanceof Chunk.NotFoundError) {
render.notFound(res)
} else {
throw err
}
}
}
async function getHistory(req, res, next) {
const projectId = req.swagger.params.project_id.value
const version = req.swagger.params.version.value
try {
const chunk = await chunkStore.loadAtVersion(projectId, version)
const chunkResponse = new ChunkResponse(chunk)
res.json(chunkResponse.toRaw())
} catch (err) {
if (err instanceof Chunk.NotFoundError) {
render.notFound(res)
} else {
throw err
}
}
}
async function getHistoryBefore(req, res, next) {
const projectId = req.swagger.params.project_id.value
const timestamp = req.swagger.params.timestamp.value
try {
const chunk = await chunkStore.loadAtTimestamp(projectId, timestamp)
const chunkResponse = new ChunkResponse(chunk)
res.json(chunkResponse.toRaw())
} catch (err) {
if (err instanceof Chunk.NotFoundError) {
render.notFound(res)
} else {
throw err
}
}
}
/**
* Get all changes since the beginning of history or since a given version
*/
async function getChanges(req, res, next) {
const projectId = req.swagger.params.project_id.value
const since = req.swagger.params.since.value ?? 0
if (since < 0) {
// Negative values would cause an infinite loop
return res.status(400).json({
error: `Version out of bounds: ${since}`,
})
}
const changes = []
let chunk = await chunkBuffer.loadLatest(projectId)
if (since > chunk.getEndVersion()) {
return res.status(400).json({
error: `Version out of bounds: ${since}`,
})
}
// Fetch all chunks that come after the chunk that contains the start version
while (chunk.getStartVersion() > since) {
const changesInChunk = chunk.getChanges()
changes.unshift(...changesInChunk)
chunk = await chunkStore.loadAtVersion(projectId, chunk.getStartVersion())
}
// Extract the relevant changes from the chunk that contains the start version
const changesInChunk = chunk
.getChanges()
.slice(since - chunk.getStartVersion())
changes.unshift(...changesInChunk)
res.json(changes.map(change => change.toRaw()))
}
async function getZip(req, res, next) {
const projectId = req.swagger.params.project_id.value
const version = req.swagger.params.version.value
const blobStore = new BlobStore(projectId)
let snapshot
try {
snapshot = await getSnapshotAtVersion(projectId, version)
} catch (err) {
if (err instanceof Chunk.NotFoundError) {
return render.notFound(res)
} else {
throw err
}
}
await withTmpDir('get-zip-', async tmpDir => {
const tmpFilename = Path.join(tmpDir, 'project.zip')
const archive = new ProjectArchive(snapshot)
await archive.writeZip(blobStore, tmpFilename)
res.set('Content-Type', 'application/octet-stream')
res.set('Content-Disposition', 'attachment; filename=project.zip')
const stream = fs.createReadStream(tmpFilename)
await pipeline(stream, res)
})
}
async function createZip(req, res, next) {
const projectId = req.swagger.params.project_id.value
const version = req.swagger.params.version.value
try {
const snapshot = await getSnapshotAtVersion(projectId, version)
const zipUrl = await zipStore.getSignedUrl(projectId, version)
// Do not await this; run it in the background.
zipStore.storeZip(projectId, version, snapshot).catch(err => {
logger.error({ err, projectId, version }, 'createZip: storeZip failed')
})
res.status(HTTPStatus.OK).json({ zipUrl })
} catch (error) {
if (error instanceof Chunk.NotFoundError) {
render.notFound(res)
} else {
next(error)
}
}
}
async function deleteProject(req, res, next) {
const projectId = req.swagger.params.project_id.value
const blobStore = new BlobStore(projectId)
await Promise.all([
chunkStore.deleteProjectChunks(projectId),
blobStore.deleteBlobs(),
])
res.status(HTTPStatus.NO_CONTENT).send()
}
async function createProjectBlob(req, res, next) {
const projectId = req.swagger.params.project_id.value
const expectedHash = req.swagger.params.hash.value
const maxUploadSize = parseInt(config.get('maxFileUploadSize'), 10)
await withTmpDir('blob-', async tmpDir => {
const tmpPath = Path.join(tmpDir, 'content')
const sizeLimit = new StreamSizeLimit(maxUploadSize)
await pipeline(req, sizeLimit, fs.createWriteStream(tmpPath))
if (sizeLimit.sizeLimitExceeded) {
return render.requestEntityTooLarge(res)
}
const hash = await blobHash.fromFile(tmpPath)
if (hash !== expectedHash) {
logger.debug({ hash, expectedHash }, 'Hash mismatch')
return render.conflict(res, 'File hash mismatch')
}
const blobStore = new BlobStore(projectId)
const newBlob = await blobStore.putFile(tmpPath)
try {
const { backupBlob } = await import('../../storage/lib/backupBlob.mjs')
await backupBlob(projectId, newBlob, tmpPath)
} catch (error) {
logger.warn({ error, projectId, hash }, 'Failed to backup blob')
}
res.status(HTTPStatus.CREATED).end()
})
}
async function headProjectBlob(req, res) {
const projectId = req.swagger.params.project_id.value
const hash = req.swagger.params.hash.value
const blobStore = new BlobStore(projectId)
const blob = await blobStore.getBlob(hash)
if (blob) {
res.set('Content-Length', blob.getByteLength())
res.status(200).end()
} else {
res.status(404).end()
}
}
// Support simple, singular ranges starting from zero only, up-to 2MB = 2_000_000, 7 digits
const RANGE_HEADER = /^bytes=0-(\d{1,7})$/
/**
* @param {string} header
* @return {{}|{start: number, end: number}}
* @private
*/
function _getRangeOpts(header) {
if (!header) return {}
const match = header.match(RANGE_HEADER)
if (match) {
const end = parseInt(match[1], 10)
return { start: 0, end }
}
return {}
}
async function getProjectBlob(req, res, next) {
const projectId = req.swagger.params.project_id.value
const hash = req.swagger.params.hash.value
const opts = _getRangeOpts(req.swagger.params.range.value || '')
const blobStore = new BlobStore(projectId)
logger.debug({ projectId, hash }, 'getProjectBlob started')
try {
let stream
try {
stream = await blobStore.getStream(hash, opts)
} catch (err) {
if (err instanceof Blob.NotFoundError) {
logger.warn({ projectId, hash }, 'Blob not found')
return res.status(404).end()
} else {
throw err
}
}
res.set('Content-Type', 'application/octet-stream')
try {
await pipeline(stream, res)
} catch (err) {
if (err?.code === 'ERR_STREAM_PREMATURE_CLOSE') {
res.end()
} else {
throw OError.tag(err, 'error transferring stream', { projectId, hash })
}
}
} finally {
logger.debug({ projectId, hash }, 'getProjectBlob finished')
}
}
async function copyProjectBlob(req, res, next) {
const sourceProjectId = req.swagger.params.copyFrom.value
const targetProjectId = req.swagger.params.project_id.value
const blobHash = req.swagger.params.hash.value
// Check that blob exists in source project
const sourceBlobStore = new BlobStore(sourceProjectId)
const targetBlobStore = new BlobStore(targetProjectId)
const [sourceBlob, targetBlob] = await Promise.all([
sourceBlobStore.getBlob(blobHash),
targetBlobStore.getBlob(blobHash),
])
if (!sourceBlob) {
return render.notFound(res)
}
// Exit early if the blob exists in the target project.
// This will also catch global blobs, which always exist.
if (targetBlob) {
return res.status(HTTPStatus.NO_CONTENT).end()
}
// Otherwise, copy blob from source project to target project
await sourceBlobStore.copyBlob(sourceBlob, targetProjectId)
res.status(HTTPStatus.CREATED).end()
}
async function getSnapshotAtVersion(projectId, version) {
const chunk = await chunkStore.loadAtVersion(projectId, version)
const snapshot = chunk.getSnapshot()
const changes = _.dropRight(
chunk.getChanges(),
chunk.getEndVersion() - version
)
snapshot.applyAll(changes)
return snapshot
}
module.exports = {
initializeProject: expressify(initializeProject),
getLatestContent: expressify(getLatestContent),
getContentAtVersion: expressify(getContentAtVersion),
getLatestHashedContent: expressify(getLatestHashedContent),
getLatestPersistedHistory: expressify(getLatestHistory),
getLatestHistory: expressify(getLatestHistory),
getLatestHistoryRaw: expressify(getLatestHistoryRaw),
getHistory: expressify(getHistory),
getHistoryBefore: expressify(getHistoryBefore),
getChanges: expressify(getChanges),
getZip: expressify(getZip),
createZip: expressify(createZip),
deleteProject: expressify(deleteProject),
createProjectBlob: expressify(createProjectBlob),
getProjectBlob: expressify(getProjectBlob),
headProjectBlob: expressify(headProjectBlob),
copyProjectBlob: expressify(copyProjectBlob),
}

View File

@@ -0,0 +1,17 @@
'use strict'
const HTTPStatus = require('http-status')
function makeErrorRenderer(status) {
return (res, message) => {
res.status(status).json({ message: message || HTTPStatus[status] })
}
}
module.exports = {
badRequest: makeErrorRenderer(HTTPStatus.BAD_REQUEST),
notFound: makeErrorRenderer(HTTPStatus.NOT_FOUND),
unprocessableEntity: makeErrorRenderer(HTTPStatus.UNPROCESSABLE_ENTITY),
conflict: makeErrorRenderer(HTTPStatus.CONFLICT),
requestEntityTooLarge: makeErrorRenderer(HTTPStatus.REQUEST_ENTITY_TOO_LARGE),
}

View File

@@ -0,0 +1,26 @@
const stream = require('node:stream')
/**
* Transform stream that stops passing bytes through after some threshold has
* been reached.
*/
class StreamSizeLimit extends stream.Transform {
constructor(maxSize) {
super()
this.maxSize = maxSize
this.accumulatedSize = 0
this.sizeLimitExceeded = false
}
_transform(chunk, encoding, cb) {
this.accumulatedSize += chunk.length
if (this.accumulatedSize > this.maxSize) {
this.sizeLimitExceeded = true
} else {
this.push(chunk)
}
cb()
}
}
module.exports = StreamSizeLimit

View File

@@ -0,0 +1,27 @@
const fs = require('node:fs')
const fsExtra = require('fs-extra')
const logger = require('@overleaf/logger')
const os = require('node:os')
const path = require('node:path')
/**
* Create a temporary directory before executing a function and cleaning up
* after.
*
* @param {string} prefix - prefix for the temporary directory name
* @param {Function} fn - async function to call
*/
async function withTmpDir(prefix, fn) {
const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), prefix))
try {
await fn(tmpDir)
} finally {
fsExtra.remove(tmpDir).catch(err => {
if (err.code !== 'ENOENT') {
logger.error({ err }, 'failed to delete temporary file')
}
})
}
}
module.exports = withTmpDir

View File

@@ -0,0 +1,269 @@
'use strict'
const _ = require('lodash')
const paths = _.reduce(
[require('./projects').paths, require('./project_import').paths],
_.extend
)
const securityDefinitions = require('./security_definitions')
module.exports = {
swagger: '2.0',
info: {
title: 'Overleaf Editor API',
description: 'API for the Overleaf editor.',
version: '1.0',
},
produces: ['application/json'],
basePath: '/api',
paths,
securityDefinitions,
security: [
{
jwt: [],
},
],
definitions: {
Project: {
properties: {
projectId: {
type: 'string',
},
},
required: ['projectId'],
},
File: {
properties: {
hash: {
type: 'string',
},
byteLength: {
type: 'integer',
},
stringLength: {
type: 'integer',
},
},
},
Label: {
properties: {
authorId: {
type: 'integer',
},
text: {
type: 'string',
},
timestamp: {
type: 'string',
},
version: {
type: 'integer',
},
},
},
Chunk: {
properties: {
history: {
$ref: '#/definitions/History',
},
startVersion: {
type: 'number',
},
},
},
ChunkResponse: {
properties: {
chunk: {
$ref: '#/definitions/Chunk',
},
authors: {
type: 'array',
items: {
$ref: '#/definitions/Author',
},
},
},
},
ChunkResponseRaw: {
properties: {
startVersion: {
type: 'number',
},
endVersion: {
type: 'number',
},
endTimestamp: {
type: 'string',
},
},
},
History: {
properties: {
snapshot: {
$ref: '#/definitions/Snapshot',
},
changes: {
type: 'array',
items: {
$ref: '#/definitions/Change',
},
},
},
},
Snapshot: {
properties: {
files: {
type: 'object',
additionalProperties: {
$ref: '#/definitions/File',
},
},
},
required: ['files'],
},
Change: {
properties: {
timestamp: {
type: 'string',
},
operations: {
type: 'array',
items: {
$ref: '#/definitions/Operation',
},
},
authors: {
type: 'array',
items: {
type: ['integer', 'null'],
},
},
v2Authors: {
type: 'array',
items: {
type: ['string', 'null'],
},
},
projectVersion: {
type: 'string',
},
v2DocVersions: {
type: 'object',
additionalProperties: {
$ref: '#/definitions/V2DocVersions',
},
},
},
required: ['timestamp', 'operations'],
},
V2DocVersions: {
properties: {
pathname: {
type: 'string',
},
v: {
type: 'integer',
},
},
},
ChangeRequest: {
properties: {
baseVersion: {
type: 'integer',
},
untransformable: {
type: 'boolean',
},
operations: {
type: 'array',
items: {
$ref: '#/definitions/Operation',
},
},
authors: {
type: 'array',
items: {
type: ['integer', 'null'],
},
},
},
required: ['baseVersion', 'operations'],
},
ChangeNote: {
properties: {
baseVersion: {
type: 'integer',
},
change: {
$ref: '#/definitions/Change',
},
},
required: ['baseVersion'],
},
Operation: {
properties: {
pathname: {
type: 'string',
},
newPathname: {
type: 'string',
},
blob: {
$ref: '#/definitions/Blob',
},
textOperation: {
type: 'array',
items: {},
},
file: {
$ref: '#/definitions/File',
},
},
},
Error: {
properties: {
message: {
type: 'string',
},
},
required: ['message'],
},
Blob: {
properties: {
hash: {
type: 'string',
},
},
required: ['hash'],
},
Author: {
properties: {
id: {
type: 'integer',
},
email: {
type: 'string',
},
name: {
type: 'string',
},
},
required: ['id', 'email', 'name'],
},
SyncState: {
properties: {
synced: {
type: 'boolean',
},
},
},
ZipInfo: {
properties: {
zipUrl: {
type: 'string',
},
},
required: ['zipUrl'],
},
},
}

View File

@@ -0,0 +1,147 @@
'use strict'
const importSnapshot = {
'x-swagger-router-controller': 'project_import',
operationId: 'importSnapshot',
tags: ['ProjectImport'],
description: 'Import a snapshot from the current rails app.',
consumes: ['application/json'],
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'snapshot',
in: 'body',
description: 'Snapshot to import.',
required: true,
schema: {
$ref: '#/definitions/Snapshot',
},
},
],
responses: {
200: {
description: 'Imported',
},
409: {
description: 'Conflict: project already initialized',
},
404: {
description: 'No such project exists',
},
},
security: [
{
basic: [],
},
],
}
const importChanges = {
'x-swagger-router-controller': 'project_import',
operationId: 'importChanges',
tags: ['ProjectImport'],
description: 'Import changes for a project from the current rails app.',
consumes: ['application/json'],
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'end_version',
description: 'end_version of latest persisted chunk',
in: 'query',
required: true,
type: 'number',
},
{
name: 'return_snapshot',
description:
'optionally, return a snapshot with the latest hashed content',
in: 'query',
required: false,
type: 'string',
enum: ['hashed', 'none'],
},
{
name: 'changes',
in: 'body',
description: 'changes to be imported',
required: true,
schema: {
type: 'array',
items: {
$ref: '#/definitions/Change',
},
},
},
],
responses: {
201: {
description: 'Created',
schema: {
$ref: '#/definitions/Snapshot',
},
},
},
security: [
{
basic: [],
},
],
}
const getChanges = {
'x-swagger-router-controller': 'projects',
operationId: 'getChanges',
tags: ['Project'],
description: 'Get changes applied to a project',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'since',
in: 'query',
description: 'start version',
required: false,
type: 'number',
},
],
responses: {
200: {
description: 'Success',
schema: {
type: 'array',
items: {
$ref: '#/definitions/Change',
},
},
},
},
security: [
{
basic: [],
},
],
}
exports.paths = {
'/projects/{project_id}/import': { post: importSnapshot },
'/projects/{project_id}/legacy_import': { post: importSnapshot },
'/projects/{project_id}/changes': { get: getChanges, post: importChanges },
'/projects/{project_id}/legacy_changes': { post: importChanges },
}

View File

@@ -0,0 +1,588 @@
'use strict'
const Blob = require('overleaf-editor-core').Blob
exports.paths = {
'/projects': {
post: {
'x-swagger-router-controller': 'projects',
operationId: 'initializeProject',
tags: ['Project'],
description: 'Initialize project.',
consumes: ['application/json'],
parameters: [
{
name: 'body',
in: 'body',
schema: {
type: 'object',
properties: {
projectId: { type: 'string' },
},
},
},
],
responses: {
200: {
description: 'Initialized',
schema: {
$ref: '#/definitions/Project',
},
},
},
security: [
{
basic: [],
},
],
},
},
'/projects/{project_id}': {
delete: {
'x-swagger-router-controller': 'projects',
operationId: 'deleteProject',
tags: ['Project'],
description: "Delete a project's history",
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
],
responses: {
204: {
description: 'Success',
},
},
security: [
{
basic: [],
},
],
},
},
'/projects/{project_id}/blobs/{hash}': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getProjectBlob',
tags: ['Project'],
description: 'Fetch blob content by its project id and hash.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'hash',
in: 'path',
description: 'Hexadecimal SHA-1 hash',
required: true,
type: 'string',
pattern: Blob.HEX_HASH_RX_STRING,
},
{
name: 'range',
in: 'header',
description: 'HTTP Range header',
required: false,
type: 'string',
},
],
produces: ['application/octet-stream'],
responses: {
200: {
description: 'Success',
schema: {
type: 'file',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
security: [{ jwt: [] }, { token: [] }],
},
head: {
'x-swagger-router-controller': 'projects',
operationId: 'headProjectBlob',
tags: ['Project'],
description: 'Fetch blob content-length by its project id and hash.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'hash',
in: 'path',
description: 'Hexadecimal SHA-1 hash',
required: true,
type: 'string',
pattern: Blob.HEX_HASH_RX_STRING,
},
],
produces: ['application/octet-stream'],
responses: {
200: {
description: 'Success',
schema: {
type: 'file',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
security: [{ jwt: [] }, { token: [] }],
},
put: {
'x-swagger-router-controller': 'projects',
operationId: 'createProjectBlob',
tags: ['Project'],
description:
'Create blob to be used in a file addition operation when importing a' +
' snapshot or changes',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'hash',
in: 'path',
description: 'Hexadecimal SHA-1 hash',
required: true,
type: 'string',
pattern: Blob.HEX_HASH_RX_STRING,
},
],
responses: {
201: {
description: 'Created',
},
},
},
post: {
'x-swagger-router-controller': 'projects',
operationId: 'copyProjectBlob',
tags: ['Project'],
description:
'Copies a blob from a source project to a target project when duplicating a project',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'target project id',
required: true,
type: 'string',
},
{
name: 'hash',
in: 'path',
description: 'Hexadecimal SHA-1 hash',
required: true,
type: 'string',
pattern: Blob.HEX_HASH_RX_STRING,
},
{
name: 'copyFrom',
in: 'query',
description: 'source project id',
required: true,
type: 'string',
},
],
responses: {
201: {
description: 'Created',
},
},
},
},
'/projects/{project_id}/latest/content': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getLatestContent',
tags: ['Project'],
description:
'Get full content of the latest version. Text file ' +
'content is included, but binary files are just linked by hash.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/Snapshot',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/latest/hashed_content': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getLatestHashedContent',
tags: ['Project'],
description:
'Get a snapshot of a project at the latest version ' +
'with the hashes for the contents each file',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/Snapshot',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
security: [
{
basic: [],
},
],
},
},
'/projects/{project_id}/latest/history': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getLatestHistory',
tags: ['Project'],
description:
'Get the latest sequence of changes.' +
' TODO probably want a configurable depth.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/ChunkResponse',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/latest/history/raw': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getLatestHistoryRaw',
tags: ['Project'],
description: 'Get the metadata of latest sequence of changes.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'readOnly',
in: 'query',
description: 'use read only database connection',
required: false,
type: 'boolean',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/ChunkResponseRaw',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/latest/persistedHistory': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getLatestPersistedHistory',
tags: ['Project'],
description: 'Get the latest sequence of changes.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/ChunkResponse',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/versions/{version}/history': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getHistory',
tags: ['Project'],
description:
'Get the sequence of changes that includes the given version.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'version',
in: 'path',
description: 'numeric version',
required: true,
type: 'number',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/ChunkResponse',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/versions/{version}/content': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getContentAtVersion',
tags: ['Project'],
description: 'Get full content at the given version',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'version',
in: 'path',
description: 'numeric version',
required: true,
type: 'number',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/Snapshot',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/timestamp/{timestamp}/history': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getHistoryBefore',
tags: ['Project'],
description:
'Get the sequence of changes. ' + ' before the given timestamp',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'timestamp',
in: 'path',
description: 'timestamp',
required: true,
type: 'string',
format: 'date-time',
},
],
responses: {
200: {
description: 'Success',
schema: {
$ref: '#/definitions/ChunkResponse',
},
},
404: {
description: 'Not Found',
schema: {
$ref: '#/definitions/Error',
},
},
},
},
},
'/projects/{project_id}/version/{version}/zip': {
get: {
'x-swagger-router-controller': 'projects',
operationId: 'getZip',
tags: ['Project'],
description: 'Download zip with project content',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'version',
in: 'path',
description: 'numeric version',
required: true,
type: 'number',
},
],
produces: ['application/octet-stream'],
responses: {
200: {
description: 'success',
},
404: {
description: 'not found',
},
},
security: [
{
token: [],
},
],
},
post: {
'x-swagger-router-controller': 'projects',
operationId: 'createZip',
tags: ['Project'],
description:
'Create a zip file with project content. Returns a link to be polled.',
parameters: [
{
name: 'project_id',
in: 'path',
description: 'project id',
required: true,
type: 'string',
},
{
name: 'version',
in: 'path',
description: 'numeric version',
required: true,
type: 'number',
},
],
responses: {
200: {
description: 'success',
schema: {
$ref: '#/definitions/ZipInfo',
},
},
404: {
description: 'not found',
},
},
security: [
{
basic: [],
},
],
},
},
}

View File

@@ -0,0 +1,17 @@
'use strict'
module.exports = {
jwt: {
type: 'apiKey',
in: 'header',
name: 'authorization',
},
basic: {
type: 'basic',
},
token: {
type: 'apiKey',
in: 'query',
name: 'token',
},
}

172
services/history-v1/app.js Normal file
View File

@@ -0,0 +1,172 @@
'use strict'
/* eslint-disable no-console */
// Metrics must be initialized before importing anything else
require('@overleaf/metrics/initialize')
const config = require('config')
const Events = require('node:events')
const BPromise = require('bluebird')
const express = require('express')
const helmet = require('helmet')
const HTTPStatus = require('http-status')
const logger = require('@overleaf/logger')
const Metrics = require('@overleaf/metrics')
const bodyParser = require('body-parser')
const swaggerTools = require('swagger-tools')
const swaggerDoc = require('./api/swagger')
const security = require('./api/app/security')
const healthChecks = require('./api/controllers/health_checks')
const { mongodb, loadGlobalBlobs } = require('./storage')
const path = require('node:path')
Events.setMaxListeners(20)
const app = express()
module.exports = app
logger.initialize('history-v1')
Metrics.open_sockets.monitor()
Metrics.injectMetricsRoute(app)
app.use(Metrics.http.monitor(logger))
Metrics.leaked_sockets.monitor(logger)
// We may have fairly large JSON bodies when receiving large Changes. Clients
// may have to handle 413 status codes and try creating files instead of sending
// text content in changes.
app.use(bodyParser.json({ limit: '6MB' }))
app.use(
bodyParser.urlencoded({
extended: false,
})
)
security.setupSSL(app)
security.setupBasicHttpAuthForSwaggerDocs(app)
const HTTP_REQUEST_TIMEOUT = parseInt(config.get('httpRequestTimeout'), 10)
app.use(function (req, res, next) {
res.setTimeout(HTTP_REQUEST_TIMEOUT)
next()
})
app.get('/', function (req, res) {
res.send('')
})
app.get('/status', healthChecks.status)
app.get('/health_check', healthChecks.healthCheck)
function setupSwagger() {
return new BPromise(function (resolve) {
swaggerTools.initializeMiddleware(swaggerDoc, function (middleware) {
app.use(middleware.swaggerMetadata())
app.use(middleware.swaggerSecurity(security.getSwaggerHandlers()))
app.use(middleware.swaggerValidator())
app.use(
middleware.swaggerRouter({
controllers: path.join(__dirname, 'api/controllers'),
useStubs: app.get('env') === 'development',
})
)
app.use(middleware.swaggerUi())
resolve()
})
})
}
function setupErrorHandling() {
app.use(function (req, res, next) {
const err = new Error('Not Found')
err.status = HTTPStatus.NOT_FOUND
return next(err)
})
// Handle Swagger errors.
app.use(function (err, req, res, next) {
const projectId = req.swagger?.params?.project_id?.value
if (res.headersSent) {
return next(err)
}
if (err.code === 'SCHEMA_VALIDATION_FAILED') {
logger.error({ err, projectId }, err.message)
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json(err.results)
}
if (err.code === 'INVALID_TYPE' || err.code === 'PATTERN') {
logger.error({ err, projectId }, err.message)
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json({
message: 'invalid type: ' + err.paramName,
})
}
if (err.code === 'ENUM_MISMATCH') {
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json({
message: 'invalid enum value: ' + err.paramName,
})
}
if (err.code === 'REQUIRED') {
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json({
message: err.message,
})
}
next(err)
})
app.use(function (err, req, res, next) {
const projectId = req.swagger?.params?.project_id?.value
logger.error({ err, projectId }, err.message)
if (res.headersSent) {
return next(err)
}
// Handle errors that specify a statusCode. Some come from our code. Some
// bubble up from AWS SDK, but they sometimes have the statusCode set to
// 200, notably some InternalErrors and TimeoutErrors, so we have to guard
// against that. We also check `status`, but `statusCode` is preferred.
const statusCode = err.statusCode || err.status
if (statusCode && statusCode >= 400 && statusCode < 600) {
res.status(statusCode)
} else {
res.status(HTTPStatus.INTERNAL_SERVER_ERROR)
}
const sendErrorToClient = app.get('env') === 'development'
res.json({
message: err.message,
error: sendErrorToClient ? err : {},
})
})
}
app.setup = async function appSetup() {
await mongodb.client.connect()
logger.info('Connected to MongoDB')
await loadGlobalBlobs()
logger.info('Global blobs loaded')
app.use(helmet())
await setupSwagger()
setupErrorHandling()
}
async function startApp() {
await app.setup()
const port = parseInt(process.env.PORT, 10) || 3100
app.listen(port, err => {
if (err) {
console.error(err)
process.exit(1)
}
Metrics.event_loop.monitor(logger)
Metrics.memory.monitor(logger)
})
}
// Run this if we're called directly
if (!module.parent) {
startApp().catch(err => {
console.error(err)
process.exit(1)
})
}

View File

@@ -0,0 +1,81 @@
// @ts-check
// Metrics must be initialized before importing anything else
import '@overleaf/metrics/initialize.js'
import http from 'node:http'
import { fileURLToPath } from 'node:url'
import { promisify } from 'node:util'
import express from 'express'
import logger from '@overleaf/logger'
import Metrics from '@overleaf/metrics'
import { hasValidBasicAuthCredentials } from './api/app/security.js'
import {
deleteProjectBackupCb,
healthCheck,
healthCheckCb,
NotReadyToDelete,
} from './storage/lib/backupDeletion.mjs'
import { mongodb } from './storage/index.js'
const app = express()
logger.initialize('history-v1-backup-deletion')
Metrics.open_sockets.monitor()
Metrics.injectMetricsRoute(app)
app.use(Metrics.http.monitor(logger))
Metrics.leaked_sockets.monitor(logger)
Metrics.event_loop.monitor(logger)
Metrics.memory.monitor(logger)
function basicAuth(req, res, next) {
if (hasValidBasicAuthCredentials(req)) return next()
res.setHeader('WWW-Authenticate', 'Basic realm="Application"')
res.sendStatus(401)
}
app.delete('/project/:projectId/backup', basicAuth, (req, res, next) => {
deleteProjectBackupCb(req.params.projectId, err => {
if (err) {
return next(err)
}
res.sendStatus(204)
})
})
app.get('/status', (req, res) => {
res.send('history-v1-backup-deletion is up')
})
app.get('/health_check', (req, res, next) => {
healthCheckCb(err => {
if (err) return next(err)
res.sendStatus(200)
})
})
app.use((err, req, res, next) => {
req.logger.addFields({ err })
if (err instanceof NotReadyToDelete) {
req.logger.setLevel('warn')
return res.status(422).send(err.message)
}
req.logger.setLevel('error')
next(err)
})
/**
* @param {number} port
* @return {Promise<http.Server>}
*/
export async function startApp(port) {
await mongodb.client.connect()
await healthCheck()
const server = http.createServer(app)
await promisify(server.listen.bind(server, port))()
return server
}
// Run this if we're called directly
if (process.argv[1] === fileURLToPath(import.meta.url)) {
const PORT = parseInt(process.env.PORT || '3101', 10)
await startApp(PORT)
}

View File

@@ -0,0 +1,117 @@
// @ts-check
// Metrics must be initialized before importing anything else
import '@overleaf/metrics/initialize.js'
import http from 'node:http'
import { fileURLToPath } from 'node:url'
import { promisify } from 'node:util'
import { setTimeout } from 'node:timers/promises'
import express from 'express'
import logger from '@overleaf/logger'
import Metrics from '@overleaf/metrics'
import { healthCheck } from './backupVerifier/healthCheck.mjs'
import {
BackupCorruptedError,
verifyBlob,
} from './storage/lib/backupVerifier.mjs'
import { mongodb } from './storage/index.js'
import { expressify } from '@overleaf/promise-utils'
import { Blob } from 'overleaf-editor-core'
import { loadGlobalBlobs } from './storage/lib/blob_store/index.js'
import { EventEmitter } from 'node:events'
import {
loopRandomProjects,
setWriteMetrics,
} from './backupVerifier/ProjectVerifier.mjs'
const app = express()
logger.initialize('history-v1-backup-verifier')
Metrics.open_sockets.monitor()
Metrics.injectMetricsRoute(app)
app.use(Metrics.http.monitor(logger))
Metrics.leaked_sockets.monitor(logger)
Metrics.event_loop.monitor(logger)
Metrics.memory.monitor(logger)
app.get(
'/history/:historyId/blob/:hash/verify',
expressify(async (req, res) => {
const { historyId, hash } = req.params
try {
await verifyBlob(historyId, hash)
res.sendStatus(200)
} catch (err) {
logger.warn({ err, historyId, hash }, 'manual verify blob failed')
if (err instanceof Blob.NotFoundError) {
res.status(404).send(err.message)
} else if (err instanceof BackupCorruptedError) {
res.status(422).send(err.message)
} else {
throw err
}
}
})
)
app.get('/status', (req, res) => {
res.send('history-v1-backup-verifier is up')
})
app.get(
'/health_check',
expressify(async (req, res) => {
await healthCheck()
res.sendStatus(200)
})
)
app.use((err, req, res, next) => {
req.logger.addFields({ err })
req.logger.setLevel('error')
next(err)
})
const shutdownEmitter = new EventEmitter()
shutdownEmitter.once('shutdown', async code => {
logger.info({ code }, 'shutting down')
await mongodb.client.close()
await setTimeout(100)
process.exit(code)
})
process.on('SIGTERM', () => {
shutdownEmitter.emit('shutdown', 0)
})
process.on('SIGINT', () => {
shutdownEmitter.emit('shutdown', 0)
})
/**
* @param {number} port
* @param {boolean} enableVerificationLoop
* @return {Promise<http.Server>}
*/
export async function startApp(port, enableVerificationLoop = true) {
await mongodb.client.connect()
await loadGlobalBlobs()
await healthCheck()
const server = http.createServer(app)
await promisify(server.listen.bind(server, port))()
enableVerificationLoop && loopRandomProjects(shutdownEmitter)
return server
}
setWriteMetrics(true)
// Run this if we're called directly
if (process.argv[1] === fileURLToPath(import.meta.url)) {
const PORT = parseInt(process.env.PORT || '3102', 10)
try {
await startApp(PORT)
} catch (error) {
shutdownEmitter.emit('shutdown', 1)
logger.error({ error }, 'error starting app')
}
}

View File

@@ -0,0 +1,70 @@
// @ts-check
// Metrics must be initialized before importing anything else
import '@overleaf/metrics/initialize.js'
import http from 'node:http'
import { fileURLToPath } from 'node:url'
import { promisify } from 'node:util'
import express from 'express'
import logger from '@overleaf/logger'
import Metrics from '@overleaf/metrics'
import { expressify } from '@overleaf/promise-utils'
import { drainQueue, healthCheck } from './storage/scripts/backup_worker.mjs'
const app = express()
logger.initialize('history-v1-backup-worker')
Metrics.open_sockets.monitor()
Metrics.injectMetricsRoute(app)
app.use(Metrics.http.monitor(logger))
Metrics.leaked_sockets.monitor(logger)
Metrics.event_loop.monitor(logger)
Metrics.memory.monitor(logger)
app.get('/status', (req, res) => {
res.send('history-v1-backup-worker is up')
})
app.get(
'/health_check',
expressify(async (req, res) => {
await healthCheck()
res.sendStatus(200)
})
)
app.use((err, req, res, next) => {
req.logger.addFields({ err })
req.logger.setLevel('error')
next(err)
})
async function triggerGracefulShutdown(server, signal) {
logger.info({ signal }, 'graceful shutdown: started shutdown sequence')
await drainQueue()
server.close(function () {
logger.info({ signal }, 'graceful shutdown: closed server')
setTimeout(() => {
process.exit(0)
}, 1000)
})
}
/**
* @param {number} port
* @return {Promise<http.Server>}
*/
export async function startApp(port) {
await healthCheck()
const server = http.createServer(app)
await promisify(server.listen.bind(server, port))()
const signals = ['SIGINT', 'SIGTERM']
signals.forEach(signal => {
process.on(signal, () => triggerGracefulShutdown(server, signal))
})
return server
}
// Run this if we're called directly
if (process.argv[1] === fileURLToPath(import.meta.url)) {
const PORT = parseInt(process.env.PORT || '3103', 10)
await startApp(PORT)
}

View File

@@ -0,0 +1,33 @@
import Metrics from '@overleaf/metrics'
import { objectIdFromDate } from './utils.mjs'
import { db } from '../storage/lib/mongodb.js'
const projectsCollection = db.collection('projects')
/**
*
* @param {Date} beforeTime
* @return {Promise<void>}
*/
export async function measurePendingChangesBeforeTime(beforeTime) {
const pendingChangeCount = await projectsCollection.countDocuments({
'overleaf.backup.pendingChangeAt': {
$lt: beforeTime,
},
})
Metrics.gauge('backup_verification_pending_changes', pendingChangeCount)
}
/**
*
* @param {Date} graceTime
* @return {Promise<void>}
*/
export async function measureNeverBackedUpProjects(graceTime) {
const neverBackedUpCount = await projectsCollection.countDocuments({
'overleaf.backup.lastBackedUpVersion': null,
_id: { $lt: objectIdFromDate(graceTime) },
})
Metrics.gauge('backup_verification_never_backed_up', neverBackedUpCount)
}

View File

@@ -0,0 +1,79 @@
// @ts-check
import { objectIdFromDate } from './utils.mjs'
import { db } from '../storage/lib/mongodb.js'
import config from 'config'
const projectsCollection = db.collection('projects')
const HAS_PROJECTS_WITHOUT_HISTORY =
config.get('hasProjectsWithoutHistory') === 'true'
/**
* @param {Date} start
* @param {Date} end
* @param {number} N
* @yields {string}
*/
export async function* getProjectsCreatedInDateRangeCursor(start, end, N) {
yield* getSampleProjectsCursor(N, [
{
$match: {
_id: {
$gt: objectIdFromDate(start),
$lte: objectIdFromDate(end),
},
},
},
])
}
export async function* getProjectsUpdatedInDateRangeCursor(start, end, N) {
yield* getSampleProjectsCursor(N, [
{
$match: {
'overleaf.history.updatedAt': {
$gt: start,
$lte: end,
},
},
},
])
}
/**
* @typedef {import('mongodb').Document} Document
*/
/**
*
* @generator
* @param {number} N
* @param {Array<Document>} preSampleAggregationStages
* @yields {string}
*/
export async function* getSampleProjectsCursor(
N,
preSampleAggregationStages = []
) {
const cursor = projectsCollection.aggregate([
...preSampleAggregationStages,
{ $sample: { size: N } },
{ $project: { 'overleaf.history.id': 1 } },
])
let validProjects = 0
let hasInvalidProject = false
for await (const project of cursor) {
if (HAS_PROJECTS_WITHOUT_HISTORY && !project.overleaf?.history?.id) {
hasInvalidProject = true
continue
}
validProjects++
yield project.overleaf.history.id.toString()
}
if (validProjects === 0 && hasInvalidProject) {
yield* getSampleProjectsCursor(N, preSampleAggregationStages)
}
}

View File

@@ -0,0 +1,320 @@
// @ts-check
import { verifyProjectWithErrorContext } from '../storage/lib/backupVerifier.mjs'
import { promiseMapSettledWithLimit } from '@overleaf/promise-utils'
import logger from '@overleaf/logger'
import metrics from '@overleaf/metrics'
import {
getSampleProjectsCursor,
getProjectsCreatedInDateRangeCursor,
getProjectsUpdatedInDateRangeCursor,
} from './ProjectSampler.mjs'
import OError from '@overleaf/o-error'
import { setTimeout } from 'node:timers/promises'
const MS_PER_30_DAYS = 30 * 24 * 60 * 60 * 1000
const failureCounter = new metrics.prom.Counter({
name: 'backup_project_verification_failed',
help: 'Number of projects that failed verification',
labelNames: ['name'],
})
const successCounter = new metrics.prom.Counter({
name: 'backup_project_verification_succeeded',
help: 'Number of projects that succeeded verification',
})
let WRITE_METRICS = false
/**
* @typedef {import('node:events').EventEmitter} EventEmitter
*/
/**
* Allows writing metrics to be enabled or disabled.
* @param {Boolean} writeMetrics
*/
export function setWriteMetrics(writeMetrics) {
WRITE_METRICS = writeMetrics
}
/**
*
* @param {Error|unknown} error
* @param {string} historyId
*/
function handleVerificationError(error, historyId) {
const name = error instanceof Error ? error.name : 'UnknownError'
logger.error({ historyId, error, name }, 'error verifying project backup')
WRITE_METRICS && failureCounter.inc({ name })
return name
}
/**
*
* @param {Date} startDate
* @param {Date} endDate
* @param {number} interval
* @returns {Array<VerificationJobSpecification>}
*/
function splitJobs(startDate, endDate, interval) {
/** @type {Array<VerificationJobSpecification>} */
const jobs = []
while (startDate < endDate) {
const nextStart = new Date(
Math.min(startDate.getTime() + interval, endDate.getTime())
)
jobs.push({ startDate, endDate: nextStart })
startDate = nextStart
}
return jobs
}
/**
*
* @param {AsyncGenerator<string>} historyIdCursor
* @param {EventEmitter} [eventEmitter]
* @param {number} [delay] - Allows a delay between each verification
* @return {Promise<{verified: number, total: number, errorTypes: *[], hasFailure: boolean}>}
*/
async function verifyProjectsFromCursor(
historyIdCursor,
eventEmitter,
delay = 0
) {
const errorTypes = []
let verified = 0
let total = 0
let receivedShutdownSignal = false
if (eventEmitter) {
eventEmitter.once('shutdown', () => {
receivedShutdownSignal = true
})
}
for await (const historyId of historyIdCursor) {
if (receivedShutdownSignal) {
break
}
total++
try {
await verifyProjectWithErrorContext(historyId)
logger.debug({ historyId }, 'verified project backup successfully')
WRITE_METRICS && successCounter.inc()
verified++
} catch (error) {
const errorType = handleVerificationError(error, historyId)
errorTypes.push(errorType)
}
if (delay > 0) {
await setTimeout(delay)
}
}
return {
verified,
total,
errorTypes,
hasFailure: errorTypes.length > 0,
}
}
/**
*
* @param {number} nProjectsToSample
* @param {EventEmitter} [signal]
* @param {number} [delay]
* @return {Promise<VerificationJobStatus>}
*/
export async function verifyRandomProjectSample(
nProjectsToSample,
signal,
delay = 0
) {
const historyIds = await getSampleProjectsCursor(nProjectsToSample)
return await verifyProjectsFromCursor(historyIds, signal, delay)
}
/**
* Samples projects with history IDs between the specified dates and verifies them.
*
* @param {Date} startDate
* @param {Date} endDate
* @param {number} projectsPerRange
* @param {EventEmitter} [signal]
* @return {Promise<VerificationJobStatus>}
*/
async function verifyRange(startDate, endDate, projectsPerRange, signal) {
logger.info({ startDate, endDate }, 'verifying range')
const results = await verifyProjectsFromCursor(
getProjectsCreatedInDateRangeCursor(startDate, endDate, projectsPerRange),
signal
)
if (results.total === 0) {
logger.debug(
{ start: startDate, end: endDate },
'No projects found in range'
)
}
const jobStatus = {
...results,
startDate,
endDate,
}
logger.debug(
{ ...jobStatus, errorTypes: Array.from(new Set(jobStatus.errorTypes)) },
'Verified range'
)
return jobStatus
}
/**
* @typedef {Object} VerificationJobSpecification
* @property {Date} startDate
* @property {Date} endDate
*/
/**
* @typedef {import('./types.d.ts').VerificationJobStatus} VerificationJobStatus
*/
/**
* @typedef {Object} VerifyDateRangeOptions
* @property {Date} startDate
* @property {Date} endDate
* @property {number} [interval]
* @property {number} [projectsPerRange]
* @property {number} [concurrency]
* @property {EventEmitter} [signal]
*/
/**
*
* @param {VerifyDateRangeOptions} options
* @return {Promise<VerificationJobStatus>}
*/
export async function verifyProjectsCreatedInDateRange({
concurrency = 0,
projectsPerRange = 10,
startDate,
endDate,
interval = MS_PER_30_DAYS,
signal,
}) {
const jobs = splitJobs(startDate, endDate, interval)
if (jobs.length === 0) {
throw new OError('Time range could not be split into jobs', {
start: startDate,
end: endDate,
interval,
})
}
const settlements = await promiseMapSettledWithLimit(
concurrency,
jobs,
({ startDate, endDate }) =>
verifyRange(startDate, endDate, projectsPerRange, signal)
)
return settlements.reduce(
/**
*
* @param {VerificationJobStatus} acc
* @param settlement
* @return {VerificationJobStatus}
*/
(acc, settlement) => {
if (settlement.status !== 'rejected') {
if (settlement.value.hasFailure) {
acc.hasFailure = true
}
acc.total += settlement.value.total
acc.verified += settlement.value.verified
acc.errorTypes = acc.errorTypes.concat(settlement.value.errorTypes)
} else {
logger.error({ ...settlement.reason }, 'Error processing range')
}
return acc
},
/** @type {VerificationJobStatus} */
{
startDate,
endDate,
verified: 0,
total: 0,
hasFailure: false,
errorTypes: [],
}
)
}
/**
* Verifies that projects that have recently gone out of RPO have been updated.
*
* @param {Date} startDate
* @param {Date} endDate
* @param {number} nProjects
* @param {EventEmitter} [signal]
* @return {Promise<VerificationJobStatus>}
*/
export async function verifyProjectsUpdatedInDateRange(
startDate,
endDate,
nProjects,
signal
) {
logger.debug(
{ startDate, endDate, nProjects },
'Sampling projects updated in date range'
)
const results = await verifyProjectsFromCursor(
getProjectsUpdatedInDateRangeCursor(startDate, endDate, nProjects),
signal
)
if (results.total === 0) {
logger.debug(
{ start: startDate, end: endDate },
'No projects updated recently'
)
}
const jobStatus = {
...results,
startDate,
endDate,
}
logger.debug(
{ ...jobStatus, errorTypes: Array.from(new Set(jobStatus.errorTypes)) },
'Verified recently updated projects'
)
return jobStatus
}
/**
*
* @param {EventEmitter} signal
* @return {void}
*/
export function loopRandomProjects(signal) {
let shutdown = false
signal.on('shutdown', function () {
shutdown = true
})
async function loop() {
do {
try {
const result = await verifyRandomProjectSample(100, signal, 2_000)
logger.debug({ result }, 'verified random project sample')
} catch (error) {
logger.error({ error }, 'error verifying random project sample')
}
// eslint-disable-next-line no-unmodified-loop-condition
} while (!shutdown)
}
loop()
}

View File

@@ -0,0 +1,32 @@
import config from 'config'
import { verifyProjectWithErrorContext } from '../storage/lib/backupVerifier.mjs'
import {
measureNeverBackedUpProjects,
measurePendingChangesBeforeTime,
} from './ProjectMetrics.mjs'
import { getEndDateForRPO, RPO } from './utils.mjs'
/** @type {Array<string>} */
const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
export async function healthCheck() {
if (!Array.isArray(HEALTH_CHECK_PROJECTS)) {
throw new Error('expected healthCheckProjects to be an array')
}
if (HEALTH_CHECK_PROJECTS.length !== 2) {
throw new Error('expected 2 healthCheckProjects')
}
if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
throw new Error('expected mongo id in healthCheckProjects')
}
if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
throw new Error('expected postgres id in healthCheckProjects')
}
for (const historyId of HEALTH_CHECK_PROJECTS) {
await verifyProjectWithErrorContext(historyId)
}
await measurePendingChangesBeforeTime(getEndDateForRPO(2))
await measureNeverBackedUpProjects(getEndDateForRPO(2))
}

View File

@@ -0,0 +1,8 @@
export type VerificationJobStatus = {
verified: number
total: number
startDate?: Date
endDate?: Date
hasFailure: boolean
errorTypes: Array<string>
}

View File

@@ -0,0 +1,35 @@
import { ObjectId } from 'mongodb'
import config from 'config'
export const RPO = parseInt(config.get('backupRPOInMS'), 10)
/**
* @param {Date} time
* @return {ObjectId}
*/
export function objectIdFromDate(time) {
return ObjectId.createFromTime(time.getTime() / 1000)
}
/**
* @param {number} [factor] - Multiply RPO by this factor, default is 1
* @return {Date}
*/
export function getEndDateForRPO(factor = 1) {
return new Date(Date.now() - RPO * factor)
}
/**
* Creates a startDate, endDate pair that checks a period of time before the RPO horizon
*
* @param {number} offset - How many seconds we should check
* @return {{endDate: Date, startDate: Date}}
*/
export function getDatesBeforeRPO(offset) {
const now = new Date()
const endDate = new Date(now.getTime() - RPO)
return {
endDate,
startDate: new Date(endDate.getTime() - offset * 1000),
}
}

View File

@@ -0,0 +1,82 @@
const crypto = require('node:crypto')
const benny = require('benny')
const { Blob } = require('overleaf-editor-core')
const mongoBackend = require('../storage/lib/blob_store/mongo')
const postgresBackend = require('../storage/lib/blob_store/postgres')
const cleanup = require('../test/acceptance/js/storage/support/cleanup')
const MONGO_PROJECT_ID = '637386deb4ce3c62acd3848e'
const POSTGRES_PROJECT_ID = '123'
async function run() {
for (const blobCount of [1, 10, 100, 1000, 10000, 100000, 500000]) {
await cleanup.everything()
const blobs = createBlobs(blobCount)
await insertBlobs(blobs)
const randomHashes = getRandomHashes(blobs, 100)
await benny.suite(
`Read a blob in a project with ${blobCount} blobs`,
benny.add('Mongo backend', async () => {
await mongoBackend.findBlob(MONGO_PROJECT_ID, randomHashes[0])
}),
benny.add('Postgres backend', async () => {
await postgresBackend.findBlob(POSTGRES_PROJECT_ID, randomHashes[0])
}),
benny.cycle(),
benny.complete()
)
await benny.suite(
`Read 100 blobs in a project with ${blobCount} blobs`,
benny.add('Mongo backend', async () => {
await mongoBackend.findBlobs(MONGO_PROJECT_ID, randomHashes)
}),
benny.add('Postgres backend', async () => {
await postgresBackend.findBlobs(POSTGRES_PROJECT_ID, randomHashes)
}),
benny.cycle(),
benny.complete()
)
await benny.suite(
`Insert a blob in a project with ${blobCount} blobs`,
benny.add('Mongo backend', async () => {
const [newBlob] = createBlobs(1)
await mongoBackend.insertBlob(MONGO_PROJECT_ID, newBlob)
}),
benny.add('Postgres backend', async () => {
const [newBlob] = createBlobs(1)
await postgresBackend.insertBlob(POSTGRES_PROJECT_ID, newBlob)
}),
benny.cycle(),
benny.complete()
)
}
}
function createBlobs(blobCount) {
const blobs = []
for (let i = 0; i < blobCount; i++) {
const hash = crypto.randomBytes(20).toString('hex')
blobs.push(new Blob(hash, 42, 42))
}
return blobs
}
async function insertBlobs(blobs) {
for (const blob of blobs) {
await Promise.all([
mongoBackend.insertBlob(MONGO_PROJECT_ID, blob),
postgresBackend.insertBlob(POSTGRES_PROJECT_ID, blob),
])
}
}
function getRandomHashes(blobs, count) {
const hashes = []
for (let i = 0; i < count; i++) {
const index = Math.floor(Math.random() * blobs.length)
hashes.push(blobs[index].getHash())
}
return hashes
}
module.exports = run

View File

@@ -0,0 +1,17 @@
const testSetup = require('../test/setup')
const blobStoreSuite = require('./blob_store')
async function main() {
await testSetup.setupPostgresDatabase()
await testSetup.createGcsBuckets()
await blobStoreSuite()
}
main()
.then(() => {
process.exit(0)
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,10 @@
history-v1
--dependencies=postgres,gcs,mongo,redis,s3
--docker-repos=us-east1-docker.pkg.dev/overleaf-ops/ol-docker
--env-add=
--env-pass-through=
--esmock-loader=False
--node-version=20.18.2
--public-repo=False
--script-version=4.7.0
--tsconfig-extra-includes=backup-deletion-app.mjs,backup-verifier-app.mjs,backup-worker-app.mjs,api/**/*,migrations/**/*,storage/**/*

View File

@@ -0,0 +1,104 @@
{
"databaseUrl": "HISTORY_CONNECTION_STRING",
"databaseUrlReadOnly": "HISTORY_FOLLOWER_CONNECTION_STRING",
"herokuDatabaseUrl": "DATABASE_URL",
"databasePoolMin": "DATABASE_POOL_MIN",
"databasePoolMax": "DATABASE_POOL_MAX",
"persistor": {
"backend": "PERSISTOR_BACKEND",
"s3": {
"key": "AWS_ACCESS_KEY_ID",
"secret": "AWS_SECRET_ACCESS_KEY",
"endpoint": "AWS_S3_ENDPOINT",
"pathStyle": "AWS_S3_PATH_STYLE",
"maxRetries": "S3_MAX_RETRIES",
"httpOptions": {
"timeout": "S3_TIMEOUT"
}
},
"gcs": {
"deletedBucketSuffix": "GCS_DELETED_BUCKET_SUFFIX",
"unlockBeforeDelete": "GCS_UNLOCK_BEFORE_DELETE",
"endpoint": {
"apiEndpoint": "GCS_API_ENDPOINT",
"projectId": "GCS_PROJECT_ID"
},
"retryOptions": {
"maxRetries": "GCS_MAX_RETRIES",
"idempotencyStrategy": "GCS_IDEMPOTENCY_STRATEGY"
}
},
"fallback": {
"backend": "PERSISTOR_FALLBACK_BACKEND",
"buckets": "PERSISTOR_BUCKET_MAPPING"
}
},
"backupPersistor": {
"keyEncryptionKeys": "BACKUP_KEY_ENCRYPTION_KEYS",
"s3SSEC": {
"key": "AWS_ACCESS_KEY_ID",
"secret": "AWS_SECRET_ACCESS_KEY",
"endpoint": "AWS_S3_ENDPOINT",
"pathStyle": "AWS_S3_PATH_STYLE",
"maxRetries": "BACKUP_S3_MAX_RETRIES",
"httpOptions": {
"timeout": "BACKUP_S3_TIMEOUT"
}
}
},
"blobStore": {
"globalBucket": "OVERLEAF_EDITOR_BLOBS_BUCKET",
"projectBucket": "OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET"
},
"chunkStore": {
"historyStoreConcurrency": "HISTORY_STORE_CONCURRENCY",
"bucket": "OVERLEAF_EDITOR_CHUNKS_BUCKET"
},
"zipStore": {
"bucket": "OVERLEAF_EDITOR_ZIPS_BUCKET",
"zipTimeoutMs": "ZIP_STORE_ZIP_TIMEOUT_MS"
},
"backupStore": {
"chunksBucket":"BACKUP_OVERLEAF_EDITOR_CHUNKS_BUCKET",
"deksBucket":"BACKUP_OVERLEAF_EDITOR_DEKS_BUCKET",
"globalBlobsBucket":"BACKUP_OVERLEAF_EDITOR_GLOBAL_BLOBS_BUCKET",
"projectBlobsBucket":"BACKUP_OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET"
},
"healthCheckBlobs": "HEALTH_CHECK_BLOBS",
"healthCheckProjects": "HEALTH_CHECK_PROJECTS",
"backupRPOInMS": "BACKUP_RPO_IN_MS",
"minSoftDeletionPeriodDays": "MIN_SOFT_DELETION_PERIOD_DAYS",
"mongo": {
"uri": "MONGO_CONNECTION_STRING"
},
"basicHttpAuth": {
"password": "STAGING_PASSWORD",
"oldPassword": "BASIC_HTTP_AUTH_OLD_PASSWORD"
},
"jwtAuth": {
"key": "OT_JWT_AUTH_KEY",
"oldKey": "OT_JWT_AUTH_OLD_KEY",
"algorithm": "OT_JWT_AUTH_ALG"
},
"clusterWorkers": "CLUSTER_WORKERS",
"maxFileUploadSize": "MAX_FILE_UPLOAD_SIZE",
"httpsOnly": "HTTPS_ONLY",
"httpRequestTimeout": "HTTP_REQUEST_TIMEOUT",
"redis": {
"queue": {
"host": "QUEUES_REDIS_HOST",
"password": "QUEUES_REDIS_PASSWORD",
"port": "QUEUES_REDIS_PORT"
},
"history": {
"host": "HISTORY_REDIS_HOST",
"password": "HISTORY_REDIS_PASSWORD",
"port": "HISTORY_REDIS_PORT"
},
"lock": {
"host": "REDIS_HOST",
"password": "REDIS_PASSWORD",
"port": "REDIS_PORT"
}
}
}

View File

@@ -0,0 +1,43 @@
{
"persistor": {
"backend": "s3",
"s3": {
"signedUrlExpiryInMs": "1800000",
"maxRetries": "1",
"httpOptions": {
"timeout": "8000"
}
},
"gcs": {
"signedUrlExpiryInMs": "1800000",
"deleteConcurrency": "50"
}
},
"backupPersistor": {
"backend": "s3SSEC",
"s3SSEC": {
"maxRetries": "1",
"pathStyle": false,
"httpOptions": {
"timeout": "120000"
}
}
},
"backupRPOInMS": "3600000",
"chunkStore": {
"historyStoreConcurrency": "4"
},
"zipStore": {
"zipTimeoutMs": "360000"
},
"hasProjectsWithoutHistory": false,
"minSoftDeletionPeriodDays": "90",
"maxDeleteKeys": "1000",
"useDeleteObjects": "true",
"clusterWorkers": "1",
"maxFileUploadSize": "52428800",
"databasePoolMin": "2",
"databasePoolMax": "10",
"httpsOnly": "false",
"httpRequestTimeout": "300000"
}

View File

@@ -0,0 +1,49 @@
{
"databaseUrl": "postgres://postgres:postgres@postgres/write_latex_dev",
"persistor": {
"s3": {
"endpoint": "http://s3:8080",
"pathStyle": "true"
},
"gcs": {
"unsignedUrls": "true",
"endpoint": {
"apiEndpoint": "http://fake-gcs:9090",
"projectId": "fake"
}
}
},
"blobStore": {
"globalBucket": "overleaf-development-blobs",
"projectBucket": "overleaf-development-project-blobs"
},
"chunkStore": {
"bucket": "overleaf-development-chunks"
},
"zipStore": {
"bucket": "overleaf-development-zips"
},
"backupStore": {
"chunksBucket":"overleaf-development-history-chunks",
"deksBucket":"overleaf-development-history-deks",
"globalBlobsBucket":"overleaf-development-history-global-blobs",
"projectBlobsBucket":"overleaf-development-history-project-blobs"
},
"backupPersistor": {
"keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]",
"s3SSEC": {
"ca": "[\"/certs/public.crt\"]"
}
},
"useDeleteObjects": "false",
"mongo": {
"uri": "mongodb://mongo:27017/sharelatex"
},
"basicHttpAuth": {
"password": "password"
},
"jwtAuth": {
"key": "secureKey",
"algorithm": "HS256"
}
}

View File

@@ -0,0 +1,5 @@
{
"backupPersistor": {
"tieringStorageClass": "INTELLIGENT_TIERING"
}
}

View File

@@ -0,0 +1,53 @@
{
"databaseUrl": "postgres://overleaf:overleaf@postgres/overleaf-history-v1-test",
"databaseUrlReadOnly": "postgres://read_only:password@postgres/overleaf-history-v1-test",
"persistor": {
"backend": "gcs",
"gcs": {
"unsignedUrls": "true",
"endpoint": {
"apiEndpoint": "http://gcs:9090",
"projectId": "fake"
}
}
},
"blobStore": {
"globalBucket": "overleaf-test-blobs",
"projectBucket": "overleaf-test-project-blobs"
},
"chunkStore": {
"bucket": "overleaf-test-chunks"
},
"zipStore": {
"bucket": "overleaf-test-zips"
},
"backupStore": {
"chunksBucket":"overleaf-test-history-chunks",
"deksBucket":"overleaf-test-history-deks",
"globalBlobsBucket":"overleaf-test-history-global-blobs",
"projectBlobsBucket":"overleaf-test-history-project-blobs"
},
"backupPersistor": {
"keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]",
"s3SSEC": {
"ca": "[\"/certs/public.crt\"]"
},
"tieringStorageClass": "REDUCED_REDUNDANCY"
},
"healthCheckBlobs": "[\"42/f70d7bba4ae1f07682e0358bd7a2068094fc023b\",\"000000000000000000000042/98d5521fe746bc2d11761edab5d0829bee286009\"]",
"healthCheckProjects": "[\"42\",\"000000000000000000000042\"]",
"backupRPOInMS": "360000",
"maxDeleteKeys": "3",
"useDeleteObjects": "false",
"mongo": {
"uri": "mongodb://mongo:27017/sharelatex"
},
"basicHttpAuth": {
"password": "test"
},
"jwtAuth": {
"key": "testtest",
"algorithm": "HS256"
},
"maxFileUploadSize": "524288"
}

View File

@@ -0,0 +1,237 @@
# This file was auto-generated, do not edit it directly.
# Instead run bin/update_build_scripts from
# https://github.com/overleaf/internal/
version: "2.3"
services:
test_unit:
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
user: node
command: npm run test:unit:_run
environment:
NODE_ENV: test
NODE_OPTIONS: "--unhandled-rejections=strict"
test_acceptance:
build: .
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
environment:
ELASTIC_SEARCH_DSN: es:9200
REDIS_HOST: redis
QUEUES_REDIS_HOST: redis
HISTORY_REDIS_HOST: redis
ANALYTICS_QUEUES_REDIS_HOST: redis
MONGO_HOST: mongo
POSTGRES_HOST: postgres
AWS_S3_ENDPOINT: https://minio:9000
AWS_S3_PATH_STYLE: 'true'
AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
MINIO_ROOT_USER: MINIO_ROOT_USER
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
GCS_API_ENDPOINT: http://gcs:9090
GCS_PROJECT_ID: fake
STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1
MOCHA_GREP: ${MOCHA_GREP}
NODE_ENV: test
NODE_OPTIONS: "--unhandled-rejections=strict"
volumes:
- ./test/acceptance/certs:/certs
depends_on:
mongo:
condition: service_started
redis:
condition: service_healthy
postgres:
condition: service_healthy
certs:
condition: service_completed_successfully
minio:
condition: service_started
minio_setup:
condition: service_completed_successfully
gcs:
condition: service_healthy
user: node
command: npm run test:acceptance
tar:
build: .
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
volumes:
- ./:/tmp/build/
command: tar -czf /tmp/build/build.tar.gz --exclude=build.tar.gz --exclude-vcs .
user: root
redis:
image: redis
healthcheck:
test: ping="$$(redis-cli ping)" && [ "$$ping" = 'PONG' ]
interval: 1s
retries: 20
mongo:
image: mongo:6.0.13
command: --replSet overleaf
volumes:
- ../../bin/shared/mongodb-init-replica-set.js:/docker-entrypoint-initdb.d/mongodb-init-replica-set.js
environment:
MONGO_INITDB_DATABASE: sharelatex
extra_hosts:
# Required when using the automatic database setup for initializing the
# replica set. This override is not needed when running the setup after
# starting up mongo.
- mongo:127.0.0.1
postgres:
image: postgres:10
environment:
POSTGRES_USER: overleaf
POSTGRES_PASSWORD: overleaf
POSTGRES_DB: overleaf-history-v1-test
volumes:
- ./test/acceptance/pg-init/:/docker-entrypoint-initdb.d/
healthcheck:
test: pg_isready --quiet
interval: 1s
retries: 20
certs:
image: node:20.18.2
volumes:
- ./test/acceptance/certs:/certs
working_dir: /certs
entrypoint: sh
command:
- '-cex'
- |
if [ ! -f ./certgen ]; then
wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)"
chmod +x ./certgen
fi
if [ ! -f private.key ] || [ ! -f public.crt ]; then
./certgen -host minio
fi
minio:
image: minio/minio:RELEASE.2024-10-13T13-34-11Z
command: server /data
volumes:
- ./test/acceptance/certs:/root/.minio/certs
environment:
MINIO_ROOT_USER: MINIO_ROOT_USER
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
depends_on:
certs:
condition: service_completed_successfully
minio_setup:
depends_on:
certs:
condition: service_completed_successfully
minio:
condition: service_started
image: minio/mc:RELEASE.2024-10-08T09-37-26Z
volumes:
- ./test/acceptance/certs:/root/.mc/certs/CAs
entrypoint: sh
command:
- '-cex'
- |
sleep 1
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|| sleep 3 && \
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|| sleep 3 && \
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|| sleep 3 && \
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD
mc mb --ignore-existing s3/overleaf-test-history-chunks
mc mb --ignore-existing s3/overleaf-test-history-deks
mc mb --ignore-existing s3/overleaf-test-history-global-blobs
mc mb --ignore-existing s3/overleaf-test-history-project-blobs
mc admin user add s3 \
OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \
OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
echo '
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-chunks"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-chunks/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-deks"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-deks/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*"
}
]
}' > policy-history.json
mc admin policy create s3 overleaf-history policy-history.json
mc admin policy attach s3 overleaf-history \
--user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
gcs:
image: fsouza/fake-gcs-server:1.45.2
command: ["--port=9090", "--scheme=http"]
healthcheck:
test: wget --quiet --output-document=/dev/null http://localhost:9090/storage/v1/b
interval: 1s
retries: 20

View File

@@ -0,0 +1,246 @@
# This file was auto-generated, do not edit it directly.
# Instead run bin/update_build_scripts from
# https://github.com/overleaf/internal/
version: "2.3"
services:
test_unit:
build:
context: ../..
dockerfile: services/history-v1/Dockerfile
target: base
volumes:
- .:/overleaf/services/history-v1
- ../../node_modules:/overleaf/node_modules
- ../../libraries:/overleaf/libraries
working_dir: /overleaf/services/history-v1
environment:
MOCHA_GREP: ${MOCHA_GREP}
LOG_LEVEL: ${LOG_LEVEL:-}
NODE_ENV: test
NODE_OPTIONS: "--unhandled-rejections=strict"
command: npm run --silent test:unit
user: node
test_acceptance:
build:
context: ../..
dockerfile: services/history-v1/Dockerfile
target: base
volumes:
- .:/overleaf/services/history-v1
- ../../node_modules:/overleaf/node_modules
- ../../libraries:/overleaf/libraries
- ./test/acceptance/certs:/certs
working_dir: /overleaf/services/history-v1
environment:
ELASTIC_SEARCH_DSN: es:9200
REDIS_HOST: redis
HISTORY_REDIS_HOST: redis
QUEUES_REDIS_HOST: redis
ANALYTICS_QUEUES_REDIS_HOST: redis
MONGO_HOST: mongo
POSTGRES_HOST: postgres
AWS_S3_ENDPOINT: https://minio:9000
AWS_S3_PATH_STYLE: 'true'
AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
MINIO_ROOT_USER: MINIO_ROOT_USER
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
GCS_API_ENDPOINT: http://gcs:9090
GCS_PROJECT_ID: fake
STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1
MOCHA_GREP: ${MOCHA_GREP}
LOG_LEVEL: ${LOG_LEVEL:-}
NODE_ENV: test
NODE_OPTIONS: "--unhandled-rejections=strict"
user: node
depends_on:
mongo:
condition: service_started
redis:
condition: service_healthy
postgres:
condition: service_healthy
certs:
condition: service_completed_successfully
minio:
condition: service_started
minio_setup:
condition: service_completed_successfully
gcs:
condition: service_healthy
command: npm run --silent test:acceptance
redis:
image: redis
healthcheck:
test: ping=$$(redis-cli ping) && [ "$$ping" = 'PONG' ]
interval: 1s
retries: 20
mongo:
image: mongo:6.0.13
command: --replSet overleaf
volumes:
- ../../bin/shared/mongodb-init-replica-set.js:/docker-entrypoint-initdb.d/mongodb-init-replica-set.js
environment:
MONGO_INITDB_DATABASE: sharelatex
extra_hosts:
# Required when using the automatic database setup for initializing the
# replica set. This override is not needed when running the setup after
# starting up mongo.
- mongo:127.0.0.1
postgres:
image: postgres:10
environment:
POSTGRES_USER: overleaf
POSTGRES_PASSWORD: overleaf
POSTGRES_DB: overleaf-history-v1-test
volumes:
- ./test/acceptance/pg-init/:/docker-entrypoint-initdb.d/
healthcheck:
test: pg_isready --host=localhost --quiet
interval: 1s
retries: 20
certs:
image: node:20.18.2
volumes:
- ./test/acceptance/certs:/certs
working_dir: /certs
entrypoint: sh
command:
- '-cex'
- |
if [ ! -f ./certgen ]; then
wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)"
chmod +x ./certgen
fi
if [ ! -f private.key ] || [ ! -f public.crt ]; then
./certgen -host minio
fi
minio:
image: minio/minio:RELEASE.2024-10-13T13-34-11Z
command: server /data
volumes:
- ./test/acceptance/certs:/root/.minio/certs
environment:
MINIO_ROOT_USER: MINIO_ROOT_USER
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
depends_on:
certs:
condition: service_completed_successfully
minio_setup:
depends_on:
certs:
condition: service_completed_successfully
minio:
condition: service_started
image: minio/mc:RELEASE.2024-10-08T09-37-26Z
volumes:
- ./test/acceptance/certs:/root/.mc/certs/CAs
entrypoint: sh
command:
- '-cex'
- |
sleep 1
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|| sleep 3 && \
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|| sleep 3 && \
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|| sleep 3 && \
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD
mc mb --ignore-existing s3/overleaf-test-history-chunks
mc mb --ignore-existing s3/overleaf-test-history-deks
mc mb --ignore-existing s3/overleaf-test-history-global-blobs
mc mb --ignore-existing s3/overleaf-test-history-project-blobs
mc admin user add s3 \
OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \
OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
echo '
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-chunks"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-chunks/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-deks"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-deks/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs"
},
{
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*"
}
]
}' > policy-history.json
mc admin policy create s3 overleaf-history policy-history.json
mc admin policy attach s3 overleaf-history \
--user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
gcs:
image: fsouza/fake-gcs-server:1.45.2
command: ["--port=9090", "--scheme=http"]
healthcheck:
test: wget --quiet --output-document=/dev/null http://localhost:9090/storage/v1/b
interval: 1s
retries: 20

View File

@@ -0,0 +1,9 @@
#!/bin/sh
set -ex
apt-get update
apt-get install jq parallel --yes
rm -rf /var/lib/apt/lists/*

View File

@@ -0,0 +1,19 @@
const config = require('config')
const baseConfig = {
client: 'postgresql',
connection: config.herokuDatabaseUrl || config.databaseUrl,
pool: {
min: parseInt(config.databasePoolMin, 10),
max: parseInt(config.databasePoolMax, 10),
},
migrations: {
tableName: 'knex_migrations',
},
}
module.exports = {
development: baseConfig,
production: baseConfig,
test: baseConfig,
}

View File

@@ -0,0 +1,80 @@
/**
* This is the initial migration, meant to replicate the current state of the
* history database. If tables already exist, this migration is a noop.
*/
exports.up = async function (knex) {
await knex.raw(`
CREATE TABLE IF NOT EXISTS chunks (
id SERIAL,
doc_id integer NOT NULL,
end_version integer NOT NULL,
end_timestamp timestamp without time zone,
CONSTRAINT chunks_version_non_negative CHECK (end_version >= 0)
)
`)
await knex.raw(`
CREATE UNIQUE INDEX IF NOT EXISTS index_chunks_on_doc_id_and_end_version
ON chunks (doc_id, end_version)
`)
await knex.raw(`
CREATE TABLE IF NOT EXISTS old_chunks (
chunk_id integer NOT NULL PRIMARY KEY,
doc_id integer NOT NULL,
end_version integer,
end_timestamp timestamp without time zone,
deleted_at timestamp without time zone
)
`)
await knex.raw(`
CREATE INDEX IF NOT EXISTS index_old_chunks_on_doc_id_and_end_version
ON old_chunks (doc_id, end_version)
`)
await knex.raw(`
CREATE TABLE IF NOT EXISTS pending_chunks (
id SERIAL,
doc_id integer NOT NULL,
end_version integer NOT NULL,
end_timestamp timestamp without time zone,
CONSTRAINT chunks_version_non_negative CHECK (end_version >= 0)
)
`)
await knex.raw(`
CREATE INDEX IF NOT EXISTS index_pending_chunks_on_doc_id_and_id
ON pending_chunks (doc_id, id)
`)
await knex.raw(`
CREATE TABLE IF NOT EXISTS blobs (
hash_bytes bytea NOT NULL PRIMARY KEY,
byte_length integer NOT NULL,
string_length integer,
global boolean,
CONSTRAINT blobs_byte_length_non_negative CHECK (byte_length >= 0),
CONSTRAINT blobs_string_length_non_negative
CHECK (string_length IS NULL OR string_length >= 0)
)
`)
await knex.raw(`
CREATE TABLE IF NOT EXISTS project_blobs (
project_id integer NOT NULL,
hash_bytes bytea NOT NULL,
byte_length integer NOT NULL,
string_length integer,
PRIMARY KEY (project_id, hash_bytes),
CONSTRAINT project_blobs_byte_length_non_negative
CHECK (byte_length >= 0),
CONSTRAINT project_blobs_string_length_non_negative
CHECK (string_length IS NULL OR string_length >= 0)
)
`)
await knex.raw(`CREATE SEQUENCE IF NOT EXISTS docs_id_seq`)
}
exports.down = async function (knex) {
// Don't do anything on the down migration
}

View File

@@ -0,0 +1,23 @@
exports.up = async function (knex) {
await knex.raw(`
ALTER TABLE chunks ADD COLUMN start_version integer
`)
await knex.raw(`
ALTER TABLE pending_chunks ADD COLUMN start_version integer
`)
await knex.raw(`
ALTER TABLE old_chunks ADD COLUMN start_version integer
`)
}
exports.down = async function (knex) {
await knex.raw(`
ALTER TABLE chunks DROP COLUMN start_version
`)
await knex.raw(`
ALTER TABLE pending_chunks DROP COLUMN start_version
`)
await knex.raw(`
ALTER TABLE old_chunks DROP COLUMN start_version
`)
}

View File

@@ -0,0 +1,41 @@
exports.config = {
// CREATE INDEX CONCURRENTLY can't be run inside a transaction
// If this migration fails in the middle, indexes and constraints will have
// to be cleaned up manually.
transaction: false,
}
exports.up = async function (knex) {
await knex.raw(`
ALTER TABLE chunks
ADD CONSTRAINT chunks_start_version_non_negative
CHECK (start_version IS NOT NULL AND start_version >= 0)
NOT VALID
`)
await knex.raw(`
ALTER TABLE chunks
VALIDATE CONSTRAINT chunks_start_version_non_negative
`)
await knex.raw(`
CREATE UNIQUE INDEX CONCURRENTLY index_chunks_on_doc_id_and_start_version
ON chunks (doc_id, start_version)
`)
await knex.raw(`
ALTER TABLE chunks
ADD UNIQUE USING INDEX index_chunks_on_doc_id_and_start_version
`)
}
exports.down = async function (knex) {
await knex.raw(`
ALTER TABLE chunks
DROP CONSTRAINT IF EXISTS index_chunks_on_doc_id_and_start_version
`)
await knex.raw(`
DROP INDEX IF EXISTS index_chunks_on_doc_id_and_start_version
`)
await knex.raw(`
ALTER TABLE chunks
DROP CONSTRAINT IF EXISTS chunks_start_version_non_negative
`)
}

View File

@@ -0,0 +1,7 @@
exports.up = async function (knex) {
await knex.raw(`DROP TABLE IF EXISTS blobs`)
}
exports.down = function (knex) {
// Not reversible
}

View File

@@ -0,0 +1,27 @@
// @ts-check
/**
* @import { Knex } from "knex"
*/
/**
* @param { Knex } knex
* @returns { Promise<void> }
*/
exports.up = async function (knex) {
await knex.raw(`
ALTER TABLE chunks
ADD COLUMN closed BOOLEAN NOT NULL DEFAULT FALSE
`)
}
/**
* @param { Knex } knex
* @returns { Promise<void> }
*/
exports.down = async function (knex) {
await knex.raw(`
ALTER TABLE chunks
DROP COLUMN closed
`)
}

View File

@@ -0,0 +1,76 @@
{
"name": "overleaf-editor",
"version": "1.0.0",
"description": "Overleaf Editor.",
"author": "",
"license": "Proprietary",
"private": true,
"dependencies": {
"@google-cloud/secret-manager": "^5.6.0",
"@overleaf/logger": "*",
"@overleaf/metrics": "*",
"@overleaf/mongo-utils": "*",
"@overleaf/o-error": "*",
"@overleaf/object-persistor": "*",
"@overleaf/promise-utils": "*",
"@overleaf/redis-wrapper": "*",
"@overleaf/settings": "*",
"@overleaf/stream-utils": "^0.1.0",
"archiver": "^5.3.0",
"basic-auth": "^2.0.1",
"bluebird": "^3.7.2",
"body-parser": "^1.20.3",
"bull": "^4.16.5",
"bunyan": "^1.8.12",
"check-types": "^11.1.2",
"command-line-args": "^3.0.3",
"config": "^1.19.0",
"express": "^4.21.2",
"fs-extra": "^9.0.1",
"generic-pool": "^2.1.1",
"helmet": "^3.22.0",
"http-status": "^1.4.2",
"jsonwebtoken": "^9.0.0",
"knex": "^2.4.0",
"lodash": "^4.17.19",
"mongodb": "6.12.0",
"overleaf-editor-core": "*",
"p-limit": "^6.2.0",
"pg": "^8.7.1",
"pg-query-stream": "^4.2.4",
"swagger-tools": "^0.10.4",
"temp": "^0.8.3",
"throng": "^4.0.0",
"tsscmp": "^1.0.6",
"utf-8-validate": "^5.0.4"
},
"devDependencies": {
"benny": "^3.7.1",
"chai": "^4.3.6",
"chai-as-promised": "^7.1.1",
"chai-exclude": "^2.1.1",
"mocha": "^11.1.0",
"node-fetch": "^2.7.0",
"sinon": "^9.0.2",
"swagger-client": "^3.10.0",
"typescript": "^5.0.4",
"yauzl": "^2.9.1"
},
"scripts": {
"start": "node app.js",
"lint": "eslint --max-warnings 0 --format unix .",
"lint:fix": "eslint --fix .",
"format": "prettier --list-different $PWD/'**/*.*js'",
"format:fix": "prettier --write $PWD/'**/*.*js'",
"test:unit": "npm run test:unit:_run -- --grep=$MOCHA_GREP",
"test:acceptance": "npm run test:acceptance:_run -- --grep=$MOCHA_GREP",
"test:unit:_run": "mocha --recursive --reporter spec $@ test/unit/js",
"test:acceptance:_run": "mocha --recursive --reporter spec --timeout 15000 --exit $@ test/acceptance/js",
"nodemon": "node --watch app.js",
"migrate": "knex migrate:latest",
"delete_old_chunks": "node storage/tasks/delete_old_chunks.js",
"fix_duplicate_versions": "node storage/tasks/fix_duplicate_versions.js",
"benchmarks": "node benchmarks/index.js",
"types:check": "tsc --noEmit"
}
}

View File

@@ -0,0 +1,25 @@
exports.BatchBlobStore = require('./lib/batch_blob_store')
exports.blobHash = require('./lib/blob_hash')
exports.HashCheckBlobStore = require('./lib/hash_check_blob_store')
exports.chunkBuffer = require('./lib/chunk_buffer')
exports.chunkStore = require('./lib/chunk_store')
exports.historyStore = require('./lib/history_store').historyStore
exports.knex = require('./lib/knex')
exports.mongodb = require('./lib/mongodb')
exports.redis = require('./lib/redis')
exports.persistChanges = require('./lib/persist_changes')
exports.persistor = require('./lib/persistor')
exports.ProjectArchive = require('./lib/project_archive')
exports.streams = require('./lib/streams')
exports.temp = require('./lib/temp')
exports.zipStore = require('./lib/zip_store')
const { BlobStore, loadGlobalBlobs } = require('./lib/blob_store')
exports.BlobStore = BlobStore
exports.loadGlobalBlobs = loadGlobalBlobs
const { InvalidChangeError } = require('./lib/errors')
exports.InvalidChangeError = InvalidChangeError
const { ChunkVersionConflictError } = require('./lib/chunk_store/errors')
exports.ChunkVersionConflictError = ChunkVersionConflictError

View File

@@ -0,0 +1,76 @@
'use strict'
const OError = require('@overleaf/o-error')
const check = require('check-types')
const { Blob } = require('overleaf-editor-core')
const assert = check.assert
const MONGO_ID_REGEXP = /^[0-9a-f]{24}$/
const POSTGRES_ID_REGEXP = /^[1-9][0-9]{0,9}$/
const MONGO_OR_POSTGRES_ID_REGEXP = /^([0-9a-f]{24}|[1-9][0-9]{0,9})$/
function transaction(transaction, message) {
assert.function(transaction, message)
}
function blobHash(arg, message) {
try {
assert.match(arg, Blob.HEX_HASH_RX, message)
} catch (error) {
throw OError.tag(error, message, { arg })
}
}
/**
* A project id is a string that contains either an integer (for projects stored in Postgres) or 24
* hex digits (for projects stored in Mongo)
*/
function projectId(arg, message) {
try {
assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
} catch (error) {
throw OError.tag(error, message, { arg })
}
}
/**
* A chunk id is a string that contains either an integer (for projects stored in Postgres) or 24
* hex digits (for projects stored in Mongo)
*/
function chunkId(arg, message) {
try {
assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
} catch (error) {
throw OError.tag(error, message, { arg })
}
}
function mongoId(arg, message) {
try {
assert.match(arg, MONGO_ID_REGEXP, message)
} catch (error) {
throw OError.tag(error, message, { arg })
}
}
function postgresId(arg, message) {
try {
assert.match(arg, POSTGRES_ID_REGEXP, message)
} catch (error) {
throw OError.tag(error, message, { arg })
}
}
module.exports = {
...assert,
transaction,
blobHash,
projectId,
chunkId,
mongoId,
postgresId,
MONGO_ID_REGEXP,
POSTGRES_ID_REGEXP,
}

View File

@@ -0,0 +1,251 @@
// @ts-check
import { backupPersistor, projectBlobsBucket } from './backupPersistor.mjs'
import { GLOBAL_BLOBS, makeProjectKey, BlobStore } from './blob_store/index.js'
import Stream from 'node:stream'
import fs from 'node:fs'
import Crypto from 'node:crypto'
import assert from './assert.js'
import { backedUpBlobs, projects } from './mongodb.js'
import { Binary, ObjectId } from 'mongodb'
import logger from '@overleaf/logger/logging-manager.js'
import { AlreadyWrittenError } from '@overleaf/object-persistor/src/Errors.js'
import metrics from '@overleaf/metrics'
import zLib from 'node:zlib'
import Path from 'node:path'
const HIGHWATER_MARK = 1024 * 1024
/**
* @typedef {import("overleaf-editor-core").Blob} Blob
*/
/**
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
*/
/**
* Increment a metric to record the outcome of a backup operation.
*
* @param {"success"|"failure"|"skipped"} status
* @param {"global"|"already_backed_up"|"none"} reason
*/
function recordBackupConclusion(status, reason = 'none') {
metrics.inc('blob_backed_up', 1, { status, reason })
}
/**
* Downloads a blob to a specified directory
*
* @param {string} historyId - The history ID of the project the blob belongs to
* @param {Blob} blob - The blob to download
* @param {string} tmpDir - The directory path where the blob will be downloaded
* @returns {Promise<string>} The full path where the blob was downloaded
*/
export async function downloadBlobToDir(historyId, blob, tmpDir) {
const blobStore = new BlobStore(historyId)
const blobHash = blob.getHash()
const src = await blobStore.getStream(blobHash)
const filePath = Path.join(tmpDir, `${historyId}-${blobHash}`)
try {
const dst = fs.createWriteStream(filePath, {
highWaterMark: HIGHWATER_MARK,
flags: 'wx',
})
await Stream.promises.pipeline(src, dst)
return filePath
} catch (error) {
try {
await fs.promises.unlink(filePath)
} catch {}
throw error
}
}
/**
* Performs the actual upload of the blob to the backup storage.
*
* @param {string} historyId - The history ID of the project the blob belongs to
* @param {Blob} blob - The blob being uploaded
* @param {string} path - The path to the file to upload (should have been stored on disk already)
* @return {Promise<void>}
*/
export async function uploadBlobToBackup(historyId, blob, path, persistor) {
const md5 = Crypto.createHash('md5')
const filePathCompressed = path + '.gz'
let backupSource
let contentEncoding
let size
try {
if (blob.getStringLength()) {
backupSource = filePathCompressed
contentEncoding = 'gzip'
size = 0
await Stream.promises.pipeline(
fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
zLib.createGzip(),
async function* (source) {
for await (const chunk of source) {
size += chunk.byteLength
md5.update(chunk)
yield chunk
}
},
fs.createWriteStream(filePathCompressed, {
highWaterMark: HIGHWATER_MARK,
})
)
} else {
backupSource = path
size = blob.getByteLength()
await Stream.promises.pipeline(
fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
md5
)
}
const key = makeProjectKey(historyId, blob.getHash())
await persistor.sendStream(
projectBlobsBucket,
key,
fs.createReadStream(backupSource, { highWaterMark: HIGHWATER_MARK }),
{
contentEncoding,
contentType: 'application/octet-stream',
contentLength: size,
sourceMd5: md5.digest('hex'),
ifNoneMatch: '*',
}
)
} finally {
if (backupSource === filePathCompressed) {
try {
await fs.promises.rm(filePathCompressed, { force: true })
} catch {}
}
}
}
/**
* Converts a legacy (postgres) historyId to a mongo projectId
*
* @param {string} historyId
* @return {Promise<string>}
* @private
*/
async function _convertLegacyHistoryIdToProjectId(historyId) {
const project = await projects.findOne(
{ 'overleaf.history.id': parseInt(historyId) },
{ projection: { _id: 1 } }
)
if (!project?._id) {
throw new Error('Did not find project for history id')
}
return project?._id?.toString()
}
/**
* Records that a blob was backed up for a project.
*
* @param {string} projectId - projectId for a project (mongo format)
* @param {string} hash
* @return {Promise<void>}
*/
export async function storeBlobBackup(projectId, hash) {
await backedUpBlobs.updateOne(
{ _id: new ObjectId(projectId) },
{ $addToSet: { blobs: new Binary(Buffer.from(hash, 'hex')) } },
{ upsert: true }
)
}
/**
* Determine whether a specific blob has been backed up in this project.
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<*>}
* @private
*/
export async function _blobIsBackedUp(projectId, hash) {
const blobs = await backedUpBlobs.findOne(
{
_id: new ObjectId(projectId),
blobs: new Binary(Buffer.from(hash, 'hex')),
},
{ projection: { _id: 1 } }
)
return blobs?._id
}
/**
* Back up a blob to the global storage and record that it was backed up.
*
* @param {string} historyId - history ID for a project (can be postgres format or mongo format)
* @param {Blob} blob - The blob that is being backed up
* @param {string} tmpPath - The path to a temporary file storing the contents of the blob.
* @param {CachedPerProjectEncryptedS3Persistor} [persistor] - The persistor to use (optional)
* @return {Promise<void>}
*/
export async function backupBlob(historyId, blob, tmpPath, persistor) {
const hash = blob.getHash()
let projectId = historyId
if (assert.POSTGRES_ID_REGEXP.test(historyId)) {
projectId = await _convertLegacyHistoryIdToProjectId(historyId)
}
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob && !globalBlob.demoted) {
recordBackupConclusion('skipped', 'global')
logger.debug({ projectId, hash }, 'Blob is global - skipping backup')
return
}
try {
if (await _blobIsBackedUp(projectId, hash)) {
recordBackupConclusion('skipped', 'already_backed_up')
logger.debug(
{ projectId, hash },
'Blob already backed up - skipping backup'
)
return
}
} catch (error) {
logger.warn({ error }, 'Failed to check if blob is backed up')
// We'll try anyway - we'll catch the error if it was backed up
}
// If we weren't passed a persistor for this project, create one.
// This will fetch the key from AWS, so it's prefereable to use
// the same persistor for all blobs in a project where possible.
if (!persistor) {
logger.debug(
{ historyId, hash },
'warning: persistor not passed to backupBlob'
)
}
persistor ??= await backupPersistor.forProject(
projectBlobsBucket,
makeProjectKey(historyId, '')
)
try {
logger.debug({ projectId, hash }, 'Starting blob backup')
await uploadBlobToBackup(historyId, blob, tmpPath, persistor)
await storeBlobBackup(projectId, hash)
recordBackupConclusion('success')
} catch (error) {
if (error instanceof AlreadyWrittenError) {
logger.debug({ error, projectId, hash }, 'Blob already backed up')
// record that we backed it up already
await storeBlobBackup(projectId, hash)
recordBackupConclusion('failure', 'already_backed_up')
return
}
// eventually queue this for retry - for now this will be fixed by running the script
recordBackupConclusion('failure')
logger.warn({ error, projectId, hash }, 'Failed to upload blob to backup')
} finally {
logger.debug({ projectId, hash }, 'Ended blob backup')
}
}

View File

@@ -0,0 +1,93 @@
// @ts-check
import { callbackify } from 'util'
import { ObjectId } from 'mongodb'
import config from 'config'
import OError from '@overleaf/o-error'
import { db } from './mongodb.js'
import projectKey from './project_key.js'
import chunkStore from '../lib/chunk_store/index.js'
import {
backupPersistor,
chunksBucket,
projectBlobsBucket,
} from './backupPersistor.mjs'
const MS_PER_DAY = 24 * 60 * 60 * 1000
const EXPIRE_PROJECTS_AFTER_MS =
parseInt(config.get('minSoftDeletionPeriodDays'), 10) * MS_PER_DAY
const deletedProjectsCollection = db.collection('deletedProjects')
/**
* @param {string} historyId
* @return {Promise<boolean>}
*/
async function projectHasLatestChunk(historyId) {
const chunk = await chunkStore.getBackend(historyId).getLatestChunk(historyId)
return chunk != null
}
export class NotReadyToDelete extends OError {}
/**
* @param {string} projectId
* @return {Promise<void>}
*/
async function deleteProjectBackup(projectId) {
const deletedProject = await deletedProjectsCollection.findOne(
{ 'deleterData.deletedProjectId': new ObjectId(projectId) },
{
projection: {
'deleterData.deletedProjectOverleafHistoryId': 1,
'deleterData.deletedAt': 1,
},
}
)
if (!deletedProject) {
throw new NotReadyToDelete('refusing to delete non-deleted project')
}
const expiresAt =
deletedProject.deleterData.deletedAt.getTime() + EXPIRE_PROJECTS_AFTER_MS
if (expiresAt > Date.now()) {
throw new NotReadyToDelete('refusing to delete non-expired project')
}
const historyId =
deletedProject.deleterData.deletedProjectOverleafHistoryId?.toString()
if (!historyId) {
throw new NotReadyToDelete(
'refusing to delete project with unknown historyId'
)
}
if (await projectHasLatestChunk(historyId)) {
throw new NotReadyToDelete(
'refusing to delete project with remaining chunks'
)
}
const prefix = projectKey.format(historyId) + '/'
await backupPersistor.deleteDirectory(chunksBucket, prefix)
await backupPersistor.deleteDirectory(projectBlobsBucket, prefix)
}
export async function healthCheck() {
const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
if (HEALTH_CHECK_PROJECTS.length !== 2) {
throw new Error('expected 2 healthCheckProjects')
}
if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
throw new Error('expected mongo id in healthCheckProjects')
}
if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
throw new Error('expected postgres id in healthCheckProjects')
}
for (const historyId of HEALTH_CHECK_PROJECTS) {
if (!(await projectHasLatestChunk(historyId))) {
throw new Error(`project has no history: ${historyId}`)
}
}
}
export const healthCheckCb = callbackify(healthCheck)
export const deleteProjectBackupCb = callbackify(deleteProjectBackup)

View File

@@ -0,0 +1,152 @@
/**
* Provides a generator function to back up project chunks and blobs.
*/
import chunkStore from './chunk_store/index.js'
import {
GLOBAL_BLOBS, // NOTE: must call loadGlobalBlobs() before using this
BlobStore,
} from './blob_store/index.js'
import assert from './assert.js'
async function lookBehindForSeenBlobs(
projectId,
chunk,
lastBackedUpVersion,
seenBlobs
) {
if (chunk.startVersion === 0) {
return // this is the first chunk, no need to check for blobs in the previous chunk
}
if (chunk.startVersion > 0 && lastBackedUpVersion > chunk.startVersion) {
return // the snapshot in this chunk has already been backed up
}
if (
chunk.startVersion > 0 &&
lastBackedUpVersion === chunk.startVersion // same as previousChunk.endVersion
) {
// the snapshot in this chunk has not been backed up
// so we find the set of backed up blobs from the previous chunk
const previousChunk = await chunkStore.loadAtVersion(
projectId,
lastBackedUpVersion
)
const previousChunkHistory = previousChunk.getHistory()
previousChunkHistory.findBlobHashes(seenBlobs)
}
}
/**
* Records blob hashes that have been previously seen in a chunk's history.
*
* @param {Object} chunk - The chunk containing history data
* @param {number} currentBackedUpVersion - The version number that has been backed up
* @param {Set<string>} seenBlobs - Set to collect previously seen blob hashes
* @returns {void}
*/
function recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs) {
// We need to look at the chunk and decide how far we have backed up.
// If we have not backed up this chunk at all, we need to backup the blobs
// in the snapshot. Otherwise we need to backup the blobs in the changes
// that have occurred since the last backup.
const history = chunk.getHistory()
const startVersion = chunk.getStartVersion()
if (currentBackedUpVersion === 0) {
// If we have only backed up version 0 (i.e. the first change)
// then that includes the initial snapshot, so we consider
// the blobs of the initial snapshot as seen. If the project
// has not been backed up at all then currentBackedUpVersion
// will be undefined.
history.snapshot.findBlobHashes(seenBlobs)
} else if (currentBackedUpVersion > startVersion) {
history.snapshot.findBlobHashes(seenBlobs)
for (let i = 0; i < currentBackedUpVersion - startVersion; i++) {
history.changes[i].findBlobHashes(seenBlobs)
}
}
}
/**
* Collects new blob objects that need to be backed up from a given chunk.
*
* @param {Object} chunk - The chunk object containing history data
* @param {Object} blobStore - Storage interface for retrieving blobs
* @param {Set<string>} seenBlobs - Set of blob hashes that have already been processed
* @returns {Promise<Object[]>} Array of blob objects that need to be backed up
* @throws {Error} If blob retrieval fails
*/
async function collectNewBlobsForBackup(chunk, blobStore, seenBlobs) {
/** @type {Set<string>} */
const blobHashes = new Set()
const history = chunk.getHistory()
// Get all the blobs in this chunk, then exclude the seenBlobs and global blobs
history.findBlobHashes(blobHashes)
const blobsToBackup = await blobStore.getBlobs(
[...blobHashes].filter(
hash =>
hash &&
!seenBlobs.has(hash) &&
(!GLOBAL_BLOBS.has(hash) || GLOBAL_BLOBS.get(hash).demoted)
)
)
return blobsToBackup
}
/**
* Asynchronously generates backups for a project based on provided versions.
* @param {string} projectId - The ID of the project's history to back up.
* @param {number} lastBackedUpVersion - The last version that was successfully backed up.
* @yields {AsyncGenerator<{ chunkRecord: object, chunkToBackup: object, chunkBuffer: Buffer, blobsToBackup: object[] }>}
* Yields chunk records and corresponding data needed for backups.
*/
export async function* backupGenerator(projectId, lastBackedUpVersion) {
assert.projectId(projectId, 'bad projectId')
assert.maybe.integer(lastBackedUpVersion, 'bad lastBackedUpVersion')
const blobStore = new BlobStore(projectId)
/** @type {Set<string>} */
const seenBlobs = new Set() // records the blobs that are already backed up
const firstPendingVersion =
lastBackedUpVersion >= 0 ? lastBackedUpVersion + 1 : 0
let isStartingChunk = true
let currentBackedUpVersion = lastBackedUpVersion
const chunkRecordIterator = chunkStore.getProjectChunksFromVersion(
projectId,
firstPendingVersion
)
for await (const chunkRecord of chunkRecordIterator) {
const { chunk, chunkBuffer } = await chunkStore.loadByChunkRecord(
projectId,
chunkRecord
)
if (isStartingChunk) {
await lookBehindForSeenBlobs(
projectId,
chunkRecord,
lastBackedUpVersion,
seenBlobs
)
isStartingChunk = false
}
recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs)
const blobsToBackup = await collectNewBlobsForBackup(
chunk,
blobStore,
seenBlobs
)
yield { chunkRecord, chunkToBackup: chunk, chunkBuffer, blobsToBackup }
// After we generate a backup of this chunk, mark the backed up blobs as seen
blobsToBackup.forEach(blob => seenBlobs.add(blob.getHash()))
currentBackedUpVersion = chunkRecord.endVersion
}
}

View File

@@ -0,0 +1,121 @@
// @ts-check
import fs from 'node:fs'
import Path from 'node:path'
import _ from 'lodash'
import config from 'config'
import { SecretManagerServiceClient } from '@google-cloud/secret-manager'
import OError from '@overleaf/o-error'
import {
PerProjectEncryptedS3Persistor,
RootKeyEncryptionKey,
} from '@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js'
import { HistoryStore } from './history_store.js'
const persistorConfig = _.cloneDeep(config.get('backupPersistor'))
const { chunksBucket, deksBucket, globalBlobsBucket, projectBlobsBucket } =
config.get('backupStore')
export { chunksBucket, globalBlobsBucket, projectBlobsBucket }
function convertKey(key, convertFn) {
if (_.has(persistorConfig, key)) {
_.update(persistorConfig, key, convertFn)
}
}
convertKey('s3SSEC.httpOptions.timeout', s => parseInt(s, 10))
convertKey('s3SSEC.maxRetries', s => parseInt(s, 10))
convertKey('s3SSEC.pathStyle', s => s === 'true')
// array of CA, either inlined or on disk
convertKey('s3SSEC.ca', s =>
JSON.parse(s).map(ca => (ca.startsWith('/') ? fs.readFileSync(ca) : ca))
)
/** @type {() => Promise<string>} */
let getRawRootKeyEncryptionKeys
if ((process.env.NODE_ENV || 'production') === 'production') {
;[persistorConfig.s3SSEC.key, persistorConfig.s3SSEC.secret] = (
await loadFromSecretsManager(
process.env.BACKUP_AWS_CREDENTIALS || '',
'BACKUP_AWS_CREDENTIALS'
)
).split(':')
getRawRootKeyEncryptionKeys = () =>
loadFromSecretsManager(
persistorConfig.keyEncryptionKeys,
'BACKUP_KEY_ENCRYPTION_KEYS'
)
} else {
getRawRootKeyEncryptionKeys = () => persistorConfig.keyEncryptionKeys
}
export const DELETION_ONLY = persistorConfig.keyEncryptionKeys === 'none'
if (DELETION_ONLY) {
// For Backup-deleter; should not encrypt or read data; deleting does not need key.
getRawRootKeyEncryptionKeys = () => new Promise(_resolve => {})
}
const PROJECT_FOLDER_REGEX =
/^\d{3}\/\d{3}\/\d{3,}\/|[0-9a-f]{3}\/[0-9a-f]{3}\/[0-9a-f]{18}\/$/
/**
* @param {string} bucketName
* @param {string} path
* @return {string}
*/
export function pathToProjectFolder(bucketName, path) {
switch (bucketName) {
case deksBucket:
case chunksBucket:
case projectBlobsBucket:
const projectFolder = Path.join(...path.split('/').slice(0, 3)) + '/'
if (!PROJECT_FOLDER_REGEX.test(projectFolder)) {
throw new OError('invalid project folder', { bucketName, path })
}
return projectFolder
default:
throw new Error(`${bucketName} does not store per-project files`)
}
}
/**
* @param {string} name
* @param {string} label
* @return {Promise<string>}
*/
async function loadFromSecretsManager(name, label) {
const client = new SecretManagerServiceClient()
const [version] = await client.accessSecretVersion({ name })
if (!version.payload?.data) throw new Error(`empty secret: ${label}`)
return version.payload.data.toString()
}
async function getRootKeyEncryptionKeys() {
return JSON.parse(await getRawRootKeyEncryptionKeys()).map(
({ key, salt }) => {
return new RootKeyEncryptionKey(
Buffer.from(key, 'base64'),
Buffer.from(salt, 'base64')
)
}
)
}
export const backupPersistor = new PerProjectEncryptedS3Persistor({
...persistorConfig.s3SSEC,
disableMultiPartUpload: true,
dataEncryptionKeyBucketName: deksBucket,
pathToProjectFolder,
getRootKeyEncryptionKeys,
storageClass: {
[deksBucket]: 'STANDARD',
[chunksBucket]: persistorConfig.tieringStorageClass,
[projectBlobsBucket]: persistorConfig.tieringStorageClass,
},
})
export const backupHistoryStore = new HistoryStore(
backupPersistor,
chunksBucket
)

View File

@@ -0,0 +1,216 @@
// @ts-check
import OError from '@overleaf/o-error'
import chunkStore from '../lib/chunk_store/index.js'
import {
backupPersistor,
chunksBucket,
projectBlobsBucket,
} from './backupPersistor.mjs'
import { Blob, Chunk, History } from 'overleaf-editor-core'
import { BlobStore, GLOBAL_BLOBS, makeProjectKey } from './blob_store/index.js'
import blobHash from './blob_hash.js'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import logger from '@overleaf/logger'
import path from 'node:path'
import projectKey from './project_key.js'
import streams from './streams.js'
import objectPersistor from '@overleaf/object-persistor'
import { getEndDateForRPO } from '../../backupVerifier/utils.mjs'
/**
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
*/
/**
* @param {string} historyId
* @param {string} hash
*/
export async function verifyBlob(historyId, hash) {
return await verifyBlobs(historyId, [hash])
}
/**
*
* @param {string} historyId
* @return {Promise<CachedPerProjectEncryptedS3Persistor>}
*/
async function getProjectPersistor(historyId) {
try {
return await backupPersistor.forProjectRO(
projectBlobsBucket,
makeProjectKey(historyId, '')
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new BackupCorruptedError('dek does not exist', {}, err)
}
throw err
}
}
/**
* @param {string} historyId
* @param {Array<string>} hashes
* @param {CachedPerProjectEncryptedS3Persistor} [projectCache]
*/
export async function verifyBlobs(historyId, hashes, projectCache) {
if (hashes.length === 0) throw new Error('bug: empty hashes')
if (!projectCache) {
projectCache = await getProjectPersistor(historyId)
}
const blobStore = new BlobStore(historyId)
for (const hash of hashes) {
const path = makeProjectKey(historyId, hash)
const blob = await blobStore.getBlob(hash)
if (!blob) throw new Blob.NotFoundError(hash)
let stream
try {
stream = await projectCache.getObjectStream(projectBlobsBucket, path, {
autoGunzip: true,
})
} catch (err) {
if (err instanceof NotFoundError) {
throw new BackupCorruptedMissingBlobError('missing blob', {
path,
hash,
})
}
throw err
}
const backupHash = await blobHash.fromStream(blob.getByteLength(), stream)
if (backupHash !== hash) {
throw new BackupCorruptedInvalidBlobError(
'hash mismatch for backed up blob',
{
path,
hash,
backupHash,
}
)
}
}
}
/**
* @param {string} historyId
* @param {Date} [endTimestamp]
*/
export async function verifyProjectWithErrorContext(
historyId,
endTimestamp = getEndDateForRPO()
) {
try {
await verifyProject(historyId, endTimestamp)
} catch (err) {
// @ts-ignore err is Error instance
throw OError.tag(err, 'verifyProject', { historyId, endTimestamp })
}
}
/**
*
* @param {string} historyId
* @param {number} startVersion
* @param {CachedPerProjectEncryptedS3Persistor} backupPersistorForProject
* @return {Promise<any>}
*/
async function loadChunk(historyId, startVersion, backupPersistorForProject) {
const key = path.join(
projectKey.format(historyId),
projectKey.pad(startVersion)
)
try {
const buf = await streams.gunzipStreamToBuffer(
await backupPersistorForProject.getObjectStream(chunksBucket, key)
)
return JSON.parse(buf.toString('utf-8'))
} catch (err) {
if (err instanceof objectPersistor.Errors.NotFoundError) {
throw new Chunk.NotPersistedError(historyId)
}
if (err instanceof Error) {
throw OError.tag(err, 'Failed to load chunk', { historyId, startVersion })
}
throw err
}
}
/**
* @param {string} historyId
* @param {Date} endTimestamp
*/
export async function verifyProject(historyId, endTimestamp) {
const backend = chunkStore.getBackend(historyId)
const [first, last] = await Promise.all([
backend.getFirstChunkBeforeTimestamp(historyId, endTimestamp),
backend.getLastActiveChunkBeforeTimestamp(historyId, endTimestamp),
])
const chunksRecordsToVerify = [
{
chunkId: first.id,
chunkLabel: 'first',
},
]
if (first.startVersion !== last.startVersion) {
chunksRecordsToVerify.push({
chunkId: last.id,
chunkLabel: 'last before RPO',
})
}
const projectCache = await getProjectPersistor(historyId)
const chunks = await Promise.all(
chunksRecordsToVerify.map(async chunk => {
try {
return History.fromRaw(
await loadChunk(historyId, chunk.startVersion, projectCache)
)
} catch (err) {
if (err instanceof Chunk.NotPersistedError) {
throw new BackupRPOViolationChunkNotBackedUpError(
'BackupRPOviolation: chunk not backed up',
chunk
)
}
throw err
}
})
)
const seenBlobs = new Set()
const blobsToVerify = []
for (const chunk of chunks) {
/** @type {Set<string>} */
const chunkBlobs = new Set()
chunk.findBlobHashes(chunkBlobs)
let hasAddedBlobFromThisChunk = false
for (const blobHash of chunkBlobs) {
if (seenBlobs.has(blobHash)) continue // old blob
if (GLOBAL_BLOBS.has(blobHash)) continue // global blob
seenBlobs.add(blobHash)
if (!hasAddedBlobFromThisChunk) {
blobsToVerify.push(blobHash)
hasAddedBlobFromThisChunk = true
}
}
}
if (blobsToVerify.length === 0) {
logger.debug(
{
historyId,
chunksRecordsToVerify: chunksRecordsToVerify.map(c => c.chunkId),
},
'chunks contain no blobs to verify'
)
return
}
await verifyBlobs(historyId, blobsToVerify, projectCache)
}
export class BackupCorruptedError extends OError {}
export class BackupRPOViolationError extends OError {}
export class BackupCorruptedMissingBlobError extends BackupCorruptedError {}
export class BackupCorruptedInvalidBlobError extends BackupCorruptedError {}
export class BackupRPOViolationChunkNotBackedUpError extends OError {}

View File

@@ -0,0 +1,212 @@
const { Binary, ObjectId } = require('mongodb')
const { projects, backedUpBlobs } = require('../mongodb')
const OError = require('@overleaf/o-error')
// List projects with pending backups older than the specified interval
function listPendingBackups(timeIntervalMs = 0, limit = null) {
const cutoffTime = new Date(Date.now() - timeIntervalMs)
const options = {
projection: { 'overleaf.backup.pendingChangeAt': 1 },
sort: { 'overleaf.backup.pendingChangeAt': 1 },
}
// Apply limit if provided
if (limit) {
options.limit = limit
}
const cursor = projects.find(
{
'overleaf.backup.pendingChangeAt': {
$exists: true,
$lt: cutoffTime,
},
},
options
)
return cursor
}
// List projects that have never been backed up and are older than the specified interval
function listUninitializedBackups(timeIntervalMs = 0, limit = null) {
const cutoffTimeInSeconds = (Date.now() - timeIntervalMs) / 1000
const options = {
projection: { _id: 1 },
sort: { _id: 1 },
}
// Apply limit if provided
if (limit) {
options.limit = limit
}
const cursor = projects.find(
{
'overleaf.backup.lastBackedUpVersion': null,
_id: {
$lt: ObjectId.createFromTime(cutoffTimeInSeconds),
},
},
options
)
return cursor
}
// Retrieve the history ID for a given project without giving direct access to the
// projects collection.
async function getHistoryId(projectId) {
const project = await projects.findOne(
{ _id: new ObjectId(projectId) },
{
projection: {
'overleaf.history.id': 1,
},
}
)
if (!project) {
throw new Error('Project not found')
}
return project.overleaf.history.id
}
async function getBackupStatus(projectId) {
const project = await projects.findOne(
{ _id: new ObjectId(projectId) },
{
projection: {
'overleaf.history': 1,
'overleaf.backup': 1,
},
}
)
if (!project) {
throw new Error('Project not found')
}
return {
backupStatus: project.overleaf.backup,
historyId: `${project.overleaf.history.id}`,
currentEndVersion: project.overleaf.history.currentEndVersion,
currentEndTimestamp: project.overleaf.history.currentEndTimestamp,
}
}
async function setBackupVersion(
projectId,
previousBackedUpVersion,
currentBackedUpVersion,
currentBackedUpAt
) {
// FIXME: include a check to handle race conditions
// to make sure only one process updates the version numbers
const result = await projects.updateOne(
{
_id: new ObjectId(projectId),
'overleaf.backup.lastBackedUpVersion': previousBackedUpVersion,
},
{
$set: {
'overleaf.backup.lastBackedUpVersion': currentBackedUpVersion,
'overleaf.backup.lastBackedUpAt': currentBackedUpAt,
},
}
)
if (result.matchedCount === 0 || result.modifiedCount === 0) {
throw new OError('Failed to update backup version', {
previousBackedUpVersion,
currentBackedUpVersion,
currentBackedUpAt,
result,
})
}
}
async function updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata) {
await projects.updateOne(
{
_id: new ObjectId(projectId),
'overleaf.history.currentEndVersion': { $exists: false },
'overleaf.history.currentEndTimestamp': { $exists: false },
},
{
$set: {
'overleaf.history.currentEndVersion': latestChunkMetadata.endVersion,
'overleaf.history.currentEndTimestamp':
latestChunkMetadata.endTimestamp,
},
}
)
}
/**
* Updates the pending change timestamp for a project's backup status
* @param {string} projectId - The ID of the project to update
* @param {Date} backupStartTime - The timestamp to set for pending changes
* @returns {Promise<void>}
*
* If the project's last backed up version matches the current end version,
* the pending change timestamp is removed. Otherwise, it's set to the provided
* backup start time.
*/
async function updatePendingChangeTimestamp(projectId, backupStartTime) {
await projects.updateOne({ _id: new ObjectId(projectId) }, [
{
$set: {
'overleaf.backup.pendingChangeAt': {
$cond: {
if: {
$eq: [
'$overleaf.backup.lastBackedUpVersion',
'$overleaf.history.currentEndVersion',
],
},
then: '$$REMOVE',
else: backupStartTime,
},
},
},
},
])
}
async function getBackedUpBlobHashes(projectId) {
const result = await backedUpBlobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { blobs: 1 } }
)
if (!result) {
return new Set()
}
const hashes = result.blobs.map(b => b.buffer.toString('hex'))
return new Set(hashes)
}
async function unsetBackedUpBlobHashes(projectId, hashes) {
const binaryHashes = hashes.map(h => new Binary(Buffer.from(h, 'hex')))
const result = await backedUpBlobs.findOneAndUpdate(
{ _id: new ObjectId(projectId) },
{
$pullAll: {
blobs: binaryHashes,
},
},
{ returnDocument: 'after' }
)
if (result && result.blobs.length === 0) {
await backedUpBlobs.deleteOne({
_id: new ObjectId(projectId),
blobs: { $size: 0 },
})
}
return result
}
module.exports = {
getHistoryId,
getBackupStatus,
setBackupVersion,
updateCurrentMetadataIfNotSet,
updatePendingChangeTimestamp,
listPendingBackups,
listUninitializedBackups,
getBackedUpBlobHashes,
unsetBackedUpBlobHashes,
}

View File

@@ -0,0 +1,40 @@
'use strict'
const BPromise = require('bluebird')
/**
* @constructor
* @param {BlobStore} blobStore
* @classdesc
* Wrapper for BlobStore that pre-fetches blob metadata to avoid making one
* database call per blob lookup.
*/
function BatchBlobStore(blobStore) {
this.blobStore = blobStore
this.blobs = new Map()
}
/**
* Pre-fetch metadata for the given blob hashes.
*
* @param {Array.<string>} hashes
* @return {Promise}
*/
BatchBlobStore.prototype.preload = function batchBlobStorePreload(hashes) {
return BPromise.each(this.blobStore.getBlobs(hashes), blob => {
this.blobs.set(blob.getHash(), blob)
})
}
/**
* @see BlobStore#getBlob
*/
BatchBlobStore.prototype.getBlob = BPromise.method(
function batchBlobStoreGetBlob(hash) {
const blob = this.blobs.get(hash)
if (blob) return blob
return this.blobStore.getBlob(hash)
}
)
module.exports = BatchBlobStore

View File

@@ -0,0 +1,80 @@
/** @module */
'use strict'
const BPromise = require('bluebird')
const fs = BPromise.promisifyAll(require('node:fs'))
const crypto = require('node:crypto')
const { pipeline } = require('node:stream')
const assert = require('./assert')
function getGitBlobHeader(byteLength) {
return 'blob ' + byteLength + '\x00'
}
function getBlobHash(byteLength) {
const hash = crypto.createHash('sha1')
hash.setEncoding('hex')
hash.update(getGitBlobHeader(byteLength))
return hash
}
/**
* Compute the git blob hash for a blob from a readable stream of its content.
*
* @function
* @param {number} byteLength
* @param {stream.Readable} stream
* @return {Promise.<string>} hexadecimal SHA-1 hash
*/
exports.fromStream = BPromise.method(
function blobHashFromStream(byteLength, stream) {
assert.integer(byteLength, 'blobHash: bad byteLength')
assert.object(stream, 'blobHash: bad stream')
const hash = getBlobHash(byteLength)
return new BPromise(function (resolve, reject) {
pipeline(stream, hash, function (err) {
if (err) {
reject(err)
} else {
hash.end()
resolve(hash.read())
}
})
})
}
)
/**
* Compute the git blob hash for a blob with the given string content.
*
* @param {string} string
* @return {string} hexadecimal SHA-1 hash
*/
exports.fromString = function blobHashFromString(string) {
assert.string(string, 'blobHash: bad string')
const hash = getBlobHash(Buffer.byteLength(string))
hash.update(string, 'utf8')
hash.end()
return hash.read()
}
/**
* Compute the git blob hash for the content of a file
*
* @param {string} filePath
* @return {string} hexadecimal SHA-1 hash
*/
exports.fromFile = function blobHashFromFile(pathname) {
assert.string(pathname, 'blobHash: bad pathname')
function getByteLengthOfFile() {
return fs.statAsync(pathname).then(stat => stat.size)
}
const fromStream = this.fromStream
return getByteLengthOfFile(pathname).then(function (byteLength) {
const stream = fs.createReadStream(pathname)
return fromStream(byteLength, stream)
})
}

View File

@@ -0,0 +1,433 @@
'use strict'
const config = require('config')
const fs = require('node:fs')
const isValidUtf8 = require('utf-8-validate')
const { ReadableString } = require('@overleaf/stream-utils')
const core = require('overleaf-editor-core')
const objectPersistor = require('@overleaf/object-persistor')
const OError = require('@overleaf/o-error')
const Blob = core.Blob
const TextOperation = core.TextOperation
const containsNonBmpChars = core.util.containsNonBmpChars
const assert = require('../assert')
const blobHash = require('../blob_hash')
const mongodb = require('../mongodb')
const persistor = require('../persistor')
const projectKey = require('../project_key')
const streams = require('../streams')
const postgresBackend = require('./postgres')
const mongoBackend = require('./mongo')
const logger = require('@overleaf/logger')
/** @import { Readable } from 'stream' */
const GLOBAL_BLOBS = new Map()
function makeGlobalKey(hash) {
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
}
function makeProjectKey(projectId, hash) {
return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
}
async function uploadBlob(projectId, blob, stream, opts = {}) {
const bucket = config.get('blobStore.projectBucket')
const key = makeProjectKey(projectId, blob.getHash())
logger.debug({ projectId, blob }, 'uploadBlob started')
try {
await persistor.sendStream(bucket, key, stream, {
contentType: 'application/octet-stream',
...opts,
})
} finally {
logger.debug({ projectId, blob }, 'uploadBlob finished')
}
}
function getBlobLocation(projectId, hash) {
if (GLOBAL_BLOBS.has(hash)) {
return {
bucket: config.get('blobStore.globalBucket'),
key: makeGlobalKey(hash),
}
} else {
return {
bucket: config.get('blobStore.projectBucket'),
key: makeProjectKey(projectId, hash),
}
}
}
/**
* Returns the appropriate backend for the given project id
*
* Numeric ids use the Postgres backend.
* Strings of 24 characters use the Mongo backend.
*/
function getBackend(projectId) {
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
return postgresBackend
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
return mongoBackend
} else {
throw new OError('bad project id', { projectId })
}
}
async function makeBlobForFile(pathname) {
const { size: byteLength } = await fs.promises.stat(pathname)
const hash = await blobHash.fromStream(
byteLength,
fs.createReadStream(pathname)
)
return new Blob(hash, byteLength)
}
async function getStringLengthOfFile(byteLength, pathname) {
// We have to read the file into memory to get its UTF-8 length, so don't
// bother for files that are too large for us to edit anyway.
if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
return null
}
// We need to check if the file contains nonBmp or null characters
let data = await fs.promises.readFile(pathname)
if (!isValidUtf8(data)) return null
data = data.toString()
if (data.length > TextOperation.MAX_STRING_LENGTH) return null
if (containsNonBmpChars(data)) return null
if (data.indexOf('\x00') !== -1) return null
return data.length
}
async function deleteBlobsInBucket(projectId) {
const bucket = config.get('blobStore.projectBucket')
const prefix = `${projectKey.format(projectId)}/`
logger.debug({ projectId }, 'deleteBlobsInBucket started')
try {
await persistor.deleteDirectory(bucket, prefix)
} finally {
logger.debug({ projectId }, 'deleteBlobsInBucket finished')
}
}
async function loadGlobalBlobs() {
const blobs = await mongodb.globalBlobs.find()
for await (const blob of blobs) {
GLOBAL_BLOBS.set(blob._id, {
blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
demoted: Boolean(blob.demoted),
})
}
}
/**
* Return metadata for all blobs in the given project
* @param {Array<string|number>} projectIds
* @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
*/
async function getProjectBlobsBatch(projectIds) {
const mongoProjects = []
const postgresProjects = []
for (const projectId of projectIds) {
if (typeof projectId === 'number') {
postgresProjects.push(projectId)
} else {
mongoProjects.push(projectId)
}
}
const [
{ nBlobs: nBlobsPostgres, blobs: blobsPostgres },
{ nBlobs: nBlobsMongo, blobs: blobsMongo },
] = await Promise.all([
postgresBackend.getProjectBlobsBatch(postgresProjects),
mongoBackend.getProjectBlobsBatch(mongoProjects),
])
for (const [id, blobs] of blobsPostgres.entries()) {
blobsMongo.set(id.toString(), blobs)
}
return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
}
/**
* @classdesc
* Fetch and store the content of files using content-addressable hashing. The
* blob store manages both content and metadata (byte and UTF-8 length) for
* blobs.
*/
class BlobStore {
/**
* @constructor
* @param {string} projectId the project for which we'd like to find blobs
*/
constructor(projectId) {
assert.projectId(projectId)
this.projectId = projectId
this.backend = getBackend(this.projectId)
}
/**
* Set up the initial data structure for a given project
*/
async initialize() {
await this.backend.initialize(this.projectId)
}
/**
* Write a blob, if one does not already exist, with the given UTF-8 encoded
* string content.
*
* @param {string} string
* @return {Promise.<core.Blob>}
*/
async putString(string) {
assert.string(string, 'bad string')
const hash = blobHash.fromString(string)
const existingBlob = await this._findBlobBeforeInsert(hash)
if (existingBlob != null) {
return existingBlob
}
const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
// Note: the ReadableString is to work around a bug in the AWS SDK: it won't
// allow Body to be blank.
await uploadBlob(this.projectId, newBlob, new ReadableString(string))
await this.backend.insertBlob(this.projectId, newBlob)
return newBlob
}
/**
* Write a blob, if one does not already exist, with the given file (usually a
* temporary file).
*
* @param {string} pathname
* @return {Promise<core.Blob>}
*/
async putFile(pathname) {
assert.string(pathname, 'bad pathname')
const newBlob = await makeBlobForFile(pathname)
const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
if (existingBlob != null) {
return existingBlob
}
const stringLength = await getStringLengthOfFile(
newBlob.getByteLength(),
pathname
)
newBlob.setStringLength(stringLength)
await this.putBlob(pathname, newBlob)
return newBlob
}
/**
* Write a new blob, the stringLength must have been added already. It should
* have been checked that the blob does not exist yet. Consider using
* {@link putFile} instead of this lower-level method.
*
* @param {string} pathname
* @param {core.Blob} finializedBlob
* @return {Promise<void>}
*/
async putBlob(pathname, finializedBlob) {
await uploadBlob(
this.projectId,
finializedBlob,
fs.createReadStream(pathname)
)
await this.backend.insertBlob(this.projectId, finializedBlob)
}
/**
* Stores an object as a JSON string in a blob.
*
* @param {object} obj
* @returns {Promise.<core.Blob>}
*/
async putObject(obj) {
assert.object(obj, 'bad object')
const string = JSON.stringify(obj)
return await this.putString(string)
}
/**
*
* Fetch a blob's content by its hash as a UTF-8 encoded string.
*
* @param {string} hash hexadecimal SHA-1 hash
* @return {Promise.<string>} promise for the content of the file
*/
async getString(hash) {
assert.blobHash(hash, 'bad hash')
const projectId = this.projectId
logger.debug({ projectId, hash }, 'getString started')
try {
const stream = await this.getStream(hash)
const buffer = await streams.readStreamToBuffer(stream)
return buffer.toString()
} finally {
logger.debug({ projectId, hash }, 'getString finished')
}
}
/**
* Fetch a JSON encoded blob by its hash and deserialize it.
*
* @template [T=unknown]
* @param {string} hash hexadecimal SHA-1 hash
* @return {Promise.<T>} promise for the content of the file
*/
async getObject(hash) {
assert.blobHash(hash, 'bad hash')
const projectId = this.projectId
logger.debug({ projectId, hash }, 'getObject started')
try {
const jsonString = await this.getString(hash)
const object = JSON.parse(jsonString)
return object
} catch (error) {
// Maybe this is blob is gzipped. Try to gunzip it.
// TODO: Remove once we've ensured this is not reached
const stream = await this.getStream(hash)
const buffer = await streams.gunzipStreamToBuffer(stream)
const object = JSON.parse(buffer.toString())
logger.warn('getObject: Gzipped object in BlobStore')
return object
} finally {
logger.debug({ projectId, hash }, 'getObject finished')
}
}
/**
* Fetch a blob by its hash as a stream.
*
* Note that, according to the AWS SDK docs, this does not retry after initial
* failure, so the caller must be prepared to retry on errors, if appropriate.
*
* @param {string} hash hexadecimal SHA-1 hash
* @param {Object} opts
* @return {Promise.<Readable>} a stream to read the file
*/
async getStream(hash, opts = {}) {
assert.blobHash(hash, 'bad hash')
const { bucket, key } = getBlobLocation(this.projectId, hash)
try {
const stream = await persistor.getObjectStream(bucket, key, opts)
return stream
} catch (err) {
if (err instanceof objectPersistor.Errors.NotFoundError) {
throw new Blob.NotFoundError(hash)
}
throw err
}
}
/**
* Read a blob metadata record by hexadecimal hash.
*
* @param {string} hash hexadecimal SHA-1 hash
* @return {Promise<core.Blob | null>}
*/
async getBlob(hash) {
assert.blobHash(hash, 'bad hash')
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob != null) {
return globalBlob.blob
}
const blob = await this.backend.findBlob(this.projectId, hash)
return blob
}
async getBlobs(hashes) {
assert.array(hashes, 'bad hashes')
const nonGlobalHashes = []
const blobs = []
for (const hash of hashes) {
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob != null) {
blobs.push(globalBlob.blob)
} else {
nonGlobalHashes.push(hash)
}
}
if (nonGlobalHashes.length === 0) {
return blobs // to avoid unnecessary database lookup
}
const projectBlobs = await this.backend.findBlobs(
this.projectId,
nonGlobalHashes
)
blobs.push(...projectBlobs)
return blobs
}
/**
* Retrieve all blobs associated with the project.
* @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
*/
async getProjectBlobs() {
const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
return projectBlobs
}
/**
* Delete all blobs that belong to the project.
*/
async deleteBlobs() {
await Promise.all([
this.backend.deleteBlobs(this.projectId),
deleteBlobsInBucket(this.projectId),
])
}
async _findBlobBeforeInsert(hash) {
const globalBlob = GLOBAL_BLOBS.get(hash)
if (globalBlob != null && !globalBlob.demoted) {
return globalBlob.blob
}
const blob = await this.backend.findBlob(this.projectId, hash)
return blob
}
/**
* Copy an existing sourceBlob in this project to a target project.
* @param {Blob} sourceBlob
* @param {string} targetProjectId
* @return {Promise<void>}
*/
async copyBlob(sourceBlob, targetProjectId) {
assert.instance(sourceBlob, Blob, 'bad sourceBlob')
assert.projectId(targetProjectId, 'bad targetProjectId')
const hash = sourceBlob.getHash()
const sourceProjectId = this.projectId
const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
const destKey = makeProjectKey(targetProjectId, hash)
const targetBackend = getBackend(targetProjectId)
logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
try {
await persistor.copyObject(bucket, sourceKey, destKey)
await targetBackend.insertBlob(targetProjectId, sourceBlob)
} finally {
logger.debug(
{ sourceProjectId, targetProjectId, hash },
'copyBlob finished'
)
}
}
}
module.exports = {
BlobStore,
getProjectBlobsBatch,
loadGlobalBlobs,
makeProjectKey,
makeBlobForFile,
getStringLengthOfFile,
GLOBAL_BLOBS,
}

View File

@@ -0,0 +1,437 @@
// @ts-check
/**
* Mongo backend for the blob store.
*
* Blobs are stored in the projectHistoryBlobs collection. Each project has a
* document in that collection. That document has a "blobs" subdocument whose
* fields are buckets of blobs. The key of a bucket is the first three hex
* digits of the blob hash. The value of the bucket is an array of blobs that
* match the key.
*
* Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
* blobs are stored in a secondary collection: the projectHistoryShardedBlobs
* collection. This collection shards blobs between 16 documents per project.
* The shard key is the first hex digit of the hash. The documents are also
* organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
*/
const { Blob } = require('overleaf-editor-core')
const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
const assert = require('../assert')
const mongodb = require('../mongodb')
const MAX_BLOBS_IN_BUCKET = 8
const DUPLICATE_KEY_ERROR_CODE = 11000
/**
* @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
*/
/**
* Set up the data structures for a given project.
* @param {string} projectId
*/
async function initialize(projectId) {
assert.mongoId(projectId, 'bad projectId')
try {
await mongodb.blobs.insertOne({
_id: new ObjectId(projectId),
blobs: {},
})
} catch (err) {
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
return // ignore already initialized case
}
throw err
}
}
/**
* Return blob metadata for the given project and hash.
* @param {string} projectId
* @param {string} hash
* @return {Promise<Blob | null>}
*/
async function findBlob(projectId, hash) {
assert.mongoId(projectId, 'bad projectId')
assert.blobHash(hash, 'bad hash')
const bucket = getBucket(hash)
const result = await mongodb.blobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { _id: 0, bucket: `$${bucket}` } }
)
if (result?.bucket == null) {
return null
}
const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
if (record == null) {
if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
return await findBlobSharded(projectId, hash)
} else {
return null
}
}
return recordToBlob(record)
}
/**
* Search in the sharded collection for blob metadata
* @param {string} projectId
* @param {string} hash
* @return {Promise<Blob | null>}
*/
async function findBlobSharded(projectId, hash) {
const [shard, bucket] = getShardedBucket(hash)
const id = makeShardedId(projectId, shard)
const result = await mongodb.shardedBlobs.findOne(
{ _id: id },
{ projection: { _id: 0, blobs: `$${bucket}` } }
)
if (result?.blobs == null) {
return null
}
const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
if (!record) return null
return recordToBlob(record)
}
/**
* Read multiple blob metadata records by hexadecimal hashes.
* @param {string} projectId
* @param {Array<string>} hashes
* @return {Promise<Array<Blob>>}
*/
async function findBlobs(projectId, hashes) {
assert.mongoId(projectId, 'bad projectId')
assert.array(hashes, 'bad hashes: not array')
hashes.forEach(function (hash) {
assert.blobHash(hash, 'bad hash')
})
// Build a set of unique buckets
const buckets = new Set(hashes.map(getBucket))
// Get buckets from Mongo
const projection = { _id: 0 }
for (const bucket of buckets) {
projection[bucket] = 1
}
const result = await mongodb.blobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection }
)
if (result?.blobs == null) {
return []
}
// Build blobs from the query results
const hashSet = new Set(hashes)
const blobs = []
for (const bucket of Object.values(result.blobs)) {
for (const record of bucket) {
const hash = record.h.toString('hex')
if (hashSet.has(hash)) {
blobs.push(recordToBlob(record))
hashSet.delete(hash)
}
}
}
// If we haven't found all the blobs, look in the sharded collection
if (hashSet.size > 0) {
const shardedBlobs = await findBlobsSharded(projectId, hashSet)
blobs.push(...shardedBlobs)
}
return blobs
}
/**
* Search in the sharded collection for blob metadata.
* @param {string} projectId
* @param {Set<string>} hashSet
* @return {Promise<Array<Blob>>}
*/
async function findBlobsSharded(projectId, hashSet) {
// Build a map of buckets by shard key
const bucketsByShard = new Map()
for (const hash of hashSet) {
const [shard, bucket] = getShardedBucket(hash)
let buckets = bucketsByShard.get(shard)
if (buckets == null) {
buckets = new Set()
bucketsByShard.set(shard, buckets)
}
buckets.add(bucket)
}
// Make parallel requests to the shards that might contain the hashes we want
const requests = []
for (const [shard, buckets] of bucketsByShard.entries()) {
const id = makeShardedId(projectId, shard)
const projection = { _id: 0 }
for (const bucket of buckets) {
projection[bucket] = 1
}
const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
requests.push(request)
}
const results = await Promise.all(requests)
// Build blobs from the query results
const blobs = []
for (const result of results) {
if (result?.blobs == null) {
continue
}
for (const bucket of Object.values(result.blobs)) {
for (const record of bucket) {
const hash = record.h.toString('hex')
if (hashSet.has(hash)) {
blobs.push(recordToBlob(record))
}
}
}
}
return blobs
}
/**
* Return metadata for all blobs in the given project
*/
async function getProjectBlobs(projectId) {
assert.mongoId(projectId, 'bad projectId')
const result = await mongodb.blobs.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { _id: 0 } }
)
if (!result) {
return []
}
// Build blobs from the query results
const blobs = []
for (const bucket of Object.values(result.blobs)) {
for (const record of bucket) {
blobs.push(recordToBlob(record))
}
}
// Look for all possible sharded blobs
const minShardedId = makeShardedId(projectId, '0')
const maxShardedId = makeShardedId(projectId, 'f')
// @ts-ignore We are using a custom _id here.
const shardedRecords = mongodb.shardedBlobs.find(
{
_id: { $gte: minShardedId, $lte: maxShardedId },
},
{ projection: { _id: 0 } }
)
for await (const shardedRecord of shardedRecords) {
if (shardedRecord.blobs == null) {
continue
}
for (const bucket of Object.values(shardedRecord.blobs)) {
for (const record of bucket) {
blobs.push(recordToBlob(record))
}
}
}
return blobs
}
/**
* Return metadata for all blobs in the given project
* @param {Array<string>} projectIds
* @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
*/
async function getProjectBlobsBatch(projectIds) {
for (const project of projectIds) {
assert.mongoId(project, 'bad projectId')
}
let nBlobs = 0
const blobs = new Map()
if (projectIds.length === 0) return { nBlobs, blobs }
// blobs
{
const cursor = await mongodb.blobs.find(
{ _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
{ readPreference: ReadPreference.secondaryPreferred }
)
for await (const record of cursor) {
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
blobs.set(record._id.toString(), projectBlobs)
nBlobs += projectBlobs.length
}
}
// sharded blobs
{
// @ts-ignore We are using a custom _id here.
const cursor = await mongodb.shardedBlobs.find(
{
_id: {
$gte: makeShardedId(projectIds[0], '0'),
$lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
},
},
{ readPreference: ReadPreference.secondaryPreferred }
)
for await (const record of cursor) {
const recordIdHex = record._id.toString('hex')
const recordProjectId = recordIdHex.slice(0, 24)
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
const found = blobs.get(recordProjectId)
if (found) {
found.push(...projectBlobs)
} else {
blobs.set(recordProjectId, projectBlobs)
}
nBlobs += projectBlobs.length
}
}
return { nBlobs, blobs }
}
/**
* Add a blob's metadata to the blobs collection after it has been uploaded.
* @param {string} projectId
* @param {Blob} blob
*/
async function insertBlob(projectId, blob) {
assert.mongoId(projectId, 'bad projectId')
const hash = blob.getHash()
const bucket = getBucket(hash)
const record = blobToRecord(blob)
const result = await mongodb.blobs.updateOne(
{
_id: new ObjectId(projectId),
$expr: {
$lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
},
},
{
$addToSet: { [bucket]: record },
}
)
if (result.matchedCount === 0) {
await insertRecordSharded(projectId, hash, record)
}
}
/**
* Add a blob's metadata to the sharded blobs collection.
* @param {string} projectId
* @param {string} hash
* @param {Record} record
* @return {Promise<void>}
*/
async function insertRecordSharded(projectId, hash, record) {
const [shard, bucket] = getShardedBucket(hash)
const id = makeShardedId(projectId, shard)
await mongodb.shardedBlobs.updateOne(
{ _id: id },
{ $addToSet: { [bucket]: record } },
{ upsert: true }
)
}
/**
* Delete all blobs for a given project.
* @param {string} projectId
*/
async function deleteBlobs(projectId) {
assert.mongoId(projectId, 'bad projectId')
await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
const minShardedId = makeShardedId(projectId, '0')
const maxShardedId = makeShardedId(projectId, 'f')
await mongodb.shardedBlobs.deleteMany({
// @ts-ignore We are using a custom _id here.
_id: { $gte: minShardedId, $lte: maxShardedId },
})
}
/**
* Return the Mongo path to the bucket for the given hash.
* @param {string} hash
* @return {string}
*/
function getBucket(hash) {
return `blobs.${hash.slice(0, 3)}`
}
/**
* Return the shard key and Mongo path to the bucket for the given hash in the
* sharded collection.
* @param {string} hash
* @return {[string, string]}
*/
function getShardedBucket(hash) {
const shard = hash.slice(0, 1)
const bucket = `blobs.${hash.slice(1, 4)}`
return [shard, bucket]
}
/**
* Create an _id key for the sharded collection.
* @param {string} projectId
* @param {string} shard
* @return {Binary}
*/
function makeShardedId(projectId, shard) {
return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
}
/**
* @typedef {Object} Record
* @property {Binary} h
* @property {number} b
* @property {number} [s]
*/
/**
* Return the Mongo record for the given blob.
* @param {Blob} blob
* @return {Record}
*/
function blobToRecord(blob) {
const hash = blob.getHash()
const byteLength = blob.getByteLength()
const stringLength = blob.getStringLength()
return {
h: new Binary(Buffer.from(hash, 'hex')),
b: byteLength,
s: stringLength,
}
}
/**
* Create a blob from the given Mongo record.
* @param {Record} record
* @return {Blob}
*/
function recordToBlob(record) {
return new Blob(record.h.toString('hex'), record.b, record.s)
}
module.exports = {
initialize,
findBlob,
findBlobs,
getProjectBlobs,
getProjectBlobsBatch,
insertBlob,
deleteBlobs,
}

View File

@@ -0,0 +1,161 @@
const { Blob } = require('overleaf-editor-core')
const assert = require('../assert')
const knex = require('../knex')
/**
* Set up the initial data structures for a project
*/
async function initialize(projectId) {
// Nothing to do for Postgres
}
/**
* Return blob metadata for the given project and hash
*/
async function findBlob(projectId, hash) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
assert.blobHash(hash, 'bad hash')
const binaryHash = hashToBuffer(hash)
const record = await knex('project_blobs')
.select('hash_bytes', 'byte_length', 'string_length')
.where({
project_id: projectId,
hash_bytes: binaryHash,
})
.first()
return recordToBlob(record)
}
/**
* Read multiple blob metadata records by hexadecimal hashes.
*
* @param {Array.<string>} hashes hexadecimal SHA-1 hashes
* @return {Promise.<Array.<Blob?>>} no guarantee on order
*/
async function findBlobs(projectId, hashes) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
assert.array(hashes, 'bad hashes: not array')
hashes.forEach(function (hash) {
assert.blobHash(hash, 'bad hash')
})
const binaryHashes = hashes.map(hashToBuffer)
const records = await knex('project_blobs')
.select('hash_bytes', 'byte_length', 'string_length')
.where('project_id', projectId)
.whereIn('hash_bytes', binaryHashes)
const blobs = records.map(recordToBlob)
return blobs
}
/**
* Return metadata for all blobs in the given project
*/
async function getProjectBlobs(projectId) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
const records = await knex('project_blobs')
.select('hash_bytes', 'byte_length', 'string_length')
.where({
project_id: projectId,
})
const blobs = records.map(recordToBlob)
return blobs
}
/**
* Return metadata for all blobs in the given project
* @param {Array<number>} projectIds
* @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
*/
async function getProjectBlobsBatch(projectIds) {
for (const projectId of projectIds) {
assert.integer(projectId, 'bad projectId')
}
let nBlobs = 0
const blobs = new Map()
if (projectIds.length === 0) return { nBlobs, blobs }
const cursor = knex('project_blobs')
.select('project_id', 'hash_bytes', 'byte_length', 'string_length')
.whereIn('project_id', projectIds)
.stream()
for await (const record of cursor) {
const found = blobs.get(record.project_id)
if (found) {
found.push(recordToBlob(record))
} else {
blobs.set(record.project_id, [recordToBlob(record)])
}
nBlobs++
}
return { nBlobs, blobs }
}
/**
* Add a blob's metadata to the blobs table after it has been uploaded.
*/
async function insertBlob(projectId, blob) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
await knex('project_blobs')
.insert(blobToRecord(projectId, blob))
.onConflict(['project_id', 'hash_bytes'])
.ignore()
}
/**
* Deletes all blobs for a given project
*/
async function deleteBlobs(projectId) {
assert.postgresId(projectId, 'bad projectId')
projectId = parseInt(projectId, 10)
await knex('project_blobs').where('project_id', projectId).delete()
}
function blobToRecord(projectId, blob) {
return {
project_id: projectId,
hash_bytes: hashToBuffer(blob.hash),
byte_length: blob.getByteLength(),
string_length: blob.getStringLength(),
}
}
function recordToBlob(record) {
if (!record) return
return new Blob(
hashFromBuffer(record.hash_bytes),
record.byte_length,
record.string_length
)
}
function hashToBuffer(hash) {
if (!hash) return
return Buffer.from(hash, 'hex')
}
function hashFromBuffer(buffer) {
if (!buffer) return
return buffer.toString('hex')
}
module.exports = {
initialize,
findBlob,
findBlobs,
getProjectBlobs,
getProjectBlobsBatch,
insertBlob,
deleteBlobs,
}

View File

@@ -0,0 +1,40 @@
'use strict'
/**
* @module storage/lib/chunk_buffer
*/
const chunkStore = require('../chunk_store')
const redisBackend = require('../chunk_store/redis')
const metrics = require('@overleaf/metrics')
/**
* Load the latest Chunk stored for a project, including blob metadata.
*
* @param {string} projectId
* @return {Promise.<Chunk>}
*/
async function loadLatest(projectId) {
const cachedChunk = await redisBackend.getCurrentChunk(projectId)
const chunkRecord = await chunkStore.loadLatestRaw(projectId)
const cachedChunkIsValid = redisBackend.checkCacheValidityWithMetadata(
cachedChunk,
chunkRecord
)
if (cachedChunkIsValid) {
metrics.inc('chunk_buffer.loadLatest', 1, {
status: 'cache-hit',
})
return cachedChunk
} else {
metrics.inc('chunk_buffer.loadLatest', 1, {
status: 'cache-miss',
})
const chunk = await chunkStore.loadLatest(projectId)
await redisBackend.setCurrentChunk(projectId, chunk)
return chunk
}
}
module.exports = {
loadLatest,
}

View File

@@ -0,0 +1,7 @@
const OError = require('@overleaf/o-error')
class ChunkVersionConflictError extends OError {}
module.exports = {
ChunkVersionConflictError,
}

View File

@@ -0,0 +1,447 @@
// @ts-check
'use strict'
/**
* Manage {@link Chunk} and {@link History} storage.
*
* For storage, chunks are immutable. If we want to update a project with new
* changes, we create a new chunk record and History object and delete the old
* ones. If we compact a project's history, we similarly destroy the old chunk
* (or chunks) and replace them with a new one. This is helpful when using S3,
* because it guarantees only eventual consistency for updates but provides
* stronger consistency guarantees for object creation.
*
* When a chunk record in the database is removed, we save its ID for later
* in the `old_chunks` table, rather than deleting it immediately. This lets us
* use batch deletion to reduce the number of delete requests to S3.
*
* The chunk store also caches data about which blobs are referenced by each
* chunk, which allows us to find unused blobs without loading all of the data
* for all projects from S3. Whenever we create a chunk, we also insert records
* into the `chunk_blobs` table, to help with this bookkeeping.
*/
const config = require('config')
const OError = require('@overleaf/o-error')
const { Chunk, History, Snapshot } = require('overleaf-editor-core')
const assert = require('../assert')
const BatchBlobStore = require('../batch_blob_store')
const { BlobStore } = require('../blob_store')
const { historyStore } = require('../history_store')
const mongoBackend = require('./mongo')
const postgresBackend = require('./postgres')
const { ChunkVersionConflictError } = require('./errors')
const DEFAULT_DELETE_BATCH_SIZE = parseInt(config.get('maxDeleteKeys'), 10)
const DEFAULT_DELETE_TIMEOUT_SECS = 3000 // 50 minutes
const DEFAULT_DELETE_MIN_AGE_SECS = 86400 // 1 day
/**
* Create the initial chunk for a project.
*/
async function initializeProject(projectId, snapshot) {
if (projectId != null) {
assert.projectId(projectId, 'bad projectId')
} else {
projectId = await postgresBackend.generateProjectId()
}
if (snapshot != null) {
assert.instance(snapshot, Snapshot, 'bad snapshot')
} else {
snapshot = new Snapshot()
}
const blobStore = new BlobStore(projectId)
await blobStore.initialize()
const backend = getBackend(projectId)
const chunkRecord = await backend.getLatestChunk(projectId)
if (chunkRecord != null) {
throw new AlreadyInitialized(projectId)
}
const history = new History(snapshot, [])
const chunk = new Chunk(history, 0)
await create(projectId, chunk)
return projectId
}
/**
* Load the blobs referenced in the given history
*/
async function lazyLoadHistoryFiles(history, batchBlobStore) {
const blobHashes = new Set()
history.findBlobHashes(blobHashes)
await batchBlobStore.preload(Array.from(blobHashes))
await history.loadFiles('lazy', batchBlobStore)
}
/**
* Load the latest Chunk stored for a project, including blob metadata.
*
* @param {string} projectId
* @param {Object} [opts]
* @param {boolean} [opts.readOnly]
* @return {Promise<{id: string, startVersion: number, endVersion: number, endTimestamp: Date}>}
*/
async function loadLatestRaw(projectId, opts) {
assert.projectId(projectId, 'bad projectId')
const backend = getBackend(projectId)
const chunkRecord = await backend.getLatestChunk(projectId, opts)
if (chunkRecord == null) {
throw new Chunk.NotFoundError(projectId)
}
return chunkRecord
}
/**
* Load the latest Chunk stored for a project, including blob metadata.
*
* @param {string} projectId
* @return {Promise.<Chunk>}
*/
async function loadLatest(projectId) {
const chunkRecord = await loadLatestRaw(projectId)
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
await lazyLoadHistoryFiles(history, batchBlobStore)
return new Chunk(history, chunkRecord.startVersion)
}
/**
* Load the the chunk that contains the given version, including blob metadata.
*/
async function loadAtVersion(projectId, version) {
assert.projectId(projectId, 'bad projectId')
assert.integer(version, 'bad version')
const backend = getBackend(projectId)
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const chunkRecord = await backend.getChunkForVersion(projectId, version)
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
await lazyLoadHistoryFiles(history, batchBlobStore)
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
}
/**
* Load the chunk that contains the version that was current at the given
* timestamp, including blob metadata.
*/
async function loadAtTimestamp(projectId, timestamp) {
assert.projectId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const backend = getBackend(projectId)
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const chunkRecord = await backend.getChunkForTimestamp(projectId, timestamp)
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
await lazyLoadHistoryFiles(history, batchBlobStore)
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
}
/**
* Store the chunk and insert corresponding records in the database.
*
* @param {string} projectId
* @param {Chunk} chunk
* @param {Date} [earliestChangeTimestamp]
*/
async function create(projectId, chunk, earliestChangeTimestamp) {
assert.projectId(projectId, 'bad projectId')
assert.instance(chunk, Chunk, 'bad chunk')
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
const backend = getBackend(projectId)
const chunkStart = chunk.getStartVersion()
const chunkId = await uploadChunk(projectId, chunk)
const opts = {}
if (chunkStart > 0) {
opts.oldChunkId = await getChunkIdForVersion(projectId, chunkStart - 1)
}
if (earliestChangeTimestamp != null) {
opts.earliestChangeTimestamp = earliestChangeTimestamp
}
await backend.confirmCreate(projectId, chunk, chunkId, opts)
}
/**
* Upload the given chunk to object storage.
*
* This is used by the create and update methods.
*/
async function uploadChunk(projectId, chunk) {
const backend = getBackend(projectId)
const blobStore = new BlobStore(projectId)
const historyStoreConcurrency = parseInt(
config.get('chunkStore.historyStoreConcurrency'),
10
)
const rawHistory = await chunk
.getHistory()
.store(blobStore, historyStoreConcurrency)
const chunkId = await backend.insertPendingChunk(projectId, chunk)
await historyStore.storeRaw(projectId, chunkId, rawHistory)
return chunkId
}
/**
* Extend the project's history by replacing the latest chunk with a new
* chunk.
*
* @param {string} projectId
* @param {number} oldEndVersion
* @param {Chunk} newChunk
* @param {Date} [earliestChangeTimestamp]
* @return {Promise}
*/
async function update(
projectId,
oldEndVersion,
newChunk,
earliestChangeTimestamp
) {
assert.projectId(projectId, 'bad projectId')
assert.integer(oldEndVersion, 'bad oldEndVersion')
assert.instance(newChunk, Chunk, 'bad newChunk')
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
const backend = getBackend(projectId)
const oldChunkId = await getChunkIdForVersion(projectId, oldEndVersion)
const newChunkId = await uploadChunk(projectId, newChunk)
const opts = {}
if (earliestChangeTimestamp != null) {
opts.earliestChangeTimestamp = earliestChangeTimestamp
}
await backend.confirmUpdate(projectId, oldChunkId, newChunk, newChunkId, opts)
}
/**
* Find the chunk ID for a given version of a project.
*
* @param {string} projectId
* @param {number} version
* @return {Promise.<string>}
*/
async function getChunkIdForVersion(projectId, version) {
const backend = getBackend(projectId)
const chunkRecord = await backend.getChunkForVersion(projectId, version)
return chunkRecord.id
}
/**
* Find the chunk metadata for a given version of a project.
*
* @param {string} projectId
* @param {number} version
* @return {Promise.<{id: string|number, startVersion: number, endVersion: number}>}
*/
async function getChunkMetadataForVersion(projectId, version) {
const backend = getBackend(projectId)
const chunkRecord = await backend.getChunkForVersion(projectId, version)
return chunkRecord
}
/**
* Get all of a project's chunk ids
*/
async function getProjectChunkIds(projectId) {
const backend = getBackend(projectId)
const chunkIds = await backend.getProjectChunkIds(projectId)
return chunkIds
}
/**
* Get all of a projects chunks directly
*/
async function getProjectChunks(projectId) {
const backend = getBackend(projectId)
const chunkIds = await backend.getProjectChunks(projectId)
return chunkIds
}
/**
* Load the chunk for a given chunk record, including blob metadata.
*/
async function loadByChunkRecord(projectId, chunkRecord) {
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const { raw: rawHistory, buffer: chunkBuffer } =
await historyStore.loadRawWithBuffer(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
await lazyLoadHistoryFiles(history, batchBlobStore)
return {
chunk: new Chunk(history, chunkRecord.endVersion - history.countChanges()),
chunkBuffer,
}
}
/**
* Asynchronously retrieves project chunks starting from a specific version.
*
* This generator function yields chunk records for a given project starting from the specified version (inclusive).
* It continues to fetch and yield subsequent chunk records until the end version of the latest chunk metadata is reached.
* If you want to fetch all the chunks *after* a version V, call this function with V+1.
*
* @param {string} projectId - The ID of the project.
* @param {number} version - The starting version to retrieve chunks from.
* @returns {AsyncGenerator<Object, void, undefined>} An async generator that yields chunk records.
*/
async function* getProjectChunksFromVersion(projectId, version) {
const backend = getBackend(projectId)
const latestChunkMetadata = await loadLatestRaw(projectId)
if (!latestChunkMetadata || version > latestChunkMetadata.endVersion) {
return
}
let chunkRecord = await backend.getChunkForVersion(projectId, version)
while (chunkRecord != null) {
yield chunkRecord
if (chunkRecord.endVersion >= latestChunkMetadata.endVersion) {
break
} else {
chunkRecord = await backend.getChunkForVersion(
projectId,
chunkRecord.endVersion + 1
)
}
}
}
/**
* Delete the given chunk from the database.
*
* This doesn't delete the chunk from object storage yet. The old chunks
* collection will do that.
*/
async function destroy(projectId, chunkId) {
const backend = getBackend(projectId)
await backend.deleteChunk(projectId, chunkId)
}
/**
* Delete all of a project's chunks from the database.
*/
async function deleteProjectChunks(projectId) {
const backend = getBackend(projectId)
await backend.deleteProjectChunks(projectId)
}
/**
* Delete a given number of old chunks from both the database
* and from object storage.
*
* @param {object} options
* @param {number} [options.batchSize] - number of chunks to delete in each
* batch
* @param {number} [options.maxBatches] - maximum number of batches to process
* @param {number} [options.minAgeSecs] - minimum age of chunks to delete
* @param {number} [options.timeout] - maximum time to spend deleting chunks
*
* @return {Promise<number>} number of chunks deleted
*/
async function deleteOldChunks(options = {}) {
const batchSize = options.batchSize ?? DEFAULT_DELETE_BATCH_SIZE
const maxBatches = options.maxBatches ?? Number.MAX_SAFE_INTEGER
const minAgeSecs = options.minAgeSecs ?? DEFAULT_DELETE_MIN_AGE_SECS
const timeout = options.timeout ?? DEFAULT_DELETE_TIMEOUT_SECS
assert.greater(batchSize, 0)
assert.greater(timeout, 0)
assert.greater(maxBatches, 0)
assert.greaterOrEqual(minAgeSecs, 0)
const timeoutAfter = Date.now() + timeout * 1000
let deletedChunksTotal = 0
for (const backend of [postgresBackend, mongoBackend]) {
for (let i = 0; i < maxBatches; i++) {
if (Date.now() > timeoutAfter) {
break
}
const deletedChunks = await deleteOldChunksBatch(
backend,
batchSize,
minAgeSecs
)
deletedChunksTotal += deletedChunks.length
if (deletedChunks.length !== batchSize) {
// Last batch was incomplete. There probably are no old chunks left
break
}
}
}
return deletedChunksTotal
}
async function deleteOldChunksBatch(backend, count, minAgeSecs) {
assert.greater(count, 0, 'bad count')
assert.greaterOrEqual(minAgeSecs, 0, 'bad minAgeSecs')
const oldChunks = await backend.getOldChunksBatch(count, minAgeSecs)
if (oldChunks.length === 0) {
return []
}
await historyStore.deleteChunks(oldChunks)
await backend.deleteOldChunks(oldChunks.map(chunk => chunk.chunkId))
return oldChunks
}
/**
* Returns the appropriate backend for the given project id
*
* Numeric ids use the Postgres backend.
* Strings of 24 characters use the Mongo backend.
*/
function getBackend(projectId) {
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
return postgresBackend
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
return mongoBackend
} else {
throw new OError('bad project id', { projectId })
}
}
class AlreadyInitialized extends OError {
constructor(projectId) {
super('Project is already initialized', { projectId })
}
}
module.exports = {
getBackend,
initializeProject,
loadLatest,
loadLatestRaw,
loadAtVersion,
loadAtTimestamp,
loadByChunkRecord,
create,
update,
destroy,
getChunkIdForVersion,
getChunkMetadataForVersion,
getProjectChunkIds,
getProjectChunks,
getProjectChunksFromVersion,
deleteProjectChunks,
deleteOldChunks,
AlreadyInitialized,
ChunkVersionConflictError,
}

View File

@@ -0,0 +1,526 @@
// @ts-check
const { ObjectId, ReadPreference, MongoError } = require('mongodb')
const { Chunk } = require('overleaf-editor-core')
const OError = require('@overleaf/o-error')
const assert = require('../assert')
const mongodb = require('../mongodb')
const { ChunkVersionConflictError } = require('./errors')
const DUPLICATE_KEY_ERROR_CODE = 11000
/**
* @import { ClientSession } from 'mongodb'
*/
/**
* Get the latest chunk's metadata from the database
* @param {string} projectId
* @param {Object} [opts]
* @param {boolean} [opts.readOnly]
*/
async function getLatestChunk(projectId, opts = {}) {
assert.mongoId(projectId, 'bad projectId')
const { readOnly = false } = opts
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{
sort: { startVersion: -1 },
readPreference: readOnly
? ReadPreference.secondaryPreferred
: ReadPreference.primary,
}
)
if (record == null) {
return null
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version.
*/
async function getChunkForVersion(projectId, version) {
assert.mongoId(projectId, 'bad projectId')
assert.integer(version, 'bad version')
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
startVersion: { $lte: version },
endVersion: { $gte: version },
},
{ sort: { startVersion: 1 } }
)
if (record == null) {
throw new Chunk.VersionNotFoundError(projectId, version)
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version before the endTime.
*/
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
assert.mongoId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const recordActive = await getChunkForVersion(projectId, 0)
if (recordActive && recordActive.endTimestamp <= timestamp) {
return recordActive
}
// fallback to deleted chunk
const recordDeleted = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: 'deleted',
startVersion: 0,
updatedAt: { $lte: timestamp }, // indexed for state=deleted
endTimestamp: { $lte: timestamp },
},
{ sort: { updatedAt: -1 } }
)
if (recordDeleted) {
return chunkFromRecord(recordDeleted)
}
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
/**
* Get the metadata for the chunk that contains the version that was current at
* the given timestamp.
*/
async function getChunkForTimestamp(projectId, timestamp) {
assert.mongoId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
endTimestamp: { $gte: timestamp },
},
// We use the index on the startVersion for sorting records. This assumes
// that timestamps go up with each version.
{ sort: { startVersion: 1 } }
)
if (record == null) {
// Couldn't find a chunk that had modifications after the given timestamp.
// Fetch the latest chunk instead.
const chunk = await getLatestChunk(projectId)
if (chunk == null) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunk
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the version that was current before
* the given timestamp.
*/
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
assert.mongoId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
$or: [
{
endTimestamp: {
$lte: timestamp,
},
},
{
endTimestamp: null,
},
],
},
// We use the index on the startVersion for sorting records. This assumes
// that timestamps go up with each version.
{ sort: { startVersion: -1 } }
)
if (record == null) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunkFromRecord(record)
}
/**
* Get all of a project's chunk ids
*/
async function getProjectChunkIds(projectId) {
assert.mongoId(projectId, 'bad projectId')
const cursor = mongodb.chunks.find(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{ projection: { _id: 1 } }
)
return await cursor.map(record => record._id).toArray()
}
/**
* Get all of a projects chunks directly
*/
async function getProjectChunks(projectId) {
assert.mongoId(projectId, 'bad projectId')
const cursor = mongodb.chunks
.find(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{ projection: { state: 0 } }
)
.sort({ startVersion: 1 })
return await cursor.map(chunkFromRecord).toArray()
}
/**
* Insert a pending chunk before sending it to object storage.
*/
async function insertPendingChunk(projectId, chunk) {
assert.mongoId(projectId, 'bad projectId')
assert.instance(chunk, Chunk, 'bad chunk')
const chunkId = new ObjectId()
await mongodb.chunks.insertOne({
_id: chunkId,
projectId: new ObjectId(projectId),
startVersion: chunk.getStartVersion(),
endVersion: chunk.getEndVersion(),
endTimestamp: chunk.getEndTimestamp(),
state: 'pending',
updatedAt: new Date(),
})
return chunkId.toString()
}
/**
* Record that a new chunk was created.
*
* @param {string} projectId
* @param {Chunk} chunk
* @param {string} chunkId
* @param {object} opts
* @param {Date} [opts.earliestChangeTimestamp]
* @param {string} [opts.oldChunkId]
*/
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
assert.mongoId(projectId, 'bad projectId')
assert.instance(chunk, Chunk, 'bad newChunk')
assert.mongoId(chunkId, 'bad newChunkId')
await mongodb.client.withSession(async session => {
await session.withTransaction(async () => {
if (opts.oldChunkId != null) {
await closeChunk(projectId, opts.oldChunkId, { session })
}
await activateChunk(projectId, chunkId, { session })
await updateProjectRecord(
projectId,
chunk,
opts.earliestChangeTimestamp,
{ session }
)
})
})
}
/**
* Write the metadata to the project record
*/
async function updateProjectRecord(
projectId,
chunk,
earliestChangeTimestamp,
mongoOpts = {}
) {
// record the end version against the project
await mongodb.projects.updateOne(
{
'overleaf.history.id': projectId, // string for Object ids, number for postgres ids
},
{
// always store the latest end version and timestamp for the chunk
$max: {
'overleaf.history.currentEndVersion': chunk.getEndVersion(),
'overleaf.history.currentEndTimestamp': chunk.getEndTimestamp(),
'overleaf.history.updatedAt': new Date(),
},
// store the first pending change timestamp for the chunk, this will
// be cleared every time a backup is completed.
$min: {
'overleaf.backup.pendingChangeAt':
earliestChangeTimestamp || chunk.getEndTimestamp() || new Date(),
},
},
mongoOpts
)
}
/**
* Record that a chunk was replaced by a new one.
*
* @param {string} projectId
* @param {string} oldChunkId
* @param {Chunk} newChunk
* @param {string} newChunkId
* @param {object} [opts]
* @param {Date} [opts.earliestChangeTimestamp]
*/
async function confirmUpdate(
projectId,
oldChunkId,
newChunk,
newChunkId,
opts = {}
) {
assert.mongoId(projectId, 'bad projectId')
assert.mongoId(oldChunkId, 'bad oldChunkId')
assert.instance(newChunk, Chunk, 'bad newChunk')
assert.mongoId(newChunkId, 'bad newChunkId')
await mongodb.client.withSession(async session => {
await session.withTransaction(async () => {
await deleteActiveChunk(projectId, oldChunkId, { session })
await activateChunk(projectId, newChunkId, { session })
await updateProjectRecord(
projectId,
newChunk,
opts.earliestChangeTimestamp,
{ session }
)
})
})
}
/**
* Activate a pending chunk
*
* @param {string} projectId
* @param {string} chunkId
* @param {object} [opts]
* @param {ClientSession} [opts.session]
*/
async function activateChunk(projectId, chunkId, opts = {}) {
assert.mongoId(projectId, 'bad projectId')
assert.mongoId(chunkId, 'bad chunkId')
let result
try {
result = await mongodb.chunks.updateOne(
{
_id: new ObjectId(chunkId),
projectId: new ObjectId(projectId),
state: 'pending',
},
{ $set: { state: 'active', updatedAt: new Date() } },
opts
)
} catch (err) {
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
throw new ChunkVersionConflictError('chunk start version is not unique', {
projectId,
chunkId,
})
} else {
throw err
}
}
if (result.matchedCount === 0) {
throw new OError('pending chunk not found', { projectId, chunkId })
}
}
/**
* Close a chunk
*
* A closed chunk is one that can't be extended anymore.
*
* @param {string} projectId
* @param {string} chunkId
* @param {object} [opts]
* @param {ClientSession} [opts.session]
*/
async function closeChunk(projectId, chunkId, opts = {}) {
const result = await mongodb.chunks.updateOne(
{
_id: new ObjectId(chunkId),
projectId: new ObjectId(projectId),
state: 'active',
},
{ $set: { state: 'closed' } },
opts
)
if (result.matchedCount === 0) {
throw new ChunkVersionConflictError('unable to close chunk', {
projectId,
chunkId,
})
}
}
/**
* Delete an active chunk
*
* This is used to delete chunks that are in the process of being extended. It
* will refuse to delete chunks that are already closed and can therefore not be
* extended.
*
* @param {string} projectId
* @param {string} chunkId
* @param {object} [opts]
* @param {ClientSession} [opts.session]
*/
async function deleteActiveChunk(projectId, chunkId, opts = {}) {
const updateResult = await mongodb.chunks.updateOne(
{
_id: new ObjectId(chunkId),
projectId: new ObjectId(projectId),
state: 'active',
},
{ $set: { state: 'deleted', updatedAt: new Date() } },
opts
)
if (updateResult.matchedCount === 0) {
throw new ChunkVersionConflictError('unable to delete active chunk', {
projectId,
chunkId,
})
}
}
/**
* Delete a chunk.
*
* @param {string} projectId
* @param {string} chunkId
* @return {Promise}
*/
async function deleteChunk(projectId, chunkId, mongoOpts = {}) {
assert.mongoId(projectId, 'bad projectId')
assert.mongoId(chunkId, 'bad chunkId')
await mongodb.chunks.updateOne(
{ _id: new ObjectId(chunkId), projectId: new ObjectId(projectId) },
{ $set: { state: 'deleted', updatedAt: new Date() } },
mongoOpts
)
}
/**
* Delete all of a project's chunks
*/
async function deleteProjectChunks(projectId) {
assert.mongoId(projectId, 'bad projectId')
await mongodb.chunks.updateMany(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{ $set: { state: 'deleted', updatedAt: new Date() } }
)
}
/**
* Get a batch of old chunks for deletion
*/
async function getOldChunksBatch(count, minAgeSecs) {
const maxUpdatedAt = new Date(Date.now() - minAgeSecs * 1000)
const batch = []
// We need to fetch one state at a time to take advantage of the partial
// indexes on the chunks collection.
//
// Mongo 6.0 allows partial indexes that use the $in operator. When we reach
// that Mongo version, we can create a partial index on both the deleted and
// pending states and simplify this logic a bit.
for (const state of ['deleted', 'pending']) {
if (count === 0) {
// There's no more space in the batch
break
}
const cursor = mongodb.chunks
.find(
{ state, updatedAt: { $lt: maxUpdatedAt } },
{
limit: count,
projection: { _id: 1, projectId: 1 },
}
)
.map(record => ({
chunkId: record._id.toString(),
projectId: record.projectId.toString(),
}))
for await (const record of cursor) {
batch.push(record)
count -= 1
}
}
return batch
}
/**
* Delete a batch of old chunks from the database
*/
async function deleteOldChunks(chunkIds) {
await mongodb.chunks.deleteMany({
_id: { $in: chunkIds.map(id => new ObjectId(id)) },
state: { $in: ['deleted', 'pending'] },
})
}
/**
* Build a chunk metadata object from the database record
*/
function chunkFromRecord(record) {
return {
id: record._id.toString(),
startVersion: record.startVersion,
endVersion: record.endVersion,
endTimestamp: record.endTimestamp,
}
}
module.exports = {
getLatestChunk,
getFirstChunkBeforeTimestamp,
getLastActiveChunkBeforeTimestamp,
getChunkForVersion,
getChunkForTimestamp,
getProjectChunkIds,
getProjectChunks,
insertPendingChunk,
confirmCreate,
confirmUpdate,
updateProjectRecord,
deleteChunk,
deleteProjectChunks,
getOldChunksBatch,
deleteOldChunks,
}

View File

@@ -0,0 +1,487 @@
// @ts-check
const { Chunk } = require('overleaf-editor-core')
const assert = require('../assert')
const knex = require('../knex')
const knexReadOnly = require('../knex_read_only')
const { ChunkVersionConflictError } = require('./errors')
const { updateProjectRecord } = require('./mongo')
const DUPLICATE_KEY_ERROR_CODE = '23505'
/**
* @import { Knex } from 'knex'
*/
/**
* Get the latest chunk's metadata from the database
* @param {string} projectId
* @param {Object} [opts]
* @param {boolean} [opts.readOnly]
*/
async function getLatestChunk(projectId, opts = {}) {
assert.postgresId(projectId, 'bad projectId')
const { readOnly = false } = opts
const record = await (readOnly ? knexReadOnly : knex)('chunks')
.where('doc_id', parseInt(projectId, 10))
.orderBy('end_version', 'desc')
.first()
if (record == null) {
return null
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version.
*
* @param {string} projectId
* @param {number} version
*/
async function getChunkForVersion(projectId, version) {
assert.postgresId(projectId, 'bad projectId')
const record = await knex('chunks')
.where('doc_id', parseInt(projectId, 10))
.where('end_version', '>=', version)
.orderBy('end_version')
.first()
if (!record) {
throw new Chunk.VersionNotFoundError(projectId, version)
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version.
*
* @param {string} projectId
* @param {Date} timestamp
*/
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
assert.date(timestamp, 'bad timestamp')
const recordActive = await getChunkForVersion(projectId, 0)
// projectId must be valid if getChunkForVersion did not throw
if (recordActive && recordActive.endTimestamp <= timestamp) {
return recordActive
}
// fallback to deleted chunk
const recordDeleted = await knex('old_chunks')
.where('doc_id', parseInt(projectId, 10))
.where('start_version', '=', 0)
.where('end_timestamp', '<=', timestamp)
.orderBy('end_version', 'desc')
.first()
if (recordDeleted) {
return chunkFromRecord(recordDeleted)
}
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
/**
* Get the metadata for the chunk that contains the version that was current at
* the given timestamp.
*
* @param {string} projectId
* @param {Date} timestamp
*/
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
assert.date(timestamp, 'bad timestamp')
assert.postgresId(projectId, 'bad projectId')
const query = knex('chunks')
.where('doc_id', parseInt(projectId, 10))
.where(function () {
this.where('end_timestamp', '<=', timestamp).orWhere(
'end_timestamp',
null
)
})
.orderBy('end_version', 'desc', 'last')
const record = await query.first()
if (!record) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the version that was current at
* the given timestamp.
*
* @param {string} projectId
* @param {Date} timestamp
*/
async function getChunkForTimestamp(projectId, timestamp) {
assert.postgresId(projectId, 'bad projectId')
// This query will find the latest chunk after the timestamp (query orders
// in reverse chronological order), OR the latest chunk
// This accounts for the case where the timestamp is ahead of the chunk's
// timestamp and therefore will not return any results
const whereAfterEndTimestampOrLatestChunk = knex.raw(
'end_timestamp >= ? ' +
'OR id = ( ' +
'SELECT id FROM chunks ' +
'WHERE doc_id = ? ' +
'ORDER BY end_version desc LIMIT 1' +
')',
[timestamp, parseInt(projectId, 10)]
)
const record = await knex('chunks')
.where('doc_id', parseInt(projectId, 10))
.where(whereAfterEndTimestampOrLatestChunk)
.orderBy('end_version')
.first()
if (!record) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunkFromRecord(record)
}
/**
* Build a chunk metadata object from the database record
*/
function chunkFromRecord(record) {
return {
id: record.id.toString(),
startVersion: record.start_version,
endVersion: record.end_version,
endTimestamp: record.end_timestamp,
}
}
/**
* Get all of a project's chunk ids
*
* @param {string} projectId
*/
async function getProjectChunkIds(projectId) {
assert.postgresId(projectId, 'bad projectId')
const records = await knex('chunks')
.select('id')
.where('doc_id', parseInt(projectId, 10))
return records.map(record => record.id)
}
/**
* Get all of a projects chunks directly
*
* @param {string} projectId
*/
async function getProjectChunks(projectId) {
assert.postgresId(projectId, 'bad projectId')
const records = await knex('chunks')
.select()
.where('doc_id', parseInt(projectId, 10))
.orderBy('end_version')
return records.map(chunkFromRecord)
}
/**
* Insert a pending chunk before sending it to object storage.
*
* @param {string} projectId
* @param {Chunk} chunk
*/
async function insertPendingChunk(projectId, chunk) {
assert.postgresId(projectId, 'bad projectId')
const result = await knex.first(
knex.raw("nextval('chunks_id_seq'::regclass)::integer as chunkid")
)
const chunkId = result.chunkid
await knex('pending_chunks').insert({
id: chunkId,
doc_id: parseInt(projectId, 10),
end_version: chunk.getEndVersion(),
start_version: chunk.getStartVersion(),
end_timestamp: chunk.getEndTimestamp(),
})
return chunkId.toString()
}
/**
* Record that a new chunk was created.
*
* @param {string} projectId
* @param {Chunk} chunk
* @param {string} chunkId
* @param {object} opts
* @param {Date} [opts.earliestChangeTimestamp]
* @param {string} [opts.oldChunkId]
*/
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
assert.postgresId(projectId, 'bad projectId')
await knex.transaction(async tx => {
if (opts.oldChunkId != null) {
await _assertChunkIsNotClosed(tx, projectId, opts.oldChunkId)
await _closeChunk(tx, projectId, opts.oldChunkId)
}
await Promise.all([
_deletePendingChunk(tx, projectId, chunkId),
_insertChunk(tx, projectId, chunk, chunkId),
])
await updateProjectRecord(
// The history id in Mongo is an integer for Postgres projects
parseInt(projectId, 10),
chunk,
opts.earliestChangeTimestamp
)
})
}
/**
* Record that a chunk was replaced by a new one.
*
* @param {string} projectId
* @param {string} oldChunkId
* @param {Chunk} newChunk
* @param {string} newChunkId
*/
async function confirmUpdate(
projectId,
oldChunkId,
newChunk,
newChunkId,
opts = {}
) {
assert.postgresId(projectId, 'bad projectId')
await knex.transaction(async tx => {
await _assertChunkIsNotClosed(tx, projectId, oldChunkId)
await _deleteChunks(tx, { doc_id: projectId, id: oldChunkId })
await Promise.all([
_deletePendingChunk(tx, projectId, newChunkId),
_insertChunk(tx, projectId, newChunk, newChunkId),
])
await updateProjectRecord(
// The history id in Mongo is an integer for Postgres projects
parseInt(projectId, 10),
newChunk,
opts.earliestChangeTimestamp
)
})
}
/**
* Delete a pending chunk
*
* @param {Knex} tx
* @param {string} projectId
* @param {string} chunkId
*/
async function _deletePendingChunk(tx, projectId, chunkId) {
await tx('pending_chunks')
.where({
doc_id: parseInt(projectId, 10),
id: parseInt(chunkId, 10),
})
.del()
}
/**
* Adds an active chunk
*
* @param {Knex} tx
* @param {string} projectId
* @param {Chunk} chunk
* @param {string} chunkId
*/
async function _insertChunk(tx, projectId, chunk, chunkId) {
const startVersion = chunk.getStartVersion()
const endVersion = chunk.getEndVersion()
try {
await tx('chunks').insert({
id: parseInt(chunkId, 10),
doc_id: parseInt(projectId, 10),
start_version: startVersion,
end_version: endVersion,
end_timestamp: chunk.getEndTimestamp(),
})
} catch (err) {
if (
err instanceof Error &&
'code' in err &&
err.code === DUPLICATE_KEY_ERROR_CODE
) {
throw new ChunkVersionConflictError(
'chunk start or end version is not unique',
{ projectId, chunkId, startVersion, endVersion }
)
}
throw err
}
}
/**
* Check that a chunk is not closed
*
* This is used to synchronize chunk creations and extensions.
*
* @param {Knex} tx
* @param {string} projectId
* @param {string} chunkId
*/
async function _assertChunkIsNotClosed(tx, projectId, chunkId) {
const record = await tx('chunks')
.forUpdate()
.select('closed')
.where('doc_id', parseInt(projectId, 10))
.where('id', parseInt(chunkId, 10))
.first()
if (!record) {
throw new ChunkVersionConflictError('unable to close chunk: not found', {
projectId,
chunkId,
})
}
if (record.closed) {
throw new ChunkVersionConflictError(
'unable to close chunk: already closed',
{
projectId,
chunkId,
}
)
}
}
/**
* Close a chunk
*
* A closed chunk can no longer be extended.
*
* @param {Knex} tx
* @param {string} projectId
* @param {string} chunkId
*/
async function _closeChunk(tx, projectId, chunkId) {
await tx('chunks')
.update({ closed: true })
.where('doc_id', parseInt(projectId, 10))
.where('id', parseInt(chunkId, 10))
}
/**
* Delete a chunk.
*
* @param {string} projectId
* @param {string} chunkId
*/
async function deleteChunk(projectId, chunkId) {
assert.postgresId(projectId, 'bad projectId')
assert.integer(chunkId, 'bad chunkId')
await _deleteChunks(knex, {
doc_id: parseInt(projectId, 10),
id: parseInt(chunkId, 10),
})
}
/**
* Delete all of a project's chunks
*
* @param {string} projectId
*/
async function deleteProjectChunks(projectId) {
assert.postgresId(projectId, 'bad projectId')
await knex.transaction(async tx => {
await _deleteChunks(knex, { doc_id: parseInt(projectId, 10) })
})
}
/**
* Delete many chunks
*
* @param {Knex} tx
* @param {any} whereClause
*/
async function _deleteChunks(tx, whereClause) {
const rows = await tx('chunks').where(whereClause).del().returning('*')
if (rows.length === 0) {
return
}
const oldChunks = rows.map(row => ({
doc_id: row.doc_id,
chunk_id: row.id,
start_version: row.start_version,
end_version: row.end_version,
end_timestamp: row.end_timestamp,
deleted_at: tx.fn.now(),
}))
await tx('old_chunks').insert(oldChunks)
}
/**
* Get a batch of old chunks for deletion
*
* @param {number} count
* @param {number} minAgeSecs
*/
async function getOldChunksBatch(count, minAgeSecs) {
const maxDeletedAt = new Date(Date.now() - minAgeSecs * 1000)
const records = await knex('old_chunks')
.whereNull('deleted_at')
.orWhere('deleted_at', '<', maxDeletedAt)
.orderBy('chunk_id')
.limit(count)
return records.map(oldChunk => ({
projectId: oldChunk.doc_id.toString(),
chunkId: oldChunk.chunk_id.toString(),
}))
}
/**
* Delete a batch of old chunks from the database
*
* @param {string[]} chunkIds
*/
async function deleteOldChunks(chunkIds) {
await knex('old_chunks')
.whereIn(
'chunk_id',
chunkIds.map(id => parseInt(id, 10))
)
.del()
}
/**
* Generate a new project id
*/
async function generateProjectId() {
const record = await knex.first(
knex.raw("nextval('docs_id_seq'::regclass)::integer as doc_id")
)
return record.doc_id.toString()
}
module.exports = {
getLatestChunk,
getFirstChunkBeforeTimestamp,
getLastActiveChunkBeforeTimestamp,
getChunkForVersion,
getChunkForTimestamp,
getProjectChunkIds,
getProjectChunks,
insertPendingChunk,
confirmCreate,
confirmUpdate,
deleteChunk,
deleteProjectChunks,
getOldChunksBatch,
deleteOldChunks,
generateProjectId,
}

View File

@@ -0,0 +1,254 @@
const metrics = require('@overleaf/metrics')
const logger = require('@overleaf/logger')
const redis = require('../redis')
const rclient = redis.rclientHistory //
const { Snapshot, Change, History, Chunk } = require('overleaf-editor-core')
const TEMPORARY_CACHE_LIFETIME = 300 // 5 minutes
const keySchema = {
snapshot({ projectId }) {
return `snapshot:{${projectId}}`
},
startVersion({ projectId }) {
return `snapshot-version:{${projectId}}`
},
changes({ projectId }) {
return `changes:{${projectId}}`
},
}
rclient.defineCommand('get_current_chunk', {
numberOfKeys: 3,
lua: `
local startVersionValue = redis.call('GET', KEYS[2])
if not startVersionValue then
return nil -- this is a cache-miss
end
local snapshotValue = redis.call('GET', KEYS[1])
local changesValues = redis.call('LRANGE', KEYS[3], 0, -1)
return {snapshotValue, startVersionValue, changesValues}
`,
})
/**
* Retrieves the current chunk of project history from Redis storage
* @param {string} projectId - The unique identifier of the project
* @returns {Promise<Chunk|null>} A Promise that resolves to a Chunk object containing project history,
* or null if retrieval fails
* @throws {Error} If Redis operations fail
*/
async function getCurrentChunk(projectId) {
try {
const result = await rclient.get_current_chunk(
keySchema.snapshot({ projectId }),
keySchema.startVersion({ projectId }),
keySchema.changes({ projectId })
)
if (!result) {
return null // cache-miss
}
const snapshot = Snapshot.fromRaw(JSON.parse(result[0]))
const startVersion = JSON.parse(result[1])
const changes = result[2].map(c => Change.fromRaw(JSON.parse(c)))
const history = new History(snapshot, changes)
const chunk = new Chunk(history, startVersion)
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'success' })
return chunk
} catch (err) {
logger.error({ err, projectId }, 'error getting current chunk from redis')
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'error' })
return null
}
}
rclient.defineCommand('get_current_chunk_metadata', {
numberOfKeys: 2,
lua: `
local startVersionValue = redis.call('GET', KEYS[1])
local changesCount = redis.call('LLEN', KEYS[2])
return {startVersionValue, changesCount}
`,
})
/**
* Retrieves the current chunk metadata for a given project from Redis
* @param {string} projectId - The ID of the project to get metadata for
* @returns {Promise<Object|null>} Object containing startVersion and changesCount if found, null on error or cache miss
* @property {number} startVersion - The starting version information
* @property {number} changesCount - The number of changes in the chunk
*/
async function getCurrentChunkMetadata(projectId) {
try {
const result = await rclient.get_current_chunk_metadata(
keySchema.startVersion({ projectId }),
keySchema.changes({ projectId })
)
if (!result) {
return null // cache-miss
}
const startVersion = JSON.parse(result[0])
const changesCount = parseInt(result[1], 10)
return { startVersion, changesCount }
} catch (err) {
return null
}
}
rclient.defineCommand('set_current_chunk', {
numberOfKeys: 3,
lua: `
local snapshotValue = ARGV[1]
local startVersionValue = ARGV[2]
redis.call('SETEX', KEYS[1], ${TEMPORARY_CACHE_LIFETIME}, snapshotValue)
redis.call('SETEX', KEYS[2], ${TEMPORARY_CACHE_LIFETIME}, startVersionValue)
redis.call('DEL', KEYS[3]) -- clear the old changes list
if #ARGV >= 3 then
redis.call('RPUSH', KEYS[3], unpack(ARGV, 3))
redis.call('EXPIRE', KEYS[3], ${TEMPORARY_CACHE_LIFETIME})
end
`,
})
/**
* Stores the current chunk of project history in Redis
* @param {string} projectId - The ID of the project
* @param {Chunk} chunk - The chunk object containing history data
* @returns {Promise<*>} Returns the result of the Redis operation, or null if an error occurs
* @throws {Error} May throw Redis-related errors which are caught internally
*/
async function setCurrentChunk(projectId, chunk) {
try {
const snapshotKey = keySchema.snapshot({ projectId })
const startVersionKey = keySchema.startVersion({ projectId })
const changesKey = keySchema.changes({ projectId })
const snapshot = chunk.history.snapshot
const startVersion = chunk.startVersion
const changes = chunk.history.changes
await rclient.set_current_chunk(
snapshotKey,
startVersionKey,
changesKey,
JSON.stringify(snapshot.toRaw()),
startVersion,
...changes.map(c => JSON.stringify(c.toRaw()))
)
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'success' })
} catch (err) {
logger.error(
{ err, projectId, chunk },
'error setting current chunk inredis'
)
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'error' })
return null // while testing we will suppress any errors
}
}
/**
* Checks whether a cached chunk's version metadata matches the current chunk's metadata
* @param {Chunk} cachedChunk - The chunk retrieved from cache
* @param {Chunk} currentChunk - The current chunk to compare against
* @returns {boolean} - Returns true if the chunks have matching start and end versions, false otherwise
*/
function checkCacheValidity(cachedChunk, currentChunk) {
return Boolean(
cachedChunk &&
cachedChunk.getStartVersion() === currentChunk.getStartVersion() &&
cachedChunk.getEndVersion() === currentChunk.getEndVersion()
)
}
/**
* Validates if a cached chunk matches the current chunk metadata by comparing versions
* @param {Object} cachedChunk - The cached chunk object to validate
* @param {Object} currentChunkMetadata - The current chunk metadata to compare against
* @param {number} currentChunkMetadata.startVersion - The starting version number
* @param {number} currentChunkMetadata.endVersion - The ending version number
* @returns {boolean} - True if the cached chunk is valid, false otherwise
*/
function checkCacheValidityWithMetadata(cachedChunk, currentChunkMetadata) {
return Boolean(
cachedChunk &&
cachedChunk.getStartVersion() === currentChunkMetadata.startVersion &&
cachedChunk.getEndVersion() === currentChunkMetadata.endVersion
)
}
/**
* Compares two chunks for equality using stringified JSON comparison
* @param {string} projectId - The ID of the project
* @param {Chunk} cachedChunk - The cached chunk to compare
* @param {Chunk} currentChunk - The current chunk to compare against
* @returns {boolean} - Returns false if either chunk is null/undefined, otherwise returns the comparison result
*/
function compareChunks(projectId, cachedChunk, currentChunk) {
if (!cachedChunk || !currentChunk) {
return false
}
const identical = JSON.stringify(cachedChunk) === JSON.stringify(currentChunk)
if (!identical) {
try {
logger.error(
{
projectId,
cachedChunkStartVersion: cachedChunk.getStartVersion(),
cachedChunkEndVersion: cachedChunk.getEndVersion(),
currentChunkStartVersion: currentChunk.getStartVersion(),
currentChunkEndVersion: currentChunk.getEndVersion(),
},
'chunk cache mismatch'
)
} catch (err) {
// ignore errors while logging
}
}
metrics.inc('chunk_store.redis.compare_chunks', 1, {
status: identical ? 'success' : 'fail',
})
return identical
}
// Define Lua script for atomic cache clearing
rclient.defineCommand('clear_chunk_cache', {
numberOfKeys: 3,
lua: `
-- Delete all keys related to a project's chunk cache atomically
redis.call('DEL', KEYS[1]) -- snapshot key
redis.call('DEL', KEYS[2]) -- startVersion key
redis.call('DEL', KEYS[3]) -- changes key
return 1
`,
})
/**
* Clears all cache entries for a project's chunk data
* @param {string} projectId - The ID of the project whose cache should be cleared
* @returns {Promise<boolean>} A promise that resolves to true if successful, false on error
*/
async function clearCache(projectId) {
try {
const snapshotKey = keySchema.snapshot({ projectId })
const startVersionKey = keySchema.startVersion({ projectId })
const changesKey = keySchema.changes({ projectId })
await rclient.clear_chunk_cache(snapshotKey, startVersionKey, changesKey)
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'success' })
return true
} catch (err) {
logger.error({ err, projectId }, 'error clearing chunk cache from redis')
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'error' })
return false
}
}
module.exports = {
getCurrentChunk,
setCurrentChunk,
getCurrentChunkMetadata,
checkCacheValidity,
checkCacheValidityWithMetadata,
compareChunks,
clearCache,
}

View File

@@ -0,0 +1,18 @@
// @ts-check
const { createHash } = require('node:crypto')
/**
* Compute a SHA-1 hash of the content
*
* This is used to validate incoming updates.
*
* @param {string} content
*/
function getContentHash(content) {
const hash = createHash('sha-1')
hash.update(content)
return hash.digest('hex')
}
module.exports = { getContentHash }

View File

@@ -0,0 +1,5 @@
const OError = require('@overleaf/o-error')
class InvalidChangeError extends OError {}
module.exports = { InvalidChangeError }

View File

@@ -0,0 +1,30 @@
const Blob = require('overleaf-editor-core').Blob
const blobHash = require('./blob_hash')
const BPromise = require('bluebird')
// We want to simulate applying all of the operations so we can return the
// resulting hashes to the caller for them to check. To do this, we need to be
// able to take the lazy files in the final snapshot, fetch their content, and
// compute the new content hashes. We don't, however, need to actually store
// that content; we just need to get the hash.
function HashCheckBlobStore(realBlobStore) {
this.realBlobStore = realBlobStore
}
HashCheckBlobStore.prototype.getString = BPromise.method(
function hashCheckBlobStoreGetString(hash) {
return this.realBlobStore.getString(hash)
}
)
HashCheckBlobStore.prototype.putString = BPromise.method(
function hashCheckBlobStorePutString(string) {
return new Blob(
blobHash.fromString(string),
Buffer.byteLength(string),
string.length
)
}
)
module.exports = HashCheckBlobStore

View File

@@ -0,0 +1,202 @@
// @ts-check
'use strict'
const core = require('overleaf-editor-core')
const config = require('config')
const path = require('node:path')
const Stream = require('node:stream')
const { promisify } = require('node:util')
const zlib = require('node:zlib')
const OError = require('@overleaf/o-error')
const objectPersistor = require('@overleaf/object-persistor')
const logger = require('@overleaf/logger')
const assert = require('./assert')
const persistor = require('./persistor')
const projectKey = require('./project_key')
const streams = require('./streams')
const Chunk = core.Chunk
const gzip = promisify(zlib.gzip)
const gunzip = promisify(zlib.gunzip)
class LoadError extends OError {
/**
* @param {string} projectId
* @param {string} chunkId
* @param {any} cause
*/
constructor(projectId, chunkId, cause) {
super(
'HistoryStore: failed to load chunk history',
{ projectId, chunkId },
cause
)
this.projectId = projectId
this.chunkId = chunkId
}
}
class StoreError extends OError {
/**
* @param {string} projectId
* @param {string} chunkId
* @param {any} cause
*/
constructor(projectId, chunkId, cause) {
super(
'HistoryStore: failed to store chunk history',
{ projectId, chunkId },
cause
)
this.projectId = projectId
this.chunkId = chunkId
}
}
/**
* @param {string} projectId
* @param {string} chunkId
* @return {string}
*/
function getKey(projectId, chunkId) {
return path.join(projectKey.format(projectId), projectKey.pad(chunkId))
}
/**
* Store and retreive raw {@link History} objects from bucket. Mainly used via the
* {@link ChunkStore}.
*
* Histories are stored as gzipped JSON blobs, keyed on the project ID and the
* ID of the Chunk that owns the history. The project ID is currently redundant,
* but I think it might help in future if we have to shard on project ID, and
* it gives us some chance of reconstructing histories even if there is a
* problem with the chunk metadata in the database.
*
* @class
*/
class HistoryStore {
#persistor
#bucket
constructor(persistor, bucket) {
this.#persistor = persistor
this.#bucket = bucket
}
/**
* Load the raw object for a History.
*
* @param {string} projectId
* @param {string} chunkId
* @return {Promise<import('overleaf-editor-core/lib/types').RawHistory>}
*/
async loadRaw(projectId, chunkId) {
assert.projectId(projectId, 'bad projectId')
assert.chunkId(chunkId, 'bad chunkId')
const key = getKey(projectId, chunkId)
logger.debug({ projectId, chunkId }, 'loadRaw started')
try {
const buf = await streams.gunzipStreamToBuffer(
await this.#persistor.getObjectStream(this.#bucket, key)
)
return JSON.parse(buf.toString('utf-8'))
} catch (err) {
if (err instanceof objectPersistor.Errors.NotFoundError) {
throw new Chunk.NotPersistedError(projectId)
}
throw new LoadError(projectId, chunkId, err)
} finally {
logger.debug({ projectId, chunkId }, 'loadRaw finished')
}
}
async loadRawWithBuffer(projectId, chunkId) {
assert.projectId(projectId, 'bad projectId')
assert.chunkId(chunkId, 'bad chunkId')
const key = getKey(projectId, chunkId)
logger.debug({ projectId, chunkId }, 'loadBuffer started')
try {
const buf = await streams.readStreamToBuffer(
await this.#persistor.getObjectStream(this.#bucket, key)
)
const unzipped = await gunzip(buf)
return {
buffer: buf,
raw: JSON.parse(unzipped.toString('utf-8')),
}
} catch (err) {
if (err instanceof objectPersistor.Errors.NotFoundError) {
throw new Chunk.NotPersistedError(projectId)
}
throw new LoadError(projectId, chunkId, err)
} finally {
logger.debug({ projectId, chunkId }, 'loadBuffer finished')
}
}
/**
* Compress and store a {@link History}.
*
* @param {string} projectId
* @param {string} chunkId
* @param {import('overleaf-editor-core/lib/types').RawHistory} rawHistory
*/
async storeRaw(projectId, chunkId, rawHistory) {
assert.projectId(projectId, 'bad projectId')
assert.chunkId(chunkId, 'bad chunkId')
assert.object(rawHistory, 'bad rawHistory')
const key = getKey(projectId, chunkId)
logger.debug({ projectId, chunkId }, 'storeRaw started')
const buf = await gzip(JSON.stringify(rawHistory))
try {
await this.#persistor.sendStream(
this.#bucket,
key,
Stream.Readable.from([buf]),
{
contentType: 'application/json',
contentEncoding: 'gzip',
contentLength: buf.byteLength,
}
)
} catch (err) {
throw new StoreError(projectId, chunkId, err)
} finally {
logger.debug({ projectId, chunkId }, 'storeRaw finished')
}
}
/**
* Delete multiple chunks from bucket. Expects an Array of objects with
* projectId and chunkId properties
* @param {Array<{projectId: string,chunkId:string}>} chunks
*/
async deleteChunks(chunks) {
logger.debug({ chunks }, 'deleteChunks started')
try {
await Promise.all(
chunks.map(chunk => {
const key = getKey(chunk.projectId, chunk.chunkId)
return this.#persistor.deleteObject(this.#bucket, key)
})
)
} finally {
logger.debug({ chunks }, 'deleteChunks finished')
}
}
}
module.exports = {
HistoryStore,
historyStore: new HistoryStore(persistor, config.get('chunkStore.bucket')),
}

View File

@@ -0,0 +1,8 @@
// @ts-check
'use strict'
const env = process.env.NODE_ENV || 'development'
const knexfile = require('../../knexfile')
module.exports = require('knex').default(knexfile[env])

View File

@@ -0,0 +1,19 @@
'use strict'
const config = require('config')
const knexfile = require('../../knexfile')
const env = process.env.NODE_ENV || 'development'
if (config.databaseUrlReadOnly) {
module.exports = require('knex')({
...knexfile[env],
pool: {
...knexfile[env].pool,
min: 0,
},
connection: config.databaseUrlReadOnly,
})
} else {
module.exports = require('./knex')
}

View File

@@ -0,0 +1,30 @@
const Metrics = require('@overleaf/metrics')
const config = require('config')
const { MongoClient } = require('mongodb')
const client = new MongoClient(config.mongo.uri)
const db = client.db()
const chunks = db.collection('projectHistoryChunks')
const blobs = db.collection('projectHistoryBlobs')
const globalBlobs = db.collection('projectHistoryGlobalBlobs')
const shardedBlobs = db.collection('projectHistoryShardedBlobs')
const projects = db.collection('projects')
// Temporary collection for tracking progress of backed up old blobs (without a hash).
// The initial sync process will be able to skip over these.
// Schema: _id: projectId, blobs: [Binary]
const backedUpBlobs = db.collection('projectHistoryBackedUpBlobs')
Metrics.mongodb.monitor(client)
module.exports = {
client,
db,
chunks,
blobs,
globalBlobs,
projects,
shardedBlobs,
backedUpBlobs,
}

View File

@@ -0,0 +1,261 @@
// @ts-check
'use strict'
const _ = require('lodash')
const logger = require('@overleaf/logger')
const core = require('overleaf-editor-core')
const Chunk = core.Chunk
const History = core.History
const assert = require('./assert')
const chunkStore = require('./chunk_store')
const { BlobStore } = require('./blob_store')
const { InvalidChangeError } = require('./errors')
const { getContentHash } = require('./content_hash')
function countChangeBytes(change) {
// Note: This is not quite accurate, because the raw change may contain raw
// file info (or conceivably even content) that will not be included in the
// actual stored object.
return Buffer.byteLength(JSON.stringify(change.toRaw()))
}
function totalChangeBytes(changes) {
return changes.length ? _(changes).map(countChangeBytes).sum() : 0
}
// provide a simple timer function
function Timer() {
this.t0 = process.hrtime()
}
Timer.prototype.elapsed = function () {
const dt = process.hrtime(this.t0)
const timeInMilliseconds = (dt[0] + dt[1] * 1e-9) * 1e3
return timeInMilliseconds
}
/**
* Break the given set of changes into zero or more Chunks according to the
* provided limits and store them.
*
* Some other possible improvements:
* 1. This does a lot more JSON serialization than it has to. We may know the
* JSON for the changes before we call this function, so we could in that
* case get the byte size of each change without doing any work. Even if we
* don't know it initially, we could save some computation by caching this
* info rather than recomputing it many times. TBD whether it is worthwhile.
* 2. We don't necessarily have to fetch the latest chunk in order to determine
* that it is full. We could store this in the chunk metadata record. It may
* be worth distinguishing between a Chunk and its metadata record. The
* endVersion may be better suited to the metadata record.
*
* @param {string} projectId
* @param {core.Change[]} allChanges
* @param {Object} limits
* @param {number} clientEndVersion
* @return {Promise.<Object?>}
*/
async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
assert.projectId(projectId)
assert.array(allChanges)
assert.maybe.object(limits)
assert.integer(clientEndVersion)
const blobStore = new BlobStore(projectId)
const earliestChangeTimestamp =
allChanges.length > 0 ? allChanges[0].getTimestamp() : null
let currentChunk
/**
* currentSnapshot tracks the latest change that we're applying; we use it to
* check that the changes we are persisting are valid.
*
* @type {core.Snapshot}
*/
let currentSnapshot
let originalEndVersion
let changesToPersist
limits = limits || {}
_.defaults(limits, {
changeBucketMinutes: 60,
maxChanges: 2500,
maxChangeBytes: 5 * 1024 * 1024,
maxChunkChanges: 2000,
maxChunkChangeBytes: 5 * 1024 * 1024,
maxChunkChangeTime: 5000, // warn if total time for changes in a chunk takes longer than this
})
function checkElapsedTime(timer) {
const timeTaken = timer.elapsed()
if (timeTaken > limits.maxChunkChangeTime) {
console.log('warning: slow chunk', projectId, timeTaken)
}
}
/**
* Add changes to a chunk until the chunk is full
*
* The chunk is full if it reaches a certain number of changes or a certain
* size in bytes
*
* @param {core.Chunk} chunk
* @param {core.Change[]} changes
*/
async function fillChunk(chunk, changes) {
let totalBytes = totalChangeBytes(chunk.getChanges())
let changesPushed = false
while (changes.length > 0) {
if (chunk.getChanges().length >= limits.maxChunkChanges) {
break
}
const change = changes[0]
const changeBytes = countChangeBytes(change)
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) {
break
}
for (const operation of change.iterativelyApplyTo(currentSnapshot, {
strict: true,
})) {
await validateContentHash(operation)
}
chunk.pushChanges([change])
changes.shift()
totalBytes += changeBytes
changesPushed = true
}
return changesPushed
}
/**
* Check that the operation is valid and can be incorporated to the history.
*
* For now, this checks content hashes when they are provided.
*
* @param {core.Operation} operation
*/
async function validateContentHash(operation) {
if (operation instanceof core.EditFileOperation) {
const editOperation = operation.getOperation()
if (
editOperation instanceof core.TextOperation &&
editOperation.contentHash != null
) {
const path = operation.getPathname()
const file = currentSnapshot.getFile(path)
if (file == null) {
throw new InvalidChangeError('file not found for hash validation', {
projectId,
path,
})
}
await file.load('eager', blobStore)
const content = file.getContent({ filterTrackedDeletes: true })
const expectedHash = editOperation.contentHash
const actualHash = content != null ? getContentHash(content) : null
logger.debug({ expectedHash, actualHash }, 'validating content hash')
if (actualHash !== expectedHash) {
throw new InvalidChangeError('content hash mismatch', {
projectId,
path,
expectedHash,
actualHash,
})
}
// Remove the content hash from the change before storing it in the chunk.
// It was only useful for validation.
editOperation.contentHash = null
}
}
}
async function extendLastChunkIfPossible() {
const latestChunk = await chunkStore.loadLatest(projectId)
currentChunk = latestChunk
originalEndVersion = latestChunk.getEndVersion()
if (originalEndVersion !== clientEndVersion) {
throw new Chunk.ConflictingEndVersion(
clientEndVersion,
originalEndVersion
)
}
currentSnapshot = latestChunk.getSnapshot().clone()
const timer = new Timer()
currentSnapshot.applyAll(latestChunk.getChanges())
const changesPushed = await fillChunk(currentChunk, changesToPersist)
if (!changesPushed) {
return
}
checkElapsedTime(timer)
await chunkStore.update(
projectId,
originalEndVersion,
currentChunk,
earliestChangeTimestamp
)
}
async function createNewChunksAsNeeded() {
while (changesToPersist.length > 0) {
const endVersion = currentChunk.getEndVersion()
const history = new History(currentSnapshot.clone(), [])
const chunk = new Chunk(history, endVersion)
const timer = new Timer()
const changesPushed = await fillChunk(chunk, changesToPersist)
if (changesPushed) {
checkElapsedTime(timer)
currentChunk = chunk
await chunkStore.create(projectId, chunk, earliestChangeTimestamp)
} else {
throw new Error('failed to fill empty chunk')
}
}
}
function isOlderThanMinChangeTimestamp(change) {
return change.getTimestamp().getTime() < limits.minChangeTimestamp
}
function isOlderThanMaxChangeTimestamp(change) {
return change.getTimestamp().getTime() < limits.maxChangeTimestamp
}
const oldChanges = _.filter(allChanges, isOlderThanMinChangeTimestamp)
const anyTooOld = _.some(oldChanges, isOlderThanMaxChangeTimestamp)
const tooManyChanges = oldChanges.length > limits.maxChanges
const tooManyBytes = totalChangeBytes(oldChanges) > limits.maxChangeBytes
if (anyTooOld || tooManyChanges || tooManyBytes) {
changesToPersist = oldChanges
const numberOfChangesToPersist = oldChanges.length
await extendLastChunkIfPossible()
await createNewChunksAsNeeded()
return {
numberOfChangesPersisted: numberOfChangesToPersist,
originalEndVersion,
currentChunk,
}
} else {
return null
}
}
module.exports = persistChanges

View File

@@ -0,0 +1,27 @@
const _ = require('lodash')
const config = require('config')
const metrics = require('@overleaf/metrics')
const objectPersistor = require('@overleaf/object-persistor')
const persistorConfig = _.cloneDeep(config.get('persistor'))
function convertKey(key, convertFn) {
if (_.has(persistorConfig, key)) {
_.update(persistorConfig, key, convertFn)
}
}
convertKey('s3.signedUrlExpiryInMs', s => parseInt(s, 10))
convertKey('s3.httpOptions.timeout', s => parseInt(s, 10))
convertKey('s3.maxRetries', s => parseInt(s, 10))
convertKey('s3.pathStyle', s => s === 'true')
convertKey('gcs.unlockBeforeDelete', s => s === 'true')
convertKey('gcs.unsignedUrls', s => s === 'true')
convertKey('gcs.signedUrlExpiryInMs', s => parseInt(s, 10))
convertKey('gcs.deleteConcurrency', s => parseInt(s, 10))
convertKey('gcs.retryOptions.maxRetries', s => parseInt(s, 10))
convertKey('fallback.buckets', s => JSON.parse(s || '{}'))
persistorConfig.Metrics = metrics
module.exports = objectPersistor(persistorConfig)

View File

@@ -0,0 +1,140 @@
// @ts-check
'use strict'
/**
* @import { Snapshot } from 'overleaf-editor-core'
* @import { BlobStore } from '../../storage/lib/blob_store/index'
*/
const Archive = require('archiver')
const BPromise = require('bluebird')
const fs = require('node:fs')
const { pipeline } = require('node:stream')
const core = require('overleaf-editor-core')
const Snapshot = core.Snapshot
const OError = require('@overleaf/o-error')
const assert = require('./assert')
// The maximum safe concurrency appears to be 1.
// https://github.com/overleaf/issues/issues/1909
const FETCH_CONCURRENCY = 1 // number of files to fetch at once
const DEFAULT_ZIP_TIMEOUT = 25000 // ms
class DownloadError extends OError {
constructor(hash) {
super(`ProjectArchive: blob download failed: ${hash}`, { hash })
}
}
class ArchiveTimeout extends OError {
constructor() {
super('ProjectArchive timed out')
}
}
class MissingfileError extends OError {
constructor() {
super('ProjectArchive: attempting to look up a file that does not exist')
}
}
class ProjectArchive {
static ArchiveTimeout = ArchiveTimeout
static MissingfileError = MissingfileError
static DownloadError = DownloadError
/**
* @constructor
* @param {Snapshot} snapshot
* @param {number} [timeout] in ms
* @classdesc
* Writes the project snapshot to a zip file.
*/
constructor(snapshot, timeout) {
assert.instance(snapshot, Snapshot)
this.snapshot = snapshot
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
}
/**
* Write zip archive to the given file path.
*
* @param {BlobStore} blobStore
* @param {string} zipFilePath
*/
writeZip(blobStore, zipFilePath) {
const snapshot = this.snapshot
const timeout = this.timeout
const startTime = process.hrtime()
const archive = new Archive('zip')
// Convert elapsed seconds and nanoseconds to milliseconds.
function findElapsedMilliseconds() {
const elapsed = process.hrtime(startTime)
return elapsed[0] * 1e3 + elapsed[1] * 1e-6
}
function addFileToArchive(pathname) {
if (findElapsedMilliseconds() > timeout) {
throw new ProjectArchive.ArchiveTimeout()
}
const file = snapshot.getFile(pathname)
if (!file) {
throw new ProjectArchive.MissingfileError()
}
return file.load('eager', blobStore).then(function () {
const content = file.getContent({ filterTrackedDeletes: true })
if (content === null) {
return streamFileToArchive(pathname, file).catch(function (err) {
throw new ProjectArchive.DownloadError(file.getHash()).withCause(
err
)
})
} else {
archive.append(content, { name: pathname })
}
})
}
function streamFileToArchive(pathname, file) {
return new BPromise(function (resolve, reject) {
blobStore
.getStream(file.getHash())
.then(stream => {
stream.on('error', reject)
stream.on('end', resolve)
archive.append(stream, { name: pathname })
})
.catch(reject)
})
}
const addFilesToArchiveAndFinalize = BPromise.map(
snapshot.getFilePathnames(),
addFileToArchive,
{ concurrency: FETCH_CONCURRENCY }
).then(function () {
archive.finalize()
})
const streamArchiveToFile = new BPromise(function (resolve, reject) {
const stream = fs.createWriteStream(zipFilePath)
pipeline(archive, stream, function (err) {
if (err) {
reject(err)
} else {
resolve()
}
})
})
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
}
}
module.exports = ProjectArchive

View File

@@ -0,0 +1,24 @@
// Keep in sync with services/web/app/src/Features/History/project_key.js
const _ = require('lodash')
const path = require('node:path')
//
// The advice in http://docs.aws.amazon.com/AmazonS3/latest/dev/
// request-rate-perf-considerations.html is to avoid sequential key prefixes,
// so we reverse the project ID part of the key as they suggest.
//
function format(projectId) {
const prefix = naiveReverse(pad(projectId))
return path.join(prefix.slice(0, 3), prefix.slice(3, 6), prefix.slice(6))
}
function pad(number) {
return _.padStart(number, 9, '0')
}
function naiveReverse(string) {
return string.split('').reverse().join('')
}
exports.format = format
exports.pad = pad

View File

@@ -0,0 +1,19 @@
const config = require('config')
const redis = require('@overleaf/redis-wrapper')
const historyRedisOptions = config.get('redis.history')
const rclientHistory = redis.createClient(historyRedisOptions)
const lockRedisOptions = config.get('redis.history')
const rclientLock = redis.createClient(lockRedisOptions)
async function disconnect() {
await Promise.all([rclientHistory.disconnect(), rclientLock.disconnect()])
}
module.exports = {
rclientHistory,
rclientLock,
redis,
disconnect,
}

View File

@@ -0,0 +1,40 @@
// @ts-check
/**
* Promises are promises and streams are streams, and ne'er the twain shall
* meet.
* @module
*/
'use strict'
const Stream = require('node:stream')
const zlib = require('node:zlib')
const { WritableBuffer } = require('@overleaf/stream-utils')
/**
* Create a promise for the result of reading a stream to a buffer.
*
* @param {Stream.Readable} readStream
* @return {Promise<Buffer>}
*/
async function readStreamToBuffer(readStream) {
const bufferStream = new WritableBuffer()
await Stream.promises.pipeline(readStream, bufferStream)
return bufferStream.contents()
}
exports.readStreamToBuffer = readStreamToBuffer
/**
* Create a promise for the result of un-gzipping a stream to a buffer.
*
* @param {NodeJS.ReadableStream} readStream
* @return {Promise<Buffer>}
*/
async function gunzipStreamToBuffer(readStream) {
const gunzip = zlib.createGunzip()
const bufferStream = new WritableBuffer()
await Stream.promises.pipeline(readStream, gunzip, bufferStream)
return bufferStream.contents()
}
exports.gunzipStreamToBuffer = gunzipStreamToBuffer

View File

@@ -0,0 +1,25 @@
/*
* Taken from renderer/app/helpers/temp.js with minor cosmetic changes.
* Promisify the temp package. The temp package provides a 'track' feature
* that automatically cleans up temp files at process exit, but that is not
* very useful. They also provide a method to trigger cleanup, but that is not
* safe for concurrent use. So, we use a disposer to unlink the file.
*/
const BPromise = require('bluebird')
const fs = BPromise.promisifyAll(require('node:fs'))
const temp = BPromise.promisifyAll(require('temp'))
exports.open = function (affixes) {
return temp.openAsync(affixes).disposer(function (fileInfo) {
fs.closeAsync(fileInfo.fd)
.then(() => {
return fs.unlinkAsync(fileInfo.path)
})
.catch(function (err) {
if (err.code !== 'ENOENT') {
throw err
}
})
})
}

View File

@@ -0,0 +1,134 @@
'use strict'
const BPromise = require('bluebird')
const config = require('config')
const fs = require('node:fs')
const path = require('node:path')
const OError = require('@overleaf/o-error')
const objectPersistor = require('@overleaf/object-persistor')
const assert = require('./assert')
const { BlobStore } = require('./blob_store')
const persistor = require('./persistor')
const ProjectArchive = require('./project_archive')
const projectKey = require('./project_key')
const temp = require('./temp')
const BUCKET = config.get('zipStore.bucket')
function getZipKey(projectId, version) {
return path.join(
projectKey.format(projectId),
version.toString(),
'project.zip'
)
}
/**
* Store a zip of a given version of a project in bucket.
*
* @class
*/
class ZipStore {
/**
* Generate signed link to access the zip file.
*
* @param {number | string} projectId
* @param {number} version
* @return {string}
*/
async getSignedUrl(projectId, version) {
assert.projectId(projectId, 'bad projectId')
assert.integer(version, 'bad version')
const key = getZipKey(projectId, version)
return await persistor.getRedirectUrl(BUCKET, key)
}
/**
* Generate a zip of the given snapshot.
*
* @param {number | string} projectId
* @param {number} version
* @param {Snapshot} snapshot
*/
async storeZip(projectId, version, snapshot) {
assert.projectId(projectId, 'bad projectId')
assert.integer(version, 'bad version')
assert.object(snapshot, 'bad snapshot')
const zipKey = getZipKey(projectId, version)
if (await isZipPresent()) return
await BPromise.using(temp.open('zip'), async tempFileInfo => {
await zipSnapshot(tempFileInfo.path, snapshot)
await uploadZip(tempFileInfo.path)
})
// If the file is already there, we don't need to build the zip again. If we
// just HEAD the file, there's a race condition, because the zip files
// automatically expire. So, we try to copy the file from itself to itself,
// and if it fails, we know the file didn't exist. If it succeeds, this has
// the effect of re-extending its lifetime.
async function isZipPresent() {
try {
await persistor.copyObject(BUCKET, zipKey, zipKey)
return true
} catch (error) {
if (!(error instanceof objectPersistor.Errors.NotFoundError)) {
console.error(
'storeZip: isZipPresent: unexpected error (except in dev): %s',
error
)
}
return false
}
}
async function zipSnapshot(tempPathname, snapshot) {
const blobStore = new BlobStore(projectId)
const zipTimeoutMs = parseInt(config.get('zipStore.zipTimeoutMs'), 10)
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
try {
await archive.writeZip(blobStore, tempPathname)
} catch (err) {
throw new ZipStore.CreationError(projectId, version).withCause(err)
}
}
async function uploadZip(tempPathname, snapshot) {
const stream = fs.createReadStream(tempPathname)
try {
await persistor.sendStream(BUCKET, zipKey, stream, {
contentType: 'application/zip',
})
} catch (err) {
throw new ZipStore.UploadError(projectId, version).withCause(err)
}
}
}
}
class CreationError extends OError {
constructor(projectId, version) {
super(`Zip creation failed for ${projectId} version ${version}`, {
projectId,
version,
})
}
}
ZipStore.CreationError = CreationError
class UploadError extends OError {
constructor(projectId, version) {
super(`Zip upload failed for ${projectId} version ${version}`, {
projectId,
version,
})
}
}
ZipStore.UploadError = UploadError
module.exports = new ZipStore()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,647 @@
// @ts-check
import Events from 'node:events'
import fs from 'node:fs'
import Stream from 'node:stream'
import { ObjectId } from 'mongodb'
import logger from '@overleaf/logger'
import OError from '@overleaf/o-error'
import { Blob } from 'overleaf-editor-core'
import {
BlobStore,
getStringLengthOfFile,
GLOBAL_BLOBS,
makeBlobForFile,
} from '../lib/blob_store/index.js'
import { db } from '../lib/mongodb.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import filestorePersistor from '../lib/persistor.js'
import { setTimeout } from 'node:timers/promises'
// Silence warning.
Events.setMaxListeners(20)
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
/**
* @typedef {import("mongodb").Collection} Collection
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
*/
/**
* @typedef {Object} FileRef
* @property {ObjectId} _id
* @property {string} hash
*/
/**
* @typedef {Object} Folder
* @property {Array<Folder>} folders
* @property {Array<FileRef>} fileRefs
*/
/**
* @typedef {Object} Project
* @property {ObjectId} _id
* @property {Array<Folder>} rootFolder
* @property {{history: {id: (number|string)}}} overleaf
*/
/**
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
*/
function parseArgs() {
const args = commandLineArgs([
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
{ name: 'fixMissingHash', type: String, defaultValue: 'true' },
{ name: 'logs', type: String, defaultValue: '' },
])
/**
* commandLineArgs cannot handle --foo=false, so go the long way
* @param {string} name
* @return {boolean}
*/
function boolVal(name) {
const v = args[name]
if (['true', 'false'].includes(v)) return v === 'true'
throw new Error(`expected "true" or "false" for boolean option ${name}`)
}
return {
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
FIX_MISSING_HASH: boolVal('fixMissingHash'),
LOGS: args.logs,
}
}
const {
FIX_HASH_MISMATCH,
FIX_DELETE_PERMISSION,
FIX_NOT_FOUND,
FIX_MISSING_HASH,
LOGS,
} = parseArgs()
if (!LOGS) {
throw new Error('--logs parameter missing')
}
const BUFFER_DIR = fs.mkdtempSync(
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
)
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
if (!USER_FILES_BUCKET_NAME) {
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
}
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
const STREAM_HIGH_WATER_MARK = parseInt(
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
10
)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
/** @type {ProjectsCollection} */
const projectsCollection = db.collection('projects')
/** @type {DeletedProjectsCollection} */
const deletedProjectsCollection = db.collection('deletedProjects')
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated, draining queue')
}
class FileDeletedError extends OError {}
/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
const PROJECT_CACHE = new Map()
/**
* @param {string} projectId
* @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
*/
async function getProject(projectId) {
const cached = PROJECT_CACHE.get(projectId)
if (cached) return cached
let projectSoftDeleted
let project = await projectsCollection.findOne({
_id: new ObjectId(projectId),
})
if (project) {
projectSoftDeleted = false
} else {
const softDeleted = await deletedProjectsCollection.findOne({
'deleterData.deletedProjectId': new ObjectId(projectId),
project: { $exists: true },
})
if (!softDeleted) {
throw new OError('project hard-deleted')
}
project = softDeleted.project
projectSoftDeleted = true
}
PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
return { projectSoftDeleted, project }
}
/**
* @param {Folder} folder
* @param {string} fileId
* @return {{path: string, fileRef: FileRef, folder: Folder}|null}
*/
function getFileTreePath(folder, fileId) {
if (!folder) return null
let idx = 0
if (Array.isArray(folder.fileRefs)) {
for (const fileRef of folder.fileRefs) {
if (fileRef?._id.toString() === fileId) {
return {
fileRef,
path: `.fileRefs.${idx}`,
folder,
}
}
idx++
}
}
idx = 0
if (Array.isArray(folder.folders)) {
for (const child of folder.folders) {
const match = getFileTreePath(child, fileId)
if (match) {
return {
fileRef: match.fileRef,
folder: match.folder,
path: `.folders.${idx}${match.path}`,
}
}
idx++
}
}
return null
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
*/
async function findFile(projectId, fileId) {
const { projectSoftDeleted, project } = await getProject(projectId)
const match = getFileTreePath(project.rootFolder[0], fileId)
if (!match) {
throw new FileDeletedError('file not found in file-tree', {
projectSoftDeleted,
})
}
const { path, fileRef, folder } = match
let fullPath
let query
if (projectSoftDeleted) {
fullPath = `project.rootFolder.0${path}`
query = {
'deleterData.deletedProjectId': new ObjectId(projectId),
[`${fullPath}._id`]: new ObjectId(fileId),
}
} else {
fullPath = `rootFolder.0${path}`
query = {
_id: new ObjectId(projectId),
[`${fullPath}._id`]: new ObjectId(fileId),
}
}
return {
projectSoftDeleted,
query,
fullPath,
fileRef,
folder,
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixNotFound(line) {
const { projectId, fileId, bucketName } = JSON.parse(line)
if (bucketName !== USER_FILES_BUCKET_NAME) {
throw new OError('not found case for another bucket')
}
const { projectSoftDeleted, query, fullPath, fileRef, folder } =
await findFile(projectId, fileId)
logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
// Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
let result
if (projectSoftDeleted) {
result = await deletedProjectsCollection.updateOne(query, {
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
$inc: { 'project.version': 1 },
})
} else {
result = await projectsCollection.updateOne(query, {
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
$inc: { version: 1 },
})
}
if (result.matchedCount !== 1) {
throw new OError('file-tree write did not match', { result })
}
// Update the cache. The mongo-path of the next file will be off otherwise.
folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
return true
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<void>}
*/
async function setHashInMongo(projectId, fileId, hash) {
const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
projectId,
fileId
)
if (fileRef.hash === hash) return
logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
let result
if (projectSoftDeleted) {
result = await deletedProjectsCollection.updateOne(query, {
$set: { [`${fullPath}.hash`]: hash },
$inc: { 'project.version': 1 },
})
} else {
result = await projectsCollection.updateOne(query, {
$set: { [`${fullPath}.hash`]: hash },
$inc: { version: 1 },
})
}
if (result.matchedCount !== 1) {
throw new OError('file-tree write did not match', { result })
}
fileRef.hash = hash // Update cache for completeness.
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} historyId
* @return {Promise<void>}
*/
async function importRestoredFilestoreFile(projectId, fileId, historyId) {
const filestoreKey = `${projectId}/${fileId}`
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
let s
try {
s = await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
)
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
await Stream.promises.pipeline(
s,
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blobStore = new BlobStore(historyId)
const blob = await blobStore.putFile(path)
await backupBlob(historyId, blob, path)
await setHashInMongo(projectId, fileId, blob.getHash())
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} path
* @return {Promise<Blob>}
*/
async function bufferFilestoreFileToDisk(projectId, fileId, path) {
const filestoreKey = `${projectId}/${fileId}`
try {
await Stream.promises.pipeline(
await filestorePersistor.getObjectStream(
USER_FILES_BUCKET_NAME,
filestoreKey
),
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
)
const blob = await makeBlobForFile(path)
blob.setStringLength(
await getStringLengthOfFile(blob.getByteLength(), path)
)
return blob
} catch (err) {
if (err instanceof NotFoundError) {
throw new OError('missing blob, need to restore filestore file', {
filestoreKey,
})
}
throw err
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<string>}
*/
async function computeFilestoreFileHash(projectId, fileId) {
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
return blob.getHash()
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} projectId
* @param {string} fileId
* @return {Promise<void>}
*/
async function uploadFilestoreFile(projectId, fileId) {
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
try {
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
const hash = blob.getHash()
try {
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
} catch (err) {
if (!(err instanceof Blob.NotFoundError)) throw err
const { project } = await getProject(projectId)
const historyId = project.overleaf.history.id.toString()
const blobStore = new BlobStore(historyId)
await blobStore.putBlob(path, blob)
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
} finally {
await fs.promises.rm(path, { force: true })
}
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixHashMismatch(line) {
const {
projectId,
fileId,
hash: computedHash,
entry: {
hash: fileTreeHash,
ctx: { historyId },
},
} = JSON.parse(line)
const blobStore = new BlobStore(historyId)
if (await blobStore.getBlob(fileTreeHash)) {
throw new OError('found blob with computed filestore object hash')
}
if (!(await blobStore.getBlob(computedHash))) {
await importRestoredFilestoreFile(projectId, fileId, historyId)
return true
}
return await ensureBlobExistsForFileAndUploadToAWS(
projectId,
fileId,
computedHash
)
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
const { fileRef } = await findFile(projectId, fileId)
return fileRef.hash === hash
}
/**
* @param {string} projectId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function needsBackingUpToAWS(projectId, hash) {
if (GLOBAL_BLOBS.has(hash)) return false
return !(await _blobIsBackedUp(projectId, hash))
}
/**
* @param {string} projectId
* @param {string} fileId
* @param {string} hash
* @return {Promise<boolean>}
*/
async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
const { project } = await getProject(projectId)
const historyId = project.overleaf.history.id.toString()
const blobStore = new BlobStore(historyId)
if (
(await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
(await blobStore.getBlob(hash)) &&
!(await needsBackingUpToAWS(projectId, hash))
) {
return false // already processed
}
const stream = await blobStore.getStream(hash)
const path = `${BUFFER_DIR}/${historyId}_${hash}`
try {
await Stream.promises.pipeline(
stream,
fs.createWriteStream(path, {
highWaterMark: STREAM_HIGH_WATER_MARK,
})
)
const writtenBlob = await makeBlobForFile(path)
writtenBlob.setStringLength(
await getStringLengthOfFile(writtenBlob.getByteLength(), path)
)
if (writtenBlob.getHash() !== hash) {
// Double check download, better safe than sorry.
throw new OError('blob corrupted', { writtenBlob })
}
let blob = await blobStore.getBlob(hash)
if (!blob) {
// Calling blobStore.putBlob would result in the same error again.
// HACK: Skip upload to GCS and finalize putBlob operation directly.
await blobStore.backend.insertBlob(historyId, writtenBlob)
}
await backupBlob(historyId, writtenBlob, path)
} finally {
await fs.promises.rm(path, { force: true })
}
await setHashInMongo(projectId, fileId, hash)
return true
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixDeletePermission(line) {
let { projectId, fileId, hash } = JSON.parse(line)
if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
/**
* @param {string} line
* @return {Promise<boolean>}
*/
async function fixMissingHash(line) {
let { projectId, _id: fileId } = JSON.parse(line)
const {
fileRef: { hash },
} = await findFile(projectId, fileId)
if (hash) {
// processed, double check
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
}
await uploadFilestoreFile(projectId, fileId)
return true
}
const CASES = {
'not found': {
match: 'NotFoundError',
flag: FIX_NOT_FOUND,
action: fixNotFound,
},
'hash mismatch': {
match: 'OError: hash mismatch',
flag: FIX_HASH_MISMATCH,
action: fixHashMismatch,
},
'delete permission': {
match: 'storage.objects.delete',
flag: FIX_DELETE_PERMISSION,
action: fixDeletePermission,
},
'missing file hash': {
match: '"bad file hash"',
flag: FIX_MISSING_HASH,
action: fixMissingHash,
},
}
const STATS = {
processedLines: 0,
success: 0,
alreadyProcessed: 0,
fileDeleted: 0,
skipped: 0,
failed: 0,
unmatched: 0,
}
function logStats() {
console.log(
JSON.stringify({
time: new Date(),
gracefulShutdownInitiated,
...STATS,
})
)
}
setInterval(logStats, 10_000)
async function processLog() {
const rl = readline.createInterface({
input: fs.createReadStream(LOGS),
})
nextLine: for await (const line of rl) {
if (gracefulShutdownInitiated) break
STATS.processedLines++
if (
!(
line.includes('"failed to process file"') ||
// Process missing hashes as flagged by find_malformed_filetrees.mjs
line.includes('"bad file-tree path"')
)
) {
continue
}
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
if (!line.includes(match)) continue
if (flag) {
try {
if (await action(line)) {
STATS.success++
} else {
STATS.alreadyProcessed++
}
} catch (err) {
if (err instanceof FileDeletedError) {
STATS.fileDeleted++
logger.info({ err, line }, 'file deleted, skipping')
} else {
STATS.failed++
logger.error({ err, line }, `failed to fix ${name}`)
}
}
} else {
STATS.skipped++
}
continue nextLine
}
STATS.unmatched++
logger.warn({ line }, 'unknown fatal error')
}
}
async function main() {
try {
await processLog()
} finally {
logStats()
try {
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
} catch (err) {
console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
}
}
const { skipped, failed, unmatched } = STATS
await setTimeout(SLEEP_BEFORE_EXIT)
if (failed > 0) {
process.exit(Math.min(failed, 99))
} else if (unmatched > 0) {
process.exit(100)
} else if (skipped > 0) {
process.exit(101)
} else {
process.exit(0)
}
}
await main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,173 @@
// @ts-check
import commandLineArgs from 'command-line-args'
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
import withTmpDir from '../../api/controllers/with_tmp_dir.js'
import {
BlobStore,
GLOBAL_BLOBS,
loadGlobalBlobs,
} from '../lib/blob_store/index.js'
import assert from '../lib/assert.js'
import knex from '../lib/knex.js'
import { client } from '../lib/mongodb.js'
import redis from '../lib/redis.js'
import { setTimeout } from 'node:timers/promises'
import fs from 'node:fs'
await loadGlobalBlobs()
/**
* Gracefully shutdown the process
* @return {Promise<void>}
*/
async function gracefulShutdown() {
console.log('Gracefully shutting down')
await knex.destroy()
await client.close()
await redis.disconnect()
await setTimeout(100)
process.exit()
}
/**
*
* @param {string} row
* @return {BackupBlobJob}
*/
function parseCSVRow(row) {
const [historyId, hash] = row.split(',')
validateBackedUpBlobJob({ historyId, hash })
return { historyId, hash }
}
/**
*
* @param {BackupBlobJob} job
*/
function validateBackedUpBlobJob(job) {
assert.projectId(job.historyId)
assert.blobHash(job.hash)
}
/**
*
* @param {string} path
* @return {Promise<Array<BackupBlobJob>>}
*/
async function readCSV(path) {
let fh
/** @type {Array<BackupBlobJob>} */
const rows = []
try {
fh = await fs.promises.open(path, 'r')
} catch (error) {
console.error(`Could not open file: ${error}`)
throw error
}
for await (const line of fh.readLines()) {
try {
const row = parseCSVRow(line)
if (GLOBAL_BLOBS.has(row.hash)) {
console.log(`Skipping global blob: ${line}`)
continue
}
rows.push(row)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
console.log(`Skipping invalid row: ${line}`)
}
}
return rows
}
/**
* @typedef {Object} BackupBlobJob
* @property {string} hash
* @property {string} historyId
*/
/**
* @param {Object} options
* @property {string} [options.historyId]
* @property {string} [options.hash]
* @property {string} [options.input]
* @return {Promise<Array<BackupBlobJob>>}
*/
async function initialiseJobs({ historyId, hash, input }) {
if (input) {
return await readCSV(input)
}
if (!historyId) {
console.error('historyId is required')
process.exitCode = 1
await gracefulShutdown()
}
if (!hash) {
console.error('hash is required')
process.exitCode = 1
await gracefulShutdown()
}
validateBackedUpBlobJob({ historyId, hash })
if (GLOBAL_BLOBS.has(hash)) {
console.error(`Blob ${hash} is a global blob; not backing up`)
process.exitCode = 1
await gracefulShutdown()
}
return [{ hash, historyId }]
}
/**
*
* @param {string} historyId
* @param {string} hash
* @return {Promise<void>}
*/
export async function downloadAndBackupBlob(historyId, hash) {
const blobStore = new BlobStore(historyId)
const blob = await blobStore.getBlob(hash)
if (!blob) {
throw new Error(`Blob ${hash} could not be loaded`)
}
await withTmpDir(`blob-${hash}`, async tmpDir => {
const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
console.log(`Downloaded blob ${hash} to ${filePath}`)
await backupBlob(historyId, blob, filePath)
console.log('Backed up blob')
})
}
let jobs
const options = commandLineArgs([
{ name: 'historyId', type: String },
{ name: 'hash', type: String },
{ name: 'input', type: String },
])
try {
jobs = await initialiseJobs(options)
} catch (error) {
console.error(error)
await gracefulShutdown()
}
if (!Array.isArray(jobs)) {
// This is mostly to satisfy typescript
process.exitCode = 1
await gracefulShutdown()
process.exit(1)
}
for (const { historyId, hash } of jobs) {
try {
await downloadAndBackupBlob(historyId, hash)
} catch (error) {
console.error(error)
process.exitCode = 1
}
}
await gracefulShutdown()

View File

@@ -0,0 +1,153 @@
// @ts-check
import { ObjectId } from 'mongodb'
import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
import { db, client } from '../lib/mongodb.js'
const projectsCollection = db.collection('projects')
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
// Configuration
const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
? parseInt(process.argv[2], 10)
: 10000
const TARGET_ERROR_PERCENTAGE = process.argv[3]
? parseFloat(process.argv[3])
: 5.0
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated')
}
async function takeSample(sampleSize) {
const results = await projectsCollection
.aggregate(
[
{ $sample: { size: sampleSize } },
{
$match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
},
{
$count: 'total',
},
],
{ readPreference: READ_PREFERENCE_SECONDARY }
)
.toArray()
const count = results[0]?.total || 0
return { totalSampled: sampleSize, backedUp: count }
}
function calculateStatistics(
cumulativeSampled,
cumulativeBackedUp,
totalPopulation
) {
const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
// Standard error with finite population correction
const fpc = Math.sqrt(
(totalPopulation - cumulativeSampled) / (totalPopulation - 1)
)
const stdError =
Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
// 95% confidence interval is approximately ±1.96 standard errors
const marginOfError = 1.96 * stdError
return {
proportion,
percentage: (proportion * 100).toFixed(2),
marginOfError,
errorPercentage: (marginOfError * 100).toFixed(2),
lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
upperBound: ((proportion + marginOfError) * 100).toFixed(2),
sampleSize: cumulativeSampled,
populationSize: totalPopulation,
}
}
async function main() {
console.log('Date:', new Date().toISOString())
const totalCount = await projectsCollection.estimatedDocumentCount({
readPreference: READ_PREFERENCE_SECONDARY,
})
console.log(
`Total projects in collection (estimated): ${totalCount.toLocaleString()}`
)
console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
let cumulativeSampled = 0
let cumulativeBackedUp = 0
let currentError = Infinity
let iteration = 0
console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
console.log('----------|---------------|-------------|----------------')
while (currentError > TARGET_ERROR_PERCENTAGE) {
if (gracefulShutdownInitiated) {
console.log('Graceful shutdown initiated. Exiting sampling loop.')
break
}
iteration++
const { totalSampled, backedUp } = await takeSample(
SAMPLE_SIZE_PER_ITERATION
)
cumulativeSampled += totalSampled
cumulativeBackedUp += backedUp
const stats = calculateStatistics(
cumulativeSampled,
cumulativeBackedUp,
totalCount
)
currentError = parseFloat(stats.errorPercentage)
console.log(
`${iteration.toString().padStart(9)} | ` +
`${cumulativeSampled.toString().padStart(13)} | ` +
`${stats.percentage.padStart(10)}% | ` +
`\u00B1${stats.errorPercentage}%`
)
// Small delay between iterations
await new Promise(resolve => setTimeout(resolve, 100))
}
const finalStats = calculateStatistics(
cumulativeSampled,
cumulativeBackedUp,
totalCount
)
console.log(
`Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
)
console.log(
`Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
)
console.log(
`95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
)
console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err)
process.exitCode = 1
})
.finally(() => {
client.close().catch(err => console.error('Error closing MongoDB:', err))
})

View File

@@ -0,0 +1,429 @@
import Queue from 'bull'
import config from 'config'
import commandLineArgs from 'command-line-args'
import logger from '@overleaf/logger'
import {
listPendingBackups,
listUninitializedBackups,
getBackupStatus,
} from '../lib/backup_store/index.js'
logger.initialize('backup-queue')
// Use the same redis config as backup_worker
const redisOptions = config.get('redis.queue')
// Create a Bull queue named 'backup'
const backupQueue = new Queue('backup', {
redis: redisOptions,
defaultJobOptions: {
removeOnComplete: true,
removeOnFail: true,
},
})
// Define command-line options
const optionDefinitions = [
{ name: 'clean', type: Boolean },
{ name: 'status', type: Boolean },
{
name: 'add',
type: String,
multiple: true,
description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
},
{ name: 'monitor', type: Boolean },
{
name: 'queue-pending',
type: Number,
description:
'Find projects with pending changes older than N seconds and add them to the queue',
},
{
name: 'show-pending',
type: Number,
description:
'Show count of pending projects older than N seconds without adding to queue',
},
{
name: 'limit',
type: Number,
description: 'Limit the number of jobs to be added',
},
{
name: 'interval',
type: Number,
description: 'Time in seconds to spread jobs over (default: 300)',
defaultValue: 300,
},
{
name: 'backoff-delay',
type: Number,
description:
'Backoff delay in milliseconds for failed jobs (default: 1000)',
defaultValue: 1000,
},
{
name: 'attempts',
type: Number,
description: 'Number of retry attempts for failed jobs (default: 3)',
defaultValue: 3,
},
{
name: 'warn-threshold',
type: Number,
description: 'Warn about any project exceeding this pending age',
defaultValue: 2 * 3600, // 2 hours
},
{
name: 'verbose',
alias: 'v',
type: Boolean,
description: 'Show detailed information when used with --show-pending',
},
]
// Parse command line arguments
const options = commandLineArgs(optionDefinitions)
const WARN_THRESHOLD = options['warn-threshold']
// Helper to validate date format
function isValidDateFormat(dateStr) {
return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
}
// Helper to validate the pending time parameter
function validatePendingTime(option, value) {
if (typeof value !== 'number' || value <= 0) {
console.error(
`Error: --${option} requires a positive numeric TIME argument in seconds`
)
console.error(`Example: --${option} 3600`)
process.exit(1)
}
return value
}
// Helper to format the pending time display
function formatPendingTime(timestamp) {
const now = new Date()
const diffMs = now - timestamp
const seconds = Math.floor(diffMs / 1000)
return `${timestamp.toISOString()} (${seconds} seconds ago)`
}
// Helper to add a job to the queue, checking for duplicates
async function addJobWithCheck(queue, data, options) {
const jobId = options.jobId
// Check if the job already exists
const existingJob = await queue.getJob(jobId)
if (existingJob) {
return { job: existingJob, added: false }
} else {
const job = await queue.add(data, options)
return { job, added: true }
}
}
// Setup queue event listeners
function setupMonitoring() {
console.log('Starting queue monitoring. Press Ctrl+C to exit.')
backupQueue.on('global:error', error => {
logger.info({ error }, 'Queue error')
})
backupQueue.on('global:waiting', jobId => {
logger.info({ jobId }, 'job is waiting')
})
backupQueue.on('global:active', jobId => {
logger.info({ jobId }, 'job is now active')
})
backupQueue.on('global:stalled', jobId => {
logger.info({ jobId }, 'job has stalled')
})
backupQueue.on('global:progress', (jobId, progress) => {
logger.info({ jobId, progress }, 'job progress')
})
backupQueue.on('global:completed', (jobId, result) => {
logger.info({ jobId, result }, 'job completed')
})
backupQueue.on('global:failed', (jobId, err) => {
logger.info({ jobId, err }, 'job failed')
})
backupQueue.on('global:paused', () => {
logger.info({}, 'Queue paused')
})
backupQueue.on('global:resumed', () => {
logger.info({}, 'Queue resumed')
})
backupQueue.on('global:cleaned', (jobs, type) => {
logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
})
backupQueue.on('global:drained', () => {
logger.info({}, 'Queue drained')
})
backupQueue.on('global:removed', jobId => {
logger.info({ jobId }, 'Job removed')
})
}
async function addDateRangeJob(input) {
const [startDate, endDate] = input.split(':')
if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
console.error(
`Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
)
return
}
const jobId = `backup-${startDate}-to-${endDate}`
const { job, added } = await addJobWithCheck(
backupQueue,
{ startDate, endDate },
{ jobId }
)
console.log(
`${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
)
}
// Helper to list pending and uninitialized backups
// This function combines the two cursors into a single generator
// to yield projects from both lists
async function* pendingCursor(timeIntervalMs, limit) {
for await (const project of listPendingBackups(timeIntervalMs, limit)) {
yield project
}
for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
yield project
}
}
// Process pending projects with changes older than the specified seconds
async function processPendingProjects(
age,
showOnly,
limit,
verbose,
jobInterval,
jobOpts = {}
) {
const timeIntervalMs = age * 1000
console.log(
`Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
)
let count = 0
let addedCount = 0
let existingCount = 0
// Pass the limit directly to MongoDB query for better performance
const changeTimes = []
for await (const project of pendingCursor(timeIntervalMs, limit)) {
const projectId = project._id.toHexString()
const pendingAt =
project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
if (pendingAt) {
changeTimes.push(pendingAt)
const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
if (pendingAge > WARN_THRESHOLD) {
try {
const backupStatus = await getBackupStatus(projectId)
logger.warn(
{
projectId,
pendingAt,
pendingAge,
backupStatus,
warnThreshold: WARN_THRESHOLD,
},
`pending change exceeds rpo warning threshold`
)
} catch (err) {
logger.error(
{ projectId, pendingAt, pendingAge },
'Error getting backup status'
)
throw err
}
}
}
if (showOnly && verbose) {
console.log(
`Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
)
} else if (!showOnly) {
const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
const { job, added } = await addJobWithCheck(
backupQueue,
{ projectId, pendingChangeAt: pendingAt.getTime() },
{ ...jobOpts, delay, jobId: projectId }
)
if (added) {
if (verbose) {
console.log(
`Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
)
}
addedCount++
} else {
if (verbose) {
console.log(
`Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
)
}
existingCount++
}
}
count++
if (count % 1000 === 0) {
console.log(
`Processed ${count} projects`,
showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
)
}
}
// Set oldestChange to undefined if there are no changes
const oldestChange =
changeTimes.length > 0
? changeTimes.reduce((min, time) => (time < min ? time : min))
: undefined
if (showOnly) {
console.log(
`Found ${count} projects with pending changes (not added to queue)`
)
} else {
console.log(`Found ${count} projects with pending changes:`)
console.log(` ${addedCount} jobs added to queue`)
console.log(` ${existingCount} jobs already existed in queue`)
if (oldestChange) {
console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`)
}
}
}
// Main execution block
async function run() {
const optionCount = [
options.clean,
options.status,
options.add,
options.monitor,
options['queue-pending'] !== undefined,
options['show-pending'] !== undefined,
].filter(Boolean).length
if (optionCount > 1) {
console.error('Only one option can be specified')
process.exit(1)
}
if (options.clean) {
const beforeCounts = await backupQueue.getJobCounts()
console.log('Current queue state:', JSON.stringify(beforeCounts))
console.log('Cleaning completed and failed jobs...')
await backupQueue.clean(1, 'completed')
await backupQueue.clean(1, 'failed')
const afterCounts = await backupQueue.getJobCounts()
console.log('Current queue state:', JSON.stringify(afterCounts))
console.log('Queue cleaned successfully')
} else if (options.status) {
const counts = await backupQueue.getJobCounts()
console.log('Current queue state:', JSON.stringify(counts))
} else if (options.add) {
const inputs = Array.isArray(options.add) ? options.add : [options.add]
for (const input of inputs) {
if (input.includes(':')) {
// Handle date range format
await addDateRangeJob(input)
} else {
// Handle project ID format
const { job, added } = await addJobWithCheck(
backupQueue,
{ projectId: input },
{ jobId: input }
)
console.log(
`${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
)
}
}
} else if (options.monitor) {
setupMonitoring()
} else if (options['queue-pending'] !== undefined) {
const age = validatePendingTime('queue-pending', options['queue-pending'])
await processPendingProjects(
age,
false,
options.limit,
options.verbose,
options.interval,
{
attempts: options.attempts,
backoff: {
type: 'exponential',
delay: options['backoff-delay'],
},
}
)
} else if (options['show-pending'] !== undefined) {
const age = validatePendingTime('show-pending', options['show-pending'])
await processPendingProjects(age, true, options.limit, options.verbose)
} else {
console.log('Usage:')
console.log(' --clean Clean up completed and failed jobs')
console.log(' --status Show current job counts')
console.log(' --add [projectId] Add a job for the specified projectId')
console.log(
' --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
)
console.log(' --monitor Monitor queue events')
console.log(
' --queue-pending TIME Find projects with changes older than TIME seconds and add them to the queue'
)
console.log(
' --show-pending TIME Show count of pending projects older than TIME seconds'
)
console.log(' --limit N Limit the number of jobs to be added')
console.log(
' --interval TIME Time interval in seconds to spread jobs over'
)
console.log(
' --backoff-delay TIME Backoff delay in milliseconds for failed jobs (default: 1000)'
)
console.log(
' --attempts N Number of retry attempts for failed jobs (default: 3)'
)
console.log(
' --verbose, -v Show detailed information when used with --show-pending'
)
}
}
// Run and handle errors
run()
.catch(err => {
console.error('Error:', err)
process.exit(1)
})
.then(result => {
// Only exit if not in monitor mode
if (!options.monitor) {
process.exit(0)
}
})

View File

@@ -0,0 +1,144 @@
import Queue from 'bull'
import logger from '@overleaf/logger'
import config from 'config'
import metrics from '@overleaf/metrics'
import {
backupProject,
initializeProjects,
configureBackup,
} from './backup.mjs'
const CONCURRENCY = 15
const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
const redisOptions = config.get('redis.queue')
const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
const LAG_TIME_BUCKETS_HRS = [
0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
] // hours
// Configure backup settings to match worker concurrency
configureBackup({ concurrency: 50, useSecondary: true })
// Create a Bull queue named 'backup'
const backupQueue = new Queue('backup', {
redis: redisOptions,
settings: {
lockDuration: 15 * 60 * 1000, // 15 minutes
lockRenewTime: 60 * 1000, // 1 minute
maxStalledCount: 0, // mark stalled jobs as failed
},
})
// Log queue events
backupQueue.on('active', job => {
logger.debug({ job }, 'job is now active')
})
backupQueue.on('completed', (job, result) => {
metrics.inc('backup_worker_job', 1, { status: 'completed' })
logger.debug({ job, result }, 'job completed')
})
backupQueue.on('failed', (job, err) => {
metrics.inc('backup_worker_job', 1, { status: 'failed' })
logger.error({ job, err }, 'job failed')
})
backupQueue.on('waiting', jobId => {
logger.debug({ jobId }, 'job is waiting')
})
backupQueue.on('error', error => {
logger.error({ error }, 'queue error')
})
backupQueue.on('stalled', job => {
logger.error({ job }, 'job has stalled')
})
backupQueue.on('lock-extension-failed', (job, err) => {
logger.error({ job, err }, 'lock extension failed')
})
backupQueue.on('paused', () => {
logger.info('queue paused')
})
backupQueue.on('resumed', () => {
logger.info('queue resumed')
})
// Process jobs
backupQueue.process(CONCURRENCY, async job => {
const { projectId, startDate, endDate } = job.data
if (projectId) {
return await runBackup(projectId, job.data, job)
} else if (startDate && endDate) {
return await runInit(startDate, endDate)
} else {
throw new Error('invalid job data')
}
})
async function runBackup(projectId, data, job) {
const { pendingChangeAt } = data
// record the time it takes to run the backup job
const timer = new metrics.Timer(
'backup_worker_job_duration',
1,
{},
JOB_TIME_BUCKETS
)
const pendingAge = Date.now() - pendingChangeAt
if (pendingAge > WARN_THRESHOLD) {
logger.warn(
{ projectId, pendingAge, job },
'project has been pending for a long time'
)
}
try {
logger.debug({ projectId }, 'processing backup for project')
await backupProject(projectId, {})
metrics.inc('backup_worker_project', 1, {
status: 'success',
})
timer.done()
// record the replication lag (time from change to backup)
if (pendingChangeAt) {
metrics.histogram(
'backup_worker_replication_lag_in_hours',
(Date.now() - pendingChangeAt) / (3600 * 1000),
LAG_TIME_BUCKETS_HRS
)
}
return `backup completed ${projectId}`
} catch (err) {
metrics.inc('backup_worker_project', 1, { status: 'failed' })
logger.error({ projectId, err }, 'backup failed')
throw err // Re-throw to mark job as failed
}
}
async function runInit(startDate, endDate) {
try {
logger.info({ startDate, endDate }, 'initializing projects')
await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
return `initialization completed ${startDate} - ${endDate}`
} catch (err) {
logger.error({ startDate, endDate, err }, 'initialization failed')
throw err
}
}
export async function drainQueue() {
logger.info({ queue: backupQueue.name }, 'pausing queue')
await backupQueue.pause(true) // pause this worker and wait for jobs to finish
logger.info({ queue: backupQueue.name }, 'closing queue')
await backupQueue.close()
}
export async function healthCheck() {
const count = await backupQueue.count()
metrics.gauge('backup_worker_queue_length', count)
}

View File

@@ -0,0 +1,69 @@
/**
* A script to export the global blobs from mongo to a CSV file.
*
* node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
*
* The output CSV has the following format:
*
* hash,path,byteLength,stringLength,demoted
*
* hash: the hash of the blob
* path: the path of the blob in the blob store
* byteLength: the byte length of the blob, or empty if unknown
* stringLength: the string length of the blob, or empty if unknown
* demoted: true if the blob has been demoted to a reference, false otherwise
*/
// @ts-check
import { ObjectId } from 'mongodb'
import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
import { client } from '../lib/mongodb.js'
import commandLineArgs from 'command-line-args'
import fs from 'node:fs'
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
function parseArgs() {
const args = commandLineArgs([
{
name: 'output',
type: String,
alias: 'o',
},
])
const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
return {
OUTPUT_STREAM,
}
}
const { OUTPUT_STREAM } = parseArgs()
async function main() {
await loadGlobalBlobs()
OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
const { hash: blobHash, byteLength, stringLength } = blob
if (blobHash !== hash) {
throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
}
const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
const byteLengthStr = byteLength === null ? '' : byteLength
const stringLengthStr = stringLength === null ? '' : stringLength
OUTPUT_STREAM.write(
`${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
)
}
}
main()
.then(() => console.log('Done.'))
.catch(err => {
console.error('Error:', err)
process.exitCode = 1
})
.finally(() => {
client.close().catch(err => console.error('Error closing MongoDB:', err))
})

View File

@@ -0,0 +1,51 @@
// @ts-check
import { backedUpBlobs } from '../lib/mongodb.js'
import { mongoId } from '../lib/assert.js'
import { ObjectId } from 'mongodb'
import commandLineArgs from 'command-line-args'
const STATS = {
total: 0,
replaced: 0,
skipped: 0,
}
const config = commandLineArgs([
{ name: 'commit', type: Boolean, defaultValue: false },
])
async function processRecord(record) {
STATS.total++
try {
mongoId(record._id)
const newId = new ObjectId(record._id)
if (config.commit) {
await backedUpBlobs.updateOne(
{ _id: newId },
{
$addToSet: { blobs: { $each: record.blobs } },
},
{ upsert: true }
)
await backedUpBlobs.deleteOne({ _id: record._id })
}
STATS.replaced++
} catch (error) {
console.log(error)
STATS.skipped++
}
}
const cursor = backedUpBlobs
.find({ _id: { $type: 'string' } })
.project({ _id: 1, blobs: 1 })
while (await cursor.hasNext()) {
const record = await cursor.next()
await processRecord(record)
}
console.log(
`${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
)
process.exit()

View File

@@ -0,0 +1,3 @@
UPDATE blobs
SET global = TRUE
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);

View File

@@ -0,0 +1,16 @@
CREATE TABLE global_blobs (
hash_bytes bytea NOT NULL,
byte_length integer NOT NULL,
string_length integer,
global boolean,
CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
CONSTRAINT global_blobs_byte_length_non_negative
CHECK (byte_length >= 0),
CONSTRAINT global_blobs_string_length_non_negative
CHECK (string_length IS NULL OR string_length >= 0)
);
INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
SELECT hash_bytes, byte_length, string_length, true
FROM blobs
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);

View File

@@ -0,0 +1,22 @@
BEGIN;
ALTER TABLE blobs RENAME TO old_blobs;
ALTER TABLE global_blobs RENAME TO blobs;
ALTER TABLE old_blobs
RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
ALTER TABLE old_blobs
RENAME CONSTRAINT blobs_byte_length_non_negative
TO old_blobs_byte_length_non_negative;
ALTER TABLE old_blobs
RENAME CONSTRAINT blobs_string_length_non_negative
TO old_blobs_string_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
ALTER TABLE blobs
RENAME CONSTRAINT global_blobs_byte_length_non_negative
TO blobs_byte_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT global_blobs_string_length_non_negative
TO blobs_string_length_non_negative;
COMMIT;

View File

@@ -0,0 +1,9 @@
Scripts in this directory were used when we cleaned up the global blobs table,
ensuring that it only contained global blobs. The scripts are meant to be run in this order:
* `01-create-blob-hashes-table.sql`
* `02-set-global-flag.sql`
* `03-create-global-blobs-table.sql`
* `04-swap-global-blob-tables.sql`
The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.

View File

@@ -0,0 +1,22 @@
BEGIN;
ALTER TABLE blobs RENAME TO global_blobs;
ALTER TABLE old_blobs RENAME TO blobs;
ALTER TABLE global_blobs
RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
ALTER TABLE global_blobs
RENAME CONSTRAINT blobs_byte_length_non_negative
TO global_blobs_byte_length_non_negative;
ALTER TABLE global_blobs
RENAME CONSTRAINT blobs_string_length_non_negative
TO global_blobs_string_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
ALTER TABLE blobs
RENAME CONSTRAINT old_blobs_byte_length_non_negative
TO blobs_byte_length_non_negative;
ALTER TABLE blobs
RENAME CONSTRAINT old_blobs_string_length_non_negative
TO blobs_string_length_non_negative;
COMMIT;

View File

@@ -0,0 +1,379 @@
const fsPromises = require('node:fs/promises')
const { ObjectId } = require('mongodb')
const BPromise = require('bluebird')
const logger = require('@overleaf/logger')
const Settings = require('@overleaf/settings')
const rclient = require('@overleaf/redis-wrapper').createClient(
Settings.redis.documentupdater
)
const mongodb = require('../lib/mongodb')
const { chunkStore } = require('..')
const Events = require('node:events')
// Silence warning.
Events.setMaxListeners(20)
const BATCH_SIZE = 1000
const OPTIONS = {
concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
'skip-history-failures':
process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
}
const db = {
deletedProjects: mongodb.db.collection('deletedProjects'),
docs: mongodb.db.collection('docs'),
migrations: mongodb.db.collection('migrations'),
projects: mongodb.db.collection('projects'),
}
const BAD_MIGRATION_NAME =
'20231219081700_move_doc_versions_from_docops_to_docs'
const RECOVERY_FILES_502 = [
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
]
let loggingChain = Promise.resolve()
const projectIdsThatNeedResyncing = []
const unflushedDocIds = new Set()
async function flushLogQueue() {
const logPath = OPTIONS['resyncs-needed-file']
loggingChain = loggingChain.then(async () => {
const batch = projectIdsThatNeedResyncing.splice(0)
if (batch.length === 0) return
try {
await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
} catch (err) {
projectIdsThatNeedResyncing.push(...batch)
logger.err({ err, logPath, batch }, 'Failed to write to log file')
}
})
await loggingChain
}
async function recordProjectNeedsResync(projectId) {
if (OPTIONS['resyncs-needed-file']) {
projectIdsThatNeedResyncing.push(projectId)
await flushLogQueue()
} else {
console.log(`Project ${projectId} needs a hard resync.`)
}
}
async function main() {
const recovery502Ran = await did502RecoveryRun()
await getUnflushedDocIds()
const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
// Tell customers that they need to flush
console.log(`
--------------------------------------------------------------------
Detected unflushed changes while recovering doc versions.
Please go back to version 5.0.1 and follow the recovery procedure
for flushing document updates:
https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
--------------------------------------------------------------------`)
process.exit(1)
}
if (OPTIONS.force || recovery502Ran || badMigration != null) {
console.warn('Need to recover doc versions. This will take a while.')
await runRecovery()
await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
await delete502RecoveryFiles()
}
console.log('Done.')
}
async function did502RecoveryRun() {
for (const file of RECOVERY_FILES_502) {
try {
await fsPromises.stat(file)
return true
} catch (err) {
// file doesn't exist. continue
}
}
return false
}
async function delete502RecoveryFiles() {
for (const file of RECOVERY_FILES_502) {
try {
await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
} catch (err) {
// file doesn't exist. continue
}
}
}
async function runRecovery() {
let batch = []
const summary = {
ignored: 0,
skipped: 0,
deletedUpdatedMongo: 0,
deletedUpdatedRedis: 0,
deletedUpdatedBoth: 0,
deletedIgnored: 0,
updatedMongo: 0,
updatedRedis: 0,
updatedBoth: 0,
}
const processBatchAndLogProgress = async () => {
try {
await BPromise.map(batch, project => processProject(project, summary), {
concurrency: OPTIONS.concurrency,
})
} finally {
console.log(`${summary.updatedRedis} projects updated in Redis`)
console.log(`${summary.updatedMongo} projects updated in Mongo`)
console.log(
`${summary.updatedBoth} projects updated in both Mongo and Redis`
)
console.log(`${summary.ignored} projects had good versions`)
console.log(
`${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
)
console.log(
`${summary.deletedUpdatedRedis} deleted projects updated in Redis`
)
console.log(
`${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
)
console.log(
`${summary.deletedIgnored} deleted projects had good versions`
)
console.log(`${summary.skipped} projects skipped`)
}
batch = []
}
await printDBStats()
await initResyncsNeededFile()
for await (const project of getProjects()) {
batch.push(project)
if (batch.length >= BATCH_SIZE) {
await processBatchAndLogProgress()
}
}
for await (const deletedProject of getDeletedProjects()) {
const project = deletedProject.project
project.isDeleted = true
batch.push(project)
if (batch.length >= BATCH_SIZE) {
await processBatchAndLogProgress()
}
}
if (batch.length > 0) {
await processBatchAndLogProgress()
}
await backfillMissingVersions()
}
async function getUnflushedDocIds() {
const batchSize = 1000
let cursor = '0'
do {
const [newCursor, keys] = await rclient.scan(
cursor,
'MATCH',
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
'COUNT',
batchSize
)
for (const key of keys) {
unflushedDocIds.add(key.slice('DocVersion:'.length))
}
cursor = newCursor
} while (cursor !== '0')
}
async function printDBStats() {
const projects = await db.projects.estimatedDocumentCount()
const deletedProjects = await db.deletedProjects.countDocuments()
const docs = await db.docs.estimatedDocumentCount()
console.log(
`Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
)
}
async function initResyncsNeededFile() {
const logPath = OPTIONS['resyncs-needed-file']
if (logPath) {
await fsPromises.writeFile(logPath, '')
await fsPromises.rm(`${logPath}.done`, { force: true })
}
}
function getProjects() {
return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
}
function getDeletedProjects() {
return db.deletedProjects.find(
{ 'project.overleaf.history.id': { $exists: true } },
{ projection: { 'project._id': 1, 'project.overleaf': 1 } }
)
}
async function processProject(project, summary) {
const projectId = project._id.toString()
let updatedMongo = false
let updatedRedis = false
try {
const historyDocVersions = await getHistoryDocVersions(project)
for (const { docId, version } of historyDocVersions) {
const update = await fixDocVersion(docId, version)
if (update != null) {
if (update.in === 'mongo') {
updatedMongo = true
} else if (update.in === 'redis') {
updatedRedis = true
}
}
}
if (project.isDeleted) {
if (updatedMongo && updatedRedis) {
summary.deletedUpdatedBoth += 1
} else if (updatedMongo) {
summary.deletedUpdatedMongo += 1
} else if (updatedRedis) {
summary.deletedUpdatedRedis += 1
} else {
summary.deletedIgnored += 1
}
} else {
await recordProjectNeedsResync(projectId)
if (updatedMongo && updatedRedis) {
summary.updatedBoth += 1
} else if (updatedMongo) {
summary.updatedMongo += 1
} else if (updatedRedis) {
summary.updatedRedis += 1
} else {
summary.ignored += 1
}
}
} catch (err) {
logger.error({ err, projectId }, 'Failed to process project')
if (OPTIONS['skip-history-failures']) {
summary.skipped += 1
} else {
throw err
}
}
}
async function getHistoryDocVersions(project) {
const historyId = project.overleaf.history.id
const chunk = await chunkStore.loadLatest(historyId)
if (chunk == null) {
return []
}
const snapshot = chunk.getSnapshot()
const changes = chunk.getChanges()
snapshot.applyAll(changes)
const v2DocVersions = snapshot.getV2DocVersions()
if (v2DocVersions == null) {
return []
}
return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
docId,
version: versionInfo.v,
}))
}
async function fixDocVersion(docId, historyVersion) {
const redisVersion = await getRedisDocVersion(docId)
if (redisVersion != null && historyVersion >= redisVersion) {
await setRedisDocVersion(docId, historyVersion + 1)
return {
in: 'redis',
previousVersion: redisVersion,
newVersion: historyVersion + 1,
}
} else {
const docBeforeUpdate = await db.docs.findOneAndUpdate(
{
_id: new ObjectId(docId),
$or: [
{ version: { $lte: historyVersion } },
{ version: { $exists: false } },
],
},
{ $set: { version: historyVersion + 1 } },
{ projection: { _id: 1, version: 1 } }
)
if (docBeforeUpdate != null) {
return {
in: 'mongo',
previousVersion: docBeforeUpdate.version,
newVersion: historyVersion + 1,
}
} else {
return null
}
}
}
async function getRedisDocVersion(docId) {
if (!unflushedDocIds.has(docId)) {
return null
}
const result = await rclient.get(
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
)
if (result == null) {
return null
}
return parseInt(result, 10)
}
async function setRedisDocVersion(docId, version) {
const multi = rclient.multi()
multi.set(
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
version
)
multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
await multi.exec()
}
/**
* Set all remaining versions to 0
*/
async function backfillMissingVersions() {
console.log('Defaulting version to 0 for remaining docs.')
await db.docs.updateMany(
{ version: { $exists: false } },
{ $set: { version: 0 } }
)
}
main()
.finally(async () => {
console.log('Flushing log queue.')
await flushLogQueue()
})
.then(() => {
process.exit(0)
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,255 @@
/**
* Try to recover a zip of the latest version of a project using only data in
* GCS, where this data may have been (recently) hard deleted (i.e. may exist
* wholely or in part as non-current versions). This should be able to
* retrieve the latest content of a project up to 180 days after it was
* deleted.
*
* Usage:
* node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
*
* Output:
* Signed URL(s) for the uploaded zip files. Note that these are valid for
* only 24h, to match the lifecycle rule on the zip bucket.
*/
const fs = require('node:fs')
const os = require('node:os')
const path = require('node:path')
const util = require('node:util')
// Something is registering 11 listeners, over the limit of 10, which generates
// a lot of warning noise.
require('node:events').EventEmitter.defaultMaxListeners = 11
const config = require('config')
// We depend on this via object-persistor.
// eslint-disable-next-line import/no-extraneous-dependencies
const { Storage } = require('@google-cloud/storage')
const isValidUtf8 = require('utf-8-validate')
const core = require('overleaf-editor-core')
const projectKey = require('../lib/project_key')
const streams = require('../lib/streams')
const ProjectArchive = require('../lib/project_archive')
const {
values: { verbose: VERBOSE },
positionals: HISTORY_IDS,
} = util.parseArgs({
options: {
verbose: {
type: 'boolean',
default: false,
},
},
allowPositionals: true,
})
if (HISTORY_IDS.length === 0) {
console.error('no history IDs; see usage')
process.exit(1)
}
async function listDeletedChunks(historyId) {
const bucketName = config.get('chunkStore.bucket')
const storage = new Storage()
const [files] = await storage.bucket(bucketName).getFiles({
prefix: projectKey.format(historyId),
versions: true,
})
return files
}
async function findLatestChunk(historyId) {
const files = await listDeletedChunks(historyId)
if (files.length === 0) return null
files.sort((a, b) => {
if (a.name < b.name) return -1
if (a.name > b.name) return 1
return 0
})
return files[files.length - 1]
}
async function downloadLatestChunk(tmp, historyId) {
const latestChunkFile = await findLatestChunk(historyId)
if (!latestChunkFile) throw new Error('no chunk found to recover')
const destination = path.join(tmp, 'latest.json')
await latestChunkFile.download({ destination })
return destination
}
async function loadHistory(historyPathname) {
const data = await fs.promises.readFile(historyPathname)
const rawHistory = JSON.parse(data)
return core.History.fromRaw(rawHistory)
}
async function loadChunk(historyPathname, blobStore) {
const history = await loadHistory(historyPathname)
const blobHashes = new Set()
history.findBlobHashes(blobHashes)
await blobStore.fetchBlobs(blobHashes)
await history.loadFiles('lazy', blobStore)
return new core.Chunk(history, 0)
}
// TODO: it would be nice to export / expose this from BlobStore;
// currently this is a copy of the method there.
async function getStringLengthOfFile(byteLength, pathname) {
// We have to read the file into memory to get its UTF-8 length, so don't
// bother for files that are too large for us to edit anyway.
if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
return null
}
// We need to check if the file contains nonBmp or null characters
let data = await fs.promises.readFile(pathname)
if (!isValidUtf8(data)) return null
data = data.toString()
if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
if (core.util.containsNonBmpChars(data)) return null
if (data.indexOf('\x00') !== -1) return null
return data.length
}
class RecoveryBlobStore {
constructor(historyId, tmp) {
this.historyId = historyId
this.tmp = tmp
this.blobs = new Map()
}
async fetchBlobs(blobHashes) {
for await (const blobHash of blobHashes) {
await this.fetchBlob(blobHash)
}
}
async fetchBlob(hash) {
if (this.blobs.has(hash)) return
if (VERBOSE) console.log('fetching blob', hash)
const bucketName = config.get('blobStore.projectBucket')
const storage = new Storage()
const [files] = await storage.bucket(bucketName).getFiles({
prefix: this.makeProjectBlobKey(hash),
versions: true,
})
const destination = this.getBlobPathname(hash)
if (files.length === 0) {
await this.fetchGlobalBlob(hash, destination)
} else if (files.length === 1) {
await files[0].download({ destination })
} else {
throw new Error('Multiple versions of blob ' + hash)
}
this.blobs.set(hash, await this.makeBlob(hash, destination))
}
async fetchGlobalBlob(hash, destination) {
const bucketName = config.get('blobStore.globalBucket')
const storage = new Storage()
const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
await file.download({ destination })
}
async makeBlob(hash, pathname) {
const stat = await fs.promises.stat(pathname)
const byteLength = stat.size
const stringLength = await getStringLengthOfFile(byteLength, pathname)
return new core.Blob(hash, byteLength, stringLength)
}
async getString(hash) {
const stream = await this.getStream(hash)
const buffer = await streams.readStreamToBuffer(stream)
return buffer.toString()
}
async getStream(hash) {
return fs.createReadStream(this.getBlobPathname(hash))
}
async getBlob(hash) {
return this.blobs.get(hash)
}
getBlobPathname(hash) {
return path.join(this.tmp, hash)
}
makeGlobalBlobKey(hash) {
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
}
makeProjectBlobKey(hash) {
return `${projectKey.format(this.historyId)}/${hash.slice(
0,
2
)}/${hash.slice(2)}`
}
}
async function uploadZip(historyId, zipPathname) {
const bucketName = config.get('zipStore.bucket')
const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
const storage = new Storage()
const destination = `${historyId}-recovered.zip`
await storage.bucket(bucketName).upload(zipPathname, { destination })
const signedUrls = await storage
.bucket(bucketName)
.file(destination)
.getSignedUrl({
version: 'v4',
action: 'read',
expires: Date.now() + deadline,
})
return signedUrls[0]
}
async function restoreProject(historyId) {
const tmp = await fs.promises.mkdtemp(
path.join(os.tmpdir(), historyId.toString())
)
if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
const blobStore = new RecoveryBlobStore(historyId, tmp)
const chunk = await loadChunk(latestJsonPathname, blobStore)
const snapshot = chunk.getSnapshot()
for (const change of chunk.getChanges()) {
change.applyTo(snapshot)
}
if (VERBOSE) console.log('zipping', historyId)
const zipPathname = path.join(tmp, `${historyId}.zip`)
const zipTimeoutMs = 60 * 1000
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
await archive.writeZip(blobStore, zipPathname)
if (VERBOSE) console.log('uploading', historyId)
return await uploadZip(historyId, zipPathname)
}
async function main() {
for (const historyId of HISTORY_IDS) {
const signedUrl = await restoreProject(historyId)
console.log(signedUrl)
}
}
main().catch(console.error)

View File

@@ -0,0 +1,36 @@
import redis from '@overleaf/redis-wrapper'
import config from 'config'
// Get allowed Redis dbs from config
const redisConfig = config.get('redis')
const allowedDbs = Object.keys(redisConfig)
// Get the Redis db from command line argument or use the first available db as default
const db = process.argv[2]
// Validate redis db
if (!allowedDbs.includes(db)) {
if (db) {
console.error('Invalid redis db:', db)
}
console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
process.exit(1)
}
// Get redis options based on command line argument
const redisOptions = config.get(`redis.${db}`)
console.log('Using redis db:', db)
console.log('REDIS CONFIG', {
...redisOptions,
password: '*'.repeat(redisOptions.password?.length),
})
const rclient = redis.createClient(redisOptions)
try {
await rclient.healthCheck()
console.log('REDIS HEALTHCHECK SUCCEEDED')
} catch (error) {
console.error('REDIS HEALTHCHECK FAILED', error)
} finally {
await rclient.quit()
}

View File

@@ -0,0 +1,104 @@
// @ts-check
import { readFileSync } from 'node:fs'
import commandLineArgs from 'command-line-args'
import { client } from '../lib/mongodb.js'
import {
getBackedUpBlobHashes,
unsetBackedUpBlobHashes,
} from '../lib/backup_store/index.js'
let gracefulShutdownInitiated = false
// Parse command line arguments
const args = commandLineArgs([
{ name: 'input', type: String, alias: 'i', defaultOption: true },
{ name: 'commit', type: Boolean, default: false },
])
if (!args.input) {
console.error(
'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
)
process.exit(1)
}
if (!args.commit) {
console.log('Running in dry-run mode. Use --commit to apply changes.')
}
// Signal handling
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
console.warn('Graceful shutdown initiated')
gracefulShutdownInitiated = true
}
// Process CSV and remove blobs
async function main() {
const projectBlobs = new Map()
const lines = readFileSync(args.input, 'utf8').split('\n')
const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
// Skip header
for (const line of lines.slice(1)) {
if (!line.trim() || gracefulShutdownInitiated) break
const [projectId, path] = line.split(',')
const pathParts = path.split('/')
const hash = pathParts[3] + pathParts[4]
if (!SHA1_HEX_REGEX.test(hash)) {
console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
continue
}
if (!projectBlobs.has(projectId)) {
projectBlobs.set(projectId, new Set())
}
projectBlobs.get(projectId).add(hash)
}
// Process each project
for (const [projectId, hashes] of projectBlobs) {
if (gracefulShutdownInitiated) break
if (!args.commit) {
console.log(
`DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
)
continue
}
try {
const originalHashes = await getBackedUpBlobHashes(projectId)
if (originalHashes.size === 0) {
continue
}
const result = await unsetBackedUpBlobHashes(
projectId,
Array.from(hashes)
)
if (result) {
console.log(
`Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
)
}
} catch (err) {
console.error(`Error updating project ${projectId}:`, err)
}
}
}
// Run the script
main()
.catch(err => {
console.error('Fatal error:', err)
process.exitCode = 1
})
.finally(() => {
client
.close()
.catch(err => console.error('Error closing MongoDB connection:', err))
})

View File

@@ -0,0 +1,221 @@
// @ts-check
/**
* This script is used to remove blobs that have been backed up under the project ID
* instead of the history ID (where those are different).
*
* This script reads a CSV file with the following format:
* ```
* project_id,hash
* <mongo ID>,<hash>
* ```
*
* The header row is optional. All rows will be checked for conformance to the format.
*/
import commandLineArgs from 'command-line-args'
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
import { makeProjectKey } from '../lib/blob_store/index.js'
import fs from 'node:fs'
import assert from '../lib/assert.js'
import { client } from '../lib/mongodb.js'
import { verifyBlobs } from '../lib/backupVerifier.mjs'
import { setTimeout } from 'node:timers/promises'
import { getHistoryId } from '../lib/backup_store/index.js'
const argsSchema = [
{
name: 'input',
type: String,
},
{
name: 'commit',
type: Boolean,
},
{
name: 'header',
type: Boolean,
},
{
name: 'force',
type: Boolean,
},
{
name: 'verbose',
type: Boolean,
},
]
const args = commandLineArgs(argsSchema)
async function gracefulClose(code = 0) {
await client.close()
process.exit(code)
}
/**
*
* @param {(value: unknown) => void} fn
* @param {unknown} value
* @return {boolean}
*/
function not(fn, value) {
try {
fn(value)
return false
} catch {
return true
}
}
/**
*
* @param {string} row
* @return {{projectId: string, hash: string}}
*/
function parseCSVRow(row) {
const [projectId, hash] = row.split(',')
assert.mongoId(projectId, `invalid projectId ${projectId}`)
assert.blobHash(hash, `invalid hash ${hash}`)
return { projectId, hash }
}
/**
*
* @param {string} path
* @param {boolean} hasHeader
* @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
*/
async function* readCSV(path, hasHeader) {
let seenHeader = !hasHeader
let fh
try {
fh = await fs.promises.open(path, 'r')
} catch (error) {
console.error(`Could not open file: ${error}`)
return await gracefulClose(1)
}
for await (const line of fh.readLines()) {
if (!seenHeader) {
const [first, second] = line.split(',')
const noDataInHeader =
not(assert.mongoId, first) && not(assert.blobHash, second)
if (!noDataInHeader) {
console.error('Data found in header row')
return await gracefulClose(1)
}
seenHeader = true
continue
}
try {
yield parseCSVRow(line)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
console.info(`Skipping invalid row: ${line}`)
}
}
}
function usage() {
console.info(
'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
)
}
if (!args.input) {
console.error('--input was missing')
usage()
await gracefulClose(1)
}
/**
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<void>}
*/
async function deleteBlob(projectId, hash) {
const path = makeProjectKey(projectId, hash)
if (args.commit) {
await backupPersistor.deleteObject(projectBlobsBucket, path)
} else {
console.log(`DELETE: ${path}`)
}
}
/**
*
* @param {string} projectId
* @param {string} hash
* @return {Promise<void>}
*/
async function canDeleteBlob(projectId, hash) {
let historyId
try {
historyId = await getHistoryId(projectId)
} catch (error) {
if (args.verbose) {
console.error(error)
}
throw new Error(`No history ID found for project ${projectId}, skipping`)
}
if (historyId === projectId) {
throw new Error(
`Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
)
}
// TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
assert.postgresId(
`${historyId}`,
`History ID ${historyId} does not appear to be for a postgres project`
)
try {
await verifyBlobs(`${historyId}`, [hash])
} catch (error) {
if (args.verbose) {
console.error(error)
}
throw new Error(
`Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
)
}
}
if (!args.commit) {
console.log('DRY RUN: provide --commit to perform operations')
}
if (args.force) {
console.log(
'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
)
await setTimeout(5_000)
}
let deleted = 0
let errors = 0
for await (const { projectId, hash } of readCSV(args.input, args.header)) {
if (!args.force) {
try {
await canDeleteBlob(projectId, hash)
} catch (error) {
console.error(error instanceof Error ? error.message : error)
continue
}
}
try {
await deleteBlob(projectId, hash)
deleted++
} catch (error) {
errors++
console.error(error)
}
}
console.log(`Deleted: ${deleted}`)
console.log(`Errors: ${errors}`)
await gracefulClose()

Some files were not shown because too many files have changed in this diff Show More