first commit
This commit is contained in:
3
services/history-v1/.gitignore
vendored
Normal file
3
services/history-v1/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
|
||||
# managed by monorepo$ bin/update_build_scripts
|
||||
.npmrc
|
3
services/history-v1/.mocharc.json
Normal file
3
services/history-v1/.mocharc.json
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"require": "test/setup.js"
|
||||
}
|
1
services/history-v1/.nvmrc
Normal file
1
services/history-v1/.nvmrc
Normal file
@@ -0,0 +1 @@
|
||||
20.18.2
|
32
services/history-v1/Dockerfile
Normal file
32
services/history-v1/Dockerfile
Normal file
@@ -0,0 +1,32 @@
|
||||
# This file was auto-generated, do not edit it directly.
|
||||
# Instead run bin/update_build_scripts from
|
||||
# https://github.com/overleaf/internal/
|
||||
|
||||
FROM node:20.18.2 AS base
|
||||
|
||||
WORKDIR /overleaf/services/history-v1
|
||||
COPY services/history-v1/install_deps.sh /overleaf/services/history-v1/
|
||||
RUN chmod 0755 ./install_deps.sh && ./install_deps.sh
|
||||
|
||||
# Google Cloud Storage needs a writable $HOME/.config for resumable uploads
|
||||
# (see https://googleapis.dev/nodejs/storage/latest/File.html#createWriteStream)
|
||||
RUN mkdir /home/node/.config && chown node:node /home/node/.config
|
||||
|
||||
# fs persistor needs a writable folder as a target for the mounted volume
|
||||
RUN mkdir /buckets && chown node:node /buckets
|
||||
|
||||
FROM base AS app
|
||||
|
||||
COPY package.json package-lock.json /overleaf/
|
||||
COPY services/history-v1/package.json /overleaf/services/history-v1/
|
||||
COPY libraries/ /overleaf/libraries/
|
||||
COPY patches/ /overleaf/patches/
|
||||
|
||||
RUN cd /overleaf && npm ci --quiet
|
||||
|
||||
COPY services/history-v1/ /overleaf/services/history-v1/
|
||||
|
||||
FROM app
|
||||
USER node
|
||||
|
||||
CMD ["node", "--expose-gc", "app.js"]
|
156
services/history-v1/Makefile
Normal file
156
services/history-v1/Makefile
Normal file
@@ -0,0 +1,156 @@
|
||||
# This file was auto-generated, do not edit it directly.
|
||||
# Instead run bin/update_build_scripts from
|
||||
# https://github.com/overleaf/internal/
|
||||
|
||||
BUILD_NUMBER ?= local
|
||||
BRANCH_NAME ?= $(shell git rev-parse --abbrev-ref HEAD)
|
||||
PROJECT_NAME = history-v1
|
||||
BUILD_DIR_NAME = $(shell pwd | xargs basename | tr -cd '[a-zA-Z0-9_.\-]')
|
||||
|
||||
DOCKER_COMPOSE_FLAGS ?= -f docker-compose.yml
|
||||
DOCKER_COMPOSE := BUILD_NUMBER=$(BUILD_NUMBER) \
|
||||
BRANCH_NAME=$(BRANCH_NAME) \
|
||||
PROJECT_NAME=$(PROJECT_NAME) \
|
||||
MOCHA_GREP=${MOCHA_GREP} \
|
||||
docker compose ${DOCKER_COMPOSE_FLAGS}
|
||||
|
||||
COMPOSE_PROJECT_NAME_TEST_ACCEPTANCE ?= test_acceptance_$(BUILD_DIR_NAME)
|
||||
DOCKER_COMPOSE_TEST_ACCEPTANCE = \
|
||||
COMPOSE_PROJECT_NAME=$(COMPOSE_PROJECT_NAME_TEST_ACCEPTANCE) $(DOCKER_COMPOSE)
|
||||
|
||||
COMPOSE_PROJECT_NAME_TEST_UNIT ?= test_unit_$(BUILD_DIR_NAME)
|
||||
DOCKER_COMPOSE_TEST_UNIT = \
|
||||
COMPOSE_PROJECT_NAME=$(COMPOSE_PROJECT_NAME_TEST_UNIT) $(DOCKER_COMPOSE)
|
||||
|
||||
clean:
|
||||
-docker rmi ci/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER)
|
||||
-docker rmi us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER)
|
||||
-$(DOCKER_COMPOSE_TEST_UNIT) down --rmi local
|
||||
-$(DOCKER_COMPOSE_TEST_ACCEPTANCE) down --rmi local
|
||||
|
||||
HERE=$(shell pwd)
|
||||
MONOREPO=$(shell cd ../../ && pwd)
|
||||
# Run the linting commands in the scope of the monorepo.
|
||||
# Eslint and prettier (plus some configs) are on the root.
|
||||
RUN_LINTING = docker run --rm -v $(MONOREPO):$(MONOREPO) -w $(HERE) node:20.18.2 npm run --silent
|
||||
|
||||
RUN_LINTING_CI = docker run --rm --volume $(MONOREPO)/.editorconfig:/overleaf/.editorconfig --volume $(MONOREPO)/.eslintignore:/overleaf/.eslintignore --volume $(MONOREPO)/.eslintrc:/overleaf/.eslintrc --volume $(MONOREPO)/.prettierignore:/overleaf/.prettierignore --volume $(MONOREPO)/.prettierrc:/overleaf/.prettierrc --volume $(MONOREPO)/tsconfig.backend.json:/overleaf/tsconfig.backend.json ci/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER) npm run --silent
|
||||
|
||||
# Same but from the top of the monorepo
|
||||
RUN_LINTING_MONOREPO = docker run --rm -v $(MONOREPO):$(MONOREPO) -w $(MONOREPO) node:20.18.2 npm run --silent
|
||||
|
||||
SHELLCHECK_OPTS = \
|
||||
--shell=bash \
|
||||
--external-sources
|
||||
SHELLCHECK_COLOR := $(if $(CI),--color=never,--color)
|
||||
SHELLCHECK_FILES := { git ls-files "*.sh" -z; git grep -Plz "\A\#\!.*bash"; } | sort -zu
|
||||
|
||||
shellcheck:
|
||||
@$(SHELLCHECK_FILES) | xargs -0 -r docker run --rm -v $(HERE):/mnt -w /mnt \
|
||||
koalaman/shellcheck:stable $(SHELLCHECK_OPTS) $(SHELLCHECK_COLOR)
|
||||
|
||||
shellcheck_fix:
|
||||
@$(SHELLCHECK_FILES) | while IFS= read -r -d '' file; do \
|
||||
diff=$$(docker run --rm -v $(HERE):/mnt -w /mnt koalaman/shellcheck:stable $(SHELLCHECK_OPTS) --format=diff "$$file" 2>/dev/null); \
|
||||
if [ -n "$$diff" ] && ! echo "$$diff" | patch -p1 >/dev/null 2>&1; then echo "\033[31m$$file\033[0m"; \
|
||||
elif [ -n "$$diff" ]; then echo "$$file"; \
|
||||
else echo "\033[2m$$file\033[0m"; fi \
|
||||
done
|
||||
|
||||
format:
|
||||
$(RUN_LINTING) format
|
||||
|
||||
format_ci:
|
||||
$(RUN_LINTING_CI) format
|
||||
|
||||
format_fix:
|
||||
$(RUN_LINTING) format:fix
|
||||
|
||||
lint:
|
||||
$(RUN_LINTING) lint
|
||||
|
||||
lint_ci:
|
||||
$(RUN_LINTING_CI) lint
|
||||
|
||||
lint_fix:
|
||||
$(RUN_LINTING) lint:fix
|
||||
|
||||
typecheck:
|
||||
$(RUN_LINTING) types:check
|
||||
|
||||
typecheck_ci:
|
||||
$(RUN_LINTING_CI) types:check
|
||||
|
||||
test: format lint typecheck shellcheck test_unit test_acceptance
|
||||
|
||||
test_unit:
|
||||
ifneq (,$(wildcard test/unit))
|
||||
$(DOCKER_COMPOSE_TEST_UNIT) run --rm test_unit
|
||||
$(MAKE) test_unit_clean
|
||||
endif
|
||||
|
||||
test_clean: test_unit_clean
|
||||
test_unit_clean:
|
||||
ifneq (,$(wildcard test/unit))
|
||||
$(DOCKER_COMPOSE_TEST_UNIT) down -v -t 0
|
||||
endif
|
||||
|
||||
test_acceptance: test_acceptance_clean test_acceptance_pre_run test_acceptance_run
|
||||
$(MAKE) test_acceptance_clean
|
||||
|
||||
test_acceptance_debug: test_acceptance_clean test_acceptance_pre_run test_acceptance_run_debug
|
||||
$(MAKE) test_acceptance_clean
|
||||
|
||||
test_acceptance_run:
|
||||
ifneq (,$(wildcard test/acceptance))
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run --rm test_acceptance
|
||||
endif
|
||||
|
||||
test_acceptance_run_debug:
|
||||
ifneq (,$(wildcard test/acceptance))
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run -p 127.0.0.9:19999:19999 --rm test_acceptance npm run test:acceptance -- --inspect=0.0.0.0:19999 --inspect-brk
|
||||
endif
|
||||
|
||||
test_clean: test_acceptance_clean
|
||||
test_acceptance_clean:
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) down -v -t 0
|
||||
|
||||
test_acceptance_pre_run:
|
||||
ifneq (,$(wildcard test/acceptance/js/scripts/pre-run))
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run --rm test_acceptance test/acceptance/js/scripts/pre-run
|
||||
endif
|
||||
|
||||
benchmarks:
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run --rm test_acceptance npm run benchmarks
|
||||
|
||||
build:
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||||
--tag ci/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER) \
|
||||
--tag us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER) \
|
||||
--tag us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME) \
|
||||
--cache-from us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):$(BRANCH_NAME) \
|
||||
--cache-from us-east1-docker.pkg.dev/overleaf-ops/ol-docker/$(PROJECT_NAME):main \
|
||||
--file Dockerfile \
|
||||
../..
|
||||
|
||||
tar:
|
||||
$(DOCKER_COMPOSE) up tar
|
||||
|
||||
publish:
|
||||
|
||||
docker push $(DOCKER_REPO)/$(PROJECT_NAME):$(BRANCH_NAME)-$(BUILD_NUMBER)
|
||||
|
||||
|
||||
.PHONY: clean \
|
||||
format format_fix \
|
||||
lint lint_fix \
|
||||
build_types typecheck \
|
||||
lint_ci format_ci typecheck_ci \
|
||||
shellcheck shellcheck_fix \
|
||||
test test_clean test_unit test_unit_clean \
|
||||
test_acceptance test_acceptance_debug test_acceptance_pre_run \
|
||||
test_acceptance_run test_acceptance_run_debug test_acceptance_clean \
|
||||
benchmarks \
|
||||
build tar publish \
|
51
services/history-v1/README.md
Normal file
51
services/history-v1/README.md
Normal file
@@ -0,0 +1,51 @@
|
||||
## Database migrations
|
||||
|
||||
The history service uses knex to manage PostgreSQL migrations.
|
||||
|
||||
To create a new migrations, run:
|
||||
```
|
||||
npx knex migrate:make migration_name
|
||||
```
|
||||
|
||||
To apply migrations, run:
|
||||
```
|
||||
npx knex migrate:latest
|
||||
```
|
||||
|
||||
For more information, consult the [knex migrations
|
||||
guide](https://knexjs.org/guide/migrations.html#migration-cli).
|
||||
|
||||
## Global blobs
|
||||
|
||||
Global blobs are blobs that are shared between projects. The list of global
|
||||
blobs is stored in the projectHistoryGlobalBlobs Mongo collection and is read
|
||||
when the service starts. Changing the list of global blobs needs to be done
|
||||
carefully.
|
||||
|
||||
### Adding a blob to the global blobs list
|
||||
|
||||
If we identify a blob that appears in many projects, we might want to move that
|
||||
blob to the global blobs list.
|
||||
|
||||
1. Add a record for the blob to the projectHistoryGlobalBlobs collection.
|
||||
2. Restart the history service.
|
||||
3. Delete any corresponding project blobs.
|
||||
|
||||
### Removing a blob from the global blobs list
|
||||
|
||||
Removing a blob from the global blobs list is trickier. As soon as the global
|
||||
blob is made unavailable, every project that needs the blob will have to get
|
||||
its own copy. To avoid disruptions, follow these steps:
|
||||
|
||||
1. In the projectHistoryGlobalBlobs collection, set the `demoted` property to
|
||||
`false` on the global blob to remove. This will make the history system
|
||||
write new instances of this blob to project blobs, but still read from the
|
||||
global blob.
|
||||
|
||||
2. Restart the history service.
|
||||
|
||||
3. Copy the blob to all projects that need it.
|
||||
|
||||
4. Remove the blob from the projectHistoryGlobalBlobs collection.
|
||||
|
||||
5. Restart the history service.
|
149
services/history-v1/api/app/security.js
Normal file
149
services/history-v1/api/app/security.js
Normal file
@@ -0,0 +1,149 @@
|
||||
'use strict'
|
||||
|
||||
const basicAuth = require('basic-auth')
|
||||
const config = require('config')
|
||||
const HTTPStatus = require('http-status')
|
||||
const jwt = require('jsonwebtoken')
|
||||
const tsscmp = require('tsscmp')
|
||||
|
||||
function setupBasicHttpAuthForSwaggerDocs(app) {
|
||||
app.use('/docs', function (req, res, next) {
|
||||
if (hasValidBasicAuthCredentials(req)) {
|
||||
return next()
|
||||
}
|
||||
|
||||
res.header('WWW-Authenticate', 'Basic realm="Application"')
|
||||
res.status(HTTPStatus.UNAUTHORIZED).end()
|
||||
})
|
||||
}
|
||||
|
||||
exports.setupBasicHttpAuthForSwaggerDocs = setupBasicHttpAuthForSwaggerDocs
|
||||
|
||||
function hasValidBasicAuthCredentials(req) {
|
||||
const credentials = basicAuth(req)
|
||||
if (!credentials) return false
|
||||
|
||||
// No security in the name, so just use straight comparison.
|
||||
if (credentials.name !== 'staging') return false
|
||||
|
||||
const password = config.get('basicHttpAuth.password')
|
||||
if (password && tsscmp(credentials.pass, password)) return true
|
||||
|
||||
// Support an old password so we can change the password without downtime.
|
||||
if (config.has('basicHttpAuth.oldPassword')) {
|
||||
const oldPassword = config.get('basicHttpAuth.oldPassword')
|
||||
if (oldPassword && tsscmp(credentials.pass, oldPassword)) return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
function setupSSL(app) {
|
||||
const httpsOnly = config.get('httpsOnly') === 'true'
|
||||
if (!httpsOnly) {
|
||||
return
|
||||
}
|
||||
app.enable('trust proxy')
|
||||
app.use(function (req, res, next) {
|
||||
if (req.protocol === 'https') {
|
||||
next()
|
||||
return
|
||||
}
|
||||
if (req.method === 'GET' || req.method === 'HEAD') {
|
||||
res.redirect('https://' + req.headers.host + req.url)
|
||||
} else {
|
||||
res
|
||||
.status(HTTPStatus.FORBIDDEN)
|
||||
.send('Please use HTTPS when submitting data to this server.')
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
exports.setupSSL = setupSSL
|
||||
|
||||
function handleJWTAuth(req, authOrSecDef, scopesOrApiKey, next) {
|
||||
// as a temporary solution, to make the OT demo still work
|
||||
// this handler will also check for basic authorization
|
||||
if (hasValidBasicAuthCredentials(req)) {
|
||||
return next()
|
||||
}
|
||||
let token, err
|
||||
if (authOrSecDef.name === 'token') {
|
||||
token = req.query.token
|
||||
} else if (
|
||||
req.headers.authorization &&
|
||||
req.headers.authorization.split(' ')[0] === 'Bearer'
|
||||
) {
|
||||
token = req.headers.authorization.split(' ')[1]
|
||||
}
|
||||
if (!token) {
|
||||
err = new Error('jwt missing')
|
||||
err.statusCode = HTTPStatus.UNAUTHORIZED
|
||||
err.headers = { 'WWW-Authenticate': 'Bearer' }
|
||||
return next(err)
|
||||
}
|
||||
let decoded
|
||||
try {
|
||||
decoded = decodeJWT(token)
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof jwt.JsonWebTokenError ||
|
||||
error instanceof jwt.TokenExpiredError
|
||||
) {
|
||||
err = new Error(error.message)
|
||||
err.statusCode = HTTPStatus.UNAUTHORIZED
|
||||
err.headers = { 'WWW-Authenticate': 'Bearer error="invalid_token"' }
|
||||
return next(err)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
if (decoded.project_id.toString() !== req.swagger.params.project_id.value) {
|
||||
err = new Error('Wrong project_id')
|
||||
err.statusCode = HTTPStatus.FORBIDDEN
|
||||
return next(err)
|
||||
}
|
||||
next()
|
||||
}
|
||||
|
||||
exports.hasValidBasicAuthCredentials = hasValidBasicAuthCredentials
|
||||
|
||||
/**
|
||||
* Verify and decode the given JSON Web Token
|
||||
*/
|
||||
function decodeJWT(token) {
|
||||
const key = config.get('jwtAuth.key')
|
||||
const algorithm = config.get('jwtAuth.algorithm')
|
||||
try {
|
||||
return jwt.verify(token, key, { algorithms: [algorithm] })
|
||||
} catch (err) {
|
||||
// Support an old key so we can change the key without downtime.
|
||||
if (config.has('jwtAuth.oldKey')) {
|
||||
const oldKey = config.get('jwtAuth.oldKey')
|
||||
return jwt.verify(token, oldKey, { algorithms: [algorithm] })
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
function handleBasicAuth(req, authOrSecDef, scopesOrApiKey, next) {
|
||||
if (hasValidBasicAuthCredentials(req)) {
|
||||
return next()
|
||||
}
|
||||
const error = new Error()
|
||||
error.statusCode = HTTPStatus.UNAUTHORIZED
|
||||
error.headers = { 'WWW-Authenticate': 'Basic realm="Application"' }
|
||||
return next(error)
|
||||
}
|
||||
|
||||
function getSwaggerHandlers() {
|
||||
const handlers = {}
|
||||
if (!config.has('jwtAuth.key') || !config.has('basicHttpAuth.password')) {
|
||||
throw new Error('missing authentication env vars')
|
||||
}
|
||||
handlers.jwt = handleJWTAuth
|
||||
handlers.basic = handleBasicAuth
|
||||
handlers.token = handleJWTAuth
|
||||
return handlers
|
||||
}
|
||||
|
||||
exports.getSwaggerHandlers = getSwaggerHandlers
|
10
services/history-v1/api/controllers/expressify.js
Normal file
10
services/history-v1/api/controllers/expressify.js
Normal file
@@ -0,0 +1,10 @@
|
||||
/**
|
||||
* Turn an async function into an Express middleware
|
||||
*/
|
||||
function expressify(fn) {
|
||||
return (req, res, next) => {
|
||||
fn(req, res, next).catch(next)
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = expressify
|
23
services/history-v1/api/controllers/health_checks.js
Normal file
23
services/history-v1/api/controllers/health_checks.js
Normal file
@@ -0,0 +1,23 @@
|
||||
const logger = require('@overleaf/logger')
|
||||
const expressify = require('./expressify')
|
||||
const { mongodb } = require('../../storage')
|
||||
|
||||
async function status(req, res) {
|
||||
try {
|
||||
await mongodb.db.command({ ping: 1 })
|
||||
} catch (err) {
|
||||
logger.warn({ err }, 'Lost connection with MongoDB')
|
||||
res.status(500).send('Lost connection with MongoDB')
|
||||
return
|
||||
}
|
||||
res.send('history-v1 is up')
|
||||
}
|
||||
|
||||
function healthCheck(req, res) {
|
||||
res.send('OK')
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
status: expressify(status),
|
||||
healthCheck,
|
||||
}
|
141
services/history-v1/api/controllers/project_import.js
Normal file
141
services/history-v1/api/controllers/project_import.js
Normal file
@@ -0,0 +1,141 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
const { expressify } = require('@overleaf/promise-utils')
|
||||
|
||||
const HTTPStatus = require('http-status')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const Change = core.Change
|
||||
const Chunk = core.Chunk
|
||||
const File = core.File
|
||||
const FileMap = core.FileMap
|
||||
const Snapshot = core.Snapshot
|
||||
const TextOperation = core.TextOperation
|
||||
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
const storage = require('../../storage')
|
||||
const BatchBlobStore = storage.BatchBlobStore
|
||||
const BlobStore = storage.BlobStore
|
||||
const chunkStore = storage.chunkStore
|
||||
const HashCheckBlobStore = storage.HashCheckBlobStore
|
||||
const persistChanges = storage.persistChanges
|
||||
const InvalidChangeError = storage.InvalidChangeError
|
||||
|
||||
const render = require('./render')
|
||||
|
||||
async function importSnapshot(req, res) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const rawSnapshot = req.swagger.params.snapshot.value
|
||||
|
||||
let snapshot
|
||||
|
||||
try {
|
||||
snapshot = Snapshot.fromRaw(rawSnapshot)
|
||||
} catch (err) {
|
||||
return render.unprocessableEntity(res)
|
||||
}
|
||||
|
||||
let historyId
|
||||
try {
|
||||
historyId = await chunkStore.initializeProject(projectId, snapshot)
|
||||
} catch (err) {
|
||||
if (err instanceof chunkStore.AlreadyInitialized) {
|
||||
return render.conflict(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
res.status(HTTPStatus.OK).json({ projectId: historyId })
|
||||
}
|
||||
|
||||
async function importChanges(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const rawChanges = req.swagger.params.changes.value
|
||||
const endVersion = req.swagger.params.end_version.value
|
||||
const returnSnapshot = req.swagger.params.return_snapshot.value || 'none'
|
||||
|
||||
let changes
|
||||
|
||||
try {
|
||||
changes = rawChanges.map(Change.fromRaw)
|
||||
} catch (err) {
|
||||
logger.warn({ err, projectId }, 'failed to parse changes')
|
||||
return render.unprocessableEntity(res)
|
||||
}
|
||||
|
||||
// Set limits to force us to persist all of the changes.
|
||||
const farFuture = new Date()
|
||||
farFuture.setTime(farFuture.getTime() + 7 * 24 * 3600 * 1000)
|
||||
const limits = {
|
||||
maxChanges: 0,
|
||||
minChangeTimestamp: farFuture,
|
||||
maxChangeTimestamp: farFuture,
|
||||
}
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
const hashCheckBlobStore = new HashCheckBlobStore(blobStore)
|
||||
|
||||
async function loadFiles() {
|
||||
const blobHashes = new Set()
|
||||
for (const change of changes) {
|
||||
// This populates the set blobHashes with blobs referred to in the change
|
||||
change.findBlobHashes(blobHashes)
|
||||
}
|
||||
|
||||
await batchBlobStore.preload(Array.from(blobHashes))
|
||||
|
||||
for (const change of changes) {
|
||||
await change.loadFiles('lazy', batchBlobStore)
|
||||
}
|
||||
}
|
||||
|
||||
async function buildResultSnapshot(resultChunk) {
|
||||
const chunk = resultChunk || (await chunkStore.loadLatest(projectId))
|
||||
const snapshot = chunk.getSnapshot()
|
||||
snapshot.applyAll(chunk.getChanges())
|
||||
const rawSnapshot = await snapshot.store(hashCheckBlobStore)
|
||||
return rawSnapshot
|
||||
}
|
||||
|
||||
await loadFiles()
|
||||
|
||||
let result
|
||||
try {
|
||||
result = await persistChanges(projectId, changes, limits, endVersion)
|
||||
} catch (err) {
|
||||
if (
|
||||
err instanceof Chunk.ConflictingEndVersion ||
|
||||
err instanceof TextOperation.UnprocessableError ||
|
||||
err instanceof File.NotEditableError ||
|
||||
err instanceof FileMap.PathnameError ||
|
||||
err instanceof Snapshot.EditMissingFileError ||
|
||||
err instanceof chunkStore.ChunkVersionConflictError ||
|
||||
err instanceof InvalidChangeError
|
||||
) {
|
||||
// If we failed to apply operations, that's probably because they were
|
||||
// invalid.
|
||||
logger.warn({ err, projectId, endVersion }, 'changes rejected by history')
|
||||
return render.unprocessableEntity(res)
|
||||
} else if (err instanceof Chunk.NotFoundError) {
|
||||
logger.warn({ err, projectId }, 'chunk not found')
|
||||
return render.notFound(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
if (returnSnapshot === 'none') {
|
||||
res.status(HTTPStatus.CREATED).json({})
|
||||
} else {
|
||||
const rawSnapshot = await buildResultSnapshot(result && result.currentChunk)
|
||||
res.status(HTTPStatus.CREATED).json(rawSnapshot)
|
||||
}
|
||||
}
|
||||
|
||||
exports.importSnapshot = expressify(importSnapshot)
|
||||
exports.importChanges = expressify(importChanges)
|
388
services/history-v1/api/controllers/projects.js
Normal file
388
services/history-v1/api/controllers/projects.js
Normal file
@@ -0,0 +1,388 @@
|
||||
'use strict'
|
||||
|
||||
const _ = require('lodash')
|
||||
const Path = require('node:path')
|
||||
const Stream = require('node:stream')
|
||||
const HTTPStatus = require('http-status')
|
||||
const fs = require('node:fs')
|
||||
const { promisify } = require('node:util')
|
||||
const config = require('config')
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
const logger = require('@overleaf/logger')
|
||||
const { Chunk, ChunkResponse, Blob } = require('overleaf-editor-core')
|
||||
const {
|
||||
BlobStore,
|
||||
blobHash,
|
||||
chunkStore,
|
||||
HashCheckBlobStore,
|
||||
ProjectArchive,
|
||||
zipStore,
|
||||
chunkBuffer,
|
||||
} = require('../../storage')
|
||||
|
||||
const render = require('./render')
|
||||
const expressify = require('./expressify')
|
||||
const withTmpDir = require('./with_tmp_dir')
|
||||
const StreamSizeLimit = require('./stream_size_limit')
|
||||
|
||||
const pipeline = promisify(Stream.pipeline)
|
||||
|
||||
async function initializeProject(req, res, next) {
|
||||
let projectId = req.swagger.params.body.value.projectId
|
||||
try {
|
||||
projectId = await chunkStore.initializeProject(projectId)
|
||||
res.status(HTTPStatus.OK).json({ projectId })
|
||||
} catch (err) {
|
||||
if (err instanceof chunkStore.AlreadyInitialized) {
|
||||
render.conflict(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getLatestContent(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const chunk = await chunkBuffer.loadLatest(projectId)
|
||||
const snapshot = chunk.getSnapshot()
|
||||
snapshot.applyAll(chunk.getChanges())
|
||||
await snapshot.loadFiles('eager', blobStore)
|
||||
res.json(snapshot.toRaw())
|
||||
}
|
||||
|
||||
async function getContentAtVersion(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const version = req.swagger.params.version.value
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const snapshot = await getSnapshotAtVersion(projectId, version)
|
||||
await snapshot.loadFiles('eager', blobStore)
|
||||
res.json(snapshot.toRaw())
|
||||
}
|
||||
|
||||
async function getLatestHashedContent(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const blobStore = new HashCheckBlobStore(new BlobStore(projectId))
|
||||
const chunk = await chunkBuffer.loadLatest(projectId)
|
||||
const snapshot = chunk.getSnapshot()
|
||||
snapshot.applyAll(chunk.getChanges())
|
||||
await snapshot.loadFiles('eager', blobStore)
|
||||
const rawSnapshot = await snapshot.store(blobStore)
|
||||
res.json(rawSnapshot)
|
||||
}
|
||||
|
||||
async function getLatestHistory(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
try {
|
||||
const chunk = await chunkBuffer.loadLatest(projectId)
|
||||
const chunkResponse = new ChunkResponse(chunk)
|
||||
res.json(chunkResponse.toRaw())
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotFoundError) {
|
||||
render.notFound(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getLatestHistoryRaw(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const readOnly = req.swagger.params.readOnly.value
|
||||
try {
|
||||
const { startVersion, endVersion, endTimestamp } =
|
||||
await chunkStore.loadLatestRaw(projectId, { readOnly })
|
||||
res.json({
|
||||
startVersion,
|
||||
endVersion,
|
||||
endTimestamp,
|
||||
})
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotFoundError) {
|
||||
render.notFound(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getHistory(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const version = req.swagger.params.version.value
|
||||
try {
|
||||
const chunk = await chunkStore.loadAtVersion(projectId, version)
|
||||
const chunkResponse = new ChunkResponse(chunk)
|
||||
res.json(chunkResponse.toRaw())
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotFoundError) {
|
||||
render.notFound(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getHistoryBefore(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const timestamp = req.swagger.params.timestamp.value
|
||||
try {
|
||||
const chunk = await chunkStore.loadAtTimestamp(projectId, timestamp)
|
||||
const chunkResponse = new ChunkResponse(chunk)
|
||||
res.json(chunkResponse.toRaw())
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotFoundError) {
|
||||
render.notFound(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all changes since the beginning of history or since a given version
|
||||
*/
|
||||
async function getChanges(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const since = req.swagger.params.since.value ?? 0
|
||||
|
||||
if (since < 0) {
|
||||
// Negative values would cause an infinite loop
|
||||
return res.status(400).json({
|
||||
error: `Version out of bounds: ${since}`,
|
||||
})
|
||||
}
|
||||
|
||||
const changes = []
|
||||
let chunk = await chunkBuffer.loadLatest(projectId)
|
||||
|
||||
if (since > chunk.getEndVersion()) {
|
||||
return res.status(400).json({
|
||||
error: `Version out of bounds: ${since}`,
|
||||
})
|
||||
}
|
||||
|
||||
// Fetch all chunks that come after the chunk that contains the start version
|
||||
while (chunk.getStartVersion() > since) {
|
||||
const changesInChunk = chunk.getChanges()
|
||||
changes.unshift(...changesInChunk)
|
||||
chunk = await chunkStore.loadAtVersion(projectId, chunk.getStartVersion())
|
||||
}
|
||||
|
||||
// Extract the relevant changes from the chunk that contains the start version
|
||||
const changesInChunk = chunk
|
||||
.getChanges()
|
||||
.slice(since - chunk.getStartVersion())
|
||||
changes.unshift(...changesInChunk)
|
||||
|
||||
res.json(changes.map(change => change.toRaw()))
|
||||
}
|
||||
|
||||
async function getZip(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const version = req.swagger.params.version.value
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
let snapshot
|
||||
try {
|
||||
snapshot = await getSnapshotAtVersion(projectId, version)
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotFoundError) {
|
||||
return render.notFound(res)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
await withTmpDir('get-zip-', async tmpDir => {
|
||||
const tmpFilename = Path.join(tmpDir, 'project.zip')
|
||||
const archive = new ProjectArchive(snapshot)
|
||||
await archive.writeZip(blobStore, tmpFilename)
|
||||
res.set('Content-Type', 'application/octet-stream')
|
||||
res.set('Content-Disposition', 'attachment; filename=project.zip')
|
||||
const stream = fs.createReadStream(tmpFilename)
|
||||
await pipeline(stream, res)
|
||||
})
|
||||
}
|
||||
|
||||
async function createZip(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const version = req.swagger.params.version.value
|
||||
try {
|
||||
const snapshot = await getSnapshotAtVersion(projectId, version)
|
||||
const zipUrl = await zipStore.getSignedUrl(projectId, version)
|
||||
// Do not await this; run it in the background.
|
||||
zipStore.storeZip(projectId, version, snapshot).catch(err => {
|
||||
logger.error({ err, projectId, version }, 'createZip: storeZip failed')
|
||||
})
|
||||
res.status(HTTPStatus.OK).json({ zipUrl })
|
||||
} catch (error) {
|
||||
if (error instanceof Chunk.NotFoundError) {
|
||||
render.notFound(res)
|
||||
} else {
|
||||
next(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function deleteProject(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const blobStore = new BlobStore(projectId)
|
||||
await Promise.all([
|
||||
chunkStore.deleteProjectChunks(projectId),
|
||||
blobStore.deleteBlobs(),
|
||||
])
|
||||
res.status(HTTPStatus.NO_CONTENT).send()
|
||||
}
|
||||
|
||||
async function createProjectBlob(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const expectedHash = req.swagger.params.hash.value
|
||||
const maxUploadSize = parseInt(config.get('maxFileUploadSize'), 10)
|
||||
|
||||
await withTmpDir('blob-', async tmpDir => {
|
||||
const tmpPath = Path.join(tmpDir, 'content')
|
||||
const sizeLimit = new StreamSizeLimit(maxUploadSize)
|
||||
await pipeline(req, sizeLimit, fs.createWriteStream(tmpPath))
|
||||
if (sizeLimit.sizeLimitExceeded) {
|
||||
return render.requestEntityTooLarge(res)
|
||||
}
|
||||
const hash = await blobHash.fromFile(tmpPath)
|
||||
if (hash !== expectedHash) {
|
||||
logger.debug({ hash, expectedHash }, 'Hash mismatch')
|
||||
return render.conflict(res, 'File hash mismatch')
|
||||
}
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const newBlob = await blobStore.putFile(tmpPath)
|
||||
|
||||
try {
|
||||
const { backupBlob } = await import('../../storage/lib/backupBlob.mjs')
|
||||
await backupBlob(projectId, newBlob, tmpPath)
|
||||
} catch (error) {
|
||||
logger.warn({ error, projectId, hash }, 'Failed to backup blob')
|
||||
}
|
||||
res.status(HTTPStatus.CREATED).end()
|
||||
})
|
||||
}
|
||||
|
||||
async function headProjectBlob(req, res) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const hash = req.swagger.params.hash.value
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const blob = await blobStore.getBlob(hash)
|
||||
if (blob) {
|
||||
res.set('Content-Length', blob.getByteLength())
|
||||
res.status(200).end()
|
||||
} else {
|
||||
res.status(404).end()
|
||||
}
|
||||
}
|
||||
|
||||
// Support simple, singular ranges starting from zero only, up-to 2MB = 2_000_000, 7 digits
|
||||
const RANGE_HEADER = /^bytes=0-(\d{1,7})$/
|
||||
|
||||
/**
|
||||
* @param {string} header
|
||||
* @return {{}|{start: number, end: number}}
|
||||
* @private
|
||||
*/
|
||||
function _getRangeOpts(header) {
|
||||
if (!header) return {}
|
||||
const match = header.match(RANGE_HEADER)
|
||||
if (match) {
|
||||
const end = parseInt(match[1], 10)
|
||||
return { start: 0, end }
|
||||
}
|
||||
return {}
|
||||
}
|
||||
|
||||
async function getProjectBlob(req, res, next) {
|
||||
const projectId = req.swagger.params.project_id.value
|
||||
const hash = req.swagger.params.hash.value
|
||||
const opts = _getRangeOpts(req.swagger.params.range.value || '')
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
logger.debug({ projectId, hash }, 'getProjectBlob started')
|
||||
try {
|
||||
let stream
|
||||
try {
|
||||
stream = await blobStore.getStream(hash, opts)
|
||||
} catch (err) {
|
||||
if (err instanceof Blob.NotFoundError) {
|
||||
logger.warn({ projectId, hash }, 'Blob not found')
|
||||
return res.status(404).end()
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
res.set('Content-Type', 'application/octet-stream')
|
||||
try {
|
||||
await pipeline(stream, res)
|
||||
} catch (err) {
|
||||
if (err?.code === 'ERR_STREAM_PREMATURE_CLOSE') {
|
||||
res.end()
|
||||
} else {
|
||||
throw OError.tag(err, 'error transferring stream', { projectId, hash })
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getProjectBlob finished')
|
||||
}
|
||||
}
|
||||
|
||||
async function copyProjectBlob(req, res, next) {
|
||||
const sourceProjectId = req.swagger.params.copyFrom.value
|
||||
const targetProjectId = req.swagger.params.project_id.value
|
||||
const blobHash = req.swagger.params.hash.value
|
||||
// Check that blob exists in source project
|
||||
const sourceBlobStore = new BlobStore(sourceProjectId)
|
||||
const targetBlobStore = new BlobStore(targetProjectId)
|
||||
const [sourceBlob, targetBlob] = await Promise.all([
|
||||
sourceBlobStore.getBlob(blobHash),
|
||||
targetBlobStore.getBlob(blobHash),
|
||||
])
|
||||
if (!sourceBlob) {
|
||||
return render.notFound(res)
|
||||
}
|
||||
// Exit early if the blob exists in the target project.
|
||||
// This will also catch global blobs, which always exist.
|
||||
if (targetBlob) {
|
||||
return res.status(HTTPStatus.NO_CONTENT).end()
|
||||
}
|
||||
// Otherwise, copy blob from source project to target project
|
||||
await sourceBlobStore.copyBlob(sourceBlob, targetProjectId)
|
||||
res.status(HTTPStatus.CREATED).end()
|
||||
}
|
||||
|
||||
async function getSnapshotAtVersion(projectId, version) {
|
||||
const chunk = await chunkStore.loadAtVersion(projectId, version)
|
||||
const snapshot = chunk.getSnapshot()
|
||||
const changes = _.dropRight(
|
||||
chunk.getChanges(),
|
||||
chunk.getEndVersion() - version
|
||||
)
|
||||
snapshot.applyAll(changes)
|
||||
return snapshot
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initializeProject: expressify(initializeProject),
|
||||
getLatestContent: expressify(getLatestContent),
|
||||
getContentAtVersion: expressify(getContentAtVersion),
|
||||
getLatestHashedContent: expressify(getLatestHashedContent),
|
||||
getLatestPersistedHistory: expressify(getLatestHistory),
|
||||
getLatestHistory: expressify(getLatestHistory),
|
||||
getLatestHistoryRaw: expressify(getLatestHistoryRaw),
|
||||
getHistory: expressify(getHistory),
|
||||
getHistoryBefore: expressify(getHistoryBefore),
|
||||
getChanges: expressify(getChanges),
|
||||
getZip: expressify(getZip),
|
||||
createZip: expressify(createZip),
|
||||
deleteProject: expressify(deleteProject),
|
||||
createProjectBlob: expressify(createProjectBlob),
|
||||
getProjectBlob: expressify(getProjectBlob),
|
||||
headProjectBlob: expressify(headProjectBlob),
|
||||
copyProjectBlob: expressify(copyProjectBlob),
|
||||
}
|
17
services/history-v1/api/controllers/render.js
Normal file
17
services/history-v1/api/controllers/render.js
Normal file
@@ -0,0 +1,17 @@
|
||||
'use strict'
|
||||
|
||||
const HTTPStatus = require('http-status')
|
||||
|
||||
function makeErrorRenderer(status) {
|
||||
return (res, message) => {
|
||||
res.status(status).json({ message: message || HTTPStatus[status] })
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
badRequest: makeErrorRenderer(HTTPStatus.BAD_REQUEST),
|
||||
notFound: makeErrorRenderer(HTTPStatus.NOT_FOUND),
|
||||
unprocessableEntity: makeErrorRenderer(HTTPStatus.UNPROCESSABLE_ENTITY),
|
||||
conflict: makeErrorRenderer(HTTPStatus.CONFLICT),
|
||||
requestEntityTooLarge: makeErrorRenderer(HTTPStatus.REQUEST_ENTITY_TOO_LARGE),
|
||||
}
|
26
services/history-v1/api/controllers/stream_size_limit.js
Normal file
26
services/history-v1/api/controllers/stream_size_limit.js
Normal file
@@ -0,0 +1,26 @@
|
||||
const stream = require('node:stream')
|
||||
|
||||
/**
|
||||
* Transform stream that stops passing bytes through after some threshold has
|
||||
* been reached.
|
||||
*/
|
||||
class StreamSizeLimit extends stream.Transform {
|
||||
constructor(maxSize) {
|
||||
super()
|
||||
this.maxSize = maxSize
|
||||
this.accumulatedSize = 0
|
||||
this.sizeLimitExceeded = false
|
||||
}
|
||||
|
||||
_transform(chunk, encoding, cb) {
|
||||
this.accumulatedSize += chunk.length
|
||||
if (this.accumulatedSize > this.maxSize) {
|
||||
this.sizeLimitExceeded = true
|
||||
} else {
|
||||
this.push(chunk)
|
||||
}
|
||||
cb()
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = StreamSizeLimit
|
27
services/history-v1/api/controllers/with_tmp_dir.js
Normal file
27
services/history-v1/api/controllers/with_tmp_dir.js
Normal file
@@ -0,0 +1,27 @@
|
||||
const fs = require('node:fs')
|
||||
const fsExtra = require('fs-extra')
|
||||
const logger = require('@overleaf/logger')
|
||||
const os = require('node:os')
|
||||
const path = require('node:path')
|
||||
|
||||
/**
|
||||
* Create a temporary directory before executing a function and cleaning up
|
||||
* after.
|
||||
*
|
||||
* @param {string} prefix - prefix for the temporary directory name
|
||||
* @param {Function} fn - async function to call
|
||||
*/
|
||||
async function withTmpDir(prefix, fn) {
|
||||
const tmpDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), prefix))
|
||||
try {
|
||||
await fn(tmpDir)
|
||||
} finally {
|
||||
fsExtra.remove(tmpDir).catch(err => {
|
||||
if (err.code !== 'ENOENT') {
|
||||
logger.error({ err }, 'failed to delete temporary file')
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = withTmpDir
|
269
services/history-v1/api/swagger/index.js
Normal file
269
services/history-v1/api/swagger/index.js
Normal file
@@ -0,0 +1,269 @@
|
||||
'use strict'
|
||||
|
||||
const _ = require('lodash')
|
||||
const paths = _.reduce(
|
||||
[require('./projects').paths, require('./project_import').paths],
|
||||
_.extend
|
||||
)
|
||||
|
||||
const securityDefinitions = require('./security_definitions')
|
||||
module.exports = {
|
||||
swagger: '2.0',
|
||||
info: {
|
||||
title: 'Overleaf Editor API',
|
||||
description: 'API for the Overleaf editor.',
|
||||
version: '1.0',
|
||||
},
|
||||
produces: ['application/json'],
|
||||
basePath: '/api',
|
||||
paths,
|
||||
securityDefinitions,
|
||||
security: [
|
||||
{
|
||||
jwt: [],
|
||||
},
|
||||
],
|
||||
definitions: {
|
||||
Project: {
|
||||
properties: {
|
||||
projectId: {
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
required: ['projectId'],
|
||||
},
|
||||
File: {
|
||||
properties: {
|
||||
hash: {
|
||||
type: 'string',
|
||||
},
|
||||
byteLength: {
|
||||
type: 'integer',
|
||||
},
|
||||
stringLength: {
|
||||
type: 'integer',
|
||||
},
|
||||
},
|
||||
},
|
||||
Label: {
|
||||
properties: {
|
||||
authorId: {
|
||||
type: 'integer',
|
||||
},
|
||||
text: {
|
||||
type: 'string',
|
||||
},
|
||||
timestamp: {
|
||||
type: 'string',
|
||||
},
|
||||
version: {
|
||||
type: 'integer',
|
||||
},
|
||||
},
|
||||
},
|
||||
Chunk: {
|
||||
properties: {
|
||||
history: {
|
||||
$ref: '#/definitions/History',
|
||||
},
|
||||
startVersion: {
|
||||
type: 'number',
|
||||
},
|
||||
},
|
||||
},
|
||||
ChunkResponse: {
|
||||
properties: {
|
||||
chunk: {
|
||||
$ref: '#/definitions/Chunk',
|
||||
},
|
||||
authors: {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/definitions/Author',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ChunkResponseRaw: {
|
||||
properties: {
|
||||
startVersion: {
|
||||
type: 'number',
|
||||
},
|
||||
endVersion: {
|
||||
type: 'number',
|
||||
},
|
||||
endTimestamp: {
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
},
|
||||
History: {
|
||||
properties: {
|
||||
snapshot: {
|
||||
$ref: '#/definitions/Snapshot',
|
||||
},
|
||||
changes: {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/definitions/Change',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Snapshot: {
|
||||
properties: {
|
||||
files: {
|
||||
type: 'object',
|
||||
additionalProperties: {
|
||||
$ref: '#/definitions/File',
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ['files'],
|
||||
},
|
||||
Change: {
|
||||
properties: {
|
||||
timestamp: {
|
||||
type: 'string',
|
||||
},
|
||||
operations: {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/definitions/Operation',
|
||||
},
|
||||
},
|
||||
authors: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: ['integer', 'null'],
|
||||
},
|
||||
},
|
||||
v2Authors: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: ['string', 'null'],
|
||||
},
|
||||
},
|
||||
projectVersion: {
|
||||
type: 'string',
|
||||
},
|
||||
v2DocVersions: {
|
||||
type: 'object',
|
||||
additionalProperties: {
|
||||
$ref: '#/definitions/V2DocVersions',
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ['timestamp', 'operations'],
|
||||
},
|
||||
V2DocVersions: {
|
||||
properties: {
|
||||
pathname: {
|
||||
type: 'string',
|
||||
},
|
||||
v: {
|
||||
type: 'integer',
|
||||
},
|
||||
},
|
||||
},
|
||||
ChangeRequest: {
|
||||
properties: {
|
||||
baseVersion: {
|
||||
type: 'integer',
|
||||
},
|
||||
untransformable: {
|
||||
type: 'boolean',
|
||||
},
|
||||
operations: {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/definitions/Operation',
|
||||
},
|
||||
},
|
||||
authors: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: ['integer', 'null'],
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ['baseVersion', 'operations'],
|
||||
},
|
||||
ChangeNote: {
|
||||
properties: {
|
||||
baseVersion: {
|
||||
type: 'integer',
|
||||
},
|
||||
change: {
|
||||
$ref: '#/definitions/Change',
|
||||
},
|
||||
},
|
||||
required: ['baseVersion'],
|
||||
},
|
||||
Operation: {
|
||||
properties: {
|
||||
pathname: {
|
||||
type: 'string',
|
||||
},
|
||||
newPathname: {
|
||||
type: 'string',
|
||||
},
|
||||
blob: {
|
||||
$ref: '#/definitions/Blob',
|
||||
},
|
||||
textOperation: {
|
||||
type: 'array',
|
||||
items: {},
|
||||
},
|
||||
file: {
|
||||
$ref: '#/definitions/File',
|
||||
},
|
||||
},
|
||||
},
|
||||
Error: {
|
||||
properties: {
|
||||
message: {
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
required: ['message'],
|
||||
},
|
||||
Blob: {
|
||||
properties: {
|
||||
hash: {
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
required: ['hash'],
|
||||
},
|
||||
Author: {
|
||||
properties: {
|
||||
id: {
|
||||
type: 'integer',
|
||||
},
|
||||
email: {
|
||||
type: 'string',
|
||||
},
|
||||
name: {
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
required: ['id', 'email', 'name'],
|
||||
},
|
||||
SyncState: {
|
||||
properties: {
|
||||
synced: {
|
||||
type: 'boolean',
|
||||
},
|
||||
},
|
||||
},
|
||||
ZipInfo: {
|
||||
properties: {
|
||||
zipUrl: {
|
||||
type: 'string',
|
||||
},
|
||||
},
|
||||
required: ['zipUrl'],
|
||||
},
|
||||
},
|
||||
}
|
147
services/history-v1/api/swagger/project_import.js
Normal file
147
services/history-v1/api/swagger/project_import.js
Normal file
@@ -0,0 +1,147 @@
|
||||
'use strict'
|
||||
|
||||
const importSnapshot = {
|
||||
'x-swagger-router-controller': 'project_import',
|
||||
operationId: 'importSnapshot',
|
||||
tags: ['ProjectImport'],
|
||||
description: 'Import a snapshot from the current rails app.',
|
||||
consumes: ['application/json'],
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'snapshot',
|
||||
in: 'body',
|
||||
description: 'Snapshot to import.',
|
||||
required: true,
|
||||
schema: {
|
||||
$ref: '#/definitions/Snapshot',
|
||||
},
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Imported',
|
||||
},
|
||||
409: {
|
||||
description: 'Conflict: project already initialized',
|
||||
},
|
||||
404: {
|
||||
description: 'No such project exists',
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
const importChanges = {
|
||||
'x-swagger-router-controller': 'project_import',
|
||||
operationId: 'importChanges',
|
||||
tags: ['ProjectImport'],
|
||||
description: 'Import changes for a project from the current rails app.',
|
||||
consumes: ['application/json'],
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'end_version',
|
||||
description: 'end_version of latest persisted chunk',
|
||||
in: 'query',
|
||||
required: true,
|
||||
type: 'number',
|
||||
},
|
||||
{
|
||||
name: 'return_snapshot',
|
||||
description:
|
||||
'optionally, return a snapshot with the latest hashed content',
|
||||
in: 'query',
|
||||
required: false,
|
||||
type: 'string',
|
||||
enum: ['hashed', 'none'],
|
||||
},
|
||||
{
|
||||
name: 'changes',
|
||||
in: 'body',
|
||||
description: 'changes to be imported',
|
||||
required: true,
|
||||
schema: {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/definitions/Change',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
201: {
|
||||
description: 'Created',
|
||||
schema: {
|
||||
$ref: '#/definitions/Snapshot',
|
||||
},
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
const getChanges = {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getChanges',
|
||||
tags: ['Project'],
|
||||
description: 'Get changes applied to a project',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'since',
|
||||
in: 'query',
|
||||
description: 'start version',
|
||||
required: false,
|
||||
type: 'number',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
type: 'array',
|
||||
items: {
|
||||
$ref: '#/definitions/Change',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
exports.paths = {
|
||||
'/projects/{project_id}/import': { post: importSnapshot },
|
||||
'/projects/{project_id}/legacy_import': { post: importSnapshot },
|
||||
'/projects/{project_id}/changes': { get: getChanges, post: importChanges },
|
||||
'/projects/{project_id}/legacy_changes': { post: importChanges },
|
||||
}
|
588
services/history-v1/api/swagger/projects.js
Normal file
588
services/history-v1/api/swagger/projects.js
Normal file
@@ -0,0 +1,588 @@
|
||||
'use strict'
|
||||
|
||||
const Blob = require('overleaf-editor-core').Blob
|
||||
|
||||
exports.paths = {
|
||||
'/projects': {
|
||||
post: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'initializeProject',
|
||||
tags: ['Project'],
|
||||
description: 'Initialize project.',
|
||||
consumes: ['application/json'],
|
||||
parameters: [
|
||||
{
|
||||
name: 'body',
|
||||
in: 'body',
|
||||
schema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
projectId: { type: 'string' },
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Initialized',
|
||||
schema: {
|
||||
$ref: '#/definitions/Project',
|
||||
},
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}': {
|
||||
delete: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'deleteProject',
|
||||
tags: ['Project'],
|
||||
description: "Delete a project's history",
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
204: {
|
||||
description: 'Success',
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/blobs/{hash}': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getProjectBlob',
|
||||
tags: ['Project'],
|
||||
description: 'Fetch blob content by its project id and hash.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'hash',
|
||||
in: 'path',
|
||||
description: 'Hexadecimal SHA-1 hash',
|
||||
required: true,
|
||||
type: 'string',
|
||||
pattern: Blob.HEX_HASH_RX_STRING,
|
||||
},
|
||||
{
|
||||
name: 'range',
|
||||
in: 'header',
|
||||
description: 'HTTP Range header',
|
||||
required: false,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
produces: ['application/octet-stream'],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
type: 'file',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
security: [{ jwt: [] }, { token: [] }],
|
||||
},
|
||||
head: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'headProjectBlob',
|
||||
tags: ['Project'],
|
||||
description: 'Fetch blob content-length by its project id and hash.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'hash',
|
||||
in: 'path',
|
||||
description: 'Hexadecimal SHA-1 hash',
|
||||
required: true,
|
||||
type: 'string',
|
||||
pattern: Blob.HEX_HASH_RX_STRING,
|
||||
},
|
||||
],
|
||||
produces: ['application/octet-stream'],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
type: 'file',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
security: [{ jwt: [] }, { token: [] }],
|
||||
},
|
||||
put: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'createProjectBlob',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Create blob to be used in a file addition operation when importing a' +
|
||||
' snapshot or changes',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'hash',
|
||||
in: 'path',
|
||||
description: 'Hexadecimal SHA-1 hash',
|
||||
required: true,
|
||||
type: 'string',
|
||||
pattern: Blob.HEX_HASH_RX_STRING,
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
201: {
|
||||
description: 'Created',
|
||||
},
|
||||
},
|
||||
},
|
||||
post: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'copyProjectBlob',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Copies a blob from a source project to a target project when duplicating a project',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'target project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'hash',
|
||||
in: 'path',
|
||||
description: 'Hexadecimal SHA-1 hash',
|
||||
required: true,
|
||||
type: 'string',
|
||||
pattern: Blob.HEX_HASH_RX_STRING,
|
||||
},
|
||||
{
|
||||
name: 'copyFrom',
|
||||
in: 'query',
|
||||
description: 'source project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
201: {
|
||||
description: 'Created',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/latest/content': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getLatestContent',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Get full content of the latest version. Text file ' +
|
||||
'content is included, but binary files are just linked by hash.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/Snapshot',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/latest/hashed_content': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getLatestHashedContent',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Get a snapshot of a project at the latest version ' +
|
||||
'with the hashes for the contents each file',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/Snapshot',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/latest/history': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getLatestHistory',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Get the latest sequence of changes.' +
|
||||
' TODO probably want a configurable depth.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/ChunkResponse',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/latest/history/raw': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getLatestHistoryRaw',
|
||||
tags: ['Project'],
|
||||
description: 'Get the metadata of latest sequence of changes.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'readOnly',
|
||||
in: 'query',
|
||||
description: 'use read only database connection',
|
||||
required: false,
|
||||
type: 'boolean',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/ChunkResponseRaw',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/latest/persistedHistory': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getLatestPersistedHistory',
|
||||
tags: ['Project'],
|
||||
description: 'Get the latest sequence of changes.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/ChunkResponse',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
'/projects/{project_id}/versions/{version}/history': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getHistory',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Get the sequence of changes that includes the given version.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'version',
|
||||
in: 'path',
|
||||
description: 'numeric version',
|
||||
required: true,
|
||||
type: 'number',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/ChunkResponse',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/versions/{version}/content': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getContentAtVersion',
|
||||
tags: ['Project'],
|
||||
description: 'Get full content at the given version',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'version',
|
||||
in: 'path',
|
||||
description: 'numeric version',
|
||||
required: true,
|
||||
type: 'number',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/Snapshot',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/timestamp/{timestamp}/history': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getHistoryBefore',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Get the sequence of changes. ' + ' before the given timestamp',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'timestamp',
|
||||
in: 'path',
|
||||
description: 'timestamp',
|
||||
required: true,
|
||||
type: 'string',
|
||||
format: 'date-time',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'Success',
|
||||
schema: {
|
||||
$ref: '#/definitions/ChunkResponse',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'Not Found',
|
||||
schema: {
|
||||
$ref: '#/definitions/Error',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
'/projects/{project_id}/version/{version}/zip': {
|
||||
get: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'getZip',
|
||||
tags: ['Project'],
|
||||
description: 'Download zip with project content',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'version',
|
||||
in: 'path',
|
||||
description: 'numeric version',
|
||||
required: true,
|
||||
type: 'number',
|
||||
},
|
||||
],
|
||||
produces: ['application/octet-stream'],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'success',
|
||||
},
|
||||
404: {
|
||||
description: 'not found',
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
token: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
post: {
|
||||
'x-swagger-router-controller': 'projects',
|
||||
operationId: 'createZip',
|
||||
tags: ['Project'],
|
||||
description:
|
||||
'Create a zip file with project content. Returns a link to be polled.',
|
||||
parameters: [
|
||||
{
|
||||
name: 'project_id',
|
||||
in: 'path',
|
||||
description: 'project id',
|
||||
required: true,
|
||||
type: 'string',
|
||||
},
|
||||
{
|
||||
name: 'version',
|
||||
in: 'path',
|
||||
description: 'numeric version',
|
||||
required: true,
|
||||
type: 'number',
|
||||
},
|
||||
],
|
||||
responses: {
|
||||
200: {
|
||||
description: 'success',
|
||||
schema: {
|
||||
$ref: '#/definitions/ZipInfo',
|
||||
},
|
||||
},
|
||||
404: {
|
||||
description: 'not found',
|
||||
},
|
||||
},
|
||||
security: [
|
||||
{
|
||||
basic: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
17
services/history-v1/api/swagger/security_definitions.js
Normal file
17
services/history-v1/api/swagger/security_definitions.js
Normal file
@@ -0,0 +1,17 @@
|
||||
'use strict'
|
||||
|
||||
module.exports = {
|
||||
jwt: {
|
||||
type: 'apiKey',
|
||||
in: 'header',
|
||||
name: 'authorization',
|
||||
},
|
||||
basic: {
|
||||
type: 'basic',
|
||||
},
|
||||
token: {
|
||||
type: 'apiKey',
|
||||
in: 'query',
|
||||
name: 'token',
|
||||
},
|
||||
}
|
172
services/history-v1/app.js
Normal file
172
services/history-v1/app.js
Normal file
@@ -0,0 +1,172 @@
|
||||
'use strict'
|
||||
|
||||
/* eslint-disable no-console */
|
||||
|
||||
// Metrics must be initialized before importing anything else
|
||||
require('@overleaf/metrics/initialize')
|
||||
|
||||
const config = require('config')
|
||||
const Events = require('node:events')
|
||||
const BPromise = require('bluebird')
|
||||
const express = require('express')
|
||||
const helmet = require('helmet')
|
||||
const HTTPStatus = require('http-status')
|
||||
const logger = require('@overleaf/logger')
|
||||
const Metrics = require('@overleaf/metrics')
|
||||
const bodyParser = require('body-parser')
|
||||
const swaggerTools = require('swagger-tools')
|
||||
const swaggerDoc = require('./api/swagger')
|
||||
const security = require('./api/app/security')
|
||||
const healthChecks = require('./api/controllers/health_checks')
|
||||
const { mongodb, loadGlobalBlobs } = require('./storage')
|
||||
const path = require('node:path')
|
||||
|
||||
Events.setMaxListeners(20)
|
||||
const app = express()
|
||||
module.exports = app
|
||||
|
||||
logger.initialize('history-v1')
|
||||
Metrics.open_sockets.monitor()
|
||||
Metrics.injectMetricsRoute(app)
|
||||
app.use(Metrics.http.monitor(logger))
|
||||
Metrics.leaked_sockets.monitor(logger)
|
||||
|
||||
// We may have fairly large JSON bodies when receiving large Changes. Clients
|
||||
// may have to handle 413 status codes and try creating files instead of sending
|
||||
// text content in changes.
|
||||
app.use(bodyParser.json({ limit: '6MB' }))
|
||||
app.use(
|
||||
bodyParser.urlencoded({
|
||||
extended: false,
|
||||
})
|
||||
)
|
||||
|
||||
security.setupSSL(app)
|
||||
security.setupBasicHttpAuthForSwaggerDocs(app)
|
||||
|
||||
const HTTP_REQUEST_TIMEOUT = parseInt(config.get('httpRequestTimeout'), 10)
|
||||
app.use(function (req, res, next) {
|
||||
res.setTimeout(HTTP_REQUEST_TIMEOUT)
|
||||
next()
|
||||
})
|
||||
|
||||
app.get('/', function (req, res) {
|
||||
res.send('')
|
||||
})
|
||||
|
||||
app.get('/status', healthChecks.status)
|
||||
app.get('/health_check', healthChecks.healthCheck)
|
||||
|
||||
function setupSwagger() {
|
||||
return new BPromise(function (resolve) {
|
||||
swaggerTools.initializeMiddleware(swaggerDoc, function (middleware) {
|
||||
app.use(middleware.swaggerMetadata())
|
||||
app.use(middleware.swaggerSecurity(security.getSwaggerHandlers()))
|
||||
app.use(middleware.swaggerValidator())
|
||||
app.use(
|
||||
middleware.swaggerRouter({
|
||||
controllers: path.join(__dirname, 'api/controllers'),
|
||||
useStubs: app.get('env') === 'development',
|
||||
})
|
||||
)
|
||||
app.use(middleware.swaggerUi())
|
||||
resolve()
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
function setupErrorHandling() {
|
||||
app.use(function (req, res, next) {
|
||||
const err = new Error('Not Found')
|
||||
err.status = HTTPStatus.NOT_FOUND
|
||||
return next(err)
|
||||
})
|
||||
|
||||
// Handle Swagger errors.
|
||||
app.use(function (err, req, res, next) {
|
||||
const projectId = req.swagger?.params?.project_id?.value
|
||||
if (res.headersSent) {
|
||||
return next(err)
|
||||
}
|
||||
|
||||
if (err.code === 'SCHEMA_VALIDATION_FAILED') {
|
||||
logger.error({ err, projectId }, err.message)
|
||||
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json(err.results)
|
||||
}
|
||||
if (err.code === 'INVALID_TYPE' || err.code === 'PATTERN') {
|
||||
logger.error({ err, projectId }, err.message)
|
||||
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json({
|
||||
message: 'invalid type: ' + err.paramName,
|
||||
})
|
||||
}
|
||||
if (err.code === 'ENUM_MISMATCH') {
|
||||
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json({
|
||||
message: 'invalid enum value: ' + err.paramName,
|
||||
})
|
||||
}
|
||||
if (err.code === 'REQUIRED') {
|
||||
return res.status(HTTPStatus.UNPROCESSABLE_ENTITY).json({
|
||||
message: err.message,
|
||||
})
|
||||
}
|
||||
next(err)
|
||||
})
|
||||
|
||||
app.use(function (err, req, res, next) {
|
||||
const projectId = req.swagger?.params?.project_id?.value
|
||||
logger.error({ err, projectId }, err.message)
|
||||
|
||||
if (res.headersSent) {
|
||||
return next(err)
|
||||
}
|
||||
|
||||
// Handle errors that specify a statusCode. Some come from our code. Some
|
||||
// bubble up from AWS SDK, but they sometimes have the statusCode set to
|
||||
// 200, notably some InternalErrors and TimeoutErrors, so we have to guard
|
||||
// against that. We also check `status`, but `statusCode` is preferred.
|
||||
const statusCode = err.statusCode || err.status
|
||||
if (statusCode && statusCode >= 400 && statusCode < 600) {
|
||||
res.status(statusCode)
|
||||
} else {
|
||||
res.status(HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
|
||||
const sendErrorToClient = app.get('env') === 'development'
|
||||
res.json({
|
||||
message: err.message,
|
||||
error: sendErrorToClient ? err : {},
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
app.setup = async function appSetup() {
|
||||
await mongodb.client.connect()
|
||||
logger.info('Connected to MongoDB')
|
||||
await loadGlobalBlobs()
|
||||
logger.info('Global blobs loaded')
|
||||
app.use(helmet())
|
||||
await setupSwagger()
|
||||
setupErrorHandling()
|
||||
}
|
||||
|
||||
async function startApp() {
|
||||
await app.setup()
|
||||
|
||||
const port = parseInt(process.env.PORT, 10) || 3100
|
||||
app.listen(port, err => {
|
||||
if (err) {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
}
|
||||
Metrics.event_loop.monitor(logger)
|
||||
Metrics.memory.monitor(logger)
|
||||
})
|
||||
}
|
||||
|
||||
// Run this if we're called directly
|
||||
if (!module.parent) {
|
||||
startApp().catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
}
|
81
services/history-v1/backup-deletion-app.mjs
Normal file
81
services/history-v1/backup-deletion-app.mjs
Normal file
@@ -0,0 +1,81 @@
|
||||
// @ts-check
|
||||
// Metrics must be initialized before importing anything else
|
||||
import '@overleaf/metrics/initialize.js'
|
||||
import http from 'node:http'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import { promisify } from 'node:util'
|
||||
import express from 'express'
|
||||
import logger from '@overleaf/logger'
|
||||
import Metrics from '@overleaf/metrics'
|
||||
import { hasValidBasicAuthCredentials } from './api/app/security.js'
|
||||
import {
|
||||
deleteProjectBackupCb,
|
||||
healthCheck,
|
||||
healthCheckCb,
|
||||
NotReadyToDelete,
|
||||
} from './storage/lib/backupDeletion.mjs'
|
||||
import { mongodb } from './storage/index.js'
|
||||
|
||||
const app = express()
|
||||
|
||||
logger.initialize('history-v1-backup-deletion')
|
||||
Metrics.open_sockets.monitor()
|
||||
Metrics.injectMetricsRoute(app)
|
||||
app.use(Metrics.http.monitor(logger))
|
||||
Metrics.leaked_sockets.monitor(logger)
|
||||
Metrics.event_loop.monitor(logger)
|
||||
Metrics.memory.monitor(logger)
|
||||
|
||||
function basicAuth(req, res, next) {
|
||||
if (hasValidBasicAuthCredentials(req)) return next()
|
||||
res.setHeader('WWW-Authenticate', 'Basic realm="Application"')
|
||||
res.sendStatus(401)
|
||||
}
|
||||
|
||||
app.delete('/project/:projectId/backup', basicAuth, (req, res, next) => {
|
||||
deleteProjectBackupCb(req.params.projectId, err => {
|
||||
if (err) {
|
||||
return next(err)
|
||||
}
|
||||
res.sendStatus(204)
|
||||
})
|
||||
})
|
||||
|
||||
app.get('/status', (req, res) => {
|
||||
res.send('history-v1-backup-deletion is up')
|
||||
})
|
||||
|
||||
app.get('/health_check', (req, res, next) => {
|
||||
healthCheckCb(err => {
|
||||
if (err) return next(err)
|
||||
res.sendStatus(200)
|
||||
})
|
||||
})
|
||||
|
||||
app.use((err, req, res, next) => {
|
||||
req.logger.addFields({ err })
|
||||
if (err instanceof NotReadyToDelete) {
|
||||
req.logger.setLevel('warn')
|
||||
return res.status(422).send(err.message)
|
||||
}
|
||||
req.logger.setLevel('error')
|
||||
next(err)
|
||||
})
|
||||
|
||||
/**
|
||||
* @param {number} port
|
||||
* @return {Promise<http.Server>}
|
||||
*/
|
||||
export async function startApp(port) {
|
||||
await mongodb.client.connect()
|
||||
await healthCheck()
|
||||
const server = http.createServer(app)
|
||||
await promisify(server.listen.bind(server, port))()
|
||||
return server
|
||||
}
|
||||
|
||||
// Run this if we're called directly
|
||||
if (process.argv[1] === fileURLToPath(import.meta.url)) {
|
||||
const PORT = parseInt(process.env.PORT || '3101', 10)
|
||||
await startApp(PORT)
|
||||
}
|
117
services/history-v1/backup-verifier-app.mjs
Normal file
117
services/history-v1/backup-verifier-app.mjs
Normal file
@@ -0,0 +1,117 @@
|
||||
// @ts-check
|
||||
// Metrics must be initialized before importing anything else
|
||||
import '@overleaf/metrics/initialize.js'
|
||||
import http from 'node:http'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import { promisify } from 'node:util'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import express from 'express'
|
||||
import logger from '@overleaf/logger'
|
||||
import Metrics from '@overleaf/metrics'
|
||||
import { healthCheck } from './backupVerifier/healthCheck.mjs'
|
||||
import {
|
||||
BackupCorruptedError,
|
||||
verifyBlob,
|
||||
} from './storage/lib/backupVerifier.mjs'
|
||||
import { mongodb } from './storage/index.js'
|
||||
import { expressify } from '@overleaf/promise-utils'
|
||||
import { Blob } from 'overleaf-editor-core'
|
||||
import { loadGlobalBlobs } from './storage/lib/blob_store/index.js'
|
||||
import { EventEmitter } from 'node:events'
|
||||
import {
|
||||
loopRandomProjects,
|
||||
setWriteMetrics,
|
||||
} from './backupVerifier/ProjectVerifier.mjs'
|
||||
|
||||
const app = express()
|
||||
|
||||
logger.initialize('history-v1-backup-verifier')
|
||||
Metrics.open_sockets.monitor()
|
||||
Metrics.injectMetricsRoute(app)
|
||||
app.use(Metrics.http.monitor(logger))
|
||||
Metrics.leaked_sockets.monitor(logger)
|
||||
Metrics.event_loop.monitor(logger)
|
||||
Metrics.memory.monitor(logger)
|
||||
|
||||
app.get(
|
||||
'/history/:historyId/blob/:hash/verify',
|
||||
expressify(async (req, res) => {
|
||||
const { historyId, hash } = req.params
|
||||
try {
|
||||
await verifyBlob(historyId, hash)
|
||||
res.sendStatus(200)
|
||||
} catch (err) {
|
||||
logger.warn({ err, historyId, hash }, 'manual verify blob failed')
|
||||
if (err instanceof Blob.NotFoundError) {
|
||||
res.status(404).send(err.message)
|
||||
} else if (err instanceof BackupCorruptedError) {
|
||||
res.status(422).send(err.message)
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
app.get('/status', (req, res) => {
|
||||
res.send('history-v1-backup-verifier is up')
|
||||
})
|
||||
|
||||
app.get(
|
||||
'/health_check',
|
||||
expressify(async (req, res) => {
|
||||
await healthCheck()
|
||||
res.sendStatus(200)
|
||||
})
|
||||
)
|
||||
|
||||
app.use((err, req, res, next) => {
|
||||
req.logger.addFields({ err })
|
||||
req.logger.setLevel('error')
|
||||
next(err)
|
||||
})
|
||||
|
||||
const shutdownEmitter = new EventEmitter()
|
||||
|
||||
shutdownEmitter.once('shutdown', async code => {
|
||||
logger.info({ code }, 'shutting down')
|
||||
await mongodb.client.close()
|
||||
await setTimeout(100)
|
||||
process.exit(code)
|
||||
})
|
||||
|
||||
process.on('SIGTERM', () => {
|
||||
shutdownEmitter.emit('shutdown', 0)
|
||||
})
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
shutdownEmitter.emit('shutdown', 0)
|
||||
})
|
||||
|
||||
/**
|
||||
* @param {number} port
|
||||
* @param {boolean} enableVerificationLoop
|
||||
* @return {Promise<http.Server>}
|
||||
*/
|
||||
export async function startApp(port, enableVerificationLoop = true) {
|
||||
await mongodb.client.connect()
|
||||
await loadGlobalBlobs()
|
||||
await healthCheck()
|
||||
const server = http.createServer(app)
|
||||
await promisify(server.listen.bind(server, port))()
|
||||
enableVerificationLoop && loopRandomProjects(shutdownEmitter)
|
||||
return server
|
||||
}
|
||||
|
||||
setWriteMetrics(true)
|
||||
|
||||
// Run this if we're called directly
|
||||
if (process.argv[1] === fileURLToPath(import.meta.url)) {
|
||||
const PORT = parseInt(process.env.PORT || '3102', 10)
|
||||
try {
|
||||
await startApp(PORT)
|
||||
} catch (error) {
|
||||
shutdownEmitter.emit('shutdown', 1)
|
||||
logger.error({ error }, 'error starting app')
|
||||
}
|
||||
}
|
70
services/history-v1/backup-worker-app.mjs
Normal file
70
services/history-v1/backup-worker-app.mjs
Normal file
@@ -0,0 +1,70 @@
|
||||
// @ts-check
|
||||
// Metrics must be initialized before importing anything else
|
||||
import '@overleaf/metrics/initialize.js'
|
||||
import http from 'node:http'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import { promisify } from 'node:util'
|
||||
import express from 'express'
|
||||
import logger from '@overleaf/logger'
|
||||
import Metrics from '@overleaf/metrics'
|
||||
import { expressify } from '@overleaf/promise-utils'
|
||||
import { drainQueue, healthCheck } from './storage/scripts/backup_worker.mjs'
|
||||
const app = express()
|
||||
|
||||
logger.initialize('history-v1-backup-worker')
|
||||
Metrics.open_sockets.monitor()
|
||||
Metrics.injectMetricsRoute(app)
|
||||
app.use(Metrics.http.monitor(logger))
|
||||
Metrics.leaked_sockets.monitor(logger)
|
||||
Metrics.event_loop.monitor(logger)
|
||||
Metrics.memory.monitor(logger)
|
||||
|
||||
app.get('/status', (req, res) => {
|
||||
res.send('history-v1-backup-worker is up')
|
||||
})
|
||||
|
||||
app.get(
|
||||
'/health_check',
|
||||
expressify(async (req, res) => {
|
||||
await healthCheck()
|
||||
res.sendStatus(200)
|
||||
})
|
||||
)
|
||||
|
||||
app.use((err, req, res, next) => {
|
||||
req.logger.addFields({ err })
|
||||
req.logger.setLevel('error')
|
||||
next(err)
|
||||
})
|
||||
|
||||
async function triggerGracefulShutdown(server, signal) {
|
||||
logger.info({ signal }, 'graceful shutdown: started shutdown sequence')
|
||||
await drainQueue()
|
||||
server.close(function () {
|
||||
logger.info({ signal }, 'graceful shutdown: closed server')
|
||||
setTimeout(() => {
|
||||
process.exit(0)
|
||||
}, 1000)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {number} port
|
||||
* @return {Promise<http.Server>}
|
||||
*/
|
||||
export async function startApp(port) {
|
||||
await healthCheck()
|
||||
const server = http.createServer(app)
|
||||
await promisify(server.listen.bind(server, port))()
|
||||
const signals = ['SIGINT', 'SIGTERM']
|
||||
signals.forEach(signal => {
|
||||
process.on(signal, () => triggerGracefulShutdown(server, signal))
|
||||
})
|
||||
return server
|
||||
}
|
||||
|
||||
// Run this if we're called directly
|
||||
if (process.argv[1] === fileURLToPath(import.meta.url)) {
|
||||
const PORT = parseInt(process.env.PORT || '3103', 10)
|
||||
await startApp(PORT)
|
||||
}
|
33
services/history-v1/backupVerifier/ProjectMetrics.mjs
Normal file
33
services/history-v1/backupVerifier/ProjectMetrics.mjs
Normal file
@@ -0,0 +1,33 @@
|
||||
import Metrics from '@overleaf/metrics'
|
||||
import { objectIdFromDate } from './utils.mjs'
|
||||
import { db } from '../storage/lib/mongodb.js'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Date} beforeTime
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function measurePendingChangesBeforeTime(beforeTime) {
|
||||
const pendingChangeCount = await projectsCollection.countDocuments({
|
||||
'overleaf.backup.pendingChangeAt': {
|
||||
$lt: beforeTime,
|
||||
},
|
||||
})
|
||||
|
||||
Metrics.gauge('backup_verification_pending_changes', pendingChangeCount)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Date} graceTime
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function measureNeverBackedUpProjects(graceTime) {
|
||||
const neverBackedUpCount = await projectsCollection.countDocuments({
|
||||
'overleaf.backup.lastBackedUpVersion': null,
|
||||
_id: { $lt: objectIdFromDate(graceTime) },
|
||||
})
|
||||
Metrics.gauge('backup_verification_never_backed_up', neverBackedUpCount)
|
||||
}
|
79
services/history-v1/backupVerifier/ProjectSampler.mjs
Normal file
79
services/history-v1/backupVerifier/ProjectSampler.mjs
Normal file
@@ -0,0 +1,79 @@
|
||||
// @ts-check
|
||||
import { objectIdFromDate } from './utils.mjs'
|
||||
import { db } from '../storage/lib/mongodb.js'
|
||||
import config from 'config'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
const HAS_PROJECTS_WITHOUT_HISTORY =
|
||||
config.get('hasProjectsWithoutHistory') === 'true'
|
||||
|
||||
/**
|
||||
* @param {Date} start
|
||||
* @param {Date} end
|
||||
* @param {number} N
|
||||
* @yields {string}
|
||||
*/
|
||||
export async function* getProjectsCreatedInDateRangeCursor(start, end, N) {
|
||||
yield* getSampleProjectsCursor(N, [
|
||||
{
|
||||
$match: {
|
||||
_id: {
|
||||
$gt: objectIdFromDate(start),
|
||||
$lte: objectIdFromDate(end),
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
export async function* getProjectsUpdatedInDateRangeCursor(start, end, N) {
|
||||
yield* getSampleProjectsCursor(N, [
|
||||
{
|
||||
$match: {
|
||||
'overleaf.history.updatedAt': {
|
||||
$gt: start,
|
||||
$lte: end,
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {import('mongodb').Document} Document
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @generator
|
||||
* @param {number} N
|
||||
* @param {Array<Document>} preSampleAggregationStages
|
||||
* @yields {string}
|
||||
*/
|
||||
export async function* getSampleProjectsCursor(
|
||||
N,
|
||||
preSampleAggregationStages = []
|
||||
) {
|
||||
const cursor = projectsCollection.aggregate([
|
||||
...preSampleAggregationStages,
|
||||
{ $sample: { size: N } },
|
||||
{ $project: { 'overleaf.history.id': 1 } },
|
||||
])
|
||||
|
||||
let validProjects = 0
|
||||
let hasInvalidProject = false
|
||||
|
||||
for await (const project of cursor) {
|
||||
if (HAS_PROJECTS_WITHOUT_HISTORY && !project.overleaf?.history?.id) {
|
||||
hasInvalidProject = true
|
||||
continue
|
||||
}
|
||||
validProjects++
|
||||
yield project.overleaf.history.id.toString()
|
||||
}
|
||||
|
||||
if (validProjects === 0 && hasInvalidProject) {
|
||||
yield* getSampleProjectsCursor(N, preSampleAggregationStages)
|
||||
}
|
||||
}
|
320
services/history-v1/backupVerifier/ProjectVerifier.mjs
Normal file
320
services/history-v1/backupVerifier/ProjectVerifier.mjs
Normal file
@@ -0,0 +1,320 @@
|
||||
// @ts-check
|
||||
import { verifyProjectWithErrorContext } from '../storage/lib/backupVerifier.mjs'
|
||||
import { promiseMapSettledWithLimit } from '@overleaf/promise-utils'
|
||||
import logger from '@overleaf/logger'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import {
|
||||
getSampleProjectsCursor,
|
||||
getProjectsCreatedInDateRangeCursor,
|
||||
getProjectsUpdatedInDateRangeCursor,
|
||||
} from './ProjectSampler.mjs'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
const MS_PER_30_DAYS = 30 * 24 * 60 * 60 * 1000
|
||||
|
||||
const failureCounter = new metrics.prom.Counter({
|
||||
name: 'backup_project_verification_failed',
|
||||
help: 'Number of projects that failed verification',
|
||||
labelNames: ['name'],
|
||||
})
|
||||
|
||||
const successCounter = new metrics.prom.Counter({
|
||||
name: 'backup_project_verification_succeeded',
|
||||
help: 'Number of projects that succeeded verification',
|
||||
})
|
||||
|
||||
let WRITE_METRICS = false
|
||||
|
||||
/**
|
||||
* @typedef {import('node:events').EventEmitter} EventEmitter
|
||||
*/
|
||||
|
||||
/**
|
||||
* Allows writing metrics to be enabled or disabled.
|
||||
* @param {Boolean} writeMetrics
|
||||
*/
|
||||
export function setWriteMetrics(writeMetrics) {
|
||||
WRITE_METRICS = writeMetrics
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Error|unknown} error
|
||||
* @param {string} historyId
|
||||
*/
|
||||
function handleVerificationError(error, historyId) {
|
||||
const name = error instanceof Error ? error.name : 'UnknownError'
|
||||
logger.error({ historyId, error, name }, 'error verifying project backup')
|
||||
|
||||
WRITE_METRICS && failureCounter.inc({ name })
|
||||
|
||||
return name
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Date} startDate
|
||||
* @param {Date} endDate
|
||||
* @param {number} interval
|
||||
* @returns {Array<VerificationJobSpecification>}
|
||||
*/
|
||||
function splitJobs(startDate, endDate, interval) {
|
||||
/** @type {Array<VerificationJobSpecification>} */
|
||||
const jobs = []
|
||||
while (startDate < endDate) {
|
||||
const nextStart = new Date(
|
||||
Math.min(startDate.getTime() + interval, endDate.getTime())
|
||||
)
|
||||
jobs.push({ startDate, endDate: nextStart })
|
||||
startDate = nextStart
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {AsyncGenerator<string>} historyIdCursor
|
||||
* @param {EventEmitter} [eventEmitter]
|
||||
* @param {number} [delay] - Allows a delay between each verification
|
||||
* @return {Promise<{verified: number, total: number, errorTypes: *[], hasFailure: boolean}>}
|
||||
*/
|
||||
async function verifyProjectsFromCursor(
|
||||
historyIdCursor,
|
||||
eventEmitter,
|
||||
delay = 0
|
||||
) {
|
||||
const errorTypes = []
|
||||
let verified = 0
|
||||
let total = 0
|
||||
let receivedShutdownSignal = false
|
||||
if (eventEmitter) {
|
||||
eventEmitter.once('shutdown', () => {
|
||||
receivedShutdownSignal = true
|
||||
})
|
||||
}
|
||||
for await (const historyId of historyIdCursor) {
|
||||
if (receivedShutdownSignal) {
|
||||
break
|
||||
}
|
||||
total++
|
||||
try {
|
||||
await verifyProjectWithErrorContext(historyId)
|
||||
logger.debug({ historyId }, 'verified project backup successfully')
|
||||
WRITE_METRICS && successCounter.inc()
|
||||
verified++
|
||||
} catch (error) {
|
||||
const errorType = handleVerificationError(error, historyId)
|
||||
errorTypes.push(errorType)
|
||||
}
|
||||
if (delay > 0) {
|
||||
await setTimeout(delay)
|
||||
}
|
||||
}
|
||||
return {
|
||||
verified,
|
||||
total,
|
||||
errorTypes,
|
||||
hasFailure: errorTypes.length > 0,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {number} nProjectsToSample
|
||||
* @param {EventEmitter} [signal]
|
||||
* @param {number} [delay]
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
export async function verifyRandomProjectSample(
|
||||
nProjectsToSample,
|
||||
signal,
|
||||
delay = 0
|
||||
) {
|
||||
const historyIds = await getSampleProjectsCursor(nProjectsToSample)
|
||||
return await verifyProjectsFromCursor(historyIds, signal, delay)
|
||||
}
|
||||
|
||||
/**
|
||||
* Samples projects with history IDs between the specified dates and verifies them.
|
||||
*
|
||||
* @param {Date} startDate
|
||||
* @param {Date} endDate
|
||||
* @param {number} projectsPerRange
|
||||
* @param {EventEmitter} [signal]
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
async function verifyRange(startDate, endDate, projectsPerRange, signal) {
|
||||
logger.info({ startDate, endDate }, 'verifying range')
|
||||
|
||||
const results = await verifyProjectsFromCursor(
|
||||
getProjectsCreatedInDateRangeCursor(startDate, endDate, projectsPerRange),
|
||||
signal
|
||||
)
|
||||
|
||||
if (results.total === 0) {
|
||||
logger.debug(
|
||||
{ start: startDate, end: endDate },
|
||||
'No projects found in range'
|
||||
)
|
||||
}
|
||||
|
||||
const jobStatus = {
|
||||
...results,
|
||||
startDate,
|
||||
endDate,
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
{ ...jobStatus, errorTypes: Array.from(new Set(jobStatus.errorTypes)) },
|
||||
'Verified range'
|
||||
)
|
||||
return jobStatus
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} VerificationJobSpecification
|
||||
* @property {Date} startDate
|
||||
* @property {Date} endDate
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import('./types.d.ts').VerificationJobStatus} VerificationJobStatus
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} VerifyDateRangeOptions
|
||||
* @property {Date} startDate
|
||||
* @property {Date} endDate
|
||||
* @property {number} [interval]
|
||||
* @property {number} [projectsPerRange]
|
||||
* @property {number} [concurrency]
|
||||
* @property {EventEmitter} [signal]
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {VerifyDateRangeOptions} options
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
export async function verifyProjectsCreatedInDateRange({
|
||||
concurrency = 0,
|
||||
projectsPerRange = 10,
|
||||
startDate,
|
||||
endDate,
|
||||
interval = MS_PER_30_DAYS,
|
||||
signal,
|
||||
}) {
|
||||
const jobs = splitJobs(startDate, endDate, interval)
|
||||
if (jobs.length === 0) {
|
||||
throw new OError('Time range could not be split into jobs', {
|
||||
start: startDate,
|
||||
end: endDate,
|
||||
interval,
|
||||
})
|
||||
}
|
||||
const settlements = await promiseMapSettledWithLimit(
|
||||
concurrency,
|
||||
jobs,
|
||||
({ startDate, endDate }) =>
|
||||
verifyRange(startDate, endDate, projectsPerRange, signal)
|
||||
)
|
||||
return settlements.reduce(
|
||||
/**
|
||||
*
|
||||
* @param {VerificationJobStatus} acc
|
||||
* @param settlement
|
||||
* @return {VerificationJobStatus}
|
||||
*/
|
||||
(acc, settlement) => {
|
||||
if (settlement.status !== 'rejected') {
|
||||
if (settlement.value.hasFailure) {
|
||||
acc.hasFailure = true
|
||||
}
|
||||
acc.total += settlement.value.total
|
||||
acc.verified += settlement.value.verified
|
||||
acc.errorTypes = acc.errorTypes.concat(settlement.value.errorTypes)
|
||||
} else {
|
||||
logger.error({ ...settlement.reason }, 'Error processing range')
|
||||
}
|
||||
return acc
|
||||
},
|
||||
/** @type {VerificationJobStatus} */
|
||||
{
|
||||
startDate,
|
||||
endDate,
|
||||
verified: 0,
|
||||
total: 0,
|
||||
hasFailure: false,
|
||||
errorTypes: [],
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that projects that have recently gone out of RPO have been updated.
|
||||
*
|
||||
* @param {Date} startDate
|
||||
* @param {Date} endDate
|
||||
* @param {number} nProjects
|
||||
* @param {EventEmitter} [signal]
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
export async function verifyProjectsUpdatedInDateRange(
|
||||
startDate,
|
||||
endDate,
|
||||
nProjects,
|
||||
signal
|
||||
) {
|
||||
logger.debug(
|
||||
{ startDate, endDate, nProjects },
|
||||
'Sampling projects updated in date range'
|
||||
)
|
||||
const results = await verifyProjectsFromCursor(
|
||||
getProjectsUpdatedInDateRangeCursor(startDate, endDate, nProjects),
|
||||
signal
|
||||
)
|
||||
|
||||
if (results.total === 0) {
|
||||
logger.debug(
|
||||
{ start: startDate, end: endDate },
|
||||
'No projects updated recently'
|
||||
)
|
||||
}
|
||||
|
||||
const jobStatus = {
|
||||
...results,
|
||||
startDate,
|
||||
endDate,
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
{ ...jobStatus, errorTypes: Array.from(new Set(jobStatus.errorTypes)) },
|
||||
'Verified recently updated projects'
|
||||
)
|
||||
return jobStatus
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {EventEmitter} signal
|
||||
* @return {void}
|
||||
*/
|
||||
export function loopRandomProjects(signal) {
|
||||
let shutdown = false
|
||||
signal.on('shutdown', function () {
|
||||
shutdown = true
|
||||
})
|
||||
async function loop() {
|
||||
do {
|
||||
try {
|
||||
const result = await verifyRandomProjectSample(100, signal, 2_000)
|
||||
logger.debug({ result }, 'verified random project sample')
|
||||
} catch (error) {
|
||||
logger.error({ error }, 'error verifying random project sample')
|
||||
}
|
||||
// eslint-disable-next-line no-unmodified-loop-condition
|
||||
} while (!shutdown)
|
||||
}
|
||||
loop()
|
||||
}
|
32
services/history-v1/backupVerifier/healthCheck.mjs
Normal file
32
services/history-v1/backupVerifier/healthCheck.mjs
Normal file
@@ -0,0 +1,32 @@
|
||||
import config from 'config'
|
||||
import { verifyProjectWithErrorContext } from '../storage/lib/backupVerifier.mjs'
|
||||
import {
|
||||
measureNeverBackedUpProjects,
|
||||
measurePendingChangesBeforeTime,
|
||||
} from './ProjectMetrics.mjs'
|
||||
import { getEndDateForRPO, RPO } from './utils.mjs'
|
||||
|
||||
/** @type {Array<string>} */
|
||||
const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
|
||||
|
||||
export async function healthCheck() {
|
||||
if (!Array.isArray(HEALTH_CHECK_PROJECTS)) {
|
||||
throw new Error('expected healthCheckProjects to be an array')
|
||||
}
|
||||
if (HEALTH_CHECK_PROJECTS.length !== 2) {
|
||||
throw new Error('expected 2 healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
|
||||
throw new Error('expected mongo id in healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
|
||||
throw new Error('expected postgres id in healthCheckProjects')
|
||||
}
|
||||
|
||||
for (const historyId of HEALTH_CHECK_PROJECTS) {
|
||||
await verifyProjectWithErrorContext(historyId)
|
||||
}
|
||||
|
||||
await measurePendingChangesBeforeTime(getEndDateForRPO(2))
|
||||
await measureNeverBackedUpProjects(getEndDateForRPO(2))
|
||||
}
|
8
services/history-v1/backupVerifier/types.d.ts
vendored
Normal file
8
services/history-v1/backupVerifier/types.d.ts
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
export type VerificationJobStatus = {
|
||||
verified: number
|
||||
total: number
|
||||
startDate?: Date
|
||||
endDate?: Date
|
||||
hasFailure: boolean
|
||||
errorTypes: Array<string>
|
||||
}
|
35
services/history-v1/backupVerifier/utils.mjs
Normal file
35
services/history-v1/backupVerifier/utils.mjs
Normal file
@@ -0,0 +1,35 @@
|
||||
import { ObjectId } from 'mongodb'
|
||||
import config from 'config'
|
||||
|
||||
export const RPO = parseInt(config.get('backupRPOInMS'), 10)
|
||||
|
||||
/**
|
||||
* @param {Date} time
|
||||
* @return {ObjectId}
|
||||
*/
|
||||
export function objectIdFromDate(time) {
|
||||
return ObjectId.createFromTime(time.getTime() / 1000)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {number} [factor] - Multiply RPO by this factor, default is 1
|
||||
* @return {Date}
|
||||
*/
|
||||
export function getEndDateForRPO(factor = 1) {
|
||||
return new Date(Date.now() - RPO * factor)
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a startDate, endDate pair that checks a period of time before the RPO horizon
|
||||
*
|
||||
* @param {number} offset - How many seconds we should check
|
||||
* @return {{endDate: Date, startDate: Date}}
|
||||
*/
|
||||
export function getDatesBeforeRPO(offset) {
|
||||
const now = new Date()
|
||||
const endDate = new Date(now.getTime() - RPO)
|
||||
return {
|
||||
endDate,
|
||||
startDate: new Date(endDate.getTime() - offset * 1000),
|
||||
}
|
||||
}
|
82
services/history-v1/benchmarks/blob_store.js
Normal file
82
services/history-v1/benchmarks/blob_store.js
Normal file
@@ -0,0 +1,82 @@
|
||||
const crypto = require('node:crypto')
|
||||
const benny = require('benny')
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const mongoBackend = require('../storage/lib/blob_store/mongo')
|
||||
const postgresBackend = require('../storage/lib/blob_store/postgres')
|
||||
const cleanup = require('../test/acceptance/js/storage/support/cleanup')
|
||||
|
||||
const MONGO_PROJECT_ID = '637386deb4ce3c62acd3848e'
|
||||
const POSTGRES_PROJECT_ID = '123'
|
||||
|
||||
async function run() {
|
||||
for (const blobCount of [1, 10, 100, 1000, 10000, 100000, 500000]) {
|
||||
await cleanup.everything()
|
||||
const blobs = createBlobs(blobCount)
|
||||
await insertBlobs(blobs)
|
||||
const randomHashes = getRandomHashes(blobs, 100)
|
||||
await benny.suite(
|
||||
`Read a blob in a project with ${blobCount} blobs`,
|
||||
benny.add('Mongo backend', async () => {
|
||||
await mongoBackend.findBlob(MONGO_PROJECT_ID, randomHashes[0])
|
||||
}),
|
||||
benny.add('Postgres backend', async () => {
|
||||
await postgresBackend.findBlob(POSTGRES_PROJECT_ID, randomHashes[0])
|
||||
}),
|
||||
benny.cycle(),
|
||||
benny.complete()
|
||||
)
|
||||
await benny.suite(
|
||||
`Read 100 blobs in a project with ${blobCount} blobs`,
|
||||
benny.add('Mongo backend', async () => {
|
||||
await mongoBackend.findBlobs(MONGO_PROJECT_ID, randomHashes)
|
||||
}),
|
||||
benny.add('Postgres backend', async () => {
|
||||
await postgresBackend.findBlobs(POSTGRES_PROJECT_ID, randomHashes)
|
||||
}),
|
||||
benny.cycle(),
|
||||
benny.complete()
|
||||
)
|
||||
await benny.suite(
|
||||
`Insert a blob in a project with ${blobCount} blobs`,
|
||||
benny.add('Mongo backend', async () => {
|
||||
const [newBlob] = createBlobs(1)
|
||||
await mongoBackend.insertBlob(MONGO_PROJECT_ID, newBlob)
|
||||
}),
|
||||
benny.add('Postgres backend', async () => {
|
||||
const [newBlob] = createBlobs(1)
|
||||
await postgresBackend.insertBlob(POSTGRES_PROJECT_ID, newBlob)
|
||||
}),
|
||||
benny.cycle(),
|
||||
benny.complete()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
function createBlobs(blobCount) {
|
||||
const blobs = []
|
||||
for (let i = 0; i < blobCount; i++) {
|
||||
const hash = crypto.randomBytes(20).toString('hex')
|
||||
blobs.push(new Blob(hash, 42, 42))
|
||||
}
|
||||
return blobs
|
||||
}
|
||||
|
||||
async function insertBlobs(blobs) {
|
||||
for (const blob of blobs) {
|
||||
await Promise.all([
|
||||
mongoBackend.insertBlob(MONGO_PROJECT_ID, blob),
|
||||
postgresBackend.insertBlob(POSTGRES_PROJECT_ID, blob),
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
function getRandomHashes(blobs, count) {
|
||||
const hashes = []
|
||||
for (let i = 0; i < count; i++) {
|
||||
const index = Math.floor(Math.random() * blobs.length)
|
||||
hashes.push(blobs[index].getHash())
|
||||
}
|
||||
return hashes
|
||||
}
|
||||
|
||||
module.exports = run
|
17
services/history-v1/benchmarks/index.js
Normal file
17
services/history-v1/benchmarks/index.js
Normal file
@@ -0,0 +1,17 @@
|
||||
const testSetup = require('../test/setup')
|
||||
const blobStoreSuite = require('./blob_store')
|
||||
|
||||
async function main() {
|
||||
await testSetup.setupPostgresDatabase()
|
||||
await testSetup.createGcsBuckets()
|
||||
await blobStoreSuite()
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit(0)
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
10
services/history-v1/buildscript.txt
Normal file
10
services/history-v1/buildscript.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
history-v1
|
||||
--dependencies=postgres,gcs,mongo,redis,s3
|
||||
--docker-repos=us-east1-docker.pkg.dev/overleaf-ops/ol-docker
|
||||
--env-add=
|
||||
--env-pass-through=
|
||||
--esmock-loader=False
|
||||
--node-version=20.18.2
|
||||
--public-repo=False
|
||||
--script-version=4.7.0
|
||||
--tsconfig-extra-includes=backup-deletion-app.mjs,backup-verifier-app.mjs,backup-worker-app.mjs,api/**/*,migrations/**/*,storage/**/*
|
104
services/history-v1/config/custom-environment-variables.json
Normal file
104
services/history-v1/config/custom-environment-variables.json
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"databaseUrl": "HISTORY_CONNECTION_STRING",
|
||||
"databaseUrlReadOnly": "HISTORY_FOLLOWER_CONNECTION_STRING",
|
||||
"herokuDatabaseUrl": "DATABASE_URL",
|
||||
"databasePoolMin": "DATABASE_POOL_MIN",
|
||||
"databasePoolMax": "DATABASE_POOL_MAX",
|
||||
"persistor": {
|
||||
"backend": "PERSISTOR_BACKEND",
|
||||
"s3": {
|
||||
"key": "AWS_ACCESS_KEY_ID",
|
||||
"secret": "AWS_SECRET_ACCESS_KEY",
|
||||
"endpoint": "AWS_S3_ENDPOINT",
|
||||
"pathStyle": "AWS_S3_PATH_STYLE",
|
||||
"maxRetries": "S3_MAX_RETRIES",
|
||||
"httpOptions": {
|
||||
"timeout": "S3_TIMEOUT"
|
||||
}
|
||||
},
|
||||
"gcs": {
|
||||
"deletedBucketSuffix": "GCS_DELETED_BUCKET_SUFFIX",
|
||||
"unlockBeforeDelete": "GCS_UNLOCK_BEFORE_DELETE",
|
||||
"endpoint": {
|
||||
"apiEndpoint": "GCS_API_ENDPOINT",
|
||||
"projectId": "GCS_PROJECT_ID"
|
||||
},
|
||||
"retryOptions": {
|
||||
"maxRetries": "GCS_MAX_RETRIES",
|
||||
"idempotencyStrategy": "GCS_IDEMPOTENCY_STRATEGY"
|
||||
}
|
||||
},
|
||||
"fallback": {
|
||||
"backend": "PERSISTOR_FALLBACK_BACKEND",
|
||||
"buckets": "PERSISTOR_BUCKET_MAPPING"
|
||||
}
|
||||
},
|
||||
"backupPersistor": {
|
||||
"keyEncryptionKeys": "BACKUP_KEY_ENCRYPTION_KEYS",
|
||||
"s3SSEC": {
|
||||
"key": "AWS_ACCESS_KEY_ID",
|
||||
"secret": "AWS_SECRET_ACCESS_KEY",
|
||||
"endpoint": "AWS_S3_ENDPOINT",
|
||||
"pathStyle": "AWS_S3_PATH_STYLE",
|
||||
"maxRetries": "BACKUP_S3_MAX_RETRIES",
|
||||
"httpOptions": {
|
||||
"timeout": "BACKUP_S3_TIMEOUT"
|
||||
}
|
||||
}
|
||||
},
|
||||
"blobStore": {
|
||||
"globalBucket": "OVERLEAF_EDITOR_BLOBS_BUCKET",
|
||||
"projectBucket": "OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET"
|
||||
},
|
||||
"chunkStore": {
|
||||
"historyStoreConcurrency": "HISTORY_STORE_CONCURRENCY",
|
||||
"bucket": "OVERLEAF_EDITOR_CHUNKS_BUCKET"
|
||||
},
|
||||
"zipStore": {
|
||||
"bucket": "OVERLEAF_EDITOR_ZIPS_BUCKET",
|
||||
"zipTimeoutMs": "ZIP_STORE_ZIP_TIMEOUT_MS"
|
||||
},
|
||||
"backupStore": {
|
||||
"chunksBucket":"BACKUP_OVERLEAF_EDITOR_CHUNKS_BUCKET",
|
||||
"deksBucket":"BACKUP_OVERLEAF_EDITOR_DEKS_BUCKET",
|
||||
"globalBlobsBucket":"BACKUP_OVERLEAF_EDITOR_GLOBAL_BLOBS_BUCKET",
|
||||
"projectBlobsBucket":"BACKUP_OVERLEAF_EDITOR_PROJECT_BLOBS_BUCKET"
|
||||
},
|
||||
"healthCheckBlobs": "HEALTH_CHECK_BLOBS",
|
||||
"healthCheckProjects": "HEALTH_CHECK_PROJECTS",
|
||||
"backupRPOInMS": "BACKUP_RPO_IN_MS",
|
||||
"minSoftDeletionPeriodDays": "MIN_SOFT_DELETION_PERIOD_DAYS",
|
||||
"mongo": {
|
||||
"uri": "MONGO_CONNECTION_STRING"
|
||||
},
|
||||
"basicHttpAuth": {
|
||||
"password": "STAGING_PASSWORD",
|
||||
"oldPassword": "BASIC_HTTP_AUTH_OLD_PASSWORD"
|
||||
},
|
||||
"jwtAuth": {
|
||||
"key": "OT_JWT_AUTH_KEY",
|
||||
"oldKey": "OT_JWT_AUTH_OLD_KEY",
|
||||
"algorithm": "OT_JWT_AUTH_ALG"
|
||||
},
|
||||
"clusterWorkers": "CLUSTER_WORKERS",
|
||||
"maxFileUploadSize": "MAX_FILE_UPLOAD_SIZE",
|
||||
"httpsOnly": "HTTPS_ONLY",
|
||||
"httpRequestTimeout": "HTTP_REQUEST_TIMEOUT",
|
||||
"redis": {
|
||||
"queue": {
|
||||
"host": "QUEUES_REDIS_HOST",
|
||||
"password": "QUEUES_REDIS_PASSWORD",
|
||||
"port": "QUEUES_REDIS_PORT"
|
||||
},
|
||||
"history": {
|
||||
"host": "HISTORY_REDIS_HOST",
|
||||
"password": "HISTORY_REDIS_PASSWORD",
|
||||
"port": "HISTORY_REDIS_PORT"
|
||||
},
|
||||
"lock": {
|
||||
"host": "REDIS_HOST",
|
||||
"password": "REDIS_PASSWORD",
|
||||
"port": "REDIS_PORT"
|
||||
}
|
||||
}
|
||||
}
|
43
services/history-v1/config/default.json
Normal file
43
services/history-v1/config/default.json
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"persistor": {
|
||||
"backend": "s3",
|
||||
"s3": {
|
||||
"signedUrlExpiryInMs": "1800000",
|
||||
"maxRetries": "1",
|
||||
"httpOptions": {
|
||||
"timeout": "8000"
|
||||
}
|
||||
},
|
||||
"gcs": {
|
||||
"signedUrlExpiryInMs": "1800000",
|
||||
"deleteConcurrency": "50"
|
||||
}
|
||||
},
|
||||
"backupPersistor": {
|
||||
"backend": "s3SSEC",
|
||||
"s3SSEC": {
|
||||
"maxRetries": "1",
|
||||
"pathStyle": false,
|
||||
"httpOptions": {
|
||||
"timeout": "120000"
|
||||
}
|
||||
}
|
||||
},
|
||||
"backupRPOInMS": "3600000",
|
||||
"chunkStore": {
|
||||
"historyStoreConcurrency": "4"
|
||||
},
|
||||
"zipStore": {
|
||||
"zipTimeoutMs": "360000"
|
||||
},
|
||||
"hasProjectsWithoutHistory": false,
|
||||
"minSoftDeletionPeriodDays": "90",
|
||||
"maxDeleteKeys": "1000",
|
||||
"useDeleteObjects": "true",
|
||||
"clusterWorkers": "1",
|
||||
"maxFileUploadSize": "52428800",
|
||||
"databasePoolMin": "2",
|
||||
"databasePoolMax": "10",
|
||||
"httpsOnly": "false",
|
||||
"httpRequestTimeout": "300000"
|
||||
}
|
49
services/history-v1/config/development.json
Normal file
49
services/history-v1/config/development.json
Normal file
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"databaseUrl": "postgres://postgres:postgres@postgres/write_latex_dev",
|
||||
"persistor": {
|
||||
"s3": {
|
||||
"endpoint": "http://s3:8080",
|
||||
"pathStyle": "true"
|
||||
},
|
||||
"gcs": {
|
||||
"unsignedUrls": "true",
|
||||
"endpoint": {
|
||||
"apiEndpoint": "http://fake-gcs:9090",
|
||||
"projectId": "fake"
|
||||
}
|
||||
}
|
||||
},
|
||||
"blobStore": {
|
||||
"globalBucket": "overleaf-development-blobs",
|
||||
"projectBucket": "overleaf-development-project-blobs"
|
||||
},
|
||||
"chunkStore": {
|
||||
"bucket": "overleaf-development-chunks"
|
||||
},
|
||||
"zipStore": {
|
||||
"bucket": "overleaf-development-zips"
|
||||
},
|
||||
"backupStore": {
|
||||
"chunksBucket":"overleaf-development-history-chunks",
|
||||
"deksBucket":"overleaf-development-history-deks",
|
||||
"globalBlobsBucket":"overleaf-development-history-global-blobs",
|
||||
"projectBlobsBucket":"overleaf-development-history-project-blobs"
|
||||
},
|
||||
"backupPersistor": {
|
||||
"keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]",
|
||||
"s3SSEC": {
|
||||
"ca": "[\"/certs/public.crt\"]"
|
||||
}
|
||||
},
|
||||
"useDeleteObjects": "false",
|
||||
"mongo": {
|
||||
"uri": "mongodb://mongo:27017/sharelatex"
|
||||
},
|
||||
"basicHttpAuth": {
|
||||
"password": "password"
|
||||
},
|
||||
"jwtAuth": {
|
||||
"key": "secureKey",
|
||||
"algorithm": "HS256"
|
||||
}
|
||||
}
|
5
services/history-v1/config/production.json
Normal file
5
services/history-v1/config/production.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"backupPersistor": {
|
||||
"tieringStorageClass": "INTELLIGENT_TIERING"
|
||||
}
|
||||
}
|
53
services/history-v1/config/test.json
Normal file
53
services/history-v1/config/test.json
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"databaseUrl": "postgres://overleaf:overleaf@postgres/overleaf-history-v1-test",
|
||||
"databaseUrlReadOnly": "postgres://read_only:password@postgres/overleaf-history-v1-test",
|
||||
"persistor": {
|
||||
"backend": "gcs",
|
||||
"gcs": {
|
||||
"unsignedUrls": "true",
|
||||
"endpoint": {
|
||||
"apiEndpoint": "http://gcs:9090",
|
||||
"projectId": "fake"
|
||||
}
|
||||
}
|
||||
},
|
||||
"blobStore": {
|
||||
"globalBucket": "overleaf-test-blobs",
|
||||
"projectBucket": "overleaf-test-project-blobs"
|
||||
},
|
||||
"chunkStore": {
|
||||
"bucket": "overleaf-test-chunks"
|
||||
},
|
||||
"zipStore": {
|
||||
"bucket": "overleaf-test-zips"
|
||||
},
|
||||
"backupStore": {
|
||||
"chunksBucket":"overleaf-test-history-chunks",
|
||||
"deksBucket":"overleaf-test-history-deks",
|
||||
"globalBlobsBucket":"overleaf-test-history-global-blobs",
|
||||
"projectBlobsBucket":"overleaf-test-history-project-blobs"
|
||||
},
|
||||
"backupPersistor": {
|
||||
"keyEncryptionKeys": "[{\"key\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\",\"salt\":\"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"}]",
|
||||
"s3SSEC": {
|
||||
"ca": "[\"/certs/public.crt\"]"
|
||||
},
|
||||
"tieringStorageClass": "REDUCED_REDUNDANCY"
|
||||
},
|
||||
"healthCheckBlobs": "[\"42/f70d7bba4ae1f07682e0358bd7a2068094fc023b\",\"000000000000000000000042/98d5521fe746bc2d11761edab5d0829bee286009\"]",
|
||||
"healthCheckProjects": "[\"42\",\"000000000000000000000042\"]",
|
||||
"backupRPOInMS": "360000",
|
||||
"maxDeleteKeys": "3",
|
||||
"useDeleteObjects": "false",
|
||||
"mongo": {
|
||||
"uri": "mongodb://mongo:27017/sharelatex"
|
||||
},
|
||||
"basicHttpAuth": {
|
||||
"password": "test"
|
||||
},
|
||||
"jwtAuth": {
|
||||
"key": "testtest",
|
||||
"algorithm": "HS256"
|
||||
},
|
||||
"maxFileUploadSize": "524288"
|
||||
}
|
237
services/history-v1/docker-compose.ci.yml
Normal file
237
services/history-v1/docker-compose.ci.yml
Normal file
@@ -0,0 +1,237 @@
|
||||
# This file was auto-generated, do not edit it directly.
|
||||
# Instead run bin/update_build_scripts from
|
||||
# https://github.com/overleaf/internal/
|
||||
|
||||
version: "2.3"
|
||||
|
||||
services:
|
||||
test_unit:
|
||||
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
|
||||
user: node
|
||||
command: npm run test:unit:_run
|
||||
environment:
|
||||
NODE_ENV: test
|
||||
NODE_OPTIONS: "--unhandled-rejections=strict"
|
||||
|
||||
|
||||
test_acceptance:
|
||||
build: .
|
||||
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
|
||||
environment:
|
||||
ELASTIC_SEARCH_DSN: es:9200
|
||||
REDIS_HOST: redis
|
||||
QUEUES_REDIS_HOST: redis
|
||||
HISTORY_REDIS_HOST: redis
|
||||
ANALYTICS_QUEUES_REDIS_HOST: redis
|
||||
MONGO_HOST: mongo
|
||||
POSTGRES_HOST: postgres
|
||||
AWS_S3_ENDPOINT: https://minio:9000
|
||||
AWS_S3_PATH_STYLE: 'true'
|
||||
AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
GCS_API_ENDPOINT: http://gcs:9090
|
||||
GCS_PROJECT_ID: fake
|
||||
STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1
|
||||
MOCHA_GREP: ${MOCHA_GREP}
|
||||
NODE_ENV: test
|
||||
NODE_OPTIONS: "--unhandled-rejections=strict"
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/certs
|
||||
depends_on:
|
||||
mongo:
|
||||
condition: service_started
|
||||
redis:
|
||||
condition: service_healthy
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
minio_setup:
|
||||
condition: service_completed_successfully
|
||||
gcs:
|
||||
condition: service_healthy
|
||||
user: node
|
||||
command: npm run test:acceptance
|
||||
|
||||
|
||||
tar:
|
||||
build: .
|
||||
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
|
||||
volumes:
|
||||
- ./:/tmp/build/
|
||||
command: tar -czf /tmp/build/build.tar.gz --exclude=build.tar.gz --exclude-vcs .
|
||||
user: root
|
||||
redis:
|
||||
image: redis
|
||||
healthcheck:
|
||||
test: ping="$$(redis-cli ping)" && [ "$$ping" = 'PONG' ]
|
||||
interval: 1s
|
||||
retries: 20
|
||||
|
||||
mongo:
|
||||
image: mongo:6.0.13
|
||||
command: --replSet overleaf
|
||||
volumes:
|
||||
- ../../bin/shared/mongodb-init-replica-set.js:/docker-entrypoint-initdb.d/mongodb-init-replica-set.js
|
||||
environment:
|
||||
MONGO_INITDB_DATABASE: sharelatex
|
||||
extra_hosts:
|
||||
# Required when using the automatic database setup for initializing the
|
||||
# replica set. This override is not needed when running the setup after
|
||||
# starting up mongo.
|
||||
- mongo:127.0.0.1
|
||||
postgres:
|
||||
image: postgres:10
|
||||
environment:
|
||||
POSTGRES_USER: overleaf
|
||||
POSTGRES_PASSWORD: overleaf
|
||||
POSTGRES_DB: overleaf-history-v1-test
|
||||
volumes:
|
||||
- ./test/acceptance/pg-init/:/docker-entrypoint-initdb.d/
|
||||
healthcheck:
|
||||
test: pg_isready --quiet
|
||||
interval: 1s
|
||||
retries: 20
|
||||
|
||||
certs:
|
||||
image: node:20.18.2
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/certs
|
||||
working_dir: /certs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
if [ ! -f ./certgen ]; then
|
||||
wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)"
|
||||
chmod +x ./certgen
|
||||
fi
|
||||
if [ ! -f private.key ] || [ ! -f public.crt ]; then
|
||||
./certgen -host minio
|
||||
fi
|
||||
|
||||
minio:
|
||||
image: minio/minio:RELEASE.2024-10-13T13-34-11Z
|
||||
command: server /data
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.minio/certs
|
||||
environment:
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
|
||||
minio_setup:
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
image: minio/mc:RELEASE.2024-10-08T09-37-26Z
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.mc/certs/CAs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
sleep 1
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD
|
||||
mc mb --ignore-existing s3/overleaf-test-history-chunks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-deks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-global-blobs
|
||||
mc mb --ignore-existing s3/overleaf-test-history-project-blobs
|
||||
mc admin user add s3 \
|
||||
OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \
|
||||
OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
echo '
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*"
|
||||
}
|
||||
]
|
||||
}' > policy-history.json
|
||||
|
||||
mc admin policy create s3 overleaf-history policy-history.json
|
||||
mc admin policy attach s3 overleaf-history \
|
||||
--user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
gcs:
|
||||
image: fsouza/fake-gcs-server:1.45.2
|
||||
command: ["--port=9090", "--scheme=http"]
|
||||
healthcheck:
|
||||
test: wget --quiet --output-document=/dev/null http://localhost:9090/storage/v1/b
|
||||
interval: 1s
|
||||
retries: 20
|
246
services/history-v1/docker-compose.yml
Normal file
246
services/history-v1/docker-compose.yml
Normal file
@@ -0,0 +1,246 @@
|
||||
# This file was auto-generated, do not edit it directly.
|
||||
# Instead run bin/update_build_scripts from
|
||||
# https://github.com/overleaf/internal/
|
||||
|
||||
version: "2.3"
|
||||
|
||||
services:
|
||||
test_unit:
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: services/history-v1/Dockerfile
|
||||
target: base
|
||||
volumes:
|
||||
- .:/overleaf/services/history-v1
|
||||
- ../../node_modules:/overleaf/node_modules
|
||||
- ../../libraries:/overleaf/libraries
|
||||
working_dir: /overleaf/services/history-v1
|
||||
environment:
|
||||
MOCHA_GREP: ${MOCHA_GREP}
|
||||
LOG_LEVEL: ${LOG_LEVEL:-}
|
||||
NODE_ENV: test
|
||||
NODE_OPTIONS: "--unhandled-rejections=strict"
|
||||
command: npm run --silent test:unit
|
||||
user: node
|
||||
|
||||
test_acceptance:
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: services/history-v1/Dockerfile
|
||||
target: base
|
||||
volumes:
|
||||
- .:/overleaf/services/history-v1
|
||||
- ../../node_modules:/overleaf/node_modules
|
||||
- ../../libraries:/overleaf/libraries
|
||||
- ./test/acceptance/certs:/certs
|
||||
working_dir: /overleaf/services/history-v1
|
||||
environment:
|
||||
ELASTIC_SEARCH_DSN: es:9200
|
||||
REDIS_HOST: redis
|
||||
HISTORY_REDIS_HOST: redis
|
||||
QUEUES_REDIS_HOST: redis
|
||||
ANALYTICS_QUEUES_REDIS_HOST: redis
|
||||
MONGO_HOST: mongo
|
||||
POSTGRES_HOST: postgres
|
||||
AWS_S3_ENDPOINT: https://minio:9000
|
||||
AWS_S3_PATH_STYLE: 'true'
|
||||
AWS_ACCESS_KEY_ID: OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
AWS_SECRET_ACCESS_KEY: OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
GCS_API_ENDPOINT: http://gcs:9090
|
||||
GCS_PROJECT_ID: fake
|
||||
STORAGE_EMULATOR_HOST: http://gcs:9090/storage/v1
|
||||
MOCHA_GREP: ${MOCHA_GREP}
|
||||
LOG_LEVEL: ${LOG_LEVEL:-}
|
||||
NODE_ENV: test
|
||||
NODE_OPTIONS: "--unhandled-rejections=strict"
|
||||
user: node
|
||||
depends_on:
|
||||
mongo:
|
||||
condition: service_started
|
||||
redis:
|
||||
condition: service_healthy
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
minio_setup:
|
||||
condition: service_completed_successfully
|
||||
gcs:
|
||||
condition: service_healthy
|
||||
command: npm run --silent test:acceptance
|
||||
|
||||
redis:
|
||||
image: redis
|
||||
healthcheck:
|
||||
test: ping=$$(redis-cli ping) && [ "$$ping" = 'PONG' ]
|
||||
interval: 1s
|
||||
retries: 20
|
||||
|
||||
mongo:
|
||||
image: mongo:6.0.13
|
||||
command: --replSet overleaf
|
||||
volumes:
|
||||
- ../../bin/shared/mongodb-init-replica-set.js:/docker-entrypoint-initdb.d/mongodb-init-replica-set.js
|
||||
environment:
|
||||
MONGO_INITDB_DATABASE: sharelatex
|
||||
extra_hosts:
|
||||
# Required when using the automatic database setup for initializing the
|
||||
# replica set. This override is not needed when running the setup after
|
||||
# starting up mongo.
|
||||
- mongo:127.0.0.1
|
||||
|
||||
postgres:
|
||||
image: postgres:10
|
||||
environment:
|
||||
POSTGRES_USER: overleaf
|
||||
POSTGRES_PASSWORD: overleaf
|
||||
POSTGRES_DB: overleaf-history-v1-test
|
||||
volumes:
|
||||
- ./test/acceptance/pg-init/:/docker-entrypoint-initdb.d/
|
||||
healthcheck:
|
||||
test: pg_isready --host=localhost --quiet
|
||||
interval: 1s
|
||||
retries: 20
|
||||
|
||||
certs:
|
||||
image: node:20.18.2
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/certs
|
||||
working_dir: /certs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
if [ ! -f ./certgen ]; then
|
||||
wget -O ./certgen "https://github.com/minio/certgen/releases/download/v1.3.0/certgen-linux-$(dpkg --print-architecture)"
|
||||
chmod +x ./certgen
|
||||
fi
|
||||
if [ ! -f private.key ] || [ ! -f public.crt ]; then
|
||||
./certgen -host minio
|
||||
fi
|
||||
|
||||
minio:
|
||||
image: minio/minio:RELEASE.2024-10-13T13-34-11Z
|
||||
command: server /data
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.minio/certs
|
||||
environment:
|
||||
MINIO_ROOT_USER: MINIO_ROOT_USER
|
||||
MINIO_ROOT_PASSWORD: MINIO_ROOT_PASSWORD
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
|
||||
minio_setup:
|
||||
depends_on:
|
||||
certs:
|
||||
condition: service_completed_successfully
|
||||
minio:
|
||||
condition: service_started
|
||||
image: minio/mc:RELEASE.2024-10-08T09-37-26Z
|
||||
volumes:
|
||||
- ./test/acceptance/certs:/root/.mc/certs/CAs
|
||||
entrypoint: sh
|
||||
command:
|
||||
- '-cex'
|
||||
- |
|
||||
sleep 1
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD \
|
||||
|| sleep 3 && \
|
||||
mc alias set s3 https://minio:9000 MINIO_ROOT_USER MINIO_ROOT_PASSWORD
|
||||
mc mb --ignore-existing s3/overleaf-test-history-chunks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-deks
|
||||
mc mb --ignore-existing s3/overleaf-test-history-global-blobs
|
||||
mc mb --ignore-existing s3/overleaf-test-history-project-blobs
|
||||
mc admin user add s3 \
|
||||
OVERLEAF_HISTORY_S3_ACCESS_KEY_ID \
|
||||
OVERLEAF_HISTORY_S3_SECRET_ACCESS_KEY
|
||||
echo '
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-chunks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-deks/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-global-blobs/*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:PutObject",
|
||||
"s3:GetObject",
|
||||
"s3:DeleteObject"
|
||||
],
|
||||
"Resource": "arn:aws:s3:::overleaf-test-history-project-blobs/*"
|
||||
}
|
||||
]
|
||||
}' > policy-history.json
|
||||
|
||||
mc admin policy create s3 overleaf-history policy-history.json
|
||||
mc admin policy attach s3 overleaf-history \
|
||||
--user=OVERLEAF_HISTORY_S3_ACCESS_KEY_ID
|
||||
gcs:
|
||||
image: fsouza/fake-gcs-server:1.45.2
|
||||
command: ["--port=9090", "--scheme=http"]
|
||||
healthcheck:
|
||||
test: wget --quiet --output-document=/dev/null http://localhost:9090/storage/v1/b
|
||||
interval: 1s
|
||||
retries: 20
|
9
services/history-v1/install_deps.sh
Executable file
9
services/history-v1/install_deps.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -ex
|
||||
|
||||
apt-get update
|
||||
|
||||
apt-get install jq parallel --yes
|
||||
|
||||
rm -rf /var/lib/apt/lists/*
|
19
services/history-v1/knexfile.js
Normal file
19
services/history-v1/knexfile.js
Normal file
@@ -0,0 +1,19 @@
|
||||
const config = require('config')
|
||||
|
||||
const baseConfig = {
|
||||
client: 'postgresql',
|
||||
connection: config.herokuDatabaseUrl || config.databaseUrl,
|
||||
pool: {
|
||||
min: parseInt(config.databasePoolMin, 10),
|
||||
max: parseInt(config.databasePoolMax, 10),
|
||||
},
|
||||
migrations: {
|
||||
tableName: 'knex_migrations',
|
||||
},
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
development: baseConfig,
|
||||
production: baseConfig,
|
||||
test: baseConfig,
|
||||
}
|
80
services/history-v1/migrations/20220228163642_initial.js
Normal file
80
services/history-v1/migrations/20220228163642_initial.js
Normal file
@@ -0,0 +1,80 @@
|
||||
/**
|
||||
* This is the initial migration, meant to replicate the current state of the
|
||||
* history database. If tables already exist, this migration is a noop.
|
||||
*/
|
||||
|
||||
exports.up = async function (knex) {
|
||||
await knex.raw(`
|
||||
CREATE TABLE IF NOT EXISTS chunks (
|
||||
id SERIAL,
|
||||
doc_id integer NOT NULL,
|
||||
end_version integer NOT NULL,
|
||||
end_timestamp timestamp without time zone,
|
||||
CONSTRAINT chunks_version_non_negative CHECK (end_version >= 0)
|
||||
)
|
||||
`)
|
||||
await knex.raw(`
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS index_chunks_on_doc_id_and_end_version
|
||||
ON chunks (doc_id, end_version)
|
||||
`)
|
||||
|
||||
await knex.raw(`
|
||||
CREATE TABLE IF NOT EXISTS old_chunks (
|
||||
chunk_id integer NOT NULL PRIMARY KEY,
|
||||
doc_id integer NOT NULL,
|
||||
end_version integer,
|
||||
end_timestamp timestamp without time zone,
|
||||
deleted_at timestamp without time zone
|
||||
)
|
||||
`)
|
||||
await knex.raw(`
|
||||
CREATE INDEX IF NOT EXISTS index_old_chunks_on_doc_id_and_end_version
|
||||
ON old_chunks (doc_id, end_version)
|
||||
`)
|
||||
|
||||
await knex.raw(`
|
||||
CREATE TABLE IF NOT EXISTS pending_chunks (
|
||||
id SERIAL,
|
||||
doc_id integer NOT NULL,
|
||||
end_version integer NOT NULL,
|
||||
end_timestamp timestamp without time zone,
|
||||
CONSTRAINT chunks_version_non_negative CHECK (end_version >= 0)
|
||||
)
|
||||
`)
|
||||
await knex.raw(`
|
||||
CREATE INDEX IF NOT EXISTS index_pending_chunks_on_doc_id_and_id
|
||||
ON pending_chunks (doc_id, id)
|
||||
`)
|
||||
|
||||
await knex.raw(`
|
||||
CREATE TABLE IF NOT EXISTS blobs (
|
||||
hash_bytes bytea NOT NULL PRIMARY KEY,
|
||||
byte_length integer NOT NULL,
|
||||
string_length integer,
|
||||
global boolean,
|
||||
CONSTRAINT blobs_byte_length_non_negative CHECK (byte_length >= 0),
|
||||
CONSTRAINT blobs_string_length_non_negative
|
||||
CHECK (string_length IS NULL OR string_length >= 0)
|
||||
)
|
||||
`)
|
||||
|
||||
await knex.raw(`
|
||||
CREATE TABLE IF NOT EXISTS project_blobs (
|
||||
project_id integer NOT NULL,
|
||||
hash_bytes bytea NOT NULL,
|
||||
byte_length integer NOT NULL,
|
||||
string_length integer,
|
||||
PRIMARY KEY (project_id, hash_bytes),
|
||||
CONSTRAINT project_blobs_byte_length_non_negative
|
||||
CHECK (byte_length >= 0),
|
||||
CONSTRAINT project_blobs_string_length_non_negative
|
||||
CHECK (string_length IS NULL OR string_length >= 0)
|
||||
)
|
||||
`)
|
||||
|
||||
await knex.raw(`CREATE SEQUENCE IF NOT EXISTS docs_id_seq`)
|
||||
}
|
||||
|
||||
exports.down = async function (knex) {
|
||||
// Don't do anything on the down migration
|
||||
}
|
@@ -0,0 +1,23 @@
|
||||
exports.up = async function (knex) {
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks ADD COLUMN start_version integer
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE pending_chunks ADD COLUMN start_version integer
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE old_chunks ADD COLUMN start_version integer
|
||||
`)
|
||||
}
|
||||
|
||||
exports.down = async function (knex) {
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks DROP COLUMN start_version
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE pending_chunks DROP COLUMN start_version
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE old_chunks DROP COLUMN start_version
|
||||
`)
|
||||
}
|
@@ -0,0 +1,41 @@
|
||||
exports.config = {
|
||||
// CREATE INDEX CONCURRENTLY can't be run inside a transaction
|
||||
// If this migration fails in the middle, indexes and constraints will have
|
||||
// to be cleaned up manually.
|
||||
transaction: false,
|
||||
}
|
||||
|
||||
exports.up = async function (knex) {
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
ADD CONSTRAINT chunks_start_version_non_negative
|
||||
CHECK (start_version IS NOT NULL AND start_version >= 0)
|
||||
NOT VALID
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
VALIDATE CONSTRAINT chunks_start_version_non_negative
|
||||
`)
|
||||
await knex.raw(`
|
||||
CREATE UNIQUE INDEX CONCURRENTLY index_chunks_on_doc_id_and_start_version
|
||||
ON chunks (doc_id, start_version)
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
ADD UNIQUE USING INDEX index_chunks_on_doc_id_and_start_version
|
||||
`)
|
||||
}
|
||||
|
||||
exports.down = async function (knex) {
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
DROP CONSTRAINT IF EXISTS index_chunks_on_doc_id_and_start_version
|
||||
`)
|
||||
await knex.raw(`
|
||||
DROP INDEX IF EXISTS index_chunks_on_doc_id_and_start_version
|
||||
`)
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
DROP CONSTRAINT IF EXISTS chunks_start_version_non_negative
|
||||
`)
|
||||
}
|
@@ -0,0 +1,7 @@
|
||||
exports.up = async function (knex) {
|
||||
await knex.raw(`DROP TABLE IF EXISTS blobs`)
|
||||
}
|
||||
|
||||
exports.down = function (knex) {
|
||||
// Not reversible
|
||||
}
|
@@ -0,0 +1,27 @@
|
||||
// @ts-check
|
||||
|
||||
/**
|
||||
* @import { Knex } from "knex"
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param { Knex } knex
|
||||
* @returns { Promise<void> }
|
||||
*/
|
||||
exports.up = async function (knex) {
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
ADD COLUMN closed BOOLEAN NOT NULL DEFAULT FALSE
|
||||
`)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param { Knex } knex
|
||||
* @returns { Promise<void> }
|
||||
*/
|
||||
exports.down = async function (knex) {
|
||||
await knex.raw(`
|
||||
ALTER TABLE chunks
|
||||
DROP COLUMN closed
|
||||
`)
|
||||
}
|
76
services/history-v1/package.json
Normal file
76
services/history-v1/package.json
Normal file
@@ -0,0 +1,76 @@
|
||||
{
|
||||
"name": "overleaf-editor",
|
||||
"version": "1.0.0",
|
||||
"description": "Overleaf Editor.",
|
||||
"author": "",
|
||||
"license": "Proprietary",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@google-cloud/secret-manager": "^5.6.0",
|
||||
"@overleaf/logger": "*",
|
||||
"@overleaf/metrics": "*",
|
||||
"@overleaf/mongo-utils": "*",
|
||||
"@overleaf/o-error": "*",
|
||||
"@overleaf/object-persistor": "*",
|
||||
"@overleaf/promise-utils": "*",
|
||||
"@overleaf/redis-wrapper": "*",
|
||||
"@overleaf/settings": "*",
|
||||
"@overleaf/stream-utils": "^0.1.0",
|
||||
"archiver": "^5.3.0",
|
||||
"basic-auth": "^2.0.1",
|
||||
"bluebird": "^3.7.2",
|
||||
"body-parser": "^1.20.3",
|
||||
"bull": "^4.16.5",
|
||||
"bunyan": "^1.8.12",
|
||||
"check-types": "^11.1.2",
|
||||
"command-line-args": "^3.0.3",
|
||||
"config": "^1.19.0",
|
||||
"express": "^4.21.2",
|
||||
"fs-extra": "^9.0.1",
|
||||
"generic-pool": "^2.1.1",
|
||||
"helmet": "^3.22.0",
|
||||
"http-status": "^1.4.2",
|
||||
"jsonwebtoken": "^9.0.0",
|
||||
"knex": "^2.4.0",
|
||||
"lodash": "^4.17.19",
|
||||
"mongodb": "6.12.0",
|
||||
"overleaf-editor-core": "*",
|
||||
"p-limit": "^6.2.0",
|
||||
"pg": "^8.7.1",
|
||||
"pg-query-stream": "^4.2.4",
|
||||
"swagger-tools": "^0.10.4",
|
||||
"temp": "^0.8.3",
|
||||
"throng": "^4.0.0",
|
||||
"tsscmp": "^1.0.6",
|
||||
"utf-8-validate": "^5.0.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"benny": "^3.7.1",
|
||||
"chai": "^4.3.6",
|
||||
"chai-as-promised": "^7.1.1",
|
||||
"chai-exclude": "^2.1.1",
|
||||
"mocha": "^11.1.0",
|
||||
"node-fetch": "^2.7.0",
|
||||
"sinon": "^9.0.2",
|
||||
"swagger-client": "^3.10.0",
|
||||
"typescript": "^5.0.4",
|
||||
"yauzl": "^2.9.1"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "node app.js",
|
||||
"lint": "eslint --max-warnings 0 --format unix .",
|
||||
"lint:fix": "eslint --fix .",
|
||||
"format": "prettier --list-different $PWD/'**/*.*js'",
|
||||
"format:fix": "prettier --write $PWD/'**/*.*js'",
|
||||
"test:unit": "npm run test:unit:_run -- --grep=$MOCHA_GREP",
|
||||
"test:acceptance": "npm run test:acceptance:_run -- --grep=$MOCHA_GREP",
|
||||
"test:unit:_run": "mocha --recursive --reporter spec $@ test/unit/js",
|
||||
"test:acceptance:_run": "mocha --recursive --reporter spec --timeout 15000 --exit $@ test/acceptance/js",
|
||||
"nodemon": "node --watch app.js",
|
||||
"migrate": "knex migrate:latest",
|
||||
"delete_old_chunks": "node storage/tasks/delete_old_chunks.js",
|
||||
"fix_duplicate_versions": "node storage/tasks/fix_duplicate_versions.js",
|
||||
"benchmarks": "node benchmarks/index.js",
|
||||
"types:check": "tsc --noEmit"
|
||||
}
|
||||
}
|
25
services/history-v1/storage/index.js
Normal file
25
services/history-v1/storage/index.js
Normal file
@@ -0,0 +1,25 @@
|
||||
exports.BatchBlobStore = require('./lib/batch_blob_store')
|
||||
exports.blobHash = require('./lib/blob_hash')
|
||||
exports.HashCheckBlobStore = require('./lib/hash_check_blob_store')
|
||||
exports.chunkBuffer = require('./lib/chunk_buffer')
|
||||
exports.chunkStore = require('./lib/chunk_store')
|
||||
exports.historyStore = require('./lib/history_store').historyStore
|
||||
exports.knex = require('./lib/knex')
|
||||
exports.mongodb = require('./lib/mongodb')
|
||||
exports.redis = require('./lib/redis')
|
||||
exports.persistChanges = require('./lib/persist_changes')
|
||||
exports.persistor = require('./lib/persistor')
|
||||
exports.ProjectArchive = require('./lib/project_archive')
|
||||
exports.streams = require('./lib/streams')
|
||||
exports.temp = require('./lib/temp')
|
||||
exports.zipStore = require('./lib/zip_store')
|
||||
|
||||
const { BlobStore, loadGlobalBlobs } = require('./lib/blob_store')
|
||||
exports.BlobStore = BlobStore
|
||||
exports.loadGlobalBlobs = loadGlobalBlobs
|
||||
|
||||
const { InvalidChangeError } = require('./lib/errors')
|
||||
exports.InvalidChangeError = InvalidChangeError
|
||||
|
||||
const { ChunkVersionConflictError } = require('./lib/chunk_store/errors')
|
||||
exports.ChunkVersionConflictError = ChunkVersionConflictError
|
76
services/history-v1/storage/lib/assert.js
Normal file
76
services/history-v1/storage/lib/assert.js
Normal file
@@ -0,0 +1,76 @@
|
||||
'use strict'
|
||||
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
const check = require('check-types')
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
|
||||
const assert = check.assert
|
||||
|
||||
const MONGO_ID_REGEXP = /^[0-9a-f]{24}$/
|
||||
const POSTGRES_ID_REGEXP = /^[1-9][0-9]{0,9}$/
|
||||
const MONGO_OR_POSTGRES_ID_REGEXP = /^([0-9a-f]{24}|[1-9][0-9]{0,9})$/
|
||||
|
||||
function transaction(transaction, message) {
|
||||
assert.function(transaction, message)
|
||||
}
|
||||
|
||||
function blobHash(arg, message) {
|
||||
try {
|
||||
assert.match(arg, Blob.HEX_HASH_RX, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A project id is a string that contains either an integer (for projects stored in Postgres) or 24
|
||||
* hex digits (for projects stored in Mongo)
|
||||
*/
|
||||
function projectId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A chunk id is a string that contains either an integer (for projects stored in Postgres) or 24
|
||||
* hex digits (for projects stored in Mongo)
|
||||
*/
|
||||
function chunkId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, MONGO_OR_POSTGRES_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
function mongoId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, MONGO_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
function postgresId(arg, message) {
|
||||
try {
|
||||
assert.match(arg, POSTGRES_ID_REGEXP, message)
|
||||
} catch (error) {
|
||||
throw OError.tag(error, message, { arg })
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
...assert,
|
||||
transaction,
|
||||
blobHash,
|
||||
projectId,
|
||||
chunkId,
|
||||
mongoId,
|
||||
postgresId,
|
||||
MONGO_ID_REGEXP,
|
||||
POSTGRES_ID_REGEXP,
|
||||
}
|
251
services/history-v1/storage/lib/backupBlob.mjs
Normal file
251
services/history-v1/storage/lib/backupBlob.mjs
Normal file
@@ -0,0 +1,251 @@
|
||||
// @ts-check
|
||||
import { backupPersistor, projectBlobsBucket } from './backupPersistor.mjs'
|
||||
import { GLOBAL_BLOBS, makeProjectKey, BlobStore } from './blob_store/index.js'
|
||||
import Stream from 'node:stream'
|
||||
import fs from 'node:fs'
|
||||
import Crypto from 'node:crypto'
|
||||
import assert from './assert.js'
|
||||
import { backedUpBlobs, projects } from './mongodb.js'
|
||||
import { Binary, ObjectId } from 'mongodb'
|
||||
import logger from '@overleaf/logger/logging-manager.js'
|
||||
import { AlreadyWrittenError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import zLib from 'node:zlib'
|
||||
import Path from 'node:path'
|
||||
|
||||
const HIGHWATER_MARK = 1024 * 1024
|
||||
|
||||
/**
|
||||
* @typedef {import("overleaf-editor-core").Blob} Blob
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
|
||||
*/
|
||||
|
||||
/**
|
||||
* Increment a metric to record the outcome of a backup operation.
|
||||
*
|
||||
* @param {"success"|"failure"|"skipped"} status
|
||||
* @param {"global"|"already_backed_up"|"none"} reason
|
||||
*/
|
||||
function recordBackupConclusion(status, reason = 'none') {
|
||||
metrics.inc('blob_backed_up', 1, { status, reason })
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads a blob to a specified directory
|
||||
*
|
||||
* @param {string} historyId - The history ID of the project the blob belongs to
|
||||
* @param {Blob} blob - The blob to download
|
||||
* @param {string} tmpDir - The directory path where the blob will be downloaded
|
||||
* @returns {Promise<string>} The full path where the blob was downloaded
|
||||
*/
|
||||
export async function downloadBlobToDir(historyId, blob, tmpDir) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blobHash = blob.getHash()
|
||||
const src = await blobStore.getStream(blobHash)
|
||||
const filePath = Path.join(tmpDir, `${historyId}-${blobHash}`)
|
||||
try {
|
||||
const dst = fs.createWriteStream(filePath, {
|
||||
highWaterMark: HIGHWATER_MARK,
|
||||
flags: 'wx',
|
||||
})
|
||||
await Stream.promises.pipeline(src, dst)
|
||||
return filePath
|
||||
} catch (error) {
|
||||
try {
|
||||
await fs.promises.unlink(filePath)
|
||||
} catch {}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the actual upload of the blob to the backup storage.
|
||||
*
|
||||
* @param {string} historyId - The history ID of the project the blob belongs to
|
||||
* @param {Blob} blob - The blob being uploaded
|
||||
* @param {string} path - The path to the file to upload (should have been stored on disk already)
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function uploadBlobToBackup(historyId, blob, path, persistor) {
|
||||
const md5 = Crypto.createHash('md5')
|
||||
const filePathCompressed = path + '.gz'
|
||||
let backupSource
|
||||
let contentEncoding
|
||||
let size
|
||||
try {
|
||||
if (blob.getStringLength()) {
|
||||
backupSource = filePathCompressed
|
||||
contentEncoding = 'gzip'
|
||||
size = 0
|
||||
await Stream.promises.pipeline(
|
||||
fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
|
||||
zLib.createGzip(),
|
||||
async function* (source) {
|
||||
for await (const chunk of source) {
|
||||
size += chunk.byteLength
|
||||
md5.update(chunk)
|
||||
yield chunk
|
||||
}
|
||||
},
|
||||
fs.createWriteStream(filePathCompressed, {
|
||||
highWaterMark: HIGHWATER_MARK,
|
||||
})
|
||||
)
|
||||
} else {
|
||||
backupSource = path
|
||||
size = blob.getByteLength()
|
||||
await Stream.promises.pipeline(
|
||||
fs.createReadStream(path, { highWaterMark: HIGHWATER_MARK }),
|
||||
md5
|
||||
)
|
||||
}
|
||||
const key = makeProjectKey(historyId, blob.getHash())
|
||||
await persistor.sendStream(
|
||||
projectBlobsBucket,
|
||||
key,
|
||||
fs.createReadStream(backupSource, { highWaterMark: HIGHWATER_MARK }),
|
||||
{
|
||||
contentEncoding,
|
||||
contentType: 'application/octet-stream',
|
||||
contentLength: size,
|
||||
sourceMd5: md5.digest('hex'),
|
||||
ifNoneMatch: '*',
|
||||
}
|
||||
)
|
||||
} finally {
|
||||
if (backupSource === filePathCompressed) {
|
||||
try {
|
||||
await fs.promises.rm(filePathCompressed, { force: true })
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a legacy (postgres) historyId to a mongo projectId
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @return {Promise<string>}
|
||||
* @private
|
||||
*/
|
||||
async function _convertLegacyHistoryIdToProjectId(historyId) {
|
||||
const project = await projects.findOne(
|
||||
{ 'overleaf.history.id': parseInt(historyId) },
|
||||
{ projection: { _id: 1 } }
|
||||
)
|
||||
|
||||
if (!project?._id) {
|
||||
throw new Error('Did not find project for history id')
|
||||
}
|
||||
|
||||
return project?._id?.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Records that a blob was backed up for a project.
|
||||
*
|
||||
* @param {string} projectId - projectId for a project (mongo format)
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function storeBlobBackup(projectId, hash) {
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ $addToSet: { blobs: new Binary(Buffer.from(hash, 'hex')) } },
|
||||
{ upsert: true }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether a specific blob has been backed up in this project.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<*>}
|
||||
* @private
|
||||
*/
|
||||
export async function _blobIsBackedUp(projectId, hash) {
|
||||
const blobs = await backedUpBlobs.findOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: new Binary(Buffer.from(hash, 'hex')),
|
||||
},
|
||||
{ projection: { _id: 1 } }
|
||||
)
|
||||
return blobs?._id
|
||||
}
|
||||
|
||||
/**
|
||||
* Back up a blob to the global storage and record that it was backed up.
|
||||
*
|
||||
* @param {string} historyId - history ID for a project (can be postgres format or mongo format)
|
||||
* @param {Blob} blob - The blob that is being backed up
|
||||
* @param {string} tmpPath - The path to a temporary file storing the contents of the blob.
|
||||
* @param {CachedPerProjectEncryptedS3Persistor} [persistor] - The persistor to use (optional)
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function backupBlob(historyId, blob, tmpPath, persistor) {
|
||||
const hash = blob.getHash()
|
||||
|
||||
let projectId = historyId
|
||||
if (assert.POSTGRES_ID_REGEXP.test(historyId)) {
|
||||
projectId = await _convertLegacyHistoryIdToProjectId(historyId)
|
||||
}
|
||||
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
|
||||
if (globalBlob && !globalBlob.demoted) {
|
||||
recordBackupConclusion('skipped', 'global')
|
||||
logger.debug({ projectId, hash }, 'Blob is global - skipping backup')
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
if (await _blobIsBackedUp(projectId, hash)) {
|
||||
recordBackupConclusion('skipped', 'already_backed_up')
|
||||
logger.debug(
|
||||
{ projectId, hash },
|
||||
'Blob already backed up - skipping backup'
|
||||
)
|
||||
return
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn({ error }, 'Failed to check if blob is backed up')
|
||||
// We'll try anyway - we'll catch the error if it was backed up
|
||||
}
|
||||
// If we weren't passed a persistor for this project, create one.
|
||||
// This will fetch the key from AWS, so it's prefereable to use
|
||||
// the same persistor for all blobs in a project where possible.
|
||||
if (!persistor) {
|
||||
logger.debug(
|
||||
{ historyId, hash },
|
||||
'warning: persistor not passed to backupBlob'
|
||||
)
|
||||
}
|
||||
persistor ??= await backupPersistor.forProject(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
try {
|
||||
logger.debug({ projectId, hash }, 'Starting blob backup')
|
||||
await uploadBlobToBackup(historyId, blob, tmpPath, persistor)
|
||||
await storeBlobBackup(projectId, hash)
|
||||
recordBackupConclusion('success')
|
||||
} catch (error) {
|
||||
if (error instanceof AlreadyWrittenError) {
|
||||
logger.debug({ error, projectId, hash }, 'Blob already backed up')
|
||||
// record that we backed it up already
|
||||
await storeBlobBackup(projectId, hash)
|
||||
recordBackupConclusion('failure', 'already_backed_up')
|
||||
return
|
||||
}
|
||||
// eventually queue this for retry - for now this will be fixed by running the script
|
||||
recordBackupConclusion('failure')
|
||||
logger.warn({ error, projectId, hash }, 'Failed to upload blob to backup')
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'Ended blob backup')
|
||||
}
|
||||
}
|
93
services/history-v1/storage/lib/backupDeletion.mjs
Normal file
93
services/history-v1/storage/lib/backupDeletion.mjs
Normal file
@@ -0,0 +1,93 @@
|
||||
// @ts-check
|
||||
import { callbackify } from 'util'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import config from 'config'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { db } from './mongodb.js'
|
||||
import projectKey from './project_key.js'
|
||||
import chunkStore from '../lib/chunk_store/index.js'
|
||||
import {
|
||||
backupPersistor,
|
||||
chunksBucket,
|
||||
projectBlobsBucket,
|
||||
} from './backupPersistor.mjs'
|
||||
|
||||
const MS_PER_DAY = 24 * 60 * 60 * 1000
|
||||
const EXPIRE_PROJECTS_AFTER_MS =
|
||||
parseInt(config.get('minSoftDeletionPeriodDays'), 10) * MS_PER_DAY
|
||||
const deletedProjectsCollection = db.collection('deletedProjects')
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function projectHasLatestChunk(historyId) {
|
||||
const chunk = await chunkStore.getBackend(historyId).getLatestChunk(historyId)
|
||||
return chunk != null
|
||||
}
|
||||
|
||||
export class NotReadyToDelete extends OError {}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function deleteProjectBackup(projectId) {
|
||||
const deletedProject = await deletedProjectsCollection.findOne(
|
||||
{ 'deleterData.deletedProjectId': new ObjectId(projectId) },
|
||||
{
|
||||
projection: {
|
||||
'deleterData.deletedProjectOverleafHistoryId': 1,
|
||||
'deleterData.deletedAt': 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (!deletedProject) {
|
||||
throw new NotReadyToDelete('refusing to delete non-deleted project')
|
||||
}
|
||||
const expiresAt =
|
||||
deletedProject.deleterData.deletedAt.getTime() + EXPIRE_PROJECTS_AFTER_MS
|
||||
if (expiresAt > Date.now()) {
|
||||
throw new NotReadyToDelete('refusing to delete non-expired project')
|
||||
}
|
||||
|
||||
const historyId =
|
||||
deletedProject.deleterData.deletedProjectOverleafHistoryId?.toString()
|
||||
if (!historyId) {
|
||||
throw new NotReadyToDelete(
|
||||
'refusing to delete project with unknown historyId'
|
||||
)
|
||||
}
|
||||
|
||||
if (await projectHasLatestChunk(historyId)) {
|
||||
throw new NotReadyToDelete(
|
||||
'refusing to delete project with remaining chunks'
|
||||
)
|
||||
}
|
||||
|
||||
const prefix = projectKey.format(historyId) + '/'
|
||||
await backupPersistor.deleteDirectory(chunksBucket, prefix)
|
||||
await backupPersistor.deleteDirectory(projectBlobsBucket, prefix)
|
||||
}
|
||||
|
||||
export async function healthCheck() {
|
||||
const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
|
||||
if (HEALTH_CHECK_PROJECTS.length !== 2) {
|
||||
throw new Error('expected 2 healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
|
||||
throw new Error('expected mongo id in healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
|
||||
throw new Error('expected postgres id in healthCheckProjects')
|
||||
}
|
||||
|
||||
for (const historyId of HEALTH_CHECK_PROJECTS) {
|
||||
if (!(await projectHasLatestChunk(historyId))) {
|
||||
throw new Error(`project has no history: ${historyId}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const healthCheckCb = callbackify(healthCheck)
|
||||
export const deleteProjectBackupCb = callbackify(deleteProjectBackup)
|
152
services/history-v1/storage/lib/backupGenerator.mjs
Normal file
152
services/history-v1/storage/lib/backupGenerator.mjs
Normal file
@@ -0,0 +1,152 @@
|
||||
/**
|
||||
* Provides a generator function to back up project chunks and blobs.
|
||||
*/
|
||||
|
||||
import chunkStore from './chunk_store/index.js'
|
||||
|
||||
import {
|
||||
GLOBAL_BLOBS, // NOTE: must call loadGlobalBlobs() before using this
|
||||
BlobStore,
|
||||
} from './blob_store/index.js'
|
||||
|
||||
import assert from './assert.js'
|
||||
|
||||
async function lookBehindForSeenBlobs(
|
||||
projectId,
|
||||
chunk,
|
||||
lastBackedUpVersion,
|
||||
seenBlobs
|
||||
) {
|
||||
if (chunk.startVersion === 0) {
|
||||
return // this is the first chunk, no need to check for blobs in the previous chunk
|
||||
}
|
||||
if (chunk.startVersion > 0 && lastBackedUpVersion > chunk.startVersion) {
|
||||
return // the snapshot in this chunk has already been backed up
|
||||
}
|
||||
if (
|
||||
chunk.startVersion > 0 &&
|
||||
lastBackedUpVersion === chunk.startVersion // same as previousChunk.endVersion
|
||||
) {
|
||||
// the snapshot in this chunk has not been backed up
|
||||
// so we find the set of backed up blobs from the previous chunk
|
||||
const previousChunk = await chunkStore.loadAtVersion(
|
||||
projectId,
|
||||
lastBackedUpVersion
|
||||
)
|
||||
const previousChunkHistory = previousChunk.getHistory()
|
||||
previousChunkHistory.findBlobHashes(seenBlobs)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Records blob hashes that have been previously seen in a chunk's history.
|
||||
*
|
||||
* @param {Object} chunk - The chunk containing history data
|
||||
* @param {number} currentBackedUpVersion - The version number that has been backed up
|
||||
* @param {Set<string>} seenBlobs - Set to collect previously seen blob hashes
|
||||
* @returns {void}
|
||||
*/
|
||||
function recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs) {
|
||||
// We need to look at the chunk and decide how far we have backed up.
|
||||
// If we have not backed up this chunk at all, we need to backup the blobs
|
||||
// in the snapshot. Otherwise we need to backup the blobs in the changes
|
||||
// that have occurred since the last backup.
|
||||
const history = chunk.getHistory()
|
||||
const startVersion = chunk.getStartVersion()
|
||||
if (currentBackedUpVersion === 0) {
|
||||
// If we have only backed up version 0 (i.e. the first change)
|
||||
// then that includes the initial snapshot, so we consider
|
||||
// the blobs of the initial snapshot as seen. If the project
|
||||
// has not been backed up at all then currentBackedUpVersion
|
||||
// will be undefined.
|
||||
history.snapshot.findBlobHashes(seenBlobs)
|
||||
} else if (currentBackedUpVersion > startVersion) {
|
||||
history.snapshot.findBlobHashes(seenBlobs)
|
||||
for (let i = 0; i < currentBackedUpVersion - startVersion; i++) {
|
||||
history.changes[i].findBlobHashes(seenBlobs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects new blob objects that need to be backed up from a given chunk.
|
||||
*
|
||||
* @param {Object} chunk - The chunk object containing history data
|
||||
* @param {Object} blobStore - Storage interface for retrieving blobs
|
||||
* @param {Set<string>} seenBlobs - Set of blob hashes that have already been processed
|
||||
* @returns {Promise<Object[]>} Array of blob objects that need to be backed up
|
||||
* @throws {Error} If blob retrieval fails
|
||||
*/
|
||||
async function collectNewBlobsForBackup(chunk, blobStore, seenBlobs) {
|
||||
/** @type {Set<string>} */
|
||||
const blobHashes = new Set()
|
||||
const history = chunk.getHistory()
|
||||
// Get all the blobs in this chunk, then exclude the seenBlobs and global blobs
|
||||
history.findBlobHashes(blobHashes)
|
||||
const blobsToBackup = await blobStore.getBlobs(
|
||||
[...blobHashes].filter(
|
||||
hash =>
|
||||
hash &&
|
||||
!seenBlobs.has(hash) &&
|
||||
(!GLOBAL_BLOBS.has(hash) || GLOBAL_BLOBS.get(hash).demoted)
|
||||
)
|
||||
)
|
||||
return blobsToBackup
|
||||
}
|
||||
|
||||
/**
|
||||
* Asynchronously generates backups for a project based on provided versions.
|
||||
* @param {string} projectId - The ID of the project's history to back up.
|
||||
* @param {number} lastBackedUpVersion - The last version that was successfully backed up.
|
||||
* @yields {AsyncGenerator<{ chunkRecord: object, chunkToBackup: object, chunkBuffer: Buffer, blobsToBackup: object[] }>}
|
||||
* Yields chunk records and corresponding data needed for backups.
|
||||
*/
|
||||
export async function* backupGenerator(projectId, lastBackedUpVersion) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.maybe.integer(lastBackedUpVersion, 'bad lastBackedUpVersion')
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
/** @type {Set<string>} */
|
||||
const seenBlobs = new Set() // records the blobs that are already backed up
|
||||
|
||||
const firstPendingVersion =
|
||||
lastBackedUpVersion >= 0 ? lastBackedUpVersion + 1 : 0
|
||||
let isStartingChunk = true
|
||||
let currentBackedUpVersion = lastBackedUpVersion
|
||||
const chunkRecordIterator = chunkStore.getProjectChunksFromVersion(
|
||||
projectId,
|
||||
firstPendingVersion
|
||||
)
|
||||
|
||||
for await (const chunkRecord of chunkRecordIterator) {
|
||||
const { chunk, chunkBuffer } = await chunkStore.loadByChunkRecord(
|
||||
projectId,
|
||||
chunkRecord
|
||||
)
|
||||
|
||||
if (isStartingChunk) {
|
||||
await lookBehindForSeenBlobs(
|
||||
projectId,
|
||||
chunkRecord,
|
||||
lastBackedUpVersion,
|
||||
seenBlobs
|
||||
)
|
||||
isStartingChunk = false
|
||||
}
|
||||
|
||||
recordPreviouslySeenBlobs(chunk, currentBackedUpVersion, seenBlobs)
|
||||
|
||||
const blobsToBackup = await collectNewBlobsForBackup(
|
||||
chunk,
|
||||
blobStore,
|
||||
seenBlobs
|
||||
)
|
||||
|
||||
yield { chunkRecord, chunkToBackup: chunk, chunkBuffer, blobsToBackup }
|
||||
|
||||
// After we generate a backup of this chunk, mark the backed up blobs as seen
|
||||
blobsToBackup.forEach(blob => seenBlobs.add(blob.getHash()))
|
||||
currentBackedUpVersion = chunkRecord.endVersion
|
||||
}
|
||||
}
|
121
services/history-v1/storage/lib/backupPersistor.mjs
Normal file
121
services/history-v1/storage/lib/backupPersistor.mjs
Normal file
@@ -0,0 +1,121 @@
|
||||
// @ts-check
|
||||
import fs from 'node:fs'
|
||||
import Path from 'node:path'
|
||||
import _ from 'lodash'
|
||||
import config from 'config'
|
||||
import { SecretManagerServiceClient } from '@google-cloud/secret-manager'
|
||||
import OError from '@overleaf/o-error'
|
||||
import {
|
||||
PerProjectEncryptedS3Persistor,
|
||||
RootKeyEncryptionKey,
|
||||
} from '@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js'
|
||||
import { HistoryStore } from './history_store.js'
|
||||
|
||||
const persistorConfig = _.cloneDeep(config.get('backupPersistor'))
|
||||
const { chunksBucket, deksBucket, globalBlobsBucket, projectBlobsBucket } =
|
||||
config.get('backupStore')
|
||||
|
||||
export { chunksBucket, globalBlobsBucket, projectBlobsBucket }
|
||||
|
||||
function convertKey(key, convertFn) {
|
||||
if (_.has(persistorConfig, key)) {
|
||||
_.update(persistorConfig, key, convertFn)
|
||||
}
|
||||
}
|
||||
|
||||
convertKey('s3SSEC.httpOptions.timeout', s => parseInt(s, 10))
|
||||
convertKey('s3SSEC.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('s3SSEC.pathStyle', s => s === 'true')
|
||||
// array of CA, either inlined or on disk
|
||||
convertKey('s3SSEC.ca', s =>
|
||||
JSON.parse(s).map(ca => (ca.startsWith('/') ? fs.readFileSync(ca) : ca))
|
||||
)
|
||||
|
||||
/** @type {() => Promise<string>} */
|
||||
let getRawRootKeyEncryptionKeys
|
||||
|
||||
if ((process.env.NODE_ENV || 'production') === 'production') {
|
||||
;[persistorConfig.s3SSEC.key, persistorConfig.s3SSEC.secret] = (
|
||||
await loadFromSecretsManager(
|
||||
process.env.BACKUP_AWS_CREDENTIALS || '',
|
||||
'BACKUP_AWS_CREDENTIALS'
|
||||
)
|
||||
).split(':')
|
||||
getRawRootKeyEncryptionKeys = () =>
|
||||
loadFromSecretsManager(
|
||||
persistorConfig.keyEncryptionKeys,
|
||||
'BACKUP_KEY_ENCRYPTION_KEYS'
|
||||
)
|
||||
} else {
|
||||
getRawRootKeyEncryptionKeys = () => persistorConfig.keyEncryptionKeys
|
||||
}
|
||||
|
||||
export const DELETION_ONLY = persistorConfig.keyEncryptionKeys === 'none'
|
||||
if (DELETION_ONLY) {
|
||||
// For Backup-deleter; should not encrypt or read data; deleting does not need key.
|
||||
getRawRootKeyEncryptionKeys = () => new Promise(_resolve => {})
|
||||
}
|
||||
|
||||
const PROJECT_FOLDER_REGEX =
|
||||
/^\d{3}\/\d{3}\/\d{3,}\/|[0-9a-f]{3}\/[0-9a-f]{3}\/[0-9a-f]{18}\/$/
|
||||
|
||||
/**
|
||||
* @param {string} bucketName
|
||||
* @param {string} path
|
||||
* @return {string}
|
||||
*/
|
||||
export function pathToProjectFolder(bucketName, path) {
|
||||
switch (bucketName) {
|
||||
case deksBucket:
|
||||
case chunksBucket:
|
||||
case projectBlobsBucket:
|
||||
const projectFolder = Path.join(...path.split('/').slice(0, 3)) + '/'
|
||||
if (!PROJECT_FOLDER_REGEX.test(projectFolder)) {
|
||||
throw new OError('invalid project folder', { bucketName, path })
|
||||
}
|
||||
return projectFolder
|
||||
default:
|
||||
throw new Error(`${bucketName} does not store per-project files`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} name
|
||||
* @param {string} label
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function loadFromSecretsManager(name, label) {
|
||||
const client = new SecretManagerServiceClient()
|
||||
const [version] = await client.accessSecretVersion({ name })
|
||||
if (!version.payload?.data) throw new Error(`empty secret: ${label}`)
|
||||
return version.payload.data.toString()
|
||||
}
|
||||
|
||||
async function getRootKeyEncryptionKeys() {
|
||||
return JSON.parse(await getRawRootKeyEncryptionKeys()).map(
|
||||
({ key, salt }) => {
|
||||
return new RootKeyEncryptionKey(
|
||||
Buffer.from(key, 'base64'),
|
||||
Buffer.from(salt, 'base64')
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
export const backupPersistor = new PerProjectEncryptedS3Persistor({
|
||||
...persistorConfig.s3SSEC,
|
||||
disableMultiPartUpload: true,
|
||||
dataEncryptionKeyBucketName: deksBucket,
|
||||
pathToProjectFolder,
|
||||
getRootKeyEncryptionKeys,
|
||||
storageClass: {
|
||||
[deksBucket]: 'STANDARD',
|
||||
[chunksBucket]: persistorConfig.tieringStorageClass,
|
||||
[projectBlobsBucket]: persistorConfig.tieringStorageClass,
|
||||
},
|
||||
})
|
||||
|
||||
export const backupHistoryStore = new HistoryStore(
|
||||
backupPersistor,
|
||||
chunksBucket
|
||||
)
|
216
services/history-v1/storage/lib/backupVerifier.mjs
Normal file
216
services/history-v1/storage/lib/backupVerifier.mjs
Normal file
@@ -0,0 +1,216 @@
|
||||
// @ts-check
|
||||
import OError from '@overleaf/o-error'
|
||||
import chunkStore from '../lib/chunk_store/index.js'
|
||||
import {
|
||||
backupPersistor,
|
||||
chunksBucket,
|
||||
projectBlobsBucket,
|
||||
} from './backupPersistor.mjs'
|
||||
import { Blob, Chunk, History } from 'overleaf-editor-core'
|
||||
import { BlobStore, GLOBAL_BLOBS, makeProjectKey } from './blob_store/index.js'
|
||||
import blobHash from './blob_hash.js'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import logger from '@overleaf/logger'
|
||||
import path from 'node:path'
|
||||
import projectKey from './project_key.js'
|
||||
import streams from './streams.js'
|
||||
import objectPersistor from '@overleaf/object-persistor'
|
||||
import { getEndDateForRPO } from '../../backupVerifier/utils.mjs'
|
||||
|
||||
/**
|
||||
* @typedef {import("@overleaf/object-persistor/src/PerProjectEncryptedS3Persistor.js").CachedPerProjectEncryptedS3Persistor} CachedPerProjectEncryptedS3Persistor
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {string} hash
|
||||
*/
|
||||
export async function verifyBlob(historyId, hash) {
|
||||
return await verifyBlobs(historyId, [hash])
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @return {Promise<CachedPerProjectEncryptedS3Persistor>}
|
||||
*/
|
||||
async function getProjectPersistor(historyId) {
|
||||
try {
|
||||
return await backupPersistor.forProjectRO(
|
||||
projectBlobsBucket,
|
||||
makeProjectKey(historyId, '')
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new BackupCorruptedError('dek does not exist', {}, err)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {Array<string>} hashes
|
||||
* @param {CachedPerProjectEncryptedS3Persistor} [projectCache]
|
||||
*/
|
||||
export async function verifyBlobs(historyId, hashes, projectCache) {
|
||||
if (hashes.length === 0) throw new Error('bug: empty hashes')
|
||||
|
||||
if (!projectCache) {
|
||||
projectCache = await getProjectPersistor(historyId)
|
||||
}
|
||||
const blobStore = new BlobStore(historyId)
|
||||
for (const hash of hashes) {
|
||||
const path = makeProjectKey(historyId, hash)
|
||||
const blob = await blobStore.getBlob(hash)
|
||||
if (!blob) throw new Blob.NotFoundError(hash)
|
||||
let stream
|
||||
try {
|
||||
stream = await projectCache.getObjectStream(projectBlobsBucket, path, {
|
||||
autoGunzip: true,
|
||||
})
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new BackupCorruptedMissingBlobError('missing blob', {
|
||||
path,
|
||||
hash,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
const backupHash = await blobHash.fromStream(blob.getByteLength(), stream)
|
||||
if (backupHash !== hash) {
|
||||
throw new BackupCorruptedInvalidBlobError(
|
||||
'hash mismatch for backed up blob',
|
||||
{
|
||||
path,
|
||||
hash,
|
||||
backupHash,
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {Date} [endTimestamp]
|
||||
*/
|
||||
export async function verifyProjectWithErrorContext(
|
||||
historyId,
|
||||
endTimestamp = getEndDateForRPO()
|
||||
) {
|
||||
try {
|
||||
await verifyProject(historyId, endTimestamp)
|
||||
} catch (err) {
|
||||
// @ts-ignore err is Error instance
|
||||
throw OError.tag(err, 'verifyProject', { historyId, endTimestamp })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {number} startVersion
|
||||
* @param {CachedPerProjectEncryptedS3Persistor} backupPersistorForProject
|
||||
* @return {Promise<any>}
|
||||
*/
|
||||
async function loadChunk(historyId, startVersion, backupPersistorForProject) {
|
||||
const key = path.join(
|
||||
projectKey.format(historyId),
|
||||
projectKey.pad(startVersion)
|
||||
)
|
||||
try {
|
||||
const buf = await streams.gunzipStreamToBuffer(
|
||||
await backupPersistorForProject.getObjectStream(chunksBucket, key)
|
||||
)
|
||||
return JSON.parse(buf.toString('utf-8'))
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Chunk.NotPersistedError(historyId)
|
||||
}
|
||||
if (err instanceof Error) {
|
||||
throw OError.tag(err, 'Failed to load chunk', { historyId, startVersion })
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} historyId
|
||||
* @param {Date} endTimestamp
|
||||
*/
|
||||
export async function verifyProject(historyId, endTimestamp) {
|
||||
const backend = chunkStore.getBackend(historyId)
|
||||
const [first, last] = await Promise.all([
|
||||
backend.getFirstChunkBeforeTimestamp(historyId, endTimestamp),
|
||||
backend.getLastActiveChunkBeforeTimestamp(historyId, endTimestamp),
|
||||
])
|
||||
|
||||
const chunksRecordsToVerify = [
|
||||
{
|
||||
chunkId: first.id,
|
||||
chunkLabel: 'first',
|
||||
},
|
||||
]
|
||||
if (first.startVersion !== last.startVersion) {
|
||||
chunksRecordsToVerify.push({
|
||||
chunkId: last.id,
|
||||
chunkLabel: 'last before RPO',
|
||||
})
|
||||
}
|
||||
|
||||
const projectCache = await getProjectPersistor(historyId)
|
||||
|
||||
const chunks = await Promise.all(
|
||||
chunksRecordsToVerify.map(async chunk => {
|
||||
try {
|
||||
return History.fromRaw(
|
||||
await loadChunk(historyId, chunk.startVersion, projectCache)
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof Chunk.NotPersistedError) {
|
||||
throw new BackupRPOViolationChunkNotBackedUpError(
|
||||
'BackupRPOviolation: chunk not backed up',
|
||||
chunk
|
||||
)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
})
|
||||
)
|
||||
const seenBlobs = new Set()
|
||||
const blobsToVerify = []
|
||||
for (const chunk of chunks) {
|
||||
/** @type {Set<string>} */
|
||||
const chunkBlobs = new Set()
|
||||
chunk.findBlobHashes(chunkBlobs)
|
||||
let hasAddedBlobFromThisChunk = false
|
||||
for (const blobHash of chunkBlobs) {
|
||||
if (seenBlobs.has(blobHash)) continue // old blob
|
||||
if (GLOBAL_BLOBS.has(blobHash)) continue // global blob
|
||||
seenBlobs.add(blobHash)
|
||||
if (!hasAddedBlobFromThisChunk) {
|
||||
blobsToVerify.push(blobHash)
|
||||
hasAddedBlobFromThisChunk = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if (blobsToVerify.length === 0) {
|
||||
logger.debug(
|
||||
{
|
||||
historyId,
|
||||
chunksRecordsToVerify: chunksRecordsToVerify.map(c => c.chunkId),
|
||||
},
|
||||
'chunks contain no blobs to verify'
|
||||
)
|
||||
return
|
||||
}
|
||||
await verifyBlobs(historyId, blobsToVerify, projectCache)
|
||||
}
|
||||
|
||||
export class BackupCorruptedError extends OError {}
|
||||
export class BackupRPOViolationError extends OError {}
|
||||
export class BackupCorruptedMissingBlobError extends BackupCorruptedError {}
|
||||
export class BackupCorruptedInvalidBlobError extends BackupCorruptedError {}
|
||||
export class BackupRPOViolationChunkNotBackedUpError extends OError {}
|
212
services/history-v1/storage/lib/backup_store/index.js
Normal file
212
services/history-v1/storage/lib/backup_store/index.js
Normal file
@@ -0,0 +1,212 @@
|
||||
const { Binary, ObjectId } = require('mongodb')
|
||||
const { projects, backedUpBlobs } = require('../mongodb')
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
// List projects with pending backups older than the specified interval
|
||||
function listPendingBackups(timeIntervalMs = 0, limit = null) {
|
||||
const cutoffTime = new Date(Date.now() - timeIntervalMs)
|
||||
const options = {
|
||||
projection: { 'overleaf.backup.pendingChangeAt': 1 },
|
||||
sort: { 'overleaf.backup.pendingChangeAt': 1 },
|
||||
}
|
||||
|
||||
// Apply limit if provided
|
||||
if (limit) {
|
||||
options.limit = limit
|
||||
}
|
||||
|
||||
const cursor = projects.find(
|
||||
{
|
||||
'overleaf.backup.pendingChangeAt': {
|
||||
$exists: true,
|
||||
$lt: cutoffTime,
|
||||
},
|
||||
},
|
||||
options
|
||||
)
|
||||
return cursor
|
||||
}
|
||||
|
||||
// List projects that have never been backed up and are older than the specified interval
|
||||
function listUninitializedBackups(timeIntervalMs = 0, limit = null) {
|
||||
const cutoffTimeInSeconds = (Date.now() - timeIntervalMs) / 1000
|
||||
const options = {
|
||||
projection: { _id: 1 },
|
||||
sort: { _id: 1 },
|
||||
}
|
||||
// Apply limit if provided
|
||||
if (limit) {
|
||||
options.limit = limit
|
||||
}
|
||||
const cursor = projects.find(
|
||||
{
|
||||
'overleaf.backup.lastBackedUpVersion': null,
|
||||
_id: {
|
||||
$lt: ObjectId.createFromTime(cutoffTimeInSeconds),
|
||||
},
|
||||
},
|
||||
options
|
||||
)
|
||||
return cursor
|
||||
}
|
||||
|
||||
// Retrieve the history ID for a given project without giving direct access to the
|
||||
// projects collection.
|
||||
|
||||
async function getHistoryId(projectId) {
|
||||
const project = await projects.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{
|
||||
projection: {
|
||||
'overleaf.history.id': 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (!project) {
|
||||
throw new Error('Project not found')
|
||||
}
|
||||
return project.overleaf.history.id
|
||||
}
|
||||
|
||||
async function getBackupStatus(projectId) {
|
||||
const project = await projects.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{
|
||||
projection: {
|
||||
'overleaf.history': 1,
|
||||
'overleaf.backup': 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (!project) {
|
||||
throw new Error('Project not found')
|
||||
}
|
||||
return {
|
||||
backupStatus: project.overleaf.backup,
|
||||
historyId: `${project.overleaf.history.id}`,
|
||||
currentEndVersion: project.overleaf.history.currentEndVersion,
|
||||
currentEndTimestamp: project.overleaf.history.currentEndTimestamp,
|
||||
}
|
||||
}
|
||||
|
||||
async function setBackupVersion(
|
||||
projectId,
|
||||
previousBackedUpVersion,
|
||||
currentBackedUpVersion,
|
||||
currentBackedUpAt
|
||||
) {
|
||||
// FIXME: include a check to handle race conditions
|
||||
// to make sure only one process updates the version numbers
|
||||
const result = await projects.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
'overleaf.backup.lastBackedUpVersion': previousBackedUpVersion,
|
||||
},
|
||||
{
|
||||
$set: {
|
||||
'overleaf.backup.lastBackedUpVersion': currentBackedUpVersion,
|
||||
'overleaf.backup.lastBackedUpAt': currentBackedUpAt,
|
||||
},
|
||||
}
|
||||
)
|
||||
if (result.matchedCount === 0 || result.modifiedCount === 0) {
|
||||
throw new OError('Failed to update backup version', {
|
||||
previousBackedUpVersion,
|
||||
currentBackedUpVersion,
|
||||
currentBackedUpAt,
|
||||
result,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
async function updateCurrentMetadataIfNotSet(projectId, latestChunkMetadata) {
|
||||
await projects.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
'overleaf.history.currentEndVersion': { $exists: false },
|
||||
'overleaf.history.currentEndTimestamp': { $exists: false },
|
||||
},
|
||||
{
|
||||
$set: {
|
||||
'overleaf.history.currentEndVersion': latestChunkMetadata.endVersion,
|
||||
'overleaf.history.currentEndTimestamp':
|
||||
latestChunkMetadata.endTimestamp,
|
||||
},
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the pending change timestamp for a project's backup status
|
||||
* @param {string} projectId - The ID of the project to update
|
||||
* @param {Date} backupStartTime - The timestamp to set for pending changes
|
||||
* @returns {Promise<void>}
|
||||
*
|
||||
* If the project's last backed up version matches the current end version,
|
||||
* the pending change timestamp is removed. Otherwise, it's set to the provided
|
||||
* backup start time.
|
||||
*/
|
||||
async function updatePendingChangeTimestamp(projectId, backupStartTime) {
|
||||
await projects.updateOne({ _id: new ObjectId(projectId) }, [
|
||||
{
|
||||
$set: {
|
||||
'overleaf.backup.pendingChangeAt': {
|
||||
$cond: {
|
||||
if: {
|
||||
$eq: [
|
||||
'$overleaf.backup.lastBackedUpVersion',
|
||||
'$overleaf.history.currentEndVersion',
|
||||
],
|
||||
},
|
||||
then: '$$REMOVE',
|
||||
else: backupStartTime,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
async function getBackedUpBlobHashes(projectId) {
|
||||
const result = await backedUpBlobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { blobs: 1 } }
|
||||
)
|
||||
if (!result) {
|
||||
return new Set()
|
||||
}
|
||||
const hashes = result.blobs.map(b => b.buffer.toString('hex'))
|
||||
return new Set(hashes)
|
||||
}
|
||||
|
||||
async function unsetBackedUpBlobHashes(projectId, hashes) {
|
||||
const binaryHashes = hashes.map(h => new Binary(Buffer.from(h, 'hex')))
|
||||
const result = await backedUpBlobs.findOneAndUpdate(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{
|
||||
$pullAll: {
|
||||
blobs: binaryHashes,
|
||||
},
|
||||
},
|
||||
{ returnDocument: 'after' }
|
||||
)
|
||||
if (result && result.blobs.length === 0) {
|
||||
await backedUpBlobs.deleteOne({
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: { $size: 0 },
|
||||
})
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getHistoryId,
|
||||
getBackupStatus,
|
||||
setBackupVersion,
|
||||
updateCurrentMetadataIfNotSet,
|
||||
updatePendingChangeTimestamp,
|
||||
listPendingBackups,
|
||||
listUninitializedBackups,
|
||||
getBackedUpBlobHashes,
|
||||
unsetBackedUpBlobHashes,
|
||||
}
|
40
services/history-v1/storage/lib/batch_blob_store.js
Normal file
40
services/history-v1/storage/lib/batch_blob_store.js
Normal file
@@ -0,0 +1,40 @@
|
||||
'use strict'
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
|
||||
/**
|
||||
* @constructor
|
||||
* @param {BlobStore} blobStore
|
||||
* @classdesc
|
||||
* Wrapper for BlobStore that pre-fetches blob metadata to avoid making one
|
||||
* database call per blob lookup.
|
||||
*/
|
||||
function BatchBlobStore(blobStore) {
|
||||
this.blobStore = blobStore
|
||||
this.blobs = new Map()
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-fetch metadata for the given blob hashes.
|
||||
*
|
||||
* @param {Array.<string>} hashes
|
||||
* @return {Promise}
|
||||
*/
|
||||
BatchBlobStore.prototype.preload = function batchBlobStorePreload(hashes) {
|
||||
return BPromise.each(this.blobStore.getBlobs(hashes), blob => {
|
||||
this.blobs.set(blob.getHash(), blob)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* @see BlobStore#getBlob
|
||||
*/
|
||||
BatchBlobStore.prototype.getBlob = BPromise.method(
|
||||
function batchBlobStoreGetBlob(hash) {
|
||||
const blob = this.blobs.get(hash)
|
||||
if (blob) return blob
|
||||
return this.blobStore.getBlob(hash)
|
||||
}
|
||||
)
|
||||
|
||||
module.exports = BatchBlobStore
|
80
services/history-v1/storage/lib/blob_hash.js
Normal file
80
services/history-v1/storage/lib/blob_hash.js
Normal file
@@ -0,0 +1,80 @@
|
||||
/** @module */
|
||||
'use strict'
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
const fs = BPromise.promisifyAll(require('node:fs'))
|
||||
const crypto = require('node:crypto')
|
||||
const { pipeline } = require('node:stream')
|
||||
const assert = require('./assert')
|
||||
|
||||
function getGitBlobHeader(byteLength) {
|
||||
return 'blob ' + byteLength + '\x00'
|
||||
}
|
||||
|
||||
function getBlobHash(byteLength) {
|
||||
const hash = crypto.createHash('sha1')
|
||||
hash.setEncoding('hex')
|
||||
hash.update(getGitBlobHeader(byteLength))
|
||||
return hash
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the git blob hash for a blob from a readable stream of its content.
|
||||
*
|
||||
* @function
|
||||
* @param {number} byteLength
|
||||
* @param {stream.Readable} stream
|
||||
* @return {Promise.<string>} hexadecimal SHA-1 hash
|
||||
*/
|
||||
exports.fromStream = BPromise.method(
|
||||
function blobHashFromStream(byteLength, stream) {
|
||||
assert.integer(byteLength, 'blobHash: bad byteLength')
|
||||
assert.object(stream, 'blobHash: bad stream')
|
||||
|
||||
const hash = getBlobHash(byteLength)
|
||||
return new BPromise(function (resolve, reject) {
|
||||
pipeline(stream, hash, function (err) {
|
||||
if (err) {
|
||||
reject(err)
|
||||
} else {
|
||||
hash.end()
|
||||
resolve(hash.read())
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
/**
|
||||
* Compute the git blob hash for a blob with the given string content.
|
||||
*
|
||||
* @param {string} string
|
||||
* @return {string} hexadecimal SHA-1 hash
|
||||
*/
|
||||
exports.fromString = function blobHashFromString(string) {
|
||||
assert.string(string, 'blobHash: bad string')
|
||||
const hash = getBlobHash(Buffer.byteLength(string))
|
||||
hash.update(string, 'utf8')
|
||||
hash.end()
|
||||
return hash.read()
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the git blob hash for the content of a file
|
||||
*
|
||||
* @param {string} filePath
|
||||
* @return {string} hexadecimal SHA-1 hash
|
||||
*/
|
||||
exports.fromFile = function blobHashFromFile(pathname) {
|
||||
assert.string(pathname, 'blobHash: bad pathname')
|
||||
|
||||
function getByteLengthOfFile() {
|
||||
return fs.statAsync(pathname).then(stat => stat.size)
|
||||
}
|
||||
|
||||
const fromStream = this.fromStream
|
||||
return getByteLengthOfFile(pathname).then(function (byteLength) {
|
||||
const stream = fs.createReadStream(pathname)
|
||||
return fromStream(byteLength, stream)
|
||||
})
|
||||
}
|
433
services/history-v1/storage/lib/blob_store/index.js
Normal file
433
services/history-v1/storage/lib/blob_store/index.js
Normal file
@@ -0,0 +1,433 @@
|
||||
'use strict'
|
||||
|
||||
const config = require('config')
|
||||
const fs = require('node:fs')
|
||||
const isValidUtf8 = require('utf-8-validate')
|
||||
const { ReadableString } = require('@overleaf/stream-utils')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const Blob = core.Blob
|
||||
const TextOperation = core.TextOperation
|
||||
const containsNonBmpChars = core.util.containsNonBmpChars
|
||||
|
||||
const assert = require('../assert')
|
||||
const blobHash = require('../blob_hash')
|
||||
const mongodb = require('../mongodb')
|
||||
const persistor = require('../persistor')
|
||||
const projectKey = require('../project_key')
|
||||
const streams = require('../streams')
|
||||
const postgresBackend = require('./postgres')
|
||||
const mongoBackend = require('./mongo')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
/** @import { Readable } from 'stream' */
|
||||
|
||||
const GLOBAL_BLOBS = new Map()
|
||||
|
||||
function makeGlobalKey(hash) {
|
||||
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
||||
}
|
||||
|
||||
function makeProjectKey(projectId, hash) {
|
||||
return `${projectKey.format(projectId)}/${hash.slice(0, 2)}/${hash.slice(2)}`
|
||||
}
|
||||
|
||||
async function uploadBlob(projectId, blob, stream, opts = {}) {
|
||||
const bucket = config.get('blobStore.projectBucket')
|
||||
const key = makeProjectKey(projectId, blob.getHash())
|
||||
logger.debug({ projectId, blob }, 'uploadBlob started')
|
||||
try {
|
||||
await persistor.sendStream(bucket, key, stream, {
|
||||
contentType: 'application/octet-stream',
|
||||
...opts,
|
||||
})
|
||||
} finally {
|
||||
logger.debug({ projectId, blob }, 'uploadBlob finished')
|
||||
}
|
||||
}
|
||||
|
||||
function getBlobLocation(projectId, hash) {
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
return {
|
||||
bucket: config.get('blobStore.globalBucket'),
|
||||
key: makeGlobalKey(hash),
|
||||
}
|
||||
} else {
|
||||
return {
|
||||
bucket: config.get('blobStore.projectBucket'),
|
||||
key: makeProjectKey(projectId, hash),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the appropriate backend for the given project id
|
||||
*
|
||||
* Numeric ids use the Postgres backend.
|
||||
* Strings of 24 characters use the Mongo backend.
|
||||
*/
|
||||
function getBackend(projectId) {
|
||||
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
|
||||
return postgresBackend
|
||||
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
|
||||
return mongoBackend
|
||||
} else {
|
||||
throw new OError('bad project id', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
async function makeBlobForFile(pathname) {
|
||||
const { size: byteLength } = await fs.promises.stat(pathname)
|
||||
const hash = await blobHash.fromStream(
|
||||
byteLength,
|
||||
fs.createReadStream(pathname)
|
||||
)
|
||||
return new Blob(hash, byteLength)
|
||||
}
|
||||
|
||||
async function getStringLengthOfFile(byteLength, pathname) {
|
||||
// We have to read the file into memory to get its UTF-8 length, so don't
|
||||
// bother for files that are too large for us to edit anyway.
|
||||
if (byteLength > Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
||||
return null
|
||||
}
|
||||
|
||||
// We need to check if the file contains nonBmp or null characters
|
||||
let data = await fs.promises.readFile(pathname)
|
||||
if (!isValidUtf8(data)) return null
|
||||
data = data.toString()
|
||||
if (data.length > TextOperation.MAX_STRING_LENGTH) return null
|
||||
if (containsNonBmpChars(data)) return null
|
||||
if (data.indexOf('\x00') !== -1) return null
|
||||
return data.length
|
||||
}
|
||||
|
||||
async function deleteBlobsInBucket(projectId) {
|
||||
const bucket = config.get('blobStore.projectBucket')
|
||||
const prefix = `${projectKey.format(projectId)}/`
|
||||
logger.debug({ projectId }, 'deleteBlobsInBucket started')
|
||||
try {
|
||||
await persistor.deleteDirectory(bucket, prefix)
|
||||
} finally {
|
||||
logger.debug({ projectId }, 'deleteBlobsInBucket finished')
|
||||
}
|
||||
}
|
||||
|
||||
async function loadGlobalBlobs() {
|
||||
const blobs = await mongodb.globalBlobs.find()
|
||||
for await (const blob of blobs) {
|
||||
GLOBAL_BLOBS.set(blob._id, {
|
||||
blob: new Blob(blob._id, blob.byteLength, blob.stringLength),
|
||||
demoted: Boolean(blob.demoted),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<string|number>} projectIds
|
||||
* @return {Promise<{nBlobs:number, blobs:Map<string,Array<core.Blob>>}>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
const mongoProjects = []
|
||||
const postgresProjects = []
|
||||
for (const projectId of projectIds) {
|
||||
if (typeof projectId === 'number') {
|
||||
postgresProjects.push(projectId)
|
||||
} else {
|
||||
mongoProjects.push(projectId)
|
||||
}
|
||||
}
|
||||
const [
|
||||
{ nBlobs: nBlobsPostgres, blobs: blobsPostgres },
|
||||
{ nBlobs: nBlobsMongo, blobs: blobsMongo },
|
||||
] = await Promise.all([
|
||||
postgresBackend.getProjectBlobsBatch(postgresProjects),
|
||||
mongoBackend.getProjectBlobsBatch(mongoProjects),
|
||||
])
|
||||
for (const [id, blobs] of blobsPostgres.entries()) {
|
||||
blobsMongo.set(id.toString(), blobs)
|
||||
}
|
||||
return { nBlobs: nBlobsPostgres + nBlobsMongo, blobs: blobsMongo }
|
||||
}
|
||||
|
||||
/**
|
||||
* @classdesc
|
||||
* Fetch and store the content of files using content-addressable hashing. The
|
||||
* blob store manages both content and metadata (byte and UTF-8 length) for
|
||||
* blobs.
|
||||
*/
|
||||
class BlobStore {
|
||||
/**
|
||||
* @constructor
|
||||
* @param {string} projectId the project for which we'd like to find blobs
|
||||
*/
|
||||
constructor(projectId) {
|
||||
assert.projectId(projectId)
|
||||
this.projectId = projectId
|
||||
this.backend = getBackend(this.projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the initial data structure for a given project
|
||||
*/
|
||||
async initialize() {
|
||||
await this.backend.initialize(this.projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a blob, if one does not already exist, with the given UTF-8 encoded
|
||||
* string content.
|
||||
*
|
||||
* @param {string} string
|
||||
* @return {Promise.<core.Blob>}
|
||||
*/
|
||||
async putString(string) {
|
||||
assert.string(string, 'bad string')
|
||||
const hash = blobHash.fromString(string)
|
||||
|
||||
const existingBlob = await this._findBlobBeforeInsert(hash)
|
||||
if (existingBlob != null) {
|
||||
return existingBlob
|
||||
}
|
||||
const newBlob = new Blob(hash, Buffer.byteLength(string), string.length)
|
||||
// Note: the ReadableString is to work around a bug in the AWS SDK: it won't
|
||||
// allow Body to be blank.
|
||||
await uploadBlob(this.projectId, newBlob, new ReadableString(string))
|
||||
await this.backend.insertBlob(this.projectId, newBlob)
|
||||
return newBlob
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a blob, if one does not already exist, with the given file (usually a
|
||||
* temporary file).
|
||||
*
|
||||
* @param {string} pathname
|
||||
* @return {Promise<core.Blob>}
|
||||
*/
|
||||
async putFile(pathname) {
|
||||
assert.string(pathname, 'bad pathname')
|
||||
const newBlob = await makeBlobForFile(pathname)
|
||||
const existingBlob = await this._findBlobBeforeInsert(newBlob.getHash())
|
||||
if (existingBlob != null) {
|
||||
return existingBlob
|
||||
}
|
||||
const stringLength = await getStringLengthOfFile(
|
||||
newBlob.getByteLength(),
|
||||
pathname
|
||||
)
|
||||
newBlob.setStringLength(stringLength)
|
||||
await this.putBlob(pathname, newBlob)
|
||||
return newBlob
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a new blob, the stringLength must have been added already. It should
|
||||
* have been checked that the blob does not exist yet. Consider using
|
||||
* {@link putFile} instead of this lower-level method.
|
||||
*
|
||||
* @param {string} pathname
|
||||
* @param {core.Blob} finializedBlob
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async putBlob(pathname, finializedBlob) {
|
||||
await uploadBlob(
|
||||
this.projectId,
|
||||
finializedBlob,
|
||||
fs.createReadStream(pathname)
|
||||
)
|
||||
await this.backend.insertBlob(this.projectId, finializedBlob)
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores an object as a JSON string in a blob.
|
||||
*
|
||||
* @param {object} obj
|
||||
* @returns {Promise.<core.Blob>}
|
||||
*/
|
||||
async putObject(obj) {
|
||||
assert.object(obj, 'bad object')
|
||||
const string = JSON.stringify(obj)
|
||||
return await this.putString(string)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Fetch a blob's content by its hash as a UTF-8 encoded string.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise.<string>} promise for the content of the file
|
||||
*/
|
||||
async getString(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const projectId = this.projectId
|
||||
logger.debug({ projectId, hash }, 'getString started')
|
||||
try {
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.readStreamToBuffer(stream)
|
||||
return buffer.toString()
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getString finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a JSON encoded blob by its hash and deserialize it.
|
||||
*
|
||||
* @template [T=unknown]
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise.<T>} promise for the content of the file
|
||||
*/
|
||||
async getObject(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
const projectId = this.projectId
|
||||
logger.debug({ projectId, hash }, 'getObject started')
|
||||
try {
|
||||
const jsonString = await this.getString(hash)
|
||||
const object = JSON.parse(jsonString)
|
||||
return object
|
||||
} catch (error) {
|
||||
// Maybe this is blob is gzipped. Try to gunzip it.
|
||||
// TODO: Remove once we've ensured this is not reached
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.gunzipStreamToBuffer(stream)
|
||||
const object = JSON.parse(buffer.toString())
|
||||
logger.warn('getObject: Gzipped object in BlobStore')
|
||||
return object
|
||||
} finally {
|
||||
logger.debug({ projectId, hash }, 'getObject finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a blob by its hash as a stream.
|
||||
*
|
||||
* Note that, according to the AWS SDK docs, this does not retry after initial
|
||||
* failure, so the caller must be prepared to retry on errors, if appropriate.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @param {Object} opts
|
||||
* @return {Promise.<Readable>} a stream to read the file
|
||||
*/
|
||||
async getStream(hash, opts = {}) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const { bucket, key } = getBlobLocation(this.projectId, hash)
|
||||
try {
|
||||
const stream = await persistor.getObjectStream(bucket, key, opts)
|
||||
return stream
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Blob.NotFoundError(hash)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a blob metadata record by hexadecimal hash.
|
||||
*
|
||||
* @param {string} hash hexadecimal SHA-1 hash
|
||||
* @return {Promise<core.Blob | null>}
|
||||
*/
|
||||
async getBlob(hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null) {
|
||||
return globalBlob.blob
|
||||
}
|
||||
const blob = await this.backend.findBlob(this.projectId, hash)
|
||||
return blob
|
||||
}
|
||||
|
||||
async getBlobs(hashes) {
|
||||
assert.array(hashes, 'bad hashes')
|
||||
const nonGlobalHashes = []
|
||||
const blobs = []
|
||||
for (const hash of hashes) {
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null) {
|
||||
blobs.push(globalBlob.blob)
|
||||
} else {
|
||||
nonGlobalHashes.push(hash)
|
||||
}
|
||||
}
|
||||
if (nonGlobalHashes.length === 0) {
|
||||
return blobs // to avoid unnecessary database lookup
|
||||
}
|
||||
const projectBlobs = await this.backend.findBlobs(
|
||||
this.projectId,
|
||||
nonGlobalHashes
|
||||
)
|
||||
blobs.push(...projectBlobs)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve all blobs associated with the project.
|
||||
* @returns {Promise<core.Blob[]>} A promise that resolves to an array of blobs.
|
||||
*/
|
||||
|
||||
async getProjectBlobs() {
|
||||
const projectBlobs = await this.backend.getProjectBlobs(this.projectId)
|
||||
return projectBlobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all blobs that belong to the project.
|
||||
*/
|
||||
async deleteBlobs() {
|
||||
await Promise.all([
|
||||
this.backend.deleteBlobs(this.projectId),
|
||||
deleteBlobsInBucket(this.projectId),
|
||||
])
|
||||
}
|
||||
|
||||
async _findBlobBeforeInsert(hash) {
|
||||
const globalBlob = GLOBAL_BLOBS.get(hash)
|
||||
if (globalBlob != null && !globalBlob.demoted) {
|
||||
return globalBlob.blob
|
||||
}
|
||||
const blob = await this.backend.findBlob(this.projectId, hash)
|
||||
return blob
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy an existing sourceBlob in this project to a target project.
|
||||
* @param {Blob} sourceBlob
|
||||
* @param {string} targetProjectId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async copyBlob(sourceBlob, targetProjectId) {
|
||||
assert.instance(sourceBlob, Blob, 'bad sourceBlob')
|
||||
assert.projectId(targetProjectId, 'bad targetProjectId')
|
||||
const hash = sourceBlob.getHash()
|
||||
const sourceProjectId = this.projectId
|
||||
const { bucket, key: sourceKey } = getBlobLocation(sourceProjectId, hash)
|
||||
const destKey = makeProjectKey(targetProjectId, hash)
|
||||
const targetBackend = getBackend(targetProjectId)
|
||||
logger.debug({ sourceProjectId, targetProjectId, hash }, 'copyBlob started')
|
||||
try {
|
||||
await persistor.copyObject(bucket, sourceKey, destKey)
|
||||
await targetBackend.insertBlob(targetProjectId, sourceBlob)
|
||||
} finally {
|
||||
logger.debug(
|
||||
{ sourceProjectId, targetProjectId, hash },
|
||||
'copyBlob finished'
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
BlobStore,
|
||||
getProjectBlobsBatch,
|
||||
loadGlobalBlobs,
|
||||
makeProjectKey,
|
||||
makeBlobForFile,
|
||||
getStringLengthOfFile,
|
||||
GLOBAL_BLOBS,
|
||||
}
|
437
services/history-v1/storage/lib/blob_store/mongo.js
Normal file
437
services/history-v1/storage/lib/blob_store/mongo.js
Normal file
@@ -0,0 +1,437 @@
|
||||
// @ts-check
|
||||
/**
|
||||
* Mongo backend for the blob store.
|
||||
*
|
||||
* Blobs are stored in the projectHistoryBlobs collection. Each project has a
|
||||
* document in that collection. That document has a "blobs" subdocument whose
|
||||
* fields are buckets of blobs. The key of a bucket is the first three hex
|
||||
* digits of the blob hash. The value of the bucket is an array of blobs that
|
||||
* match the key.
|
||||
*
|
||||
* Buckets have a maximum capacity of 8 blobs. When that capacity is exceeded,
|
||||
* blobs are stored in a secondary collection: the projectHistoryShardedBlobs
|
||||
* collection. This collection shards blobs between 16 documents per project.
|
||||
* The shard key is the first hex digit of the hash. The documents are also
|
||||
* organized in buckets, but the bucket key is made of hex digits 2, 3 and 4.
|
||||
*/
|
||||
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const { ObjectId, Binary, MongoError, ReadPreference } = require('mongodb')
|
||||
const assert = require('../assert')
|
||||
const mongodb = require('../mongodb')
|
||||
|
||||
const MAX_BLOBS_IN_BUCKET = 8
|
||||
const DUPLICATE_KEY_ERROR_CODE = 11000
|
||||
|
||||
/**
|
||||
* @typedef {import('mongodb').ReadPreferenceLike} ReadPreferenceLike
|
||||
*/
|
||||
|
||||
/**
|
||||
* Set up the data structures for a given project.
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function initialize(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
try {
|
||||
await mongodb.blobs.insertOne({
|
||||
_id: new ObjectId(projectId),
|
||||
blobs: {},
|
||||
})
|
||||
} catch (err) {
|
||||
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
|
||||
return // ignore already initialized case
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return blob metadata for the given project and hash.
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<Blob | null>}
|
||||
*/
|
||||
async function findBlob(projectId, hash) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const bucket = getBucket(hash)
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { _id: 0, bucket: `$${bucket}` } }
|
||||
)
|
||||
|
||||
if (result?.bucket == null) {
|
||||
return null
|
||||
}
|
||||
|
||||
const record = result.bucket.find(blob => blob.h.toString('hex') === hash)
|
||||
if (record == null) {
|
||||
if (result.bucket.length >= MAX_BLOBS_IN_BUCKET) {
|
||||
return await findBlobSharded(projectId, hash)
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the sharded collection for blob metadata
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<Blob | null>}
|
||||
*/
|
||||
async function findBlobSharded(projectId, hash) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
const id = makeShardedId(projectId, shard)
|
||||
const result = await mongodb.shardedBlobs.findOne(
|
||||
{ _id: id },
|
||||
{ projection: { _id: 0, blobs: `$${bucket}` } }
|
||||
)
|
||||
if (result?.blobs == null) {
|
||||
return null
|
||||
}
|
||||
const record = result.blobs.find(blob => blob.h.toString('hex') === hash)
|
||||
if (!record) return null
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read multiple blob metadata records by hexadecimal hashes.
|
||||
* @param {string} projectId
|
||||
* @param {Array<string>} hashes
|
||||
* @return {Promise<Array<Blob>>}
|
||||
*/
|
||||
async function findBlobs(projectId, hashes) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.array(hashes, 'bad hashes: not array')
|
||||
hashes.forEach(function (hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
})
|
||||
|
||||
// Build a set of unique buckets
|
||||
const buckets = new Set(hashes.map(getBucket))
|
||||
|
||||
// Get buckets from Mongo
|
||||
const projection = { _id: 0 }
|
||||
for (const bucket of buckets) {
|
||||
projection[bucket] = 1
|
||||
}
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection }
|
||||
)
|
||||
|
||||
if (result?.blobs == null) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Build blobs from the query results
|
||||
const hashSet = new Set(hashes)
|
||||
const blobs = []
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
const hash = record.h.toString('hex')
|
||||
if (hashSet.has(hash)) {
|
||||
blobs.push(recordToBlob(record))
|
||||
hashSet.delete(hash)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we haven't found all the blobs, look in the sharded collection
|
||||
if (hashSet.size > 0) {
|
||||
const shardedBlobs = await findBlobsSharded(projectId, hashSet)
|
||||
blobs.push(...shardedBlobs)
|
||||
}
|
||||
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Search in the sharded collection for blob metadata.
|
||||
* @param {string} projectId
|
||||
* @param {Set<string>} hashSet
|
||||
* @return {Promise<Array<Blob>>}
|
||||
*/
|
||||
async function findBlobsSharded(projectId, hashSet) {
|
||||
// Build a map of buckets by shard key
|
||||
const bucketsByShard = new Map()
|
||||
for (const hash of hashSet) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
let buckets = bucketsByShard.get(shard)
|
||||
if (buckets == null) {
|
||||
buckets = new Set()
|
||||
bucketsByShard.set(shard, buckets)
|
||||
}
|
||||
buckets.add(bucket)
|
||||
}
|
||||
|
||||
// Make parallel requests to the shards that might contain the hashes we want
|
||||
const requests = []
|
||||
for (const [shard, buckets] of bucketsByShard.entries()) {
|
||||
const id = makeShardedId(projectId, shard)
|
||||
const projection = { _id: 0 }
|
||||
for (const bucket of buckets) {
|
||||
projection[bucket] = 1
|
||||
}
|
||||
const request = mongodb.shardedBlobs.findOne({ _id: id }, { projection })
|
||||
requests.push(request)
|
||||
}
|
||||
const results = await Promise.all(requests)
|
||||
|
||||
// Build blobs from the query results
|
||||
const blobs = []
|
||||
for (const result of results) {
|
||||
if (result?.blobs == null) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
const hash = record.h.toString('hex')
|
||||
if (hashSet.has(hash)) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
*/
|
||||
async function getProjectBlobs(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const result = await mongodb.blobs.findOne(
|
||||
{ _id: new ObjectId(projectId) },
|
||||
{ projection: { _id: 0 } }
|
||||
)
|
||||
|
||||
if (!result) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Build blobs from the query results
|
||||
const blobs = []
|
||||
for (const bucket of Object.values(result.blobs)) {
|
||||
for (const record of bucket) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
|
||||
// Look for all possible sharded blobs
|
||||
|
||||
const minShardedId = makeShardedId(projectId, '0')
|
||||
const maxShardedId = makeShardedId(projectId, 'f')
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
const shardedRecords = mongodb.shardedBlobs.find(
|
||||
{
|
||||
_id: { $gte: minShardedId, $lte: maxShardedId },
|
||||
},
|
||||
{ projection: { _id: 0 } }
|
||||
)
|
||||
|
||||
for await (const shardedRecord of shardedRecords) {
|
||||
if (shardedRecord.blobs == null) {
|
||||
continue
|
||||
}
|
||||
for (const bucket of Object.values(shardedRecord.blobs)) {
|
||||
for (const record of bucket) {
|
||||
blobs.push(recordToBlob(record))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<string>} projectIds
|
||||
* @return {Promise<{ nBlobs: number, blobs: Map<string, Array<Blob>> }>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
for (const project of projectIds) {
|
||||
assert.mongoId(project, 'bad projectId')
|
||||
}
|
||||
let nBlobs = 0
|
||||
const blobs = new Map()
|
||||
if (projectIds.length === 0) return { nBlobs, blobs }
|
||||
|
||||
// blobs
|
||||
{
|
||||
const cursor = await mongodb.blobs.find(
|
||||
{ _id: { $in: projectIds.map(projectId => new ObjectId(projectId)) } },
|
||||
{ readPreference: ReadPreference.secondaryPreferred }
|
||||
)
|
||||
for await (const record of cursor) {
|
||||
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
|
||||
blobs.set(record._id.toString(), projectBlobs)
|
||||
nBlobs += projectBlobs.length
|
||||
}
|
||||
}
|
||||
|
||||
// sharded blobs
|
||||
{
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
const cursor = await mongodb.shardedBlobs.find(
|
||||
{
|
||||
_id: {
|
||||
$gte: makeShardedId(projectIds[0], '0'),
|
||||
$lte: makeShardedId(projectIds[projectIds.length - 1], 'f'),
|
||||
},
|
||||
},
|
||||
{ readPreference: ReadPreference.secondaryPreferred }
|
||||
)
|
||||
for await (const record of cursor) {
|
||||
const recordIdHex = record._id.toString('hex')
|
||||
const recordProjectId = recordIdHex.slice(0, 24)
|
||||
const projectBlobs = Object.values(record.blobs).flat().map(recordToBlob)
|
||||
const found = blobs.get(recordProjectId)
|
||||
if (found) {
|
||||
found.push(...projectBlobs)
|
||||
} else {
|
||||
blobs.set(recordProjectId, projectBlobs)
|
||||
}
|
||||
nBlobs += projectBlobs.length
|
||||
}
|
||||
}
|
||||
return { nBlobs, blobs }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the blobs collection after it has been uploaded.
|
||||
* @param {string} projectId
|
||||
* @param {Blob} blob
|
||||
*/
|
||||
async function insertBlob(projectId, blob) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
const hash = blob.getHash()
|
||||
const bucket = getBucket(hash)
|
||||
const record = blobToRecord(blob)
|
||||
const result = await mongodb.blobs.updateOne(
|
||||
{
|
||||
_id: new ObjectId(projectId),
|
||||
$expr: {
|
||||
$lt: [{ $size: { $ifNull: [`$${bucket}`, []] } }, MAX_BLOBS_IN_BUCKET],
|
||||
},
|
||||
},
|
||||
{
|
||||
$addToSet: { [bucket]: record },
|
||||
}
|
||||
)
|
||||
|
||||
if (result.matchedCount === 0) {
|
||||
await insertRecordSharded(projectId, hash, record)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the sharded blobs collection.
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @param {Record} record
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function insertRecordSharded(projectId, hash, record) {
|
||||
const [shard, bucket] = getShardedBucket(hash)
|
||||
const id = makeShardedId(projectId, shard)
|
||||
await mongodb.shardedBlobs.updateOne(
|
||||
{ _id: id },
|
||||
{ $addToSet: { [bucket]: record } },
|
||||
{ upsert: true }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all blobs for a given project.
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function deleteBlobs(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
await mongodb.blobs.deleteOne({ _id: new ObjectId(projectId) })
|
||||
const minShardedId = makeShardedId(projectId, '0')
|
||||
const maxShardedId = makeShardedId(projectId, 'f')
|
||||
await mongodb.shardedBlobs.deleteMany({
|
||||
// @ts-ignore We are using a custom _id here.
|
||||
_id: { $gte: minShardedId, $lte: maxShardedId },
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the Mongo path to the bucket for the given hash.
|
||||
* @param {string} hash
|
||||
* @return {string}
|
||||
*/
|
||||
function getBucket(hash) {
|
||||
return `blobs.${hash.slice(0, 3)}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the shard key and Mongo path to the bucket for the given hash in the
|
||||
* sharded collection.
|
||||
* @param {string} hash
|
||||
* @return {[string, string]}
|
||||
*/
|
||||
function getShardedBucket(hash) {
|
||||
const shard = hash.slice(0, 1)
|
||||
const bucket = `blobs.${hash.slice(1, 4)}`
|
||||
return [shard, bucket]
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an _id key for the sharded collection.
|
||||
* @param {string} projectId
|
||||
* @param {string} shard
|
||||
* @return {Binary}
|
||||
*/
|
||||
function makeShardedId(projectId, shard) {
|
||||
return new Binary(Buffer.from(`${projectId}0${shard}`, 'hex'))
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} Record
|
||||
* @property {Binary} h
|
||||
* @property {number} b
|
||||
* @property {number} [s]
|
||||
*/
|
||||
|
||||
/**
|
||||
* Return the Mongo record for the given blob.
|
||||
* @param {Blob} blob
|
||||
* @return {Record}
|
||||
*/
|
||||
function blobToRecord(blob) {
|
||||
const hash = blob.getHash()
|
||||
const byteLength = blob.getByteLength()
|
||||
const stringLength = blob.getStringLength()
|
||||
return {
|
||||
h: new Binary(Buffer.from(hash, 'hex')),
|
||||
b: byteLength,
|
||||
s: stringLength,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a blob from the given Mongo record.
|
||||
* @param {Record} record
|
||||
* @return {Blob}
|
||||
*/
|
||||
function recordToBlob(record) {
|
||||
return new Blob(record.h.toString('hex'), record.b, record.s)
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initialize,
|
||||
findBlob,
|
||||
findBlobs,
|
||||
getProjectBlobs,
|
||||
getProjectBlobsBatch,
|
||||
insertBlob,
|
||||
deleteBlobs,
|
||||
}
|
161
services/history-v1/storage/lib/blob_store/postgres.js
Normal file
161
services/history-v1/storage/lib/blob_store/postgres.js
Normal file
@@ -0,0 +1,161 @@
|
||||
const { Blob } = require('overleaf-editor-core')
|
||||
const assert = require('../assert')
|
||||
const knex = require('../knex')
|
||||
|
||||
/**
|
||||
* Set up the initial data structures for a project
|
||||
*/
|
||||
async function initialize(projectId) {
|
||||
// Nothing to do for Postgres
|
||||
}
|
||||
|
||||
/**
|
||||
* Return blob metadata for the given project and hash
|
||||
*/
|
||||
async function findBlob(projectId, hash) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
|
||||
const binaryHash = hashToBuffer(hash)
|
||||
const record = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where({
|
||||
project_id: projectId,
|
||||
hash_bytes: binaryHash,
|
||||
})
|
||||
.first()
|
||||
return recordToBlob(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read multiple blob metadata records by hexadecimal hashes.
|
||||
*
|
||||
* @param {Array.<string>} hashes hexadecimal SHA-1 hashes
|
||||
* @return {Promise.<Array.<Blob?>>} no guarantee on order
|
||||
*/
|
||||
async function findBlobs(projectId, hashes) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
assert.array(hashes, 'bad hashes: not array')
|
||||
hashes.forEach(function (hash) {
|
||||
assert.blobHash(hash, 'bad hash')
|
||||
})
|
||||
|
||||
const binaryHashes = hashes.map(hashToBuffer)
|
||||
|
||||
const records = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where('project_id', projectId)
|
||||
.whereIn('hash_bytes', binaryHashes)
|
||||
|
||||
const blobs = records.map(recordToBlob)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
*/
|
||||
async function getProjectBlobs(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
const records = await knex('project_blobs')
|
||||
.select('hash_bytes', 'byte_length', 'string_length')
|
||||
.where({
|
||||
project_id: projectId,
|
||||
})
|
||||
|
||||
const blobs = records.map(recordToBlob)
|
||||
return blobs
|
||||
}
|
||||
|
||||
/**
|
||||
* Return metadata for all blobs in the given project
|
||||
* @param {Array<number>} projectIds
|
||||
* @return {Promise<{ nBlobs: number, blobs: Map<number, Array<Blob>> }>}
|
||||
*/
|
||||
async function getProjectBlobsBatch(projectIds) {
|
||||
for (const projectId of projectIds) {
|
||||
assert.integer(projectId, 'bad projectId')
|
||||
}
|
||||
let nBlobs = 0
|
||||
const blobs = new Map()
|
||||
if (projectIds.length === 0) return { nBlobs, blobs }
|
||||
|
||||
const cursor = knex('project_blobs')
|
||||
.select('project_id', 'hash_bytes', 'byte_length', 'string_length')
|
||||
.whereIn('project_id', projectIds)
|
||||
.stream()
|
||||
for await (const record of cursor) {
|
||||
const found = blobs.get(record.project_id)
|
||||
if (found) {
|
||||
found.push(recordToBlob(record))
|
||||
} else {
|
||||
blobs.set(record.project_id, [recordToBlob(record)])
|
||||
}
|
||||
nBlobs++
|
||||
}
|
||||
return { nBlobs, blobs }
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a blob's metadata to the blobs table after it has been uploaded.
|
||||
*/
|
||||
async function insertBlob(projectId, blob) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
await knex('project_blobs')
|
||||
.insert(blobToRecord(projectId, blob))
|
||||
.onConflict(['project_id', 'hash_bytes'])
|
||||
.ignore()
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes all blobs for a given project
|
||||
*/
|
||||
async function deleteBlobs(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
projectId = parseInt(projectId, 10)
|
||||
|
||||
await knex('project_blobs').where('project_id', projectId).delete()
|
||||
}
|
||||
|
||||
function blobToRecord(projectId, blob) {
|
||||
return {
|
||||
project_id: projectId,
|
||||
hash_bytes: hashToBuffer(blob.hash),
|
||||
byte_length: blob.getByteLength(),
|
||||
string_length: blob.getStringLength(),
|
||||
}
|
||||
}
|
||||
|
||||
function recordToBlob(record) {
|
||||
if (!record) return
|
||||
return new Blob(
|
||||
hashFromBuffer(record.hash_bytes),
|
||||
record.byte_length,
|
||||
record.string_length
|
||||
)
|
||||
}
|
||||
|
||||
function hashToBuffer(hash) {
|
||||
if (!hash) return
|
||||
return Buffer.from(hash, 'hex')
|
||||
}
|
||||
|
||||
function hashFromBuffer(buffer) {
|
||||
if (!buffer) return
|
||||
return buffer.toString('hex')
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
initialize,
|
||||
findBlob,
|
||||
findBlobs,
|
||||
getProjectBlobs,
|
||||
getProjectBlobsBatch,
|
||||
insertBlob,
|
||||
deleteBlobs,
|
||||
}
|
40
services/history-v1/storage/lib/chunk_buffer/index.js
Normal file
40
services/history-v1/storage/lib/chunk_buffer/index.js
Normal file
@@ -0,0 +1,40 @@
|
||||
'use strict'
|
||||
|
||||
/**
|
||||
* @module storage/lib/chunk_buffer
|
||||
*/
|
||||
|
||||
const chunkStore = require('../chunk_store')
|
||||
const redisBackend = require('../chunk_store/redis')
|
||||
const metrics = require('@overleaf/metrics')
|
||||
/**
|
||||
* Load the latest Chunk stored for a project, including blob metadata.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @return {Promise.<Chunk>}
|
||||
*/
|
||||
async function loadLatest(projectId) {
|
||||
const cachedChunk = await redisBackend.getCurrentChunk(projectId)
|
||||
const chunkRecord = await chunkStore.loadLatestRaw(projectId)
|
||||
const cachedChunkIsValid = redisBackend.checkCacheValidityWithMetadata(
|
||||
cachedChunk,
|
||||
chunkRecord
|
||||
)
|
||||
if (cachedChunkIsValid) {
|
||||
metrics.inc('chunk_buffer.loadLatest', 1, {
|
||||
status: 'cache-hit',
|
||||
})
|
||||
return cachedChunk
|
||||
} else {
|
||||
metrics.inc('chunk_buffer.loadLatest', 1, {
|
||||
status: 'cache-miss',
|
||||
})
|
||||
const chunk = await chunkStore.loadLatest(projectId)
|
||||
await redisBackend.setCurrentChunk(projectId, chunk)
|
||||
return chunk
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
loadLatest,
|
||||
}
|
7
services/history-v1/storage/lib/chunk_store/errors.js
Normal file
7
services/history-v1/storage/lib/chunk_store/errors.js
Normal file
@@ -0,0 +1,7 @@
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
class ChunkVersionConflictError extends OError {}
|
||||
|
||||
module.exports = {
|
||||
ChunkVersionConflictError,
|
||||
}
|
447
services/history-v1/storage/lib/chunk_store/index.js
Normal file
447
services/history-v1/storage/lib/chunk_store/index.js
Normal file
@@ -0,0 +1,447 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
/**
|
||||
* Manage {@link Chunk} and {@link History} storage.
|
||||
*
|
||||
* For storage, chunks are immutable. If we want to update a project with new
|
||||
* changes, we create a new chunk record and History object and delete the old
|
||||
* ones. If we compact a project's history, we similarly destroy the old chunk
|
||||
* (or chunks) and replace them with a new one. This is helpful when using S3,
|
||||
* because it guarantees only eventual consistency for updates but provides
|
||||
* stronger consistency guarantees for object creation.
|
||||
*
|
||||
* When a chunk record in the database is removed, we save its ID for later
|
||||
* in the `old_chunks` table, rather than deleting it immediately. This lets us
|
||||
* use batch deletion to reduce the number of delete requests to S3.
|
||||
*
|
||||
* The chunk store also caches data about which blobs are referenced by each
|
||||
* chunk, which allows us to find unused blobs without loading all of the data
|
||||
* for all projects from S3. Whenever we create a chunk, we also insert records
|
||||
* into the `chunk_blobs` table, to help with this bookkeeping.
|
||||
*/
|
||||
|
||||
const config = require('config')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const { Chunk, History, Snapshot } = require('overleaf-editor-core')
|
||||
|
||||
const assert = require('../assert')
|
||||
const BatchBlobStore = require('../batch_blob_store')
|
||||
const { BlobStore } = require('../blob_store')
|
||||
const { historyStore } = require('../history_store')
|
||||
const mongoBackend = require('./mongo')
|
||||
const postgresBackend = require('./postgres')
|
||||
const { ChunkVersionConflictError } = require('./errors')
|
||||
|
||||
const DEFAULT_DELETE_BATCH_SIZE = parseInt(config.get('maxDeleteKeys'), 10)
|
||||
const DEFAULT_DELETE_TIMEOUT_SECS = 3000 // 50 minutes
|
||||
const DEFAULT_DELETE_MIN_AGE_SECS = 86400 // 1 day
|
||||
|
||||
/**
|
||||
* Create the initial chunk for a project.
|
||||
*/
|
||||
async function initializeProject(projectId, snapshot) {
|
||||
if (projectId != null) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
} else {
|
||||
projectId = await postgresBackend.generateProjectId()
|
||||
}
|
||||
|
||||
if (snapshot != null) {
|
||||
assert.instance(snapshot, Snapshot, 'bad snapshot')
|
||||
} else {
|
||||
snapshot = new Snapshot()
|
||||
}
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
await blobStore.initialize()
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getLatestChunk(projectId)
|
||||
if (chunkRecord != null) {
|
||||
throw new AlreadyInitialized(projectId)
|
||||
}
|
||||
|
||||
const history = new History(snapshot, [])
|
||||
const chunk = new Chunk(history, 0)
|
||||
await create(projectId, chunk)
|
||||
return projectId
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the blobs referenced in the given history
|
||||
*/
|
||||
async function lazyLoadHistoryFiles(history, batchBlobStore) {
|
||||
const blobHashes = new Set()
|
||||
history.findBlobHashes(blobHashes)
|
||||
|
||||
await batchBlobStore.preload(Array.from(blobHashes))
|
||||
await history.loadFiles('lazy', batchBlobStore)
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the latest Chunk stored for a project, including blob metadata.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Object} [opts]
|
||||
* @param {boolean} [opts.readOnly]
|
||||
* @return {Promise<{id: string, startVersion: number, endVersion: number, endTimestamp: Date}>}
|
||||
*/
|
||||
async function loadLatestRaw(projectId, opts) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getLatestChunk(projectId, opts)
|
||||
if (chunkRecord == null) {
|
||||
throw new Chunk.NotFoundError(projectId)
|
||||
}
|
||||
return chunkRecord
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the latest Chunk stored for a project, including blob metadata.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @return {Promise.<Chunk>}
|
||||
*/
|
||||
async function loadLatest(projectId) {
|
||||
const chunkRecord = await loadLatestRaw(projectId)
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return new Chunk(history, chunkRecord.startVersion)
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the the chunk that contains the given version, including blob metadata.
|
||||
*/
|
||||
async function loadAtVersion(projectId, version) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
|
||||
const chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the chunk that contains the version that was current at the given
|
||||
* timestamp, including blob metadata.
|
||||
*/
|
||||
async function loadAtTimestamp(projectId, timestamp) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
|
||||
const chunkRecord = await backend.getChunkForTimestamp(projectId, timestamp)
|
||||
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
|
||||
}
|
||||
|
||||
/**
|
||||
* Store the chunk and insert corresponding records in the database.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {Date} [earliestChangeTimestamp]
|
||||
*/
|
||||
async function create(projectId, chunk, earliestChangeTimestamp) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.instance(chunk, Chunk, 'bad chunk')
|
||||
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const chunkStart = chunk.getStartVersion()
|
||||
const chunkId = await uploadChunk(projectId, chunk)
|
||||
|
||||
const opts = {}
|
||||
if (chunkStart > 0) {
|
||||
opts.oldChunkId = await getChunkIdForVersion(projectId, chunkStart - 1)
|
||||
}
|
||||
if (earliestChangeTimestamp != null) {
|
||||
opts.earliestChangeTimestamp = earliestChangeTimestamp
|
||||
}
|
||||
|
||||
await backend.confirmCreate(projectId, chunk, chunkId, opts)
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload the given chunk to object storage.
|
||||
*
|
||||
* This is used by the create and update methods.
|
||||
*/
|
||||
async function uploadChunk(projectId, chunk) {
|
||||
const backend = getBackend(projectId)
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
const historyStoreConcurrency = parseInt(
|
||||
config.get('chunkStore.historyStoreConcurrency'),
|
||||
10
|
||||
)
|
||||
|
||||
const rawHistory = await chunk
|
||||
.getHistory()
|
||||
.store(blobStore, historyStoreConcurrency)
|
||||
const chunkId = await backend.insertPendingChunk(projectId, chunk)
|
||||
await historyStore.storeRaw(projectId, chunkId, rawHistory)
|
||||
return chunkId
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the project's history by replacing the latest chunk with a new
|
||||
* chunk.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} oldEndVersion
|
||||
* @param {Chunk} newChunk
|
||||
* @param {Date} [earliestChangeTimestamp]
|
||||
* @return {Promise}
|
||||
*/
|
||||
async function update(
|
||||
projectId,
|
||||
oldEndVersion,
|
||||
newChunk,
|
||||
earliestChangeTimestamp
|
||||
) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(oldEndVersion, 'bad oldEndVersion')
|
||||
assert.instance(newChunk, Chunk, 'bad newChunk')
|
||||
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
|
||||
|
||||
const backend = getBackend(projectId)
|
||||
const oldChunkId = await getChunkIdForVersion(projectId, oldEndVersion)
|
||||
const newChunkId = await uploadChunk(projectId, newChunk)
|
||||
|
||||
const opts = {}
|
||||
if (earliestChangeTimestamp != null) {
|
||||
opts.earliestChangeTimestamp = earliestChangeTimestamp
|
||||
}
|
||||
|
||||
await backend.confirmUpdate(projectId, oldChunkId, newChunk, newChunkId, opts)
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the chunk ID for a given version of a project.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} version
|
||||
* @return {Promise.<string>}
|
||||
*/
|
||||
async function getChunkIdForVersion(projectId, version) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
return chunkRecord.id
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the chunk metadata for a given version of a project.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} version
|
||||
* @return {Promise.<{id: string|number, startVersion: number, endVersion: number}>}
|
||||
*/
|
||||
async function getChunkMetadataForVersion(projectId, version) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
return chunkRecord
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a project's chunk ids
|
||||
*/
|
||||
async function getProjectChunkIds(projectId) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkIds = await backend.getProjectChunkIds(projectId)
|
||||
return chunkIds
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a projects chunks directly
|
||||
*/
|
||||
async function getProjectChunks(projectId) {
|
||||
const backend = getBackend(projectId)
|
||||
const chunkIds = await backend.getProjectChunks(projectId)
|
||||
return chunkIds
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the chunk for a given chunk record, including blob metadata.
|
||||
*/
|
||||
async function loadByChunkRecord(projectId, chunkRecord) {
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const batchBlobStore = new BatchBlobStore(blobStore)
|
||||
const { raw: rawHistory, buffer: chunkBuffer } =
|
||||
await historyStore.loadRawWithBuffer(projectId, chunkRecord.id)
|
||||
const history = History.fromRaw(rawHistory)
|
||||
await lazyLoadHistoryFiles(history, batchBlobStore)
|
||||
return {
|
||||
chunk: new Chunk(history, chunkRecord.endVersion - history.countChanges()),
|
||||
chunkBuffer,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Asynchronously retrieves project chunks starting from a specific version.
|
||||
*
|
||||
* This generator function yields chunk records for a given project starting from the specified version (inclusive).
|
||||
* It continues to fetch and yield subsequent chunk records until the end version of the latest chunk metadata is reached.
|
||||
* If you want to fetch all the chunks *after* a version V, call this function with V+1.
|
||||
*
|
||||
* @param {string} projectId - The ID of the project.
|
||||
* @param {number} version - The starting version to retrieve chunks from.
|
||||
* @returns {AsyncGenerator<Object, void, undefined>} An async generator that yields chunk records.
|
||||
*/
|
||||
async function* getProjectChunksFromVersion(projectId, version) {
|
||||
const backend = getBackend(projectId)
|
||||
const latestChunkMetadata = await loadLatestRaw(projectId)
|
||||
if (!latestChunkMetadata || version > latestChunkMetadata.endVersion) {
|
||||
return
|
||||
}
|
||||
let chunkRecord = await backend.getChunkForVersion(projectId, version)
|
||||
while (chunkRecord != null) {
|
||||
yield chunkRecord
|
||||
if (chunkRecord.endVersion >= latestChunkMetadata.endVersion) {
|
||||
break
|
||||
} else {
|
||||
chunkRecord = await backend.getChunkForVersion(
|
||||
projectId,
|
||||
chunkRecord.endVersion + 1
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete the given chunk from the database.
|
||||
*
|
||||
* This doesn't delete the chunk from object storage yet. The old chunks
|
||||
* collection will do that.
|
||||
*/
|
||||
async function destroy(projectId, chunkId) {
|
||||
const backend = getBackend(projectId)
|
||||
await backend.deleteChunk(projectId, chunkId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all of a project's chunks from the database.
|
||||
*/
|
||||
async function deleteProjectChunks(projectId) {
|
||||
const backend = getBackend(projectId)
|
||||
await backend.deleteProjectChunks(projectId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a given number of old chunks from both the database
|
||||
* and from object storage.
|
||||
*
|
||||
* @param {object} options
|
||||
* @param {number} [options.batchSize] - number of chunks to delete in each
|
||||
* batch
|
||||
* @param {number} [options.maxBatches] - maximum number of batches to process
|
||||
* @param {number} [options.minAgeSecs] - minimum age of chunks to delete
|
||||
* @param {number} [options.timeout] - maximum time to spend deleting chunks
|
||||
*
|
||||
* @return {Promise<number>} number of chunks deleted
|
||||
*/
|
||||
async function deleteOldChunks(options = {}) {
|
||||
const batchSize = options.batchSize ?? DEFAULT_DELETE_BATCH_SIZE
|
||||
const maxBatches = options.maxBatches ?? Number.MAX_SAFE_INTEGER
|
||||
const minAgeSecs = options.minAgeSecs ?? DEFAULT_DELETE_MIN_AGE_SECS
|
||||
const timeout = options.timeout ?? DEFAULT_DELETE_TIMEOUT_SECS
|
||||
assert.greater(batchSize, 0)
|
||||
assert.greater(timeout, 0)
|
||||
assert.greater(maxBatches, 0)
|
||||
assert.greaterOrEqual(minAgeSecs, 0)
|
||||
|
||||
const timeoutAfter = Date.now() + timeout * 1000
|
||||
let deletedChunksTotal = 0
|
||||
for (const backend of [postgresBackend, mongoBackend]) {
|
||||
for (let i = 0; i < maxBatches; i++) {
|
||||
if (Date.now() > timeoutAfter) {
|
||||
break
|
||||
}
|
||||
const deletedChunks = await deleteOldChunksBatch(
|
||||
backend,
|
||||
batchSize,
|
||||
minAgeSecs
|
||||
)
|
||||
deletedChunksTotal += deletedChunks.length
|
||||
if (deletedChunks.length !== batchSize) {
|
||||
// Last batch was incomplete. There probably are no old chunks left
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return deletedChunksTotal
|
||||
}
|
||||
|
||||
async function deleteOldChunksBatch(backend, count, minAgeSecs) {
|
||||
assert.greater(count, 0, 'bad count')
|
||||
assert.greaterOrEqual(minAgeSecs, 0, 'bad minAgeSecs')
|
||||
|
||||
const oldChunks = await backend.getOldChunksBatch(count, minAgeSecs)
|
||||
if (oldChunks.length === 0) {
|
||||
return []
|
||||
}
|
||||
await historyStore.deleteChunks(oldChunks)
|
||||
await backend.deleteOldChunks(oldChunks.map(chunk => chunk.chunkId))
|
||||
return oldChunks
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the appropriate backend for the given project id
|
||||
*
|
||||
* Numeric ids use the Postgres backend.
|
||||
* Strings of 24 characters use the Mongo backend.
|
||||
*/
|
||||
function getBackend(projectId) {
|
||||
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
|
||||
return postgresBackend
|
||||
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
|
||||
return mongoBackend
|
||||
} else {
|
||||
throw new OError('bad project id', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
class AlreadyInitialized extends OError {
|
||||
constructor(projectId) {
|
||||
super('Project is already initialized', { projectId })
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getBackend,
|
||||
initializeProject,
|
||||
loadLatest,
|
||||
loadLatestRaw,
|
||||
loadAtVersion,
|
||||
loadAtTimestamp,
|
||||
loadByChunkRecord,
|
||||
create,
|
||||
update,
|
||||
destroy,
|
||||
getChunkIdForVersion,
|
||||
getChunkMetadataForVersion,
|
||||
getProjectChunkIds,
|
||||
getProjectChunks,
|
||||
getProjectChunksFromVersion,
|
||||
deleteProjectChunks,
|
||||
deleteOldChunks,
|
||||
AlreadyInitialized,
|
||||
ChunkVersionConflictError,
|
||||
}
|
526
services/history-v1/storage/lib/chunk_store/mongo.js
Normal file
526
services/history-v1/storage/lib/chunk_store/mongo.js
Normal file
@@ -0,0 +1,526 @@
|
||||
// @ts-check
|
||||
|
||||
const { ObjectId, ReadPreference, MongoError } = require('mongodb')
|
||||
const { Chunk } = require('overleaf-editor-core')
|
||||
const OError = require('@overleaf/o-error')
|
||||
const assert = require('../assert')
|
||||
const mongodb = require('../mongodb')
|
||||
const { ChunkVersionConflictError } = require('./errors')
|
||||
|
||||
const DUPLICATE_KEY_ERROR_CODE = 11000
|
||||
|
||||
/**
|
||||
* @import { ClientSession } from 'mongodb'
|
||||
*/
|
||||
|
||||
/**
|
||||
* Get the latest chunk's metadata from the database
|
||||
* @param {string} projectId
|
||||
* @param {Object} [opts]
|
||||
* @param {boolean} [opts.readOnly]
|
||||
*/
|
||||
async function getLatestChunk(projectId, opts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
const { readOnly = false } = opts
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{
|
||||
sort: { startVersion: -1 },
|
||||
readPreference: readOnly
|
||||
? ReadPreference.secondaryPreferred
|
||||
: ReadPreference.primary,
|
||||
}
|
||||
)
|
||||
if (record == null) {
|
||||
return null
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version.
|
||||
*/
|
||||
async function getChunkForVersion(projectId, version) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
startVersion: { $lte: version },
|
||||
endVersion: { $gte: version },
|
||||
},
|
||||
{ sort: { startVersion: 1 } }
|
||||
)
|
||||
if (record == null) {
|
||||
throw new Chunk.VersionNotFoundError(projectId, version)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version before the endTime.
|
||||
*/
|
||||
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const recordActive = await getChunkForVersion(projectId, 0)
|
||||
if (recordActive && recordActive.endTimestamp <= timestamp) {
|
||||
return recordActive
|
||||
}
|
||||
|
||||
// fallback to deleted chunk
|
||||
const recordDeleted = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'deleted',
|
||||
startVersion: 0,
|
||||
updatedAt: { $lte: timestamp }, // indexed for state=deleted
|
||||
endTimestamp: { $lte: timestamp },
|
||||
},
|
||||
{ sort: { updatedAt: -1 } }
|
||||
)
|
||||
if (recordDeleted) {
|
||||
return chunkFromRecord(recordDeleted)
|
||||
}
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current at
|
||||
* the given timestamp.
|
||||
*/
|
||||
async function getChunkForTimestamp(projectId, timestamp) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
endTimestamp: { $gte: timestamp },
|
||||
},
|
||||
// We use the index on the startVersion for sorting records. This assumes
|
||||
// that timestamps go up with each version.
|
||||
{ sort: { startVersion: 1 } }
|
||||
)
|
||||
|
||||
if (record == null) {
|
||||
// Couldn't find a chunk that had modifications after the given timestamp.
|
||||
// Fetch the latest chunk instead.
|
||||
const chunk = await getLatestChunk(projectId)
|
||||
if (chunk == null) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunk
|
||||
}
|
||||
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current before
|
||||
* the given timestamp.
|
||||
*/
|
||||
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const record = await mongodb.chunks.findOne(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
$or: [
|
||||
{
|
||||
endTimestamp: {
|
||||
$lte: timestamp,
|
||||
},
|
||||
},
|
||||
{
|
||||
endTimestamp: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
// We use the index on the startVersion for sorting records. This assumes
|
||||
// that timestamps go up with each version.
|
||||
{ sort: { startVersion: -1 } }
|
||||
)
|
||||
if (record == null) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a project's chunk ids
|
||||
*/
|
||||
async function getProjectChunkIds(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const cursor = mongodb.chunks.find(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{ projection: { _id: 1 } }
|
||||
)
|
||||
return await cursor.map(record => record._id).toArray()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a projects chunks directly
|
||||
*/
|
||||
async function getProjectChunks(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
const cursor = mongodb.chunks
|
||||
.find(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{ projection: { state: 0 } }
|
||||
)
|
||||
.sort({ startVersion: 1 })
|
||||
return await cursor.map(chunkFromRecord).toArray()
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a pending chunk before sending it to object storage.
|
||||
*/
|
||||
async function insertPendingChunk(projectId, chunk) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.instance(chunk, Chunk, 'bad chunk')
|
||||
|
||||
const chunkId = new ObjectId()
|
||||
await mongodb.chunks.insertOne({
|
||||
_id: chunkId,
|
||||
projectId: new ObjectId(projectId),
|
||||
startVersion: chunk.getStartVersion(),
|
||||
endVersion: chunk.getEndVersion(),
|
||||
endTimestamp: chunk.getEndTimestamp(),
|
||||
state: 'pending',
|
||||
updatedAt: new Date(),
|
||||
})
|
||||
return chunkId.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a new chunk was created.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {string} chunkId
|
||||
* @param {object} opts
|
||||
* @param {Date} [opts.earliestChangeTimestamp]
|
||||
* @param {string} [opts.oldChunkId]
|
||||
*/
|
||||
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.instance(chunk, Chunk, 'bad newChunk')
|
||||
assert.mongoId(chunkId, 'bad newChunkId')
|
||||
|
||||
await mongodb.client.withSession(async session => {
|
||||
await session.withTransaction(async () => {
|
||||
if (opts.oldChunkId != null) {
|
||||
await closeChunk(projectId, opts.oldChunkId, { session })
|
||||
}
|
||||
|
||||
await activateChunk(projectId, chunkId, { session })
|
||||
|
||||
await updateProjectRecord(
|
||||
projectId,
|
||||
chunk,
|
||||
opts.earliestChangeTimestamp,
|
||||
{ session }
|
||||
)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the metadata to the project record
|
||||
*/
|
||||
async function updateProjectRecord(
|
||||
projectId,
|
||||
chunk,
|
||||
earliestChangeTimestamp,
|
||||
mongoOpts = {}
|
||||
) {
|
||||
// record the end version against the project
|
||||
await mongodb.projects.updateOne(
|
||||
{
|
||||
'overleaf.history.id': projectId, // string for Object ids, number for postgres ids
|
||||
},
|
||||
{
|
||||
// always store the latest end version and timestamp for the chunk
|
||||
$max: {
|
||||
'overleaf.history.currentEndVersion': chunk.getEndVersion(),
|
||||
'overleaf.history.currentEndTimestamp': chunk.getEndTimestamp(),
|
||||
'overleaf.history.updatedAt': new Date(),
|
||||
},
|
||||
// store the first pending change timestamp for the chunk, this will
|
||||
// be cleared every time a backup is completed.
|
||||
$min: {
|
||||
'overleaf.backup.pendingChangeAt':
|
||||
earliestChangeTimestamp || chunk.getEndTimestamp() || new Date(),
|
||||
},
|
||||
},
|
||||
mongoOpts
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a chunk was replaced by a new one.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} oldChunkId
|
||||
* @param {Chunk} newChunk
|
||||
* @param {string} newChunkId
|
||||
* @param {object} [opts]
|
||||
* @param {Date} [opts.earliestChangeTimestamp]
|
||||
*/
|
||||
async function confirmUpdate(
|
||||
projectId,
|
||||
oldChunkId,
|
||||
newChunk,
|
||||
newChunkId,
|
||||
opts = {}
|
||||
) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.mongoId(oldChunkId, 'bad oldChunkId')
|
||||
assert.instance(newChunk, Chunk, 'bad newChunk')
|
||||
assert.mongoId(newChunkId, 'bad newChunkId')
|
||||
|
||||
await mongodb.client.withSession(async session => {
|
||||
await session.withTransaction(async () => {
|
||||
await deleteActiveChunk(projectId, oldChunkId, { session })
|
||||
|
||||
await activateChunk(projectId, newChunkId, { session })
|
||||
|
||||
await updateProjectRecord(
|
||||
projectId,
|
||||
newChunk,
|
||||
opts.earliestChangeTimestamp,
|
||||
{ session }
|
||||
)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Activate a pending chunk
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {object} [opts]
|
||||
* @param {ClientSession} [opts.session]
|
||||
*/
|
||||
async function activateChunk(projectId, chunkId, opts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.mongoId(chunkId, 'bad chunkId')
|
||||
|
||||
let result
|
||||
try {
|
||||
result = await mongodb.chunks.updateOne(
|
||||
{
|
||||
_id: new ObjectId(chunkId),
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'pending',
|
||||
},
|
||||
{ $set: { state: 'active', updatedAt: new Date() } },
|
||||
opts
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
|
||||
throw new ChunkVersionConflictError('chunk start version is not unique', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
if (result.matchedCount === 0) {
|
||||
throw new OError('pending chunk not found', { projectId, chunkId })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close a chunk
|
||||
*
|
||||
* A closed chunk is one that can't be extended anymore.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {object} [opts]
|
||||
* @param {ClientSession} [opts.session]
|
||||
*/
|
||||
async function closeChunk(projectId, chunkId, opts = {}) {
|
||||
const result = await mongodb.chunks.updateOne(
|
||||
{
|
||||
_id: new ObjectId(chunkId),
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'active',
|
||||
},
|
||||
{ $set: { state: 'closed' } },
|
||||
opts
|
||||
)
|
||||
|
||||
if (result.matchedCount === 0) {
|
||||
throw new ChunkVersionConflictError('unable to close chunk', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete an active chunk
|
||||
*
|
||||
* This is used to delete chunks that are in the process of being extended. It
|
||||
* will refuse to delete chunks that are already closed and can therefore not be
|
||||
* extended.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {object} [opts]
|
||||
* @param {ClientSession} [opts.session]
|
||||
*/
|
||||
async function deleteActiveChunk(projectId, chunkId, opts = {}) {
|
||||
const updateResult = await mongodb.chunks.updateOne(
|
||||
{
|
||||
_id: new ObjectId(chunkId),
|
||||
projectId: new ObjectId(projectId),
|
||||
state: 'active',
|
||||
},
|
||||
{ $set: { state: 'deleted', updatedAt: new Date() } },
|
||||
opts
|
||||
)
|
||||
|
||||
if (updateResult.matchedCount === 0) {
|
||||
throw new ChunkVersionConflictError('unable to delete active chunk', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a chunk.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @return {Promise}
|
||||
*/
|
||||
async function deleteChunk(projectId, chunkId, mongoOpts = {}) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
assert.mongoId(chunkId, 'bad chunkId')
|
||||
|
||||
await mongodb.chunks.updateOne(
|
||||
{ _id: new ObjectId(chunkId), projectId: new ObjectId(projectId) },
|
||||
{ $set: { state: 'deleted', updatedAt: new Date() } },
|
||||
mongoOpts
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all of a project's chunks
|
||||
*/
|
||||
async function deleteProjectChunks(projectId) {
|
||||
assert.mongoId(projectId, 'bad projectId')
|
||||
|
||||
await mongodb.chunks.updateMany(
|
||||
{
|
||||
projectId: new ObjectId(projectId),
|
||||
state: { $in: ['active', 'closed'] },
|
||||
},
|
||||
{ $set: { state: 'deleted', updatedAt: new Date() } }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a batch of old chunks for deletion
|
||||
*/
|
||||
async function getOldChunksBatch(count, minAgeSecs) {
|
||||
const maxUpdatedAt = new Date(Date.now() - minAgeSecs * 1000)
|
||||
const batch = []
|
||||
|
||||
// We need to fetch one state at a time to take advantage of the partial
|
||||
// indexes on the chunks collection.
|
||||
//
|
||||
// Mongo 6.0 allows partial indexes that use the $in operator. When we reach
|
||||
// that Mongo version, we can create a partial index on both the deleted and
|
||||
// pending states and simplify this logic a bit.
|
||||
for (const state of ['deleted', 'pending']) {
|
||||
if (count === 0) {
|
||||
// There's no more space in the batch
|
||||
break
|
||||
}
|
||||
|
||||
const cursor = mongodb.chunks
|
||||
.find(
|
||||
{ state, updatedAt: { $lt: maxUpdatedAt } },
|
||||
{
|
||||
limit: count,
|
||||
projection: { _id: 1, projectId: 1 },
|
||||
}
|
||||
)
|
||||
.map(record => ({
|
||||
chunkId: record._id.toString(),
|
||||
projectId: record.projectId.toString(),
|
||||
}))
|
||||
|
||||
for await (const record of cursor) {
|
||||
batch.push(record)
|
||||
count -= 1
|
||||
}
|
||||
}
|
||||
return batch
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a batch of old chunks from the database
|
||||
*/
|
||||
async function deleteOldChunks(chunkIds) {
|
||||
await mongodb.chunks.deleteMany({
|
||||
_id: { $in: chunkIds.map(id => new ObjectId(id)) },
|
||||
state: { $in: ['deleted', 'pending'] },
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a chunk metadata object from the database record
|
||||
*/
|
||||
function chunkFromRecord(record) {
|
||||
return {
|
||||
id: record._id.toString(),
|
||||
startVersion: record.startVersion,
|
||||
endVersion: record.endVersion,
|
||||
endTimestamp: record.endTimestamp,
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getLatestChunk,
|
||||
getFirstChunkBeforeTimestamp,
|
||||
getLastActiveChunkBeforeTimestamp,
|
||||
getChunkForVersion,
|
||||
getChunkForTimestamp,
|
||||
getProjectChunkIds,
|
||||
getProjectChunks,
|
||||
insertPendingChunk,
|
||||
confirmCreate,
|
||||
confirmUpdate,
|
||||
updateProjectRecord,
|
||||
deleteChunk,
|
||||
deleteProjectChunks,
|
||||
getOldChunksBatch,
|
||||
deleteOldChunks,
|
||||
}
|
487
services/history-v1/storage/lib/chunk_store/postgres.js
Normal file
487
services/history-v1/storage/lib/chunk_store/postgres.js
Normal file
@@ -0,0 +1,487 @@
|
||||
// @ts-check
|
||||
|
||||
const { Chunk } = require('overleaf-editor-core')
|
||||
const assert = require('../assert')
|
||||
const knex = require('../knex')
|
||||
const knexReadOnly = require('../knex_read_only')
|
||||
const { ChunkVersionConflictError } = require('./errors')
|
||||
const { updateProjectRecord } = require('./mongo')
|
||||
|
||||
const DUPLICATE_KEY_ERROR_CODE = '23505'
|
||||
|
||||
/**
|
||||
* @import { Knex } from 'knex'
|
||||
*/
|
||||
|
||||
/**
|
||||
* Get the latest chunk's metadata from the database
|
||||
* @param {string} projectId
|
||||
* @param {Object} [opts]
|
||||
* @param {boolean} [opts.readOnly]
|
||||
*/
|
||||
async function getLatestChunk(projectId, opts = {}) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
const { readOnly = false } = opts
|
||||
|
||||
const record = await (readOnly ? knexReadOnly : knex)('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.orderBy('end_version', 'desc')
|
||||
.first()
|
||||
if (record == null) {
|
||||
return null
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {number} version
|
||||
*/
|
||||
async function getChunkForVersion(projectId, version) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const record = await knex('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('end_version', '>=', version)
|
||||
.orderBy('end_version')
|
||||
.first()
|
||||
if (!record) {
|
||||
throw new Chunk.VersionNotFoundError(projectId, version)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the given version.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Date} timestamp
|
||||
*/
|
||||
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
|
||||
const recordActive = await getChunkForVersion(projectId, 0)
|
||||
|
||||
// projectId must be valid if getChunkForVersion did not throw
|
||||
if (recordActive && recordActive.endTimestamp <= timestamp) {
|
||||
return recordActive
|
||||
}
|
||||
|
||||
// fallback to deleted chunk
|
||||
const recordDeleted = await knex('old_chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('start_version', '=', 0)
|
||||
.where('end_timestamp', '<=', timestamp)
|
||||
.orderBy('end_version', 'desc')
|
||||
.first()
|
||||
if (recordDeleted) {
|
||||
return chunkFromRecord(recordDeleted)
|
||||
}
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current at
|
||||
* the given timestamp.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Date} timestamp
|
||||
*/
|
||||
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
|
||||
assert.date(timestamp, 'bad timestamp')
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const query = knex('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where(function () {
|
||||
this.where('end_timestamp', '<=', timestamp).orWhere(
|
||||
'end_timestamp',
|
||||
null
|
||||
)
|
||||
})
|
||||
.orderBy('end_version', 'desc', 'last')
|
||||
|
||||
const record = await query.first()
|
||||
|
||||
if (!record) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the metadata for the chunk that contains the version that was current at
|
||||
* the given timestamp.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Date} timestamp
|
||||
*/
|
||||
async function getChunkForTimestamp(projectId, timestamp) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
// This query will find the latest chunk after the timestamp (query orders
|
||||
// in reverse chronological order), OR the latest chunk
|
||||
// This accounts for the case where the timestamp is ahead of the chunk's
|
||||
// timestamp and therefore will not return any results
|
||||
const whereAfterEndTimestampOrLatestChunk = knex.raw(
|
||||
'end_timestamp >= ? ' +
|
||||
'OR id = ( ' +
|
||||
'SELECT id FROM chunks ' +
|
||||
'WHERE doc_id = ? ' +
|
||||
'ORDER BY end_version desc LIMIT 1' +
|
||||
')',
|
||||
[timestamp, parseInt(projectId, 10)]
|
||||
)
|
||||
|
||||
const record = await knex('chunks')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where(whereAfterEndTimestampOrLatestChunk)
|
||||
.orderBy('end_version')
|
||||
.first()
|
||||
if (!record) {
|
||||
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
|
||||
}
|
||||
return chunkFromRecord(record)
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a chunk metadata object from the database record
|
||||
*/
|
||||
function chunkFromRecord(record) {
|
||||
return {
|
||||
id: record.id.toString(),
|
||||
startVersion: record.start_version,
|
||||
endVersion: record.end_version,
|
||||
endTimestamp: record.end_timestamp,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a project's chunk ids
|
||||
*
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function getProjectChunkIds(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const records = await knex('chunks')
|
||||
.select('id')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
return records.map(record => record.id)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of a projects chunks directly
|
||||
*
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function getProjectChunks(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const records = await knex('chunks')
|
||||
.select()
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.orderBy('end_version')
|
||||
return records.map(chunkFromRecord)
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a pending chunk before sending it to object storage.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
*/
|
||||
async function insertPendingChunk(projectId, chunk) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
const result = await knex.first(
|
||||
knex.raw("nextval('chunks_id_seq'::regclass)::integer as chunkid")
|
||||
)
|
||||
const chunkId = result.chunkid
|
||||
await knex('pending_chunks').insert({
|
||||
id: chunkId,
|
||||
doc_id: parseInt(projectId, 10),
|
||||
end_version: chunk.getEndVersion(),
|
||||
start_version: chunk.getStartVersion(),
|
||||
end_timestamp: chunk.getEndTimestamp(),
|
||||
})
|
||||
return chunkId.toString()
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a new chunk was created.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {string} chunkId
|
||||
* @param {object} opts
|
||||
* @param {Date} [opts.earliestChangeTimestamp]
|
||||
* @param {string} [opts.oldChunkId]
|
||||
*/
|
||||
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
await knex.transaction(async tx => {
|
||||
if (opts.oldChunkId != null) {
|
||||
await _assertChunkIsNotClosed(tx, projectId, opts.oldChunkId)
|
||||
await _closeChunk(tx, projectId, opts.oldChunkId)
|
||||
}
|
||||
await Promise.all([
|
||||
_deletePendingChunk(tx, projectId, chunkId),
|
||||
_insertChunk(tx, projectId, chunk, chunkId),
|
||||
])
|
||||
await updateProjectRecord(
|
||||
// The history id in Mongo is an integer for Postgres projects
|
||||
parseInt(projectId, 10),
|
||||
chunk,
|
||||
opts.earliestChangeTimestamp
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Record that a chunk was replaced by a new one.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} oldChunkId
|
||||
* @param {Chunk} newChunk
|
||||
* @param {string} newChunkId
|
||||
*/
|
||||
async function confirmUpdate(
|
||||
projectId,
|
||||
oldChunkId,
|
||||
newChunk,
|
||||
newChunkId,
|
||||
opts = {}
|
||||
) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
await knex.transaction(async tx => {
|
||||
await _assertChunkIsNotClosed(tx, projectId, oldChunkId)
|
||||
await _deleteChunks(tx, { doc_id: projectId, id: oldChunkId })
|
||||
await Promise.all([
|
||||
_deletePendingChunk(tx, projectId, newChunkId),
|
||||
_insertChunk(tx, projectId, newChunk, newChunkId),
|
||||
])
|
||||
await updateProjectRecord(
|
||||
// The history id in Mongo is an integer for Postgres projects
|
||||
parseInt(projectId, 10),
|
||||
newChunk,
|
||||
opts.earliestChangeTimestamp
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a pending chunk
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _deletePendingChunk(tx, projectId, chunkId) {
|
||||
await tx('pending_chunks')
|
||||
.where({
|
||||
doc_id: parseInt(projectId, 10),
|
||||
id: parseInt(chunkId, 10),
|
||||
})
|
||||
.del()
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an active chunk
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {Chunk} chunk
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _insertChunk(tx, projectId, chunk, chunkId) {
|
||||
const startVersion = chunk.getStartVersion()
|
||||
const endVersion = chunk.getEndVersion()
|
||||
try {
|
||||
await tx('chunks').insert({
|
||||
id: parseInt(chunkId, 10),
|
||||
doc_id: parseInt(projectId, 10),
|
||||
start_version: startVersion,
|
||||
end_version: endVersion,
|
||||
end_timestamp: chunk.getEndTimestamp(),
|
||||
})
|
||||
} catch (err) {
|
||||
if (
|
||||
err instanceof Error &&
|
||||
'code' in err &&
|
||||
err.code === DUPLICATE_KEY_ERROR_CODE
|
||||
) {
|
||||
throw new ChunkVersionConflictError(
|
||||
'chunk start or end version is not unique',
|
||||
{ projectId, chunkId, startVersion, endVersion }
|
||||
)
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that a chunk is not closed
|
||||
*
|
||||
* This is used to synchronize chunk creations and extensions.
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _assertChunkIsNotClosed(tx, projectId, chunkId) {
|
||||
const record = await tx('chunks')
|
||||
.forUpdate()
|
||||
.select('closed')
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('id', parseInt(chunkId, 10))
|
||||
.first()
|
||||
if (!record) {
|
||||
throw new ChunkVersionConflictError('unable to close chunk: not found', {
|
||||
projectId,
|
||||
chunkId,
|
||||
})
|
||||
}
|
||||
if (record.closed) {
|
||||
throw new ChunkVersionConflictError(
|
||||
'unable to close chunk: already closed',
|
||||
{
|
||||
projectId,
|
||||
chunkId,
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close a chunk
|
||||
*
|
||||
* A closed chunk can no longer be extended.
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function _closeChunk(tx, projectId, chunkId) {
|
||||
await tx('chunks')
|
||||
.update({ closed: true })
|
||||
.where('doc_id', parseInt(projectId, 10))
|
||||
.where('id', parseInt(chunkId, 10))
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a chunk.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
*/
|
||||
async function deleteChunk(projectId, chunkId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
assert.integer(chunkId, 'bad chunkId')
|
||||
|
||||
await _deleteChunks(knex, {
|
||||
doc_id: parseInt(projectId, 10),
|
||||
id: parseInt(chunkId, 10),
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all of a project's chunks
|
||||
*
|
||||
* @param {string} projectId
|
||||
*/
|
||||
async function deleteProjectChunks(projectId) {
|
||||
assert.postgresId(projectId, 'bad projectId')
|
||||
|
||||
await knex.transaction(async tx => {
|
||||
await _deleteChunks(knex, { doc_id: parseInt(projectId, 10) })
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete many chunks
|
||||
*
|
||||
* @param {Knex} tx
|
||||
* @param {any} whereClause
|
||||
*/
|
||||
async function _deleteChunks(tx, whereClause) {
|
||||
const rows = await tx('chunks').where(whereClause).del().returning('*')
|
||||
if (rows.length === 0) {
|
||||
return
|
||||
}
|
||||
|
||||
const oldChunks = rows.map(row => ({
|
||||
doc_id: row.doc_id,
|
||||
chunk_id: row.id,
|
||||
start_version: row.start_version,
|
||||
end_version: row.end_version,
|
||||
end_timestamp: row.end_timestamp,
|
||||
deleted_at: tx.fn.now(),
|
||||
}))
|
||||
await tx('old_chunks').insert(oldChunks)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a batch of old chunks for deletion
|
||||
*
|
||||
* @param {number} count
|
||||
* @param {number} minAgeSecs
|
||||
*/
|
||||
async function getOldChunksBatch(count, minAgeSecs) {
|
||||
const maxDeletedAt = new Date(Date.now() - minAgeSecs * 1000)
|
||||
const records = await knex('old_chunks')
|
||||
.whereNull('deleted_at')
|
||||
.orWhere('deleted_at', '<', maxDeletedAt)
|
||||
.orderBy('chunk_id')
|
||||
.limit(count)
|
||||
return records.map(oldChunk => ({
|
||||
projectId: oldChunk.doc_id.toString(),
|
||||
chunkId: oldChunk.chunk_id.toString(),
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a batch of old chunks from the database
|
||||
*
|
||||
* @param {string[]} chunkIds
|
||||
*/
|
||||
async function deleteOldChunks(chunkIds) {
|
||||
await knex('old_chunks')
|
||||
.whereIn(
|
||||
'chunk_id',
|
||||
chunkIds.map(id => parseInt(id, 10))
|
||||
)
|
||||
.del()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a new project id
|
||||
*/
|
||||
async function generateProjectId() {
|
||||
const record = await knex.first(
|
||||
knex.raw("nextval('docs_id_seq'::regclass)::integer as doc_id")
|
||||
)
|
||||
return record.doc_id.toString()
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getLatestChunk,
|
||||
getFirstChunkBeforeTimestamp,
|
||||
getLastActiveChunkBeforeTimestamp,
|
||||
getChunkForVersion,
|
||||
getChunkForTimestamp,
|
||||
getProjectChunkIds,
|
||||
getProjectChunks,
|
||||
insertPendingChunk,
|
||||
confirmCreate,
|
||||
confirmUpdate,
|
||||
deleteChunk,
|
||||
deleteProjectChunks,
|
||||
getOldChunksBatch,
|
||||
deleteOldChunks,
|
||||
generateProjectId,
|
||||
}
|
254
services/history-v1/storage/lib/chunk_store/redis.js
Normal file
254
services/history-v1/storage/lib/chunk_store/redis.js
Normal file
@@ -0,0 +1,254 @@
|
||||
const metrics = require('@overleaf/metrics')
|
||||
const logger = require('@overleaf/logger')
|
||||
const redis = require('../redis')
|
||||
const rclient = redis.rclientHistory //
|
||||
const { Snapshot, Change, History, Chunk } = require('overleaf-editor-core')
|
||||
|
||||
const TEMPORARY_CACHE_LIFETIME = 300 // 5 minutes
|
||||
|
||||
const keySchema = {
|
||||
snapshot({ projectId }) {
|
||||
return `snapshot:{${projectId}}`
|
||||
},
|
||||
startVersion({ projectId }) {
|
||||
return `snapshot-version:{${projectId}}`
|
||||
},
|
||||
changes({ projectId }) {
|
||||
return `changes:{${projectId}}`
|
||||
},
|
||||
}
|
||||
|
||||
rclient.defineCommand('get_current_chunk', {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
local startVersionValue = redis.call('GET', KEYS[2])
|
||||
if not startVersionValue then
|
||||
return nil -- this is a cache-miss
|
||||
end
|
||||
local snapshotValue = redis.call('GET', KEYS[1])
|
||||
local changesValues = redis.call('LRANGE', KEYS[3], 0, -1)
|
||||
return {snapshotValue, startVersionValue, changesValues}
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Retrieves the current chunk of project history from Redis storage
|
||||
* @param {string} projectId - The unique identifier of the project
|
||||
* @returns {Promise<Chunk|null>} A Promise that resolves to a Chunk object containing project history,
|
||||
* or null if retrieval fails
|
||||
* @throws {Error} If Redis operations fail
|
||||
*/
|
||||
async function getCurrentChunk(projectId) {
|
||||
try {
|
||||
const result = await rclient.get_current_chunk(
|
||||
keySchema.snapshot({ projectId }),
|
||||
keySchema.startVersion({ projectId }),
|
||||
keySchema.changes({ projectId })
|
||||
)
|
||||
if (!result) {
|
||||
return null // cache-miss
|
||||
}
|
||||
const snapshot = Snapshot.fromRaw(JSON.parse(result[0]))
|
||||
const startVersion = JSON.parse(result[1])
|
||||
const changes = result[2].map(c => Change.fromRaw(JSON.parse(c)))
|
||||
const history = new History(snapshot, changes)
|
||||
const chunk = new Chunk(history, startVersion)
|
||||
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'success' })
|
||||
return chunk
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'error getting current chunk from redis')
|
||||
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'error' })
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
rclient.defineCommand('get_current_chunk_metadata', {
|
||||
numberOfKeys: 2,
|
||||
lua: `
|
||||
local startVersionValue = redis.call('GET', KEYS[1])
|
||||
local changesCount = redis.call('LLEN', KEYS[2])
|
||||
return {startVersionValue, changesCount}
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Retrieves the current chunk metadata for a given project from Redis
|
||||
* @param {string} projectId - The ID of the project to get metadata for
|
||||
* @returns {Promise<Object|null>} Object containing startVersion and changesCount if found, null on error or cache miss
|
||||
* @property {number} startVersion - The starting version information
|
||||
* @property {number} changesCount - The number of changes in the chunk
|
||||
*/
|
||||
async function getCurrentChunkMetadata(projectId) {
|
||||
try {
|
||||
const result = await rclient.get_current_chunk_metadata(
|
||||
keySchema.startVersion({ projectId }),
|
||||
keySchema.changes({ projectId })
|
||||
)
|
||||
if (!result) {
|
||||
return null // cache-miss
|
||||
}
|
||||
const startVersion = JSON.parse(result[0])
|
||||
const changesCount = parseInt(result[1], 10)
|
||||
return { startVersion, changesCount }
|
||||
} catch (err) {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
rclient.defineCommand('set_current_chunk', {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
local snapshotValue = ARGV[1]
|
||||
local startVersionValue = ARGV[2]
|
||||
redis.call('SETEX', KEYS[1], ${TEMPORARY_CACHE_LIFETIME}, snapshotValue)
|
||||
redis.call('SETEX', KEYS[2], ${TEMPORARY_CACHE_LIFETIME}, startVersionValue)
|
||||
redis.call('DEL', KEYS[3]) -- clear the old changes list
|
||||
if #ARGV >= 3 then
|
||||
redis.call('RPUSH', KEYS[3], unpack(ARGV, 3))
|
||||
redis.call('EXPIRE', KEYS[3], ${TEMPORARY_CACHE_LIFETIME})
|
||||
end
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Stores the current chunk of project history in Redis
|
||||
* @param {string} projectId - The ID of the project
|
||||
* @param {Chunk} chunk - The chunk object containing history data
|
||||
* @returns {Promise<*>} Returns the result of the Redis operation, or null if an error occurs
|
||||
* @throws {Error} May throw Redis-related errors which are caught internally
|
||||
*/
|
||||
async function setCurrentChunk(projectId, chunk) {
|
||||
try {
|
||||
const snapshotKey = keySchema.snapshot({ projectId })
|
||||
const startVersionKey = keySchema.startVersion({ projectId })
|
||||
const changesKey = keySchema.changes({ projectId })
|
||||
|
||||
const snapshot = chunk.history.snapshot
|
||||
const startVersion = chunk.startVersion
|
||||
const changes = chunk.history.changes
|
||||
|
||||
await rclient.set_current_chunk(
|
||||
snapshotKey,
|
||||
startVersionKey,
|
||||
changesKey,
|
||||
JSON.stringify(snapshot.toRaw()),
|
||||
startVersion,
|
||||
...changes.map(c => JSON.stringify(c.toRaw()))
|
||||
)
|
||||
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'success' })
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ err, projectId, chunk },
|
||||
'error setting current chunk inredis'
|
||||
)
|
||||
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'error' })
|
||||
return null // while testing we will suppress any errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a cached chunk's version metadata matches the current chunk's metadata
|
||||
* @param {Chunk} cachedChunk - The chunk retrieved from cache
|
||||
* @param {Chunk} currentChunk - The current chunk to compare against
|
||||
* @returns {boolean} - Returns true if the chunks have matching start and end versions, false otherwise
|
||||
*/
|
||||
function checkCacheValidity(cachedChunk, currentChunk) {
|
||||
return Boolean(
|
||||
cachedChunk &&
|
||||
cachedChunk.getStartVersion() === currentChunk.getStartVersion() &&
|
||||
cachedChunk.getEndVersion() === currentChunk.getEndVersion()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates if a cached chunk matches the current chunk metadata by comparing versions
|
||||
* @param {Object} cachedChunk - The cached chunk object to validate
|
||||
* @param {Object} currentChunkMetadata - The current chunk metadata to compare against
|
||||
* @param {number} currentChunkMetadata.startVersion - The starting version number
|
||||
* @param {number} currentChunkMetadata.endVersion - The ending version number
|
||||
* @returns {boolean} - True if the cached chunk is valid, false otherwise
|
||||
*/
|
||||
function checkCacheValidityWithMetadata(cachedChunk, currentChunkMetadata) {
|
||||
return Boolean(
|
||||
cachedChunk &&
|
||||
cachedChunk.getStartVersion() === currentChunkMetadata.startVersion &&
|
||||
cachedChunk.getEndVersion() === currentChunkMetadata.endVersion
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two chunks for equality using stringified JSON comparison
|
||||
* @param {string} projectId - The ID of the project
|
||||
* @param {Chunk} cachedChunk - The cached chunk to compare
|
||||
* @param {Chunk} currentChunk - The current chunk to compare against
|
||||
* @returns {boolean} - Returns false if either chunk is null/undefined, otherwise returns the comparison result
|
||||
*/
|
||||
function compareChunks(projectId, cachedChunk, currentChunk) {
|
||||
if (!cachedChunk || !currentChunk) {
|
||||
return false
|
||||
}
|
||||
const identical = JSON.stringify(cachedChunk) === JSON.stringify(currentChunk)
|
||||
if (!identical) {
|
||||
try {
|
||||
logger.error(
|
||||
{
|
||||
projectId,
|
||||
cachedChunkStartVersion: cachedChunk.getStartVersion(),
|
||||
cachedChunkEndVersion: cachedChunk.getEndVersion(),
|
||||
currentChunkStartVersion: currentChunk.getStartVersion(),
|
||||
currentChunkEndVersion: currentChunk.getEndVersion(),
|
||||
},
|
||||
'chunk cache mismatch'
|
||||
)
|
||||
} catch (err) {
|
||||
// ignore errors while logging
|
||||
}
|
||||
}
|
||||
metrics.inc('chunk_store.redis.compare_chunks', 1, {
|
||||
status: identical ? 'success' : 'fail',
|
||||
})
|
||||
return identical
|
||||
}
|
||||
|
||||
// Define Lua script for atomic cache clearing
|
||||
rclient.defineCommand('clear_chunk_cache', {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
-- Delete all keys related to a project's chunk cache atomically
|
||||
redis.call('DEL', KEYS[1]) -- snapshot key
|
||||
redis.call('DEL', KEYS[2]) -- startVersion key
|
||||
redis.call('DEL', KEYS[3]) -- changes key
|
||||
return 1
|
||||
`,
|
||||
})
|
||||
|
||||
/**
|
||||
* Clears all cache entries for a project's chunk data
|
||||
* @param {string} projectId - The ID of the project whose cache should be cleared
|
||||
* @returns {Promise<boolean>} A promise that resolves to true if successful, false on error
|
||||
*/
|
||||
async function clearCache(projectId) {
|
||||
try {
|
||||
const snapshotKey = keySchema.snapshot({ projectId })
|
||||
const startVersionKey = keySchema.startVersion({ projectId })
|
||||
const changesKey = keySchema.changes({ projectId })
|
||||
|
||||
await rclient.clear_chunk_cache(snapshotKey, startVersionKey, changesKey)
|
||||
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'success' })
|
||||
return true
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'error clearing chunk cache from redis')
|
||||
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'error' })
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getCurrentChunk,
|
||||
setCurrentChunk,
|
||||
getCurrentChunkMetadata,
|
||||
checkCacheValidity,
|
||||
checkCacheValidityWithMetadata,
|
||||
compareChunks,
|
||||
clearCache,
|
||||
}
|
18
services/history-v1/storage/lib/content_hash.js
Normal file
18
services/history-v1/storage/lib/content_hash.js
Normal file
@@ -0,0 +1,18 @@
|
||||
// @ts-check
|
||||
|
||||
const { createHash } = require('node:crypto')
|
||||
|
||||
/**
|
||||
* Compute a SHA-1 hash of the content
|
||||
*
|
||||
* This is used to validate incoming updates.
|
||||
*
|
||||
* @param {string} content
|
||||
*/
|
||||
function getContentHash(content) {
|
||||
const hash = createHash('sha-1')
|
||||
hash.update(content)
|
||||
return hash.digest('hex')
|
||||
}
|
||||
|
||||
module.exports = { getContentHash }
|
5
services/history-v1/storage/lib/errors.js
Normal file
5
services/history-v1/storage/lib/errors.js
Normal file
@@ -0,0 +1,5 @@
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
class InvalidChangeError extends OError {}
|
||||
|
||||
module.exports = { InvalidChangeError }
|
30
services/history-v1/storage/lib/hash_check_blob_store.js
Normal file
30
services/history-v1/storage/lib/hash_check_blob_store.js
Normal file
@@ -0,0 +1,30 @@
|
||||
const Blob = require('overleaf-editor-core').Blob
|
||||
const blobHash = require('./blob_hash')
|
||||
const BPromise = require('bluebird')
|
||||
|
||||
// We want to simulate applying all of the operations so we can return the
|
||||
// resulting hashes to the caller for them to check. To do this, we need to be
|
||||
// able to take the lazy files in the final snapshot, fetch their content, and
|
||||
// compute the new content hashes. We don't, however, need to actually store
|
||||
// that content; we just need to get the hash.
|
||||
function HashCheckBlobStore(realBlobStore) {
|
||||
this.realBlobStore = realBlobStore
|
||||
}
|
||||
|
||||
HashCheckBlobStore.prototype.getString = BPromise.method(
|
||||
function hashCheckBlobStoreGetString(hash) {
|
||||
return this.realBlobStore.getString(hash)
|
||||
}
|
||||
)
|
||||
|
||||
HashCheckBlobStore.prototype.putString = BPromise.method(
|
||||
function hashCheckBlobStorePutString(string) {
|
||||
return new Blob(
|
||||
blobHash.fromString(string),
|
||||
Buffer.byteLength(string),
|
||||
string.length
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
module.exports = HashCheckBlobStore
|
202
services/history-v1/storage/lib/history_store.js
Normal file
202
services/history-v1/storage/lib/history_store.js
Normal file
@@ -0,0 +1,202 @@
|
||||
// @ts-check
|
||||
'use strict'
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
|
||||
const config = require('config')
|
||||
const path = require('node:path')
|
||||
const Stream = require('node:stream')
|
||||
const { promisify } = require('node:util')
|
||||
const zlib = require('node:zlib')
|
||||
|
||||
const OError = require('@overleaf/o-error')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
const assert = require('./assert')
|
||||
const persistor = require('./persistor')
|
||||
const projectKey = require('./project_key')
|
||||
const streams = require('./streams')
|
||||
|
||||
const Chunk = core.Chunk
|
||||
|
||||
const gzip = promisify(zlib.gzip)
|
||||
const gunzip = promisify(zlib.gunzip)
|
||||
|
||||
class LoadError extends OError {
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {any} cause
|
||||
*/
|
||||
constructor(projectId, chunkId, cause) {
|
||||
super(
|
||||
'HistoryStore: failed to load chunk history',
|
||||
{ projectId, chunkId },
|
||||
cause
|
||||
)
|
||||
this.projectId = projectId
|
||||
this.chunkId = chunkId
|
||||
}
|
||||
}
|
||||
|
||||
class StoreError extends OError {
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {any} cause
|
||||
*/
|
||||
constructor(projectId, chunkId, cause) {
|
||||
super(
|
||||
'HistoryStore: failed to store chunk history',
|
||||
{ projectId, chunkId },
|
||||
cause
|
||||
)
|
||||
this.projectId = projectId
|
||||
this.chunkId = chunkId
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @return {string}
|
||||
*/
|
||||
function getKey(projectId, chunkId) {
|
||||
return path.join(projectKey.format(projectId), projectKey.pad(chunkId))
|
||||
}
|
||||
|
||||
/**
|
||||
* Store and retreive raw {@link History} objects from bucket. Mainly used via the
|
||||
* {@link ChunkStore}.
|
||||
*
|
||||
* Histories are stored as gzipped JSON blobs, keyed on the project ID and the
|
||||
* ID of the Chunk that owns the history. The project ID is currently redundant,
|
||||
* but I think it might help in future if we have to shard on project ID, and
|
||||
* it gives us some chance of reconstructing histories even if there is a
|
||||
* problem with the chunk metadata in the database.
|
||||
*
|
||||
* @class
|
||||
*/
|
||||
class HistoryStore {
|
||||
#persistor
|
||||
#bucket
|
||||
constructor(persistor, bucket) {
|
||||
this.#persistor = persistor
|
||||
this.#bucket = bucket
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the raw object for a History.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @return {Promise<import('overleaf-editor-core/lib/types').RawHistory>}
|
||||
*/
|
||||
async loadRaw(projectId, chunkId) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.chunkId(chunkId, 'bad chunkId')
|
||||
|
||||
const key = getKey(projectId, chunkId)
|
||||
|
||||
logger.debug({ projectId, chunkId }, 'loadRaw started')
|
||||
try {
|
||||
const buf = await streams.gunzipStreamToBuffer(
|
||||
await this.#persistor.getObjectStream(this.#bucket, key)
|
||||
)
|
||||
return JSON.parse(buf.toString('utf-8'))
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Chunk.NotPersistedError(projectId)
|
||||
}
|
||||
throw new LoadError(projectId, chunkId, err)
|
||||
} finally {
|
||||
logger.debug({ projectId, chunkId }, 'loadRaw finished')
|
||||
}
|
||||
}
|
||||
|
||||
async loadRawWithBuffer(projectId, chunkId) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.chunkId(chunkId, 'bad chunkId')
|
||||
|
||||
const key = getKey(projectId, chunkId)
|
||||
|
||||
logger.debug({ projectId, chunkId }, 'loadBuffer started')
|
||||
try {
|
||||
const buf = await streams.readStreamToBuffer(
|
||||
await this.#persistor.getObjectStream(this.#bucket, key)
|
||||
)
|
||||
const unzipped = await gunzip(buf)
|
||||
return {
|
||||
buffer: buf,
|
||||
raw: JSON.parse(unzipped.toString('utf-8')),
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof objectPersistor.Errors.NotFoundError) {
|
||||
throw new Chunk.NotPersistedError(projectId)
|
||||
}
|
||||
throw new LoadError(projectId, chunkId, err)
|
||||
} finally {
|
||||
logger.debug({ projectId, chunkId }, 'loadBuffer finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress and store a {@link History}.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} chunkId
|
||||
* @param {import('overleaf-editor-core/lib/types').RawHistory} rawHistory
|
||||
*/
|
||||
async storeRaw(projectId, chunkId, rawHistory) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.chunkId(chunkId, 'bad chunkId')
|
||||
assert.object(rawHistory, 'bad rawHistory')
|
||||
|
||||
const key = getKey(projectId, chunkId)
|
||||
|
||||
logger.debug({ projectId, chunkId }, 'storeRaw started')
|
||||
|
||||
const buf = await gzip(JSON.stringify(rawHistory))
|
||||
try {
|
||||
await this.#persistor.sendStream(
|
||||
this.#bucket,
|
||||
key,
|
||||
Stream.Readable.from([buf]),
|
||||
{
|
||||
contentType: 'application/json',
|
||||
contentEncoding: 'gzip',
|
||||
contentLength: buf.byteLength,
|
||||
}
|
||||
)
|
||||
} catch (err) {
|
||||
throw new StoreError(projectId, chunkId, err)
|
||||
} finally {
|
||||
logger.debug({ projectId, chunkId }, 'storeRaw finished')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete multiple chunks from bucket. Expects an Array of objects with
|
||||
* projectId and chunkId properties
|
||||
* @param {Array<{projectId: string,chunkId:string}>} chunks
|
||||
*/
|
||||
async deleteChunks(chunks) {
|
||||
logger.debug({ chunks }, 'deleteChunks started')
|
||||
try {
|
||||
await Promise.all(
|
||||
chunks.map(chunk => {
|
||||
const key = getKey(chunk.projectId, chunk.chunkId)
|
||||
return this.#persistor.deleteObject(this.#bucket, key)
|
||||
})
|
||||
)
|
||||
} finally {
|
||||
logger.debug({ chunks }, 'deleteChunks finished')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
HistoryStore,
|
||||
historyStore: new HistoryStore(persistor, config.get('chunkStore.bucket')),
|
||||
}
|
8
services/history-v1/storage/lib/knex.js
Normal file
8
services/history-v1/storage/lib/knex.js
Normal file
@@ -0,0 +1,8 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
const env = process.env.NODE_ENV || 'development'
|
||||
|
||||
const knexfile = require('../../knexfile')
|
||||
module.exports = require('knex').default(knexfile[env])
|
19
services/history-v1/storage/lib/knex_read_only.js
Normal file
19
services/history-v1/storage/lib/knex_read_only.js
Normal file
@@ -0,0 +1,19 @@
|
||||
'use strict'
|
||||
|
||||
const config = require('config')
|
||||
const knexfile = require('../../knexfile')
|
||||
|
||||
const env = process.env.NODE_ENV || 'development'
|
||||
|
||||
if (config.databaseUrlReadOnly) {
|
||||
module.exports = require('knex')({
|
||||
...knexfile[env],
|
||||
pool: {
|
||||
...knexfile[env].pool,
|
||||
min: 0,
|
||||
},
|
||||
connection: config.databaseUrlReadOnly,
|
||||
})
|
||||
} else {
|
||||
module.exports = require('./knex')
|
||||
}
|
30
services/history-v1/storage/lib/mongodb.js
Normal file
30
services/history-v1/storage/lib/mongodb.js
Normal file
@@ -0,0 +1,30 @@
|
||||
const Metrics = require('@overleaf/metrics')
|
||||
|
||||
const config = require('config')
|
||||
const { MongoClient } = require('mongodb')
|
||||
|
||||
const client = new MongoClient(config.mongo.uri)
|
||||
const db = client.db()
|
||||
|
||||
const chunks = db.collection('projectHistoryChunks')
|
||||
const blobs = db.collection('projectHistoryBlobs')
|
||||
const globalBlobs = db.collection('projectHistoryGlobalBlobs')
|
||||
const shardedBlobs = db.collection('projectHistoryShardedBlobs')
|
||||
const projects = db.collection('projects')
|
||||
// Temporary collection for tracking progress of backed up old blobs (without a hash).
|
||||
// The initial sync process will be able to skip over these.
|
||||
// Schema: _id: projectId, blobs: [Binary]
|
||||
const backedUpBlobs = db.collection('projectHistoryBackedUpBlobs')
|
||||
|
||||
Metrics.mongodb.monitor(client)
|
||||
|
||||
module.exports = {
|
||||
client,
|
||||
db,
|
||||
chunks,
|
||||
blobs,
|
||||
globalBlobs,
|
||||
projects,
|
||||
shardedBlobs,
|
||||
backedUpBlobs,
|
||||
}
|
261
services/history-v1/storage/lib/persist_changes.js
Normal file
261
services/history-v1/storage/lib/persist_changes.js
Normal file
@@ -0,0 +1,261 @@
|
||||
// @ts-check
|
||||
|
||||
'use strict'
|
||||
|
||||
const _ = require('lodash')
|
||||
const logger = require('@overleaf/logger')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const Chunk = core.Chunk
|
||||
const History = core.History
|
||||
|
||||
const assert = require('./assert')
|
||||
const chunkStore = require('./chunk_store')
|
||||
const { BlobStore } = require('./blob_store')
|
||||
const { InvalidChangeError } = require('./errors')
|
||||
const { getContentHash } = require('./content_hash')
|
||||
|
||||
function countChangeBytes(change) {
|
||||
// Note: This is not quite accurate, because the raw change may contain raw
|
||||
// file info (or conceivably even content) that will not be included in the
|
||||
// actual stored object.
|
||||
return Buffer.byteLength(JSON.stringify(change.toRaw()))
|
||||
}
|
||||
|
||||
function totalChangeBytes(changes) {
|
||||
return changes.length ? _(changes).map(countChangeBytes).sum() : 0
|
||||
}
|
||||
|
||||
// provide a simple timer function
|
||||
function Timer() {
|
||||
this.t0 = process.hrtime()
|
||||
}
|
||||
Timer.prototype.elapsed = function () {
|
||||
const dt = process.hrtime(this.t0)
|
||||
const timeInMilliseconds = (dt[0] + dt[1] * 1e-9) * 1e3
|
||||
return timeInMilliseconds
|
||||
}
|
||||
|
||||
/**
|
||||
* Break the given set of changes into zero or more Chunks according to the
|
||||
* provided limits and store them.
|
||||
*
|
||||
* Some other possible improvements:
|
||||
* 1. This does a lot more JSON serialization than it has to. We may know the
|
||||
* JSON for the changes before we call this function, so we could in that
|
||||
* case get the byte size of each change without doing any work. Even if we
|
||||
* don't know it initially, we could save some computation by caching this
|
||||
* info rather than recomputing it many times. TBD whether it is worthwhile.
|
||||
* 2. We don't necessarily have to fetch the latest chunk in order to determine
|
||||
* that it is full. We could store this in the chunk metadata record. It may
|
||||
* be worth distinguishing between a Chunk and its metadata record. The
|
||||
* endVersion may be better suited to the metadata record.
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {core.Change[]} allChanges
|
||||
* @param {Object} limits
|
||||
* @param {number} clientEndVersion
|
||||
* @return {Promise.<Object?>}
|
||||
*/
|
||||
async function persistChanges(projectId, allChanges, limits, clientEndVersion) {
|
||||
assert.projectId(projectId)
|
||||
assert.array(allChanges)
|
||||
assert.maybe.object(limits)
|
||||
assert.integer(clientEndVersion)
|
||||
|
||||
const blobStore = new BlobStore(projectId)
|
||||
|
||||
const earliestChangeTimestamp =
|
||||
allChanges.length > 0 ? allChanges[0].getTimestamp() : null
|
||||
|
||||
let currentChunk
|
||||
|
||||
/**
|
||||
* currentSnapshot tracks the latest change that we're applying; we use it to
|
||||
* check that the changes we are persisting are valid.
|
||||
*
|
||||
* @type {core.Snapshot}
|
||||
*/
|
||||
let currentSnapshot
|
||||
|
||||
let originalEndVersion
|
||||
let changesToPersist
|
||||
|
||||
limits = limits || {}
|
||||
_.defaults(limits, {
|
||||
changeBucketMinutes: 60,
|
||||
maxChanges: 2500,
|
||||
maxChangeBytes: 5 * 1024 * 1024,
|
||||
maxChunkChanges: 2000,
|
||||
maxChunkChangeBytes: 5 * 1024 * 1024,
|
||||
maxChunkChangeTime: 5000, // warn if total time for changes in a chunk takes longer than this
|
||||
})
|
||||
|
||||
function checkElapsedTime(timer) {
|
||||
const timeTaken = timer.elapsed()
|
||||
if (timeTaken > limits.maxChunkChangeTime) {
|
||||
console.log('warning: slow chunk', projectId, timeTaken)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add changes to a chunk until the chunk is full
|
||||
*
|
||||
* The chunk is full if it reaches a certain number of changes or a certain
|
||||
* size in bytes
|
||||
*
|
||||
* @param {core.Chunk} chunk
|
||||
* @param {core.Change[]} changes
|
||||
*/
|
||||
async function fillChunk(chunk, changes) {
|
||||
let totalBytes = totalChangeBytes(chunk.getChanges())
|
||||
let changesPushed = false
|
||||
while (changes.length > 0) {
|
||||
if (chunk.getChanges().length >= limits.maxChunkChanges) {
|
||||
break
|
||||
}
|
||||
|
||||
const change = changes[0]
|
||||
const changeBytes = countChangeBytes(change)
|
||||
|
||||
if (totalBytes + changeBytes > limits.maxChunkChangeBytes) {
|
||||
break
|
||||
}
|
||||
|
||||
for (const operation of change.iterativelyApplyTo(currentSnapshot, {
|
||||
strict: true,
|
||||
})) {
|
||||
await validateContentHash(operation)
|
||||
}
|
||||
|
||||
chunk.pushChanges([change])
|
||||
changes.shift()
|
||||
totalBytes += changeBytes
|
||||
changesPushed = true
|
||||
}
|
||||
return changesPushed
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the operation is valid and can be incorporated to the history.
|
||||
*
|
||||
* For now, this checks content hashes when they are provided.
|
||||
*
|
||||
* @param {core.Operation} operation
|
||||
*/
|
||||
async function validateContentHash(operation) {
|
||||
if (operation instanceof core.EditFileOperation) {
|
||||
const editOperation = operation.getOperation()
|
||||
if (
|
||||
editOperation instanceof core.TextOperation &&
|
||||
editOperation.contentHash != null
|
||||
) {
|
||||
const path = operation.getPathname()
|
||||
const file = currentSnapshot.getFile(path)
|
||||
if (file == null) {
|
||||
throw new InvalidChangeError('file not found for hash validation', {
|
||||
projectId,
|
||||
path,
|
||||
})
|
||||
}
|
||||
await file.load('eager', blobStore)
|
||||
const content = file.getContent({ filterTrackedDeletes: true })
|
||||
const expectedHash = editOperation.contentHash
|
||||
const actualHash = content != null ? getContentHash(content) : null
|
||||
logger.debug({ expectedHash, actualHash }, 'validating content hash')
|
||||
if (actualHash !== expectedHash) {
|
||||
throw new InvalidChangeError('content hash mismatch', {
|
||||
projectId,
|
||||
path,
|
||||
expectedHash,
|
||||
actualHash,
|
||||
})
|
||||
}
|
||||
|
||||
// Remove the content hash from the change before storing it in the chunk.
|
||||
// It was only useful for validation.
|
||||
editOperation.contentHash = null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function extendLastChunkIfPossible() {
|
||||
const latestChunk = await chunkStore.loadLatest(projectId)
|
||||
|
||||
currentChunk = latestChunk
|
||||
originalEndVersion = latestChunk.getEndVersion()
|
||||
if (originalEndVersion !== clientEndVersion) {
|
||||
throw new Chunk.ConflictingEndVersion(
|
||||
clientEndVersion,
|
||||
originalEndVersion
|
||||
)
|
||||
}
|
||||
|
||||
currentSnapshot = latestChunk.getSnapshot().clone()
|
||||
const timer = new Timer()
|
||||
currentSnapshot.applyAll(latestChunk.getChanges())
|
||||
|
||||
const changesPushed = await fillChunk(currentChunk, changesToPersist)
|
||||
if (!changesPushed) {
|
||||
return
|
||||
}
|
||||
|
||||
checkElapsedTime(timer)
|
||||
|
||||
await chunkStore.update(
|
||||
projectId,
|
||||
originalEndVersion,
|
||||
currentChunk,
|
||||
earliestChangeTimestamp
|
||||
)
|
||||
}
|
||||
|
||||
async function createNewChunksAsNeeded() {
|
||||
while (changesToPersist.length > 0) {
|
||||
const endVersion = currentChunk.getEndVersion()
|
||||
const history = new History(currentSnapshot.clone(), [])
|
||||
const chunk = new Chunk(history, endVersion)
|
||||
const timer = new Timer()
|
||||
|
||||
const changesPushed = await fillChunk(chunk, changesToPersist)
|
||||
if (changesPushed) {
|
||||
checkElapsedTime(timer)
|
||||
currentChunk = chunk
|
||||
await chunkStore.create(projectId, chunk, earliestChangeTimestamp)
|
||||
} else {
|
||||
throw new Error('failed to fill empty chunk')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function isOlderThanMinChangeTimestamp(change) {
|
||||
return change.getTimestamp().getTime() < limits.minChangeTimestamp
|
||||
}
|
||||
|
||||
function isOlderThanMaxChangeTimestamp(change) {
|
||||
return change.getTimestamp().getTime() < limits.maxChangeTimestamp
|
||||
}
|
||||
|
||||
const oldChanges = _.filter(allChanges, isOlderThanMinChangeTimestamp)
|
||||
const anyTooOld = _.some(oldChanges, isOlderThanMaxChangeTimestamp)
|
||||
const tooManyChanges = oldChanges.length > limits.maxChanges
|
||||
const tooManyBytes = totalChangeBytes(oldChanges) > limits.maxChangeBytes
|
||||
|
||||
if (anyTooOld || tooManyChanges || tooManyBytes) {
|
||||
changesToPersist = oldChanges
|
||||
const numberOfChangesToPersist = oldChanges.length
|
||||
|
||||
await extendLastChunkIfPossible()
|
||||
await createNewChunksAsNeeded()
|
||||
|
||||
return {
|
||||
numberOfChangesPersisted: numberOfChangesToPersist,
|
||||
originalEndVersion,
|
||||
currentChunk,
|
||||
}
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = persistChanges
|
27
services/history-v1/storage/lib/persistor.js
Normal file
27
services/history-v1/storage/lib/persistor.js
Normal file
@@ -0,0 +1,27 @@
|
||||
const _ = require('lodash')
|
||||
const config = require('config')
|
||||
const metrics = require('@overleaf/metrics')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
|
||||
const persistorConfig = _.cloneDeep(config.get('persistor'))
|
||||
|
||||
function convertKey(key, convertFn) {
|
||||
if (_.has(persistorConfig, key)) {
|
||||
_.update(persistorConfig, key, convertFn)
|
||||
}
|
||||
}
|
||||
|
||||
convertKey('s3.signedUrlExpiryInMs', s => parseInt(s, 10))
|
||||
convertKey('s3.httpOptions.timeout', s => parseInt(s, 10))
|
||||
convertKey('s3.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('s3.pathStyle', s => s === 'true')
|
||||
convertKey('gcs.unlockBeforeDelete', s => s === 'true')
|
||||
convertKey('gcs.unsignedUrls', s => s === 'true')
|
||||
convertKey('gcs.signedUrlExpiryInMs', s => parseInt(s, 10))
|
||||
convertKey('gcs.deleteConcurrency', s => parseInt(s, 10))
|
||||
convertKey('gcs.retryOptions.maxRetries', s => parseInt(s, 10))
|
||||
convertKey('fallback.buckets', s => JSON.parse(s || '{}'))
|
||||
|
||||
persistorConfig.Metrics = metrics
|
||||
|
||||
module.exports = objectPersistor(persistorConfig)
|
140
services/history-v1/storage/lib/project_archive.js
Normal file
140
services/history-v1/storage/lib/project_archive.js
Normal file
@@ -0,0 +1,140 @@
|
||||
// @ts-check
|
||||
'use strict'
|
||||
|
||||
/**
|
||||
* @import { Snapshot } from 'overleaf-editor-core'
|
||||
* @import { BlobStore } from '../../storage/lib/blob_store/index'
|
||||
*/
|
||||
|
||||
const Archive = require('archiver')
|
||||
const BPromise = require('bluebird')
|
||||
const fs = require('node:fs')
|
||||
const { pipeline } = require('node:stream')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
|
||||
const Snapshot = core.Snapshot
|
||||
const OError = require('@overleaf/o-error')
|
||||
|
||||
const assert = require('./assert')
|
||||
|
||||
// The maximum safe concurrency appears to be 1.
|
||||
// https://github.com/overleaf/issues/issues/1909
|
||||
const FETCH_CONCURRENCY = 1 // number of files to fetch at once
|
||||
const DEFAULT_ZIP_TIMEOUT = 25000 // ms
|
||||
|
||||
class DownloadError extends OError {
|
||||
constructor(hash) {
|
||||
super(`ProjectArchive: blob download failed: ${hash}`, { hash })
|
||||
}
|
||||
}
|
||||
|
||||
class ArchiveTimeout extends OError {
|
||||
constructor() {
|
||||
super('ProjectArchive timed out')
|
||||
}
|
||||
}
|
||||
|
||||
class MissingfileError extends OError {
|
||||
constructor() {
|
||||
super('ProjectArchive: attempting to look up a file that does not exist')
|
||||
}
|
||||
}
|
||||
|
||||
class ProjectArchive {
|
||||
static ArchiveTimeout = ArchiveTimeout
|
||||
static MissingfileError = MissingfileError
|
||||
static DownloadError = DownloadError
|
||||
|
||||
/**
|
||||
* @constructor
|
||||
* @param {Snapshot} snapshot
|
||||
* @param {number} [timeout] in ms
|
||||
* @classdesc
|
||||
* Writes the project snapshot to a zip file.
|
||||
*/
|
||||
constructor(snapshot, timeout) {
|
||||
assert.instance(snapshot, Snapshot)
|
||||
this.snapshot = snapshot
|
||||
this.timeout = timeout || DEFAULT_ZIP_TIMEOUT
|
||||
}
|
||||
|
||||
/**
|
||||
* Write zip archive to the given file path.
|
||||
*
|
||||
* @param {BlobStore} blobStore
|
||||
* @param {string} zipFilePath
|
||||
*/
|
||||
writeZip(blobStore, zipFilePath) {
|
||||
const snapshot = this.snapshot
|
||||
const timeout = this.timeout
|
||||
|
||||
const startTime = process.hrtime()
|
||||
const archive = new Archive('zip')
|
||||
|
||||
// Convert elapsed seconds and nanoseconds to milliseconds.
|
||||
function findElapsedMilliseconds() {
|
||||
const elapsed = process.hrtime(startTime)
|
||||
return elapsed[0] * 1e3 + elapsed[1] * 1e-6
|
||||
}
|
||||
|
||||
function addFileToArchive(pathname) {
|
||||
if (findElapsedMilliseconds() > timeout) {
|
||||
throw new ProjectArchive.ArchiveTimeout()
|
||||
}
|
||||
|
||||
const file = snapshot.getFile(pathname)
|
||||
if (!file) {
|
||||
throw new ProjectArchive.MissingfileError()
|
||||
}
|
||||
return file.load('eager', blobStore).then(function () {
|
||||
const content = file.getContent({ filterTrackedDeletes: true })
|
||||
if (content === null) {
|
||||
return streamFileToArchive(pathname, file).catch(function (err) {
|
||||
throw new ProjectArchive.DownloadError(file.getHash()).withCause(
|
||||
err
|
||||
)
|
||||
})
|
||||
} else {
|
||||
archive.append(content, { name: pathname })
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
function streamFileToArchive(pathname, file) {
|
||||
return new BPromise(function (resolve, reject) {
|
||||
blobStore
|
||||
.getStream(file.getHash())
|
||||
.then(stream => {
|
||||
stream.on('error', reject)
|
||||
stream.on('end', resolve)
|
||||
archive.append(stream, { name: pathname })
|
||||
})
|
||||
.catch(reject)
|
||||
})
|
||||
}
|
||||
|
||||
const addFilesToArchiveAndFinalize = BPromise.map(
|
||||
snapshot.getFilePathnames(),
|
||||
addFileToArchive,
|
||||
{ concurrency: FETCH_CONCURRENCY }
|
||||
).then(function () {
|
||||
archive.finalize()
|
||||
})
|
||||
|
||||
const streamArchiveToFile = new BPromise(function (resolve, reject) {
|
||||
const stream = fs.createWriteStream(zipFilePath)
|
||||
pipeline(archive, stream, function (err) {
|
||||
if (err) {
|
||||
reject(err)
|
||||
} else {
|
||||
resolve()
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return BPromise.join(streamArchiveToFile, addFilesToArchiveAndFinalize)
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = ProjectArchive
|
24
services/history-v1/storage/lib/project_key.js
Normal file
24
services/history-v1/storage/lib/project_key.js
Normal file
@@ -0,0 +1,24 @@
|
||||
// Keep in sync with services/web/app/src/Features/History/project_key.js
|
||||
const _ = require('lodash')
|
||||
const path = require('node:path')
|
||||
|
||||
//
|
||||
// The advice in http://docs.aws.amazon.com/AmazonS3/latest/dev/
|
||||
// request-rate-perf-considerations.html is to avoid sequential key prefixes,
|
||||
// so we reverse the project ID part of the key as they suggest.
|
||||
//
|
||||
function format(projectId) {
|
||||
const prefix = naiveReverse(pad(projectId))
|
||||
return path.join(prefix.slice(0, 3), prefix.slice(3, 6), prefix.slice(6))
|
||||
}
|
||||
|
||||
function pad(number) {
|
||||
return _.padStart(number, 9, '0')
|
||||
}
|
||||
|
||||
function naiveReverse(string) {
|
||||
return string.split('').reverse().join('')
|
||||
}
|
||||
|
||||
exports.format = format
|
||||
exports.pad = pad
|
19
services/history-v1/storage/lib/redis.js
Normal file
19
services/history-v1/storage/lib/redis.js
Normal file
@@ -0,0 +1,19 @@
|
||||
const config = require('config')
|
||||
const redis = require('@overleaf/redis-wrapper')
|
||||
|
||||
const historyRedisOptions = config.get('redis.history')
|
||||
const rclientHistory = redis.createClient(historyRedisOptions)
|
||||
|
||||
const lockRedisOptions = config.get('redis.history')
|
||||
const rclientLock = redis.createClient(lockRedisOptions)
|
||||
|
||||
async function disconnect() {
|
||||
await Promise.all([rclientHistory.disconnect(), rclientLock.disconnect()])
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
rclientHistory,
|
||||
rclientLock,
|
||||
redis,
|
||||
disconnect,
|
||||
}
|
40
services/history-v1/storage/lib/streams.js
Normal file
40
services/history-v1/storage/lib/streams.js
Normal file
@@ -0,0 +1,40 @@
|
||||
// @ts-check
|
||||
/**
|
||||
* Promises are promises and streams are streams, and ne'er the twain shall
|
||||
* meet.
|
||||
* @module
|
||||
*/
|
||||
'use strict'
|
||||
|
||||
const Stream = require('node:stream')
|
||||
const zlib = require('node:zlib')
|
||||
const { WritableBuffer } = require('@overleaf/stream-utils')
|
||||
|
||||
/**
|
||||
* Create a promise for the result of reading a stream to a buffer.
|
||||
*
|
||||
* @param {Stream.Readable} readStream
|
||||
* @return {Promise<Buffer>}
|
||||
*/
|
||||
async function readStreamToBuffer(readStream) {
|
||||
const bufferStream = new WritableBuffer()
|
||||
await Stream.promises.pipeline(readStream, bufferStream)
|
||||
return bufferStream.contents()
|
||||
}
|
||||
|
||||
exports.readStreamToBuffer = readStreamToBuffer
|
||||
|
||||
/**
|
||||
* Create a promise for the result of un-gzipping a stream to a buffer.
|
||||
*
|
||||
* @param {NodeJS.ReadableStream} readStream
|
||||
* @return {Promise<Buffer>}
|
||||
*/
|
||||
async function gunzipStreamToBuffer(readStream) {
|
||||
const gunzip = zlib.createGunzip()
|
||||
const bufferStream = new WritableBuffer()
|
||||
await Stream.promises.pipeline(readStream, gunzip, bufferStream)
|
||||
return bufferStream.contents()
|
||||
}
|
||||
|
||||
exports.gunzipStreamToBuffer = gunzipStreamToBuffer
|
25
services/history-v1/storage/lib/temp.js
Normal file
25
services/history-v1/storage/lib/temp.js
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Taken from renderer/app/helpers/temp.js with minor cosmetic changes.
|
||||
* Promisify the temp package. The temp package provides a 'track' feature
|
||||
* that automatically cleans up temp files at process exit, but that is not
|
||||
* very useful. They also provide a method to trigger cleanup, but that is not
|
||||
* safe for concurrent use. So, we use a disposer to unlink the file.
|
||||
*/
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
const fs = BPromise.promisifyAll(require('node:fs'))
|
||||
const temp = BPromise.promisifyAll(require('temp'))
|
||||
|
||||
exports.open = function (affixes) {
|
||||
return temp.openAsync(affixes).disposer(function (fileInfo) {
|
||||
fs.closeAsync(fileInfo.fd)
|
||||
.then(() => {
|
||||
return fs.unlinkAsync(fileInfo.path)
|
||||
})
|
||||
.catch(function (err) {
|
||||
if (err.code !== 'ENOENT') {
|
||||
throw err
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
134
services/history-v1/storage/lib/zip_store.js
Normal file
134
services/history-v1/storage/lib/zip_store.js
Normal file
@@ -0,0 +1,134 @@
|
||||
'use strict'
|
||||
|
||||
const BPromise = require('bluebird')
|
||||
const config = require('config')
|
||||
const fs = require('node:fs')
|
||||
const path = require('node:path')
|
||||
|
||||
const OError = require('@overleaf/o-error')
|
||||
const objectPersistor = require('@overleaf/object-persistor')
|
||||
|
||||
const assert = require('./assert')
|
||||
const { BlobStore } = require('./blob_store')
|
||||
const persistor = require('./persistor')
|
||||
const ProjectArchive = require('./project_archive')
|
||||
const projectKey = require('./project_key')
|
||||
const temp = require('./temp')
|
||||
|
||||
const BUCKET = config.get('zipStore.bucket')
|
||||
|
||||
function getZipKey(projectId, version) {
|
||||
return path.join(
|
||||
projectKey.format(projectId),
|
||||
version.toString(),
|
||||
'project.zip'
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Store a zip of a given version of a project in bucket.
|
||||
*
|
||||
* @class
|
||||
*/
|
||||
class ZipStore {
|
||||
/**
|
||||
* Generate signed link to access the zip file.
|
||||
*
|
||||
* @param {number | string} projectId
|
||||
* @param {number} version
|
||||
* @return {string}
|
||||
*/
|
||||
async getSignedUrl(projectId, version) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
|
||||
const key = getZipKey(projectId, version)
|
||||
return await persistor.getRedirectUrl(BUCKET, key)
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a zip of the given snapshot.
|
||||
*
|
||||
* @param {number | string} projectId
|
||||
* @param {number} version
|
||||
* @param {Snapshot} snapshot
|
||||
*/
|
||||
async storeZip(projectId, version, snapshot) {
|
||||
assert.projectId(projectId, 'bad projectId')
|
||||
assert.integer(version, 'bad version')
|
||||
assert.object(snapshot, 'bad snapshot')
|
||||
|
||||
const zipKey = getZipKey(projectId, version)
|
||||
|
||||
if (await isZipPresent()) return
|
||||
|
||||
await BPromise.using(temp.open('zip'), async tempFileInfo => {
|
||||
await zipSnapshot(tempFileInfo.path, snapshot)
|
||||
await uploadZip(tempFileInfo.path)
|
||||
})
|
||||
|
||||
// If the file is already there, we don't need to build the zip again. If we
|
||||
// just HEAD the file, there's a race condition, because the zip files
|
||||
// automatically expire. So, we try to copy the file from itself to itself,
|
||||
// and if it fails, we know the file didn't exist. If it succeeds, this has
|
||||
// the effect of re-extending its lifetime.
|
||||
async function isZipPresent() {
|
||||
try {
|
||||
await persistor.copyObject(BUCKET, zipKey, zipKey)
|
||||
return true
|
||||
} catch (error) {
|
||||
if (!(error instanceof objectPersistor.Errors.NotFoundError)) {
|
||||
console.error(
|
||||
'storeZip: isZipPresent: unexpected error (except in dev): %s',
|
||||
error
|
||||
)
|
||||
}
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async function zipSnapshot(tempPathname, snapshot) {
|
||||
const blobStore = new BlobStore(projectId)
|
||||
const zipTimeoutMs = parseInt(config.get('zipStore.zipTimeoutMs'), 10)
|
||||
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
|
||||
try {
|
||||
await archive.writeZip(blobStore, tempPathname)
|
||||
} catch (err) {
|
||||
throw new ZipStore.CreationError(projectId, version).withCause(err)
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadZip(tempPathname, snapshot) {
|
||||
const stream = fs.createReadStream(tempPathname)
|
||||
try {
|
||||
await persistor.sendStream(BUCKET, zipKey, stream, {
|
||||
contentType: 'application/zip',
|
||||
})
|
||||
} catch (err) {
|
||||
throw new ZipStore.UploadError(projectId, version).withCause(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CreationError extends OError {
|
||||
constructor(projectId, version) {
|
||||
super(`Zip creation failed for ${projectId} version ${version}`, {
|
||||
projectId,
|
||||
version,
|
||||
})
|
||||
}
|
||||
}
|
||||
ZipStore.CreationError = CreationError
|
||||
|
||||
class UploadError extends OError {
|
||||
constructor(projectId, version) {
|
||||
super(`Zip upload failed for ${projectId} version ${version}`, {
|
||||
projectId,
|
||||
version,
|
||||
})
|
||||
}
|
||||
}
|
||||
ZipStore.UploadError = UploadError
|
||||
|
||||
module.exports = new ZipStore()
|
1476
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
1476
services/history-v1/storage/scripts/back_fill_file_hash.mjs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,647 @@
|
||||
// @ts-check
|
||||
import Events from 'node:events'
|
||||
import fs from 'node:fs'
|
||||
import Stream from 'node:stream'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import logger from '@overleaf/logger'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { Blob } from 'overleaf-editor-core'
|
||||
import {
|
||||
BlobStore,
|
||||
getStringLengthOfFile,
|
||||
GLOBAL_BLOBS,
|
||||
makeBlobForFile,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import { db } from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import readline from 'node:readline'
|
||||
import { _blobIsBackedUp, backupBlob } from '../lib/backupBlob.mjs'
|
||||
import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
|
||||
import filestorePersistor from '../lib/persistor.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
/**
|
||||
* @typedef {import("mongodb").Collection} Collection
|
||||
* @typedef {import("mongodb").Collection<Project>} ProjectsCollection
|
||||
* @typedef {import("mongodb").Collection<{project: Project}>} DeletedProjectsCollection
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} FileRef
|
||||
* @property {ObjectId} _id
|
||||
* @property {string} hash
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Folder
|
||||
* @property {Array<Folder>} folders
|
||||
* @property {Array<FileRef>} fileRefs
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} Project
|
||||
* @property {ObjectId} _id
|
||||
* @property {Array<Folder>} rootFolder
|
||||
* @property {{history: {id: (number|string)}}} overleaf
|
||||
*/
|
||||
|
||||
/**
|
||||
* @return {{FIX_NOT_FOUND: boolean, FIX_HASH_MISMATCH: boolean, FIX_DELETE_PERMISSION: boolean, FIX_MISSING_HASH: boolean, LOGS: string}}
|
||||
*/
|
||||
function parseArgs() {
|
||||
const args = commandLineArgs([
|
||||
{ name: 'fixNotFound', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixDeletePermission', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixHashMismatch', type: String, defaultValue: 'true' },
|
||||
{ name: 'fixMissingHash', type: String, defaultValue: 'true' },
|
||||
{ name: 'logs', type: String, defaultValue: '' },
|
||||
])
|
||||
/**
|
||||
* commandLineArgs cannot handle --foo=false, so go the long way
|
||||
* @param {string} name
|
||||
* @return {boolean}
|
||||
*/
|
||||
function boolVal(name) {
|
||||
const v = args[name]
|
||||
if (['true', 'false'].includes(v)) return v === 'true'
|
||||
throw new Error(`expected "true" or "false" for boolean option ${name}`)
|
||||
}
|
||||
return {
|
||||
FIX_HASH_MISMATCH: boolVal('fixNotFound'),
|
||||
FIX_DELETE_PERMISSION: boolVal('fixDeletePermission'),
|
||||
FIX_NOT_FOUND: boolVal('fixHashMismatch'),
|
||||
FIX_MISSING_HASH: boolVal('fixMissingHash'),
|
||||
LOGS: args.logs,
|
||||
}
|
||||
}
|
||||
|
||||
const {
|
||||
FIX_HASH_MISMATCH,
|
||||
FIX_DELETE_PERMISSION,
|
||||
FIX_NOT_FOUND,
|
||||
FIX_MISSING_HASH,
|
||||
LOGS,
|
||||
} = parseArgs()
|
||||
if (!LOGS) {
|
||||
throw new Error('--logs parameter missing')
|
||||
}
|
||||
const BUFFER_DIR = fs.mkdtempSync(
|
||||
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
|
||||
)
|
||||
const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
|
||||
if (!USER_FILES_BUCKET_NAME) {
|
||||
throw new Error('env var USER_FILES_BUCKET_NAME is missing')
|
||||
}
|
||||
// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
|
||||
const STREAM_HIGH_WATER_MARK = parseInt(
|
||||
process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
|
||||
10
|
||||
)
|
||||
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
|
||||
|
||||
/** @type {ProjectsCollection} */
|
||||
const projectsCollection = db.collection('projects')
|
||||
/** @type {DeletedProjectsCollection} */
|
||||
const deletedProjectsCollection = db.collection('deletedProjects')
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated, draining queue')
|
||||
}
|
||||
|
||||
class FileDeletedError extends OError {}
|
||||
|
||||
/** @type {Map<string,{project: Project, projectSoftDeleted: boolean}>} */
|
||||
const PROJECT_CACHE = new Map()
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @return {Promise<{project: Project, projectSoftDeleted: boolean}>}
|
||||
*/
|
||||
async function getProject(projectId) {
|
||||
const cached = PROJECT_CACHE.get(projectId)
|
||||
if (cached) return cached
|
||||
|
||||
let projectSoftDeleted
|
||||
let project = await projectsCollection.findOne({
|
||||
_id: new ObjectId(projectId),
|
||||
})
|
||||
if (project) {
|
||||
projectSoftDeleted = false
|
||||
} else {
|
||||
const softDeleted = await deletedProjectsCollection.findOne({
|
||||
'deleterData.deletedProjectId': new ObjectId(projectId),
|
||||
project: { $exists: true },
|
||||
})
|
||||
if (!softDeleted) {
|
||||
throw new OError('project hard-deleted')
|
||||
}
|
||||
project = softDeleted.project
|
||||
projectSoftDeleted = true
|
||||
}
|
||||
PROJECT_CACHE.set(projectId, { projectSoftDeleted, project })
|
||||
return { projectSoftDeleted, project }
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Folder} folder
|
||||
* @param {string} fileId
|
||||
* @return {{path: string, fileRef: FileRef, folder: Folder}|null}
|
||||
*/
|
||||
function getFileTreePath(folder, fileId) {
|
||||
if (!folder) return null
|
||||
let idx = 0
|
||||
if (Array.isArray(folder.fileRefs)) {
|
||||
for (const fileRef of folder.fileRefs) {
|
||||
if (fileRef?._id.toString() === fileId) {
|
||||
return {
|
||||
fileRef,
|
||||
path: `.fileRefs.${idx}`,
|
||||
folder,
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
}
|
||||
idx = 0
|
||||
if (Array.isArray(folder.folders)) {
|
||||
for (const child of folder.folders) {
|
||||
const match = getFileTreePath(child, fileId)
|
||||
if (match) {
|
||||
return {
|
||||
fileRef: match.fileRef,
|
||||
folder: match.folder,
|
||||
path: `.folders.${idx}${match.path}`,
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<{fileRef: FileRef, folder: Folder, fullPath: string, query: Object, projectSoftDeleted: boolean}>}
|
||||
*/
|
||||
async function findFile(projectId, fileId) {
|
||||
const { projectSoftDeleted, project } = await getProject(projectId)
|
||||
const match = getFileTreePath(project.rootFolder[0], fileId)
|
||||
if (!match) {
|
||||
throw new FileDeletedError('file not found in file-tree', {
|
||||
projectSoftDeleted,
|
||||
})
|
||||
}
|
||||
const { path, fileRef, folder } = match
|
||||
let fullPath
|
||||
let query
|
||||
if (projectSoftDeleted) {
|
||||
fullPath = `project.rootFolder.0${path}`
|
||||
query = {
|
||||
'deleterData.deletedProjectId': new ObjectId(projectId),
|
||||
[`${fullPath}._id`]: new ObjectId(fileId),
|
||||
}
|
||||
} else {
|
||||
fullPath = `rootFolder.0${path}`
|
||||
query = {
|
||||
_id: new ObjectId(projectId),
|
||||
[`${fullPath}._id`]: new ObjectId(fileId),
|
||||
}
|
||||
}
|
||||
return {
|
||||
projectSoftDeleted,
|
||||
query,
|
||||
fullPath,
|
||||
fileRef,
|
||||
folder,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixNotFound(line) {
|
||||
const { projectId, fileId, bucketName } = JSON.parse(line)
|
||||
if (bucketName !== USER_FILES_BUCKET_NAME) {
|
||||
throw new OError('not found case for another bucket')
|
||||
}
|
||||
|
||||
const { projectSoftDeleted, query, fullPath, fileRef, folder } =
|
||||
await findFile(projectId, fileId)
|
||||
logger.info({ projectId, fileId, fileRef }, 'removing fileRef')
|
||||
// Copied from _removeElementFromMongoArray (https://github.com/overleaf/internal/blob/11e09528c153de6b7766d18c3c90d94962190371/services/web/app/src/Features/Project/ProjectEntityMongoUpdateHandler.js)
|
||||
const nonArrayPath = fullPath.slice(0, fullPath.lastIndexOf('.'))
|
||||
let result
|
||||
if (projectSoftDeleted) {
|
||||
result = await deletedProjectsCollection.updateOne(query, {
|
||||
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
|
||||
$inc: { 'project.version': 1 },
|
||||
})
|
||||
} else {
|
||||
result = await projectsCollection.updateOne(query, {
|
||||
$pull: { [nonArrayPath]: { _id: new ObjectId(fileId) } },
|
||||
$inc: { version: 1 },
|
||||
})
|
||||
}
|
||||
if (result.matchedCount !== 1) {
|
||||
throw new OError('file-tree write did not match', { result })
|
||||
}
|
||||
// Update the cache. The mongo-path of the next file will be off otherwise.
|
||||
folder.fileRefs = folder.fileRefs.filter(f => !f._id.equals(fileId))
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function setHashInMongo(projectId, fileId, hash) {
|
||||
const { projectSoftDeleted, query, fullPath, fileRef } = await findFile(
|
||||
projectId,
|
||||
fileId
|
||||
)
|
||||
if (fileRef.hash === hash) return
|
||||
logger.info({ projectId, fileId, fileRef, hash }, 'setting fileRef hash')
|
||||
let result
|
||||
if (projectSoftDeleted) {
|
||||
result = await deletedProjectsCollection.updateOne(query, {
|
||||
$set: { [`${fullPath}.hash`]: hash },
|
||||
$inc: { 'project.version': 1 },
|
||||
})
|
||||
} else {
|
||||
result = await projectsCollection.updateOne(query, {
|
||||
$set: { [`${fullPath}.hash`]: hash },
|
||||
$inc: { version: 1 },
|
||||
})
|
||||
}
|
||||
if (result.matchedCount !== 1) {
|
||||
throw new OError('file-tree write did not match', { result })
|
||||
}
|
||||
fileRef.hash = hash // Update cache for completeness.
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} historyId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function importRestoredFilestoreFile(projectId, fileId, historyId) {
|
||||
const filestoreKey = `${projectId}/${fileId}`
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
let s
|
||||
try {
|
||||
s = await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
filestoreKey
|
||||
)
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new OError('missing blob, need to restore filestore file', {
|
||||
filestoreKey,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
await Stream.promises.pipeline(
|
||||
s,
|
||||
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
|
||||
)
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.putFile(path)
|
||||
await backupBlob(historyId, blob, path)
|
||||
await setHashInMongo(projectId, fileId, blob.getHash())
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} path
|
||||
* @return {Promise<Blob>}
|
||||
*/
|
||||
async function bufferFilestoreFileToDisk(projectId, fileId, path) {
|
||||
const filestoreKey = `${projectId}/${fileId}`
|
||||
try {
|
||||
await Stream.promises.pipeline(
|
||||
await filestorePersistor.getObjectStream(
|
||||
USER_FILES_BUCKET_NAME,
|
||||
filestoreKey
|
||||
),
|
||||
fs.createWriteStream(path, { highWaterMark: STREAM_HIGH_WATER_MARK })
|
||||
)
|
||||
const blob = await makeBlobForFile(path)
|
||||
blob.setStringLength(
|
||||
await getStringLengthOfFile(blob.getByteLength(), path)
|
||||
)
|
||||
return blob
|
||||
} catch (err) {
|
||||
if (err instanceof NotFoundError) {
|
||||
throw new OError('missing blob, need to restore filestore file', {
|
||||
filestoreKey,
|
||||
})
|
||||
}
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<string>}
|
||||
*/
|
||||
async function computeFilestoreFileHash(projectId, fileId) {
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
|
||||
return blob.getHash()
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function uploadFilestoreFile(projectId, fileId) {
|
||||
const path = `${BUFFER_DIR}/${projectId}_${fileId}`
|
||||
try {
|
||||
const blob = await bufferFilestoreFileToDisk(projectId, fileId, path)
|
||||
const hash = blob.getHash()
|
||||
try {
|
||||
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
} catch (err) {
|
||||
if (!(err instanceof Blob.NotFoundError)) throw err
|
||||
|
||||
const { project } = await getProject(projectId)
|
||||
const historyId = project.overleaf.history.id.toString()
|
||||
const blobStore = new BlobStore(historyId)
|
||||
await blobStore.putBlob(path, blob)
|
||||
await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixHashMismatch(line) {
|
||||
const {
|
||||
projectId,
|
||||
fileId,
|
||||
hash: computedHash,
|
||||
entry: {
|
||||
hash: fileTreeHash,
|
||||
ctx: { historyId },
|
||||
},
|
||||
} = JSON.parse(line)
|
||||
const blobStore = new BlobStore(historyId)
|
||||
if (await blobStore.getBlob(fileTreeHash)) {
|
||||
throw new OError('found blob with computed filestore object hash')
|
||||
}
|
||||
if (!(await blobStore.getBlob(computedHash))) {
|
||||
await importRestoredFilestoreFile(projectId, fileId, historyId)
|
||||
return true
|
||||
}
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(
|
||||
projectId,
|
||||
fileId,
|
||||
computedHash
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function hashAlreadyUpdatedInFileTree(projectId, fileId, hash) {
|
||||
const { fileRef } = await findFile(projectId, fileId)
|
||||
return fileRef.hash === hash
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function needsBackingUpToAWS(projectId, hash) {
|
||||
if (GLOBAL_BLOBS.has(hash)) return false
|
||||
return !(await _blobIsBackedUp(projectId, hash))
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} projectId
|
||||
* @param {string} fileId
|
||||
* @param {string} hash
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash) {
|
||||
const { project } = await getProject(projectId)
|
||||
const historyId = project.overleaf.history.id.toString()
|
||||
const blobStore = new BlobStore(historyId)
|
||||
if (
|
||||
(await hashAlreadyUpdatedInFileTree(projectId, fileId, hash)) &&
|
||||
(await blobStore.getBlob(hash)) &&
|
||||
!(await needsBackingUpToAWS(projectId, hash))
|
||||
) {
|
||||
return false // already processed
|
||||
}
|
||||
|
||||
const stream = await blobStore.getStream(hash)
|
||||
const path = `${BUFFER_DIR}/${historyId}_${hash}`
|
||||
try {
|
||||
await Stream.promises.pipeline(
|
||||
stream,
|
||||
fs.createWriteStream(path, {
|
||||
highWaterMark: STREAM_HIGH_WATER_MARK,
|
||||
})
|
||||
)
|
||||
|
||||
const writtenBlob = await makeBlobForFile(path)
|
||||
writtenBlob.setStringLength(
|
||||
await getStringLengthOfFile(writtenBlob.getByteLength(), path)
|
||||
)
|
||||
if (writtenBlob.getHash() !== hash) {
|
||||
// Double check download, better safe than sorry.
|
||||
throw new OError('blob corrupted', { writtenBlob })
|
||||
}
|
||||
|
||||
let blob = await blobStore.getBlob(hash)
|
||||
if (!blob) {
|
||||
// Calling blobStore.putBlob would result in the same error again.
|
||||
// HACK: Skip upload to GCS and finalize putBlob operation directly.
|
||||
await blobStore.backend.insertBlob(historyId, writtenBlob)
|
||||
}
|
||||
await backupBlob(historyId, writtenBlob, path)
|
||||
} finally {
|
||||
await fs.promises.rm(path, { force: true })
|
||||
}
|
||||
await setHashInMongo(projectId, fileId, hash)
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixDeletePermission(line) {
|
||||
let { projectId, fileId, hash } = JSON.parse(line)
|
||||
if (!hash) hash = await computeFilestoreFileHash(projectId, fileId)
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} line
|
||||
* @return {Promise<boolean>}
|
||||
*/
|
||||
async function fixMissingHash(line) {
|
||||
let { projectId, _id: fileId } = JSON.parse(line)
|
||||
const {
|
||||
fileRef: { hash },
|
||||
} = await findFile(projectId, fileId)
|
||||
if (hash) {
|
||||
// processed, double check
|
||||
return await ensureBlobExistsForFileAndUploadToAWS(projectId, fileId, hash)
|
||||
}
|
||||
await uploadFilestoreFile(projectId, fileId)
|
||||
return true
|
||||
}
|
||||
|
||||
const CASES = {
|
||||
'not found': {
|
||||
match: 'NotFoundError',
|
||||
flag: FIX_NOT_FOUND,
|
||||
action: fixNotFound,
|
||||
},
|
||||
'hash mismatch': {
|
||||
match: 'OError: hash mismatch',
|
||||
flag: FIX_HASH_MISMATCH,
|
||||
action: fixHashMismatch,
|
||||
},
|
||||
'delete permission': {
|
||||
match: 'storage.objects.delete',
|
||||
flag: FIX_DELETE_PERMISSION,
|
||||
action: fixDeletePermission,
|
||||
},
|
||||
'missing file hash': {
|
||||
match: '"bad file hash"',
|
||||
flag: FIX_MISSING_HASH,
|
||||
action: fixMissingHash,
|
||||
},
|
||||
}
|
||||
|
||||
const STATS = {
|
||||
processedLines: 0,
|
||||
success: 0,
|
||||
alreadyProcessed: 0,
|
||||
fileDeleted: 0,
|
||||
skipped: 0,
|
||||
failed: 0,
|
||||
unmatched: 0,
|
||||
}
|
||||
function logStats() {
|
||||
console.log(
|
||||
JSON.stringify({
|
||||
time: new Date(),
|
||||
gracefulShutdownInitiated,
|
||||
...STATS,
|
||||
})
|
||||
)
|
||||
}
|
||||
setInterval(logStats, 10_000)
|
||||
|
||||
async function processLog() {
|
||||
const rl = readline.createInterface({
|
||||
input: fs.createReadStream(LOGS),
|
||||
})
|
||||
nextLine: for await (const line of rl) {
|
||||
if (gracefulShutdownInitiated) break
|
||||
STATS.processedLines++
|
||||
if (
|
||||
!(
|
||||
line.includes('"failed to process file"') ||
|
||||
// Process missing hashes as flagged by find_malformed_filetrees.mjs
|
||||
line.includes('"bad file-tree path"')
|
||||
)
|
||||
) {
|
||||
continue
|
||||
}
|
||||
|
||||
for (const [name, { match, flag, action }] of Object.entries(CASES)) {
|
||||
if (!line.includes(match)) continue
|
||||
if (flag) {
|
||||
try {
|
||||
if (await action(line)) {
|
||||
STATS.success++
|
||||
} else {
|
||||
STATS.alreadyProcessed++
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof FileDeletedError) {
|
||||
STATS.fileDeleted++
|
||||
logger.info({ err, line }, 'file deleted, skipping')
|
||||
} else {
|
||||
STATS.failed++
|
||||
logger.error({ err, line }, `failed to fix ${name}`)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
STATS.skipped++
|
||||
}
|
||||
continue nextLine
|
||||
}
|
||||
STATS.unmatched++
|
||||
logger.warn({ line }, 'unknown fatal error')
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
await processLog()
|
||||
} finally {
|
||||
logStats()
|
||||
try {
|
||||
await fs.promises.rm(BUFFER_DIR, { recursive: true, force: true })
|
||||
} catch (err) {
|
||||
console.error(`Cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
|
||||
}
|
||||
}
|
||||
const { skipped, failed, unmatched } = STATS
|
||||
await setTimeout(SLEEP_BEFORE_EXIT)
|
||||
if (failed > 0) {
|
||||
process.exit(Math.min(failed, 99))
|
||||
} else if (unmatched > 0) {
|
||||
process.exit(100)
|
||||
} else if (skipped > 0) {
|
||||
process.exit(101)
|
||||
} else {
|
||||
process.exit(0)
|
||||
}
|
||||
}
|
||||
|
||||
await main()
|
1104
services/history-v1/storage/scripts/backup.mjs
Normal file
1104
services/history-v1/storage/scripts/backup.mjs
Normal file
File diff suppressed because it is too large
Load Diff
173
services/history-v1/storage/scripts/backup_blob.mjs
Normal file
173
services/history-v1/storage/scripts/backup_blob.mjs
Normal file
@@ -0,0 +1,173 @@
|
||||
// @ts-check
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { backupBlob, downloadBlobToDir } from '../lib/backupBlob.mjs'
|
||||
import withTmpDir from '../../api/controllers/with_tmp_dir.js'
|
||||
import {
|
||||
BlobStore,
|
||||
GLOBAL_BLOBS,
|
||||
loadGlobalBlobs,
|
||||
} from '../lib/blob_store/index.js'
|
||||
import assert from '../lib/assert.js'
|
||||
import knex from '../lib/knex.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import redis from '../lib/redis.js'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import fs from 'node:fs'
|
||||
|
||||
await loadGlobalBlobs()
|
||||
|
||||
/**
|
||||
* Gracefully shutdown the process
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function gracefulShutdown() {
|
||||
console.log('Gracefully shutting down')
|
||||
await knex.destroy()
|
||||
await client.close()
|
||||
await redis.disconnect()
|
||||
await setTimeout(100)
|
||||
process.exit()
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} row
|
||||
* @return {BackupBlobJob}
|
||||
*/
|
||||
function parseCSVRow(row) {
|
||||
const [historyId, hash] = row.split(',')
|
||||
validateBackedUpBlobJob({ historyId, hash })
|
||||
return { historyId, hash }
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {BackupBlobJob} job
|
||||
*/
|
||||
function validateBackedUpBlobJob(job) {
|
||||
assert.projectId(job.historyId)
|
||||
assert.blobHash(job.hash)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} path
|
||||
* @return {Promise<Array<BackupBlobJob>>}
|
||||
*/
|
||||
async function readCSV(path) {
|
||||
let fh
|
||||
/** @type {Array<BackupBlobJob>} */
|
||||
const rows = []
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
throw error
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
try {
|
||||
const row = parseCSVRow(line)
|
||||
if (GLOBAL_BLOBS.has(row.hash)) {
|
||||
console.log(`Skipping global blob: ${line}`)
|
||||
continue
|
||||
}
|
||||
rows.push(row)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.log(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} BackupBlobJob
|
||||
* @property {string} hash
|
||||
* @property {string} historyId
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param {Object} options
|
||||
* @property {string} [options.historyId]
|
||||
* @property {string} [options.hash]
|
||||
* @property {string} [options.input]
|
||||
* @return {Promise<Array<BackupBlobJob>>}
|
||||
*/
|
||||
async function initialiseJobs({ historyId, hash, input }) {
|
||||
if (input) {
|
||||
return await readCSV(input)
|
||||
}
|
||||
|
||||
if (!historyId) {
|
||||
console.error('historyId is required')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
if (!hash) {
|
||||
console.error('hash is required')
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
validateBackedUpBlobJob({ historyId, hash })
|
||||
|
||||
if (GLOBAL_BLOBS.has(hash)) {
|
||||
console.error(`Blob ${hash} is a global blob; not backing up`)
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
}
|
||||
return [{ hash, historyId }]
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} historyId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function downloadAndBackupBlob(historyId, hash) {
|
||||
const blobStore = new BlobStore(historyId)
|
||||
const blob = await blobStore.getBlob(hash)
|
||||
if (!blob) {
|
||||
throw new Error(`Blob ${hash} could not be loaded`)
|
||||
}
|
||||
await withTmpDir(`blob-${hash}`, async tmpDir => {
|
||||
const filePath = await downloadBlobToDir(historyId, blob, tmpDir)
|
||||
console.log(`Downloaded blob ${hash} to ${filePath}`)
|
||||
await backupBlob(historyId, blob, filePath)
|
||||
console.log('Backed up blob')
|
||||
})
|
||||
}
|
||||
|
||||
let jobs
|
||||
|
||||
const options = commandLineArgs([
|
||||
{ name: 'historyId', type: String },
|
||||
{ name: 'hash', type: String },
|
||||
{ name: 'input', type: String },
|
||||
])
|
||||
|
||||
try {
|
||||
jobs = await initialiseJobs(options)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
await gracefulShutdown()
|
||||
}
|
||||
|
||||
if (!Array.isArray(jobs)) {
|
||||
// This is mostly to satisfy typescript
|
||||
process.exitCode = 1
|
||||
await gracefulShutdown()
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
for (const { historyId, hash } of jobs) {
|
||||
try {
|
||||
await downloadAndBackupBlob(historyId, hash)
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
process.exitCode = 1
|
||||
}
|
||||
}
|
||||
await gracefulShutdown()
|
153
services/history-v1/storage/scripts/backup_sample.mjs
Normal file
153
services/history-v1/storage/scripts/backup_sample.mjs
Normal file
@@ -0,0 +1,153 @@
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { READ_PREFERENCE_SECONDARY } from '@overleaf/mongo-utils/batchedUpdate.js'
|
||||
import { db, client } from '../lib/mongodb.js'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
// Configuration
|
||||
const SAMPLE_SIZE_PER_ITERATION = process.argv[2]
|
||||
? parseInt(process.argv[2], 10)
|
||||
: 10000
|
||||
const TARGET_ERROR_PERCENTAGE = process.argv[3]
|
||||
? parseFloat(process.argv[3])
|
||||
: 5.0
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
gracefulShutdownInitiated = true
|
||||
console.warn('graceful shutdown initiated')
|
||||
}
|
||||
|
||||
async function takeSample(sampleSize) {
|
||||
const results = await projectsCollection
|
||||
.aggregate(
|
||||
[
|
||||
{ $sample: { size: sampleSize } },
|
||||
{
|
||||
$match: { 'overleaf.backup.lastBackedUpVersion': { $exists: true } },
|
||||
},
|
||||
{
|
||||
$count: 'total',
|
||||
},
|
||||
],
|
||||
{ readPreference: READ_PREFERENCE_SECONDARY }
|
||||
)
|
||||
.toArray()
|
||||
|
||||
const count = results[0]?.total || 0
|
||||
return { totalSampled: sampleSize, backedUp: count }
|
||||
}
|
||||
|
||||
function calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalPopulation
|
||||
) {
|
||||
const proportion = Math.max(1, cumulativeBackedUp) / cumulativeSampled
|
||||
|
||||
// Standard error with finite population correction
|
||||
const fpc = Math.sqrt(
|
||||
(totalPopulation - cumulativeSampled) / (totalPopulation - 1)
|
||||
)
|
||||
const stdError =
|
||||
Math.sqrt((proportion * (1 - proportion)) / cumulativeSampled) * fpc
|
||||
|
||||
// 95% confidence interval is approximately ±1.96 standard errors
|
||||
const marginOfError = 1.96 * stdError
|
||||
|
||||
return {
|
||||
proportion,
|
||||
percentage: (proportion * 100).toFixed(2),
|
||||
marginOfError,
|
||||
errorPercentage: (marginOfError * 100).toFixed(2),
|
||||
lowerBound: ((proportion - marginOfError) * 100).toFixed(2),
|
||||
upperBound: ((proportion + marginOfError) * 100).toFixed(2),
|
||||
sampleSize: cumulativeSampled,
|
||||
populationSize: totalPopulation,
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('Date:', new Date().toISOString())
|
||||
const totalCount = await projectsCollection.estimatedDocumentCount({
|
||||
readPreference: READ_PREFERENCE_SECONDARY,
|
||||
})
|
||||
console.log(
|
||||
`Total projects in collection (estimated): ${totalCount.toLocaleString()}`
|
||||
)
|
||||
console.log(`Target margin of error: ${TARGET_ERROR_PERCENTAGE}%`)
|
||||
|
||||
let cumulativeSampled = 0
|
||||
let cumulativeBackedUp = 0
|
||||
let currentError = Infinity
|
||||
let iteration = 0
|
||||
|
||||
console.log('Iteration | Total Sampled | % Backed Up | Margin of Error')
|
||||
console.log('----------|---------------|-------------|----------------')
|
||||
|
||||
while (currentError > TARGET_ERROR_PERCENTAGE) {
|
||||
if (gracefulShutdownInitiated) {
|
||||
console.log('Graceful shutdown initiated. Exiting sampling loop.')
|
||||
break
|
||||
}
|
||||
|
||||
iteration++
|
||||
const { totalSampled, backedUp } = await takeSample(
|
||||
SAMPLE_SIZE_PER_ITERATION
|
||||
)
|
||||
cumulativeSampled += totalSampled
|
||||
cumulativeBackedUp += backedUp
|
||||
|
||||
const stats = calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalCount
|
||||
)
|
||||
currentError = parseFloat(stats.errorPercentage)
|
||||
|
||||
console.log(
|
||||
`${iteration.toString().padStart(9)} | ` +
|
||||
`${cumulativeSampled.toString().padStart(13)} | ` +
|
||||
`${stats.percentage.padStart(10)}% | ` +
|
||||
`\u00B1${stats.errorPercentage}%`
|
||||
)
|
||||
|
||||
// Small delay between iterations
|
||||
await new Promise(resolve => setTimeout(resolve, 100))
|
||||
}
|
||||
|
||||
const finalStats = calculateStatistics(
|
||||
cumulativeSampled,
|
||||
cumulativeBackedUp,
|
||||
totalCount
|
||||
)
|
||||
|
||||
console.log(
|
||||
`Projects sampled: ${cumulativeSampled.toLocaleString()} out of ${totalCount.toLocaleString()}`
|
||||
)
|
||||
console.log(
|
||||
`Estimated percentage with lastBackedUpVersion: ${finalStats.percentage}%`
|
||||
)
|
||||
console.log(
|
||||
`95% Confidence Interval: ${finalStats.lowerBound}% - ${finalStats.upperBound}%`
|
||||
)
|
||||
console.log(`Final Margin of Error: \u00B1${finalStats.errorPercentage}%`)
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
429
services/history-v1/storage/scripts/backup_scheduler.mjs
Normal file
429
services/history-v1/storage/scripts/backup_scheduler.mjs
Normal file
@@ -0,0 +1,429 @@
|
||||
import Queue from 'bull'
|
||||
import config from 'config'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import logger from '@overleaf/logger'
|
||||
import {
|
||||
listPendingBackups,
|
||||
listUninitializedBackups,
|
||||
getBackupStatus,
|
||||
} from '../lib/backup_store/index.js'
|
||||
|
||||
logger.initialize('backup-queue')
|
||||
|
||||
// Use the same redis config as backup_worker
|
||||
const redisOptions = config.get('redis.queue')
|
||||
|
||||
// Create a Bull queue named 'backup'
|
||||
const backupQueue = new Queue('backup', {
|
||||
redis: redisOptions,
|
||||
defaultJobOptions: {
|
||||
removeOnComplete: true,
|
||||
removeOnFail: true,
|
||||
},
|
||||
})
|
||||
|
||||
// Define command-line options
|
||||
const optionDefinitions = [
|
||||
{ name: 'clean', type: Boolean },
|
||||
{ name: 'status', type: Boolean },
|
||||
{
|
||||
name: 'add',
|
||||
type: String,
|
||||
multiple: true,
|
||||
description: 'Project IDs or date range in YYYY-MM-DD:YYYY-MM-DD format',
|
||||
},
|
||||
{ name: 'monitor', type: Boolean },
|
||||
{
|
||||
name: 'queue-pending',
|
||||
type: Number,
|
||||
description:
|
||||
'Find projects with pending changes older than N seconds and add them to the queue',
|
||||
},
|
||||
{
|
||||
name: 'show-pending',
|
||||
type: Number,
|
||||
description:
|
||||
'Show count of pending projects older than N seconds without adding to queue',
|
||||
},
|
||||
{
|
||||
name: 'limit',
|
||||
type: Number,
|
||||
description: 'Limit the number of jobs to be added',
|
||||
},
|
||||
{
|
||||
name: 'interval',
|
||||
type: Number,
|
||||
description: 'Time in seconds to spread jobs over (default: 300)',
|
||||
defaultValue: 300,
|
||||
},
|
||||
{
|
||||
name: 'backoff-delay',
|
||||
type: Number,
|
||||
description:
|
||||
'Backoff delay in milliseconds for failed jobs (default: 1000)',
|
||||
defaultValue: 1000,
|
||||
},
|
||||
{
|
||||
name: 'attempts',
|
||||
type: Number,
|
||||
description: 'Number of retry attempts for failed jobs (default: 3)',
|
||||
defaultValue: 3,
|
||||
},
|
||||
{
|
||||
name: 'warn-threshold',
|
||||
type: Number,
|
||||
description: 'Warn about any project exceeding this pending age',
|
||||
defaultValue: 2 * 3600, // 2 hours
|
||||
},
|
||||
{
|
||||
name: 'verbose',
|
||||
alias: 'v',
|
||||
type: Boolean,
|
||||
description: 'Show detailed information when used with --show-pending',
|
||||
},
|
||||
]
|
||||
|
||||
// Parse command line arguments
|
||||
const options = commandLineArgs(optionDefinitions)
|
||||
const WARN_THRESHOLD = options['warn-threshold']
|
||||
|
||||
// Helper to validate date format
|
||||
function isValidDateFormat(dateStr) {
|
||||
return /^\d{4}-\d{2}-\d{2}$/.test(dateStr)
|
||||
}
|
||||
|
||||
// Helper to validate the pending time parameter
|
||||
function validatePendingTime(option, value) {
|
||||
if (typeof value !== 'number' || value <= 0) {
|
||||
console.error(
|
||||
`Error: --${option} requires a positive numeric TIME argument in seconds`
|
||||
)
|
||||
console.error(`Example: --${option} 3600`)
|
||||
process.exit(1)
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// Helper to format the pending time display
|
||||
function formatPendingTime(timestamp) {
|
||||
const now = new Date()
|
||||
const diffMs = now - timestamp
|
||||
const seconds = Math.floor(diffMs / 1000)
|
||||
return `${timestamp.toISOString()} (${seconds} seconds ago)`
|
||||
}
|
||||
|
||||
// Helper to add a job to the queue, checking for duplicates
|
||||
async function addJobWithCheck(queue, data, options) {
|
||||
const jobId = options.jobId
|
||||
|
||||
// Check if the job already exists
|
||||
const existingJob = await queue.getJob(jobId)
|
||||
|
||||
if (existingJob) {
|
||||
return { job: existingJob, added: false }
|
||||
} else {
|
||||
const job = await queue.add(data, options)
|
||||
return { job, added: true }
|
||||
}
|
||||
}
|
||||
|
||||
// Setup queue event listeners
|
||||
function setupMonitoring() {
|
||||
console.log('Starting queue monitoring. Press Ctrl+C to exit.')
|
||||
|
||||
backupQueue.on('global:error', error => {
|
||||
logger.info({ error }, 'Queue error')
|
||||
})
|
||||
|
||||
backupQueue.on('global:waiting', jobId => {
|
||||
logger.info({ jobId }, 'job is waiting')
|
||||
})
|
||||
|
||||
backupQueue.on('global:active', jobId => {
|
||||
logger.info({ jobId }, 'job is now active')
|
||||
})
|
||||
|
||||
backupQueue.on('global:stalled', jobId => {
|
||||
logger.info({ jobId }, 'job has stalled')
|
||||
})
|
||||
|
||||
backupQueue.on('global:progress', (jobId, progress) => {
|
||||
logger.info({ jobId, progress }, 'job progress')
|
||||
})
|
||||
|
||||
backupQueue.on('global:completed', (jobId, result) => {
|
||||
logger.info({ jobId, result }, 'job completed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:failed', (jobId, err) => {
|
||||
logger.info({ jobId, err }, 'job failed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:paused', () => {
|
||||
logger.info({}, 'Queue paused')
|
||||
})
|
||||
|
||||
backupQueue.on('global:resumed', () => {
|
||||
logger.info({}, 'Queue resumed')
|
||||
})
|
||||
|
||||
backupQueue.on('global:cleaned', (jobs, type) => {
|
||||
logger.info({ jobsCount: jobs.length, type }, 'Jobs cleaned')
|
||||
})
|
||||
|
||||
backupQueue.on('global:drained', () => {
|
||||
logger.info({}, 'Queue drained')
|
||||
})
|
||||
|
||||
backupQueue.on('global:removed', jobId => {
|
||||
logger.info({ jobId }, 'Job removed')
|
||||
})
|
||||
}
|
||||
|
||||
async function addDateRangeJob(input) {
|
||||
const [startDate, endDate] = input.split(':')
|
||||
if (!isValidDateFormat(startDate) || !isValidDateFormat(endDate)) {
|
||||
console.error(
|
||||
`Invalid date format for "${input}". Use YYYY-MM-DD:YYYY-MM-DD`
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
const jobId = `backup-${startDate}-to-${endDate}`
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ startDate, endDate },
|
||||
{ jobId }
|
||||
)
|
||||
|
||||
console.log(
|
||||
`${added ? 'Added' : 'Already exists'}: date range backup job: ${startDate} to ${endDate}, job ID: ${job.id}`
|
||||
)
|
||||
}
|
||||
|
||||
// Helper to list pending and uninitialized backups
|
||||
// This function combines the two cursors into a single generator
|
||||
// to yield projects from both lists
|
||||
async function* pendingCursor(timeIntervalMs, limit) {
|
||||
for await (const project of listPendingBackups(timeIntervalMs, limit)) {
|
||||
yield project
|
||||
}
|
||||
for await (const project of listUninitializedBackups(timeIntervalMs, limit)) {
|
||||
yield project
|
||||
}
|
||||
}
|
||||
|
||||
// Process pending projects with changes older than the specified seconds
|
||||
async function processPendingProjects(
|
||||
age,
|
||||
showOnly,
|
||||
limit,
|
||||
verbose,
|
||||
jobInterval,
|
||||
jobOpts = {}
|
||||
) {
|
||||
const timeIntervalMs = age * 1000
|
||||
console.log(
|
||||
`Finding projects with pending changes older than ${age} seconds${showOnly ? ' (count only)' : ''}`
|
||||
)
|
||||
|
||||
let count = 0
|
||||
let addedCount = 0
|
||||
let existingCount = 0
|
||||
// Pass the limit directly to MongoDB query for better performance
|
||||
const changeTimes = []
|
||||
for await (const project of pendingCursor(timeIntervalMs, limit)) {
|
||||
const projectId = project._id.toHexString()
|
||||
const pendingAt =
|
||||
project.overleaf?.backup?.pendingChangeAt || project._id.getTimestamp()
|
||||
if (pendingAt) {
|
||||
changeTimes.push(pendingAt)
|
||||
const pendingAge = Math.floor((Date.now() - pendingAt.getTime()) / 1000)
|
||||
if (pendingAge > WARN_THRESHOLD) {
|
||||
try {
|
||||
const backupStatus = await getBackupStatus(projectId)
|
||||
logger.warn(
|
||||
{
|
||||
projectId,
|
||||
pendingAt,
|
||||
pendingAge,
|
||||
backupStatus,
|
||||
warnThreshold: WARN_THRESHOLD,
|
||||
},
|
||||
`pending change exceeds rpo warning threshold`
|
||||
)
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
{ projectId, pendingAt, pendingAge },
|
||||
'Error getting backup status'
|
||||
)
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
if (showOnly && verbose) {
|
||||
console.log(
|
||||
`Project: ${projectId} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
} else if (!showOnly) {
|
||||
const delay = Math.floor(Math.random() * jobInterval * 1000) // add random delay to avoid all jobs running simultaneously
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ projectId, pendingChangeAt: pendingAt.getTime() },
|
||||
{ ...jobOpts, delay, jobId: projectId }
|
||||
)
|
||||
|
||||
if (added) {
|
||||
if (verbose) {
|
||||
console.log(
|
||||
`Added job for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
}
|
||||
addedCount++
|
||||
} else {
|
||||
if (verbose) {
|
||||
console.log(
|
||||
`Job already exists for project: ${projectId}, job ID: ${job.id} (pending since: ${formatPendingTime(pendingAt)})`
|
||||
)
|
||||
}
|
||||
existingCount++
|
||||
}
|
||||
}
|
||||
|
||||
count++
|
||||
if (count % 1000 === 0) {
|
||||
console.log(
|
||||
`Processed ${count} projects`,
|
||||
showOnly ? '' : `(${addedCount} added, ${existingCount} existing)`
|
||||
)
|
||||
}
|
||||
}
|
||||
// Set oldestChange to undefined if there are no changes
|
||||
const oldestChange =
|
||||
changeTimes.length > 0
|
||||
? changeTimes.reduce((min, time) => (time < min ? time : min))
|
||||
: undefined
|
||||
|
||||
if (showOnly) {
|
||||
console.log(
|
||||
`Found ${count} projects with pending changes (not added to queue)`
|
||||
)
|
||||
} else {
|
||||
console.log(`Found ${count} projects with pending changes:`)
|
||||
console.log(` ${addedCount} jobs added to queue`)
|
||||
console.log(` ${existingCount} jobs already existed in queue`)
|
||||
if (oldestChange) {
|
||||
console.log(` Oldest pending change: ${formatPendingTime(oldestChange)}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main execution block
|
||||
async function run() {
|
||||
const optionCount = [
|
||||
options.clean,
|
||||
options.status,
|
||||
options.add,
|
||||
options.monitor,
|
||||
options['queue-pending'] !== undefined,
|
||||
options['show-pending'] !== undefined,
|
||||
].filter(Boolean).length
|
||||
if (optionCount > 1) {
|
||||
console.error('Only one option can be specified')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (options.clean) {
|
||||
const beforeCounts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(beforeCounts))
|
||||
console.log('Cleaning completed and failed jobs...')
|
||||
await backupQueue.clean(1, 'completed')
|
||||
await backupQueue.clean(1, 'failed')
|
||||
const afterCounts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(afterCounts))
|
||||
console.log('Queue cleaned successfully')
|
||||
} else if (options.status) {
|
||||
const counts = await backupQueue.getJobCounts()
|
||||
console.log('Current queue state:', JSON.stringify(counts))
|
||||
} else if (options.add) {
|
||||
const inputs = Array.isArray(options.add) ? options.add : [options.add]
|
||||
for (const input of inputs) {
|
||||
if (input.includes(':')) {
|
||||
// Handle date range format
|
||||
await addDateRangeJob(input)
|
||||
} else {
|
||||
// Handle project ID format
|
||||
const { job, added } = await addJobWithCheck(
|
||||
backupQueue,
|
||||
{ projectId: input },
|
||||
{ jobId: input }
|
||||
)
|
||||
console.log(
|
||||
`${added ? 'Added' : 'Already exists'}: job for project: ${input}, job ID: ${job.id}`
|
||||
)
|
||||
}
|
||||
}
|
||||
} else if (options.monitor) {
|
||||
setupMonitoring()
|
||||
} else if (options['queue-pending'] !== undefined) {
|
||||
const age = validatePendingTime('queue-pending', options['queue-pending'])
|
||||
await processPendingProjects(
|
||||
age,
|
||||
false,
|
||||
options.limit,
|
||||
options.verbose,
|
||||
options.interval,
|
||||
{
|
||||
attempts: options.attempts,
|
||||
backoff: {
|
||||
type: 'exponential',
|
||||
delay: options['backoff-delay'],
|
||||
},
|
||||
}
|
||||
)
|
||||
} else if (options['show-pending'] !== undefined) {
|
||||
const age = validatePendingTime('show-pending', options['show-pending'])
|
||||
await processPendingProjects(age, true, options.limit, options.verbose)
|
||||
} else {
|
||||
console.log('Usage:')
|
||||
console.log(' --clean Clean up completed and failed jobs')
|
||||
console.log(' --status Show current job counts')
|
||||
console.log(' --add [projectId] Add a job for the specified projectId')
|
||||
console.log(
|
||||
' --add [YYYY-MM-DD:YYYY-MM-DD] Add a job for the specified date range'
|
||||
)
|
||||
console.log(' --monitor Monitor queue events')
|
||||
console.log(
|
||||
' --queue-pending TIME Find projects with changes older than TIME seconds and add them to the queue'
|
||||
)
|
||||
console.log(
|
||||
' --show-pending TIME Show count of pending projects older than TIME seconds'
|
||||
)
|
||||
console.log(' --limit N Limit the number of jobs to be added')
|
||||
console.log(
|
||||
' --interval TIME Time interval in seconds to spread jobs over'
|
||||
)
|
||||
console.log(
|
||||
' --backoff-delay TIME Backoff delay in milliseconds for failed jobs (default: 1000)'
|
||||
)
|
||||
console.log(
|
||||
' --attempts N Number of retry attempts for failed jobs (default: 3)'
|
||||
)
|
||||
console.log(
|
||||
' --verbose, -v Show detailed information when used with --show-pending'
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Run and handle errors
|
||||
run()
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
.then(result => {
|
||||
// Only exit if not in monitor mode
|
||||
if (!options.monitor) {
|
||||
process.exit(0)
|
||||
}
|
||||
})
|
144
services/history-v1/storage/scripts/backup_worker.mjs
Normal file
144
services/history-v1/storage/scripts/backup_worker.mjs
Normal file
@@ -0,0 +1,144 @@
|
||||
import Queue from 'bull'
|
||||
import logger from '@overleaf/logger'
|
||||
import config from 'config'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import {
|
||||
backupProject,
|
||||
initializeProjects,
|
||||
configureBackup,
|
||||
} from './backup.mjs'
|
||||
|
||||
const CONCURRENCY = 15
|
||||
const WARN_THRESHOLD = 2 * 60 * 60 * 1000 // warn if projects are older than this
|
||||
const redisOptions = config.get('redis.queue')
|
||||
const JOB_TIME_BUCKETS = [10, 100, 500, 1000, 5000, 10000, 30000, 60000] // milliseconds
|
||||
const LAG_TIME_BUCKETS_HRS = [
|
||||
0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.75, 2, 3, 4, 5, 6,
|
||||
] // hours
|
||||
|
||||
// Configure backup settings to match worker concurrency
|
||||
configureBackup({ concurrency: 50, useSecondary: true })
|
||||
|
||||
// Create a Bull queue named 'backup'
|
||||
const backupQueue = new Queue('backup', {
|
||||
redis: redisOptions,
|
||||
settings: {
|
||||
lockDuration: 15 * 60 * 1000, // 15 minutes
|
||||
lockRenewTime: 60 * 1000, // 1 minute
|
||||
maxStalledCount: 0, // mark stalled jobs as failed
|
||||
},
|
||||
})
|
||||
|
||||
// Log queue events
|
||||
backupQueue.on('active', job => {
|
||||
logger.debug({ job }, 'job is now active')
|
||||
})
|
||||
|
||||
backupQueue.on('completed', (job, result) => {
|
||||
metrics.inc('backup_worker_job', 1, { status: 'completed' })
|
||||
logger.debug({ job, result }, 'job completed')
|
||||
})
|
||||
|
||||
backupQueue.on('failed', (job, err) => {
|
||||
metrics.inc('backup_worker_job', 1, { status: 'failed' })
|
||||
logger.error({ job, err }, 'job failed')
|
||||
})
|
||||
|
||||
backupQueue.on('waiting', jobId => {
|
||||
logger.debug({ jobId }, 'job is waiting')
|
||||
})
|
||||
|
||||
backupQueue.on('error', error => {
|
||||
logger.error({ error }, 'queue error')
|
||||
})
|
||||
|
||||
backupQueue.on('stalled', job => {
|
||||
logger.error({ job }, 'job has stalled')
|
||||
})
|
||||
|
||||
backupQueue.on('lock-extension-failed', (job, err) => {
|
||||
logger.error({ job, err }, 'lock extension failed')
|
||||
})
|
||||
|
||||
backupQueue.on('paused', () => {
|
||||
logger.info('queue paused')
|
||||
})
|
||||
|
||||
backupQueue.on('resumed', () => {
|
||||
logger.info('queue resumed')
|
||||
})
|
||||
|
||||
// Process jobs
|
||||
backupQueue.process(CONCURRENCY, async job => {
|
||||
const { projectId, startDate, endDate } = job.data
|
||||
|
||||
if (projectId) {
|
||||
return await runBackup(projectId, job.data, job)
|
||||
} else if (startDate && endDate) {
|
||||
return await runInit(startDate, endDate)
|
||||
} else {
|
||||
throw new Error('invalid job data')
|
||||
}
|
||||
})
|
||||
|
||||
async function runBackup(projectId, data, job) {
|
||||
const { pendingChangeAt } = data
|
||||
// record the time it takes to run the backup job
|
||||
const timer = new metrics.Timer(
|
||||
'backup_worker_job_duration',
|
||||
1,
|
||||
{},
|
||||
JOB_TIME_BUCKETS
|
||||
)
|
||||
const pendingAge = Date.now() - pendingChangeAt
|
||||
if (pendingAge > WARN_THRESHOLD) {
|
||||
logger.warn(
|
||||
{ projectId, pendingAge, job },
|
||||
'project has been pending for a long time'
|
||||
)
|
||||
}
|
||||
try {
|
||||
logger.debug({ projectId }, 'processing backup for project')
|
||||
await backupProject(projectId, {})
|
||||
metrics.inc('backup_worker_project', 1, {
|
||||
status: 'success',
|
||||
})
|
||||
timer.done()
|
||||
// record the replication lag (time from change to backup)
|
||||
if (pendingChangeAt) {
|
||||
metrics.histogram(
|
||||
'backup_worker_replication_lag_in_hours',
|
||||
(Date.now() - pendingChangeAt) / (3600 * 1000),
|
||||
LAG_TIME_BUCKETS_HRS
|
||||
)
|
||||
}
|
||||
return `backup completed ${projectId}`
|
||||
} catch (err) {
|
||||
metrics.inc('backup_worker_project', 1, { status: 'failed' })
|
||||
logger.error({ projectId, err }, 'backup failed')
|
||||
throw err // Re-throw to mark job as failed
|
||||
}
|
||||
}
|
||||
|
||||
async function runInit(startDate, endDate) {
|
||||
try {
|
||||
logger.info({ startDate, endDate }, 'initializing projects')
|
||||
await initializeProjects({ 'start-date': startDate, 'end-date': endDate })
|
||||
return `initialization completed ${startDate} - ${endDate}`
|
||||
} catch (err) {
|
||||
logger.error({ startDate, endDate, err }, 'initialization failed')
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
export async function drainQueue() {
|
||||
logger.info({ queue: backupQueue.name }, 'pausing queue')
|
||||
await backupQueue.pause(true) // pause this worker and wait for jobs to finish
|
||||
logger.info({ queue: backupQueue.name }, 'closing queue')
|
||||
await backupQueue.close()
|
||||
}
|
||||
|
||||
export async function healthCheck() {
|
||||
const count = await backupQueue.count()
|
||||
metrics.gauge('backup_worker_queue_length', count)
|
||||
}
|
69
services/history-v1/storage/scripts/export_global_blobs.mjs
Normal file
69
services/history-v1/storage/scripts/export_global_blobs.mjs
Normal file
@@ -0,0 +1,69 @@
|
||||
/**
|
||||
* A script to export the global blobs from mongo to a CSV file.
|
||||
*
|
||||
* node storage/scripts/export_global_blobs.mjs --output global_blobs.csv
|
||||
*
|
||||
* The output CSV has the following format:
|
||||
*
|
||||
* hash,path,byteLength,stringLength,demoted
|
||||
*
|
||||
* hash: the hash of the blob
|
||||
* path: the path of the blob in the blob store
|
||||
* byteLength: the byte length of the blob, or empty if unknown
|
||||
* stringLength: the string length of the blob, or empty if unknown
|
||||
* demoted: true if the blob has been demoted to a reference, false otherwise
|
||||
*/
|
||||
|
||||
// @ts-check
|
||||
import { ObjectId } from 'mongodb'
|
||||
import { GLOBAL_BLOBS, loadGlobalBlobs } from '../lib/blob_store/index.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import fs from 'node:fs'
|
||||
|
||||
// Enable caching for ObjectId.toString()
|
||||
ObjectId.cacheHexString = true
|
||||
|
||||
function parseArgs() {
|
||||
const args = commandLineArgs([
|
||||
{
|
||||
name: 'output',
|
||||
type: String,
|
||||
alias: 'o',
|
||||
},
|
||||
])
|
||||
const OUTPUT_STREAM = fs.createWriteStream(args['output'], { flags: 'wx' })
|
||||
|
||||
return {
|
||||
OUTPUT_STREAM,
|
||||
}
|
||||
}
|
||||
|
||||
const { OUTPUT_STREAM } = parseArgs()
|
||||
|
||||
async function main() {
|
||||
await loadGlobalBlobs()
|
||||
OUTPUT_STREAM.write('hash,path,byteLength,stringLength,demoted\n')
|
||||
for (const [hash, { blob, demoted }] of GLOBAL_BLOBS) {
|
||||
const { hash: blobHash, byteLength, stringLength } = blob
|
||||
if (blobHash !== hash) {
|
||||
throw new Error(`hash mismatch: ${hash} !== ${blobHash}`)
|
||||
}
|
||||
const path = blobHash.slice(0, 2) + '/' + blobHash.slice(2)
|
||||
const byteLengthStr = byteLength === null ? '' : byteLength
|
||||
const stringLengthStr = stringLength === null ? '' : stringLength
|
||||
OUTPUT_STREAM.write(
|
||||
`${hash},${path},${byteLengthStr},${stringLengthStr},${demoted}\n`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => console.log('Done.'))
|
||||
.catch(err => {
|
||||
console.error('Error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client.close().catch(err => console.error('Error closing MongoDB:', err))
|
||||
})
|
@@ -0,0 +1,51 @@
|
||||
// @ts-check
|
||||
import { backedUpBlobs } from '../lib/mongodb.js'
|
||||
import { mongoId } from '../lib/assert.js'
|
||||
import { ObjectId } from 'mongodb'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
|
||||
const STATS = {
|
||||
total: 0,
|
||||
replaced: 0,
|
||||
skipped: 0,
|
||||
}
|
||||
|
||||
const config = commandLineArgs([
|
||||
{ name: 'commit', type: Boolean, defaultValue: false },
|
||||
])
|
||||
|
||||
async function processRecord(record) {
|
||||
STATS.total++
|
||||
try {
|
||||
mongoId(record._id)
|
||||
const newId = new ObjectId(record._id)
|
||||
if (config.commit) {
|
||||
await backedUpBlobs.updateOne(
|
||||
{ _id: newId },
|
||||
{
|
||||
$addToSet: { blobs: { $each: record.blobs } },
|
||||
},
|
||||
{ upsert: true }
|
||||
)
|
||||
await backedUpBlobs.deleteOne({ _id: record._id })
|
||||
}
|
||||
STATS.replaced++
|
||||
} catch (error) {
|
||||
console.log(error)
|
||||
STATS.skipped++
|
||||
}
|
||||
}
|
||||
|
||||
const cursor = backedUpBlobs
|
||||
.find({ _id: { $type: 'string' } })
|
||||
.project({ _id: 1, blobs: 1 })
|
||||
|
||||
while (await cursor.hasNext()) {
|
||||
const record = await cursor.next()
|
||||
await processRecord(record)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`${!config.commit ? 'DRY RUN' : ''} ${STATS.total} records ${STATS.replaced} replaced, ${STATS.skipped} skipped`
|
||||
)
|
||||
process.exit()
|
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
UPDATE blobs
|
||||
SET global = TRUE
|
||||
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
|
@@ -0,0 +1,16 @@
|
||||
CREATE TABLE global_blobs (
|
||||
hash_bytes bytea NOT NULL,
|
||||
byte_length integer NOT NULL,
|
||||
string_length integer,
|
||||
global boolean,
|
||||
CONSTRAINT global_blobs_pkey PRIMARY KEY (hash_bytes),
|
||||
CONSTRAINT global_blobs_byte_length_non_negative
|
||||
CHECK (byte_length >= 0),
|
||||
CONSTRAINT global_blobs_string_length_non_negative
|
||||
CHECK (string_length IS NULL OR string_length >= 0)
|
||||
);
|
||||
|
||||
INSERT INTO global_blobs (hash_bytes, byte_length, string_length, global)
|
||||
SELECT hash_bytes, byte_length, string_length, true
|
||||
FROM blobs
|
||||
WHERE hash_bytes IN (SELECT hash_bytes FROM global_blob_hashes);
|
@@ -0,0 +1,22 @@
|
||||
BEGIN;
|
||||
ALTER TABLE blobs RENAME TO old_blobs;
|
||||
ALTER TABLE global_blobs RENAME TO blobs;
|
||||
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_pkey TO old_blobs_pkey;
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_byte_length_non_negative
|
||||
TO old_blobs_byte_length_non_negative;
|
||||
ALTER TABLE old_blobs
|
||||
RENAME CONSTRAINT blobs_string_length_non_negative
|
||||
TO old_blobs_string_length_non_negative;
|
||||
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_pkey TO blobs_pkey;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_byte_length_non_negative
|
||||
TO blobs_byte_length_non_negative;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT global_blobs_string_length_non_negative
|
||||
TO blobs_string_length_non_negative;
|
||||
COMMIT;
|
@@ -0,0 +1,9 @@
|
||||
Scripts in this directory were used when we cleaned up the global blobs table,
|
||||
ensuring that it only contained global blobs. The scripts are meant to be run in this order:
|
||||
|
||||
* `01-create-blob-hashes-table.sql`
|
||||
* `02-set-global-flag.sql`
|
||||
* `03-create-global-blobs-table.sql`
|
||||
* `04-swap-global-blob-tables.sql`
|
||||
|
||||
The `rollback.sql` can be run to reverse the effect of `03-swap-global-blob-tables.sql`.
|
@@ -0,0 +1,22 @@
|
||||
BEGIN;
|
||||
ALTER TABLE blobs RENAME TO global_blobs;
|
||||
ALTER TABLE old_blobs RENAME TO blobs;
|
||||
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_pkey TO global_blobs_pkey;
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_byte_length_non_negative
|
||||
TO global_blobs_byte_length_non_negative;
|
||||
ALTER TABLE global_blobs
|
||||
RENAME CONSTRAINT blobs_string_length_non_negative
|
||||
TO global_blobs_string_length_non_negative;
|
||||
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_pkey TO blobs_pkey;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_byte_length_non_negative
|
||||
TO blobs_byte_length_non_negative;
|
||||
ALTER TABLE blobs
|
||||
RENAME CONSTRAINT old_blobs_string_length_non_negative
|
||||
TO blobs_string_length_non_negative;
|
||||
COMMIT;
|
379
services/history-v1/storage/scripts/recover_doc_versions.js
Normal file
379
services/history-v1/storage/scripts/recover_doc_versions.js
Normal file
@@ -0,0 +1,379 @@
|
||||
const fsPromises = require('node:fs/promises')
|
||||
const { ObjectId } = require('mongodb')
|
||||
const BPromise = require('bluebird')
|
||||
const logger = require('@overleaf/logger')
|
||||
const Settings = require('@overleaf/settings')
|
||||
const rclient = require('@overleaf/redis-wrapper').createClient(
|
||||
Settings.redis.documentupdater
|
||||
)
|
||||
const mongodb = require('../lib/mongodb')
|
||||
const { chunkStore } = require('..')
|
||||
const Events = require('node:events')
|
||||
|
||||
// Silence warning.
|
||||
Events.setMaxListeners(20)
|
||||
|
||||
const BATCH_SIZE = 1000
|
||||
const OPTIONS = {
|
||||
concurrency: parseInt(process.env.DOC_VERSION_RECOVERY_CONCURRENCY, 10) || 20,
|
||||
force: process.env.DOC_VERSION_RECOVERY_FORCE === 'true',
|
||||
'skip-history-failures':
|
||||
process.env.DOC_VERSION_RECOVERY_SKIP_HISTORY_FAILURES === 'true',
|
||||
'resyncs-needed-file': process.env.DOC_VERSION_RECOVERY_RESYNCS_NEEDED_FILE,
|
||||
}
|
||||
|
||||
const db = {
|
||||
deletedProjects: mongodb.db.collection('deletedProjects'),
|
||||
docs: mongodb.db.collection('docs'),
|
||||
migrations: mongodb.db.collection('migrations'),
|
||||
projects: mongodb.db.collection('projects'),
|
||||
}
|
||||
|
||||
const BAD_MIGRATION_NAME =
|
||||
'20231219081700_move_doc_versions_from_docops_to_docs'
|
||||
|
||||
const RECOVERY_FILES_502 = [
|
||||
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log',
|
||||
'/var/lib/overleaf/data/history/doc-version-recovery-resyncs.log.done',
|
||||
]
|
||||
|
||||
let loggingChain = Promise.resolve()
|
||||
const projectIdsThatNeedResyncing = []
|
||||
const unflushedDocIds = new Set()
|
||||
|
||||
async function flushLogQueue() {
|
||||
const logPath = OPTIONS['resyncs-needed-file']
|
||||
loggingChain = loggingChain.then(async () => {
|
||||
const batch = projectIdsThatNeedResyncing.splice(0)
|
||||
if (batch.length === 0) return
|
||||
try {
|
||||
await fsPromises.appendFile(logPath, batch.join('\n') + '\n')
|
||||
} catch (err) {
|
||||
projectIdsThatNeedResyncing.push(...batch)
|
||||
logger.err({ err, logPath, batch }, 'Failed to write to log file')
|
||||
}
|
||||
})
|
||||
await loggingChain
|
||||
}
|
||||
async function recordProjectNeedsResync(projectId) {
|
||||
if (OPTIONS['resyncs-needed-file']) {
|
||||
projectIdsThatNeedResyncing.push(projectId)
|
||||
await flushLogQueue()
|
||||
} else {
|
||||
console.log(`Project ${projectId} needs a hard resync.`)
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const recovery502Ran = await did502RecoveryRun()
|
||||
await getUnflushedDocIds()
|
||||
const badMigration = await db.migrations.findOne({ name: BAD_MIGRATION_NAME })
|
||||
|
||||
if (unflushedDocIds.size > 0 && !recovery502Ran && badMigration != null) {
|
||||
// Tell customers that they need to flush
|
||||
console.log(`
|
||||
--------------------------------------------------------------------
|
||||
Detected unflushed changes while recovering doc versions.
|
||||
Please go back to version 5.0.1 and follow the recovery procedure
|
||||
for flushing document updates:
|
||||
|
||||
https://github.com/overleaf/overleaf/wiki/Doc-version-recovery
|
||||
--------------------------------------------------------------------`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (OPTIONS.force || recovery502Ran || badMigration != null) {
|
||||
console.warn('Need to recover doc versions. This will take a while.')
|
||||
await runRecovery()
|
||||
await db.migrations.deleteOne({ name: BAD_MIGRATION_NAME })
|
||||
await delete502RecoveryFiles()
|
||||
}
|
||||
|
||||
console.log('Done.')
|
||||
}
|
||||
|
||||
async function did502RecoveryRun() {
|
||||
for (const file of RECOVERY_FILES_502) {
|
||||
try {
|
||||
await fsPromises.stat(file)
|
||||
return true
|
||||
} catch (err) {
|
||||
// file doesn't exist. continue
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async function delete502RecoveryFiles() {
|
||||
for (const file of RECOVERY_FILES_502) {
|
||||
try {
|
||||
await fsPromises.rename(file, file.replace('.log', '-5.0.2.log'))
|
||||
} catch (err) {
|
||||
// file doesn't exist. continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runRecovery() {
|
||||
let batch = []
|
||||
const summary = {
|
||||
ignored: 0,
|
||||
skipped: 0,
|
||||
deletedUpdatedMongo: 0,
|
||||
deletedUpdatedRedis: 0,
|
||||
deletedUpdatedBoth: 0,
|
||||
deletedIgnored: 0,
|
||||
updatedMongo: 0,
|
||||
updatedRedis: 0,
|
||||
updatedBoth: 0,
|
||||
}
|
||||
const processBatchAndLogProgress = async () => {
|
||||
try {
|
||||
await BPromise.map(batch, project => processProject(project, summary), {
|
||||
concurrency: OPTIONS.concurrency,
|
||||
})
|
||||
} finally {
|
||||
console.log(`${summary.updatedRedis} projects updated in Redis`)
|
||||
console.log(`${summary.updatedMongo} projects updated in Mongo`)
|
||||
console.log(
|
||||
`${summary.updatedBoth} projects updated in both Mongo and Redis`
|
||||
)
|
||||
console.log(`${summary.ignored} projects had good versions`)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedMongo} deleted projects updated in Mongo`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedRedis} deleted projects updated in Redis`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedUpdatedBoth} deleted projects updated in both Mongo and Redis`
|
||||
)
|
||||
console.log(
|
||||
`${summary.deletedIgnored} deleted projects had good versions`
|
||||
)
|
||||
console.log(`${summary.skipped} projects skipped`)
|
||||
}
|
||||
batch = []
|
||||
}
|
||||
|
||||
await printDBStats()
|
||||
await initResyncsNeededFile()
|
||||
for await (const project of getProjects()) {
|
||||
batch.push(project)
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
}
|
||||
|
||||
for await (const deletedProject of getDeletedProjects()) {
|
||||
const project = deletedProject.project
|
||||
project.isDeleted = true
|
||||
batch.push(project)
|
||||
if (batch.length >= BATCH_SIZE) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.length > 0) {
|
||||
await processBatchAndLogProgress()
|
||||
}
|
||||
|
||||
await backfillMissingVersions()
|
||||
}
|
||||
|
||||
async function getUnflushedDocIds() {
|
||||
const batchSize = 1000
|
||||
let cursor = '0'
|
||||
do {
|
||||
const [newCursor, keys] = await rclient.scan(
|
||||
cursor,
|
||||
'MATCH',
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: '*' }),
|
||||
'COUNT',
|
||||
batchSize
|
||||
)
|
||||
for (const key of keys) {
|
||||
unflushedDocIds.add(key.slice('DocVersion:'.length))
|
||||
}
|
||||
cursor = newCursor
|
||||
} while (cursor !== '0')
|
||||
}
|
||||
|
||||
async function printDBStats() {
|
||||
const projects = await db.projects.estimatedDocumentCount()
|
||||
const deletedProjects = await db.deletedProjects.countDocuments()
|
||||
const docs = await db.docs.estimatedDocumentCount()
|
||||
console.log(
|
||||
`Need to check ${projects} projects and up-to ${deletedProjects} deleted projects with a total of ${docs} docs.`
|
||||
)
|
||||
}
|
||||
|
||||
async function initResyncsNeededFile() {
|
||||
const logPath = OPTIONS['resyncs-needed-file']
|
||||
if (logPath) {
|
||||
await fsPromises.writeFile(logPath, '')
|
||||
await fsPromises.rm(`${logPath}.done`, { force: true })
|
||||
}
|
||||
}
|
||||
|
||||
function getProjects() {
|
||||
return db.projects.find({}, { projection: { _id: 1, overleaf: 1 } })
|
||||
}
|
||||
|
||||
function getDeletedProjects() {
|
||||
return db.deletedProjects.find(
|
||||
{ 'project.overleaf.history.id': { $exists: true } },
|
||||
{ projection: { 'project._id': 1, 'project.overleaf': 1 } }
|
||||
)
|
||||
}
|
||||
|
||||
async function processProject(project, summary) {
|
||||
const projectId = project._id.toString()
|
||||
let updatedMongo = false
|
||||
let updatedRedis = false
|
||||
try {
|
||||
const historyDocVersions = await getHistoryDocVersions(project)
|
||||
|
||||
for (const { docId, version } of historyDocVersions) {
|
||||
const update = await fixDocVersion(docId, version)
|
||||
if (update != null) {
|
||||
if (update.in === 'mongo') {
|
||||
updatedMongo = true
|
||||
} else if (update.in === 'redis') {
|
||||
updatedRedis = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (project.isDeleted) {
|
||||
if (updatedMongo && updatedRedis) {
|
||||
summary.deletedUpdatedBoth += 1
|
||||
} else if (updatedMongo) {
|
||||
summary.deletedUpdatedMongo += 1
|
||||
} else if (updatedRedis) {
|
||||
summary.deletedUpdatedRedis += 1
|
||||
} else {
|
||||
summary.deletedIgnored += 1
|
||||
}
|
||||
} else {
|
||||
await recordProjectNeedsResync(projectId)
|
||||
if (updatedMongo && updatedRedis) {
|
||||
summary.updatedBoth += 1
|
||||
} else if (updatedMongo) {
|
||||
summary.updatedMongo += 1
|
||||
} else if (updatedRedis) {
|
||||
summary.updatedRedis += 1
|
||||
} else {
|
||||
summary.ignored += 1
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error({ err, projectId }, 'Failed to process project')
|
||||
if (OPTIONS['skip-history-failures']) {
|
||||
summary.skipped += 1
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getHistoryDocVersions(project) {
|
||||
const historyId = project.overleaf.history.id
|
||||
const chunk = await chunkStore.loadLatest(historyId)
|
||||
if (chunk == null) {
|
||||
return []
|
||||
}
|
||||
|
||||
const snapshot = chunk.getSnapshot()
|
||||
const changes = chunk.getChanges()
|
||||
snapshot.applyAll(changes)
|
||||
const v2DocVersions = snapshot.getV2DocVersions()
|
||||
if (v2DocVersions == null) {
|
||||
return []
|
||||
}
|
||||
return Object.entries(v2DocVersions.data).map(([docId, versionInfo]) => ({
|
||||
docId,
|
||||
version: versionInfo.v,
|
||||
}))
|
||||
}
|
||||
|
||||
async function fixDocVersion(docId, historyVersion) {
|
||||
const redisVersion = await getRedisDocVersion(docId)
|
||||
if (redisVersion != null && historyVersion >= redisVersion) {
|
||||
await setRedisDocVersion(docId, historyVersion + 1)
|
||||
return {
|
||||
in: 'redis',
|
||||
previousVersion: redisVersion,
|
||||
newVersion: historyVersion + 1,
|
||||
}
|
||||
} else {
|
||||
const docBeforeUpdate = await db.docs.findOneAndUpdate(
|
||||
{
|
||||
_id: new ObjectId(docId),
|
||||
$or: [
|
||||
{ version: { $lte: historyVersion } },
|
||||
{ version: { $exists: false } },
|
||||
],
|
||||
},
|
||||
{ $set: { version: historyVersion + 1 } },
|
||||
{ projection: { _id: 1, version: 1 } }
|
||||
)
|
||||
|
||||
if (docBeforeUpdate != null) {
|
||||
return {
|
||||
in: 'mongo',
|
||||
previousVersion: docBeforeUpdate.version,
|
||||
newVersion: historyVersion + 1,
|
||||
}
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function getRedisDocVersion(docId) {
|
||||
if (!unflushedDocIds.has(docId)) {
|
||||
return null
|
||||
}
|
||||
const result = await rclient.get(
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId })
|
||||
)
|
||||
if (result == null) {
|
||||
return null
|
||||
}
|
||||
return parseInt(result, 10)
|
||||
}
|
||||
|
||||
async function setRedisDocVersion(docId, version) {
|
||||
const multi = rclient.multi()
|
||||
multi.set(
|
||||
Settings.redis.documentupdater.key_schema.docVersion({ doc_id: docId }),
|
||||
version
|
||||
)
|
||||
multi.set(`UnflushedTime:{${docId}}`, Date.now(), 'NX')
|
||||
await multi.exec()
|
||||
}
|
||||
|
||||
/**
|
||||
* Set all remaining versions to 0
|
||||
*/
|
||||
async function backfillMissingVersions() {
|
||||
console.log('Defaulting version to 0 for remaining docs.')
|
||||
await db.docs.updateMany(
|
||||
{ version: { $exists: false } },
|
||||
{ $set: { version: 0 } }
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
.finally(async () => {
|
||||
console.log('Flushing log queue.')
|
||||
await flushLogQueue()
|
||||
})
|
||||
.then(() => {
|
||||
process.exit(0)
|
||||
})
|
||||
.catch(err => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
255
services/history-v1/storage/scripts/recover_zip.js
Normal file
255
services/history-v1/storage/scripts/recover_zip.js
Normal file
@@ -0,0 +1,255 @@
|
||||
/**
|
||||
* Try to recover a zip of the latest version of a project using only data in
|
||||
* GCS, where this data may have been (recently) hard deleted (i.e. may exist
|
||||
* wholely or in part as non-current versions). This should be able to
|
||||
* retrieve the latest content of a project up to 180 days after it was
|
||||
* deleted.
|
||||
*
|
||||
* Usage:
|
||||
* node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
|
||||
*
|
||||
* Output:
|
||||
* Signed URL(s) for the uploaded zip files. Note that these are valid for
|
||||
* only 24h, to match the lifecycle rule on the zip bucket.
|
||||
*/
|
||||
|
||||
const fs = require('node:fs')
|
||||
const os = require('node:os')
|
||||
const path = require('node:path')
|
||||
const util = require('node:util')
|
||||
|
||||
// Something is registering 11 listeners, over the limit of 10, which generates
|
||||
// a lot of warning noise.
|
||||
require('node:events').EventEmitter.defaultMaxListeners = 11
|
||||
|
||||
const config = require('config')
|
||||
// We depend on this via object-persistor.
|
||||
// eslint-disable-next-line import/no-extraneous-dependencies
|
||||
const { Storage } = require('@google-cloud/storage')
|
||||
const isValidUtf8 = require('utf-8-validate')
|
||||
|
||||
const core = require('overleaf-editor-core')
|
||||
const projectKey = require('../lib/project_key')
|
||||
const streams = require('../lib/streams')
|
||||
const ProjectArchive = require('../lib/project_archive')
|
||||
|
||||
const {
|
||||
values: { verbose: VERBOSE },
|
||||
positionals: HISTORY_IDS,
|
||||
} = util.parseArgs({
|
||||
options: {
|
||||
verbose: {
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
},
|
||||
},
|
||||
allowPositionals: true,
|
||||
})
|
||||
|
||||
if (HISTORY_IDS.length === 0) {
|
||||
console.error('no history IDs; see usage')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
async function listDeletedChunks(historyId) {
|
||||
const bucketName = config.get('chunkStore.bucket')
|
||||
const storage = new Storage()
|
||||
const [files] = await storage.bucket(bucketName).getFiles({
|
||||
prefix: projectKey.format(historyId),
|
||||
versions: true,
|
||||
})
|
||||
return files
|
||||
}
|
||||
|
||||
async function findLatestChunk(historyId) {
|
||||
const files = await listDeletedChunks(historyId)
|
||||
if (files.length === 0) return null
|
||||
files.sort((a, b) => {
|
||||
if (a.name < b.name) return -1
|
||||
if (a.name > b.name) return 1
|
||||
return 0
|
||||
})
|
||||
return files[files.length - 1]
|
||||
}
|
||||
|
||||
async function downloadLatestChunk(tmp, historyId) {
|
||||
const latestChunkFile = await findLatestChunk(historyId)
|
||||
if (!latestChunkFile) throw new Error('no chunk found to recover')
|
||||
|
||||
const destination = path.join(tmp, 'latest.json')
|
||||
await latestChunkFile.download({ destination })
|
||||
return destination
|
||||
}
|
||||
|
||||
async function loadHistory(historyPathname) {
|
||||
const data = await fs.promises.readFile(historyPathname)
|
||||
const rawHistory = JSON.parse(data)
|
||||
return core.History.fromRaw(rawHistory)
|
||||
}
|
||||
|
||||
async function loadChunk(historyPathname, blobStore) {
|
||||
const history = await loadHistory(historyPathname)
|
||||
|
||||
const blobHashes = new Set()
|
||||
history.findBlobHashes(blobHashes)
|
||||
|
||||
await blobStore.fetchBlobs(blobHashes)
|
||||
await history.loadFiles('lazy', blobStore)
|
||||
|
||||
return new core.Chunk(history, 0)
|
||||
}
|
||||
|
||||
// TODO: it would be nice to export / expose this from BlobStore;
|
||||
// currently this is a copy of the method there.
|
||||
async function getStringLengthOfFile(byteLength, pathname) {
|
||||
// We have to read the file into memory to get its UTF-8 length, so don't
|
||||
// bother for files that are too large for us to edit anyway.
|
||||
if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
||||
return null
|
||||
}
|
||||
|
||||
// We need to check if the file contains nonBmp or null characters
|
||||
let data = await fs.promises.readFile(pathname)
|
||||
if (!isValidUtf8(data)) return null
|
||||
data = data.toString()
|
||||
if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
|
||||
if (core.util.containsNonBmpChars(data)) return null
|
||||
if (data.indexOf('\x00') !== -1) return null
|
||||
return data.length
|
||||
}
|
||||
|
||||
class RecoveryBlobStore {
|
||||
constructor(historyId, tmp) {
|
||||
this.historyId = historyId
|
||||
this.tmp = tmp
|
||||
this.blobs = new Map()
|
||||
}
|
||||
|
||||
async fetchBlobs(blobHashes) {
|
||||
for await (const blobHash of blobHashes) {
|
||||
await this.fetchBlob(blobHash)
|
||||
}
|
||||
}
|
||||
|
||||
async fetchBlob(hash) {
|
||||
if (this.blobs.has(hash)) return
|
||||
|
||||
if (VERBOSE) console.log('fetching blob', hash)
|
||||
|
||||
const bucketName = config.get('blobStore.projectBucket')
|
||||
const storage = new Storage()
|
||||
const [files] = await storage.bucket(bucketName).getFiles({
|
||||
prefix: this.makeProjectBlobKey(hash),
|
||||
versions: true,
|
||||
})
|
||||
|
||||
const destination = this.getBlobPathname(hash)
|
||||
|
||||
if (files.length === 0) {
|
||||
await this.fetchGlobalBlob(hash, destination)
|
||||
} else if (files.length === 1) {
|
||||
await files[0].download({ destination })
|
||||
} else {
|
||||
throw new Error('Multiple versions of blob ' + hash)
|
||||
}
|
||||
|
||||
this.blobs.set(hash, await this.makeBlob(hash, destination))
|
||||
}
|
||||
|
||||
async fetchGlobalBlob(hash, destination) {
|
||||
const bucketName = config.get('blobStore.globalBucket')
|
||||
const storage = new Storage()
|
||||
const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
|
||||
await file.download({ destination })
|
||||
}
|
||||
|
||||
async makeBlob(hash, pathname) {
|
||||
const stat = await fs.promises.stat(pathname)
|
||||
const byteLength = stat.size
|
||||
const stringLength = await getStringLengthOfFile(byteLength, pathname)
|
||||
return new core.Blob(hash, byteLength, stringLength)
|
||||
}
|
||||
|
||||
async getString(hash) {
|
||||
const stream = await this.getStream(hash)
|
||||
const buffer = await streams.readStreamToBuffer(stream)
|
||||
return buffer.toString()
|
||||
}
|
||||
|
||||
async getStream(hash) {
|
||||
return fs.createReadStream(this.getBlobPathname(hash))
|
||||
}
|
||||
|
||||
async getBlob(hash) {
|
||||
return this.blobs.get(hash)
|
||||
}
|
||||
|
||||
getBlobPathname(hash) {
|
||||
return path.join(this.tmp, hash)
|
||||
}
|
||||
|
||||
makeGlobalBlobKey(hash) {
|
||||
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
||||
}
|
||||
|
||||
makeProjectBlobKey(hash) {
|
||||
return `${projectKey.format(this.historyId)}/${hash.slice(
|
||||
0,
|
||||
2
|
||||
)}/${hash.slice(2)}`
|
||||
}
|
||||
}
|
||||
|
||||
async function uploadZip(historyId, zipPathname) {
|
||||
const bucketName = config.get('zipStore.bucket')
|
||||
const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
|
||||
const storage = new Storage()
|
||||
const destination = `${historyId}-recovered.zip`
|
||||
await storage.bucket(bucketName).upload(zipPathname, { destination })
|
||||
|
||||
const signedUrls = await storage
|
||||
.bucket(bucketName)
|
||||
.file(destination)
|
||||
.getSignedUrl({
|
||||
version: 'v4',
|
||||
action: 'read',
|
||||
expires: Date.now() + deadline,
|
||||
})
|
||||
|
||||
return signedUrls[0]
|
||||
}
|
||||
|
||||
async function restoreProject(historyId) {
|
||||
const tmp = await fs.promises.mkdtemp(
|
||||
path.join(os.tmpdir(), historyId.toString())
|
||||
)
|
||||
if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
|
||||
|
||||
const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
|
||||
const blobStore = new RecoveryBlobStore(historyId, tmp)
|
||||
const chunk = await loadChunk(latestJsonPathname, blobStore)
|
||||
|
||||
const snapshot = chunk.getSnapshot()
|
||||
for (const change of chunk.getChanges()) {
|
||||
change.applyTo(snapshot)
|
||||
}
|
||||
|
||||
if (VERBOSE) console.log('zipping', historyId)
|
||||
|
||||
const zipPathname = path.join(tmp, `${historyId}.zip`)
|
||||
const zipTimeoutMs = 60 * 1000
|
||||
const archive = new ProjectArchive(snapshot, zipTimeoutMs)
|
||||
await archive.writeZip(blobStore, zipPathname)
|
||||
|
||||
if (VERBOSE) console.log('uploading', historyId)
|
||||
|
||||
return await uploadZip(historyId, zipPathname)
|
||||
}
|
||||
|
||||
async function main() {
|
||||
for (const historyId of HISTORY_IDS) {
|
||||
const signedUrl = await restoreProject(historyId)
|
||||
console.log(signedUrl)
|
||||
}
|
||||
}
|
||||
main().catch(console.error)
|
36
services/history-v1/storage/scripts/redis.mjs
Normal file
36
services/history-v1/storage/scripts/redis.mjs
Normal file
@@ -0,0 +1,36 @@
|
||||
import redis from '@overleaf/redis-wrapper'
|
||||
import config from 'config'
|
||||
|
||||
// Get allowed Redis dbs from config
|
||||
const redisConfig = config.get('redis')
|
||||
const allowedDbs = Object.keys(redisConfig)
|
||||
|
||||
// Get the Redis db from command line argument or use the first available db as default
|
||||
const db = process.argv[2]
|
||||
|
||||
// Validate redis db
|
||||
if (!allowedDbs.includes(db)) {
|
||||
if (db) {
|
||||
console.error('Invalid redis db:', db)
|
||||
}
|
||||
console.error(`Usage: node redis.mjs [${allowedDbs.join('|')}]`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Get redis options based on command line argument
|
||||
const redisOptions = config.get(`redis.${db}`)
|
||||
console.log('Using redis db:', db)
|
||||
console.log('REDIS CONFIG', {
|
||||
...redisOptions,
|
||||
password: '*'.repeat(redisOptions.password?.length),
|
||||
})
|
||||
const rclient = redis.createClient(redisOptions)
|
||||
|
||||
try {
|
||||
await rclient.healthCheck()
|
||||
console.log('REDIS HEALTHCHECK SUCCEEDED')
|
||||
} catch (error) {
|
||||
console.error('REDIS HEALTHCHECK FAILED', error)
|
||||
} finally {
|
||||
await rclient.quit()
|
||||
}
|
104
services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
Normal file
104
services/history-v1/storage/scripts/remove_backed_up_blobs.mjs
Normal file
@@ -0,0 +1,104 @@
|
||||
// @ts-check
|
||||
import { readFileSync } from 'node:fs'
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import {
|
||||
getBackedUpBlobHashes,
|
||||
unsetBackedUpBlobHashes,
|
||||
} from '../lib/backup_store/index.js'
|
||||
|
||||
let gracefulShutdownInitiated = false
|
||||
|
||||
// Parse command line arguments
|
||||
const args = commandLineArgs([
|
||||
{ name: 'input', type: String, alias: 'i', defaultOption: true },
|
||||
{ name: 'commit', type: Boolean, default: false },
|
||||
])
|
||||
|
||||
if (!args.input) {
|
||||
console.error(
|
||||
'Usage: node remove_backed_up_blobs.mjs --input <csv-file> [--commit]'
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
if (!args.commit) {
|
||||
console.log('Running in dry-run mode. Use --commit to apply changes.')
|
||||
}
|
||||
|
||||
// Signal handling
|
||||
process.on('SIGINT', handleSignal)
|
||||
process.on('SIGTERM', handleSignal)
|
||||
|
||||
function handleSignal() {
|
||||
console.warn('Graceful shutdown initiated')
|
||||
gracefulShutdownInitiated = true
|
||||
}
|
||||
|
||||
// Process CSV and remove blobs
|
||||
async function main() {
|
||||
const projectBlobs = new Map()
|
||||
const lines = readFileSync(args.input, 'utf8').split('\n')
|
||||
const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
|
||||
|
||||
// Skip header
|
||||
for (const line of lines.slice(1)) {
|
||||
if (!line.trim() || gracefulShutdownInitiated) break
|
||||
|
||||
const [projectId, path] = line.split(',')
|
||||
const pathParts = path.split('/')
|
||||
const hash = pathParts[3] + pathParts[4]
|
||||
|
||||
if (!SHA1_HEX_REGEX.test(hash)) {
|
||||
console.warn(`Invalid SHA1 hash for project ${projectId}: ${hash}`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (!projectBlobs.has(projectId)) {
|
||||
projectBlobs.set(projectId, new Set())
|
||||
}
|
||||
projectBlobs.get(projectId).add(hash)
|
||||
}
|
||||
|
||||
// Process each project
|
||||
for (const [projectId, hashes] of projectBlobs) {
|
||||
if (gracefulShutdownInitiated) break
|
||||
|
||||
if (!args.commit) {
|
||||
console.log(
|
||||
`DRY-RUN: would remove ${hashes.size} blobs from project ${projectId}`
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
try {
|
||||
const originalHashes = await getBackedUpBlobHashes(projectId)
|
||||
if (originalHashes.size === 0) {
|
||||
continue
|
||||
}
|
||||
const result = await unsetBackedUpBlobHashes(
|
||||
projectId,
|
||||
Array.from(hashes)
|
||||
)
|
||||
if (result) {
|
||||
console.log(
|
||||
`Project ${projectId}: want to remove ${hashes.size}, removed ${originalHashes.size - result.blobs.length}, ${result.blobs.length} remaining`
|
||||
)
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Error updating project ${projectId}:`, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
main()
|
||||
.catch(err => {
|
||||
console.error('Fatal error:', err)
|
||||
process.exitCode = 1
|
||||
})
|
||||
.finally(() => {
|
||||
client
|
||||
.close()
|
||||
.catch(err => console.error('Error closing MongoDB connection:', err))
|
||||
})
|
@@ -0,0 +1,221 @@
|
||||
// @ts-check
|
||||
|
||||
/**
|
||||
* This script is used to remove blobs that have been backed up under the project ID
|
||||
* instead of the history ID (where those are different).
|
||||
*
|
||||
* This script reads a CSV file with the following format:
|
||||
* ```
|
||||
* project_id,hash
|
||||
* <mongo ID>,<hash>
|
||||
* ```
|
||||
*
|
||||
* The header row is optional. All rows will be checked for conformance to the format.
|
||||
*/
|
||||
|
||||
import commandLineArgs from 'command-line-args'
|
||||
import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
|
||||
import { makeProjectKey } from '../lib/blob_store/index.js'
|
||||
import fs from 'node:fs'
|
||||
import assert from '../lib/assert.js'
|
||||
import { client } from '../lib/mongodb.js'
|
||||
import { verifyBlobs } from '../lib/backupVerifier.mjs'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
import { getHistoryId } from '../lib/backup_store/index.js'
|
||||
|
||||
const argsSchema = [
|
||||
{
|
||||
name: 'input',
|
||||
type: String,
|
||||
},
|
||||
{
|
||||
name: 'commit',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'header',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'force',
|
||||
type: Boolean,
|
||||
},
|
||||
{
|
||||
name: 'verbose',
|
||||
type: Boolean,
|
||||
},
|
||||
]
|
||||
|
||||
const args = commandLineArgs(argsSchema)
|
||||
|
||||
async function gracefulClose(code = 0) {
|
||||
await client.close()
|
||||
process.exit(code)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {(value: unknown) => void} fn
|
||||
* @param {unknown} value
|
||||
* @return {boolean}
|
||||
*/
|
||||
function not(fn, value) {
|
||||
try {
|
||||
fn(value)
|
||||
return false
|
||||
} catch {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} row
|
||||
* @return {{projectId: string, hash: string}}
|
||||
*/
|
||||
function parseCSVRow(row) {
|
||||
const [projectId, hash] = row.split(',')
|
||||
assert.mongoId(projectId, `invalid projectId ${projectId}`)
|
||||
assert.blobHash(hash, `invalid hash ${hash}`)
|
||||
return { projectId, hash }
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} path
|
||||
* @param {boolean} hasHeader
|
||||
* @return {AsyncGenerator<{projectId: string, hash: string}, void, *>}
|
||||
*/
|
||||
async function* readCSV(path, hasHeader) {
|
||||
let seenHeader = !hasHeader
|
||||
let fh
|
||||
try {
|
||||
fh = await fs.promises.open(path, 'r')
|
||||
} catch (error) {
|
||||
console.error(`Could not open file: ${error}`)
|
||||
return await gracefulClose(1)
|
||||
}
|
||||
for await (const line of fh.readLines()) {
|
||||
if (!seenHeader) {
|
||||
const [first, second] = line.split(',')
|
||||
const noDataInHeader =
|
||||
not(assert.mongoId, first) && not(assert.blobHash, second)
|
||||
if (!noDataInHeader) {
|
||||
console.error('Data found in header row')
|
||||
return await gracefulClose(1)
|
||||
}
|
||||
seenHeader = true
|
||||
continue
|
||||
}
|
||||
try {
|
||||
yield parseCSVRow(line)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
console.info(`Skipping invalid row: ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function usage() {
|
||||
console.info(
|
||||
'Usage: remove_blobs_from_backup.mjs --input <path> [--commit] [--header] [--force] [--verbose]'
|
||||
)
|
||||
}
|
||||
|
||||
if (!args.input) {
|
||||
console.error('--input was missing')
|
||||
usage()
|
||||
await gracefulClose(1)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function deleteBlob(projectId, hash) {
|
||||
const path = makeProjectKey(projectId, hash)
|
||||
if (args.commit) {
|
||||
await backupPersistor.deleteObject(projectBlobsBucket, path)
|
||||
} else {
|
||||
console.log(`DELETE: ${path}`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {string} projectId
|
||||
* @param {string} hash
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
async function canDeleteBlob(projectId, hash) {
|
||||
let historyId
|
||||
try {
|
||||
historyId = await getHistoryId(projectId)
|
||||
} catch (error) {
|
||||
if (args.verbose) {
|
||||
console.error(error)
|
||||
}
|
||||
throw new Error(`No history ID found for project ${projectId}, skipping`)
|
||||
}
|
||||
if (historyId === projectId) {
|
||||
throw new Error(
|
||||
`Project ID and history ID are the same for ${projectId} - use --force to delete anyway`
|
||||
)
|
||||
}
|
||||
|
||||
// TODO: fix assert.postgresId to handle integers better and then stop coercing to string below
|
||||
assert.postgresId(
|
||||
`${historyId}`,
|
||||
`History ID ${historyId} does not appear to be for a postgres project`
|
||||
)
|
||||
|
||||
try {
|
||||
await verifyBlobs(`${historyId}`, [hash])
|
||||
} catch (error) {
|
||||
if (args.verbose) {
|
||||
console.error(error)
|
||||
}
|
||||
throw new Error(
|
||||
`Blob ${hash} is not backed up for project ${projectId} - use --force to delete anyway`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if (!args.commit) {
|
||||
console.log('DRY RUN: provide --commit to perform operations')
|
||||
}
|
||||
|
||||
if (args.force) {
|
||||
console.log(
|
||||
'WARNING: --force is enabled, blobs will be deleted regardless of backup status'
|
||||
)
|
||||
await setTimeout(5_000)
|
||||
}
|
||||
|
||||
let deleted = 0
|
||||
let errors = 0
|
||||
|
||||
for await (const { projectId, hash } of readCSV(args.input, args.header)) {
|
||||
if (!args.force) {
|
||||
try {
|
||||
await canDeleteBlob(projectId, hash)
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : error)
|
||||
continue
|
||||
}
|
||||
}
|
||||
try {
|
||||
await deleteBlob(projectId, hash)
|
||||
deleted++
|
||||
} catch (error) {
|
||||
errors++
|
||||
console.error(error)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Deleted: ${deleted}`)
|
||||
console.log(`Errors: ${errors}`)
|
||||
|
||||
await gracefulClose()
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user