first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
/* eslint-env mongo */
// add a TTL index to expire entries for completed resyncs in the
// projectHistorySyncState collection. The entries should only be expired if
// resyncProjectStructure is false and resyncDocContents is a zero-length array.
const now = Date.now()
const inTheFuture = now + 24 * 3600 * 1000
db.projectHistorySyncState.ensureIndex(
{ expiresAt: 1 },
{ expireAfterSeconds: 0, background: true }
)
db.projectHistorySyncState.updateMany(
{
resyncProjectStructure: false,
resyncDocContents: [],
expiresAt: { $exists: false },
},
{ $set: { expiresAt: new Date(inTheFuture) } }
)

View File

@@ -0,0 +1,328 @@
// @ts-check
import Events from 'node:events'
import { setTimeout } from 'node:timers/promises'
import readline from 'node:readline'
import fs from 'node:fs'
import minimist from 'minimist'
import { ObjectId } from 'mongodb'
import { batchedUpdate } from '@overleaf/mongo-utils/batchedUpdate.js'
import logger from '@overleaf/logger'
import Metrics from '@overleaf/metrics'
import OError from '@overleaf/o-error'
import { promiseMapWithLimit } from '@overleaf/promise-utils'
import { db, mongoClient } from '../app/js/mongodb.js'
import * as HistoryStoreManager from '../app/js/HistoryStoreManager.js'
import * as RedisManager from '../app/js/RedisManager.js'
import * as SyncManager from '../app/js/SyncManager.js'
import * as UpdatesProcessor from '../app/js/UpdatesProcessor.js'
import { NeedFullProjectStructureResyncError } from '../app/js/Errors.js'
import * as ErrorRecorder from '../app/js/ErrorRecorder.js'
// Silence warning.
Events.setMaxListeners(20)
// Enable caching for ObjectId.toString()
ObjectId.cacheHexString = true
const READ_CONCURRENCY = parseInt(process.env.READ_CONCURRENCY || '100', 10)
const WRITE_CONCURRENCY = parseInt(process.env.WRITE_CONCURRENCY || '10', 10)
const FLUSH_RETRIES = parseInt(process.env.FLUSH_RETRIES || '20', 10)
// Relevant dates:
// - 2024-12-19, start of event-hold removal in filestore bucket -> objects older than 24h are (soft-)deleted.
// - 2024-12-23, copy operation skipped in filestore when cloning project -> objects not created on clone.
// - 2025-01-24, no more filestore reads allowed in project-history -> no more empty files in history for 404s
const FILESTORE_SOFT_DELETE_START = new Date('2024-12-19T00:00:00Z')
const FILESTORE_READ_OFF = new Date('2025-01-24T15:00:00Z')
const argv = minimist(process.argv.slice(2), {
string: ['logs', 'log-latency'],
})
const LOG_LATENCY = argv['log-latency'] === 'true'
let gracefulShutdownInitiated = false
process.on('SIGINT', handleSignal)
process.on('SIGTERM', handleSignal)
function handleSignal() {
gracefulShutdownInitiated = true
console.warn('graceful shutdown initiated, draining queue')
}
const STATS = {
processedLines: 0,
success: 0,
changed: 0,
failure: 0,
skipped: 0,
checkFailure: 0,
}
function logStats() {
console.log(
JSON.stringify({
time: new Date(),
gracefulShutdownInitiated,
...STATS,
})
)
}
const logInterval = setInterval(logStats, 10_000)
/**
* @typedef {Object} FileRef
* @property {ObjectId} _id
* @property {any} linkedFileData
*/
/**
* @typedef {Object} Folder
* @property {Array<Folder>} folders
* @property {Array<FileRef>} fileRefs
*/
/**
* @typedef {Object} Project
* @property {ObjectId} _id
* @property {Date} lastUpdated
* @property {Array<Folder>} rootFolder
* @property {{history: {id: (number|string)}}} overleaf
*/
/**
* @param {Folder} folder
* @return {boolean}
*/
function checkFileTreeNeedsResync(folder) {
if (!folder) return false
if (Array.isArray(folder.fileRefs)) {
for (const fileRef of folder.fileRefs) {
if (fileRef.linkedFileData) return true
if (fileRef._id.getTimestamp() > FILESTORE_SOFT_DELETE_START) return true
}
}
if (Array.isArray(folder.folders)) {
for (const child of folder.folders) {
if (checkFileTreeNeedsResync(child)) return true
}
}
return false
}
/**
* @param {string} projectId
* @param {string} historyId
* @return {Promise<Date>}
*/
async function getLastEndTimestamp(projectId, historyId) {
const raw = await HistoryStoreManager.promises.getMostRecentVersionRaw(
projectId,
historyId,
{ readOnly: true }
)
if (!raw) throw new Error('bug: history not initialized')
return raw.endTimestamp
}
/** @type {Record<string, (project: Project) => Promise<boolean>>} */
const conditions = {
// cheap: in-memory mongo lookup
'updated after filestore soft-delete': async function (project) {
return project.lastUpdated > FILESTORE_SOFT_DELETE_START
},
// cheap: in-memory mongo lookup
'file-tree requires re-sync': async function (project) {
return checkFileTreeNeedsResync(project.rootFolder?.[0])
},
// moderate: GET from Redis
'has pending operations': async function (project) {
const n = await RedisManager.promises.countUnprocessedUpdates(
project._id.toString()
)
return n > 0
},
// expensive: GET from Mongo/Postgres via history-v1 HTTP API call
'has been flushed after filestore soft-delete': async function (project) {
// Resyncs started after soft-deleting can trigger 404s and result in empty files.
const endTimestamp = await getLastEndTimestamp(
project._id.toString(),
project.overleaf.history.id.toString()
)
return endTimestamp > FILESTORE_SOFT_DELETE_START
},
}
/**
* @param {Project} project
* @return {Promise<{projectId: string, historyId: string} | null>}
*/
async function checkProject(project) {
if (gracefulShutdownInitiated) return null
if (project._id.getTimestamp() > FILESTORE_READ_OFF) {
STATS.skipped++ // Project created after all bugs were fixed.
return null
}
const projectId = project._id.toString()
const historyId = project.overleaf.history.id.toString()
for (const [condition, check] of Object.entries(conditions)) {
try {
if (await check(project)) return { projectId, historyId }
} catch (err) {
logger.err({ projectId, condition, err }, 'failed to check project')
STATS.checkFailure++
return null
}
}
STATS.skipped++
return null
}
/**
* @param {string} projectId
* @param {string} historyId
* @return {Promise<void>}
*/
async function processProject(projectId, historyId) {
if (gracefulShutdownInitiated) return
const t0 = performance.now()
try {
await tryProcessProject(projectId, historyId)
const latency = performance.now() - t0
if (LOG_LATENCY) {
logger.info({ projectId, historyId, latency }, 'processed project')
}
STATS.success++
} catch (err) {
logger.err({ err, projectId, historyId }, 'failed to process project')
STATS.failure++
}
}
/**
* @param {string} projectId
* @return {Promise<void>}
*/
async function flushWithRetries(projectId) {
for (let attempt = 0; attempt < FLUSH_RETRIES; attempt++) {
try {
await UpdatesProcessor.promises.processUpdatesForProject(projectId)
return
} catch (err) {
logger.warn(
{ projectId, err, attempt },
'failed to flush updates, trying again'
)
if (gracefulShutdownInitiated) throw err
}
}
try {
await UpdatesProcessor.promises.processUpdatesForProject(projectId)
} catch (err) {
// @ts-ignore err is Error
throw new OError('failed to flush updates', {}, err)
}
}
/**
* @param {string} projectId
* @param {string} historyId
* @return {Promise<void>}
*/
async function tryProcessProject(projectId, historyId) {
await flushWithRetries(projectId)
const start = new Date()
let needsFullSync = false
try {
await UpdatesProcessor.promises.startResyncAndProcessUpdatesUnderLock(
projectId,
{ resyncProjectStructureOnly: true }
)
} catch (err) {
if (err instanceof NeedFullProjectStructureResyncError) {
needsFullSync = true
} else {
throw err
}
}
if (needsFullSync) {
logger.warn(
{ projectId, historyId },
'structure only resync not sufficient, doing full soft resync'
)
await SyncManager.promises.startResync(projectId, {})
await UpdatesProcessor.promises.processUpdatesForProject(projectId)
STATS.changed++
} else {
const after = await getLastEndTimestamp(projectId, historyId)
if (after > start) {
STATS.changed++
}
}
// Avoid db.projectHistorySyncState from growing for each project we resynced.
// MongoDB collections cannot shrink on their own. In case of success, purge
// the db entry created by this script right away.
await SyncManager.promises.clearResyncStateIfAllAfter(projectId, start)
}
async function processBatch(projects) {
const projectIds = (
await promiseMapWithLimit(READ_CONCURRENCY, projects, checkProject)
).filter(id => !!id)
await promiseMapWithLimit(WRITE_CONCURRENCY, projectIds, ids =>
processProject(ids.projectId, ids.historyId)
)
if (gracefulShutdownInitiated) throw new Error('graceful shutdown triggered')
}
async function processProjectsFromLog() {
const rl = readline.createInterface({
input: fs.createReadStream(argv.logs),
})
for await (const line of rl) {
if (gracefulShutdownInitiated) break
STATS.processedLines++
if (!line.startsWith('{')) continue
const { projectId, historyId, msg } = JSON.parse(line)
if (msg !== 'failed to process project') continue
await processProject(projectId, historyId) // does try/catch with logging
}
}
async function main() {
if (argv.logs) {
await processProjectsFromLog()
return
}
await batchedUpdate(db.projects, {}, processBatch, {
_id: 1,
lastUpdated: 1,
'overleaf.history': 1,
rootFolder: 1,
})
}
try {
try {
await main()
} finally {
clearInterval(logInterval)
logStats()
Metrics.close()
await mongoClient.close()
// TODO(das7pad): graceful shutdown for redis. Refactor process.exit when done.
}
console.log('Done.')
await setTimeout(1_000)
if (STATS.failure) {
process.exit(Math.min(STATS.failure, 99))
} else {
process.exit(0)
}
} catch (err) {
logger.err({ err }, 'fatal error')
await setTimeout(1_000)
process.exit(100)
}

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env node
// Clear timestamps which don't have any corresponding history ops
// usage: scripts/flush_all.js <limit>
import logger from '@overleaf/logger'
import * as RedisManager from '../app/js/RedisManager.js'
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
// find all dangling timestamps and clear them
async function main() {
logger.info(
{ limit },
'running redis scan for project timestamps, this may take a while'
)
const projectIdsWithFirstOpTimestamps =
await RedisManager.promises.getProjectIdsWithFirstOpTimestamps(limit)
const totalTimestamps = projectIdsWithFirstOpTimestamps.length
logger.info(
{ totalTimestamps },
'scan completed, now clearing dangling timestamps'
)
let clearedTimestamps = 0
let processed = 0
for (const projectId of projectIdsWithFirstOpTimestamps) {
const result =
await RedisManager.promises.clearDanglingFirstOpTimestamp(projectId)
processed++
clearedTimestamps += result
if (processed % 1000 === 0) {
logger.info(
{ processed, totalTimestamps, clearedTimestamps },
'clearing timestamps'
)
}
}
logger.info({ processed, totalTimestamps, clearedTimestamps }, 'completed')
process.exit(0)
}
main()

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env node
import async from 'async'
import logger from '@overleaf/logger'
import Settings from '@overleaf/settings'
import redis from '@overleaf/redis-wrapper'
import { db, ObjectId } from '../app/js/mongodb.js'
logger.logger.level('fatal')
const rclient = redis.createClient(Settings.redis.project_history)
const Keys = Settings.redis.project_history.key_schema
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
const force = argv[1] === 'force' || false
let delay = 0
function checkAndClear(project, callback) {
const projectId = project.project_id
function checkDeleted(cb) {
db.projects.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { _id: 1 } },
(err, result) => {
if (err) {
cb(err)
} else if (!result) {
// project not found, but we still need to look at deletedProjects
cb()
} else {
console.log(`Project ${projectId} found in projects`)
cb(new Error('error: project still exists'))
}
}
)
}
function checkRecoverable(cb) {
db.deletedProjects.findOne(
{
// this condition makes use of the index
'deleterData.deletedProjectId': new ObjectId(projectId),
// this condition checks if the deleted project has expired
'project._id': new ObjectId(projectId),
},
{ projection: { _id: 1 } },
(err, result) => {
if (err) {
cb(err)
} else if (!result) {
console.log(
`project ${projectId} has been deleted - safe to clear queue`
)
cb()
} else {
console.log(`Project ${projectId} found in deletedProjects`)
cb(new Error('error: project still exists'))
}
}
)
}
function clearRedisQueue(cb) {
const key = Keys.projectHistoryOps({ project_id: projectId })
delay++
if (force) {
console.log('setting redis key', key, 'to expire in', delay, 'seconds')
// use expire to allow redis to delete the key in the background
rclient.expire(key, delay, err => {
cb(err)
})
} else {
console.log(
'dry run, would set key',
key,
'to expire in',
delay,
'seconds'
)
cb()
}
}
function clearMongoEntry(cb) {
if (force) {
console.log('deleting key in mongo projectHistoryFailures', projectId)
db.projectHistoryFailures.deleteOne({ project_id: projectId }, cb)
} else {
console.log('would delete failure record for', projectId, 'from mongo')
cb()
}
}
// do the checks and deletions
async.waterfall(
[checkDeleted, checkRecoverable, clearRedisQueue, clearMongoEntry],
err => {
if (!err || err.message === 'error: project still exists') {
callback()
} else {
console.log('error:', err)
callback(err)
}
}
)
}
// find all the broken projects from the failure records
async function main() {
const results = await db.projectHistoryFailures.find({}).toArray()
processFailures(results)
}
main().catch(error => {
console.error(error)
process.exit(1)
})
function processFailures(results) {
if (argv.length === 0) {
console.log(`
Usage: node clear_deleted.js [QUEUES] [FORCE]
where
QUEUES is the number of queues to process
FORCE is the string "force" when we're ready to delete the queues. Without it, this script does a dry-run
`)
}
console.log('number of stuck projects', results.length)
// now check if the project is truly deleted in mongo
async.eachSeries(results.slice(0, limit), checkAndClear, err => {
console.log('DONE', err)
process.exit()
})
}

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env node
// To run in dev:
//
// docker compose run --rm project-history scripts/clear_deleted.js
//
// In production:
//
// docker run --rm $(docker ps -lq) scripts/clear_deleted.js
import async from 'async'
import logger from '@overleaf/logger'
import Settings from '@overleaf/settings'
import redis from '@overleaf/redis-wrapper'
import { db, ObjectId } from '../app/js/mongodb.js'
logger.logger.level('fatal')
const rclient = redis.createClient(Settings.redis.project_history)
const Keys = Settings.redis.project_history.key_schema
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
const force = argv[1] === 'force' || false
let projectNotFoundErrors = 0
let projectImportedFromV1Errors = 0
const projectsNotFound = []
const projectsImportedFromV1 = []
let projectWithHistoryIdErrors = 0
const projectsWithHistoryId = []
function checkAndClear(project, callback) {
const projectId = project.project_id
console.log('checking project', projectId)
function checkDeleted(cb) {
db.projects.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { overleaf: true } },
(err, result) => {
console.log(
'1. looking in mongo projects collection: err',
err,
'result',
JSON.stringify(result)
)
if (err) {
return cb(err)
}
if (!result) {
return cb(new Error('project not found in mongo'))
}
if (
result &&
result.overleaf &&
!result.overleaf.id &&
result.overleaf.history &&
!result.overleaf.history.id &&
result.overleaf.history.deleted_id
) {
console.log(
' - project is not imported from v1 and has a deleted_id - ok to clear'
)
return cb()
} else if (result && result.overleaf && result.overleaf.id) {
console.log(' - project is imported from v1')
return cb(
new Error('project is imported from v1 - will not clear it')
)
} else if (
result &&
result.overleaf &&
result.overleaf.history &&
result.overleaf.history.id
) {
console.log(' - project has a history id')
return cb(new Error('project has a history id - will not clear it'))
} else {
console.log(' - project state not recognised')
return cb(new Error('project state not recognised'))
}
}
)
}
function clearRedisQueue(cb) {
const key = Keys.projectHistoryOps({ project_id: projectId })
if (force) {
console.log('deleting redis key', key)
rclient.del(key, err => {
cb(err)
})
} else {
console.log('dry run, would deleted key', key)
cb()
}
}
function clearMongoEntry(cb) {
if (force) {
console.log('deleting key in mongo projectHistoryFailures', projectId)
db.projectHistoryFailures.deleteOne(
{ project_id: projectId },
(err, result) => {
console.log('got result from remove', err, result)
cb(err)
}
)
} else {
console.log('would delete failure record for', projectId, 'from mongo')
cb()
}
}
// do the checks and deletions
async.waterfall([checkDeleted, clearRedisQueue, clearMongoEntry], err => {
if (!err) {
if (force) {
return setTimeout(callback, 100)
} // include a 1 second delay
return callback()
} else if (err.message === 'project not found in mongo') {
projectNotFoundErrors++
projectsNotFound.push(projectId)
return callback()
} else if (err.message === 'project has a history id - will not clear it') {
projectWithHistoryIdErrors++
projectsWithHistoryId.push(projectId)
return callback()
} else if (
err.message === 'project is imported from v1 - will not clear it'
) {
projectImportedFromV1Errors++
projectsImportedFromV1.push(projectId)
return callback()
} else {
console.log('error:', err)
return callback(err)
}
})
}
// find all the broken projects from the failure records
async function main() {
const results = await db.projectHistoryFailures
.find({ error: /history store a non-success status code: 422/ })
.toArray()
console.log('number of queues without history store 442 =', results.length)
// now check if the project is truly deleted in mongo
async.eachSeries(results.slice(0, limit), checkAndClear, err => {
console.log('Final error status', err)
console.log(
'Project not found errors',
projectNotFoundErrors,
projectsNotFound
)
console.log(
'Project with history id errors',
projectWithHistoryIdErrors,
projectsWithHistoryId
)
console.log(
'Project imported from V1 errors',
projectImportedFromV1Errors,
projectsImportedFromV1
)
process.exit()
})
}
main().catch(error => {
console.error(error)
process.exit(1)
})

View File

@@ -0,0 +1,204 @@
#!/usr/bin/env node
// To run in dev:
//
// docker compose run --rm project-history scripts/clear_deleted.js
//
// In production:
//
// docker run --rm $(docker ps -lq) scripts/clear_deleted.js
import async from 'async'
import logger from '@overleaf/logger'
import request from 'request'
import Settings from '@overleaf/settings'
import redis from '@overleaf/redis-wrapper'
import { db, ObjectId } from '../app/js/mongodb.js'
logger.logger.level('fatal')
const rclient = redis.createClient(Settings.redis.project_history)
const Keys = Settings.redis.project_history.key_schema
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
const force = argv[1] === 'force' || false
let projectNotFoundErrors = 0
let projectImportedFromV1Errors = 0
const projectsNotFound = []
const projectsImportedFromV1 = []
function checkAndClear(project, callback) {
const projectId = project.project_id
console.log('checking project', projectId)
// These can probably also be reset and their overleaf.history.id unset
// (unless they are v1 projects).
function checkNotV1Project(cb) {
db.projects.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { overleaf: true } },
(err, result) => {
console.log(
'1. looking in mongo projects collection: err',
err,
'result',
JSON.stringify(result)
)
if (err) {
return cb(err)
}
if (!result) {
return cb(new Error('project not found in mongo'))
}
if (result && result.overleaf && !result.overleaf.id) {
console.log(' - project is not imported from v1 - ok to clear')
cb()
} else {
cb(new Error('project is imported from v1 - will not clear it'))
}
}
)
}
function clearProjectHistoryInMongo(cb) {
if (force) {
console.log('2. deleting overleaf.history.id in mongo project', projectId)
// Accessing mongo projects collection directly - BE CAREFUL!
db.projects.updateOne(
{ _id: new ObjectId(projectId) },
{ $unset: { 'overleaf.history.id': '' } },
(err, result) => {
console.log(' - got result from remove', err, result)
if (err) {
return err
}
if (
result &&
(result.modifiedCount === 1 || result.modifiedCount === 0)
) {
return cb()
} else {
return cb(
new Error('error: problem trying to unset overleaf.history.id')
)
}
}
)
} else {
console.log(
'2. would delete overleaf.history.id for',
projectId,
'from mongo'
)
cb()
}
}
function clearDocUpdaterCache(cb) {
const url = Settings.apis.documentupdater.url + '/project/' + projectId
if (force) {
console.log('3. making request to clear docupdater', url)
request.delete(url, (err, response, body) => {
console.log(
' - result of request',
err,
response && response.statusCode,
body
)
cb(err)
})
} else {
console.log('3. dry run, would request DELETE on url', url)
cb()
}
}
function clearRedisQueue(cb) {
const key = Keys.projectHistoryOps({ project_id: projectId })
if (force) {
console.log('4. deleting redis queue key', key)
rclient.del(key, err => {
cb(err)
})
} else {
console.log('4. dry run, would delete redis key', key)
cb()
}
}
function clearMongoEntry(cb) {
if (force) {
console.log('5. deleting key in mongo projectHistoryFailures', projectId)
db.projectHistoryFailures.deleteOne(
{ project_id: projectId },
(err, result) => {
console.log(' - got result from remove', err, result)
cb(err)
}
)
} else {
console.log('5. would delete failure record for', projectId, 'from mongo')
cb()
}
}
// do the checks and deletions
async.waterfall(
[
checkNotV1Project,
clearProjectHistoryInMongo,
clearDocUpdaterCache,
clearRedisQueue,
clearMongoEntry,
],
err => {
if (!err) {
return setTimeout(callback, 1000) // include a 1 second delay
} else if (err.message === 'project not found in mongo') {
projectNotFoundErrors++
projectsNotFound.push(projectId)
return callback()
} else if (
err.message === 'project is imported from v1 - will not clear it'
) {
projectImportedFromV1Errors++
projectsImportedFromV1.push(projectId)
return callback()
} else {
console.log('error:', err)
return callback(err)
}
}
)
}
// find all the broken projects from the failure records
async function main() {
const results = await db.projectHistoryFailures
.find({ error: 'Error: bad response from filestore: 404' })
.toArray()
console.log('number of queues without filestore 404 =', results.length)
// now check if the project is truly deleted in mongo
async.eachSeries(results.slice(0, limit), checkAndClear, err => {
console.log('Final error status', err)
console.log(
'Project not found errors',
projectNotFoundErrors,
projectsNotFound
)
console.log(
'Project imported from V1 errors',
projectImportedFromV1Errors,
projectsImportedFromV1
)
process.exit()
})
}
main().catch(error => {
console.error(error)
process.exit(1)
})

View File

@@ -0,0 +1,260 @@
#!/usr/bin/env node
// To run in dev:
//
// docker compose run --rm project-history scripts/clear_deleted.js
//
// In production:
//
// docker run --rm $(docker ps -lq) scripts/clear_deleted.js
import async from 'async'
import logger from '@overleaf/logger'
import request from 'request'
import Settings from '@overleaf/settings'
import redis from '@overleaf/redis-wrapper'
import { db, ObjectId } from '../app/js/mongodb.js'
logger.logger.level('fatal')
const rclient = redis.createClient(Settings.redis.project_history)
const Keys = Settings.redis.project_history.key_schema
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
const force = argv[1] === 'force' || false
let projectNotFoundErrors = 0
let projectImportedFromV1Errors = 0
const projectsNotFound = []
const projectsImportedFromV1 = []
let projectHasV2HistoryErrors = 0
const projectsV2HistoryInUse = []
function checkAndClear(project, callback) {
const projectId = project.project_id
console.log('checking project', projectId)
// These can probably also be reset and their overleaf.history.id unset
// (unless they are v1 projects).
function checkNotV1Project(cb) {
db.projects.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { overleaf: true } },
(err, result) => {
console.log(
'1. looking in mongo projects collection: err',
err,
'result',
JSON.stringify(result)
)
if (err) {
return cb(err)
}
if (!result) {
return cb(new Error('project not found in mongo'))
}
const isV1Project = result && result.overleaf && result.overleaf.id
const hasHistoryId =
result &&
result.overleaf &&
result.overleaf.history &&
result.overleaf.history.id
const hasV2HistoryInUse =
result &&
result.overleaf &&
result.overleaf.history &&
result.overleaf.history.display
const hasExistingDeletedHistory =
result &&
result.overleaf.history &&
result.overleaf.history.deleted_id
if (
hasHistoryId &&
!(isV1Project || hasV2HistoryInUse || hasExistingDeletedHistory)
) {
console.log(
' - project is not imported from v1 and v2 history is not in use - ok to clear'
)
return cb()
} else if (hasHistoryId && hasExistingDeletedHistory) {
console.log(' - project already has deleted_id')
return cb(
new Error('project already has deleted_id - will not clear it')
)
} else if (hasHistoryId && isV1Project) {
console.log(' - project is imported from v1')
return cb(
new Error('project is imported from v1 - will not clear it')
)
} else if (hasHistoryId && hasV2HistoryInUse) {
console.log(' - project is displaying v2 history')
return cb(
new Error('project is displaying v2 history - will not clear it')
)
} else {
console.log(' - project state not recognised')
return cb(new Error('project state not recognised'))
}
}
)
}
function clearProjectHistoryInMongo(cb) {
if (force) {
console.log('2. deleting overleaf.history.id in mongo project', projectId)
// Accessing mongo projects collection directly - BE CAREFUL!
db.projects.updateOne(
{ _id: new ObjectId(projectId) },
{ $rename: { 'overleaf.history.id': 'overleaf.history.deleted_id' } },
(err, result) => {
console.log(' - got result from remove', err, result)
if (err) {
return err
}
if (
result &&
(result.modifiedCount === 1 || result.modifiedCount === 0)
) {
return cb()
} else {
return cb(
new Error('error: problem trying to unset overleaf.history.id')
)
}
}
)
} else {
console.log(
'2. would delete overleaf.history.id for',
projectId,
'from mongo'
)
cb()
}
}
function clearDocUpdaterCache(cb) {
const url = Settings.apis.documentupdater.url + '/project/' + projectId
if (force) {
console.log('3. making request to clear docupdater', url)
request.delete(url, (err, response, body) => {
console.log(
' - result of request',
err,
response && response.statusCode,
body
)
cb(err)
})
} else {
console.log('3. dry run, would request DELETE on url', url)
cb()
}
}
function clearRedisQueue(cb) {
const key = Keys.projectHistoryOps({ project_id: projectId })
if (force) {
console.log('4. deleting redis queue key', key)
rclient.del(key, err => {
cb(err)
})
} else {
console.log('4. dry run, would delete redis key', key)
cb()
}
}
function clearMongoEntry(cb) {
if (force) {
console.log('5. deleting key in mongo projectHistoryFailures', projectId)
db.projectHistoryFailures.deleteOne(
{ project_id: projectId },
(err, result) => {
console.log(' - got result from remove', err, result)
cb(err)
}
)
} else {
console.log('5. would delete failure record for', projectId, 'from mongo')
cb()
}
}
// do the checks and deletions
async.waterfall(
[
checkNotV1Project,
clearProjectHistoryInMongo,
clearDocUpdaterCache,
clearRedisQueue,
clearMongoEntry,
],
err => {
if (!err) {
return setTimeout(callback, 100) // include a delay
} else if (err.message === 'project not found in mongo') {
projectNotFoundErrors++
projectsNotFound.push(projectId)
return callback()
} else if (
err.message === 'project is imported from v1 - will not clear it'
) {
projectImportedFromV1Errors++
projectsImportedFromV1.push(projectId)
return callback()
} else if (
err.message === 'project is displaying v2 history - will not clear it'
) {
projectHasV2HistoryErrors++
projectsV2HistoryInUse.push(projectId)
return callback()
} else {
console.log('error:', err)
return callback(err)
}
}
)
}
// find all the broken projects from the failure records
async function main() {
const results = await db.projectHistoryFailures
.find({
error:
'OpsOutOfOrderError: project structure version out of order on incoming updates',
})
.toArray()
console.log(
'number of queues with project structure version out of order on incoming updates=',
results.length
)
// now clear the projects
async.eachSeries(results.slice(0, limit), checkAndClear, err => {
console.log('Final error status', err)
console.log(
'Project not found errors',
projectNotFoundErrors,
projectsNotFound
)
console.log(
'Project imported from V1 errors',
projectImportedFromV1Errors,
projectsImportedFromV1
)
console.log(
'Project has V2 history in use',
projectHasV2HistoryErrors,
projectsV2HistoryInUse
)
process.exit()
})
}
main().catch(error => {
console.error(error)
process.exit(1)
})

View File

@@ -0,0 +1,74 @@
#!/usr/bin/env node
/**
* This script takes a dump file, obtained via the /project/:project_id/dump
* endpoint and feeds it to the update translator to how updates are transfomed
* into changes sent to v1 history.
*/
import fs from 'node:fs'
import * as UpdateTranslator from '../app/js/UpdateTranslator.js'
import * as SyncManager from '../app/js/SyncManager.js'
import * as HistoryStoreManager from '../app/js/HistoryStoreManager.js'
const { filename } = parseArgs()
const { projectId, updates, chunk } = parseDumpFile(filename)
function expandResyncProjectStructure(chunk, update) {
HistoryStoreManager._mocks.getMostRecentChunk = function (
projectId,
projectHistoryId,
callback
) {
callback(null, chunk)
}
SyncManager.expandSyncUpdates(
projectId,
99999, // dummy history id
chunk,
[update],
cb => cb(), // extend lock
(err, result) => {
console.log('err', err, 'result', JSON.stringify(result, null, 2))
process.exit()
}
)
}
function expandUpdates(updates) {
const wrappedUpdates = updates.map(update => ({ update }))
let changes
try {
changes = UpdateTranslator.convertToChanges(projectId, wrappedUpdates)
} catch (err) {
error(err)
}
console.log(JSON.stringify(changes, null, 2))
}
if (updates[0].resyncProjectStructure) {
expandResyncProjectStructure(chunk, updates[0])
} else {
expandUpdates(updates)
}
function parseArgs() {
const args = process.argv.slice(2)
if (args.length !== 1) {
console.log('Usage: debug_translate_updates.js DUMP_FILE')
process.exit(1)
}
const filename = args[0]
return { filename }
}
function parseDumpFile(filename) {
const json = fs.readFileSync(filename)
const { project_id: projectId, updates, chunk } = JSON.parse(json)
return { projectId, updates, chunk }
}
function error(err) {
console.error(err)
process.exit(1)
}

View File

@@ -0,0 +1,93 @@
#!/usr/bin/env node
// To run in dev:
//
// docker compose run --rm project-history scripts/flush_all.js <limit>
//
// In production:
//
// docker run --rm $(docker ps -lq) scripts/flush_all.js <limit>
import _ from 'lodash'
import async from 'async'
import logger from '@overleaf/logger'
import * as RedisManager from '../app/js/RedisManager.js'
import * as UpdatesProcessor from '../app/js/UpdatesProcessor.js'
logger.logger.level('fatal')
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
const parallelism = Math.min(parseInt(argv[1], 10) || 1, 10)
// flush all outstanding changes
RedisManager.getProjectIdsWithHistoryOps(limit, flushProjects)
function flushProjects(error, projectIds) {
if (error) {
throw error
}
let ts = new Date()
console.log(
'found projects',
JSON.stringify({ project_ids: projectIds.length, limit, ts })
)
projectIds = _.shuffle(projectIds) // randomise to avoid hitting same projects each time
if (limit > 0) {
projectIds = projectIds.slice(0, limit)
}
let succeededProjects = 0
let failedProjects = 0
let attempts = 0
async.eachLimit(
projectIds,
parallelism,
function (projectId, cb) {
attempts++
UpdatesProcessor.processUpdatesForProject(
projectId,
function (err, queueSize) {
const progress = attempts + '/' + projectIds.length
ts = new Date()
if (err) {
failedProjects++
console.log(
'failed',
progress,
JSON.stringify({
projectId,
queueSize,
ts,
err: err.toString(),
})
)
} else {
succeededProjects++
console.log(
'succeeded',
progress,
JSON.stringify({
projectId,
queueSize,
ts,
})
)
}
return cb()
}
)
},
function () {
console.log(
'total',
JSON.stringify({
succeededProjects,
failedProjects,
})
)
process.exit(0)
}
)
}

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env node
import Settings from '@overleaf/settings'
import minimist from 'minimist'
import logger from '@overleaf/logger'
import PQueue from 'p-queue'
import * as RedisManager from '../app/js/RedisManager.js'
import * as ErrorRecorder from '../app/js/ErrorRecorder.js'
logger.logger.level('fatal')
function usage() {
console.log(`
Usage: flush_old.js [options]
Options:
-b, --batch-size <size> Number of projects to process in each batch (default: 100)
-a, --max-age <seconds> Maximum age of projects to keep (default: 3600)
-i, --interval <seconds> Interval to spread the processing over (default: 300)
-c, --concurrency <number> Number of concurrent jobs (default: 10)
-u, --buffer <seconds> Buffer time in seconds to reserve at end (default: 15)
-n, --dry-run Show what would be done without making changes
-h, --help Show this help message
Examples:
# Flush projects older than 24 hours with 5 concurrent jobs
flush_old.js --batch-size 100 --max-age 86400 -c 5
# Dry run to see what would be flushed
flush_old.js --max-age 3600 --dry-run
`)
process.exit(0)
}
const argv = minimist(process.argv.slice(2), {
boolean: ['dry-run', 'help'],
alias: {
b: 'batch-size',
a: 'max-age',
i: 'interval',
c: 'concurrency',
n: 'dry-run',
u: 'buffer',
h: 'help',
},
default: {
'batch-size': 100,
'max-age': 3600,
interval: 300,
concurrency: 10,
'dry-run': false,
buffer: 15,
help: false,
},
})
if (argv.help || process.argv.length === 2) {
usage()
}
const batchSize = parseInt(argv['batch-size'], 10)
const maxAge = argv['max-age'] ? parseInt(argv['max-age'], 10) : null
const interval = parseInt(argv.interval, 10) || 300
const concurrency = parseInt(argv.concurrency, 10) || 10
const bufferTime = parseInt(argv.buffer, 10) || 15
const dryRun = argv['dry-run']
/**
* Generator function that yields batches of items from an array
* @param {Array} array - The array to batch
* @param {number} size - The size of each batch
* @yields {Array} A batch of items
*/
function* getBatches(array, size) {
for (let i = 0; i < array.length; i += size) {
yield array.slice(i, i + size)
}
}
let flushCount = 0
async function flushProject({ projectId, timestamp }) {
const url = `${Settings.apis.project_history.url}/project/${projectId}/flush`
if (dryRun) {
console.log(`[DRY RUN] would flush project ${projectId}`)
return
}
const response = await fetch(url, {
method: 'POST',
})
flushCount++
if (flushCount % 100 === 0) {
console.log('flushed', flushCount, 'projects, up to', timestamp)
}
if (!response.ok) {
throw new Error(`failed to flush project ${projectId}`)
}
}
const SCRIPT_START_TIME = Date.now() // current time in milliseconds from start of script
function olderThan(maxAge, timestamp) {
const age = (SCRIPT_START_TIME - timestamp) / 1000
return age > maxAge
}
async function main() {
const projectIds = await RedisManager.promises.getProjectIdsWithHistoryOps()
const failedProjects = await ErrorRecorder.promises.getFailedProjects()
const failedProjectIds = new Set(failedProjects.map(p => p.project_id))
const projectIdsToProcess = projectIds.filter(p => !failedProjectIds.has(p))
console.log('number of projects with history ops', projectIds.length)
console.log(
'number of failed projects to exclude',
projectIds.length - projectIdsToProcess.length
)
const collectedProjects = []
let nullCount = 0
// iterate over the project ids in batches of doing a redis MGET to retrieve the first op timestamps
for (const batch of getBatches(projectIdsToProcess, batchSize)) {
const timestamps = await RedisManager.promises.getFirstOpTimestamps(batch)
const newProjects = batch
.map((projectId, idx) => {
return { projectId, timestamp: timestamps[idx] }
})
.filter(({ timestamp }) => {
if (!timestamp) {
nullCount++
}
return timestamp ? olderThan(maxAge, timestamp) : true
})
collectedProjects.push(...newProjects)
}
// sort the collected projects by ascending timestamp
collectedProjects.sort((a, b) => a.timestamp - b.timestamp)
console.log('number of projects to flush', collectedProjects.length)
console.log('number with null timestamps', nullCount)
const elapsedTime = Math.floor((Date.now() - SCRIPT_START_TIME) / 1000)
console.log('elapsed time', elapsedTime, 'seconds, buffer time', bufferTime)
const remainingTime = Math.max(interval - elapsedTime - bufferTime, 0)
console.log('remaining time', remainingTime, 'seconds')
const jobsPerSecond = Math.max(
Math.ceil(collectedProjects.length / Math.max(remainingTime, 60)),
1
)
console.log('interval', interval, 'seconds')
console.log('jobs per second', jobsPerSecond)
console.log('concurrency', concurrency)
const queue = new PQueue({
concurrency,
interval: 1000,
intervalCap: jobsPerSecond,
})
const taskFns = collectedProjects.map(project => {
return async () => {
try {
await flushProject(project)
return { status: 'fulfilled', value: project }
} catch (error) {
return { status: 'rejected', reason: error, project }
}
}
})
const results = await queue.addAll(taskFns)
console.log(
'finished after',
Math.floor((Date.now() - SCRIPT_START_TIME) / 1000),
'seconds'
)
// count the number of successful and failed flushes
const success = results.filter(r => r.status === 'fulfilled').length
const failed = results.filter(r => r.status === 'rejected').length
console.log('completed', { success, failed })
}
main()
.then(() => {
process.exit(0)
})
.catch(err => {
console.error(err)
process.exit(1)
})

View File

@@ -0,0 +1,233 @@
#!/usr/bin/env node
// To run in dev:
//
// docker compose run --rm project-history scripts/clear_deleted.js
//
// In production:
//
// docker run --rm $(docker ps -lq) scripts/clear_deleted.js
import async from 'async'
import Settings from '@overleaf/settings'
import redis from '@overleaf/redis-wrapper'
import { db, ObjectId } from '../app/js/mongodb.js'
import * as SyncManager from '../app/js/SyncManager.js'
import * as UpdatesProcessor from '../app/js/UpdatesProcessor.js'
const rclient = redis.createClient(Settings.redis.project_history)
const Keys = Settings.redis.project_history.key_schema
const argv = process.argv.slice(2)
const limit = parseInt(argv[0], 10) || null
const force = argv[1] === 'force' || false
let projectNotFoundErrors = 0
let projectImportedFromV1Errors = 0
const projectsNotFound = []
const projectsImportedFromV1 = []
let projectNoHistoryIdErrors = 0
let projectsFailedErrors = 0
const projectsFailed = []
let projectsBrokenSyncErrors = 0
const projectsBrokenSync = []
function checkAndClear(project, callback) {
const projectId = project.project_id
console.log('checking project', projectId)
// These can probably also be reset and their overleaf.history.id unset
// (unless they are v1 projects).
function checkNotV1Project(cb) {
db.projects.findOne(
{ _id: new ObjectId(projectId) },
{ projection: { overleaf: true } },
(err, result) => {
console.log(
'1. looking in mongo projects collection: err',
err,
'result',
JSON.stringify(result)
)
if (err) {
return cb(err)
}
if (!result) {
return cb(new Error('project not found in mongo'))
}
if (result && result.overleaf && !result.overleaf.id) {
if (result.overleaf.history.id) {
console.log(
' - project is not imported from v1 and has a history id - ok to resync'
)
return cb()
} else {
console.log(
' - project is not imported from v1 but does not have a history id'
)
return cb(new Error('no history id'))
}
} else {
cb(new Error('project is imported from v1 - will not resync it'))
}
}
)
}
function startResync(cb) {
if (force) {
console.log('2. starting resync for', projectId)
SyncManager.startHardResync(projectId, err => {
if (err) {
console.log('ERR', JSON.stringify(err.message))
return cb(err)
}
setTimeout(cb, 3000) // include a delay to allow the request to be processed
})
} else {
console.log('2. dry run, would start resync for', projectId)
cb()
}
}
function forceFlush(cb) {
if (force) {
console.log('3. forcing a flush for', projectId)
UpdatesProcessor.processUpdatesForProject(projectId, err => {
console.log('err', err)
return cb(err)
})
} else {
console.log('3. dry run, would force a flush for', projectId)
cb()
}
}
function watchRedisQueue(cb) {
const key = Keys.projectHistoryOps({ project_id: projectId })
function checkQueueEmpty(_callback) {
rclient.llen(key, (err, result) => {
console.log('LLEN', projectId, err, result)
if (err) {
_callback(err)
}
if (result === 0) {
_callback()
} else {
_callback(new Error('queue not empty'))
}
})
}
if (force) {
console.log('4. checking redis queue key', key)
async.retry({ times: 30, interval: 1000 }, checkQueueEmpty, err => {
cb(err)
})
} else {
console.log('4. dry run, would check redis key', key)
cb()
}
}
function checkMongoFailureEntry(cb) {
if (force) {
console.log('5. checking key in mongo projectHistoryFailures', projectId)
db.projectHistoryFailures.findOne(
{ project_id: projectId },
{ projection: { _id: 1 } },
(err, result) => {
console.log('got result', err, result)
if (err) {
return cb(err)
}
if (result) {
return cb(new Error('failure record still exists'))
}
return cb()
}
)
} else {
console.log('5. would check failure record for', projectId, 'in mongo')
cb()
}
}
// do the checks and deletions
async.waterfall(
[
checkNotV1Project,
startResync,
forceFlush,
watchRedisQueue,
checkMongoFailureEntry,
],
err => {
if (!err) {
return setTimeout(callback, 1000) // include a 1 second delay
} else if (err.message === 'project not found in mongo') {
projectNotFoundErrors++
projectsNotFound.push(projectId)
return callback()
} else if (err.message === 'no history id') {
projectNoHistoryIdErrors++
return callback()
} else if (
err.message === 'project is imported from v1 - will not resync it'
) {
projectImportedFromV1Errors++
projectsImportedFromV1.push(projectId)
return callback()
} else if (
err.message === 'history store a non-success status code: 422'
) {
projectsFailedErrors++
projectsFailed.push(projectId)
return callback()
} else if (err.message === 'sync ongoing') {
projectsBrokenSyncErrors++
projectsBrokenSync.push(projectId)
return callback()
} else {
console.log('error:', err)
return callback()
}
}
)
}
async function main() {
const results = await db.projectHistoryFailures.find().toArray()
console.log('number of queues without history store 442 =', results.length)
// now check if the project is truly deleted in mongo
async.eachSeries(results.slice(0, limit), checkAndClear, err => {
console.log('Final error status', err)
console.log(
'Project flush failed again errors',
projectsFailedErrors,
projectsFailed
)
console.log(
'Project flush ongoing errors',
projectsBrokenSyncErrors,
projectsBrokenSync
)
console.log(
'Project not found errors',
projectNotFoundErrors,
projectsNotFound
)
console.log('Project without history_id errors', projectNoHistoryIdErrors)
console.log(
'Project imported from V1 errors',
projectImportedFromV1Errors,
projectsImportedFromV1
)
process.exit()
})
}
main().catch(error => {
console.error(error)
process.exit(1)
})