first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,425 @@
const fs = require('node:fs')
const Path = require('node:path')
const _ = require('lodash')
const logger = require('@overleaf/logger')
const OError = require('@overleaf/o-error')
const Errors = require('../app/js/Errors')
const LockManager = require('../app/js/LockManager')
const PersistenceManager = require('../app/js/PersistenceManager')
const ProjectFlusher = require('../app/js/ProjectFlusher')
const ProjectManager = require('../app/js/ProjectManager')
const RedisManager = require('../app/js/RedisManager')
const Settings = require('@overleaf/settings')
const request = require('requestretry').defaults({
maxAttempts: 2,
retryDelay: 10,
})
const AUTO_FIX_VERSION_MISMATCH =
process.env.AUTO_FIX_VERSION_MISMATCH === 'true'
const AUTO_FIX_PARTIALLY_DELETED_DOC_METADATA =
process.env.AUTO_FIX_PARTIALLY_DELETED_DOC_METADATA === 'true'
const SCRIPT_LOG_LEVEL = process.env.SCRIPT_LOG_LEVEL || 'warn'
const FLUSH_IN_SYNC_PROJECTS = process.env.FLUSH_IN_SYNC_PROJECTS === 'true'
const FOLDER =
process.env.FOLDER || '/tmp/overleaf-check-redis-mongo-sync-state'
const LIMIT = parseInt(process.env.LIMIT || '1000', 10)
const RETRIES = parseInt(process.env.RETRIES || '5', 10)
const WRITE_CONTENT = process.env.WRITE_CONTENT === 'true'
process.env.LOG_LEVEL = SCRIPT_LOG_LEVEL
logger.initialize('check-redis-mongo-sync-state')
const COMPARE_AND_SET =
'if redis.call("get", KEYS[1]) == ARGV[1] then return redis.call("set", KEYS[1], ARGV[2]) else return 0 end'
/**
* @typedef {Object} Doc
* @property {number} version
* @property {Array<string>} lines
* @property {string} pathname
* @property {Object} ranges
* @property {boolean} [partiallyDeleted]
*/
class TryAgainError extends Error {}
/**
* @param {string} docId
* @param {Doc} redisDoc
* @param {Doc} mongoDoc
* @return {Promise<void>}
*/
async function updateDocVersionInRedis(docId, redisDoc, mongoDoc) {
const lockValue = await LockManager.promises.getLock(docId)
try {
const key = Settings.redis.documentupdater.key_schema.docVersion({
doc_id: docId,
})
const numberOfKeys = 1
const ok = await RedisManager.rclient.eval(
COMPARE_AND_SET,
numberOfKeys,
key,
redisDoc.version,
mongoDoc.version
)
if (!ok) {
throw new TryAgainError(
'document has been updated, aborting overwrite. Try again.'
)
}
} finally {
await LockManager.promises.releaseLock(docId, lockValue)
}
}
async function fixPartiallyDeletedDocMetadata(projectId, docId, pathname) {
await new Promise((resolve, reject) => {
request(
{
method: 'PATCH',
url: `http://${process.env.DOCSTORE_HOST || '127.0.0.1'}:3016/project/${projectId}/doc/${docId}`,
timeout: 60 * 1000,
json: {
name: Path.basename(pathname),
deleted: true,
deletedAt: new Date(),
},
},
(err, res, body) => {
if (err) return reject(err)
const { statusCode } = res
if (statusCode !== 204) {
return reject(
new OError('patch request to docstore failed', {
statusCode,
body,
})
)
}
resolve()
}
)
})
}
async function getDocFromMongo(projectId, docId) {
try {
return await PersistenceManager.promises.getDoc(projectId, docId)
} catch (err) {
if (!(err instanceof Errors.NotFoundError)) {
throw err
}
}
const docstoreDoc = await new Promise((resolve, reject) => {
request(
{
url: `http://${process.env.DOCSTORE_HOST || '127.0.0.1'}:3016/project/${projectId}/doc/${docId}/peek`,
timeout: 60 * 1000,
json: true,
},
(err, res, body) => {
if (err) return reject(err)
const { statusCode } = res
if (statusCode !== 200) {
return reject(
new OError('fallback request to docstore failed', {
statusCode,
body,
})
)
}
resolve(body)
}
)
})
const deletedDocName = await new Promise((resolve, reject) => {
request(
{
url: `http://${process.env.DOCSTORE_HOST || '127.0.0.1'}:3016/project/${projectId}/doc-deleted`,
timeout: 60 * 1000,
json: true,
},
(err, res, body) => {
if (err) return reject(err)
const { statusCode } = res
if (statusCode !== 200) {
return reject(
new OError('list deleted docs request to docstore failed', {
statusCode,
body,
})
)
}
resolve(body.find(doc => doc._id === docId)?.name)
}
)
})
if (docstoreDoc.deleted && deletedDocName) {
return {
...docstoreDoc,
pathname: deletedDocName,
}
}
return {
...docstoreDoc,
pathname: `/partially-deleted-doc-with-unknown-name-and-id-${docId}.txt`,
partiallyDeleted: true,
}
}
/**
* @param {string} projectId
* @param {string} docId
* @return {Promise<boolean>}
*/
async function processDoc(projectId, docId) {
const redisDoc = /** @type Doc */ await RedisManager.promises.getDoc(
projectId,
docId
)
const mongoDoc = /** @type Doc */ await getDocFromMongo(projectId, docId)
if (mongoDoc.partiallyDeleted) {
if (AUTO_FIX_PARTIALLY_DELETED_DOC_METADATA) {
console.log(
`Found partially deleted doc ${docId} in project ${projectId}: fixing metadata`
)
await fixPartiallyDeletedDocMetadata(projectId, docId, redisDoc.pathname)
} else {
console.log(
`Found partially deleted doc ${docId} in project ${projectId}: use AUTO_FIX_PARTIALLY_DELETED_DOC_METADATA=true to fix metadata`
)
}
}
if (mongoDoc.version < redisDoc.version) {
// mongo is behind, we can flush to mongo when all docs are processed.
return false
}
mongoDoc.snapshot = mongoDoc.lines.join('\n')
redisDoc.snapshot = redisDoc.lines.join('\n')
if (!mongoDoc.ranges) mongoDoc.ranges = {}
if (!redisDoc.ranges) redisDoc.ranges = {}
const sameLines = mongoDoc.snapshot === redisDoc.snapshot
const sameRanges = _.isEqual(mongoDoc.ranges, redisDoc.ranges)
if (sameLines && sameRanges) {
if (mongoDoc.version > redisDoc.version) {
// mongo is ahead, technically out of sync, but practically the content is identical
if (AUTO_FIX_VERSION_MISMATCH) {
console.log(
`Fixing out of sync doc version for doc ${docId} in project ${projectId}: mongo=${mongoDoc.version} > redis=${redisDoc.version}`
)
await updateDocVersionInRedis(docId, redisDoc, mongoDoc)
return false
} else {
console.error(
`Detected out of sync redis and mongo version for doc ${docId} in project ${projectId}, auto-fixable via AUTO_FIX_VERSION_MISMATCH=true`
)
return true
}
} else {
// same lines, same ranges, same version
return false
}
}
const dir = Path.join(FOLDER, projectId, docId)
console.error(
`Detected out of sync redis and mongo content for doc ${docId} in project ${projectId}`
)
if (!WRITE_CONTENT) return true
console.log(`pathname: ${mongoDoc.pathname}`)
if (mongoDoc.pathname !== redisDoc.pathname) {
console.log(`pathname redis: ${redisDoc.pathname}`)
}
console.log(`mongo version: ${mongoDoc.version}`)
console.log(`redis version: ${redisDoc.version}`)
await fs.promises.mkdir(dir, { recursive: true })
if (sameLines) {
console.log('mongo lines match redis lines')
} else {
console.log(
`mongo lines and redis lines out of sync, writing content into ${dir}`
)
await fs.promises.writeFile(
Path.join(dir, 'mongo-snapshot.txt'),
mongoDoc.snapshot
)
await fs.promises.writeFile(
Path.join(dir, 'redis-snapshot.txt'),
redisDoc.snapshot
)
}
if (sameRanges) {
console.log('mongo ranges match redis ranges')
} else {
console.log(
`mongo ranges and redis ranges out of sync, writing content into ${dir}`
)
await fs.promises.writeFile(
Path.join(dir, 'mongo-ranges.json'),
JSON.stringify(mongoDoc.ranges)
)
await fs.promises.writeFile(
Path.join(dir, 'redis-ranges.json'),
JSON.stringify(redisDoc.ranges)
)
}
console.log('---')
return true
}
/**
* @param {string} projectId
* @return {Promise<number>}
*/
async function processProject(projectId) {
const docIds = await RedisManager.promises.getDocIdsInProject(projectId)
let outOfSync = 0
for (const docId of docIds) {
let lastErr
for (let i = 0; i <= RETRIES; i++) {
try {
if (await processDoc(projectId, docId)) {
outOfSync++
}
break
} catch (err) {
lastErr = err
}
}
if (lastErr) {
throw OError.tag(lastErr, 'process doc', { docId })
}
}
if (outOfSync === 0 && FLUSH_IN_SYNC_PROJECTS) {
try {
await ProjectManager.promises.flushAndDeleteProjectWithLocks(
projectId,
{}
)
} catch (err) {
throw OError.tag(err, 'flush project with only in-sync docs')
}
}
return outOfSync
}
/**
* @param {Set<string>} processed
* @param {Set<string>} outOfSync
* @return {Promise<{perIterationOutOfSync: number, done: boolean}>}
*/
async function scanOnce(processed, outOfSync) {
const projectIds = await ProjectFlusher.promises.flushAllProjects({
limit: LIMIT,
dryRun: true,
})
let perIterationOutOfSync = 0
for (const projectId of projectIds) {
if (processed.has(projectId)) continue
processed.add(projectId)
let perProjectOutOfSync = 0
try {
perProjectOutOfSync = await processProject(projectId)
} catch (err) {
throw OError.tag(err, 'process project', { projectId })
}
perIterationOutOfSync += perProjectOutOfSync
if (perProjectOutOfSync > 0) {
outOfSync.add(projectId)
}
}
return { perIterationOutOfSync, done: projectIds.length < LIMIT }
}
/**
* @return {Promise<number>}
*/
async function main() {
if (!WRITE_CONTENT) {
console.warn()
console.warn(
` Use WRITE_CONTENT=true to write the content of out of sync docs to FOLDER=${FOLDER}`
)
console.warn()
} else {
console.log(
`Writing content for projects with out of sync docs into FOLDER=${FOLDER}`
)
await fs.promises.mkdir(FOLDER, { recursive: true })
const existing = await fs.promises.readdir(FOLDER)
if (existing.length > 0) {
console.warn()
console.warn(
` Found existing entries in FOLDER=${FOLDER}. Please delete or move these before running the script again.`
)
console.warn()
return 101
}
}
if (LIMIT < 100) {
console.warn()
console.warn(
` Using small LIMIT=${LIMIT}, this can take a while to SCAN in a large redis database.`
)
console.warn()
}
const processed = new Set()
const outOfSyncProjects = new Set()
let totalOutOfSyncDocs = 0
while (true) {
const before = processed.size
const { perIterationOutOfSync, done } = await scanOnce(
processed,
outOfSyncProjects
)
totalOutOfSyncDocs += perIterationOutOfSync
console.log(`Processed ${processed.size} projects`)
console.log(
`Found ${
outOfSyncProjects.size
} projects with ${totalOutOfSyncDocs} out of sync docs: ${JSON.stringify(
Array.from(outOfSyncProjects)
)}`
)
if (done) {
console.log('Finished iterating all projects in redis')
break
}
if (processed.size === before) {
console.error(
`Found too many un-flushed projects (LIMIT=${LIMIT}). Please fix the reported projects first, then try again.`
)
if (!FLUSH_IN_SYNC_PROJECTS) {
console.error(
'Use FLUSH_IN_SYNC_PROJECTS=true to flush projects that have been checked.'
)
}
return 2
}
}
return totalOutOfSyncDocs > 0 ? 1 : 0
}
main()
.then(code => {
process.exit(code)
})
.catch(error => {
console.error(OError.getFullStack(error))
console.error(OError.getFullInfo(error))
process.exit(1)
})

View File

@@ -0,0 +1,65 @@
const Settings = require('@overleaf/settings')
const rclient = require('@overleaf/redis-wrapper').createClient(
Settings.redis.documentupdater
)
let keys = Settings.redis.documentupdater.key_schema
const async = require('async')
const RedisManager = require('../app/js/RedisManager')
const getKeysFromNode = function (node, pattern, callback) {
let cursor = 0 // redis iterator
const keySet = {} // use hash to avoid duplicate results
// scan over all keys looking for pattern
const doIteration = () =>
node.scan(cursor, 'MATCH', pattern, 'COUNT', 1000, function (error, reply) {
if (error) {
return callback(error)
}
;[cursor, keys] = reply
console.log('SCAN', keys.length)
for (const key of keys) {
keySet[key] = true
}
if (cursor === '0') {
// note redis returns string result not numeric
return callback(null, Object.keys(keySet))
} else {
return doIteration()
}
})
return doIteration()
}
const getKeys = function (pattern, callback) {
const nodes = (typeof rclient.nodes === 'function'
? rclient.nodes('master')
: undefined) || [rclient]
console.log('GOT NODES', nodes.length)
const doKeyLookupForNode = (node, cb) => getKeysFromNode(node, pattern, cb)
return async.concatSeries(nodes, doKeyLookupForNode, callback)
}
const expireDocOps = callback =>
getKeys(keys.docOps({ doc_id: '*' }), (error, keys) => {
if (error) return callback(error)
async.mapSeries(
keys,
function (key, cb) {
console.log(`EXPIRE ${key} ${RedisManager.DOC_OPS_TTL}`)
return rclient.expire(key, RedisManager.DOC_OPS_TTL, cb)
},
callback
)
})
setTimeout(
() =>
// Give redis a chance to connect
expireDocOps(function (error) {
if (error) {
throw error
}
return process.exit()
}),
1000
)

View File

@@ -0,0 +1,79 @@
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
const rclient = require('@overleaf/redis-wrapper').createClient(
Settings.redis.documentupdater
)
const keys = Settings.redis.documentupdater.key_schema
const ProjectFlusher = require('app/js/ProjectFlusher')
const DocumentManager = require('app/js/DocumentManager')
const util = require('node:util')
const flushAndDeleteDocWithLock = util.promisify(
DocumentManager.flushAndDeleteDocWithLock
)
async function flushAndDeleteDocs(dockeys, options) {
const docIds = ProjectFlusher._extractIds(dockeys)
for (const docId of docIds) {
const pathname = await rclient.get(keys.pathname({ doc_id: docId }))
if (!pathname) {
const projectId = await rclient.get(keys.projectKey({ doc_id: docId }))
if (!projectId) {
// await deleteDanglingDoc(projectId, docId, pathname, options)
logger.info(
{ projectId, docId, pathname },
'skipping doc with empty pathname and project id'
)
} else {
await flushAndDeleteDoc(projectId, docId, pathname, options)
}
}
}
}
async function flushAndDeleteDoc(projectId, docId, pathname, options) {
if (options.dryRun) {
logger.info(
{ projectId, docId, pathname },
'dry run mode - would flush doc with empty pathname'
)
return
}
logger.info(
{ projectId, docId, pathname },
'flushing doc with empty pathname'
)
try {
await flushAndDeleteDocWithLock(projectId, docId, {})
} catch (err) {
logger.error(
{ projectId, docId, pathname, err },
'error flushing and deleting doc without pathname'
)
}
}
async function cleanUpDocs(options) {
logger.info({ options }, 'cleaning up docs without pathnames')
let cursor = 0
do {
const [newCursor, doclinesKeys] = await rclient.scan(
cursor,
'MATCH',
keys.docLines({ doc_id: '*' }),
'COUNT',
options.limit
)
await flushAndDeleteDocs(doclinesKeys, options)
cursor = newCursor
} while (cursor !== '0')
}
cleanUpDocs({ limit: 1000, dryRun: process.env.DRY_RUN !== 'false' })
.then(result => {
rclient.quit()
console.log('DONE')
})
.catch(function (error) {
console.error(error)
process.exit(1)
})

View File

@@ -0,0 +1,87 @@
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
const rclient = require('@overleaf/redis-wrapper').createClient(
Settings.redis.documentupdater
)
const keys = Settings.redis.documentupdater.key_schema
const ProjectFlusher = require('../app/js/ProjectFlusher')
const DocumentManager = require('../app/js/DocumentManager')
const { mongoClient, db, ObjectId } = require('../app/js/mongodb')
const util = require('node:util')
const flushAndDeleteDocWithLock = util.promisify(
DocumentManager.flushAndDeleteDocWithLock
)
async function fixDocsWithMissingProjectIds(dockeys, options) {
const docIds = ProjectFlusher._extractIds(dockeys)
for (const docId of docIds) {
const projectId = await rclient.get(keys.projectKey({ doc_id: docId }))
logger.debug({ docId, projectId }, 'checking doc')
if (!projectId) {
try {
await insertMissingProjectId(docId, options)
} catch (err) {
logger.error({ docId, err }, 'error fixing doc without project id')
}
}
}
}
async function insertMissingProjectId(docId, options) {
const doc = await db.docs.findOne({ _id: ObjectId(docId) })
if (!doc) {
logger.warn({ docId }, 'doc not found in mongo')
return
}
if (!doc.project_id) {
logger.error({ docId }, 'doc does not have project id in mongo')
return
}
logger.debug({ docId, doc }, 'found doc')
const projectIdFromMongo = doc.project_id.toString()
if (options.dryRun) {
logger.info(
{ projectIdFromMongo, docId },
'dry run mode - would insert project id in redis'
)
return
}
// set the project id for this doc
await rclient.set(keys.projectKey({ doc_id: docId }), projectIdFromMongo)
logger.debug({ docId, projectIdFromMongo }, 'inserted project id in redis')
if (projectIdFromMongo) {
await flushAndDeleteDocWithLock(projectIdFromMongo, docId, {})
logger.info(
{ docId, projectIdFromMongo },
'fixed doc with empty project id'
)
}
return projectIdFromMongo
}
async function findAndProcessDocs(options) {
logger.info({ options }, 'fixing docs with missing projcct id')
let cursor = 0
do {
const [newCursor, doclinesKeys] = await rclient.scan(
cursor,
'MATCH',
keys.docLines({ doc_id: '*' }),
'COUNT',
options.limit
)
await fixDocsWithMissingProjectIds(doclinesKeys, options)
cursor = newCursor
} while (cursor !== '0')
}
findAndProcessDocs({ limit: 1000, dryRun: process.env.DRY_RUN !== 'false' })
.then(result => {
rclient.quit()
mongoClient.close()
console.log('DONE')
})
.catch(function (error) {
console.error(error)
process.exit(1)
})

View File

@@ -0,0 +1,54 @@
const ProjectFlusher = require('../app/js/ProjectFlusher')
const minimist = require('minimist')
async function main() {
const argv = minimist(process.argv.slice(2), {
default: {
limit: 100000,
concurrency: 5,
'dry-run': false,
},
boolean: ['dry-run', 'help'],
alias: { h: 'help', n: 'dry-run', j: 'concurrency' },
})
if (argv.help) {
console.log(`
Usage: node scripts/flush_all.js [options]
Options:
--limit Number of projects to flush (default: 100000)
--concurrency, -j Number of concurrent flush operations (default: 5)
--dryRun, -n Perform a dry run without making any changes (default: false)
--help, -h Show this help message
`)
process.exit(0)
}
const options = {
limit: argv.limit,
concurrency: argv.concurrency,
dryRun: argv['dry-run'],
}
console.log('Flushing all projects with options:', options)
return await new Promise((resolve, reject) => {
ProjectFlusher.flushAllProjects(options, err => {
if (err) {
reject(err)
} else {
resolve()
}
})
})
}
main()
.then(() => {
console.log('Done flushing all projects')
process.exit(0)
})
.catch(error => {
console.error('There was an error flushing all projects', { error })
process.exit(1)
})

View File

@@ -0,0 +1,161 @@
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
const rclient = require('@overleaf/redis-wrapper').createClient(
Settings.redis.documentupdater
)
const keys = Settings.redis.documentupdater.key_schema
const ProjectFlusher = require('../app/js/ProjectFlusher')
const RedisManager = require('../app/js/RedisManager')
const { mongoClient, db, ObjectId } = require('../app/js/mongodb')
const util = require('node:util')
const getDoc = util.promisify((projectId, docId, cb) =>
RedisManager.getDoc(projectId, docId, (err, ...args) => cb(err, args))
)
const removeDocFromMemory = util.promisify(RedisManager.removeDocFromMemory)
const summary = { totalDocs: 0, deletedDocs: 0, skippedDocs: 0 }
async function removeDeletedDocs(dockeys, options) {
const docIds = ProjectFlusher._extractIds(dockeys)
for (const docId of docIds) {
summary.totalDocs++
const docCount = await db.docs.find({ _id: new ObjectId(docId) }).count()
if (!docCount) {
try {
await removeDeletedDoc(docId, options)
} catch (err) {
logger.error({ docId, err }, 'error removing deleted doc')
}
}
}
}
async function removeDeletedDoc(docId, options) {
const projectId = await rclient.get(keys.projectKey({ doc_id: docId }))
const [
docLines,
version,
ranges,
pathname,
projectHistoryId,
unflushedTime,
lastUpdatedAt,
lastUpdatedBy,
] = await getDoc(projectId, docId)
const project = await db.projects.findOne({ _id: new ObjectId(projectId) })
let status
if (project) {
const projectJSON = JSON.stringify(project.rootFolder)
const containsDoc = projectJSON.indexOf(docId) !== -1
if (containsDoc) {
logger.warn(
{
projectId,
docId,
docLinesBytes: docLines && docLines.length,
version,
rangesBytes: ranges && ranges.length,
pathname,
projectHistoryId,
unflushedTime,
lastUpdatedAt,
lastUpdatedBy,
},
'refusing to delete doc, project contains docId'
)
summary.skippedDocs++
return
} else {
logger.warn(
{
projectId,
docId,
docLinesBytes: docLines && docLines.length,
version,
rangesBytes: ranges && ranges.length,
pathname,
projectHistoryId,
unflushedTime,
lastUpdatedAt,
lastUpdatedBy,
},
'refusing to delete doc, project still exists'
)
summary.skippedDocs++
return
}
} else {
status = 'projectDeleted'
}
summary.deletedDocs++
if (options.dryRun) {
logger.info(
{
projectId,
docId,
docLinesBytes: docLines && docLines.length,
version,
rangesBytes: ranges && ranges.length,
pathname,
projectHistoryId,
unflushedTime,
lastUpdatedAt,
lastUpdatedBy,
status,
summary,
},
'dry run mode - would remove doc from redis'
)
return
}
removeDocFromMemory(projectId, docId)
logger.info(
{
projectId,
docId,
docLinesBytes: docLines && docLines.length,
version,
rangesBytes: ranges && ranges.length,
pathname,
projectHistoryId,
unflushedTime,
lastUpdatedAt,
lastUpdatedBy,
status,
summary,
},
'removed doc from redis'
)
}
async function findAndProcessDocs(options) {
logger.info({ options }, 'removing deleted docs')
let cursor = 0
do {
const [newCursor, doclinesKeys] = await rclient.scan(
cursor,
'MATCH',
keys.docLines({ doc_id: '*' }),
'COUNT',
options.limit
)
await removeDeletedDocs(doclinesKeys, options)
cursor = newCursor
} while (cursor !== '0')
}
findAndProcessDocs({ limit: 1000, dryRun: process.env.DRY_RUN !== 'false' })
.then(result => {
rclient.quit()
mongoClient.close()
console.log('DONE')
process.exit(0)
})
.catch(function (error) {
console.error(error)
process.exit(1)
})