first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,276 @@
const crypto = require('node:crypto')
const fs = require('node:fs')
const Path = require('node:path')
const { pipeline } = require('node:stream/promises')
const { createGzip, createGunzip } = require('node:zlib')
const tarFs = require('tar-fs')
const _ = require('lodash')
const {
fetchNothing,
fetchStream,
RequestFailedError,
} = require('@overleaf/fetch-utils')
const logger = require('@overleaf/logger')
const Metrics = require('@overleaf/metrics')
const Settings = require('@overleaf/settings')
const { CACHE_SUBDIR } = require('./OutputCacheManager')
const { isExtraneousFile } = require('./ResourceWriter')
const TIMING_BUCKETS = [
0, 10, 100, 1000, 2000, 5000, 10000, 15000, 20000, 30000,
]
const MAX_ENTRIES_IN_OUTPUT_TAR = 100
/**
* @param {string} projectId
* @param {string} userId
* @param {string} buildId
* @param {string} editorId
* @param {[{path: string}]} outputFiles
* @param {string} compileGroup
* @param {Record<string, any>} options
*/
function notifyCLSICacheAboutBuild({
projectId,
userId,
buildId,
editorId,
outputFiles,
compileGroup,
options,
}) {
if (!Settings.apis.clsiCache.enabled) return
/**
* @param {[{path: string}]} files
*/
const enqueue = files => {
Metrics.count('clsi_cache_enqueue_files', files.length)
fetchNothing(`${Settings.apis.clsiCache.url}/enqueue`, {
method: 'POST',
json: {
projectId,
userId,
buildId,
editorId,
files,
downloadHost: Settings.apis.clsi.downloadHost,
clsiServerId: Settings.apis.clsi.clsiServerId,
compileGroup,
options,
},
signal: AbortSignal.timeout(15_000),
}).catch(err => {
logger.warn(
{ err, projectId, userId, buildId },
'enqueue for clsi cache failed'
)
})
}
// PDF preview
enqueue(
outputFiles
.filter(
f =>
f.path === 'output.pdf' ||
f.path === 'output.log' ||
f.path === 'output.synctex.gz' ||
f.path.endsWith('.blg')
)
.map(f => {
if (f.path === 'output.pdf') {
return _.pick(f, 'path', 'size', 'contentId', 'ranges')
}
return _.pick(f, 'path')
})
)
// Compile Cache
buildTarball({ projectId, userId, buildId, outputFiles })
.then(() => {
enqueue([{ path: 'output.tar.gz' }])
})
.catch(err => {
logger.warn(
{ err, projectId, userId, buildId },
'build output.tar.gz for clsi cache failed'
)
})
}
/**
* @param {string} projectId
* @param {string} userId
* @param {string} buildId
* @param {[{path: string}]} outputFiles
* @return {Promise<void>}
*/
async function buildTarball({ projectId, userId, buildId, outputFiles }) {
const timer = new Metrics.Timer('clsi_cache_build', 1, {}, TIMING_BUCKETS)
const outputDir = Path.join(
Settings.path.outputDir,
userId ? `${projectId}-${userId}` : projectId,
CACHE_SUBDIR,
buildId
)
const files = outputFiles.filter(f => !isExtraneousFile(f.path))
if (files.length > MAX_ENTRIES_IN_OUTPUT_TAR) {
Metrics.inc('clsi_cache_build_too_many_entries')
throw new Error('too many output files for output.tar.gz')
}
Metrics.count('clsi_cache_build_files', files.length)
const path = Path.join(outputDir, 'output.tar.gz')
try {
await pipeline(
tarFs.pack(outputDir, { entries: files.map(f => f.path) }),
createGzip(),
fs.createWriteStream(path)
)
} catch (err) {
try {
await fs.promises.unlink(path)
} catch (e) {}
throw err
} finally {
timer.done()
}
}
/**
* @param {string} projectId
* @param {string} userId
* @param {string} editorId
* @param {string} buildId
* @param {string} outputDir
* @return {Promise<boolean>}
*/
async function downloadOutputDotSynctexFromCompileCache(
projectId,
userId,
editorId,
buildId,
outputDir
) {
if (!Settings.apis.clsiCache.enabled) return false
const timer = new Metrics.Timer(
'clsi_cache_download',
1,
{ method: 'synctex' },
TIMING_BUCKETS
)
let stream
try {
stream = await fetchStream(
`${Settings.apis.clsiCache.url}/project/${projectId}/${
userId ? `user/${userId}/` : ''
}build/${editorId}-${buildId}/search/output/output.synctex.gz`,
{
method: 'GET',
signal: AbortSignal.timeout(10_000),
}
)
} catch (err) {
if (err instanceof RequestFailedError && err.response.status === 404) {
timer.done({ status: 'not-found' })
return false
}
timer.done({ status: 'error' })
throw err
}
await fs.promises.mkdir(outputDir, { recursive: true })
const dst = Path.join(outputDir, 'output.synctex.gz')
const tmp = dst + crypto.randomUUID()
try {
await pipeline(stream, fs.createWriteStream(tmp))
await fs.promises.rename(tmp, dst)
} catch (err) {
try {
await fs.promises.unlink(tmp)
} catch {}
throw err
}
timer.done({ status: 'success' })
return true
}
/**
* @param {string} projectId
* @param {string} userId
* @param {string} compileDir
* @return {Promise<boolean>}
*/
async function downloadLatestCompileCache(projectId, userId, compileDir) {
if (!Settings.apis.clsiCache.enabled) return false
const url = `${Settings.apis.clsiCache.url}/project/${projectId}/${
userId ? `user/${userId}/` : ''
}latest/output/output.tar.gz`
const timer = new Metrics.Timer(
'clsi_cache_download',
1,
{ method: 'tar' },
TIMING_BUCKETS
)
let stream
try {
stream = await fetchStream(url, {
method: 'GET',
signal: AbortSignal.timeout(10_000),
})
} catch (err) {
if (err instanceof RequestFailedError && err.response.status === 404) {
timer.done({ status: 'not-found' })
return false
}
timer.done({ status: 'error' })
throw err
}
let n = 0
let abort = false
await pipeline(
stream,
createGunzip(),
tarFs.extract(compileDir, {
// use ignore hook for counting entries (files+folders) and validation.
// Include folders as they incur mkdir calls.
ignore(_, header) {
if (abort) return true // log once
n++
if (n > MAX_ENTRIES_IN_OUTPUT_TAR) {
abort = true
logger.warn(
{
url,
compileDir,
},
'too many entries in tar-ball from clsi-cache'
)
} else if (header.type !== 'file' && header.type !== 'directory') {
abort = true
logger.warn(
{
url,
compileDir,
entryType: header.type,
},
'unexpected entry in tar-ball from clsi-cache'
)
}
return abort
},
})
)
Metrics.count('clsi_cache_download_entries', n)
timer.done({ status: 'success' })
return !abort
}
module.exports = {
notifyCLSICacheAboutBuild,
downloadLatestCompileCache,
downloadOutputDotSynctexFromCompileCache,
}

View File

@@ -0,0 +1,20 @@
// TODO: This file was created by bulk-decaffeinate.
// Sanity-check the conversion and remove this comment.
/*
* decaffeinate suggestions:
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let commandRunnerPath
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
if ((Settings.clsi != null ? Settings.clsi.dockerRunner : undefined) === true) {
commandRunnerPath = './DockerRunner'
} else {
commandRunnerPath = './LocalCommandRunner'
}
logger.debug({ commandRunnerPath }, 'selecting command runner for clsi')
const CommandRunner = require(commandRunnerPath)
module.exports = CommandRunner

View File

@@ -0,0 +1,276 @@
const Path = require('node:path')
const RequestParser = require('./RequestParser')
const CompileManager = require('./CompileManager')
const Settings = require('@overleaf/settings')
const Metrics = require('./Metrics')
const ProjectPersistenceManager = require('./ProjectPersistenceManager')
const logger = require('@overleaf/logger')
const Errors = require('./Errors')
const { notifyCLSICacheAboutBuild } = require('./CLSICacheHandler')
let lastSuccessfulCompileTimestamp = 0
function timeSinceLastSuccessfulCompile() {
return Date.now() - lastSuccessfulCompileTimestamp
}
function compile(req, res, next) {
const timer = new Metrics.Timer('compile-request')
RequestParser.parse(req.body, function (error, request) {
if (error) {
return next(error)
}
timer.opts = request.metricsOpts
request.project_id = req.params.project_id
if (req.params.user_id != null) {
request.user_id = req.params.user_id
}
ProjectPersistenceManager.markProjectAsJustAccessed(
request.project_id,
function (error) {
if (error) {
return next(error)
}
const stats = {}
const timings = {}
CompileManager.doCompileWithLock(
request,
stats,
timings,
(error, result) => {
let { buildId, outputFiles } = result || {}
let code, status
if (outputFiles == null) {
outputFiles = []
}
if (error instanceof Errors.AlreadyCompilingError) {
code = 423 // Http 423 Locked
status = 'compile-in-progress'
} else if (error instanceof Errors.FilesOutOfSyncError) {
code = 409 // Http 409 Conflict
status = 'retry'
logger.warn(
{
projectId: request.project_id,
userId: request.user_id,
},
'files out of sync, please retry'
)
} else if (
error?.code === 'EPIPE' ||
error instanceof Errors.TooManyCompileRequestsError
) {
// docker returns EPIPE when shutting down
code = 503 // send 503 Unavailable response
status = 'unavailable'
} else if (error?.terminated) {
status = 'terminated'
} else if (error?.validate) {
status = `validation-${error.validate}`
} else if (error?.timedout) {
status = 'timedout'
logger.debug(
{ err: error, projectId: request.project_id },
'timeout running compile'
)
} else if (error) {
status = 'error'
code = 500
logger.error(
{ err: error, projectId: request.project_id },
'error running compile'
)
} else {
if (
outputFiles.some(
file => file.path === 'output.pdf' && file.size > 0
)
) {
status = 'success'
lastSuccessfulCompileTimestamp = Date.now()
} else if (request.stopOnFirstError) {
status = 'stopped-on-first-error'
} else {
status = 'failure'
logger.warn(
{ projectId: request.project_id, outputFiles },
'project failed to compile successfully, no output.pdf generated'
)
}
// log an error if any core files are found
if (outputFiles.some(file => file.path === 'core')) {
logger.error(
{ projectId: request.project_id, req, outputFiles },
'core file found in output'
)
}
}
if (error) {
outputFiles = error.outputFiles || []
buildId = error.buildId
}
if (
status === 'success' &&
request.editorId &&
request.populateClsiCache
) {
notifyCLSICacheAboutBuild({
projectId: request.project_id,
userId: request.user_id,
buildId: outputFiles[0].build,
editorId: request.editorId,
outputFiles,
compileGroup: request.compileGroup,
options: {
compiler: request.compiler,
draft: request.draft,
imageName: request.imageName
? Path.basename(request.imageName)
: undefined,
rootResourcePath: request.rootResourcePath,
stopOnFirstError: request.stopOnFirstError,
},
})
}
timer.done()
res.status(code || 200).send({
compile: {
status,
error: error?.message || error,
stats,
timings,
buildId,
outputUrlPrefix: Settings.apis.clsi.outputUrlPrefix,
outputFiles: outputFiles.map(file => ({
url:
`${Settings.apis.clsi.url}/project/${request.project_id}` +
(request.user_id != null
? `/user/${request.user_id}`
: '') +
`/build/${file.build}/output/${file.path}`,
...file,
})),
},
})
}
)
}
)
})
}
function stopCompile(req, res, next) {
const { project_id: projectId, user_id: userId } = req.params
CompileManager.stopCompile(projectId, userId, function (error) {
if (error) {
return next(error)
}
res.sendStatus(204)
})
}
function clearCache(req, res, next) {
ProjectPersistenceManager.clearProject(
req.params.project_id,
req.params.user_id,
function (error) {
if (error) {
return next(error)
}
// No content
res.sendStatus(204)
}
)
}
function syncFromCode(req, res, next) {
const { file, editorId, buildId, compileFromClsiCache } = req.query
const line = parseInt(req.query.line, 10)
const column = parseInt(req.query.column, 10)
const { imageName } = req.query
const projectId = req.params.project_id
const userId = req.params.user_id
CompileManager.syncFromCode(
projectId,
userId,
file,
line,
column,
{ imageName, editorId, buildId, compileFromClsiCache },
function (error, pdfPositions) {
if (error) {
return next(error)
}
res.json({
pdf: pdfPositions,
})
}
)
}
function syncFromPdf(req, res, next) {
const page = parseInt(req.query.page, 10)
const h = parseFloat(req.query.h)
const v = parseFloat(req.query.v)
const { imageName, editorId, buildId, compileFromClsiCache } = req.query
const projectId = req.params.project_id
const userId = req.params.user_id
CompileManager.syncFromPdf(
projectId,
userId,
page,
h,
v,
{ imageName, editorId, buildId, compileFromClsiCache },
function (error, codePositions) {
if (error) {
return next(error)
}
res.json({
code: codePositions,
})
}
)
}
function wordcount(req, res, next) {
const file = req.query.file || 'main.tex'
const projectId = req.params.project_id
const userId = req.params.user_id
const { image } = req.query
logger.debug({ image, file, projectId }, 'word count request')
CompileManager.wordcount(
projectId,
userId,
file,
image,
function (error, result) {
if (error) {
return next(error)
}
res.json({
texcount: result,
})
}
)
}
function status(req, res, next) {
res.send('OK')
}
module.exports = {
compile,
stopCompile,
clearCache,
syncFromCode,
syncFromPdf,
wordcount,
status,
timeSinceLastSuccessfulCompile,
}

View File

@@ -0,0 +1,701 @@
const fsPromises = require('node:fs/promises')
const os = require('node:os')
const Path = require('node:path')
const { callbackify } = require('node:util')
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
const OError = require('@overleaf/o-error')
const ResourceWriter = require('./ResourceWriter')
const LatexRunner = require('./LatexRunner')
const OutputFileFinder = require('./OutputFileFinder')
const OutputCacheManager = require('./OutputCacheManager')
const Metrics = require('./Metrics')
const DraftModeManager = require('./DraftModeManager')
const TikzManager = require('./TikzManager')
const LockManager = require('./LockManager')
const Errors = require('./Errors')
const CommandRunner = require('./CommandRunner')
const { emitPdfStats } = require('./ContentCacheMetrics')
const SynctexOutputParser = require('./SynctexOutputParser')
const {
downloadLatestCompileCache,
downloadOutputDotSynctexFromCompileCache,
} = require('./CLSICacheHandler')
const COMPILE_TIME_BUCKETS = [
// NOTE: These buckets are locked in per metric name.
// If you want to change them, you will need to rename metrics.
0, 1, 2, 3, 4, 6, 8, 11, 15, 22, 31, 43, 61, 86, 121, 170, 240,
].map(seconds => seconds * 1000)
function getCompileName(projectId, userId) {
if (userId != null) {
return `${projectId}-${userId}`
} else {
return projectId
}
}
function getCompileDir(projectId, userId) {
return Path.join(Settings.path.compilesDir, getCompileName(projectId, userId))
}
function getOutputDir(projectId, userId) {
return Path.join(Settings.path.outputDir, getCompileName(projectId, userId))
}
async function doCompileWithLock(request, stats, timings) {
const compileDir = getCompileDir(request.project_id, request.user_id)
request.isInitialCompile =
(await fsPromises.mkdir(compileDir, { recursive: true })) === compileDir
// prevent simultaneous compiles
const lock = LockManager.acquire(compileDir)
try {
return await doCompile(request, stats, timings)
} finally {
lock.release()
}
}
async function doCompile(request, stats, timings) {
const { project_id: projectId, user_id: userId } = request
const compileDir = getCompileDir(request.project_id, request.user_id)
const timerE2E = new Metrics.Timer(
'compile-e2e-v2',
1,
request.metricsOpts,
COMPILE_TIME_BUCKETS
)
if (request.isInitialCompile) {
stats.isInitialCompile = 1
request.metricsOpts.compile = 'initial'
if (request.compileFromClsiCache) {
try {
if (await downloadLatestCompileCache(projectId, userId, compileDir)) {
stats.restoredClsiCache = 1
request.metricsOpts.compile = 'from-clsi-cache'
}
} catch (err) {
logger.warn(
{ err, projectId, userId },
'failed to populate compile dir from cache'
)
}
}
} else {
request.metricsOpts.compile = 'recompile'
}
const writeToDiskTimer = new Metrics.Timer(
'write-to-disk',
1,
request.metricsOpts
)
logger.debug(
{ projectId: request.project_id, userId: request.user_id },
'syncing resources to disk'
)
let resourceList
try {
// NOTE: resourceList is insecure, it should only be used to exclude files from the output list
resourceList = await ResourceWriter.promises.syncResourcesToDisk(
request,
compileDir
)
} catch (error) {
if (error instanceof Errors.FilesOutOfSyncError) {
OError.tag(error, 'files out of sync, please retry', {
projectId: request.project_id,
userId: request.user_id,
})
} else {
OError.tag(error, 'error writing resources to disk', {
projectId: request.project_id,
userId: request.user_id,
})
}
throw error
}
logger.debug(
{
projectId: request.project_id,
userId: request.user_id,
timeTaken: Date.now() - writeToDiskTimer.start,
},
'written files to disk'
)
timings.sync = writeToDiskTimer.done()
// set up environment variables for chktex
const env = {
OVERLEAF_PROJECT_ID: request.project_id,
}
if (Settings.texliveOpenoutAny && Settings.texliveOpenoutAny !== '') {
// override default texlive openout_any environment variable
env.openout_any = Settings.texliveOpenoutAny
}
if (Settings.texliveMaxPrintLine && Settings.texliveMaxPrintLine !== '') {
// override default texlive max_print_line environment variable
env.max_print_line = Settings.texliveMaxPrintLine
}
// only run chktex on LaTeX files (not knitr .Rtex files or any others)
const isLaTeXFile = request.rootResourcePath?.match(/\.tex$/i)
if (request.check != null && isLaTeXFile) {
env.CHKTEX_OPTIONS = '-nall -e9 -e10 -w15 -w16'
env.CHKTEX_ULIMIT_OPTIONS = '-t 5 -v 64000'
if (request.check === 'error') {
env.CHKTEX_EXIT_ON_ERROR = 1
}
if (request.check === 'validate') {
env.CHKTEX_VALIDATE = 1
}
}
// apply a series of file modifications/creations for draft mode and tikz
if (request.draft) {
await DraftModeManager.promises.injectDraftMode(
Path.join(compileDir, request.rootResourcePath)
)
}
const needsMainFile = await TikzManager.promises.checkMainFile(
compileDir,
request.rootResourcePath,
resourceList
)
if (needsMainFile) {
await TikzManager.promises.injectOutputFile(
compileDir,
request.rootResourcePath
)
}
const compileTimer = new Metrics.Timer('run-compile', 1, request.metricsOpts)
// find the image tag to log it as a metric, e.g. 2015.1 (convert . to - for graphite)
let tag = 'default'
if (request.imageName != null) {
const match = request.imageName.match(/:(.*)/)
if (match != null) {
tag = match[1].replace(/\./g, '-')
}
}
// exclude smoke test
if (!request.project_id.match(/^[0-9a-f]{24}$/)) {
tag = 'other'
}
Metrics.inc('compiles', 1, request.metricsOpts)
Metrics.inc(`compiles-with-image.${tag}`, 1, request.metricsOpts)
const compileName = getCompileName(request.project_id, request.user_id)
try {
await LatexRunner.promises.runLatex(compileName, {
directory: compileDir,
mainFile: request.rootResourcePath,
compiler: request.compiler,
timeout: request.timeout,
image: request.imageName,
flags: request.flags,
environment: env,
compileGroup: request.compileGroup,
stopOnFirstError: request.stopOnFirstError,
stats,
timings,
})
// We use errors to return the validation state. It would be nice to use a
// more appropriate mechanism.
if (request.check === 'validate') {
const validationError = new Error('validation')
validationError.validate = 'pass'
throw validationError
}
} catch (originalError) {
let error = originalError
// request was for validation only
if (request.check === 'validate' && !error.validate) {
error = new Error('validation')
error.validate = originalError.code ? 'fail' : 'pass'
}
// request was for compile, and failed on validation
if (request.check === 'error' && originalError.message === 'exited') {
error = new Error('compilation')
error.validate = 'fail'
}
// record timeout errors as a separate counter, success is recorded later
if (error.timedout) {
Metrics.inc('compiles-timeout', 1, request.metricsOpts)
}
const { outputFiles, allEntries, buildId } = await _saveOutputFiles({
request,
compileDir,
resourceList,
stats,
timings,
})
error.outputFiles = outputFiles // return output files so user can check logs
error.buildId = buildId
// Clear project if this compile was abruptly terminated
if (error.terminated || error.timedout) {
await clearProjectWithListing(
request.project_id,
request.user_id,
allEntries
)
}
throw error
}
// compile completed normally
Metrics.inc('compiles-succeeded', 1, request.metricsOpts)
for (const metricKey in stats) {
const metricValue = stats[metricKey]
Metrics.count(metricKey, metricValue, 1, request.metricsOpts)
}
for (const metricKey in timings) {
const metricValue = timings[metricKey]
Metrics.timing(metricKey, metricValue, 1, request.metricsOpts)
}
const loadavg = typeof os.loadavg === 'function' ? os.loadavg() : undefined
if (loadavg != null) {
Metrics.gauge('load-avg', loadavg[0])
}
const ts = compileTimer.done()
logger.debug(
{
projectId: request.project_id,
userId: request.user_id,
timeTaken: ts,
stats,
timings,
loadavg,
},
'done compile'
)
if (stats['latex-runs'] > 0) {
Metrics.histogram(
'avg-compile-per-pass-v2',
ts / stats['latex-runs'],
COMPILE_TIME_BUCKETS,
request.metricsOpts
)
Metrics.timing(
'avg-compile-per-pass-v2',
ts / stats['latex-runs'],
1,
request.metricsOpts
)
}
if (stats['latex-runs'] > 0 && timings['cpu-time'] > 0) {
Metrics.timing(
'run-compile-cpu-time-per-pass',
timings['cpu-time'] / stats['latex-runs'],
1,
request.metricsOpts
)
}
// Emit compile time.
timings.compile = ts
const { outputFiles, buildId } = await _saveOutputFiles({
request,
compileDir,
resourceList,
stats,
timings,
})
// Emit e2e compile time.
timings.compileE2E = timerE2E.done()
Metrics.timing('compile-e2e-v2', timings.compileE2E, 1, request.metricsOpts)
if (stats['pdf-size']) {
emitPdfStats(stats, timings, request)
}
return { outputFiles, buildId }
}
async function _saveOutputFiles({
request,
compileDir,
resourceList,
stats,
timings,
}) {
const timer = new Metrics.Timer(
'process-output-files',
1,
request.metricsOpts
)
const outputDir = getOutputDir(request.project_id, request.user_id)
const { outputFiles: rawOutputFiles, allEntries } =
await OutputFileFinder.promises.findOutputFiles(resourceList, compileDir)
const { buildId, outputFiles } =
await OutputCacheManager.promises.saveOutputFiles(
{ request, stats, timings },
rawOutputFiles,
compileDir,
outputDir
)
timings.output = timer.done()
return { outputFiles, allEntries, buildId }
}
async function stopCompile(projectId, userId) {
const compileName = getCompileName(projectId, userId)
await LatexRunner.promises.killLatex(compileName)
}
async function clearProject(projectId, userId) {
const compileDir = getCompileDir(projectId, userId)
await fsPromises.rm(compileDir, { force: true, recursive: true })
}
async function clearProjectWithListing(projectId, userId, allEntries) {
const compileDir = getCompileDir(projectId, userId)
const exists = await _checkDirectory(compileDir)
if (!exists) {
// skip removal if no directory present
return
}
for (const pathInProject of allEntries) {
const path = Path.join(compileDir, pathInProject)
if (path.endsWith('/')) {
await fsPromises.rmdir(path)
} else {
await fsPromises.unlink(path)
}
}
await fsPromises.rmdir(compileDir)
}
async function _findAllDirs() {
const root = Settings.path.compilesDir
const files = await fsPromises.readdir(root)
const allDirs = files.map(file => Path.join(root, file))
return allDirs
}
async function clearExpiredProjects(maxCacheAgeMs) {
const now = Date.now()
const dirs = await _findAllDirs()
for (const dir of dirs) {
let stats
try {
stats = await fsPromises.stat(dir)
} catch (err) {
// ignore errors checking directory
continue
}
const age = now - stats.mtime
const hasExpired = age > maxCacheAgeMs
if (hasExpired) {
await fsPromises.rm(dir, { force: true, recursive: true })
}
}
}
async function _checkDirectory(compileDir) {
let stats
try {
stats = await fsPromises.lstat(compileDir)
} catch (err) {
if (err.code === 'ENOENT') {
// directory does not exist
return false
}
OError.tag(err, 'error on stat of project directory for removal', {
dir: compileDir,
})
throw err
}
if (!stats.isDirectory()) {
throw new OError('project directory is not directory', {
dir: compileDir,
stats,
})
}
return true
}
async function syncFromCode(projectId, userId, filename, line, column, opts) {
// If LaTeX was run in a virtual environment, the file path that synctex expects
// might not match the file path on the host. The .synctex.gz file however, will be accessed
// wherever it is on the host.
const compileName = getCompileName(projectId, userId)
const baseDir = Settings.path.synctexBaseDir(compileName)
const inputFilePath = Path.join(baseDir, filename)
const outputFilePath = Path.join(baseDir, 'output.pdf')
const command = [
'synctex',
'view',
'-i',
`${line}:${column}:${inputFilePath}`,
'-o',
outputFilePath,
]
const stdout = await _runSynctex(projectId, userId, command, opts)
logger.debug(
{ projectId, userId, filename, line, column, command, stdout },
'synctex code output'
)
return SynctexOutputParser.parseViewOutput(stdout)
}
async function syncFromPdf(projectId, userId, page, h, v, opts) {
const compileName = getCompileName(projectId, userId)
const baseDir = Settings.path.synctexBaseDir(compileName)
const outputFilePath = `${baseDir}/output.pdf`
const command = [
'synctex',
'edit',
'-o',
`${page}:${h}:${v}:${outputFilePath}`,
]
const stdout = await _runSynctex(projectId, userId, command, opts)
logger.debug({ projectId, userId, page, h, v, stdout }, 'synctex pdf output')
return SynctexOutputParser.parseEditOutput(stdout, baseDir)
}
async function _checkFileExists(dir, filename) {
try {
await fsPromises.stat(dir)
} catch (error) {
if (error.code === 'ENOENT') {
throw new Errors.NotFoundError('no output directory')
}
throw error
}
const file = Path.join(dir, filename)
let stats
try {
stats = await fsPromises.stat(file)
} catch (error) {
if (error.code === 'ENOENT') {
throw new Errors.NotFoundError('no output file')
}
}
if (!stats.isFile()) {
throw new Error('not a file')
}
}
async function _runSynctex(projectId, userId, command, opts) {
const { imageName, editorId, buildId, compileFromClsiCache } = opts
if (imageName && !_isImageNameAllowed(imageName)) {
throw new Errors.InvalidParameter('invalid image')
}
if (editorId && !/^[a-f0-9-]+$/.test(editorId)) {
throw new Errors.InvalidParameter('invalid editorId')
}
if (buildId && !OutputCacheManager.BUILD_REGEX.test(buildId)) {
throw new Errors.InvalidParameter('invalid buildId')
}
const outputDir = getOutputDir(projectId, userId)
const runInOutputDir = buildId && CommandRunner.canRunSyncTeXInOutputDir()
const directory = runInOutputDir
? Path.join(outputDir, OutputCacheManager.CACHE_SUBDIR, buildId)
: getCompileDir(projectId, userId)
const timeout = 60 * 1000 // increased to allow for large projects
const compileName = getCompileName(projectId, userId)
const compileGroup = runInOutputDir ? 'synctex-output' : 'synctex'
const defaultImageName =
Settings.clsi && Settings.clsi.docker && Settings.clsi.docker.image
// eslint-disable-next-line @typescript-eslint/return-await
return await OutputCacheManager.promises.queueDirOperation(
outputDir,
/**
* @return {Promise<string>}
*/
async () => {
try {
await _checkFileExists(directory, 'output.synctex.gz')
} catch (err) {
if (
err instanceof Errors.NotFoundError &&
compileFromClsiCache &&
editorId &&
buildId
) {
try {
await downloadOutputDotSynctexFromCompileCache(
projectId,
userId,
editorId,
buildId,
directory
)
} catch (err) {
logger.warn(
{ err, projectId, userId, editorId, buildId },
'failed to download output.synctex.gz from clsi-cache'
)
}
await _checkFileExists(directory, 'output.synctex.gz')
} else {
throw err
}
}
try {
const output = await CommandRunner.promises.run(
compileName,
command,
directory,
imageName || defaultImageName,
timeout,
{},
compileGroup
)
return output.stdout
} catch (error) {
throw OError.tag(error, 'error running synctex', {
command,
projectId,
userId,
})
}
}
)
}
async function wordcount(projectId, userId, filename, image) {
logger.debug({ projectId, userId, filename, image }, 'running wordcount')
const filePath = `$COMPILE_DIR/${filename}`
const command = ['texcount', '-nocol', '-inc', filePath]
const compileDir = getCompileDir(projectId, userId)
const timeout = 60 * 1000
const compileName = getCompileName(projectId, userId)
const compileGroup = 'wordcount'
if (image && !_isImageNameAllowed(image)) {
throw new Errors.InvalidParameter('invalid image')
}
try {
await fsPromises.mkdir(compileDir, { recursive: true })
} catch (err) {
throw OError.tag(err, 'error ensuring dir for wordcount', {
projectId,
userId,
filename,
})
}
try {
const { stdout } = await CommandRunner.promises.run(
compileName,
command,
compileDir,
image,
timeout,
{},
compileGroup
)
const results = _parseWordcountFromOutput(stdout)
logger.debug(
{ projectId, userId, wordcount: results },
'word count results'
)
return results
} catch (err) {
throw OError.tag(err, 'error reading word count output', {
command,
compileDir,
projectId,
userId,
})
}
}
function _parseWordcountFromOutput(output) {
const results = {
encode: '',
textWords: 0,
headWords: 0,
outside: 0,
headers: 0,
elements: 0,
mathInline: 0,
mathDisplay: 0,
errors: 0,
messages: '',
}
for (const line of output.split('\n')) {
const [data, info] = line.split(':')
if (data.indexOf('Encoding') > -1) {
results.encode = info.trim()
}
if (data.indexOf('in text') > -1) {
results.textWords = parseInt(info, 10)
}
if (data.indexOf('in head') > -1) {
results.headWords = parseInt(info, 10)
}
if (data.indexOf('outside') > -1) {
results.outside = parseInt(info, 10)
}
if (data.indexOf('of head') > -1) {
results.headers = parseInt(info, 10)
}
if (data.indexOf('Number of floats/tables/figures') > -1) {
results.elements = parseInt(info, 10)
}
if (data.indexOf('Number of math inlines') > -1) {
results.mathInline = parseInt(info, 10)
}
if (data.indexOf('Number of math displayed') > -1) {
results.mathDisplay = parseInt(info, 10)
}
if (data === '(errors') {
// errors reported as (errors:123)
results.errors = parseInt(info, 10)
}
if (line.indexOf('!!! ') > -1) {
// errors logged as !!! message !!!
results.messages += line + '\n'
}
}
return results
}
function _isImageNameAllowed(imageName) {
const ALLOWED_IMAGES =
Settings.clsi && Settings.clsi.docker && Settings.clsi.docker.allowedImages
return !ALLOWED_IMAGES || ALLOWED_IMAGES.includes(imageName)
}
module.exports = {
doCompileWithLock: callbackify(doCompileWithLock),
stopCompile: callbackify(stopCompile),
clearProject: callbackify(clearProject),
clearExpiredProjects: callbackify(clearExpiredProjects),
syncFromCode: callbackify(syncFromCode),
syncFromPdf: callbackify(syncFromPdf),
wordcount: callbackify(wordcount),
promises: {
doCompileWithLock,
stopCompile,
clearProject,
clearExpiredProjects,
syncFromCode,
syncFromPdf,
wordcount,
},
}

View File

@@ -0,0 +1,441 @@
/**
* ContentCacheManager - maintains a cache of stream hashes from a PDF file
*/
const { callbackify } = require('node:util')
const fs = require('node:fs')
const crypto = require('node:crypto')
const Path = require('node:path')
const Settings = require('@overleaf/settings')
const OError = require('@overleaf/o-error')
const pLimit = require('p-limit')
const { parseXrefTable } = require('./XrefParser')
const {
QueueLimitReachedError,
TimedOutError,
NoXrefTableError,
} = require('./Errors')
const workerpool = require('workerpool')
const Metrics = require('@overleaf/metrics')
/**
* @type {import('workerpool').WorkerPool}
*/
let WORKER_POOL
// NOTE: Check for main thread to avoid recursive start of pool.
if (Settings.pdfCachingEnableWorkerPool && workerpool.isMainThread) {
WORKER_POOL = workerpool.pool(Path.join(__dirname, 'ContentCacheWorker.js'), {
// Cap number of worker threads.
maxWorkers: Settings.pdfCachingWorkerPoolSize,
// Warmup workers.
minWorkers: Settings.pdfCachingWorkerPoolSize,
// Limit queue back-log
maxQueueSize: Settings.pdfCachingWorkerPoolBackLogLimit,
})
setInterval(() => {
const {
totalWorkers,
busyWorkers,
idleWorkers,
pendingTasks,
activeTasks,
} = WORKER_POOL.stats()
Metrics.gauge('pdf_caching_total_workers', totalWorkers)
Metrics.gauge('pdf_caching_busy_workers', busyWorkers)
Metrics.gauge('pdf_caching_idle_workers', idleWorkers)
Metrics.gauge('pdf_caching_pending_tasks', pendingTasks)
Metrics.gauge('pdf_caching_active_tasks', activeTasks)
}, 15 * 1000)
}
/**
*
* @param {String} contentDir path to directory where content hash files are cached
* @param {String} filePath the pdf file to scan for streams
* @param {number} pdfSize the pdf size
* @param {number} pdfCachingMinChunkSize per request threshold
* @param {number} compileTime
*/
async function update({
contentDir,
filePath,
pdfSize,
pdfCachingMinChunkSize,
compileTime,
}) {
if (pdfSize < pdfCachingMinChunkSize) {
return {
contentRanges: [],
newContentRanges: [],
reclaimedSpace: 0,
startXRefTable: undefined,
}
}
if (Settings.pdfCachingEnableWorkerPool) {
return await updateOtherEventLoop({
contentDir,
filePath,
pdfSize,
pdfCachingMinChunkSize,
compileTime,
})
} else {
return await updateSameEventLoop({
contentDir,
filePath,
pdfSize,
pdfCachingMinChunkSize,
compileTime,
})
}
}
/**
*
* @param {String} contentDir path to directory where content hash files are cached
* @param {String} filePath the pdf file to scan for streams
* @param {number} pdfSize the pdf size
* @param {number} pdfCachingMinChunkSize per request threshold
* @param {number} compileTime
*/
async function updateOtherEventLoop({
contentDir,
filePath,
pdfSize,
pdfCachingMinChunkSize,
compileTime,
}) {
const workerLatencyInMs = 100
// Prefer getting the timeout error from the worker vs timing out the worker.
const timeout = getMaxOverhead(compileTime) + workerLatencyInMs
try {
return await WORKER_POOL.exec('updateSameEventLoop', [
{
contentDir,
filePath,
pdfSize,
pdfCachingMinChunkSize,
compileTime,
},
]).timeout(timeout)
} catch (e) {
if (e instanceof workerpool.Promise.TimeoutError) {
throw new TimedOutError('context-lost-in-worker', { timeout })
}
if (e.message?.includes?.('Max queue size of ')) {
throw new QueueLimitReachedError()
}
if (e.message?.includes?.('xref')) {
throw new NoXrefTableError(e.message)
}
throw e
}
}
/**
*
* @param {String} contentDir path to directory where content hash files are cached
* @param {String} filePath the pdf file to scan for streams
* @param {number} pdfSize the pdf size
* @param {number} pdfCachingMinChunkSize per request threshold
* @param {number} compileTime
*/
async function updateSameEventLoop({
contentDir,
filePath,
pdfSize,
pdfCachingMinChunkSize,
compileTime,
}) {
const checkDeadline = getDeadlineChecker(compileTime)
// keep track of hashes expire old ones when they reach a generation > N.
const tracker = await HashFileTracker.from(contentDir)
tracker.updateAge()
checkDeadline('after init HashFileTracker')
const [reclaimedSpace, overheadDeleteStaleHashes] =
await tracker.deleteStaleHashes(5)
checkDeadline('after delete stale hashes')
const { xRefEntries, startXRefTable } = await parseXrefTable(
filePath,
pdfSize
)
xRefEntries.sort((a, b) => {
return a.offset - b.offset
})
xRefEntries.forEach((obj, idx) => {
obj.idx = idx
})
checkDeadline('after parsing')
const uncompressedObjects = []
for (const object of xRefEntries) {
if (!object.uncompressed) {
continue
}
const nextObject = xRefEntries[object.idx + 1]
if (!nextObject) {
// Ignore this possible edge case.
// The last object should be part of the xRef table.
continue
} else {
object.endOffset = nextObject.offset
}
const size = object.endOffset - object.offset
object.size = size
if (size < pdfCachingMinChunkSize) {
continue
}
uncompressedObjects.push({ object, idx: uncompressedObjects.length })
}
checkDeadline('after finding uncompressed')
let timedOutErr = null
const contentRanges = []
const newContentRanges = []
const handle = await fs.promises.open(filePath)
try {
for (const { object, idx } of uncompressedObjects) {
let buffer = Buffer.alloc(object.size, 0)
const { bytesRead } = await handle.read(
buffer,
0,
object.size,
object.offset
)
checkDeadline('after read ' + idx)
if (bytesRead !== object.size) {
throw new OError('could not read full chunk', {
object,
bytesRead,
})
}
const idxObj = buffer.indexOf('obj')
if (idxObj > 100) {
throw new OError('objectId is too large', {
object,
idxObj,
})
}
const objectIdRaw = buffer.subarray(0, idxObj)
buffer = buffer.subarray(objectIdRaw.byteLength)
const hash = pdfStreamHash(buffer)
checkDeadline('after hash ' + idx)
const range = {
objectId: objectIdRaw.toString(),
start: object.offset + objectIdRaw.byteLength,
end: object.endOffset,
hash,
}
if (tracker.has(range.hash)) {
// Optimization: Skip writing of already seen hashes.
tracker.track(range)
contentRanges.push(range)
continue
}
await writePdfStream(contentDir, hash, buffer)
tracker.track(range)
contentRanges.push(range)
newContentRanges.push(range)
checkDeadline('after write ' + idx)
}
} catch (err) {
if (err instanceof TimedOutError) {
// Let the frontend use ranges that were processed so far.
timedOutErr = err
} else {
throw err
}
} finally {
await handle.close()
// Flush from both success and failure code path. This allows the next
// cycle to complete faster as it can use the already written ranges.
await tracker.flush()
}
return {
contentRanges,
newContentRanges,
reclaimedSpace,
startXRefTable,
overheadDeleteStaleHashes,
timedOutErr,
}
}
function getStatePath(contentDir) {
return Path.join(contentDir, '.state.v0.json')
}
class HashFileTracker {
constructor(contentDir, { hashAge = [], hashSize = [] }) {
this.contentDir = contentDir
this.hashAge = new Map(hashAge)
this.hashSize = new Map(hashSize)
}
static async from(contentDir) {
const statePath = getStatePath(contentDir)
let state = {}
try {
const blob = await fs.promises.readFile(statePath)
state = JSON.parse(blob)
} catch (e) {}
return new HashFileTracker(contentDir, state)
}
has(hash) {
return this.hashAge.has(hash)
}
track(range) {
if (!this.hashSize.has(range.hash)) {
this.hashSize.set(range.hash, range.end - range.start)
}
this.hashAge.set(range.hash, 0)
}
updateAge() {
for (const [hash, age] of this.hashAge) {
this.hashAge.set(hash, age + 1)
}
return this
}
findStale(maxAge) {
const stale = []
for (const [hash, age] of this.hashAge) {
if (age > maxAge) {
stale.push(hash)
}
}
return stale
}
async flush() {
const statePath = getStatePath(this.contentDir)
const blob = JSON.stringify({
hashAge: Array.from(this.hashAge.entries()),
hashSize: Array.from(this.hashSize.entries()),
})
const atomicWrite = statePath + '~'
try {
await fs.promises.writeFile(atomicWrite, blob)
} catch (err) {
try {
await fs.promises.unlink(atomicWrite)
} catch (e) {}
throw err
}
try {
await fs.promises.rename(atomicWrite, statePath)
} catch (err) {
try {
await fs.promises.unlink(atomicWrite)
} catch (e) {}
throw err
}
}
async deleteStaleHashes(n) {
const t0 = Date.now()
// delete any hash file older than N generations
const hashes = this.findStale(n)
let reclaimedSpace = 0
if (hashes.length === 0) {
return [reclaimedSpace, Date.now() - t0]
}
await promiseMapWithLimit(10, hashes, async hash => {
try {
await fs.promises.unlink(Path.join(this.contentDir, hash))
} catch (err) {
if (err?.code === 'ENOENT') {
// Ignore already deleted entries. The previous cleanup cycle may have
// been killed halfway through the deletion process, or before we
// flushed the state to disk.
} else {
throw err
}
}
this.hashAge.delete(hash)
reclaimedSpace += this.hashSize.get(hash)
this.hashSize.delete(hash)
})
return [reclaimedSpace, Date.now() - t0]
}
}
function pdfStreamHash(buffer) {
const hash = crypto.createHash('sha256')
hash.update(buffer)
return hash.digest('hex')
}
async function writePdfStream(dir, hash, buffer) {
const filename = Path.join(dir, hash)
const atomicWriteFilename = filename + '~'
try {
await fs.promises.writeFile(atomicWriteFilename, buffer)
await fs.promises.rename(atomicWriteFilename, filename)
} catch (err) {
try {
await fs.promises.unlink(atomicWriteFilename)
} catch (_) {
throw err
}
}
}
function getMaxOverhead(compileTime) {
return Math.min(
// Adding 10s to a 40s compile time is OK.
// Adding 1s to a 3s compile time is OK.
Math.max(compileTime / 4, 1000),
// Adding 30s to a 120s compile time is not OK, limit to 10s.
Settings.pdfCachingMaxProcessingTime
)
}
function getDeadlineChecker(compileTime) {
const timeout = getMaxOverhead(compileTime)
const deadline = Date.now() + timeout
let lastStage = { stage: 'start', now: Date.now() }
let completedStages = 0
return function (stage) {
const now = Date.now()
if (now > deadline) {
throw new TimedOutError(stage, {
timeout,
completedStages,
lastStage: lastStage.stage,
diffToLastStage: now - lastStage.now,
})
}
completedStages++
lastStage = { stage, now }
}
}
function promiseMapWithLimit(concurrency, array, fn) {
const limit = pLimit(concurrency)
return Promise.all(array.map(x => limit(() => fn(x))))
}
module.exports = {
HASH_REGEX: /^[0-9a-f]{64}$/,
update: callbackify(update),
promises: {
update,
updateSameEventLoop,
},
}

View File

@@ -0,0 +1,146 @@
const logger = require('@overleaf/logger')
const Metrics = require('./Metrics')
const os = require('node:os')
let CACHED_LOAD = {
expires: -1,
load: [0, 0, 0],
}
function getSystemLoad() {
if (CACHED_LOAD.expires < Date.now()) {
CACHED_LOAD = {
expires: Date.now() + 10 * 1000,
load: os.loadavg(),
}
}
return CACHED_LOAD.load
}
const ONE_MB = 1024 * 1024
function emitPdfStats(stats, timings, request) {
if (timings['compute-pdf-caching']) {
emitPdfCachingStats(stats, timings, request)
} else {
// How much bandwidth will the pdf incur when downloaded in full?
Metrics.summary('pdf-bandwidth', stats['pdf-size'], request.metricsOpts)
}
}
function emitPdfCachingStats(stats, timings, request) {
if (!stats['pdf-size']) return // double check
if (stats['pdf-caching-timed-out']) {
Metrics.inc('pdf-caching-timed-out', 1, request.metricsOpts)
}
if (timings['pdf-caching-overhead-delete-stale-hashes'] !== undefined) {
Metrics.summary(
'pdf-caching-overhead-delete-stale-hashes',
timings['pdf-caching-overhead-delete-stale-hashes'],
request.metricsOpts
)
}
// How much extra time did we spent in PDF.js?
Metrics.timing(
'compute-pdf-caching',
timings['compute-pdf-caching'],
1,
request.metricsOpts
)
// How large is the overhead of hashing up-front?
const fraction =
timings.compileE2E - timings['compute-pdf-caching'] !== 0
? timings.compileE2E /
(timings.compileE2E - timings['compute-pdf-caching'])
: 1
if (fraction > 1.5 && timings.compileE2E > 10 * 1000) {
logger.warn(
{
stats,
timings,
load: getSystemLoad(),
},
'slow pdf caching'
)
}
Metrics.summary(
'overhead-compute-pdf-ranges',
fraction * 100 - 100,
request.metricsOpts
)
// How does the hashing scale to pdf size in MB?
Metrics.timing(
'compute-pdf-caching-relative-to-pdf-size',
timings['compute-pdf-caching'] / (stats['pdf-size'] / ONE_MB),
1,
request.metricsOpts
)
if (stats['pdf-caching-total-ranges-size']) {
// How does the hashing scale to total ranges size in MB?
Metrics.timing(
'compute-pdf-caching-relative-to-total-ranges-size',
timings['compute-pdf-caching'] /
(stats['pdf-caching-total-ranges-size'] / ONE_MB),
1,
request.metricsOpts
)
// How fast is the hashing per range on average?
Metrics.timing(
'compute-pdf-caching-relative-to-ranges-count',
timings['compute-pdf-caching'] / stats['pdf-caching-n-ranges'],
1,
request.metricsOpts
)
// How many ranges are new?
Metrics.summary(
'new-pdf-ranges-relative-to-total-ranges',
(stats['pdf-caching-n-new-ranges'] / stats['pdf-caching-n-ranges']) * 100,
request.metricsOpts
)
}
// How much content is cacheable?
Metrics.summary(
'cacheable-ranges-to-pdf-size',
(stats['pdf-caching-total-ranges-size'] / stats['pdf-size']) * 100,
request.metricsOpts
)
const sizeWhenDownloadedInFull =
// All of the pdf
stats['pdf-size'] -
// These ranges are potentially cached.
stats['pdf-caching-total-ranges-size'] +
// These ranges are not cached.
stats['pdf-caching-new-ranges-size']
// How much bandwidth can we save when downloading the pdf in full?
Metrics.summary(
'pdf-bandwidth-savings',
100 - (sizeWhenDownloadedInFull / stats['pdf-size']) * 100,
request.metricsOpts
)
// How much bandwidth will the pdf incur when downloaded in full?
Metrics.summary(
'pdf-bandwidth',
sizeWhenDownloadedInFull,
request.metricsOpts
)
// How much space do the ranges use?
// This will accumulate the ranges size over time, skipping already written ranges.
Metrics.summary(
'pdf-ranges-disk-size',
stats['pdf-caching-new-ranges-size'] - stats['pdf-caching-reclaimed-space'],
request.metricsOpts
)
}
module.exports = {
emitPdfStats,
}

View File

@@ -0,0 +1,4 @@
const workerpool = require('workerpool')
const ContentCacheManager = require('./ContentCacheManager')
workerpool.worker(ContentCacheManager.promises)

View File

@@ -0,0 +1,24 @@
const Path = require('node:path')
const send = require('send')
const Settings = require('@overleaf/settings')
const OutputCacheManager = require('./OutputCacheManager')
const ONE_DAY_S = 24 * 60 * 60
const ONE_DAY_MS = ONE_DAY_S * 1000
function getPdfRange(req, res, next) {
const { projectId, userId, contentId, hash } = req.params
const perUserDir = userId ? `${projectId}-${userId}` : projectId
const path = Path.join(
Settings.path.outputDir,
perUserDir,
OutputCacheManager.CONTENT_SUBDIR,
contentId,
hash
)
res.setHeader('cache-control', `public, max-age=${ONE_DAY_S}`)
res.setHeader('expires', new Date(Date.now() + ONE_DAY_MS).toUTCString())
send(req, path).pipe(res)
}
module.exports = { getPdfRange }

View File

@@ -0,0 +1,38 @@
/* eslint-disable
no-unused-vars,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
let ContentTypeMapper
const Path = require('node:path')
// here we coerce html, css and js to text/plain,
// otherwise choose correct mime type based on file extension,
// falling back to octet-stream
module.exports = ContentTypeMapper = {
map(path) {
switch (Path.extname(path)) {
case '.txt':
case '.html':
case '.js':
case '.css':
case '.svg':
return 'text/plain'
case '.csv':
return 'text/csv'
case '.pdf':
return 'application/pdf'
case '.png':
return 'image/png'
case '.jpg':
case '.jpeg':
return 'image/jpeg'
case '.tiff':
return 'image/tiff'
case '.gif':
return 'image/gif'
default:
return 'application/octet-stream'
}
},
}

View File

@@ -0,0 +1,110 @@
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let LockManager
const logger = require('@overleaf/logger')
const LockState = {} // locks for docker container operations, by container name
module.exports = LockManager = {
MAX_LOCK_HOLD_TIME: 15000, // how long we can keep a lock
MAX_LOCK_WAIT_TIME: 10000, // how long we wait for a lock
LOCK_TEST_INTERVAL: 1000, // retry time
tryLock(key, callback) {
let lockValue
if (callback == null) {
callback = function () {}
}
const existingLock = LockState[key]
if (existingLock != null) {
// the lock is already taken, check how old it is
const lockAge = Date.now() - existingLock.created
if (lockAge < LockManager.MAX_LOCK_HOLD_TIME) {
return callback(null, false) // we didn't get the lock, bail out
} else {
logger.error(
{ key, lock: existingLock, age: lockAge },
'taking old lock by force'
)
}
}
// take the lock
LockState[key] = lockValue = { created: Date.now() }
return callback(null, true, lockValue)
},
getLock(key, callback) {
let attempt
if (callback == null) {
callback = function () {}
}
const startTime = Date.now()
return (attempt = () =>
LockManager.tryLock(key, function (error, gotLock, lockValue) {
if (error != null) {
return callback(error)
}
if (gotLock) {
return callback(null, lockValue)
} else if (Date.now() - startTime > LockManager.MAX_LOCK_WAIT_TIME) {
const e = new Error('Lock timeout')
e.key = key
return callback(e)
} else {
return setTimeout(attempt, LockManager.LOCK_TEST_INTERVAL)
}
}))()
},
releaseLock(key, lockValue, callback) {
if (callback == null) {
callback = function () {}
}
const existingLock = LockState[key]
if (existingLock === lockValue) {
// lockValue is an object, so we can test by reference
delete LockState[key] // our lock, so we can free it
return callback()
} else if (existingLock != null) {
// lock exists but doesn't match ours
logger.error(
{ key, lock: existingLock },
'tried to release lock taken by force'
)
return callback()
} else {
logger.error(
{ key, lock: existingLock },
'tried to release lock that has gone'
)
return callback()
}
},
runWithLock(key, runner, callback) {
if (callback == null) {
callback = function () {}
}
return LockManager.getLock(key, function (error, lockValue) {
if (error != null) {
return callback(error)
}
return runner((error1, ...args) =>
LockManager.releaseLock(key, lockValue, function (error2) {
error = error1 || error2
if (error != null) {
return callback(error)
}
return callback(null, ...Array.from(args))
})
)
})
},
}

View File

@@ -0,0 +1,597 @@
const { promisify } = require('node:util')
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
const Docker = require('dockerode')
const dockerode = new Docker()
const crypto = require('node:crypto')
const async = require('async')
const LockManager = require('./DockerLockManager')
const Path = require('node:path')
const _ = require('lodash')
const ONE_HOUR_IN_MS = 60 * 60 * 1000
logger.debug('using docker runner')
let containerMonitorTimeout
let containerMonitorInterval
const DockerRunner = {
run(
projectId,
command,
directory,
image,
timeout,
environment,
compileGroup,
callback
) {
command = command.map(arg =>
arg.toString().replace('$COMPILE_DIR', '/compile')
)
if (image == null) {
image = Settings.clsi.docker.image
}
if (
Settings.clsi.docker.allowedImages &&
!Settings.clsi.docker.allowedImages.includes(image)
) {
return callback(new Error('image not allowed'))
}
if (Settings.texliveImageNameOveride != null) {
const img = image.split('/')
image = `${Settings.texliveImageNameOveride}/${img[2]}`
}
if (compileGroup === 'synctex-output') {
// In: directory = '/overleaf/services/clsi/output/projectId-userId/generated-files/buildId'
// directory.split('/').slice(-3) === 'projectId-userId/generated-files/buildId'
// sandboxedCompilesHostDirOutput = '/host/output'
// Out: directory = '/host/output/projectId-userId/generated-files/buildId'
directory = Path.join(
Settings.path.sandboxedCompilesHostDirOutput,
...directory.split('/').slice(-3)
)
} else {
// In: directory = '/overleaf/services/clsi/compiles/projectId-userId'
// Path.basename(directory) === 'projectId-userId'
// sandboxedCompilesHostDirCompiles = '/host/compiles'
// Out: directory = '/host/compiles/projectId-userId'
directory = Path.join(
Settings.path.sandboxedCompilesHostDirCompiles,
Path.basename(directory)
)
}
const volumes = { [directory]: '/compile' }
if (
compileGroup === 'synctex' ||
compileGroup === 'synctex-output' ||
compileGroup === 'wordcount'
) {
volumes[directory] += ':ro'
}
const options = DockerRunner._getContainerOptions(
command,
image,
volumes,
timeout,
environment,
compileGroup
)
const fingerprint = DockerRunner._fingerprintContainer(options)
const name = `project-${projectId}-${fingerprint}`
options.name = name
// logOptions = _.clone(options)
// logOptions?.HostConfig?.SecurityOpt = "secomp used, removed in logging"
logger.debug({ projectId }, 'running docker container')
DockerRunner._runAndWaitForContainer(
options,
volumes,
timeout,
(error, output) => {
if (error && error.statusCode === 500) {
logger.debug(
{ err: error, projectId },
'error running container so destroying and retrying'
)
DockerRunner.destroyContainer(name, null, true, error => {
if (error != null) {
return callback(error)
}
DockerRunner._runAndWaitForContainer(
options,
volumes,
timeout,
callback
)
})
} else {
callback(error, output)
}
}
)
// pass back the container name to allow it to be killed
return name
},
kill(containerId, callback) {
logger.debug({ containerId }, 'sending kill signal to container')
const container = dockerode.getContainer(containerId)
container.kill(error => {
if (
error != null &&
error.message != null &&
error.message.match(/Cannot kill container .* is not running/)
) {
logger.warn(
{ err: error, containerId },
'container not running, continuing'
)
error = null
}
if (error != null) {
logger.error({ err: error, containerId }, 'error killing container')
callback(error)
} else {
callback()
}
})
},
_runAndWaitForContainer(options, volumes, timeout, _callback) {
const callback = _.once(_callback)
const { name } = options
let streamEnded = false
let containerReturned = false
let output = {}
function callbackIfFinished() {
if (streamEnded && containerReturned) {
callback(null, output)
}
}
function attachStreamHandler(error, _output) {
if (error != null) {
return callback(error)
}
output = _output
streamEnded = true
callbackIfFinished()
}
DockerRunner.startContainer(
options,
volumes,
attachStreamHandler,
(error, containerId) => {
if (error != null) {
return callback(error)
}
DockerRunner.waitForContainer(name, timeout, (error, exitCode) => {
if (error != null) {
return callback(error)
}
if (exitCode === 137) {
// exit status from kill -9
const err = new Error('terminated')
err.terminated = true
return callback(err)
}
if (exitCode === 1) {
// exit status from chktex
const err = new Error('exited')
err.code = exitCode
return callback(err)
}
containerReturned = true
if (options != null && options.HostConfig != null) {
options.HostConfig.SecurityOpt = null
}
logger.debug({ exitCode, options }, 'docker container has exited')
callbackIfFinished()
})
}
)
},
_getContainerOptions(
command,
image,
volumes,
timeout,
environment,
compileGroup
) {
const timeoutInSeconds = timeout / 1000
const dockerVolumes = {}
for (const hostVol in volumes) {
const dockerVol = volumes[hostVol]
dockerVolumes[dockerVol] = {}
if (volumes[hostVol].slice(-3).indexOf(':r') === -1) {
volumes[hostVol] = `${dockerVol}:rw`
}
}
// merge settings and environment parameter
const env = {}
for (const src of [Settings.clsi.docker.env, environment || {}]) {
for (const key in src) {
const value = src[key]
env[key] = value
}
}
// set the path based on the image year
const match = image.match(/:([0-9]+)\.[0-9]+/)
const year = match ? match[1] : '2014'
env.PATH = `/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/texlive/${year}/bin/x86_64-linux/`
const options = {
Cmd: command,
Image: image,
Volumes: dockerVolumes,
WorkingDir: '/compile',
NetworkDisabled: true,
Memory: 1024 * 1024 * 1024 * 1024, // 1 Gb
User: Settings.clsi.docker.user,
Env: Object.entries(env).map(([key, value]) => `${key}=${value}`),
HostConfig: {
Binds: Object.entries(volumes).map(
([hostVol, dockerVol]) => `${hostVol}:${dockerVol}`
),
LogConfig: { Type: 'none', Config: {} },
Ulimits: [
{
Name: 'cpu',
Soft: timeoutInSeconds + 5,
Hard: timeoutInSeconds + 10,
},
],
CapDrop: 'ALL',
SecurityOpt: ['no-new-privileges'],
},
}
if (Settings.clsi.docker.seccomp_profile != null) {
options.HostConfig.SecurityOpt.push(
`seccomp=${Settings.clsi.docker.seccomp_profile}`
)
}
if (Settings.clsi.docker.apparmor_profile != null) {
options.HostConfig.SecurityOpt.push(
`apparmor=${Settings.clsi.docker.apparmor_profile}`
)
}
if (Settings.clsi.docker.runtime) {
options.HostConfig.Runtime = Settings.clsi.docker.runtime
}
if (Settings.clsi.docker.Readonly) {
options.HostConfig.ReadonlyRootfs = true
options.HostConfig.Tmpfs = { '/tmp': 'rw,noexec,nosuid,size=65536k' }
options.Volumes['/home/tex'] = {}
}
// Allow per-compile group overriding of individual settings
if (
Settings.clsi.docker.compileGroupConfig &&
Settings.clsi.docker.compileGroupConfig[compileGroup]
) {
const override = Settings.clsi.docker.compileGroupConfig[compileGroup]
for (const key in override) {
_.set(options, key, override[key])
}
}
return options
},
_fingerprintContainer(containerOptions) {
// Yay, Hashing!
const json = JSON.stringify(containerOptions)
return crypto.createHash('md5').update(json).digest('hex')
},
startContainer(options, volumes, attachStreamHandler, callback) {
LockManager.runWithLock(
options.name,
releaseLock =>
DockerRunner._startContainer(
options,
volumes,
attachStreamHandler,
releaseLock
),
callback
)
},
// Check that volumes exist and are directories
_startContainer(options, volumes, attachStreamHandler, callback) {
callback = _.once(callback)
const { name } = options
logger.debug({ containerName: name }, 'starting container')
const container = dockerode.getContainer(name)
function createAndStartContainer() {
dockerode.createContainer(options, (error, container) => {
if (error != null) {
return callback(error)
}
startExistingContainer()
})
}
function startExistingContainer() {
DockerRunner.attachToContainer(
options.name,
attachStreamHandler,
error => {
if (error != null) {
return callback(error)
}
container.start(error => {
if (error != null && error.statusCode !== 304) {
callback(error)
} else {
// already running
callback()
}
})
}
)
}
container.inspect((error, stats) => {
if (error != null && error.statusCode === 404) {
createAndStartContainer()
} else if (error != null) {
logger.err(
{ containerName: name, error },
'unable to inspect container to start'
)
callback(error)
} else {
startExistingContainer()
}
})
},
attachToContainer(containerId, attachStreamHandler, attachStartCallback) {
const container = dockerode.getContainer(containerId)
container.attach({ stdout: 1, stderr: 1, stream: 1 }, (error, stream) => {
if (error != null) {
logger.error(
{ err: error, containerId },
'error attaching to container'
)
return attachStartCallback(error)
} else {
attachStartCallback()
}
logger.debug({ containerId }, 'attached to container')
const MAX_OUTPUT = 1024 * 1024 * 2 // limit output to 2MB
function createStringOutputStream(name) {
return {
data: '',
overflowed: false,
write(data) {
if (this.overflowed) {
return
}
if (this.data.length < MAX_OUTPUT) {
this.data += data
} else {
logger.info(
{
containerId,
length: this.data.length,
maxLen: MAX_OUTPUT,
},
`${name} exceeds max size`
)
this.data += `(...truncated at ${MAX_OUTPUT} chars...)`
this.overflowed = true
}
},
// kill container if too much output
// docker.containers.kill(containerId, () ->)
}
}
const stdout = createStringOutputStream('stdout')
const stderr = createStringOutputStream('stderr')
container.modem.demuxStream(stream, stdout, stderr)
stream.on('error', err =>
logger.error(
{ err, containerId },
'error reading from container stream'
)
)
stream.on('end', () =>
attachStreamHandler(null, { stdout: stdout.data, stderr: stderr.data })
)
})
},
waitForContainer(containerId, timeout, _callback) {
const callback = _.once(_callback)
const container = dockerode.getContainer(containerId)
let timedOut = false
const timeoutId = setTimeout(() => {
timedOut = true
logger.debug({ containerId }, 'timeout reached, killing container')
container.kill(err => {
logger.warn({ err, containerId }, 'failed to kill container')
})
}, timeout)
logger.debug({ containerId }, 'waiting for docker container')
container.wait((error, res) => {
if (error != null) {
clearTimeout(timeoutId)
logger.warn({ err: error, containerId }, 'error waiting for container')
return callback(error)
}
if (timedOut) {
logger.debug({ containerId }, 'docker container timed out')
error = new Error('container timed out')
error.timedout = true
callback(error)
} else {
clearTimeout(timeoutId)
logger.debug(
{ containerId, exitCode: res.StatusCode },
'docker container returned'
)
callback(null, res.StatusCode)
}
})
},
destroyContainer(containerName, containerId, shouldForce, callback) {
// We want the containerName for the lock and, ideally, the
// containerId to delete. There is a bug in the docker.io module
// where if you delete by name and there is an error, it throws an
// async exception, but if you delete by id it just does a normal
// error callback. We fall back to deleting by name if no id is
// supplied.
LockManager.runWithLock(
containerName,
releaseLock =>
DockerRunner._destroyContainer(
containerId || containerName,
shouldForce,
releaseLock
),
callback
)
},
_destroyContainer(containerId, shouldForce, callback) {
logger.debug({ containerId }, 'destroying docker container')
const container = dockerode.getContainer(containerId)
container.remove({ force: shouldForce === true, v: true }, error => {
if (error != null && error.statusCode === 404) {
logger.warn(
{ err: error, containerId },
'container not found, continuing'
)
error = null
}
if (error != null) {
logger.error({ err: error, containerId }, 'error destroying container')
} else {
logger.debug({ containerId }, 'destroyed container')
}
callback(error)
})
},
// handle expiry of docker containers
MAX_CONTAINER_AGE: Settings.clsi.docker.maxContainerAge || ONE_HOUR_IN_MS,
examineOldContainer(container, callback) {
const name = container.Name || (container.Names && container.Names[0])
const created = container.Created * 1000 // creation time is returned in seconds
const now = Date.now()
const age = now - created
const maxAge = DockerRunner.MAX_CONTAINER_AGE
const ttl = maxAge - age
logger.debug(
{ containerName: name, created, now, age, maxAge, ttl },
'checking whether to destroy container'
)
return { name, id: container.Id, ttl }
},
destroyOldContainers(callback) {
dockerode.listContainers({ all: true }, (error, containers) => {
if (error != null) {
return callback(error)
}
const jobs = []
for (const container of containers) {
const { name, id, ttl } = DockerRunner.examineOldContainer(container)
if (name.slice(0, 9) === '/project-' && ttl <= 0) {
// strip the / prefix
// the LockManager uses the plain container name
const plainName = name.slice(1)
jobs.push(cb =>
DockerRunner.destroyContainer(plainName, id, false, () => cb())
)
}
}
// Ignore errors because some containers get stuck but
// will be destroyed next time
async.series(jobs, callback)
})
},
startContainerMonitor() {
logger.debug(
{ maxAge: DockerRunner.MAX_CONTAINER_AGE },
'starting container expiry'
)
// guarantee only one monitor is running
DockerRunner.stopContainerMonitor()
// randomise the start time
const randomDelay = Math.floor(Math.random() * 5 * 60 * 1000)
containerMonitorTimeout = setTimeout(() => {
containerMonitorInterval = setInterval(
() =>
DockerRunner.destroyOldContainers(err => {
if (err) {
logger.error({ err }, 'failed to destroy old containers')
}
}),
ONE_HOUR_IN_MS
)
}, randomDelay)
},
stopContainerMonitor() {
if (containerMonitorTimeout) {
clearTimeout(containerMonitorTimeout)
containerMonitorTimeout = undefined
}
if (containerMonitorInterval) {
clearInterval(containerMonitorInterval)
containerMonitorInterval = undefined
}
},
canRunSyncTeXInOutputDir() {
return Boolean(Settings.path.sandboxedCompilesHostDirOutput)
},
}
DockerRunner.startContainerMonitor()
module.exports = DockerRunner
module.exports.promises = {
run: promisify(DockerRunner.run),
kill: promisify(DockerRunner.kill),
}

View File

@@ -0,0 +1,24 @@
const fsPromises = require('node:fs/promises')
const { callbackify } = require('node:util')
const logger = require('@overleaf/logger')
async function injectDraftMode(filename) {
const content = await fsPromises.readFile(filename, { encoding: 'utf8' })
const modifiedContent =
'\\PassOptionsToPackage{draft}{graphicx}\\PassOptionsToPackage{draft}{graphics}' +
content
logger.debug(
{
content: content.slice(0, 1024), // \documentclass is normally v near the top
modifiedContent: modifiedContent.slice(0, 1024),
filename,
},
'injected draft class'
)
await fsPromises.writeFile(filename, modifiedContent, { encoding: 'utf8' })
}
module.exports = {
injectDraftMode: callbackify(injectDraftMode),
promises: { injectDraftMode },
}

View File

@@ -0,0 +1,49 @@
/* eslint-disable
no-proto,
no-unused-vars,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
const OError = require('@overleaf/o-error')
let Errors
function NotFoundError(message) {
const error = new Error(message)
error.name = 'NotFoundError'
error.__proto__ = NotFoundError.prototype
return error
}
NotFoundError.prototype.__proto__ = Error.prototype
function FilesOutOfSyncError(message) {
const error = new Error(message)
error.name = 'FilesOutOfSyncError'
error.__proto__ = FilesOutOfSyncError.prototype
return error
}
FilesOutOfSyncError.prototype.__proto__ = Error.prototype
function AlreadyCompilingError(message) {
const error = new Error(message)
error.name = 'AlreadyCompilingError'
error.__proto__ = AlreadyCompilingError.prototype
return error
}
AlreadyCompilingError.prototype.__proto__ = Error.prototype
class QueueLimitReachedError extends OError {}
class TimedOutError extends OError {}
class NoXrefTableError extends OError {}
class TooManyCompileRequestsError extends OError {}
class InvalidParameter extends OError {}
module.exports = Errors = {
QueueLimitReachedError,
TimedOutError,
NotFoundError,
FilesOutOfSyncError,
AlreadyCompilingError,
NoXrefTableError,
TooManyCompileRequestsError,
InvalidParameter,
}

View File

@@ -0,0 +1,203 @@
const Path = require('node:path')
const { promisify } = require('node:util')
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
const CommandRunner = require('./CommandRunner')
const fs = require('node:fs')
const ProcessTable = {} // table of currently running jobs (pids or docker container names)
const TIME_V_METRICS = Object.entries({
'cpu-percent': /Percent of CPU this job got: (\d+)/m,
'cpu-time': /User time.*: (\d+.\d+)/m,
'sys-time': /System time.*: (\d+.\d+)/m,
})
const COMPILER_FLAGS = {
latex: '-pdfdvi',
lualatex: '-lualatex',
pdflatex: '-pdf',
xelatex: '-xelatex',
}
function runLatex(projectId, options, callback) {
const {
directory,
mainFile,
image,
environment,
flags,
compileGroup,
stopOnFirstError,
stats,
timings,
} = options
const compiler = options.compiler || 'pdflatex'
const timeout = options.timeout || 60000 // milliseconds
logger.debug(
{
directory,
compiler,
timeout,
mainFile,
environment,
flags,
compileGroup,
stopOnFirstError,
},
'starting compile'
)
let command
try {
command = _buildLatexCommand(mainFile, {
compiler,
stopOnFirstError,
flags,
})
} catch (err) {
return callback(err)
}
const id = `${projectId}` // record running project under this id
ProcessTable[id] = CommandRunner.run(
projectId,
command,
directory,
image,
timeout,
environment,
compileGroup,
function (error, output) {
delete ProcessTable[id]
if (error) {
return callback(error)
}
const runs =
output?.stderr?.match(/^Run number \d+ of .*latex/gm)?.length || 0
const failed = output?.stdout?.match(/^Latexmk: Errors/m) != null ? 1 : 0
// counters from latexmk output
stats['latexmk-errors'] = failed
stats['latex-runs'] = runs
stats['latex-runs-with-errors'] = failed ? runs : 0
stats[`latex-runs-${runs}`] = 1
stats[`latex-runs-with-errors-${runs}`] = failed ? 1 : 0
// timing information from /usr/bin/time
const stderr = (output && output.stderr) || ''
if (stderr.includes('Command being timed:')) {
// Add metrics for runs with `$ time -v ...`
for (const [timing, matcher] of TIME_V_METRICS) {
const match = stderr.match(matcher)
if (match) {
timings[timing] = parseFloat(match[1])
}
}
}
// record output files
_writeLogOutput(projectId, directory, output, () => {
callback(error, output)
})
}
)
}
function _writeLogOutput(projectId, directory, output, callback) {
if (!output) {
return callback()
}
// internal method for writing non-empty log files
function _writeFile(file, content, cb) {
if (content && content.length > 0) {
fs.unlink(file, () => {
fs.writeFile(file, content, { flag: 'wx' }, err => {
if (err) {
// don't fail on error
logger.error({ err, projectId, file }, 'error writing log file')
}
cb()
})
})
} else {
cb()
}
}
// write stdout and stderr, ignoring errors
_writeFile(Path.join(directory, 'output.stdout'), output.stdout, () => {
_writeFile(Path.join(directory, 'output.stderr'), output.stderr, () => {
callback()
})
})
}
function killLatex(projectId, callback) {
const id = `${projectId}`
logger.debug({ id }, 'killing running compile')
if (ProcessTable[id] == null) {
logger.warn({ id }, 'no such project to kill')
callback(null)
} else {
CommandRunner.kill(ProcessTable[id], callback)
}
}
function _buildLatexCommand(mainFile, opts = {}) {
const command = []
if (Settings.clsi?.strace) {
command.push('strace', '-o', 'strace', '-ff')
}
if (Settings.clsi?.latexmkCommandPrefix) {
command.push(...Settings.clsi.latexmkCommandPrefix)
}
// Basic command and flags
command.push(
'latexmk',
'-cd',
'-jobname=output',
'-auxdir=$COMPILE_DIR',
'-outdir=$COMPILE_DIR',
'-synctex=1',
'-interaction=batchmode'
)
// Stop on first error option
if (opts.stopOnFirstError) {
command.push('-halt-on-error')
} else {
// Run all passes despite errors
command.push('-f')
}
// Extra flags
if (opts.flags) {
command.push(...opts.flags)
}
// TeX Engine selection
const compilerFlag = COMPILER_FLAGS[opts.compiler]
if (compilerFlag) {
command.push(compilerFlag)
} else {
throw new Error(`unknown compiler: ${opts.compiler}`)
}
// We want to run latexmk on the tex file which we will automatically
// generate from the Rtex/Rmd/md file.
mainFile = mainFile.replace(/\.(Rtex|md|Rmd|Rnw)$/, '.tex')
command.push(Path.join('$COMPILE_DIR', mainFile))
return command
}
module.exports = {
runLatex,
killLatex,
promises: {
runLatex: promisify(runLatex),
killLatex: promisify(killLatex),
},
}

View File

@@ -0,0 +1,111 @@
/* eslint-disable
no-return-assign,
no-unused-vars,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let CommandRunner
const { spawn } = require('node:child_process')
const { promisify } = require('node:util')
const _ = require('lodash')
const logger = require('@overleaf/logger')
logger.debug('using standard command runner')
module.exports = CommandRunner = {
run(
projectId,
command,
directory,
image,
timeout,
environment,
compileGroup,
callback
) {
let key, value
callback = _.once(callback)
command = Array.from(command).map(arg =>
arg.toString().replace('$COMPILE_DIR', directory)
)
logger.debug({ projectId, command, directory }, 'running command')
logger.warn('timeouts and sandboxing are not enabled with CommandRunner')
// merge environment settings
const env = {}
for (key in process.env) {
value = process.env[key]
env[key] = value
}
for (key in environment) {
value = environment[key]
env[key] = value
}
// run command as detached process so it has its own process group (which can be killed if needed)
const proc = spawn(command[0], command.slice(1), {
cwd: directory,
env,
stdio: ['pipe', 'pipe', 'ignore'],
})
let stdout = ''
proc.stdout.setEncoding('utf8').on('data', data => (stdout += data))
proc.on('error', function (err) {
logger.err(
{ err, projectId, command, directory },
'error running command'
)
return callback(err)
})
proc.on('close', function (code, signal) {
let err
logger.debug({ code, signal, projectId }, 'command exited')
if (signal === 'SIGTERM') {
// signal from kill method below
err = new Error('terminated')
err.terminated = true
return callback(err)
} else if (code === 1) {
// exit status from chktex
err = new Error('exited')
err.code = code
return callback(err)
} else {
return callback(null, { stdout })
}
})
return proc.pid
}, // return process id to allow job to be killed if necessary
kill(pid, callback) {
if (callback == null) {
callback = function () {}
}
try {
process.kill(-pid) // kill all processes in group
} catch (err) {
return callback(err)
}
return callback()
},
canRunSyncTeXInOutputDir() {
return true
},
}
module.exports.promises = {
run: promisify(CommandRunner.run),
kill: promisify(CommandRunner.kill),
}

View File

@@ -0,0 +1,66 @@
const logger = require('@overleaf/logger')
const Errors = require('./Errors')
const RequestParser = require('./RequestParser')
const Metrics = require('@overleaf/metrics')
const Settings = require('@overleaf/settings')
// The lock timeout should be higher than the maximum end-to-end compile time.
// Here, we use the maximum compile timeout plus 2 minutes.
const LOCK_TIMEOUT_MS = RequestParser.MAX_TIMEOUT * 1000 + 120000
const LOCKS = new Map()
function acquire(key) {
const currentLock = LOCKS.get(key)
if (currentLock != null) {
if (currentLock.isExpired()) {
logger.warn({ key }, 'Compile lock expired')
currentLock.release()
} else {
throw new Errors.AlreadyCompilingError('compile in progress')
}
}
checkConcurrencyLimit()
const lock = new Lock(key)
LOCKS.set(key, lock)
return lock
}
function checkConcurrencyLimit() {
Metrics.gauge('concurrent_compile_requests', LOCKS.size)
if (LOCKS.size <= Settings.compileConcurrencyLimit) {
return
}
Metrics.inc('exceeded-compilier-concurrency-limit')
throw new Errors.TooManyCompileRequestsError(
'too many concurrent compile requests'
)
}
class Lock {
constructor(key) {
this.key = key
this.expiresAt = Date.now() + LOCK_TIMEOUT_MS
}
isExpired() {
return Date.now() >= this.expiresAt
}
release() {
const lockWasActive = LOCKS.delete(this.key)
if (!lockWasActive) {
logger.error({ key: this.key }, 'Lock was released twice')
}
if (this.isExpired()) {
Metrics.inc('compile_lock_expired_before_release')
}
}
}
module.exports = { acquire }

View File

@@ -0,0 +1,3 @@
// TODO: This file was created by bulk-decaffeinate.
// Sanity-check the conversion and remove this comment.
module.exports = require('@overleaf/metrics')

View File

@@ -0,0 +1,688 @@
let OutputCacheManager
const { callbackify, promisify } = require('node:util')
const async = require('async')
const fs = require('node:fs')
const Path = require('node:path')
const logger = require('@overleaf/logger')
const _ = require('lodash')
const Settings = require('@overleaf/settings')
const crypto = require('node:crypto')
const Metrics = require('./Metrics')
const OutputFileOptimiser = require('./OutputFileOptimiser')
const ContentCacheManager = require('./ContentCacheManager')
const {
QueueLimitReachedError,
TimedOutError,
NoXrefTableError,
} = require('./Errors')
const OLDEST_BUILD_DIR = new Map()
const PENDING_PROJECT_ACTIONS = new Map()
function init() {
doInit().catch(err => {
logger.fatal({ err }, 'low level error setting up cleanup of output dir')
// consider shutting down?
})
}
async function doInit() {
await fillCache()
const oldestTimestamp = await runBulkCleanup()
scheduleBulkCleanup(oldestTimestamp)
}
function scheduleBulkCleanup(oldestTimestamp) {
const delay =
Math.max(OutputCacheManager.CACHE_AGE + oldestTimestamp - Date.now(), 0) +
60 * 1000
setTimeout(async function () {
const oldestTimestamp = await runBulkCleanup()
scheduleBulkCleanup(oldestTimestamp)
}, delay)
}
async function fillCache() {
const handle = await fs.promises.opendir(Settings.path.outputDir)
try {
for await (const { name: projectIdAndUserId } of handle) {
OLDEST_BUILD_DIR.set(
Path.join(Settings.path.outputDir, projectIdAndUserId),
// Queue them for cleanup in the next hour.
Date.now() - Math.random() * OutputCacheManager.CACHE_AGE
)
}
} finally {
try {
await handle.close()
} catch (e) {}
}
}
async function runBulkCleanup() {
const cleanupThreshold = Date.now() - OutputCacheManager.CACHE_AGE
let oldestTimestamp = Date.now()
for (const [dir, timeStamp] of OLDEST_BUILD_DIR.entries()) {
if (timeStamp < cleanupThreshold) {
await cleanupDirectory(dir, { limit: OutputCacheManager.CACHE_LIMIT })
} else if (timeStamp < oldestTimestamp) {
oldestTimestamp = timeStamp
}
}
return oldestTimestamp
}
async function cleanupDirectory(dir, options) {
return await queueDirOperation(dir, async () => {
try {
await OutputCacheManager.promises.expireOutputFiles(dir, options)
} catch (err) {
logger.err({ dir, err }, 'cleanup of output directory failed')
}
})
}
/**
* @template T
*
* @param {string} dir
* @param {() => Promise<T>} fn
* @return {Promise<T>}
*/
async function queueDirOperation(dir, fn) {
const pending = PENDING_PROJECT_ACTIONS.get(dir) || Promise.resolve()
const p = pending.then(fn, fn).finally(() => {
if (PENDING_PROJECT_ACTIONS.get(dir) === p) {
PENDING_PROJECT_ACTIONS.delete(dir)
}
})
PENDING_PROJECT_ACTIONS.set(dir, p)
return p
}
module.exports = OutputCacheManager = {
CONTENT_SUBDIR: 'content',
CACHE_SUBDIR: 'generated-files',
ARCHIVE_SUBDIR: 'archived-logs',
// build id is HEXDATE-HEXRANDOM from Date.now() and RandomBytes
BUILD_REGEX: /^[0-9a-f]+-[0-9a-f]+$/,
CONTENT_REGEX: /^[0-9a-f]+-[0-9a-f]+$/,
CACHE_LIMIT: 2, // maximum number of cache directories
CACHE_AGE: 90 * 60 * 1000, // up to 90 minutes old
init,
queueDirOperation: callbackify(queueDirOperation),
path(buildId, file) {
// used by static server, given build id return '.cache/clsi/buildId'
if (buildId.match(OutputCacheManager.BUILD_REGEX)) {
return Path.join(OutputCacheManager.CACHE_SUBDIR, buildId, file)
} else {
// for invalid build id, return top level
return file
}
},
generateBuildId(callback) {
// generate a secure build id from Date.now() and 8 random bytes in hex
crypto.randomBytes(8, function (err, buf) {
if (err) {
return callback(err)
}
const random = buf.toString('hex')
const date = Date.now().toString(16)
callback(err, `${date}-${random}`)
})
},
saveOutputFiles(
{ request, stats, timings },
outputFiles,
compileDir,
outputDir,
callback
) {
const getBuildId = cb => {
if (request.buildId) return cb(null, request.buildId)
OutputCacheManager.generateBuildId(cb)
}
getBuildId(function (err, buildId) {
if (err) {
return callback(err)
}
if (!OLDEST_BUILD_DIR.has(outputDir)) {
// Register for cleanup
OLDEST_BUILD_DIR.set(outputDir, Date.now())
}
OutputCacheManager.queueDirOperation(
outputDir,
() =>
OutputCacheManager.promises.saveOutputFilesInBuildDir(
outputFiles,
compileDir,
outputDir,
buildId
),
function (err, result) {
if (err) {
return callback(err)
}
OutputCacheManager.collectOutputPdfSize(
result,
outputDir,
stats,
(err, outputFiles) => {
if (err) return callback(err, { outputFiles, buildId })
const enablePdfCaching = request.enablePdfCaching
const enablePdfCachingDark =
Settings.enablePdfCachingDark && !request.enablePdfCaching
if (
!Settings.enablePdfCaching ||
(!enablePdfCaching && !enablePdfCachingDark)
) {
return callback(null, { outputFiles, buildId })
}
OutputCacheManager.saveStreamsInContentDir(
{ request, stats, timings, enablePdfCachingDark },
outputFiles,
compileDir,
outputDir,
(err, status) => {
Metrics.inc('pdf-caching-status', 1, {
status,
...request.metricsOpts,
})
if (err) {
logger.warn(
{ err, outputDir, stats, timings },
'pdf caching failed'
)
return callback(null, { outputFiles, buildId })
}
callback(err, { outputFiles, buildId })
}
)
}
)
}
)
})
},
saveOutputFilesInBuildDir(
outputFiles,
compileDir,
outputDir,
buildId,
callback
) {
// make a compileDir/CACHE_SUBDIR/build_id directory and
// copy all the output files into it
// Put the files into a new cache subdirectory
const cacheDir = Path.join(
outputDir,
OutputCacheManager.CACHE_SUBDIR,
buildId
)
// Is it a per-user compile? check if compile directory is PROJECTID-USERID
const perUser = Path.basename(compileDir).match(
/^[0-9a-f]{24}-[0-9a-f]{24}$/
)
// Archive logs in background
if (Settings.clsi?.archive_logs || Settings.clsi?.strace) {
OutputCacheManager.archiveLogs(
outputFiles,
compileDir,
outputDir,
buildId,
function (err) {
if (err) {
return logger.warn({ err }, 'erroring archiving log files')
}
}
)
}
// make the new cache directory
fs.mkdir(cacheDir, { recursive: true }, function (err) {
if (err) {
logger.error(
{ err, directory: cacheDir },
'error creating cache directory'
)
callback(err)
} else {
// copy all the output files into the new cache directory
const results = []
const dirCache = new Set()
dirCache.add(cacheDir)
async.mapSeries(
outputFiles,
function (file, cb) {
// don't send dot files as output, express doesn't serve them
if (OutputCacheManager._fileIsHidden(file.path)) {
logger.debug(
{ compileDir, path: file.path },
'ignoring dotfile in output'
)
return cb()
}
// copy other files into cache directory if valid
const src = Path.join(compileDir, file.path)
const dst = Path.join(cacheDir, file.path)
OutputCacheManager._checkIfShouldCopy(
src,
function (err, shouldCopy) {
if (err) {
return cb(err)
}
if (!shouldCopy) {
return cb()
}
OutputCacheManager._copyFile(src, dst, dirCache, err => {
if (err) {
return cb(err)
}
file.build = buildId
results.push(file)
cb()
})
}
)
},
function (err) {
if (err) {
callback(err)
// clean up the directory we just created
fs.rm(cacheDir, { force: true, recursive: true }, function (err) {
if (err) {
return logger.error(
{ err, dir: cacheDir },
'error removing cache dir after failure'
)
}
})
} else {
// pass back the list of new files in the cache
callback(null, results)
// let file expiry run in the background, expire all previous files if per-user
cleanupDirectory(outputDir, {
keep: buildId,
limit: perUser ? 1 : null,
}).catch(() => {})
}
}
)
}
})
},
collectOutputPdfSize(outputFiles, outputDir, stats, callback) {
const outputFile = outputFiles.find(x => x.path === 'output.pdf')
if (!outputFile) return callback(null, outputFiles)
const outputFilePath = Path.join(
outputDir,
OutputCacheManager.path(outputFile.build, outputFile.path)
)
fs.stat(outputFilePath, (err, stat) => {
if (err) return callback(err, outputFiles)
outputFile.size = stat.size
stats['pdf-size'] = outputFile.size
callback(null, outputFiles)
})
},
saveStreamsInContentDir(
{ request, stats, timings, enablePdfCachingDark },
outputFiles,
compileDir,
outputDir,
callback
) {
const cacheRoot = Path.join(outputDir, OutputCacheManager.CONTENT_SUBDIR)
// check if content dir exists
OutputCacheManager.ensureContentDir(cacheRoot, function (err, contentDir) {
if (err) return callback(err, 'content-dir-unavailable')
const outputFile = outputFiles.find(x => x.path === 'output.pdf')
if (outputFile) {
// possibly we should copy the file from the build dir here
const outputFilePath = Path.join(
outputDir,
OutputCacheManager.path(outputFile.build, outputFile.path)
)
const pdfSize = outputFile.size
const timer = new Metrics.Timer(
'compute-pdf-ranges',
1,
request.metricsOpts
)
ContentCacheManager.update(
{
contentDir,
filePath: outputFilePath,
pdfSize,
pdfCachingMinChunkSize: request.pdfCachingMinChunkSize,
compileTime: timings.compile,
},
function (err, result) {
if (err && err instanceof NoXrefTableError) {
return callback(null, err.message)
}
if (err && err instanceof QueueLimitReachedError) {
logger.warn({ err, outputDir }, 'pdf caching queue limit reached')
stats['pdf-caching-queue-limit-reached'] = 1
return callback(null, 'queue-limit')
}
if (err && err instanceof TimedOutError) {
logger.warn(
{ err, outputDir, stats, timings },
'pdf caching timed out'
)
stats['pdf-caching-timed-out'] = 1
return callback(null, 'timed-out')
}
if (err) return callback(err, 'failed')
const {
contentRanges,
newContentRanges,
reclaimedSpace,
overheadDeleteStaleHashes,
timedOutErr,
startXRefTable,
} = result
let status = 'success'
if (timedOutErr) {
// Soft failure: let the frontend use partial set of ranges.
logger.warn(
{
err: timedOutErr,
overheadDeleteStaleHashes,
outputDir,
stats,
timings,
},
'pdf caching timed out - soft failure'
)
stats['pdf-caching-timed-out'] = 1
status = 'timed-out-soft-failure'
}
if (enablePdfCachingDark) {
// In dark mode we are doing the computation only and do not emit
// any ranges to the frontend.
} else {
outputFile.contentId = Path.basename(contentDir)
outputFile.ranges = contentRanges
outputFile.startXRefTable = startXRefTable
}
timings['compute-pdf-caching'] = timer.done()
stats['pdf-caching-n-ranges'] = contentRanges.length
stats['pdf-caching-total-ranges-size'] = contentRanges.reduce(
(sum, next) => sum + (next.end - next.start),
0
)
stats['pdf-caching-n-new-ranges'] = newContentRanges.length
stats['pdf-caching-new-ranges-size'] = newContentRanges.reduce(
(sum, next) => sum + (next.end - next.start),
0
)
stats['pdf-caching-reclaimed-space'] = reclaimedSpace
timings['pdf-caching-overhead-delete-stale-hashes'] =
overheadDeleteStaleHashes
callback(null, status)
}
)
} else {
callback(null, 'missing-pdf')
}
})
},
ensureContentDir(contentRoot, callback) {
fs.mkdir(contentRoot, { recursive: true }, function (err) {
if (err) {
return callback(err)
}
fs.readdir(contentRoot, function (err, results) {
if (err) return callback(err)
const dirs = results.sort()
const contentId = dirs.find(dir =>
OutputCacheManager.BUILD_REGEX.test(dir)
)
if (contentId) {
callback(null, Path.join(contentRoot, contentId))
} else {
// make a content directory
OutputCacheManager.generateBuildId(function (err, contentId) {
if (err) {
return callback(err)
}
const contentDir = Path.join(contentRoot, contentId)
fs.mkdir(contentDir, { recursive: true }, function (err) {
if (err) {
return callback(err)
}
callback(null, contentDir)
})
})
}
})
})
},
archiveLogs(outputFiles, compileDir, outputDir, buildId, callback) {
const archiveDir = Path.join(
outputDir,
OutputCacheManager.ARCHIVE_SUBDIR,
buildId
)
logger.debug({ dir: archiveDir }, 'archiving log files for project')
fs.mkdir(archiveDir, { recursive: true }, function (err) {
if (err) {
return callback(err)
}
const dirCache = new Set()
dirCache.add(archiveDir)
async.mapSeries(
outputFiles,
function (file, cb) {
const src = Path.join(compileDir, file.path)
const dst = Path.join(archiveDir, file.path)
OutputCacheManager._checkIfShouldArchive(
src,
function (err, shouldArchive) {
if (err) {
return cb(err)
}
if (!shouldArchive) {
return cb()
}
OutputCacheManager._copyFile(src, dst, dirCache, cb)
}
)
},
callback
)
})
},
expireOutputFiles(outputDir, options, callback) {
// look in compileDir for build dirs and delete if > N or age of mod time > T
const cleanupAll = cb => {
fs.rm(outputDir, { force: true, recursive: true }, err => {
if (err) {
return cb(err)
}
// Drop reference after successful cleanup of the output dir.
OLDEST_BUILD_DIR.delete(outputDir)
cb(null)
})
}
const cacheRoot = Path.join(outputDir, OutputCacheManager.CACHE_SUBDIR)
fs.readdir(cacheRoot, function (err, results) {
if (err) {
if (err.code === 'ENOENT') {
// cache directory is empty
return cleanupAll(callback)
}
logger.error({ err, projectId: cacheRoot }, 'error clearing cache')
return callback(err)
}
const dirs = results.sort().reverse()
const currentTime = Date.now()
let oldestDirTimeToKeep = 0
const isExpired = function (dir, index) {
if (options?.keep === dir) {
// This is the directory we just created for the compile request.
oldestDirTimeToKeep = currentTime
return false
}
// remove any directories over the requested (non-null) limit
if (options?.limit != null && index > options.limit) {
return true
}
// remove any directories over the hard limit
if (index > OutputCacheManager.CACHE_LIMIT) {
return true
}
// we can get the build time from the first part of the directory name DDDD-RRRR
// DDDD is date and RRRR is random bytes
const dirTime = parseInt(dir.split('-')[0], 16)
const age = currentTime - dirTime
const expired = age > OutputCacheManager.CACHE_AGE
if (expired) {
return true
}
oldestDirTimeToKeep = dirTime
return false
}
const toRemove = _.filter(dirs, isExpired)
if (toRemove.length === dirs.length) {
// No builds left after cleanup.
return cleanupAll(callback)
}
const removeDir = (dir, cb) =>
fs.rm(
Path.join(cacheRoot, dir),
{ force: true, recursive: true },
function (err, result) {
logger.debug({ cache: cacheRoot, dir }, 'removed expired cache dir')
if (err) {
logger.error({ err, dir }, 'cache remove error')
}
cb(err, result)
}
)
async.eachSeries(
toRemove,
(dir, cb) => removeDir(dir, cb),
err => {
if (err) {
// On error: keep the timestamp in the past.
// The next iteration of the cleanup loop will retry the deletion.
return callback(err)
}
// On success: push the timestamp into the future.
OLDEST_BUILD_DIR.set(outputDir, oldestDirTimeToKeep)
callback(null)
}
)
})
},
_fileIsHidden(path) {
return path?.match(/^\.|\/\./) != null
},
_ensureParentExists(dst, dirCache, callback) {
let parent = Path.dirname(dst)
if (dirCache.has(parent)) {
callback()
} else {
fs.mkdir(parent, { recursive: true }, err => {
if (err) return callback(err)
while (!dirCache.has(parent)) {
dirCache.add(parent)
parent = Path.dirname(parent)
}
callback()
})
}
},
_copyFile(src, dst, dirCache, callback) {
OutputCacheManager._ensureParentExists(dst, dirCache, err => {
if (err) {
logger.warn(
{ err, dst },
'creating parent directory in output cache failed'
)
return callback(err, false)
}
// copy output file into the cache
fs.copyFile(src, dst, function (err) {
if (err?.code === 'ENOENT') {
logger.warn(
{ err, file: src },
'file has disappeared when copying to build cache'
)
callback(err, false)
} else if (err) {
logger.error({ err, src, dst }, 'copy error for file in cache')
callback(err)
} else {
if (Settings.clsi?.optimiseInDocker) {
// don't run any optimisations on the pdf when they are done
// in the docker container
callback()
} else {
// call the optimiser for the file too
OutputFileOptimiser.optimiseFile(src, dst, callback)
}
}
})
})
},
_checkIfShouldCopy(src, callback) {
callback(null, !Path.basename(src).match(/^strace/))
},
_checkIfShouldArchive(src, callback) {
if (Path.basename(src).match(/^strace/)) {
return callback(null, true)
}
const basename = Path.basename(src)
if (
Settings.clsi?.archive_logs &&
['output.log', 'output.blg'].includes(basename)
) {
return callback(null, true)
}
callback(null, false)
},
}
OutputCacheManager.promises = {
expireOutputFiles: promisify(OutputCacheManager.expireOutputFiles),
saveOutputFiles: promisify(OutputCacheManager.saveOutputFiles),
saveOutputFilesInBuildDir: promisify(
OutputCacheManager.saveOutputFilesInBuildDir
),
queueDirOperation,
}

View File

@@ -0,0 +1,23 @@
const OutputFileArchiveManager = require('./OutputFileArchiveManager')
const { expressify } = require('@overleaf/promise-utils')
const { pipeline } = require('node:stream/promises')
async function createOutputZip(req, res) {
const {
project_id: projectId,
user_id: userId,
build_id: buildId,
} = req.params
const archive = await OutputFileArchiveManager.archiveFilesForBuild(
projectId,
userId,
buildId
)
res.attachment('output.zip')
res.setHeader('X-Content-Type-Options', 'nosniff')
await pipeline(archive, res)
}
module.exports = { createOutputZip: expressify(createOutputZip) }

View File

@@ -0,0 +1,113 @@
const archiver = require('archiver')
const OutputCacheManager = require('./OutputCacheManager')
const OutputFileFinder = require('./OutputFileFinder')
const Settings = require('@overleaf/settings')
const { open } = require('node:fs/promises')
const { NotFoundError } = require('./Errors')
const logger = require('@overleaf/logger')
// NOTE: Updating this list requires a corresponding change in
// * services/web/frontend/js/features/pdf-preview/util/file-list.ts
const ignoreFiles = ['output.fls', 'output.fdb_latexmk']
function getContentDir(projectId, userId) {
let subDir
if (userId != null) {
subDir = `${projectId}-${userId}`
} else {
subDir = projectId
}
return `${Settings.path.outputDir}/${subDir}/`
}
module.exports = {
async archiveFilesForBuild(projectId, userId, build) {
logger.debug({ projectId, userId, build }, 'Will create zip file')
const contentDir = getContentDir(projectId, userId)
const outputFiles = await this._getAllOutputFiles(
contentDir,
projectId,
userId,
build
)
const archive = archiver('zip')
archive.on('error', err => {
logger.warn(
{ err, projectId, userId, build },
'error emitted when creating output files archive'
)
})
archive.on('warning', err => {
logger.warn(
{ err, projectId, userId, build },
'warning emitted when creating output files archive'
)
})
const missingFiles = []
for (const { path } of outputFiles) {
let fileHandle
try {
fileHandle = await open(
`${contentDir}${OutputCacheManager.path(build, path)}`
)
} catch (error) {
logger.warn(
{ path, error, projectId, userId, build },
'error opening file to add to output files archive'
)
missingFiles.push(path)
continue
}
const fileStream = fileHandle.createReadStream()
archive.append(fileStream, { name: path })
}
if (missingFiles.length > 0) {
archive.append(missingFiles.join('\n'), {
name: 'missing_files.txt',
})
}
archive.finalize().catch(error => {
logger.error(
{ error, projectId, userId, build },
'error finalizing output files archive'
)
})
return archive
},
async _getAllOutputFiles(contentDir, projectId, userId, build) {
try {
const { outputFiles } = await OutputFileFinder.promises.findOutputFiles(
[],
`${contentDir}${OutputCacheManager.path(build, '.')}`
)
return outputFiles.filter(
// Ignore the pdf, clsi-cache tar-ball and also ignore the files ignored by the frontend.
({ path }) =>
path !== 'output.pdf' &&
path !== 'output.tar.gz' &&
!ignoreFiles.includes(path)
)
} catch (error) {
if (
error.code === 'ENOENT' ||
error.code === 'ENOTDIR' ||
error.code === 'EACCES'
) {
throw new NotFoundError('Output files not found')
}
throw error
}
},
}

View File

@@ -0,0 +1,53 @@
const Path = require('node:path')
const fs = require('node:fs')
const { callbackifyMultiResult } = require('@overleaf/promise-utils')
async function walkFolder(compileDir, d, files, allEntries) {
const dirents = await fs.promises.readdir(Path.join(compileDir, d), {
withFileTypes: true,
})
for (const dirent of dirents) {
const p = Path.join(d, dirent.name)
if (dirent.isDirectory()) {
await walkFolder(compileDir, p, files, allEntries)
allEntries.push(p + '/')
} else if (dirent.isFile()) {
files.push(p)
allEntries.push(p)
} else {
allEntries.push(p)
}
}
}
async function findOutputFiles(resources, directory) {
const files = []
const allEntries = []
await walkFolder(directory, '', files, allEntries)
const incomingResources = new Set(resources.map(resource => resource.path))
const outputFiles = []
for (const path of files) {
if (incomingResources.has(path)) continue
if (path === '.project-sync-state') continue
outputFiles.push({
path,
type: Path.extname(path).replace(/^\./, '') || undefined,
})
}
return {
outputFiles,
allEntries,
}
}
module.exports = {
findOutputFiles: callbackifyMultiResult(findOutputFiles, [
'outputFiles',
'allEntries',
]),
promises: {
findOutputFiles,
},
}

View File

@@ -0,0 +1,100 @@
/* eslint-disable
no-return-assign,
no-undef,
no-unused-vars,
n/no-deprecated-api,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let OutputFileOptimiser
const fs = require('node:fs')
const Path = require('node:path')
const { spawn } = require('node:child_process')
const logger = require('@overleaf/logger')
const Metrics = require('./Metrics')
const _ = require('lodash')
module.exports = OutputFileOptimiser = {
optimiseFile(src, dst, callback) {
// check output file (src) and see if we can optimise it, storing
// the result in the build directory (dst)
if (callback == null) {
callback = function () {}
}
if (src.match(/\/output\.pdf$/)) {
return OutputFileOptimiser.checkIfPDFIsOptimised(
src,
function (err, isOptimised) {
if (err != null || isOptimised) {
return callback(null)
}
return OutputFileOptimiser.optimisePDF(src, dst, callback)
}
)
} else {
return callback(null)
}
},
checkIfPDFIsOptimised(file, callback) {
const SIZE = 16 * 1024 // check the header of the pdf
const result = Buffer.alloc(SIZE) // fills with zeroes by default
return fs.open(file, 'r', function (err, fd) {
if (err != null) {
return callback(err)
}
return fs.read(fd, result, 0, SIZE, 0, (errRead, bytesRead, buffer) =>
fs.close(fd, function (errClose) {
if (errRead != null) {
return callback(errRead)
}
if (typeof errReadClose !== 'undefined' && errReadClose !== null) {
return callback(errClose)
}
const isOptimised =
buffer.toString('ascii').indexOf('/Linearized 1') >= 0
return callback(null, isOptimised)
})
)
})
},
optimisePDF(src, dst, callback) {
if (callback == null) {
callback = function () {}
}
const tmpOutput = dst + '.opt'
const args = ['--linearize', '--newline-before-endstream', src, tmpOutput]
logger.debug({ args }, 'running qpdf command')
const timer = new Metrics.Timer('qpdf')
const proc = spawn('qpdf', args, { stdio: 'ignore' })
callback = _.once(callback) // avoid double call back for error and close event
proc.on('error', function (err) {
logger.warn({ err, args }, 'qpdf failed')
return callback(null)
}) // ignore the error
return proc.on('close', function (code) {
timer.done()
if (code !== 0) {
logger.warn({ code, args }, 'qpdf returned error')
return callback(null) // ignore the error
}
return fs.rename(tmpOutput, dst, function (err) {
if (err != null) {
logger.warn(
{ tmpOutput, dst },
'failed to rename output of qpdf command'
)
}
return callback(null)
})
})
}, // ignore the error
}

View File

@@ -0,0 +1,247 @@
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let ProjectPersistenceManager
const UrlCache = require('./UrlCache')
const CompileManager = require('./CompileManager')
const async = require('async')
const logger = require('@overleaf/logger')
const oneDay = 24 * 60 * 60 * 1000
const Metrics = require('@overleaf/metrics')
const Settings = require('@overleaf/settings')
const { callbackify } = require('node:util')
const Path = require('node:path')
const fs = require('node:fs')
// projectId -> timestamp mapping.
const LAST_ACCESS = new Map()
async function collectDiskStats() {
const paths = [
Settings.path.compilesDir,
Settings.path.outputDir,
Settings.path.clsiCacheDir,
]
const diskStats = {}
for (const path of paths) {
try {
const { blocks, bavail, bsize } = await fs.promises.statfs(path)
const stats = {
// Warning: these values will be wrong by a factor in Docker-for-Mac.
// See https://github.com/docker/for-mac/issues/2136
total: blocks * bsize, // Total size of the file system in bytes
available: bavail * bsize, // Free space available to unprivileged users.
}
const diskAvailablePercent = (stats.available / stats.total) * 100
Metrics.gauge('disk_available_percent', diskAvailablePercent, 1, {
path,
})
const lowDisk = diskAvailablePercent < 10
diskStats[path] = { stats, lowDisk }
} catch (err) {
logger.err({ err, path }, 'error getting disk usage')
}
}
return diskStats
}
async function refreshExpiryTimeout() {
for (const [path, { stats, lowDisk }] of Object.entries(
await collectDiskStats()
)) {
const lowerExpiry = ProjectPersistenceManager.EXPIRY_TIMEOUT * 0.9
if (lowDisk && Settings.project_cache_length_ms / 2 < lowerExpiry) {
logger.warn(
{
path,
stats,
newExpiryTimeoutInDays: (lowerExpiry / oneDay).toFixed(2),
},
'disk running low on space, modifying EXPIRY_TIMEOUT'
)
ProjectPersistenceManager.EXPIRY_TIMEOUT = lowerExpiry
break
}
}
}
module.exports = ProjectPersistenceManager = {
EXPIRY_TIMEOUT: Settings.project_cache_length_ms || oneDay * 2.5,
promises: {
refreshExpiryTimeout,
},
refreshExpiryTimeout: callbackify(refreshExpiryTimeout),
init() {
fs.readdir(Settings.path.compilesDir, (err, dirs) => {
if (err) {
logger.warn({ err }, 'cannot get project listing')
dirs = []
}
async.eachLimit(
dirs,
10,
(projectAndUserId, cb) => {
const compileDir = Path.join(
Settings.path.compilesDir,
projectAndUserId
)
const projectId = projectAndUserId.slice(0, 24)
fs.stat(compileDir, (err, stats) => {
if (err) {
// Schedule for immediate cleanup
LAST_ACCESS.set(projectId, 0)
} else {
// Cleanup eventually.
LAST_ACCESS.set(projectId, stats.mtime.getTime())
}
cb()
})
},
() => {
setInterval(
() => {
ProjectPersistenceManager.refreshExpiryTimeout(() => {
ProjectPersistenceManager.clearExpiredProjects(err => {
if (err) {
logger.error({ err }, 'clearing expired projects failed')
}
})
})
},
10 * 60 * 1000
)
}
)
})
// Collect disk stats frequently to have them ready the next time /metrics is scraped (60s +- jitter).
setInterval(() => {
collectDiskStats().catch(err => {
logger.err({ err }, 'low level error collecting disk stats')
})
}, 50_000)
},
markProjectAsJustAccessed(projectId, callback) {
LAST_ACCESS.set(projectId, Date.now())
callback()
},
clearExpiredProjects(callback) {
if (callback == null) {
callback = function () {}
}
return ProjectPersistenceManager._findExpiredProjectIds(
function (error, projectIds) {
if (error != null) {
return callback(error)
}
logger.debug({ projectIds }, 'clearing expired projects')
const jobs = Array.from(projectIds || []).map(projectId =>
(
projectId => callback =>
ProjectPersistenceManager.clearProjectFromCache(
projectId,
{ reason: 'expired' },
function (err) {
if (err != null) {
logger.error({ err, projectId }, 'error clearing project')
}
return callback()
}
)
)(projectId)
)
return async.series(jobs, function (error) {
if (error != null) {
return callback(error)
}
return CompileManager.clearExpiredProjects(
ProjectPersistenceManager.EXPIRY_TIMEOUT,
error => callback(error)
)
})
}
)
}, // ignore any errors from deleting directories
clearProject(projectId, userId, callback) {
if (callback == null) {
callback = function () {}
}
logger.debug({ projectId, userId }, 'clearing project for user')
return CompileManager.clearProject(projectId, userId, function (error) {
if (error != null) {
return callback(error)
}
return ProjectPersistenceManager.clearProjectFromCache(
projectId,
{ reason: 'cleared' },
function (error) {
if (error != null) {
return callback(error)
}
return callback()
}
)
})
},
clearProjectFromCache(projectId, options, callback) {
if (callback == null) {
callback = function () {}
}
logger.debug({ projectId }, 'clearing project from cache')
return UrlCache.clearProject(projectId, options, function (error) {
if (error != null) {
logger.err({ error, projectId }, 'error clearing project from cache')
return callback(error)
}
return ProjectPersistenceManager._clearProjectFromDatabase(
projectId,
function (error) {
if (error != null) {
logger.err(
{ error, projectId },
'error clearing project from database'
)
}
return callback(error)
}
)
})
},
_clearProjectFromDatabase(projectId, callback) {
LAST_ACCESS.delete(projectId)
callback()
},
_findExpiredProjectIds(callback) {
const expiredFrom = Date.now() - ProjectPersistenceManager.EXPIRY_TIMEOUT
const expiredProjectsIds = []
for (const [projectId, lastAccess] of LAST_ACCESS.entries()) {
if (lastAccess < expiredFrom) {
expiredProjectsIds.push(projectId)
}
}
// ^ may be a fairly busy loop, continue detached.
setTimeout(() => callback(null, expiredProjectsIds), 0)
},
}
logger.debug(
{ EXPIRY_TIMEOUT: ProjectPersistenceManager.EXPIRY_TIMEOUT },
'project assets kept timeout'
)

View File

@@ -0,0 +1,250 @@
const settings = require('@overleaf/settings')
const OutputCacheManager = require('./OutputCacheManager')
const VALID_COMPILERS = ['pdflatex', 'latex', 'xelatex', 'lualatex']
const MAX_TIMEOUT = 600
const EDITOR_ID_REGEX = /^[a-f0-9-]{36}$/ // UUID
function parse(body, callback) {
const response = {}
if (body.compile == null) {
return callback(
new Error('top level object should have a compile attribute')
)
}
const { compile } = body
if (!compile.options) {
compile.options = {}
}
try {
response.metricsOpts = {
path: _parseAttribute('metricsPath', compile.options.metricsPath, {
default: '',
type: 'string',
}),
method: _parseAttribute('metricsMethod', compile.options.metricsMethod, {
default: '',
type: 'string',
}),
// Will be populated later. Must always be populated for prom library.
compile: 'initial',
}
response.compiler = _parseAttribute('compiler', compile.options.compiler, {
validValues: VALID_COMPILERS,
default: 'pdflatex',
type: 'string',
})
response.compileFromClsiCache = _parseAttribute(
'compileFromClsiCache',
compile.options.compileFromClsiCache,
{ default: false, type: 'boolean' }
)
response.populateClsiCache = _parseAttribute(
'populateClsiCache',
compile.options.populateClsiCache,
{ default: false, type: 'boolean' }
)
response.enablePdfCaching = _parseAttribute(
'enablePdfCaching',
compile.options.enablePdfCaching,
{
default: false,
type: 'boolean',
}
)
response.pdfCachingMinChunkSize = _parseAttribute(
'pdfCachingMinChunkSize',
compile.options.pdfCachingMinChunkSize,
{
default: settings.pdfCachingMinChunkSize,
type: 'number',
}
)
response.timeout = _parseAttribute('timeout', compile.options.timeout, {
default: MAX_TIMEOUT,
type: 'number',
})
response.imageName = _parseAttribute(
'imageName',
compile.options.imageName,
{
type: 'string',
validValues:
settings.clsi &&
settings.clsi.docker &&
settings.clsi.docker.allowedImages,
}
)
response.draft = _parseAttribute('draft', compile.options.draft, {
default: false,
type: 'boolean',
})
response.stopOnFirstError = _parseAttribute(
'stopOnFirstError',
compile.options.stopOnFirstError,
{
default: false,
type: 'boolean',
}
)
response.check = _parseAttribute('check', compile.options.check, {
type: 'string',
})
response.flags = _parseAttribute('flags', compile.options.flags, {
default: [],
type: 'object',
})
if (settings.allowedCompileGroups) {
response.compileGroup = _parseAttribute(
'compileGroup',
compile.options.compileGroup,
{
validValues: settings.allowedCompileGroups,
default: '',
type: 'string',
}
)
}
// The syncType specifies whether the request contains all
// resources (full) or only those resources to be updated
// in-place (incremental).
response.syncType = _parseAttribute('syncType', compile.options.syncType, {
validValues: ['full', 'incremental'],
type: 'string',
})
// The syncState is an identifier passed in with the request
// which has the property that it changes when any resource is
// added, deleted, moved or renamed.
//
// on syncType full the syncState identifier is passed in and
// stored
//
// on syncType incremental the syncState identifier must match
// the stored value
response.syncState = _parseAttribute(
'syncState',
compile.options.syncState,
{ type: 'string' }
)
if (response.timeout > MAX_TIMEOUT) {
response.timeout = MAX_TIMEOUT
}
response.timeout = response.timeout * 1000 // milliseconds
response.resources = (compile.resources || []).map(resource =>
_parseResource(resource)
)
const rootResourcePath = _parseAttribute(
'rootResourcePath',
compile.rootResourcePath,
{
default: 'main.tex',
type: 'string',
}
)
response.rootResourcePath = _checkPath(rootResourcePath)
response.editorId = _parseAttribute('editorId', compile.options.editorId, {
type: 'string',
regex: EDITOR_ID_REGEX,
})
response.buildId = _parseAttribute('buildId', compile.options.buildId, {
type: 'string',
regex: OutputCacheManager.BUILD_REGEX,
})
} catch (error1) {
const error = error1
return callback(error)
}
callback(null, response)
}
function _parseResource(resource) {
let modified
if (resource.path == null || typeof resource.path !== 'string') {
throw new Error('all resources should have a path attribute')
}
if (resource.modified != null) {
modified = new Date(resource.modified)
if (isNaN(modified.getTime())) {
throw new Error(
`resource modified date could not be understood: ${resource.modified}`
)
}
}
if (resource.url == null && resource.content == null) {
throw new Error(
'all resources should have either a url or content attribute'
)
}
if (resource.content != null && typeof resource.content !== 'string') {
throw new Error('content attribute should be a string')
}
if (resource.url != null && typeof resource.url !== 'string') {
throw new Error('url attribute should be a string')
}
if (resource.fallbackURL && typeof resource.fallbackURL !== 'string') {
throw new Error('fallbackURL attribute should be a string')
}
return {
path: resource.path,
modified,
url: resource.url,
fallbackURL: resource.fallbackURL,
content: resource.content,
}
}
function _parseAttribute(name, attribute, options) {
if (attribute != null) {
if (options.validValues != null) {
if (options.validValues.indexOf(attribute) === -1) {
throw new Error(
`${name} attribute should be one of: ${options.validValues.join(
', '
)}`
)
}
}
if (options.type != null) {
// eslint-disable-next-line valid-typeof
if (typeof attribute !== options.type) {
throw new Error(`${name} attribute should be a ${options.type}`)
}
}
if (options.type === 'string' && options.regex instanceof RegExp) {
if (!options.regex.test(attribute)) {
throw new Error(
`${name} attribute does not match regex ${options.regex}`
)
}
}
} else {
if (options.default != null) {
return options.default
}
}
return attribute
}
function _checkPath(path) {
// check that the request does not use a relative path
for (const dir of Array.from(path.split('/'))) {
if (dir === '..') {
throw new Error('relative path in root resource')
}
}
return path
}
module.exports = { parse, MAX_TIMEOUT }

View File

@@ -0,0 +1,116 @@
const Path = require('node:path')
const fs = require('node:fs')
const logger = require('@overleaf/logger')
const Errors = require('./Errors')
const SafeReader = require('./SafeReader')
module.exports = {
// The sync state is an identifier which must match for an
// incremental update to be allowed.
//
// The initial value is passed in and stored on a full
// compile, along with the list of resources..
//
// Subsequent incremental compiles must come with the same value - if
// not they will be rejected with a 409 Conflict response. The
// previous list of resources is returned.
//
// An incremental compile can only update existing files with new
// content. The sync state identifier must change if any docs or
// files are moved, added, deleted or renamed.
SYNC_STATE_FILE: '.project-sync-state',
SYNC_STATE_MAX_SIZE: 128 * 1024,
saveProjectState(state, resources, basePath, callback) {
const stateFile = Path.join(basePath, this.SYNC_STATE_FILE)
if (state == null) {
// remove the file if no state passed in
logger.debug({ state, basePath }, 'clearing sync state')
fs.unlink(stateFile, function (err) {
if (err && err.code !== 'ENOENT') {
return callback(err)
} else {
return callback()
}
})
} else {
logger.debug({ state, basePath }, 'writing sync state')
const resourceList = resources.map(resource => resource.path)
fs.writeFile(
stateFile,
[...resourceList, `stateHash:${state}`].join('\n'),
callback
)
}
},
checkProjectStateMatches(state, basePath, callback) {
const stateFile = Path.join(basePath, this.SYNC_STATE_FILE)
const size = this.SYNC_STATE_MAX_SIZE
SafeReader.readFile(
stateFile,
size,
'utf8',
function (err, result, bytesRead) {
if (err) {
return callback(err)
}
if (bytesRead === size) {
logger.error(
{ file: stateFile, size, bytesRead },
'project state file truncated'
)
}
const array = result ? result.toString().split('\n') : []
const adjustedLength = Math.max(array.length, 1)
const resourceList = array.slice(0, adjustedLength - 1)
const oldState = array[adjustedLength - 1]
const newState = `stateHash:${state}`
logger.debug(
{ state, oldState, basePath, stateMatches: newState === oldState },
'checking sync state'
)
if (newState !== oldState) {
return callback(
new Errors.FilesOutOfSyncError(
'invalid state for incremental update'
)
)
} else {
const resources = resourceList.map(path => ({ path }))
callback(null, resources)
}
}
)
},
checkResourceFiles(resources, allFiles, basePath, callback) {
// check the paths are all relative to current directory
const containsRelativePath = resource => {
const dirs = resource.path.split('/')
return dirs.indexOf('..') !== -1
}
if (resources.some(containsRelativePath)) {
return callback(new Error('relative path in resource file list'))
}
// check if any of the input files are not present in list of files
const seenFiles = new Set(allFiles)
const missingFiles = resources
.map(resource => resource.path)
.filter(path => !seenFiles.has(path))
if (missingFiles.length > 0) {
logger.err(
{ missingFiles, basePath, allFiles, resources },
'missing input files for project'
)
return callback(
new Errors.FilesOutOfSyncError(
'resource files missing in incremental update'
)
)
} else {
callback()
}
},
}

View File

@@ -0,0 +1,384 @@
/* eslint-disable
no-return-assign,
no-unused-vars,
no-useless-escape,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let ResourceWriter
const { promisify } = require('node:util')
const UrlCache = require('./UrlCache')
const Path = require('node:path')
const fs = require('node:fs')
const async = require('async')
const OutputFileFinder = require('./OutputFileFinder')
const ResourceStateManager = require('./ResourceStateManager')
const Metrics = require('./Metrics')
const logger = require('@overleaf/logger')
const settings = require('@overleaf/settings')
const parallelFileDownloads = settings.parallelFileDownloads || 1
module.exports = ResourceWriter = {
syncResourcesToDisk(request, basePath, callback) {
if (callback == null) {
callback = function () {}
}
if (request.syncType === 'incremental') {
logger.debug(
{ projectId: request.project_id, userId: request.user_id },
'incremental sync'
)
return ResourceStateManager.checkProjectStateMatches(
request.syncState,
basePath,
function (error, resourceList) {
if (error != null) {
return callback(error)
}
return ResourceWriter._removeExtraneousFiles(
request,
resourceList,
basePath,
function (error, outputFiles, allFiles) {
if (error != null) {
return callback(error)
}
return ResourceStateManager.checkResourceFiles(
resourceList,
allFiles,
basePath,
function (error) {
if (error != null) {
return callback(error)
}
return ResourceWriter.saveIncrementalResourcesToDisk(
request.project_id,
request.resources,
basePath,
function (error) {
if (error != null) {
return callback(error)
}
return callback(null, resourceList)
}
)
}
)
}
)
}
)
}
logger.debug(
{ projectId: request.project_id, userId: request.user_id },
'full sync'
)
UrlCache.createProjectDir(request.project_id, error => {
if (error != null) {
return callback(error)
}
ResourceWriter.saveAllResourcesToDisk(
request,
basePath,
function (error) {
if (error != null) {
return callback(error)
}
return ResourceStateManager.saveProjectState(
request.syncState,
request.resources,
basePath,
function (error) {
if (error != null) {
return callback(error)
}
return callback(null, request.resources)
}
)
}
)
})
},
saveIncrementalResourcesToDisk(projectId, resources, basePath, callback) {
if (callback == null) {
callback = function () {}
}
return ResourceWriter._createDirectory(basePath, error => {
if (error != null) {
return callback(error)
}
const jobs = Array.from(resources).map(resource =>
(resource => {
return callback =>
ResourceWriter._writeResourceToDisk(
projectId,
resource,
basePath,
callback
)
})(resource)
)
return async.parallelLimit(jobs, parallelFileDownloads, callback)
})
},
saveAllResourcesToDisk(request, basePath, callback) {
if (callback == null) {
callback = function () {}
}
return ResourceWriter._createDirectory(basePath, error => {
if (error != null) {
return callback(error)
}
const { project_id: projectId, resources } = request
ResourceWriter._removeExtraneousFiles(
request,
resources,
basePath,
error => {
if (error != null) {
return callback(error)
}
const jobs = Array.from(resources).map(resource =>
(resource => {
return callback =>
ResourceWriter._writeResourceToDisk(
projectId,
resource,
basePath,
callback
)
})(resource)
)
return async.parallelLimit(jobs, parallelFileDownloads, callback)
}
)
})
},
_createDirectory(basePath, callback) {
if (callback == null) {
callback = function () {}
}
return fs.mkdir(basePath, function (err) {
if (err != null) {
if (err.code === 'EEXIST') {
return callback()
} else {
logger.debug({ err, dir: basePath }, 'error creating directory')
return callback(err)
}
} else {
return callback()
}
})
},
_removeExtraneousFiles(request, resources, basePath, _callback) {
if (_callback == null) {
_callback = function () {}
}
const timer = new Metrics.Timer(
'unlink-output-files',
1,
request.metricsOpts
)
const callback = function (error, ...result) {
timer.done()
return _callback(error, ...Array.from(result))
}
return OutputFileFinder.findOutputFiles(
resources,
basePath,
(error, outputFiles, allFiles) => {
if (error != null) {
return callback(error)
}
const jobs = []
for (const { path } of outputFiles || []) {
const shouldDelete = ResourceWriter.isExtraneousFile(path)
if (shouldDelete) {
jobs.push(callback =>
ResourceWriter._deleteFileIfNotDirectory(
Path.join(basePath, path),
callback
)
)
}
}
return async.series(jobs, function (error) {
if (error != null) {
return callback(error)
}
return callback(null, outputFiles, allFiles)
})
}
)
},
isExtraneousFile(path) {
let shouldDelete = true
if (
path.match(/^output\./) ||
path.match(/\.aux$/) ||
path.match(/^cache\//)
) {
// knitr cache
shouldDelete = false
}
if (path.match(/^output-.*/)) {
// Tikz cached figures (default case)
shouldDelete = false
}
if (path.match(/\.(pdf|dpth|md5)$/)) {
// Tikz cached figures (by extension)
shouldDelete = false
}
if (
path.match(/\.(pygtex|pygstyle)$/) ||
path.match(/(^|\/)_minted-[^\/]+\//)
) {
// minted files/directory
shouldDelete = false
}
if (path.match(/\.md\.tex$/) || path.match(/(^|\/)_markdown_[^\/]+\//)) {
// markdown files/directory
shouldDelete = false
}
if (path.match(/-eps-converted-to\.pdf$/)) {
// Epstopdf generated files
shouldDelete = false
}
if (
path === 'output.tar.gz' ||
path === 'output.synctex.gz' ||
path === 'output.pdfxref' ||
path === 'output.pdf' ||
path === 'output.dvi' ||
path === 'output.log' ||
path === 'output.xdv' ||
path === 'output.stdout' ||
path === 'output.stderr'
) {
shouldDelete = true
}
if (path === 'output.tex') {
// created by TikzManager if present in output files
shouldDelete = true
}
return shouldDelete
},
_deleteFileIfNotDirectory(path, callback) {
if (callback == null) {
callback = function () {}
}
return fs.stat(path, function (error, stat) {
if (error != null && error.code === 'ENOENT') {
return callback()
} else if (error != null) {
logger.err(
{ err: error, path },
'error stating file in deleteFileIfNotDirectory'
)
return callback(error)
} else if (stat.isFile()) {
return fs.unlink(path, function (error) {
if (error != null) {
logger.err(
{ err: error, path },
'error removing file in deleteFileIfNotDirectory'
)
return callback(error)
} else {
return callback()
}
})
} else {
return callback()
}
})
},
_writeResourceToDisk(projectId, resource, basePath, callback) {
if (callback == null) {
callback = function () {}
}
return ResourceWriter.checkPath(
basePath,
resource.path,
function (error, path) {
if (error != null) {
return callback(error)
}
return fs.mkdir(
Path.dirname(path),
{ recursive: true },
function (error) {
if (error != null) {
return callback(error)
}
// TODO: Don't overwrite file if it hasn't been modified
if (resource.url != null) {
return UrlCache.downloadUrlToFile(
projectId,
resource.url,
resource.fallbackURL,
path,
resource.modified,
function (err) {
if (err != null) {
logger.err(
{
err,
projectId,
path,
resourceUrl: resource.url,
modified: resource.modified,
},
'error downloading file for resources'
)
Metrics.inc('download-failed')
}
return callback()
}
) // try and continue compiling even if http resource can not be downloaded at this time
} else {
fs.writeFile(path, resource.content, callback)
}
}
)
}
)
},
checkPath(basePath, resourcePath, callback) {
const path = Path.normalize(Path.join(basePath, resourcePath))
if (path.slice(0, basePath.length + 1) !== basePath + '/') {
return callback(new Error('resource path is outside root directory'))
} else {
return callback(null, path)
}
},
}
module.exports.promises = {
syncResourcesToDisk: promisify(ResourceWriter.syncResourcesToDisk),
saveIncrementalResourcesToDisk: promisify(
ResourceWriter.saveIncrementalResourcesToDisk
),
saveAllResourcesToDisk: promisify(ResourceWriter.saveAllResourcesToDisk),
checkPath: promisify(ResourceWriter.checkPath),
}

View File

@@ -0,0 +1,62 @@
/* eslint-disable
no-unused-vars,
n/no-deprecated-api,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let SafeReader
const fs = require('node:fs')
const logger = require('@overleaf/logger')
module.exports = SafeReader = {
// safely read up to size bytes from a file and return result as a
// string
readFile(file, size, encoding, callback) {
if (callback == null) {
callback = function () {}
}
return fs.open(file, 'r', function (err, fd) {
if (err != null && err.code === 'ENOENT') {
return callback()
}
if (err != null) {
return callback(err)
}
// safely return always closing the file
const callbackWithClose = (err, ...result) =>
fs.close(fd, function (err1) {
if (err != null) {
return callback(err)
}
if (err1 != null) {
return callback(err1)
}
return callback(null, ...Array.from(result))
})
const buff = Buffer.alloc(size) // fills with zeroes by default
return fs.read(
fd,
buff,
0,
buff.length,
0,
function (err, bytesRead, buffer) {
if (err != null) {
return callbackWithClose(err)
}
const result = buffer.toString(encoding, 0, bytesRead)
return callbackWithClose(null, result, bytesRead)
}
)
})
},
}

View File

@@ -0,0 +1,89 @@
/* eslint-disable
no-cond-assign,
no-unused-vars,
n/no-deprecated-api,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let ForbidSymlinks
const Path = require('node:path')
const fs = require('node:fs')
const Settings = require('@overleaf/settings')
const logger = require('@overleaf/logger')
module.exports = ForbidSymlinks = function (staticFn, root, options) {
const expressStatic = staticFn(root, options)
const basePath = Path.resolve(root)
return function (req, res, next) {
let file, projectId, result
const path = req.url
// check that the path is of the form /project_id_or_name/path/to/file.log
if ((result = path.match(/^\/([a-zA-Z0-9_-]+)\/(.*)$/s))) {
projectId = result[1]
file = result[2]
if (path !== `/${projectId}/${file}`) {
logger.warn({ path }, 'unrecognized file request')
return res.sendStatus(404)
}
} else {
logger.warn({ path }, 'unrecognized file request')
return res.sendStatus(404)
}
// check that the file does not use a relative path
for (const dir of Array.from(file.split('/'))) {
if (dir === '..') {
logger.warn({ path }, 'attempt to use a relative path')
return res.sendStatus(404)
}
}
// check that the requested path is normalized
const requestedFsPath = `${basePath}/${projectId}/${file}`
if (requestedFsPath !== Path.normalize(requestedFsPath)) {
logger.error(
{ path: requestedFsPath },
'requestedFsPath is not normalized'
)
return res.sendStatus(404)
}
// check that the requested path is not a symlink
return fs.realpath(requestedFsPath, function (err, realFsPath) {
if (err != null) {
if (err.code === 'ENOENT') {
return res.sendStatus(404)
} else {
logger.error(
{
err,
requestedFsPath,
realFsPath,
path: req.params[0],
projectId: req.params.project_id,
},
'error checking file access'
)
return res.sendStatus(500)
}
} else if (requestedFsPath !== realFsPath) {
logger.warn(
{
requestedFsPath,
realFsPath,
path: req.params[0],
projectId: req.params.project_id,
},
'trying to access a different file (symlink), aborting'
)
return res.sendStatus(404)
} else {
return expressStatic(req, res, next)
}
})
}
}

View File

@@ -0,0 +1,113 @@
const Path = require('node:path')
/**
* Parse output from the `synctex view` command
*/
function parseViewOutput(output) {
return _parseOutput(output, (record, label, value) => {
switch (label) {
case 'Page':
_setIntProp(record, 'page', value)
break
case 'h':
_setFloatProp(record, 'h', value)
break
case 'v':
_setFloatProp(record, 'v', value)
break
case 'W':
_setFloatProp(record, 'width', value)
break
case 'H':
_setFloatProp(record, 'height', value)
break
}
})
}
/**
* Parse output from the `synctex edit` command
*/
function parseEditOutput(output, baseDir) {
return _parseOutput(output, (record, label, value) => {
switch (label) {
case 'Input':
if (Path.isAbsolute(value)) {
record.file = Path.relative(baseDir, value)
} else {
record.file = value
}
break
case 'Line':
_setIntProp(record, 'line', value)
break
case 'Column':
_setIntProp(record, 'column', value)
break
}
})
}
/**
* Generic parser for synctex output
*
* Parses the output into records. Each line is split into a label and a value,
* which are then sent to `processLine` for further processing.
*/
function _parseOutput(output, processLine) {
const lines = output.split('\n')
let currentRecord = null
const records = []
for (const line of lines) {
const [label, value] = _splitLine(line)
// A line that starts with 'Output:' indicates a new record
if (label === 'Output') {
// Start new record
currentRecord = {}
records.push(currentRecord)
continue
}
// Ignore the line if we're not in a record yet
if (currentRecord == null) {
continue
}
// Process the line
processLine(currentRecord, label, value)
}
return records
}
/**
* Split a line in label and value components.
*
* The components are separated by a colon. Note that this is slightly
* different from `line.split(':', 2)`. This version puts the entirety of the
* line after the colon in the value component, even if there are more colons
* on the line.
*/
function _splitLine(line) {
const splitIndex = line.indexOf(':')
if (splitIndex === -1) {
return ['', line]
}
return [line.slice(0, splitIndex).trim(), line.slice(splitIndex + 1).trim()]
}
function _setIntProp(record, prop, value) {
const intValue = parseInt(value, 10)
if (!isNaN(intValue)) {
record[prop] = intValue
}
}
function _setFloatProp(record, prop, value) {
const floatValue = parseFloat(value)
if (!isNaN(floatValue)) {
record[prop] = floatValue
}
}
module.exports = { parseViewOutput, parseEditOutput }

View File

@@ -0,0 +1,109 @@
/* eslint-disable
no-unused-vars,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let TikzManager
const fs = require('node:fs')
const Path = require('node:path')
const { promisify } = require('node:util')
const ResourceWriter = require('./ResourceWriter')
const SafeReader = require('./SafeReader')
const logger = require('@overleaf/logger')
// for \tikzexternalize or pstool to work the main file needs to match the
// jobname. Since we set the -jobname to output, we have to create a
// copy of the main file as 'output.tex'.
module.exports = TikzManager = {
checkMainFile(compileDir, mainFile, resources, callback) {
// if there's already an output.tex file, we don't want to touch it
if (callback == null) {
callback = function () {}
}
for (const resource of Array.from(resources)) {
if (resource.path === 'output.tex') {
logger.debug(
{ compileDir, mainFile },
'output.tex already in resources'
)
return callback(null, false)
}
}
// if there's no output.tex, see if we are using tikz/pgf or pstool in the main file
return ResourceWriter.checkPath(
compileDir,
mainFile,
function (error, path) {
if (error != null) {
return callback(error)
}
return SafeReader.readFile(
path,
65536,
'utf8',
function (error, content) {
if (error != null) {
return callback(error)
}
const usesTikzExternalize =
(content != null
? content.indexOf('\\tikzexternalize')
: undefined) >= 0
const usesPsTool =
(content != null ? content.indexOf('{pstool}') : undefined) >= 0
logger.debug(
{ compileDir, mainFile, usesTikzExternalize, usesPsTool },
'checked for packages needing main file as output.tex'
)
const needsMainFile = usesTikzExternalize || usesPsTool
return callback(null, needsMainFile)
}
)
}
)
},
injectOutputFile(compileDir, mainFile, callback) {
if (callback == null) {
callback = function () {}
}
return ResourceWriter.checkPath(
compileDir,
mainFile,
function (error, path) {
if (error != null) {
return callback(error)
}
return fs.readFile(path, 'utf8', function (error, content) {
if (error != null) {
return callback(error)
}
logger.debug(
{ compileDir, mainFile },
'copied file to output.tex as project uses packages which require it'
)
// use wx flag to ensure that output file does not already exist
return fs.writeFile(
Path.join(compileDir, 'output.tex'),
content,
{ flag: 'wx' },
callback
)
})
}
)
},
}
module.exports.promises = {
checkMainFile: promisify(TikzManager.checkMainFile),
injectOutputFile: promisify(TikzManager.injectOutputFile),
}

View File

@@ -0,0 +1,132 @@
/* eslint-disable
no-return-assign,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
const UrlFetcher = require('./UrlFetcher')
const Settings = require('@overleaf/settings')
const fs = require('node:fs')
const Path = require('node:path')
const { callbackify } = require('node:util')
const Metrics = require('./Metrics')
const PENDING_DOWNLOADS = new Map()
function getProjectDir(projectId) {
return Path.join(Settings.path.clsiCacheDir, projectId)
}
function getCachePath(projectId, url, lastModified) {
// The url is a filestore URL.
// It is sufficient to look at the path and mtime for uniqueness.
const mtime = (lastModified && lastModified.getTime()) || 0
const key = new URL(url).pathname.replace(/\//g, '-') + '-' + mtime
return Path.join(getProjectDir(projectId), key)
}
async function clearProject(projectId, options) {
const timer = new Metrics.Timer('url_cache', {
status: options?.reason || 'unknown',
path: 'delete',
})
await fs.promises.rm(getProjectDir(projectId), {
force: true,
recursive: true,
})
timer.done()
}
async function createProjectDir(projectId) {
await fs.promises.mkdir(getProjectDir(projectId), { recursive: true })
}
async function downloadUrlToFile(
projectId,
url,
fallbackURL,
destPath,
lastModified
) {
const cachePath = getCachePath(projectId, url, lastModified)
try {
const timer = new Metrics.Timer('url_cache', {
status: 'cache-hit',
path: 'copy',
})
try {
await fs.promises.copyFile(cachePath, destPath)
} catch (err) {
if (err.code === 'ENOENT' && fallbackURL) {
const fallbackPath = getCachePath(projectId, fallbackURL, lastModified)
await fs.promises.copyFile(fallbackPath, destPath)
} else {
throw err
}
}
// the metric is only updated if the file is present in the cache
timer.done()
return
} catch (e) {
if (e.code !== 'ENOENT') {
throw e
}
}
// time the download
{
const timer = new Metrics.Timer('url_cache', {
status: 'cache-miss',
path: 'download',
})
try {
await download(url, fallbackURL, cachePath)
} finally {
timer.done()
}
}
// time the file copy
{
const timer = new Metrics.Timer('url_cache', {
status: 'cache-miss',
path: 'copy',
})
await fs.promises.copyFile(cachePath, destPath)
timer.done()
}
}
async function download(url, fallbackURL, cachePath) {
let pending = PENDING_DOWNLOADS.get(cachePath)
if (pending) {
return pending
}
pending = UrlFetcher.promises.pipeUrlToFileWithRetry(
url,
fallbackURL,
cachePath
)
PENDING_DOWNLOADS.set(cachePath, pending)
try {
await pending
} finally {
PENDING_DOWNLOADS.delete(cachePath)
}
}
module.exports = {
clearProject: callbackify(clearProject),
createProjectDir: callbackify(createProjectDir),
downloadUrlToFile: callbackify(downloadUrlToFile),
promises: {
clearProject,
createProjectDir,
downloadUrlToFile,
},
}

View File

@@ -0,0 +1,122 @@
const fs = require('node:fs')
const logger = require('@overleaf/logger')
const Settings = require('@overleaf/settings')
const {
CustomHttpAgent,
CustomHttpsAgent,
fetchStream,
RequestFailedError,
} = require('@overleaf/fetch-utils')
const { URL } = require('node:url')
const { pipeline } = require('node:stream/promises')
const Metrics = require('./Metrics')
const MAX_CONNECT_TIME = 1000
const httpAgent = new CustomHttpAgent({ connectTimeout: MAX_CONNECT_TIME })
const httpsAgent = new CustomHttpsAgent({ connectTimeout: MAX_CONNECT_TIME })
async function pipeUrlToFileWithRetry(url, fallbackURL, filePath) {
let remainingAttempts = 3
let lastErr
while (remainingAttempts-- > 0) {
const timer = new Metrics.Timer('url_fetcher', {
path: lastErr ? ' retry' : 'fetch',
})
try {
await pipeUrlToFile(url, fallbackURL, filePath)
timer.done({ status: 'success' })
return
} catch (err) {
timer.done({ status: 'error' })
logger.warn(
{ err, url, filePath, remainingAttempts },
'error downloading url'
)
lastErr = err
}
}
throw lastErr
}
async function pipeUrlToFile(url, fallbackURL, filePath) {
const u = new URL(url)
if (
Settings.filestoreDomainOveride &&
u.host !== Settings.apis.clsiPerf.host
) {
url = `${Settings.filestoreDomainOveride}${u.pathname}${u.search}`
}
if (fallbackURL) {
const u2 = new URL(fallbackURL)
if (
Settings.filestoreDomainOveride &&
u2.host !== Settings.apis.clsiPerf.host
) {
fallbackURL = `${Settings.filestoreDomainOveride}${u2.pathname}${u2.search}`
}
}
let stream
try {
stream = await fetchStream(url, {
signal: AbortSignal.timeout(60 * 1000),
// provide a function to get the agent for each request
// as there may be multiple requests with different protocols
// due to redirects.
agent: _url => (_url.protocol === 'https:' ? httpsAgent : httpAgent),
})
} catch (err) {
if (
fallbackURL &&
err instanceof RequestFailedError &&
err.response.status === 404
) {
stream = await fetchStream(fallbackURL, {
signal: AbortSignal.timeout(60 * 1000),
// provide a function to get the agent for each request
// as there may be multiple requests with different protocols
// due to redirects.
agent: _url => (_url.protocol === 'https:' ? httpsAgent : httpAgent),
})
url = fallbackURL
} else {
throw err
}
}
const source = inferSource(url)
Metrics.inc('url_source', 1, { path: source })
const atomicWrite = filePath + '~'
try {
const output = fs.createWriteStream(atomicWrite)
await pipeline(stream, output)
await fs.promises.rename(atomicWrite, filePath)
Metrics.count('UrlFetcher.downloaded_bytes', output.bytesWritten, {
path: source,
})
} catch (err) {
try {
await fs.promises.unlink(atomicWrite)
} catch (e) {}
throw err
}
}
const BUCKET_REGEX = /\/bucket\/([^/]+)\/key\//
function inferSource(url) {
if (url.includes(Settings.apis.clsiPerf.host)) {
return 'clsi-perf'
} else if (url.includes('/project/') && url.includes('/file/')) {
return 'user-files'
} else if (url.includes('/key/')) {
const match = url.match(BUCKET_REGEX)
if (match) return match[1]
}
return 'unknown'
}
module.exports.promises = {
pipeUrlToFileWithRetry,
}

View File

@@ -0,0 +1,67 @@
const { NoXrefTableError } = require('./Errors')
const fs = require('node:fs')
const { O_RDONLY, O_NOFOLLOW } = fs.constants
const MAX_XREF_FILE_SIZE = 1024 * 1024
/** Parse qpdf --show-xref output to get a table of xref entries
*
* @param {string} filePath
* @param {number} pdfFileSize
* @returns
*/
async function parseXrefTable(filePath, pdfFileSize) {
try {
// the xref table will be written to output.pdfxref when available
const xRefFilePath = filePath + 'xref'
// check the size of the file (as it is untrusted)
const stats = await fs.promises.stat(xRefFilePath)
if (!stats.isFile()) {
throw new NoXrefTableError('xref file invalid type')
}
if (stats.size === 0) {
throw new NoXrefTableError('xref file empty')
}
if (stats.size > MAX_XREF_FILE_SIZE) {
throw new NoXrefTableError('xref file too large')
}
const content = await fs.promises.readFile(xRefFilePath, {
encoding: 'ascii',
flag: O_RDONLY | O_NOFOLLOW,
})
// the qpdf xref table output looks like this:
//
// 3/0: uncompressed; offset = 194159
//
// we only need the uncompressed objects
const matches = content.matchAll(
// put an upper limit of 10^10 on all the matched numbers for safety
// ignore the generation id in "id/gen"
// in a linearized pdf all objects must have generation number 0
/^\d{1,9}\/\d{1,9}: uncompressed; offset = (\d{1,9})$/gm
)
// include a zero-index object for backwards compatibility with
// our existing xref table parsing code
const xRefEntries = [{ offset: 0 }]
// extract all the xref table entries
for (const match of matches) {
const offset = parseInt(match[1], 10)
xRefEntries.push({ offset, uncompressed: true })
}
if (xRefEntries.length === 1) {
throw new NoXrefTableError('xref file has no objects')
}
return { xRefEntries }
} catch (err) {
if (err instanceof NoXrefTableError) {
throw err
} else if (err.code) {
throw new NoXrefTableError(`xref file error ${err.code}`)
} else {
throw new NoXrefTableError('xref file parse error')
}
}
}
module.exports = {
parseXrefTable,
}