first commit
This commit is contained in:
33
services/history-v1/backupVerifier/ProjectMetrics.mjs
Normal file
33
services/history-v1/backupVerifier/ProjectMetrics.mjs
Normal file
@@ -0,0 +1,33 @@
|
||||
import Metrics from '@overleaf/metrics'
|
||||
import { objectIdFromDate } from './utils.mjs'
|
||||
import { db } from '../storage/lib/mongodb.js'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Date} beforeTime
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function measurePendingChangesBeforeTime(beforeTime) {
|
||||
const pendingChangeCount = await projectsCollection.countDocuments({
|
||||
'overleaf.backup.pendingChangeAt': {
|
||||
$lt: beforeTime,
|
||||
},
|
||||
})
|
||||
|
||||
Metrics.gauge('backup_verification_pending_changes', pendingChangeCount)
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Date} graceTime
|
||||
* @return {Promise<void>}
|
||||
*/
|
||||
export async function measureNeverBackedUpProjects(graceTime) {
|
||||
const neverBackedUpCount = await projectsCollection.countDocuments({
|
||||
'overleaf.backup.lastBackedUpVersion': null,
|
||||
_id: { $lt: objectIdFromDate(graceTime) },
|
||||
})
|
||||
Metrics.gauge('backup_verification_never_backed_up', neverBackedUpCount)
|
||||
}
|
79
services/history-v1/backupVerifier/ProjectSampler.mjs
Normal file
79
services/history-v1/backupVerifier/ProjectSampler.mjs
Normal file
@@ -0,0 +1,79 @@
|
||||
// @ts-check
|
||||
import { objectIdFromDate } from './utils.mjs'
|
||||
import { db } from '../storage/lib/mongodb.js'
|
||||
import config from 'config'
|
||||
|
||||
const projectsCollection = db.collection('projects')
|
||||
|
||||
const HAS_PROJECTS_WITHOUT_HISTORY =
|
||||
config.get('hasProjectsWithoutHistory') === 'true'
|
||||
|
||||
/**
|
||||
* @param {Date} start
|
||||
* @param {Date} end
|
||||
* @param {number} N
|
||||
* @yields {string}
|
||||
*/
|
||||
export async function* getProjectsCreatedInDateRangeCursor(start, end, N) {
|
||||
yield* getSampleProjectsCursor(N, [
|
||||
{
|
||||
$match: {
|
||||
_id: {
|
||||
$gt: objectIdFromDate(start),
|
||||
$lte: objectIdFromDate(end),
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
export async function* getProjectsUpdatedInDateRangeCursor(start, end, N) {
|
||||
yield* getSampleProjectsCursor(N, [
|
||||
{
|
||||
$match: {
|
||||
'overleaf.history.updatedAt': {
|
||||
$gt: start,
|
||||
$lte: end,
|
||||
},
|
||||
},
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {import('mongodb').Document} Document
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @generator
|
||||
* @param {number} N
|
||||
* @param {Array<Document>} preSampleAggregationStages
|
||||
* @yields {string}
|
||||
*/
|
||||
export async function* getSampleProjectsCursor(
|
||||
N,
|
||||
preSampleAggregationStages = []
|
||||
) {
|
||||
const cursor = projectsCollection.aggregate([
|
||||
...preSampleAggregationStages,
|
||||
{ $sample: { size: N } },
|
||||
{ $project: { 'overleaf.history.id': 1 } },
|
||||
])
|
||||
|
||||
let validProjects = 0
|
||||
let hasInvalidProject = false
|
||||
|
||||
for await (const project of cursor) {
|
||||
if (HAS_PROJECTS_WITHOUT_HISTORY && !project.overleaf?.history?.id) {
|
||||
hasInvalidProject = true
|
||||
continue
|
||||
}
|
||||
validProjects++
|
||||
yield project.overleaf.history.id.toString()
|
||||
}
|
||||
|
||||
if (validProjects === 0 && hasInvalidProject) {
|
||||
yield* getSampleProjectsCursor(N, preSampleAggregationStages)
|
||||
}
|
||||
}
|
320
services/history-v1/backupVerifier/ProjectVerifier.mjs
Normal file
320
services/history-v1/backupVerifier/ProjectVerifier.mjs
Normal file
@@ -0,0 +1,320 @@
|
||||
// @ts-check
|
||||
import { verifyProjectWithErrorContext } from '../storage/lib/backupVerifier.mjs'
|
||||
import { promiseMapSettledWithLimit } from '@overleaf/promise-utils'
|
||||
import logger from '@overleaf/logger'
|
||||
import metrics from '@overleaf/metrics'
|
||||
import {
|
||||
getSampleProjectsCursor,
|
||||
getProjectsCreatedInDateRangeCursor,
|
||||
getProjectsUpdatedInDateRangeCursor,
|
||||
} from './ProjectSampler.mjs'
|
||||
import OError from '@overleaf/o-error'
|
||||
import { setTimeout } from 'node:timers/promises'
|
||||
|
||||
const MS_PER_30_DAYS = 30 * 24 * 60 * 60 * 1000
|
||||
|
||||
const failureCounter = new metrics.prom.Counter({
|
||||
name: 'backup_project_verification_failed',
|
||||
help: 'Number of projects that failed verification',
|
||||
labelNames: ['name'],
|
||||
})
|
||||
|
||||
const successCounter = new metrics.prom.Counter({
|
||||
name: 'backup_project_verification_succeeded',
|
||||
help: 'Number of projects that succeeded verification',
|
||||
})
|
||||
|
||||
let WRITE_METRICS = false
|
||||
|
||||
/**
|
||||
* @typedef {import('node:events').EventEmitter} EventEmitter
|
||||
*/
|
||||
|
||||
/**
|
||||
* Allows writing metrics to be enabled or disabled.
|
||||
* @param {Boolean} writeMetrics
|
||||
*/
|
||||
export function setWriteMetrics(writeMetrics) {
|
||||
WRITE_METRICS = writeMetrics
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Error|unknown} error
|
||||
* @param {string} historyId
|
||||
*/
|
||||
function handleVerificationError(error, historyId) {
|
||||
const name = error instanceof Error ? error.name : 'UnknownError'
|
||||
logger.error({ historyId, error, name }, 'error verifying project backup')
|
||||
|
||||
WRITE_METRICS && failureCounter.inc({ name })
|
||||
|
||||
return name
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {Date} startDate
|
||||
* @param {Date} endDate
|
||||
* @param {number} interval
|
||||
* @returns {Array<VerificationJobSpecification>}
|
||||
*/
|
||||
function splitJobs(startDate, endDate, interval) {
|
||||
/** @type {Array<VerificationJobSpecification>} */
|
||||
const jobs = []
|
||||
while (startDate < endDate) {
|
||||
const nextStart = new Date(
|
||||
Math.min(startDate.getTime() + interval, endDate.getTime())
|
||||
)
|
||||
jobs.push({ startDate, endDate: nextStart })
|
||||
startDate = nextStart
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {AsyncGenerator<string>} historyIdCursor
|
||||
* @param {EventEmitter} [eventEmitter]
|
||||
* @param {number} [delay] - Allows a delay between each verification
|
||||
* @return {Promise<{verified: number, total: number, errorTypes: *[], hasFailure: boolean}>}
|
||||
*/
|
||||
async function verifyProjectsFromCursor(
|
||||
historyIdCursor,
|
||||
eventEmitter,
|
||||
delay = 0
|
||||
) {
|
||||
const errorTypes = []
|
||||
let verified = 0
|
||||
let total = 0
|
||||
let receivedShutdownSignal = false
|
||||
if (eventEmitter) {
|
||||
eventEmitter.once('shutdown', () => {
|
||||
receivedShutdownSignal = true
|
||||
})
|
||||
}
|
||||
for await (const historyId of historyIdCursor) {
|
||||
if (receivedShutdownSignal) {
|
||||
break
|
||||
}
|
||||
total++
|
||||
try {
|
||||
await verifyProjectWithErrorContext(historyId)
|
||||
logger.debug({ historyId }, 'verified project backup successfully')
|
||||
WRITE_METRICS && successCounter.inc()
|
||||
verified++
|
||||
} catch (error) {
|
||||
const errorType = handleVerificationError(error, historyId)
|
||||
errorTypes.push(errorType)
|
||||
}
|
||||
if (delay > 0) {
|
||||
await setTimeout(delay)
|
||||
}
|
||||
}
|
||||
return {
|
||||
verified,
|
||||
total,
|
||||
errorTypes,
|
||||
hasFailure: errorTypes.length > 0,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {number} nProjectsToSample
|
||||
* @param {EventEmitter} [signal]
|
||||
* @param {number} [delay]
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
export async function verifyRandomProjectSample(
|
||||
nProjectsToSample,
|
||||
signal,
|
||||
delay = 0
|
||||
) {
|
||||
const historyIds = await getSampleProjectsCursor(nProjectsToSample)
|
||||
return await verifyProjectsFromCursor(historyIds, signal, delay)
|
||||
}
|
||||
|
||||
/**
|
||||
* Samples projects with history IDs between the specified dates and verifies them.
|
||||
*
|
||||
* @param {Date} startDate
|
||||
* @param {Date} endDate
|
||||
* @param {number} projectsPerRange
|
||||
* @param {EventEmitter} [signal]
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
async function verifyRange(startDate, endDate, projectsPerRange, signal) {
|
||||
logger.info({ startDate, endDate }, 'verifying range')
|
||||
|
||||
const results = await verifyProjectsFromCursor(
|
||||
getProjectsCreatedInDateRangeCursor(startDate, endDate, projectsPerRange),
|
||||
signal
|
||||
)
|
||||
|
||||
if (results.total === 0) {
|
||||
logger.debug(
|
||||
{ start: startDate, end: endDate },
|
||||
'No projects found in range'
|
||||
)
|
||||
}
|
||||
|
||||
const jobStatus = {
|
||||
...results,
|
||||
startDate,
|
||||
endDate,
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
{ ...jobStatus, errorTypes: Array.from(new Set(jobStatus.errorTypes)) },
|
||||
'Verified range'
|
||||
)
|
||||
return jobStatus
|
||||
}
|
||||
|
||||
/**
|
||||
* @typedef {Object} VerificationJobSpecification
|
||||
* @property {Date} startDate
|
||||
* @property {Date} endDate
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import('./types.d.ts').VerificationJobStatus} VerificationJobStatus
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} VerifyDateRangeOptions
|
||||
* @property {Date} startDate
|
||||
* @property {Date} endDate
|
||||
* @property {number} [interval]
|
||||
* @property {number} [projectsPerRange]
|
||||
* @property {number} [concurrency]
|
||||
* @property {EventEmitter} [signal]
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {VerifyDateRangeOptions} options
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
export async function verifyProjectsCreatedInDateRange({
|
||||
concurrency = 0,
|
||||
projectsPerRange = 10,
|
||||
startDate,
|
||||
endDate,
|
||||
interval = MS_PER_30_DAYS,
|
||||
signal,
|
||||
}) {
|
||||
const jobs = splitJobs(startDate, endDate, interval)
|
||||
if (jobs.length === 0) {
|
||||
throw new OError('Time range could not be split into jobs', {
|
||||
start: startDate,
|
||||
end: endDate,
|
||||
interval,
|
||||
})
|
||||
}
|
||||
const settlements = await promiseMapSettledWithLimit(
|
||||
concurrency,
|
||||
jobs,
|
||||
({ startDate, endDate }) =>
|
||||
verifyRange(startDate, endDate, projectsPerRange, signal)
|
||||
)
|
||||
return settlements.reduce(
|
||||
/**
|
||||
*
|
||||
* @param {VerificationJobStatus} acc
|
||||
* @param settlement
|
||||
* @return {VerificationJobStatus}
|
||||
*/
|
||||
(acc, settlement) => {
|
||||
if (settlement.status !== 'rejected') {
|
||||
if (settlement.value.hasFailure) {
|
||||
acc.hasFailure = true
|
||||
}
|
||||
acc.total += settlement.value.total
|
||||
acc.verified += settlement.value.verified
|
||||
acc.errorTypes = acc.errorTypes.concat(settlement.value.errorTypes)
|
||||
} else {
|
||||
logger.error({ ...settlement.reason }, 'Error processing range')
|
||||
}
|
||||
return acc
|
||||
},
|
||||
/** @type {VerificationJobStatus} */
|
||||
{
|
||||
startDate,
|
||||
endDate,
|
||||
verified: 0,
|
||||
total: 0,
|
||||
hasFailure: false,
|
||||
errorTypes: [],
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that projects that have recently gone out of RPO have been updated.
|
||||
*
|
||||
* @param {Date} startDate
|
||||
* @param {Date} endDate
|
||||
* @param {number} nProjects
|
||||
* @param {EventEmitter} [signal]
|
||||
* @return {Promise<VerificationJobStatus>}
|
||||
*/
|
||||
export async function verifyProjectsUpdatedInDateRange(
|
||||
startDate,
|
||||
endDate,
|
||||
nProjects,
|
||||
signal
|
||||
) {
|
||||
logger.debug(
|
||||
{ startDate, endDate, nProjects },
|
||||
'Sampling projects updated in date range'
|
||||
)
|
||||
const results = await verifyProjectsFromCursor(
|
||||
getProjectsUpdatedInDateRangeCursor(startDate, endDate, nProjects),
|
||||
signal
|
||||
)
|
||||
|
||||
if (results.total === 0) {
|
||||
logger.debug(
|
||||
{ start: startDate, end: endDate },
|
||||
'No projects updated recently'
|
||||
)
|
||||
}
|
||||
|
||||
const jobStatus = {
|
||||
...results,
|
||||
startDate,
|
||||
endDate,
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
{ ...jobStatus, errorTypes: Array.from(new Set(jobStatus.errorTypes)) },
|
||||
'Verified recently updated projects'
|
||||
)
|
||||
return jobStatus
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {EventEmitter} signal
|
||||
* @return {void}
|
||||
*/
|
||||
export function loopRandomProjects(signal) {
|
||||
let shutdown = false
|
||||
signal.on('shutdown', function () {
|
||||
shutdown = true
|
||||
})
|
||||
async function loop() {
|
||||
do {
|
||||
try {
|
||||
const result = await verifyRandomProjectSample(100, signal, 2_000)
|
||||
logger.debug({ result }, 'verified random project sample')
|
||||
} catch (error) {
|
||||
logger.error({ error }, 'error verifying random project sample')
|
||||
}
|
||||
// eslint-disable-next-line no-unmodified-loop-condition
|
||||
} while (!shutdown)
|
||||
}
|
||||
loop()
|
||||
}
|
32
services/history-v1/backupVerifier/healthCheck.mjs
Normal file
32
services/history-v1/backupVerifier/healthCheck.mjs
Normal file
@@ -0,0 +1,32 @@
|
||||
import config from 'config'
|
||||
import { verifyProjectWithErrorContext } from '../storage/lib/backupVerifier.mjs'
|
||||
import {
|
||||
measureNeverBackedUpProjects,
|
||||
measurePendingChangesBeforeTime,
|
||||
} from './ProjectMetrics.mjs'
|
||||
import { getEndDateForRPO, RPO } from './utils.mjs'
|
||||
|
||||
/** @type {Array<string>} */
|
||||
const HEALTH_CHECK_PROJECTS = JSON.parse(config.get('healthCheckProjects'))
|
||||
|
||||
export async function healthCheck() {
|
||||
if (!Array.isArray(HEALTH_CHECK_PROJECTS)) {
|
||||
throw new Error('expected healthCheckProjects to be an array')
|
||||
}
|
||||
if (HEALTH_CHECK_PROJECTS.length !== 2) {
|
||||
throw new Error('expected 2 healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length === 24)) {
|
||||
throw new Error('expected mongo id in healthCheckProjects')
|
||||
}
|
||||
if (!HEALTH_CHECK_PROJECTS.some(id => id.length < 24)) {
|
||||
throw new Error('expected postgres id in healthCheckProjects')
|
||||
}
|
||||
|
||||
for (const historyId of HEALTH_CHECK_PROJECTS) {
|
||||
await verifyProjectWithErrorContext(historyId)
|
||||
}
|
||||
|
||||
await measurePendingChangesBeforeTime(getEndDateForRPO(2))
|
||||
await measureNeverBackedUpProjects(getEndDateForRPO(2))
|
||||
}
|
8
services/history-v1/backupVerifier/types.d.ts
vendored
Normal file
8
services/history-v1/backupVerifier/types.d.ts
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
export type VerificationJobStatus = {
|
||||
verified: number
|
||||
total: number
|
||||
startDate?: Date
|
||||
endDate?: Date
|
||||
hasFailure: boolean
|
||||
errorTypes: Array<string>
|
||||
}
|
35
services/history-v1/backupVerifier/utils.mjs
Normal file
35
services/history-v1/backupVerifier/utils.mjs
Normal file
@@ -0,0 +1,35 @@
|
||||
import { ObjectId } from 'mongodb'
|
||||
import config from 'config'
|
||||
|
||||
export const RPO = parseInt(config.get('backupRPOInMS'), 10)
|
||||
|
||||
/**
|
||||
* @param {Date} time
|
||||
* @return {ObjectId}
|
||||
*/
|
||||
export function objectIdFromDate(time) {
|
||||
return ObjectId.createFromTime(time.getTime() / 1000)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {number} [factor] - Multiply RPO by this factor, default is 1
|
||||
* @return {Date}
|
||||
*/
|
||||
export function getEndDateForRPO(factor = 1) {
|
||||
return new Date(Date.now() - RPO * factor)
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a startDate, endDate pair that checks a period of time before the RPO horizon
|
||||
*
|
||||
* @param {number} offset - How many seconds we should check
|
||||
* @return {{endDate: Date, startDate: Date}}
|
||||
*/
|
||||
export function getDatesBeforeRPO(offset) {
|
||||
const now = new Date()
|
||||
const endDate = new Date(now.getTime() - RPO)
|
||||
return {
|
||||
endDate,
|
||||
startDate: new Date(endDate.getTime() - offset * 1000),
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user