first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
const OError = require('@overleaf/o-error')
class ChunkVersionConflictError extends OError {}
module.exports = {
ChunkVersionConflictError,
}

View File

@@ -0,0 +1,447 @@
// @ts-check
'use strict'
/**
* Manage {@link Chunk} and {@link History} storage.
*
* For storage, chunks are immutable. If we want to update a project with new
* changes, we create a new chunk record and History object and delete the old
* ones. If we compact a project's history, we similarly destroy the old chunk
* (or chunks) and replace them with a new one. This is helpful when using S3,
* because it guarantees only eventual consistency for updates but provides
* stronger consistency guarantees for object creation.
*
* When a chunk record in the database is removed, we save its ID for later
* in the `old_chunks` table, rather than deleting it immediately. This lets us
* use batch deletion to reduce the number of delete requests to S3.
*
* The chunk store also caches data about which blobs are referenced by each
* chunk, which allows us to find unused blobs without loading all of the data
* for all projects from S3. Whenever we create a chunk, we also insert records
* into the `chunk_blobs` table, to help with this bookkeeping.
*/
const config = require('config')
const OError = require('@overleaf/o-error')
const { Chunk, History, Snapshot } = require('overleaf-editor-core')
const assert = require('../assert')
const BatchBlobStore = require('../batch_blob_store')
const { BlobStore } = require('../blob_store')
const { historyStore } = require('../history_store')
const mongoBackend = require('./mongo')
const postgresBackend = require('./postgres')
const { ChunkVersionConflictError } = require('./errors')
const DEFAULT_DELETE_BATCH_SIZE = parseInt(config.get('maxDeleteKeys'), 10)
const DEFAULT_DELETE_TIMEOUT_SECS = 3000 // 50 minutes
const DEFAULT_DELETE_MIN_AGE_SECS = 86400 // 1 day
/**
* Create the initial chunk for a project.
*/
async function initializeProject(projectId, snapshot) {
if (projectId != null) {
assert.projectId(projectId, 'bad projectId')
} else {
projectId = await postgresBackend.generateProjectId()
}
if (snapshot != null) {
assert.instance(snapshot, Snapshot, 'bad snapshot')
} else {
snapshot = new Snapshot()
}
const blobStore = new BlobStore(projectId)
await blobStore.initialize()
const backend = getBackend(projectId)
const chunkRecord = await backend.getLatestChunk(projectId)
if (chunkRecord != null) {
throw new AlreadyInitialized(projectId)
}
const history = new History(snapshot, [])
const chunk = new Chunk(history, 0)
await create(projectId, chunk)
return projectId
}
/**
* Load the blobs referenced in the given history
*/
async function lazyLoadHistoryFiles(history, batchBlobStore) {
const blobHashes = new Set()
history.findBlobHashes(blobHashes)
await batchBlobStore.preload(Array.from(blobHashes))
await history.loadFiles('lazy', batchBlobStore)
}
/**
* Load the latest Chunk stored for a project, including blob metadata.
*
* @param {string} projectId
* @param {Object} [opts]
* @param {boolean} [opts.readOnly]
* @return {Promise<{id: string, startVersion: number, endVersion: number, endTimestamp: Date}>}
*/
async function loadLatestRaw(projectId, opts) {
assert.projectId(projectId, 'bad projectId')
const backend = getBackend(projectId)
const chunkRecord = await backend.getLatestChunk(projectId, opts)
if (chunkRecord == null) {
throw new Chunk.NotFoundError(projectId)
}
return chunkRecord
}
/**
* Load the latest Chunk stored for a project, including blob metadata.
*
* @param {string} projectId
* @return {Promise.<Chunk>}
*/
async function loadLatest(projectId) {
const chunkRecord = await loadLatestRaw(projectId)
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
await lazyLoadHistoryFiles(history, batchBlobStore)
return new Chunk(history, chunkRecord.startVersion)
}
/**
* Load the the chunk that contains the given version, including blob metadata.
*/
async function loadAtVersion(projectId, version) {
assert.projectId(projectId, 'bad projectId')
assert.integer(version, 'bad version')
const backend = getBackend(projectId)
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const chunkRecord = await backend.getChunkForVersion(projectId, version)
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
await lazyLoadHistoryFiles(history, batchBlobStore)
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
}
/**
* Load the chunk that contains the version that was current at the given
* timestamp, including blob metadata.
*/
async function loadAtTimestamp(projectId, timestamp) {
assert.projectId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const backend = getBackend(projectId)
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const chunkRecord = await backend.getChunkForTimestamp(projectId, timestamp)
const rawHistory = await historyStore.loadRaw(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
await lazyLoadHistoryFiles(history, batchBlobStore)
return new Chunk(history, chunkRecord.endVersion - history.countChanges())
}
/**
* Store the chunk and insert corresponding records in the database.
*
* @param {string} projectId
* @param {Chunk} chunk
* @param {Date} [earliestChangeTimestamp]
*/
async function create(projectId, chunk, earliestChangeTimestamp) {
assert.projectId(projectId, 'bad projectId')
assert.instance(chunk, Chunk, 'bad chunk')
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
const backend = getBackend(projectId)
const chunkStart = chunk.getStartVersion()
const chunkId = await uploadChunk(projectId, chunk)
const opts = {}
if (chunkStart > 0) {
opts.oldChunkId = await getChunkIdForVersion(projectId, chunkStart - 1)
}
if (earliestChangeTimestamp != null) {
opts.earliestChangeTimestamp = earliestChangeTimestamp
}
await backend.confirmCreate(projectId, chunk, chunkId, opts)
}
/**
* Upload the given chunk to object storage.
*
* This is used by the create and update methods.
*/
async function uploadChunk(projectId, chunk) {
const backend = getBackend(projectId)
const blobStore = new BlobStore(projectId)
const historyStoreConcurrency = parseInt(
config.get('chunkStore.historyStoreConcurrency'),
10
)
const rawHistory = await chunk
.getHistory()
.store(blobStore, historyStoreConcurrency)
const chunkId = await backend.insertPendingChunk(projectId, chunk)
await historyStore.storeRaw(projectId, chunkId, rawHistory)
return chunkId
}
/**
* Extend the project's history by replacing the latest chunk with a new
* chunk.
*
* @param {string} projectId
* @param {number} oldEndVersion
* @param {Chunk} newChunk
* @param {Date} [earliestChangeTimestamp]
* @return {Promise}
*/
async function update(
projectId,
oldEndVersion,
newChunk,
earliestChangeTimestamp
) {
assert.projectId(projectId, 'bad projectId')
assert.integer(oldEndVersion, 'bad oldEndVersion')
assert.instance(newChunk, Chunk, 'bad newChunk')
assert.maybe.date(earliestChangeTimestamp, 'bad timestamp')
const backend = getBackend(projectId)
const oldChunkId = await getChunkIdForVersion(projectId, oldEndVersion)
const newChunkId = await uploadChunk(projectId, newChunk)
const opts = {}
if (earliestChangeTimestamp != null) {
opts.earliestChangeTimestamp = earliestChangeTimestamp
}
await backend.confirmUpdate(projectId, oldChunkId, newChunk, newChunkId, opts)
}
/**
* Find the chunk ID for a given version of a project.
*
* @param {string} projectId
* @param {number} version
* @return {Promise.<string>}
*/
async function getChunkIdForVersion(projectId, version) {
const backend = getBackend(projectId)
const chunkRecord = await backend.getChunkForVersion(projectId, version)
return chunkRecord.id
}
/**
* Find the chunk metadata for a given version of a project.
*
* @param {string} projectId
* @param {number} version
* @return {Promise.<{id: string|number, startVersion: number, endVersion: number}>}
*/
async function getChunkMetadataForVersion(projectId, version) {
const backend = getBackend(projectId)
const chunkRecord = await backend.getChunkForVersion(projectId, version)
return chunkRecord
}
/**
* Get all of a project's chunk ids
*/
async function getProjectChunkIds(projectId) {
const backend = getBackend(projectId)
const chunkIds = await backend.getProjectChunkIds(projectId)
return chunkIds
}
/**
* Get all of a projects chunks directly
*/
async function getProjectChunks(projectId) {
const backend = getBackend(projectId)
const chunkIds = await backend.getProjectChunks(projectId)
return chunkIds
}
/**
* Load the chunk for a given chunk record, including blob metadata.
*/
async function loadByChunkRecord(projectId, chunkRecord) {
const blobStore = new BlobStore(projectId)
const batchBlobStore = new BatchBlobStore(blobStore)
const { raw: rawHistory, buffer: chunkBuffer } =
await historyStore.loadRawWithBuffer(projectId, chunkRecord.id)
const history = History.fromRaw(rawHistory)
await lazyLoadHistoryFiles(history, batchBlobStore)
return {
chunk: new Chunk(history, chunkRecord.endVersion - history.countChanges()),
chunkBuffer,
}
}
/**
* Asynchronously retrieves project chunks starting from a specific version.
*
* This generator function yields chunk records for a given project starting from the specified version (inclusive).
* It continues to fetch and yield subsequent chunk records until the end version of the latest chunk metadata is reached.
* If you want to fetch all the chunks *after* a version V, call this function with V+1.
*
* @param {string} projectId - The ID of the project.
* @param {number} version - The starting version to retrieve chunks from.
* @returns {AsyncGenerator<Object, void, undefined>} An async generator that yields chunk records.
*/
async function* getProjectChunksFromVersion(projectId, version) {
const backend = getBackend(projectId)
const latestChunkMetadata = await loadLatestRaw(projectId)
if (!latestChunkMetadata || version > latestChunkMetadata.endVersion) {
return
}
let chunkRecord = await backend.getChunkForVersion(projectId, version)
while (chunkRecord != null) {
yield chunkRecord
if (chunkRecord.endVersion >= latestChunkMetadata.endVersion) {
break
} else {
chunkRecord = await backend.getChunkForVersion(
projectId,
chunkRecord.endVersion + 1
)
}
}
}
/**
* Delete the given chunk from the database.
*
* This doesn't delete the chunk from object storage yet. The old chunks
* collection will do that.
*/
async function destroy(projectId, chunkId) {
const backend = getBackend(projectId)
await backend.deleteChunk(projectId, chunkId)
}
/**
* Delete all of a project's chunks from the database.
*/
async function deleteProjectChunks(projectId) {
const backend = getBackend(projectId)
await backend.deleteProjectChunks(projectId)
}
/**
* Delete a given number of old chunks from both the database
* and from object storage.
*
* @param {object} options
* @param {number} [options.batchSize] - number of chunks to delete in each
* batch
* @param {number} [options.maxBatches] - maximum number of batches to process
* @param {number} [options.minAgeSecs] - minimum age of chunks to delete
* @param {number} [options.timeout] - maximum time to spend deleting chunks
*
* @return {Promise<number>} number of chunks deleted
*/
async function deleteOldChunks(options = {}) {
const batchSize = options.batchSize ?? DEFAULT_DELETE_BATCH_SIZE
const maxBatches = options.maxBatches ?? Number.MAX_SAFE_INTEGER
const minAgeSecs = options.minAgeSecs ?? DEFAULT_DELETE_MIN_AGE_SECS
const timeout = options.timeout ?? DEFAULT_DELETE_TIMEOUT_SECS
assert.greater(batchSize, 0)
assert.greater(timeout, 0)
assert.greater(maxBatches, 0)
assert.greaterOrEqual(minAgeSecs, 0)
const timeoutAfter = Date.now() + timeout * 1000
let deletedChunksTotal = 0
for (const backend of [postgresBackend, mongoBackend]) {
for (let i = 0; i < maxBatches; i++) {
if (Date.now() > timeoutAfter) {
break
}
const deletedChunks = await deleteOldChunksBatch(
backend,
batchSize,
minAgeSecs
)
deletedChunksTotal += deletedChunks.length
if (deletedChunks.length !== batchSize) {
// Last batch was incomplete. There probably are no old chunks left
break
}
}
}
return deletedChunksTotal
}
async function deleteOldChunksBatch(backend, count, minAgeSecs) {
assert.greater(count, 0, 'bad count')
assert.greaterOrEqual(minAgeSecs, 0, 'bad minAgeSecs')
const oldChunks = await backend.getOldChunksBatch(count, minAgeSecs)
if (oldChunks.length === 0) {
return []
}
await historyStore.deleteChunks(oldChunks)
await backend.deleteOldChunks(oldChunks.map(chunk => chunk.chunkId))
return oldChunks
}
/**
* Returns the appropriate backend for the given project id
*
* Numeric ids use the Postgres backend.
* Strings of 24 characters use the Mongo backend.
*/
function getBackend(projectId) {
if (assert.POSTGRES_ID_REGEXP.test(projectId)) {
return postgresBackend
} else if (assert.MONGO_ID_REGEXP.test(projectId)) {
return mongoBackend
} else {
throw new OError('bad project id', { projectId })
}
}
class AlreadyInitialized extends OError {
constructor(projectId) {
super('Project is already initialized', { projectId })
}
}
module.exports = {
getBackend,
initializeProject,
loadLatest,
loadLatestRaw,
loadAtVersion,
loadAtTimestamp,
loadByChunkRecord,
create,
update,
destroy,
getChunkIdForVersion,
getChunkMetadataForVersion,
getProjectChunkIds,
getProjectChunks,
getProjectChunksFromVersion,
deleteProjectChunks,
deleteOldChunks,
AlreadyInitialized,
ChunkVersionConflictError,
}

View File

@@ -0,0 +1,526 @@
// @ts-check
const { ObjectId, ReadPreference, MongoError } = require('mongodb')
const { Chunk } = require('overleaf-editor-core')
const OError = require('@overleaf/o-error')
const assert = require('../assert')
const mongodb = require('../mongodb')
const { ChunkVersionConflictError } = require('./errors')
const DUPLICATE_KEY_ERROR_CODE = 11000
/**
* @import { ClientSession } from 'mongodb'
*/
/**
* Get the latest chunk's metadata from the database
* @param {string} projectId
* @param {Object} [opts]
* @param {boolean} [opts.readOnly]
*/
async function getLatestChunk(projectId, opts = {}) {
assert.mongoId(projectId, 'bad projectId')
const { readOnly = false } = opts
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{
sort: { startVersion: -1 },
readPreference: readOnly
? ReadPreference.secondaryPreferred
: ReadPreference.primary,
}
)
if (record == null) {
return null
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version.
*/
async function getChunkForVersion(projectId, version) {
assert.mongoId(projectId, 'bad projectId')
assert.integer(version, 'bad version')
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
startVersion: { $lte: version },
endVersion: { $gte: version },
},
{ sort: { startVersion: 1 } }
)
if (record == null) {
throw new Chunk.VersionNotFoundError(projectId, version)
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version before the endTime.
*/
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
assert.mongoId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const recordActive = await getChunkForVersion(projectId, 0)
if (recordActive && recordActive.endTimestamp <= timestamp) {
return recordActive
}
// fallback to deleted chunk
const recordDeleted = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: 'deleted',
startVersion: 0,
updatedAt: { $lte: timestamp }, // indexed for state=deleted
endTimestamp: { $lte: timestamp },
},
{ sort: { updatedAt: -1 } }
)
if (recordDeleted) {
return chunkFromRecord(recordDeleted)
}
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
/**
* Get the metadata for the chunk that contains the version that was current at
* the given timestamp.
*/
async function getChunkForTimestamp(projectId, timestamp) {
assert.mongoId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
endTimestamp: { $gte: timestamp },
},
// We use the index on the startVersion for sorting records. This assumes
// that timestamps go up with each version.
{ sort: { startVersion: 1 } }
)
if (record == null) {
// Couldn't find a chunk that had modifications after the given timestamp.
// Fetch the latest chunk instead.
const chunk = await getLatestChunk(projectId)
if (chunk == null) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunk
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the version that was current before
* the given timestamp.
*/
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
assert.mongoId(projectId, 'bad projectId')
assert.date(timestamp, 'bad timestamp')
const record = await mongodb.chunks.findOne(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
$or: [
{
endTimestamp: {
$lte: timestamp,
},
},
{
endTimestamp: null,
},
],
},
// We use the index on the startVersion for sorting records. This assumes
// that timestamps go up with each version.
{ sort: { startVersion: -1 } }
)
if (record == null) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunkFromRecord(record)
}
/**
* Get all of a project's chunk ids
*/
async function getProjectChunkIds(projectId) {
assert.mongoId(projectId, 'bad projectId')
const cursor = mongodb.chunks.find(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{ projection: { _id: 1 } }
)
return await cursor.map(record => record._id).toArray()
}
/**
* Get all of a projects chunks directly
*/
async function getProjectChunks(projectId) {
assert.mongoId(projectId, 'bad projectId')
const cursor = mongodb.chunks
.find(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{ projection: { state: 0 } }
)
.sort({ startVersion: 1 })
return await cursor.map(chunkFromRecord).toArray()
}
/**
* Insert a pending chunk before sending it to object storage.
*/
async function insertPendingChunk(projectId, chunk) {
assert.mongoId(projectId, 'bad projectId')
assert.instance(chunk, Chunk, 'bad chunk')
const chunkId = new ObjectId()
await mongodb.chunks.insertOne({
_id: chunkId,
projectId: new ObjectId(projectId),
startVersion: chunk.getStartVersion(),
endVersion: chunk.getEndVersion(),
endTimestamp: chunk.getEndTimestamp(),
state: 'pending',
updatedAt: new Date(),
})
return chunkId.toString()
}
/**
* Record that a new chunk was created.
*
* @param {string} projectId
* @param {Chunk} chunk
* @param {string} chunkId
* @param {object} opts
* @param {Date} [opts.earliestChangeTimestamp]
* @param {string} [opts.oldChunkId]
*/
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
assert.mongoId(projectId, 'bad projectId')
assert.instance(chunk, Chunk, 'bad newChunk')
assert.mongoId(chunkId, 'bad newChunkId')
await mongodb.client.withSession(async session => {
await session.withTransaction(async () => {
if (opts.oldChunkId != null) {
await closeChunk(projectId, opts.oldChunkId, { session })
}
await activateChunk(projectId, chunkId, { session })
await updateProjectRecord(
projectId,
chunk,
opts.earliestChangeTimestamp,
{ session }
)
})
})
}
/**
* Write the metadata to the project record
*/
async function updateProjectRecord(
projectId,
chunk,
earliestChangeTimestamp,
mongoOpts = {}
) {
// record the end version against the project
await mongodb.projects.updateOne(
{
'overleaf.history.id': projectId, // string for Object ids, number for postgres ids
},
{
// always store the latest end version and timestamp for the chunk
$max: {
'overleaf.history.currentEndVersion': chunk.getEndVersion(),
'overleaf.history.currentEndTimestamp': chunk.getEndTimestamp(),
'overleaf.history.updatedAt': new Date(),
},
// store the first pending change timestamp for the chunk, this will
// be cleared every time a backup is completed.
$min: {
'overleaf.backup.pendingChangeAt':
earliestChangeTimestamp || chunk.getEndTimestamp() || new Date(),
},
},
mongoOpts
)
}
/**
* Record that a chunk was replaced by a new one.
*
* @param {string} projectId
* @param {string} oldChunkId
* @param {Chunk} newChunk
* @param {string} newChunkId
* @param {object} [opts]
* @param {Date} [opts.earliestChangeTimestamp]
*/
async function confirmUpdate(
projectId,
oldChunkId,
newChunk,
newChunkId,
opts = {}
) {
assert.mongoId(projectId, 'bad projectId')
assert.mongoId(oldChunkId, 'bad oldChunkId')
assert.instance(newChunk, Chunk, 'bad newChunk')
assert.mongoId(newChunkId, 'bad newChunkId')
await mongodb.client.withSession(async session => {
await session.withTransaction(async () => {
await deleteActiveChunk(projectId, oldChunkId, { session })
await activateChunk(projectId, newChunkId, { session })
await updateProjectRecord(
projectId,
newChunk,
opts.earliestChangeTimestamp,
{ session }
)
})
})
}
/**
* Activate a pending chunk
*
* @param {string} projectId
* @param {string} chunkId
* @param {object} [opts]
* @param {ClientSession} [opts.session]
*/
async function activateChunk(projectId, chunkId, opts = {}) {
assert.mongoId(projectId, 'bad projectId')
assert.mongoId(chunkId, 'bad chunkId')
let result
try {
result = await mongodb.chunks.updateOne(
{
_id: new ObjectId(chunkId),
projectId: new ObjectId(projectId),
state: 'pending',
},
{ $set: { state: 'active', updatedAt: new Date() } },
opts
)
} catch (err) {
if (err instanceof MongoError && err.code === DUPLICATE_KEY_ERROR_CODE) {
throw new ChunkVersionConflictError('chunk start version is not unique', {
projectId,
chunkId,
})
} else {
throw err
}
}
if (result.matchedCount === 0) {
throw new OError('pending chunk not found', { projectId, chunkId })
}
}
/**
* Close a chunk
*
* A closed chunk is one that can't be extended anymore.
*
* @param {string} projectId
* @param {string} chunkId
* @param {object} [opts]
* @param {ClientSession} [opts.session]
*/
async function closeChunk(projectId, chunkId, opts = {}) {
const result = await mongodb.chunks.updateOne(
{
_id: new ObjectId(chunkId),
projectId: new ObjectId(projectId),
state: 'active',
},
{ $set: { state: 'closed' } },
opts
)
if (result.matchedCount === 0) {
throw new ChunkVersionConflictError('unable to close chunk', {
projectId,
chunkId,
})
}
}
/**
* Delete an active chunk
*
* This is used to delete chunks that are in the process of being extended. It
* will refuse to delete chunks that are already closed and can therefore not be
* extended.
*
* @param {string} projectId
* @param {string} chunkId
* @param {object} [opts]
* @param {ClientSession} [opts.session]
*/
async function deleteActiveChunk(projectId, chunkId, opts = {}) {
const updateResult = await mongodb.chunks.updateOne(
{
_id: new ObjectId(chunkId),
projectId: new ObjectId(projectId),
state: 'active',
},
{ $set: { state: 'deleted', updatedAt: new Date() } },
opts
)
if (updateResult.matchedCount === 0) {
throw new ChunkVersionConflictError('unable to delete active chunk', {
projectId,
chunkId,
})
}
}
/**
* Delete a chunk.
*
* @param {string} projectId
* @param {string} chunkId
* @return {Promise}
*/
async function deleteChunk(projectId, chunkId, mongoOpts = {}) {
assert.mongoId(projectId, 'bad projectId')
assert.mongoId(chunkId, 'bad chunkId')
await mongodb.chunks.updateOne(
{ _id: new ObjectId(chunkId), projectId: new ObjectId(projectId) },
{ $set: { state: 'deleted', updatedAt: new Date() } },
mongoOpts
)
}
/**
* Delete all of a project's chunks
*/
async function deleteProjectChunks(projectId) {
assert.mongoId(projectId, 'bad projectId')
await mongodb.chunks.updateMany(
{
projectId: new ObjectId(projectId),
state: { $in: ['active', 'closed'] },
},
{ $set: { state: 'deleted', updatedAt: new Date() } }
)
}
/**
* Get a batch of old chunks for deletion
*/
async function getOldChunksBatch(count, minAgeSecs) {
const maxUpdatedAt = new Date(Date.now() - minAgeSecs * 1000)
const batch = []
// We need to fetch one state at a time to take advantage of the partial
// indexes on the chunks collection.
//
// Mongo 6.0 allows partial indexes that use the $in operator. When we reach
// that Mongo version, we can create a partial index on both the deleted and
// pending states and simplify this logic a bit.
for (const state of ['deleted', 'pending']) {
if (count === 0) {
// There's no more space in the batch
break
}
const cursor = mongodb.chunks
.find(
{ state, updatedAt: { $lt: maxUpdatedAt } },
{
limit: count,
projection: { _id: 1, projectId: 1 },
}
)
.map(record => ({
chunkId: record._id.toString(),
projectId: record.projectId.toString(),
}))
for await (const record of cursor) {
batch.push(record)
count -= 1
}
}
return batch
}
/**
* Delete a batch of old chunks from the database
*/
async function deleteOldChunks(chunkIds) {
await mongodb.chunks.deleteMany({
_id: { $in: chunkIds.map(id => new ObjectId(id)) },
state: { $in: ['deleted', 'pending'] },
})
}
/**
* Build a chunk metadata object from the database record
*/
function chunkFromRecord(record) {
return {
id: record._id.toString(),
startVersion: record.startVersion,
endVersion: record.endVersion,
endTimestamp: record.endTimestamp,
}
}
module.exports = {
getLatestChunk,
getFirstChunkBeforeTimestamp,
getLastActiveChunkBeforeTimestamp,
getChunkForVersion,
getChunkForTimestamp,
getProjectChunkIds,
getProjectChunks,
insertPendingChunk,
confirmCreate,
confirmUpdate,
updateProjectRecord,
deleteChunk,
deleteProjectChunks,
getOldChunksBatch,
deleteOldChunks,
}

View File

@@ -0,0 +1,487 @@
// @ts-check
const { Chunk } = require('overleaf-editor-core')
const assert = require('../assert')
const knex = require('../knex')
const knexReadOnly = require('../knex_read_only')
const { ChunkVersionConflictError } = require('./errors')
const { updateProjectRecord } = require('./mongo')
const DUPLICATE_KEY_ERROR_CODE = '23505'
/**
* @import { Knex } from 'knex'
*/
/**
* Get the latest chunk's metadata from the database
* @param {string} projectId
* @param {Object} [opts]
* @param {boolean} [opts.readOnly]
*/
async function getLatestChunk(projectId, opts = {}) {
assert.postgresId(projectId, 'bad projectId')
const { readOnly = false } = opts
const record = await (readOnly ? knexReadOnly : knex)('chunks')
.where('doc_id', parseInt(projectId, 10))
.orderBy('end_version', 'desc')
.first()
if (record == null) {
return null
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version.
*
* @param {string} projectId
* @param {number} version
*/
async function getChunkForVersion(projectId, version) {
assert.postgresId(projectId, 'bad projectId')
const record = await knex('chunks')
.where('doc_id', parseInt(projectId, 10))
.where('end_version', '>=', version)
.orderBy('end_version')
.first()
if (!record) {
throw new Chunk.VersionNotFoundError(projectId, version)
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the given version.
*
* @param {string} projectId
* @param {Date} timestamp
*/
async function getFirstChunkBeforeTimestamp(projectId, timestamp) {
assert.date(timestamp, 'bad timestamp')
const recordActive = await getChunkForVersion(projectId, 0)
// projectId must be valid if getChunkForVersion did not throw
if (recordActive && recordActive.endTimestamp <= timestamp) {
return recordActive
}
// fallback to deleted chunk
const recordDeleted = await knex('old_chunks')
.where('doc_id', parseInt(projectId, 10))
.where('start_version', '=', 0)
.where('end_timestamp', '<=', timestamp)
.orderBy('end_version', 'desc')
.first()
if (recordDeleted) {
return chunkFromRecord(recordDeleted)
}
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
/**
* Get the metadata for the chunk that contains the version that was current at
* the given timestamp.
*
* @param {string} projectId
* @param {Date} timestamp
*/
async function getLastActiveChunkBeforeTimestamp(projectId, timestamp) {
assert.date(timestamp, 'bad timestamp')
assert.postgresId(projectId, 'bad projectId')
const query = knex('chunks')
.where('doc_id', parseInt(projectId, 10))
.where(function () {
this.where('end_timestamp', '<=', timestamp).orWhere(
'end_timestamp',
null
)
})
.orderBy('end_version', 'desc', 'last')
const record = await query.first()
if (!record) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunkFromRecord(record)
}
/**
* Get the metadata for the chunk that contains the version that was current at
* the given timestamp.
*
* @param {string} projectId
* @param {Date} timestamp
*/
async function getChunkForTimestamp(projectId, timestamp) {
assert.postgresId(projectId, 'bad projectId')
// This query will find the latest chunk after the timestamp (query orders
// in reverse chronological order), OR the latest chunk
// This accounts for the case where the timestamp is ahead of the chunk's
// timestamp and therefore will not return any results
const whereAfterEndTimestampOrLatestChunk = knex.raw(
'end_timestamp >= ? ' +
'OR id = ( ' +
'SELECT id FROM chunks ' +
'WHERE doc_id = ? ' +
'ORDER BY end_version desc LIMIT 1' +
')',
[timestamp, parseInt(projectId, 10)]
)
const record = await knex('chunks')
.where('doc_id', parseInt(projectId, 10))
.where(whereAfterEndTimestampOrLatestChunk)
.orderBy('end_version')
.first()
if (!record) {
throw new Chunk.BeforeTimestampNotFoundError(projectId, timestamp)
}
return chunkFromRecord(record)
}
/**
* Build a chunk metadata object from the database record
*/
function chunkFromRecord(record) {
return {
id: record.id.toString(),
startVersion: record.start_version,
endVersion: record.end_version,
endTimestamp: record.end_timestamp,
}
}
/**
* Get all of a project's chunk ids
*
* @param {string} projectId
*/
async function getProjectChunkIds(projectId) {
assert.postgresId(projectId, 'bad projectId')
const records = await knex('chunks')
.select('id')
.where('doc_id', parseInt(projectId, 10))
return records.map(record => record.id)
}
/**
* Get all of a projects chunks directly
*
* @param {string} projectId
*/
async function getProjectChunks(projectId) {
assert.postgresId(projectId, 'bad projectId')
const records = await knex('chunks')
.select()
.where('doc_id', parseInt(projectId, 10))
.orderBy('end_version')
return records.map(chunkFromRecord)
}
/**
* Insert a pending chunk before sending it to object storage.
*
* @param {string} projectId
* @param {Chunk} chunk
*/
async function insertPendingChunk(projectId, chunk) {
assert.postgresId(projectId, 'bad projectId')
const result = await knex.first(
knex.raw("nextval('chunks_id_seq'::regclass)::integer as chunkid")
)
const chunkId = result.chunkid
await knex('pending_chunks').insert({
id: chunkId,
doc_id: parseInt(projectId, 10),
end_version: chunk.getEndVersion(),
start_version: chunk.getStartVersion(),
end_timestamp: chunk.getEndTimestamp(),
})
return chunkId.toString()
}
/**
* Record that a new chunk was created.
*
* @param {string} projectId
* @param {Chunk} chunk
* @param {string} chunkId
* @param {object} opts
* @param {Date} [opts.earliestChangeTimestamp]
* @param {string} [opts.oldChunkId]
*/
async function confirmCreate(projectId, chunk, chunkId, opts = {}) {
assert.postgresId(projectId, 'bad projectId')
await knex.transaction(async tx => {
if (opts.oldChunkId != null) {
await _assertChunkIsNotClosed(tx, projectId, opts.oldChunkId)
await _closeChunk(tx, projectId, opts.oldChunkId)
}
await Promise.all([
_deletePendingChunk(tx, projectId, chunkId),
_insertChunk(tx, projectId, chunk, chunkId),
])
await updateProjectRecord(
// The history id in Mongo is an integer for Postgres projects
parseInt(projectId, 10),
chunk,
opts.earliestChangeTimestamp
)
})
}
/**
* Record that a chunk was replaced by a new one.
*
* @param {string} projectId
* @param {string} oldChunkId
* @param {Chunk} newChunk
* @param {string} newChunkId
*/
async function confirmUpdate(
projectId,
oldChunkId,
newChunk,
newChunkId,
opts = {}
) {
assert.postgresId(projectId, 'bad projectId')
await knex.transaction(async tx => {
await _assertChunkIsNotClosed(tx, projectId, oldChunkId)
await _deleteChunks(tx, { doc_id: projectId, id: oldChunkId })
await Promise.all([
_deletePendingChunk(tx, projectId, newChunkId),
_insertChunk(tx, projectId, newChunk, newChunkId),
])
await updateProjectRecord(
// The history id in Mongo is an integer for Postgres projects
parseInt(projectId, 10),
newChunk,
opts.earliestChangeTimestamp
)
})
}
/**
* Delete a pending chunk
*
* @param {Knex} tx
* @param {string} projectId
* @param {string} chunkId
*/
async function _deletePendingChunk(tx, projectId, chunkId) {
await tx('pending_chunks')
.where({
doc_id: parseInt(projectId, 10),
id: parseInt(chunkId, 10),
})
.del()
}
/**
* Adds an active chunk
*
* @param {Knex} tx
* @param {string} projectId
* @param {Chunk} chunk
* @param {string} chunkId
*/
async function _insertChunk(tx, projectId, chunk, chunkId) {
const startVersion = chunk.getStartVersion()
const endVersion = chunk.getEndVersion()
try {
await tx('chunks').insert({
id: parseInt(chunkId, 10),
doc_id: parseInt(projectId, 10),
start_version: startVersion,
end_version: endVersion,
end_timestamp: chunk.getEndTimestamp(),
})
} catch (err) {
if (
err instanceof Error &&
'code' in err &&
err.code === DUPLICATE_KEY_ERROR_CODE
) {
throw new ChunkVersionConflictError(
'chunk start or end version is not unique',
{ projectId, chunkId, startVersion, endVersion }
)
}
throw err
}
}
/**
* Check that a chunk is not closed
*
* This is used to synchronize chunk creations and extensions.
*
* @param {Knex} tx
* @param {string} projectId
* @param {string} chunkId
*/
async function _assertChunkIsNotClosed(tx, projectId, chunkId) {
const record = await tx('chunks')
.forUpdate()
.select('closed')
.where('doc_id', parseInt(projectId, 10))
.where('id', parseInt(chunkId, 10))
.first()
if (!record) {
throw new ChunkVersionConflictError('unable to close chunk: not found', {
projectId,
chunkId,
})
}
if (record.closed) {
throw new ChunkVersionConflictError(
'unable to close chunk: already closed',
{
projectId,
chunkId,
}
)
}
}
/**
* Close a chunk
*
* A closed chunk can no longer be extended.
*
* @param {Knex} tx
* @param {string} projectId
* @param {string} chunkId
*/
async function _closeChunk(tx, projectId, chunkId) {
await tx('chunks')
.update({ closed: true })
.where('doc_id', parseInt(projectId, 10))
.where('id', parseInt(chunkId, 10))
}
/**
* Delete a chunk.
*
* @param {string} projectId
* @param {string} chunkId
*/
async function deleteChunk(projectId, chunkId) {
assert.postgresId(projectId, 'bad projectId')
assert.integer(chunkId, 'bad chunkId')
await _deleteChunks(knex, {
doc_id: parseInt(projectId, 10),
id: parseInt(chunkId, 10),
})
}
/**
* Delete all of a project's chunks
*
* @param {string} projectId
*/
async function deleteProjectChunks(projectId) {
assert.postgresId(projectId, 'bad projectId')
await knex.transaction(async tx => {
await _deleteChunks(knex, { doc_id: parseInt(projectId, 10) })
})
}
/**
* Delete many chunks
*
* @param {Knex} tx
* @param {any} whereClause
*/
async function _deleteChunks(tx, whereClause) {
const rows = await tx('chunks').where(whereClause).del().returning('*')
if (rows.length === 0) {
return
}
const oldChunks = rows.map(row => ({
doc_id: row.doc_id,
chunk_id: row.id,
start_version: row.start_version,
end_version: row.end_version,
end_timestamp: row.end_timestamp,
deleted_at: tx.fn.now(),
}))
await tx('old_chunks').insert(oldChunks)
}
/**
* Get a batch of old chunks for deletion
*
* @param {number} count
* @param {number} minAgeSecs
*/
async function getOldChunksBatch(count, minAgeSecs) {
const maxDeletedAt = new Date(Date.now() - minAgeSecs * 1000)
const records = await knex('old_chunks')
.whereNull('deleted_at')
.orWhere('deleted_at', '<', maxDeletedAt)
.orderBy('chunk_id')
.limit(count)
return records.map(oldChunk => ({
projectId: oldChunk.doc_id.toString(),
chunkId: oldChunk.chunk_id.toString(),
}))
}
/**
* Delete a batch of old chunks from the database
*
* @param {string[]} chunkIds
*/
async function deleteOldChunks(chunkIds) {
await knex('old_chunks')
.whereIn(
'chunk_id',
chunkIds.map(id => parseInt(id, 10))
)
.del()
}
/**
* Generate a new project id
*/
async function generateProjectId() {
const record = await knex.first(
knex.raw("nextval('docs_id_seq'::regclass)::integer as doc_id")
)
return record.doc_id.toString()
}
module.exports = {
getLatestChunk,
getFirstChunkBeforeTimestamp,
getLastActiveChunkBeforeTimestamp,
getChunkForVersion,
getChunkForTimestamp,
getProjectChunkIds,
getProjectChunks,
insertPendingChunk,
confirmCreate,
confirmUpdate,
deleteChunk,
deleteProjectChunks,
getOldChunksBatch,
deleteOldChunks,
generateProjectId,
}

View File

@@ -0,0 +1,254 @@
const metrics = require('@overleaf/metrics')
const logger = require('@overleaf/logger')
const redis = require('../redis')
const rclient = redis.rclientHistory //
const { Snapshot, Change, History, Chunk } = require('overleaf-editor-core')
const TEMPORARY_CACHE_LIFETIME = 300 // 5 minutes
const keySchema = {
snapshot({ projectId }) {
return `snapshot:{${projectId}}`
},
startVersion({ projectId }) {
return `snapshot-version:{${projectId}}`
},
changes({ projectId }) {
return `changes:{${projectId}}`
},
}
rclient.defineCommand('get_current_chunk', {
numberOfKeys: 3,
lua: `
local startVersionValue = redis.call('GET', KEYS[2])
if not startVersionValue then
return nil -- this is a cache-miss
end
local snapshotValue = redis.call('GET', KEYS[1])
local changesValues = redis.call('LRANGE', KEYS[3], 0, -1)
return {snapshotValue, startVersionValue, changesValues}
`,
})
/**
* Retrieves the current chunk of project history from Redis storage
* @param {string} projectId - The unique identifier of the project
* @returns {Promise<Chunk|null>} A Promise that resolves to a Chunk object containing project history,
* or null if retrieval fails
* @throws {Error} If Redis operations fail
*/
async function getCurrentChunk(projectId) {
try {
const result = await rclient.get_current_chunk(
keySchema.snapshot({ projectId }),
keySchema.startVersion({ projectId }),
keySchema.changes({ projectId })
)
if (!result) {
return null // cache-miss
}
const snapshot = Snapshot.fromRaw(JSON.parse(result[0]))
const startVersion = JSON.parse(result[1])
const changes = result[2].map(c => Change.fromRaw(JSON.parse(c)))
const history = new History(snapshot, changes)
const chunk = new Chunk(history, startVersion)
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'success' })
return chunk
} catch (err) {
logger.error({ err, projectId }, 'error getting current chunk from redis')
metrics.inc('chunk_store.redis.get_current_chunk', 1, { status: 'error' })
return null
}
}
rclient.defineCommand('get_current_chunk_metadata', {
numberOfKeys: 2,
lua: `
local startVersionValue = redis.call('GET', KEYS[1])
local changesCount = redis.call('LLEN', KEYS[2])
return {startVersionValue, changesCount}
`,
})
/**
* Retrieves the current chunk metadata for a given project from Redis
* @param {string} projectId - The ID of the project to get metadata for
* @returns {Promise<Object|null>} Object containing startVersion and changesCount if found, null on error or cache miss
* @property {number} startVersion - The starting version information
* @property {number} changesCount - The number of changes in the chunk
*/
async function getCurrentChunkMetadata(projectId) {
try {
const result = await rclient.get_current_chunk_metadata(
keySchema.startVersion({ projectId }),
keySchema.changes({ projectId })
)
if (!result) {
return null // cache-miss
}
const startVersion = JSON.parse(result[0])
const changesCount = parseInt(result[1], 10)
return { startVersion, changesCount }
} catch (err) {
return null
}
}
rclient.defineCommand('set_current_chunk', {
numberOfKeys: 3,
lua: `
local snapshotValue = ARGV[1]
local startVersionValue = ARGV[2]
redis.call('SETEX', KEYS[1], ${TEMPORARY_CACHE_LIFETIME}, snapshotValue)
redis.call('SETEX', KEYS[2], ${TEMPORARY_CACHE_LIFETIME}, startVersionValue)
redis.call('DEL', KEYS[3]) -- clear the old changes list
if #ARGV >= 3 then
redis.call('RPUSH', KEYS[3], unpack(ARGV, 3))
redis.call('EXPIRE', KEYS[3], ${TEMPORARY_CACHE_LIFETIME})
end
`,
})
/**
* Stores the current chunk of project history in Redis
* @param {string} projectId - The ID of the project
* @param {Chunk} chunk - The chunk object containing history data
* @returns {Promise<*>} Returns the result of the Redis operation, or null if an error occurs
* @throws {Error} May throw Redis-related errors which are caught internally
*/
async function setCurrentChunk(projectId, chunk) {
try {
const snapshotKey = keySchema.snapshot({ projectId })
const startVersionKey = keySchema.startVersion({ projectId })
const changesKey = keySchema.changes({ projectId })
const snapshot = chunk.history.snapshot
const startVersion = chunk.startVersion
const changes = chunk.history.changes
await rclient.set_current_chunk(
snapshotKey,
startVersionKey,
changesKey,
JSON.stringify(snapshot.toRaw()),
startVersion,
...changes.map(c => JSON.stringify(c.toRaw()))
)
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'success' })
} catch (err) {
logger.error(
{ err, projectId, chunk },
'error setting current chunk inredis'
)
metrics.inc('chunk_store.redis.set_current_chunk', 1, { status: 'error' })
return null // while testing we will suppress any errors
}
}
/**
* Checks whether a cached chunk's version metadata matches the current chunk's metadata
* @param {Chunk} cachedChunk - The chunk retrieved from cache
* @param {Chunk} currentChunk - The current chunk to compare against
* @returns {boolean} - Returns true if the chunks have matching start and end versions, false otherwise
*/
function checkCacheValidity(cachedChunk, currentChunk) {
return Boolean(
cachedChunk &&
cachedChunk.getStartVersion() === currentChunk.getStartVersion() &&
cachedChunk.getEndVersion() === currentChunk.getEndVersion()
)
}
/**
* Validates if a cached chunk matches the current chunk metadata by comparing versions
* @param {Object} cachedChunk - The cached chunk object to validate
* @param {Object} currentChunkMetadata - The current chunk metadata to compare against
* @param {number} currentChunkMetadata.startVersion - The starting version number
* @param {number} currentChunkMetadata.endVersion - The ending version number
* @returns {boolean} - True if the cached chunk is valid, false otherwise
*/
function checkCacheValidityWithMetadata(cachedChunk, currentChunkMetadata) {
return Boolean(
cachedChunk &&
cachedChunk.getStartVersion() === currentChunkMetadata.startVersion &&
cachedChunk.getEndVersion() === currentChunkMetadata.endVersion
)
}
/**
* Compares two chunks for equality using stringified JSON comparison
* @param {string} projectId - The ID of the project
* @param {Chunk} cachedChunk - The cached chunk to compare
* @param {Chunk} currentChunk - The current chunk to compare against
* @returns {boolean} - Returns false if either chunk is null/undefined, otherwise returns the comparison result
*/
function compareChunks(projectId, cachedChunk, currentChunk) {
if (!cachedChunk || !currentChunk) {
return false
}
const identical = JSON.stringify(cachedChunk) === JSON.stringify(currentChunk)
if (!identical) {
try {
logger.error(
{
projectId,
cachedChunkStartVersion: cachedChunk.getStartVersion(),
cachedChunkEndVersion: cachedChunk.getEndVersion(),
currentChunkStartVersion: currentChunk.getStartVersion(),
currentChunkEndVersion: currentChunk.getEndVersion(),
},
'chunk cache mismatch'
)
} catch (err) {
// ignore errors while logging
}
}
metrics.inc('chunk_store.redis.compare_chunks', 1, {
status: identical ? 'success' : 'fail',
})
return identical
}
// Define Lua script for atomic cache clearing
rclient.defineCommand('clear_chunk_cache', {
numberOfKeys: 3,
lua: `
-- Delete all keys related to a project's chunk cache atomically
redis.call('DEL', KEYS[1]) -- snapshot key
redis.call('DEL', KEYS[2]) -- startVersion key
redis.call('DEL', KEYS[3]) -- changes key
return 1
`,
})
/**
* Clears all cache entries for a project's chunk data
* @param {string} projectId - The ID of the project whose cache should be cleared
* @returns {Promise<boolean>} A promise that resolves to true if successful, false on error
*/
async function clearCache(projectId) {
try {
const snapshotKey = keySchema.snapshot({ projectId })
const startVersionKey = keySchema.startVersion({ projectId })
const changesKey = keySchema.changes({ projectId })
await rclient.clear_chunk_cache(snapshotKey, startVersionKey, changesKey)
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'success' })
return true
} catch (err) {
logger.error({ err, projectId }, 'error clearing chunk cache from redis')
metrics.inc('chunk_store.redis.clear_cache', 1, { status: 'error' })
return false
}
}
module.exports = {
getCurrentChunk,
setCurrentChunk,
getCurrentChunkMetadata,
checkCacheValidity,
checkCacheValidityWithMetadata,
compareChunks,
clearCache,
}