Merge pull request #27246 from overleaf/jpa-hotfix-5-5-3

[server-pro] add hotfix 5.5.3

GitOrigin-RevId: 6bd266afb8f5ba622224b6095204ee6801c05a44
This commit is contained in:
Brian Gough
2025-07-29 14:46:33 +01:00
committed by Copybot
parent 7f8a423104
commit ae180fba46
14 changed files with 6674 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
#!/bin/bash
for PR in "$@"; do
gh pr diff "$PR" --patch \
| node -e 'const blob = require("fs").readFileSync("/dev/stdin", "utf-8"); console.log(blob.replace(/From [\s\S]+?\d+ files? changed,.+/g, ""))' \
> "pr_$PR.patch"
done

View File

@@ -0,0 +1,25 @@
FROM sharelatex/sharelatex:5.5.2
# ../../bin/import_pr_patch.sh 27147 27173 27230 27240 27249 27257 27273 27397
# Remove CE tests
# Remove tests
# Remove cloudbuild changes
# Remove SaaS changes
# Fixup package.json and toolbar-items.tsx
# Fix cron paths
COPY *.patch .
RUN --mount=type=cache,target=/root/.cache \
--mount=type=cache,target=/root/.npm \
--mount=type=cache,target=/overleaf/services/web/node_modules/.cache,id=server-ce-webpack-cache \
--mount=type=tmpfs,target=/tmp true \
&& bash -ec 'for p in *.patch; do echo "=== Applying $p ==="; patch -p1 < "$p" && rm $p; done' \
&& npm audit --audit-level=high \
&& node genScript compile | bash \
&& npm prune --omit=dev \
&& apt remove -y linux-libc-dev
# ../../bin/import_pr_patch.sh 27476
# Remove tests
# Remove SaaS changes
COPY pr_27476.patch-stage-2 .
RUN patch -p1 < pr_27476.patch-stage-2 && rm pr_27476.patch-stage-2

View File

@@ -0,0 +1,54 @@
# Get the base container running
docker build -t base .
CONTAINER_NAME=new
# Start the container
docker run -t -i --entrypoint /bin/bash --name $CONTAINER_NAME base
# Clean any existing directories
rm -rf /tmp/{a,b}
# Take snapshot of initial container
mkdir /tmp/a ; docker export $CONTAINER_NAME | tar --exclude node_modules -x -C /tmp/a --strip-components=1 overleaf
# In the container, run the following commands
docker exec -i $CONTAINER_NAME /bin/bash <<'EOF'
npm install -g json
json -I -f package.json -c 'this.overrides["swagger-tools"].multer="2.0.2"'
json -I -f package.json -c 'this.overrides["request@2.88.2"]["form-data"]="2.5.5"'
json -I -f package.json -c 'this.overrides["superagent@7.1.6"] ??= {}'
json -I -f package.json -c 'this.overrides["superagent@7.1.6"]["form-data"]="4.0.4"'
json -I -f package.json -c 'this.overrides["superagent@3.8.3"] ??= {}'
json -I -f package.json -c 'this.overrides["superagent@3.8.3"]["form-data"]="2.5.5"'
npm uninstall -w libraries/metrics @google-cloud/opentelemetry-cloud-trace-exporter @google-cloud/profiler
npm uninstall -w libraries/logger @google-cloud/logging-bunyan
npm uninstall -w services/web @slack/webhook contentful @contentful/rich-text-types @contentful/rich-text-html-renderer
npm uninstall -w services/history-v1 @google-cloud/secret-manager
npm uninstall -w services/web "@node-saml/passport-saml"
npm install -w services/web "@node-saml/passport-saml@^5.1.0"
npm uninstall -w services/web multer
npm install -w services/web "multer@2.0.2"
npm uninstall -w services/history-v1 swagger-tools
npm install -w services/history-v1 swagger-tools@0.10.4
npm uninstall -w services/clsi request
npm install -w services/clsi request@2.88.2
npm install
npm audit --audit-level=high
EOF
# Take snapshot of final container
mkdir /tmp/b ; docker export $CONTAINER_NAME | tar --exclude node_modules -x -C /tmp/b --strip-components=1 overleaf
# Find the diff excluding node modules directories
# The sec_ prefix ensures it applies after pr_* patches.
(cd /tmp ; diff -u -x 'node_modules' -r a/ b/) > sec-npm.patch
# In the docker file we also need to remove linux-libc-dev
apt remove -y linux-libc-dev

View File

@@ -0,0 +1,27 @@
commit 43d0476e489cdf8e2e7261eb419810140d252a6d
Author: Andrew Rumble <andrew.rumble@overleaf.com>
Date: Fri Jul 25 12:18:26 2025 +0100
Add patch for multer 2.0.2
Co-authored-by: Ersun Warncke <ersun.warncke@overleaf.com>
diff --git a/patches/multer+2.0.2.patch b/patches/multer+2.0.2.patch
new file mode 100644
index 00000000000..f9959effe15
--- /dev/null
+++ b/patches/multer+2.0.2.patch
@@ -0,0 +1,13 @@
+diff --git a/node_modules/multer/lib/make-middleware.js b/node_modules/multer/lib/make-middleware.js
+index 260dcb4..895b4b2 100644
+--- a/node_modules/multer/lib/make-middleware.js
++++ b/node_modules/multer/lib/make-middleware.js
+@@ -113,7 +113,7 @@ function makeMiddleware (setup) {
+ if (fieldname == null) return abortWithCode('MISSING_FIELD_NAME')
+
+ // don't attach to the files object, if there is no file
+- if (!filename) return fileStream.resume()
++ if (!filename) filename = 'undefined'
+
+ // Work around bug in Busboy (https://github.com/mscdex/busboy/issues/6)
+ if (limits && Object.prototype.hasOwnProperty.call(limits, 'fieldNameSize')) {

View File

@@ -0,0 +1,351 @@
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index ba3e0d43598e..feb4612ddc23 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -33,7 +33,6 @@ import {
makeProjectKey,
} from '../lib/blob_store/index.js'
import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
-import filestorePersistor from '../lib/persistor.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
@@ -179,6 +178,37 @@ const STREAM_HIGH_WATER_MARK = parseInt(
const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
+// Filestore endpoint location, the port is always hardcoded
+const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1'
+const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009'
+
+async function fetchFromFilestore(projectId, fileId) {
+ const url = `http://${FILESTORE_HOST}:${FILESTORE_PORT}/project/${projectId}/file/${fileId}`
+ const response = await fetch(url)
+ if (!response.ok) {
+ if (response.status === 404) {
+ throw new NotFoundError('file not found in filestore', {
+ status: response.status,
+ })
+ }
+ const body = await response.text()
+ throw new OError('fetchFromFilestore failed', {
+ projectId,
+ fileId,
+ status: response.status,
+ body,
+ })
+ }
+ if (!response.body) {
+ throw new OError('fetchFromFilestore response has no body', {
+ projectId,
+ fileId,
+ status: response.status,
+ })
+ }
+ return response.body
+}
+
const projectsCollection = db.collection('projects')
/** @type {ProjectsCollection} */
const typedProjectsCollection = db.collection('projects')
@@ -348,8 +378,7 @@ async function processFile(entry, filePath) {
} catch (err) {
if (gracefulShutdownInitiated) throw err
if (err instanceof NotFoundError) {
- const { bucketName } = OError.getFullInfo(err)
- if (bucketName === USER_FILES_BUCKET_NAME && !RETRY_FILESTORE_404) {
+ if (!RETRY_FILESTORE_404) {
throw err // disable retries for not found in filestore bucket case
}
}
@@ -416,10 +445,8 @@ async function processFileOnce(entry, filePath) {
}
STATS.readFromGCSCount++
- const src = await filestorePersistor.getObjectStream(
- USER_FILES_BUCKET_NAME,
- `${projectId}/${fileId}`
- )
+ // make a fetch request to filestore itself
+ const src = await fetchFromFilestore(projectId, fileId)
const dst = fs.createWriteStream(filePath, {
highWaterMark: STREAM_HIGH_WATER_MARK,
})
@@ -1327,14 +1354,21 @@ async function processDeletedProjects() {
}
async function main() {
+ console.log('Starting project file backup...')
await loadGlobalBlobs()
+ console.log('Loaded global blobs:', GLOBAL_BLOBS.size)
if (PROJECT_IDS_FROM) {
+ console.log(
+ `Processing projects from file: ${PROJECT_IDS_FROM}, this may take a while...`
+ )
await processProjectsFromFile()
} else {
if (PROCESS_NON_DELETED_PROJECTS) {
+ console.log('Processing non-deleted projects...')
await processNonDeletedProjects()
}
if (PROCESS_DELETED_PROJECTS) {
+ console.log('Processing deleted projects...')
await processDeletedProjects()
}
}
diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
index fd39369a7189..4e697b8bec2c 100644
--- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
+++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
@@ -15,7 +15,6 @@ import { execFile } from 'node:child_process'
import chai, { expect } from 'chai'
import chaiExclude from 'chai-exclude'
import config from 'config'
-import ObjectPersistor from '@overleaf/object-persistor'
import { WritableBuffer } from '@overleaf/stream-utils'
import {
backupPersistor,
@@ -27,6 +26,9 @@ import {
makeProjectKey,
} from '../../../../storage/lib/blob_store/index.js'
+import express from 'express'
+import bodyParser from 'body-parser'
+
chai.use(chaiExclude)
const TIMEOUT = 20 * 1_000
@@ -36,15 +38,60 @@ const { tieringStorageClass } = config.get('backupPersistor')
const projectsCollection = db.collection('projects')
const deletedProjectsCollection = db.collection('deletedProjects')
-const FILESTORE_PERSISTOR = ObjectPersistor({
- backend: 'gcs',
- gcs: {
- endpoint: {
- apiEndpoint: process.env.GCS_API_ENDPOINT,
- projectId: process.env.GCS_PROJECT_ID,
- },
- },
-})
+class MockFilestore {
+ constructor() {
+ this.host = process.env.FILESTORE_HOST || '127.0.0.1'
+ this.port = process.env.FILESTORE_PORT || 3009
+ // create a server listening on this.host and this.port
+ this.files = {}
+
+ this.app = express()
+ this.app.use(bodyParser.json())
+ this.app.use(bodyParser.urlencoded({ extended: true }))
+
+ this.app.get('/project/:projectId/file/:fileId', (req, res) => {
+ const { projectId, fileId } = req.params
+ const content = this.files[projectId]?.[fileId]
+ if (!content) return res.status(404).end()
+ res.status(200).end(content)
+ })
+ }
+
+ start() {
+ // reset stored files
+ this.files = {}
+ // start the server
+ if (this.serverPromise) {
+ return this.serverPromise
+ } else {
+ this.serverPromise = new Promise((resolve, reject) => {
+ this.server = this.app.listen(this.port, this.host, err => {
+ if (err) return reject(err)
+ resolve()
+ })
+ })
+ return this.serverPromise
+ }
+ }
+
+ addFile(projectId, fileId, fileContent) {
+ if (!this.files[projectId]) {
+ this.files[projectId] = {}
+ }
+ this.files[projectId][fileId] = fileContent
+ }
+
+ deleteObject(projectId, fileId) {
+ if (this.files[projectId]) {
+ delete this.files[projectId][fileId]
+ if (Object.keys(this.files[projectId]).length === 0) {
+ delete this.files[projectId]
+ }
+ }
+ }
+}
+
+const mockFilestore = new MockFilestore()
/**
* @param {ObjectId} objectId
@@ -472,67 +519,36 @@ describe('back_fill_file_hash script', function () {
}
async function populateFilestore() {
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId0}/${fileId0}`,
- Stream.Readable.from([fileId0.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId0}/${fileId6}`,
- Stream.Readable.from([fileId6.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId0}/${fileId7}`,
- Stream.Readable.from([contentFile7])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId1}/${fileId1}`,
- Stream.Readable.from([fileId1.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId2}/${fileId2}`,
- Stream.Readable.from([fileId2.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId3}/${fileId3}`,
- Stream.Readable.from([fileId3.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId3}/${fileId10}`,
+ await mockFilestore.addFile(projectId0, fileId0, fileId0.toString())
+ await mockFilestore.addFile(projectId0, fileId6, fileId6.toString())
+ await mockFilestore.addFile(projectId0, fileId7, contentFile7)
+ await mockFilestore.addFile(projectId1, fileId1, fileId1.toString())
+ await mockFilestore.addFile(projectId2, fileId2, fileId2.toString())
+ await mockFilestore.addFile(projectId3, fileId3, fileId3.toString())
+ await mockFilestore.addFile(
+ projectId3,
+ fileId10,
// fileId10 is dupe of fileId3
- Stream.Readable.from([fileId3.toString()])
+ fileId3.toString()
)
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId3}/${fileId11}`,
+ await mockFilestore.addFile(
+ projectId3,
+ fileId11,
// fileId11 is dupe of fileId3
- Stream.Readable.from([fileId3.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectIdDeleted0}/${fileId4}`,
- Stream.Readable.from([fileId4.toString()])
+ fileId3.toString()
)
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectIdDeleted1}/${fileId5}`,
- Stream.Readable.from([fileId5.toString()])
- )
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectIdBadFileTree3}/${fileId9}`,
- Stream.Readable.from([fileId9.toString()])
+ await mockFilestore.addFile(projectIdDeleted0, fileId4, fileId4.toString())
+ await mockFilestore.addFile(projectIdDeleted1, fileId5, fileId5.toString())
+ await mockFilestore.addFile(
+ projectIdBadFileTree3,
+ fileId9,
+ fileId9.toString()
)
}
async function prepareEnvironment() {
await cleanup.everything()
+ await mockFilestore.start()
await populateMongo()
await populateHistoryV1()
await populateFilestore()
@@ -1117,10 +1133,7 @@ describe('back_fill_file_hash script', function () {
beforeEach('prepare environment', prepareEnvironment)
it('should gracefully handle fatal errors', async function () {
- await FILESTORE_PERSISTOR.deleteObject(
- USER_FILES_BUCKET_NAME,
- `${projectId0}/${fileId0}`
- )
+ mockFilestore.deleteObject(projectId0, fileId0)
const t0 = Date.now()
const { stats, result } = await tryRunScript([], {
RETRIES: '10',
@@ -1148,17 +1161,10 @@ describe('back_fill_file_hash script', function () {
})
it('should retry on error', async function () {
- await FILESTORE_PERSISTOR.deleteObject(
- USER_FILES_BUCKET_NAME,
- `${projectId0}/${fileId0}`
- )
+ mockFilestore.deleteObject(projectId0, fileId0)
const restoreFileAfter5s = async () => {
await setTimeout(5_000)
- await FILESTORE_PERSISTOR.sendStream(
- USER_FILES_BUCKET_NAME,
- `${projectId0}/${fileId0}`,
- Stream.Readable.from([fileId0.toString()])
- )
+ mockFilestore.addFile(projectId0, fileId0, fileId0.toString())
}
// use Promise.allSettled to ensure the above sendStream call finishes before this test completes
const [
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index feb4612ddc23..5a590e347a94 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -178,7 +178,7 @@ const STREAM_HIGH_WATER_MARK = parseInt(
const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
-// Filestore endpoint location, the port is always hardcoded
+// Filestore endpoint location
const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1'
const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009'
diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
index 4e697b8bec2c..8f861d393451 100644
--- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
+++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
@@ -27,7 +27,6 @@ import {
} from '../../../../storage/lib/blob_store/index.js'
import express from 'express'
-import bodyParser from 'body-parser'
chai.use(chaiExclude)
const TIMEOUT = 20 * 1_000
@@ -46,8 +45,6 @@ class MockFilestore {
this.files = {}
this.app = express()
- this.app.use(bodyParser.json())
- this.app.use(bodyParser.urlencoded({ extended: true }))
this.app.get('/project/:projectId/file/:fileId', (req, res) => {
const { projectId, fileId } = req.params

View File

@@ -0,0 +1,961 @@
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index 5a590e347a9..3be1c8a5407 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -1,28 +1,20 @@
// @ts-check
-import Crypto from 'node:crypto'
import Events from 'node:events'
import fs from 'node:fs'
import Path from 'node:path'
import { performance } from 'node:perf_hooks'
import Stream from 'node:stream'
-import zLib from 'node:zlib'
import { setTimeout } from 'node:timers/promises'
-import { Binary, ObjectId } from 'mongodb'
+import { ObjectId } from 'mongodb'
import pLimit from 'p-limit'
import logger from '@overleaf/logger'
import {
batchedUpdate,
objectIdFromInput,
renderObjectId,
- READ_PREFERENCE_SECONDARY,
} from '@overleaf/mongo-utils/batchedUpdate.js'
import OError from '@overleaf/o-error'
-import {
- AlreadyWrittenError,
- NoKEKMatchedError,
- NotFoundError,
-} from '@overleaf/object-persistor/src/Errors.js'
-import { backupPersistor, projectBlobsBucket } from '../lib/backupPersistor.mjs'
+import { NotFoundError } from '@overleaf/object-persistor/src/Errors.js'
import {
BlobStore,
GLOBAL_BLOBS,
@@ -30,9 +22,8 @@ import {
getProjectBlobsBatch,
getStringLengthOfFile,
makeBlobForFile,
- makeProjectKey,
} from '../lib/blob_store/index.js'
-import { backedUpBlobs as backedUpBlobsCollection, db } from '../lib/mongodb.js'
+import { db } from '../lib/mongodb.js'
import commandLineArgs from 'command-line-args'
import readline from 'node:readline'
@@ -88,7 +79,7 @@ ObjectId.cacheHexString = true
*/
/**
- * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, COLLECT_BACKED_UP_BLOBS: boolean}}
+ * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, PROCESS_BLOBS: boolean, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean}}
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
@@ -98,7 +89,6 @@ function parseArgs() {
{ name: 'processHashedFiles', type: String, defaultValue: 'false' },
{ name: 'processBlobs', type: String, defaultValue: 'true' },
{ name: 'projectIdsFrom', type: String, defaultValue: '' },
- { name: 'collectBackedUpBlobs', type: String, defaultValue: 'true' },
{
name: 'BATCH_RANGE_START',
type: String,
@@ -130,7 +120,6 @@ function parseArgs() {
PROCESS_DELETED_PROJECTS: boolVal('processDeletedProjects'),
PROCESS_BLOBS: boolVal('processBlobs'),
PROCESS_HASHED_FILES: boolVal('processHashedFiles'),
- COLLECT_BACKED_UP_BLOBS: boolVal('collectBackedUpBlobs'),
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['LOGGING_IDENTIFIER'] || BATCH_RANGE_START,
@@ -143,7 +132,6 @@ const {
PROCESS_DELETED_PROJECTS,
PROCESS_BLOBS,
PROCESS_HASHED_FILES,
- COLLECT_BACKED_UP_BLOBS,
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER,
@@ -232,7 +220,6 @@ async function processConcurrently(array, fn) {
const STATS = {
projects: 0,
blobs: 0,
- backedUpBlobs: 0,
filesWithHash: 0,
filesWithoutHash: 0,
filesDuplicated: 0,
@@ -246,14 +233,8 @@ const STATS = {
projectHardDeleted: 0,
fileHardDeleted: 0,
mongoUpdates: 0,
- deduplicatedWriteToAWSLocalCount: 0,
- deduplicatedWriteToAWSLocalEgress: 0,
- deduplicatedWriteToAWSRemoteCount: 0,
- deduplicatedWriteToAWSRemoteEgress: 0,
readFromGCSCount: 0,
readFromGCSIngress: 0,
- writeToAWSCount: 0,
- writeToAWSEgress: 0,
writeToGCSCount: 0,
writeToGCSEgress: 0,
}
@@ -275,7 +256,7 @@ function toMiBPerSecond(v, ms) {
/**
* @param {any} stats
* @param {number} ms
- * @return {{writeToAWSThroughputMiBPerSecond: number, readFromGCSThroughputMiBPerSecond: number}}
+ * @return {{readFromGCSThroughputMiBPerSecond: number}}
*/
function bandwidthStats(stats, ms) {
return {
@@ -283,10 +264,6 @@ function bandwidthStats(stats, ms) {
stats.readFromGCSIngress,
ms
),
- writeToAWSThroughputMiBPerSecond: toMiBPerSecond(
- stats.writeToAWSEgress,
- ms
- ),
}
}
@@ -382,9 +359,6 @@ async function processFile(entry, filePath) {
throw err // disable retries for not found in filestore bucket case
}
}
- if (err instanceof NoKEKMatchedError) {
- throw err // disable retries when upload to S3 will fail again
- }
STATS.filesRetries++
const {
ctx: { projectId },
@@ -417,32 +391,8 @@ async function processFileOnce(entry, filePath) {
if (entry.blob) {
const { blob } = entry
const hash = blob.getHash()
- if (entry.ctx.hasBackedUpBlob(hash)) {
- STATS.deduplicatedWriteToAWSLocalCount++
- STATS.deduplicatedWriteToAWSLocalEgress += estimateBlobSize(blob)
- return hash
- }
- entry.ctx.recordPendingBlob(hash)
- STATS.readFromGCSCount++
- const src = await blobStore.getStream(hash)
- const dst = fs.createWriteStream(filePath, {
- highWaterMark: STREAM_HIGH_WATER_MARK,
- })
- try {
- await Stream.promises.pipeline(src, dst)
- } finally {
- STATS.readFromGCSIngress += dst.bytesWritten
- }
- await uploadBlobToAWS(entry, blob, filePath)
return hash
}
- if (entry.hash && entry.ctx.hasBackedUpBlob(entry.hash)) {
- STATS.deduplicatedWriteToAWSLocalCount++
- const blob = entry.ctx.getCachedHistoryBlob(entry.hash)
- // blob might not exist on re-run with --PROCESS_BLOBS=false
- if (blob) STATS.deduplicatedWriteToAWSLocalEgress += estimateBlobSize(blob)
- return entry.hash
- }
STATS.readFromGCSCount++
// make a fetch request to filestore itself
@@ -469,16 +419,14 @@ async function processFileOnce(entry, filePath) {
STATS.globalBlobsEgress += estimateBlobSize(blob)
return hash
}
- if (entry.ctx.hasBackedUpBlob(hash)) {
- STATS.deduplicatedWriteToAWSLocalCount++
- STATS.deduplicatedWriteToAWSLocalEgress += estimateBlobSize(blob)
+ if (entry.ctx.hasCompletedBlob(hash)) {
return hash
}
entry.ctx.recordPendingBlob(hash)
try {
await uploadBlobToGCS(blobStore, entry, blob, hash, filePath)
- await uploadBlobToAWS(entry, blob, filePath)
+ entry.ctx.recordCompletedBlob(hash) // mark upload as completed
} catch (err) {
entry.ctx.recordFailedBlob(hash)
throw err
@@ -515,76 +463,6 @@ async function uploadBlobToGCS(blobStore, entry, blob, hash, filePath) {
const GZ_SUFFIX = '.gz'
-/**
- * @param {QueueEntry} entry
- * @param {Blob} blob
- * @param {string} filePath
- * @return {Promise<void>}
- */
-async function uploadBlobToAWS(entry, blob, filePath) {
- const { historyId } = entry.ctx
- let backupSource
- let contentEncoding
- const md5 = Crypto.createHash('md5')
- let size
- if (blob.getStringLength()) {
- const filePathCompressed = filePath + GZ_SUFFIX
- backupSource = filePathCompressed
- contentEncoding = 'gzip'
- size = 0
- await Stream.promises.pipeline(
- fs.createReadStream(filePath, { highWaterMark: STREAM_HIGH_WATER_MARK }),
- zLib.createGzip(),
- async function* (source) {
- for await (const chunk of source) {
- size += chunk.byteLength
- md5.update(chunk)
- yield chunk
- }
- },
- fs.createWriteStream(filePathCompressed, {
- highWaterMark: STREAM_HIGH_WATER_MARK,
- })
- )
- } else {
- backupSource = filePath
- size = blob.getByteLength()
- await Stream.promises.pipeline(
- fs.createReadStream(filePath, { highWaterMark: STREAM_HIGH_WATER_MARK }),
- md5
- )
- }
- const backendKeyPath = makeProjectKey(historyId, blob.getHash())
- const persistor = await entry.ctx.getCachedPersistor(backendKeyPath)
- try {
- STATS.writeToAWSCount++
- await persistor.sendStream(
- projectBlobsBucket,
- backendKeyPath,
- fs.createReadStream(backupSource, {
- highWaterMark: STREAM_HIGH_WATER_MARK,
- }),
- {
- contentEncoding,
- contentType: 'application/octet-stream',
- contentLength: size,
- sourceMd5: md5.digest('hex'),
- ifNoneMatch: '*', // de-duplicate write (we pay for the request, but avoid egress)
- }
- )
- STATS.writeToAWSEgress += size
- } catch (err) {
- if (err instanceof AlreadyWrittenError) {
- STATS.deduplicatedWriteToAWSRemoteCount++
- STATS.deduplicatedWriteToAWSRemoteEgress += size
- } else {
- STATS.writeToAWSEgress += size
- throw err
- }
- }
- entry.ctx.recordBackedUpBlob(blob.getHash())
-}
-
/**
* @param {Array<QueueEntry>} files
* @return {Promise<void>}
@@ -670,23 +548,18 @@ async function queueNextBatch(batch, prefix = 'rootFolder.0') {
* @return {Promise<void>}
*/
async function processBatch(batch, prefix = 'rootFolder.0') {
- const [{ nBlobs, blobs }, { nBackedUpBlobs, backedUpBlobs }] =
- await Promise.all([collectProjectBlobs(batch), collectBackedUpBlobs(batch)])
- const files = Array.from(findFileInBatch(batch, prefix, blobs, backedUpBlobs))
+ const { nBlobs, blobs } = await collectProjectBlobs(batch)
+ const files = Array.from(findFileInBatch(batch, prefix, blobs))
STATS.projects += batch.length
STATS.blobs += nBlobs
- STATS.backedUpBlobs += nBackedUpBlobs
// GC
batch.length = 0
blobs.clear()
- backedUpBlobs.clear()
// The files are currently ordered by project-id.
// Order them by file-id ASC then blobs ASC to
// - process files before blobs
- // - avoid head-of-line blocking from many project-files waiting on the generation of the projects DEK (round trip to AWS)
- // - bonus: increase chance of de-duplicating write to AWS
files.sort(
/**
* @param {QueueEntry} a
@@ -903,23 +776,15 @@ function* findFiles(ctx, folder, path, isInputLoop = false) {
* @param {Array<Project>} projects
* @param {string} prefix
* @param {Map<string,Array<Blob>>} blobs
- * @param {Map<string,Array<string>>} backedUpBlobs
* @return Generator<QueueEntry>
*/
-function* findFileInBatch(projects, prefix, blobs, backedUpBlobs) {
+function* findFileInBatch(projects, prefix, blobs) {
for (const project of projects) {
const projectIdS = project._id.toString()
const historyIdS = project.overleaf.history.id.toString()
const projectBlobs = blobs.get(historyIdS) || []
- const projectBackedUpBlobs = new Set(backedUpBlobs.get(projectIdS) || [])
- const ctx = new ProjectContext(
- project._id,
- historyIdS,
- projectBlobs,
- projectBackedUpBlobs
- )
+ const ctx = new ProjectContext(project._id, historyIdS, projectBlobs)
for (const blob of projectBlobs) {
- if (projectBackedUpBlobs.has(blob.getHash())) continue
ctx.remainingQueueEntries++
yield {
ctx,
@@ -951,42 +816,11 @@ async function collectProjectBlobs(batch) {
return await getProjectBlobsBatch(batch.map(p => p.overleaf.history.id))
}
-/**
- * @param {Array<Project>} projects
- * @return {Promise<{nBackedUpBlobs:number,backedUpBlobs:Map<string,Array<string>>}>}
- */
-async function collectBackedUpBlobs(projects) {
- let nBackedUpBlobs = 0
- const backedUpBlobs = new Map()
- if (!COLLECT_BACKED_UP_BLOBS) return { nBackedUpBlobs, backedUpBlobs }
-
- const cursor = backedUpBlobsCollection.find(
- { _id: { $in: projects.map(p => p._id) } },
- {
- readPreference: READ_PREFERENCE_SECONDARY,
- sort: { _id: 1 },
- }
- )
- for await (const record of cursor) {
- const blobs = record.blobs.map(b => b.toString('hex'))
- backedUpBlobs.set(record._id.toString(), blobs)
- nBackedUpBlobs += blobs.length
- }
- return { nBackedUpBlobs, backedUpBlobs }
-}
-
-const BATCH_HASH_WRITES = 1_000
const BATCH_FILE_UPDATES = 100
const MONGO_PATH_SKIP_WRITE_HASH_TO_FILE_TREE = 'skip-write-to-file-tree'
class ProjectContext {
- /** @type {Promise<CachedPerProjectEncryptedS3Persistor> | null} */
- #cachedPersistorPromise = null
-
- /** @type {Set<string>} */
- #backedUpBlobs
-
/** @type {Map<string, Blob>} */
#historyBlobs
@@ -1000,12 +834,10 @@ class ProjectContext {
* @param {ObjectId} projectId
* @param {string} historyId
* @param {Array<Blob>} blobs
- * @param {Set<string>} backedUpBlobs
*/
- constructor(projectId, historyId, blobs, backedUpBlobs) {
+ constructor(projectId, historyId, blobs) {
this.projectId = projectId
this.historyId = historyId
- this.#backedUpBlobs = backedUpBlobs
this.#historyBlobs = new Map(blobs.map(b => [b.getHash(), b]))
}
@@ -1034,75 +866,17 @@ class ProjectContext {
return false
}
- /**
- * @param {string} key
- * @return {Promise<CachedPerProjectEncryptedS3Persistor>}
- */
- getCachedPersistor(key) {
- if (!this.#cachedPersistorPromise) {
- // Fetch DEK once, but only if needed -- upon the first use
- this.#cachedPersistorPromise = this.#getCachedPersistorWithRetries(key)
- }
- return this.#cachedPersistorPromise
- }
-
- /**
- * @param {string} key
- * @return {Promise<CachedPerProjectEncryptedS3Persistor>}
- */
- async #getCachedPersistorWithRetries(key) {
- // Optimization: Skip GET on DEK in case no blobs are marked as backed up yet.
- let tryGenerateDEKFirst = this.#backedUpBlobs.size === 0
- for (let attempt = 0; attempt < RETRIES; attempt++) {
- try {
- if (tryGenerateDEKFirst) {
- try {
- return await backupPersistor.generateDataEncryptionKey(
- projectBlobsBucket,
- key
- )
- } catch (err) {
- if (err instanceof AlreadyWrittenError) {
- tryGenerateDEKFirst = false
- // fall back to GET below
- } else {
- throw err
- }
- }
- }
- return await backupPersistor.forProject(projectBlobsBucket, key)
- } catch (err) {
- if (gracefulShutdownInitiated) throw err
- if (err instanceof NoKEKMatchedError) {
- throw err
- } else {
- logger.warn(
- { err, projectId: this.projectId, attempt },
- 'failed to get DEK, trying again'
- )
- const jitter = Math.random() * RETRY_DELAY_MS
- await setTimeout(RETRY_DELAY_MS + jitter)
- }
- }
- }
- return await backupPersistor.forProject(projectBlobsBucket, key)
- }
-
async flushMongoQueuesIfNeeded() {
if (this.remainingQueueEntries === 0) {
await this.flushMongoQueues()
}
- if (this.#completedBlobs.size > BATCH_HASH_WRITES) {
- await this.#storeBackedUpBlobs()
- }
if (this.#pendingFileWrites.length > BATCH_FILE_UPDATES) {
await this.#storeFileHashes()
}
}
async flushMongoQueues() {
- await this.#storeBackedUpBlobs()
await this.#storeFileHashes()
}
@@ -1111,20 +885,6 @@ class ProjectContext {
/** @type {Set<string>} */
#completedBlobs = new Set()
- async #storeBackedUpBlobs() {
- if (this.#completedBlobs.size === 0) return
- const blobs = Array.from(this.#completedBlobs).map(
- hash => new Binary(Buffer.from(hash, 'hex'))
- )
- this.#completedBlobs.clear()
- STATS.mongoUpdates++
- await backedUpBlobsCollection.updateOne(
- { _id: this.projectId },
- { $addToSet: { blobs: { $each: blobs } } },
- { upsert: true }
- )
- }
-
/**
* @param {string} hash
*/
@@ -1142,8 +902,7 @@ class ProjectContext {
/**
* @param {string} hash
*/
- recordBackedUpBlob(hash) {
- this.#backedUpBlobs.add(hash)
+ recordCompletedBlob(hash) {
this.#completedBlobs.add(hash)
this.#pendingBlobs.delete(hash)
}
@@ -1152,12 +911,8 @@ class ProjectContext {
* @param {string} hash
* @return {boolean}
*/
- hasBackedUpBlob(hash) {
- return (
- this.#pendingBlobs.has(hash) ||
- this.#completedBlobs.has(hash) ||
- this.#backedUpBlobs.has(hash)
- )
+ hasCompletedBlob(hash) {
+ return this.#pendingBlobs.has(hash) || this.#completedBlobs.has(hash)
}
/** @type {Array<QueueEntry>} */
diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
index 8f861d39345..62b0b1de25f 100644
--- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
+++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
@@ -4,23 +4,17 @@ import Stream from 'node:stream'
import { setTimeout } from 'node:timers/promises'
import { promisify } from 'node:util'
import { ObjectId, Binary } from 'mongodb'
-import {
- db,
- backedUpBlobs,
- globalBlobs,
-} from '../../../../storage/lib/mongodb.js'
+import { db, globalBlobs } from '../../../../storage/lib/mongodb.js'
import cleanup from './support/cleanup.js'
import testProjects from '../api/support/test_projects.js'
import { execFile } from 'node:child_process'
import chai, { expect } from 'chai'
import chaiExclude from 'chai-exclude'
-import config from 'config'
import { WritableBuffer } from '@overleaf/stream-utils'
import {
backupPersistor,
projectBlobsBucket,
} from '../../../../storage/lib/backupPersistor.mjs'
-import projectKey from '../../../../storage/lib/project_key.js'
import {
BlobStore,
makeProjectKey,
@@ -31,9 +25,6 @@ import express from 'express'
chai.use(chaiExclude)
const TIMEOUT = 20 * 1_000
-const { deksBucket } = config.get('backupStore')
-const { tieringStorageClass } = config.get('backupPersistor')
-
const projectsCollection = db.collection('projects')
const deletedProjectsCollection = db.collection('deletedProjects')
@@ -117,17 +108,6 @@ function binaryForGitBlobHash(gitBlobHash) {
return new Binary(Buffer.from(gitBlobHash, 'hex'))
}
-async function listS3Bucket(bucket, wantStorageClass) {
- const client = backupPersistor._getClientForBucket(bucket)
- const response = await client.listObjectsV2({ Bucket: bucket }).promise()
-
- for (const object of response.Contents || []) {
- expect(object).to.have.property('StorageClass', wantStorageClass)
- }
-
- return (response.Contents || []).map(item => item.Key || '')
-}
-
function objectIdFromTime(timestamp) {
return ObjectId.createFromTime(new Date(timestamp).getTime() / 1000)
}
@@ -591,11 +571,7 @@ describe('back_fill_file_hash script', function () {
expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match(
/back_fill_file_hash/
)
- const extraStatsKeys = [
- 'eventLoop',
- 'readFromGCSThroughputMiBPerSecond',
- 'writeToAWSThroughputMiBPerSecond',
- ]
+ const extraStatsKeys = ['eventLoop', 'readFromGCSThroughputMiBPerSecond']
const stats = JSON.parse(
result.stderr
.split('\n')
@@ -610,7 +586,6 @@ describe('back_fill_file_hash script', function () {
delete stats.time
if (shouldHaveWritten) {
expect(stats.readFromGCSThroughputMiBPerSecond).to.be.greaterThan(0)
- expect(stats.writeToAWSThroughputMiBPerSecond).to.be.greaterThan(0)
}
for (const key of extraStatsKeys) {
delete stats[key]
@@ -856,109 +831,6 @@ describe('back_fill_file_hash script', function () {
},
},
])
- expect(
- (await backedUpBlobs.find({}, { sort: { _id: 1 } }).toArray()).map(
- entry => {
- // blobs are pushed unordered into mongo. Sort the list for consistency.
- entry.blobs.sort()
- return entry
- }
- )
- ).to.deep.equal([
- {
- _id: projectId0,
- blobs: [
- binaryForGitBlobHash(gitBlobHash(fileId0)),
- binaryForGitBlobHash(hashFile7),
- binaryForGitBlobHash(hashTextBlob0),
- ].sort(),
- },
- {
- _id: projectId1,
- blobs: [
- binaryForGitBlobHash(gitBlobHash(fileId1)),
- binaryForGitBlobHash(hashTextBlob1),
- ].sort(),
- },
- {
- _id: projectId2,
- blobs: [binaryForGitBlobHash(hashTextBlob2)]
- .concat(
- processHashedFiles
- ? [binaryForGitBlobHash(gitBlobHash(fileId2))]
- : []
- )
- .sort(),
- },
- {
- _id: projectIdDeleted0,
- blobs: [binaryForGitBlobHash(gitBlobHash(fileId4))].sort(),
- },
- {
- _id: projectId3,
- blobs: [binaryForGitBlobHash(gitBlobHash(fileId3))].sort(),
- },
- ...(processHashedFiles
- ? [
- {
- _id: projectIdDeleted1,
- blobs: [binaryForGitBlobHash(gitBlobHash(fileId5))].sort(),
- },
- ]
- : []),
- {
- _id: projectIdBadFileTree0,
- blobs: [binaryForGitBlobHash(hashTextBlob3)].sort(),
- },
- {
- _id: projectIdBadFileTree3,
- blobs: [binaryForGitBlobHash(gitBlobHash(fileId9))].sort(),
- },
- ])
- })
- it('should have backed up all the files', async function () {
- expect(tieringStorageClass).to.exist
- const blobs = await listS3Bucket(projectBlobsBucket, tieringStorageClass)
- expect(blobs.sort()).to.deep.equal(
- Array.from(
- new Set(
- writtenBlobs
- .map(({ historyId, fileId, hash }) =>
- makeProjectKey(historyId, hash || gitBlobHash(fileId))
- )
- .sort()
- )
- )
- )
- for (let { historyId, fileId, hash, content } of writtenBlobs) {
- hash = hash || gitBlobHash(fileId.toString())
- const s = await backupPersistor.getObjectStream(
- projectBlobsBucket,
- makeProjectKey(historyId, hash),
- { autoGunzip: true }
- )
- const buf = new WritableBuffer()
- await Stream.promises.pipeline(s, buf)
- expect(gitBlobHashBuffer(buf.getContents())).to.equal(hash)
- if (content) {
- expect(buf.getContents()).to.deep.equal(content)
- } else {
- const id = buf.getContents().toString('utf-8')
- expect(id).to.equal(fileId.toString())
- // double check we are not comparing 'undefined' or '[object Object]' above
- expect(id).to.match(/^[a-f0-9]{24}$/)
- }
- }
- const deks = await listS3Bucket(deksBucket, 'STANDARD')
- expect(deks.sort()).to.deep.equal(
- Array.from(
- new Set(
- writtenBlobs.map(
- ({ historyId }) => projectKey.format(historyId) + '/dek'
- )
- )
- ).sort()
- )
})
it('should have written the back filled files to history v1', async function () {
for (const { historyId, hash, fileId, content } of writtenBlobs) {
@@ -991,14 +863,13 @@ describe('back_fill_file_hash script', function () {
// We still need to iterate over all the projects and blobs.
projects: 10,
blobs: 10,
- backedUpBlobs: 10,
+
badFileTrees: 4,
}
if (processHashedFiles) {
stats = sumStats(stats, {
...STATS_ALL_ZERO,
blobs: 2,
- backedUpBlobs: 2,
})
}
expect(rerun.stats).deep.equal(stats)
@@ -1024,7 +895,6 @@ describe('back_fill_file_hash script', function () {
const STATS_ALL_ZERO = {
projects: 0,
blobs: 0,
- backedUpBlobs: 0,
filesWithHash: 0,
filesWithoutHash: 0,
filesDuplicated: 0,
@@ -1038,21 +908,14 @@ describe('back_fill_file_hash script', function () {
fileHardDeleted: 0,
badFileTrees: 0,
mongoUpdates: 0,
- deduplicatedWriteToAWSLocalCount: 0,
- deduplicatedWriteToAWSLocalEgress: 0,
- deduplicatedWriteToAWSRemoteCount: 0,
- deduplicatedWriteToAWSRemoteEgress: 0,
readFromGCSCount: 0,
readFromGCSIngress: 0,
- writeToAWSCount: 0,
- writeToAWSEgress: 0,
writeToGCSCount: 0,
writeToGCSEgress: 0,
}
const STATS_UP_TO_PROJECT1 = {
projects: 2,
blobs: 2,
- backedUpBlobs: 0,
filesWithHash: 0,
filesWithoutHash: 5,
filesDuplicated: 1,
@@ -1065,22 +928,15 @@ describe('back_fill_file_hash script', function () {
projectHardDeleted: 0,
fileHardDeleted: 0,
badFileTrees: 0,
- mongoUpdates: 4,
- deduplicatedWriteToAWSLocalCount: 0,
- deduplicatedWriteToAWSLocalEgress: 0,
- deduplicatedWriteToAWSRemoteCount: 0,
- deduplicatedWriteToAWSRemoteEgress: 0,
- readFromGCSCount: 6,
- readFromGCSIngress: 4000086,
- writeToAWSCount: 5,
- writeToAWSEgress: 4026,
+ mongoUpdates: 2, // 4-2 blobs written to backedUpBlobs collection
+ readFromGCSCount: 4,
+ readFromGCSIngress: 4000072,
writeToGCSCount: 3,
writeToGCSEgress: 4000048,
}
const STATS_UP_FROM_PROJECT1_ONWARD = {
projects: 8,
blobs: 2,
- backedUpBlobs: 0,
filesWithHash: 0,
filesWithoutHash: 4,
filesDuplicated: 0,
@@ -1093,26 +949,18 @@ describe('back_fill_file_hash script', function () {
projectHardDeleted: 0,
fileHardDeleted: 0,
badFileTrees: 4,
- mongoUpdates: 8,
- deduplicatedWriteToAWSLocalCount: 1,
- deduplicatedWriteToAWSLocalEgress: 30,
- deduplicatedWriteToAWSRemoteCount: 0,
- deduplicatedWriteToAWSRemoteEgress: 0,
- readFromGCSCount: 6,
- readFromGCSIngress: 110,
- writeToAWSCount: 5,
- writeToAWSEgress: 143,
+ mongoUpdates: 3, // previously 5 blobs written to backedUpBlobs collection
+ readFromGCSCount: 4,
+ readFromGCSIngress: 96,
writeToGCSCount: 3,
writeToGCSEgress: 72,
}
const STATS_FILES_HASHED_EXTRA = {
...STATS_ALL_ZERO,
filesWithHash: 2,
- mongoUpdates: 2,
+ mongoUpdates: 0, // previously 2 blobs written to backedUpBlobs collection
readFromGCSCount: 2,
readFromGCSIngress: 48,
- writeToAWSCount: 2,
- writeToAWSEgress: 60,
writeToGCSCount: 2,
writeToGCSEgress: 48,
}
@@ -1144,8 +992,6 @@ describe('back_fill_file_hash script', function () {
...STATS_ALL_ZERO,
filesFailed: 1,
readFromGCSIngress: -24,
- writeToAWSCount: -1,
- writeToAWSEgress: -28,
writeToGCSCount: -1,
writeToGCSEgress: -24,
})
@@ -1269,13 +1115,14 @@ describe('back_fill_file_hash script', function () {
before('run script with hashed files', async function () {
output2 = await runScript(['--processHashedFiles=true'], {})
})
- it('should print stats', function () {
+ it('should print stats for the first run without hashed files', function () {
expect(output1.stats).deep.equal(STATS_ALL)
+ })
+ it('should print stats for the hashed files run', function () {
expect(output2.stats).deep.equal({
...STATS_FILES_HASHED_EXTRA,
projects: 10,
blobs: 10,
- backedUpBlobs: 10,
badFileTrees: 4,
})
})
@@ -1322,9 +1169,7 @@ describe('back_fill_file_hash script', function () {
...STATS_FILES_HASHED_EXTRA,
readFromGCSCount: 3,
readFromGCSIngress: 72,
- deduplicatedWriteToAWSLocalCount: 1,
- deduplicatedWriteToAWSLocalEgress: 30,
- mongoUpdates: 1,
+ mongoUpdates: 0,
filesWithHash: 3,
})
)
@@ -1354,48 +1199,6 @@ describe('back_fill_file_hash script', function () {
expect(output.stats).deep.equal(
sumStats(STATS_ALL, {
...STATS_ALL_ZERO,
- // one remote deduplicate
- deduplicatedWriteToAWSRemoteCount: 1,
- deduplicatedWriteToAWSRemoteEgress: 28,
- writeToAWSEgress: -28, // subtract skipped egress
- })
- )
- })
- commonAssertions()
- })
-
- describe('with something in the bucket and marked as processed', function () {
- before('prepare environment', prepareEnvironment)
- before('create a file in s3', async function () {
- await backupPersistor.sendStream(
- projectBlobsBucket,
- makeProjectKey(historyId0, hashTextBlob0),
- Stream.Readable.from([contentTextBlob0]),
- { contentLength: contentTextBlob0.byteLength }
- )
- await backedUpBlobs.insertMany([
- {
- _id: projectId0,
- blobs: [binaryForGitBlobHash(hashTextBlob0)],
- },
- ])
- })
- let output
- before('run script', async function () {
- output = await runScript([], {
- CONCURRENCY: '1',
- })
- })
-
- it('should print stats', function () {
- expect(output.stats).deep.equal(
- sumStats(STATS_ALL, {
- ...STATS_ALL_ZERO,
- backedUpBlobs: 1,
- writeToAWSCount: -1,
- writeToAWSEgress: -27,
- readFromGCSCount: -1,
- readFromGCSIngress: -7,
})
)
})
@@ -1418,8 +1221,10 @@ describe('back_fill_file_hash script', function () {
})
})
- it('should print stats', function () {
+ it('should print stats for part 0', function () {
expect(outputPart0.stats).to.deep.equal(STATS_UP_TO_PROJECT1)
+ })
+ it('should print stats for part 1', function () {
expect(outputPart1.stats).to.deep.equal(STATS_UP_FROM_PROJECT1_ONWARD)
})
commonAssertions()
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index 3be1c8a5407..c9ed13c6cb4 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -388,12 +388,6 @@ async function processFileOnce(entry, filePath) {
fileId,
} = entry
const blobStore = new BlobStore(historyId)
- if (entry.blob) {
- const { blob } = entry
- const hash = blob.getHash()
- return hash
- }
-
STATS.readFromGCSCount++
// make a fetch request to filestore itself
const src = await fetchFromFilestore(projectId, fileId)
@@ -784,16 +778,6 @@ function* findFileInBatch(projects, prefix, blobs) {
const historyIdS = project.overleaf.history.id.toString()
const projectBlobs = blobs.get(historyIdS) || []
const ctx = new ProjectContext(project._id, historyIdS, projectBlobs)
- for (const blob of projectBlobs) {
- ctx.remainingQueueEntries++
- yield {
- ctx,
- cacheKey: blob.getHash(),
- path: MONGO_PATH_SKIP_WRITE_HASH_TO_FILE_TREE,
- blob,
- hash: blob.getHash(),
- }
- }
try {
yield* findFiles(ctx, project.rootFolder?.[0], prefix, true)
} catch (err) {
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index c9ed13c6cb4..f24ce4a6605 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -387,6 +387,13 @@ async function processFileOnce(entry, filePath) {
ctx: { projectId, historyId },
fileId,
} = entry
+ if (entry.hash && entry.ctx.hasCompletedBlob(entry.hash)) {
+ // We can enter this case for two identical files in the same project,
+ // one with hash, the other without. When the one without hash gets
+ // processed first, we can skip downloading the other one we already
+ // know the hash of.
+ return entry.hash
+ }
const blobStore = new BlobStore(historyId)
STATS.readFromGCSCount++
// make a fetch request to filestore itself
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index f24ce4a6605..0ccadaf5a95 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -559,8 +559,9 @@ async function processBatch(batch, prefix = 'rootFolder.0') {
blobs.clear()
// The files are currently ordered by project-id.
- // Order them by file-id ASC then blobs ASC to
- // - process files before blobs
+ // Order them by file-id ASC then hash ASC to
+ // increase the hit rate on the "already processed
+ // hash for project" checks.
files.sort(
/**
* @param {QueueEntry} a

View File

@@ -0,0 +1,191 @@
diff --git a/services/web/app.mjs b/services/web/app.mjs
index b7c723da3d77..3f54cc36a8c3 100644
--- a/services/web/app.mjs
+++ b/services/web/app.mjs
@@ -56,14 +56,8 @@ if (Settings.catchErrors) {
// Create ./data/dumpFolder if needed
FileWriter.ensureDumpFolderExists()
-if (
- !Features.hasFeature('project-history-blobs') &&
- !Features.hasFeature('filestore')
-) {
- throw new Error(
- 'invalid config: must enable either project-history-blobs (Settings.enableProjectHistoryBlobs=true) or enable filestore (Settings.disableFilestore=false)'
- )
-}
+// Validate combination of feature flags.
+Features.validateSettings()
// handle SIGTERM for graceful shutdown in kubernetes
process.on('SIGTERM', function (signal) {
diff --git a/services/web/app/src/Features/History/HistoryURLHelper.js b/services/web/app/src/Features/History/HistoryURLHelper.js
index 8b8d8cbdd730..acb43ced68e0 100644
--- a/services/web/app/src/Features/History/HistoryURLHelper.js
+++ b/services/web/app/src/Features/History/HistoryURLHelper.js
@@ -8,7 +8,7 @@ function projectHistoryURLWithFilestoreFallback(
) {
const filestoreURL = `${Settings.apis.filestore.url}/project/${projectId}/file/${fileRef._id}?from=${origin}`
// TODO: When this file is converted to ES modules we will be able to use Features.hasFeature('project-history-blobs'). Currently we can't stub the feature return value in tests.
- if (fileRef.hash && Settings.enableProjectHistoryBlobs) {
+ if (fileRef.hash && Settings.filestoreMigrationLevel >= 1) {
return {
url: `${Settings.apis.project_history.url}/project/${historyId}/blob/${fileRef.hash}`,
fallbackURL: filestoreURL,
diff --git a/services/web/app/src/infrastructure/Features.js b/services/web/app/src/infrastructure/Features.js
index aaf51103b9b8..89c8e6b841d0 100644
--- a/services/web/app/src/infrastructure/Features.js
+++ b/services/web/app/src/infrastructure/Features.js
@@ -19,8 +19,7 @@ const trackChangesModuleAvailable =
* @property {boolean | undefined} enableGithubSync
* @property {boolean | undefined} enableGitBridge
* @property {boolean | undefined} enableHomepage
- * @property {boolean | undefined} enableProjectHistoryBlobs
- * @property {boolean | undefined} disableFilestore
+ * @property {number} filestoreMigrationLevel
* @property {boolean | undefined} enableSaml
* @property {boolean | undefined} ldap
* @property {boolean | undefined} oauth
@@ -29,7 +28,39 @@ const trackChangesModuleAvailable =
* @property {boolean | undefined} saml
*/
+/**
+ * @return {{'project-history-blobs': boolean, filestore: boolean}}
+ */
+function getFilestoreMigrationOptions() {
+ switch (Settings.filestoreMigrationLevel) {
+ case 0:
+ return {
+ 'project-history-blobs': false,
+ filestore: true,
+ }
+ case 1:
+ return {
+ 'project-history-blobs': true,
+ filestore: true,
+ }
+
+ case 2:
+ return {
+ 'project-history-blobs': true,
+ filestore: false,
+ }
+ default:
+ throw new Error(
+ `invalid OVERLEAF_FILESTORE_MIGRATION_LEVEL=${Settings.filestoreMigrationLevel}, expected 0, 1 or 2`
+ )
+ }
+}
+
const Features = {
+ validateSettings() {
+ getFilestoreMigrationOptions() // throws for invalid settings
+ },
+
/**
* @returns {boolean}
*/
@@ -89,9 +120,9 @@ const Features = {
Settings.enabledLinkedFileTypes.includes('url')
)
case 'project-history-blobs':
- return Boolean(Settings.enableProjectHistoryBlobs)
+ return getFilestoreMigrationOptions()['project-history-blobs']
case 'filestore':
- return Boolean(Settings.disableFilestore) === false
+ return getFilestoreMigrationOptions().filestore
case 'support':
return supportModuleAvailable
case 'symbol-palette':
diff --git a/services/web/config/settings.defaults.js b/services/web/config/settings.defaults.js
index bd0730d5d00c..4df63ebd7c6c 100644
--- a/services/web/config/settings.defaults.js
+++ b/services/web/config/settings.defaults.js
@@ -440,6 +440,9 @@ module.exports = {
','
),
+ filestoreMigrationLevel:
+ parseInt(process.env.OVERLEAF_FILESTORE_MIGRATION_LEVEL, 10) || 0,
+
// i18n
// ------
//
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index 0ccadaf5a955..2e12328e5c49 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -150,10 +150,6 @@ const CONCURRENT_BATCHES = parseInt(process.env.CONCURRENT_BATCHES || '2', 10)
const RETRIES = parseInt(process.env.RETRIES || '10', 10)
const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10)
-const USER_FILES_BUCKET_NAME = process.env.USER_FILES_BUCKET_NAME || ''
-if (!USER_FILES_BUCKET_NAME) {
- throw new Error('env var USER_FILES_BUCKET_NAME is missing')
-}
const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true'
const BUFFER_DIR = fs.mkdtempSync(
process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
diff --git a/services/web/app/src/infrastructure/Features.js b/services/web/app/src/infrastructure/Features.js
index 89c8e6b841d0..6147e70e0faf 100644
--- a/services/web/app/src/infrastructure/Features.js
+++ b/services/web/app/src/infrastructure/Features.js
@@ -28,37 +28,13 @@ const trackChangesModuleAvailable =
* @property {boolean | undefined} saml
*/
-/**
- * @return {{'project-history-blobs': boolean, filestore: boolean}}
- */
-function getFilestoreMigrationOptions() {
- switch (Settings.filestoreMigrationLevel) {
- case 0:
- return {
- 'project-history-blobs': false,
- filestore: true,
- }
- case 1:
- return {
- 'project-history-blobs': true,
- filestore: true,
- }
-
- case 2:
- return {
- 'project-history-blobs': true,
- filestore: false,
- }
- default:
+const Features = {
+ validateSettings() {
+ if (![0, 1, 2].includes(Settings.filestoreMigrationLevel)) {
throw new Error(
`invalid OVERLEAF_FILESTORE_MIGRATION_LEVEL=${Settings.filestoreMigrationLevel}, expected 0, 1 or 2`
)
- }
-}
-
-const Features = {
- validateSettings() {
- getFilestoreMigrationOptions() // throws for invalid settings
+ }
},
/**
@@ -120,9 +96,9 @@ const Features = {
Settings.enabledLinkedFileTypes.includes('url')
)
case 'project-history-blobs':
- return getFilestoreMigrationOptions()['project-history-blobs']
+ return Settings.filestoreMigrationLevel > 0
case 'filestore':
- return getFilestoreMigrationOptions().filestore
+ return Settings.filestoreMigrationLevel < 2
case 'support':
return supportModuleAvailable
case 'symbol-palette':

View File

@@ -0,0 +1,84 @@
diff --git a/cron/deactivate-projects.sh b/cron/deactivate-projects.sh
index fab0fbfbf667..a391f99a5bd8 100755
--- a/cron/deactivate-projects.sh
+++ b/cron/deactivate-projects.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "-------------------------"
echo "Deactivating old projects"
diff --git a/cron/delete-projects.sh b/cron/delete-projects.sh
index e1ea5ac5e621..7cd45771716a 100755
--- a/cron/delete-projects.sh
+++ b/cron/delete-projects.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "-------------------------"
echo "Expiring deleted projects"
diff --git a/cron/delete-users.sh b/cron/delete-users.sh
index fe97bffeea0b..30872ac55657 100755
--- a/cron/delete-users.sh
+++ b/cron/delete-users.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "----------------------"
echo "Expiring deleted users"
diff --git a/cron/project-history-flush-all.sh b/cron/project-history-flush-all.sh
index d8bbb184aa37..8fe9eea5fc55 100755
--- a/cron/project-history-flush-all.sh
+++ b/cron/project-history-flush-all.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "---------------------------------"
echo "Flush all project-history changes"
diff --git a/cron/project-history-periodic-flush.sh b/cron/project-history-periodic-flush.sh
index 76feae410e26..1b8efff6cc7c 100755
--- a/cron/project-history-periodic-flush.sh
+++ b/cron/project-history-periodic-flush.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "--------------------------"
echo "Flush project-history queue"
diff --git a/cron/project-history-retry-hard.sh b/cron/project-history-retry-hard.sh
index 651a6615f22d..df9b4703a58e 100755
--- a/cron/project-history-retry-hard.sh
+++ b/cron/project-history-retry-hard.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "-----------------------------------"
echo "Retry project-history errors (hard)"
diff --git a/cron/project-history-retry-soft.sh b/cron/project-history-retry-soft.sh
index 70c597021b28..cbb6e714cae7 100755
--- a/cron/project-history-retry-soft.sh
+++ b/cron/project-history-retry-soft.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-set -eux
+set -eu
echo "-----------------------------------"
echo "Retry project-history errors (soft)"

View File

@@ -0,0 +1,76 @@
diff --git a/package-lock.json b/package-lock.json
index 2b3a5868a20..d9d8285618d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -35581,6 +35581,7 @@
"resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",
"integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==",
"deprecated": "request has been deprecated, see https://github.com/request/request/issues/3142",
+ "license": "Apache-2.0",
"dependencies": {
"aws-sign2": "~0.7.0",
"aws4": "^1.8.0",
@@ -35638,15 +35639,15 @@
}
},
"node_modules/request/node_modules/tough-cookie": {
- "version": "2.5.0",
- "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
- "integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
+ "version": "5.1.2",
+ "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-5.1.2.tgz",
+ "integrity": "sha512-FVDYdxtnj0G6Qm/DhNPSb8Ju59ULcup3tuJxkFb5K8Bv2pUXILbf0xZWU8PX8Ov19OXljbUyveOFwRMwkXzO+A==",
+ "license": "BSD-3-Clause",
"dependencies": {
- "psl": "^1.1.28",
- "punycode": "^2.1.1"
+ "tldts": "^6.1.32"
},
"engines": {
- "node": ">=0.8"
+ "node": ">=16"
}
},
"node_modules/requestretry": {
@@ -39612,6 +39613,24 @@
"tlds": "bin.js"
}
},
+ "node_modules/tldts": {
+ "version": "6.1.86",
+ "resolved": "https://registry.npmjs.org/tldts/-/tldts-6.1.86.tgz",
+ "integrity": "sha512-WMi/OQ2axVTf/ykqCQgXiIct+mSQDFdH2fkwhPwgEwvJ1kSzZRiinb0zF2Xb8u4+OqPChmyI6MEu4EezNJz+FQ==",
+ "license": "MIT",
+ "dependencies": {
+ "tldts-core": "^6.1.86"
+ },
+ "bin": {
+ "tldts": "bin/cli.js"
+ }
+ },
+ "node_modules/tldts-core": {
+ "version": "6.1.86",
+ "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.86.tgz",
+ "integrity": "sha512-Je6p7pkk+KMzMv2XXKmAE3McmolOQFdxkKw0R8EYNr7sELW46JqnNeTX8ybPiQgvg1ymCoF8LXs5fzFaZvJPTA==",
+ "license": "MIT"
+ },
"node_modules/tmp": {
"version": "0.2.3",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz",
diff --git a/package.json b/package.json
index 388b750c3d2..44fffc4664a 100644
--- a/package.json
+++ b/package.json
@@ -33,6 +33,9 @@
"multer": "2.0.1",
"path-to-regexp": "3.3.0",
"qs": "6.13.0"
+ },
+ "request@2.88.2": {
+ "tough-cookie": "5.1.2"
}
},
"scripts": {

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,82 @@
diff --git a/services/web/frontend/js/features/review-panel-new/components/review-tooltip-menu.tsx b/services/web/frontend/js/features/review-panel-new/components/review-tooltip-menu.tsx
index f26542ebe909..fb6b68460bdc 100644
--- a/services/web/frontend/js/features/review-panel-new/components/review-tooltip-menu.tsx
+++ b/services/web/frontend/js/features/review-panel-new/components/review-tooltip-menu.tsx
@@ -18,7 +18,6 @@ import {
reviewTooltipStateField,
} from '@/features/source-editor/extensions/review-tooltip'
import { EditorView, getTooltip } from '@codemirror/view'
-import useViewerPermissions from '@/shared/hooks/use-viewer-permissions'
import usePreviousValue from '@/shared/hooks/use-previous-value'
import { useLayoutContext } from '@/shared/context/layout-context'
import { useReviewPanelViewActionsContext } from '../context/review-panel-view-context'
@@ -35,6 +34,7 @@ import { useEditorPropertiesContext } from '@/features/ide-react/context/editor-
import classNames from 'classnames'
import useEventListener from '@/shared/hooks/use-event-listener'
import useReviewPanelLayout from '../hooks/use-review-panel-layout'
+import { usePermissionsContext } from '@/features/ide-react/context/permissions-context'
const EDIT_MODE_SWITCH_WIDGET_HEIGHT = 40
const CM_LINE_RIGHT_PADDING = 8
@@ -43,7 +43,7 @@ const TOOLTIP_SHOW_DELAY = 120
const ReviewTooltipMenu: FC = () => {
const state = useCodeMirrorStateContext()
const view = useCodeMirrorViewContext()
- const isViewer = useViewerPermissions()
+ const permissions = usePermissionsContext()
const [show, setShow] = useState(true)
const { setView } = useReviewPanelViewActionsContext()
const { openReviewPanel } = useReviewPanelLayout()
@@ -58,7 +58,7 @@ const ReviewTooltipMenu: FC = () => {
const addComment = useCallback(() => {
const { main } = view.state.selection
- if (main.empty) {
+ if (main.empty || !permissions.comment) {
return
}
@@ -74,11 +74,11 @@ const ReviewTooltipMenu: FC = () => {
view.dispatch({ effects })
setShow(false)
- }, [openReviewPanel, setView, setShow, view])
+ }, [view, permissions.comment, openReviewPanel, setView])
useEventListener('add-new-review-comment', addComment)
- if (isViewer || !show || !tooltipState) {
+ if (!permissions.comment || !show || !tooltipState) {
return null
}
diff --git a/services/web/frontend/js/features/source-editor/components/toolbar/toolbar-items.tsx b/services/web/frontend/js/features/source-editor/components/toolbar/toolbar-items.tsx
index 3404976d4462..1811ccc99950 100644
--- a/services/web/frontend/js/features/source-editor/components/toolbar/toolbar-items.tsx
+++ b/services/web/frontend/js/features/source-editor/components/toolbar/toolbar-items.tsx
@@ -16,5 +16,6 @@ import { isSplitTestEnabled } from '@/utils/splitTestUtils'
import { isMac } from '@/shared/utils/os'
import { useProjectContext } from '@/shared/context/project-context'
+import { usePermissionsContext } from '@/features/ide-react/context/permissions-context'
export const ToolbarItems: FC<{
state: EditorState
@@ -35,6 +36,7 @@ export const ToolbarItems: FC<{
useEditorPropertiesContext()
const { writefullInstance } = useEditorContext()
const { features } = useProjectContext()
+ const permissions = usePermissionsContext()
const isActive = withinFormattingCommand(state)
const symbolPaletteAvailable = getMeta('ol-symbolPaletteAvailable')
@@ -131,7 +133,7 @@ export const ToolbarItems: FC<{
command={commands.wrapInHref}
icon="add_link"
/>
- {features.trackChangesVisible && (
+ {features.trackChangesVisible && permissions.comment && (
<ToolbarButton
id="toolbar-add-comment"
label={t('add_comment')}

View File

@@ -0,0 +1,673 @@
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index c0fdda35d8f..09212d426e3 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -83,7 +83,7 @@ ObjectId.cacheHexString = true
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
- const DEFAULT_OUTPUT_FILE = `file-migration-${new Date()
+ const DEFAULT_OUTPUT_FILE = `/var/log/overleaf/file-migration-${new Date()
.toISOString()
.replace(/[:.]/g, '_')}.log`
@@ -208,7 +208,7 @@ is equivalent to
PROCESS_HASHED_FILES: !args['skip-hashed-files'],
PROCESS_BLOBS: !args['skip-existing-blobs'],
DRY_RUN: args['dry-run'],
- OUTPUT_FILE: args.output,
+ OUTPUT_FILE: args.report ? '-' : args.output,
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['logging-id'] || BATCH_RANGE_START,
@@ -256,6 +256,9 @@ const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
// Log output to a file
+if (OUTPUT_FILE !== '-') {
+ console.warn(`Writing logs into ${OUTPUT_FILE}`)
+}
logger.initialize('file-migration', {
streams: [
{
diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
index f6f4a6fb76d..c661ae9bc3f 100644
--- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
+++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
@@ -501,6 +501,7 @@ describe('back_fill_file_hash script', function () {
timeout: TIMEOUT - 500,
env: {
...process.env,
+ AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE: '1',
USER_FILES_BUCKET_NAME,
SLEEP_BEFORE_EXIT: '1',
...env,
@@ -516,6 +517,7 @@ describe('back_fill_file_hash script', function () {
}
result = { stdout, stderr, status: code }
}
+ // Ensure no tmp folder is left behind.
expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match(
/back_fill_file_hash/
)
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index 09212d426e3..de4fca51db4 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -1373,7 +1373,18 @@ async function main() {
console.warn('Done.')
}
+async function cleanupBufferDir() {
+ try {
+ // Perform non-recursive removal of the BUFFER_DIR. Individual files
+ // should get removed in parallel as part of batch processing.
+ await fs.promises.rmdir(BUFFER_DIR)
+ } catch (err) {
+ console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
+ }
+}
+
if (DISPLAY_REPORT) {
+ await cleanupBufferDir()
console.warn('Displaying report...')
await displayReport()
process.exit(0)
@@ -1384,13 +1395,7 @@ try {
await main()
} finally {
printStats(true)
- try {
- // Perform non-recursive removal of the BUFFER_DIR. Individual files
- // should get removed in parallel as part of batch processing.
- await fs.promises.rmdir(BUFFER_DIR)
- } catch (err) {
- console.error(`cleanup of BUFFER_DIR=${BUFFER_DIR} failed`, err)
- }
+ await cleanupBufferDir()
}
let code = 0
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index de4fca51db4..e9a7721944c 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -316,6 +316,7 @@ async function getStatsForCollection(
projectsWithAllHashes: 0,
fileCount: 0,
fileWithHashCount: 0,
+ fileMissingInHistoryCount: 0,
}
// Pick a random sample of projects and estimate the number of files without hashes
const result = await collection
@@ -342,25 +343,43 @@ async function getStatsForCollection(
const filesWithoutHash = fileTree.match(/\{"_id":"[0-9a-f]{24}"\}/g) || []
// count the number of files with a hash, these are uniquely identified
// by the number of "hash" strings due to the filtering
- const filesWithHash = fileTree.match(/"hash"/g) || []
+ const filesWithHash = fileTree.match(/"hash":"[0-9a-f]{40}"/g) || []
stats.fileCount += filesWithoutHash.length + filesWithHash.length
stats.fileWithHashCount += filesWithHash.length
stats.projectCount++
stats.projectsWithAllHashes += filesWithoutHash.length === 0 ? 1 : 0
+ const projectId = project._id.toString()
+ const { blobs: perProjectBlobs } = await getProjectBlobsBatch([projectId])
+ const blobs = new Set(
+ (perProjectBlobs.get(projectId) || []).map(b => b.getHash())
+ )
+ const uniqueHashes = new Set(filesWithHash.map(m => m.slice(8, 48)))
+ for (const hash of uniqueHashes) {
+ if (blobs.has(hash) || GLOBAL_BLOBS.has(hash)) continue
+ stats.fileMissingInHistoryCount++
+ }
}
console.log(`Sampled stats for ${name}:`)
const fractionSampled = stats.projectCount / collectionCount
- const percentageSampled = (fractionSampled * 100).toFixed(1)
+ const percentageSampled = (fractionSampled * 100).toFixed(0)
const fractionConverted = stats.projectsWithAllHashes / stats.projectCount
- const percentageConverted = (fractionConverted * 100).toFixed(1)
+ const percentageConverted = (fractionConverted * 100).toFixed(0)
+ const fractionMissing = stats.fileMissingInHistoryCount / stats.fileCount
+ const percentageMissing = (fractionMissing * 100).toFixed(0)
console.log(
- `- Sampled ${name}: ${stats.projectCount} (${percentageSampled}%)`
+ `- Sampled ${name}: ${stats.projectCount} (${percentageSampled}% of all ${name})`
)
console.log(
`- Sampled ${name} with all hashes present: ${stats.projectsWithAllHashes}`
)
console.log(
- `- Percentage of ${name} converted: ${percentageConverted}% (estimated)`
+ `- Percentage of ${name} that need back-filling hashes: ${percentageConverted}% (estimated)`
+ )
+ console.log(
+ `- Sampled ${name} have ${stats.fileCount} files that need to be checked against the full project history system.`
+ )
+ console.log(
+ `- Sampled ${name} have ${stats.fileMissingInHistoryCount} files that need to be uploaded to the full project history system (estimating ${percentageMissing}% of all files).`
)
}
@@ -369,13 +388,15 @@ async function getStatsForCollection(
* including counts and estimated progress based on a sample.
*/
async function displayReport() {
- const projectsCountResult = await projectsCollection.countDocuments()
+ const projectsCountResult = await projectsCollection.estimatedDocumentCount()
const deletedProjectsCountResult =
- await deletedProjectsCollection.countDocuments()
+ await deletedProjectsCollection.estimatedDocumentCount()
const sampleSize = 1000
console.log('Current status:')
- console.log(`- Projects: ${projectsCountResult}`)
- console.log(`- Deleted projects: ${deletedProjectsCountResult}`)
+ console.log(`- Total number of projects: ${projectsCountResult}`)
+ console.log(
+ `- Total number of deleted projects: ${deletedProjectsCountResult}`
+ )
console.log(`Sampling ${sampleSize} projects to estimate progress...`)
await getStatsForCollection(
sampleSize,
diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
index c661ae9bc3f..7248e74cb3f 100644
--- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
+++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
@@ -481,21 +481,14 @@ describe('back_fill_file_hash script', function () {
/**
* @param {Array<string>} args
* @param {Record<string, string>} env
- * @param {boolean} shouldHaveWritten
- * @return {Promise<{result, stats: any}>}
+ * @return {Promise<{result: { stdout: string, stderr: string, status: number }, stats: any}>}
*/
- async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
+ async function rawRunScript(args = [], env = {}) {
let result
try {
result = await promisify(execFile)(
process.argv0,
- [
- 'storage/scripts/back_fill_file_hash.mjs',
- '--output=-',
- '--projects',
- '--deleted-projects',
- ...args,
- ],
+ ['storage/scripts/back_fill_file_hash.mjs', ...args],
{
encoding: 'utf-8',
timeout: TIMEOUT - 500,
@@ -521,6 +514,20 @@ describe('back_fill_file_hash script', function () {
expect((await fs.promises.readdir('/tmp')).join(';')).to.not.match(
/back_fill_file_hash/
)
+ return result
+ }
+
+ /**
+ * @param {Array<string>} args
+ * @param {Record<string, string>} env
+ * @param {boolean} shouldHaveWritten
+ * @return {Promise<{result, stats: any}>}
+ */
+ async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
+ const result = await rawRunScript(
+ ['--output=-', '--projects', '--deleted-projects', ...args],
+ env
+ )
const extraStatsKeys = ['eventLoop', 'readFromGCSThroughputMiBPerSecond']
const stats = JSON.parse(
result.stderr
@@ -1078,6 +1085,35 @@ describe('back_fill_file_hash script', function () {
})
commonAssertions(true)
})
+ describe('report mode', function () {
+ let output
+ before('prepare environment', prepareEnvironment)
+ before('run script', async function () {
+ output = await rawRunScript(['--report'], {})
+ })
+ it('should print the report', () => {
+ expect(output.status).to.equal(0)
+ console.log(output.stdout)
+ expect(output.stdout).to.equal(`\
+Current status:
+- Total number of projects: 10
+- Total number of deleted projects: 5
+Sampling 1000 projects to estimate progress...
+Sampled stats for projects:
+- Sampled projects: 9 (90% of all projects)
+- Sampled projects with all hashes present: 5
+- Percentage of projects that need back-filling hashes: 56% (estimated)
+- Sampled projects have 11 files that need to be checked against the full project history system.
+- Sampled projects have 3 files that need to be uploaded to the full project history system (estimating 27% of all files).
+Sampled stats for deleted projects:
+- Sampled deleted projects: 4 (80% of all deleted projects)
+- Sampled deleted projects with all hashes present: 3
+- Percentage of deleted projects that need back-filling hashes: 75% (estimated)
+- Sampled deleted projects have 2 files that need to be checked against the full project history system.
+- Sampled deleted projects have 1 files that need to be uploaded to the full project history system (estimating 50% of all files).
+`)
+ })
+ })
describe('full run in dry-run mode', function () {
let output
diff --git a/services/history-v1/storage/scripts/back_fill_file_hash.mjs b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
index e9a7721944c..9c2a9818680 100644
--- a/services/history-v1/storage/scripts/back_fill_file_hash.mjs
+++ b/services/history-v1/storage/scripts/back_fill_file_hash.mjs
@@ -79,7 +79,7 @@ ObjectId.cacheHexString = true
*/
/**
- * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean}}
+ * @return {{PROJECT_IDS_FROM: string, PROCESS_HASHED_FILES: boolean, LOGGING_IDENTIFIER: string, BATCH_RANGE_START: string, BATCH_RANGE_END: string, PROCESS_NON_DELETED_PROJECTS: boolean, PROCESS_DELETED_PROJECTS: boolean, PROCESS_BLOBS: boolean, DRY_RUN: boolean, OUTPUT_FILE: string, DISPLAY_REPORT: boolean, CONCURRENCY: number, CONCURRENT_BATCHES: number, RETRIES: number, RETRY_DELAY_MS: number, RETRY_FILESTORE_404: boolean, BUFFER_DIR_PREFIX: string, STREAM_HIGH_WATER_MARK: number, LOGGING_INTERVAL: number, SLEEP_BEFORE_EXIT: number }}
*/
function parseArgs() {
const PUBLIC_LAUNCH_DATE = new Date('2012-01-01T00:00:00Z')
@@ -95,6 +95,12 @@ function parseArgs() {
{ name: 'skip-hashed-files', type: Boolean },
{ name: 'skip-existing-blobs', type: Boolean },
{ name: 'from-file', type: String, defaultValue: '' },
+ { name: 'concurrency', type: Number, defaultValue: 10 },
+ { name: 'concurrent-batches', type: Number, defaultValue: 1 },
+ { name: 'stream-high-water-mark', type: Number, defaultValue: 1024 * 1024 },
+ { name: 'retries', type: Number, defaultValue: 10 },
+ { name: 'retry-delay-ms', type: Number, defaultValue: 100 },
+ { name: 'retry-filestore-404', type: Boolean },
{ name: 'dry-run', alias: 'n', type: Boolean },
{
name: 'output',
@@ -114,6 +120,13 @@ function parseArgs() {
defaultValue: new Date().toISOString(),
},
{ name: 'logging-id', type: String, defaultValue: '' },
+ { name: 'logging-interval-ms', type: Number, defaultValue: 60_000 },
+ {
+ name: 'buffer-dir-prefix',
+ type: String,
+ defaultValue: '/tmp/back_fill_file_hash-',
+ },
+ { name: 'sleep-before-exit-ms', type: Number, defaultValue: 1_000 },
])
// If no arguments are provided, display a usage message
@@ -143,6 +156,8 @@ Logging options:
(default: file-migration-<timestamp>.log)
--logging-id <id> Identifier for logging
(default: BATCH_RANGE_START)
+ --logging-interval-ms <ms> Interval for logging progres stats
+ (default: 60000, 1min)
Batch range options:
--BATCH_RANGE_START <date> Start date for processing
@@ -150,10 +165,30 @@ Batch range options:
--BATCH_RANGE_END <date> End date for processing
(default: ${args.BATCH_RANGE_END})
+Concurrency:
+ --concurrency <n> Number of files to process concurrently
+ (default: 10)
+ --concurrent-batches <n> Number of project batches to process concurrently
+ (default: 1)
+ --stream-high-water-mark n In-Memory buffering threshold
+ (default: 1MiB)
+
+Retries:
+ --retries <n> Number of times to retry processing a file
+ (default: 10)
+ --retry-delay-ms <ms> How long to wait before processing a file again
+ (default: 100, 100ms)
+ --retry-filestore-404 Retry downloading a file when receiving a 404
+ (default: false)
+
Other options:
--report Display a report of the current status
--dry-run, -n Perform a dry run without making changes
--help, -h Show this help message
+ --buffer-dir-prefix <p> Folder/prefix for buffering files on disk
+ (default: ${args['buffer-dir-prefix']})
+ --sleep-before-exit-ms <n> Defer exiting from the script
+ (default: 1000, 1s)
Typical usage:
@@ -212,8 +247,17 @@ is equivalent to
BATCH_RANGE_START,
BATCH_RANGE_END,
LOGGING_IDENTIFIER: args['logging-id'] || BATCH_RANGE_START,
+ LOGGING_INTERVAL: args['logging-interval-ms'],
PROJECT_IDS_FROM: args['from-file'],
DISPLAY_REPORT: args.report,
+ CONCURRENCY: args.concurrency,
+ CONCURRENT_BATCHES: args['concurrent-batches'],
+ STREAM_HIGH_WATER_MARK: args['stream-high-water-mark'],
+ RETRIES: args.retries,
+ RETRY_DELAY_MS: args['retry-delay-ms'],
+ RETRY_FILESTORE_404: args['retry-filestore-404'],
+ BUFFER_DIR_PREFIX: args['buffer-dir-prefix'],
+ SLEEP_BEFORE_EXIT: args['sleep-before-exit-ms'],
}
}
@@ -229,6 +273,15 @@ const {
LOGGING_IDENTIFIER,
PROJECT_IDS_FROM,
DISPLAY_REPORT,
+ CONCURRENCY,
+ CONCURRENT_BATCHES,
+ RETRIES,
+ RETRY_DELAY_MS,
+ RETRY_FILESTORE_404,
+ BUFFER_DIR_PREFIX,
+ STREAM_HIGH_WATER_MARK,
+ LOGGING_INTERVAL,
+ SLEEP_BEFORE_EXIT,
} = parseArgs()
// We need to handle the start and end differently as ids of deleted projects are created at time of deletion.
@@ -236,24 +289,7 @@ if (process.env.BATCH_RANGE_START || process.env.BATCH_RANGE_END) {
throw new Error('use --BATCH_RANGE_START and --BATCH_RANGE_END')
}
-// Concurrency for downloading from GCS and updating hashes in mongo
-const CONCURRENCY = parseInt(process.env.CONCURRENCY || '100', 10)
-const CONCURRENT_BATCHES = parseInt(process.env.CONCURRENT_BATCHES || '2', 10)
-// Retries for processing a given file
-const RETRIES = parseInt(process.env.RETRIES || '10', 10)
-const RETRY_DELAY_MS = parseInt(process.env.RETRY_DELAY_MS || '100', 10)
-
-const RETRY_FILESTORE_404 = process.env.RETRY_FILESTORE_404 === 'true'
-const BUFFER_DIR = fs.mkdtempSync(
- process.env.BUFFER_DIR_PREFIX || '/tmp/back_fill_file_hash-'
-)
-// https://nodejs.org/api/stream.html#streamgetdefaulthighwatermarkobjectmode
-const STREAM_HIGH_WATER_MARK = parseInt(
- process.env.STREAM_HIGH_WATER_MARK || (64 * 1024).toString(),
- 10
-)
-const LOGGING_INTERVAL = parseInt(process.env.LOGGING_INTERVAL || '60000', 10)
-const SLEEP_BEFORE_EXIT = parseInt(process.env.SLEEP_BEFORE_EXIT || '1000', 10)
+const BUFFER_DIR = fs.mkdtempSync(BUFFER_DIR_PREFIX)
// Log output to a file
if (OUTPUT_FILE !== '-') {
@@ -416,7 +452,7 @@ async function displayReport() {
)
}
-// Filestore endpoint location
+// Filestore endpoint location (configured by /etc/overleaf/env.sh)
const FILESTORE_HOST = process.env.FILESTORE_HOST || '127.0.0.1'
const FILESTORE_PORT = process.env.FILESTORE_PORT || '3009'
diff --git a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
index 7248e74cb3f..601cea13b6a 100644
--- a/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
+++ b/services/history-v1/test/acceptance/js/storage/back_fill_file_hash.test.mjs
@@ -61,9 +61,8 @@ function objectIdFromTime(timestamp) {
const PRINT_IDS_AND_HASHES_FOR_DEBUGGING = false
-describe('back_fill_file_hash script', function () {
+describe.only('back_fill_file_hash script', function () {
this.timeout(TIMEOUT)
- const USER_FILES_BUCKET_NAME = 'fake-user-files-gcs'
const projectId0 = objectIdFromTime('2017-01-01T00:00:00Z')
const projectId1 = objectIdFromTime('2017-01-01T00:01:00Z')
@@ -480,24 +479,24 @@ describe('back_fill_file_hash script', function () {
/**
* @param {Array<string>} args
- * @param {Record<string, string>} env
* @return {Promise<{result: { stdout: string, stderr: string, status: number }, stats: any}>}
*/
- async function rawRunScript(args = [], env = {}) {
+ async function rawRunScript(args = []) {
let result
try {
result = await promisify(execFile)(
process.argv0,
- ['storage/scripts/back_fill_file_hash.mjs', ...args],
+ [
+ 'storage/scripts/back_fill_file_hash.mjs',
+ '--sleep-before-exit-ms=1',
+ ...args,
+ ],
{
encoding: 'utf-8',
timeout: TIMEOUT - 500,
env: {
...process.env,
AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE: '1',
- USER_FILES_BUCKET_NAME,
- SLEEP_BEFORE_EXIT: '1',
- ...env,
LOG_LEVEL: 'warn', // Override LOG_LEVEL of acceptance tests
},
}
@@ -519,15 +518,16 @@ describe('back_fill_file_hash script', function () {
/**
* @param {Array<string>} args
- * @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
*/
- async function tryRunScript(args = [], env = {}, shouldHaveWritten) {
- const result = await rawRunScript(
- ['--output=-', '--projects', '--deleted-projects', ...args],
- env
- )
+ async function tryRunScript(args = [], shouldHaveWritten) {
+ const result = await rawRunScript([
+ '--output=-',
+ '--projects',
+ '--deleted-projects',
+ ...args,
+ ])
const extraStatsKeys = ['eventLoop', 'readFromGCSThroughputMiBPerSecond']
const stats = JSON.parse(
result.stderr
@@ -558,12 +558,11 @@ describe('back_fill_file_hash script', function () {
/**
* @param {Array<string>} args
- * @param {Record<string, string>} env
* @param {boolean} shouldHaveWritten
* @return {Promise<{result, stats: any}>}
*/
- async function runScript(args = [], env = {}, shouldHaveWritten = true) {
- const { stats, result } = await tryRunScript(args, env, shouldHaveWritten)
+ async function runScript(args = [], shouldHaveWritten = true) {
+ const { stats, result } = await tryRunScript(args, shouldHaveWritten)
if (result.status !== 0) {
console.log(result)
expect(result).to.have.property('status', 0)
@@ -812,7 +811,6 @@ describe('back_fill_file_hash script', function () {
it('should process nothing on re-run', async function () {
const rerun = await runScript(
!processHashedFiles ? ['--skip-hashed-files'] : [],
- {},
false
)
let stats = {
@@ -937,10 +935,11 @@ describe('back_fill_file_hash script', function () {
it('should gracefully handle fatal errors', async function () {
mockFilestore.deleteObject(projectId0, fileId0)
const t0 = Date.now()
- const { stats, result } = await tryRunScript(['--skip-hashed-files'], {
- RETRIES: '10',
- RETRY_DELAY_MS: '1000',
- })
+ const { stats, result } = await tryRunScript([
+ '--skip-hashed-files',
+ '--retries=10',
+ '--retry-delay-ms=1000',
+ ])
const t1 = Date.now()
expectNotFoundError(result, 'failed to process file')
expect(result.status).to.equal(1)
@@ -972,11 +971,12 @@ describe('back_fill_file_hash script', function () {
value: { stats, result },
},
] = await Promise.allSettled([
- tryRunScript(['--skip-hashed-files'], {
- RETRY_DELAY_MS: '100',
- RETRIES: '60',
- RETRY_FILESTORE_404: 'true', // 404s are the easiest to simulate in tests
- }),
+ tryRunScript([
+ '--skip-hashed-files',
+ '--retries=60',
+ '--retry-delay-ms=1000',
+ '--retry-filestore-404',
+ ]),
restoreFileAfter5s(),
])
expectNotFoundError(result, 'failed to process file, trying again')
@@ -998,9 +998,7 @@ describe('back_fill_file_hash script', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
- output = await runScript(['--skip-hashed-files'], {
- CONCURRENCY: '1',
- })
+ output = await runScript(['--skip-hashed-files', '--concurrency=1'])
})
/**
@@ -1067,10 +1065,10 @@ describe('back_fill_file_hash script', function () {
let output1, output2
before('prepare environment', prepareEnvironment)
before('run script without hashed files', async function () {
- output1 = await runScript(['--skip-hashed-files'], {})
+ output1 = await runScript(['--skip-hashed-files'])
})
before('run script with hashed files', async function () {
- output2 = await runScript([], {})
+ output2 = await runScript([])
})
it('should print stats for the first run without hashed files', function () {
expect(output1.stats).deep.equal(STATS_ALL)
@@ -1089,7 +1087,7 @@ describe('back_fill_file_hash script', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
- output = await rawRunScript(['--report'], {})
+ output = await rawRunScript(['--report'])
})
it('should print the report', () => {
expect(output.status).to.equal(0)
@@ -1127,13 +1125,7 @@ Sampled stats for deleted projects:
.toArray()
})
before('run script', async function () {
- output = await runScript(
- ['--dry-run'],
- {
- CONCURRENCY: '1',
- },
- false
- )
+ output = await runScript(['--dry-run', '--concurrency=1'], false)
})
it('should print stats for dry-run mode', function () {
@@ -1174,9 +1166,7 @@ Sampled stats for deleted projects:
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
- output = await runScript(['--skip-hashed-files'], {
- CONCURRENCY: '10',
- })
+ output = await runScript(['--skip-hashed-files', '--concurrency=10'])
})
it('should print stats', function () {
expect(output.stats).deep.equal(STATS_ALL)
@@ -1184,13 +1174,14 @@ Sampled stats for deleted projects:
commonAssertions()
})
- describe('full run STREAM_HIGH_WATER_MARK=1MB', function () {
+ describe('full run STREAM_HIGH_WATER_MARK=64kiB', function () {
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
- output = await runScript(['--skip-hashed-files'], {
- STREAM_HIGH_WATER_MARK: (1024 * 1024).toString(),
- })
+ output = await runScript([
+ '--skip-hashed-files',
+ `--stream-high-water-mark=${64 * 1024}`,
+ ])
})
it('should print stats', function () {
expect(output.stats).deep.equal(STATS_ALL)
@@ -1202,7 +1193,7 @@ Sampled stats for deleted projects:
let output
before('prepare environment', prepareEnvironment)
before('run script', async function () {
- output = await runScript([], {})
+ output = await runScript([])
})
it('should print stats', function () {
expect(output.stats).deep.equal(
@@ -1231,9 +1222,7 @@ Sampled stats for deleted projects:
})
let output
before('run script', async function () {
- output = await runScript(['--skip-hashed-files'], {
- CONCURRENCY: '1',
- })
+ output = await runScript(['--skip-hashed-files', '--concurrency=1'])
})
it('should print stats', function () {
@@ -1252,20 +1241,18 @@ Sampled stats for deleted projects:
let outputPart0, outputPart1
before('prepare environment', prepareEnvironment)
before('run script on part 0', async function () {
- outputPart0 = await runScript(
- ['--skip-hashed-files', `--BATCH_RANGE_END=${edge}`],
- {
- CONCURRENCY: '1',
- }
- )
+ outputPart0 = await runScript([
+ '--skip-hashed-files',
+ `--BATCH_RANGE_END=${edge}`,
+ '--concurrency=1',
+ ])
})
before('run script on part 1', async function () {
- outputPart1 = await runScript(
- ['--skip-hashed-files', `--BATCH_RANGE_START=${edge}`],
- {
- CONCURRENCY: '1',
- }
- )
+ outputPart1 = await runScript([
+ '--skip-hashed-files',
+ `--BATCH_RANGE_START=${edge}`,
+ '--concurrency=1',
+ ])
})
it('should print stats for part 0', function () {

View File

@@ -0,0 +1,165 @@
diff --git a/services/web/app/src/Features/Collaborators/OwnershipTransferHandler.js b/services/web/app/src/Features/Collaborators/OwnershipTransferHandler.js
index e22818ebb880..81ec5ccb0aa5 100644
--- a/services/web/app/src/Features/Collaborators/OwnershipTransferHandler.js
+++ b/services/web/app/src/Features/Collaborators/OwnershipTransferHandler.js
@@ -9,9 +9,75 @@ const PrivilegeLevels = require('../Authorization/PrivilegeLevels')
const TpdsProjectFlusher = require('../ThirdPartyDataStore/TpdsProjectFlusher')
const ProjectAuditLogHandler = require('../Project/ProjectAuditLogHandler')
const AnalyticsManager = require('../Analytics/AnalyticsManager')
+const OError = require('@overleaf/o-error')
+const TagsHandler = require('../Tags/TagsHandler')
+const { promiseMapWithLimit } = require('@overleaf/promise-utils')
module.exports = {
- promises: { transferOwnership },
+ promises: {
+ transferOwnership,
+ transferAllProjectsToUser,
+ },
+}
+
+const TAG_COLOR_BLUE = '#434AF0'
+
+/**
+ * @param {string} fromUserId
+ * @param {string} toUserId
+ * @param {string} ipAddress
+ * @return {Promise<{projectCount: number, newTagName: string}>}
+ */
+async function transferAllProjectsToUser({ fromUserId, toUserId, ipAddress }) {
+ // - Verify that both users exist
+ const fromUser = await UserGetter.promises.getUser(fromUserId, {
+ _id: 1,
+ email: 1,
+ })
+ const toUser = await UserGetter.promises.getUser(toUserId, { _id: 1 })
+ if (!fromUser) throw new OError('missing source user', { fromUserId })
+ if (!toUser) throw new OError('missing destination user', { toUserId })
+ if (fromUser._id.equals(toUser._id))
+ throw new OError('rejecting transfer between identical users', {
+ fromUserId,
+ toUserId,
+ })
+ logger.debug(
+ { fromUserId, toUserId },
+ 'started bulk transfer of all projects from one user to another'
+ )
+ // - Get all owned projects for fromUserId
+ const projects = await Project.find({ owner_ref: fromUserId }, { _id: 1 })
+
+ // - Create new tag on toUserId
+ const newTag = await TagsHandler.promises.createTag(
+ toUserId,
+ `transferred-from-${fromUser.email}`,
+ TAG_COLOR_BLUE,
+ { truncate: true }
+ )
+
+ // - Add tag to projects (can happen before ownership is transferred)
+ await TagsHandler.promises.addProjectsToTag(
+ toUserId,
+ newTag._id,
+ projects.map(p => p._id)
+ )
+
+ // - Transfer all projects
+ await promiseMapWithLimit(5, projects, async project => {
+ await transferOwnership(project._id, toUserId, {
+ allowTransferToNonCollaborators: true,
+ skipEmails: true,
+ ipAddress,
+ })
+ })
+
+ logger.debug(
+ { fromUserId, toUserId },
+ 'finished bulk transfer of all projects from one user to another'
+ )
+ return { projectCount: projects.length, newTagName: newTag.name }
}
async function transferOwnership(projectId, newOwnerId, options = {}) {
@@ -74,8 +140,8 @@ async function transferOwnership(projectId, newOwnerId, options = {}) {
await TpdsProjectFlusher.promises.flushProjectToTpds(projectId)
// Send confirmation emails
- const previousOwner = await UserGetter.promises.getUser(previousOwnerId)
if (!skipEmails) {
+ const previousOwner = await UserGetter.promises.getUser(previousOwnerId)
await _sendEmails(project, previousOwner, newOwner)
}
}
diff --git a/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs b/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs
new file mode 100644
index 000000000000..6ff1215de53b
--- /dev/null
+++ b/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs
@@ -0,0 +1,46 @@
+import { ObjectId } from 'mongodb'
+import minimist from 'minimist'
+import OwnershipTransferHandler from '../../../app/src/Features/Collaborators/OwnershipTransferHandler.js'
+import UserGetter from '../../../app/src/Features/User/UserGetter.js'
+import EmailHelper from '../../../app/src/Features/Helpers/EmailHelper.js'
+
+const args = minimist(process.argv.slice(2), {
+ string: ['from-user', 'to-user'],
+})
+
+/**
+ * @param {string} flag
+ * @return {Promise<string>}
+ */
+async function resolveUser(flag) {
+ const raw = args[flag]
+ if (!raw) throw new Error(`missing parameter --${flag}`)
+ if (ObjectId.isValid(raw)) return raw
+ const email = EmailHelper.parseEmail(raw)
+ if (!email) throw new Error(`invalid email --${flag}=${raw}`)
+ const user = await UserGetter.promises.getUser({ email: email }, { _id: 1 })
+ if (!user)
+ throw new Error(`user with email --${flag}=${email} does not exist`)
+ return user._id.toString()
+}
+
+async function main() {
+ const fromUserId = await resolveUser('from-user')
+ const toUserId = await resolveUser('to-user')
+ await OwnershipTransferHandler.promises.transferAllProjectsToUser({
+ fromUserId,
+ toUserId,
+ ipAddress: '0.0.0.0',
+ })
+}
+
+main()
+ .then(() => {
+ console.error('Done.')
+ process.exit(0)
+ })
+ .catch(err => {
+ console.error('---')
+ console.error(err)
+ process.exit(1)
+ })
diff --git a/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs b/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs
index 6ff1215de53b..8c5951334403 100644
--- a/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs
+++ b/services/web/modules/server-ce-scripts/scripts/transfer-all-projects-to-user.mjs
@@ -1,4 +1,4 @@
-import { ObjectId } from 'mongodb'
+import { ObjectId } from '../../../app/src/infrastructure/mongodb.js'
import minimist from 'minimist'
import OwnershipTransferHandler from '../../../app/src/Features/Collaborators/OwnershipTransferHandler.js'
import UserGetter from '../../../app/src/Features/User/UserGetter.js'
@@ -18,7 +18,7 @@ async function resolveUser(flag) {
if (ObjectId.isValid(raw)) return raw
const email = EmailHelper.parseEmail(raw)
if (!email) throw new Error(`invalid email --${flag}=${raw}`)
- const user = await UserGetter.promises.getUser({ email: email }, { _id: 1 })
+ const user = await UserGetter.promises.getUser({ email }, { _id: 1 })
if (!user)
throw new Error(`user with email --${flag}=${email} does not exist`)
return user._id.toString()

File diff suppressed because it is too large Load Diff