From 9ac982cda73ebf19d22f2ef4045612549a955e1f Mon Sep 17 00:00:00 2001 From: Christopher Hoskin Date: Tue, 21 Feb 2023 10:19:41 +0000 Subject: [PATCH] Merge pull request #11860 from overleaf/jpa-cleanup-history-analytics [history-v1] delete count_blob_references script and related infra GitOrigin-RevId: f1fb41600486d2b22fa47b922b8895c0ae8e9288 --- server-ce/Dockerfile | 1 - .../config/custom-environment-variables.json | 3 - services/history-v1/cloud-formation.json | 572 ------------------ .../config/custom-environment-variables.json | 3 - services/history-v1/config/development.json | 3 - services/history-v1/config/test.json | 3 - .../storage/tasks/count_blob_references.js | 246 -------- 7 files changed, 831 deletions(-) delete mode 100644 services/history-v1/cloud-formation.json delete mode 100755 services/history-v1/storage/tasks/count_blob_references.js diff --git a/server-ce/Dockerfile b/server-ce/Dockerfile index 58e671a463..6b7c27e640 100644 --- a/server-ce/Dockerfile +++ b/server-ce/Dockerfile @@ -95,7 +95,6 @@ ENV SHARELATEX_HISTORY_BLOBS_BUCKET "/var/lib/sharelatex/data/history/overleaf-b ENV SHARELATEX_HISTORY_PROJECT_BLOBS_BUCKET "/var/lib/sharelatex/data/history/overleaf-project-blobs" ENV SHARELATEX_HISTORY_CHUNKS_BUCKET "/var/lib/sharelatex/data/history/overleaf-chunks" ENV SHARELATEX_HISTORY_ZIPS_BUCKET "/var/lib/sharelatex/data/history/overleaf-zips" -ENV SHARELATEX_HISTORY_ANALYTICS_BUCKET "/var/lib/sharelatex/data/history/overleaf-analytics" # Phusion Image timeouts before sending SIGKILL to processes # ---------------------------------------------------------- diff --git a/server-ce/config/custom-environment-variables.json b/server-ce/config/custom-environment-variables.json index 6a8c97c924..1f05548ecb 100644 --- a/server-ce/config/custom-environment-variables.json +++ b/server-ce/config/custom-environment-variables.json @@ -29,9 +29,6 @@ "bucket": "SHARELATEX_HISTORY_ZIPS_BUCKET", "zipTimeoutMs": "ZIP_STORE_ZIP_TIMEOUT_MS" }, - "analytics": { - "bucket": "SHARELATEX_HISTORY_ANALYTICS_BUCKET" - }, "mongo": { "uri": "MONGO_CONNECTION_STRING" }, diff --git a/services/history-v1/cloud-formation.json b/services/history-v1/cloud-formation.json deleted file mode 100644 index 9088756d24..0000000000 --- a/services/history-v1/cloud-formation.json +++ /dev/null @@ -1,572 +0,0 @@ -{ - "AWSTemplateFormatVersion": "2010-09-09", - "Metadata": { - "AWS::CloudFormation::Designer": { - "ee78c12d-0d1e-4ca0-8fa9-ba02f49d071c": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 30, - "y": 60 - }, - "z": 0, - "embeds": [] - }, - "a52902b8-f027-45a8-9151-3e56ced5fb42": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 30, - "y": 140 - }, - "z": 0, - "embeds": [] - }, - "674a64fc-3703-4222-91b9-4878490489e2": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 250, - "y": 100 - }, - "z": 0, - "embeds": [], - "isassociatedwith": [ - "5c314e8e-535b-4b09-8bb7-c089794a3829" - ] - }, - "5c314e8e-535b-4b09-8bb7-c089794a3829": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 250, - "y": 210 - }, - "z": 0, - "embeds": [] - }, - "3da9a376-afc1-4b37-add1-9cf0df20b0a0": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 360, - "y": 100 - }, - "z": 0, - "embeds": [] - }, - "7fd11cc7-5574-44f3-99df-877b6f0f2a74": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 130, - "y": 60 - }, - "z": 0, - "embeds": [], - "isassociatedwith": [ - "ee78c12d-0d1e-4ca0-8fa9-ba02f49d071c" - ] - }, - "1d8a8e19-2661-44d4-99c0-4a2c88c8557d": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 130, - "y": 140 - }, - "z": 0, - "embeds": [], - "isassociatedwith": [ - "a52902b8-f027-45a8-9151-3e56ced5fb42" - ] - }, - "e29c9a81-85ad-4511-ab1e-018fe50f1573": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 30, - "y": 220 - }, - "z": 0, - "embeds": [] - }, - "1388662c-85e1-4f6e-9b80-0f1888a6e07d": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 130, - "y": 220 - }, - "z": 0, - "embeds": [], - "isassociatedwith": [ - "e29c9a81-85ad-4511-ab1e-018fe50f1573" - ] - }, - "236600ec-46ca-4770-8d7c-61532a6d8c27": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 30, - "y": 300 - }, - "z": 0, - "embeds": [] - }, - "454a6298-2f35-48d7-8cd5-3152d78a585b": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 130, - "y": 300 - }, - "z": 0, - "embeds": [] - } - } - }, - "Resources": { - "Blobs": { - "Type": "AWS::S3::Bucket", - "Properties": { - "BucketName": { - "Fn::Join": [ - "-", - [ - { - "Ref": "OverleafEditorBucketPrefix" - }, - "blobs" - ] - ] - }, - "VersioningConfiguration": { - "Status": "Enabled" - }, - "LifecycleConfiguration": { - "Rules": [ - { - "NoncurrentVersionExpirationInDays": 90, - "Status": "Enabled" - } - ] - }, - "BucketEncryption": { - "ServerSideEncryptionConfiguration": [ - { - "ServerSideEncryptionByDefault": { - "SSEAlgorithm": "AES256" - } - } - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "ee78c12d-0d1e-4ca0-8fa9-ba02f49d071c" - } - } - }, - "Chunks": { - "Type": "AWS::S3::Bucket", - "Properties": { - "BucketName": { - "Fn::Join": [ - "-", - [ - { - "Ref": "OverleafEditorBucketPrefix" - }, - "chunks" - ] - ] - }, - "VersioningConfiguration": { - "Status": "Enabled" - }, - "LifecycleConfiguration": { - "Rules": [ - { - "NoncurrentVersionExpirationInDays": 80, - "Status": "Enabled" - } - ] - }, - "BucketEncryption": { - "ServerSideEncryptionConfiguration": [ - { - "ServerSideEncryptionByDefault": { - "SSEAlgorithm": "AES256" - } - } - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "a52902b8-f027-45a8-9151-3e56ced5fb42" - } - } - }, - "APIUser": { - "Type": "AWS::IAM::User", - "Properties": { - "Groups": [ - { - "Ref": "APIGroup" - } - ] - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "674a64fc-3703-4222-91b9-4878490489e2" - } - } - }, - "APIGroup": { - "Type": "AWS::IAM::Group", - "Properties": {}, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "5c314e8e-535b-4b09-8bb7-c089794a3829" - } - } - }, - "APIUserAccessKey": { - "Type": "AWS::IAM::AccessKey", - "Properties": { - "UserName": { - "Ref": "APIUser" - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "3da9a376-afc1-4b37-add1-9cf0df20b0a0" - } - } - }, - "BlobsPolicy": { - "Type": "AWS::S3::BucketPolicy", - "Properties": { - "Bucket": { - "Ref": "Blobs" - }, - "PolicyDocument": { - "Id": "BlobsPolicy", - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "BlobsPolicyAPIUser", - "Action": [ - "s3:GetObject", - "s3:PutObject" - ], - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:aws:s3:::", - { - "Ref": "Blobs" - }, - "/*" - ] - ] - }, - "Principal": { - "AWS": { - "Fn::GetAtt": [ - "APIUser", - "Arn" - ] - } - } - } - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "7fd11cc7-5574-44f3-99df-877b6f0f2a74" - } - } - }, - "ChunksPolicy": { - "Type": "AWS::S3::BucketPolicy", - "Properties": { - "Bucket": { - "Ref": "Chunks" - }, - "PolicyDocument": { - "Id": "ChunksPolicy", - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "ChunksPolicyAPIUser", - "Action": [ - "s3:GetObject", - "s3:PutObject", - "s3:DeleteObject" - ], - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:aws:s3:::", - { - "Ref": "Chunks" - }, - "/*" - ] - ] - }, - "Principal": { - "AWS": { - "Fn::GetAtt": [ - "APIUser", - "Arn" - ] - } - } - } - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "1d8a8e19-2661-44d4-99c0-4a2c88c8557d" - } - } - }, - "Zips": { - "Type": "AWS::S3::Bucket", - "Properties": { - "BucketName": { - "Fn::Join": [ - "-", - [ - { - "Ref": "OverleafEditorBucketPrefix" - }, - "zips" - ] - ] - }, - "BucketEncryption": { - "ServerSideEncryptionConfiguration": [ - { - "ServerSideEncryptionByDefault": { - "SSEAlgorithm": "AES256" - } - } - ] - }, - "LifecycleConfiguration": { - "Rules": [ - { - "ExpirationInDays": 1, - "Status": "Enabled" - } - ] - }, - "PublicAccessBlockConfiguration": { - "BlockPublicAcls": true, - "BlockPublicPolicy": true, - "IgnorePublicAcls": true, - "RestrictPublicBuckets": true - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "e29c9a81-85ad-4511-ab1e-018fe50f1573" - } - } - }, - "ZipsPolicy": { - "Type": "AWS::S3::BucketPolicy", - "Properties": { - "Bucket": { - "Ref": "Zips" - }, - "PolicyDocument": { - "Id": "ZipsPolicy", - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "ZipsPolicyAPIUser", - "Action": [ - "s3:GetObject", - "s3:PutObject" - ], - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:aws:s3:::", - { - "Ref": "Zips" - }, - "/*" - ] - ] - }, - "Principal": { - "AWS": { - "Fn::GetAtt": [ - "APIUser", - "Arn" - ] - } - } - } - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "1388662c-85e1-4f6e-9b80-0f1888a6e07d" - } - } - }, - "Analytics": { - "Type": "AWS::S3::Bucket", - "Properties": { - "BucketName": { - "Fn::Join": [ - "-", - [ - { - "Ref": "OverleafEditorBucketPrefix" - }, - "analytics" - ] - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "236600ec-46ca-4770-8d7c-61532a6d8c27" - } - } - }, - "AnalyticsPolicy": { - "Type": "AWS::S3::BucketPolicy", - "Properties": { - "Bucket": { - "Ref": "Analytics" - }, - "PolicyDocument": { - "Id": "AnalyticsPolicy", - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "AnalyticsPolicyAPIUser", - "Action": [ - "s3:GetObject", - "s3:PutObject", - "s3:DeleteObject" - ], - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:aws:s3:::", - { - "Ref": "Analytics" - }, - "/*" - ] - ] - }, - "Principal": { - "AWS": { - "Fn::GetAtt": [ - "APIUser", - "Arn" - ] - } - } - }, - { - "Sid": "AnalyticsPolicyAPIUserBucketPerms", - "Action": "s3:ListBucket", - "Effect": "Allow", - "Resource": { - "Fn::Join": [ - "", - [ - "arn:aws:s3:::", - { - "Ref": "Analytics" - } - ] - ] - }, - "Principal": { - "AWS": { - "Fn::GetAtt": [ - "APIUser", - "Arn" - ] - } - } - } - ] - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "454a6298-2f35-48d7-8cd5-3152d78a585b" - } - } - } - }, - "Parameters": { - "OverleafEditorBucketPrefix": { - "Description": "Prefix for the S3 bucket names (e.g. production-overleaf-editor or staging-overleaf-editor)", - "Type": "String" - } - }, - "Outputs": { - "APIUserAccessKey": { - "Value": { - "Ref": "APIUserAccessKey" - } - }, - "APIUserSecretKey": { - "Value": { - "Fn::GetAtt": [ - "APIUserAccessKey", - "SecretAccessKey" - ] - } - } - } -} diff --git a/services/history-v1/config/custom-environment-variables.json b/services/history-v1/config/custom-environment-variables.json index e4725ab0cb..58b9576be1 100644 --- a/services/history-v1/config/custom-environment-variables.json +++ b/services/history-v1/config/custom-environment-variables.json @@ -39,9 +39,6 @@ "bucket": "OVERLEAF_EDITOR_ZIPS_BUCKET", "zipTimeoutMs": "ZIP_STORE_ZIP_TIMEOUT_MS" }, - "analytics": { - "bucket": "OVERLEAF_EDITOR_ANALYTICS_BUCKET" - }, "mongo": { "uri": "MONGO_CONNECTION_STRING" }, diff --git a/services/history-v1/config/development.json b/services/history-v1/config/development.json index cdf3fca1a7..327b25d805 100644 --- a/services/history-v1/config/development.json +++ b/services/history-v1/config/development.json @@ -24,9 +24,6 @@ "zipStore": { "bucket": "overleaf-development-zips" }, - "analytics": { - "bucket": "overleaf-development-analytics" - }, "useDeleteObjects": "false", "mongo": { "uri": "mongodb://mongo:27017/sharelatex" diff --git a/services/history-v1/config/test.json b/services/history-v1/config/test.json index b7f203a35b..e658877e19 100644 --- a/services/history-v1/config/test.json +++ b/services/history-v1/config/test.json @@ -21,9 +21,6 @@ "zipStore": { "bucket": "overleaf-test-zips" }, - "analytics": { - "bucket": "overleaf-test-analytics" - }, "maxDeleteKeys": "3", "useDeleteObjects": "false", "mongo": { diff --git a/services/history-v1/storage/tasks/count_blob_references.js b/services/history-v1/storage/tasks/count_blob_references.js deleted file mode 100755 index fce1579e96..0000000000 --- a/services/history-v1/storage/tasks/count_blob_references.js +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env node - -'use strict' - -/** - * This script fetches all history chunks from active projects (as listed in the - * active_doc_ids table) and counts how many times each blob is referenced. The - * reference count is stored in the blobs.estimated_reference_count column. - */ - -const Path = require('path') -const BPromise = require('bluebird') -const commandLineArgs = require('command-line-args') -const config = require('config') -const stringToStream = require('string-to-stream') - -const { History, EditFileOperation } = require('overleaf-editor-core') -const { knex, historyStore, persistor } = require('..') - -const DEFAULT_BATCH_SIZE = 100 -const DEFAULT_TIMEOUT = 23 * 60 * 60 // 23 hours -const MAX_POSTGRES_INTEGER = 2147483647 -const TEXT_OPERATION_COUNT_THRESHOLD = 500 -const BUCKET = config.get('analytics.bucket') -const BLOB_REFERENCE_COUNTS_PREFIX = 'blob-reference-counts/batches/' -const TEXT_OPERATION_COUNTS_PREFIX = 'text-operation-counts/' - -async function main() { - const programName = Path.basename(process.argv[1]) - const options = commandLineArgs([ - { name: 'restart', type: Boolean }, - { name: 'continue', type: Boolean }, - { name: 'batch-size', type: Number, defaultValue: DEFAULT_BATCH_SIZE }, - { name: 'timeout', type: Number, defaultValue: DEFAULT_TIMEOUT }, - { name: 'concurrency', type: Number, defaultValue: 1 }, - { name: 'min-doc-id', type: Number, defaultValue: 1 }, - { name: 'max-doc-id', type: Number, defaultValue: MAX_POSTGRES_INTEGER }, - ]) - const minDocId = options['min-doc-id'] - const maxDocId = options['max-doc-id'] - const runOptions = { - batchSize: options['batch-size'], - timeout: options.timeout, - concurrency: options.concurrency, - } - const inProgress = await isRunInProgress() - if (inProgress && !options.restart && !options.continue) { - console.log(`\ -A blob reference count is already under way. - -To resume this run, use: ${programName} --continue -To start a new run, use: ${programName} --restart`) - return - } - if (!inProgress || options.restart) { - await initialize() - } - const nextDocId = await getNextDocId(minDocId, maxDocId) - await run(nextDocId, maxDocId, runOptions) -} - -async function isRunInProgress() { - const record = await knex('blob_reference_count_batches').first() - return record != null -} - -async function getNextDocId(minDocId, maxDocId) { - const { lastDocId } = await knex('blob_reference_count_batches') - .where('end_doc_id', '<=', maxDocId) - .max({ lastDocId: 'end_doc_id' }) - .first() - if (lastDocId == null) { - return minDocId - } else { - return Math.max(minDocId, lastDocId + 1) - } -} - -async function initialize() { - await persistor.deleteDirectory(BUCKET, BLOB_REFERENCE_COUNTS_PREFIX) - await persistor.deleteDirectory(BUCKET, TEXT_OPERATION_COUNTS_PREFIX) - await knex('blob_reference_count_batches').truncate() -} - -async function run(startDocId, maxDocId, options) { - const { timeout, batchSize, concurrency } = options - const maxRunningTime = Date.now() + timeout * 1000 - let batchStart = startDocId - while (true) { - if (Date.now() > maxRunningTime) { - console.log('Timeout exceeded. Exiting early.') - break - } - const docIds = await getDocIds(batchStart, maxDocId, batchSize) - if (docIds.length === 0) { - console.log('No more projects to process. Bye!') - break - } - const batchEnd = docIds[docIds.length - 1] - console.log(`Processing doc ids ${batchStart} to ${batchEnd}...`) - const chunks = await getChunks(docIds) - const blobReferenceCounter = new BlobReferenceCounter() - const textOperationCounter = new TextOperationCounter() - await BPromise.map( - chunks, - async chunk => { - const history = await getHistory(chunk) - blobReferenceCounter.processHistory(history, chunk.projectId) - textOperationCounter.processHistory(history, chunk.projectId) - }, - { concurrency } - ) - await storeBlobReferenceCounts(batchStart, blobReferenceCounter.getCounts()) - await storeTextOperationCounts(batchStart, textOperationCounter.getCounts()) - await recordBatch(batchStart, batchEnd) - batchStart = batchEnd + 1 - } -} - -async function getDocIds(minDocId, maxDocId, batchSize) { - const docIds = await knex('active_doc_ids') - .select('doc_id') - .where('doc_id', '>=', minDocId) - .andWhere('doc_id', '<=', maxDocId) - .orderBy('doc_id') - .limit(batchSize) - .pluck('doc_id') - return docIds -} - -async function getChunks(docIds) { - const chunks = await knex('chunks') - .select('id', { projectId: 'doc_id' }) - .where('doc_id', 'in', docIds) - return chunks -} - -async function recordBatch(batchStart, batchEnd) { - await knex('blob_reference_count_batches').insert({ - start_doc_id: batchStart, - end_doc_id: batchEnd, - }) -} - -async function getHistory(chunk) { - const rawHistory = await historyStore.loadRaw(chunk.projectId, chunk.id) - const history = History.fromRaw(rawHistory) - return history -} - -async function storeBlobReferenceCounts(startDocId, counts) { - const key = `${BLOB_REFERENCE_COUNTS_PREFIX}${startDocId}.csv` - const csv = makeCsvFromMap(counts) - const stream = stringToStream(csv) - persistor.sendStream(BUCKET, key, stream) -} - -async function storeTextOperationCounts(startDocId, counts) { - const key = `${TEXT_OPERATION_COUNTS_PREFIX}${startDocId}.csv` - const csv = makeCsvFromMap(counts) - const stream = stringToStream(csv) - await persistor.sendStream(BUCKET, key, stream) -} - -function makeCsvFromMap(map) { - const entries = Array.from(map.entries()) - entries.sort((a, b) => { - if (a[0] < b[0]) { - return -1 - } - if (a[0] > b[0]) { - return 1 - } - return 0 - }) - return entries.map(entry => entry.join(',')).join('\n') -} - -function incrementMapEntry(map, key) { - const currentCount = map.get(key) || 0 - map.set(key, currentCount + 1) -} - -class BlobReferenceCounter { - constructor() { - this.blobHashesByProjectId = new Map() - } - - processHistory(history, projectId) { - let blobHashes = this.blobHashesByProjectId.get(projectId) - if (blobHashes == null) { - blobHashes = new Set() - this.blobHashesByProjectId.set(projectId, blobHashes) - } - history.findBlobHashes(blobHashes) - } - - getCounts() { - const countsByHash = new Map() - for (const blobHashes of this.blobHashesByProjectId.values()) { - for (const hash of blobHashes) { - incrementMapEntry(countsByHash, hash) - } - } - return countsByHash - } -} - -class TextOperationCounter { - constructor() { - this.countsByProjectId = new Map() - } - - processHistory(history, projectId) { - for (const change of history.getChanges()) { - let textOperationCount = 0 - for (const operation of change.getOperations()) { - if (operation instanceof EditFileOperation) { - textOperationCount++ - } - } - if (textOperationCount >= TEXT_OPERATION_COUNT_THRESHOLD) { - this.countsByProjectId.set( - projectId, - Math.max( - this.countsByProjectId.get(projectId) || 0, - textOperationCount - ) - ) - } - } - } - - getCounts() { - return this.countsByProjectId - } -} - -main() - .then(() => { - process.exit() - }) - .catch(err => { - console.error(err) - process.exit(1) - })