From 2d9db134a2289cf777f7a84f23ff3f8903274d1e Mon Sep 17 00:00:00 2001
From: John Lees-Miller <jdleesmiller@gmail.com>
Date: Wed, 30 Aug 2023 21:11:30 +0100
Subject: [PATCH] Merge pull request #14544 from
 overleaf/jlm-add-history-restore-script

Add project recovery script

GitOrigin-RevId: b98fb2988245d2b67f8fc0711742294e895b1f07
---
 .../history-v1/storage/scripts/recover_zip.js | 255 ++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 services/history-v1/storage/scripts/recover_zip.js

diff --git a/services/history-v1/storage/scripts/recover_zip.js b/services/history-v1/storage/scripts/recover_zip.js
new file mode 100644
index 0000000000..9f5e4adaee
--- /dev/null
+++ b/services/history-v1/storage/scripts/recover_zip.js
@@ -0,0 +1,255 @@
+/**
+ * Try to recover a zip of the latest version of a project using only data in
+ * GCS, where this data may have been (recently) hard deleted (i.e. may exist
+ * wholely or in part as non-current versions). This should be able to
+ * retrieve the latest content of a project up to 180 days after it was
+ * deleted.
+ *
+ * Usage:
+ * node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
+ *
+ * Output:
+ * Signed URL(s) for the uploaded zip files. Note that these are valid for
+ * only 24h, to match the lifecycle rule on the zip bucket.
+ */
+
+const fs = require('fs')
+const os = require('os')
+const path = require('path')
+const util = require('util')
+
+// Something is registering 11 listeners, over the limit of 10, which generates
+// a lot of warning noise.
+require('events').EventEmitter.defaultMaxListeners = 11
+
+const config = require('config')
+// We depend on this via object-persistor.
+// eslint-disable-next-line import/no-extraneous-dependencies
+const { Storage } = require('@google-cloud/storage')
+const isValidUtf8 = require('utf-8-validate')
+
+const core = require('overleaf-editor-core')
+const projectKey = require('../lib/project_key')
+const streams = require('../lib/streams')
+const ProjectArchive = require('../lib/project_archive')
+
+const {
+  values: { verbose: VERBOSE },
+  positionals: HISTORY_IDS,
+} = util.parseArgs({
+  options: {
+    verbose: {
+      type: 'boolean',
+      default: false,
+    },
+  },
+  allowPositionals: true,
+})
+
+if (HISTORY_IDS.length === 0) {
+  console.error('no history IDs; see usage')
+  process.exit(1)
+}
+
+async function listDeletedChunks(historyId) {
+  const bucketName = config.get('chunkStore.bucket')
+  const storage = new Storage()
+  const [files] = await storage.bucket(bucketName).getFiles({
+    prefix: projectKey.format(historyId),
+    versions: true,
+  })
+  return files
+}
+
+async function findLatestChunk(historyId) {
+  const files = await listDeletedChunks(historyId)
+  if (files.length === 0) return null
+  files.sort((a, b) => {
+    if (a.name < b.name) return -1
+    if (a.name > b.name) return 1
+    return 0
+  })
+  return files[files.length - 1]
+}
+
+async function downloadLatestChunk(tmp, historyId) {
+  const latestChunkFile = await findLatestChunk(historyId)
+  if (!latestChunkFile) throw new Error('no chunk found to recover')
+
+  const destination = path.join(tmp, 'latest.json')
+  await latestChunkFile.download({ destination })
+  return destination
+}
+
+async function loadHistory(historyPathname) {
+  const data = await fs.promises.readFile(historyPathname)
+  const rawHistory = JSON.parse(data)
+  return core.History.fromRaw(rawHistory)
+}
+
+async function loadChunk(historyPathname, blobStore) {
+  const history = await loadHistory(historyPathname)
+
+  const blobHashes = new Set()
+  history.findBlobHashes(blobHashes)
+
+  await blobStore.fetchBlobs(blobHashes)
+  await history.loadFiles('lazy', blobStore)
+
+  return new core.Chunk(history, 0)
+}
+
+// TODO: it would be nice to export / expose this from BlobStore;
+// currently this is a copy of the method there.
+async function getStringLengthOfFile(byteLength, pathname) {
+  // We have to read the file into memory to get its UTF-8 length, so don't
+  // bother for files that are too large for us to edit anyway.
+  if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
+    return null
+  }
+
+  // We need to check if the file contains nonBmp or null characters
+  let data = await fs.promises.readFile(pathname)
+  if (!isValidUtf8(data)) return null
+  data = data.toString()
+  if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
+  if (core.util.containsNonBmpChars(data)) return null
+  if (data.indexOf('\x00') !== -1) return null
+  return data.length
+}
+
+class RecoveryBlobStore {
+  constructor(historyId, tmp) {
+    this.historyId = historyId
+    this.tmp = tmp
+    this.blobs = new Map()
+  }
+
+  async fetchBlobs(blobHashes) {
+    for await (const blobHash of blobHashes) {
+      await this.fetchBlob(blobHash)
+    }
+  }
+
+  async fetchBlob(hash) {
+    if (this.blobs.has(hash)) return
+
+    if (VERBOSE) console.log('fetching blob', hash)
+
+    const bucketName = config.get('blobStore.projectBucket')
+    const storage = new Storage()
+    const [files] = await storage.bucket(bucketName).getFiles({
+      prefix: this.makeProjectBlobKey(hash),
+      versions: true,
+    })
+
+    const destination = this.getBlobPathname(hash)
+
+    if (files.length === 0) {
+      await this.fetchGlobalBlob(hash, destination)
+    } else if (files.length === 1) {
+      await files[0].download({ destination })
+    } else {
+      throw new Error('Multiple versions of blob ' + hash)
+    }
+
+    this.blobs.set(hash, await this.makeBlob(hash, destination))
+  }
+
+  async fetchGlobalBlob(hash, destination) {
+    const bucketName = config.get('blobStore.globalBucket')
+    const storage = new Storage()
+    const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
+    await file.download({ destination })
+  }
+
+  async makeBlob(hash, pathname) {
+    const stat = await fs.promises.stat(pathname)
+    const byteLength = stat.size
+    const stringLength = await getStringLengthOfFile(byteLength, pathname)
+    return new core.Blob(hash, byteLength, stringLength)
+  }
+
+  async getString(hash) {
+    const stream = await this.getStream(hash)
+    const buffer = await streams.readStreamToBuffer(stream)
+    return buffer.toString()
+  }
+
+  async getStream(hash) {
+    return fs.createReadStream(this.getBlobPathname(hash))
+  }
+
+  async getBlob(hash) {
+    return this.blobs.get(hash)
+  }
+
+  getBlobPathname(hash) {
+    return path.join(this.tmp, hash)
+  }
+
+  makeGlobalBlobKey(hash) {
+    return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
+  }
+
+  makeProjectBlobKey(hash) {
+    return `${projectKey.format(this.historyId)}/${hash.slice(
+      0,
+      2
+    )}/${hash.slice(2)}`
+  }
+}
+
+async function uploadZip(historyId, zipPathname) {
+  const bucketName = config.get('zipStore.bucket')
+  const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
+  const storage = new Storage()
+  const destination = `${historyId}-recovered.zip`
+  await storage.bucket(bucketName).upload(zipPathname, { destination })
+
+  const signedUrls = await storage
+    .bucket(bucketName)
+    .file(destination)
+    .getSignedUrl({
+      version: 'v4',
+      action: 'read',
+      expires: Date.now() + deadline,
+    })
+
+  return signedUrls[0]
+}
+
+async function restoreProject(historyId) {
+  const tmp = await fs.promises.mkdtemp(
+    path.join(os.tmpdir(), historyId.toString())
+  )
+  if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
+
+  const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
+  const blobStore = new RecoveryBlobStore(historyId, tmp)
+  const chunk = await loadChunk(latestJsonPathname, blobStore)
+
+  const snapshot = chunk.getSnapshot()
+  for (const change of chunk.getChanges()) {
+    change.applyTo(snapshot)
+  }
+
+  if (VERBOSE) console.log('zipping', historyId)
+
+  const zipPathname = path.join(tmp, `${historyId}.zip`)
+  const zipTimeoutMs = 60 * 1000
+  const archive = new ProjectArchive(snapshot, zipTimeoutMs)
+  await archive.writeZip(blobStore, zipPathname)
+
+  if (VERBOSE) console.log('uploading', historyId)
+
+  return await uploadZip(historyId, zipPathname)
+}
+
+async function main() {
+  for (const historyId of HISTORY_IDS) {
+    const signedUrl = await restoreProject(historyId)
+    console.log(signedUrl)
+  }
+}
+main().catch(console.error)