Merge pull request #28655 from overleaf/em-restore-avoid-doc-download

File restore: avoid downloading docs unnecessarily

GitOrigin-RevId: bf5faab7510b118041aaf848f9acb3eb864b5cc4
This commit is contained in:
Eric Mc Sween
2025-09-26 11:09:09 -04:00
committed by Copybot
parent 9ba3014f96
commit c150f2fdad
7 changed files with 165 additions and 164 deletions

View File

@@ -3,6 +3,7 @@ import Path from 'node:path'
import FileWriter from '../../infrastructure/FileWriter.js'
import Metrics from '../../infrastructure/Metrics.js'
import FileSystemImportManager from '../Uploads/FileSystemImportManager.js'
import FileTypeManager from '../Uploads/FileTypeManager.js'
import EditorController from '../Editor/EditorController.js'
import Errors from '../Errors/Errors.js'
import moment from 'moment'
@@ -114,11 +115,6 @@ const RestoreManager = {
throw new OError('project does not have ranges support', { projectId })
}
const fsPath = await RestoreManager._writeFileVersionToDisk(
projectId,
version,
pathname
)
const basename = Path.basename(pathname)
let dirname = Path.dirname(pathname)
if (dirname === '.') {
@@ -142,18 +138,13 @@ const RestoreManager = {
throw new OError('file not found in snapshot', { pathname })
}
const importInfo = await FileSystemImportManager.promises.importFile(
fsPath,
pathname
)
let hadDeletedRootFile = false
if (file) {
if (file.type !== 'doc' && file.type !== 'file') {
throw new OError('unexpected file type', { type: file.type })
}
logger.debug(
{ projectId, fileId: file.element._id, type: importInfo.type },
{ projectId, fileId: file.element._id },
'deleting entity before reverting it'
)
await EditorController.promises.deleteEntity(
@@ -177,15 +168,22 @@ const RestoreManager = {
// Look for metadata indicating a linked file.
const fileMetadata = snapshotFile.getMetadata()
const isFileMetadata = fileMetadata && 'provider' in fileMetadata
const isLinkedFile = fileMetadata && 'provider' in fileMetadata
logger.debug({ fileMetadata }, 'metadata from history')
if (
isLinkedFile ||
!snapshotFile.isEditable() ||
importInfo.type === 'file' ||
isFileMetadata
!FileTypeManager.isEditable(snapshotFile.getContent(), {
filename: pathname,
})
) {
const fsPath = await RestoreManager._writeFileVersionToDisk(
projectId,
version,
pathname
)
const newFile = await EditorController.promises.upsertFile(
projectId,
parentFolderId,
@@ -320,7 +318,7 @@ const RestoreManager = {
endTimer({ type: 'doc' })
return {
_id,
type: importInfo.type,
type: 'doc',
}
},

View File

@@ -31,7 +31,7 @@ async function newUpdate(
throw new Errors.TooManyRequestsError('project on cooldown')
}
const shouldIgnore = await FileTypeManager.promises.shouldIgnore(path)
const shouldIgnore = FileTypeManager.shouldIgnore(path)
if (shouldIgnore) {
return null
}
@@ -175,7 +175,7 @@ async function createFolder(userId, projectId, projectName, path) {
throw new Errors.TooManyRequestsError('project on cooldown')
}
const shouldIgnore = await FileTypeManager.promises.shouldIgnore(path)
const shouldIgnore = FileTypeManager.shouldIgnore(path)
if (shouldIgnore) {
return null
}

View File

@@ -96,7 +96,7 @@ async function addFolderContents(
}
const entries = (await fs.promises.readdir(folderPath)) || []
for (const entry of entries) {
if (await FileTypeManager.promises.shouldIgnore(entry)) {
if (FileTypeManager.shouldIgnore(entry)) {
continue
}
await addEntity(
@@ -227,7 +227,7 @@ async function* _walkDir(dirPath) {
const entries = await fs.promises.readdir(dirPath)
for (const entry of entries) {
const entryPath = Path.join(dirPath, entry)
if (await FileTypeManager.promises.shouldIgnore(entryPath)) {
if (FileTypeManager.shouldIgnore(entryPath)) {
continue
}

View File

@@ -1,96 +1,101 @@
const fs = require('fs')
// @ts-check
const fs = require('fs/promises')
const Path = require('path')
const { callbackify } = require('util')
const isUtf8 = require('utf-8-validate')
const { promisifyAll } = require('@overleaf/promise-utils')
const Settings = require('@overleaf/settings')
const Minimatch = require('minimatch').Minimatch
const fileIgnoreMatcher = new Minimatch(Settings.fileIgnorePattern, {
nocase: true, // make the whole path matching case-insensitive
// (previously we were only matching the extension case-insensitively but it seems safer to match the whole path)
dot: true, // allows matching on paths containing a dot e.g. /.git/foo/bar.txt
const FILE_IGNORE_MATCHER = new Minimatch(Settings.fileIgnorePattern, {
// make the whole path matching case-insensitive (previously we were only
// matching the extension case-insensitively but it seems safer to match the
// whole path)
nocase: true,
// allows matching on paths containing a dot e.g. /.git/foo/bar.txt
dot: true,
})
const FileTypeManager = {
TEXT_EXTENSIONS: new Set(Settings.textExtensions.map(ext => `.${ext}`)),
EDITABLE_FILENAMES: Settings.editableFilenames,
const TEXT_EXTENSIONS = new Set(Settings.textExtensions.map(ext => `.${ext}`))
const EDITABLE_FILENAMES = Settings.editableFilenames
MAX_TEXT_FILE_SIZE: 3 * Settings.max_doc_length, // allow 3 bytes for every character
// allow 3 bytes for every character
const MAX_TEXT_FILE_SIZE = 3 * Settings.max_doc_length
isDirectory(path, callback) {
fs.stat(path, (error, stats) => {
if (error != null) {
return callback(error)
}
callback(null, stats.isDirectory())
})
},
async function isDirectory(path) {
const stats = await fs.stat(path)
return stats.isDirectory()
}
// returns charset as understood by fs.readFile,
getType(name, fsPath, existingFileType, callback) {
if (!name) {
return callback(
new Error(
'[FileTypeManager] getType requires a non-null "name" parameter'
)
)
}
if (!fsPath) {
return callback(
new Error(
'[FileTypeManager] getType requires a non-null "fsPath" parameter'
)
)
}
const basename = Path.basename(name)
if (existingFileType !== 'doc' && !_isTextFilename(basename)) {
return callback(null, { binary: true })
}
/**
* Determine whether a string can be stored as an editable doc
*
* @param {string} content
* @param {object} [opts]
* @param {string} [opts.filename] - if a filename is given, the algorithm also
* checks whether the filename matches the list of editable filenames
*/
function isEditable(content, opts = {}) {
if (opts.filename != null && !_isTextFilename(opts.filename)) {
return false
}
fs.stat(fsPath, (err, stat) => {
if (err != null) {
return callback(err)
}
if (stat.size > FileTypeManager.MAX_TEXT_FILE_SIZE) {
return callback(null, { binary: true }) // Treat large text file as binary
}
if (content.length >= Settings.max_doc_length) {
return false
}
fs.readFile(fsPath, (err, bytes) => {
if (err != null) {
return callback(err)
}
const encoding = _detectEncoding(bytes)
const text = bytes.toString(encoding)
if (text.length >= Settings.max_doc_length) {
return callback(null, { binary: true }) // Treat large text file as binary
}
// For compatibility with the history service, only accept valid utf8 with no
// nulls or non-BMP characters as text, eveything else is binary.
if (text.includes('\x00')) {
return callback(null, { binary: true })
}
if (/[\uD800-\uDFFF]/.test(text)) {
// non-BMP characters (high and low surrogate characters)
return callback(null, { binary: true })
}
callback(null, { binary: false, encoding })
})
})
},
// For compatibility with the history service, only accept valid utf8 with no
// nulls or non-BMP characters as text, eveything else is binary.
if (content.includes('\x00')) {
return false
}
// non-BMP characters (high and low surrogate characters)
if (/[\uD800-\uDFFF]/.test(content)) {
return false
}
return true
}
// FIXME: we can convert this to a synchronous function if we want to
shouldIgnore(path, callback) {
// use minimatch file matching to check if the path should be ignored
const ignore = fileIgnoreMatcher.match(path)
callback(null, ignore)
},
/**
* Determine whether a file can be stored as an editable doc
*
* @param {string} name - target filename
* @param {string} fsPath - path of the file on the filesystem
* @param {'file' | 'doc' | null} existingFileType - current type of the file at
* the target location
*/
async function getType(name, fsPath, existingFileType) {
if (existingFileType !== 'doc' && !_isTextFilename(name)) {
return { binary: true }
}
const stat = await fs.stat(fsPath)
if (stat.size > MAX_TEXT_FILE_SIZE) {
return { binary: true }
}
const bytes = await fs.readFile(fsPath)
const encoding = _detectEncoding(bytes)
const text = bytes.toString(encoding)
if (isEditable(text)) {
return { binary: false, encoding }
} else {
return { binary: true }
}
}
function shouldIgnore(path) {
// use minimatch file matching to check if the path should be ignored
return FILE_IGNORE_MATCHER.match(path)
}
function _isTextFilename(filename) {
const basename = Path.basename(filename)
const extension = Path.extname(filename).toLowerCase()
return (
FileTypeManager.TEXT_EXTENSIONS.has(extension) ||
FileTypeManager.EDITABLE_FILENAMES.includes(filename.toLowerCase())
TEXT_EXTENSIONS.has(extension) ||
EDITABLE_FILENAMES.includes(basename.toLowerCase())
)
}
@@ -105,7 +110,13 @@ function _detectEncoding(bytes) {
return 'latin1'
}
module.exports = FileTypeManager
module.exports.promises = promisifyAll(FileTypeManager, {
without: ['getStrictTypeFromContent'],
})
module.exports = {
shouldIgnore,
isEditable,
getType: callbackify(getType),
isDirectory: callbackify(isDirectory),
promises: {
getType,
isDirectory,
},
}

View File

@@ -487,9 +487,6 @@ describe('RestoreManager', function () {
metadata: { ts: '2024-01-01T00:00:00.000Z', user_id: 'user-2' },
},
]
ctx.FileSystemImportManager.promises.importFile = sinon
.stub()
.resolves({ type: 'doc', lines: ['foo', 'bar', 'baz'] })
ctx.getDocUpdaterCompatibleRanges.returns({
changes: ctx.tracked_changes,
comments: ctx.comments,
@@ -933,9 +930,6 @@ describe('RestoreManager', function () {
describe('when reverting a linked file', function () {
beforeEach(async function (ctx) {
ctx.pathname = 'foo.png'
ctx.FileSystemImportManager.promises.importFile = sinon
.stub()
.resolves({ type: 'file' })
ctx.result = await ctx.RestoreManager.promises.revertFile(
ctx.user_id,
ctx.project_id,
@@ -979,9 +973,6 @@ describe('RestoreManager', function () {
describe('when reverting a linked document with provider', function () {
beforeEach(async function (ctx) {
ctx.pathname = 'linkedFile.bib'
ctx.FileSystemImportManager.promises.importFile = sinon
.stub()
.resolves({ type: 'doc', lines: ['foo', 'bar', 'baz'] })
ctx.result = await ctx.RestoreManager.promises.revertFile(
ctx.user_id,
ctx.project_id,
@@ -1025,9 +1016,6 @@ describe('RestoreManager', function () {
describe('when reverting a linked document with { main: true }', function () {
beforeEach(async function (ctx) {
ctx.pathname = 'withMainTrue.tex'
ctx.FileSystemImportManager.promises.importFile = sinon
.stub()
.resolves({ type: 'doc', lines: ['foo', 'bar', 'baz'] })
ctx.result = await ctx.RestoreManager.promises.revertFile(
ctx.user_id,
ctx.project_id,
@@ -1065,9 +1053,6 @@ describe('RestoreManager', function () {
describe('when reverting a binary file', function () {
beforeEach(async function (ctx) {
ctx.pathname = 'foo.png'
ctx.FileSystemImportManager.promises.importFile = sinon
.stub()
.resolves({ type: 'file' })
ctx.EditorController.promises.upsertFile = sinon
.stub()
.resolves({ _id: 'mock-file-id', type: 'file' })

View File

@@ -45,9 +45,7 @@ describe('TpdsUpdateHandler', function () {
},
}
ctx.FileTypeManager = {
promises: {
shouldIgnore: sinon.stub().resolves(false),
},
shouldIgnore: sinon.stub().returns(false),
}
ctx.Modules = {
promises: {
@@ -225,7 +223,7 @@ describe('TpdsUpdateHandler', function () {
describe('update to a file that should be ignored', async function () {
setupMatchingProjects(['active1'])
beforeEach(function (ctx) {
ctx.FileTypeManager.promises.shouldIgnore.resolves(true)
ctx.FileTypeManager.shouldIgnore.returns(true)
})
receiveUpdate()
expectProjectNotCreated()

View File

@@ -3,58 +3,59 @@ const { expect } = require('chai')
const SandboxedModule = require('sandboxed-module')
const isUtf8 = require('utf-8-validate')
const Settings = require('@overleaf/settings')
const modulePath = '../../../../app/src/Features/Uploads/FileTypeManager.js'
const MODULE_PATH = '../../../../app/src/Features/Uploads/FileTypeManager.js'
describe('FileTypeManager', function () {
const fileContents = 'Ich bin eine kleine Teekanne, kurz und kräftig.'
beforeEach(function () {
this.isUtf8 = sinon.spy(isUtf8)
this.stats = {
isDirectory: sinon.stub().returns(false),
size: 100,
}
const fileContents = 'Ich bin eine kleine Teekanne, kurz und kräftig.'
this.fs = {
stat: sinon.stub().yields(null, this.stats),
stat: sinon.stub().resolves(this.stats),
readFile: sinon.stub(),
}
this.fs.readFile
.withArgs('utf8.tex')
.yields(null, Buffer.from(fileContents, 'utf-8'))
.resolves(Buffer.from(fileContents, 'utf-8'))
this.fs.readFile
.withArgs('utf16.tex')
.yields(null, Buffer.from(`\uFEFF${fileContents}`, 'utf-16le'))
.resolves(Buffer.from(`\uFEFF${fileContents}`, 'utf-16le'))
this.fs.readFile
.withArgs('latin1.tex')
.yields(null, Buffer.from(fileContents, 'latin1'))
.resolves(Buffer.from(fileContents, 'latin1'))
this.fs.readFile
.withArgs('latin1-null.tex')
.yields(null, Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8'))
.resolves(Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8'))
this.fs.readFile
.withArgs('utf8-null.tex')
.yields(null, Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8'))
.resolves(Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8'))
this.fs.readFile
.withArgs('utf8-non-bmp.tex')
.yields(null, Buffer.from(`${fileContents}😈`))
.resolves(Buffer.from(`${fileContents}😈`))
this.fs.readFile
.withArgs('utf8-control-chars.tex')
.yields(null, Buffer.from(`${fileContents}\x0c${fileContents}`))
.resolves(Buffer.from(`${fileContents}\x0c${fileContents}`))
this.fs.readFile
.withArgs('text-short.tex')
.yields(null, Buffer.from('a'.repeat(0.5 * 1024 * 1024), 'utf-8'))
.resolves(Buffer.from('a'.repeat(0.5 * 1024 * 1024), 'utf-8'))
this.fs.readFile
.withArgs('text-smaller.tex')
.yields(null, Buffer.from('a'.repeat(2 * 1024 * 1024 - 1), 'utf-8'))
.resolves(Buffer.from('a'.repeat(2 * 1024 * 1024 - 1), 'utf-8'))
this.fs.readFile
.withArgs('text-exact.tex')
.yields(null, Buffer.from('a'.repeat(2 * 1024 * 1024), 'utf-8'))
.resolves(Buffer.from('a'.repeat(2 * 1024 * 1024), 'utf-8'))
this.fs.readFile
.withArgs('text-long.tex')
.yields(null, Buffer.from('a'.repeat(3 * 1024 * 1024), 'utf-8'))
this.callback = sinon.stub()
this.DocumentHelper = { getEncodingFromTexContent: sinon.stub() }
this.FileTypeManager = SandboxedModule.require(modulePath, {
.resolves(Buffer.from('a'.repeat(3 * 1024 * 1024), 'utf-8'))
this.FileTypeManager = SandboxedModule.require(MODULE_PATH, {
requires: {
fs: this.fs,
'fs/promises': this.fs,
'utf-8-validate': this.isUtf8,
'@overleaf/settings': Settings,
},
@@ -88,6 +89,32 @@ describe('FileTypeManager', function () {
})
})
describe('isEditable', function () {
it('classifies simple UTF-8 as editable', function () {
expect(this.FileTypeManager.isEditable(fileContents)).to.be.true
})
it('classifies text with non-BMP characters as binary', function () {
expect(this.FileTypeManager.isEditable(`${fileContents}😈`)).to.be.false
})
it('classifies a .tex file as editable', function () {
expect(
this.FileTypeManager.isEditable(fileContents, {
filename: 'some/file.tex',
})
).to.be.true
})
it('classifies a .exe file as binary', function () {
expect(
this.FileTypeManager.isEditable(fileContents, {
filename: 'command.exe',
})
).to.be.false
})
})
describe('getType', function () {
describe('when the file extension is text', function () {
const TEXT_FILENAMES = [
@@ -335,76 +362,58 @@ describe('FileTypeManager', function () {
describe('shouldIgnore', function () {
it('should ignore tex auxiliary files', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('file.aux')
const ignore = this.FileTypeManager.shouldIgnore('file.aux')
ignore.should.equal(true)
})
it('should ignore dotfiles', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('path/.git')
const ignore = this.FileTypeManager.shouldIgnore('path/.git')
ignore.should.equal(true)
})
it('should ignore .git directories and contained files', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('path/.git/info')
const ignore = await this.FileTypeManager.shouldIgnore('path/.git/info')
ignore.should.equal(true)
})
it('should not ignore .latexmkrc dotfile', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('path/.latexmkrc')
const ignore = this.FileTypeManager.shouldIgnore('path/.latexmkrc')
ignore.should.equal(false)
})
it('should ignore __MACOSX', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('path/__MACOSX')
const ignore = this.FileTypeManager.shouldIgnore('path/__MACOSX')
ignore.should.equal(true)
})
it('should ignore synctex files', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('file.synctex')
const ignore = this.FileTypeManager.shouldIgnore('file.synctex')
ignore.should.equal(true)
})
it('should ignore synctex(busy) files', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('file.synctex(busy)')
const ignore = this.FileTypeManager.shouldIgnore('file.synctex(busy)')
ignore.should.equal(true)
})
it('should not ignore .tex files', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('file.tex')
const ignore = this.FileTypeManager.shouldIgnore('file.tex')
ignore.should.equal(false)
})
it('should ignore the case of the extension', async function () {
const ignore =
await this.FileTypeManager.promises.shouldIgnore('file.AUX')
const ignore = this.FileTypeManager.shouldIgnore('file.AUX')
ignore.should.equal(true)
})
it('should not ignore files with an ignored extension as full name', async function () {
const ignore = await this.FileTypeManager.promises.shouldIgnore('dvi')
const ignore = this.FileTypeManager.shouldIgnore('dvi')
ignore.should.equal(false)
})
it('should not ignore directories with an ignored extension as full name', async function () {
this.stats.isDirectory.returns(true)
const ignore = await this.FileTypeManager.promises.shouldIgnore('dvi')
const ignore = this.FileTypeManager.shouldIgnore('dvi')
ignore.should.equal(false)
})
})