diff --git a/services/web/app/src/Features/History/RestoreManager.mjs b/services/web/app/src/Features/History/RestoreManager.mjs index b50521245e..7153b13263 100644 --- a/services/web/app/src/Features/History/RestoreManager.mjs +++ b/services/web/app/src/Features/History/RestoreManager.mjs @@ -3,6 +3,7 @@ import Path from 'node:path' import FileWriter from '../../infrastructure/FileWriter.js' import Metrics from '../../infrastructure/Metrics.js' import FileSystemImportManager from '../Uploads/FileSystemImportManager.js' +import FileTypeManager from '../Uploads/FileTypeManager.js' import EditorController from '../Editor/EditorController.js' import Errors from '../Errors/Errors.js' import moment from 'moment' @@ -114,11 +115,6 @@ const RestoreManager = { throw new OError('project does not have ranges support', { projectId }) } - const fsPath = await RestoreManager._writeFileVersionToDisk( - projectId, - version, - pathname - ) const basename = Path.basename(pathname) let dirname = Path.dirname(pathname) if (dirname === '.') { @@ -142,18 +138,13 @@ const RestoreManager = { throw new OError('file not found in snapshot', { pathname }) } - const importInfo = await FileSystemImportManager.promises.importFile( - fsPath, - pathname - ) - let hadDeletedRootFile = false if (file) { if (file.type !== 'doc' && file.type !== 'file') { throw new OError('unexpected file type', { type: file.type }) } logger.debug( - { projectId, fileId: file.element._id, type: importInfo.type }, + { projectId, fileId: file.element._id }, 'deleting entity before reverting it' ) await EditorController.promises.deleteEntity( @@ -177,15 +168,22 @@ const RestoreManager = { // Look for metadata indicating a linked file. const fileMetadata = snapshotFile.getMetadata() - const isFileMetadata = fileMetadata && 'provider' in fileMetadata + const isLinkedFile = fileMetadata && 'provider' in fileMetadata logger.debug({ fileMetadata }, 'metadata from history') if ( + isLinkedFile || !snapshotFile.isEditable() || - importInfo.type === 'file' || - isFileMetadata + !FileTypeManager.isEditable(snapshotFile.getContent(), { + filename: pathname, + }) ) { + const fsPath = await RestoreManager._writeFileVersionToDisk( + projectId, + version, + pathname + ) const newFile = await EditorController.promises.upsertFile( projectId, parentFolderId, @@ -320,7 +318,7 @@ const RestoreManager = { endTimer({ type: 'doc' }) return { _id, - type: importInfo.type, + type: 'doc', } }, diff --git a/services/web/app/src/Features/ThirdPartyDataStore/TpdsUpdateHandler.mjs b/services/web/app/src/Features/ThirdPartyDataStore/TpdsUpdateHandler.mjs index be2fc8a6ac..3463a3ad2e 100644 --- a/services/web/app/src/Features/ThirdPartyDataStore/TpdsUpdateHandler.mjs +++ b/services/web/app/src/Features/ThirdPartyDataStore/TpdsUpdateHandler.mjs @@ -31,7 +31,7 @@ async function newUpdate( throw new Errors.TooManyRequestsError('project on cooldown') } - const shouldIgnore = await FileTypeManager.promises.shouldIgnore(path) + const shouldIgnore = FileTypeManager.shouldIgnore(path) if (shouldIgnore) { return null } @@ -175,7 +175,7 @@ async function createFolder(userId, projectId, projectName, path) { throw new Errors.TooManyRequestsError('project on cooldown') } - const shouldIgnore = await FileTypeManager.promises.shouldIgnore(path) + const shouldIgnore = FileTypeManager.shouldIgnore(path) if (shouldIgnore) { return null } diff --git a/services/web/app/src/Features/Uploads/FileSystemImportManager.js b/services/web/app/src/Features/Uploads/FileSystemImportManager.js index 22533af8fa..1eb593d420 100644 --- a/services/web/app/src/Features/Uploads/FileSystemImportManager.js +++ b/services/web/app/src/Features/Uploads/FileSystemImportManager.js @@ -96,7 +96,7 @@ async function addFolderContents( } const entries = (await fs.promises.readdir(folderPath)) || [] for (const entry of entries) { - if (await FileTypeManager.promises.shouldIgnore(entry)) { + if (FileTypeManager.shouldIgnore(entry)) { continue } await addEntity( @@ -227,7 +227,7 @@ async function* _walkDir(dirPath) { const entries = await fs.promises.readdir(dirPath) for (const entry of entries) { const entryPath = Path.join(dirPath, entry) - if (await FileTypeManager.promises.shouldIgnore(entryPath)) { + if (FileTypeManager.shouldIgnore(entryPath)) { continue } diff --git a/services/web/app/src/Features/Uploads/FileTypeManager.js b/services/web/app/src/Features/Uploads/FileTypeManager.js index 5dc7ebbb20..bd6f1d9e0d 100644 --- a/services/web/app/src/Features/Uploads/FileTypeManager.js +++ b/services/web/app/src/Features/Uploads/FileTypeManager.js @@ -1,96 +1,101 @@ -const fs = require('fs') +// @ts-check + +const fs = require('fs/promises') const Path = require('path') +const { callbackify } = require('util') const isUtf8 = require('utf-8-validate') -const { promisifyAll } = require('@overleaf/promise-utils') const Settings = require('@overleaf/settings') const Minimatch = require('minimatch').Minimatch -const fileIgnoreMatcher = new Minimatch(Settings.fileIgnorePattern, { - nocase: true, // make the whole path matching case-insensitive - // (previously we were only matching the extension case-insensitively but it seems safer to match the whole path) - dot: true, // allows matching on paths containing a dot e.g. /.git/foo/bar.txt +const FILE_IGNORE_MATCHER = new Minimatch(Settings.fileIgnorePattern, { + // make the whole path matching case-insensitive (previously we were only + // matching the extension case-insensitively but it seems safer to match the + // whole path) + nocase: true, + // allows matching on paths containing a dot e.g. /.git/foo/bar.txt + dot: true, }) -const FileTypeManager = { - TEXT_EXTENSIONS: new Set(Settings.textExtensions.map(ext => `.${ext}`)), - EDITABLE_FILENAMES: Settings.editableFilenames, +const TEXT_EXTENSIONS = new Set(Settings.textExtensions.map(ext => `.${ext}`)) +const EDITABLE_FILENAMES = Settings.editableFilenames - MAX_TEXT_FILE_SIZE: 3 * Settings.max_doc_length, // allow 3 bytes for every character +// allow 3 bytes for every character +const MAX_TEXT_FILE_SIZE = 3 * Settings.max_doc_length - isDirectory(path, callback) { - fs.stat(path, (error, stats) => { - if (error != null) { - return callback(error) - } - callback(null, stats.isDirectory()) - }) - }, +async function isDirectory(path) { + const stats = await fs.stat(path) + return stats.isDirectory() +} - // returns charset as understood by fs.readFile, - getType(name, fsPath, existingFileType, callback) { - if (!name) { - return callback( - new Error( - '[FileTypeManager] getType requires a non-null "name" parameter' - ) - ) - } - if (!fsPath) { - return callback( - new Error( - '[FileTypeManager] getType requires a non-null "fsPath" parameter' - ) - ) - } - const basename = Path.basename(name) - if (existingFileType !== 'doc' && !_isTextFilename(basename)) { - return callback(null, { binary: true }) - } +/** + * Determine whether a string can be stored as an editable doc + * + * @param {string} content + * @param {object} [opts] + * @param {string} [opts.filename] - if a filename is given, the algorithm also + * checks whether the filename matches the list of editable filenames + */ +function isEditable(content, opts = {}) { + if (opts.filename != null && !_isTextFilename(opts.filename)) { + return false + } - fs.stat(fsPath, (err, stat) => { - if (err != null) { - return callback(err) - } - if (stat.size > FileTypeManager.MAX_TEXT_FILE_SIZE) { - return callback(null, { binary: true }) // Treat large text file as binary - } + if (content.length >= Settings.max_doc_length) { + return false + } - fs.readFile(fsPath, (err, bytes) => { - if (err != null) { - return callback(err) - } - const encoding = _detectEncoding(bytes) - const text = bytes.toString(encoding) - if (text.length >= Settings.max_doc_length) { - return callback(null, { binary: true }) // Treat large text file as binary - } - // For compatibility with the history service, only accept valid utf8 with no - // nulls or non-BMP characters as text, eveything else is binary. - if (text.includes('\x00')) { - return callback(null, { binary: true }) - } - if (/[\uD800-\uDFFF]/.test(text)) { - // non-BMP characters (high and low surrogate characters) - return callback(null, { binary: true }) - } - callback(null, { binary: false, encoding }) - }) - }) - }, + // For compatibility with the history service, only accept valid utf8 with no + // nulls or non-BMP characters as text, eveything else is binary. + if (content.includes('\x00')) { + return false + } + // non-BMP characters (high and low surrogate characters) + if (/[\uD800-\uDFFF]/.test(content)) { + return false + } + return true +} - // FIXME: we can convert this to a synchronous function if we want to - shouldIgnore(path, callback) { - // use minimatch file matching to check if the path should be ignored - const ignore = fileIgnoreMatcher.match(path) - callback(null, ignore) - }, +/** + * Determine whether a file can be stored as an editable doc + * + * @param {string} name - target filename + * @param {string} fsPath - path of the file on the filesystem + * @param {'file' | 'doc' | null} existingFileType - current type of the file at + * the target location + */ +async function getType(name, fsPath, existingFileType) { + if (existingFileType !== 'doc' && !_isTextFilename(name)) { + return { binary: true } + } + + const stat = await fs.stat(fsPath) + if (stat.size > MAX_TEXT_FILE_SIZE) { + return { binary: true } + } + + const bytes = await fs.readFile(fsPath) + const encoding = _detectEncoding(bytes) + const text = bytes.toString(encoding) + + if (isEditable(text)) { + return { binary: false, encoding } + } else { + return { binary: true } + } +} + +function shouldIgnore(path) { + // use minimatch file matching to check if the path should be ignored + return FILE_IGNORE_MATCHER.match(path) } function _isTextFilename(filename) { + const basename = Path.basename(filename) const extension = Path.extname(filename).toLowerCase() return ( - FileTypeManager.TEXT_EXTENSIONS.has(extension) || - FileTypeManager.EDITABLE_FILENAMES.includes(filename.toLowerCase()) + TEXT_EXTENSIONS.has(extension) || + EDITABLE_FILENAMES.includes(basename.toLowerCase()) ) } @@ -105,7 +110,13 @@ function _detectEncoding(bytes) { return 'latin1' } -module.exports = FileTypeManager -module.exports.promises = promisifyAll(FileTypeManager, { - without: ['getStrictTypeFromContent'], -}) +module.exports = { + shouldIgnore, + isEditable, + getType: callbackify(getType), + isDirectory: callbackify(isDirectory), + promises: { + getType, + isDirectory, + }, +} diff --git a/services/web/test/unit/src/History/RestoreManager.test.mjs b/services/web/test/unit/src/History/RestoreManager.test.mjs index 003731eb78..43b9057e5c 100644 --- a/services/web/test/unit/src/History/RestoreManager.test.mjs +++ b/services/web/test/unit/src/History/RestoreManager.test.mjs @@ -487,9 +487,6 @@ describe('RestoreManager', function () { metadata: { ts: '2024-01-01T00:00:00.000Z', user_id: 'user-2' }, }, ] - ctx.FileSystemImportManager.promises.importFile = sinon - .stub() - .resolves({ type: 'doc', lines: ['foo', 'bar', 'baz'] }) ctx.getDocUpdaterCompatibleRanges.returns({ changes: ctx.tracked_changes, comments: ctx.comments, @@ -933,9 +930,6 @@ describe('RestoreManager', function () { describe('when reverting a linked file', function () { beforeEach(async function (ctx) { ctx.pathname = 'foo.png' - ctx.FileSystemImportManager.promises.importFile = sinon - .stub() - .resolves({ type: 'file' }) ctx.result = await ctx.RestoreManager.promises.revertFile( ctx.user_id, ctx.project_id, @@ -979,9 +973,6 @@ describe('RestoreManager', function () { describe('when reverting a linked document with provider', function () { beforeEach(async function (ctx) { ctx.pathname = 'linkedFile.bib' - ctx.FileSystemImportManager.promises.importFile = sinon - .stub() - .resolves({ type: 'doc', lines: ['foo', 'bar', 'baz'] }) ctx.result = await ctx.RestoreManager.promises.revertFile( ctx.user_id, ctx.project_id, @@ -1025,9 +1016,6 @@ describe('RestoreManager', function () { describe('when reverting a linked document with { main: true }', function () { beforeEach(async function (ctx) { ctx.pathname = 'withMainTrue.tex' - ctx.FileSystemImportManager.promises.importFile = sinon - .stub() - .resolves({ type: 'doc', lines: ['foo', 'bar', 'baz'] }) ctx.result = await ctx.RestoreManager.promises.revertFile( ctx.user_id, ctx.project_id, @@ -1065,9 +1053,6 @@ describe('RestoreManager', function () { describe('when reverting a binary file', function () { beforeEach(async function (ctx) { ctx.pathname = 'foo.png' - ctx.FileSystemImportManager.promises.importFile = sinon - .stub() - .resolves({ type: 'file' }) ctx.EditorController.promises.upsertFile = sinon .stub() .resolves({ _id: 'mock-file-id', type: 'file' }) diff --git a/services/web/test/unit/src/ThirdPartyDataStore/TpdsUpdateHandler.test.mjs b/services/web/test/unit/src/ThirdPartyDataStore/TpdsUpdateHandler.test.mjs index 863a4ce2f4..af771f75f4 100644 --- a/services/web/test/unit/src/ThirdPartyDataStore/TpdsUpdateHandler.test.mjs +++ b/services/web/test/unit/src/ThirdPartyDataStore/TpdsUpdateHandler.test.mjs @@ -45,9 +45,7 @@ describe('TpdsUpdateHandler', function () { }, } ctx.FileTypeManager = { - promises: { - shouldIgnore: sinon.stub().resolves(false), - }, + shouldIgnore: sinon.stub().returns(false), } ctx.Modules = { promises: { @@ -225,7 +223,7 @@ describe('TpdsUpdateHandler', function () { describe('update to a file that should be ignored', async function () { setupMatchingProjects(['active1']) beforeEach(function (ctx) { - ctx.FileTypeManager.promises.shouldIgnore.resolves(true) + ctx.FileTypeManager.shouldIgnore.returns(true) }) receiveUpdate() expectProjectNotCreated() diff --git a/services/web/test/unit/src/Uploads/FileTypeManagerTests.js b/services/web/test/unit/src/Uploads/FileTypeManagerTests.js index 032f11ed57..97f0eff11e 100644 --- a/services/web/test/unit/src/Uploads/FileTypeManagerTests.js +++ b/services/web/test/unit/src/Uploads/FileTypeManagerTests.js @@ -3,58 +3,59 @@ const { expect } = require('chai') const SandboxedModule = require('sandboxed-module') const isUtf8 = require('utf-8-validate') const Settings = require('@overleaf/settings') -const modulePath = '../../../../app/src/Features/Uploads/FileTypeManager.js' + +const MODULE_PATH = '../../../../app/src/Features/Uploads/FileTypeManager.js' describe('FileTypeManager', function () { + const fileContents = 'Ich bin eine kleine Teekanne, kurz und kräftig.' + beforeEach(function () { this.isUtf8 = sinon.spy(isUtf8) this.stats = { isDirectory: sinon.stub().returns(false), size: 100, } - const fileContents = 'Ich bin eine kleine Teekanne, kurz und kräftig.' this.fs = { - stat: sinon.stub().yields(null, this.stats), + stat: sinon.stub().resolves(this.stats), readFile: sinon.stub(), } this.fs.readFile .withArgs('utf8.tex') - .yields(null, Buffer.from(fileContents, 'utf-8')) + .resolves(Buffer.from(fileContents, 'utf-8')) this.fs.readFile .withArgs('utf16.tex') - .yields(null, Buffer.from(`\uFEFF${fileContents}`, 'utf-16le')) + .resolves(Buffer.from(`\uFEFF${fileContents}`, 'utf-16le')) this.fs.readFile .withArgs('latin1.tex') - .yields(null, Buffer.from(fileContents, 'latin1')) + .resolves(Buffer.from(fileContents, 'latin1')) this.fs.readFile .withArgs('latin1-null.tex') - .yields(null, Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8')) + .resolves(Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8')) this.fs.readFile .withArgs('utf8-null.tex') - .yields(null, Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8')) + .resolves(Buffer.from(`${fileContents}\x00${fileContents}`, 'utf-8')) this.fs.readFile .withArgs('utf8-non-bmp.tex') - .yields(null, Buffer.from(`${fileContents}😈`)) + .resolves(Buffer.from(`${fileContents}😈`)) this.fs.readFile .withArgs('utf8-control-chars.tex') - .yields(null, Buffer.from(`${fileContents}\x0c${fileContents}`)) + .resolves(Buffer.from(`${fileContents}\x0c${fileContents}`)) this.fs.readFile .withArgs('text-short.tex') - .yields(null, Buffer.from('a'.repeat(0.5 * 1024 * 1024), 'utf-8')) + .resolves(Buffer.from('a'.repeat(0.5 * 1024 * 1024), 'utf-8')) this.fs.readFile .withArgs('text-smaller.tex') - .yields(null, Buffer.from('a'.repeat(2 * 1024 * 1024 - 1), 'utf-8')) + .resolves(Buffer.from('a'.repeat(2 * 1024 * 1024 - 1), 'utf-8')) this.fs.readFile .withArgs('text-exact.tex') - .yields(null, Buffer.from('a'.repeat(2 * 1024 * 1024), 'utf-8')) + .resolves(Buffer.from('a'.repeat(2 * 1024 * 1024), 'utf-8')) this.fs.readFile .withArgs('text-long.tex') - .yields(null, Buffer.from('a'.repeat(3 * 1024 * 1024), 'utf-8')) - this.callback = sinon.stub() - this.DocumentHelper = { getEncodingFromTexContent: sinon.stub() } - this.FileTypeManager = SandboxedModule.require(modulePath, { + .resolves(Buffer.from('a'.repeat(3 * 1024 * 1024), 'utf-8')) + + this.FileTypeManager = SandboxedModule.require(MODULE_PATH, { requires: { - fs: this.fs, + 'fs/promises': this.fs, 'utf-8-validate': this.isUtf8, '@overleaf/settings': Settings, }, @@ -88,6 +89,32 @@ describe('FileTypeManager', function () { }) }) + describe('isEditable', function () { + it('classifies simple UTF-8 as editable', function () { + expect(this.FileTypeManager.isEditable(fileContents)).to.be.true + }) + + it('classifies text with non-BMP characters as binary', function () { + expect(this.FileTypeManager.isEditable(`${fileContents}😈`)).to.be.false + }) + + it('classifies a .tex file as editable', function () { + expect( + this.FileTypeManager.isEditable(fileContents, { + filename: 'some/file.tex', + }) + ).to.be.true + }) + + it('classifies a .exe file as binary', function () { + expect( + this.FileTypeManager.isEditable(fileContents, { + filename: 'command.exe', + }) + ).to.be.false + }) + }) + describe('getType', function () { describe('when the file extension is text', function () { const TEXT_FILENAMES = [ @@ -335,76 +362,58 @@ describe('FileTypeManager', function () { describe('shouldIgnore', function () { it('should ignore tex auxiliary files', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('file.aux') + const ignore = this.FileTypeManager.shouldIgnore('file.aux') ignore.should.equal(true) }) it('should ignore dotfiles', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('path/.git') - + const ignore = this.FileTypeManager.shouldIgnore('path/.git') ignore.should.equal(true) }) it('should ignore .git directories and contained files', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('path/.git/info') - + const ignore = await this.FileTypeManager.shouldIgnore('path/.git/info') ignore.should.equal(true) }) it('should not ignore .latexmkrc dotfile', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('path/.latexmkrc') - + const ignore = this.FileTypeManager.shouldIgnore('path/.latexmkrc') ignore.should.equal(false) }) it('should ignore __MACOSX', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('path/__MACOSX') - + const ignore = this.FileTypeManager.shouldIgnore('path/__MACOSX') ignore.should.equal(true) }) it('should ignore synctex files', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('file.synctex') - + const ignore = this.FileTypeManager.shouldIgnore('file.synctex') ignore.should.equal(true) }) it('should ignore synctex(busy) files', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('file.synctex(busy)') - + const ignore = this.FileTypeManager.shouldIgnore('file.synctex(busy)') ignore.should.equal(true) }) it('should not ignore .tex files', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('file.tex') - + const ignore = this.FileTypeManager.shouldIgnore('file.tex') ignore.should.equal(false) }) it('should ignore the case of the extension', async function () { - const ignore = - await this.FileTypeManager.promises.shouldIgnore('file.AUX') - + const ignore = this.FileTypeManager.shouldIgnore('file.AUX') ignore.should.equal(true) }) it('should not ignore files with an ignored extension as full name', async function () { - const ignore = await this.FileTypeManager.promises.shouldIgnore('dvi') + const ignore = this.FileTypeManager.shouldIgnore('dvi') ignore.should.equal(false) }) it('should not ignore directories with an ignored extension as full name', async function () { this.stats.isDirectory.returns(true) - const ignore = await this.FileTypeManager.promises.shouldIgnore('dvi') - + const ignore = this.FileTypeManager.shouldIgnore('dvi') ignore.should.equal(false) }) })