Laurent Cozic 2024-01-08 10:48:51 +00:00
parent 4e8863d81f
commit 20b1c2e7cb
5 changed files with 90 additions and 125 deletions

View File

@ -6,16 +6,16 @@ const os = require('os');
const { filename } = require('./path-utils'); const { filename } = require('./path-utils');
import { setupDatabaseAndSynchronizer, switchClient, expectNotThrow, supportDir, expectThrow } from './testing/test-utils'; import { setupDatabaseAndSynchronizer, switchClient, expectNotThrow, supportDir, expectThrow } from './testing/test-utils';
const { enexXmlToMd } = require('./import-enex-md-gen.js'); const { enexXmlToMd } = require('./import-enex-md-gen.js');
import importEnex from './import-enex'; import importEnex, { ImportOptions } from './import-enex';
import Note from './models/Note'; import Note from './models/Note';
import Tag from './models/Tag'; import Tag from './models/Tag';
import Resource from './models/Resource'; import Resource from './models/Resource';
const enexSampleBaseDir = `${supportDir}/../enex_to_md`; const enexSampleBaseDir = `${supportDir}/../enex_to_md`;
const importEnexFile = async (filename: string) => { const importEnexFile = async (filename: string, options: ImportOptions = null) => {
const filePath = `${enexSampleBaseDir}/${filename}`; const filePath = `${enexSampleBaseDir}/${filename}`;
await importEnex('', filePath); await importEnex('', filePath, options);
}; };
const readExpectedFile = async (filename: string) => { const readExpectedFile = async (filename: string) => {
@ -221,7 +221,7 @@ describe('import-enex-md-gen', () => {
}); });
it('should resolve note links', async () => { it('should resolve note links', async () => {
await importEnexFile('linked_notes.enex'); await importEnexFile('linked_notes.enex', { batchSize: 1 });
const notes: NoteEntity[] = await Note.all(); const notes: NoteEntity[] = await Note.all();
const note1 = notes.find(n => n.title === 'Note 1'); const note1 = notes.find(n => n.title === 'Note 1');

View File

@ -58,7 +58,6 @@ interface ParserState {
spanAttributes: string[]; spanAttributes: string[];
tags: ParserStateTag[]; tags: ParserStateTag[];
currentCode?: string; currentCode?: string;
evernoteLinkTitles: Record<string, string>;
} }
@ -608,7 +607,6 @@ function enexXmlToMdArray(stream: any, resources: ResourceEntity[], tasks: Extra
anchorAttributes: [], anchorAttributes: [],
spanAttributes: [], spanAttributes: [],
tags: [], tags: [],
evernoteLinkTitles: {},
}; };
const options = {}; const options = {};

View File

@ -1,10 +1,8 @@
import uuid from './uuid'; import uuid from './uuid';
import BaseModel from './BaseModel';
import Note from './models/Note'; import Note from './models/Note';
import Tag from './models/Tag'; import Tag from './models/Tag';
import Resource from './models/Resource'; import Resource from './models/Resource';
import Setting from './models/Setting'; import Setting from './models/Setting';
import time from './time';
import shim from './shim'; import shim from './shim';
import { NoteEntity, ResourceEntity } from './services/database/types'; import { NoteEntity, ResourceEntity } from './services/database/types';
import { enexXmlToMd } from './import-enex-md-gen'; import { enexXmlToMd } from './import-enex-md-gen';
@ -15,7 +13,6 @@ import { extractUrls as extractUrlsFromMarkdown } from '@joplin/utils/markdown';
const moment = require('moment'); const moment = require('moment');
const { wrapError } = require('./errorUtils'); const { wrapError } = require('./errorUtils');
const { enexXmlToHtml } = require('./import-enex-html-gen.js'); const { enexXmlToHtml } = require('./import-enex-html-gen.js');
const Levenshtein = require('levenshtein');
const md5 = require('md5'); const md5 = require('md5');
const { Base64Decode } = require('base64-stream'); const { Base64Decode } = require('base64-stream');
const md5File = require('md5-file'); const md5File = require('md5-file');
@ -96,38 +93,6 @@ function removeUndefinedProperties(note: NoteEntity) {
return output; return output;
} }
function levenshteinPercent(s1: string, s2: string) {
const l = new Levenshtein(s1, s2);
if (!s1.length || !s2.length) return 1;
return Math.abs(l.distance / s1.length);
}
async function fuzzyMatch(note: ExtractedNote) {
if (note.created_time < time.unixMs() - 1000 * 60 * 60 * 24 * 360) {
const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ? AND title = ?', [note.created_time, note.title]);
return notes.length !== 1 ? null : notes[0];
}
const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ?', [note.created_time]);
if (notes.length === 0) return null;
if (notes.length === 1) return notes[0];
let lowestL = 1;
let lowestN = null;
for (let i = 0; i < notes.length; i++) {
const n = notes[i];
const l = levenshteinPercent(note.title, n.title);
if (l < lowestL) {
lowestL = l;
lowestN = n;
}
}
if (lowestN && lowestL < 0.2) return lowestN;
return null;
}
interface ExtractedResource { interface ExtractedResource {
hasData?: boolean; hasData?: boolean;
id?: string; id?: string;
@ -155,6 +120,14 @@ interface ExtractedNote extends NoteEntity {
bodyXml?: string; bodyXml?: string;
} }
// Those are the notes that have been parsed and saved to Joplin. We don't keep
// in memory the whole `ExtractedNote` because it contains resource data, etc.
// We only keep what is needed to restore the note links.
interface SavedNote {
id: string;
body: string;
}
// At this point we have the resource as it's been parsed from the XML, but // At this point we have the resource as it's been parsed from the XML, but
// additional processing needs to be done to get the final resource file, its // additional processing needs to be done to get the final resource file, its
// size, MD5, etc. // size, MD5, etc.
@ -245,26 +218,19 @@ async function saveNoteTags(note: ExtractedNote) {
return notesTagged; return notesTagged;
} }
interface ImportOptions { export interface ImportOptions {
fuzzyMatching?: boolean;
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
onProgress?: Function; onProgress?: Function;
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
onError?: Function; onError?: Function;
outputFormat?: string; outputFormat?: string;
batchSize?: number;
} }
async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptions) { async function saveNoteToStorage(note: ExtractedNote) {
importOptions = { fuzzyMatching: false, ...importOptions };
note = Note.filter(note as any); note = Note.filter(note as any);
const existingNote = importOptions.fuzzyMatching ? await fuzzyMatch(note) : null;
const result = { const result = {
noteCreated: false,
noteUpdated: false,
noteSkipped: false,
resourcesCreated: 0, resourcesCreated: 0,
notesTagged: 0, notesTagged: 0,
}; };
@ -275,28 +241,10 @@ async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptio
const notesTagged = await saveNoteTags(note); const notesTagged = await saveNoteTags(note);
result.notesTagged += notesTagged; result.notesTagged += notesTagged;
if (existingNote) {
const diff = BaseModel.diffObjects(existingNote, note);
delete diff.tags;
delete diff.resources;
delete diff.id;
if (!Object.getOwnPropertyNames(diff).length) {
result.noteSkipped = true;
return result;
}
diff.id = existingNote.id;
diff.type_ = existingNote.type_;
await Note.save(diff, { autoTimestamp: false });
result.noteUpdated = true;
} else {
await Note.save(note, { await Note.save(note, {
isNew: true, isNew: true,
autoTimestamp: false, autoTimestamp: false,
}); });
result.noteCreated = true;
}
return result; return result;
} }
@ -345,12 +293,47 @@ const preProcessFile = async (filePath: string): Promise<string> => {
// return newFilePath; // return newFilePath;
}; };
export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) {
if (!importOptions) importOptions = {};
if (!('fuzzyMatching' in importOptions)) importOptions.fuzzyMatching = false;
if (!('onProgress' in importOptions)) importOptions.onProgress = function() {};
if (!('onError' in importOptions)) importOptions.onError = function() {};
const restoreNoteLinks = async (notes: SavedNote[], noteTitlesToIds: Record<string, string[]>, importOptions: ImportOptions) => {
// --------------------------------------------------------
// Convert the Evernote note links to Joplin note links. If
// we don't find a matching note, or if there are multiple
// matching notes, we leave the Evernote links as is.
// --------------------------------------------------------
for (const note of notes) {
const links = importOptions.outputFormat === 'html' ?
extractUrlsFromHtml(note.body) :
extractUrlsFromMarkdown(note.body);
let noteChanged = false;
for (const link of links) {
const matchingNoteIds = noteTitlesToIds[link.title];
if (matchingNoteIds && matchingNoteIds.length === 1) {
note.body = note.body.replace(link.url, `:/${matchingNoteIds[0]}`);
noteChanged = true;
}
}
if (noteChanged) {
await Note.save({
id: note.id,
body: note.body,
updated_time: Date.now(),
}, {
autoTimestamp: false,
});
}
}
};
interface ParseNotesResult {
savedNotes: SavedNote[];
noteTitlesToIds: Record<string, string[]>;
}
const parseNotes = async (parentFolderId: string, filePath: string, importOptions: ImportOptions = null): Promise<ParseNotesResult> => {
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
function handleSaxStreamEvent(fn: Function) { function handleSaxStreamEvent(fn: Function) {
return function(...args: any[]) { return function(...args: any[]) {
@ -397,6 +380,9 @@ export default async function importEnex(parentFolderId: string, filePath: strin
let noteResourceRecognition: NoteResourceRecognition = null; let noteResourceRecognition: NoteResourceRecognition = null;
const notes: ExtractedNote[] = []; const notes: ExtractedNote[] = [];
let processingNotes = false; let processingNotes = false;
const savedNotes: SavedNote[] = [];
const createdNoteIds: string[] = [];
const noteTitlesToIds: Record<string, string[]> = {};
const createErrorWithNoteTitle = (fnThis: any, error: any) => { const createErrorWithNoteTitle = (fnThis: any, error: any) => {
const line = []; const line = [];
@ -437,15 +423,6 @@ export default async function importEnex(parentFolderId: string, filePath: strin
processingNotes = true; processingNotes = true;
stream.pause(); stream.pause();
// Set the note ID so that we can create a title-to-id map, which
// will be needed to recreate the note links below.
const noteTitleToId: Record<string, string[]> = {};
for (const note of notes) {
if (!noteTitleToId[note.title]) noteTitleToId[note.title] = [];
note.id = uuid.create();
noteTitleToId[note.title].push(note.id);
}
while (notes.length) { while (notes.length) {
const note = notes.shift(); const note = notes.shift();
@ -467,32 +444,16 @@ export default async function importEnex(parentFolderId: string, filePath: strin
// Convert the ENEX body to either Markdown or HTML // Convert the ENEX body to either Markdown or HTML
// -------------------------------------------------------- // --------------------------------------------------------
let body: string = importOptions.outputFormat === 'html' ? const body: string = importOptions.outputFormat === 'html' ?
await enexXmlToHtml(note.bodyXml, note.resources) : await enexXmlToHtml(note.bodyXml, note.resources) :
await enexXmlToMd(note.bodyXml, note.resources, note.tasks); await enexXmlToMd(note.bodyXml, note.resources, note.tasks);
delete note.bodyXml; delete note.bodyXml;
// --------------------------------------------------------
// Convert the Evernote note links to Joplin note links. If
// we don't find a matching note, or if there are multiple
// matching notes, we leave the Evernote links as is.
// --------------------------------------------------------
const links = importOptions.outputFormat === 'html' ?
extractUrlsFromHtml(body) :
extractUrlsFromMarkdown(body);
for (const link of links) {
const matchingNoteIds = noteTitleToId[link.title];
if (matchingNoteIds && matchingNoteIds.length === 1) {
body = body.replace(link.url, `:/${matchingNoteIds[0]}`);
}
}
// -------------------------------------------------------- // --------------------------------------------------------
// Finish setting up the note // Finish setting up the note
// -------------------------------------------------------- // --------------------------------------------------------
note.id = uuid.create();
note.markup_language = importOptions.outputFormat === 'html' ? note.markup_language = importOptions.outputFormat === 'html' ?
MarkupToHtml.MARKUP_LANGUAGE_HTML : MarkupToHtml.MARKUP_LANGUAGE_HTML :
MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN; MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
@ -511,15 +472,17 @@ export default async function importEnex(parentFolderId: string, filePath: strin
// that case // that case
if (!note.updated_time) note.updated_time = note.created_time; if (!note.updated_time) note.updated_time = note.created_time;
const result = await saveNoteToStorage(note, importOptions); const result = await saveNoteToStorage(note);
createdNoteIds.push(note.id);
if (!noteTitlesToIds[note.title]) noteTitlesToIds[note.title] = [];
noteTitlesToIds[note.title].push(note.id);
savedNotes.push({
id: note.id,
body: note.body,
});
if (result.noteUpdated) {
progressState.updated++;
} else if (result.noteCreated) {
progressState.created++; progressState.created++;
} else if (result.noteSkipped) {
progressState.skipped++;
}
progressState.resourcesCreated += result.resourcesCreated; progressState.resourcesCreated += result.resourcesCreated;
progressState.notesTagged += result.notesTagged; progressState.notesTagged += result.notesTagged;
importOptions.onProgress(progressState); importOptions.onProgress(progressState);
@ -648,7 +611,7 @@ export default async function importEnex(parentFolderId: string, filePath: strin
notes.push(note); notes.push(note);
if (notes.length >= 10) { if (notes.length >= importOptions.batchSize) {
// eslint-disable-next-line promise/prefer-await-to-then -- Old code before rule was applied // eslint-disable-next-line promise/prefer-await-to-then -- Old code before rule was applied
processNotes().catch(error => { processNotes().catch(error => {
importOptions.onError(createErrorWithNoteTitle(this, error)); importOptions.onError(createErrorWithNoteTitle(this, error));
@ -718,12 +681,25 @@ export default async function importEnex(parentFolderId: string, filePath: strin
if (allDone) { if (allDone) {
shim.clearTimeout(iid); shim.clearTimeout(iid);
if (needToDeleteFileToProcess) void shim.fsDriver().remove(fileToProcess); if (needToDeleteFileToProcess) void shim.fsDriver().remove(fileToProcess);
resolve(null); resolve({
savedNotes,
noteTitlesToIds,
});
} }
}); });
}, 500); }, 1000);
})); }));
stream.pipe(saxStream); stream.pipe(saxStream);
}); });
};
export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) {
if (!importOptions) importOptions = {};
if (!('onProgress' in importOptions)) importOptions.onProgress = function() {};
if (!('onError' in importOptions)) importOptions.onError = function() {};
if (!('batchSize' in importOptions)) importOptions.batchSize = 10;
const result = await parseNotes(parentFolderId, filePath, importOptions);
await restoreNoteLinks(result.savedNotes, result.noteTitlesToIds, importOptions);
} }

View File

@ -66,7 +66,6 @@
"image-type": "3.1.0", "image-type": "3.1.0",
"immer": "7.0.15", "immer": "7.0.15",
"js-yaml": "4.1.0", "js-yaml": "4.1.0",
"levenshtein": "1.0.5",
"markdown-it": "13.0.2", "markdown-it": "13.0.2",
"md5": "2.3.0", "md5": "2.3.0",
"md5-file": "5.0.0", "md5-file": "5.0.0",

View File

@ -6871,7 +6871,6 @@ __metadata:
immer: 7.0.15 immer: 7.0.15
jest: 29.7.0 jest: 29.7.0
js-yaml: 4.1.0 js-yaml: 4.1.0
levenshtein: 1.0.5
markdown-it: 13.0.2 markdown-it: 13.0.2
md5: 2.3.0 md5: 2.3.0
md5-file: 5.0.0 md5-file: 5.0.0
@ -27688,13 +27687,6 @@ __metadata:
languageName: node languageName: node
linkType: hard linkType: hard
"levenshtein@npm:1.0.5":
version: 1.0.5
resolution: "levenshtein@npm:1.0.5"
checksum: d5ceca3bfc4804ad50515291841d968eea5f1f740310c21b5ae6cb6d5514ee68b9c00405059f36934611d3258967bad6d306dcf299f446c7cdd25bdda2c4720c
languageName: node
linkType: hard
"levn@npm:^0.4.1": "levn@npm:^0.4.1":
version: 0.4.1 version: 0.4.1
resolution: "levn@npm:0.4.1" resolution: "levn@npm:0.4.1"