Laurent Cozic 2024-01-08 10:48:51 +00:00
parent 4e8863d81f
commit 20b1c2e7cb
5 changed files with 90 additions and 125 deletions

View File

@ -6,16 +6,16 @@ const os = require('os');
const { filename } = require('./path-utils');
import { setupDatabaseAndSynchronizer, switchClient, expectNotThrow, supportDir, expectThrow } from './testing/test-utils';
const { enexXmlToMd } = require('./import-enex-md-gen.js');
import importEnex from './import-enex';
import importEnex, { ImportOptions } from './import-enex';
import Note from './models/Note';
import Tag from './models/Tag';
import Resource from './models/Resource';
const enexSampleBaseDir = `${supportDir}/../enex_to_md`;
const importEnexFile = async (filename: string) => {
const importEnexFile = async (filename: string, options: ImportOptions = null) => {
const filePath = `${enexSampleBaseDir}/${filename}`;
await importEnex('', filePath);
await importEnex('', filePath, options);
};
const readExpectedFile = async (filename: string) => {
@ -221,7 +221,7 @@ describe('import-enex-md-gen', () => {
});
it('should resolve note links', async () => {
await importEnexFile('linked_notes.enex');
await importEnexFile('linked_notes.enex', { batchSize: 1 });
const notes: NoteEntity[] = await Note.all();
const note1 = notes.find(n => n.title === 'Note 1');

View File

@ -58,7 +58,6 @@ interface ParserState {
spanAttributes: string[];
tags: ParserStateTag[];
currentCode?: string;
evernoteLinkTitles: Record<string, string>;
}
@ -608,7 +607,6 @@ function enexXmlToMdArray(stream: any, resources: ResourceEntity[], tasks: Extra
anchorAttributes: [],
spanAttributes: [],
tags: [],
evernoteLinkTitles: {},
};
const options = {};

View File

@ -1,10 +1,8 @@
import uuid from './uuid';
import BaseModel from './BaseModel';
import Note from './models/Note';
import Tag from './models/Tag';
import Resource from './models/Resource';
import Setting from './models/Setting';
import time from './time';
import shim from './shim';
import { NoteEntity, ResourceEntity } from './services/database/types';
import { enexXmlToMd } from './import-enex-md-gen';
@ -15,7 +13,6 @@ import { extractUrls as extractUrlsFromMarkdown } from '@joplin/utils/markdown';
const moment = require('moment');
const { wrapError } = require('./errorUtils');
const { enexXmlToHtml } = require('./import-enex-html-gen.js');
const Levenshtein = require('levenshtein');
const md5 = require('md5');
const { Base64Decode } = require('base64-stream');
const md5File = require('md5-file');
@ -96,38 +93,6 @@ function removeUndefinedProperties(note: NoteEntity) {
return output;
}
function levenshteinPercent(s1: string, s2: string) {
const l = new Levenshtein(s1, s2);
if (!s1.length || !s2.length) return 1;
return Math.abs(l.distance / s1.length);
}
async function fuzzyMatch(note: ExtractedNote) {
if (note.created_time < time.unixMs() - 1000 * 60 * 60 * 24 * 360) {
const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ? AND title = ?', [note.created_time, note.title]);
return notes.length !== 1 ? null : notes[0];
}
const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ?', [note.created_time]);
if (notes.length === 0) return null;
if (notes.length === 1) return notes[0];
let lowestL = 1;
let lowestN = null;
for (let i = 0; i < notes.length; i++) {
const n = notes[i];
const l = levenshteinPercent(note.title, n.title);
if (l < lowestL) {
lowestL = l;
lowestN = n;
}
}
if (lowestN && lowestL < 0.2) return lowestN;
return null;
}
interface ExtractedResource {
hasData?: boolean;
id?: string;
@ -155,6 +120,14 @@ interface ExtractedNote extends NoteEntity {
bodyXml?: string;
}
// Those are the notes that have been parsed and saved to Joplin. We don't keep
// in memory the whole `ExtractedNote` because it contains resource data, etc.
// We only keep what is needed to restore the note links.
interface SavedNote {
id: string;
body: string;
}
// At this point we have the resource as it's been parsed from the XML, but
// additional processing needs to be done to get the final resource file, its
// size, MD5, etc.
@ -245,26 +218,19 @@ async function saveNoteTags(note: ExtractedNote) {
return notesTagged;
}
interface ImportOptions {
fuzzyMatching?: boolean;
export interface ImportOptions {
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
onProgress?: Function;
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
onError?: Function;
outputFormat?: string;
batchSize?: number;
}
async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptions) {
importOptions = { fuzzyMatching: false, ...importOptions };
async function saveNoteToStorage(note: ExtractedNote) {
note = Note.filter(note as any);
const existingNote = importOptions.fuzzyMatching ? await fuzzyMatch(note) : null;
const result = {
noteCreated: false,
noteUpdated: false,
noteSkipped: false,
resourcesCreated: 0,
notesTagged: 0,
};
@ -275,28 +241,10 @@ async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptio
const notesTagged = await saveNoteTags(note);
result.notesTagged += notesTagged;
if (existingNote) {
const diff = BaseModel.diffObjects(existingNote, note);
delete diff.tags;
delete diff.resources;
delete diff.id;
if (!Object.getOwnPropertyNames(diff).length) {
result.noteSkipped = true;
return result;
}
diff.id = existingNote.id;
diff.type_ = existingNote.type_;
await Note.save(diff, { autoTimestamp: false });
result.noteUpdated = true;
} else {
await Note.save(note, {
isNew: true,
autoTimestamp: false,
});
result.noteCreated = true;
}
return result;
}
@ -345,12 +293,47 @@ const preProcessFile = async (filePath: string): Promise<string> => {
// return newFilePath;
};
export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) {
if (!importOptions) importOptions = {};
if (!('fuzzyMatching' in importOptions)) importOptions.fuzzyMatching = false;
if (!('onProgress' in importOptions)) importOptions.onProgress = function() {};
if (!('onError' in importOptions)) importOptions.onError = function() {};
const restoreNoteLinks = async (notes: SavedNote[], noteTitlesToIds: Record<string, string[]>, importOptions: ImportOptions) => {
// --------------------------------------------------------
// Convert the Evernote note links to Joplin note links. If
// we don't find a matching note, or if there are multiple
// matching notes, we leave the Evernote links as is.
// --------------------------------------------------------
for (const note of notes) {
const links = importOptions.outputFormat === 'html' ?
extractUrlsFromHtml(note.body) :
extractUrlsFromMarkdown(note.body);
let noteChanged = false;
for (const link of links) {
const matchingNoteIds = noteTitlesToIds[link.title];
if (matchingNoteIds && matchingNoteIds.length === 1) {
note.body = note.body.replace(link.url, `:/${matchingNoteIds[0]}`);
noteChanged = true;
}
}
if (noteChanged) {
await Note.save({
id: note.id,
body: note.body,
updated_time: Date.now(),
}, {
autoTimestamp: false,
});
}
}
};
interface ParseNotesResult {
savedNotes: SavedNote[];
noteTitlesToIds: Record<string, string[]>;
}
const parseNotes = async (parentFolderId: string, filePath: string, importOptions: ImportOptions = null): Promise<ParseNotesResult> => {
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
function handleSaxStreamEvent(fn: Function) {
return function(...args: any[]) {
@ -397,6 +380,9 @@ export default async function importEnex(parentFolderId: string, filePath: strin
let noteResourceRecognition: NoteResourceRecognition = null;
const notes: ExtractedNote[] = [];
let processingNotes = false;
const savedNotes: SavedNote[] = [];
const createdNoteIds: string[] = [];
const noteTitlesToIds: Record<string, string[]> = {};
const createErrorWithNoteTitle = (fnThis: any, error: any) => {
const line = [];
@ -437,15 +423,6 @@ export default async function importEnex(parentFolderId: string, filePath: strin
processingNotes = true;
stream.pause();
// Set the note ID so that we can create a title-to-id map, which
// will be needed to recreate the note links below.
const noteTitleToId: Record<string, string[]> = {};
for (const note of notes) {
if (!noteTitleToId[note.title]) noteTitleToId[note.title] = [];
note.id = uuid.create();
noteTitleToId[note.title].push(note.id);
}
while (notes.length) {
const note = notes.shift();
@ -467,32 +444,16 @@ export default async function importEnex(parentFolderId: string, filePath: strin
// Convert the ENEX body to either Markdown or HTML
// --------------------------------------------------------
let body: string = importOptions.outputFormat === 'html' ?
const body: string = importOptions.outputFormat === 'html' ?
await enexXmlToHtml(note.bodyXml, note.resources) :
await enexXmlToMd(note.bodyXml, note.resources, note.tasks);
delete note.bodyXml;
// --------------------------------------------------------
// Convert the Evernote note links to Joplin note links. If
// we don't find a matching note, or if there are multiple
// matching notes, we leave the Evernote links as is.
// --------------------------------------------------------
const links = importOptions.outputFormat === 'html' ?
extractUrlsFromHtml(body) :
extractUrlsFromMarkdown(body);
for (const link of links) {
const matchingNoteIds = noteTitleToId[link.title];
if (matchingNoteIds && matchingNoteIds.length === 1) {
body = body.replace(link.url, `:/${matchingNoteIds[0]}`);
}
}
// --------------------------------------------------------
// Finish setting up the note
// --------------------------------------------------------
note.id = uuid.create();
note.markup_language = importOptions.outputFormat === 'html' ?
MarkupToHtml.MARKUP_LANGUAGE_HTML :
MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
@ -511,15 +472,17 @@ export default async function importEnex(parentFolderId: string, filePath: strin
// that case
if (!note.updated_time) note.updated_time = note.created_time;
const result = await saveNoteToStorage(note, importOptions);
const result = await saveNoteToStorage(note);
createdNoteIds.push(note.id);
if (!noteTitlesToIds[note.title]) noteTitlesToIds[note.title] = [];
noteTitlesToIds[note.title].push(note.id);
savedNotes.push({
id: note.id,
body: note.body,
});
if (result.noteUpdated) {
progressState.updated++;
} else if (result.noteCreated) {
progressState.created++;
} else if (result.noteSkipped) {
progressState.skipped++;
}
progressState.resourcesCreated += result.resourcesCreated;
progressState.notesTagged += result.notesTagged;
importOptions.onProgress(progressState);
@ -648,7 +611,7 @@ export default async function importEnex(parentFolderId: string, filePath: strin
notes.push(note);
if (notes.length >= 10) {
if (notes.length >= importOptions.batchSize) {
// eslint-disable-next-line promise/prefer-await-to-then -- Old code before rule was applied
processNotes().catch(error => {
importOptions.onError(createErrorWithNoteTitle(this, error));
@ -718,12 +681,25 @@ export default async function importEnex(parentFolderId: string, filePath: strin
if (allDone) {
shim.clearTimeout(iid);
if (needToDeleteFileToProcess) void shim.fsDriver().remove(fileToProcess);
resolve(null);
resolve({
savedNotes,
noteTitlesToIds,
});
}
});
}, 500);
}, 1000);
}));
stream.pipe(saxStream);
});
};
export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) {
if (!importOptions) importOptions = {};
if (!('onProgress' in importOptions)) importOptions.onProgress = function() {};
if (!('onError' in importOptions)) importOptions.onError = function() {};
if (!('batchSize' in importOptions)) importOptions.batchSize = 10;
const result = await parseNotes(parentFolderId, filePath, importOptions);
await restoreNoteLinks(result.savedNotes, result.noteTitlesToIds, importOptions);
}

View File

@ -66,7 +66,6 @@
"image-type": "3.1.0",
"immer": "7.0.15",
"js-yaml": "4.1.0",
"levenshtein": "1.0.5",
"markdown-it": "13.0.2",
"md5": "2.3.0",
"md5-file": "5.0.0",

View File

@ -6871,7 +6871,6 @@ __metadata:
immer: 7.0.15
jest: 29.7.0
js-yaml: 4.1.0
levenshtein: 1.0.5
markdown-it: 13.0.2
md5: 2.3.0
md5-file: 5.0.0
@ -27688,13 +27687,6 @@ __metadata:
languageName: node
linkType: hard
"levenshtein@npm:1.0.5":
version: 1.0.5
resolution: "levenshtein@npm:1.0.5"
checksum: d5ceca3bfc4804ad50515291841d968eea5f1f740310c21b5ae6cb6d5514ee68b9c00405059f36934611d3258967bad6d306dcf299f446c7cdd25bdda2c4720c
languageName: node
linkType: hard
"levn@npm:^0.4.1":
version: 0.4.1
resolution: "levn@npm:0.4.1"