From 143f0b4bc52a30a26b07e55546d950736145f71d Mon Sep 17 00:00:00 2001 From: Laurent Cozic Date: Tue, 29 Dec 2020 15:58:20 +0000 Subject: [PATCH] Desktop, Cli: Improve support for SVG images when importing ENEX files --- .eslintignore | 3 + .gitignore | 3 + packages/app-cli/tests/enex_to_md/svg.html | 1 + packages/app-cli/tests/enex_to_md/svg.md | 1 + ...t-enex-md-gen.js => import-enex-md-gen.ts} | 213 +++++++++++------- packages/renderer/MdToHtml/setupLinkify.js | 4 + 6 files changed, 141 insertions(+), 84 deletions(-) create mode 100644 packages/app-cli/tests/enex_to_md/svg.html create mode 100644 packages/app-cli/tests/enex_to_md/svg.md rename packages/lib/{import-enex-md-gen.js => import-enex-md-gen.ts} (88%) diff --git a/.eslintignore b/.eslintignore index 2265e18644..ff7dce2561 100644 --- a/.eslintignore +++ b/.eslintignore @@ -922,6 +922,9 @@ packages/lib/fs-driver-base.js.map packages/lib/fs-driver-node.d.ts packages/lib/fs-driver-node.js packages/lib/fs-driver-node.js.map +packages/lib/import-enex-md-gen.d.ts +packages/lib/import-enex-md-gen.js +packages/lib/import-enex-md-gen.js.map packages/lib/locale.d.ts packages/lib/locale.js packages/lib/locale.js.map diff --git a/.gitignore b/.gitignore index ef31bd220d..136054ac97 100644 --- a/.gitignore +++ b/.gitignore @@ -911,6 +911,9 @@ packages/lib/fs-driver-base.js.map packages/lib/fs-driver-node.d.ts packages/lib/fs-driver-node.js packages/lib/fs-driver-node.js.map +packages/lib/import-enex-md-gen.d.ts +packages/lib/import-enex-md-gen.js +packages/lib/import-enex-md-gen.js.map packages/lib/locale.d.ts packages/lib/locale.js packages/lib/locale.js.map diff --git a/packages/app-cli/tests/enex_to_md/svg.html b/packages/app-cli/tests/enex_to_md/svg.html new file mode 100644 index 0000000000..9b3a71fd07 --- /dev/null +++ b/packages/app-cli/tests/enex_to_md/svg.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/packages/app-cli/tests/enex_to_md/svg.md b/packages/app-cli/tests/enex_to_md/svg.md new file mode 100644 index 0000000000..205146fcbf --- /dev/null +++ b/packages/app-cli/tests/enex_to_md/svg.md @@ -0,0 +1 @@ +![](data:image/svg+xml,%3csvg%20xmlns='http://www.w3.org/2000/svg'%20xmlns:xlink='http://www.w3.org/1999/xlink'%20x='0px'%20y='0px'%20width='16px'%20height='16px'%20viewBox='0%200%2024%2024'%20data-evernote-id='0'%20class='js-evernote-checked'%3e%3cg%20transform='translate%280%2c%200%29'%20data-evernote-id='18'%20class='js-evernote-checked'%3e%3cpolygon%20fill='none'%20stroke='%23343434'%20stroke-width='2'%20stroke-linecap='square'%20stroke-miterlimit='10'%20points='12%2c2.6%2015%2c9%2021.4%2c9%2016.7%2c13.9%2018.6%2c21.4%2012%2c17.6%205.4%2c21.4%207.3%2c13.9%202.6%2c9%209%2c9%20'%20stroke-linejoin='miter'%20data-evernote-id='19'%20class='js-evernote-checked'%3e%3c/polygon%3e%3c/g%3e%3c/svg%3e) \ No newline at end of file diff --git a/packages/lib/import-enex-md-gen.js b/packages/lib/import-enex-md-gen.ts similarity index 88% rename from packages/lib/import-enex-md-gen.js rename to packages/lib/import-enex-md-gen.ts index 723430de1b..796e7e21e9 100644 --- a/packages/lib/import-enex-md-gen.js +++ b/packages/lib/import-enex-md-gen.ts @@ -1,3 +1,5 @@ +import markdownUtils from './markdownUtils'; +import { ResourceEntity } from './services/database/types'; const stringPadding = require('string-padding'); const stringToStream = require('string-to-stream'); const resourceUtils = require('./resourceUtils.js'); @@ -8,7 +10,51 @@ const NEWLINE = '[[NEWLINE]]'; const NEWLINE_MERGED = '[[MERGED]]'; const SPACE = '[[SPACE]]'; -function processMdArrayNewLines(md) { +enum SectionType { + Text = 'text', + Tr = 'tr', + Td = 'td', + Table = 'table', + Caption = 'caption', + Hidden = 'hidden', + Code = 'code', +} + +interface Section { + type: SectionType; + parent: Section; + lines: any[]; + isHeader?: boolean; +} + +interface ParserStateTag { + name: string; + visible: boolean; +} + +interface ParserStateList { + tag: string; + counter: number; + startedText: boolean; +} + +interface ParserState { + inCode: boolean[]; + inPre: boolean; + inQuote: boolean; + lists: ParserStateList[]; + anchorAttributes: any[]; + spanAttributes: string[]; + tags: ParserStateTag[]; + currentCode?: string; +} + +interface EnexXmlToMdArrayResult { + content: Section; + resources: ResourceEntity[]; +} + +function processMdArrayNewLines(md: string[]): string { while (md.length && md[0] == BLOCK_OPEN) { md.shift(); } @@ -102,7 +148,7 @@ function processMdArrayNewLines(md) { if (!output.trim().length) return ''; // To simplify the result, we only allow up to one empty line between blocks of text - const mergeMultipleNewLines = function(lines) { + const mergeMultipleNewLines = function(lines: string[]) { const output = []; let newlineCount = 0; for (let i = 0; i < lines.length; i++) { @@ -159,23 +205,23 @@ function processMdArrayNewLines(md) { // differently than if there's a newlines between them. So the function below parses the almost final MD and add new lines depending // on various rules. -const isHeading = function(line) { +const isHeading = function(line: string) { return !!line.match(/^#+\s/); }; -const isListItem = function(line) { +const isListItem = function(line: string) { return line && line.trim().indexOf('- ') === 0; }; -const isCodeLine = function(line) { +const isCodeLine = function(line: string) { return line && line.indexOf('\t') === 0; }; -const isTableLine = function(line) { +const isTableLine = function(line: string) { return line.indexOf('| ') === 0; }; -const isPlainParagraph = function(line) { +const isPlainParagraph = function(line: string) { // Note: if a line is no longer than 80 characters, we don't consider it's a paragraph, which // means no newlines will be added before or after. This is to handle text that has been // written with "hard" new lines. @@ -189,7 +235,7 @@ const isPlainParagraph = function(line) { return true; }; -function formatMdLayout(lines) { +function formatMdLayout(lines: string[]) { let previous = ''; const newLines = []; for (let i = 0; i < lines.length; i++) { @@ -235,13 +281,13 @@ function formatMdLayout(lines) { return newLines; } -function isWhiteSpace(c) { +function isWhiteSpace(c: string): boolean { return c == '\n' || c == '\r' || c == '\v' || c == '\f' || c == '\t' || c == ' '; } // Like QString::simpified(), except that it preserves non-breaking spaces (which // Evernote uses for identation, etc.) -function simplifyString(s) { +function simplifyString(s: string): string { let output = ''; let previousWhite = false; for (let i = 0; i < s.length; i++) { @@ -261,7 +307,7 @@ function simplifyString(s) { return output; } -function collapseWhiteSpaceAndAppend(lines, state, text) { +function collapseWhiteSpaceAndAppend(lines: string[], state: any, text: string) { if (state.inCode.length) { lines.push(text); } else { @@ -296,7 +342,7 @@ function collapseWhiteSpaceAndAppend(lines, state, text) { return lines; } -function tagAttributeToMdText(attr) { +function tagAttributeToMdText(attr: string): string { // HTML attributes may contain newlines so remove them. // https://github.com/laurent22/joplin/issues/1583 if (!attr) return ''; @@ -305,7 +351,7 @@ function tagAttributeToMdText(attr) { return attr; } -function addResourceTag(lines, resource, alt = '') { +function addResourceTag(lines: string[], resource: ResourceEntity, alt: string = ''): string[] { // Note: refactor to use Resource.markdownTag if (!alt) alt = resource.title; @@ -326,50 +372,50 @@ function addResourceTag(lines, resource, alt = '') { return lines; } -function isBlockTag(n) { +function isBlockTag(n: string) { return ['div', 'p', 'dl', 'dd', 'dt', 'center', 'address'].indexOf(n) >= 0; } -function isStrongTag(n) { +function isStrongTag(n: string) { return n == 'strong' || n == 'b' || n == 'big'; } -function isStrikeTag(n) { +function isStrikeTag(n: string) { return n == 'strike' || n == 's' || n == 'del'; } -function isEmTag(n) { +function isEmTag(n: string) { return n == 'em' || n == 'i' || n == 'u'; } -function isAnchor(n) { +function isAnchor(n: string) { return n == 'a'; } -function isIgnoredEndTag(n) { +function isIgnoredEndTag(n: string) { return ['en-note', 'en-todo', 'body', 'html', 'font', 'br', 'hr', 'tbody', 'sup', 'img', 'abbr', 'cite', 'thead', 'small', 'tt', 'sub', 'colgroup', 'col', 'ins', 'caption', 'var', 'map', 'area'].indexOf(n) >= 0; } -function isListTag(n) { +function isListTag(n: string) { return n == 'ol' || n == 'ul'; } // Elements that don't require any special treatment beside adding a newline character -function isNewLineOnlyEndTag(n) { +function isNewLineOnlyEndTag(n: string) { return ['div', 'p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dl', 'dd', 'dt', 'center', 'address'].indexOf(n) >= 0; } -function isInlineCodeTag(n) { +function isInlineCodeTag(n: string) { return ['samp', 'kbd'].indexOf(n) >= 0; } -function isNewLineBlock(s) { +function isNewLineBlock(s: string) { return s == BLOCK_OPEN || s == BLOCK_CLOSE; } -function attributeToLowerCase(node) { +function attributeToLowerCase(node: any) { if (!node.attributes) return {}; - const output = {}; + const output: any = {}; for (const n in node.attributes) { if (!node.attributes.hasOwnProperty(n)) continue; output[n.toLowerCase()] = node.attributes[n]; @@ -377,13 +423,13 @@ function attributeToLowerCase(node) { return output; } -function isInvisibleBlock(attributes) { +function isInvisibleBlock(attributes: any) { const style = attributes.style; if (!style) return false; return !!style.match(/display:[\s\S]*none/); } -function isSpanWithStyle(attributes) { +function isSpanWithStyle(attributes: any) { if (attributes != undefined) { if ('style' in attributes) { return true; @@ -391,9 +437,10 @@ function isSpanWithStyle(attributes) { return false; } } + return false; } -function isSpanStyleBold(attributes) { +function isSpanStyleBold(attributes: any) { const style = attributes.style; if (!style) return false; @@ -406,13 +453,13 @@ function isSpanStyleBold(attributes) { } } -function isSpanStyleItalic(attributes) { +function isSpanStyleItalic(attributes: any) { let style = attributes.style; style = style.replace(/\s+/g, ''); return (style.toLowerCase().includes('font-style:italic;')); } -function displaySaxWarning(context, message) { +function displaySaxWarning(context: any, message: string) { const line = []; const parser = context ? context._parser : null; if (parser) { @@ -422,31 +469,29 @@ function displaySaxWarning(context, message) { console.warn(line.join(': ')); } -// eslint-disable-next-line no-unused-vars, @typescript-eslint/no-unused-vars -function removeSectionParent(section) { - if (typeof section === 'string') return section; +// function removeSectionParent(section:Section | string) { +// if (typeof section === 'string') return section; - section = { ...section }; - delete section.parent; +// section = { ...section }; +// delete section.parent; - section.lines = section.lines.slice(); +// section.lines = section.lines.slice(); - for (let i = 0; i < section.lines.length; i++) { - section.lines[i] = removeSectionParent(section.lines[i]); - } +// for (let i = 0; i < section.lines.length; i++) { +// section.lines[i] = removeSectionParent(section.lines[i]); +// } - return section; -} +// return section; +// } -// eslint-disable-next-line no-unused-vars, @typescript-eslint/no-unused-vars -function printSection(section) { - console.info(JSON.stringify(removeSectionParent(section), null, 4)); -} +// function printSection(section:Section) { +// console.info(JSON.stringify(removeSectionParent(section), null, 4)); +// } -function enexXmlToMdArray(stream, resources) { +function enexXmlToMdArray(stream: any, resources: ResourceEntity[]): Promise { const remainingResources = resources.slice(); - const removeRemainingResource = id => { + const removeRemainingResource = (id: string) => { for (let i = 0; i < remainingResources.length; i++) { const r = remainingResources[i]; if (r.id === id) { @@ -456,7 +501,7 @@ function enexXmlToMdArray(stream, resources) { }; return new Promise((resolve) => { - const state = { + const state: ParserState = { inCode: [], inPre: false, inQuote: false, @@ -470,17 +515,17 @@ function enexXmlToMdArray(stream, resources) { const strict = false; const saxStream = require('@joplin/fork-sax').createStream(strict, options); - let section = { - type: 'text', + let section: Section = { + type: SectionType.Text, lines: [], parent: null, }; - saxStream.on('error', function(e) { + saxStream.on('error', function(e: any) { console.warn(e); }); - const unwrapInnerText = text => { + const unwrapInnerText = (text: string) => { const lines = text.split('\n'); let output = ''; @@ -504,14 +549,14 @@ function enexXmlToMdArray(stream, resources) { return output; }; - saxStream.on('text', function(text) { + saxStream.on('text', function(text: string) { if (['table', 'tr', 'tbody'].indexOf(section.type) >= 0) return; text = !state.inPre ? unwrapInnerText(text) : text; section.lines = collapseWhiteSpaceAndAppend(section.lines, state, text); }); - saxStream.on('opentag', function(node) { + saxStream.on('opentag', function(node: any) { const nodeAttributes = attributeToLowerCase(node); const n = node.name.toLowerCase(); const isVisible = !isInvisibleBlock(nodeAttributes); @@ -542,8 +587,8 @@ function enexXmlToMdArray(stream, resources) { if (n == 'en-note') { // Start of note } else if (n == 'table') { - const newSection = { - type: 'table', + const newSection: Section = { + type: SectionType.Table, lines: [], parent: section, }; @@ -568,8 +613,8 @@ function enexXmlToMdArray(stream, resources) { // return; } - const newSection = { - type: 'tr', + const newSection: Section = { + type: SectionType.Tr, lines: [], parent: section, isHeader: false, @@ -585,8 +630,8 @@ function enexXmlToMdArray(stream, resources) { if (n == 'th') section.isHeader = true; - const newSection = { - type: 'td', + const newSection: Section = { + type: SectionType.Td, lines: [], parent: section, }; @@ -599,8 +644,8 @@ function enexXmlToMdArray(stream, resources) { // return; } - const newSection = { - type: 'caption', + const newSection: Section = { + type: SectionType.Caption, lines: [], parent: section, }; @@ -608,8 +653,8 @@ function enexXmlToMdArray(stream, resources) { section.lines.push(newSection); section = newSection; } else if (isInvisibleBlock(nodeAttributes)) { - const newSection = { - type: 'hidden', + const newSection: Section = { + type: SectionType.Hidden, lines: [], parent: section, }; @@ -650,7 +695,7 @@ function enexXmlToMdArray(stream, resources) { // Many (most?) img tags don't have no source associated, especially when they were imported from HTML let s = '!['; if (nodeAttributes.alt) s += tagAttributeToMdText(nodeAttributes.alt); - s += `](${nodeAttributes.src})`; + s += `](${markdownUtils.escapeLinkUrl(nodeAttributes.src)})`; section.lines.push(s); } } else if (isAnchor(n)) { @@ -694,8 +739,8 @@ function enexXmlToMdArray(stream, resources) { state.inCode.push(true); state.currentCode = ''; - const newSection = { - type: 'code', + const newSection: Section = { + type: SectionType.Code, lines: [], parent: section, }; @@ -802,7 +847,7 @@ function enexXmlToMdArray(stream, resources) { } }); - saxStream.on('closetag', function(n) { + saxStream.on('closetag', function(n: string) { n = n ? n.toLowerCase() : n; const poppedTag = state.tags.pop(); @@ -940,7 +985,7 @@ function enexXmlToMdArray(stream, resources) { // [ Sign in ](https://example.com) // to: // [Sign in](https://example.com) - const trimTextStartAndEndSpaces = function(lines) { + const trimTextStartAndEndSpaces = function(lines: string[]) { let firstBracketIndex = 0; let foundFirstNonWhite = false; for (let i = lines.length - 1; i >= 0; i--) { @@ -999,14 +1044,14 @@ function enexXmlToMdArray(stream, resources) { resolve({ content: section, resources: remainingResources, - }); + } as EnexXmlToMdArrayResult); }); stream.pipe(saxStream); }); } -function tableHasSubTables(table) { +function tableHasSubTables(table: Section) { for (let trIndex = 0; trIndex < table.lines.length; trIndex++) { const tr = table.lines[trIndex]; if (!tr || !tr.lines) continue; @@ -1029,7 +1074,7 @@ function tableHasSubTables(table) { // via Web Clipper. So to handle this, we render all the outer tables as regular text (as if replacing all the , and
// elements by
) and only the inner ones, those that don't contain any other tables, are rendered as actual tables. This is generally // the required behaviour since the outer tables are usually for layout and the inner ones are the content. -function drawTable(table) { +function drawTable(table: Section) { // | First Header | Second Header | // | ------------- | ------------- | // | Content Cell | Content Cell | @@ -1061,7 +1106,7 @@ function drawTable(table) { if (flatRender) { line.push(BLOCK_OPEN); - let currentCells = []; + let currentCells: any[] = []; const renderCurrentCells = () => { if (!currentCells.length) return; @@ -1092,7 +1137,7 @@ function drawTable(table) { // A cell in a Markdown table cannot have actual new lines so replace // them with
, which are supported by the markdown renderers. - let cellText = processMdArrayNewLines(td.lines, true); + let cellText = processMdArrayNewLines(td.lines); let lines = cellText.split('\n'); lines = postProcessMarkdown(lines); cellText = lines.join('\n').replace(/\n+/g, '
'); @@ -1142,19 +1187,19 @@ function drawTable(table) { lines.push(BLOCK_CLOSE); if (caption) { - const captionLines = renderLines(caption.lines); + const captionLines: any[] = renderLines(caption.lines); lines = lines.concat(captionLines); } return flatRender ? lines : lines.join(`<<<<:D>>>>${NEWLINE}<<<<:D>>>>`).split('<<<<:D>>>>'); } -function postProcessMarkdown(lines) { +function postProcessMarkdown(lines: string[]) { // After importing HTML, the resulting Markdown often has empty lines at the beginning and end due to // block start/end or elements that were ignored, etc. If these white spaces were intended it's not really // possible to detect it, so simply trim them all so that the result is more deterministic and can be // easily unit tested. - const trimEmptyLines = function(lines) { + const trimEmptyLines = function(lines: string[]) { while (lines.length) { if (!lines[0].trim()) { lines.splice(0, 1); @@ -1174,7 +1219,7 @@ function postProcessMarkdown(lines) { return lines; }; - function cleanUpSpaces(lines) { + function cleanUpSpaces(lines: string[]) { const output = []; for (let i = 0; i < lines.length; i++) { @@ -1203,7 +1248,7 @@ function postProcessMarkdown(lines) { // A "line" can be some Markdown text, or it can be a section, like a table, // etc. so this function returns an array of strings. -function renderLine(line) { +function renderLine(line: any) { if (typeof line === 'object' && line.type === 'table') { // A table const table = line; @@ -1227,8 +1272,8 @@ function renderLine(line) { } } -function renderLines(lines) { - let mdLines = []; +function renderLines(lines: any[]) { + let mdLines: string[] = []; for (let i = 0; i < lines.length; i++) { const renderedLines = renderLine(lines[i]); mdLines = mdLines.concat(renderedLines); @@ -1236,9 +1281,9 @@ function renderLines(lines) { return mdLines; } -async function enexXmlToMd(xmlString, resources, options = {}) { +async function enexXmlToMd(xmlString: string, resources: ResourceEntity[]) { const stream = stringToStream(xmlString); - const result = await enexXmlToMdArray(stream, resources, options); + const result = await enexXmlToMdArray(stream, resources); let mdLines = renderLines(result.content.lines); @@ -1258,4 +1303,4 @@ async function enexXmlToMd(xmlString, resources, options = {}) { return output.join('\n'); } -module.exports = { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag }; +export { enexXmlToMd, processMdArrayNewLines, NEWLINE, addResourceTag }; diff --git a/packages/renderer/MdToHtml/setupLinkify.js b/packages/renderer/MdToHtml/setupLinkify.js index 405f31a2db..93272b8e1d 100644 --- a/packages/renderer/MdToHtml/setupLinkify.js +++ b/packages/renderer/MdToHtml/setupLinkify.js @@ -24,6 +24,10 @@ module.exports = function(markdownIt) { // url should be normalized at this point, and existing entities are decoded const str = url.trim().toLowerCase(); + if (str.indexOf('data:image/svg+xml,') === 0) { + return true; + } + return BAD_PROTO_RE.test(str) ? (GOOD_DATA_RE.test(str) ? true : false) : true; };