506 lines
15 KiB
JavaScript
506 lines
15 KiB
JavaScript
/**
|
|
* Markdown Converter Library
|
|
*
|
|
* Core conversion logic for transforming HTML to Markdown.
|
|
* This library is used by both:
|
|
* - docs-v2 build scripts (html-to-markdown.js)
|
|
* - docs-tooling Lambda@Edge function
|
|
*
|
|
* Exports reusable functions for HTML→Markdown conversion
|
|
*/
|
|
|
|
const TurndownService = require('turndown');
|
|
const { JSDOM } = require('jsdom');
|
|
const path = require('path');
|
|
const fs = require('fs');
|
|
const yaml = require('js-yaml');
|
|
|
|
// Debug mode - set to true to enable verbose logging
|
|
const DEBUG = true;
|
|
|
|
// Product data cache
|
|
let productsData = null;
|
|
|
|
/**
|
|
* Initialize product data from YAML file
|
|
*/
|
|
async function ensureProductDataInitialized() {
|
|
if (productsData) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
// Path to products.yml from this file (scripts/lib/markdown-converter.js)
|
|
const productsPath = path.join(__dirname, '../../data/products.yml');
|
|
|
|
if (fs.existsSync(productsPath)) {
|
|
const fileContents = fs.readFileSync(productsPath, 'utf8');
|
|
productsData = yaml.load(fileContents);
|
|
}
|
|
} catch (err) {
|
|
console.warn('Failed to load products.yml:', err.message);
|
|
productsData = {}; // fallback to empty object
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get product info from URL path
|
|
*/
|
|
function getProductFromPath(urlPath) {
|
|
if (!productsData) {
|
|
return null;
|
|
}
|
|
|
|
// Match URL patterns to products
|
|
// Based on patterns from product-mappings.ts
|
|
for (const [key, product] of Object.entries(productsData)) {
|
|
if (!product.url_path) continue;
|
|
|
|
const pathPattern = product.url_path.replace(/\/$/, ''); // remove trailing slash
|
|
if (urlPath.startsWith(pathPattern)) {
|
|
return {
|
|
key,
|
|
name: product.name,
|
|
version: product.version,
|
|
description: product.description,
|
|
};
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Detect product context from URL path
|
|
*/
|
|
function detectProduct(urlPath) {
|
|
return getProductFromPath(urlPath);
|
|
}
|
|
|
|
/**
|
|
* Configure Turndown for InfluxData documentation
|
|
*/
|
|
function createTurndownService() {
|
|
const turndownService = new TurndownService({
|
|
headingStyle: 'atx',
|
|
codeBlockStyle: 'fenced',
|
|
fence: '```',
|
|
emDelimiter: '*',
|
|
strongDelimiter: '**',
|
|
// Note: linkStyle: 'inline' breaks link conversion in Turndown 7.2.2
|
|
// Using default 'referenced' style which works correctly
|
|
bulletListMarker: '-',
|
|
});
|
|
|
|
// Preserve code block language identifiers
|
|
turndownService.addRule('fencedCodeBlock', {
|
|
filter: function (node, options) {
|
|
return (
|
|
options.codeBlockStyle === 'fenced' &&
|
|
node.nodeName === 'PRE' &&
|
|
node.firstChild &&
|
|
node.firstChild.nodeName === 'CODE'
|
|
);
|
|
},
|
|
replacement: function (content, node, options) {
|
|
const code = node.firstChild;
|
|
const language = code.className.replace(/^language-/, '') || '';
|
|
const fence = options.fence;
|
|
return `\n\n${fence}${language}\n${code.textContent}\n${fence}\n\n`;
|
|
},
|
|
});
|
|
|
|
// Improve list item handling - ensure proper spacing
|
|
turndownService.addRule('listItems', {
|
|
filter: 'li',
|
|
replacement: function (content, node, options) {
|
|
content = content
|
|
.replace(/^\n+/, '') // Remove leading newlines
|
|
.replace(/\n+$/, '\n') // Single trailing newline
|
|
.replace(/\n/gm, '\n '); // Indent nested content
|
|
|
|
let prefix = options.bulletListMarker + ' '; // Dash + 3 spaces for unordered lists
|
|
const parent = node.parentNode;
|
|
|
|
if (parent.nodeName === 'OL') {
|
|
const start = parent.getAttribute('start');
|
|
const index = Array.prototype.indexOf.call(parent.children, node);
|
|
prefix = (start ? Number(start) + index : index + 1) + '. ';
|
|
}
|
|
|
|
return (
|
|
prefix +
|
|
content +
|
|
(node.nextSibling && !/\n$/.test(content) ? '\n' : '')
|
|
);
|
|
},
|
|
});
|
|
|
|
// Convert HTML tables to Markdown tables
|
|
turndownService.addRule('tables', {
|
|
filter: 'table',
|
|
replacement: function (content, node) {
|
|
// Get all rows from tbody and thead
|
|
const theadRows = Array.from(node.querySelectorAll('thead tr'));
|
|
const tbodyRows = Array.from(node.querySelectorAll('tbody tr'));
|
|
|
|
// If no thead/tbody, fall back to all tr elements
|
|
const allRows =
|
|
theadRows.length || tbodyRows.length
|
|
? [...theadRows, ...tbodyRows]
|
|
: Array.from(node.querySelectorAll('tr'));
|
|
|
|
if (allRows.length === 0) return '';
|
|
|
|
// Extract headers from first row
|
|
const headerRow = allRows[0];
|
|
const headers = Array.from(headerRow.querySelectorAll('th, td')).map(
|
|
(cell) => cell.textContent.trim()
|
|
);
|
|
|
|
// Build separator row
|
|
const separator = headers.map(() => '---').join(' | ');
|
|
|
|
// Extract data rows (skip first row which is the header)
|
|
const dataRows = allRows
|
|
.slice(1)
|
|
.map((row) => {
|
|
const cells = Array.from(row.querySelectorAll('td, th')).map((cell) =>
|
|
cell.textContent.trim().replace(/\n/g, ' ')
|
|
);
|
|
return '| ' + cells.join(' | ') + ' |';
|
|
})
|
|
.join('\n');
|
|
|
|
return (
|
|
'\n| ' +
|
|
headers.join(' | ') +
|
|
' |\n| ' +
|
|
separator +
|
|
' |\n' +
|
|
dataRows +
|
|
'\n\n'
|
|
);
|
|
},
|
|
});
|
|
|
|
// Handle GitHub-style callouts (notes, warnings, etc.)
|
|
turndownService.addRule('githubCallouts', {
|
|
filter: function (node) {
|
|
return (
|
|
node.nodeName === 'BLOCKQUOTE' &&
|
|
node.classList &&
|
|
(node.classList.contains('note') ||
|
|
node.classList.contains('warning') ||
|
|
node.classList.contains('important') ||
|
|
node.classList.contains('tip') ||
|
|
node.classList.contains('caution'))
|
|
);
|
|
},
|
|
replacement: function (content, node) {
|
|
const type = Array.from(node.classList).find((c) =>
|
|
['note', 'warning', 'important', 'tip', 'caution'].includes(c)
|
|
);
|
|
const emoji =
|
|
{
|
|
note: 'Note',
|
|
warning: 'Warning',
|
|
caution: 'Caution',
|
|
important: 'Important',
|
|
tip: 'Tip',
|
|
}[type] || 'Note';
|
|
|
|
return `\n> [!${emoji}]\n> ${content.trim().replace(/\n/g, '\n> ')}\n\n`;
|
|
},
|
|
});
|
|
|
|
// Remove navigation, footer, and other non-content elements
|
|
turndownService.remove([
|
|
'nav',
|
|
'header',
|
|
'footer',
|
|
'script',
|
|
'style',
|
|
'noscript',
|
|
'iframe',
|
|
'.format-selector', // Remove format selector buttons (Copy page, etc.)
|
|
'.page-feedback', // Remove page feedback form
|
|
'#page-feedback', // Remove feedback modal
|
|
]);
|
|
|
|
return turndownService;
|
|
}
|
|
|
|
/**
|
|
* Extract article content from HTML
|
|
* @param {string} htmlContent - Raw HTML content
|
|
* @param {string} contextInfo - Context info for error messages (file path or URL)
|
|
* @returns {Object|null} Object with title, description, content or null if not found
|
|
*/
|
|
function extractArticleContent(htmlContent, contextInfo = '') {
|
|
const dom = new JSDOM(htmlContent);
|
|
const document = dom.window.document;
|
|
|
|
try {
|
|
// Find the main article content
|
|
const article = document.querySelector('article.article--content');
|
|
|
|
// Debug logging
|
|
if (DEBUG) {
|
|
console.log(`[DEBUG] Looking for article in ${contextInfo}`);
|
|
console.log(`[DEBUG] HTML length: ${htmlContent.length}`);
|
|
console.log(`[DEBUG] Article found: ${!!article}`);
|
|
}
|
|
|
|
if (!article) {
|
|
// Try alternative selectors to debug
|
|
if (DEBUG) {
|
|
const anyArticle = document.querySelector('article');
|
|
const articleContent = document.querySelector('.article--content');
|
|
console.log(`[DEBUG] Any article element: ${!!anyArticle}`);
|
|
console.log(`[DEBUG] .article--content element: ${!!articleContent}`);
|
|
}
|
|
|
|
console.warn(
|
|
` ⚠️ No article content found in ${contextInfo}. This is typically not a problem and represents an aliased path.`
|
|
);
|
|
return null;
|
|
}
|
|
|
|
// Remove unwanted elements from article before conversion
|
|
const elementsToRemove = [
|
|
'.format-selector', // Remove format selector buttons
|
|
'.page-feedback', // Remove page feedback form
|
|
'#page-feedback', // Remove feedback modal
|
|
'.feedback-widget', // Remove any feedback widgets
|
|
'.helpful', // Remove "Was this page helpful?" section
|
|
'.feedback.block', // Remove footer feedback/support section
|
|
'hr', // Remove horizontal rules (often used as separators before footer)
|
|
];
|
|
|
|
elementsToRemove.forEach((selector) => {
|
|
const elements = article.querySelectorAll(selector);
|
|
elements.forEach((el) => el.remove());
|
|
});
|
|
|
|
// Extract metadata
|
|
const title =
|
|
document.querySelector('h1')?.textContent?.trim() ||
|
|
document.querySelector('title')?.textContent?.trim() ||
|
|
'Untitled';
|
|
|
|
const description =
|
|
document
|
|
.querySelector('meta[name="description"]')
|
|
?.getAttribute('content') ||
|
|
document
|
|
.querySelector('meta[property="og:description"]')
|
|
?.getAttribute('content') ||
|
|
'';
|
|
|
|
// Get the content before closing the DOM
|
|
const content = article.innerHTML;
|
|
|
|
return {
|
|
title,
|
|
description,
|
|
content,
|
|
};
|
|
} finally {
|
|
// Clean up JSDOM to prevent memory leaks
|
|
dom.window.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate frontmatter for markdown file (single page)
|
|
* @param {Object} metadata - Object with title, description
|
|
* @param {string} urlPath - URL path for the page
|
|
* @returns {string} YAML frontmatter as string
|
|
*/
|
|
function generateFrontmatter(metadata, urlPath) {
|
|
const product = detectProduct(urlPath);
|
|
const frontmatter = ['---'];
|
|
|
|
frontmatter.push(`title: ${metadata.title}`);
|
|
if (metadata.description) {
|
|
frontmatter.push(`description: ${metadata.description}`);
|
|
}
|
|
frontmatter.push(`url: ${urlPath}`);
|
|
|
|
if (product) {
|
|
frontmatter.push(`product: ${product.name}`);
|
|
if (product.version) {
|
|
frontmatter.push(`version: ${product.version}`);
|
|
}
|
|
}
|
|
|
|
frontmatter.push('---');
|
|
return frontmatter.join('\n');
|
|
}
|
|
|
|
/**
|
|
* Generate enhanced frontmatter for section aggregation
|
|
* @param {Object} metadata - Object with title, description
|
|
* @param {string} urlPath - URL path for the section
|
|
* @param {Array} childPages - Array of child page objects with url and title
|
|
* @returns {string} YAML frontmatter as string
|
|
*/
|
|
function generateSectionFrontmatter(metadata, urlPath, childPages) {
|
|
const product = detectProduct(urlPath);
|
|
const frontmatter = ['---'];
|
|
|
|
frontmatter.push(`title: ${metadata.title}`);
|
|
if (metadata.description) {
|
|
frontmatter.push(`description: ${metadata.description}`);
|
|
}
|
|
frontmatter.push(`url: ${urlPath}`);
|
|
frontmatter.push(`type: section`);
|
|
frontmatter.push(`pages: ${childPages.length}`);
|
|
|
|
// Add token estimate (rough: 4 chars per token)
|
|
const contentLength = metadata.content?.length || 0;
|
|
const childContentLength = childPages.reduce(
|
|
(sum, child) => sum + (child.content?.length || 0),
|
|
0
|
|
);
|
|
const totalLength = contentLength + childContentLength;
|
|
const estimatedTokens = Math.ceil(totalLength / 4);
|
|
frontmatter.push(`estimated_tokens: ${estimatedTokens}`);
|
|
|
|
if (product) {
|
|
frontmatter.push(`product: ${product.name}`);
|
|
if (product.version) {
|
|
frontmatter.push(`version: ${product.version}`);
|
|
}
|
|
}
|
|
|
|
// List child pages
|
|
if (childPages.length > 0) {
|
|
frontmatter.push(`child_pages:`);
|
|
childPages.forEach((child) => {
|
|
frontmatter.push(` - url: ${child.url}`);
|
|
frontmatter.push(` title: ${child.title}`);
|
|
});
|
|
}
|
|
|
|
frontmatter.push('---');
|
|
return frontmatter.join('\n');
|
|
}
|
|
|
|
/**
|
|
* Convert HTML content to Markdown (single page)
|
|
* @param {string} htmlContent - Raw HTML content
|
|
* @param {string} urlPath - URL path for the page (for frontmatter)
|
|
* @returns {Promise<string|null>} Markdown content with frontmatter or null if conversion fails
|
|
*/
|
|
async function convertToMarkdown(htmlContent, urlPath) {
|
|
await ensureProductDataInitialized();
|
|
|
|
const turndownService = createTurndownService();
|
|
const metadata = extractArticleContent(htmlContent, urlPath);
|
|
|
|
if (!metadata) {
|
|
return null;
|
|
}
|
|
|
|
// Convert HTML to markdown
|
|
let markdown = turndownService.turndown(metadata.content);
|
|
|
|
// Clean up excessive newlines and separator artifacts
|
|
markdown = markdown
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.replace(/\* \* \*\s*\n\s*\* \* \*/g, '')
|
|
.replace(/\* \* \*\s*$/g, '')
|
|
.trim();
|
|
|
|
// Generate frontmatter
|
|
const frontmatter = generateFrontmatter(metadata, urlPath);
|
|
|
|
return `${frontmatter}\n\n${markdown}\n`;
|
|
}
|
|
|
|
/**
|
|
* Convert section HTML with child pages to aggregated Markdown
|
|
* @param {string} sectionHtml - HTML content of the section index page
|
|
* @param {string} sectionUrlPath - URL path for the section
|
|
* @param {Array} childHtmls - Array of objects with {html, url} for each child page
|
|
* @returns {Promise<string|null>} Aggregated markdown content or null if conversion fails
|
|
*/
|
|
async function convertSectionToMarkdown(
|
|
sectionHtml,
|
|
sectionUrlPath,
|
|
childHtmls
|
|
) {
|
|
await ensureProductDataInitialized();
|
|
|
|
const turndownService = createTurndownService();
|
|
|
|
// Extract section metadata and content
|
|
const sectionMetadata = extractArticleContent(sectionHtml, sectionUrlPath);
|
|
if (!sectionMetadata) {
|
|
return null;
|
|
}
|
|
|
|
// Convert section content to markdown
|
|
let sectionMarkdown = turndownService.turndown(sectionMetadata.content);
|
|
sectionMarkdown = sectionMarkdown
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.replace(/\* \* \*\s*\n\s*\* \* \*/g, '')
|
|
.replace(/\* \* \*\s*$/g, '')
|
|
.trim();
|
|
|
|
// Process child pages
|
|
const childContents = [];
|
|
const childPageInfo = [];
|
|
|
|
for (const { html, url } of childHtmls) {
|
|
const childMetadata = extractArticleContent(html, url);
|
|
if (childMetadata) {
|
|
let childMarkdown = turndownService.turndown(childMetadata.content);
|
|
childMarkdown = childMarkdown
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.replace(/\* \* \*\s*\n\s*\* \* \*/g, '')
|
|
.replace(/\* \* \*\s*$/g, '')
|
|
.trim();
|
|
|
|
// Remove the first h1 heading (page title) to avoid redundancy
|
|
// since we're adding it as an h2 heading
|
|
childMarkdown = childMarkdown.replace(/^#\s+.+?\n+/, '');
|
|
|
|
// Add child page title as heading
|
|
childContents.push(`## ${childMetadata.title}\n\n${childMarkdown}`);
|
|
|
|
// Track child page info for frontmatter
|
|
childPageInfo.push({
|
|
url: url,
|
|
title: childMetadata.title,
|
|
content: childMarkdown,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Generate section frontmatter with child page info
|
|
const frontmatter = generateSectionFrontmatter(
|
|
{ ...sectionMetadata, content: sectionMarkdown },
|
|
sectionUrlPath,
|
|
childPageInfo
|
|
);
|
|
|
|
// Combine section content with child pages
|
|
const allContent = [sectionMarkdown, ...childContents].join('\n\n---\n\n');
|
|
|
|
return `${frontmatter}\n\n${allContent}\n`;
|
|
}
|
|
|
|
// Export all functions for CommonJS
|
|
module.exports = {
|
|
detectProduct,
|
|
createTurndownService,
|
|
extractArticleContent,
|
|
generateFrontmatter,
|
|
generateSectionFrontmatter,
|
|
convertToMarkdown,
|
|
convertSectionToMarkdown,
|
|
};
|