/** * Markdown Converter Library * * Core conversion logic for transforming HTML to Markdown. * This library is used by both: * - docs-v2 build scripts (html-to-markdown.js) * - docs-tooling Lambda@Edge function * * Exports reusable functions for HTML→Markdown conversion */ const TurndownService = require('turndown'); const { JSDOM } = require('jsdom'); const path = require('path'); const fs = require('fs'); const yaml = require('js-yaml'); // Debug mode - set to true to enable verbose logging const DEBUG = true; // Product data cache let productsData = null; /** * Initialize product data from YAML file */ async function ensureProductDataInitialized() { if (productsData) { return; } try { // Path to products.yml from this file (scripts/lib/markdown-converter.js) const productsPath = path.join(__dirname, '../../data/products.yml'); if (fs.existsSync(productsPath)) { const fileContents = fs.readFileSync(productsPath, 'utf8'); productsData = yaml.load(fileContents); } } catch (err) { console.warn('Failed to load products.yml:', err.message); productsData = {}; // fallback to empty object } } /** * Get product info from URL path */ function getProductFromPath(urlPath) { if (!productsData) { return null; } // Match URL patterns to products // Based on patterns from product-mappings.ts for (const [key, product] of Object.entries(productsData)) { if (!product.url_path) continue; const pathPattern = product.url_path.replace(/\/$/, ''); // remove trailing slash if (urlPath.startsWith(pathPattern)) { return { key, name: product.name, version: product.version, description: product.description, }; } } return null; } /** * Detect product context from URL path */ function detectProduct(urlPath) { return getProductFromPath(urlPath); } /** * Configure Turndown for InfluxData documentation */ function createTurndownService() { const turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', fence: '```', emDelimiter: '*', strongDelimiter: '**', // Note: linkStyle: 'inline' breaks link conversion in Turndown 7.2.2 // Using default 'referenced' style which works correctly bulletListMarker: '-', }); // Preserve code block language identifiers turndownService.addRule('fencedCodeBlock', { filter: function (node, options) { return ( options.codeBlockStyle === 'fenced' && node.nodeName === 'PRE' && node.firstChild && node.firstChild.nodeName === 'CODE' ); }, replacement: function (content, node, options) { const code = node.firstChild; const language = code.className.replace(/^language-/, '') || ''; const fence = options.fence; return `\n\n${fence}${language}\n${code.textContent}\n${fence}\n\n`; }, }); // Improve list item handling - ensure proper spacing turndownService.addRule('listItems', { filter: 'li', replacement: function (content, node, options) { content = content .replace(/^\n+/, '') // Remove leading newlines .replace(/\n+$/, '\n') // Single trailing newline .replace(/\n/gm, '\n '); // Indent nested content let prefix = options.bulletListMarker + ' '; // Dash + 3 spaces for unordered lists const parent = node.parentNode; if (parent.nodeName === 'OL') { const start = parent.getAttribute('start'); const index = Array.prototype.indexOf.call(parent.children, node); prefix = (start ? Number(start) + index : index + 1) + '. '; } return ( prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '') ); }, }); // Convert HTML tables to Markdown tables turndownService.addRule('tables', { filter: 'table', replacement: function (content, node) { // Get all rows from tbody and thead const theadRows = Array.from(node.querySelectorAll('thead tr')); const tbodyRows = Array.from(node.querySelectorAll('tbody tr')); // If no thead/tbody, fall back to all tr elements const allRows = theadRows.length || tbodyRows.length ? [...theadRows, ...tbodyRows] : Array.from(node.querySelectorAll('tr')); if (allRows.length === 0) return ''; // Extract headers from first row const headerRow = allRows[0]; const headers = Array.from(headerRow.querySelectorAll('th, td')).map( (cell) => cell.textContent.trim() ); // Build separator row const separator = headers.map(() => '---').join(' | '); // Extract data rows (skip first row which is the header) const dataRows = allRows .slice(1) .map((row) => { const cells = Array.from(row.querySelectorAll('td, th')).map((cell) => cell.textContent.trim().replace(/\n/g, ' ') ); return '| ' + cells.join(' | ') + ' |'; }) .join('\n'); return ( '\n| ' + headers.join(' | ') + ' |\n| ' + separator + ' |\n' + dataRows + '\n\n' ); }, }); // Handle GitHub-style callouts (notes, warnings, etc.) turndownService.addRule('githubCallouts', { filter: function (node) { return ( node.nodeName === 'BLOCKQUOTE' && node.classList && (node.classList.contains('note') || node.classList.contains('warning') || node.classList.contains('important') || node.classList.contains('tip') || node.classList.contains('caution')) ); }, replacement: function (content, node) { const type = Array.from(node.classList).find((c) => ['note', 'warning', 'important', 'tip', 'caution'].includes(c) ); const emoji = { note: 'Note', warning: 'Warning', caution: 'Caution', important: 'Important', tip: 'Tip', }[type] || 'Note'; return `\n> [!${emoji}]\n> ${content.trim().replace(/\n/g, '\n> ')}\n\n`; }, }); // Remove navigation, footer, and other non-content elements turndownService.remove([ 'nav', 'header', 'footer', 'script', 'style', 'noscript', 'iframe', '.format-selector', // Remove format selector buttons (Copy page, etc.) '.page-feedback', // Remove page feedback form '#page-feedback', // Remove feedback modal ]); return turndownService; } /** * Extract article content from HTML * @param {string} htmlContent - Raw HTML content * @param {string} contextInfo - Context info for error messages (file path or URL) * @returns {Object|null} Object with title, description, content or null if not found */ function extractArticleContent(htmlContent, contextInfo = '') { const dom = new JSDOM(htmlContent); const document = dom.window.document; try { // Find the main article content const article = document.querySelector('article.article--content'); // Debug logging if (DEBUG) { console.log(`[DEBUG] Looking for article in ${contextInfo}`); console.log(`[DEBUG] HTML length: ${htmlContent.length}`); console.log(`[DEBUG] Article found: ${!!article}`); } if (!article) { // Try alternative selectors to debug if (DEBUG) { const anyArticle = document.querySelector('article'); const articleContent = document.querySelector('.article--content'); console.log(`[DEBUG] Any article element: ${!!anyArticle}`); console.log(`[DEBUG] .article--content element: ${!!articleContent}`); } console.warn( ` ⚠️ No article content found in ${contextInfo}. This is typically not a problem and represents an aliased path.` ); return null; } // Remove unwanted elements from article before conversion const elementsToRemove = [ '.format-selector', // Remove format selector buttons '.page-feedback', // Remove page feedback form '#page-feedback', // Remove feedback modal '.feedback-widget', // Remove any feedback widgets '.helpful', // Remove "Was this page helpful?" section '.feedback.block', // Remove footer feedback/support section 'hr', // Remove horizontal rules (often used as separators before footer) ]; elementsToRemove.forEach((selector) => { const elements = article.querySelectorAll(selector); elements.forEach((el) => el.remove()); }); // Extract metadata const title = document.querySelector('h1')?.textContent?.trim() || document.querySelector('title')?.textContent?.trim() || 'Untitled'; const description = document .querySelector('meta[name="description"]') ?.getAttribute('content') || document .querySelector('meta[property="og:description"]') ?.getAttribute('content') || ''; // Get the content before closing the DOM const content = article.innerHTML; return { title, description, content, }; } finally { // Clean up JSDOM to prevent memory leaks dom.window.close(); } } /** * Generate frontmatter for markdown file (single page) * @param {Object} metadata - Object with title, description * @param {string} urlPath - URL path for the page * @returns {string} YAML frontmatter as string */ function generateFrontmatter(metadata, urlPath) { const product = detectProduct(urlPath); const frontmatter = ['---']; frontmatter.push(`title: ${metadata.title}`); if (metadata.description) { frontmatter.push(`description: ${metadata.description}`); } frontmatter.push(`url: ${urlPath}`); if (product) { frontmatter.push(`product: ${product.name}`); if (product.version) { frontmatter.push(`version: ${product.version}`); } } frontmatter.push('---'); return frontmatter.join('\n'); } /** * Generate enhanced frontmatter for section aggregation * @param {Object} metadata - Object with title, description * @param {string} urlPath - URL path for the section * @param {Array} childPages - Array of child page objects with url and title * @returns {string} YAML frontmatter as string */ function generateSectionFrontmatter(metadata, urlPath, childPages) { const product = detectProduct(urlPath); const frontmatter = ['---']; frontmatter.push(`title: ${metadata.title}`); if (metadata.description) { frontmatter.push(`description: ${metadata.description}`); } frontmatter.push(`url: ${urlPath}`); frontmatter.push(`type: section`); frontmatter.push(`pages: ${childPages.length}`); // Add token estimate (rough: 4 chars per token) const contentLength = metadata.content?.length || 0; const childContentLength = childPages.reduce( (sum, child) => sum + (child.content?.length || 0), 0 ); const totalLength = contentLength + childContentLength; const estimatedTokens = Math.ceil(totalLength / 4); frontmatter.push(`estimated_tokens: ${estimatedTokens}`); if (product) { frontmatter.push(`product: ${product.name}`); if (product.version) { frontmatter.push(`version: ${product.version}`); } } // List child pages if (childPages.length > 0) { frontmatter.push(`child_pages:`); childPages.forEach((child) => { frontmatter.push(` - url: ${child.url}`); frontmatter.push(` title: ${child.title}`); }); } frontmatter.push('---'); return frontmatter.join('\n'); } /** * Convert HTML content to Markdown (single page) * @param {string} htmlContent - Raw HTML content * @param {string} urlPath - URL path for the page (for frontmatter) * @returns {Promise} Markdown content with frontmatter or null if conversion fails */ async function convertToMarkdown(htmlContent, urlPath) { await ensureProductDataInitialized(); const turndownService = createTurndownService(); const metadata = extractArticleContent(htmlContent, urlPath); if (!metadata) { return null; } // Convert HTML to markdown let markdown = turndownService.turndown(metadata.content); // Clean up excessive newlines and separator artifacts markdown = markdown .replace(/\n{3,}/g, '\n\n') .replace(/\* \* \*\s*\n\s*\* \* \*/g, '') .replace(/\* \* \*\s*$/g, '') .trim(); // Generate frontmatter const frontmatter = generateFrontmatter(metadata, urlPath); return `${frontmatter}\n\n${markdown}\n`; } /** * Convert section HTML with child pages to aggregated Markdown * @param {string} sectionHtml - HTML content of the section index page * @param {string} sectionUrlPath - URL path for the section * @param {Array} childHtmls - Array of objects with {html, url} for each child page * @returns {Promise} Aggregated markdown content or null if conversion fails */ async function convertSectionToMarkdown( sectionHtml, sectionUrlPath, childHtmls ) { await ensureProductDataInitialized(); const turndownService = createTurndownService(); // Extract section metadata and content const sectionMetadata = extractArticleContent(sectionHtml, sectionUrlPath); if (!sectionMetadata) { return null; } // Convert section content to markdown let sectionMarkdown = turndownService.turndown(sectionMetadata.content); sectionMarkdown = sectionMarkdown .replace(/\n{3,}/g, '\n\n') .replace(/\* \* \*\s*\n\s*\* \* \*/g, '') .replace(/\* \* \*\s*$/g, '') .trim(); // Process child pages const childContents = []; const childPageInfo = []; for (const { html, url } of childHtmls) { const childMetadata = extractArticleContent(html, url); if (childMetadata) { let childMarkdown = turndownService.turndown(childMetadata.content); childMarkdown = childMarkdown .replace(/\n{3,}/g, '\n\n') .replace(/\* \* \*\s*\n\s*\* \* \*/g, '') .replace(/\* \* \*\s*$/g, '') .trim(); // Remove the first h1 heading (page title) to avoid redundancy // since we're adding it as an h2 heading childMarkdown = childMarkdown.replace(/^#\s+.+?\n+/, ''); // Add child page title as heading childContents.push(`## ${childMetadata.title}\n\n${childMarkdown}`); // Track child page info for frontmatter childPageInfo.push({ url: url, title: childMetadata.title, content: childMarkdown, }); } } // Generate section frontmatter with child page info const frontmatter = generateSectionFrontmatter( { ...sectionMetadata, content: sectionMarkdown }, sectionUrlPath, childPageInfo ); // Combine section content with child pages const allContent = [sectionMarkdown, ...childContents].join('\n\n---\n\n'); return `${frontmatter}\n\n${allContent}\n`; } // Export all functions for CommonJS module.exports = { detectProduct, createTurndownService, extractArticleContent, generateFrontmatter, generateSectionFrontmatter, convertToMarkdown, convertSectionToMarkdown, };