#!/usr/bin/env node /** * Build LLM-friendly Markdown from Hugo-generated HTML * * This script generates static .md files at build time for optimal performance. * Two-phase approach: * 1. Convert HTML → individual page markdown (memory-bounded parallelism) * 2. Combine pages → section bundles (fast string concatenation) * */ import { glob } from 'glob'; import fs from 'fs/promises'; import { readFileSync } from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import { dirname } from 'path'; import { createRequire } from 'module'; import yaml from 'js-yaml'; import pLimit from 'p-limit'; // Get __dirname equivalent in ESM const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Create require function for CommonJS modules const require = createRequire(import.meta.url); const { convertToMarkdown } = require('./lib/markdown-converter.cjs'); // ============================================================================ // CONFIGURATION // ============================================================================ /** * Minimum file size threshold for processing HTML files. * Files smaller than this are assumed to be Hugo alias redirects and skipped. * * Hugo alias redirects are typically 300-400 bytes (simple meta refresh pages). * Content pages are typically 30KB-100KB+. * * Set to 0 to disable redirect detection (process all files). * * @default 1024 (1KB) - Safe threshold with large margin */ const MIN_HTML_SIZE_BYTES = 1024; /** * Approximate character-to-token ratio for estimation. * Used to estimate token count from markdown content length. * * @default 4 - Rough heuristic (4 characters ā‰ˆ 1 token) */ const CHARS_PER_TOKEN = 4; // ============================================================================ // PHASE 1: HTML → MARKDOWN CONVERSION // ============================================================================ /** * Phase 1: Convert all HTML files to individual page markdown * Uses memory-bounded parallelism to avoid OOM in CI */ async function buildPageMarkdown() { console.log('šŸ“„ Converting HTML to Markdown (individual pages)...\n'); const startTime = Date.now(); // Find all HTML files const htmlFiles = await glob('public/**/index.html', { ignore: ['**/node_modules/**', '**/api-docs/**'], }); console.log(`Found ${htmlFiles.length} HTML files\n`); // Memory-bounded concurrency // CircleCI medium (2GB RAM): 10 workers safe // Local development (16GB RAM): 20 workers faster const CONCURRENCY = process.env.CI ? 10 : 20; const limit = pLimit(CONCURRENCY); let converted = 0; let skipped = 0; const errors = []; // Map all files to limited-concurrency tasks const tasks = htmlFiles.map((htmlPath) => limit(async () => { try { // Check file size before reading (skip Hugo alias redirects) if (MIN_HTML_SIZE_BYTES > 0) { const stats = await fs.stat(htmlPath); if (stats.size < MIN_HTML_SIZE_BYTES) { skipped++; return; // Skip redirect page } } // Read HTML const html = await fs.readFile(htmlPath, 'utf-8'); // Derive URL path for frontmatter const urlPath = htmlPath .replace(/^public/, '') .replace(/\/index\.html$/, '/'); // Convert to markdown (JSDOM + Turndown processing) const markdown = await convertToMarkdown(html, urlPath); if (!markdown) { skipped++; return; } // Write .md file next to .html const mdPath = htmlPath.replace(/index\.html$/, 'index.md'); await fs.writeFile(mdPath, markdown, 'utf-8'); converted++; // Progress logging if (converted % 100 === 0) { const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); const rate = ((converted / (Date.now() - startTime)) * 1000).toFixed( 0 ); const memUsed = ( process.memoryUsage().heapUsed / 1024 / 1024 ).toFixed(0); console.log( ` āœ“ ${converted}/${htmlFiles.length} (${rate}/sec, ${elapsed}s elapsed, ${memUsed}MB memory)` ); } } catch (error) { errors.push({ file: htmlPath, error: error.message }); console.error(` āœ— ${htmlPath}: ${error.message}`); } }) ); // Execute all tasks (p-limit ensures only CONCURRENCY run simultaneously) await Promise.all(tasks); const duration = ((Date.now() - startTime) / 1000).toFixed(1); const rate = ((converted / (Date.now() - startTime)) * 1000).toFixed(0); console.log(`\nāœ… Converted ${converted} files (${rate}/sec)`); if (MIN_HTML_SIZE_BYTES > 0) { console.log( `ā­ļø Skipped ${skipped} files (Hugo alias redirects < ${MIN_HTML_SIZE_BYTES} bytes)` ); } else { console.log(`ā­ļø Skipped ${skipped} files (no article content)`); } console.log(`ā±ļø Phase 1 time: ${duration}s`); if (errors.length > 0) { console.log(`āš ļø ${errors.length} errors occurred`); } console.log(''); return { converted, skipped, errors }; } /** * Phase 2: Build section bundles by combining individual markdown files * Fast string concatenation with minimal memory usage */ async function buildSectionBundles() { console.log('šŸ“¦ Building section bundles...\n'); const startTime = Date.now(); // Find all sections (directories with index.md + child index.md files) const sections = await findSections(); console.log(`Found ${sections.length} sections\n`); let built = 0; const errors = []; // High concurrency OK - just string operations, minimal memory const limit = pLimit(50); const tasks = sections.map((section) => limit(async () => { try { // Read parent markdown const parentMd = await fs.readFile(section.mdPath, 'utf-8'); // Read all child markdowns const childMds = await Promise.all( section.children.map(async (child) => ({ markdown: await fs.readFile(child.mdPath, 'utf-8'), url: child.url, title: child.title, })) ); // Combine markdown files (string manipulation only) const combined = combineMarkdown(parentMd, childMds, section.url); // Write section bundle const sectionMdPath = section.mdPath.replace( /index\.md$/, 'index.section.md' ); await fs.writeFile(sectionMdPath, combined, 'utf-8'); built++; if (built % 50 === 0) { console.log(` āœ“ Built ${built}/${sections.length} sections`); } } catch (error) { errors.push({ section: section.url, error: error.message }); console.error(` āœ— ${section.url}: ${error.message}`); } }) ); await Promise.all(tasks); const duration = ((Date.now() - startTime) / 1000).toFixed(1); console.log(`\nāœ… Built ${built} section bundles`); console.log(`ā±ļø Phase 2 time: ${duration}s`); if (errors.length > 0) { console.log(`āš ļø ${errors.length} errors occurred`); } console.log(''); return { built, errors }; } /** * Find all sections (parent pages with child pages) */ async function findSections() { const allMdFiles = await glob('public/**/index.md'); const sections = []; for (const mdPath of allMdFiles) { const dir = path.dirname(mdPath); // Find child directories with index.md const childMdFiles = await glob(path.join(dir, '*/index.md')); if (childMdFiles.length === 0) continue; // Not a section sections.push({ mdPath: mdPath, url: dir.replace(/^public/, '') + '/', children: childMdFiles.map((childMdPath) => ({ mdPath: childMdPath, url: path.dirname(childMdPath).replace(/^public/, '') + '/', title: extractTitleFromMd(childMdPath), })), }); } return sections; } /** * Extract title from markdown file (quick regex, no full parsing) */ function extractTitleFromMd(mdPath) { try { const content = readFileSync(mdPath, 'utf-8'); const match = content.match(/^---[\s\S]+?title:\s*(.+?)$/m); return match ? match[1].trim() : 'Untitled'; } catch { return 'Untitled'; } } /** * Combine parent and child markdown into section bundle */ function combineMarkdown(parentMd, childMds, sectionUrl) { // Parse parent frontmatter + content const parent = parseMarkdown(parentMd); // Parse child frontmatter + content const children = childMds.map(({ markdown, url, title }) => { const child = parseMarkdown(markdown); // Remove h1 heading (will be added as h2 to avoid duplicate) const contentWithoutH1 = child.content.replace(/^#\s+.+?\n+/, ''); return { title: child.frontmatter.title || title, url: child.frontmatter.url || url, // Use full URL from frontmatter content: `## ${child.frontmatter.title || title}\n\n${contentWithoutH1}`, tokens: child.frontmatter.estimated_tokens || 0, }; }); // Calculate total tokens const totalTokens = (parent.frontmatter.estimated_tokens || 0) + children.reduce((sum, c) => sum + c.tokens, 0); // Sanitize description (remove newlines, truncate to reasonable length) let description = parent.frontmatter.description || ''; description = description .replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with single space .trim() .substring(0, 500); // Truncate to 500 characters max // Build section frontmatter object (will be serialized to YAML) const frontmatterObj = { title: parent.frontmatter.title, description: description, url: parent.frontmatter.url || sectionUrl, // Use full URL from parent frontmatter product: parent.frontmatter.product || '', type: 'section', pages: children.length + 1, estimated_tokens: totalTokens, child_pages: children.map((c) => ({ url: c.url, title: c.title, })), }; // Serialize to YAML (handles special characters properly) const sectionFrontmatter = '---\n' + yaml .dump(frontmatterObj, { lineWidth: -1, // Disable line wrapping noRefs: true, // Disable anchors/aliases }) .trim() + '\n---'; // Combine all content const allContent = [parent.content, ...children.map((c) => c.content)].join( '\n\n---\n\n' ); return `${sectionFrontmatter}\n\n${allContent}\n`; } /** * Parse markdown into frontmatter + content */ function parseMarkdown(markdown) { const match = markdown.match(/^---\n([\s\S]+?)\n---\n\n([\s\S]+)$/); if (!match) { return { frontmatter: {}, content: markdown }; } try { const frontmatter = yaml.load(match[1]); const content = match[2]; return { frontmatter, content }; } catch (error) { console.warn('Failed to parse frontmatter:', error.message); return { frontmatter: {}, content: markdown }; } } // ============================================================================ // COMMAND-LINE ARGUMENT PARSING // ============================================================================ /** * Parse command-line arguments */ function parseArgs() { const args = process.argv.slice(2); const options = { environment: null, }; for (let i = 0; i < args.length; i++) { if ((args[i] === '-e' || args[i] === '--env') && args[i + 1]) { options.environment = args[++i]; } } return options; } // Parse arguments and set environment const cliOptions = parseArgs(); if (cliOptions.environment) { process.env.HUGO_ENV = cliOptions.environment; } /** * Main execution */ async function main() { console.log('šŸš€ Building LLM-friendly Markdown\n'); // Show environment if specified if (cliOptions.environment) { console.log(`šŸŒ Environment: ${cliOptions.environment}\n`); } console.log('════════════════════════════════\n'); const overallStart = Date.now(); // Phase 1: Generate individual page markdown const pageResults = await buildPageMarkdown(); // Phase 2: Build section bundles const sectionResults = await buildSectionBundles(); // Summary const totalDuration = ((Date.now() - overallStart) / 1000).toFixed(1); const totalFiles = pageResults.converted + sectionResults.built; console.log('════════════════════════════════\n'); console.log('šŸ“Š Summary:'); console.log(` Pages: ${pageResults.converted}`); console.log(` Sections: ${sectionResults.built}`); console.log(` Total: ${totalFiles} markdown files`); console.log(` Skipped: ${pageResults.skipped} (no article content)`); const totalErrors = pageResults.errors.length + sectionResults.errors.length; if (totalErrors > 0) { console.log(` Errors: ${totalErrors}`); } console.log(` Time: ${totalDuration}s\n`); // Exit with error code if there were errors if (totalErrors > 0) { process.exit(1); } } // Run if called directly main().catch((error) => { console.error('Fatal error:', error); process.exit(1); }); // Export functions for testing export { buildPageMarkdown, buildSectionBundles, findSections, combineMarkdown, parseMarkdown, };