docs-v2/scripts/build-llm-markdown.js

454 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* Build LLM-friendly Markdown from Hugo-generated HTML
*
* This script generates static .md files at build time for optimal performance.
* Two-phase approach:
* 1. Convert HTML → individual page markdown (memory-bounded parallelism)
* 2. Combine pages → section bundles (fast string concatenation)
*
*/
import { glob } from 'glob';
import fs from 'fs/promises';
import { readFileSync } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { dirname } from 'path';
import { createRequire } from 'module';
import yaml from 'js-yaml';
import pLimit from 'p-limit';
// Get __dirname equivalent in ESM
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Create require function for CommonJS modules
const require = createRequire(import.meta.url);
const { convertToMarkdown } = require('./lib/markdown-converter.cjs');
// ============================================================================
// CONFIGURATION
// ============================================================================
/**
* Minimum file size threshold for processing HTML files.
* Files smaller than this are assumed to be Hugo alias redirects and skipped.
*
* Hugo alias redirects are typically 300-400 bytes (simple meta refresh pages).
* Content pages are typically 30KB-100KB+.
*
* Set to 0 to disable redirect detection (process all files).
*
* @default 1024 (1KB) - Safe threshold with large margin
*/
const MIN_HTML_SIZE_BYTES = 1024;
/**
* Approximate character-to-token ratio for estimation.
* Used to estimate token count from markdown content length.
*
* @default 4 - Rough heuristic (4 characters ≈ 1 token)
*/
const CHARS_PER_TOKEN = 4;
// ============================================================================
// PHASE 1: HTML → MARKDOWN CONVERSION
// ============================================================================
/**
* Phase 1: Convert all HTML files to individual page markdown
* Uses memory-bounded parallelism to avoid OOM in CI
*/
async function buildPageMarkdown() {
console.log('📄 Converting HTML to Markdown (individual pages)...\n');
const startTime = Date.now();
// Find all HTML files
const htmlFiles = await glob('public/**/index.html', {
ignore: ['**/node_modules/**', '**/api-docs/**'],
});
console.log(`Found ${htmlFiles.length} HTML files\n`);
// Memory-bounded concurrency
// CircleCI medium (2GB RAM): 10 workers safe
// Local development (16GB RAM): 20 workers faster
const CONCURRENCY = process.env.CI ? 10 : 20;
const limit = pLimit(CONCURRENCY);
let converted = 0;
let skipped = 0;
const errors = [];
// Map all files to limited-concurrency tasks
const tasks = htmlFiles.map((htmlPath) =>
limit(async () => {
try {
// Check file size before reading (skip Hugo alias redirects)
if (MIN_HTML_SIZE_BYTES > 0) {
const stats = await fs.stat(htmlPath);
if (stats.size < MIN_HTML_SIZE_BYTES) {
skipped++;
return; // Skip redirect page
}
}
// Read HTML
const html = await fs.readFile(htmlPath, 'utf-8');
// Derive URL path for frontmatter
const urlPath = htmlPath
.replace(/^public/, '')
.replace(/\/index\.html$/, '/');
// Convert to markdown (JSDOM + Turndown processing)
const markdown = await convertToMarkdown(html, urlPath);
if (!markdown) {
skipped++;
return;
}
// Write .md file next to .html
const mdPath = htmlPath.replace(/index\.html$/, 'index.md');
await fs.writeFile(mdPath, markdown, 'utf-8');
converted++;
// Progress logging
if (converted % 100 === 0) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
const rate = ((converted / (Date.now() - startTime)) * 1000).toFixed(
0
);
const memUsed = (
process.memoryUsage().heapUsed /
1024 /
1024
).toFixed(0);
console.log(
`${converted}/${htmlFiles.length} (${rate}/sec, ${elapsed}s elapsed, ${memUsed}MB memory)`
);
}
} catch (error) {
errors.push({ file: htmlPath, error: error.message });
console.error(`${htmlPath}: ${error.message}`);
}
})
);
// Execute all tasks (p-limit ensures only CONCURRENCY run simultaneously)
await Promise.all(tasks);
const duration = ((Date.now() - startTime) / 1000).toFixed(1);
const rate = ((converted / (Date.now() - startTime)) * 1000).toFixed(0);
console.log(`\n✅ Converted ${converted} files (${rate}/sec)`);
if (MIN_HTML_SIZE_BYTES > 0) {
console.log(
`⏭️ Skipped ${skipped} files (Hugo alias redirects < ${MIN_HTML_SIZE_BYTES} bytes)`
);
} else {
console.log(`⏭️ Skipped ${skipped} files (no article content)`);
}
console.log(`⏱️ Phase 1 time: ${duration}s`);
if (errors.length > 0) {
console.log(`⚠️ ${errors.length} errors occurred`);
}
console.log('');
return { converted, skipped, errors };
}
/**
* Phase 2: Build section bundles by combining individual markdown files
* Fast string concatenation with minimal memory usage
*/
async function buildSectionBundles() {
console.log('📦 Building section bundles...\n');
const startTime = Date.now();
// Find all sections (directories with index.md + child index.md files)
const sections = await findSections();
console.log(`Found ${sections.length} sections\n`);
let built = 0;
const errors = [];
// High concurrency OK - just string operations, minimal memory
const limit = pLimit(50);
const tasks = sections.map((section) =>
limit(async () => {
try {
// Read parent markdown
const parentMd = await fs.readFile(section.mdPath, 'utf-8');
// Read all child markdowns
const childMds = await Promise.all(
section.children.map(async (child) => ({
markdown: await fs.readFile(child.mdPath, 'utf-8'),
url: child.url,
title: child.title,
}))
);
// Combine markdown files (string manipulation only)
const combined = combineMarkdown(parentMd, childMds, section.url);
// Write section bundle
const sectionMdPath = section.mdPath.replace(
/index\.md$/,
'index.section.md'
);
await fs.writeFile(sectionMdPath, combined, 'utf-8');
built++;
if (built % 50 === 0) {
console.log(` ✓ Built ${built}/${sections.length} sections`);
}
} catch (error) {
errors.push({ section: section.url, error: error.message });
console.error(`${section.url}: ${error.message}`);
}
})
);
await Promise.all(tasks);
const duration = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`\n✅ Built ${built} section bundles`);
console.log(`⏱️ Phase 2 time: ${duration}s`);
if (errors.length > 0) {
console.log(`⚠️ ${errors.length} errors occurred`);
}
console.log('');
return { built, errors };
}
/**
* Find all sections (parent pages with child pages)
*/
async function findSections() {
const allMdFiles = await glob('public/**/index.md');
const sections = [];
for (const mdPath of allMdFiles) {
const dir = path.dirname(mdPath);
// Find child directories with index.md
const childMdFiles = await glob(path.join(dir, '*/index.md'));
if (childMdFiles.length === 0) continue; // Not a section
sections.push({
mdPath: mdPath,
url: dir.replace(/^public/, '') + '/',
children: childMdFiles.map((childMdPath) => ({
mdPath: childMdPath,
url: path.dirname(childMdPath).replace(/^public/, '') + '/',
title: extractTitleFromMd(childMdPath),
})),
});
}
return sections;
}
/**
* Extract title from markdown file (quick regex, no full parsing)
*/
function extractTitleFromMd(mdPath) {
try {
const content = readFileSync(mdPath, 'utf-8');
const match = content.match(/^---[\s\S]+?title:\s*(.+?)$/m);
return match ? match[1].trim() : 'Untitled';
} catch {
return 'Untitled';
}
}
/**
* Combine parent and child markdown into section bundle
*/
function combineMarkdown(parentMd, childMds, sectionUrl) {
// Parse parent frontmatter + content
const parent = parseMarkdown(parentMd);
// Parse child frontmatter + content
const children = childMds.map(({ markdown, url, title }) => {
const child = parseMarkdown(markdown);
// Remove h1 heading (will be added as h2 to avoid duplicate)
const contentWithoutH1 = child.content.replace(/^#\s+.+?\n+/, '');
return {
title: child.frontmatter.title || title,
url: child.frontmatter.url || url, // Use full URL from frontmatter
content: `## ${child.frontmatter.title || title}\n\n${contentWithoutH1}`,
tokens: child.frontmatter.estimated_tokens || 0,
};
});
// Calculate total tokens
const totalTokens =
(parent.frontmatter.estimated_tokens || 0) +
children.reduce((sum, c) => sum + c.tokens, 0);
// Sanitize description (remove newlines, truncate to reasonable length)
let description = parent.frontmatter.description || '';
description = description
.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with single space
.trim()
.substring(0, 500); // Truncate to 500 characters max
// Build section frontmatter object (will be serialized to YAML)
const frontmatterObj = {
title: parent.frontmatter.title,
description: description,
url: parent.frontmatter.url || sectionUrl, // Use full URL from parent frontmatter
product: parent.frontmatter.product || '',
type: 'section',
pages: children.length + 1,
estimated_tokens: totalTokens,
child_pages: children.map((c) => ({
url: c.url,
title: c.title,
})),
};
// Serialize to YAML (handles special characters properly)
const sectionFrontmatter =
'---\n' +
yaml
.dump(frontmatterObj, {
lineWidth: -1, // Disable line wrapping
noRefs: true, // Disable anchors/aliases
})
.trim() +
'\n---';
// Combine all content
const allContent = [parent.content, ...children.map((c) => c.content)].join(
'\n\n---\n\n'
);
return `${sectionFrontmatter}\n\n${allContent}\n`;
}
/**
* Parse markdown into frontmatter + content
*/
function parseMarkdown(markdown) {
const match = markdown.match(/^---\n([\s\S]+?)\n---\n\n([\s\S]+)$/);
if (!match) {
return { frontmatter: {}, content: markdown };
}
try {
const frontmatter = yaml.load(match[1]);
const content = match[2];
return { frontmatter, content };
} catch (error) {
console.warn('Failed to parse frontmatter:', error.message);
return { frontmatter: {}, content: markdown };
}
}
// ============================================================================
// COMMAND-LINE ARGUMENT PARSING
// ============================================================================
/**
* Parse command-line arguments
*/
function parseArgs() {
const args = process.argv.slice(2);
const options = {
environment: null,
};
for (let i = 0; i < args.length; i++) {
if ((args[i] === '-e' || args[i] === '--env') && args[i + 1]) {
options.environment = args[++i];
}
}
return options;
}
// Parse arguments and set environment
const cliOptions = parseArgs();
if (cliOptions.environment) {
process.env.HUGO_ENV = cliOptions.environment;
}
/**
* Main execution
*/
async function main() {
console.log('🚀 Building LLM-friendly Markdown\n');
// Show environment if specified
if (cliOptions.environment) {
console.log(`🌍 Environment: ${cliOptions.environment}\n`);
}
console.log('════════════════════════════════\n');
const overallStart = Date.now();
// Phase 1: Generate individual page markdown
const pageResults = await buildPageMarkdown();
// Phase 2: Build section bundles
const sectionResults = await buildSectionBundles();
// Summary
const totalDuration = ((Date.now() - overallStart) / 1000).toFixed(1);
const totalFiles = pageResults.converted + sectionResults.built;
console.log('════════════════════════════════\n');
console.log('📊 Summary:');
console.log(` Pages: ${pageResults.converted}`);
console.log(` Sections: ${sectionResults.built}`);
console.log(` Total: ${totalFiles} markdown files`);
console.log(` Skipped: ${pageResults.skipped} (no article content)`);
const totalErrors = pageResults.errors.length + sectionResults.errors.length;
if (totalErrors > 0) {
console.log(` Errors: ${totalErrors}`);
}
console.log(` Time: ${totalDuration}s\n`);
// Exit with error code if there were errors
if (totalErrors > 0) {
process.exit(1);
}
}
// Run if called directly
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});
// Export functions for testing
export {
buildPageMarkdown,
buildSectionBundles,
findSections,
combineMarkdown,
parseMarkdown,
};