docs-v2/scripts/build-llm-markdown.js

#!/usr/bin/env node
/**
 * Build LLM-friendly Markdown from Hugo-generated HTML
 *
 * This script generates static .md files at build time for optimal performance.
 * Two-phase approach:
 * 1. Convert HTML → individual page markdown (memory-bounded parallelism)
 * 2. Combine pages → section bundles (fast string concatenation)
 *
 */

import { glob } from 'glob';
import fs from 'fs/promises';
import { readFileSync } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { dirname } from 'path';
import { createRequire } from 'module';
import yaml from 'js-yaml';
import pLimit from 'p-limit';

// Get __dirname equivalent in ESM
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// Create require function for CommonJS modules
const require = createRequire(import.meta.url);
const { convertToMarkdown } = require('./lib/markdown-converter.cjs');

// ============================================================================
// CONFIGURATION
// ============================================================================

/**
 * Minimum file size threshold for processing HTML files.
 * Files smaller than this are assumed to be Hugo alias redirects and skipped.
 *
 * Hugo alias redirects are typically 300-400 bytes (simple meta refresh pages).
 * Content pages are typically 30KB-100KB+.
 *
 * Set to 0 to disable redirect detection (process all files).
 *
 * @default 1024 (1KB) - Safe threshold with large margin
 */
const MIN_HTML_SIZE_BYTES = 1024;

/**
 * Approximate character-to-token ratio for estimation.
 * Used to estimate token count from markdown content length.
 *
 * @default 4 - Rough heuristic (4 characters ≈ 1 token)
 */
const CHARS_PER_TOKEN = 4;

// ============================================================================
// PHASE 1: HTML → MARKDOWN CONVERSION
// ============================================================================

/**
 * Phase 1: Convert all HTML files to individual page markdown
 * Uses memory-bounded parallelism to avoid OOM in CI
 */
async function buildPageMarkdown() {
  console.log('📄 Converting HTML to Markdown (individual pages)...\n');
  const startTime = Date.now();

  // Find all HTML files
  const htmlFiles = await glob('public/**/index.html', {
    ignore: ['**/node_modules/**', '**/api-docs/**'],
  });

  console.log(`Found ${htmlFiles.length} HTML files\n`);

  // Memory-bounded concurrency
  // CircleCI medium (2GB RAM): 10 workers safe
  // Local development (16GB RAM): 20 workers faster
  const CONCURRENCY = process.env.CI ? 10 : 20;
  const limit = pLimit(CONCURRENCY);

  let converted = 0;
  let skipped = 0;
  const errors = [];

  // Map all files to limited-concurrency tasks
  const tasks = htmlFiles.map((htmlPath) =>
    limit(async () => {
      try {
        // Check file size before reading (skip Hugo alias redirects)
        if (MIN_HTML_SIZE_BYTES > 0) {
          const stats = await fs.stat(htmlPath);
          if (stats.size < MIN_HTML_SIZE_BYTES) {
            skipped++;
            return; // Skip redirect page
          }
        }

        // Read HTML
        const html = await fs.readFile(htmlPath, 'utf-8');

        // Derive URL path for frontmatter
        const urlPath = htmlPath
          .replace(/^public/, '')
          .replace(/\/index\.html$/, '/');

        // Convert to markdown (JSDOM + Turndown processing)
        const markdown = await convertToMarkdown(html, urlPath);

        if (!markdown) {
          skipped++;
          return;
        }

        // Write .md file next to .html
        const mdPath = htmlPath.replace(/index\.html$/, 'index.md');
        await fs.writeFile(mdPath, markdown, 'utf-8');

        converted++;

        // Progress logging
        if (converted % 100 === 0) {
          const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
          const rate = ((converted / (Date.now() - startTime)) * 1000).toFixed(
            0
          );
          const memUsed = (
            process.memoryUsage().heapUsed /
            1024 /
            1024
          ).toFixed(0);
          console.log(
            `  ✓ ${converted}/${htmlFiles.length} (${rate}/sec, ${elapsed}s elapsed, ${memUsed}MB memory)`
          );
        }
      } catch (error) {
        errors.push({ file: htmlPath, error: error.message });
        console.error(`  ✗ ${htmlPath}: ${error.message}`);
      }
    })
  );

  // Execute all tasks (p-limit ensures only CONCURRENCY run simultaneously)
  await Promise.all(tasks);

  const duration = ((Date.now() - startTime) / 1000).toFixed(1);
  const rate = ((converted / (Date.now() - startTime)) * 1000).toFixed(0);

  console.log(`\n✅ Converted ${converted} files (${rate}/sec)`);
  if (MIN_HTML_SIZE_BYTES > 0) {
    console.log(
      `⏭️  Skipped ${skipped} files (Hugo alias redirects < ${MIN_HTML_SIZE_BYTES} bytes)`
    );
  } else {
    console.log(`⏭️  Skipped ${skipped} files (no article content)`);
  }
  console.log(`⏱️  Phase 1 time: ${duration}s`);

  if (errors.length > 0) {
    console.log(`⚠️  ${errors.length} errors occurred`);
  }

  console.log('');

  return { converted, skipped, errors };
}

/**
 * Phase 2: Build section bundles by combining individual markdown files
 * Fast string concatenation with minimal memory usage
 */
async function buildSectionBundles() {
  console.log('📦 Building section bundles...\n');
  const startTime = Date.now();

  // Find all sections (directories with index.md + child index.md files)
  const sections = await findSections();

  console.log(`Found ${sections.length} sections\n`);

  let built = 0;
  const errors = [];

  // High concurrency OK - just string operations, minimal memory
  const limit = pLimit(50);

  const tasks = sections.map((section) =>
    limit(async () => {
      try {
        // Read parent markdown
        const parentMd = await fs.readFile(section.mdPath, 'utf-8');

        // Read all child markdowns
        const childMds = await Promise.all(
          section.children.map(async (child) => ({
            markdown: await fs.readFile(child.mdPath, 'utf-8'),
            url: child.url,
            title: child.title,
          }))
        );

        // Combine markdown files (string manipulation only)
        const combined = combineMarkdown(parentMd, childMds, section.url);

        // Write section bundle
        const sectionMdPath = section.mdPath.replace(
          /index\.md$/,
          'index.section.md'
        );
        await fs.writeFile(sectionMdPath, combined, 'utf-8');

        built++;

        if (built % 50 === 0) {
          console.log(`  ✓ Built ${built}/${sections.length} sections`);
        }
      } catch (error) {
        errors.push({ section: section.url, error: error.message });
        console.error(`  ✗ ${section.url}: ${error.message}`);
      }
    })
  );

  await Promise.all(tasks);

  const duration = ((Date.now() - startTime) / 1000).toFixed(1);
  console.log(`\n✅ Built ${built} section bundles`);
  console.log(`⏱️  Phase 2 time: ${duration}s`);

  if (errors.length > 0) {
    console.log(`⚠️  ${errors.length} errors occurred`);
  }

  console.log('');

  return { built, errors };
}

/**
 * Find all sections (parent pages with child pages)
 */
async function findSections() {
  const allMdFiles = await glob('public/**/index.md');
  const sections = [];

  for (const mdPath of allMdFiles) {
    const dir = path.dirname(mdPath);

    // Find child directories with index.md
    const childMdFiles = await glob(path.join(dir, '*/index.md'));

    if (childMdFiles.length === 0) continue; // Not a section

    sections.push({
      mdPath: mdPath,
      url: dir.replace(/^public/, '') + '/',
      children: childMdFiles.map((childMdPath) => ({
        mdPath: childMdPath,
        url: path.dirname(childMdPath).replace(/^public/, '') + '/',
        title: extractTitleFromMd(childMdPath),
      })),
    });
  }

  return sections;
}

/**
 * Extract title from markdown file (quick regex, no full parsing)
 */
function extractTitleFromMd(mdPath) {
  try {
    const content = readFileSync(mdPath, 'utf-8');
    const match = content.match(/^---[\s\S]+?title:\s*(.+?)$/m);
    return match ? match[1].trim() : 'Untitled';
  } catch {
    return 'Untitled';
  }
}

/**
 * Combine parent and child markdown into section bundle
 */
function combineMarkdown(parentMd, childMds, sectionUrl) {
  // Parse parent frontmatter + content
  const parent = parseMarkdown(parentMd);

  // Parse child frontmatter + content
  const children = childMds.map(({ markdown, url, title }) => {
    const child = parseMarkdown(markdown);

    // Remove h1 heading (will be added as h2 to avoid duplicate)
    const contentWithoutH1 = child.content.replace(/^#\s+.+?\n+/, '');

    return {
      title: child.frontmatter.title || title,
      url: child.frontmatter.url || url, // Use full URL from frontmatter
      content: `## ${child.frontmatter.title || title}\n\n${contentWithoutH1}`,
      tokens: child.frontmatter.estimated_tokens || 0,
    };
  });

  // Calculate total tokens
  const totalTokens =
    (parent.frontmatter.estimated_tokens || 0) +
    children.reduce((sum, c) => sum + c.tokens, 0);

  // Sanitize description (remove newlines, truncate to reasonable length)
  let description = parent.frontmatter.description || '';
  description = description
    .replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with single space
    .trim()
    .substring(0, 500); // Truncate to 500 characters max

  // Build section frontmatter object (will be serialized to YAML)
  const frontmatterObj = {
    title: parent.frontmatter.title,
    description: description,
    url: parent.frontmatter.url || sectionUrl, // Use full URL from parent frontmatter
    product: parent.frontmatter.product || '',
    type: 'section',
    pages: children.length + 1,
    estimated_tokens: totalTokens,
    child_pages: children.map((c) => ({
      url: c.url,
      title: c.title,
    })),
  };

  // Serialize to YAML (handles special characters properly)
  const sectionFrontmatter =
    '---\n' +
    yaml
      .dump(frontmatterObj, {
        lineWidth: -1, // Disable line wrapping
        noRefs: true, // Disable anchors/aliases
      })
      .trim() +
    '\n---';

  // Combine all content
  const allContent = [parent.content, ...children.map((c) => c.content)].join(
    '\n\n---\n\n'
  );

  return `${sectionFrontmatter}\n\n${allContent}\n`;
}

/**
 * Parse markdown into frontmatter + content
 */
function parseMarkdown(markdown) {
  const match = markdown.match(/^---\n([\s\S]+?)\n---\n\n([\s\S]+)$/);

  if (!match) {
    return { frontmatter: {}, content: markdown };
  }

  try {
    const frontmatter = yaml.load(match[1]);
    const content = match[2];
    return { frontmatter, content };
  } catch (error) {
    console.warn('Failed to parse frontmatter:', error.message);
    return { frontmatter: {}, content: markdown };
  }
}

// ============================================================================
// COMMAND-LINE ARGUMENT PARSING
// ============================================================================

/**
 * Parse command-line arguments
 */
function parseArgs() {
  const args = process.argv.slice(2);
  const options = {
    environment: null,
  };

  for (let i = 0; i < args.length; i++) {
    if ((args[i] === '-e' || args[i] === '--env') && args[i + 1]) {
      options.environment = args[++i];
    }
  }

  return options;
}

// Parse arguments and set environment
const cliOptions = parseArgs();
if (cliOptions.environment) {
  process.env.HUGO_ENV = cliOptions.environment;
}

/**
 * Main execution
 */
async function main() {
  console.log('🚀 Building LLM-friendly Markdown\n');

  // Show environment if specified
  if (cliOptions.environment) {
    console.log(`🌍 Environment: ${cliOptions.environment}\n`);
  }

  console.log('════════════════════════════════\n');

  const overallStart = Date.now();

  // Phase 1: Generate individual page markdown
  const pageResults = await buildPageMarkdown();

  // Phase 2: Build section bundles
  const sectionResults = await buildSectionBundles();

  // Summary
  const totalDuration = ((Date.now() - overallStart) / 1000).toFixed(1);
  const totalFiles = pageResults.converted + sectionResults.built;

  console.log('════════════════════════════════\n');
  console.log('📊 Summary:');
  console.log(`   Pages: ${pageResults.converted}`);
  console.log(`   Sections: ${sectionResults.built}`);
  console.log(`   Total: ${totalFiles} markdown files`);
  console.log(`   Skipped: ${pageResults.skipped} (no article content)`);

  const totalErrors = pageResults.errors.length + sectionResults.errors.length;
  if (totalErrors > 0) {
    console.log(`   Errors: ${totalErrors}`);
  }

  console.log(`   Time: ${totalDuration}s\n`);

  // Exit with error code if there were errors
  if (totalErrors > 0) {
    process.exit(1);
  }
}

// Run if called directly
main().catch((error) => {
  console.error('Fatal error:', error);
  process.exit(1);
});

// Export functions for testing
export {
  buildPageMarkdown,
  buildSectionBundles,
  findSections,
  combineMarkdown,
  parseMarkdown,
};