462 lines
12 KiB
JavaScript
462 lines
12 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* HTML to Markdown Converter CLI for InfluxData Documentation
|
|
*
|
|
* Generates LLM-friendly Markdown from Hugo-generated HTML documentation.
|
|
* This script is the local CLI companion to the Lambda@Edge function that serves
|
|
* Markdown on-demand at docs.influxdata.com.
|
|
*
|
|
* ## Architecture
|
|
*
|
|
* The core conversion logic lives in ./lib/markdown-converter.js, which is shared
|
|
* between this CLI tool and the Lambda@Edge function in deploy/llm-markdown/.
|
|
* This ensures local builds and production Lambda use identical conversion logic.
|
|
*
|
|
* ## Prerequisites
|
|
*
|
|
* Before running this script, you must:
|
|
*
|
|
* 1. Install dependencies:
|
|
* ```bash
|
|
* yarn install
|
|
* ```
|
|
*
|
|
* 2. Compile TypeScript (for product mappings):
|
|
* ```bash
|
|
* yarn build:ts
|
|
* ```
|
|
*
|
|
* 3. Build the Hugo site:
|
|
* ```bash
|
|
* npx hugo --quiet
|
|
* ```
|
|
*
|
|
* ## Usage
|
|
*
|
|
* Basic usage:
|
|
* ```bash
|
|
* node scripts/html-to-markdown.js [options]
|
|
* ```
|
|
*
|
|
* ## Options
|
|
*
|
|
* --path <path> Process specific content path relative to public/ directory
|
|
* Example: influxdb3/core/get-started
|
|
*
|
|
* --limit <n> Limit number of files to process (useful for testing)
|
|
* Example: --limit 10
|
|
*
|
|
* -e, --env <env> Set environment (development, staging, production)
|
|
* Controls base URL in frontmatter (matches Hugo's -e flag)
|
|
* Example: -e staging
|
|
*
|
|
* --verbose Enable detailed logging showing each file processed
|
|
*
|
|
* ## Examples
|
|
*
|
|
* Generate Markdown for all documentation:
|
|
* ```bash
|
|
* node scripts/html-to-markdown.js
|
|
* ```
|
|
*
|
|
* Generate Markdown for InfluxDB 3 Core documentation:
|
|
* ```bash
|
|
* node scripts/html-to-markdown.js --path influxdb3/core
|
|
* ```
|
|
*
|
|
* Generate Markdown for a specific section (testing):
|
|
* ```bash
|
|
* node scripts/html-to-markdown.js --path influxdb3/core/get-started --limit 10
|
|
* ```
|
|
*
|
|
* Generate with verbose output:
|
|
* ```bash
|
|
* node scripts/html-to-markdown.js --path influxdb3/core --limit 5 --verbose
|
|
* ```
|
|
*
|
|
* Generate Markdown with staging URLs:
|
|
* ```bash
|
|
* node scripts/html-to-markdown.js --path influxdb3/core -e staging
|
|
* ```
|
|
*
|
|
* ## Output Files
|
|
*
|
|
* This script generates two types of Markdown files:
|
|
*
|
|
* 1. **Single page**: `index.md`
|
|
* - Mirrors the HTML page structure
|
|
* - Contains YAML frontmatter with title, description, URL, product info
|
|
* - Located alongside the source `index.html`
|
|
*
|
|
* 2. **Section aggregation**: `index.section.md`
|
|
* - Combines parent page + all child pages in one file
|
|
* - Optimized for LLM context windows
|
|
* - Only generated for pages that have child pages
|
|
* - Enhanced frontmatter includes child page list and token estimate
|
|
*
|
|
* ## Frontmatter Structure
|
|
*
|
|
* Single page frontmatter:
|
|
* ```yaml
|
|
* ---
|
|
* title: Page Title
|
|
* description: Page description from meta tags
|
|
* url: /influxdb3/core/path/to/page/
|
|
* product: InfluxDB 3 Core
|
|
* version: core
|
|
* ---
|
|
* ```
|
|
*
|
|
* Section aggregation frontmatter includes additional fields:
|
|
* ```yaml
|
|
* ---
|
|
* title: Section Title
|
|
* description: Section description
|
|
* url: /influxdb3/core/section/
|
|
* type: section
|
|
* pages: 5
|
|
* estimated_tokens: 12500
|
|
* product: InfluxDB 3 Core
|
|
* version: core
|
|
* child_pages:
|
|
* - url: /influxdb3/core/section/page1/
|
|
* title: Page 1 Title
|
|
* - url: /influxdb3/core/section/page2/
|
|
* title: Page 2 Title
|
|
* ---
|
|
* ```
|
|
*
|
|
* ## Testing Generated Markdown
|
|
*
|
|
* Use Cypress to validate generated Markdown:
|
|
* ```bash
|
|
* node cypress/support/run-e2e-specs.js \
|
|
* --spec "cypress/e2e/content/markdown-content-validation.cy.js"
|
|
* ```
|
|
*
|
|
* ## Common Issues
|
|
*
|
|
* **Error: Directory not found**
|
|
* - Solution: Run `npx hugo --quiet` first to generate HTML files
|
|
*
|
|
* **No article content found warnings**
|
|
* - This is normal for alias/redirect pages
|
|
* - The script skips these pages automatically
|
|
*
|
|
* **Memory issues with large builds**
|
|
* - Use `--path` to process specific sections
|
|
* - Use `--limit` for testing with small batches
|
|
* - Script includes periodic garbage collection hints
|
|
*
|
|
* ## Related Files
|
|
*
|
|
* - Core logic: `scripts/lib/markdown-converter.js`
|
|
* - Lambda handler: `deploy/llm-markdown/lambda-edge/markdown-generator/index.js`
|
|
* - Product detection: `dist/utils/product-mappings.js` (compiled from TypeScript)
|
|
* - Cypress tests: `cypress/e2e/content/markdown-content-validation.cy.js`
|
|
*/
|
|
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import {
|
|
convertToMarkdown,
|
|
convertSectionToMarkdown,
|
|
} from './lib/markdown-converter.cjs';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
// Parse command line arguments
|
|
const args = process.argv.slice(2);
|
|
const options = {
|
|
publicDir: path.join(__dirname, '..', 'public'),
|
|
limit: null,
|
|
verbose: false,
|
|
specificPath: null,
|
|
environment: null,
|
|
};
|
|
|
|
// Parse command-line arguments
|
|
for (let i = 0; i < args.length; i++) {
|
|
if (args[i] === '--path' && args[i + 1]) {
|
|
options.specificPath = args[++i];
|
|
} else if (args[i] === '--limit' && args[i + 1]) {
|
|
options.limit = parseInt(args[++i], 10);
|
|
} else if ((args[i] === '-e' || args[i] === '--env') && args[i + 1]) {
|
|
options.environment = args[++i];
|
|
} else if (args[i] === '--verbose') {
|
|
options.verbose = true;
|
|
}
|
|
}
|
|
|
|
// Set HUGO_ENV environment variable based on --env flag (matches Hugo's -e flag behavior)
|
|
if (options.environment) {
|
|
process.env.HUGO_ENV = options.environment;
|
|
console.log(`🌍 Environment set to: ${options.environment}`);
|
|
}
|
|
|
|
/**
|
|
* Check if a directory is a section (has child directories with index.html)
|
|
*/
|
|
function isSection(dirPath) {
|
|
try {
|
|
const files = fs.readdirSync(dirPath);
|
|
return files.some((file) => {
|
|
const fullPath = path.join(dirPath, file);
|
|
const stat = fs.statSync(fullPath);
|
|
return (
|
|
stat.isDirectory() && fs.existsSync(path.join(fullPath, 'index.html'))
|
|
);
|
|
});
|
|
} catch (error) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Find all child page HTML files in a section
|
|
*/
|
|
function findChildPages(sectionPath) {
|
|
try {
|
|
const files = fs.readdirSync(sectionPath);
|
|
const childPages = [];
|
|
|
|
for (const file of files) {
|
|
const fullPath = path.join(sectionPath, file);
|
|
const stat = fs.statSync(fullPath);
|
|
|
|
if (stat.isDirectory()) {
|
|
const childIndexPath = path.join(fullPath, 'index.html');
|
|
if (fs.existsSync(childIndexPath)) {
|
|
childPages.push(childIndexPath);
|
|
}
|
|
}
|
|
}
|
|
|
|
return childPages;
|
|
} catch (error) {
|
|
console.error(
|
|
`Error finding child pages in ${sectionPath}:`,
|
|
error.message
|
|
);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert single HTML file to Markdown using the shared library
|
|
*/
|
|
async function convertHtmlFileToMarkdown(htmlFilePath) {
|
|
try {
|
|
const htmlContent = fs.readFileSync(htmlFilePath, 'utf-8');
|
|
|
|
// Derive URL path from file path
|
|
const relativePath = path.relative(options.publicDir, htmlFilePath);
|
|
const urlPath =
|
|
'/' + relativePath.replace(/\/index\.html$/, '/').replace(/\\/g, '/');
|
|
|
|
// Use shared conversion function
|
|
const markdown = await convertToMarkdown(htmlContent, urlPath);
|
|
if (!markdown) {
|
|
return null;
|
|
}
|
|
|
|
// Write to index.md in same directory
|
|
const markdownFilePath = htmlFilePath.replace(/index\.html$/, 'index.md');
|
|
fs.writeFileSync(markdownFilePath, markdown, 'utf-8');
|
|
|
|
if (options.verbose) {
|
|
console.log(` ✓ Converted: ${relativePath}`);
|
|
}
|
|
|
|
return markdownFilePath;
|
|
} catch (error) {
|
|
console.error(` ✗ Error converting ${htmlFilePath}:`, error.message);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Aggregate section and child page markdown using the shared library
|
|
*/
|
|
async function aggregateSectionMarkdown(sectionHtmlPath) {
|
|
try {
|
|
const sectionDir = path.dirname(sectionHtmlPath);
|
|
|
|
// Read section HTML
|
|
const sectionHtml = fs.readFileSync(sectionHtmlPath, 'utf-8');
|
|
|
|
// Derive URL path
|
|
const sectionUrlPath =
|
|
'/' +
|
|
path
|
|
.relative(options.publicDir, sectionHtmlPath)
|
|
.replace(/\/index\.html$/, '/')
|
|
.replace(/\\/g, '/');
|
|
|
|
// Find and read child pages
|
|
const childPaths = findChildPages(sectionDir);
|
|
const childHtmls = [];
|
|
|
|
for (const childPath of childPaths) {
|
|
try {
|
|
const childHtml = fs.readFileSync(childPath, 'utf-8');
|
|
const childUrl =
|
|
'/' +
|
|
path
|
|
.relative(options.publicDir, childPath)
|
|
.replace(/\/index\.html$/, '/')
|
|
.replace(/\\/g, '/');
|
|
|
|
childHtmls.push({ html: childHtml, url: childUrl });
|
|
} catch (error) {
|
|
if (options.verbose) {
|
|
console.warn(` ⚠️ Could not read child page: ${childPath}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Use shared conversion function
|
|
const markdown = await convertSectionToMarkdown(
|
|
sectionHtml,
|
|
sectionUrlPath,
|
|
childHtmls
|
|
);
|
|
|
|
return markdown;
|
|
} catch (error) {
|
|
console.error(
|
|
`Error aggregating section ${sectionHtmlPath}:`,
|
|
error.message
|
|
);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Find all HTML files recursively
|
|
*/
|
|
function findHtmlFiles(dir, fileList = []) {
|
|
const files = fs.readdirSync(dir);
|
|
|
|
for (const file of files) {
|
|
const filePath = path.join(dir, file);
|
|
const stat = fs.statSync(filePath);
|
|
|
|
if (stat.isDirectory()) {
|
|
findHtmlFiles(filePath, fileList);
|
|
} else if (file === 'index.html') {
|
|
fileList.push(filePath);
|
|
}
|
|
}
|
|
|
|
return fileList;
|
|
}
|
|
|
|
/**
|
|
* Main function
|
|
*/
|
|
async function main() {
|
|
console.log('🚀 Starting HTML to Markdown conversion...\n');
|
|
|
|
const startDir = options.specificPath
|
|
? path.join(options.publicDir, options.specificPath)
|
|
: options.publicDir;
|
|
|
|
if (!fs.existsSync(startDir)) {
|
|
console.error(`❌ Error: Directory not found: ${startDir}`);
|
|
console.error(' Run "npx hugo --quiet" first to generate HTML files.');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`📂 Scanning: ${path.relative(process.cwd(), startDir)}`);
|
|
|
|
const htmlFiles = findHtmlFiles(startDir);
|
|
|
|
// Sort files by depth (shallow first) so root index.html files are processed first
|
|
htmlFiles.sort((a, b) => {
|
|
const depthA = a.split(path.sep).length;
|
|
const depthB = b.split(path.sep).length;
|
|
return depthA - depthB;
|
|
});
|
|
|
|
const totalFiles = options.limit
|
|
? Math.min(htmlFiles.length, options.limit)
|
|
: htmlFiles.length;
|
|
|
|
console.log(`📄 Found ${htmlFiles.length} HTML files`);
|
|
if (options.limit) {
|
|
console.log(
|
|
`🎯 Processing first ${totalFiles} files (--limit ${options.limit})`
|
|
);
|
|
}
|
|
console.log('');
|
|
|
|
let converted = 0;
|
|
let skipped = 0;
|
|
let sectionsGenerated = 0;
|
|
|
|
const filesToProcess = htmlFiles.slice(0, totalFiles);
|
|
|
|
for (let i = 0; i < filesToProcess.length; i++) {
|
|
const htmlFile = filesToProcess[i];
|
|
|
|
if (!options.verbose && i > 0 && i % 100 === 0) {
|
|
console.log(` Progress: ${i}/${totalFiles} files...`);
|
|
}
|
|
|
|
// Generate regular index.md
|
|
const result = await convertHtmlFileToMarkdown(htmlFile);
|
|
if (result) {
|
|
converted++;
|
|
} else {
|
|
skipped++;
|
|
}
|
|
|
|
// Check if this is a section and generate aggregated markdown
|
|
const htmlDir = path.dirname(htmlFile);
|
|
if (result && isSection(htmlDir)) {
|
|
try {
|
|
const sectionMarkdown = await aggregateSectionMarkdown(htmlFile);
|
|
if (sectionMarkdown) {
|
|
const sectionFilePath = htmlFile.replace(
|
|
/index\.html$/,
|
|
'index.section.md'
|
|
);
|
|
fs.writeFileSync(sectionFilePath, sectionMarkdown, 'utf-8');
|
|
sectionsGenerated++;
|
|
|
|
if (options.verbose) {
|
|
const relativePath = path.relative(
|
|
options.publicDir,
|
|
sectionFilePath
|
|
);
|
|
console.log(` ✓ Generated section: ${relativePath}`);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.error(
|
|
` ✗ Error generating section for ${htmlFile}:`,
|
|
error.message
|
|
);
|
|
}
|
|
}
|
|
|
|
// Periodic garbage collection hint every 100 files
|
|
if (i > 0 && i % 100 === 0 && global.gc) {
|
|
global.gc();
|
|
}
|
|
}
|
|
|
|
console.log('\n✅ Conversion complete!');
|
|
console.log(` Converted: ${converted} files`);
|
|
console.log(` Sections: ${sectionsGenerated} aggregated files`);
|
|
console.log(` Skipped: ${skipped} files`);
|
|
console.log(` Total: ${totalFiles} files processed`);
|
|
}
|
|
|
|
// Run main function
|
|
main();
|