From 4cb455b1ae80bd2cfe71ab05eab266a5f6765063 Mon Sep 17 00:00:00 2001 From: Jason Stirnaman Date: Mon, 1 Dec 2025 19:45:42 -0500 Subject: [PATCH] feat(ci): add incremental builds and shared content-utils (#6582) - Incremental Markdown build for PRs, full build for production - Shared content-utils library for: - Mapping shared content to consuming pages (Markdown generation, Cypress) - Listing changed content pages (committed, uncommitted, staged) - Extracting source frontmatter (docs edit) - Fix CSS parsing warnings with JSDOM VirtualConsole - Remove unused imports and variables --- .circleci/config.yml | 9 +- DOCS-DEPLOYING.md | 22 +++ cypress/support/map-files-to-urls.js | 50 +----- scripts/build-llm-markdown.js | 86 +++++++-- scripts/docs-edit.js | 21 +-- scripts/lib/content-utils.js | 251 +++++++++++++++++++++++++++ scripts/lib/markdown-converter.cjs | 20 ++- 7 files changed, 374 insertions(+), 85 deletions(-) create mode 100644 scripts/lib/content-utils.js diff --git a/.circleci/config.yml b/.circleci/config.yml index 86c287d27..22fda0c22 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -44,7 +44,14 @@ jobs: command: yarn hugo --environment production --logLevel info --gc --destination workspace/public - run: name: Generate LLM-friendly Markdown - command: yarn build:md --public-dir workspace/public + command: | + if [ "$CIRCLE_BRANCH" = "master" ]; then + # Full build for production deployments + yarn build:md --public-dir workspace/public + else + # Incremental build for PRs - only process changed files + yarn build:md --public-dir workspace/public --only-changed --base-branch origin/master + fi - persist_to_workspace: root: workspace paths: diff --git a/DOCS-DEPLOYING.md b/DOCS-DEPLOYING.md index 385e8fbe0..7f3825dfb 100644 --- a/DOCS-DEPLOYING.md +++ b/DOCS-DEPLOYING.md @@ -161,6 +161,28 @@ const CONCURRENCY = process.env.CI ? 10 : 20; - **Memory**: \~300MB peak (safe for 2GB CircleCI) - **Rate**: \~23 files/second with memory-bounded parallelism +## Making Deployment Changes + +### During Initial Implementation + +If making changes that affect `yarn build` commands or `.circleci/config.yml`: + +1. **Read the surrounding context** in the CI file +2. **Notice** flags, such as `--destination workspace/public` on the Hugo build +3. **Ask**: "Does the build script need to know about environment details--for example, do paths differ between production and staging?" + +### Recommended Prompt for Future Similar Work + +> "This script will run in CI. Let me read the CI configuration to understand the build environment and directory structure before finalizing the implementation." + +## Summary of Recommendations + +| Strategy | Implementation | Effort | +| ---------------------------------- | ---------------------------------- | ------ | +| Read CI config before implementing | Process/habit change | Low | +| Test on feature branch first | Push and watch CI before merging | Low | +| Add CI validation step | Add file count check in config.yml | Low | + ## Testing and Validation ### Local Testing diff --git a/cypress/support/map-files-to-urls.js b/cypress/support/map-files-to-urls.js index c02787651..ecf4ea246 100644 --- a/cypress/support/map-files-to-urls.js +++ b/cypress/support/map-files-to-urls.js @@ -2,60 +2,22 @@ import process from 'process'; import fs from 'fs'; -import { execSync } from 'child_process'; import matter from 'gray-matter'; import { filePathToUrl } from '../../.github/scripts/utils/url-transformer.js'; +import { + findPagesReferencingSharedContent, + categorizeContentFiles, +} from '../../scripts/lib/content-utils.js'; // Get file paths from command line arguments const filePaths = process.argv.slice(2).filter((arg) => !arg.startsWith('--')); // Parse options -const debugMode = process.argv.includes('--debug'); // deprecated, no longer used const jsonMode = process.argv.includes('--json'); // Separate shared content files and regular content files -const sharedContentFiles = filePaths.filter( - (file) => - file.startsWith('content/shared/') && - (file.endsWith('.md') || file.endsWith('.html')) -); - -const regularContentFiles = filePaths.filter( - (file) => - file.startsWith('content/') && - !file.startsWith('content/shared/') && - (file.endsWith('.md') || file.endsWith('.html')) -); - -// Find pages that reference shared content files in their frontmatter -function findPagesReferencingSharedContent(sharedFilePath) { - try { - // Remove the leading "content/" to match how it would appear in frontmatter - const relativePath = sharedFilePath.replace(/^content\//, ''); - - // Use grep to find files that reference this shared content in frontmatter - // Look for source: pattern in YAML frontmatter - const grepCmd = `grep -l "source: .*${relativePath}" --include="*.md" --include="*.html" -r content/`; - - // Execute grep command and parse results - const result = execSync(grepCmd, { encoding: 'utf8' }).trim(); - - if (!result) { - return []; - } - - return result.split('\n').filter(Boolean); - } catch (error) { - // grep returns non-zero exit code when no matches are found - if (error.status === 1) { - return []; - } - console.error( - `Error finding references to ${sharedFilePath}: ${error.message}` - ); - return []; - } -} +const { shared: sharedContentFiles, regular: regularContentFiles } = + categorizeContentFiles(filePaths); /** * Extract source from frontmatter or use the file path as source diff --git a/scripts/build-llm-markdown.js b/scripts/build-llm-markdown.js index d1be2f1a5..1925f56bb 100644 --- a/scripts/build-llm-markdown.js +++ b/scripts/build-llm-markdown.js @@ -13,16 +13,10 @@ import { glob } from 'glob'; import fs from 'fs/promises'; import { readFileSync } from 'fs'; import path from 'path'; -import { fileURLToPath } from 'url'; -import { dirname } from 'path'; import { createRequire } from 'module'; import yaml from 'js-yaml'; import pLimit from 'p-limit'; -// Get __dirname equivalent in ESM -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - // Create require function for CommonJS modules const require = createRequire(import.meta.url); const { convertToMarkdown } = require('./lib/markdown-converter.cjs'); @@ -44,13 +38,14 @@ const { convertToMarkdown } = require('./lib/markdown-converter.cjs'); */ const MIN_HTML_SIZE_BYTES = 1024; -/** - * Approximate character-to-token ratio for estimation. - * Used to estimate token count from markdown content length. - * - * @default 4 - Rough heuristic (4 characters ≈ 1 token) - */ -const CHARS_PER_TOKEN = 4; +// ============================================================================ +// INCREMENTAL BUILD HELPERS +// ============================================================================ + +import { + getChangedContentFiles, + mapContentToPublic, +} from './lib/content-utils.js'; // ============================================================================ // PHASE 1: HTML → MARKDOWN CONVERSION @@ -60,17 +55,48 @@ const CHARS_PER_TOKEN = 4; * Phase 1: Convert all HTML files to individual page markdown * Uses memory-bounded parallelism to avoid OOM in CI * @param {string} publicDir - Directory containing Hugo build output + * @param {Object} options - Build options + * @param {boolean} options.onlyChanged - Only process files changed since base branch + * @param {string} options.baseBranch - Base branch for comparison (default: 'origin/master') */ -async function buildPageMarkdown(publicDir = 'public') { +async function buildPageMarkdown(publicDir = 'public', options = {}) { + const { onlyChanged = false, baseBranch = 'origin/master' } = options; + console.log('📄 Converting HTML to Markdown (individual pages)...\n'); const startTime = Date.now(); // Find all HTML files - const htmlFiles = await glob(`${publicDir}/**/index.html`, { + let htmlFiles = await glob(`${publicDir}/**/index.html`, { ignore: ['**/node_modules/**', '**/api-docs/**'], }); - console.log(`Found ${htmlFiles.length} HTML files\n`); + const totalFiles = htmlFiles.length; + console.log(`Found ${totalFiles} HTML files\n`); + + // Filter to only changed files if requested + if (onlyChanged) { + const changedContentFiles = getChangedContentFiles(baseBranch); + + if (changedContentFiles.length > 0) { + const changedHtmlSet = mapContentToPublic(changedContentFiles, publicDir); + const filteredFiles = htmlFiles.filter((f) => changedHtmlSet.has(f)); + + console.log( + `🔄 Incremental build: ${filteredFiles.length}/${totalFiles} files changed since ${baseBranch}\n` + ); + + if (filteredFiles.length === 0) { + console.log(' No matching HTML files found, skipping Phase 1\n'); + return { converted: 0, skipped: 0, errors: [] }; + } + + htmlFiles = filteredFiles; + } else { + console.log( + ' ⚠️ No changed content files detected, processing all files\n' + ); + } + } // Memory-bounded concurrency // CircleCI medium (2GB RAM): 10 workers safe @@ -381,6 +407,8 @@ function parseArgs() { const options = { environment: null, publicDir: 'public', + onlyChanged: false, + baseBranch: 'origin/master', }; for (let i = 0; i < args.length; i++) { @@ -388,6 +416,10 @@ function parseArgs() { options.environment = args[++i]; } else if (args[i] === '--public-dir' && args[i + 1]) { options.publicDir = args[++i]; + } else if (args[i] === '--only-changed') { + options.onlyChanged = true; + } else if (args[i] === '--base-branch' && args[i + 1]) { + options.baseBranch = args[++i]; } } @@ -412,14 +444,25 @@ async function main() { } // Show public directory - console.log(`📁 Public directory: ${cliOptions.publicDir}\n`); + console.log(`📁 Public directory: ${cliOptions.publicDir}`); + // Show build mode + if (cliOptions.onlyChanged) { + console.log(`🔄 Mode: Incremental (comparing to ${cliOptions.baseBranch})`); + } else { + console.log('📦 Mode: Full build'); + } + + console.log(''); console.log('════════════════════════════════\n'); const overallStart = Date.now(); // Phase 1: Generate individual page markdown - const pageResults = await buildPageMarkdown(cliOptions.publicDir); + const pageResults = await buildPageMarkdown(cliOptions.publicDir, { + onlyChanged: cliOptions.onlyChanged, + baseBranch: cliOptions.baseBranch, + }); // Phase 2: Build section bundles const sectionResults = await buildSectionBundles(cliOptions.publicDir); @@ -462,3 +505,10 @@ export { combineMarkdown, parseMarkdown, }; + +// Re-export content utilities +export { + getChangedContentFiles, + mapContentToPublic, + findPagesReferencingSharedContent, +} from './lib/content-utils.js'; diff --git a/scripts/docs-edit.js b/scripts/docs-edit.js index ec85e73e9..3829cb1a3 100755 --- a/scripts/docs-edit.js +++ b/scripts/docs-edit.js @@ -14,9 +14,10 @@ import { parseArgs } from 'node:util'; import process from 'node:process'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; -import { existsSync, readFileSync } from 'fs'; +import { existsSync } from 'fs'; import { spawn } from 'child_process'; import { parseDocumentationURL, urlToFilePaths } from './lib/url-parser.js'; +import { getSourceFromFrontmatter } from './lib/content-utils.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); @@ -128,24 +129,12 @@ function findFiles(url) { /** * Check if file uses shared content + * @param {string} filePath - Relative path from repo root + * @returns {string|null} Path to shared source file or null */ function checkSharedContent(filePath) { const fullPath = join(REPO_ROOT, filePath); - - if (!existsSync(fullPath)) { - return null; - } - - const content = readFileSync(fullPath, 'utf8'); - - // Check for source: frontmatter - const sourceMatch = content.match(/^source:\s*(.+)$/m); - if (sourceMatch) { - const sourcePath = sourceMatch[1].trim(); - return `content${sourcePath}`; - } - - return null; + return getSourceFromFrontmatter(fullPath); } /** diff --git a/scripts/lib/content-utils.js b/scripts/lib/content-utils.js new file mode 100644 index 000000000..21932bd72 --- /dev/null +++ b/scripts/lib/content-utils.js @@ -0,0 +1,251 @@ +/** + * Content Utilities Library + * + * Shared utilities for working with content files, including: + * - Detecting changed files via git + * - Resolving shared content dependencies + * - Mapping content paths to public/URL paths + * - Extracting source frontmatter + * + * Used by: + * - scripts/build-llm-markdown.js (incremental builds) + * - scripts/docs-edit.js (opening shared source files) + * - cypress/support/map-files-to-urls.js (test file mapping) + */ + +import { execSync } from 'child_process'; +import { existsSync, readFileSync } from 'fs'; + +/** + * Find pages that reference a shared content file via source: frontmatter + * @param {string} sharedFilePath - Path to shared file (e.g., 'content/shared/sql-reference/_index.md') + * @returns {string[]} Array of content file paths that reference this shared file + */ +export function findPagesReferencingSharedContent(sharedFilePath) { + try { + // Remove leading "content/" to match frontmatter format (source: /shared/...) + const relativePath = sharedFilePath.replace(/^content\//, ''); + + // Use grep to find files with source: in frontmatter + // Include both .md and .html files for compatibility + const grepCmd = `grep -l "source: .*${relativePath}" --include="*.md" --include="*.html" -r content/`; + + const result = execSync(grepCmd, { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); + + return result ? result.split('\n').filter(Boolean) : []; + } catch (err) { + // grep returns exit code 1 when no matches found + if (err.status === 1) { + return []; + } + console.warn( + ` ⚠️ Error finding references to ${sharedFilePath}: ${err.message}` + ); + return []; + } +} + +/** + * Expand a list of changed files to include pages that reference changed shared content + * @param {string[]} changedFiles - Array of changed file paths + * @param {Object} options - Options + * @param {boolean} options.verbose - Log details about shared content resolution + * @returns {string[]} Expanded array including pages referencing changed shared content + */ +export function expandSharedContentChanges(changedFiles, options = {}) { + const { verbose = false } = options; + + // Separate shared and regular content files + const sharedFiles = changedFiles.filter((f) => + f.startsWith('content/shared/') + ); + const regularFiles = changedFiles.filter( + (f) => !f.startsWith('content/shared/') + ); + + // Start with regular files + const allAffectedFiles = new Set(regularFiles); + + // For each changed shared file, find all pages that reference it + if (sharedFiles.length > 0) { + if (verbose) { + console.log( + ` 📎 Found ${sharedFiles.length} shared content changes, finding referencing pages...` + ); + } + + for (const sharedFile of sharedFiles) { + const referencingPages = findPagesReferencingSharedContent(sharedFile); + if (referencingPages.length > 0) { + if (verbose) { + console.log(` ${sharedFile} → ${referencingPages.length} pages`); + } + referencingPages.forEach((page) => allAffectedFiles.add(page)); + } + } + } + + return Array.from(allAffectedFiles); +} + +/** + * Get list of content files that changed compared to base branch + * Includes both committed changes and uncommitted working tree changes + * Expands shared content changes to include all pages that reference them + * @param {string} baseBranch - Branch to compare against (e.g., 'origin/master') + * @param {Object} options - Options + * @param {boolean} options.verbose - Log details about change detection + * @returns {string[]} Array of changed content file paths + */ +export function getChangedContentFiles(baseBranch, options = {}) { + const { verbose = true } = options; + + try { + const allChangedFiles = new Set(); + + // Get committed changes between base branch and HEAD + try { + const committedOutput = execSync( + `git diff --name-only ${baseBranch}...HEAD -- content/`, + { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + } + ); + committedOutput + .trim() + .split('\n') + .filter(Boolean) + .forEach((f) => allChangedFiles.add(f)); + } catch { + // May fail if baseBranch doesn't exist locally + } + + // Get uncommitted changes (staged + unstaged) in working tree + try { + const uncommittedOutput = execSync( + `git diff --name-only HEAD -- content/`, + { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + } + ); + uncommittedOutput + .trim() + .split('\n') + .filter(Boolean) + .forEach((f) => allChangedFiles.add(f)); + } catch { + // May fail in detached HEAD state + } + + // Get staged changes + try { + const stagedOutput = execSync( + `git diff --name-only --cached -- content/`, + { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + } + ); + stagedOutput + .trim() + .split('\n') + .filter(Boolean) + .forEach((f) => allChangedFiles.add(f)); + } catch { + // Ignore errors + } + + const changedFiles = Array.from(allChangedFiles); + + // Expand to include pages referencing changed shared content + return expandSharedContentChanges(changedFiles, { verbose }); + } catch (err) { + console.warn(` ⚠️ Could not detect changed files: ${err.message}`); + return []; // Fall back to full build + } +} + +/** + * Map content file paths to their corresponding public HTML paths + * @param {string[]} contentFiles - Array of content file paths (e.g., 'content/influxdb3/core/page.md') + * @param {string} publicDir - Public directory (e.g., 'public' or 'workspace/public') + * @returns {Set} Set of public HTML file paths + */ +export function mapContentToPublic(contentFiles, publicDir) { + const htmlPaths = new Set(); + + for (const file of contentFiles) { + // Only process markdown files + if (!file.endsWith('.md')) continue; + + // Remove content/ prefix and .md extension + let urlPath = file.replace(/^content\//, '').replace(/\.md$/, ''); + + // Handle _index.md (section pages) - remove the _index suffix + urlPath = urlPath.replace(/\/_index$/, ''); + + // Build public HTML path + const htmlPath = `${publicDir}/${urlPath}/index.html`.replace(/\/+/g, '/'); + htmlPaths.add(htmlPath); + } + + return htmlPaths; +} + +/** + * Separate content files into shared and regular categories + * @param {string[]} files - Array of file paths + * @returns {{shared: string[], regular: string[]}} Categorized files + */ +export function categorizeContentFiles(files) { + const shared = files.filter( + (file) => + file.startsWith('content/shared/') && + (file.endsWith('.md') || file.endsWith('.html')) + ); + + const regular = files.filter( + (file) => + file.startsWith('content/') && + !file.startsWith('content/shared/') && + (file.endsWith('.md') || file.endsWith('.html')) + ); + + return { shared, regular }; +} + +/** + * Extract the source path from a file's frontmatter + * Used to find the shared content file that a page includes + * @param {string} filePath - Path to the content file + * @returns {string|null} The source path (e.g., 'content/shared/sql-reference/_index.md') or null + */ +export function getSourceFromFrontmatter(filePath) { + if (!existsSync(filePath)) { + return null; + } + + try { + const content = readFileSync(filePath, 'utf8'); + + // Quick regex check for source: in frontmatter (avoids full YAML parsing) + const sourceMatch = content.match(/^source:\s*(.+)$/m); + if (sourceMatch) { + const sourcePath = sourceMatch[1].trim(); + // Normalize to content/ prefix format + if (sourcePath.startsWith('/')) { + return `content${sourcePath}`; + } + return sourcePath; + } + + return null; + } catch { + return null; + } +} diff --git a/scripts/lib/markdown-converter.cjs b/scripts/lib/markdown-converter.cjs index dd172084a..de8545608 100644 --- a/scripts/lib/markdown-converter.cjs +++ b/scripts/lib/markdown-converter.cjs @@ -10,9 +10,7 @@ */ const TurndownService = require('turndown'); -const { JSDOM } = require('jsdom'); -const path = require('path'); -const fs = require('fs'); +const { JSDOM, VirtualConsole } = require('jsdom'); const yaml = require('js-yaml'); // Try to load Rust converter (10x faster), fall back to JavaScript @@ -167,7 +165,7 @@ function createTurndownService() { node.firstChild.nodeName === 'CODE' ); }, - replacement: function (content, node, options) { + replacement: function (_content, node, options) { const code = node.firstChild; const language = code.className.replace(/^language-/, '') || ''; const fence = options.fence; @@ -204,7 +202,7 @@ function createTurndownService() { // Convert HTML tables to Markdown tables turndownService.addRule('tables', { filter: 'table', - replacement: function (content, node) { + replacement: function (_content, node) { // Get all rows from tbody and thead const theadRows = Array.from(node.querySelectorAll('thead tr')); const tbodyRows = Array.from(node.querySelectorAll('tbody tr')); @@ -303,7 +301,17 @@ function createTurndownService() { * @returns {Object|null} Object with title, description, content or null if not found */ function extractArticleContent(htmlContent, contextInfo = '') { - const dom = new JSDOM(htmlContent); + // Create a virtual console to suppress CSS parsing errors + // JSDOM attempts to parse stylesheets which is unnecessary for markdown conversion + const virtualConsole = new VirtualConsole(); + // Optionally forward errors to console for debugging (commented out to suppress) + // virtualConsole.sendTo(console, { omitJSDOMErrors: true }); + + const dom = new JSDOM(htmlContent, { + virtualConsole, + // Don't load external resources (stylesheets, scripts, images) + resources: 'usable', + }); const document = dom.window.document; try {