feat(ci): add incremental builds and shared content-utils (#6582)

- Incremental Markdown build for PRs, full build for production - Shared content-utils library for: - Mapping shared content to consuming pages (Markdown generation, Cypress) - Listing changed content pages (committed, uncommitted, staged) - Extracting source frontmatter (docs edit) - Fix CSS parsing warnings with JSDOM VirtualConsole - Remove unused imports and variables
2025-12-01 19:45:42 -05:00 · 2025-12-01 19:45:42 -05:00 · 4cb455b1ae
parent b95e608d07
commit 4cb455b1ae
7 changed files with 374 additions and 85 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -44,7 +44,14 @@ jobs:
          command: yarn hugo --environment production --logLevel info --gc --destination workspace/public
      - run:
          name: Generate LLM-friendly Markdown
-          command: yarn build:md --public-dir workspace/public
+          command: |
+            if [ "$CIRCLE_BRANCH" = "master" ]; then
+              # Full build for production deployments
+              yarn build:md --public-dir workspace/public
+            else
+              # Incremental build for PRs - only process changed files
+              yarn build:md --public-dir workspace/public --only-changed --base-branch origin/master
+            fi
      - persist_to_workspace:
          root: workspace
          paths:
--- a/DOCS-DEPLOYING.md
+++ b/DOCS-DEPLOYING.md
@ -161,6 +161,28 @@ const CONCURRENCY = process.env.CI ? 10 : 20;
 - **Memory**: \~300MB peak (safe for 2GB CircleCI)
 - **Rate**: \~23 files/second with memory-bounded parallelism

+## Making Deployment Changes
+
+### During Initial Implementation
+
+If making changes that affect `yarn build` commands or `.circleci/config.yml`:
+
+1. **Read the surrounding context** in the CI file
+2. **Notice** flags, such as `--destination workspace/public` on the Hugo build
+3. **Ask**: "Does the build script need to know about environment details--for example, do paths differ between production and staging?"
+
+### Recommended Prompt for Future Similar Work
+
+> "This script will run in CI. Let me read the CI configuration to understand the build environment and directory structure before finalizing the implementation."
+
+## Summary of Recommendations
+
+| Strategy                           | Implementation                     | Effort |
+| ---------------------------------- | ---------------------------------- | ------ |
+| Read CI config before implementing | Process/habit change               | Low    |
+| Test on feature branch first       | Push and watch CI before merging   | Low    |
+| Add CI validation step             | Add file count check in config.yml | Low    |
+
 ## Testing and Validation

 ### Local Testing
--- a/cypress/support/map-files-to-urls.js
+++ b/cypress/support/map-files-to-urls.js
@ -2,60 +2,22 @@

 import process from 'process';
 import fs from 'fs';
-import { execSync } from 'child_process';
 import matter from 'gray-matter';
 import { filePathToUrl } from '../../.github/scripts/utils/url-transformer.js';
+import {
+  findPagesReferencingSharedContent,
+  categorizeContentFiles,
+} from '../../scripts/lib/content-utils.js';

 // Get file paths from command line arguments
 const filePaths = process.argv.slice(2).filter((arg) => !arg.startsWith('--'));

 // Parse options
-const debugMode = process.argv.includes('--debug'); // deprecated, no longer used
 const jsonMode = process.argv.includes('--json');

 // Separate shared content files and regular content files
-const sharedContentFiles = filePaths.filter(
-  (file) =>
-    file.startsWith('content/shared/') &&
-    (file.endsWith('.md') || file.endsWith('.html'))
-);
-
-const regularContentFiles = filePaths.filter(
-  (file) =>
-    file.startsWith('content/') &&
-    !file.startsWith('content/shared/') &&
-    (file.endsWith('.md') || file.endsWith('.html'))
-);
-
-// Find pages that reference shared content files in their frontmatter
-function findPagesReferencingSharedContent(sharedFilePath) {
-  try {
-    // Remove the leading "content/" to match how it would appear in frontmatter
-    const relativePath = sharedFilePath.replace(/^content\//, '');
-
-    // Use grep to find files that reference this shared content in frontmatter
-    // Look for source: <path> pattern in YAML frontmatter
-    const grepCmd = `grep -l "source: .*${relativePath}" --include="*.md" --include="*.html" -r content/`;
-
-    // Execute grep command and parse results
-    const result = execSync(grepCmd, { encoding: 'utf8' }).trim();
-
-    if (!result) {
-      return [];
-    }
-
-    return result.split('\n').filter(Boolean);
-  } catch (error) {
-    // grep returns non-zero exit code when no matches are found
-    if (error.status === 1) {
-      return [];
-    }
-    console.error(
-      `Error finding references to ${sharedFilePath}: ${error.message}`
-    );
-    return [];
-  }
-}
+const { shared: sharedContentFiles, regular: regularContentFiles } =
+  categorizeContentFiles(filePaths);

 /**
 * Extract source from frontmatter or use the file path as source
--- a/scripts/build-llm-markdown.js
+++ b/scripts/build-llm-markdown.js
@ -13,16 +13,10 @@ import { glob } from 'glob';
 import fs from 'fs/promises';
 import { readFileSync } from 'fs';
 import path from 'path';
-import { fileURLToPath } from 'url';
-import { dirname } from 'path';
 import { createRequire } from 'module';
 import yaml from 'js-yaml';
 import pLimit from 'p-limit';

-// Get __dirname equivalent in ESM
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = dirname(__filename);
-
 // Create require function for CommonJS modules
 const require = createRequire(import.meta.url);
 const { convertToMarkdown } = require('./lib/markdown-converter.cjs');
@ -44,13 +38,14 @@ const { convertToMarkdown } = require('./lib/markdown-converter.cjs');
 */
 const MIN_HTML_SIZE_BYTES = 1024;

-/**
- * Approximate character-to-token ratio for estimation.
- * Used to estimate token count from markdown content length.
- *
- * @default 4 - Rough heuristic (4 characters ≈ 1 token)
- */
-const CHARS_PER_TOKEN = 4;
+// ============================================================================
+// INCREMENTAL BUILD HELPERS
+// ============================================================================
+
+import {
+  getChangedContentFiles,
+  mapContentToPublic,
+} from './lib/content-utils.js';

 // ============================================================================
 // PHASE 1: HTML → MARKDOWN CONVERSION
@ -60,17 +55,48 @@ const CHARS_PER_TOKEN = 4;
 * Phase 1: Convert all HTML files to individual page markdown
 * Uses memory-bounded parallelism to avoid OOM in CI
 * @param {string} publicDir - Directory containing Hugo build output
+ * @param {Object} options - Build options
+ * @param {boolean} options.onlyChanged - Only process files changed since base branch
+ * @param {string} options.baseBranch - Base branch for comparison (default: 'origin/master')
 */
-async function buildPageMarkdown(publicDir = 'public') {
+async function buildPageMarkdown(publicDir = 'public', options = {}) {
+  const { onlyChanged = false, baseBranch = 'origin/master' } = options;
+
  console.log('📄 Converting HTML to Markdown (individual pages)...\n');
  const startTime = Date.now();

  // Find all HTML files
-  const htmlFiles = await glob(`${publicDir}/**/index.html`, {
+  let htmlFiles = await glob(`${publicDir}/**/index.html`, {
    ignore: ['**/node_modules/**', '**/api-docs/**'],
  });

-  console.log(`Found ${htmlFiles.length} HTML files\n`);
+  const totalFiles = htmlFiles.length;
+  console.log(`Found ${totalFiles} HTML files\n`);
+
+  // Filter to only changed files if requested
+  if (onlyChanged) {
+    const changedContentFiles = getChangedContentFiles(baseBranch);
+
+    if (changedContentFiles.length > 0) {
+      const changedHtmlSet = mapContentToPublic(changedContentFiles, publicDir);
+      const filteredFiles = htmlFiles.filter((f) => changedHtmlSet.has(f));
+
+      console.log(
+        `🔄 Incremental build: ${filteredFiles.length}/${totalFiles} files changed since ${baseBranch}\n`
+      );
+
+      if (filteredFiles.length === 0) {
+        console.log('  No matching HTML files found, skipping Phase 1\n');
+        return { converted: 0, skipped: 0, errors: [] };
+      }
+
+      htmlFiles = filteredFiles;
+    } else {
+      console.log(
+        '  ⚠️  No changed content files detected, processing all files\n'
+      );
+    }
+  }

  // Memory-bounded concurrency
  // CircleCI medium (2GB RAM): 10 workers safe
@ -381,6 +407,8 @@ function parseArgs() {
  const options = {
    environment: null,
    publicDir: 'public',
+    onlyChanged: false,
+    baseBranch: 'origin/master',
  };

  for (let i = 0; i < args.length; i++) {
@ -388,6 +416,10 @@ function parseArgs() {
      options.environment = args[++i];
    } else if (args[i] === '--public-dir' && args[i + 1]) {
      options.publicDir = args[++i];
+    } else if (args[i] === '--only-changed') {
+      options.onlyChanged = true;
+    } else if (args[i] === '--base-branch' && args[i + 1]) {
+      options.baseBranch = args[++i];
    }
  }

@ -412,14 +444,25 @@ async function main() {
  }

  // Show public directory
-  console.log(`📁 Public directory: ${cliOptions.publicDir}\n`);
+  console.log(`📁 Public directory: ${cliOptions.publicDir}`);

+  // Show build mode
+  if (cliOptions.onlyChanged) {
+    console.log(`🔄 Mode: Incremental (comparing to ${cliOptions.baseBranch})`);
+  } else {
+    console.log('📦 Mode: Full build');
+  }
+
+  console.log('');
  console.log('════════════════════════════════\n');

  const overallStart = Date.now();

  // Phase 1: Generate individual page markdown
-  const pageResults = await buildPageMarkdown(cliOptions.publicDir);
+  const pageResults = await buildPageMarkdown(cliOptions.publicDir, {
+    onlyChanged: cliOptions.onlyChanged,
+    baseBranch: cliOptions.baseBranch,
+  });

  // Phase 2: Build section bundles
  const sectionResults = await buildSectionBundles(cliOptions.publicDir);
@ -462,3 +505,10 @@ export {
  combineMarkdown,
  parseMarkdown,
 };
+
+// Re-export content utilities
+export {
+  getChangedContentFiles,
+  mapContentToPublic,
+  findPagesReferencingSharedContent,
+} from './lib/content-utils.js';
--- a/scripts/docs-edit.js
+++ b/scripts/docs-edit.js
@ -14,9 +14,10 @@ import { parseArgs } from 'node:util';
 import process from 'node:process';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
-import { existsSync, readFileSync } from 'fs';
+import { existsSync } from 'fs';
 import { spawn } from 'child_process';
 import { parseDocumentationURL, urlToFilePaths } from './lib/url-parser.js';
+import { getSourceFromFrontmatter } from './lib/content-utils.js';

 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
@ -128,24 +129,12 @@ function findFiles(url) {

 /**
 * Check if file uses shared content
+ * @param {string} filePath - Relative path from repo root
+ * @returns {string|null} Path to shared source file or null
 */
 function checkSharedContent(filePath) {
  const fullPath = join(REPO_ROOT, filePath);
-
-  if (!existsSync(fullPath)) {
-    return null;
-  }
-
-  const content = readFileSync(fullPath, 'utf8');
-
-  // Check for source: frontmatter
-  const sourceMatch = content.match(/^source:\s*(.+)$/m);
-  if (sourceMatch) {
-    const sourcePath = sourceMatch[1].trim();
-    return `content${sourcePath}`;
-  }
-
-  return null;
+  return getSourceFromFrontmatter(fullPath);
 }

 /**
--- a/scripts/lib/content-utils.js
+++ b/scripts/lib/content-utils.js
@ -0,0 +1,251 @@
+/**
+ * Content Utilities Library
+ *
+ * Shared utilities for working with content files, including:
+ * - Detecting changed files via git
+ * - Resolving shared content dependencies
+ * - Mapping content paths to public/URL paths
+ * - Extracting source frontmatter
+ *
+ * Used by:
+ * - scripts/build-llm-markdown.js (incremental builds)
+ * - scripts/docs-edit.js (opening shared source files)
+ * - cypress/support/map-files-to-urls.js (test file mapping)
+ */
+
+import { execSync } from 'child_process';
+import { existsSync, readFileSync } from 'fs';
+
+/**
+ * Find pages that reference a shared content file via source: frontmatter
+ * @param {string} sharedFilePath - Path to shared file (e.g., 'content/shared/sql-reference/_index.md')
+ * @returns {string[]} Array of content file paths that reference this shared file
+ */
+export function findPagesReferencingSharedContent(sharedFilePath) {
+  try {
+    // Remove leading "content/" to match frontmatter format (source: /shared/...)
+    const relativePath = sharedFilePath.replace(/^content\//, '');
+
+    // Use grep to find files with source: <path> in frontmatter
+    // Include both .md and .html files for compatibility
+    const grepCmd = `grep -l "source: .*${relativePath}" --include="*.md" --include="*.html" -r content/`;
+
+    const result = execSync(grepCmd, {
+      encoding: 'utf-8',
+      stdio: ['pipe', 'pipe', 'pipe'],
+    }).trim();
+
+    return result ? result.split('\n').filter(Boolean) : [];
+  } catch (err) {
+    // grep returns exit code 1 when no matches found
+    if (err.status === 1) {
+      return [];
+    }
+    console.warn(
+      `  ⚠️  Error finding references to ${sharedFilePath}: ${err.message}`
+    );
+    return [];
+  }
+}
+
+/**
+ * Expand a list of changed files to include pages that reference changed shared content
+ * @param {string[]} changedFiles - Array of changed file paths
+ * @param {Object} options - Options
+ * @param {boolean} options.verbose - Log details about shared content resolution
+ * @returns {string[]} Expanded array including pages referencing changed shared content
+ */
+export function expandSharedContentChanges(changedFiles, options = {}) {
+  const { verbose = false } = options;
+
+  // Separate shared and regular content files
+  const sharedFiles = changedFiles.filter((f) =>
+    f.startsWith('content/shared/')
+  );
+  const regularFiles = changedFiles.filter(
+    (f) => !f.startsWith('content/shared/')
+  );
+
+  // Start with regular files
+  const allAffectedFiles = new Set(regularFiles);
+
+  // For each changed shared file, find all pages that reference it
+  if (sharedFiles.length > 0) {
+    if (verbose) {
+      console.log(
+        `  📎 Found ${sharedFiles.length} shared content changes, finding referencing pages...`
+      );
+    }
+
+    for (const sharedFile of sharedFiles) {
+      const referencingPages = findPagesReferencingSharedContent(sharedFile);
+      if (referencingPages.length > 0) {
+        if (verbose) {
+          console.log(`     ${sharedFile} → ${referencingPages.length} pages`);
+        }
+        referencingPages.forEach((page) => allAffectedFiles.add(page));
+      }
+    }
+  }
+
+  return Array.from(allAffectedFiles);
+}
+
+/**
+ * Get list of content files that changed compared to base branch
+ * Includes both committed changes and uncommitted working tree changes
+ * Expands shared content changes to include all pages that reference them
+ * @param {string} baseBranch - Branch to compare against (e.g., 'origin/master')
+ * @param {Object} options - Options
+ * @param {boolean} options.verbose - Log details about change detection
+ * @returns {string[]} Array of changed content file paths
+ */
+export function getChangedContentFiles(baseBranch, options = {}) {
+  const { verbose = true } = options;
+
+  try {
+    const allChangedFiles = new Set();
+
+    // Get committed changes between base branch and HEAD
+    try {
+      const committedOutput = execSync(
+        `git diff --name-only ${baseBranch}...HEAD -- content/`,
+        {
+          encoding: 'utf-8',
+          stdio: ['pipe', 'pipe', 'pipe'],
+        }
+      );
+      committedOutput
+        .trim()
+        .split('\n')
+        .filter(Boolean)
+        .forEach((f) => allChangedFiles.add(f));
+    } catch {
+      // May fail if baseBranch doesn't exist locally
+    }
+
+    // Get uncommitted changes (staged + unstaged) in working tree
+    try {
+      const uncommittedOutput = execSync(
+        `git diff --name-only HEAD -- content/`,
+        {
+          encoding: 'utf-8',
+          stdio: ['pipe', 'pipe', 'pipe'],
+        }
+      );
+      uncommittedOutput
+        .trim()
+        .split('\n')
+        .filter(Boolean)
+        .forEach((f) => allChangedFiles.add(f));
+    } catch {
+      // May fail in detached HEAD state
+    }
+
+    // Get staged changes
+    try {
+      const stagedOutput = execSync(
+        `git diff --name-only --cached -- content/`,
+        {
+          encoding: 'utf-8',
+          stdio: ['pipe', 'pipe', 'pipe'],
+        }
+      );
+      stagedOutput
+        .trim()
+        .split('\n')
+        .filter(Boolean)
+        .forEach((f) => allChangedFiles.add(f));
+    } catch {
+      // Ignore errors
+    }
+
+    const changedFiles = Array.from(allChangedFiles);
+
+    // Expand to include pages referencing changed shared content
+    return expandSharedContentChanges(changedFiles, { verbose });
+  } catch (err) {
+    console.warn(`  ⚠️  Could not detect changed files: ${err.message}`);
+    return []; // Fall back to full build
+  }
+}
+
+/**
+ * Map content file paths to their corresponding public HTML paths
+ * @param {string[]} contentFiles - Array of content file paths (e.g., 'content/influxdb3/core/page.md')
+ * @param {string} publicDir - Public directory (e.g., 'public' or 'workspace/public')
+ * @returns {Set<string>} Set of public HTML file paths
+ */
+export function mapContentToPublic(contentFiles, publicDir) {
+  const htmlPaths = new Set();
+
+  for (const file of contentFiles) {
+    // Only process markdown files
+    if (!file.endsWith('.md')) continue;
+
+    // Remove content/ prefix and .md extension
+    let urlPath = file.replace(/^content\//, '').replace(/\.md$/, '');
+
+    // Handle _index.md (section pages) - remove the _index suffix
+    urlPath = urlPath.replace(/\/_index$/, '');
+
+    // Build public HTML path
+    const htmlPath = `${publicDir}/${urlPath}/index.html`.replace(/\/+/g, '/');
+    htmlPaths.add(htmlPath);
+  }
+
+  return htmlPaths;
+}
+
+/**
+ * Separate content files into shared and regular categories
+ * @param {string[]} files - Array of file paths
+ * @returns {{shared: string[], regular: string[]}} Categorized files
+ */
+export function categorizeContentFiles(files) {
+  const shared = files.filter(
+    (file) =>
+      file.startsWith('content/shared/') &&
+      (file.endsWith('.md') || file.endsWith('.html'))
+  );
+
+  const regular = files.filter(
+    (file) =>
+      file.startsWith('content/') &&
+      !file.startsWith('content/shared/') &&
+      (file.endsWith('.md') || file.endsWith('.html'))
+  );
+
+  return { shared, regular };
+}
+
+/**
+ * Extract the source path from a file's frontmatter
+ * Used to find the shared content file that a page includes
+ * @param {string} filePath - Path to the content file
+ * @returns {string|null} The source path (e.g., 'content/shared/sql-reference/_index.md') or null
+ */
+export function getSourceFromFrontmatter(filePath) {
+  if (!existsSync(filePath)) {
+    return null;
+  }
+
+  try {
+    const content = readFileSync(filePath, 'utf8');
+
+    // Quick regex check for source: in frontmatter (avoids full YAML parsing)
+    const sourceMatch = content.match(/^source:\s*(.+)$/m);
+    if (sourceMatch) {
+      const sourcePath = sourceMatch[1].trim();
+      // Normalize to content/ prefix format
+      if (sourcePath.startsWith('/')) {
+        return `content${sourcePath}`;
+      }
+      return sourcePath;
+    }
+
+    return null;
+  } catch {
+    return null;
+  }
+}
--- a/scripts/lib/markdown-converter.cjs
+++ b/scripts/lib/markdown-converter.cjs
@ -10,9 +10,7 @@
 */

 const TurndownService = require('turndown');
-const { JSDOM } = require('jsdom');
-const path = require('path');
-const fs = require('fs');
+const { JSDOM, VirtualConsole } = require('jsdom');
 const yaml = require('js-yaml');

 // Try to load Rust converter (10x faster), fall back to JavaScript
@ -167,7 +165,7 @@ function createTurndownService() {
        node.firstChild.nodeName === 'CODE'
      );
    },
-    replacement: function (content, node, options) {
+    replacement: function (_content, node, options) {
      const code = node.firstChild;
      const language = code.className.replace(/^language-/, '') || '';
      const fence = options.fence;
@ -204,7 +202,7 @@ function createTurndownService() {
  // Convert HTML tables to Markdown tables
  turndownService.addRule('tables', {
    filter: 'table',
-    replacement: function (content, node) {
+    replacement: function (_content, node) {
      // Get all rows from tbody and thead
      const theadRows = Array.from(node.querySelectorAll('thead tr'));
      const tbodyRows = Array.from(node.querySelectorAll('tbody tr'));
@ -303,7 +301,17 @@ function createTurndownService() {
 * @returns {Object|null} Object with title, description, content or null if not found
 */
 function extractArticleContent(htmlContent, contextInfo = '') {
-  const dom = new JSDOM(htmlContent);
+  // Create a virtual console to suppress CSS parsing errors
+  // JSDOM attempts to parse stylesheets which is unnecessary for markdown conversion
+  const virtualConsole = new VirtualConsole();
+  // Optionally forward errors to console for debugging (commented out to suppress)
+  // virtualConsole.sendTo(console, { omitJSDOMErrors: true });
+
+  const dom = new JSDOM(htmlContent, {
+    virtualConsole,
+    // Don't load external resources (stylesheets, scripts, images)
+    resources: 'usable',
+  });
  const document = dom.window.document;

  try {