feat(ci): add incremental builds and shared content-utils (#6582)

- Incremental Markdown build for PRs, full build for production
- Shared content-utils library for:
  - Mapping shared content to consuming pages (Markdown generation, Cypress)
  - Listing changed content pages (committed, uncommitted, staged)
  - Extracting source frontmatter (docs edit)
- Fix CSS parsing warnings with JSDOM VirtualConsole
- Remove unused imports and variables
fix-prefer-database-over-namespace^2
Jason Stirnaman 2025-12-01 19:45:42 -05:00 committed by GitHub
parent b95e608d07
commit 4cb455b1ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 374 additions and 85 deletions

View File

@ -44,7 +44,14 @@ jobs:
command: yarn hugo --environment production --logLevel info --gc --destination workspace/public
- run:
name: Generate LLM-friendly Markdown
command: yarn build:md --public-dir workspace/public
command: |
if [ "$CIRCLE_BRANCH" = "master" ]; then
# Full build for production deployments
yarn build:md --public-dir workspace/public
else
# Incremental build for PRs - only process changed files
yarn build:md --public-dir workspace/public --only-changed --base-branch origin/master
fi
- persist_to_workspace:
root: workspace
paths:

View File

@ -161,6 +161,28 @@ const CONCURRENCY = process.env.CI ? 10 : 20;
- **Memory**: \~300MB peak (safe for 2GB CircleCI)
- **Rate**: \~23 files/second with memory-bounded parallelism
## Making Deployment Changes
### During Initial Implementation
If making changes that affect `yarn build` commands or `.circleci/config.yml`:
1. **Read the surrounding context** in the CI file
2. **Notice** flags, such as `--destination workspace/public` on the Hugo build
3. **Ask**: "Does the build script need to know about environment details--for example, do paths differ between production and staging?"
### Recommended Prompt for Future Similar Work
> "This script will run in CI. Let me read the CI configuration to understand the build environment and directory structure before finalizing the implementation."
## Summary of Recommendations
| Strategy | Implementation | Effort |
| ---------------------------------- | ---------------------------------- | ------ |
| Read CI config before implementing | Process/habit change | Low |
| Test on feature branch first | Push and watch CI before merging | Low |
| Add CI validation step | Add file count check in config.yml | Low |
## Testing and Validation
### Local Testing

View File

@ -2,60 +2,22 @@
import process from 'process';
import fs from 'fs';
import { execSync } from 'child_process';
import matter from 'gray-matter';
import { filePathToUrl } from '../../.github/scripts/utils/url-transformer.js';
import {
findPagesReferencingSharedContent,
categorizeContentFiles,
} from '../../scripts/lib/content-utils.js';
// Get file paths from command line arguments
const filePaths = process.argv.slice(2).filter((arg) => !arg.startsWith('--'));
// Parse options
const debugMode = process.argv.includes('--debug'); // deprecated, no longer used
const jsonMode = process.argv.includes('--json');
// Separate shared content files and regular content files
const sharedContentFiles = filePaths.filter(
(file) =>
file.startsWith('content/shared/') &&
(file.endsWith('.md') || file.endsWith('.html'))
);
const regularContentFiles = filePaths.filter(
(file) =>
file.startsWith('content/') &&
!file.startsWith('content/shared/') &&
(file.endsWith('.md') || file.endsWith('.html'))
);
// Find pages that reference shared content files in their frontmatter
function findPagesReferencingSharedContent(sharedFilePath) {
try {
// Remove the leading "content/" to match how it would appear in frontmatter
const relativePath = sharedFilePath.replace(/^content\//, '');
// Use grep to find files that reference this shared content in frontmatter
// Look for source: <path> pattern in YAML frontmatter
const grepCmd = `grep -l "source: .*${relativePath}" --include="*.md" --include="*.html" -r content/`;
// Execute grep command and parse results
const result = execSync(grepCmd, { encoding: 'utf8' }).trim();
if (!result) {
return [];
}
return result.split('\n').filter(Boolean);
} catch (error) {
// grep returns non-zero exit code when no matches are found
if (error.status === 1) {
return [];
}
console.error(
`Error finding references to ${sharedFilePath}: ${error.message}`
);
return [];
}
}
const { shared: sharedContentFiles, regular: regularContentFiles } =
categorizeContentFiles(filePaths);
/**
* Extract source from frontmatter or use the file path as source

View File

@ -13,16 +13,10 @@ import { glob } from 'glob';
import fs from 'fs/promises';
import { readFileSync } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { dirname } from 'path';
import { createRequire } from 'module';
import yaml from 'js-yaml';
import pLimit from 'p-limit';
// Get __dirname equivalent in ESM
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Create require function for CommonJS modules
const require = createRequire(import.meta.url);
const { convertToMarkdown } = require('./lib/markdown-converter.cjs');
@ -44,13 +38,14 @@ const { convertToMarkdown } = require('./lib/markdown-converter.cjs');
*/
const MIN_HTML_SIZE_BYTES = 1024;
/**
* Approximate character-to-token ratio for estimation.
* Used to estimate token count from markdown content length.
*
* @default 4 - Rough heuristic (4 characters 1 token)
*/
const CHARS_PER_TOKEN = 4;
// ============================================================================
// INCREMENTAL BUILD HELPERS
// ============================================================================
import {
getChangedContentFiles,
mapContentToPublic,
} from './lib/content-utils.js';
// ============================================================================
// PHASE 1: HTML → MARKDOWN CONVERSION
@ -60,17 +55,48 @@ const CHARS_PER_TOKEN = 4;
* Phase 1: Convert all HTML files to individual page markdown
* Uses memory-bounded parallelism to avoid OOM in CI
* @param {string} publicDir - Directory containing Hugo build output
* @param {Object} options - Build options
* @param {boolean} options.onlyChanged - Only process files changed since base branch
* @param {string} options.baseBranch - Base branch for comparison (default: 'origin/master')
*/
async function buildPageMarkdown(publicDir = 'public') {
async function buildPageMarkdown(publicDir = 'public', options = {}) {
const { onlyChanged = false, baseBranch = 'origin/master' } = options;
console.log('📄 Converting HTML to Markdown (individual pages)...\n');
const startTime = Date.now();
// Find all HTML files
const htmlFiles = await glob(`${publicDir}/**/index.html`, {
let htmlFiles = await glob(`${publicDir}/**/index.html`, {
ignore: ['**/node_modules/**', '**/api-docs/**'],
});
console.log(`Found ${htmlFiles.length} HTML files\n`);
const totalFiles = htmlFiles.length;
console.log(`Found ${totalFiles} HTML files\n`);
// Filter to only changed files if requested
if (onlyChanged) {
const changedContentFiles = getChangedContentFiles(baseBranch);
if (changedContentFiles.length > 0) {
const changedHtmlSet = mapContentToPublic(changedContentFiles, publicDir);
const filteredFiles = htmlFiles.filter((f) => changedHtmlSet.has(f));
console.log(
`🔄 Incremental build: ${filteredFiles.length}/${totalFiles} files changed since ${baseBranch}\n`
);
if (filteredFiles.length === 0) {
console.log(' No matching HTML files found, skipping Phase 1\n');
return { converted: 0, skipped: 0, errors: [] };
}
htmlFiles = filteredFiles;
} else {
console.log(
' ⚠️ No changed content files detected, processing all files\n'
);
}
}
// Memory-bounded concurrency
// CircleCI medium (2GB RAM): 10 workers safe
@ -381,6 +407,8 @@ function parseArgs() {
const options = {
environment: null,
publicDir: 'public',
onlyChanged: false,
baseBranch: 'origin/master',
};
for (let i = 0; i < args.length; i++) {
@ -388,6 +416,10 @@ function parseArgs() {
options.environment = args[++i];
} else if (args[i] === '--public-dir' && args[i + 1]) {
options.publicDir = args[++i];
} else if (args[i] === '--only-changed') {
options.onlyChanged = true;
} else if (args[i] === '--base-branch' && args[i + 1]) {
options.baseBranch = args[++i];
}
}
@ -412,14 +444,25 @@ async function main() {
}
// Show public directory
console.log(`📁 Public directory: ${cliOptions.publicDir}\n`);
console.log(`📁 Public directory: ${cliOptions.publicDir}`);
// Show build mode
if (cliOptions.onlyChanged) {
console.log(`🔄 Mode: Incremental (comparing to ${cliOptions.baseBranch})`);
} else {
console.log('📦 Mode: Full build');
}
console.log('');
console.log('════════════════════════════════\n');
const overallStart = Date.now();
// Phase 1: Generate individual page markdown
const pageResults = await buildPageMarkdown(cliOptions.publicDir);
const pageResults = await buildPageMarkdown(cliOptions.publicDir, {
onlyChanged: cliOptions.onlyChanged,
baseBranch: cliOptions.baseBranch,
});
// Phase 2: Build section bundles
const sectionResults = await buildSectionBundles(cliOptions.publicDir);
@ -462,3 +505,10 @@ export {
combineMarkdown,
parseMarkdown,
};
// Re-export content utilities
export {
getChangedContentFiles,
mapContentToPublic,
findPagesReferencingSharedContent,
} from './lib/content-utils.js';

View File

@ -14,9 +14,10 @@ import { parseArgs } from 'node:util';
import process from 'node:process';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { existsSync, readFileSync } from 'fs';
import { existsSync } from 'fs';
import { spawn } from 'child_process';
import { parseDocumentationURL, urlToFilePaths } from './lib/url-parser.js';
import { getSourceFromFrontmatter } from './lib/content-utils.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
@ -128,24 +129,12 @@ function findFiles(url) {
/**
* Check if file uses shared content
* @param {string} filePath - Relative path from repo root
* @returns {string|null} Path to shared source file or null
*/
function checkSharedContent(filePath) {
const fullPath = join(REPO_ROOT, filePath);
if (!existsSync(fullPath)) {
return null;
}
const content = readFileSync(fullPath, 'utf8');
// Check for source: frontmatter
const sourceMatch = content.match(/^source:\s*(.+)$/m);
if (sourceMatch) {
const sourcePath = sourceMatch[1].trim();
return `content${sourcePath}`;
}
return null;
return getSourceFromFrontmatter(fullPath);
}
/**

View File

@ -0,0 +1,251 @@
/**
* Content Utilities Library
*
* Shared utilities for working with content files, including:
* - Detecting changed files via git
* - Resolving shared content dependencies
* - Mapping content paths to public/URL paths
* - Extracting source frontmatter
*
* Used by:
* - scripts/build-llm-markdown.js (incremental builds)
* - scripts/docs-edit.js (opening shared source files)
* - cypress/support/map-files-to-urls.js (test file mapping)
*/
import { execSync } from 'child_process';
import { existsSync, readFileSync } from 'fs';
/**
* Find pages that reference a shared content file via source: frontmatter
* @param {string} sharedFilePath - Path to shared file (e.g., 'content/shared/sql-reference/_index.md')
* @returns {string[]} Array of content file paths that reference this shared file
*/
export function findPagesReferencingSharedContent(sharedFilePath) {
try {
// Remove leading "content/" to match frontmatter format (source: /shared/...)
const relativePath = sharedFilePath.replace(/^content\//, '');
// Use grep to find files with source: <path> in frontmatter
// Include both .md and .html files for compatibility
const grepCmd = `grep -l "source: .*${relativePath}" --include="*.md" --include="*.html" -r content/`;
const result = execSync(grepCmd, {
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
return result ? result.split('\n').filter(Boolean) : [];
} catch (err) {
// grep returns exit code 1 when no matches found
if (err.status === 1) {
return [];
}
console.warn(
` ⚠️ Error finding references to ${sharedFilePath}: ${err.message}`
);
return [];
}
}
/**
* Expand a list of changed files to include pages that reference changed shared content
* @param {string[]} changedFiles - Array of changed file paths
* @param {Object} options - Options
* @param {boolean} options.verbose - Log details about shared content resolution
* @returns {string[]} Expanded array including pages referencing changed shared content
*/
export function expandSharedContentChanges(changedFiles, options = {}) {
const { verbose = false } = options;
// Separate shared and regular content files
const sharedFiles = changedFiles.filter((f) =>
f.startsWith('content/shared/')
);
const regularFiles = changedFiles.filter(
(f) => !f.startsWith('content/shared/')
);
// Start with regular files
const allAffectedFiles = new Set(regularFiles);
// For each changed shared file, find all pages that reference it
if (sharedFiles.length > 0) {
if (verbose) {
console.log(
` 📎 Found ${sharedFiles.length} shared content changes, finding referencing pages...`
);
}
for (const sharedFile of sharedFiles) {
const referencingPages = findPagesReferencingSharedContent(sharedFile);
if (referencingPages.length > 0) {
if (verbose) {
console.log(` ${sharedFile}${referencingPages.length} pages`);
}
referencingPages.forEach((page) => allAffectedFiles.add(page));
}
}
}
return Array.from(allAffectedFiles);
}
/**
* Get list of content files that changed compared to base branch
* Includes both committed changes and uncommitted working tree changes
* Expands shared content changes to include all pages that reference them
* @param {string} baseBranch - Branch to compare against (e.g., 'origin/master')
* @param {Object} options - Options
* @param {boolean} options.verbose - Log details about change detection
* @returns {string[]} Array of changed content file paths
*/
export function getChangedContentFiles(baseBranch, options = {}) {
const { verbose = true } = options;
try {
const allChangedFiles = new Set();
// Get committed changes between base branch and HEAD
try {
const committedOutput = execSync(
`git diff --name-only ${baseBranch}...HEAD -- content/`,
{
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}
);
committedOutput
.trim()
.split('\n')
.filter(Boolean)
.forEach((f) => allChangedFiles.add(f));
} catch {
// May fail if baseBranch doesn't exist locally
}
// Get uncommitted changes (staged + unstaged) in working tree
try {
const uncommittedOutput = execSync(
`git diff --name-only HEAD -- content/`,
{
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}
);
uncommittedOutput
.trim()
.split('\n')
.filter(Boolean)
.forEach((f) => allChangedFiles.add(f));
} catch {
// May fail in detached HEAD state
}
// Get staged changes
try {
const stagedOutput = execSync(
`git diff --name-only --cached -- content/`,
{
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}
);
stagedOutput
.trim()
.split('\n')
.filter(Boolean)
.forEach((f) => allChangedFiles.add(f));
} catch {
// Ignore errors
}
const changedFiles = Array.from(allChangedFiles);
// Expand to include pages referencing changed shared content
return expandSharedContentChanges(changedFiles, { verbose });
} catch (err) {
console.warn(` ⚠️ Could not detect changed files: ${err.message}`);
return []; // Fall back to full build
}
}
/**
* Map content file paths to their corresponding public HTML paths
* @param {string[]} contentFiles - Array of content file paths (e.g., 'content/influxdb3/core/page.md')
* @param {string} publicDir - Public directory (e.g., 'public' or 'workspace/public')
* @returns {Set<string>} Set of public HTML file paths
*/
export function mapContentToPublic(contentFiles, publicDir) {
const htmlPaths = new Set();
for (const file of contentFiles) {
// Only process markdown files
if (!file.endsWith('.md')) continue;
// Remove content/ prefix and .md extension
let urlPath = file.replace(/^content\//, '').replace(/\.md$/, '');
// Handle _index.md (section pages) - remove the _index suffix
urlPath = urlPath.replace(/\/_index$/, '');
// Build public HTML path
const htmlPath = `${publicDir}/${urlPath}/index.html`.replace(/\/+/g, '/');
htmlPaths.add(htmlPath);
}
return htmlPaths;
}
/**
* Separate content files into shared and regular categories
* @param {string[]} files - Array of file paths
* @returns {{shared: string[], regular: string[]}} Categorized files
*/
export function categorizeContentFiles(files) {
const shared = files.filter(
(file) =>
file.startsWith('content/shared/') &&
(file.endsWith('.md') || file.endsWith('.html'))
);
const regular = files.filter(
(file) =>
file.startsWith('content/') &&
!file.startsWith('content/shared/') &&
(file.endsWith('.md') || file.endsWith('.html'))
);
return { shared, regular };
}
/**
* Extract the source path from a file's frontmatter
* Used to find the shared content file that a page includes
* @param {string} filePath - Path to the content file
* @returns {string|null} The source path (e.g., 'content/shared/sql-reference/_index.md') or null
*/
export function getSourceFromFrontmatter(filePath) {
if (!existsSync(filePath)) {
return null;
}
try {
const content = readFileSync(filePath, 'utf8');
// Quick regex check for source: in frontmatter (avoids full YAML parsing)
const sourceMatch = content.match(/^source:\s*(.+)$/m);
if (sourceMatch) {
const sourcePath = sourceMatch[1].trim();
// Normalize to content/ prefix format
if (sourcePath.startsWith('/')) {
return `content${sourcePath}`;
}
return sourcePath;
}
return null;
} catch {
return null;
}
}

View File

@ -10,9 +10,7 @@
*/
const TurndownService = require('turndown');
const { JSDOM } = require('jsdom');
const path = require('path');
const fs = require('fs');
const { JSDOM, VirtualConsole } = require('jsdom');
const yaml = require('js-yaml');
// Try to load Rust converter (10x faster), fall back to JavaScript
@ -167,7 +165,7 @@ function createTurndownService() {
node.firstChild.nodeName === 'CODE'
);
},
replacement: function (content, node, options) {
replacement: function (_content, node, options) {
const code = node.firstChild;
const language = code.className.replace(/^language-/, '') || '';
const fence = options.fence;
@ -204,7 +202,7 @@ function createTurndownService() {
// Convert HTML tables to Markdown tables
turndownService.addRule('tables', {
filter: 'table',
replacement: function (content, node) {
replacement: function (_content, node) {
// Get all rows from tbody and thead
const theadRows = Array.from(node.querySelectorAll('thead tr'));
const tbodyRows = Array.from(node.querySelectorAll('tbody tr'));
@ -303,7 +301,17 @@ function createTurndownService() {
* @returns {Object|null} Object with title, description, content or null if not found
*/
function extractArticleContent(htmlContent, contextInfo = '') {
const dom = new JSDOM(htmlContent);
// Create a virtual console to suppress CSS parsing errors
// JSDOM attempts to parse stylesheets which is unnecessary for markdown conversion
const virtualConsole = new VirtualConsole();
// Optionally forward errors to console for debugging (commented out to suppress)
// virtualConsole.sendTo(console, { omitJSDOMErrors: true });
const dom = new JSDOM(htmlContent, {
virtualConsole,
// Don't load external resources (stylesheets, scripts, images)
resources: 'usable',
});
const document = dom.window.document;
try {