#!/usr/bin/env node /** * Link Extractor for Documentation Files * Extracts all links from markdown and HTML files with metadata for caching and incremental validation */ import fs from 'fs'; import crypto from 'crypto'; import matter from 'gray-matter'; import path from 'path'; import process from 'process'; import { fileURLToPath } from 'url'; /** * Extract links from markdown content * @param {string} content - File content * @param {string} filePath - Path to the file * @returns {Array} Array of link objects with metadata */ function extractMarkdownLinks(content, filePath) { const links = []; const lines = content.split('\n'); // Track reference-style link definitions const referenceLinks = new Map(); // First pass: collect reference definitions content.replace(/^\s*\[([^\]]+)\]:\s*(.+)$/gm, (match, ref, url) => { referenceLinks.set(ref.toLowerCase(), url.trim()); return match; }); // Process each line for links lines.forEach((line, lineIndex) => { const lineNumber = lineIndex + 1; // Standard markdown links let match; const standardLinkRegex = /\[([^\]]*)\]\(([^)]+)\)/g; while ((match = standardLinkRegex.exec(line)) !== null) { const linkText = match[1]; const url = match[2]; const columnStart = match.index; links.push({ url: url.trim(), text: linkText, type: 'markdown', line: lineNumber, column: columnStart, context: line.trim(), hash: generateLinkHash(url.trim(), filePath, lineNumber), }); } // Reference-style links const refLinkRegex = /\[([^\]]*)\]\[([^\]]*)\]/g; while ((match = refLinkRegex.exec(line)) !== null) { const linkText = match[1]; const refKey = (match[2] || linkText).toLowerCase(); const url = referenceLinks.get(refKey); if (url) { const columnStart = match.index; links.push({ url: url, text: linkText, type: 'markdown-reference', line: lineNumber, column: columnStart, context: line.trim(), reference: refKey, hash: generateLinkHash(url, filePath, lineNumber), }); } } // Autolinks const autolinkRegex = /<(https?:\/\/[^>]+)>/g; while ((match = autolinkRegex.exec(line)) !== null) { const url = match[1]; const columnStart = match.index; links.push({ url: url, text: url, type: 'autolink', line: lineNumber, column: columnStart, context: line.trim(), hash: generateLinkHash(url, filePath, lineNumber), }); } // Bare URLs (basic detection, avoid false positives) // Regex to match bare URLs in text // - (?:^|[\s\n]): Match the start of the line or any whitespace character // - (https?:\/\/): Match the protocol (http or https) followed by :// // - [^\s\)\]\}]+: Match the rest of the URL, stopping at spaces or closing characters like ), ], or } const bareUrlRegex = /(?^|[\s\n])(?https?:\/\/[^\s\)\]\}]+)/g; while ((match = bareUrlRegex.exec(line)) !== null) { const url = match.groups.url; const columnStart = match.index + match[0].length - url.length; // Skip if this URL is already captured in a proper markdown link const alreadyCaptured = links.some( (link) => link.line === lineNumber && Math.abs(link.column - columnStart) < 10 && link.url === url ); if (!alreadyCaptured) { links.push({ url: url, text: url, type: 'bare-url', line: lineNumber, column: columnStart, context: line.trim(), hash: generateLinkHash(url, filePath, lineNumber), }); } } }); return links; } /** * Extract links from HTML content * @param {string} content - File content * @param {string} filePath - Path to the file * @returns {Array} Array of link objects with metadata */ function extractHtmlLinks(content, filePath) { const links = []; const lines = content.split('\n'); lines.forEach((line, lineIndex) => { const lineNumber = lineIndex + 1; let match; const htmlLinkRegex = /]*href\s*=\s*["']([^"']+)["'][^>]*>/gi; while ((match = htmlLinkRegex.exec(line)) !== null) { const url = match[1]; const columnStart = match.index; // Extract link text if possible const fullMatch = match[0]; const textMatch = fullMatch.match(/>([^<]*) ({ ...link, ...categorizeLinkType(link.url), filePath, })); // Calculate statistics const stats = { totalLinks: enhancedLinks.length, externalLinks: enhancedLinks.filter((l) => l.category === 'external') .length, internalLinks: enhancedLinks.filter((l) => l.category.startsWith('internal') ).length, fragmentLinks: enhancedLinks.filter((l) => l.category === 'fragment') .length, linksNeedingValidation: enhancedLinks.filter((l) => l.needsValidation) .length, }; return { filePath, fileHash, extension, frontmatter, links: enhancedLinks, stats, extractedAt: new Date().toISOString(), }; } catch (error) { console.error(`Error extracting links from ${filePath}: ${error.message}`); return null; } } /** * Main function for CLI usage */ function main() { const args = process.argv.slice(2); if (args.length === 0) { console.error('Usage: node link-extractor.js [file2] [...]'); console.error(' node link-extractor.js --help'); process.exit(1); } if (args[0] === '--help') { console.log(` Link Extractor for Documentation Files Usage: node link-extractor.js [file2] [...] Extract links from files node link-extractor.js --help Show this help Options: --json Output results as JSON --stats-only Show only statistics --filter TYPE Filter links by category (external, internal-absolute, internal-relative, fragment) Examples: node link-extractor.js content/influxdb3/core/install.md node link-extractor.js --json content/**/*.md node link-extractor.js --stats-only --filter external content/influxdb3/**/*.md `); process.exit(0); } const jsonOutput = args.includes('--json'); const statsOnly = args.includes('--stats-only'); const filterType = args.includes('--filter') ? args[args.indexOf('--filter') + 1] : null; const files = args.filter( (arg) => !arg.startsWith('--') && arg !== filterType ); const results = []; for (const filePath of files) { const result = extractLinksFromFile(filePath); if (result) { // Apply filter if specified if (filterType) { result.links = result.links.filter( (link) => link.category === filterType ); // Recalculate stats after filtering result.stats = { totalLinks: result.links.length, externalLinks: result.links.filter((l) => l.category === 'external') .length, internalLinks: result.links.filter((l) => l.category.startsWith('internal') ).length, fragmentLinks: result.links.filter((l) => l.category === 'fragment') .length, linksNeedingValidation: result.links.filter((l) => l.needsValidation) .length, }; } results.push(result); } } if (jsonOutput) { console.log(JSON.stringify(results, null, 2)); } else if (statsOnly) { console.log('\nLink Extraction Statistics:'); console.log('=========================='); let totalFiles = 0; let totalLinks = 0; let totalExternal = 0; let totalInternal = 0; let totalFragment = 0; let totalNeedingValidation = 0; results.forEach((result) => { totalFiles++; totalLinks += result.stats.totalLinks; totalExternal += result.stats.externalLinks; totalInternal += result.stats.internalLinks; totalFragment += result.stats.fragmentLinks; totalNeedingValidation += result.stats.linksNeedingValidation; console.log( `${result.filePath}: ${result.stats.totalLinks} links (${result.stats.linksNeedingValidation} need validation)` ); }); console.log('\nSummary:'); console.log(` Total files: ${totalFiles}`); console.log(` Total links: ${totalLinks}`); console.log(` External links: ${totalExternal}`); console.log(` Internal links: ${totalInternal}`); console.log(` Fragment links: ${totalFragment}`); console.log(` Links needing validation: ${totalNeedingValidation}`); } else { results.forEach((result) => { console.log(`\nFile: ${result.filePath}`); console.log(`Hash: ${result.fileHash}`); console.log(`Links found: ${result.stats.totalLinks}`); console.log( `Links needing validation: ${result.stats.linksNeedingValidation}` ); if (result.links.length > 0) { console.log('\nLinks:'); result.links.forEach((link, index) => { console.log(` ${index + 1}. [${link.category}] ${link.url}`); console.log(` Line ${link.line}, Column ${link.column}`); console.log(` Text: "${link.text}"`); console.log(` Hash: ${link.hash}`); if (link.reference) { console.log(` Reference: ${link.reference}`); } console.log(''); }); } }); } } // Export functions for use as a module export { extractLinksFromFile, extractMarkdownLinks, extractHtmlLinks, generateFileHash, generateLinkHash, categorizeLinkType, }; // Run main function if called directly if (fileURLToPath(import.meta.url) === process.argv[1]) { main(); }