docs-v2/.github/scripts/link-extractor.cjs

477 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* Link Extractor for Documentation Files
* Extracts all links from markdown and HTML files with metadata for caching and incremental validation
*/
const fs = require('fs');
const crypto = require('crypto');
const matter = require('gray-matter');
const path = require('path');
const process = require('process');
/**
* Extract links from markdown content
* @param {string} content - File content
* @param {string} filePath - Path to the file
* @returns {Array} Array of link objects with metadata
*/
function extractMarkdownLinks(content, filePath) {
const links = [];
const lines = content.split('\n');
// Track reference-style link definitions
const referenceLinks = new Map();
// First pass: collect reference definitions
content.replace(/^\s*\[([^\]]+)\]:\s*(.+)$/gm, (match, ref, url) => {
referenceLinks.set(ref.toLowerCase(), url.trim());
return match;
});
// Process each line for links
lines.forEach((line, lineIndex) => {
const lineNumber = lineIndex + 1;
// Standard markdown links
let match;
const standardLinkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
while ((match = standardLinkRegex.exec(line)) !== null) {
const linkText = match[1];
const url = match[2];
const columnStart = match.index;
links.push({
url: url.trim(),
text: linkText,
type: 'markdown',
line: lineNumber,
column: columnStart,
context: line.trim(),
hash: generateLinkHash(url.trim(), filePath, lineNumber),
});
}
// Reference-style links
const refLinkRegex = /\[([^\]]*)\]\[([^\]]*)\]/g;
while ((match = refLinkRegex.exec(line)) !== null) {
const linkText = match[1];
const refKey = (match[2] || linkText).toLowerCase();
const url = referenceLinks.get(refKey);
if (url) {
const columnStart = match.index;
links.push({
url: url,
text: linkText,
type: 'markdown-reference',
line: lineNumber,
column: columnStart,
context: line.trim(),
reference: refKey,
hash: generateLinkHash(url, filePath, lineNumber),
});
}
}
// Autolinks
const autolinkRegex = /<(https?:\/\/[^>]+)>/g;
while ((match = autolinkRegex.exec(line)) !== null) {
const url = match[1];
const columnStart = match.index;
links.push({
url: url,
text: url,
type: 'autolink',
line: lineNumber,
column: columnStart,
context: line.trim(),
hash: generateLinkHash(url, filePath, lineNumber),
});
}
// Bare URLs (basic detection, avoid false positives)
// Regex to match bare URLs in text
// - (?:^|[\s\n]): Match the start of the line or any whitespace character
// - (https?:\/\/): Match the protocol (http or https) followed by ://
// - [^\s\)\]\}]+: Match the rest of the URL, stopping at spaces or closing characters like ), ], or }
const bareUrlRegex = /(?<start>^|[\s\n])(?<url>https?:\/\/[^\s\)\]\}]+)/g;
while ((match = bareUrlRegex.exec(line)) !== null) {
const url = match.groups.url;
const columnStart = match.index + match[0].length - url.length;
// Skip if this URL is already captured in a proper markdown link
const alreadyCaptured = links.some(
(link) =>
link.line === lineNumber &&
Math.abs(link.column - columnStart) < 10 &&
link.url === url
);
if (!alreadyCaptured) {
links.push({
url: url,
text: url,
type: 'bare-url',
line: lineNumber,
column: columnStart,
context: line.trim(),
hash: generateLinkHash(url, filePath, lineNumber),
});
}
}
});
return links;
}
/**
* Extract links from HTML content
* @param {string} content - File content
* @param {string} filePath - Path to the file
* @returns {Array} Array of link objects with metadata
*/
function extractHtmlLinks(content, filePath) {
const links = [];
const lines = content.split('\n');
lines.forEach((line, lineIndex) => {
const lineNumber = lineIndex + 1;
let match;
const htmlLinkRegex = /<a\s+[^>]*href\s*=\s*["']([^"']+)["'][^>]*>/gi;
while ((match = htmlLinkRegex.exec(line)) !== null) {
const url = match[1];
const columnStart = match.index;
// Extract link text if possible
const fullMatch = match[0];
const textMatch = fullMatch.match(/>([^<]*)</);
const linkText = textMatch ? textMatch[1].trim() : url;
links.push({
url: url,
text: linkText,
type: 'html',
line: lineNumber,
column: columnStart,
context: line.trim(),
hash: generateLinkHash(url, filePath, lineNumber),
});
}
});
return links;
}
/**
* Generate a unique hash for a link
* @param {string} url - The URL
* @param {string} filePath - File path
* @param {number} line - Line number
* @returns {string} Hash string
*/
function generateLinkHash(url, filePath, line) {
const data = `${filePath}:${line}:${url.trim()}`;
return crypto
.createHash('sha256')
.update(data)
.digest('hex')
.substring(0, 16);
}
/**
* Generate a hash for file content
* @param {string} content - File content
* @returns {string} Hash string
*/
function generateFileHash(content) {
return crypto
.createHash('sha256')
.update(content)
.digest('hex')
.substring(0, 16);
}
/**
* Categorize link types for validation
* @param {string} url - The URL to categorize
* @returns {Object} Link category information
*/
function categorizeLinkType(url) {
const trimmedUrl = url.trim();
// External links
if (trimmedUrl.startsWith('http://') || trimmedUrl.startsWith('https://')) {
return {
category: 'external',
protocol: trimmedUrl.startsWith('https://') ? 'https' : 'http',
needsValidation: true,
};
}
// Internal absolute links
if (trimmedUrl.startsWith('/')) {
return {
category: 'internal-absolute',
needsValidation: true,
};
}
// Relative links
if (
trimmedUrl.startsWith('./') ||
trimmedUrl.startsWith('../') ||
(!trimmedUrl.startsWith('#') && !trimmedUrl.includes('://'))
) {
return {
category: 'internal-relative',
needsValidation: true,
};
}
// Fragment/anchor links
if (trimmedUrl.startsWith('#')) {
return {
category: 'fragment',
needsValidation: true, // May need validation for internal page anchors
};
}
// Special protocols (mailto, tel, etc.)
if (trimmedUrl.includes('://') && !trimmedUrl.startsWith('http')) {
return {
category: 'special-protocol',
needsValidation: false,
};
}
return {
category: 'unknown',
needsValidation: true,
};
}
/**
* Extract all links from a file
* @param {string} filePath - Path to the file
* @returns {Object} File analysis with links and metadata
*/
function extractLinksFromFile(filePath) {
try {
if (!fs.existsSync(filePath)) {
throw new Error(`File not found: ${filePath}`);
}
const content = fs.readFileSync(filePath, 'utf8');
const fileHash = generateFileHash(content);
const extension = path.extname(filePath).toLowerCase();
let links = [];
let frontmatter = {};
let bodyContent = content;
// Parse frontmatter for .md files
if (extension === '.md') {
try {
const parsed = matter(content);
frontmatter = parsed.data || {};
bodyContent = parsed.content;
} catch (err) {
console.warn(
`Warning: Could not parse frontmatter in ${filePath}: ${err.message}`
);
}
// Extract links from markdown content
links = extractMarkdownLinks(bodyContent, filePath);
} else if (extension === '.html') {
// Extract links from HTML content
links = extractHtmlLinks(content, filePath);
} else {
console.warn(`Warning: Unsupported file type for ${filePath}`);
return null;
}
// Categorize and enhance links
const enhancedLinks = links.map((link) => ({
...link,
...categorizeLinkType(link.url),
filePath,
}));
// Calculate statistics
const stats = {
totalLinks: enhancedLinks.length,
externalLinks: enhancedLinks.filter((l) => l.category === 'external')
.length,
internalLinks: enhancedLinks.filter((l) =>
l.category.startsWith('internal')
).length,
fragmentLinks: enhancedLinks.filter((l) => l.category === 'fragment')
.length,
linksNeedingValidation: enhancedLinks.filter((l) => l.needsValidation)
.length,
};
return {
filePath,
fileHash,
extension,
frontmatter,
links: enhancedLinks,
stats,
extractedAt: new Date().toISOString(),
};
} catch (error) {
console.error(`Error extracting links from ${filePath}: ${error.message}`);
return null;
}
}
/**
* Main function for CLI usage
*/
function main() {
const args = process.argv.slice(2);
if (args.length === 0) {
console.error('Usage: node link-extractor.cjs <file1> [file2] [...]');
console.error(' node link-extractor.cjs --help');
process.exit(1);
}
if (args[0] === '--help') {
console.log(`
Link Extractor for Documentation Files
Usage:
node link-extractor.cjs <file1> [file2] [...] Extract links from files
node link-extractor.cjs --help Show this help
Options:
--json Output results as JSON
--stats-only Show only statistics
--filter TYPE Filter links by category (external, internal-absolute, internal-relative, fragment)
Examples:
node link-extractor.cjs content/influxdb3/core/install.md
node link-extractor.cjs --json content/**/*.md
node link-extractor.cjs --stats-only --filter external content/influxdb3/**/*.md
`);
process.exit(0);
}
const jsonOutput = args.includes('--json');
const statsOnly = args.includes('--stats-only');
const filterType = args.includes('--filter')
? args[args.indexOf('--filter') + 1]
: null;
const files = args.filter(
(arg) => !arg.startsWith('--') && arg !== filterType
);
const results = [];
for (const filePath of files) {
const result = extractLinksFromFile(filePath);
if (result) {
// Apply filter if specified
if (filterType) {
result.links = result.links.filter(
(link) => link.category === filterType
);
// Recalculate stats after filtering
result.stats = {
totalLinks: result.links.length,
externalLinks: result.links.filter((l) => l.category === 'external')
.length,
internalLinks: result.links.filter((l) =>
l.category.startsWith('internal')
).length,
fragmentLinks: result.links.filter((l) => l.category === 'fragment')
.length,
linksNeedingValidation: result.links.filter((l) => l.needsValidation)
.length,
};
}
results.push(result);
}
}
if (jsonOutput) {
console.log(JSON.stringify(results, null, 2));
} else if (statsOnly) {
console.log('\nLink Extraction Statistics:');
console.log('==========================');
let totalFiles = 0;
let totalLinks = 0;
let totalExternal = 0;
let totalInternal = 0;
let totalFragment = 0;
let totalNeedingValidation = 0;
results.forEach((result) => {
totalFiles++;
totalLinks += result.stats.totalLinks;
totalExternal += result.stats.externalLinks;
totalInternal += result.stats.internalLinks;
totalFragment += result.stats.fragmentLinks;
totalNeedingValidation += result.stats.linksNeedingValidation;
console.log(
`${result.filePath}: ${result.stats.totalLinks} links (${result.stats.linksNeedingValidation} need validation)`
);
});
console.log('\nSummary:');
console.log(` Total files: ${totalFiles}`);
console.log(` Total links: ${totalLinks}`);
console.log(` External links: ${totalExternal}`);
console.log(` Internal links: ${totalInternal}`);
console.log(` Fragment links: ${totalFragment}`);
console.log(` Links needing validation: ${totalNeedingValidation}`);
} else {
results.forEach((result) => {
console.log(`\nFile: ${result.filePath}`);
console.log(`Hash: ${result.fileHash}`);
console.log(`Links found: ${result.stats.totalLinks}`);
console.log(
`Links needing validation: ${result.stats.linksNeedingValidation}`
);
if (result.links.length > 0) {
console.log('\nLinks:');
result.links.forEach((link, index) => {
console.log(` ${index + 1}. [${link.category}] ${link.url}`);
console.log(` Line ${link.line}, Column ${link.column}`);
console.log(` Text: "${link.text}"`);
console.log(` Hash: ${link.hash}`);
if (link.reference) {
console.log(` Reference: ${link.reference}`);
}
console.log('');
});
}
});
}
}
// Export functions for use as a module
module.exports = {
extractLinksFromFile,
extractMarkdownLinks,
extractHtmlLinks,
generateFileHash,
generateLinkHash,
categorizeLinkType,
};
// Run main function if called directly
if (require.main === module) {
main();
}