477 lines
13 KiB
JavaScript
477 lines
13 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* Link Extractor for Documentation Files
|
|
* Extracts all links from markdown and HTML files with metadata for caching and incremental validation
|
|
*/
|
|
|
|
const fs = require('fs');
|
|
const crypto = require('crypto');
|
|
const matter = require('gray-matter');
|
|
const path = require('path');
|
|
const process = require('process');
|
|
|
|
/**
|
|
* Extract links from markdown content
|
|
* @param {string} content - File content
|
|
* @param {string} filePath - Path to the file
|
|
* @returns {Array} Array of link objects with metadata
|
|
*/
|
|
function extractMarkdownLinks(content, filePath) {
|
|
const links = [];
|
|
const lines = content.split('\n');
|
|
|
|
// Track reference-style link definitions
|
|
const referenceLinks = new Map();
|
|
|
|
// First pass: collect reference definitions
|
|
content.replace(/^\s*\[([^\]]+)\]:\s*(.+)$/gm, (match, ref, url) => {
|
|
referenceLinks.set(ref.toLowerCase(), url.trim());
|
|
return match;
|
|
});
|
|
|
|
// Process each line for links
|
|
lines.forEach((line, lineIndex) => {
|
|
const lineNumber = lineIndex + 1;
|
|
|
|
// Standard markdown links
|
|
let match;
|
|
const standardLinkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
|
|
while ((match = standardLinkRegex.exec(line)) !== null) {
|
|
const linkText = match[1];
|
|
const url = match[2];
|
|
const columnStart = match.index;
|
|
|
|
links.push({
|
|
url: url.trim(),
|
|
text: linkText,
|
|
type: 'markdown',
|
|
line: lineNumber,
|
|
column: columnStart,
|
|
context: line.trim(),
|
|
hash: generateLinkHash(url.trim(), filePath, lineNumber),
|
|
});
|
|
}
|
|
|
|
// Reference-style links
|
|
const refLinkRegex = /\[([^\]]*)\]\[([^\]]*)\]/g;
|
|
while ((match = refLinkRegex.exec(line)) !== null) {
|
|
const linkText = match[1];
|
|
const refKey = (match[2] || linkText).toLowerCase();
|
|
const url = referenceLinks.get(refKey);
|
|
|
|
if (url) {
|
|
const columnStart = match.index;
|
|
links.push({
|
|
url: url,
|
|
text: linkText,
|
|
type: 'markdown-reference',
|
|
line: lineNumber,
|
|
column: columnStart,
|
|
context: line.trim(),
|
|
reference: refKey,
|
|
hash: generateLinkHash(url, filePath, lineNumber),
|
|
});
|
|
}
|
|
}
|
|
|
|
// Autolinks
|
|
const autolinkRegex = /<(https?:\/\/[^>]+)>/g;
|
|
while ((match = autolinkRegex.exec(line)) !== null) {
|
|
const url = match[1];
|
|
const columnStart = match.index;
|
|
|
|
links.push({
|
|
url: url,
|
|
text: url,
|
|
type: 'autolink',
|
|
line: lineNumber,
|
|
column: columnStart,
|
|
context: line.trim(),
|
|
hash: generateLinkHash(url, filePath, lineNumber),
|
|
});
|
|
}
|
|
|
|
// Bare URLs (basic detection, avoid false positives)
|
|
// Regex to match bare URLs in text
|
|
// - (?:^|[\s\n]): Match the start of the line or any whitespace character
|
|
// - (https?:\/\/): Match the protocol (http or https) followed by ://
|
|
// - [^\s\)\]\}]+: Match the rest of the URL, stopping at spaces or closing characters like ), ], or }
|
|
const bareUrlRegex = /(?<start>^|[\s\n])(?<url>https?:\/\/[^\s\)\]\}]+)/g;
|
|
while ((match = bareUrlRegex.exec(line)) !== null) {
|
|
const url = match.groups.url;
|
|
const columnStart = match.index + match[0].length - url.length;
|
|
|
|
// Skip if this URL is already captured in a proper markdown link
|
|
const alreadyCaptured = links.some(
|
|
(link) =>
|
|
link.line === lineNumber &&
|
|
Math.abs(link.column - columnStart) < 10 &&
|
|
link.url === url
|
|
);
|
|
|
|
if (!alreadyCaptured) {
|
|
links.push({
|
|
url: url,
|
|
text: url,
|
|
type: 'bare-url',
|
|
line: lineNumber,
|
|
column: columnStart,
|
|
context: line.trim(),
|
|
hash: generateLinkHash(url, filePath, lineNumber),
|
|
});
|
|
}
|
|
}
|
|
});
|
|
|
|
return links;
|
|
}
|
|
|
|
/**
|
|
* Extract links from HTML content
|
|
* @param {string} content - File content
|
|
* @param {string} filePath - Path to the file
|
|
* @returns {Array} Array of link objects with metadata
|
|
*/
|
|
function extractHtmlLinks(content, filePath) {
|
|
const links = [];
|
|
const lines = content.split('\n');
|
|
|
|
lines.forEach((line, lineIndex) => {
|
|
const lineNumber = lineIndex + 1;
|
|
let match;
|
|
|
|
const htmlLinkRegex = /<a\s+[^>]*href\s*=\s*["']([^"']+)["'][^>]*>/gi;
|
|
while ((match = htmlLinkRegex.exec(line)) !== null) {
|
|
const url = match[1];
|
|
const columnStart = match.index;
|
|
|
|
// Extract link text if possible
|
|
const fullMatch = match[0];
|
|
const textMatch = fullMatch.match(/>([^<]*)</);
|
|
const linkText = textMatch ? textMatch[1].trim() : url;
|
|
|
|
links.push({
|
|
url: url,
|
|
text: linkText,
|
|
type: 'html',
|
|
line: lineNumber,
|
|
column: columnStart,
|
|
context: line.trim(),
|
|
hash: generateLinkHash(url, filePath, lineNumber),
|
|
});
|
|
}
|
|
});
|
|
|
|
return links;
|
|
}
|
|
|
|
/**
|
|
* Generate a unique hash for a link
|
|
* @param {string} url - The URL
|
|
* @param {string} filePath - File path
|
|
* @param {number} line - Line number
|
|
* @returns {string} Hash string
|
|
*/
|
|
function generateLinkHash(url, filePath, line) {
|
|
const data = `${filePath}:${line}:${url.trim()}`;
|
|
return crypto
|
|
.createHash('sha256')
|
|
.update(data)
|
|
.digest('hex')
|
|
.substring(0, 16);
|
|
}
|
|
|
|
/**
|
|
* Generate a hash for file content
|
|
* @param {string} content - File content
|
|
* @returns {string} Hash string
|
|
*/
|
|
function generateFileHash(content) {
|
|
return crypto
|
|
.createHash('sha256')
|
|
.update(content)
|
|
.digest('hex')
|
|
.substring(0, 16);
|
|
}
|
|
|
|
/**
|
|
* Categorize link types for validation
|
|
* @param {string} url - The URL to categorize
|
|
* @returns {Object} Link category information
|
|
*/
|
|
function categorizeLinkType(url) {
|
|
const trimmedUrl = url.trim();
|
|
|
|
// External links
|
|
if (trimmedUrl.startsWith('http://') || trimmedUrl.startsWith('https://')) {
|
|
return {
|
|
category: 'external',
|
|
protocol: trimmedUrl.startsWith('https://') ? 'https' : 'http',
|
|
needsValidation: true,
|
|
};
|
|
}
|
|
|
|
// Internal absolute links
|
|
if (trimmedUrl.startsWith('/')) {
|
|
return {
|
|
category: 'internal-absolute',
|
|
needsValidation: true,
|
|
};
|
|
}
|
|
|
|
// Relative links
|
|
if (
|
|
trimmedUrl.startsWith('./') ||
|
|
trimmedUrl.startsWith('../') ||
|
|
(!trimmedUrl.startsWith('#') && !trimmedUrl.includes('://'))
|
|
) {
|
|
return {
|
|
category: 'internal-relative',
|
|
needsValidation: true,
|
|
};
|
|
}
|
|
|
|
// Fragment/anchor links
|
|
if (trimmedUrl.startsWith('#')) {
|
|
return {
|
|
category: 'fragment',
|
|
needsValidation: true, // May need validation for internal page anchors
|
|
};
|
|
}
|
|
|
|
// Special protocols (mailto, tel, etc.)
|
|
if (trimmedUrl.includes('://') && !trimmedUrl.startsWith('http')) {
|
|
return {
|
|
category: 'special-protocol',
|
|
needsValidation: false,
|
|
};
|
|
}
|
|
|
|
return {
|
|
category: 'unknown',
|
|
needsValidation: true,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract all links from a file
|
|
* @param {string} filePath - Path to the file
|
|
* @returns {Object} File analysis with links and metadata
|
|
*/
|
|
function extractLinksFromFile(filePath) {
|
|
try {
|
|
if (!fs.existsSync(filePath)) {
|
|
throw new Error(`File not found: ${filePath}`);
|
|
}
|
|
|
|
const content = fs.readFileSync(filePath, 'utf8');
|
|
const fileHash = generateFileHash(content);
|
|
const extension = path.extname(filePath).toLowerCase();
|
|
|
|
let links = [];
|
|
let frontmatter = {};
|
|
let bodyContent = content;
|
|
|
|
// Parse frontmatter for .md files
|
|
if (extension === '.md') {
|
|
try {
|
|
const parsed = matter(content);
|
|
frontmatter = parsed.data || {};
|
|
bodyContent = parsed.content;
|
|
} catch (err) {
|
|
console.warn(
|
|
`Warning: Could not parse frontmatter in ${filePath}: ${err.message}`
|
|
);
|
|
}
|
|
|
|
// Extract links from markdown content
|
|
links = extractMarkdownLinks(bodyContent, filePath);
|
|
} else if (extension === '.html') {
|
|
// Extract links from HTML content
|
|
links = extractHtmlLinks(content, filePath);
|
|
} else {
|
|
console.warn(`Warning: Unsupported file type for ${filePath}`);
|
|
return null;
|
|
}
|
|
|
|
// Categorize and enhance links
|
|
const enhancedLinks = links.map((link) => ({
|
|
...link,
|
|
...categorizeLinkType(link.url),
|
|
filePath,
|
|
}));
|
|
|
|
// Calculate statistics
|
|
const stats = {
|
|
totalLinks: enhancedLinks.length,
|
|
externalLinks: enhancedLinks.filter((l) => l.category === 'external')
|
|
.length,
|
|
internalLinks: enhancedLinks.filter((l) =>
|
|
l.category.startsWith('internal')
|
|
).length,
|
|
fragmentLinks: enhancedLinks.filter((l) => l.category === 'fragment')
|
|
.length,
|
|
linksNeedingValidation: enhancedLinks.filter((l) => l.needsValidation)
|
|
.length,
|
|
};
|
|
|
|
return {
|
|
filePath,
|
|
fileHash,
|
|
extension,
|
|
frontmatter,
|
|
links: enhancedLinks,
|
|
stats,
|
|
extractedAt: new Date().toISOString(),
|
|
};
|
|
} catch (error) {
|
|
console.error(`Error extracting links from ${filePath}: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main function for CLI usage
|
|
*/
|
|
function main() {
|
|
const args = process.argv.slice(2);
|
|
|
|
if (args.length === 0) {
|
|
console.error('Usage: node link-extractor.cjs <file1> [file2] [...]');
|
|
console.error(' node link-extractor.cjs --help');
|
|
process.exit(1);
|
|
}
|
|
|
|
if (args[0] === '--help') {
|
|
console.log(`
|
|
Link Extractor for Documentation Files
|
|
|
|
Usage:
|
|
node link-extractor.cjs <file1> [file2] [...] Extract links from files
|
|
node link-extractor.cjs --help Show this help
|
|
|
|
Options:
|
|
--json Output results as JSON
|
|
--stats-only Show only statistics
|
|
--filter TYPE Filter links by category (external, internal-absolute, internal-relative, fragment)
|
|
|
|
Examples:
|
|
node link-extractor.cjs content/influxdb3/core/install.md
|
|
node link-extractor.cjs --json content/**/*.md
|
|
node link-extractor.cjs --stats-only --filter external content/influxdb3/**/*.md
|
|
`);
|
|
process.exit(0);
|
|
}
|
|
|
|
const jsonOutput = args.includes('--json');
|
|
const statsOnly = args.includes('--stats-only');
|
|
const filterType = args.includes('--filter')
|
|
? args[args.indexOf('--filter') + 1]
|
|
: null;
|
|
|
|
const files = args.filter(
|
|
(arg) => !arg.startsWith('--') && arg !== filterType
|
|
);
|
|
const results = [];
|
|
|
|
for (const filePath of files) {
|
|
const result = extractLinksFromFile(filePath);
|
|
if (result) {
|
|
// Apply filter if specified
|
|
if (filterType) {
|
|
result.links = result.links.filter(
|
|
(link) => link.category === filterType
|
|
);
|
|
// Recalculate stats after filtering
|
|
result.stats = {
|
|
totalLinks: result.links.length,
|
|
externalLinks: result.links.filter((l) => l.category === 'external')
|
|
.length,
|
|
internalLinks: result.links.filter((l) =>
|
|
l.category.startsWith('internal')
|
|
).length,
|
|
fragmentLinks: result.links.filter((l) => l.category === 'fragment')
|
|
.length,
|
|
linksNeedingValidation: result.links.filter((l) => l.needsValidation)
|
|
.length,
|
|
};
|
|
}
|
|
|
|
results.push(result);
|
|
}
|
|
}
|
|
|
|
if (jsonOutput) {
|
|
console.log(JSON.stringify(results, null, 2));
|
|
} else if (statsOnly) {
|
|
console.log('\nLink Extraction Statistics:');
|
|
console.log('==========================');
|
|
|
|
let totalFiles = 0;
|
|
let totalLinks = 0;
|
|
let totalExternal = 0;
|
|
let totalInternal = 0;
|
|
let totalFragment = 0;
|
|
let totalNeedingValidation = 0;
|
|
|
|
results.forEach((result) => {
|
|
totalFiles++;
|
|
totalLinks += result.stats.totalLinks;
|
|
totalExternal += result.stats.externalLinks;
|
|
totalInternal += result.stats.internalLinks;
|
|
totalFragment += result.stats.fragmentLinks;
|
|
totalNeedingValidation += result.stats.linksNeedingValidation;
|
|
|
|
console.log(
|
|
`${result.filePath}: ${result.stats.totalLinks} links (${result.stats.linksNeedingValidation} need validation)`
|
|
);
|
|
});
|
|
|
|
console.log('\nSummary:');
|
|
console.log(` Total files: ${totalFiles}`);
|
|
console.log(` Total links: ${totalLinks}`);
|
|
console.log(` External links: ${totalExternal}`);
|
|
console.log(` Internal links: ${totalInternal}`);
|
|
console.log(` Fragment links: ${totalFragment}`);
|
|
console.log(` Links needing validation: ${totalNeedingValidation}`);
|
|
} else {
|
|
results.forEach((result) => {
|
|
console.log(`\nFile: ${result.filePath}`);
|
|
console.log(`Hash: ${result.fileHash}`);
|
|
console.log(`Links found: ${result.stats.totalLinks}`);
|
|
console.log(
|
|
`Links needing validation: ${result.stats.linksNeedingValidation}`
|
|
);
|
|
|
|
if (result.links.length > 0) {
|
|
console.log('\nLinks:');
|
|
result.links.forEach((link, index) => {
|
|
console.log(` ${index + 1}. [${link.category}] ${link.url}`);
|
|
console.log(` Line ${link.line}, Column ${link.column}`);
|
|
console.log(` Text: "${link.text}"`);
|
|
console.log(` Hash: ${link.hash}`);
|
|
if (link.reference) {
|
|
console.log(` Reference: ${link.reference}`);
|
|
}
|
|
console.log('');
|
|
});
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
// Export functions for use as a module
|
|
module.exports = {
|
|
extractLinksFromFile,
|
|
extractMarkdownLinks,
|
|
extractHtmlLinks,
|
|
generateFileHash,
|
|
generateLinkHash,
|
|
categorizeLinkType,
|
|
};
|
|
|
|
// Run main function if called directly
|
|
if (require.main === module) {
|
|
main();
|
|
} |