docs-v2/scripts/lib/markdown-converter.cjs

/**
 * Markdown Converter Library
 *
 * Core conversion logic for transforming HTML to Markdown.
 * This library is used by both:
 * - docs-v2 build scripts (html-to-markdown.js)
 * - docs-tooling Lambda@Edge function
 *
 * Exports reusable functions for HTML→Markdown conversion
 */

const TurndownService = require('turndown');
const { JSDOM } = require('jsdom');
const path = require('path');
const fs = require('fs');
const yaml = require('js-yaml');

// Try to load Rust converter (10x faster), fall back to JavaScript
let rustConverter = null;
let USE_RUST = false;
try {
  rustConverter = require('../rust-markdown-converter');
  USE_RUST = true;
  console.log('✓ Rust markdown converter loaded');
} catch (err) {
  console.log('ℹ Using JavaScript converter (Rust not available)');
  rustConverter = null;
}

// Built-in product mappings (fallback since ESM module can't be required from CommonJS)
const URL_PATTERN_MAP = {
  '/influxdb3/core/': 'influxdb3_core',
  '/influxdb3/enterprise/': 'influxdb3_enterprise',
  '/influxdb3/cloud-dedicated/': 'influxdb3_cloud_dedicated',
  '/influxdb3/cloud-serverless/': 'influxdb3_cloud_serverless',
  '/influxdb3/clustered/': 'influxdb3_clustered',
  '/influxdb3/explorer/': 'influxdb3_explorer',
  '/influxdb/cloud/': 'influxdb_cloud',
  '/influxdb/v2': 'influxdb_v2',
  '/influxdb/v1': 'influxdb_v1',
  '/enterprise_influxdb/': 'enterprise_influxdb',
  '/telegraf/': 'telegraf',
  '/chronograf/': 'chronograf',
  '/kapacitor/': 'kapacitor',
  '/flux/': 'flux',
};

const PRODUCT_NAME_MAP = {
  influxdb3_core: { name: 'InfluxDB 3 Core', version: 'core' },
  influxdb3_enterprise: { name: 'InfluxDB 3 Enterprise', version: 'enterprise' },
  influxdb3_cloud_dedicated: { name: 'InfluxDB Cloud Dedicated', version: 'cloud-dedicated' },
  influxdb3_cloud_serverless: { name: 'InfluxDB Cloud Serverless', version: 'cloud-serverless' },
  influxdb3_clustered: { name: 'InfluxDB Clustered', version: 'clustered' },
  influxdb3_explorer: { name: 'InfluxDB 3 Explorer', version: 'explorer' },
  influxdb_cloud: { name: 'InfluxDB Cloud (TSM)', version: 'cloud' },
  influxdb_v2: { name: 'InfluxDB OSS v2', version: 'v2' },
  influxdb_v1: { name: 'InfluxDB OSS v1', version: 'v1' },
  enterprise_influxdb: { name: 'InfluxDB Enterprise v1', version: 'v1' },
  telegraf: { name: 'Telegraf', version: 'v1' },
  chronograf: { name: 'Chronograf', version: 'v1' },
  kapacitor: { name: 'Kapacitor', version: 'v1' },
  flux: { name: 'Flux', version: 'v0' },
};

// Note: ESM product-mappings module can't be required from CommonJS
// Using built-in mappings above instead
let productMappings = null;

// Debug mode - set to true to enable verbose logging
const DEBUG = false;

// Product data cache
let productsData = null;

/**
 * Detect base URL for the current environment
 * @returns {string} Base URL (http://localhost:1313, staging URL, or production URL)
 */
function detectBaseUrl() {
  // Check environment variables first
  if (process.env.BASE_URL) {
    return process.env.BASE_URL;
  }

  // Check if Hugo dev server is running on localhost
  if (process.env.HUGO_ENV === 'development' || process.env.NODE_ENV === 'development') {
    return 'http://localhost:1313';
  }

  // Check for staging environment
  if (process.env.HUGO_ENV === 'staging' || process.env.DEPLOY_ENV === 'staging') {
    return process.env.STAGING_URL || 'https://test2.docs.influxdata.com';
  }

  // Default to production
  return 'https://docs.influxdata.com';
}

/**
 * Initialize product data
 * Uses the product-mappings module (compiled from TypeScript)
 */
async function ensureProductDataInitialized() {
  if (productsData) {
    return;
  }

  if (productMappings && productMappings.initializeProductData) {
    try {
      await productMappings.initializeProductData();
      productsData = true; // Mark as initialized
    } catch (err) {
      console.warn('Failed to initialize product-mappings:', err.message);
      productsData = true; // Mark as initialized anyway to avoid retries
    }
  } else {
    productsData = true; // Mark as initialized (fallback mode)
  }
}

/**
 * Get product info from URL path
 * Uses built-in URL pattern maps for detection
 */
function getProductFromPath(urlPath) {
  // Find matching product key from URL patterns
  for (const [pattern, productKey] of Object.entries(URL_PATTERN_MAP)) {
    if (urlPath.includes(pattern)) {
      const productInfo = PRODUCT_NAME_MAP[productKey];
      if (productInfo) {
        return productInfo;
      }
    }
  }
  return null;
}

/**
 * Detect product context from URL path
 */
function detectProduct(urlPath) {
  return getProductFromPath(urlPath);
}

/**
 * Configure Turndown for InfluxData documentation
 */
function createTurndownService() {
  const turndownService = new TurndownService({
    headingStyle: 'atx',
    codeBlockStyle: 'fenced',
    fence: '```',
    emDelimiter: '*',
    strongDelimiter: '**',
    // Note: linkStyle: 'inline' breaks link conversion in Turndown 7.2.2
    // Using default 'referenced' style which works correctly
    bulletListMarker: '-',
  });

  // Preserve code block language identifiers
  turndownService.addRule('fencedCodeBlock', {
    filter: function (node, options) {
      return (
        options.codeBlockStyle === 'fenced' &&
        node.nodeName === 'PRE' &&
        node.firstChild &&
        node.firstChild.nodeName === 'CODE'
      );
    },
    replacement: function (content, node, options) {
      const code = node.firstChild;
      const language = code.className.replace(/^language-/, '') || '';
      const fence = options.fence;
      return `\n\n${fence}${language}\n${code.textContent}\n${fence}\n\n`;
    },
  });

  // Improve list item handling - ensure proper spacing
  turndownService.addRule('listItems', {
    filter: 'li',
    replacement: function (content, node, options) {
      content = content
        .replace(/^\n+/, '') // Remove leading newlines
        .replace(/\n+$/, '\n') // Single trailing newline
        .replace(/\n/gm, '\n    '); // Indent nested content

      let prefix = options.bulletListMarker + '   '; // Dash + 3 spaces for unordered lists
      const parent = node.parentNode;

      if (parent.nodeName === 'OL') {
        const start = parent.getAttribute('start');
        const index = Array.prototype.indexOf.call(parent.children, node);
        prefix = (start ? Number(start) + index : index + 1) + '. ';
      }

      return (
        prefix +
        content +
        (node.nextSibling && !/\n$/.test(content) ? '\n' : '')
      );
    },
  });

  // Convert HTML tables to Markdown tables
  turndownService.addRule('tables', {
    filter: 'table',
    replacement: function (content, node) {
      // Get all rows from tbody and thead
      const theadRows = Array.from(node.querySelectorAll('thead tr'));
      const tbodyRows = Array.from(node.querySelectorAll('tbody tr'));

      // If no thead/tbody, fall back to all tr elements
      const allRows =
        theadRows.length || tbodyRows.length
          ? [...theadRows, ...tbodyRows]
          : Array.from(node.querySelectorAll('tr'));

      if (allRows.length === 0) return '';

      // Extract headers from first row
      const headerRow = allRows[0];
      const headers = Array.from(headerRow.querySelectorAll('th, td')).map(
        (cell) => cell.textContent.trim()
      );

      // Build separator row
      const separator = headers.map(() => '---').join(' | ');

      // Extract data rows (skip first row which is the header)
      const dataRows = allRows
        .slice(1)
        .map((row) => {
          const cells = Array.from(row.querySelectorAll('td, th')).map((cell) =>
            cell.textContent.trim().replace(/\n/g, ' ')
          );
          return '| ' + cells.join(' | ') + ' |';
        })
        .join('\n');

      return (
        '\n| ' +
        headers.join(' | ') +
        ' |\n| ' +
        separator +
        ' |\n' +
        dataRows +
        '\n\n'
      );
    },
  });

  // Handle GitHub-style callouts (notes, warnings, etc.)
  turndownService.addRule('githubCallouts', {
    filter: function (node) {
      return (
        node.nodeName === 'BLOCKQUOTE' &&
        node.classList &&
        (node.classList.contains('note') ||
          node.classList.contains('warning') ||
          node.classList.contains('important') ||
          node.classList.contains('tip') ||
          node.classList.contains('caution'))
      );
    },
    replacement: function (content, node) {
      const type = Array.from(node.classList).find((c) =>
        ['note', 'warning', 'important', 'tip', 'caution'].includes(c)
      );
      const emoji =
        {
          note: 'Note',
          warning: 'Warning',
          caution: 'Caution',
          important: 'Important',
          tip: 'Tip',
        }[type] || 'Note';

      return `\n> [!${emoji}]\n> ${content.trim().replace(/\n/g, '\n> ')}\n\n`;
    },
  });

  // Remove navigation, footer, and other non-content elements
  turndownService.remove([
    'nav',
    'header',
    'footer',
    'script',
    'style',
    'noscript',
    'iframe',
    '.format-selector', // Remove format selector buttons (Copy page, etc.)
    '.page-feedback', // Remove page feedback form
    '#page-feedback', // Remove feedback modal
  ]);

  return turndownService;
}

/**
 * Extract article content from HTML
 * @param {string} htmlContent - Raw HTML content
 * @param {string} contextInfo - Context info for error messages (file path or URL)
 * @returns {Object|null} Object with title, description, content or null if not found
 */
function extractArticleContent(htmlContent, contextInfo = '') {
  const dom = new JSDOM(htmlContent);
  const document = dom.window.document;

  try {
    // Find the main article content
    const article = document.querySelector('article.article--content');

    // Debug logging
    if (DEBUG) {
      console.log(`[DEBUG] Looking for article in ${contextInfo}`);
      console.log(`[DEBUG] HTML length: ${htmlContent.length}`);
      console.log(`[DEBUG] Article found: ${!!article}`);
    }

    if (!article) {
      // Try alternative selectors to debug
      if (DEBUG) {
        const anyArticle = document.querySelector('article');
        const articleContent = document.querySelector('.article--content');
        console.log(`[DEBUG] Any article element: ${!!anyArticle}`);
        console.log(`[DEBUG] .article--content element: ${!!articleContent}`);
      }

      console.warn(
        `  ⚠️  No article content found in ${contextInfo}. This is typically not a problem and represents an aliased path.`
      );
      return null;
    }

    // Remove unwanted elements from article before conversion
    const elementsToRemove = [
      '.format-selector', // Remove format selector buttons
      '.page-feedback', // Remove page feedback form
      '#page-feedback', // Remove feedback modal
      '.feedback-widget', // Remove any feedback widgets
      '.helpful', // Remove "Was this page helpful?" section
      '.feedback.block', // Remove footer feedback/support section
      'hr', // Remove horizontal rules (often used as separators before footer)
    ];

    elementsToRemove.forEach((selector) => {
      const elements = article.querySelectorAll(selector);
      elements.forEach((el) => el.remove());
    });

    // Extract metadata
    const title =
      document.querySelector('h1')?.textContent?.trim() ||
      document.querySelector('title')?.textContent?.trim() ||
      'Untitled';

    const description =
      document
        .querySelector('meta[name="description"]')
        ?.getAttribute('content') ||
      document
        .querySelector('meta[property="og:description"]')
        ?.getAttribute('content') ||
      '';

    // Get the content before closing the DOM
    const content = article.innerHTML;

    return {
      title,
      description,
      content,
    };
  } finally {
    // Clean up JSDOM to prevent memory leaks
    dom.window.close();
  }
}

/**
 * Generate frontmatter for markdown file (single page)
 * @param {Object} metadata - Object with title, description
 * @param {string} urlPath - URL path for the page
 * @param {string} baseUrl - Base URL for full URL construction
 * @returns {string} YAML frontmatter as string
 */
function generateFrontmatter(metadata, urlPath, baseUrl = '') {
  const product = detectProduct(urlPath);

  // Sanitize description (remove newlines, truncate to reasonable length)
  let description = metadata.description || '';
  description = description
    .replace(/\s+/g, ' ')  // Replace all whitespace (including newlines) with single space
    .trim()
    .substring(0, 500);     // Truncate to 500 characters max

  // Add token estimate (rough: 4 chars per token)
  const contentLength = metadata.content?.length || 0;
  const estimatedTokens = Math.ceil(contentLength / 4);

  // Build full URL (baseUrl + path)
  const fullUrl = baseUrl ? `${baseUrl.replace(/\/$/, '')}${urlPath}` : urlPath;

  // Build frontmatter object (will be serialized to YAML)
  const frontmatterObj = {
    title: metadata.title,
    description: description,
    url: fullUrl,
    estimated_tokens: estimatedTokens
  };

  if (product) {
    frontmatterObj.product = product.name;
    if (product.version) {
      frontmatterObj.version = product.version;
    }
  }

  // Serialize to YAML (handles special characters properly)
  return '---\n' + yaml.dump(frontmatterObj, {
    lineWidth: -1, // Disable line wrapping
    noRefs: true   // Disable anchors/aliases
  }).trim() + '\n---';
}

/**
 * Generate enhanced frontmatter for section aggregation
 * @param {Object} metadata - Object with title, description
 * @param {string} urlPath - URL path for the section
 * @param {Array} childPages - Array of child page objects with url and title
 * @param {string} baseUrl - Base URL for full URL construction
 * @returns {string} YAML frontmatter as string
 */
function generateSectionFrontmatter(metadata, urlPath, childPages, baseUrl = '') {
  const product = detectProduct(urlPath);

  // Sanitize description (remove newlines, truncate to reasonable length)
  let description = metadata.description || '';
  description = description
    .replace(/\s+/g, ' ')  // Replace all whitespace (including newlines) with single space
    .trim()
    .substring(0, 500);     // Truncate to 500 characters max

  // Add token estimate (rough: 4 chars per token)
  const contentLength = metadata.content?.length || 0;
  const childContentLength = childPages.reduce(
    (sum, child) => sum + (child.content?.length || 0),
    0
  );
  const totalLength = contentLength + childContentLength;
  const estimatedTokens = Math.ceil(totalLength / 4);

  // Build full URL (baseUrl + path)
  const fullUrl = baseUrl ? `${baseUrl.replace(/\/$/, '')}${urlPath}` : urlPath;
  const normalizedBaseUrl = baseUrl ? baseUrl.replace(/\/$/, '') : '';

  // Build frontmatter object (will be serialized to YAML)
  const frontmatterObj = {
    title: metadata.title,
    description: description,
    url: fullUrl,
    type: 'section',
    pages: childPages.length,
    estimated_tokens: estimatedTokens
  };

  if (product) {
    frontmatterObj.product = product.name;
    if (product.version) {
      frontmatterObj.version = product.version;
    }
  }

  // List child pages with full URLs
  if (childPages.length > 0) {
    frontmatterObj.child_pages = childPages.map(child => ({
      url: normalizedBaseUrl ? `${normalizedBaseUrl}${child.url}` : child.url,
      title: child.title
    }));
  }

  // Serialize to YAML (handles special characters properly)
  return '---\n' + yaml.dump(frontmatterObj, {
    lineWidth: -1, // Disable line wrapping
    noRefs: true   // Disable anchors/aliases
  }).trim() + '\n---';
}

/**
 * Convert HTML content to Markdown (single page)
 * @param {string} htmlContent - Raw HTML content
 * @param {string} urlPath - URL path for the page (for frontmatter)
 * @returns {Promise<string|null>} Markdown content with frontmatter or null if conversion fails
 */
async function convertToMarkdown(htmlContent, urlPath) {
  await ensureProductDataInitialized();

  // Detect base URL for the environment
  const baseUrl = detectBaseUrl();
  if (DEBUG) {
    console.log(`[DEBUG] Base URL detected: ${baseUrl} (NODE_ENV=${process.env.NODE_ENV}, HUGO_ENV=${process.env.HUGO_ENV}, BASE_URL=${process.env.BASE_URL})`);
  }

  // Use Rust converter if available (10× faster)
  if (USE_RUST && rustConverter) {
    try {
      return rustConverter.convertToMarkdown(htmlContent, urlPath, baseUrl);
    } catch (err) {
      console.warn(`Rust conversion failed for ${urlPath}, falling back to JavaScript:`, err.message);
      // Fall through to JavaScript implementation
    }
  }

  // JavaScript fallback implementation
  const turndownService = createTurndownService();
  const metadata = extractArticleContent(htmlContent, urlPath);

  if (!metadata) {
    return null;
  }

  // Convert HTML to markdown
  let markdown = turndownService.turndown(metadata.content);

  // Clean up excessive newlines and separator artifacts
  markdown = markdown
    .replace(/\n{3,}/g, '\n\n')
    .replace(/\* \* \*\s*\n\s*\* \* \*/g, '')
    .replace(/\* \* \*\s*$/g, '')
    .trim();

  // Generate frontmatter with full URL
  const frontmatter = generateFrontmatter(metadata, urlPath, baseUrl);

  return `${frontmatter}\n\n${markdown}\n`;
}

/**
 * Convert section HTML with child pages to aggregated Markdown
 * @param {string} sectionHtml - HTML content of the section index page
 * @param {string} sectionUrlPath - URL path for the section
 * @param {Array} childHtmls - Array of objects with {html, url} for each child page
 * @returns {Promise<string|null>} Aggregated markdown content or null if conversion fails
 */
async function convertSectionToMarkdown(
  sectionHtml,
  sectionUrlPath,
  childHtmls
) {
  await ensureProductDataInitialized();

  // Detect base URL for the environment
  const baseUrl = detectBaseUrl();

  // Use Rust converter if available (10× faster)
  if (USE_RUST && rustConverter) {
    try {
      return rustConverter.convertSectionToMarkdown(sectionHtml, sectionUrlPath, childHtmls, baseUrl);
    } catch (err) {
      console.warn(`Rust section conversion failed for ${sectionUrlPath}, falling back to JavaScript:`, err.message);
      // Fall through to JavaScript implementation
    }
  }

  // JavaScript fallback implementation
  const turndownService = createTurndownService();

  // Extract section metadata and content
  const sectionMetadata = extractArticleContent(sectionHtml, sectionUrlPath);
  if (!sectionMetadata) {
    return null;
  }

  // Convert section content to markdown
  let sectionMarkdown = turndownService.turndown(sectionMetadata.content);
  sectionMarkdown = sectionMarkdown
    .replace(/\n{3,}/g, '\n\n')
    .replace(/\* \* \*\s*\n\s*\* \* \*/g, '')
    .replace(/\* \* \*\s*$/g, '')
    .trim();

  // Process child pages
  const childContents = [];
  const childPageInfo = [];

  for (const { html, url } of childHtmls) {
    const childMetadata = extractArticleContent(html, url);
    if (childMetadata) {
      let childMarkdown = turndownService.turndown(childMetadata.content);
      childMarkdown = childMarkdown
        .replace(/\n{3,}/g, '\n\n')
        .replace(/\* \* \*\s*\n\s*\* \* \*/g, '')
        .replace(/\* \* \*\s*$/g, '')
        .trim();

      // Remove the first h1 heading (page title) to avoid redundancy
      // since we're adding it as an h2 heading
      childMarkdown = childMarkdown.replace(/^#\s+.+?\n+/, '');

      // Add child page title as heading
      childContents.push(`## ${childMetadata.title}\n\n${childMarkdown}`);

      // Track child page info for frontmatter
      childPageInfo.push({
        url: url,
        title: childMetadata.title,
        content: childMarkdown,
      });
    }
  }

  // Generate section frontmatter with child page info and full URLs
  const frontmatter = generateSectionFrontmatter(
    { ...sectionMetadata, content: sectionMarkdown },
    sectionUrlPath,
    childPageInfo,
    baseUrl
  );

  // Combine section content with child pages
  const allContent = [sectionMarkdown, ...childContents].join('\n\n---\n\n');

  return `${frontmatter}\n\n${allContent}\n`;
}

// Export all functions for CommonJS
module.exports = {
  detectProduct,
  createTurndownService,
  extractArticleContent,
  generateFrontmatter,
  generateSectionFrontmatter,
  convertToMarkdown,
  convertSectionToMarkdown,
};