Link checker for doc site.

Please see the in-file comment for details.
2020-07-15 17:08:23 +08:00 · 2020-07-15 17:08:23 +08:00 · d13f959fa1
parent 896034cbc2
commit d13f959fa1
1 changed files with 425 additions and 0 deletions
--- a/scripts/linkchecker.py
+++ b/scripts/linkchecker.py
@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+#
+# This a link checker for Kubernetes documentation website.
+# - We cover the following cases for the language you provide via `-l`, which
+#   defaults to 'en'.
+# - If the language specified is not English (`en`), we check if you are
+#   actually using the localized links. For example, if you specify `zh` as
+#   the language, and for link target `/docs/foo/bar`, we check if the English
+#   version exists AND if the Chinese version exists as well. A checking record
+#   is produced if the link can use the localized version.
+#
+# Usage: linkchecker.py -h
+#
+# Cases handled:
+#
+# - [foo](#bar)                         : ignored currently
+# + [foo](http://bar)                   : insecure links to external site
+# + [foo](https://k8s.io/website/...)   : hardcoded site domain name
+#
+# + [foo](/<lang>/docs/bar/...)  : where <lang> is not 'en'
+#   + /<lang>/docs/bar           : contains shortcode, so ignore, or
+#   + /<lang>/docs/bar           : is a image link (ignore currently), or
+#   + /<lang>/docs/bar           : points to shared (non-localized) page, or
+#   + /<lang>/docs/bar.md        : exists for current lang, or
+#   + /<lang>/docs/bar/_index.md : exists for current lang, or
+#   + /<lang>/docs/bar/          : is a redirect entry, or
+#   + /<lang>/docs/bar           : is something we don't understand, then ERR
+#
+# + [foo](/docs/bar/...)
+#   + /docs/bar                : contains shortcode, so ignore, or
+#   + /docs/bar                : is a image link (ignore currently), or
+#   + /docs/bar                : points to a shared (non-localized) page, or
+#   + /docs/bar.md             : exists for current lang, or
+#   + /docs/bar/_index.md      : exists for current lang, or
+#   + /docs/bar                : is a redirect entry, or
+#   + /docs/bar                : is something we don't understand
+#
+
+import argparse
+import glob
+import os
+import re
+import sys
+
+# These are the bad links that doesn't hurt, though good to fix
+BAD_LINK_TYPES = {
+    "B01": {
+        "reason": "Using bad protocol",
+        "level": "WARNING",
+    },
+    "B02": {
+        "reason": "Link target is a redirect entry",
+        "level": "WARNING",
+    },
+    "B03": {
+        "reason": "Intra-site linkes should use relative path",
+        "level": "WARNING",
+    },
+}
+
+# Constants for colored printing
+C_RED = "\033[31m"
+C_GREEN = "\033[32m"
+C_YELLOW = "\033[33m"
+C_GRAY  = "\033[90m"
+C_CYAN = "\033[36m"
+C_END = "\033[0m"
+
+# Command line arguments shared across functions
+ARGS = None
+# Global result dictionary keyed by page examined
+RESULT = {}
+# Cached redirect entries
+REDIRECTS = {}
+
+
+def new_record(level, message, target):
+    """Create new checking record.
+
+    :param level: Record severity level, one of 'INFO', 'WARNING' and 'ERROR'
+    :param message: Error message string
+    :param target: The link target in question
+    :returns: A string representation the checking result, may contain ASCII
+              coded terminal colors, or None if the record is suppressed.
+    """
+    global ARGS
+
+    # Skip info when verbose
+    if ARGS.verbose == False and level == "INFO":
+        return None
+
+    result = None
+    if ARGS.no_color:
+        result = target + ": " + message
+    else:
+        target = C_GRAY + target + C_END
+        if level == "INFO":
+            result =  target + ": " + C_GREEN  + message + C_END 
+        elif level == "WARNING":
+            result = target + ": " + C_YELLOW+ message + C_END
+        else:  # default to error
+            result = target + ": " + C_RED + message + C_END
+
+    return result
+
+
+def dump_result():
+    """Dump result to stdout."""
+    global RESULT, ARGS
+
+    for path, path_output in RESULT.items():
+        norm_path = os.path.normpath(path)
+        if ARGS.no_color:
+            print("File: " + norm_path)
+        else:
+            print(C_CYAN + "File: " + norm_path + C_END)
+        for p in path_output:
+            print(" "*4 + p)
+    return
+
+
+def strip_comments(content):
+    """Manual striping of comments from file content.
+
+    Many localized content pages contain original English content in comments.
+    These comments have to be stripped out before analyzing the links.
+    Doing this using regular expression is difficult. Even the grep tool is
+    not suitable for this use case.
+
+    NOTE: We strived to preserve line numbers when producing the resulted
+    text. This can be useful in future if we want to print out the line
+    numbers for bad links.
+    """
+    result = []
+    in_comment = False
+    for line in content:
+        idx1 = line.find("<!--")
+        idx2 = line.find("-->")
+        if not in_comment:
+            # only care if new comment started
+            if idx1 < 0:
+                result.append(line)
+                continue
+
+            # single line comment
+            if idx2 > 0:
+                result.append(line[:idx1] + line[idx2+4:])
+                continue
+            result.append(line[:idx1])
+            in_comment = True
+            continue
+
+        # already in comment block
+        if idx2 < 0:  # ignore whole line
+            result.append("")
+            continue
+        result.append(line[idx2+4:])
+        in_comment = False
+
+    return result
+
+
+def normalize_filename(name, ftype="markdown"):
+    """Guess the filename based on a link target.
+
+    This function only deals with regular files.
+    """
+    if name.endswith("/"):
+        name = name[:-1]
+    if ftype == "markdown":
+        name += ".md"
+    else:
+        name += ".html"
+    return name
+
+
+def check_file_exists(base, path, ftype="markdown"):
+    """Check if the target file exists.
+
+    NOTE: We build a normalized path using 'base' and 'path' values. Suppose
+    the resulted path string is 'foo/bar', we check if 'foo/bar.md' exists,
+    AND we check if 'foo/bar/_index.md' exists.
+
+    :param base: The base directory to begin with
+    :param path: The link target which is a relative path string
+    :returns: A boolean indicating whether the target file exists.
+    """
+    # NOTE: anchor is ignored, can be a todo item
+    parts = path.split("#")
+
+    fn = normalize_filename(parts[0], ftype=ftype)
+    target = base + fn
+
+    if os.path.isfile(target):
+        return True
+
+    dir_name = base + parts[0]
+    if os.path.isdir(dir_name):
+        if os.path.isfile(dir_name + "/_index.md"):
+            return True
+        if os.path.isfile(dir_name + "/_index.html"):
+            return True
+        # /docs/contribute/style/hugo-shortcodes/ has this
+        if os.path.isfile(dir_name + "/index.md"):
+            return True
+    return False
+
+
+def get_redirect(path):
+    """Check if the path exists in the redirect database.
+
+    NOTE: We do NOT check if the redirect target is there or not. We do an
+    **exact** matching for redirection entries.
+    :returns: The redirect target if any, or None if not found.
+    """
+    global REDIRECTS
+
+    def _check_redirect(t):
+        for key, value in REDIRECTS.items():
+            if key == t:  # EXACT MATCH
+                return value
+        return None
+
+    # NOTE: anchor is ignored, can be a future todo
+    parts = path.split("#")
+    target = parts[0]
+    if not target.endswith("/"):
+        target += "/"
+
+    new_target = _check_redirect(target)
+    last_target = new_target
+    while new_target:
+        new_target = _check_redirect(new_target)
+        if new_target is None:
+            break
+        last_target = new_target
+
+    return last_target
+
+
+def check_target(page, anchor, target):
+    """Check a link from anchor to target on provided page.
+
+    :param page: Currently not used. Passed here in case we want to check the
+                 in-page links in the future.
+    :param anchor: Anchor string from the content page. This is provided to
+                help handle cases where target is empty.
+    :param target: The link target string to check
+    :returns: A checking record (string) if errors found, or None if we can
+              find the target link.
+    """
+    target = target.strip()
+    # B01: bad protocol
+    if target.startswith("http://"):
+        return new_record("WARNING", "Use HTTPS rather than HTTP", target)
+
+    # full link
+    if target.startswith("https://"):
+        # B03: self link, should revise to relative path
+        if (target.startswith("https://k8s.io/docs") or
+                target.startswith("https://kubernetes.io/docs")):
+            return new_record("ERROR", "Should use relative paths", target)
+        # external link, skip
+        return new_record("INFO", "External link, skipped", target)
+
+    # in-page link
+    # TODO: check if the target anchor does exists
+    if target.startswith("#"):
+        return new_record("INFO", "In-page link, skipped", target)
+
+    # Link has shortcode
+    if target.find("{{") > 0:
+        return new_record("INFO", "Link has shortcode, skipped", target)
+
+    # TODO: check links to examples
+    if target.startswith("/examples/"):
+        return new_record("WARNING", "Examples link, skipped", target)
+
+    # it is an embedded image
+    # TODO: an image might get translated as well
+    if target.endswith(".png") or target.endswith(".svg"):
+        return new_record("INFO", "Link to image, skipped", target)
+
+    # link to English or localized page
+    if (target.startswith("/docs/") or
+            target.startswith("/" + ARGS.lang + "/docs/")):
+
+        # target is shared reference (kubectl or kubernetes-api?
+        if (target.find("/docs/reference/generated/kubectl/") >= 0 or
+                target.find("/docs/reference/generated/kubernetes-api/") >= 0):
+            if check_file_exists(ROOT + "/static", target, "html"):
+                return None
+            return new_record("ERROR", "Missing shared reference", target)
+
+        # target is a markdown (.md) or a "<dir>/_index.md"?
+        if target.startswith("/docs/"):
+            base = os.path.join(ROOT, "content", "en")
+        else:
+            # localized target
+            base = os.path.join(ROOT, "content")
+        ok = check_file_exists(base, target)
+        if ok:
+            # We do't do additional checks for English site even if it has
+            # links to a non-English page
+            if ARGS.lang == "en":
+                return None
+
+            # If we are already checking localized link, fine
+            if target.startswith("/" + ARGS.lang + "/docs/"):
+                return None
+
+            # additional check for localization even if English target exists
+            base = os.path.join(ROOT, "content", ARGS.lang)
+            found = check_file_exists(base, target)
+            if not found:
+                # Still to be translated
+                return None
+            msg = ("Localized page detected, please append '/%s' to the target"
+                   % ARGS.lang)
+            return new_record("ERROR", "Link not using localized page", target)
+
+        # taget might be a redirect entry
+        real_target = get_redirect(target)
+        if real_target:
+            msg = ("Link using redirect records, should use %s instead" %
+                   real_target)
+            return new_record("WARNING", msg, target)
+        return new_record("ERROR", "Missing link for [%s]" % anchor, target)
+
+    msg = "Link may be wrong for the anchor [%s]" % anchor
+    return new_record("WARNING", msg, target)
+
+
+def validate_links(page):
+    """Find and validate links on a content page.
+
+    The checking records are consolidated into the global variable RESULT.
+    """
+    try:
+        with open(page, "r") as f:
+            data = f.readlines()
+    except Exception as ex:
+        print("[Error] failed in reading markdown file: " + str(ex))
+        return
+
+    content = "\n".join(strip_comments(data))
+
+    # Single results: searches for pattern: []()
+    link_pattern = r"\[([`/\w\s\n]*)\]\(([^\)]*)\)"
+    regex = re.compile(link_pattern)
+
+    matches = regex.findall(content)
+    records = []
+    for m in matches:
+        r = check_target(page, m[0], m[1])
+        if r:
+            records.append(r)
+    if len(records):
+        RESULT[page] = records
+
+
+def parse_arguments():
+    """Argument parser.
+
+    Result is returned and saved into global variable ARGS.
+    """
+    parser = argparse.ArgumentParser(description="Links checker for docs.")
+    parser.add_argument("-l", dest="lang", default="en", metavar="<LANG>",
+                        help=("two letter language code, e.g. 'zh'. "
+                              "(default='en')"))
+    parser.add_argument("-v", dest="verbose", action="store_true",
+                        help="switch on verbose level")
+    parser.add_argument("-f", dest="filter", default="/docs/**/*.md",
+                        metavar="<FILTER>",
+                        help=("File pattern to scan, e.g. '/docs/foo.md'. "
+                              "(default='/docs/foo/*.md')"))
+    parser.add_argument("-n", "--no-color", action="store_true",
+                        help="Suppress colored printing.")
+
+    return parser.parse_args()
+
+
+def main():
+    """The main entry of the program."""
+    global ARGS, ROOT, REDIRECTS
+
+    ARGS = parse_arguments()
+    print("Language: " + ARGS.lang)
+    ROOT = os.path.join(os.path.dirname(__file__), '..')
+    content_dir = os.path.join(ROOT, 'content')
+    lang_dir = os.path.join(content_dir, ARGS.lang)
+
+    # read redirects data
+    redirects_fn = os.path.join(ROOT, "static", "_redirects")
+    try:
+        with open(redirects_fn, "r") as f:
+            data = f.readlines()
+        for item in data:
+            parts = item.split()
+            # There are entries without 301 specified
+            if len(parts) < 2:
+                continue
+            entry = parts[0]
+            # There are some entries not ended with "/"
+            if entry.endswith("/"):
+                REDIRECTS[entry] = parts[1]
+            else:
+                REDIRECTS[entry + "/"] = parts[1]
+
+    except Exception as ex:
+        print("[Error] failed in reading redirects file: " + str(ex))
+        return
+
+    folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)]
+    for page in folders:
+        validate_links(page)
+
+    dump_result()
+
+    # Done
+    print("Completed link validation.")
+
+
+if __name__ == '__main__':
+    sys.exit(main())