From d13f959fa182f4dafd7df7639087b736c1e91787 Mon Sep 17 00:00:00 2001 From: Qiming Teng Date: Wed, 15 Jul 2020 17:08:23 +0800 Subject: [PATCH] Link checker for doc site. Please see the in-file comment for details. --- scripts/linkchecker.py | 425 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) create mode 100755 scripts/linkchecker.py diff --git a/scripts/linkchecker.py b/scripts/linkchecker.py new file mode 100755 index 0000000000..71f40ac22b --- /dev/null +++ b/scripts/linkchecker.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +# +# This a link checker for Kubernetes documentation website. +# - We cover the following cases for the language you provide via `-l`, which +# defaults to 'en'. +# - If the language specified is not English (`en`), we check if you are +# actually using the localized links. For example, if you specify `zh` as +# the language, and for link target `/docs/foo/bar`, we check if the English +# version exists AND if the Chinese version exists as well. A checking record +# is produced if the link can use the localized version. +# +# Usage: linkchecker.py -h +# +# Cases handled: +# +# - [foo](#bar) : ignored currently +# + [foo](http://bar) : insecure links to external site +# + [foo](https://k8s.io/website/...) : hardcoded site domain name +# +# + [foo](//docs/bar/...) : where is not 'en' +# + //docs/bar : contains shortcode, so ignore, or +# + //docs/bar : is a image link (ignore currently), or +# + //docs/bar : points to shared (non-localized) page, or +# + //docs/bar.md : exists for current lang, or +# + //docs/bar/_index.md : exists for current lang, or +# + //docs/bar/ : is a redirect entry, or +# + //docs/bar : is something we don't understand, then ERR +# +# + [foo](/docs/bar/...) +# + /docs/bar : contains shortcode, so ignore, or +# + /docs/bar : is a image link (ignore currently), or +# + /docs/bar : points to a shared (non-localized) page, or +# + /docs/bar.md : exists for current lang, or +# + /docs/bar/_index.md : exists for current lang, or +# + /docs/bar : is a redirect entry, or +# + /docs/bar : is something we don't understand +# + +import argparse +import glob +import os +import re +import sys + +# These are the bad links that doesn't hurt, though good to fix +BAD_LINK_TYPES = { + "B01": { + "reason": "Using bad protocol", + "level": "WARNING", + }, + "B02": { + "reason": "Link target is a redirect entry", + "level": "WARNING", + }, + "B03": { + "reason": "Intra-site linkes should use relative path", + "level": "WARNING", + }, +} + +# Constants for colored printing +C_RED = "\033[31m" +C_GREEN = "\033[32m" +C_YELLOW = "\033[33m" +C_GRAY = "\033[90m" +C_CYAN = "\033[36m" +C_END = "\033[0m" + +# Command line arguments shared across functions +ARGS = None +# Global result dictionary keyed by page examined +RESULT = {} +# Cached redirect entries +REDIRECTS = {} + + +def new_record(level, message, target): + """Create new checking record. + + :param level: Record severity level, one of 'INFO', 'WARNING' and 'ERROR' + :param message: Error message string + :param target: The link target in question + :returns: A string representation the checking result, may contain ASCII + coded terminal colors, or None if the record is suppressed. + """ + global ARGS + + # Skip info when verbose + if ARGS.verbose == False and level == "INFO": + return None + + result = None + if ARGS.no_color: + result = target + ": " + message + else: + target = C_GRAY + target + C_END + if level == "INFO": + result = target + ": " + C_GREEN + message + C_END + elif level == "WARNING": + result = target + ": " + C_YELLOW+ message + C_END + else: # default to error + result = target + ": " + C_RED + message + C_END + + return result + + +def dump_result(): + """Dump result to stdout.""" + global RESULT, ARGS + + for path, path_output in RESULT.items(): + norm_path = os.path.normpath(path) + if ARGS.no_color: + print("File: " + norm_path) + else: + print(C_CYAN + "File: " + norm_path + C_END) + for p in path_output: + print(" "*4 + p) + return + + +def strip_comments(content): + """Manual striping of comments from file content. + + Many localized content pages contain original English content in comments. + These comments have to be stripped out before analyzing the links. + Doing this using regular expression is difficult. Even the grep tool is + not suitable for this use case. + + NOTE: We strived to preserve line numbers when producing the resulted + text. This can be useful in future if we want to print out the line + numbers for bad links. + """ + result = [] + in_comment = False + for line in content: + idx1 = line.find("") + if not in_comment: + # only care if new comment started + if idx1 < 0: + result.append(line) + continue + + # single line comment + if idx2 > 0: + result.append(line[:idx1] + line[idx2+4:]) + continue + result.append(line[:idx1]) + in_comment = True + continue + + # already in comment block + if idx2 < 0: # ignore whole line + result.append("") + continue + result.append(line[idx2+4:]) + in_comment = False + + return result + + +def normalize_filename(name, ftype="markdown"): + """Guess the filename based on a link target. + + This function only deals with regular files. + """ + if name.endswith("/"): + name = name[:-1] + if ftype == "markdown": + name += ".md" + else: + name += ".html" + return name + + +def check_file_exists(base, path, ftype="markdown"): + """Check if the target file exists. + + NOTE: We build a normalized path using 'base' and 'path' values. Suppose + the resulted path string is 'foo/bar', we check if 'foo/bar.md' exists, + AND we check if 'foo/bar/_index.md' exists. + + :param base: The base directory to begin with + :param path: The link target which is a relative path string + :returns: A boolean indicating whether the target file exists. + """ + # NOTE: anchor is ignored, can be a todo item + parts = path.split("#") + + fn = normalize_filename(parts[0], ftype=ftype) + target = base + fn + + if os.path.isfile(target): + return True + + dir_name = base + parts[0] + if os.path.isdir(dir_name): + if os.path.isfile(dir_name + "/_index.md"): + return True + if os.path.isfile(dir_name + "/_index.html"): + return True + # /docs/contribute/style/hugo-shortcodes/ has this + if os.path.isfile(dir_name + "/index.md"): + return True + return False + + +def get_redirect(path): + """Check if the path exists in the redirect database. + + NOTE: We do NOT check if the redirect target is there or not. We do an + **exact** matching for redirection entries. + :returns: The redirect target if any, or None if not found. + """ + global REDIRECTS + + def _check_redirect(t): + for key, value in REDIRECTS.items(): + if key == t: # EXACT MATCH + return value + return None + + # NOTE: anchor is ignored, can be a future todo + parts = path.split("#") + target = parts[0] + if not target.endswith("/"): + target += "/" + + new_target = _check_redirect(target) + last_target = new_target + while new_target: + new_target = _check_redirect(new_target) + if new_target is None: + break + last_target = new_target + + return last_target + + +def check_target(page, anchor, target): + """Check a link from anchor to target on provided page. + + :param page: Currently not used. Passed here in case we want to check the + in-page links in the future. + :param anchor: Anchor string from the content page. This is provided to + help handle cases where target is empty. + :param target: The link target string to check + :returns: A checking record (string) if errors found, or None if we can + find the target link. + """ + target = target.strip() + # B01: bad protocol + if target.startswith("http://"): + return new_record("WARNING", "Use HTTPS rather than HTTP", target) + + # full link + if target.startswith("https://"): + # B03: self link, should revise to relative path + if (target.startswith("https://k8s.io/docs") or + target.startswith("https://kubernetes.io/docs")): + return new_record("ERROR", "Should use relative paths", target) + # external link, skip + return new_record("INFO", "External link, skipped", target) + + # in-page link + # TODO: check if the target anchor does exists + if target.startswith("#"): + return new_record("INFO", "In-page link, skipped", target) + + # Link has shortcode + if target.find("{{") > 0: + return new_record("INFO", "Link has shortcode, skipped", target) + + # TODO: check links to examples + if target.startswith("/examples/"): + return new_record("WARNING", "Examples link, skipped", target) + + # it is an embedded image + # TODO: an image might get translated as well + if target.endswith(".png") or target.endswith(".svg"): + return new_record("INFO", "Link to image, skipped", target) + + # link to English or localized page + if (target.startswith("/docs/") or + target.startswith("/" + ARGS.lang + "/docs/")): + + # target is shared reference (kubectl or kubernetes-api? + if (target.find("/docs/reference/generated/kubectl/") >= 0 or + target.find("/docs/reference/generated/kubernetes-api/") >= 0): + if check_file_exists(ROOT + "/static", target, "html"): + return None + return new_record("ERROR", "Missing shared reference", target) + + # target is a markdown (.md) or a "/_index.md"? + if target.startswith("/docs/"): + base = os.path.join(ROOT, "content", "en") + else: + # localized target + base = os.path.join(ROOT, "content") + ok = check_file_exists(base, target) + if ok: + # We do't do additional checks for English site even if it has + # links to a non-English page + if ARGS.lang == "en": + return None + + # If we are already checking localized link, fine + if target.startswith("/" + ARGS.lang + "/docs/"): + return None + + # additional check for localization even if English target exists + base = os.path.join(ROOT, "content", ARGS.lang) + found = check_file_exists(base, target) + if not found: + # Still to be translated + return None + msg = ("Localized page detected, please append '/%s' to the target" + % ARGS.lang) + return new_record("ERROR", "Link not using localized page", target) + + # taget might be a redirect entry + real_target = get_redirect(target) + if real_target: + msg = ("Link using redirect records, should use %s instead" % + real_target) + return new_record("WARNING", msg, target) + return new_record("ERROR", "Missing link for [%s]" % anchor, target) + + msg = "Link may be wrong for the anchor [%s]" % anchor + return new_record("WARNING", msg, target) + + +def validate_links(page): + """Find and validate links on a content page. + + The checking records are consolidated into the global variable RESULT. + """ + try: + with open(page, "r") as f: + data = f.readlines() + except Exception as ex: + print("[Error] failed in reading markdown file: " + str(ex)) + return + + content = "\n".join(strip_comments(data)) + + # Single results: searches for pattern: []() + link_pattern = r"\[([`/\w\s\n]*)\]\(([^\)]*)\)" + regex = re.compile(link_pattern) + + matches = regex.findall(content) + records = [] + for m in matches: + r = check_target(page, m[0], m[1]) + if r: + records.append(r) + if len(records): + RESULT[page] = records + + +def parse_arguments(): + """Argument parser. + + Result is returned and saved into global variable ARGS. + """ + parser = argparse.ArgumentParser(description="Links checker for docs.") + parser.add_argument("-l", dest="lang", default="en", metavar="", + help=("two letter language code, e.g. 'zh'. " + "(default='en')")) + parser.add_argument("-v", dest="verbose", action="store_true", + help="switch on verbose level") + parser.add_argument("-f", dest="filter", default="/docs/**/*.md", + metavar="", + help=("File pattern to scan, e.g. '/docs/foo.md'. " + "(default='/docs/foo/*.md')")) + parser.add_argument("-n", "--no-color", action="store_true", + help="Suppress colored printing.") + + return parser.parse_args() + + +def main(): + """The main entry of the program.""" + global ARGS, ROOT, REDIRECTS + + ARGS = parse_arguments() + print("Language: " + ARGS.lang) + ROOT = os.path.join(os.path.dirname(__file__), '..') + content_dir = os.path.join(ROOT, 'content') + lang_dir = os.path.join(content_dir, ARGS.lang) + + # read redirects data + redirects_fn = os.path.join(ROOT, "static", "_redirects") + try: + with open(redirects_fn, "r") as f: + data = f.readlines() + for item in data: + parts = item.split() + # There are entries without 301 specified + if len(parts) < 2: + continue + entry = parts[0] + # There are some entries not ended with "/" + if entry.endswith("/"): + REDIRECTS[entry] = parts[1] + else: + REDIRECTS[entry + "/"] = parts[1] + + except Exception as ex: + print("[Error] failed in reading redirects file: " + str(ex)) + return + + folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)] + for page in folders: + validate_links(page) + + dump_result() + + # Done + print("Completed link validation.") + + +if __name__ == '__main__': + sys.exit(main())