#!/usr/bin/env python3 # # This a link checker for Kubernetes documentation website. # - We cover the following cases for the language you provide via `-l`, which # defaults to 'en'. # - If the language specified is not English (`en`), we check if you are # actually using the localized links. For example, if you specify `zh` as # the language, and for link target `/docs/foo/bar`, we check if the English # version exists AND if the Chinese version exists as well. A checking record # is produced if the link can use the localized version. # # Usage: linkchecker.py -h # # Cases handled: # # - [foo](#bar) : ignored currently # + [foo](http://bar) : insecure links to external site # + [foo](https://k8s.io/website/...) : hardcoded site domain name # # + [foo](//docs/bar/...) : where is not 'en' # + //docs/bar : contains shortcode, so ignore, or # + //docs/bar : is a image link (ignore currently), or # + //docs/bar : points to shared (non-localized) page, or # + //docs/bar.md : exists for current lang, or # + //docs/bar/_index.md : exists for current lang, or # + //docs/bar/ : is a redirect entry, or # + //docs/bar : is something we don't understand, then ERR # # + [foo](/docs/bar/...) # + /docs/bar : contains shortcode, so ignore, or # + /docs/bar : is a image link (ignore currently), or # + /docs/bar : points to a shared (non-localized) page, or # + /docs/bar.md : exists for current lang, or # + /docs/bar/_index.md : exists for current lang, or # + /docs/bar : is a redirect entry, or # + /docs/bar : is something we don't understand # import argparse import glob import os import re import sys # These are the bad links that doesn't hurt, though good to fix BAD_LINK_TYPES = { "B01": { "reason": "Using bad protocol", "level": "WARNING", }, "B02": { "reason": "Link target is a redirect entry", "level": "WARNING", }, "B03": { "reason": "Intra-site linkes should use relative path", "level": "WARNING", }, } # Constants for colored printing C_RED = "\033[31m" C_GREEN = "\033[32m" C_YELLOW = "\033[33m" C_GRAY = "\033[90m" C_CYAN = "\033[36m" C_END = "\033[0m" # Command line arguments shared across functions ARGS = None # Global result dictionary keyed by page examined RESULT = {} # Cached redirect entries REDIRECTS = {} def new_record(level, message, target): """Create new checking record. :param level: Record severity level, one of 'INFO', 'WARNING' and 'ERROR' :param message: Error message string :param target: The link target in question :returns: A string representation the checking result, may contain ASCII coded terminal colors, or None if the record is suppressed. """ global ARGS # Skip info when verbose if ARGS.verbose == False and level == "INFO": return None result = None if ARGS.no_color: result = target + ": " + message else: target = C_GRAY + target + C_END if level == "INFO": result = target + ": " + C_GREEN + message + C_END elif level == "WARNING": result = target + ": " + C_YELLOW+ message + C_END else: # default to error result = target + ": " + C_RED + message + C_END return result def dump_result(): """Dump result to stdout.""" global RESULT, ARGS for path, path_output in RESULT.items(): norm_path = os.path.normpath(path) if ARGS.no_color: print("File: " + norm_path) else: print(C_CYAN + "File: " + norm_path + C_END) for p in path_output: print(" "*4 + p) return def strip_comments(content): """Manual striping of comments from file content. Many localized content pages contain original English content in comments. These comments have to be stripped out before analyzing the links. Doing this using regular expression is difficult. Even the grep tool is not suitable for this use case. NOTE: We strived to preserve line numbers when producing the resulted text. This can be useful in future if we want to print out the line numbers for bad links. """ result = [] in_comment = False for line in content: idx1 = line.find("") if not in_comment: # only care if new comment started if idx1 < 0: result.append(line) continue # single line comment if idx2 > 0: result.append(line[:idx1] + line[idx2+4:]) continue result.append(line[:idx1]) in_comment = True continue # already in comment block if idx2 < 0: # ignore whole line result.append("") continue result.append(line[idx2+4:]) in_comment = False return result def normalize_filename(name, ftype="markdown"): """Guess the filename based on a link target. This function only deals with regular files. """ if name.endswith("/"): name = name[:-1] if ftype == "markdown": name += ".md" else: name += ".html" return name def check_file_exists(base, path, ftype="markdown"): """Check if the target file exists. NOTE: We build a normalized path using 'base' and 'path' values. Suppose the resulted path string is 'foo/bar', we check if 'foo/bar.md' exists, AND we check if 'foo/bar/_index.md' exists. :param base: The base directory to begin with :param path: The link target which is a relative path string :returns: A boolean indicating whether the target file exists. """ # NOTE: anchor is ignored, can be a todo item parts = path.split("#") fn = normalize_filename(parts[0], ftype=ftype) target = base + fn if os.path.isfile(target): return True dir_name = base + parts[0] if os.path.isdir(dir_name): if os.path.isfile(dir_name + "/_index.md"): return True if os.path.isfile(dir_name + "/_index.html"): return True # /docs/contribute/style/hugo-shortcodes/ has this if os.path.isfile(dir_name + "/index.md"): return True return False def get_redirect(path): """Check if the path exists in the redirect database. NOTE: We do NOT check if the redirect target is there or not. We do an **exact** matching for redirection entries. :returns: The redirect target if any, or None if not found. """ global REDIRECTS def _check_redirect(t): for key, value in REDIRECTS.items(): if key == t: # EXACT MATCH return value return None # NOTE: anchor is ignored, can be a future todo parts = path.split("#") target = parts[0] if not target.endswith("/"): target += "/" new_target = _check_redirect(target) last_target = new_target while new_target: new_target = _check_redirect(new_target) if new_target is None: break last_target = new_target return last_target def check_target(page, anchor, target): """Check a link from anchor to target on provided page. :param page: Currently not used. Passed here in case we want to check the in-page links in the future. :param anchor: Anchor string from the content page. This is provided to help handle cases where target is empty. :param target: The link target string to check :returns: A checking record (string) if errors found, or None if we can find the target link. """ target = target.strip() # B01: bad protocol if target.startswith("http://"): return new_record("WARNING", "Use HTTPS rather than HTTP", target) # full link if target.startswith("https://"): # B03: self link, should revise to relative path if (target.startswith("https://k8s.io/docs") or target.startswith("https://kubernetes.io/docs")): return new_record("ERROR", "Should use relative paths", target) # external link, skip return new_record("INFO", "External link, skipped", target) # in-page link # TODO: check if the target anchor does exists if target.startswith("#"): return new_record("INFO", "In-page link, skipped", target) # Link has shortcode if target.find("{{") > 0: return new_record("INFO", "Link has shortcode, skipped", target) # TODO: check links to examples if target.startswith("/examples/"): return new_record("WARNING", "Examples link, skipped", target) # it is an embedded image # TODO: an image might get translated as well if target.endswith(".png") or target.endswith(".svg"): return new_record("INFO", "Link to image, skipped", target) # link to English or localized page if (target.startswith("/docs/") or target.startswith("/" + ARGS.lang + "/docs/")): # target is shared reference (kubectl or kubernetes-api? if (target.find("/docs/reference/generated/kubectl/") >= 0 or target.find("/docs/reference/generated/kubernetes-api/") >= 0): if check_file_exists(ROOT + "/static", target, "html"): return None return new_record("ERROR", "Missing shared reference", target) # target is a markdown (.md) or a "/_index.md"? if target.startswith("/docs/"): base = os.path.join(ROOT, "content", "en") else: # localized target base = os.path.join(ROOT, "content") ok = check_file_exists(base, target) if ok: # We do't do additional checks for English site even if it has # links to a non-English page if ARGS.lang == "en": return None # If we are already checking localized link, fine if target.startswith("/" + ARGS.lang + "/docs/"): return None # additional check for localization even if English target exists base = os.path.join(ROOT, "content", ARGS.lang) found = check_file_exists(base, target) if not found: # Still to be translated return None msg = ("Localized page detected, please append '/%s' to the target" % ARGS.lang) return new_record("ERROR", "Link not using localized page", target) # taget might be a redirect entry real_target = get_redirect(target) if real_target: msg = ("Link using redirect records, should use %s instead" % real_target) return new_record("WARNING", msg, target) return new_record("ERROR", "Missing link for [%s]" % anchor, target) msg = "Link may be wrong for the anchor [%s]" % anchor return new_record("WARNING", msg, target) def validate_links(page): """Find and validate links on a content page. The checking records are consolidated into the global variable RESULT. """ try: with open(page, "r") as f: data = f.readlines() except Exception as ex: print("[Error] failed in reading markdown file: " + str(ex)) return content = "\n".join(strip_comments(data)) # Single results: searches for pattern: []() link_pattern = r"\[([`/\w\s\n]*)\]\(([^\)]*)\)" regex = re.compile(link_pattern) matches = regex.findall(content) records = [] for m in matches: r = check_target(page, m[0], m[1]) if r: records.append(r) if len(records): RESULT[page] = records def parse_arguments(): """Argument parser. Result is returned and saved into global variable ARGS. """ parser = argparse.ArgumentParser(description="Links checker for docs.") parser.add_argument("-l", dest="lang", default="en", metavar="", help=("two letter language code, e.g. 'zh'. " "(default='en')")) parser.add_argument("-v", dest="verbose", action="store_true", help="switch on verbose level") parser.add_argument("-f", dest="filter", default="/docs/**/*.md", metavar="", help=("File pattern to scan, e.g. '/docs/foo.md'. " "(default='/docs/foo/*.md')")) parser.add_argument("-n", "--no-color", action="store_true", help="Suppress colored printing.") return parser.parse_args() def main(): """The main entry of the program.""" global ARGS, ROOT, REDIRECTS ARGS = parse_arguments() print("Language: " + ARGS.lang) ROOT = os.path.join(os.path.dirname(__file__), '..') content_dir = os.path.join(ROOT, 'content') lang_dir = os.path.join(content_dir, ARGS.lang) # read redirects data redirects_fn = os.path.join(ROOT, "static", "_redirects") try: with open(redirects_fn, "r") as f: data = f.readlines() for item in data: parts = item.split() # There are entries without 301 specified if len(parts) < 2: continue entry = parts[0] # There are some entries not ended with "/" if entry.endswith("/"): REDIRECTS[entry] = parts[1] else: REDIRECTS[entry + "/"] = parts[1] except Exception as ex: print("[Error] failed in reading redirects file: " + str(ex)) return folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)] for page in folders: validate_links(page) dump_result() # Done print("Completed link validation.") if __name__ == '__main__': sys.exit(main())