From 95257a2eddd1a9733dd622e3d97c28c2d0738f59 Mon Sep 17 00:00:00 2001 From: Qiming Teng Date: Sat, 23 Apr 2022 16:56:15 +0800 Subject: [PATCH] Improve the linkchecker script The linkchecker script is not working the same way as the `scripts/lsync.sh`. - The path must start with '/docs'. This is not implied in any way. - The language can be deduced if user provides a full path to a markdown file, e.g. `content/en/docs/concepts/security/controlling-access.md`. - The path parameter could use a positional argument for ease of use. This PR improves the user experience for the tool. --- scripts/linkchecker.py | 110 +++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 47 deletions(-) diff --git a/scripts/linkchecker.py b/scripts/linkchecker.py index 5bc63e1d7f..6dfd39b81e 100755 --- a/scripts/linkchecker.py +++ b/scripts/linkchecker.py @@ -1,13 +1,12 @@ #!/usr/bin/env python3 # # This a link checker for Kubernetes documentation website. -# - We cover the following cases for the language you provide via `-l`, which -# defaults to 'en'. -# - If the language specified is not English (`en`), we check if you are -# actually using the localized links. For example, if you specify `zh` as -# the language, and for link target `/docs/foo/bar`, we check if the English -# version exists AND if the Chinese version exists as well. A checking record -# is produced if the link can use the localized version. +# +# If the language to check is not English (`en`), we check if you are actually +# using the localized links. For example, if you checking +# `content/zh/docs/foo/bar`, we check if the English version exists AND if the +# Chinese version exists as well. A checking record is produced if the link +# can use the localized version. # # Usage: linkchecker.py -h # @@ -64,12 +63,16 @@ BAD_LINK_TYPES = { C_RED = "\033[31m" C_GREEN = "\033[32m" C_YELLOW = "\033[33m" -C_GRAY = "\033[90m" +C_GRAY = "\033[90m" C_CYAN = "\033[36m" C_END = "\033[0m" # Command line arguments shared across functions ARGS = None +# Command line parser +PARSER = None +# Language as parsed from the file path +LANG = None # Global result dictionary keyed by page examined RESULT = {} # Cached redirect entries @@ -77,6 +80,7 @@ REDIRECTS = {} # Cached anchors in target pages ANCHORS = {} + def new_record(level, message, target): """Create new checking record. @@ -89,7 +93,7 @@ def new_record(level, message, target): global ARGS # Skip info when verbose - if ARGS.verbose == False and level == "INFO": + if ARGS.verbose is False and level == "INFO": return None result = None @@ -98,9 +102,9 @@ def new_record(level, message, target): else: target = C_GRAY + target + C_END if level == "INFO": - result = target + ": " + C_GREEN + message + C_END + result = target + ": " + C_GREEN + message + C_END elif level == "WARNING": - result = target + ": " + C_YELLOW+ message + C_END + result = target + ": " + C_YELLOW + message + C_END else: # default to error result = target + ": " + C_RED + message + C_END @@ -286,7 +290,7 @@ def check_target(page, anchor, target): # link to English or localized page if (target.startswith("/docs/") or - target.startswith("/" + ARGS.lang + "/docs/")): + target.startswith("/" + LANG + "/docs/")): # target is shared reference (kubectl or kubernetes-api? if (target.find("/docs/reference/generated/kubectl/") >= 0 or @@ -305,22 +309,22 @@ def check_target(page, anchor, target): if ok: # We do't do additional checks for English site even if it has # links to a non-English page - if ARGS.lang == "en": + if LANG == "en": return None # If we are already checking localized link, fine - if target.startswith("/" + ARGS.lang + "/docs/"): + if target.startswith("/" + LANG + "/docs/"): return None # additional check for localization even if English target exists - base = os.path.join(ROOT, "content", ARGS.lang) + base = os.path.join(ROOT, "content", LANG) found = check_file_exists(base, target) if not found: # Still to be translated return None msg = ("Localized page detected, please append '/%s' to the target" - % ARGS.lang) - return new_record("ERROR", "Link not using localized page", target) + % LANG) + return new_record("ERROR", msg, target) # taget might be a redirect entry real_target = get_redirect(target) @@ -333,15 +337,16 @@ def check_target(page, anchor, target): msg = "Link may be wrong for the anchor [%s]" % anchor return new_record("WARNING", msg, target) -def check_anchor(target_page, anchor): + +def check_anchor(target, anchor): """Check if an anchor is defined in the target page - :param target_page: The target page to check + :param target: The target page to check :param anchor: Anchor string to find in the target page """ - if target_page not in ANCHORS: + if target not in ANCHORS: try: - with open(target_page, "r") as f: + with open(target, "r") as f: data = f.readlines() except Exception as ex: print("[Error] failed in reading markdown file: " + str(ex)) @@ -351,8 +356,9 @@ def check_anchor(target_page, anchor): regex1 = re.compile(anchor_pattern1) anchor_pattern2 = r"{#(.*?)}" regex2 = re.compile(anchor_pattern2) - ANCHORS[target_page] = regex1.findall(content) + regex2.findall(content) - return anchor in ANCHORS[target_page] + ANCHORS[target] = regex1.findall(content) + regex2.findall(content) + return anchor in ANCHORS[target] + def check_apiref_target(target, anchor): """Check a link to an API reference page. @@ -360,7 +366,8 @@ def check_apiref_target(target, anchor): :param target: The link target string to check :param anchor: Anchor string from the content page """ - base = os.path.join(ROOT, "content", "en", "docs", "reference", "kubernetes-api") + base = os.path.join(ROOT, "content", "en", "docs", "reference", + "kubernetes-api") ok = check_file_exists(base + "/", target) if not ok: return new_record("ERROR", "API reference page not found", target) @@ -370,7 +377,9 @@ def check_apiref_target(target, anchor): target_page = os.path.join(base, target)+".md" if not check_anchor(target_page, anchor): - return new_record("ERROR", "Anchor not found in API reference page", target+"#"+anchor) + return new_record("ERROR", "Anchor not found in API reference page", + target+"#"+anchor) + def validate_links(page): """Find and validate links on a content page. @@ -398,8 +407,8 @@ def validate_links(page): records.append(r) # searches for pattern: {{< api-reference page="" anchor="" - apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\"" - regex = re.compile(apiref_pattern) + apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\"" + regex = re.compile(apiref_re) matches = regex.findall(content) for m in matches: @@ -408,8 +417,8 @@ def validate_links(page): records.append(r) # searches for pattern: {{< api-reference page="" - apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\"" - regex = re.compile(apiref_pattern) + apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\"" + regex = re.compile(apiref_re) matches = regex.findall(content) for m in matches: @@ -426,31 +435,38 @@ def parse_arguments(): Result is returned and saved into global variable ARGS. """ - parser = argparse.ArgumentParser(description="Links checker for docs.") - parser.add_argument("-l", dest="lang", default="en", metavar="", - help=("two letter language code, e.g. 'zh'. " - "(default='en')")) - parser.add_argument("-v", dest="verbose", action="store_true", - help="switch on verbose level") - parser.add_argument("-f", dest="filter", default="/docs/**/*.md", - metavar="", - help=("File pattern to scan, e.g. '/docs/foo.md'. " - "(default='/docs/**/*.md')")) - parser.add_argument("-n", "--no-color", action="store_true", - help="Suppress colored printing.") + global PARSER - return parser.parse_args() + PARSER = argparse.ArgumentParser(description="Links checker for docs.") + PARSER.add_argument("-v", dest="verbose", action="store_true", + help="switch on verbose level") + PARSER.add_argument("-n", "--no-color", action="store_true", + help="Suppress colored printing.") + PARSER.add_argument("-f", dest="filter", default="content/en/docs/**/*.md", + metavar="", + help=("File pattern to scan. " + "(default='content/en/docs/**/*.md')")) + + return PARSER.parse_args() def main(): """The main entry of the program.""" - global ARGS, ROOT, REDIRECTS + global ARGS, ROOT, REDIRECTS, PARSER, LANG ARGS = parse_arguments() - print("Language: " + ARGS.lang) ROOT = os.path.join(os.path.dirname(__file__), '..') - content_dir = os.path.join(ROOT, 'content') - lang_dir = os.path.join(content_dir, ARGS.lang) + + print(ARGS.filter) + parts = ARGS.filter.split("/", 2) + if len(parts) != 3 or parts[0] != "content": + print("ERROR:\nPlease specify file pattern in the format " + "'content//', for example:\n" + "'content/zh/docs/concepts/**/*.md'\n") + PARSER.print_help() + sys.exit(-1) + + LANG = parts[1] # read redirects data redirects_fn = os.path.join(ROOT, "static", "_redirects") @@ -473,7 +489,7 @@ def main(): print("[Error] failed in reading redirects file: " + str(ex)) return - folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)] + folders = [f for f in glob.glob(ARGS.filter, recursive=True)] for page in folders: validate_links(page)