From 95257a2eddd1a9733dd622e3d97c28c2d0738f59 Mon Sep 17 00:00:00 2001
From: Qiming Teng <tengqm@outlook.com>
Date: Sat, 23 Apr 2022 16:56:15 +0800
Subject: [PATCH] Improve the linkchecker script

The linkchecker script is not working the same way as the `scripts/lsync.sh`.

- The path must start with '/docs'. This is not implied in any way.
- The language can be deduced if user provides a full path to a markdown
  file, e.g. `content/en/docs/concepts/security/controlling-access.md`.
- The path parameter could use a positional argument for ease of use.

This PR improves the user experience for the tool.
---
 scripts/linkchecker.py | 110 +++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 47 deletions(-)

diff --git a/scripts/linkchecker.py b/scripts/linkchecker.py
index 5bc63e1d7f..6dfd39b81e 100755
--- a/scripts/linkchecker.py
+++ b/scripts/linkchecker.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 #
 # This a link checker for Kubernetes documentation website.
-# - We cover the following cases for the language you provide via `-l`, which
-#   defaults to 'en'.
-# - If the language specified is not English (`en`), we check if you are
-#   actually using the localized links. For example, if you specify `zh` as
-#   the language, and for link target `/docs/foo/bar`, we check if the English
-#   version exists AND if the Chinese version exists as well. A checking record
-#   is produced if the link can use the localized version.
+#
+# If the language to check is not English (`en`), we check if you are actually
+# using the localized links. For example, if you checking
+# `content/zh/docs/foo/bar`, we check if the English version exists AND if the
+# Chinese version exists as well.  A checking record is produced if the link
+# can use the localized version.
 #
 # Usage: linkchecker.py -h
 #
@@ -64,12 +63,16 @@ BAD_LINK_TYPES = {
 C_RED = "\033[31m"
 C_GREEN = "\033[32m"
 C_YELLOW = "\033[33m"
-C_GRAY  = "\033[90m"
+C_GRAY = "\033[90m"
 C_CYAN = "\033[36m"
 C_END = "\033[0m"
 
 # Command line arguments shared across functions
 ARGS = None
+# Command line parser
+PARSER = None
+# Language as parsed from the file path
+LANG = None
 # Global result dictionary keyed by page examined
 RESULT = {}
 # Cached redirect entries
@@ -77,6 +80,7 @@ REDIRECTS = {}
 # Cached anchors in target pages
 ANCHORS = {}
 
+
 def new_record(level, message, target):
     """Create new checking record.
 
@@ -89,7 +93,7 @@ def new_record(level, message, target):
     global ARGS
 
     # Skip info when verbose
-    if ARGS.verbose == False and level == "INFO":
+    if ARGS.verbose is False and level == "INFO":
         return None
 
     result = None
@@ -98,9 +102,9 @@ def new_record(level, message, target):
     else:
         target = C_GRAY + target + C_END
         if level == "INFO":
-            result =  target + ": " + C_GREEN  + message + C_END 
+            result = target + ": " + C_GREEN + message + C_END
         elif level == "WARNING":
-            result = target + ": " + C_YELLOW+ message + C_END
+            result = target + ": " + C_YELLOW + message + C_END
         else:  # default to error
             result = target + ": " + C_RED + message + C_END
 
@@ -286,7 +290,7 @@ def check_target(page, anchor, target):
 
     # link to English or localized page
     if (target.startswith("/docs/") or
-            target.startswith("/" + ARGS.lang + "/docs/")):
+            target.startswith("/" + LANG + "/docs/")):
 
         # target is shared reference (kubectl or kubernetes-api?
         if (target.find("/docs/reference/generated/kubectl/") >= 0 or
@@ -305,22 +309,22 @@ def check_target(page, anchor, target):
         if ok:
             # We do't do additional checks for English site even if it has
             # links to a non-English page
-            if ARGS.lang == "en":
+            if LANG == "en":
                 return None
 
             # If we are already checking localized link, fine
-            if target.startswith("/" + ARGS.lang + "/docs/"):
+            if target.startswith("/" + LANG + "/docs/"):
                 return None
 
             # additional check for localization even if English target exists
-            base = os.path.join(ROOT, "content", ARGS.lang)
+            base = os.path.join(ROOT, "content", LANG)
             found = check_file_exists(base, target)
             if not found:
                 # Still to be translated
                 return None
             msg = ("Localized page detected, please append '/%s' to the target"
-                   % ARGS.lang)
-            return new_record("ERROR", "Link not using localized page", target)
+                   % LANG)
+            return new_record("ERROR", msg, target)
 
         # taget might be a redirect entry
         real_target = get_redirect(target)
@@ -333,15 +337,16 @@ def check_target(page, anchor, target):
     msg = "Link may be wrong for the anchor [%s]" % anchor
     return new_record("WARNING", msg, target)
 
-def check_anchor(target_page, anchor):
+
+def check_anchor(target, anchor):
     """Check if an anchor is defined in the target page
 
-    :param target_page: The target page to check
+    :param target: The target page to check
     :param anchor: Anchor string to find in the target page
     """
-    if target_page not in ANCHORS:
+    if target not in ANCHORS:
         try:
-            with open(target_page, "r") as f:
+            with open(target, "r") as f:
                 data = f.readlines()
         except Exception as ex:
             print("[Error] failed in reading markdown file: " + str(ex))
@@ -351,8 +356,9 @@ def check_anchor(target_page, anchor):
         regex1 = re.compile(anchor_pattern1)
         anchor_pattern2 = r"{#(.*?)}"
         regex2 = re.compile(anchor_pattern2)
-        ANCHORS[target_page] = regex1.findall(content) + regex2.findall(content)
-    return anchor in ANCHORS[target_page]
+        ANCHORS[target] = regex1.findall(content) + regex2.findall(content)
+    return anchor in ANCHORS[target]
+
 
 def check_apiref_target(target, anchor):
     """Check a link to an API reference page.
@@ -360,7 +366,8 @@ def check_apiref_target(target, anchor):
     :param target: The link target string to check
     :param anchor: Anchor string from the content page
     """
-    base = os.path.join(ROOT, "content", "en", "docs", "reference", "kubernetes-api")
+    base = os.path.join(ROOT, "content", "en", "docs", "reference",
+                        "kubernetes-api")
     ok = check_file_exists(base + "/", target)
     if not ok:
         return new_record("ERROR", "API reference page not found", target)
@@ -370,7 +377,9 @@ def check_apiref_target(target, anchor):
 
     target_page = os.path.join(base, target)+".md"
     if not check_anchor(target_page, anchor):
-        return new_record("ERROR", "Anchor not found in API reference page", target+"#"+anchor)
+        return new_record("ERROR", "Anchor not found in API reference page",
+                          target+"#"+anchor)
+
 
 def validate_links(page):
     """Find and validate links on a content page.
@@ -398,8 +407,8 @@ def validate_links(page):
             records.append(r)
 
     # searches for pattern: {{< api-reference page="" anchor=""
-    apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\""
-    regex = re.compile(apiref_pattern)
+    apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\""
+    regex = re.compile(apiref_re)
 
     matches = regex.findall(content)
     for m in matches:
@@ -408,8 +417,8 @@ def validate_links(page):
             records.append(r)
 
     # searches for pattern: {{< api-reference page=""
-    apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\""
-    regex = re.compile(apiref_pattern)
+    apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\""
+    regex = re.compile(apiref_re)
 
     matches = regex.findall(content)
     for m in matches:
@@ -426,31 +435,38 @@ def parse_arguments():
 
     Result is returned and saved into global variable ARGS.
     """
-    parser = argparse.ArgumentParser(description="Links checker for docs.")
-    parser.add_argument("-l", dest="lang", default="en", metavar="<LANG>",
-                        help=("two letter language code, e.g. 'zh'. "
-                              "(default='en')"))
-    parser.add_argument("-v", dest="verbose", action="store_true",
-                        help="switch on verbose level")
-    parser.add_argument("-f", dest="filter", default="/docs/**/*.md",
-                        metavar="<FILTER>",
-                        help=("File pattern to scan, e.g. '/docs/foo.md'. "
-                              "(default='/docs/**/*.md')"))
-    parser.add_argument("-n", "--no-color", action="store_true",
-                        help="Suppress colored printing.")
+    global PARSER
 
-    return parser.parse_args()
+    PARSER = argparse.ArgumentParser(description="Links checker for docs.")
+    PARSER.add_argument("-v", dest="verbose", action="store_true",
+                        help="switch on verbose level")
+    PARSER.add_argument("-n", "--no-color", action="store_true",
+                        help="Suppress colored printing.")
+    PARSER.add_argument("-f", dest="filter", default="content/en/docs/**/*.md",
+                        metavar="<FILTER>",
+                        help=("File pattern to scan. "
+                              "(default='content/en/docs/**/*.md')"))
+
+    return PARSER.parse_args()
 
 
 def main():
     """The main entry of the program."""
-    global ARGS, ROOT, REDIRECTS
+    global ARGS, ROOT, REDIRECTS, PARSER, LANG
 
     ARGS = parse_arguments()
-    print("Language: " + ARGS.lang)
     ROOT = os.path.join(os.path.dirname(__file__), '..')
-    content_dir = os.path.join(ROOT, 'content')
-    lang_dir = os.path.join(content_dir, ARGS.lang)
+
+    print(ARGS.filter)
+    parts = ARGS.filter.split("/", 2)
+    if len(parts) != 3 or parts[0] != "content":
+        print("ERROR:\nPlease specify file pattern in the format "
+              "'content/<lang>/<path-pattern>', for example:\n"
+              "'content/zh/docs/concepts/**/*.md'\n")
+        PARSER.print_help()
+        sys.exit(-1)
+
+    LANG = parts[1]
 
     # read redirects data
     redirects_fn = os.path.join(ROOT, "static", "_redirects")
@@ -473,7 +489,7 @@ def main():
         print("[Error] failed in reading redirects file: " + str(ex))
         return
 
-    folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)]
+    folders = [f for f in glob.glob(ARGS.filter, recursive=True)]
     for page in folders:
         validate_links(page)