Improve the linkchecker script

The linkchecker script is not working the same way as the `scripts/lsync.sh`.

- The path must start with '/docs'. This is not implied in any way.
- The language can be deduced if user provides a full path to a markdown
  file, e.g. `content/en/docs/concepts/security/controlling-access.md`.
- The path parameter could use a positional argument for ease of use.

This PR improves the user experience for the tool.
pull/33134/head
Qiming Teng 2022-04-23 16:56:15 +08:00
parent 285affba7e
commit 95257a2edd
1 changed files with 63 additions and 47 deletions

View File

@ -1,13 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# #
# This a link checker for Kubernetes documentation website. # This a link checker for Kubernetes documentation website.
# - We cover the following cases for the language you provide via `-l`, which #
# defaults to 'en'. # If the language to check is not English (`en`), we check if you are actually
# - If the language specified is not English (`en`), we check if you are # using the localized links. For example, if you checking
# actually using the localized links. For example, if you specify `zh` as # `content/zh/docs/foo/bar`, we check if the English version exists AND if the
# the language, and for link target `/docs/foo/bar`, we check if the English # Chinese version exists as well. A checking record is produced if the link
# version exists AND if the Chinese version exists as well. A checking record # can use the localized version.
# is produced if the link can use the localized version.
# #
# Usage: linkchecker.py -h # Usage: linkchecker.py -h
# #
@ -64,12 +63,16 @@ BAD_LINK_TYPES = {
C_RED = "\033[31m" C_RED = "\033[31m"
C_GREEN = "\033[32m" C_GREEN = "\033[32m"
C_YELLOW = "\033[33m" C_YELLOW = "\033[33m"
C_GRAY = "\033[90m" C_GRAY = "\033[90m"
C_CYAN = "\033[36m" C_CYAN = "\033[36m"
C_END = "\033[0m" C_END = "\033[0m"
# Command line arguments shared across functions # Command line arguments shared across functions
ARGS = None ARGS = None
# Command line parser
PARSER = None
# Language as parsed from the file path
LANG = None
# Global result dictionary keyed by page examined # Global result dictionary keyed by page examined
RESULT = {} RESULT = {}
# Cached redirect entries # Cached redirect entries
@ -77,6 +80,7 @@ REDIRECTS = {}
# Cached anchors in target pages # Cached anchors in target pages
ANCHORS = {} ANCHORS = {}
def new_record(level, message, target): def new_record(level, message, target):
"""Create new checking record. """Create new checking record.
@ -89,7 +93,7 @@ def new_record(level, message, target):
global ARGS global ARGS
# Skip info when verbose # Skip info when verbose
if ARGS.verbose == False and level == "INFO": if ARGS.verbose is False and level == "INFO":
return None return None
result = None result = None
@ -98,9 +102,9 @@ def new_record(level, message, target):
else: else:
target = C_GRAY + target + C_END target = C_GRAY + target + C_END
if level == "INFO": if level == "INFO":
result = target + ": " + C_GREEN + message + C_END result = target + ": " + C_GREEN + message + C_END
elif level == "WARNING": elif level == "WARNING":
result = target + ": " + C_YELLOW+ message + C_END result = target + ": " + C_YELLOW + message + C_END
else: # default to error else: # default to error
result = target + ": " + C_RED + message + C_END result = target + ": " + C_RED + message + C_END
@ -286,7 +290,7 @@ def check_target(page, anchor, target):
# link to English or localized page # link to English or localized page
if (target.startswith("/docs/") or if (target.startswith("/docs/") or
target.startswith("/" + ARGS.lang + "/docs/")): target.startswith("/" + LANG + "/docs/")):
# target is shared reference (kubectl or kubernetes-api? # target is shared reference (kubectl or kubernetes-api?
if (target.find("/docs/reference/generated/kubectl/") >= 0 or if (target.find("/docs/reference/generated/kubectl/") >= 0 or
@ -305,22 +309,22 @@ def check_target(page, anchor, target):
if ok: if ok:
# We do't do additional checks for English site even if it has # We do't do additional checks for English site even if it has
# links to a non-English page # links to a non-English page
if ARGS.lang == "en": if LANG == "en":
return None return None
# If we are already checking localized link, fine # If we are already checking localized link, fine
if target.startswith("/" + ARGS.lang + "/docs/"): if target.startswith("/" + LANG + "/docs/"):
return None return None
# additional check for localization even if English target exists # additional check for localization even if English target exists
base = os.path.join(ROOT, "content", ARGS.lang) base = os.path.join(ROOT, "content", LANG)
found = check_file_exists(base, target) found = check_file_exists(base, target)
if not found: if not found:
# Still to be translated # Still to be translated
return None return None
msg = ("Localized page detected, please append '/%s' to the target" msg = ("Localized page detected, please append '/%s' to the target"
% ARGS.lang) % LANG)
return new_record("ERROR", "Link not using localized page", target) return new_record("ERROR", msg, target)
# taget might be a redirect entry # taget might be a redirect entry
real_target = get_redirect(target) real_target = get_redirect(target)
@ -333,15 +337,16 @@ def check_target(page, anchor, target):
msg = "Link may be wrong for the anchor [%s]" % anchor msg = "Link may be wrong for the anchor [%s]" % anchor
return new_record("WARNING", msg, target) return new_record("WARNING", msg, target)
def check_anchor(target_page, anchor):
def check_anchor(target, anchor):
"""Check if an anchor is defined in the target page """Check if an anchor is defined in the target page
:param target_page: The target page to check :param target: The target page to check
:param anchor: Anchor string to find in the target page :param anchor: Anchor string to find in the target page
""" """
if target_page not in ANCHORS: if target not in ANCHORS:
try: try:
with open(target_page, "r") as f: with open(target, "r") as f:
data = f.readlines() data = f.readlines()
except Exception as ex: except Exception as ex:
print("[Error] failed in reading markdown file: " + str(ex)) print("[Error] failed in reading markdown file: " + str(ex))
@ -351,8 +356,9 @@ def check_anchor(target_page, anchor):
regex1 = re.compile(anchor_pattern1) regex1 = re.compile(anchor_pattern1)
anchor_pattern2 = r"{#(.*?)}" anchor_pattern2 = r"{#(.*?)}"
regex2 = re.compile(anchor_pattern2) regex2 = re.compile(anchor_pattern2)
ANCHORS[target_page] = regex1.findall(content) + regex2.findall(content) ANCHORS[target] = regex1.findall(content) + regex2.findall(content)
return anchor in ANCHORS[target_page] return anchor in ANCHORS[target]
def check_apiref_target(target, anchor): def check_apiref_target(target, anchor):
"""Check a link to an API reference page. """Check a link to an API reference page.
@ -360,7 +366,8 @@ def check_apiref_target(target, anchor):
:param target: The link target string to check :param target: The link target string to check
:param anchor: Anchor string from the content page :param anchor: Anchor string from the content page
""" """
base = os.path.join(ROOT, "content", "en", "docs", "reference", "kubernetes-api") base = os.path.join(ROOT, "content", "en", "docs", "reference",
"kubernetes-api")
ok = check_file_exists(base + "/", target) ok = check_file_exists(base + "/", target)
if not ok: if not ok:
return new_record("ERROR", "API reference page not found", target) return new_record("ERROR", "API reference page not found", target)
@ -370,7 +377,9 @@ def check_apiref_target(target, anchor):
target_page = os.path.join(base, target)+".md" target_page = os.path.join(base, target)+".md"
if not check_anchor(target_page, anchor): if not check_anchor(target_page, anchor):
return new_record("ERROR", "Anchor not found in API reference page", target+"#"+anchor) return new_record("ERROR", "Anchor not found in API reference page",
target+"#"+anchor)
def validate_links(page): def validate_links(page):
"""Find and validate links on a content page. """Find and validate links on a content page.
@ -398,8 +407,8 @@ def validate_links(page):
records.append(r) records.append(r)
# searches for pattern: {{< api-reference page="" anchor="" # searches for pattern: {{< api-reference page="" anchor=""
apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\"" apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\""
regex = re.compile(apiref_pattern) regex = re.compile(apiref_re)
matches = regex.findall(content) matches = regex.findall(content)
for m in matches: for m in matches:
@ -408,8 +417,8 @@ def validate_links(page):
records.append(r) records.append(r)
# searches for pattern: {{< api-reference page="" # searches for pattern: {{< api-reference page=""
apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\"" apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\""
regex = re.compile(apiref_pattern) regex = re.compile(apiref_re)
matches = regex.findall(content) matches = regex.findall(content)
for m in matches: for m in matches:
@ -426,31 +435,38 @@ def parse_arguments():
Result is returned and saved into global variable ARGS. Result is returned and saved into global variable ARGS.
""" """
parser = argparse.ArgumentParser(description="Links checker for docs.") global PARSER
parser.add_argument("-l", dest="lang", default="en", metavar="<LANG>",
help=("two letter language code, e.g. 'zh'. "
"(default='en')"))
parser.add_argument("-v", dest="verbose", action="store_true",
help="switch on verbose level")
parser.add_argument("-f", dest="filter", default="/docs/**/*.md",
metavar="<FILTER>",
help=("File pattern to scan, e.g. '/docs/foo.md'. "
"(default='/docs/**/*.md')"))
parser.add_argument("-n", "--no-color", action="store_true",
help="Suppress colored printing.")
return parser.parse_args() PARSER = argparse.ArgumentParser(description="Links checker for docs.")
PARSER.add_argument("-v", dest="verbose", action="store_true",
help="switch on verbose level")
PARSER.add_argument("-n", "--no-color", action="store_true",
help="Suppress colored printing.")
PARSER.add_argument("-f", dest="filter", default="content/en/docs/**/*.md",
metavar="<FILTER>",
help=("File pattern to scan. "
"(default='content/en/docs/**/*.md')"))
return PARSER.parse_args()
def main(): def main():
"""The main entry of the program.""" """The main entry of the program."""
global ARGS, ROOT, REDIRECTS global ARGS, ROOT, REDIRECTS, PARSER, LANG
ARGS = parse_arguments() ARGS = parse_arguments()
print("Language: " + ARGS.lang)
ROOT = os.path.join(os.path.dirname(__file__), '..') ROOT = os.path.join(os.path.dirname(__file__), '..')
content_dir = os.path.join(ROOT, 'content')
lang_dir = os.path.join(content_dir, ARGS.lang) print(ARGS.filter)
parts = ARGS.filter.split("/", 2)
if len(parts) != 3 or parts[0] != "content":
print("ERROR:\nPlease specify file pattern in the format "
"'content/<lang>/<path-pattern>', for example:\n"
"'content/zh/docs/concepts/**/*.md'\n")
PARSER.print_help()
sys.exit(-1)
LANG = parts[1]
# read redirects data # read redirects data
redirects_fn = os.path.join(ROOT, "static", "_redirects") redirects_fn = os.path.join(ROOT, "static", "_redirects")
@ -473,7 +489,7 @@ def main():
print("[Error] failed in reading redirects file: " + str(ex)) print("[Error] failed in reading redirects file: " + str(ex))
return return
folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)] folders = [f for f in glob.glob(ARGS.filter, recursive=True)]
for page in folders: for page in folders:
validate_links(page) validate_links(page)