Add script for detecting bad characters.

Co-authored-by: Shu Muto <shu.mutow@nec.com>
pull/27315/head
s-kawamura-w664 2021-03-30 07:12:44 +00:00
parent 6d252624b2
commit 7fde0426df
2 changed files with 80 additions and 0 deletions

View File

@ -11,6 +11,7 @@
| `linkchecker.py` | This a link checker for Kubernetes documentation website. |
| `lsync.sh` | This script checks if the English version of a page has changed since a localized page has been committed. |
| `replace-capture.sh` | This script sets K8S_WEBSITE in your env to your docs website root or rely on this script to determine it automatically |
| `check-ctrlcode.py` | This script finds control-code(0x00-0x1f) in text files. |
@ -152,3 +153,28 @@ The following command checks a subdirectory:
./scripts/lsync.sh content/zh/docs/concepts/
## check-ctrlcode.py
This script finds control-code(0x00-0x1f) in text files.
It will display illegal character in browser.
```
Usage: ./check-ctrlcode.py <dir> <ext>
<dir> Specify the directory to check.
<ext> Specify the extension to check.
For example, we can execute as following.
./check-ctrlcode.py ../content/en/ .md
The output is following format.
"{0} <L{1}:{2}:{3}>: {4}"
{0} : The path of file that a control-code exists.
{1} : The line number that a control-code exists.
{2} : The column number that a control-code exists.
{3} : The found control-code.
{4} : The one-line strings in the file.
```

54
scripts/check-ctrlcode.py Executable file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python3
import os
import sys
import re
def main():
args = sys.argv
if (len(args) != 3):
print("Usage: ./check-ctrlcode.py <dir> <ext>")
sys.exit(1)
dirpath = args[1]
ext = args[2]
fullpath = os.path.abspath(dirpath)
if (os.path.isdir(fullpath) is not True):
print("Directory not found.")
sys.exit(1)
check_dir(fullpath, ext)
def check_dir(path, ext):
for f in os.listdir(path):
if(f[0] == "."):
continue
fullpath = os.path.join(path, f)
if(os.path.isdir(fullpath)):
check_dir(fullpath, ext)
continue
exts = os.path.splitext(f)
if(exts[1] != ext):
continue
check_ctrlcode(fullpath)
def check_ctrlcode(filepath):
line = 0
with open(filepath, encoding='utf-8') as f:
while True:
str = f.readline()
if(str == ""):
break
line = line + 1
# check 0x00-0x1f except 0x09(HT), 0x0a(LF), 0x0d(CR)
pattern = re.compile('[\u0000-\u0008\u000b\u000c\u000e-\u001f]')
m = pattern.search(str)
if(m == None):
continue
pos = m.end()
ctrl = m.group().encode("utf-8")
print("{0} <L{1}:{2}:{3}>: {4}\n".format(filepath, line, pos, ctrl, str.replace('\n','')))
main()