-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchecks-checker
executable file
·261 lines (218 loc) · 9.24 KB
/
checks-checker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env python3
import argparse
import logging
import re
import sys
import types
from urllib.parse import urlparse, parse_qs
import glob
import os
import requests
import yaml
from bs4 import BeautifulSoup
def getUrlsFromText(text) -> set:
"""Returns URLs found in a text
Taken from https://www.geeksforgeeks.org/python-check-url-string/
Parameters:
text (string): The text to search for URLs
Returns:
urls (set): A list of URLs that have been found in the text
"""
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
result = re.findall(regex, text)
urls = set([x[0] for x in result])
return urls
#def checkUrl(url: str) -> tuple(list(str), list(str)):
def checkUrl(url: str) -> tuple:
"""Checks an URL for validity
Parameters:
url (string): The text to search for URLs
Returns:
(errors, warnings)
"""
warnings=[]
errors=[]
parts = urlparse(url)
# formal checks first
if parts.query:
query=parse_qs(parts.query)
if parts.hostname == "cloud.google.com":
if query['hl']:
errors.append(f"Don't request a specific document language (?hl={query['hl'][0]}) in {url}")
if parts.hostname == "documentation.suse.com":
# We should/must use the single-html version for SUSE documentation
# Otherwise links might break if the the document structure changes
if "/single-html/" not in parts.path:
errors.append(f"Use the single-html version of the document instead! {url}")
if not parts.fragment:
errors.append(f"no fragment used in {url}")
# After the formal checks, try to actually access the URL
try:
response = requests.get(url)
except requests.ConnectionError:
errors.append(("e",f"Connection failed for {url}"))
return (errors, warnings)
if not response.ok:
errors.append(f"status {response.status_code} returned for {url}")
return (errors, warnings)
# If a URL gets redirected, we might want to
# change the documentation link to the redirection target.
if response.url != url:
warnings.append(f"got redirected to {response.url} from {url}")
htmlDocument=response.text
# Parse the document
soup = BeautifulSoup(htmlDocument, 'html.parser')
# If the URL has a fragment, look for it in the document
if parts.fragment:
htmlElements=soup.find_all(id=parts.fragment)
if len(htmlElements)==0:
# Not an id, trying name
htmlElements=soup.find_all(name=parts.fragment)
if len(htmlElements)==0:
errors.append(f"fragment \"{parts.fragment}\" not found in {url}")
return (errors, warnings)
# get the canonical link of the document
canonical_links=soup.select('head link[rel*=canonical]')
# canonical_links might be empty
if len(canonical_links)>0 and 'href' in canonical_links[0]:
# # Sometimes documents have multiple instances of canonical links
# # I've never experienced them differ, so we don't care
# if len(canonical_links)>1:
# print("More than 1 canonical link!")
# for canonical_link in canonical_links:
# print(f"Canonical link: {canonical_link['href']}")
canonical_url=canonical_links[0]['href']
# for suse documentation, remove the index.html suffix (probably works for everyone else too)
if parts.hostname == "documentation.suse.com":
# Python 3.9 and newer: canonical.removesuffix('index.html')
if canonical_url.endswith('index.html'):
canonical_url = canonical_url[:-10]
# add the fragment to the canonical URL, if any
if parts.fragment:
canonical_url += "#" + parts.fragment
# Warn if the URL used is not the canonical one
if canonical_url != url:
if parts.hostname == "documentation.suse.com":
errors.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")
else:
warnings.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")
return (errors, warnings)
def checkGetDocuUrls(check: dict) -> int:
# Check URLs in remediation text
if 'remediation' not in check:
logging.info(f"Check {check['id']} has no remediation section, skipping check")
return 0
urls = getUrlsFromText(check['remediation'])
logging.debug(f"{len(urls)} URLs found")
if len(urls)>0:
for url in urls:
print(f"{check['id']}: {url}")
return 0
def checkCheckUrls(check: dict) -> int:
"""Checks the URLs in a check
Parameters:
check (dict): The check to check the URLs of
Returns:
errorcount (int): returns the number of errors encountered, if any
"""
urlErrors=0
# Check URLs in remediation text
if 'remediation' not in check:
logging.info(f"Check {check['id']} has no remediation section, skipping check")
return 0
urls = getUrlsFromText(check['remediation'])
logging.debug(f"{len(urls)} URLs found")
if len(urls)>0:
for url in urls:
errors, warnings = checkUrl(url)
for warning in warnings:
logging.warning(f"check {check['id']}: {warning}")
for error in errors:
logging.error(f"check {check['id']}: {error}")
urlErrors += len(errors)
return urlErrors
def checkCheck(check: dict) -> int:
"""Checks a check
Parameters:
check (dict): The check to check
Returns:
errorcount (int): returns the number of errors encountered, if any
"""
checkErrors=0
logging.info(f"checking check {check['id']}")
checkErrors+=checkCheckUrls(check)
return checkErrors
def getChecksfromCheckFile(filename) -> list:
"""parses a checks file and returns a list of the contained check(s) as dictionary
Parameters:
filename (string): name of the checks file to parse
Returns:
checks (list): checks contained in the parsed file (list of dicts)
"""
logging.debug(f"processing file {filename}")
checks=[]
try:
with open(filename, 'r') as file:
docs = yaml.safe_load_all(file)
if not isinstance(docs, types.GeneratorType):
logging.warning(f"file \"{filename}\" contains no valid YAML documents, skipping")
return []
for check in docs:
if isinstance(check, str):
logging.debug("File is not YAML, skipping")
continue
if not 'id' in check:
logging.debug("Check has no id property, skipping")
continue
checks.append(check)
except FileNotFoundError:
logging.debug(f"file \"{filename}\" not found, skipping")
except(yaml.scanner.ScannerError, yaml.parser.ParserError):
logging.debug(f"file \"{filename}\" not a valid YAML file, skipping")
logging.debug(f"found {len(checks)} checks in file \"{filename}\"")
return checks
def main():
# Parse Arguments
parser = argparse.ArgumentParser(description ='Check Wanda checks')
parser.add_argument(dest='filenames', metavar='filename', nargs='+', help='checks file')
parser.add_argument('-l', '--log-level', dest='loglevel', default='WARNING', action='store', help='set log level')
parser.add_argument('-m', '--match', dest='match', default='', action='store', help='only process files matching this glob')
parser.add_argument('-d', '--doc-urls', dest='doc_urls', default=False, action='store_true', help='only print documentation URLs')
args = parser.parse_args()
# Set up logging
logger = logging.getLogger()
logFormatter = logging.Formatter(fmt='%(levelname)s: %(message)s')
logHandler = logging.StreamHandler()
logHandler.setFormatter(logFormatter)
logger.addHandler(logHandler)
try:
logger.setLevel(args.loglevel)
except ValueError:
logging.critical("Invalid log level: {} (valid levels: DEBUG, INFO, WARNING, ERROR)".format(args.loglevel))
sys.exit(255)
allErrors=0
# Check all files given as parameters
for pattern in args.filenames:
for filename in glob.iglob(pattern, recursive=True):
if args.match != '' and not filename in glob.iglob(args.match, recursive=True):
logging.debug(f"file {filename} does not match glob {args.match}, skipping")
continue
if os.path.isdir(filename):
logging.debug(f"file {filename} is a directory, skipping")
continue
try:
checks=getChecksfromCheckFile(filename)
for check in checks:
if args.doc_urls:
allErrors += checkGetDocuUrls(check)
else:
allErrors += checkCheck(check)
except IsADirectoryError:
pass
logging.shutdown()
# Return 1 if any erros occured, or 0 if everything was ok
#exit(int(allErrors > 0))
exit(0)
#---------------------------------
if __name__ == '__main__':
main()