Skip to content

Commit d5b4697

Browse files
authored
Merge pull request #373 from Webperf-se/issue-317
Issue 317
2 parents 0fd28c7 + cf6f251 commit d5b4697

File tree

1 file changed

+201
-94
lines changed

1 file changed

+201
-94
lines changed

tests/html_validator_w3c.py

+201-94
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,33 @@
33
from datetime import datetime
44
import re
55
from models import Rating
6-
from tests.utils import get_config_or_default, get_friendly_url_name, get_translation, set_cache_file
6+
from tests.utils import get_config_or_default,\
7+
get_friendly_url_name,\
8+
get_translation,\
9+
set_cache_file
710
from tests.w3c_base import get_errors, identify_files
811
from tests.sitespeed_base import get_result
912

1013
# DEFAULTS
1114
REQUEST_TIMEOUT = get_config_or_default('http_request_timeout')
1215
USERAGENT = get_config_or_default('useragent')
13-
review_show_improvements_only = get_config_or_default('review_show_improvements_only')
14-
sitespeed_use_docker = get_config_or_default('sitespeed_use_docker')
15-
16-
sitespeed_timeout = get_config_or_default('sitespeed_timeout')
16+
REVIEW_SHOW_IMPROVEMENTS_ONLY = get_config_or_default('review_show_improvements_only')
17+
SITESPEED_USE_DOCKER = get_config_or_default('sitespeed_use_docker')
18+
SITESPEED_TIMEOUT = get_config_or_default('sitespeed_timeout')
1719
USE_CACHE = get_config_or_default('cache_when_possible')
1820
CACHE_TIME_DELTA = get_config_or_default('cache_time_delta')
1921

22+
HTML_STRINGS = [
23+
'Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”',
24+
'Element “head” is missing a required instance of child element “title”.'
25+
]
26+
27+
2028
def run_test(global_translation, lang_code, url):
2129
"""
2230
Only work on a domain-level. Returns tuple with decimal for grade and string with review
2331
"""
2432

25-
rating = Rating(global_translation, review_show_improvements_only)
26-
points = 0.0
27-
review = ''
28-
2933
local_translation = get_translation('html_validator_w3c', lang_code)
3034

3135
print(local_translation('TEXT_RUNNING_TEST'))
@@ -35,91 +39,19 @@ def run_test(global_translation, lang_code, url):
3539

3640
errors = []
3741

38-
# We don't need extra iterations for what we are using it for
39-
sitespeed_iterations = 1
40-
sitespeed_arg = '--shm-size=1g -b chrome --plugins.remove screenshot --plugins.remove html --plugins.remove metrics --browsertime.screenshot false --screenshot false --screenshotLCP false --browsertime.screenshotLCP false --chrome.cdp.performance false --browsertime.chrome.timeline false --videoParams.createFilmstrip false --visualMetrics false --visualMetricsPerceptual false --visualMetricsContentful false --browsertime.headless true --browsertime.chrome.includeResponseBodies all --utc true --browsertime.chrome.args ignore-certificate-errors -n {0}'.format(
41-
sitespeed_iterations)
42-
if 'nt' not in os.name:
43-
sitespeed_arg += ' --xvfb'
44-
45-
sitespeed_arg += ' --postScript chrome-cookies.cjs --postScript chrome-versions.cjs'
46-
47-
(_, filename) = get_result(
48-
url, sitespeed_use_docker, sitespeed_arg, sitespeed_timeout)
49-
50-
# 1. Visit page like a normal user
51-
data = identify_files(filename)
42+
data = get_data_for_url(url)
5243

44+
rating = Rating(global_translation, REVIEW_SHOW_IMPROVEMENTS_ONLY)
45+
points = 0.0
46+
review = ''
5347

48+
number_of_errors = 0
5449
for entry in data['htmls']:
55-
req_url = entry['url']
56-
name = get_friendly_url_name(global_translation, req_url, entry['index'])
57-
review_header = '- {0} '.format(name)
58-
html = entry['content']
59-
set_cache_file(req_url, html, True)
60-
61-
params = {'doc': req_url,
62-
'out': 'json',
63-
'level': 'error'}
64-
errors = get_errors('html', params)
65-
number_of_errors = len(errors)
66-
67-
68-
error_message_grouped_dict = {}
69-
if number_of_errors > 0:
70-
regex = r"(“[^”]+”)"
71-
for item in errors:
72-
error_message = item['message']
73-
74-
# Filter out CSS: entries that should not be here
75-
if error_message.startswith('CSS: '):
76-
number_of_errors -= 1
77-
continue
78-
79-
# Filter out start html document stuff if not start webpage
80-
if entry['index'] > 1:
81-
if 'Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”' in error_message:
82-
number_of_errors -= 1
83-
continue
84-
if 'Element “head” is missing a required instance of child element “title”.' in error_message:
85-
number_of_errors -= 1
86-
continue
87-
88-
error_message = re.sub(
89-
regex, "X", error_message, 0, re.MULTILINE)
90-
91-
if error_message_grouped_dict.get(error_message, False):
92-
error_message_grouped_dict[error_message] = error_message_grouped_dict[error_message] + 1
93-
else:
94-
error_message_grouped_dict[error_message] = 1
95-
96-
if len(error_message_grouped_dict) > 0:
97-
error_message_grouped_sorted = sorted(
98-
error_message_grouped_dict.items(), key=lambda x: x[1], reverse=True)
99-
100-
for item in error_message_grouped_sorted:
101-
102-
item_value = item[1]
103-
item_text = item[0]
104-
105-
review += local_translation('TEXT_REVIEW_ERRORS_ITEM').format(item_text, item_value)
106-
107-
number_of_error_types = len(error_message_grouped_dict)
108-
109-
result = calculate_rating(number_of_error_types, number_of_errors)
110-
111-
# if number_of_error_types > 0:
112-
error_types_rating = Rating(global_translation, review_show_improvements_only)
113-
error_types_rating.set_overall(result[0], review_header + local_translation('TEXT_REVIEW_RATING_GROUPED').format(
114-
number_of_error_types, 0.0))
115-
rating += error_types_rating
116-
117-
# if number_of_errors > 0:
118-
error_rating = Rating(global_translation, review_show_improvements_only)
119-
error_rating.set_overall(result[1], review_header + local_translation(
120-
'TEXT_REVIEW_RATING_ITEMS').format(number_of_errors, 0.0))
121-
rating += error_rating
50+
tmp_rating, tmp__errors = rate_entry(entry, global_translation, local_translation)
51+
rating += tmp_rating
52+
errors.extend(tmp__errors)
12253

54+
number_of_errors = len(errors)
12355

12456
points = rating.get_overall()
12557
rating.set_standards(points)
@@ -150,16 +82,191 @@ def run_test(global_translation, lang_code, url):
15082

15183
return (rating, errors)
15284

85+
def get_data_for_url(url):
86+
"""
87+
This function retrieves data for a given URL using the Sitespeed.io tool.
88+
89+
The function configures Sitespeed.io to run with specific parameters,
90+
including running in headless mode, ignoring certificate errors,
91+
and capturing all response bodies.
92+
93+
Parameters:
94+
url (str): The URL for which to retrieve data.
95+
96+
Returns:
97+
data (dict): A dictionary containing the data retrieved from the URL.
98+
"""
99+
100+
# We don't need extra iterations for what we are using it for
101+
sitespeed_iterations = 1
102+
sitespeed_arg = (
103+
'--shm-size=1g -b chrome '
104+
'--plugins.remove screenshot --plugins.remove html --plugins.remove metrics '
105+
'--browsertime.screenshot false --screenshot false --screenshotLCP false '
106+
'--browsertime.screenshotLCP false --chrome.cdp.performance false '
107+
'--browsertime.chrome.timeline false --videoParams.createFilmstrip false '
108+
'--visualMetrics false --visualMetricsPerceptual false '
109+
'--visualMetricsContentful false --browsertime.headless true '
110+
'--browsertime.chrome.includeResponseBodies all --utc true '
111+
'--browsertime.chrome.args ignore-certificate-errors '
112+
f'-n {sitespeed_iterations}')
113+
if 'nt' not in os.name:
114+
sitespeed_arg += ' --xvfb'
115+
116+
sitespeed_arg += ' --postScript chrome-cookies.cjs --postScript chrome-versions.cjs'
117+
118+
(_, filename) = get_result(
119+
url, SITESPEED_USE_DOCKER, sitespeed_arg, SITESPEED_TIMEOUT)
120+
121+
# 1. Visit page like a normal user
122+
data = identify_files(filename)
123+
return data
124+
125+
def rate_entry(entry, global_translation, local_translation):
126+
"""
127+
Rates an entry based on the number and types of HTML errors.
128+
129+
This function takes an entry, global translations, and local translations as input.
130+
It calculates a rating for the entry based on the number and
131+
types of HTML errors present in the content of the entry.
132+
The function also groups the error messages and calculates an overall rating.
133+
134+
Parameters:
135+
entry (dict): A dictionary containing the details of the entry including the URL and content.
136+
global_translation (function): A function for translating text globally.
137+
local_translation (function): A function for translating text locally.
138+
139+
Returns:
140+
tuple: A tuple containing the overall rating (Rating object) and the errors (list).
141+
"""
142+
rating = Rating(global_translation, REVIEW_SHOW_IMPROVEMENTS_ONLY)
143+
144+
req_url = entry['url']
145+
name = get_friendly_url_name(global_translation, req_url, entry['index'])
146+
review_header = f'- {name} '
147+
148+
set_cache_file(req_url, entry['content'], True)
149+
150+
errors = get_errors('html',
151+
{
152+
'doc': req_url,
153+
'out': 'json',
154+
'level': 'error'
155+
})
156+
number_of_errors = len(errors)
157+
158+
error_message_grouped_dict = {}
159+
if number_of_errors > 0:
160+
error_message_grouped_dict = get_grouped_error_messages(
161+
entry,
162+
local_translation,
163+
errors,
164+
number_of_errors)
165+
166+
number_of_error_types = len(error_message_grouped_dict)
167+
result = calculate_rating(number_of_error_types, number_of_errors)
168+
169+
error_types_rating = Rating(global_translation, REVIEW_SHOW_IMPROVEMENTS_ONLY)
170+
error_types_rating.set_overall(
171+
result[0],
172+
review_header + local_translation('TEXT_REVIEW_RATING_GROUPED').format(
173+
number_of_error_types,
174+
0.0))
175+
rating += error_types_rating
176+
177+
error_rating = Rating(global_translation, REVIEW_SHOW_IMPROVEMENTS_ONLY)
178+
error_rating.set_overall(result[1], review_header + local_translation(
179+
'TEXT_REVIEW_RATING_ITEMS').format(number_of_errors, 0.0))
180+
rating += error_rating
181+
return (rating, errors)
182+
183+
def get_grouped_error_messages(entry, local_translation, errors, number_of_errors):
184+
"""
185+
Groups HTML error messages and counts their occurrences.
186+
187+
This function takes an entry, local translations, a list of errors,
188+
and the total number of errors as input.
189+
It filters out irrelevant errors and groups the remaining ones by their messages.
190+
The function also counts the occurrences of each error message.
191+
192+
Parameters:
193+
entry (dict): A dictionary containing the details of the entry including the URL and content.
194+
local_translation (function): A function for translating text locally.
195+
errors (list): A list of error messages.
196+
number_of_errors (int): The total number of errors.
197+
198+
Returns:
199+
dict: A dictionary where the keys are the error messages and the values are their counts.
200+
"""
201+
error_message_grouped_dict = {}
202+
regex = r"(“[^”]+”)"
203+
for item in errors:
204+
error_message = item['message']
205+
206+
# Filter out CSS: entries that should not be here
207+
if error_message.startswith('CSS: '):
208+
number_of_errors -= 1
209+
continue
210+
211+
# Filter out start html document stuff if not start webpage
212+
if entry['index'] > 1:
213+
is_html = False
214+
for html_str in HTML_STRINGS:
215+
if html_str in error_message:
216+
number_of_errors -= 1
217+
is_html = True
218+
break
219+
220+
if is_html:
221+
continue
222+
223+
error_message = re.sub(
224+
regex, "X", error_message, 0, re.MULTILINE)
225+
226+
if error_message_grouped_dict.get(error_message, False):
227+
error_message_grouped_dict[error_message] = \
228+
error_message_grouped_dict[error_message] + 1
229+
else:
230+
error_message_grouped_dict[error_message] = 1
231+
232+
if len(error_message_grouped_dict) > 0:
233+
error_message_grouped_sorted = sorted(
234+
error_message_grouped_dict.items(), key=lambda x: x[1], reverse=True)
235+
236+
for item in error_message_grouped_sorted:
237+
item_value = item[1]
238+
item_text = item[0]
239+
240+
review += local_translation(
241+
'TEXT_REVIEW_ERRORS_ITEM'
242+
).format(item_text, item_value)
243+
244+
return error_message_grouped_dict
245+
153246

154247
def calculate_rating(number_of_error_types, number_of_errors):
248+
"""
249+
Calculates ratings based on the number of error types and errors.
250+
251+
This function takes the number of error types and the total number of errors as input.
252+
It calculates two ratings: one based on the number of error types and
253+
the other based on the total number of errors.
254+
The ratings are calculated such that a higher number of errors or
255+
error types will result in a lower rating. The minimum rating is 1.0.
155256
257+
Parameters:
258+
number_of_error_types (int): The number of different types of errors.
259+
number_of_errors (int): The total number of errors.
260+
261+
Returns:
262+
tuple: A tuple containing the rating based on the number of error types and
263+
the rating based on the total number of errors.
264+
"""
156265
rating_number_of_error_types = 5.0 - (number_of_error_types / 5.0)
157266

158267
rating_number_of_errors = 5.0 - ((number_of_errors / 2.0) / 5.0)
159268

160-
if rating_number_of_error_types < 1.0:
161-
rating_number_of_error_types = 1.0
162-
if rating_number_of_errors < 1.0:
163-
rating_number_of_errors = 1.0
269+
rating_number_of_error_types = max(rating_number_of_error_types, 1.0)
270+
rating_number_of_errors = max(rating_number_of_errors, 1.0)
164271

165272
return (rating_number_of_error_types, rating_number_of_errors)

0 commit comments

Comments
 (0)