Skip to content
This repository has been archived by the owner on Jun 28, 2024. It is now read-only.

Commit

Permalink
Merge pull request #5801 from cmaf/static-checks-user-agents-url
Browse files Browse the repository at this point in the history
static-checks: Try multiple user agents
  • Loading branch information
fidencio authored Dec 7, 2023
2 parents ef6a9c7 + da945ba commit 46da907
Showing 1 changed file with 151 additions and 39 deletions.
190 changes: 151 additions & 39 deletions .ci/static-checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,41 @@ EOF
done
}


run_url_check_cmd()
{
local url="${1:-}"
[ -n "$url" ] || die "need URL"

local out_file="${2:-}"
[ -n "$out_file" ] || die "need output file"

# Can be blank
local extra_args="${3:-}"

local curl_extra_args=()

curl_extra_args+=("$extra_args")

# Authenticate for github to increase threshold for rate limiting
if [[ "$url" =~ github\.com && -n "$GITHUB_USER" && -n "$GITHUB_TOKEN" ]]; then
curl_extra_args+=("-u ${GITHUB_USER}:${GITHUB_TOKEN}")
fi

# Some endpoints return 403 to HEAD but 200 for GET,
# so perform a GET but only read headers.
curl \
${curl_extra_args[*]} \
-sIL \
-X GET \
-c - \
-H "Accept-Encoding: zstd, none, gzip, deflate" \
--max-time "$url_check_timeout_secs" \
--retry "$url_check_max_tries" \
"$url" \
&>"$out_file"
}

check_url()
{
local url="$1"
Expand All @@ -495,56 +530,133 @@ check_url()
local invalid_file=$(printf "%s/%d" "$invalid_urls_dir" "$$")

local ret
local user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"

# Authenticate for github to increase threshold for rate limiting
local curl_args=()
if [[ "$url" =~ github\.com && -n "$GITHUB_USER" && -n "$GITHUB_TOKEN" ]]; then
curl_args+=("-u ${GITHUB_USER}:${GITHUB_TOKEN}")
fi
local -a errors=()

# Some endpoints return 403 to HEAD but 200 for GET, so perform a GET but only read headers.
{ curl ${curl_args[*]} -sIL -X GET -c - -A "${user_agent}" -H "Accept-Encoding: zstd, none, gzip, deflate" --max-time "$url_check_timeout_secs" \
--retry "$url_check_max_tries" "$url" &>"$curl_out"; ret=$?; } || true
local -a user_agents=()

# A transitory error, or the URL is incorrect,
# but capture either way.
if [ "$ret" -ne 0 ]; then
echo "$url" >> "${invalid_file}"
# Test an unspecified UA (curl default)
user_agents+=('')

die "check failed for URL $url after $url_check_max_tries tries"
fi
# Test an explictly blank UA
user_agents+=('""')

local http_statuses
# Single space
user_agents+=(' ')

http_statuses=$(grep -E "^HTTP" "$curl_out" | awk '{print $2}' || true)
if [ -z "$http_statuses" ]; then
echo "$url" >> "${invalid_file}"
die "no HTTP status codes for URL $url"
fi
# CLI HTTP tools
user_agents+=('Wget')
user_agents+=('curl')

# console based browsers
# Hopefully, these will always be supported for a11y.
user_agents+=('Lynx')
user_agents+=('Elinks')

# Emacs' w3m browser
user_agents+=('Emacs')

# The full craziness
user_agents+=('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36')

local user_agent

local status
local success='false'

for status in $http_statuses
# Cycle through the user agents until we find one that works.
#
# Note that we also test an unspecified user agent
# (no '-A <value>').
for user_agent in "${user_agents[@]}"
do
# Ignore the following ranges of status codes:
#
# - 1xx: Informational codes.
# - 2xx: Success codes.
# - 3xx: Redirection codes.
# - 405: Specifically to handle some sites
# which get upset by "curl -L" when the
# redirection is not required.
#
# Anything else is considered an error.
#
# See https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
local curl_ua_args
[ -n "$user_agent" ] && curl_ua_args="-A '$user_agent'"

{ run_url_check_cmd "$url" "$curl_out" "$curl_ua_args"; ret=$?; } || true

# A transitory error, or the URL is incorrect,
# but capture either way.
if [ "$ret" -ne 0 ]; then
echo "$url" >> "${invalid_file}"
errors+=("Failed to check URL '$url' (user agent: '$user_agent', return code $ret)")

if ! echo "$status" | grep -qE "^(1[0-9][0-9]|2[0-9][0-9]|3[0-9][0-9]|405)"; then
echo "$url" >> "$invalid_file"
die "found HTTP error status codes for URL $url ($status)"
# Give up
break
fi

local http_statuses

http_statuses=$(grep -E "^HTTP" "$curl_out" |\
awk '{print $2}' || true)

if [ -z "$http_statuses" ]; then
echo "$url" >> "${invalid_file}"
errors+=("no HTTP status codes for URL '$url' (user agent: '$user_agent')")

continue
fi

local status

local -i fail_count=0

# Check all HTTP status codes
for status in $http_statuses
do
# Ignore the following ranges of status codes:
#
# - 1xx: Informational codes.
# - 2xx: Success codes.
# - 3xx: Redirection codes.
# - 405: Specifically to handle some sites
# which get upset by "curl -L" when the
# redirection is not required.
#
# Anything else is considered an error.
#
# See https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

{ grep -qE "^(1[0-9][0-9]|2[0-9][0-9]|3[0-9][0-9]|405)" <<< "$status"; ret=$?; } || true

[ "$ret" -eq 0 ] && continue

if grep -q '^4' <<< "$status"; then
case "$status" in
# Awkward: these codes signify that the URL is
# valid, but we can't check them:
#
# 401: Unauthorized (need to login to the site).
# 402: Payment Required.
# 403: Forbidden (possibley the same as 401).
#
# If they did not exist, we'd get a 404, so since
# they have been "semi validated" by the server by
# simply returning these codes, we'll assume they
# are valid.
401|402|403) success='true' ;;
esac
# Client error: most likely the document isn't valid
# or the server won't give us access, so there is no
# point trying other user agents [*].
# ---
# [*] - we make the assumption that the server is not
# returning this error code _based_ on the UA
# presented of course, which theoretically it _could_.
break 2
else
fail_count+=1

echo "$url" >> "$invalid_file"
errors+=("found HTTP error status codes for URL $url (status: '$status', user agent: '$user_agent')")
fi
done

[ "$fail_count" -eq 0 ] && success='true' && break
done

[ "$success" = 'true' ] && return 0

die "failed to check URL '$url': errors: '${errors[*]}'"
}

# Perform basic checks on documentation files
Expand Down Expand Up @@ -768,7 +880,7 @@ static_check_docs()
then
local files

cat "$invalid_urls" | while read url
sort -u "$invalid_urls" | while read -r url
do
files=$(grep "^${url}" "$url_map" | awk '{print $2}' | sort -u)
echo >&2 -e "ERROR: Invalid URL '$url' found in the following files:\n"
Expand Down

0 comments on commit 46da907

Please sign in to comment.