Skip to content

Commit

Permalink
doFetch: Try a failed HEAD again
Browse files Browse the repository at this point in the history
  • Loading branch information
earthboundkid committed Feb 14, 2023
1 parent 45882b7 commit f9d209c
Showing 1 changed file with 43 additions and 35 deletions.
78 changes: 43 additions & 35 deletions linkcheck/linkcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,45 +225,53 @@ func (c *crawler) doFetch(ctx context.Context, pageurl string) (links, ids []str
method = http.MethodHead
}
var doc html.Node
err = requests.
URL(pageurl).
Method(method).
Accept("text/html,application/xhtml+xml,application/xml,*/*").
UserAgent(c.userAgent).
Client(c.Client).
CheckStatus(http.StatusOK).
CheckContentType(
"text/html",
"application/xhtml+xml",
"text/xml",
"text/plain",
).
CheckPeek(512, func(b []byte) error {
if ct := http.DetectContentType(b); !strings.Contains(ct, "html") {
return fmt.Errorf("content-type is %s", ct)
}
return nil
}).
AddValidator(func(res *http.Response) error {
// If we've been 30X redirected, pageurl will not be response URL
pageurl = res.Request.URL.String()
return nil
}).
Handle(requests.ToHTML(&doc)).
Fetch(ctx)

if err != nil {
for {
err = requests.
URL(pageurl).
Method(method).
Accept("text/html,application/xhtml+xml,application/xml,*/*").
UserAgent(c.userAgent).
Client(c.Client).
CheckStatus(http.StatusOK).
CheckContentType(
"text/html",
"application/xhtml+xml",
"text/xml",
"text/plain",
).
CheckPeek(512, func(b []byte) error {
if ct := http.DetectContentType(b); !strings.Contains(ct, "html") {
return fmt.Errorf("content-type is %s", ct)
}
return nil
}).
AddValidator(func(res *http.Response) error {
// If we've been 30X redirected, pageurl will not be response URL
pageurl = res.Request.URL.String()
return nil
}).
Handle(requests.ToHTML(&doc)).
Fetch(ctx)
if method == http.MethodGet || err == nil {
break
}
method = http.MethodGet
}
switch {
case err == nil:
break
case
// report 401, 404, 410; ignore temporary status errors
if requests.HasStatusErr(err,
requests.HasStatusErr(err,
http.StatusUnauthorized,
http.StatusNotFound,
http.StatusGone) {
return nil, nil, err
}
http.StatusGone),
// Report DNS errors
if d := new(net.DNSError); errors.As(err, &d) {
return nil, nil, err
}
errors.As(err, new(*net.DNSError)):

return nil, nil, err

default:
// Ignore other errors
c.l.Debug("doFetch ignore", "url", pageurl, "err", err)
return nil, nil, nil
Expand Down

0 comments on commit f9d209c

Please sign in to comment.