Skip to content

Commit b048d54

Browse files
committed
v0.8.3: Fix link checker bug for absolute links w/ anchors
1 parent eb7b2d1 commit b048d54

File tree

2 files changed

+131
-120
lines changed

2 files changed

+131
-120
lines changed

dactyl/dactyl_link_checker.py

+130-119
Original file line numberDiff line numberDiff line change
@@ -81,129 +81,140 @@ def checkLinks(offline=False):
8181
num_links_checked = 0
8282
last_checkin = time()
8383
for dirpath, dirnames, filenames in os.walk(config["out_path"]):
84-
if time() - last_checkin > CHECK_IN_INTERVAL:
85-
## Print output periodically so Jenkins/etc. don't kill the job
86-
last_checkin = time()
87-
print("... still working (dirpath: %s) ..." % dirpath)
88-
if "template_path" in config and \
89-
os.path.abspath(dirpath) == os.path.abspath(config["template_path"]):
90-
# don't try to parse and linkcheck the templates
91-
logger.warning("Skipping link checking for template path %s" % dirpath)
92-
continue
93-
for fname in filenames:
9484
if time() - last_checkin > CHECK_IN_INTERVAL:
95-
last_checkin = time()
96-
print("... still working (file: %s) ..." % fname)
97-
fullPath = os.path.join(dirpath, fname)
98-
if "/node_modules/" in fullPath or ".git" in fullPath:
99-
logger.debug("skipping ignored dir: %s" % fullPath)
100-
continue
101-
if fullPath.endswith(".html"):
102-
soup = getSoup(fullPath)
103-
unparsed_links = check_for_unparsed_reference_links(soup)
104-
if unparsed_links:
105-
logger.warning("Found %d unparsed Markdown reference links: %s" %
106-
(len(unparsed_links), "\n... ".join(unparsed_links)))
107-
[broken_links.append( (fullPath, u) ) for u in unparsed_links]
108-
links = soup.find_all('a')
109-
for link in links:
85+
## Print output periodically so Jenkins/etc. don't kill the job
86+
last_checkin = time()
87+
print("... still working (dirpath: %s) ..." % dirpath)
88+
if "template_path" in config and \
89+
os.path.abspath(dirpath) == os.path.abspath(config["template_path"]):
90+
# don't try to parse and linkcheck the templates
91+
logger.warning("Skipping link checking for template path %s" % dirpath)
92+
continue
93+
for fname in filenames:
11094
if time() - last_checkin > CHECK_IN_INTERVAL:
111-
last_checkin = time()
112-
print("... still working (link: %s) ..." % link)
113-
if "href" not in link.attrs:
114-
#probably an <a name> type anchor, skip
115-
continue
116-
117-
endpoint = link['href']
118-
if not endpoint.strip():
119-
logger.warning("Empty link in %s" % fullPath)
120-
broken_links.append( (fullPath, endpoint) )
121-
num_links_checked += 1
122-
123-
elif endpoint == "#":
124-
continue
125-
126-
elif "mailto:" in endpoint:
127-
logger.info("Skipping email link in %s to %s"%(fullPath, endpoint))
128-
continue
129-
130-
elif "://" in endpoint:
131-
if offline:
132-
logger.info("Offline - Skipping remote URL %s"%(endpoint))
133-
continue
134-
135-
num_links_checked += 1
136-
check_remote_url(endpoint, fullPath, broken_links, externalCache)
137-
138-
139-
elif '#' in endpoint:
140-
if fname in config["ignore_anchors_in"]:
141-
logger.info("Ignoring anchor %s in dynamic page %s"%(endpoint,fname))
142-
continue
143-
logger.info("Testing local link %s from %s"%(endpoint, fullPath))
144-
num_links_checked += 1
145-
filename,anchor = endpoint.split("#",1)
146-
if filename == "":
147-
fullTargetPath = fullPath
148-
else:
149-
fullTargetPath = os.path.join(dirpath, filename)
150-
if not os.path.exists(fullTargetPath):
151-
logger.warning("Broken local link in %s to %s"%(fullPath, endpoint))
152-
broken_links.append( (fullPath, endpoint) )
153-
154-
elif filename in config["ignore_anchors_in"]:
155-
#Some pages are populated dynamically, so BeatifulSoup wouldn't
156-
# be able to find anchors in them anyway
157-
logger.info("Skipping anchor link in %s to dynamic page %s" %
158-
(fullPath, endpoint))
159-
continue
160-
161-
elif fullTargetPath != "../":
162-
num_links_checked += 1
163-
targetSoup = getSoup(fullTargetPath)
164-
if not targetSoup.find(id=anchor) and not targetSoup.find(
165-
"a",attrs={"name":anchor}):
166-
logger.warning("Broken anchor link in %s to %s"%(fullPath, endpoint))
167-
broken_links.append( (fullPath, endpoint) )
168-
else:
169-
logger.info("...anchor found.")
170-
continue
95+
last_checkin = time()
96+
print("... still working (file: %s) ..." % fname)
17197

172-
elif endpoint[0] == '/':
173-
#can't really test links out of the local field
174-
logger.info("Skipping absolute link in %s to %s"%(fullPath, endpoint))
175-
continue
176-
177-
else:
178-
num_links_checked += 1
179-
if not os.path.exists(os.path.join(dirpath, endpoint)):
180-
logger.warning("Broken local link in %s to %s"%(fullPath, endpoint))
181-
broken_links.append( (fullPath, endpoint) )
182-
183-
#Now check images
184-
imgs = soup.find_all('img')
185-
for img in imgs:
186-
num_links_checked += 1
187-
if "src" not in img.attrs or not img["src"].strip():
188-
logger.warning("Broken image with no src in %s" % fullPath)
189-
broken_links.append( (fullPath, img["src"]) )
190-
continue
191-
192-
src = img["src"]
193-
if "://" in src:
194-
if offline:
195-
logger.info("Offline - Skipping remote image %s"%(endpoint))
98+
fullPath = os.path.join(dirpath, fname)
99+
if "/node_modules/" in fullPath or ".git" in fullPath:
100+
logger.debug("skipping ignored dir: %s" % fullPath)
196101
continue
197-
198-
check_remote_url(src, fullPath, broken_links, externalCache, isImg=True)
199-
200-
else:
201-
logger.info("Checking local image %s in %s" % (src, fullPath))
202-
if os.path.exists(os.path.join(dirpath, src)):
203-
logger.info("...success")
204-
else:
205-
logger.warning("Broken local image %s in %s" % (src, fullPath))
206-
broken_links.append( (fullPath, src) )
102+
if fullPath.endswith(".html"):
103+
soup = getSoup(fullPath)
104+
unparsed_links = check_for_unparsed_reference_links(soup)
105+
if unparsed_links:
106+
logger.warning("Found %d unparsed Markdown reference links: %s" %
107+
(len(unparsed_links), "\n... ".join(unparsed_links)))
108+
[broken_links.append( (fullPath, u) ) for u in unparsed_links]
109+
links = soup.find_all('a')
110+
for link in links:
111+
if time() - last_checkin > CHECK_IN_INTERVAL:
112+
last_checkin = time()
113+
print("... still working (link: %s) ..." % link)
114+
if "href" not in link.attrs:
115+
#probably an <a name> type anchor, skip
116+
continue
117+
118+
endpoint = link['href']
119+
if not endpoint.strip():
120+
logger.warning("Empty link in %s" % fullPath)
121+
broken_links.append( (fullPath, endpoint) )
122+
num_links_checked += 1
123+
124+
elif endpoint == "#":
125+
continue
126+
127+
elif "mailto:" in endpoint:
128+
logger.warning("Skipping email link in %s to %s" %
129+
(fullPath, endpoint))
130+
continue
131+
132+
elif endpoint[0] == '/':
133+
# Can't properly test absolute links without knowing where the
134+
# server root will be, so skip this
135+
logger.warning("Skipping absolute link in %s to %s" %
136+
(fullPath, endpoint))
137+
continue
138+
139+
elif "://" in endpoint:
140+
if offline:
141+
logger.info("Offline - Skipping remote URL %s" % (endpoint))
142+
continue
143+
144+
num_links_checked += 1
145+
check_remote_url(endpoint, fullPath, broken_links, externalCache)
146+
147+
148+
elif '#' in endpoint:
149+
if fname in config["ignore_anchors_in"]:
150+
logger.warning("Ignoring anchor %s in dynamic page %s" %
151+
(endpoint,fname))
152+
continue
153+
logger.info("Testing local link %s from %s" %
154+
(endpoint, fullPath))
155+
num_links_checked += 1
156+
filename,anchor = endpoint.split("#",1)
157+
if filename == "":
158+
fullTargetPath = fullPath
159+
else:
160+
fullTargetPath = os.path.join(dirpath, filename)
161+
if not os.path.exists(fullTargetPath):
162+
logger.warning("Broken local link in %s to %s" %
163+
(fullPath, endpoint))
164+
broken_links.append( (fullPath, endpoint) )
165+
166+
elif filename in config["ignore_anchors_in"]:
167+
#Some pages are populated dynamically, so BeatifulSoup wouldn't
168+
# be able to find anchors in them anyway
169+
logger.info("Skipping anchor link in %s to ignored page %s" %
170+
(fullPath, endpoint))
171+
continue
172+
173+
elif fullTargetPath != "../":
174+
num_links_checked += 1
175+
targetSoup = getSoup(fullTargetPath)
176+
if not targetSoup.find(id=anchor) and not targetSoup.find(
177+
"a",attrs={"name":anchor}):
178+
logger.warning("Broken anchor link in %s to %s" %
179+
(fullPath, endpoint))
180+
broken_links.append( (fullPath, endpoint) )
181+
else:
182+
logger.info("...anchor found.")
183+
continue
184+
185+
else:
186+
num_links_checked += 1
187+
if not os.path.exists(os.path.join(dirpath, endpoint)):
188+
logger.warning("Broken local link in %s to %s" %
189+
(fullPath, endpoint))
190+
broken_links.append( (fullPath, endpoint) )
191+
192+
#Now check images
193+
imgs = soup.find_all('img')
194+
for img in imgs:
195+
num_links_checked += 1
196+
if "src" not in img.attrs or not img["src"].strip():
197+
logger.warning("Broken image with no src in %s" % fullPath)
198+
broken_links.append( (fullPath, img["src"]) )
199+
continue
200+
201+
src = img["src"]
202+
if "://" in src:
203+
if offline:
204+
logger.info("Offline - Skipping remote image %s"%(endpoint))
205+
continue
206+
207+
check_remote_url(src, fullPath, broken_links, externalCache, isImg=True)
208+
209+
else:
210+
logger.info("Checking local image %s in %s" %
211+
(src, fullPath))
212+
if os.path.exists(os.path.join(dirpath, src)):
213+
logger.info("...success")
214+
else:
215+
logger.warning("Broken local image %s in %s" %
216+
(src, fullPath))
217+
broken_links.append( (fullPath, src) )
207218
return broken_links, num_links_checked
208219

209220

dactyl/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.8.2'
1+
__version__ = '0.8.3'

0 commit comments

Comments
 (0)