@@ -81,129 +81,140 @@ def checkLinks(offline=False):
81
81
num_links_checked = 0
82
82
last_checkin = time ()
83
83
for dirpath , dirnames , filenames in os .walk (config ["out_path" ]):
84
- if time () - last_checkin > CHECK_IN_INTERVAL :
85
- ## Print output periodically so Jenkins/etc. don't kill the job
86
- last_checkin = time ()
87
- print ("... still working (dirpath: %s) ..." % dirpath )
88
- if "template_path" in config and \
89
- os .path .abspath (dirpath ) == os .path .abspath (config ["template_path" ]):
90
- # don't try to parse and linkcheck the templates
91
- logger .warning ("Skipping link checking for template path %s" % dirpath )
92
- continue
93
- for fname in filenames :
94
84
if time () - last_checkin > CHECK_IN_INTERVAL :
95
- last_checkin = time ()
96
- print ("... still working (file: %s) ..." % fname )
97
- fullPath = os .path .join (dirpath , fname )
98
- if "/node_modules/" in fullPath or ".git" in fullPath :
99
- logger .debug ("skipping ignored dir: %s" % fullPath )
100
- continue
101
- if fullPath .endswith (".html" ):
102
- soup = getSoup (fullPath )
103
- unparsed_links = check_for_unparsed_reference_links (soup )
104
- if unparsed_links :
105
- logger .warning ("Found %d unparsed Markdown reference links: %s" %
106
- (len (unparsed_links ), "\n ... " .join (unparsed_links )))
107
- [broken_links .append ( (fullPath , u ) ) for u in unparsed_links ]
108
- links = soup .find_all ('a' )
109
- for link in links :
85
+ ## Print output periodically so Jenkins/etc. don't kill the job
86
+ last_checkin = time ()
87
+ print ("... still working (dirpath: %s) ..." % dirpath )
88
+ if "template_path" in config and \
89
+ os .path .abspath (dirpath ) == os .path .abspath (config ["template_path" ]):
90
+ # don't try to parse and linkcheck the templates
91
+ logger .warning ("Skipping link checking for template path %s" % dirpath )
92
+ continue
93
+ for fname in filenames :
110
94
if time () - last_checkin > CHECK_IN_INTERVAL :
111
- last_checkin = time ()
112
- print ("... still working (link: %s) ..." % link )
113
- if "href" not in link .attrs :
114
- #probably an <a name> type anchor, skip
115
- continue
116
-
117
- endpoint = link ['href' ]
118
- if not endpoint .strip ():
119
- logger .warning ("Empty link in %s" % fullPath )
120
- broken_links .append ( (fullPath , endpoint ) )
121
- num_links_checked += 1
122
-
123
- elif endpoint == "#" :
124
- continue
125
-
126
- elif "mailto:" in endpoint :
127
- logger .info ("Skipping email link in %s to %s" % (fullPath , endpoint ))
128
- continue
129
-
130
- elif "://" in endpoint :
131
- if offline :
132
- logger .info ("Offline - Skipping remote URL %s" % (endpoint ))
133
- continue
134
-
135
- num_links_checked += 1
136
- check_remote_url (endpoint , fullPath , broken_links , externalCache )
137
-
138
-
139
- elif '#' in endpoint :
140
- if fname in config ["ignore_anchors_in" ]:
141
- logger .info ("Ignoring anchor %s in dynamic page %s" % (endpoint ,fname ))
142
- continue
143
- logger .info ("Testing local link %s from %s" % (endpoint , fullPath ))
144
- num_links_checked += 1
145
- filename ,anchor = endpoint .split ("#" ,1 )
146
- if filename == "" :
147
- fullTargetPath = fullPath
148
- else :
149
- fullTargetPath = os .path .join (dirpath , filename )
150
- if not os .path .exists (fullTargetPath ):
151
- logger .warning ("Broken local link in %s to %s" % (fullPath , endpoint ))
152
- broken_links .append ( (fullPath , endpoint ) )
153
-
154
- elif filename in config ["ignore_anchors_in" ]:
155
- #Some pages are populated dynamically, so BeatifulSoup wouldn't
156
- # be able to find anchors in them anyway
157
- logger .info ("Skipping anchor link in %s to dynamic page %s" %
158
- (fullPath , endpoint ))
159
- continue
160
-
161
- elif fullTargetPath != "../" :
162
- num_links_checked += 1
163
- targetSoup = getSoup (fullTargetPath )
164
- if not targetSoup .find (id = anchor ) and not targetSoup .find (
165
- "a" ,attrs = {"name" :anchor }):
166
- logger .warning ("Broken anchor link in %s to %s" % (fullPath , endpoint ))
167
- broken_links .append ( (fullPath , endpoint ) )
168
- else :
169
- logger .info ("...anchor found." )
170
- continue
95
+ last_checkin = time ()
96
+ print ("... still working (file: %s) ..." % fname )
171
97
172
- elif endpoint [0 ] == '/' :
173
- #can't really test links out of the local field
174
- logger .info ("Skipping absolute link in %s to %s" % (fullPath , endpoint ))
175
- continue
176
-
177
- else :
178
- num_links_checked += 1
179
- if not os .path .exists (os .path .join (dirpath , endpoint )):
180
- logger .warning ("Broken local link in %s to %s" % (fullPath , endpoint ))
181
- broken_links .append ( (fullPath , endpoint ) )
182
-
183
- #Now check images
184
- imgs = soup .find_all ('img' )
185
- for img in imgs :
186
- num_links_checked += 1
187
- if "src" not in img .attrs or not img ["src" ].strip ():
188
- logger .warning ("Broken image with no src in %s" % fullPath )
189
- broken_links .append ( (fullPath , img ["src" ]) )
190
- continue
191
-
192
- src = img ["src" ]
193
- if "://" in src :
194
- if offline :
195
- logger .info ("Offline - Skipping remote image %s" % (endpoint ))
98
+ fullPath = os .path .join (dirpath , fname )
99
+ if "/node_modules/" in fullPath or ".git" in fullPath :
100
+ logger .debug ("skipping ignored dir: %s" % fullPath )
196
101
continue
197
-
198
- check_remote_url (src , fullPath , broken_links , externalCache , isImg = True )
199
-
200
- else :
201
- logger .info ("Checking local image %s in %s" % (src , fullPath ))
202
- if os .path .exists (os .path .join (dirpath , src )):
203
- logger .info ("...success" )
204
- else :
205
- logger .warning ("Broken local image %s in %s" % (src , fullPath ))
206
- broken_links .append ( (fullPath , src ) )
102
+ if fullPath .endswith (".html" ):
103
+ soup = getSoup (fullPath )
104
+ unparsed_links = check_for_unparsed_reference_links (soup )
105
+ if unparsed_links :
106
+ logger .warning ("Found %d unparsed Markdown reference links: %s" %
107
+ (len (unparsed_links ), "\n ... " .join (unparsed_links )))
108
+ [broken_links .append ( (fullPath , u ) ) for u in unparsed_links ]
109
+ links = soup .find_all ('a' )
110
+ for link in links :
111
+ if time () - last_checkin > CHECK_IN_INTERVAL :
112
+ last_checkin = time ()
113
+ print ("... still working (link: %s) ..." % link )
114
+ if "href" not in link .attrs :
115
+ #probably an <a name> type anchor, skip
116
+ continue
117
+
118
+ endpoint = link ['href' ]
119
+ if not endpoint .strip ():
120
+ logger .warning ("Empty link in %s" % fullPath )
121
+ broken_links .append ( (fullPath , endpoint ) )
122
+ num_links_checked += 1
123
+
124
+ elif endpoint == "#" :
125
+ continue
126
+
127
+ elif "mailto:" in endpoint :
128
+ logger .warning ("Skipping email link in %s to %s" %
129
+ (fullPath , endpoint ))
130
+ continue
131
+
132
+ elif endpoint [0 ] == '/' :
133
+ # Can't properly test absolute links without knowing where the
134
+ # server root will be, so skip this
135
+ logger .warning ("Skipping absolute link in %s to %s" %
136
+ (fullPath , endpoint ))
137
+ continue
138
+
139
+ elif "://" in endpoint :
140
+ if offline :
141
+ logger .info ("Offline - Skipping remote URL %s" % (endpoint ))
142
+ continue
143
+
144
+ num_links_checked += 1
145
+ check_remote_url (endpoint , fullPath , broken_links , externalCache )
146
+
147
+
148
+ elif '#' in endpoint :
149
+ if fname in config ["ignore_anchors_in" ]:
150
+ logger .warning ("Ignoring anchor %s in dynamic page %s" %
151
+ (endpoint ,fname ))
152
+ continue
153
+ logger .info ("Testing local link %s from %s" %
154
+ (endpoint , fullPath ))
155
+ num_links_checked += 1
156
+ filename ,anchor = endpoint .split ("#" ,1 )
157
+ if filename == "" :
158
+ fullTargetPath = fullPath
159
+ else :
160
+ fullTargetPath = os .path .join (dirpath , filename )
161
+ if not os .path .exists (fullTargetPath ):
162
+ logger .warning ("Broken local link in %s to %s" %
163
+ (fullPath , endpoint ))
164
+ broken_links .append ( (fullPath , endpoint ) )
165
+
166
+ elif filename in config ["ignore_anchors_in" ]:
167
+ #Some pages are populated dynamically, so BeatifulSoup wouldn't
168
+ # be able to find anchors in them anyway
169
+ logger .info ("Skipping anchor link in %s to ignored page %s" %
170
+ (fullPath , endpoint ))
171
+ continue
172
+
173
+ elif fullTargetPath != "../" :
174
+ num_links_checked += 1
175
+ targetSoup = getSoup (fullTargetPath )
176
+ if not targetSoup .find (id = anchor ) and not targetSoup .find (
177
+ "a" ,attrs = {"name" :anchor }):
178
+ logger .warning ("Broken anchor link in %s to %s" %
179
+ (fullPath , endpoint ))
180
+ broken_links .append ( (fullPath , endpoint ) )
181
+ else :
182
+ logger .info ("...anchor found." )
183
+ continue
184
+
185
+ else :
186
+ num_links_checked += 1
187
+ if not os .path .exists (os .path .join (dirpath , endpoint )):
188
+ logger .warning ("Broken local link in %s to %s" %
189
+ (fullPath , endpoint ))
190
+ broken_links .append ( (fullPath , endpoint ) )
191
+
192
+ #Now check images
193
+ imgs = soup .find_all ('img' )
194
+ for img in imgs :
195
+ num_links_checked += 1
196
+ if "src" not in img .attrs or not img ["src" ].strip ():
197
+ logger .warning ("Broken image with no src in %s" % fullPath )
198
+ broken_links .append ( (fullPath , img ["src" ]) )
199
+ continue
200
+
201
+ src = img ["src" ]
202
+ if "://" in src :
203
+ if offline :
204
+ logger .info ("Offline - Skipping remote image %s" % (endpoint ))
205
+ continue
206
+
207
+ check_remote_url (src , fullPath , broken_links , externalCache , isImg = True )
208
+
209
+ else :
210
+ logger .info ("Checking local image %s in %s" %
211
+ (src , fullPath ))
212
+ if os .path .exists (os .path .join (dirpath , src )):
213
+ logger .info ("...success" )
214
+ else :
215
+ logger .warning ("Broken local image %s in %s" %
216
+ (src , fullPath ))
217
+ broken_links .append ( (fullPath , src ) )
207
218
return broken_links , num_links_checked
208
219
209
220
0 commit comments