Skip to content

Commit 3183d73

Browse files
committed
fixed download web page images code
1 parent e8fe682 commit 3183d73

File tree

1 file changed

+2
-12
lines changed

1 file changed

+2
-12
lines changed

web-scraping/download-images/download_images.py

+2-12
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,6 @@
55
from urllib.parse import urljoin, urlparse
66

77

8-
def is_absolute(url):
9-
"""
10-
Determines whether a `url` is absolute.
11-
"""
12-
return bool(urlparse(url).netloc)
13-
14-
158
def is_valid(url):
169
"""
1710
Checks whether `url` is a valid URL.
@@ -28,14 +21,11 @@ def get_all_images(url):
2821
urls = []
2922
for img in tqdm(soup.find_all("img"), "Extracting images"):
3023
img_url = img.attrs.get("src")
31-
3224
if not img_url:
3325
# if img does not contain src attribute, just skip
3426
continue
35-
36-
if not is_absolute(img_url):
37-
# if img has relative URL, make it absolute by joining
38-
img_url = urljoin(url, img_url)
27+
# make the URL absolute by joining domain with the URL that is just extracted
28+
img_url = urljoin(url, img_url)
3929
# remove URLs like '/hsts-pixel.gif?c=3.2.5'
4030
try:
4131
pos = img_url.index("?")

0 commit comments

Comments
 (0)