File tree 1 file changed +2
-12
lines changed
web-scraping/download-images
1 file changed +2
-12
lines changed Original file line number Diff line number Diff line change 5
5
from urllib .parse import urljoin , urlparse
6
6
7
7
8
- def is_absolute (url ):
9
- """
10
- Determines whether a `url` is absolute.
11
- """
12
- return bool (urlparse (url ).netloc )
13
-
14
-
15
8
def is_valid (url ):
16
9
"""
17
10
Checks whether `url` is a valid URL.
@@ -28,14 +21,11 @@ def get_all_images(url):
28
21
urls = []
29
22
for img in tqdm (soup .find_all ("img" ), "Extracting images" ):
30
23
img_url = img .attrs .get ("src" )
31
-
32
24
if not img_url :
33
25
# if img does not contain src attribute, just skip
34
26
continue
35
-
36
- if not is_absolute (img_url ):
37
- # if img has relative URL, make it absolute by joining
38
- img_url = urljoin (url , img_url )
27
+ # make the URL absolute by joining domain with the URL that is just extracted
28
+ img_url = urljoin (url , img_url )
39
29
# remove URLs like '/hsts-pixel.gif?c=3.2.5'
40
30
try :
41
31
pos = img_url .index ("?" )
You can’t perform that action at this time.
0 commit comments