This repository has been archived by the owner on Mar 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
python-webpage-file-downloader.py
49 lines (39 loc) · 1.75 KB
/
python-webpage-file-downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
import re
import sys
import urllib
#Your sample url
sampleUrl = "http://stackoverflow.com"
urlAddInfo = urllib.urlopen(sampleUrl)
data = urlAddInfo.read()
#Sample extensions we'll be looking for: pngs and pdfs
TARGET_EXTENSIONS = "(png|pdf)"
targetCompile = re.compile(TARGET_EXTENSIONS, re.UNICODE|re.MULTILINE)
#Let's get all the urls: match criteria{no spaces or " in a url}
urls = re.findall('(https?://[^\s"]+)', data, re.UNICODE|re.MULTILINE)
#We want these folks
extensionMatches = filter(lambda url: url and targetCompile.search(url), urls)
#The rest of the unmatched urls for which the scrapping can also be repeated.
nonExtMatches = filter(lambda url: url and not targetCompile.search(url), urls)
def fileDl(targetUrl):
#Function to handle downloading of files.
#Arg: url => a String
#Output: Boolean to signify if file has been written to memory
#Validation of the url assumed, for the sake of keeping the illustration short
urlAddInfo = urllib.urlopen(targetUrl)
data = urlAddInfo.read()
fileNameSearch = re.search("([^\/\s]+)$", targetUrl) #Text right before the last slash '/'
if not fileNameSearch:
sys.stderr.write("Could not extract a filename from url '%s'\n"%(targetUrl))
return False
fileName = fileNameSearch.groups(1)[0]
with open(fileName, "wb") as f:
f.write(data)
sys.stderr.write("Wrote %s to memory\n"%(fileName))
return True
#Let's now download the matched files
dlResults = map(lambda fUrl: fileDl(fUrl), extensionMatches)
successfulDls = filter(lambda s: s, dlResults)
sys.stderr.write("Downloaded %d files from %s\n"%(len(successfulDls), sampleUrl))
#You can organize the above code into a function to repeat the process for each of the
#other urls and in that way you can make a crawler.