-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimg.py
executable file
·102 lines (73 loc) · 1.87 KB
/
img.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
##requires python3
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
from urllib.request import urlopen
import shutil
import requests
import sys
import time
import os
def img_download(link):
try:
os.stat("images")
except:
os.mkdir("images")
page = requests.get( link )
soup = BeautifulSoup(page.content, 'html.parser')
f = open('1','wb')
f.write(page.content)
f.close()
images = [img for img in soup.findAll('img')]
##for downloading pdf external
image_links = [each.get("src") for each in images]
print(len(image_links))
counter=1
for src in image_links:
try:
response = requests.get(src, stream=True)
f = open("images/"+str(counter) ,'wb')
f.write(response.raw.read())
f.close()
counter=counter+1
except:
print (' An error occured. Continuing.')
print( 'Done.')
def pdf_download(link):
try:
os.stat("pdf_folder")
except:
os.mkdir("pdf_folder")
page = requests.get( link )
soup = BeautifulSoup(page.content, 'html.parser')
f = open('1','wb')
f.write(page.content)
f.close()
images = [img for img in soup.findAll('a')]
##for downloading pdf external
image_links = [each.get("href") for each in images]
counter=2
for each in image_links:
print(each)
try:
filename = each.strip().split('/')[-1].strip()
src = link + each
if src[-1]=='p':
continue
print(src)
print ('Getting: ' + filename)
response = requests.get(src, stream=True)
f = open("pdf_folder/"+filename ,'wb')
f.write(response.raw.read())
f.close()
counter=counter+1
except:
print (' An error occured. Continuing.')
print( 'Done.')
# here is the link for the page for pdf
#change the link for downloading pdfs
link = "http://www.cs.cmu.edu/~epxing/Class/10701-10s/HW/"
pdf_download(link)
#change the link for downloading images
link = "https://twitter.com/dishpatani"
img_download(link)