-
Notifications
You must be signed in to change notification settings - Fork 10
/
scrape.py
67 lines (53 loc) · 1.82 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from requests import get
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import json
import http.client
import PyPDF2
url_percent=open('./url_percent.json')
data = json.load(url_percent)
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
i = 1
for url in data.values():
x = url.find('.pdf')
if x != -1:
print('I found a pdf')
print('Downloading it ....')
urllib.request.urlretrieve(url, 'pdf_file.pdf')
print('Converting pdf to text')
pdf = PyPDF2.PdfFileReader(open('pdf_file.pdf','rb'))
text = ''
for page in pdf.pages:
text = text + page.extractText()
fd = open('source{}.txt'.format(i),'w')
fd.write(text)
fd.close()
print('Conversion done')
else:
try:
print('Scraping text from web')
html = urllib.request.urlopen(url).read()
# print(text_from_html(html))
f = open("source{}.txt".format(i),"w")
text = text_from_html(html)[1:1999]
f.write(" ".join(text.strip().split()))
print('scraping done')
i=i+1
except urllib.error.HTTPError:
print("scraping not allowed")
except urllib.error.URLError:
print('Certificates verification failed')
except http.client.IncompleteRead:
print("Incomplete read scraping not allowed")