-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
95 lines (81 loc) · 3.47 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json
class Scraper():
def __init__(self):
self.loadTerms()
self.getBrowser()
self.log = os.path.join(os.getcwd(), "track.txt")
self.metaData = {t: set() for t in self.terms}
print("Scraper initialized")
def loadTerms(self):
with open(os.path.join(os.getcwd(),"terms.txt")) as f:
self.terms = f.read().splitlines()
def getBrowser(self):
opts = Options()
opts.headless = True
browser = webdriver.Chrome(executable_path=os.path.join(os.getcwd(),"./chromedriver"), options=opts)
self.browser=browser
def writeData(self):
metaData = {k : list(v) for k, v in self.metaData.items()}
with open(self.log, 'w') as f:
f.write(json.dumps(metaData))
""" the main scraping entry point """
def scrape(self, imgsPerTerm=400):
br = self.browser
terms = self.terms
def bottomScroll():
br.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# image search url
url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
for term in terms:
rstart = 0 # track position of results for current term
print(f"current search term is \'{term}\'")
qterm = term.replace(" ", "+")
br.get(url.format(q=qterm))
# update log with current term
# with open(self.log, 'w') as f:
# f.write(term + "\n")
while len(self.metaData[term]) < imgsPerTerm:
bottomScroll()
res = br.find_elements_by_css_selector("img.Q4LuWd")
print(f"found {len(res)} results")
print(f"len(res): {len(res)}")
print(f"rstart: {rstart}")
for img in res[rstart:len(res)]:
try:
img.click() # get img
time.sleep(1)
# add image url to term database
self.metaData[term].add(br.current_url)
rstart = len(res)
except Exception as e:
print(e)
pass
# get image urls
imgs = br.find_elements_by_css_selector("img.n3VNCb")
for im in imgs:
if im.get_attribute("src") and 'http' in im.get_attribute("src"):
self.metaData[term].add(im.get_attribute("src"))
if len(self.metaData[term]) >= imgsPerTerm:
print("found enough for this term")
break
else:
diff = imgsPerTerm - len(self.metaData[term])
print(f"fetching {diff} more images...")
time.sleep(2)
loadMore = br.find_element_by_css_selector(".mye4qd")
if loadMore:
br.execute_script("document.querySelector('.mye4qd').click();")
#rstart = len(res)
# done, now store the image data to download images
print("scraping complete")
# driver
if __name__ == "__main__":
scraper = Scraper()
scraper.scrape()
scraper.writeData()
print("done")