Skip to content

Commit 24f17b1

Browse files
RemoraxbhaveshAn
authored andcommitted
Addresses #320 Add video search support for Yahoo (#446)
Removed unnecessary line from google scraper Fixed Codacy and Travis errors Fixed last Travis error Removed unnecessary debug print statement Squashed the commits
1 parent 8d319e7 commit 24f17b1

13 files changed

+51
-2
lines changed

app/scrapers/ask.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def __init__(self):
99
self.url = 'http://ask.com/web'
1010
self.defaultStart = 1
1111
self.startKey = 'page'
12+
self.name = 'ask'
1213

1314
def next_start(self, current_start, prev_results):
1415
return current_start + 1

app/scrapers/baidu.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def __init__(self):
1111
self.defaultStart = 0
1212
self.queryKey = 'wd'
1313
self.startKey = 'pn'
14+
self.name = 'baidu'
1415

1516
def parse_response(self, soup):
1617
""" Parse the response and return set of urls

app/scrapers/bing.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self):
1010
self.url = 'http://www.bing.com/search'
1111
self.defaultStart = 1
1212
self.startKey = 'first'
13+
self.name = 'bing'
1314

1415
def parse_response(self, soup):
1516
""" Parses the reponse and return set of urls

app/scrapers/dailymotion.py

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def __init__(self):
1212
self.queryKey = 'search'
1313
self.startKey = 'page'
1414
self.defaultStart = 1
15+
self.name = 'dailymotion'
1516

1617
def parse_response(self, soup):
1718
""" Parse the response and return set of urls

app/scrapers/duckduckgo.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self):
1010
self.url = 'https://duckduckgo.com/html'
1111
self.defaultStart = 0
1212
self.startKey = 's'
13+
self.name = 'duckduckgo'
1314

1415
def parse_response(self, soup):
1516
""" Parse the response and return set of urls

app/scrapers/exalead.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self):
1010
self.url = 'https://www.exalead.com/search/web/results/'
1111
self.defaultStart = 0
1212
self.startKey = 'start_index'
13+
self.name = 'exalead'
1314

1415
def parse_response(self, soup):
1516
""" Parse the response and return set of urls

app/scrapers/generalized.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,24 @@ def get_page(self, query, startIndex=0, qtype=''):
2525
""" Fetch the google search results page
2626
Returns : Results Page
2727
"""
28+
url = self.url
29+
if qtype == 'vid':
30+
if self.name in ['yahoo']:
31+
url = self.videoURL
32+
else:
33+
url = self.url
2834
payload = {self.queryKey: query, self.startKey: startIndex,
2935
self.qtype: qtype}
30-
response = requests.get(self.url, headers=self.headers, params=payload)
36+
response = requests.get(url, headers=self.headers, params=payload)
3137
print(response.url)
3238
return response
3339

3440
def parse_response(self, soup):
3541
raise NotImplementedError
3642

43+
def parse_video_response(self, soup):
44+
raise NotImplementedError
45+
3746
def next_start(self, current_start, prev_results):
3847
return current_start + len(prev_results)
3948

@@ -48,7 +57,13 @@ def search(self, query, num_results, qtype=''):
4857
while (len(urls) < num_results):
4958
response = self.get_page(query, current_start, qtype)
5059
soup = BeautifulSoup(response.text, 'html.parser')
51-
new_results = self.parse_response(soup)
60+
if qtype == 'vid':
61+
if self.name in ['yahoo']:
62+
new_results = self.parse_video_response(soup)
63+
else:
64+
new_results = self.parse_response(soup)
65+
else:
66+
new_results = self.parse_response(soup)
5267
if new_results is None:
5368
break
5469
urls.extend(new_results)

app/scrapers/google.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def __init__(self):
1111
self.defaultStart = 0
1212
self.startKey = 'start'
1313
self.qtype = 'tbm'
14+
self.name = 'google'
1415

1516
def next_start(self, current_start, prev_results):
1617
return current_start + len(prev_results)

app/scrapers/mojeek.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self):
1010
self.url = 'https://www.mojeek.co.uk/search'
1111
self.defaultStart = 1
1212
self.startKey = 's'
13+
self.name = 'mojeek'
1314

1415
def parse_response(self, soup):
1516
""" Parse the response and return set of urls

app/scrapers/parsijoo.py

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def __init__(self):
1010
self.url = 'https://parsijoo.ir/web'
1111
self.defaultStart = 0
1212
self.startKey = 'co'
13+
self.name = 'parsijoo'
1314

1415
def parse_response(self, soup):
1516
""" Parse the response and return set of urls

app/scrapers/quora.py

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class Quora(Scraper):
88
def __init__(self):
99
Scraper.__init__(self)
1010
self.url = 'https://www.quora.com/search'
11+
self.name = 'quora'
1112

1213
def parse_response(self, soup):
1314
""" Parse the response and return set of urls

app/scrapers/yahoo.py

+23
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ class Yahoo(Scraper):
1313
def __init__(self):
1414
Scraper.__init__(self)
1515
self.url = 'https://search.yahoo.com/search'
16+
self.videoURL = 'https://video.search.yahoo.com/search/video'
1617
self.defaultStart = 1
1718
self.startKey = 'b'
19+
self.name = 'yahoo'
1820

1921
def parse_response(self, soup):
2022
""" Parse response and returns the urls
@@ -38,3 +40,24 @@ def parse_response(self, soup):
3840
print('Yahoo parsed: ' + str(urls))
3941

4042
return urls
43+
44+
def parse_video_response(self, soup):
45+
""" Parse response and returns the urls
46+
47+
Returns: urls (list)
48+
[[Tile1, url1], [Title2, url2], ...]
49+
"""
50+
urls = []
51+
for h in soup.findAll('li', attrs={'class': 'vr vres'}):
52+
t = h.find('a', attrs={'class': 'ng'})
53+
r = t.get('data-rurl')
54+
titleDiv = t.find('div', attrs={'class': 'v-meta bx-bb'})
55+
title = titleDiv.find('h3').getText()
56+
urls.append({
57+
'title': title,
58+
'link': r
59+
})
60+
61+
print('Yahoo parsed: ' + str(urls))
62+
63+
return urls

app/scrapers/youtube.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def __init__(self):
99
Scraper.__init__(self)
1010
self.url = 'https://www.youtube.com/results'
1111
self.queryKey = 'search_query'
12+
self.name = 'youtube'
1213

1314
def parse_response(self, soup):
1415
""" Parse the response and return list of urls

0 commit comments

Comments
 (0)