Skip to content

Fixes #462 Add Video search support for Ask #463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def feed_gen(query, engine, count=10, qtype=''):
urls = scrapers[engine].video_search_without_count(query)
elif engine in ('bing',) and qtype == 'isch':
urls = scrapers[engine].image_search_without_count(query)
elif engine in ('ask',) and qtype == 'vid':
urls = scrapers[engine].video_search(query, count, qtype)
else:
urls = scrapers[engine].search(query, count, qtype)
return urls
23 changes: 23 additions & 0 deletions app/scrapers/ask.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Ask(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'http://ask.com/web'
self.videoURL = 'https://www.ask.com/youtube'
self.defaultStart = 1
self.startKey = 'page'
self.name = 'ask'
Expand Down Expand Up @@ -35,3 +36,25 @@ def parse_response(soup):
urls.append({'title': title, 'link': url})
print('Ask parsed: ' + str(urls))
return urls

@staticmethod
def parse_video_response(soup):
""" Parse response and returns the urls

Returns: urls (list)
[[Tile1, url1], [Title2, url2], ...]
"""
urls = []
for div in soup.findAll('div', attrs={'class': 'v-info'}):
title = div.div.find('a').getText()
url = 'https' + div.div.a.get('href')
desc = div.find('div', attrs={'class': 'desc'}).getText()
urls.append({
'title': title,
'link': url,
'desc': desc
})

print('Ask parsed: ' + str(urls))

return urls
27 changes: 27 additions & 0 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ def get_page(self, query, startIndex=0, qtype=''):
if qtype == 'vid':
if self.name in ['yahoo']:
url = self.videoURL
elif self.name in ['ask']:
url = self.videoURL
payload = {self.queryKey: query, self.startKey: startIndex}
response = requests.get(
url, headers=self.headers, params=payload
)
return response
else:
url = self.url
payload = {self.queryKey: query, self.startKey: startIndex,
Expand Down Expand Up @@ -85,6 +92,26 @@ def search_without_count(self, query):
urls = self.parse_response(soup)
return urls

def video_search(self, query, num_results, qtype=''):
urls = []
current_start = self.defaultStart

while (len(urls) < num_results):
response = self.get_page(query, current_start, qtype)
soup = BeautifulSoup(response.text, 'html.parser')
if qtype == 'vid':
if self.name in ['yahoo', 'ask']:
new_results = self.parse_video_response(soup)
else:
new_results = self.parse_response(soup)
else:
new_results = self.parse_response(soup)
if new_results is None:
break
urls.extend(new_results)
current_start = self.next_start(current_start, new_results)
return urls[: num_results]

def video_search_without_count(self, query):
"""
Search for the query and return set of urls
Expand Down
18 changes: 18 additions & 0 deletions test/test_ask.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,21 @@ def test_parse_response_without_desc():
}
]
assert resp == expected_resp


def test_parse_video_response():
html_div = """<div class="v-info"><div class="v-title">
<a class="title" href="mock_url">mock_title</a></div>
<div class="desc">mock_desc</div>
</div>"""
stub_soup_div = BeautifulSoup(html_div, 'html.parser')
resp = Ask().parse_video_response(stub_soup_div)
url_video = 'https' + 'mock_url'
expected_resp = [
{
'link': url_video,
'title': u'mock_title',
'desc': u'mock_desc'
}
]
assert resp == expected_resp