Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #464 Add Image search support of Yahoo #465

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install:
- pip install -r requirements.txt

before_script:
- flake8 . --count --max-complexity=15 --show-source --statistics
- flake8 . --count --max-complexity=16 --show-source --statistics
script:
- python -m app.server > /dev/null &
- pytest --cov=./
Expand Down
2 changes: 2 additions & 0 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def feed_gen(query, engine, count=10, qtype=''):
engine = old_names.get(engine, engine)
if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
elif engine in ('yahoo',) and qtype == 'isch':
urls = scrapers[engine].image_search_without_count(query)
else:
urls = scrapers[engine].search(query, count, qtype)
return urls
14 changes: 14 additions & 0 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,17 @@ def search_without_count(self, query):
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_response(soup)
return urls

def image_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['yahoo']:
url = self.imageURL
payload = {'p': query}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_image_response(soup)
return urls
19 changes: 19 additions & 0 deletions app/scrapers/yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self):
Scraper.__init__(self)
self.url = 'https://search.yahoo.com/search'
self.videoURL = 'https://video.search.yahoo.com/search/video'
self.imageURL = 'https://images.search.yahoo.com/search/images'
self.defaultStart = 1
self.startKey = 'b'
self.name = 'yahoo'
Expand Down Expand Up @@ -63,3 +64,21 @@ def parse_video_response(soup):
print('Yahoo parsed: ' + str(urls))

return urls

@staticmethod
def parse_image_response(soup):
""" Parse response and returns the urls

Returns: urls (list)
[[url1], [url2], ...]
"""
urls = []
for li in soup.findAll('li', attrs={'class': 'ld'}):
url = 'https://images.search.yahoo.com' + li.a.get('href')
urls.append({
'link': url
})

print('Yahoo parsed: ' + str(urls))

return urls
3 changes: 2 additions & 1 deletion app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def search(search_engine):
unicode # unicode is undefined in Python 3 so NameError is raised
for line in result:
line['link'] = line['link'].encode('utf-8')
line['title'] = line['title'].encode('utf-8')
if 'title' in line:
line['title'] = line['title'].encode('utf-8')
if 'desc' in line:
line['desc'] = line['desc'].encode('utf-8')
except NameError:
Expand Down