Skip to content

Commit

Permalink
Prevent app from crashing
Browse files Browse the repository at this point in the history
  • Loading branch information
rupav committed Feb 6, 2018
1 parent 8de3e50 commit b840bee
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 17 deletions.
6 changes: 3 additions & 3 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def feed_gen(query, engine, count=10, qtype=''):
'tyoutube': 'youtube'}
engine = old_names.get(engine, engine)
if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
urls, status_code = scrapers[engine].search_without_count(query)
else:
urls = scrapers[engine].search(query, count, qtype)
return urls
urls, status_code = scrapers[engine].search(query, count, qtype)
return (urls, status_code)
25 changes: 19 additions & 6 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,12 @@ def get_page(self, query, startIndex=0, qtype=''):
if self.name == 'mojeek' and qtype == 'news':
payload['fmt'] = 'news'
response = requests.get(url, headers=self.headers, params=payload)
status_code = response.status_code
print(status_code)
if(status_code == 400 or status_code == 404):
return (None, status_code)
print(response.url)
return response
return (response, status_code)

@staticmethod
def parse_response(soup):
Expand All @@ -64,16 +68,22 @@ def search(self, query, num_results, qtype=''):
"""
urls = []
current_start = self.defaultStart

while (len(urls) < num_results):
response = self.get_page(query, current_start, qtype)
response, status_code = self.get_page(query, current_start, qtype)
if response is None:
if(len(urls) == 0):
return (None, status_code)
else:
print("Couldn't fetch more results.")
return (urls, 200)
soup = BeautifulSoup(response.text, 'html.parser')
new_results = self.call_appropriate_parser(qtype, soup)
if new_results is None:
if new_results is None or len(new_results) == 0:
print("Couldn't fetch more results.")
break
urls.extend(new_results)
current_start = self.next_start(current_start, new_results)
return urls[: num_results]
return (urls[: num_results], 200)

def call_appropriate_parser(self, qtype, soup):
new_results = ''
Expand All @@ -95,6 +105,9 @@ def search_without_count(self, query):
urls = []
payload = {self.queryKey: query}
response = requests.get(self.url, headers=self.headers, params=payload)
status_code = response.status_code
if(status_code == 400 or status_code == 404):
return(None, status_code)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_response(soup)
return urls
return (urls, 200)
7 changes: 4 additions & 3 deletions app/scrapers/parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ def parse_news_response(soup):
title = div.a.getText()
link = unquote(div.a.get('href'))
urls.append({'title': title, 'link': link})

print('Parsijoo parsed: ' + str(urls))

try:
print('Parsijoo parsed: ' + str(urls))
except Exception:
pass
return urls
23 changes: 18 additions & 5 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,24 @@ def store(url, links):
def index():
return render_template('index.html', engines_list=scrapers.keys())

'''
def bad_request(error):
message = {'Error': error[1], 'Status Code': error[0]}
response = dicttoxml(message) #if error[2] == 'xml' else message
print(response)
return Response(response, mimetype='text/xml')
#return jsonify(response)
'''


def bad_request(error):
message = {'Error': error[1], 'Status Code': error[0]}
response = dicttoxml(message) if error[2] == 'xml' else json.dumps(message)
return make_response(response, error[0])
print(error[2])
if error[2] == 'xml':
print("Its XML!!!!!")
return Response(dicttoxml(message), mimetype='text/xml')
else:
return jsonify(message)


@app.route('/api/v1/search/<search_engine>', methods=['GET'])
Expand All @@ -54,7 +67,7 @@ def search(search_engine):

engine = search_engine
if engine not in scrapers:
error = [404, 'Incorrect search engine', engine]
error = [404, 'Incorrect search engine', qformat]
return bad_request(error)

query = request.args.get('query')
Expand All @@ -68,12 +81,12 @@ def search(search_engine):
if result:
print("cache hit: {}".format(engine_and_query))
else:
result = feed_gen(query, engine, count, qtype)
result, status_code = feed_gen(query, engine, count, qtype)
if result:
# store the result in the cache to speed up future searches
store(engine_and_query, result)
else:
error = [404, 'No response', engine_and_query]
error = [status_code, 'No response', qformat]
return bad_request(error)

try:
Expand Down

0 comments on commit b840bee

Please sign in to comment.