From b840beed2a4f098dd52a1caeb25f572bbb6e781b Mon Sep 17 00:00:00 2001 From: rupav jain Date: Tue, 6 Feb 2018 19:37:44 +0530 Subject: [PATCH] Prevent app from crashing --- app/scrapers/__init__.py | 6 +++--- app/scrapers/generalized.py | 25 +++++++++++++++++++------ app/scrapers/parsijoo.py | 7 ++++--- app/server.py | 23 ++++++++++++++++++----- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 548fe1f8..209f2d5c 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -43,7 +43,7 @@ def feed_gen(query, engine, count=10, qtype=''): 'tyoutube': 'youtube'} engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): - urls = scrapers[engine].search_without_count(query) + urls, status_code = scrapers[engine].search_without_count(query) else: - urls = scrapers[engine].search(query, count, qtype) - return urls + urls, status_code = scrapers[engine].search(query, count, qtype) + return (urls, status_code) diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index 12ade912..9fe2e557 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -42,8 +42,12 @@ def get_page(self, query, startIndex=0, qtype=''): if self.name == 'mojeek' and qtype == 'news': payload['fmt'] = 'news' response = requests.get(url, headers=self.headers, params=payload) + status_code = response.status_code + print(status_code) + if(status_code == 400 or status_code == 404): + return (None, status_code) print(response.url) - return response + return (response, status_code) @staticmethod def parse_response(soup): @@ -64,16 +68,22 @@ def search(self, query, num_results, qtype=''): """ urls = [] current_start = self.defaultStart - while (len(urls) < num_results): - response = self.get_page(query, current_start, qtype) + response, status_code = self.get_page(query, current_start, qtype) + if response is None: + if(len(urls) == 0): + return (None, status_code) + else: + print("Couldn't fetch more results.") + return (urls, 200) soup = BeautifulSoup(response.text, 'html.parser') new_results = self.call_appropriate_parser(qtype, soup) - if new_results is None: + if new_results is None or len(new_results) == 0: + print("Couldn't fetch more results.") break urls.extend(new_results) current_start = self.next_start(current_start, new_results) - return urls[: num_results] + return (urls[: num_results], 200) def call_appropriate_parser(self, qtype, soup): new_results = '' @@ -95,6 +105,9 @@ def search_without_count(self, query): urls = [] payload = {self.queryKey: query} response = requests.get(self.url, headers=self.headers, params=payload) + status_code = response.status_code + if(status_code == 400 or status_code == 404): + return(None, status_code) soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_response(soup) - return urls + return (urls, 200) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index 0edc14be..f687430d 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -88,7 +88,8 @@ def parse_news_response(soup): title = div.a.getText() link = unquote(div.a.get('href')) urls.append({'title': title, 'link': link}) - - print('Parsijoo parsed: ' + str(urls)) - + try: + print('Parsijoo parsed: ' + str(urls)) + except Exception: + pass return urls diff --git a/app/server.py b/app/server.py index e8ee34d9..f4cbe8e7 100644 --- a/app/server.py +++ b/app/server.py @@ -36,11 +36,24 @@ def store(url, links): def index(): return render_template('index.html', engines_list=scrapers.keys()) +''' +def bad_request(error): + message = {'Error': error[1], 'Status Code': error[0]} + response = dicttoxml(message) #if error[2] == 'xml' else message + print(response) + return Response(response, mimetype='text/xml') + #return jsonify(response) +''' + def bad_request(error): message = {'Error': error[1], 'Status Code': error[0]} - response = dicttoxml(message) if error[2] == 'xml' else json.dumps(message) - return make_response(response, error[0]) + print(error[2]) + if error[2] == 'xml': + print("Its XML!!!!!") + return Response(dicttoxml(message), mimetype='text/xml') + else: + return jsonify(message) @app.route('/api/v1/search/', methods=['GET']) @@ -54,7 +67,7 @@ def search(search_engine): engine = search_engine if engine not in scrapers: - error = [404, 'Incorrect search engine', engine] + error = [404, 'Incorrect search engine', qformat] return bad_request(error) query = request.args.get('query') @@ -68,12 +81,12 @@ def search(search_engine): if result: print("cache hit: {}".format(engine_and_query)) else: - result = feed_gen(query, engine, count, qtype) + result, status_code = feed_gen(query, engine, count, qtype) if result: # store the result in the cache to speed up future searches store(engine_and_query, result) else: - error = [404, 'No response', engine_and_query] + error = [status_code, 'No response', qformat] return bad_request(error) try: