diff --git a/.gitignore b/.gitignore index 2c441397..c30be02f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ app/static/bower_components/* *.swp Pipfile Pipfile.lock +.vscode/* + diff --git a/README.md b/README.md index 8fee6ec9..6c576aac 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ The API(s) provided by query-server are as follows: > *query* : query can be any string -> *format* : [`json`, `xml`] +> *format* : [`json`, `xml`, 'csv'] A sample query : `/api/v1/search/bing?query=fossasia&format=xml&num=10` diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 0650cfe5..c17969c3 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -35,7 +35,7 @@ def small_test(): assert isinstance(scrapers['google'].search('fossasia', 1), list) -def feed_gen(query, engine, count=10): +def feed_gen(query, engine, count=10, qtype=''): engine = engine.lower() # provide temporary backwards compatibility for old names old_names = {'ubaidu': 'baidu', @@ -45,5 +45,5 @@ def feed_gen(query, engine, count=10): if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) else: - urls = scrapers[engine].search(query, count) + urls = scrapers[engine].search(query, count,qtype) return urls diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index ecbb7c27..3ccebaf6 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -20,12 +20,14 @@ class Scraper: def __init__(self): pass - def get_page(self, query, startIndex=0): + def get_page(self, query, startIndex=0, qtype = ''): """ Fetch the google search results page Returns : Results Page """ - payload = {self.queryKey: query, self.startKey: startIndex} + payload = {self.queryKey: query, self.startKey: startIndex, + self.qtype : qtype} response = requests.get(self.url, headers=self.headers, params=payload) + print(response.url) return response def parse_response(self, soup): @@ -34,7 +36,7 @@ def parse_response(self, soup): def next_start(self, current_start, prev_results): return current_start + len(prev_results) - def search(self, query, num_results): + def search(self, query, num_results, qtype=''): """ Search for the query and return set of urls Returns: list @@ -43,7 +45,7 @@ def search(self, query, num_results): current_start = self.defaultStart while(len(urls) < num_results): - response = self.get_page(query, current_start) + response = self.get_page(query, current_start, qtype) soup = BeautifulSoup(response.text, 'html.parser') new_results = self.parse_response(soup) if new_results is None: diff --git a/app/scrapers/google.py b/app/scrapers/google.py index 3181de37..b8ebd79b 100644 --- a/app/scrapers/google.py +++ b/app/scrapers/google.py @@ -10,6 +10,7 @@ def __init__(self): self.url = 'https://www.google.com/search' self.defaultStart = 0 self.startKey = 'start' + self.qtype = 'tbm' def next_start(self, current_start, prev_results): return current_start + len(prev_results) diff --git a/app/server.py b/app/server.py index 9ce35164..c0d9073a 100644 --- a/app/server.py +++ b/app/server.py @@ -44,6 +44,7 @@ def search(search_engine): try: count = int(request.args.get('num', 10)) qformat = request.args.get('format', 'json').lower() + qtype = request.args.get('type', '') if qformat not in ('json', 'xml', 'csv'): abort(400, 'Not Found - undefined format') @@ -63,7 +64,7 @@ def search(search_engine): if result: print("cache hit: {}".format(engine_and_query)) else: - result = feed_gen(query, engine, count) + result = feed_gen(query, engine, count, qtype) if result: # store the result in the cache to speed up future searches store(engine_and_query, result) diff --git a/app/templates/index.html b/app/templates/index.html index d4c4e557..fd045997 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -64,52 +64,70 @@
query-server
query-server
query-server