From 70899f8ac5b7105870642c9c26df7a7cadbbb183 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Sat, 20 Jan 2018 17:55:47 +0530 Subject: [PATCH] Addresses #320 and #321 Add Image/Video search support for Parsijoo --- .travis.yml | 2 +- app/scrapers/__init__.py | 4 +++ app/scrapers/generalized.py | 31 ++++++++++++++++++++++ app/scrapers/parsijoo.py | 39 ++++++++++++++++++++++++++++ app/server.py | 9 ++++--- package-lock.json | 51 +++++++++++++++++++++++++++++++++++++ 6 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 package-lock.json diff --git a/.travis.yml b/.travis.yml index a2c45cc4..9541025a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install -r requirements.txt before_script: - - flake8 . --count --max-complexity=15 --show-source --statistics + - flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100 script: - python -m app.server > /dev/null & - pytest --cov=./ diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index fa6a727b..0e751407 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -42,6 +42,10 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) + elif (engine in ['parsijoo']) and (qtype == 'isch'): + urls = scrapers[engine].image_search_without_count(query) + elif (engine in ['parsijoo']) and (qtype == 'vid'): + urls = scrapers[engine].video_search_without_count(query) else: urls = scrapers[engine].search(query, count, qtype) return urls diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index e87df5ff..590f52eb 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -81,3 +81,34 @@ def search_without_count(self, query): soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_response(soup) return urls + + def video_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name in ['parsijoo']: + url = self.videoURL + payload = {self.queryKey: query} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_video_response(soup) + if urls == []: + return "No video with this Keyword" + else: + return urls + + def image_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name in ['parsijoo']: + url = self.imageURL + payload = {self.queryKey: query} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_image_response(soup) + return urls diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index a1134bd9..c462ea1d 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -8,6 +8,8 @@ class Parsijoo(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://parsijoo.ir/web' + self.imageURL = 'https://image.parsijoo.ir/image' + self.videoURL = 'https://video.parsijoo.ir/video' self.defaultStart = 0 self.startKey = 'co' self.name = 'parsijoo' @@ -28,3 +30,40 @@ def parse_response(self, soup): print('Parsijoo parsed: ' + str(urls)) return urls + + def parse_video_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[Tile1, url1], [Title2, url2], ...] + """ + urls = [] + for a in soup.findAll('a', attrs={'class': 'over-page'}): + title = a.get('title') + url = self.videoURL + a.get('href') + urls.append({ + 'title': title, + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls + + def parse_image_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[url1], [url2], ...] + """ + urls = [] + for div in soup.findAll('div', attrs={'class': 'image-container overflow'}): + a = div.find('a') + url = 'https://image.parsijoo.ir' + a.get('href') + urls.append({ + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls diff --git a/app/server.py b/app/server.py index 104fc655..fee9a1cd 100644 --- a/app/server.py +++ b/app/server.py @@ -6,8 +6,10 @@ from dicttoxml import dicttoxml from flask import (Flask, Response, abort, jsonify, make_response, render_template, request) - -from app.scrapers import feed_gen, scrapers +try: + from app.scrapers import feed_gen, scrapers +except ImportError: + from scrapers import feed_gen, scrapers DISABLE_CACHE = True # Temporarily disable the MongoDB cache if DISABLE_CACHE: @@ -77,7 +79,8 @@ def search(search_engine): unicode # unicode is undefined in Python 3 so NameError is raised for line in result: line['link'] = line['link'].encode('utf-8') - line['title'] = line['title'].encode('utf-8') + if 'title' in line: + line['title'] = line['title'].encode('utf-8') if 'desc' in line: line['desc'] = line['desc'].encode('utf-8') except NameError: diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..beab31fb --- /dev/null +++ b/package-lock.json @@ -0,0 +1,51 @@ +{ + "name": "query-server", + "version": "0.1.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "bower": { + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/bower/-/bower-1.8.2.tgz", + "integrity": "sha1-rfU1KcjUrwLvJPuNU0HBQZ0z4vc=" + }, + "ci-info": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-1.1.2.tgz", + "integrity": "sha512-uTGIPNx/nSpBdsF6xnseRXLLtfr9VLqkz8ZqHXr3Y7b6SftyRxBGjwMtJj1OhNbmlc1wZzLNAlAcvyIiE8a6ZA==", + "dev": true + }, + "husky": { + "version": "0.14.3", + "resolved": "https://registry.npmjs.org/husky/-/husky-0.14.3.tgz", + "integrity": "sha512-e21wivqHpstpoiWA/Yi8eFti8E+sQDSS53cpJsPptPs295QTOQR0ZwnHo2TXy1XOpZFD9rPOd3NpmqTK6uMLJA==", + "dev": true, + "requires": { + "is-ci": "1.1.0", + "normalize-path": "1.0.0", + "strip-indent": "2.0.0" + } + }, + "is-ci": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-ci/-/is-ci-1.1.0.tgz", + "integrity": "sha512-c7TnwxLePuqIlxHgr7xtxzycJPegNHFuIrBkwbf8hc58//+Op1CqFkyS+xnIMkwn9UsJIwc174BIjkyBmSpjKg==", + "dev": true, + "requires": { + "ci-info": "1.1.2" + } + }, + "normalize-path": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-1.0.0.tgz", + "integrity": "sha1-MtDkcvkf80VwHBWoMRAY07CpA3k=", + "dev": true + }, + "strip-indent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-2.0.0.tgz", + "integrity": "sha1-XvjbKV0B5u1sv3qrlpmNeCJSe2g=", + "dev": true + } + } +}