From d16a1cd3f075725b3e249c64291e69404acd8e63 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Wed, 24 Jun 2020 15:27:24 +0100 Subject: [PATCH 01/15] Initial refactor of howdoi.py into separate plugin Co-authored-by: c.decal@campus.unimib.it --- howdoi/howdoi.py | 14 +- howdoi/plugins/BasePlugin.py | 301 +++++++++++++++++++++++++++++++++++ howdoi/plugins/__init__.py | 0 3 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 howdoi/plugins/BasePlugin.py create mode 100644 howdoi/plugins/__init__.py diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index 235a52562..adf82d4bc 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -30,6 +30,8 @@ from requests.exceptions import ConnectionError from requests.exceptions import SSLError +from howdoi.plugins import BasePlugin + # Handle imports for Python 2 and 3 if sys.version < '3': import codecs @@ -365,8 +367,8 @@ def _get_answers(args): answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) answer += '\n' answers.append({ - 'answer': answer, - 'link': link, + 'answer': answer, + 'link': link, 'position': current_position }) @@ -393,13 +395,13 @@ def _format_answers(res, args): return json.dumps(res) formatted_answers = [] - + for answer in res: next_ans = answer["answer"] if args["link"]: # if we only want links next_ans = answer["link"] formatted_answers.append(next_ans) - + return build_splitter().join(formatted_answers) @@ -443,7 +445,8 @@ def howdoi(raw_query): return _format_answers(res, args) try: - res = _get_answers(args) + plugin = BasePlugin.BasePlugin() + res = plugin.search() if not res: res = {"error": "Sorry, couldn\'t find any help with that topic\n"} cache.set(cache_key, res) @@ -469,6 +472,7 @@ def get_parser(): action='store_true') parser.add_argument('-e', '--engine', help='change search engine for this query only (google, bing, duckduckgo)', dest='search_engine', nargs="?", default='google') + parser.add_argument('--plugin', help='use the base plugin', type=str, default='stackoverflow') return parser diff --git a/howdoi/plugins/BasePlugin.py b/howdoi/plugins/BasePlugin.py new file mode 100644 index 000000000..de817f952 --- /dev/null +++ b/howdoi/plugins/BasePlugin.py @@ -0,0 +1,301 @@ +import os +import re +import sys + +import appdirs + +from cachelib import FileSystemCache, NullCache + +from pyquery import PyQuery as pq + +from pygments import highlight +from pygments.formatters.terminal import TerminalFormatter +from pygments.lexers import get_lexer_by_name, guess_lexer +from pygments.util import ClassNotFound + + +class BlockError(RuntimeError): + pass + + +# Handle imports for Python 2 and 3 +if sys.version < '3': + import codecs + from urllib import quote as url_quote + from urllib import getproxies + from urlparse import urlparse, parse_qs + + # Handling Unicode: http://stackoverflow.com/a/6633040/305414 + def u(x): + return codecs.unicode_escape_decode(x)[0] +else: + from urllib.request import getproxies + from urllib.parse import quote as url_quote, urlparse, parse_qs + + def u(x): + return x + +# rudimentary standardized 3-level log output + + +def _print_err(x): + print("[ERROR] " + x) + + +_print_ok = print # noqa: E305 + + +def _print_dbg(x): + print("[DEBUG] " + x) # noqa: E302 + + +# CACHE_EMPTY_VAL = "NULL" +CACHE_DIR = appdirs.user_cache_dir('howdoi') +CACHE_ENTRY_MAX = 128 + +if os.getenv('HOWDOI_DISABLE_CACHE'): + cache = NullCache() # works like an always empty cache +else: + cache = FileSystemCache(CACHE_DIR, CACHE_ENTRY_MAX, default_timeout=0) + +ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') +STAR_HEADER = u('\u2605') +CACHE_EMPTY_VAL = "NULL" +NO_ANSWER_MSG = '< no answer given >' + +if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https + SCHEME = 'http://' + VERIFY_SSL_CERTIFICATE = False +else: + SCHEME = 'https://' + VERIFY_SSL_CERTIFICATE = True + +BLOCK_INDICATORS = ( + 'form id="captcha-form"', + 'This page appears when Google automatically detects requests coming from your computer ' + 'network which appear to be in violation of the Terms of Service' +) + +BLOCKED_QUESTION_FRAGMENTS = ( + 'webcache.googleusercontent.com', +) + +URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' + +SEARCH_URLS = { + 'bing': SCHEME + 'www.bing.com/search?q=site:{0}%20{1}&hl=en', + 'google': SCHEME + 'www.google.com/search?q=site:{0}%20{1}&hl=en', + 'duckduckgo': SCHEME + 'duckduckgo.com/?q=site:{0}%20{1}&t=hj&ia=web' +} + + +def _is_blocked(page): + for indicator in BLOCK_INDICATORS: + if page.find(indicator) != -1: + return True + return False + + +class BasePlugin(): + def search(self): + print("Hello search") + pass + + def _add_links_to_text(self, element): + hyperlinks = element.find('a') + + for hyperlink in hyperlinks: + pquery_object = pq(hyperlink) + href = hyperlink.attrib['href'] + copy = pquery_object.text() + if (copy == href): + replacement = copy + else: + replacement = "[{0}]({1})".format(copy, href) + pquery_object.replace_with(replacement) + + def get_link_at_pos(self, links, position): + if not links: + return False + if len(links) >= position: + link = links[position - 1] + else: + link = links[-1] + return link + + def get_text(self, element): + ''' return inner text in pyquery element ''' + self._add_links_to_text(element) + try: + return element.text(squash_space=False) + except TypeError: + return element.text() + + def _get_search_url(self, search_engine): + return SEARCH_URLS.get(search_engine, SEARCH_URLS['google']) + + def _get_links(self, query): + search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') + search_url = self._get_search_url(search_engine) + + result = self._get_result(search_url.format(URL, url_quote(query))) + if _is_blocked(result): + _print_err('Unable to find an answer because the search engine temporarily blocked the request. ' + 'Please wait a few minutes or select a different search engine.') + raise BlockError("Temporary block by search engine") + + html = pq(result) + return self._extract_links(html, search_engine) + + def _extract_links_from_bing(self, html): + html.remove_namespaces() + return [a.attrib['href'] for a in html('.b_algo')('h2')('a')] + + def _extract_links_from_google(self, html): + return [a.attrib['href'] for a in html('.l')] or \ + [a.attrib['href'] for a in html('.r')('a')] + + def _extract_links_from_duckduckgo(self, html): + html.remove_namespaces() + links_anchors = html.find('a.result__a') + results = [] + for anchor in links_anchors: + link = anchor.attrib['href'] + url_obj = urlparse(link) + parsed_url = parse_qs(url_obj.query).get('uddg', '') + if parsed_url: + results.append(parsed_url[0]) + return results + + def _extract_links(self, html, search_engine): + if search_engine == 'bing': + return self._extract_links_from_bing(html) + if search_engine == 'duckduckgo': + return self._extract_links_from_duckduckgo(html) + return self._extract_links_from_google(html) + + def format_output(self, code, args): + if not args['color']: + return code + lexer = None + + # try to find a lexer using the StackOverflow tags + # or the query arguments + for keyword in args['query'].split() + args['tags']: + try: + lexer = get_lexer_by_name(keyword) + break + except ClassNotFound: + pass + + # no lexer found above, use the guesser + if not lexer: + try: + lexer = guess_lexer(code) + except ClassNotFound: + return code + + return highlight(code, + lexer, + TerminalFormatter(bg='dark')) + + def _is_question(self, link): + for fragment in BLOCKED_QUESTION_FRAGMENTS: + if fragment in link: + return False + return re.search(r'questions/\d+/', link) + + def _get_answer(self, args, links): + link = self.get_link_at_pos(links, args['pos']) + if not link: + return False + + cache_key = link + page = cache.get(link) + if not page: + page = self._get_result(link + '?answertab=votes') + cache.set(cache_key, page) + + html = pq(page) + + first_answer = html('.answer').eq(0) + + instructions = first_answer.find('pre') or first_answer.find('code') + args['tags'] = [t.text for t in html('.post-tag')] + + if not instructions and not args['all']: + text = self.get_text(first_answer.find('.post-text').eq(0)) + elif args['all']: + texts = [] + for html_tag in first_answer.items('.post-text > *'): + current_text = self.get_text(html_tag) + if current_text: + if html_tag[0].tag in ['pre', 'code']: + texts.append(self._format_output(current_text, args)) + else: + texts.append(current_text) + text = '\n'.join(texts) + else: + text = self._format_output(self.get_text(instructions.eq(0)), args) + if text is None: + text = NO_ANSWER_MSG + text = text.strip() + return text + + def _get_questions(self, links): + return [link for link in links if self._is_question(link)] + + def _get_links_with_cache(self, query): + cache_key = query + "-links" + res = cache.get(cache_key) + if res: + if res == CACHE_EMPTY_VAL: + res = False + return res + + links = self._get_links(query) + if not links: + cache.set(cache_key, CACHE_EMPTY_VAL) + + question_links = self._get_questions(links) + cache.set(cache_key, question_links or CACHE_EMPTY_VAL) + + return question_links + + def _get_answers(self, args): + """ + @args: command-line arguments + returns: array of answers and their respective metadata + False if unable to get answers + """ + question_links = self._get_links_with_cache(args['query']) + if not question_links: + return False + + answers = [] + initial_position = args['pos'] + multiple_answers = (args['num_answers'] > 1 or args['all']) + + for answer_number in range(args['num_answers']): + current_position = answer_number + initial_position + args['pos'] = current_position + link = self.get_link_at_pos(question_links, current_position) + answer = self._get_answer(args, question_links) + if not answer: + continue + if not args['link'] and not args['json_output'] and multiple_answers: + answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) + answer += '\n' + answers.append({ + 'answer': answer, + 'link': link, + 'position': current_position + }) + + return answers + + def extract(self): + print("Hello extract") + pass + +# Make StackOverflow plugin diff --git a/howdoi/plugins/__init__.py b/howdoi/plugins/__init__.py new file mode 100644 index 000000000..e69de29bb From cf6f386810a9a581f8a500186b705fa60739f64c Mon Sep 17 00:00:00 2001 From: Cesare De Cal Date: Wed, 24 Jun 2020 17:17:07 +0200 Subject: [PATCH 02/15] Deleted ununsed functions from howdoi.py, imported get_proxies in BasePlugin --- howdoi/howdoi.py | 311 +---------------------------------- howdoi/plugins/BasePlugin.py | 27 ++- 2 files changed, 32 insertions(+), 306 deletions(-) diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index adf82d4bc..b23c1ee42 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -14,41 +14,21 @@ import argparse import os import appdirs -import re -from cachelib import FileSystemCache, NullCache import json import requests import sys from . import __version__ -from pygments import highlight -from pygments.lexers import guess_lexer, get_lexer_by_name -from pygments.formatters.terminal import TerminalFormatter -from pygments.util import ClassNotFound +from cachelib import FileSystemCache, NullCache -from pyquery import PyQuery as pq from requests.exceptions import ConnectionError from requests.exceptions import SSLError from howdoi.plugins import BasePlugin -# Handle imports for Python 2 and 3 -if sys.version < '3': - import codecs - from urllib import quote as url_quote - from urllib import getproxies - from urlparse import urlparse, parse_qs - - # Handling Unicode: http://stackoverflow.com/a/6633040/305414 - def u(x): - return codecs.unicode_escape_decode(x)[0] -else: - from urllib.request import getproxies - from urllib.parse import quote as url_quote, urlparse, parse_qs - - def u(x): - return x - +CACHE_EMPTY_VAL = "NULL" +CACHE_DIR = appdirs.user_cache_dir('howdoi') +CACHE_ENTRY_MAX = 128 # rudimentary standardized 3-level log output def _print_err(x): print("[ERROR] " + x) @@ -57,19 +37,8 @@ def _print_err(x): print("[ERROR] " + x) _print_ok = print # noqa: E305 def _print_dbg(x): print("[DEBUG] " + x) # noqa: E302 - -if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https - SCHEME = 'http://' - VERIFY_SSL_CERTIFICATE = False -else: - SCHEME = 'https://' - VERIFY_SSL_CERTIFICATE = True - - SUPPORTED_SEARCH_ENGINES = ('google', 'bing', 'duckduckgo') -URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' - USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', @@ -77,44 +46,17 @@ def _print_dbg(x): print("[DEBUG] " + x) # noqa: E302 'Chrome/19.0.1084.46 Safari/536.5'), ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' 'Safari/536.5'), ) -SEARCH_URLS = { - 'bing': SCHEME + 'www.bing.com/search?q=site:{0}%20{1}&hl=en', - 'google': SCHEME + 'www.google.com/search?q=site:{0}%20{1}&hl=en', - 'duckduckgo': SCHEME + 'duckduckgo.com/?q=site:{0}%20{1}&t=hj&ia=web' -} - -BLOCK_INDICATORS = ( - 'form id="captcha-form"', - 'This page appears when Google automatically detects requests coming from your computer ' - 'network which appear to be in violation of the Terms of Service' -) - -BLOCKED_QUESTION_FRAGMENTS = ( - 'webcache.googleusercontent.com', -) - -STAR_HEADER = u('\u2605') -ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') -NO_ANSWER_MSG = '< no answer given >' - -CACHE_EMPTY_VAL = "NULL" -CACHE_DIR = appdirs.user_cache_dir('howdoi') -CACHE_ENTRY_MAX = 128 SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', 'do howdoi', 'howdoi howdoi', 'howdoi use howdoi'] +howdoi_session = requests.session() + if os.getenv('HOWDOI_DISABLE_CACHE'): cache = NullCache() # works like an always empty cache else: cache = FileSystemCache(CACHE_DIR, CACHE_ENTRY_MAX, default_timeout=0) -howdoi_session = requests.session() - - -class BlockError(RuntimeError): - pass - def _random_int(width): bres = os.urandom(width) @@ -130,251 +72,10 @@ def _random_choice(seq): return seq[_random_int(1) % len(seq)] -def get_proxies(): - proxies = getproxies() - filtered_proxies = {} - for key, value in proxies.items(): - if key.startswith('http'): - if not value.startswith('http'): - filtered_proxies[key] = 'http://%s' % value - else: - filtered_proxies[key] = value - return filtered_proxies - - -def _get_result(url): - try: - return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, - proxies=get_proxies(), - verify=VERIFY_SSL_CERTIFICATE).text - except requests.exceptions.SSLError as e: - _print_err('Encountered an SSL Error. Try using HTTP instead of ' - 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') - raise e - - -def _add_links_to_text(element): - hyperlinks = element.find('a') - - for hyperlink in hyperlinks: - pquery_object = pq(hyperlink) - href = hyperlink.attrib['href'] - copy = pquery_object.text() - if (copy == href): - replacement = copy - else: - replacement = "[{0}]({1})".format(copy, href) - pquery_object.replace_with(replacement) - - -def get_text(element): - ''' return inner text in pyquery element ''' - _add_links_to_text(element) - try: - return element.text(squash_space=False) - except TypeError: - return element.text() - - -def _extract_links_from_bing(html): - html.remove_namespaces() - return [a.attrib['href'] for a in html('.b_algo')('h2')('a')] - - -def _extract_links_from_google(html): - return [a.attrib['href'] for a in html('.l')] or \ - [a.attrib['href'] for a in html('.r')('a')] - - -def _extract_links_from_duckduckgo(html): - html.remove_namespaces() - links_anchors = html.find('a.result__a') - results = [] - for anchor in links_anchors: - link = anchor.attrib['href'] - url_obj = urlparse(link) - parsed_url = parse_qs(url_obj.query).get('uddg', '') - if parsed_url: - results.append(parsed_url[0]) - return results - - -def _extract_links(html, search_engine): - if search_engine == 'bing': - return _extract_links_from_bing(html) - if search_engine == 'duckduckgo': - return _extract_links_from_duckduckgo(html) - return _extract_links_from_google(html) - - -def _get_search_url(search_engine): - return SEARCH_URLS.get(search_engine, SEARCH_URLS['google']) - - -def _is_blocked(page): - for indicator in BLOCK_INDICATORS: - if page.find(indicator) != -1: - return True - - return False - - -def _get_links(query): - search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') - search_url = _get_search_url(search_engine) - - result = _get_result(search_url.format(URL, url_quote(query))) - if _is_blocked(result): - _print_err('Unable to find an answer because the search engine temporarily blocked the request. ' - 'Please wait a few minutes or select a different search engine.') - raise BlockError("Temporary block by search engine") - - html = pq(result) - return _extract_links(html, search_engine) - - -def get_link_at_pos(links, position): - if not links: - return False - - if len(links) >= position: - link = links[position - 1] - else: - link = links[-1] - return link - - -def _format_output(code, args): - if not args['color']: - return code - lexer = None - - # try to find a lexer using the StackOverflow tags - # or the query arguments - for keyword in args['query'].split() + args['tags']: - try: - lexer = get_lexer_by_name(keyword) - break - except ClassNotFound: - pass - - # no lexer found above, use the guesser - if not lexer: - try: - lexer = guess_lexer(code) - except ClassNotFound: - return code - - return highlight(code, - lexer, - TerminalFormatter(bg='dark')) - - -def _is_question(link): - for fragment in BLOCKED_QUESTION_FRAGMENTS: - if fragment in link: - return False - return re.search(r'questions/\d+/', link) - - -def _get_questions(links): - return [link for link in links if _is_question(link)] - - -def _get_answer(args, links): - link = get_link_at_pos(links, args['pos']) - if not link: - return False - - cache_key = link - page = cache.get(link) - if not page: - page = _get_result(link + '?answertab=votes') - cache.set(cache_key, page) - - html = pq(page) - - first_answer = html('.answer').eq(0) - - instructions = first_answer.find('pre') or first_answer.find('code') - args['tags'] = [t.text for t in html('.post-tag')] - - if not instructions and not args['all']: - text = get_text(first_answer.find('.post-text').eq(0)) - elif args['all']: - texts = [] - for html_tag in first_answer.items('.post-text > *'): - current_text = get_text(html_tag) - if current_text: - if html_tag[0].tag in ['pre', 'code']: - texts.append(_format_output(current_text, args)) - else: - texts.append(current_text) - text = '\n'.join(texts) - else: - text = _format_output(get_text(instructions.eq(0)), args) - if text is None: - text = NO_ANSWER_MSG - text = text.strip() - return text - - -def _get_links_with_cache(query): - cache_key = query + "-links" - res = cache.get(cache_key) - if res: - if res == CACHE_EMPTY_VAL: - res = False - return res - - links = _get_links(query) - if not links: - cache.set(cache_key, CACHE_EMPTY_VAL) - - question_links = _get_questions(links) - cache.set(cache_key, question_links or CACHE_EMPTY_VAL) - - return question_links - - def build_splitter(splitter_character='=', splitter_length=80): return '\n' + splitter_character * splitter_length + '\n\n' -def _get_answers(args): - """ - @args: command-line arguments - returns: array of answers and their respective metadata - False if unable to get answers - """ - - question_links = _get_links_with_cache(args['query']) - if not question_links: - return False - - answers = [] - initial_position = args['pos'] - multiple_answers = (args['num_answers'] > 1 or args['all']) - - for answer_number in range(args['num_answers']): - current_position = answer_number + initial_position - args['pos'] = current_position - link = get_link_at_pos(question_links, current_position) - answer = _get_answer(args, question_links) - if not answer: - continue - if not args['link'] and not args['json_output'] and multiple_answers: - answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) - answer += '\n' - answers.append({ - 'answer': answer, - 'link': link, - 'position': current_position - }) - - return answers - - def _clear_cache(): global cache if not cache: diff --git a/howdoi/plugins/BasePlugin.py b/howdoi/plugins/BasePlugin.py index de817f952..c3e9ea1fe 100644 --- a/howdoi/plugins/BasePlugin.py +++ b/howdoi/plugins/BasePlugin.py @@ -49,7 +49,7 @@ def _print_dbg(x): print("[DEBUG] " + x) # noqa: E302 -# CACHE_EMPTY_VAL = "NULL" +CACHE_EMPTY_VAL = "NULL" CACHE_DIR = appdirs.user_cache_dir('howdoi') CACHE_ENTRY_MAX = 128 @@ -294,6 +294,31 @@ def _get_answers(self, args): return answers + + def get_proxies(): + proxies = getproxies() + filtered_proxies = {} + for key, value in proxies.items(): + if key.startswith('http'): + if not value.startswith('http'): + filtered_proxies[key] = 'http://%s' % value + else: + filtered_proxies[key] = value + return filtered_proxies + + + def _get_result(self, url): + pass + # try: + # return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, + # proxies=get_proxies(), + # verify=VERIFY_SSL_CERTIFICATE).text + # except requests.exceptions.SSLError as e: + # _print_err('Encountered an SSL Error. Try using HTTP instead of ' + # 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') + # raise e + + def extract(self): print("Hello extract") pass From c920b9edb2e58c3ccc229073e70d644be604a55e Mon Sep 17 00:00:00 2001 From: Cesare De Cal Date: Wed, 24 Jun 2020 17:31:32 +0200 Subject: [PATCH 03/15] Created StackOverflowPlugin and moved StackOverflow specific methods from BasePlugin to it --- howdoi/plugins/BasePlugin.py | 82 +-------------------------- howdoi/plugins/StackOverflowPlugin.py | 82 +++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 81 deletions(-) create mode 100644 howdoi/plugins/StackOverflowPlugin.py diff --git a/howdoi/plugins/BasePlugin.py b/howdoi/plugins/BasePlugin.py index c3e9ea1fe..574c51e03 100644 --- a/howdoi/plugins/BasePlugin.py +++ b/howdoi/plugins/BasePlugin.py @@ -8,11 +8,6 @@ from pyquery import PyQuery as pq -from pygments import highlight -from pygments.formatters.terminal import TerminalFormatter -from pygments.lexers import get_lexer_by_name, guess_lexer -from pygments.util import ClassNotFound - class BlockError(RuntimeError): pass @@ -76,10 +71,6 @@ def _print_dbg(x): 'network which appear to be in violation of the Terms of Service' ) -BLOCKED_QUESTION_FRAGMENTS = ( - 'webcache.googleusercontent.com', -) - URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' SEARCH_URLS = { @@ -174,76 +165,7 @@ def _extract_links(self, html, search_engine): return self._extract_links_from_duckduckgo(html) return self._extract_links_from_google(html) - def format_output(self, code, args): - if not args['color']: - return code - lexer = None - - # try to find a lexer using the StackOverflow tags - # or the query arguments - for keyword in args['query'].split() + args['tags']: - try: - lexer = get_lexer_by_name(keyword) - break - except ClassNotFound: - pass - - # no lexer found above, use the guesser - if not lexer: - try: - lexer = guess_lexer(code) - except ClassNotFound: - return code - - return highlight(code, - lexer, - TerminalFormatter(bg='dark')) - - def _is_question(self, link): - for fragment in BLOCKED_QUESTION_FRAGMENTS: - if fragment in link: - return False - return re.search(r'questions/\d+/', link) - - def _get_answer(self, args, links): - link = self.get_link_at_pos(links, args['pos']) - if not link: - return False - - cache_key = link - page = cache.get(link) - if not page: - page = self._get_result(link + '?answertab=votes') - cache.set(cache_key, page) - - html = pq(page) - - first_answer = html('.answer').eq(0) - - instructions = first_answer.find('pre') or first_answer.find('code') - args['tags'] = [t.text for t in html('.post-tag')] - - if not instructions and not args['all']: - text = self.get_text(first_answer.find('.post-text').eq(0)) - elif args['all']: - texts = [] - for html_tag in first_answer.items('.post-text > *'): - current_text = self.get_text(html_tag) - if current_text: - if html_tag[0].tag in ['pre', 'code']: - texts.append(self._format_output(current_text, args)) - else: - texts.append(current_text) - text = '\n'.join(texts) - else: - text = self._format_output(self.get_text(instructions.eq(0)), args) - if text is None: - text = NO_ANSWER_MSG - text = text.strip() - return text - def _get_questions(self, links): - return [link for link in links if self._is_question(link)] def _get_links_with_cache(self, query): cache_key = query + "-links" @@ -321,6 +243,4 @@ def _get_result(self, url): def extract(self): print("Hello extract") - pass - -# Make StackOverflow plugin + pass \ No newline at end of file diff --git a/howdoi/plugins/StackOverflowPlugin.py b/howdoi/plugins/StackOverflowPlugin.py new file mode 100644 index 000000000..7c86a278d --- /dev/null +++ b/howdoi/plugins/StackOverflowPlugin.py @@ -0,0 +1,82 @@ +from howdoi.plugins import BasePlugin + +from pygments import highlight +from pygments.formatters.terminal import TerminalFormatter +from pygments.lexers import get_lexer_by_name, guess_lexer +from pygments.util import ClassNotFound + +BLOCKED_QUESTION_FRAGMENTS = ( + 'webcache.googleusercontent.com', +) + +class StackOverflowPlugin(BasePlugin.BasePlugin): + def format_output(self, code, args): + if not args['color']: + return code + lexer = None + + # try to find a lexer using the StackOverflow tags + # or the query arguments + for keyword in args['query'].split() + args['tags']: + try: + lexer = get_lexer_by_name(keyword) + break + except ClassNotFound: + pass + + # no lexer found above, use the guesser + if not lexer: + try: + lexer = guess_lexer(code) + except ClassNotFound: + return code + + return highlight(code, + lexer, + TerminalFormatter(bg='dark')) + + def _is_question(self, link): + for fragment in BLOCKED_QUESTION_FRAGMENTS: + if fragment in link: + return False + return re.search(r'questions/\d+/', link) + + def _get_answer(self, args, links): + link = self.get_link_at_pos(links, args['pos']) + if not link: + return False + + cache_key = link + page = cache.get(link) + if not page: + page = self._get_result(link + '?answertab=votes') + cache.set(cache_key, page) + + html = pq(page) + + first_answer = html('.answer').eq(0) + + instructions = first_answer.find('pre') or first_answer.find('code') + args['tags'] = [t.text for t in html('.post-tag')] + + if not instructions and not args['all']: + text = self.get_text(first_answer.find('.post-text').eq(0)) + elif args['all']: + texts = [] + for html_tag in first_answer.items('.post-text > *'): + current_text = self.get_text(html_tag) + if current_text: + if html_tag[0].tag in ['pre', 'code']: + texts.append(self._format_output(current_text, args)) + else: + texts.append(current_text) + text = '\n'.join(texts) + else: + text = self._format_output(self.get_text(instructions.eq(0)), args) + if text is None: + text = NO_ANSWER_MSG + text = text.strip() + return text + + def _get_questions(self, links): + return [link for link in links if self._is_question(link)] From 1574aa29a6bb71364a1b2bcd76d91c6b4a8006d9 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Wed, 24 Jun 2020 16:41:03 +0100 Subject: [PATCH 04/15] refactored plugin directory --- howdoi/plugins/__init__.py | 2 ++ howdoi/plugins/{BasePlugin.py => base.py} | 9 ++------- .../plugins/{StackOverflowPlugin.py => stackoverflow.py} | 0 3 files changed, 4 insertions(+), 7 deletions(-) rename howdoi/plugins/{BasePlugin.py => base.py} (99%) rename howdoi/plugins/{StackOverflowPlugin.py => stackoverflow.py} (100%) diff --git a/howdoi/plugins/__init__.py b/howdoi/plugins/__init__.py index e69de29bb..678d85c40 100644 --- a/howdoi/plugins/__init__.py +++ b/howdoi/plugins/__init__.py @@ -0,0 +1,2 @@ +from howdoi.plugins.base import BasePlugin +from howdoi.plugins.stackoverflow import StackOverflowPlugin diff --git a/howdoi/plugins/BasePlugin.py b/howdoi/plugins/base.py similarity index 99% rename from howdoi/plugins/BasePlugin.py rename to howdoi/plugins/base.py index 574c51e03..91301b3da 100644 --- a/howdoi/plugins/BasePlugin.py +++ b/howdoi/plugins/base.py @@ -165,8 +165,6 @@ def _extract_links(self, html, search_engine): return self._extract_links_from_duckduckgo(html) return self._extract_links_from_google(html) - - def _get_links_with_cache(self, query): cache_key = query + "-links" res = cache.get(cache_key) @@ -216,8 +214,7 @@ def _get_answers(self, args): return answers - - def get_proxies(): + def get_proxies(self): proxies = getproxies() filtered_proxies = {} for key, value in proxies.items(): @@ -228,7 +225,6 @@ def get_proxies(): filtered_proxies[key] = value return filtered_proxies - def _get_result(self, url): pass # try: @@ -240,7 +236,6 @@ def _get_result(self, url): # 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') # raise e - def extract(self): print("Hello extract") - pass \ No newline at end of file + pass diff --git a/howdoi/plugins/StackOverflowPlugin.py b/howdoi/plugins/stackoverflow.py similarity index 100% rename from howdoi/plugins/StackOverflowPlugin.py rename to howdoi/plugins/stackoverflow.py From e2b57f2d08ee6d95783c3b9e4c1415c42fdc7117 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Wed, 24 Jun 2020 17:08:44 +0100 Subject: [PATCH 05/15] cleaning up the plugins to be modular --- howdoi/howdoi.py | 10 ++- howdoi/plugins/base.py | 82 +++---------------------- howdoi/plugins/stackoverflow.py | 105 ++++++++++++++++++++++++++++++-- 3 files changed, 118 insertions(+), 79 deletions(-) diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index b23c1ee42..32480014e 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -24,19 +24,22 @@ from requests.exceptions import ConnectionError from requests.exceptions import SSLError -from howdoi.plugins import BasePlugin +from howdoi.plugins import StackOverflowPlugin CACHE_EMPTY_VAL = "NULL" CACHE_DIR = appdirs.user_cache_dir('howdoi') CACHE_ENTRY_MAX = 128 # rudimentary standardized 3-level log output + + def _print_err(x): print("[ERROR] " + x) _print_ok = print # noqa: E305 def _print_dbg(x): print("[DEBUG] " + x) # noqa: E302 + SUPPORTED_SEARCH_ENGINES = ('google', 'bing', 'duckduckgo') USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', @@ -146,8 +149,8 @@ def howdoi(raw_query): return _format_answers(res, args) try: - plugin = BasePlugin.BasePlugin() - res = plugin.search() + plugin = StackOverflowPlugin() + res = plugin.search(args) if not res: res = {"error": "Sorry, couldn\'t find any help with that topic\n"} cache.set(cache_key, res) @@ -216,4 +219,5 @@ def command_line_runner(): if __name__ == '__main__': + _clear_cache() command_line_runner() diff --git a/howdoi/plugins/base.py b/howdoi/plugins/base.py index 91301b3da..08886c027 100644 --- a/howdoi/plugins/base.py +++ b/howdoi/plugins/base.py @@ -80,17 +80,17 @@ def _print_dbg(x): } -def _is_blocked(page): - for indicator in BLOCK_INDICATORS: - if page.find(indicator) != -1: - return True - return False - - class BasePlugin(): - def search(self): - print("Hello search") - pass + def __init__(self, cache=None): + if cache is None: + cache = NullCache() + self.cache = cache + + def _is_blocked(self, page): + for indicator in BLOCK_INDICATORS: + if page.find(indicator) != -1: + return True + return False def _add_links_to_text(self, element): hyperlinks = element.find('a') @@ -125,19 +125,6 @@ def get_text(self, element): def _get_search_url(self, search_engine): return SEARCH_URLS.get(search_engine, SEARCH_URLS['google']) - def _get_links(self, query): - search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') - search_url = self._get_search_url(search_engine) - - result = self._get_result(search_url.format(URL, url_quote(query))) - if _is_blocked(result): - _print_err('Unable to find an answer because the search engine temporarily blocked the request. ' - 'Please wait a few minutes or select a different search engine.') - raise BlockError("Temporary block by search engine") - - html = pq(result) - return self._extract_links(html, search_engine) - def _extract_links_from_bing(self, html): html.remove_namespaces() return [a.attrib['href'] for a in html('.b_algo')('h2')('a')] @@ -165,55 +152,6 @@ def _extract_links(self, html, search_engine): return self._extract_links_from_duckduckgo(html) return self._extract_links_from_google(html) - def _get_links_with_cache(self, query): - cache_key = query + "-links" - res = cache.get(cache_key) - if res: - if res == CACHE_EMPTY_VAL: - res = False - return res - - links = self._get_links(query) - if not links: - cache.set(cache_key, CACHE_EMPTY_VAL) - - question_links = self._get_questions(links) - cache.set(cache_key, question_links or CACHE_EMPTY_VAL) - - return question_links - - def _get_answers(self, args): - """ - @args: command-line arguments - returns: array of answers and their respective metadata - False if unable to get answers - """ - question_links = self._get_links_with_cache(args['query']) - if not question_links: - return False - - answers = [] - initial_position = args['pos'] - multiple_answers = (args['num_answers'] > 1 or args['all']) - - for answer_number in range(args['num_answers']): - current_position = answer_number + initial_position - args['pos'] = current_position - link = self.get_link_at_pos(question_links, current_position) - answer = self._get_answer(args, question_links) - if not answer: - continue - if not args['link'] and not args['json_output'] and multiple_answers: - answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) - answer += '\n' - answers.append({ - 'answer': answer, - 'link': link, - 'position': current_position - }) - - return answers - def get_proxies(self): proxies = getproxies() filtered_proxies = {} diff --git a/howdoi/plugins/stackoverflow.py b/howdoi/plugins/stackoverflow.py index 7c86a278d..9f421c13f 100644 --- a/howdoi/plugins/stackoverflow.py +++ b/howdoi/plugins/stackoverflow.py @@ -1,15 +1,44 @@ from howdoi.plugins import BasePlugin +import re +import os +import sys +import appdirs + from pygments import highlight from pygments.formatters.terminal import TerminalFormatter from pygments.lexers import get_lexer_by_name, guess_lexer from pygments.util import ClassNotFound +if sys.version < '3': + import codecs + from urllib import quote as url_quote + from urllib import getproxies + from urlparse import urlparse, parse_qs + + # Handling Unicode: http://stackoverflow.com/a/6633040/305414 + def u(x): + return codecs.unicode_escape_decode(x)[0] +else: + from urllib.request import getproxies + from urllib.parse import quote as url_quote, urlparse, parse_qs + + def u(x): + return x + BLOCKED_QUESTION_FRAGMENTS = ( - 'webcache.googleusercontent.com', + 'webself.cache.googleusercontent.com', ) -class StackOverflowPlugin(BasePlugin.BasePlugin): +URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' + + +CACHE_EMPTY_VAL = "NULL" +CACHE_DIR = appdirs.user_cache_dir('howdoi') +CACHE_ENTRY_MAX = 128 + + +class StackOverflowPlugin(BasePlugin): def format_output(self, code, args): if not args['color']: return code @@ -35,22 +64,90 @@ def format_output(self, code, args): lexer, TerminalFormatter(bg='dark')) + def search(self, args): + return self._get_answers(args) + + def _get_answers(self, args): + """ + @args: command-line arguments + returns: array of answers and their respective metadata + False if unable to get answers + """ + question_links = self._get_links_with_cache(args['query']) + if not question_links: + return False + + answers = [] + initial_position = args['pos'] + multiple_answers = (args['num_answers'] > 1 or args['all']) + + for answer_number in range(args['num_answers']): + current_position = answer_number + initial_position + args['pos'] = current_position + link = self.get_link_at_pos(question_links, current_position) + answer = self._get_answer(args, question_links) + if not answer: + continue + if not args['link'] and not args['json_output'] and multiple_answers: + answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) + answer += '\n' + answers.append({ + 'answer': answer, + 'link': link, + 'position': current_position + }) + + return answers + + def _get_links(self, query): + search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') + search_url = self._get_search_url(search_engine) + + result = self._get_result(search_url.format(URL, url_quote(query))) + if self._is_blocked(result): + _print_err('Unable to find an answer because the search engine temporarily blocked the request. ' + 'Please wait a few minutes or select a different search engine.') + raise BlockError("Temporary block by search engine") + + html = pq(result) + return self._extract_links(html, search_engine) + + def _get_links_with_cache(self, query): + cache_key = query + "-links" + res = self.cache.get(cache_key) + if res: + if res == CACHE_EMPTY_VAL: + res = False + return res + + links = self._get_links(query) + if not links: + self.cache.set(cache_key, CACHE_EMPTY_VAL) + + question_links = self._get_questions(links) + self.cache.set(cache_key, question_links or CACHE_EMPTY_VAL) + + return question_links + def _is_question(self, link): for fragment in BLOCKED_QUESTION_FRAGMENTS: if fragment in link: return False return re.search(r'questions/\d+/', link) + def _get_result(self, url): + return [{'answer': 'scala> val x = "scala is awesome"\nx: java.lang.String = scala is awesome\n\nscala> x.reverse\nres1: String = emosewa si alacs\n', 'link': 'https://stackoverflow.com/questions/7700399/scala-reverse-string', 'position': 1}] + def _get_answer(self, args, links): link = self.get_link_at_pos(links, args['pos']) if not link: return False cache_key = link - page = cache.get(link) + page = self.cache.get(link) if not page: page = self._get_result(link + '?answertab=votes') - cache.set(cache_key, page) + self.cache.set(cache_key, page) html = pq(page) From 383b252b70497cf38449902b4eebdccf058a0aa9 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Wed, 24 Jun 2020 17:25:00 +0100 Subject: [PATCH 06/15] restore functionality to initial state --- howdoi/plugins/base.py | 13 ----- howdoi/plugins/stackoverflow.py | 95 +++++++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 18 deletions(-) diff --git a/howdoi/plugins/base.py b/howdoi/plugins/base.py index 08886c027..958acc248 100644 --- a/howdoi/plugins/base.py +++ b/howdoi/plugins/base.py @@ -9,8 +9,6 @@ from pyquery import PyQuery as pq -class BlockError(RuntimeError): - pass # Handle imports for Python 2 and 3 @@ -163,17 +161,6 @@ def get_proxies(self): filtered_proxies[key] = value return filtered_proxies - def _get_result(self, url): - pass - # try: - # return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, - # proxies=get_proxies(), - # verify=VERIFY_SSL_CERTIFICATE).text - # except requests.exceptions.SSLError as e: - # _print_err('Encountered an SSL Error. Try using HTTP instead of ' - # 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') - # raise e - def extract(self): print("Hello extract") pass diff --git a/howdoi/plugins/stackoverflow.py b/howdoi/plugins/stackoverflow.py index 9f421c13f..765a4ae54 100644 --- a/howdoi/plugins/stackoverflow.py +++ b/howdoi/plugins/stackoverflow.py @@ -1,14 +1,47 @@ -from howdoi.plugins import BasePlugin - -import re import os +import re import sys -import appdirs +import appdirs +import requests from pygments import highlight from pygments.formatters.terminal import TerminalFormatter from pygments.lexers import get_lexer_by_name, guess_lexer from pygments.util import ClassNotFound +from pyquery import PyQuery as pq + + +from howdoi.plugins import BasePlugin + +if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https + SCHEME = 'http://' + VERIFY_SSL_CERTIFICATE = False +else: + SCHEME = 'https://' + VERIFY_SSL_CERTIFICATE = True + +USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', + 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', + ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' + 'Chrome/19.0.1084.46 Safari/536.5'), + ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' + 'Safari/536.5'), ) + + +def _random_int(width): + bres = os.urandom(width) + if sys.version < '3': + ires = int(bres.encode('hex'), 16) + else: + ires = int.from_bytes(bres, 'little') + + return ires + + +def _random_choice(seq): + return seq[_random_int(1) % len(seq)] + if sys.version < '3': import codecs @@ -36,6 +69,26 @@ def u(x): CACHE_EMPTY_VAL = "NULL" CACHE_DIR = appdirs.user_cache_dir('howdoi') CACHE_ENTRY_MAX = 128 +NO_ANSWER_MSG = '< no answer given >' +ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') +STAR_HEADER = u('\u2605') + + +class BlockError(RuntimeError): + pass + +howdoi_session = requests.session() + + +def _print_err(x): + print("[ERROR] " + x) + + +_print_ok = print # noqa: E305 + + +def _print_dbg(x): + print("[DEBUG] " + x) # noqa: E302 class StackOverflowPlugin(BasePlugin): @@ -136,7 +189,14 @@ def _is_question(self, link): return re.search(r'questions/\d+/', link) def _get_result(self, url): - return [{'answer': 'scala> val x = "scala is awesome"\nx: java.lang.String = scala is awesome\n\nscala> x.reverse\nres1: String = emosewa si alacs\n', 'link': 'https://stackoverflow.com/questions/7700399/scala-reverse-string', 'position': 1}] + try: + return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, + proxies=self.get_proxies(), + verify=VERIFY_SSL_CERTIFICATE).text + except requests.exceptions.SSLError as e: + _print_err('Encountered an SSL Error. Try using HTTP instead of ' + 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') + raise e def _get_answer(self, args, links): link = self.get_link_at_pos(links, args['pos']) @@ -175,5 +235,30 @@ def _get_answer(self, args, links): text = text.strip() return text + def _format_output(self, code, args): + if not args['color']: + return code + lexer = None + + # try to find a lexer using the StackOverflow tags + # or the query arguments + for keyword in args['query'].split() + args['tags']: + try: + lexer = get_lexer_by_name(keyword) + break + except ClassNotFound: + pass + + # no lexer found above, use the guesser + if not lexer: + try: + lexer = guess_lexer(code) + except ClassNotFound: + return code + + return highlight(code, + lexer, + TerminalFormatter(bg='dark')) + def _get_questions(self, links): return [link for link in links if self._is_question(link)] From e0e3164a664d08e4b676f7dfa012da4835bf3b9c Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Wed, 24 Jun 2020 17:26:36 +0100 Subject: [PATCH 07/15] remove call to clear_cache in howdoi.py --- howdoi/howdoi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index 32480014e..49a344a33 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -219,5 +219,4 @@ def command_line_runner(): if __name__ == '__main__': - _clear_cache() command_line_runner() From 0cafae7bae0c347ac2974201fde5fa8cd0432565 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Wed, 24 Jun 2020 17:27:55 +0100 Subject: [PATCH 08/15] ensure to use cache initialized in outer scope --- howdoi/howdoi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index 49a344a33..a28b12f61 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -149,7 +149,7 @@ def howdoi(raw_query): return _format_answers(res, args) try: - plugin = StackOverflowPlugin() + plugin = StackOverflowPlugin(cache=cache) res = plugin.search(args) if not res: res = {"error": "Sorry, couldn\'t find any help with that topic\n"} From 9121577e75f9f604f42d069311c2f961ff08d952 Mon Sep 17 00:00:00 2001 From: Cesare De Cal Date: Thu, 25 Jun 2020 11:01:56 +0200 Subject: [PATCH 09/15] Delete duplicate function format_output from StackOverflowPlugin --- howdoi/plugins/stackoverflow.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/howdoi/plugins/stackoverflow.py b/howdoi/plugins/stackoverflow.py index 765a4ae54..b6baf039f 100644 --- a/howdoi/plugins/stackoverflow.py +++ b/howdoi/plugins/stackoverflow.py @@ -92,31 +92,6 @@ def _print_dbg(x): class StackOverflowPlugin(BasePlugin): - def format_output(self, code, args): - if not args['color']: - return code - lexer = None - - # try to find a lexer using the StackOverflow tags - # or the query arguments - for keyword in args['query'].split() + args['tags']: - try: - lexer = get_lexer_by_name(keyword) - break - except ClassNotFound: - pass - - # no lexer found above, use the guesser - if not lexer: - try: - lexer = guess_lexer(code) - except ClassNotFound: - return code - - return highlight(code, - lexer, - TerminalFormatter(bg='dark')) - def search(self, args): return self._get_answers(args) From aaa3013a5dd2012280c7c6850e366d4de5d6ca42 Mon Sep 17 00:00:00 2001 From: Cesare De Cal Date: Thu, 25 Jun 2020 12:47:09 +0200 Subject: [PATCH 10/15] Created utils file and refactored StackOverflow to use less code --- .vscode/settings.json | 3 + howdoi/howdoi.py | 53 ++------- howdoi/plugins/base.py | 132 ++++++++++++++++----- howdoi/plugins/stackoverflow.py | 195 ++++++-------------------------- howdoi/utils.py | 40 +++++++ 5 files changed, 190 insertions(+), 233 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 howdoi/utils.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..0862d6c3d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/Users/cesaredecal/workspace/Environments/howdoi/bin/python" +} \ No newline at end of file diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index a28b12f61..b75f3bbf7 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -25,31 +25,16 @@ from requests.exceptions import SSLError from howdoi.plugins import StackOverflowPlugin +from howdoi.utils import _print_ok, _print_err + CACHE_EMPTY_VAL = "NULL" CACHE_DIR = appdirs.user_cache_dir('howdoi') CACHE_ENTRY_MAX = 128 -# rudimentary standardized 3-level log output - - -def _print_err(x): print("[ERROR] " + x) - - -_print_ok = print # noqa: E305 -def _print_dbg(x): print("[DEBUG] " + x) # noqa: E302 - SUPPORTED_SEARCH_ENGINES = ('google', 'bing', 'duckduckgo') -USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', - ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' - 'Chrome/19.0.1084.46 Safari/536.5'), - ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' - 'Safari/536.5'), ) - SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', 'do howdoi', 'howdoi howdoi', 'howdoi use howdoi'] @@ -61,24 +46,14 @@ def _print_dbg(x): print("[DEBUG] " + x) # noqa: E302 cache = FileSystemCache(CACHE_DIR, CACHE_ENTRY_MAX, default_timeout=0) -def _random_int(width): - bres = os.urandom(width) - if sys.version < '3': - ires = int(bres.encode('hex'), 16) - else: - ires = int.from_bytes(bres, 'little') - - return ires - - -def _random_choice(seq): - return seq[_random_int(1) % len(seq)] - - def build_splitter(splitter_character='=', splitter_length=80): return '\n' + splitter_character * splitter_length + '\n\n' +def _get_cache_key(args): + return str(args) + __version__ + + def _clear_cache(): global cache if not cache: @@ -87,10 +62,6 @@ def _clear_cache(): return cache.clear() -def _is_help_query(query: str): - return any([query.lower() == help_query for help_query in SUPPORTED_HELP_QUERIES]) - - def _format_answers(res, args): if "error" in res: return res["error"] @@ -109,6 +80,10 @@ def _format_answers(res, args): return build_splitter().join(formatted_answers) +def _is_help_query(query: str): + return any([query.lower() == help_query for help_query in SUPPORTED_HELP_QUERIES]) + + def _get_help_instructions(): instruction_splitter = build_splitter(' ', 60) query = 'print hello world in python' @@ -127,10 +102,6 @@ def _get_help_instructions(): return instruction_splitter.join(instructions) -def _get_cache_key(args): - return str(args) + __version__ - - def howdoi(raw_query): args = raw_query if type(raw_query) is str: # you can pass either a raw or a parsed query @@ -150,7 +121,7 @@ def howdoi(raw_query): try: plugin = StackOverflowPlugin(cache=cache) - res = plugin.search(args) + res = plugin.get_answers(args) if not res: res = {"error": "Sorry, couldn\'t find any help with that topic\n"} cache.set(cache_key, res) @@ -176,7 +147,7 @@ def get_parser(): action='store_true') parser.add_argument('-e', '--engine', help='change search engine for this query only (google, bing, duckduckgo)', dest='search_engine', nargs="?", default='google') - parser.add_argument('--plugin', help='use the base plugin', type=str, default='stackoverflow') + parser.add_argument('--plugin', help='query a specific plugin (default: stackoverflow)', type=str, default='stackoverflow') return parser diff --git a/howdoi/plugins/base.py b/howdoi/plugins/base.py index 958acc248..95e80d924 100644 --- a/howdoi/plugins/base.py +++ b/howdoi/plugins/base.py @@ -1,14 +1,13 @@ import os import re import sys - +import requests import appdirs from cachelib import FileSystemCache, NullCache from pyquery import PyQuery as pq - - +from howdoi.utils import _print_err, _random_choice # Handle imports for Python 2 and 3 @@ -28,21 +27,7 @@ def u(x): def u(x): return x -# rudimentary standardized 3-level log output - - -def _print_err(x): - print("[ERROR] " + x) - - -_print_ok = print # noqa: E305 - -def _print_dbg(x): - print("[DEBUG] " + x) # noqa: E302 - - -CACHE_EMPTY_VAL = "NULL" CACHE_DIR = appdirs.user_cache_dir('howdoi') CACHE_ENTRY_MAX = 128 @@ -54,7 +39,6 @@ def _print_dbg(x): ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') STAR_HEADER = u('\u2605') CACHE_EMPTY_VAL = "NULL" -NO_ANSWER_MSG = '< no answer given >' if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https SCHEME = 'http://' @@ -77,6 +61,20 @@ def _print_dbg(x): 'duckduckgo': SCHEME + 'duckduckgo.com/?q=site:{0}%20{1}&t=hj&ia=web' } +USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', + 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', + ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' + 'Chrome/19.0.1084.46 Safari/536.5'), + ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' + 'Safari/536.5'), ) + + +class BlockError(RuntimeError): + pass + +howdoi_session = requests.session() + class BasePlugin(): def __init__(self, cache=None): @@ -84,12 +82,51 @@ def __init__(self, cache=None): cache = NullCache() self.cache = cache + + def get_proxies(self): + proxies = getproxies() + filtered_proxies = {} + for key, value in proxies.items(): + if key.startswith('http'): + if not value.startswith('http'): + filtered_proxies[key] = 'http://%s' % value + else: + filtered_proxies[key] = value + return filtered_proxies + + + def _get_result(self, url): + try: + return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, + proxies=self.get_proxies(), + verify=VERIFY_SSL_CERTIFICATE).text + except requests.exceptions.SSLError as e: + _print_err('Encountered an SSL Error. Try using HTTP instead of ' + 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') + raise e + + + def _get_links(self, query): + search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') + search_url = self._get_search_url(search_engine) + + result = self._get_result(search_url.format(URL, url_quote(query))) + if self._is_blocked(result): + _print_err('Unable to find an answer because the search engine temporarily blocked the request. ' + 'Please wait a few minutes or select a different search engine.') + raise BlockError("Temporary block by search engine") + + html = pq(result) + return self._extract_links(html, search_engine) + + def _is_blocked(self, page): for indicator in BLOCK_INDICATORS: if page.find(indicator) != -1: return True return False + def _add_links_to_text(self, element): hyperlinks = element.find('a') @@ -103,6 +140,7 @@ def _add_links_to_text(self, element): replacement = "[{0}]({1})".format(copy, href) pquery_object.replace_with(replacement) + def get_link_at_pos(self, links, position): if not links: return False @@ -112,6 +150,7 @@ def get_link_at_pos(self, links, position): link = links[-1] return link + def get_text(self, element): ''' return inner text in pyquery element ''' self._add_links_to_text(element) @@ -120,17 +159,21 @@ def get_text(self, element): except TypeError: return element.text() + def _get_search_url(self, search_engine): return SEARCH_URLS.get(search_engine, SEARCH_URLS['google']) + def _extract_links_from_bing(self, html): html.remove_namespaces() return [a.attrib['href'] for a in html('.b_algo')('h2')('a')] + def _extract_links_from_google(self, html): return [a.attrib['href'] for a in html('.l')] or \ [a.attrib['href'] for a in html('.r')('a')] + def _extract_links_from_duckduckgo(self, html): html.remove_namespaces() links_anchors = html.find('a.result__a') @@ -143,6 +186,7 @@ def _extract_links_from_duckduckgo(self, html): results.append(parsed_url[0]) return results + def _extract_links(self, html, search_engine): if search_engine == 'bing': return self._extract_links_from_bing(html) @@ -150,17 +194,43 @@ def _extract_links(self, html, search_engine): return self._extract_links_from_duckduckgo(html) return self._extract_links_from_google(html) - def get_proxies(self): - proxies = getproxies() - filtered_proxies = {} - for key, value in proxies.items(): - if key.startswith('http'): - if not value.startswith('http'): - filtered_proxies[key] = 'http://%s' % value - else: - filtered_proxies[key] = value - return filtered_proxies - def extract(self): - print("Hello extract") - pass + def get_answer(self, args, links): + raise NotImplementedError + + + def _get_links_with_cache(self, query): + raise NotImplementedError + + + def get_answers(self, args): + """ + @args: command-line arguments + returns: array of answers and their respective metadata + False if unable to get answers + """ + question_links = self._get_links_with_cache(args['query']) + if not question_links: + return False + + answers = [] + initial_position = args['pos'] + multiple_answers = (args['num_answers'] > 1 or args['all']) + + for answer_number in range(args['num_answers']): + current_position = answer_number + initial_position + args['pos'] = current_position + link = self.get_link_at_pos(question_links, current_position) + answer = self.get_answer(args, question_links) + if not answer: + continue + if not args['link'] and not args['json_output'] and multiple_answers: + answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) + answer += '\n' + answers.append({ + 'answer': answer, + 'link': link, + 'position': current_position + }) + + return answers diff --git a/howdoi/plugins/stackoverflow.py b/howdoi/plugins/stackoverflow.py index b6baf039f..5fd38521a 100644 --- a/howdoi/plugins/stackoverflow.py +++ b/howdoi/plugins/stackoverflow.py @@ -1,144 +1,61 @@ import os import re -import sys -import appdirs -import requests from pygments import highlight from pygments.formatters.terminal import TerminalFormatter from pygments.lexers import get_lexer_by_name, guess_lexer from pygments.util import ClassNotFound -from pyquery import PyQuery as pq - +from pyquery import PyQuery as pq from howdoi.plugins import BasePlugin -if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https - SCHEME = 'http://' - VERIFY_SSL_CERTIFICATE = False -else: - SCHEME = 'https://' - VERIFY_SSL_CERTIFICATE = True - -USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', - ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' - 'Chrome/19.0.1084.46 Safari/536.5'), - ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' - 'Safari/536.5'), ) - - -def _random_int(width): - bres = os.urandom(width) - if sys.version < '3': - ires = int(bres.encode('hex'), 16) - else: - ires = int.from_bytes(bres, 'little') - - return ires - - -def _random_choice(seq): - return seq[_random_int(1) % len(seq)] - - -if sys.version < '3': - import codecs - from urllib import quote as url_quote - from urllib import getproxies - from urlparse import urlparse, parse_qs +URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' - # Handling Unicode: http://stackoverflow.com/a/6633040/305414 - def u(x): - return codecs.unicode_escape_decode(x)[0] -else: - from urllib.request import getproxies - from urllib.parse import quote as url_quote, urlparse, parse_qs +CACHE_EMPTY_VAL = "NULL" - def u(x): - return x +NO_ANSWER_MSG = '< no answer given >' BLOCKED_QUESTION_FRAGMENTS = ( 'webself.cache.googleusercontent.com', ) -URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' - - -CACHE_EMPTY_VAL = "NULL" -CACHE_DIR = appdirs.user_cache_dir('howdoi') -CACHE_ENTRY_MAX = 128 -NO_ANSWER_MSG = '< no answer given >' -ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') -STAR_HEADER = u('\u2605') - - -class BlockError(RuntimeError): - pass - -howdoi_session = requests.session() - +class StackOverflowPlugin(BasePlugin): + def _is_question(self, link): + for fragment in BLOCKED_QUESTION_FRAGMENTS: + if fragment in link: + return False + return re.search(r'questions/\d+/', link) -def _print_err(x): - print("[ERROR] " + x) + def _get_questions(self, links): + return [link for link in links if self._is_question(link)] -_print_ok = print # noqa: E305 + def _format_output(self, code, args): + if not args['color']: + return code + lexer = None -def _print_dbg(x): - print("[DEBUG] " + x) # noqa: E302 + # try to find a lexer using the StackOverflow tags + # or the query arguments + for keyword in args['query'].split() + args['tags']: + try: + lexer = get_lexer_by_name(keyword) + break + except ClassNotFound: + pass + # no lexer found above, use the guesser + if not lexer: + try: + lexer = guess_lexer(code) + except ClassNotFound: + return code -class StackOverflowPlugin(BasePlugin): - def search(self, args): - return self._get_answers(args) - - def _get_answers(self, args): - """ - @args: command-line arguments - returns: array of answers and their respective metadata - False if unable to get answers - """ - question_links = self._get_links_with_cache(args['query']) - if not question_links: - return False + return highlight(code, + lexer, + TerminalFormatter(bg='dark')) - answers = [] - initial_position = args['pos'] - multiple_answers = (args['num_answers'] > 1 or args['all']) - - for answer_number in range(args['num_answers']): - current_position = answer_number + initial_position - args['pos'] = current_position - link = self.get_link_at_pos(question_links, current_position) - answer = self._get_answer(args, question_links) - if not answer: - continue - if not args['link'] and not args['json_output'] and multiple_answers: - answer = ANSWER_HEADER.format(link, answer, STAR_HEADER) - answer += '\n' - answers.append({ - 'answer': answer, - 'link': link, - 'position': current_position - }) - - return answers - - def _get_links(self, query): - search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') - search_url = self._get_search_url(search_engine) - - result = self._get_result(search_url.format(URL, url_quote(query))) - if self._is_blocked(result): - _print_err('Unable to find an answer because the search engine temporarily blocked the request. ' - 'Please wait a few minutes or select a different search engine.') - raise BlockError("Temporary block by search engine") - - html = pq(result) - return self._extract_links(html, search_engine) def _get_links_with_cache(self, query): cache_key = query + "-links" @@ -153,27 +70,11 @@ def _get_links_with_cache(self, query): self.cache.set(cache_key, CACHE_EMPTY_VAL) question_links = self._get_questions(links) - self.cache.set(cache_key, question_links or CACHE_EMPTY_VAL) return question_links - def _is_question(self, link): - for fragment in BLOCKED_QUESTION_FRAGMENTS: - if fragment in link: - return False - return re.search(r'questions/\d+/', link) - def _get_result(self, url): - try: - return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, - proxies=self.get_proxies(), - verify=VERIFY_SSL_CERTIFICATE).text - except requests.exceptions.SSLError as e: - _print_err('Encountered an SSL Error. Try using HTTP instead of ' - 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') - raise e - - def _get_answer(self, args, links): + def get_answer(self, args, links): link = self.get_link_at_pos(links, args['pos']) if not link: return False @@ -209,31 +110,3 @@ def _get_answer(self, args, links): text = NO_ANSWER_MSG text = text.strip() return text - - def _format_output(self, code, args): - if not args['color']: - return code - lexer = None - - # try to find a lexer using the StackOverflow tags - # or the query arguments - for keyword in args['query'].split() + args['tags']: - try: - lexer = get_lexer_by_name(keyword) - break - except ClassNotFound: - pass - - # no lexer found above, use the guesser - if not lexer: - try: - lexer = guess_lexer(code) - except ClassNotFound: - return code - - return highlight(code, - lexer, - TerminalFormatter(bg='dark')) - - def _get_questions(self, links): - return [link for link in links if self._is_question(link)] diff --git a/howdoi/utils.py b/howdoi/utils.py new file mode 100644 index 000000000..2151dd13c --- /dev/null +++ b/howdoi/utils.py @@ -0,0 +1,40 @@ +import os +import sys + + +if sys.version < '3': + import codecs + # Handling Unicode: http://stackoverflow.com/a/6633040/305414 + def u(x): + return codecs.unicode_escape_decode(x)[0] +else: + def u(x): + return x + + +# rudimentary standardized 3-level log output + + +def _print_err(x): + print("[ERROR] " + x) + + +_print_ok = print # noqa: E305 + + +def _print_dbg(x): + print("[DEBUG] " + x) # noqa: E302 + + +def _random_int(width): + bres = os.urandom(width) + if sys.version < '3': + ires = int(bres.encode('hex'), 16) + else: + ires = int.from_bytes(bres, 'little') + + return ires + + +def _random_choice(seq): + return seq[_random_int(1) % len(seq)] From 40aedbf7d7cec5b09603be7a0a5ac9f898244808 Mon Sep 17 00:00:00 2001 From: Cesare De Cal Date: Thu, 25 Jun 2020 12:54:40 +0200 Subject: [PATCH 11/15] Delete settings.json --- .vscode/settings.json | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 0862d6c3d..000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.pythonPath": "/Users/cesaredecal/workspace/Environments/howdoi/bin/python" -} \ No newline at end of file From 4506c35dde21ba9f1eb6d34a1a1477d09d731ffb Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Thu, 25 Jun 2020 14:39:08 +0100 Subject: [PATCH 12/15] clean up howdoi file and put constants in standalone file --- .gitignore | 4 +++- howdoi/constants.py | 12 ++++++++++++ howdoi/howdoi.py | 31 ++++++++++++------------------- 3 files changed, 27 insertions(+), 20 deletions(-) create mode 100644 howdoi/constants.py diff --git a/.gitignore b/.gitignore index 70d7a1e1a..f647f11ba 100644 --- a/.gitignore +++ b/.gitignore @@ -185,4 +185,6 @@ fabric.properties .ropeproject README.html .idea -HOW_TO_RELEASE.txt \ No newline at end of file +HOW_TO_RELEASE.txt + +.vscode \ No newline at end of file diff --git a/howdoi/constants.py b/howdoi/constants.py new file mode 100644 index 000000000..6724e93d0 --- /dev/null +++ b/howdoi/constants.py @@ -0,0 +1,12 @@ +import appdirs + +CACHE_EMPTY_VAL = "NULL" + +CACHE_DIR = appdirs.user_cache_dir('howdoi') + +CACHE_ENTRY_MAX = 128 + +SUPPORTED_SEARCH_ENGINES = ('google', 'bing', 'duckduckgo') + +SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', + 'do howdoi', 'howdoi howdoi', 'howdoi use howdoi'] diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index b75f3bbf7..fe364f1f5 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -9,34 +9,26 @@ ###################################################### from __future__ import print_function -import gc -gc.disable() # noqa: E402 + import argparse -import os -import appdirs +import gc import json -import requests +import os import sys -from . import __version__ +import requests from cachelib import FileSystemCache, NullCache +from requests.exceptions import ConnectionError, SSLError -from requests.exceptions import ConnectionError -from requests.exceptions import SSLError - +from howdoi.constants import (CACHE_DIR, CACHE_ENTRY_MAX, + SUPPORTED_HELP_QUERIES, SUPPORTED_SEARCH_ENGINES) from howdoi.plugins import StackOverflowPlugin -from howdoi.utils import _print_ok, _print_err - - -CACHE_EMPTY_VAL = "NULL" -CACHE_DIR = appdirs.user_cache_dir('howdoi') -CACHE_ENTRY_MAX = 128 +from howdoi.utils import _print_err, _print_ok +from . import __version__ -SUPPORTED_SEARCH_ENGINES = ('google', 'bing', 'duckduckgo') +gc.disable() # noqa: E402 -SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', - 'do howdoi', 'howdoi howdoi', 'howdoi use howdoi'] howdoi_session = requests.session() @@ -147,7 +139,8 @@ def get_parser(): action='store_true') parser.add_argument('-e', '--engine', help='change search engine for this query only (google, bing, duckduckgo)', dest='search_engine', nargs="?", default='google') - parser.add_argument('--plugin', help='query a specific plugin (default: stackoverflow)', type=str, default='stackoverflow') + parser.add_argument('--plugin', help='query a specific plugin (default: stackoverflow)', + type=str, default='stackoverflow') return parser From 619d3ea21993ee172bda22dbbf4a87ec594a56f9 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Thu, 25 Jun 2020 14:50:44 +0100 Subject: [PATCH 13/15] extracted constants to standalone file --- howdoi/constants.py | 39 ++++++++++++++++++++++ howdoi/plugins/base.py | 57 ++++----------------------------- howdoi/plugins/stackoverflow.py | 5 +-- 3 files changed, 46 insertions(+), 55 deletions(-) diff --git a/howdoi/constants.py b/howdoi/constants.py index 6724e93d0..f88b0f130 100644 --- a/howdoi/constants.py +++ b/howdoi/constants.py @@ -1,5 +1,19 @@ +import os + import appdirs + +def u(x): + return x + + +if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https + SCHEME = 'http://' + VERIFY_SSL_CERTIFICATE = False +else: + SCHEME = 'https://' + VERIFY_SSL_CERTIFICATE = True + CACHE_EMPTY_VAL = "NULL" CACHE_DIR = appdirs.user_cache_dir('howdoi') @@ -10,3 +24,28 @@ SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', 'do howdoi', 'howdoi howdoi', 'howdoi use howdoi'] + +ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') + +STAR_HEADER = u('\u2605') + + +BLOCK_INDICATORS = ( + 'form id="captcha-form"', + 'This page appears when Google automatically detects requests coming from your computer ' + 'network which appear to be in violation of the Terms of Service' +) + +SEARCH_URLS = { + 'bing': SCHEME + 'www.bing.com/search?q=site:{0}%20{1}&hl=en', + 'google': SCHEME + 'www.google.com/search?q=site:{0}%20{1}&hl=en', + 'duckduckgo': SCHEME + 'duckduckgo.com/?q=site:{0}%20{1}&t=hj&ia=web' +} + +USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', + 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', + ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' + 'Chrome/19.0.1084.46 Safari/536.5'), + ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' + 'Safari/536.5'), ) diff --git a/howdoi/plugins/base.py b/howdoi/plugins/base.py index 95e80d924..0fe828a44 100644 --- a/howdoi/plugins/base.py +++ b/howdoi/plugins/base.py @@ -1,13 +1,15 @@ import os -import re import sys import requests -import appdirs from cachelib import FileSystemCache, NullCache from pyquery import PyQuery as pq from howdoi.utils import _print_err, _random_choice +from howdoi.constants import ( + VERIFY_SSL_CERTIFICATE, BLOCK_INDICATORS, STAR_HEADER, + ANSWER_HEADER, CACHE_ENTRY_MAX, CACHE_DIR, USER_AGENTS, SEARCH_URLS +) # Handle imports for Python 2 and 3 @@ -28,51 +30,19 @@ def u(x): return x -CACHE_DIR = appdirs.user_cache_dir('howdoi') -CACHE_ENTRY_MAX = 128 - if os.getenv('HOWDOI_DISABLE_CACHE'): cache = NullCache() # works like an always empty cache else: cache = FileSystemCache(CACHE_DIR, CACHE_ENTRY_MAX, default_timeout=0) -ANSWER_HEADER = u('{2} Answer from {0} {2}\n{1}') -STAR_HEADER = u('\u2605') -CACHE_EMPTY_VAL = "NULL" - -if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https - SCHEME = 'http://' - VERIFY_SSL_CERTIFICATE = False -else: - SCHEME = 'https://' - VERIFY_SSL_CERTIFICATE = True - -BLOCK_INDICATORS = ( - 'form id="captcha-form"', - 'This page appears when Google automatically detects requests coming from your computer ' - 'network which appear to be in violation of the Terms of Service' -) URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' -SEARCH_URLS = { - 'bing': SCHEME + 'www.bing.com/search?q=site:{0}%20{1}&hl=en', - 'google': SCHEME + 'www.google.com/search?q=site:{0}%20{1}&hl=en', - 'duckduckgo': SCHEME + 'duckduckgo.com/?q=site:{0}%20{1}&t=hj&ia=web' -} - -USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', - ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' - 'Chrome/19.0.1084.46 Safari/536.5'), - ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' - 'Safari/536.5'), ) - class BlockError(RuntimeError): pass + howdoi_session = requests.session() @@ -82,7 +52,6 @@ def __init__(self, cache=None): cache = NullCache() self.cache = cache - def get_proxies(self): proxies = getproxies() filtered_proxies = {} @@ -94,7 +63,6 @@ def get_proxies(self): filtered_proxies[key] = value return filtered_proxies - def _get_result(self, url): try: return howdoi_session.get(url, headers={'User-Agent': _random_choice(USER_AGENTS)}, @@ -105,7 +73,6 @@ def _get_result(self, url): 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') raise e - def _get_links(self, query): search_engine = os.getenv('HOWDOI_SEARCH_ENGINE', 'google') search_url = self._get_search_url(search_engine) @@ -119,14 +86,12 @@ def _get_links(self, query): html = pq(result) return self._extract_links(html, search_engine) - def _is_blocked(self, page): for indicator in BLOCK_INDICATORS: if page.find(indicator) != -1: return True return False - def _add_links_to_text(self, element): hyperlinks = element.find('a') @@ -140,7 +105,6 @@ def _add_links_to_text(self, element): replacement = "[{0}]({1})".format(copy, href) pquery_object.replace_with(replacement) - def get_link_at_pos(self, links, position): if not links: return False @@ -150,7 +114,6 @@ def get_link_at_pos(self, links, position): link = links[-1] return link - def get_text(self, element): ''' return inner text in pyquery element ''' self._add_links_to_text(element) @@ -159,21 +122,17 @@ def get_text(self, element): except TypeError: return element.text() - def _get_search_url(self, search_engine): return SEARCH_URLS.get(search_engine, SEARCH_URLS['google']) - def _extract_links_from_bing(self, html): html.remove_namespaces() return [a.attrib['href'] for a in html('.b_algo')('h2')('a')] - def _extract_links_from_google(self, html): return [a.attrib['href'] for a in html('.l')] or \ [a.attrib['href'] for a in html('.r')('a')] - def _extract_links_from_duckduckgo(self, html): html.remove_namespaces() links_anchors = html.find('a.result__a') @@ -186,7 +145,6 @@ def _extract_links_from_duckduckgo(self, html): results.append(parsed_url[0]) return results - def _extract_links(self, html, search_engine): if search_engine == 'bing': return self._extract_links_from_bing(html) @@ -194,15 +152,12 @@ def _extract_links(self, html, search_engine): return self._extract_links_from_duckduckgo(html) return self._extract_links_from_google(html) - def get_answer(self, args, links): - raise NotImplementedError - + raise NotImplementedError def _get_links_with_cache(self, query): raise NotImplementedError - def get_answers(self, args): """ @args: command-line arguments diff --git a/howdoi/plugins/stackoverflow.py b/howdoi/plugins/stackoverflow.py index 5fd38521a..cb11275f7 100644 --- a/howdoi/plugins/stackoverflow.py +++ b/howdoi/plugins/stackoverflow.py @@ -19,6 +19,7 @@ 'webself.cache.googleusercontent.com', ) + class StackOverflowPlugin(BasePlugin): def _is_question(self, link): for fragment in BLOCKED_QUESTION_FRAGMENTS: @@ -26,11 +27,9 @@ def _is_question(self, link): return False return re.search(r'questions/\d+/', link) - def _get_questions(self, links): return [link for link in links if self._is_question(link)] - def _format_output(self, code, args): if not args['color']: return code @@ -56,7 +55,6 @@ def _format_output(self, code, args): lexer, TerminalFormatter(bg='dark')) - def _get_links_with_cache(self, query): cache_key = query + "-links" res = self.cache.get(cache_key) @@ -73,7 +71,6 @@ def _get_links_with_cache(self, query): return question_links - def get_answer(self, args, links): link = self.get_link_at_pos(links, args['pos']) if not link: From cd9fd572b8850987e3c3b33a1565a70cd1a08bd8 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Thu, 25 Jun 2020 14:56:19 +0100 Subject: [PATCH 14/15] extracted unicode handling to constants --- howdoi/constants.py | 5 +---- howdoi/plugins/base.py | 8 -------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/howdoi/constants.py b/howdoi/constants.py index f88b0f130..cb4dc91a5 100644 --- a/howdoi/constants.py +++ b/howdoi/constants.py @@ -1,10 +1,7 @@ import os - import appdirs - -def u(x): - return x +from howdoi.utils import u if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https diff --git a/howdoi/plugins/base.py b/howdoi/plugins/base.py index 0fe828a44..4e224c747 100644 --- a/howdoi/plugins/base.py +++ b/howdoi/plugins/base.py @@ -14,21 +14,13 @@ # Handle imports for Python 2 and 3 if sys.version < '3': - import codecs from urllib import quote as url_quote from urllib import getproxies from urlparse import urlparse, parse_qs - - # Handling Unicode: http://stackoverflow.com/a/6633040/305414 - def u(x): - return codecs.unicode_escape_decode(x)[0] else: from urllib.request import getproxies from urllib.parse import quote as url_quote, urlparse, parse_qs - def u(x): - return x - if os.getenv('HOWDOI_DISABLE_CACHE'): cache = NullCache() # works like an always empty cache From 53d0b71934c67c1501b52993be00d7f09bf86df5 Mon Sep 17 00:00:00 2001 From: Eyitayo Ogunbiyi Date: Thu, 25 Jun 2020 14:56:55 +0100 Subject: [PATCH 15/15] applying linting on utils.py --- howdoi/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/howdoi/utils.py b/howdoi/utils.py index 2151dd13c..a60682430 100644 --- a/howdoi/utils.py +++ b/howdoi/utils.py @@ -5,6 +5,7 @@ if sys.version < '3': import codecs # Handling Unicode: http://stackoverflow.com/a/6633040/305414 + def u(x): return codecs.unicode_escape_decode(x)[0] else: