Properly encode characters used in URL for arxiv and semanticscholar …

…backends
NLeSC · May 16, 2022 · ffa6a8e · ffa6a8e
1 parent 65f3d87
commit ffa6a8e
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 11 deletions.
diff --git a/litstudy/sources/arxiv.py b/litstudy/sources/arxiv.py
@@ -2,6 +2,7 @@
 from typing import Optional, List
 import feedparser  # type: ignore
 from datetime import datetime
+from urllib.parse import urlencode
 import time
 
 
@@ -60,6 +61,8 @@ def category(self) -> Optional[List[str]]:
         '''returns arxiv category for article'''
         return self.entry.get('tags', None)[0].get('term', None)
 
+# Base api query url
+ARXIV_SEARCH_URL = 'http://export.arxiv.org/api/query'
 
 def search_arxiv(search_query,
                  start=0,
@@ -89,16 +92,14 @@ def search_arxiv(search_query,
 
     docs = list()
 
-    # Base api query url
-    base_url = 'http://export.arxiv.org/api/query?'
-
-    print(f'Searching arXiv for {search_query}')
-
     for i in range(start, total_results, results_per_iteration):
-        query = (f'search_query={search_query}&start={i}&max_results='
-                 f'{results_per_iteration}')
+        query = urlencode(dict(
+            search_query=search_query,
+            start=i,
+            max_results=results_per_iteration
+        ))
 
-        url = base_url + query
+        url = f'{ARXIV_SEARCH_URL}?{query}'
         data = feedparser.parse(url)
 
         for entry in data.entries:

diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py
@@ -1,6 +1,6 @@
 from time import sleep
 from typing import Tuple, Optional
-from urllib.parse import quote_plus
+from urllib.parse import urlencode
 import logging
 import requests
 import shelve
@@ -115,11 +115,11 @@ def request_results(query, offset, cache, timeout=DEFAULT_TIMEOUT):
 
 
 def request_paper(key, cache, timeout=DEFAULT_TIMEOUT):
-    cache_key = f'paper={key}'
+    cache_key = urlencode(dict(paper=key))
     if cache_key in cache:
         return cache[cache_key]
 
-    url = S2_PAPER_URL + quote_plus(key)
+    url = S2_PAPER_URL + cache_key
 
     try:
         sleep(timeout)