Skip to content

Commit

Permalink
Properly encode characters used in URL for arxiv and semanticscholar …
Browse files Browse the repository at this point in the history
…backends
  • Loading branch information
stijnh committed May 16, 2022
1 parent 65f3d87 commit ffa6a8e
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 11 deletions.
17 changes: 9 additions & 8 deletions litstudy/sources/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, List
import feedparser # type: ignore
from datetime import datetime
from urllib.parse import urlencode
import time


Expand Down Expand Up @@ -60,6 +61,8 @@ def category(self) -> Optional[List[str]]:
'''returns arxiv category for article'''
return self.entry.get('tags', None)[0].get('term', None)

# Base api query url
ARXIV_SEARCH_URL = 'http://export.arxiv.org/api/query'

def search_arxiv(search_query,
start=0,
Expand Down Expand Up @@ -89,16 +92,14 @@ def search_arxiv(search_query,

docs = list()

# Base api query url
base_url = 'http://export.arxiv.org/api/query?'

print(f'Searching arXiv for {search_query}')

for i in range(start, total_results, results_per_iteration):
query = (f'search_query={search_query}&start={i}&max_results='
f'{results_per_iteration}')
query = urlencode(dict(
search_query=search_query,
start=i,
max_results=results_per_iteration
))

url = base_url + query
url = f'{ARXIV_SEARCH_URL}?{query}'
data = feedparser.parse(url)

for entry in data.entries:
Expand Down
6 changes: 3 additions & 3 deletions litstudy/sources/semanticscholar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from time import sleep
from typing import Tuple, Optional
from urllib.parse import quote_plus
from urllib.parse import urlencode
import logging
import requests
import shelve
Expand Down Expand Up @@ -115,11 +115,11 @@ def request_results(query, offset, cache, timeout=DEFAULT_TIMEOUT):


def request_paper(key, cache, timeout=DEFAULT_TIMEOUT):
cache_key = f'paper={key}'
cache_key = urlencode(dict(paper=key))
if cache_key in cache:
return cache[cache_key]

url = S2_PAPER_URL + quote_plus(key)
url = S2_PAPER_URL + cache_key

try:
sleep(timeout)
Expand Down

0 comments on commit ffa6a8e

Please sign in to comment.