Skip to content

Commit

Permalink
feat(file_server): download pdf files via http instead of ssh
Browse files Browse the repository at this point in the history
- changed ssh_ parameters (host, user, password) to file_server_
- added functions to download pdf files and to list supp files dirs from http server
- updated tests
- updated README
  • Loading branch information
valearna committed Mar 11, 2022
1 parent 9186d06 commit bb81b4e
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 70 deletions.
18 changes: 11 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,24 @@ from wbtools.literature.corpus import CorpusManager
paper_id = "00050564"
cm = CorpusManager()
cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost",
paper_ids=[paper_id], ssh_host="ssh_host", ssh_user="ssh_user", ssh_passwd="ssh_passwd")
paper_ids=[paper_id], file_server_host="file_server_base_url", file_server_user="username",
file_server_passwd="password")
sentences = cm.get_paper(paper_id).get_text_docs(split_sentences=True)
```

### Get the latest papers (up to 50) added to WormBase or modified in the last month
### Get the latest papers (up to 50) added to WormBase or modified in the last 30 days

```python
from wbtools.literature.corpus import CorpusManager
import datetime

one_month_ago = (datetime.datetime.now() - datetime.timedelta(days=30)).strftime("%M/%D/%Y")

cm = CorpusManager()
cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost",
from_date=datetime.datetime.now(), max_num_papers=50, ssh_host="ssh_host", ssh_user="ssh_user",
ssh_passwd="ssh_passwd")
from_date=one_month_ago, max_num_papers=50,
file_server_host="file_server_base_url", file_server_user="username",
file_server_passwd="password")
paper_ids = [paper.paper_id for paper in cm.get_all_papers()]
```

Expand All @@ -43,8 +47,8 @@ import datetime

cm = CorpusManager()
cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost",
from_date=datetime.datetime.now(), max_num_papers=50, must_be_autclass_flagged=True,
exclude_pap_types=['Review'], exclude_temp_pdf=True, ssh_host="ssh_host", ssh_user="ssh_user",
ssh_passwd="ssh_passwd")
max_num_papers=50, must_be_autclass_flagged=True, exclude_pap_types=['Review'],
exclude_temp_pdf=True, file_server_host="file_server_base_url",
file_server_user="username", file_server_passwd="password")
paper_ids = [paper.paper_id for paper in cm.get_all_papers()]
```
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="wbtools",
version="1.2.16",
version="1.3.0",
author="Valerio Arnaboldi",
author_email="[email protected]",
description="Interface to WormBase (www.wormbase.org) curation data, including literature management and NLP "
Expand Down
2 changes: 1 addition & 1 deletion tests/db/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_get_curated_variations(self):
curated_variations = self.db_manager.get_curated_variations(exclude_id_used_as_name=True)
allele_regex = ALL_VAR_REGEX.format(designations=self.db_manager.get_allele_designations())
for variation in curated_variations:
self.assertTrue(re.match(allele_regex, variation))
self.assertTrue(re.match(allele_regex, variation.lower()))

def test_entity_name_id_maps(self):
gene_name_id_map = self.db_manager.get_gene_name_id_map()
Expand Down
4 changes: 2 additions & 2 deletions tests/lib/entity_extraction/test_email_addresses.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def test_get_email_addresses_from_paper(self):
cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"],
db_password=config["wb_database"]["db_password"],
db_host=config["wb_database"]["db_host"],
ssh_user=tazendra_config["ssh"]["ssh_user"],
ssh_passwd=tazendra_config["ssh"]["ssh_password"],
file_server_user=tazendra_config["file_server"]["user"],
file_server_passwd=tazendra_config["file_server"]["password"],
paper_ids=['00062455'])
email_addresses = get_email_addresses_from_text(cm.get_paper('00062455').get_text_docs(
include_supplemental=False, return_concatenated=True))
Expand Down
4 changes: 2 additions & 2 deletions tests/lib/test_text_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def test_sectioning_cell_template(self):
cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"],
db_password=config["wb_database"]["db_password"],
db_host=config["wb_database"]["db_host"],
ssh_user=tazendra_config["ssh"]["ssh_user"],
ssh_passwd=tazendra_config["ssh"]["ssh_password"],
file_server_user=tazendra_config["file_server"]["user"],
file_server_passwd=tazendra_config["file_server"]["password"],
paper_ids=['00059375'])
fulltext = cm.get_paper('00059375').get_text_docs(remove_sections=[PaperSections.REFERENCES],
must_be_present=[PaperSections.METHOD, PaperSections.RESULTS])
Expand Down
16 changes: 8 additions & 8 deletions tests/literature/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ def test_load_from_wb_database(self):
cm.load_from_wb_database(db_name=db_config["wb_database"]["db_name"], db_user=db_config["wb_database"]["db_user"],
db_password=db_config["wb_database"]["db_password"],
db_host=db_config["wb_database"]["db_host"],
ssh_user=tazendra_config["ssh"]["ssh_user"],
ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2)
file_server_user=tazendra_config["file_server"]["user"],
file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2)
self.assertTrue(cm.size() == 2)
cm.load_from_wb_database(db_name=db_config["wb_database"]["db_name"],
db_user=db_config["wb_database"]["db_user"],
db_password=db_config["wb_database"]["db_password"],
db_host=db_config["wb_database"]["db_host"],
ssh_user=tazendra_config["ssh"]["ssh_user"],
ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2,
file_server_user=tazendra_config["file_server"]["user"],
file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2,
exclude_temp_pdf=True)
self.assertFalse(any([paper.is_temp() for paper in cm.get_all_papers()]))

Expand All @@ -83,8 +83,8 @@ def test_load_supplemental(self):
db_user=db_config["wb_database"]["db_user"],
db_password=db_config["wb_database"]["db_password"],
db_host=db_config["wb_database"]["db_host"],
ssh_user=tazendra_config["ssh"]["ssh_user"],
ssh_passwd=tazendra_config["ssh"]["ssh_password"], paper_ids=["00062512"])
file_server_user=tazendra_config["file_server"]["user"],
file_server_passwd=tazendra_config["file_server"]["password"], paper_ids=["00062512"])
self.assertTrue(len(cm.get_paper("00062512").supplemental_docs) > 0)

@unittest.skipIf(not os.path.exists(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data",
Expand All @@ -97,8 +97,8 @@ def test_load_from_wb_database_afp(self):
db_user=db_config["wb_database"]["db_user"],
db_password=db_config["wb_database"]["db_password"],
db_host=db_config["wb_database"]["db_host"],
ssh_user=tazendra_config["ssh"]["ssh_user"],
ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2,
file_server_user=tazendra_config["file_server"]["user"],
file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2,
load_curation_info=True, load_afp_info=True,
exclude_temp_pdf=True, exclude_afp_processed=True, must_be_autclass_flagged=True)
self.assertFalse(any([paper.afp_processed for paper in cm.get_all_papers()]))
Expand Down
28 changes: 16 additions & 12 deletions tests/literature/test_paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,13 @@ def test_extract_all_email_addresses_from_text(self):
"local_config", "db.cfg")), "Test DB config file not present")
def test_pdf2txt_conversion(self):
config = read_db_config()
ssh_config = read_tazendra_config()
file_server_config = read_tazendra_config()
db_manager = WBPaperDBManager(
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
paper = WBPaper(paper_id="00003969", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
ssh_passwd=ssh_config["ssh"]["ssh_password"])
paper = WBPaper(paper_id="00003969", db_manager=db_manager,
file_server_user=file_server_config["file_server"]["user"],
file_server_passwd=file_server_config["file_server"]["password"])
paper.load_text_from_pdf_files_in_db()
fulltext = paper.get_text_docs()
self.assertGreater(len(fulltext), 0)
Expand All @@ -86,12 +87,13 @@ def test_pdf2txt_conversion(self):
"local_config", "db.cfg")), "Test DB config file not present")
def test_pdf_table_conversion(self):
config = read_db_config()
ssh_config = read_tazendra_config()
file_server_config = read_tazendra_config()
db_manager = WBPaperDBManager(
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
paper = WBPaper(paper_id="00059755", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
ssh_passwd=ssh_config["ssh"]["ssh_password"])
paper = WBPaper(paper_id="00059755", db_manager=db_manager,
file_server_user=file_server_config["file_server"]["user"],
file_server_passwd=file_server_config["file_server"]["password"])
paper.load_text_from_pdf_files_in_db()
fulltext = paper.get_text_docs()
self.assertTrue(fulltext)
Expand All @@ -100,12 +102,13 @@ def test_pdf_table_conversion(self):
"local_config", "db.cfg")), "Test DB config file not present")
def test_tokenize_sentences_with_tables(self):
config = read_db_config()
ssh_config = read_tazendra_config()
file_server_config = read_tazendra_config()
db_manager = WBPaperDBManager(
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
paper = WBPaper(paper_id="00003969", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
ssh_passwd=ssh_config["ssh"]["ssh_password"])
paper = WBPaper(paper_id="00003969", db_manager=db_manager,
file_server_user=file_server_config["file_server"]["user"],
file_server_passwd=file_server_config["file_server"]["password"])
paper.load_text_from_pdf_files_in_db()
sentences = paper.get_text_docs(split_sentences=True)
self.assertGreater(len(sentences), 0)
Expand All @@ -114,12 +117,13 @@ def test_tokenize_sentences_with_tables(self):
"local_config", "db.cfg")), "Test DB config file not present")
def test_two_cols_conversion(self):
config = read_db_config()
ssh_config = read_tazendra_config()
file_server_config = read_tazendra_config()
db_manager = WBPaperDBManager(
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
paper = WBPaper(paper_id="00055367", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
ssh_passwd=ssh_config["ssh"]["ssh_password"])
paper = WBPaper(paper_id="00055367", db_manager=db_manager,
file_server_user=file_server_config["file_server"]["user"],
file_server_passwd=file_server_config["file_server"]["password"])
paper.load_text_from_pdf_files_in_db()
fulltext = paper.get_text_docs()
self.assertTrue(fulltext)
25 changes: 25 additions & 0 deletions wbtools/lib/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import ssl
import tempfile
import urllib.request
from typing import List

Expand Down Expand Up @@ -64,3 +65,27 @@ def get_curated_papers(datatype, tazendra_user, tazendra_password) -> List[str]:
if m:
curated_papers = curated_papers | set(m.group(1).split())
return list(curated_papers)


def get_supp_file_names_from_paper_dir(paper_sup_dir_url, user, password):
request = urllib.request.Request(paper_sup_dir_url)
base64string = base64.b64encode(bytes('%s:%s' % (user, password), 'ascii'))
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
supp_files = set()
with urllib.request.urlopen(request) as response:
res = response.read().decode("utf8")
m = re.findall('.*alt="\[ \]"></td><td><a href="([^"]+)">.*', res)
if m:
supp_files = set(m)
return list(supp_files)


def download_pdf_file_from_url(url, user, password):
tmp_file = tempfile.NamedTemporaryFile()
request = urllib.request.Request(url)
base64string = base64.b64encode(bytes('%s:%s' % (user, password), 'ascii'))
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
with urllib.request.urlopen(request) as response:
with open(tmp_file.name, 'wb') as tmp_file_stream:
tmp_file_stream.write(response.read())
return tmp_file
13 changes: 7 additions & 6 deletions wbtools/literature/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def load_from_dir_with_txt_files(self, dir_path: str):
paper.add_file(dir_path=dir_path, filename=f, remote_file=False, pdf=False)

def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db_host: str,
ssh_host: str = 'tazendra.caltech.edu', ssh_user: str = None, ssh_passwd: str = None,
file_server_host: str = 'https://tazendra.caltech.edu/~acedb/daniel/',
file_server_user: str = None, file_server_passwd: str = None,
paper_ids: list = None,
from_date: str = None, load_pdf_files: bool = True, load_bib_info: bool = True,
load_curation_info: bool = True, load_afp_info: bool = False, max_num_papers: int = None,
Expand All @@ -74,9 +75,9 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db
db_user (str): database user
db_password (str): database password
db_host (str): database host
ssh_host (str): host where to fetch the files via ssh
ssh_user (str): ssh user to fetch pdf files
ssh_passwd (str): ssh password to fetch pdf files
file_server_host (str): host where to fetch the files via url
file_server_user (str): user required to log in to web form
file_server_passwd (str): password to fetch pdf files from web form
paper_ids (list): optional list of paper ids to be fetched
from_date (str): load papers added or modified from the specified date (only if paper_ids is not provided)
load_pdf_files (bool): load pdf files using ssh credentials
Expand Down Expand Up @@ -119,8 +120,8 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db
exclude_no_author_email else []

for paper_id in paper_ids:
paper = WBPaper(paper_id=paper_id, ssh_host=ssh_host, ssh_user=ssh_user,
ssh_passwd=ssh_passwd, db_manager=main_db_manager.paper)
paper = WBPaper(paper_id=paper_id, file_server_host=file_server_host, file_server_user=file_server_user,
file_server_passwd=file_server_passwd, db_manager=main_db_manager.paper)
if exclude_afp_processed and paper_id in afp_processed_ids:
logger.info("Skipping paper already processed by AFP")
continue
Expand Down
Loading

0 comments on commit bb81b4e

Please sign in to comment.