From bb81b4e4727d32c784ac839496e526f2ea33df0e Mon Sep 17 00:00:00 2001 From: Valerio Arnaboldi Date: Thu, 10 Mar 2022 21:13:21 -0800 Subject: [PATCH] feat(file_server): download pdf files via http instead of ssh - changed ssh_ parameters (host, user, password) to file_server_ - added functions to download pdf files and to list supp files dirs from http server - updated tests - updated README --- README.md | 18 +++--- setup.py | 2 +- tests/db/test_generic.py | 2 +- .../entity_extraction/test_email_addresses.py | 4 +- tests/lib/test_text_preprocessing.py | 4 +- tests/literature/test_corpus.py | 16 +++--- tests/literature/test_paper.py | 28 ++++++---- wbtools/lib/scraping.py | 25 +++++++++ wbtools/literature/corpus.py | 13 +++-- wbtools/literature/paper.py | 56 +++++++++---------- 10 files changed, 98 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index c8d7fa2..14bafb8 100644 --- a/README.md +++ b/README.md @@ -18,20 +18,24 @@ from wbtools.literature.corpus import CorpusManager paper_id = "00050564" cm = CorpusManager() cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost", - paper_ids=[paper_id], ssh_host="ssh_host", ssh_user="ssh_user", ssh_passwd="ssh_passwd") + paper_ids=[paper_id], file_server_host="file_server_base_url", file_server_user="username", + file_server_passwd="password") sentences = cm.get_paper(paper_id).get_text_docs(split_sentences=True) ``` -### Get the latest papers (up to 50) added to WormBase or modified in the last month +### Get the latest papers (up to 50) added to WormBase or modified in the last 30 days ```python from wbtools.literature.corpus import CorpusManager import datetime +one_month_ago = (datetime.datetime.now() - datetime.timedelta(days=30)).strftime("%M/%D/%Y") + cm = CorpusManager() cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost", - from_date=datetime.datetime.now(), max_num_papers=50, ssh_host="ssh_host", ssh_user="ssh_user", - ssh_passwd="ssh_passwd") + from_date=one_month_ago, max_num_papers=50, + file_server_host="file_server_base_url", file_server_user="username", + file_server_passwd="password") paper_ids = [paper.paper_id for paper in cm.get_all_papers()] ``` @@ -43,8 +47,8 @@ import datetime cm = CorpusManager() cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost", - from_date=datetime.datetime.now(), max_num_papers=50, must_be_autclass_flagged=True, - exclude_pap_types=['Review'], exclude_temp_pdf=True, ssh_host="ssh_host", ssh_user="ssh_user", - ssh_passwd="ssh_passwd") + max_num_papers=50, must_be_autclass_flagged=True, exclude_pap_types=['Review'], + exclude_temp_pdf=True, file_server_host="file_server_base_url", + file_server_user="username", file_server_passwd="password") paper_ids = [paper.paper_id for paper in cm.get_all_papers()] ``` \ No newline at end of file diff --git a/setup.py b/setup.py index f0c2684..f444813 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="wbtools", - version="1.2.16", + version="1.3.0", author="Valerio Arnaboldi", author_email="valearna@caltech.edu", description="Interface to WormBase (www.wormbase.org) curation data, including literature management and NLP " diff --git a/tests/db/test_generic.py b/tests/db/test_generic.py index 0c0a50d..7dc18e5 100644 --- a/tests/db/test_generic.py +++ b/tests/db/test_generic.py @@ -24,7 +24,7 @@ def test_get_curated_variations(self): curated_variations = self.db_manager.get_curated_variations(exclude_id_used_as_name=True) allele_regex = ALL_VAR_REGEX.format(designations=self.db_manager.get_allele_designations()) for variation in curated_variations: - self.assertTrue(re.match(allele_regex, variation)) + self.assertTrue(re.match(allele_regex, variation.lower())) def test_entity_name_id_maps(self): gene_name_id_map = self.db_manager.get_gene_name_id_map() diff --git a/tests/lib/entity_extraction/test_email_addresses.py b/tests/lib/entity_extraction/test_email_addresses.py index 9dd3574..d621574 100644 --- a/tests/lib/entity_extraction/test_email_addresses.py +++ b/tests/lib/entity_extraction/test_email_addresses.py @@ -23,8 +23,8 @@ def test_get_email_addresses_from_paper(self): cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"], db_password=config["wb_database"]["db_password"], db_host=config["wb_database"]["db_host"], - ssh_user=tazendra_config["ssh"]["ssh_user"], - ssh_passwd=tazendra_config["ssh"]["ssh_password"], + file_server_user=tazendra_config["file_server"]["user"], + file_server_passwd=tazendra_config["file_server"]["password"], paper_ids=['00062455']) email_addresses = get_email_addresses_from_text(cm.get_paper('00062455').get_text_docs( include_supplemental=False, return_concatenated=True)) diff --git a/tests/lib/test_text_preprocessing.py b/tests/lib/test_text_preprocessing.py index 3ec6757..4656e65 100644 --- a/tests/lib/test_text_preprocessing.py +++ b/tests/lib/test_text_preprocessing.py @@ -30,8 +30,8 @@ def test_sectioning_cell_template(self): cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"], db_password=config["wb_database"]["db_password"], db_host=config["wb_database"]["db_host"], - ssh_user=tazendra_config["ssh"]["ssh_user"], - ssh_passwd=tazendra_config["ssh"]["ssh_password"], + file_server_user=tazendra_config["file_server"]["user"], + file_server_passwd=tazendra_config["file_server"]["password"], paper_ids=['00059375']) fulltext = cm.get_paper('00059375').get_text_docs(remove_sections=[PaperSections.REFERENCES], must_be_present=[PaperSections.METHOD, PaperSections.RESULTS]) diff --git a/tests/literature/test_corpus.py b/tests/literature/test_corpus.py index b9c990a..aa90345 100644 --- a/tests/literature/test_corpus.py +++ b/tests/literature/test_corpus.py @@ -63,15 +63,15 @@ def test_load_from_wb_database(self): cm.load_from_wb_database(db_name=db_config["wb_database"]["db_name"], db_user=db_config["wb_database"]["db_user"], db_password=db_config["wb_database"]["db_password"], db_host=db_config["wb_database"]["db_host"], - ssh_user=tazendra_config["ssh"]["ssh_user"], - ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2) + file_server_user=tazendra_config["file_server"]["user"], + file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2) self.assertTrue(cm.size() == 2) cm.load_from_wb_database(db_name=db_config["wb_database"]["db_name"], db_user=db_config["wb_database"]["db_user"], db_password=db_config["wb_database"]["db_password"], db_host=db_config["wb_database"]["db_host"], - ssh_user=tazendra_config["ssh"]["ssh_user"], - ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2, + file_server_user=tazendra_config["file_server"]["user"], + file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2, exclude_temp_pdf=True) self.assertFalse(any([paper.is_temp() for paper in cm.get_all_papers()])) @@ -83,8 +83,8 @@ def test_load_supplemental(self): db_user=db_config["wb_database"]["db_user"], db_password=db_config["wb_database"]["db_password"], db_host=db_config["wb_database"]["db_host"], - ssh_user=tazendra_config["ssh"]["ssh_user"], - ssh_passwd=tazendra_config["ssh"]["ssh_password"], paper_ids=["00062512"]) + file_server_user=tazendra_config["file_server"]["user"], + file_server_passwd=tazendra_config["file_server"]["password"], paper_ids=["00062512"]) self.assertTrue(len(cm.get_paper("00062512").supplemental_docs) > 0) @unittest.skipIf(not os.path.exists(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data", @@ -97,8 +97,8 @@ def test_load_from_wb_database_afp(self): db_user=db_config["wb_database"]["db_user"], db_password=db_config["wb_database"]["db_password"], db_host=db_config["wb_database"]["db_host"], - ssh_user=tazendra_config["ssh"]["ssh_user"], - ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2, + file_server_user=tazendra_config["file_server"]["user"], + file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2, load_curation_info=True, load_afp_info=True, exclude_temp_pdf=True, exclude_afp_processed=True, must_be_autclass_flagged=True) self.assertFalse(any([paper.afp_processed for paper in cm.get_all_papers()])) diff --git a/tests/literature/test_paper.py b/tests/literature/test_paper.py index 143c951..d220a55 100644 --- a/tests/literature/test_paper.py +++ b/tests/literature/test_paper.py @@ -71,12 +71,13 @@ def test_extract_all_email_addresses_from_text(self): "local_config", "db.cfg")), "Test DB config file not present") def test_pdf2txt_conversion(self): config = read_db_config() - ssh_config = read_tazendra_config() + file_server_config = read_tazendra_config() db_manager = WBPaperDBManager( dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"], password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"]) - paper = WBPaper(paper_id="00003969", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"], - ssh_passwd=ssh_config["ssh"]["ssh_password"]) + paper = WBPaper(paper_id="00003969", db_manager=db_manager, + file_server_user=file_server_config["file_server"]["user"], + file_server_passwd=file_server_config["file_server"]["password"]) paper.load_text_from_pdf_files_in_db() fulltext = paper.get_text_docs() self.assertGreater(len(fulltext), 0) @@ -86,12 +87,13 @@ def test_pdf2txt_conversion(self): "local_config", "db.cfg")), "Test DB config file not present") def test_pdf_table_conversion(self): config = read_db_config() - ssh_config = read_tazendra_config() + file_server_config = read_tazendra_config() db_manager = WBPaperDBManager( dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"], password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"]) - paper = WBPaper(paper_id="00059755", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"], - ssh_passwd=ssh_config["ssh"]["ssh_password"]) + paper = WBPaper(paper_id="00059755", db_manager=db_manager, + file_server_user=file_server_config["file_server"]["user"], + file_server_passwd=file_server_config["file_server"]["password"]) paper.load_text_from_pdf_files_in_db() fulltext = paper.get_text_docs() self.assertTrue(fulltext) @@ -100,12 +102,13 @@ def test_pdf_table_conversion(self): "local_config", "db.cfg")), "Test DB config file not present") def test_tokenize_sentences_with_tables(self): config = read_db_config() - ssh_config = read_tazendra_config() + file_server_config = read_tazendra_config() db_manager = WBPaperDBManager( dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"], password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"]) - paper = WBPaper(paper_id="00003969", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"], - ssh_passwd=ssh_config["ssh"]["ssh_password"]) + paper = WBPaper(paper_id="00003969", db_manager=db_manager, + file_server_user=file_server_config["file_server"]["user"], + file_server_passwd=file_server_config["file_server"]["password"]) paper.load_text_from_pdf_files_in_db() sentences = paper.get_text_docs(split_sentences=True) self.assertGreater(len(sentences), 0) @@ -114,12 +117,13 @@ def test_tokenize_sentences_with_tables(self): "local_config", "db.cfg")), "Test DB config file not present") def test_two_cols_conversion(self): config = read_db_config() - ssh_config = read_tazendra_config() + file_server_config = read_tazendra_config() db_manager = WBPaperDBManager( dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"], password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"]) - paper = WBPaper(paper_id="00055367", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"], - ssh_passwd=ssh_config["ssh"]["ssh_password"]) + paper = WBPaper(paper_id="00055367", db_manager=db_manager, + file_server_user=file_server_config["file_server"]["user"], + file_server_passwd=file_server_config["file_server"]["password"]) paper.load_text_from_pdf_files_in_db() fulltext = paper.get_text_docs() self.assertTrue(fulltext) diff --git a/wbtools/lib/scraping.py b/wbtools/lib/scraping.py index ceb5836..6379f12 100644 --- a/wbtools/lib/scraping.py +++ b/wbtools/lib/scraping.py @@ -2,6 +2,7 @@ import os import re import ssl +import tempfile import urllib.request from typing import List @@ -64,3 +65,27 @@ def get_curated_papers(datatype, tazendra_user, tazendra_password) -> List[str]: if m: curated_papers = curated_papers | set(m.group(1).split()) return list(curated_papers) + + +def get_supp_file_names_from_paper_dir(paper_sup_dir_url, user, password): + request = urllib.request.Request(paper_sup_dir_url) + base64string = base64.b64encode(bytes('%s:%s' % (user, password), 'ascii')) + request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8')) + supp_files = set() + with urllib.request.urlopen(request) as response: + res = response.read().decode("utf8") + m = re.findall('.*alt="\[ \]">.*', res) + if m: + supp_files = set(m) + return list(supp_files) + + +def download_pdf_file_from_url(url, user, password): + tmp_file = tempfile.NamedTemporaryFile() + request = urllib.request.Request(url) + base64string = base64.b64encode(bytes('%s:%s' % (user, password), 'ascii')) + request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8')) + with urllib.request.urlopen(request) as response: + with open(tmp_file.name, 'wb') as tmp_file_stream: + tmp_file_stream.write(response.read()) + return tmp_file diff --git a/wbtools/literature/corpus.py b/wbtools/literature/corpus.py index c088329..3b5d77c 100644 --- a/wbtools/literature/corpus.py +++ b/wbtools/literature/corpus.py @@ -58,7 +58,8 @@ def load_from_dir_with_txt_files(self, dir_path: str): paper.add_file(dir_path=dir_path, filename=f, remote_file=False, pdf=False) def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db_host: str, - ssh_host: str = 'tazendra.caltech.edu', ssh_user: str = None, ssh_passwd: str = None, + file_server_host: str = 'https://tazendra.caltech.edu/~acedb/daniel/', + file_server_user: str = None, file_server_passwd: str = None, paper_ids: list = None, from_date: str = None, load_pdf_files: bool = True, load_bib_info: bool = True, load_curation_info: bool = True, load_afp_info: bool = False, max_num_papers: int = None, @@ -74,9 +75,9 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db db_user (str): database user db_password (str): database password db_host (str): database host - ssh_host (str): host where to fetch the files via ssh - ssh_user (str): ssh user to fetch pdf files - ssh_passwd (str): ssh password to fetch pdf files + file_server_host (str): host where to fetch the files via url + file_server_user (str): user required to log in to web form + file_server_passwd (str): password to fetch pdf files from web form paper_ids (list): optional list of paper ids to be fetched from_date (str): load papers added or modified from the specified date (only if paper_ids is not provided) load_pdf_files (bool): load pdf files using ssh credentials @@ -119,8 +120,8 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db exclude_no_author_email else [] for paper_id in paper_ids: - paper = WBPaper(paper_id=paper_id, ssh_host=ssh_host, ssh_user=ssh_user, - ssh_passwd=ssh_passwd, db_manager=main_db_manager.paper) + paper = WBPaper(paper_id=paper_id, file_server_host=file_server_host, file_server_user=file_server_user, + file_server_passwd=file_server_passwd, db_manager=main_db_manager.paper) if exclude_afp_processed and paper_id in afp_processed_ids: logger.info("Skipping paper already processed by AFP") continue diff --git a/wbtools/literature/paper.py b/wbtools/literature/paper.py index c8f5467..aed7833 100644 --- a/wbtools/literature/paper.py +++ b/wbtools/literature/paper.py @@ -16,6 +16,7 @@ from wbtools.db.person import WBPersonDBManager from wbtools.lib.nlp.entity_extraction.email_addresses import get_email_addresses_from_text from wbtools.lib.nlp.text_preprocessing import preprocess, get_documents_from_text, PaperSections +from wbtools.lib.scraping import get_supp_file_names_from_paper_dir, download_pdf_file_from_url from wbtools.lib.timeout import timeout from wbtools.literature.person import WBAuthor @@ -26,10 +27,10 @@ class PaperFileReader(object): - def __init__(self, ssh_host: str = '', ssh_user: str = '', ssh_passwd: str = ''): - self.ssh_host = ssh_host - self.ssh_user = ssh_user - self.ssh_passwd = ssh_passwd + def __init__(self, host: str = "https://tazendra.caltech.edu/~acedb/daniel/", user: str = '', password: str = ''): + self.host = host + self.user = user + self.passwd = password @staticmethod @timeout(3600) @@ -43,21 +44,11 @@ def convert_pdf_to_txt(file_path): return "" def get_supplemental_file_names(self, supp_dir_path): - with Connection(self.ssh_host, self.ssh_user, - connect_kwargs={"password": self.ssh_passwd}) as c, c.sftp() as sftp: - try: - return [filename for filename in sftp.listdir(supp_dir_path) if filename.endswith(".pdf")] - except UnicodeDecodeError: - logger.error("Cannot read non-unicode chars in filenames due to bug in ssh library") - return [] + return get_supp_file_names_from_paper_dir(self.host.rstrip("/") + "/" + supp_dir_path, self.user, self.passwd) def download_paper_and_extract_txt(self, file_url, pdf: bool = False): try: - with Connection(self.ssh_host, self.ssh_user, connect_kwargs={"password": self.ssh_passwd}) as \ - c, c.sftp() as sftp, sftp.open(file_url) as file_stream: - tmp_file = tempfile.NamedTemporaryFile() - with open(tmp_file.name, 'wb') as tmp_file_stream: - tmp_file_stream.write(file_stream.read()) + tmp_file = download_pdf_file_from_url(file_url, self.user, self.passwd) if pdf: return self.convert_pdf_to_txt(tmp_file.name) else: @@ -66,9 +57,9 @@ def download_paper_and_extract_txt(self, file_url, pdf: bool = False): logger.warning("File not found: " + file_url) return "" - def get_text_from_file(self, dir_path, filename, remote_file: bool = False, pdf: bool = False): + def get_text_from_file(self, dir_path, supp_dir, filename, remote_file: bool = False, pdf: bool = False): if remote_file: - text = self.download_paper_and_extract_txt(dir_path + filename, pdf) + text = self.download_paper_and_extract_txt(self.host.rstrip("/") + "/" + supp_dir + filename, pdf) else: with open(os.path.join(dir_path, filename), 'r') as file: text = file.read() @@ -82,9 +73,9 @@ class WBPaper(object): def __init__(self, paper_id: str = '', main_text: str = '', ocr_text: str = '', temp_text: str = '', aut_text: str = '', html_text: str = '', proof_text: str = '', supplemental_docs: list = None, - ssh_host: str = 'tazendra.caltech.edu', ssh_user: str = '', ssh_passwd: str = '', title: str = '', - journal: str = '', - pub_date: str = '', authors: List[WBAuthor] = None, abstract: str = '', doi: str = '', pmid: str = '', + file_server_host: str = 'https://tazendra.caltech.edu/~acedb/daniel/', file_server_user: str = '', + file_server_passwd: str = '', title: str = '', journal: str = '', pub_date: str = '', + authors: List[WBAuthor] = None, abstract: str = '', doi: str = '', pmid: str = '', db_manager: WBPaperDBManager = None): self.paper_id = paper_id self.title = title @@ -102,7 +93,8 @@ def __init__(self, paper_id: str = '', main_text: str = '', ocr_text: str = '', self.proof_text = proof_text self.supplemental_docs = supplemental_docs if supplemental_docs else [] self.aut_class_values = defaultdict(str) - self.paper_file_reader = PaperFileReader(ssh_host=ssh_host, ssh_user=ssh_user, ssh_passwd=ssh_passwd) + self.paper_file_reader = PaperFileReader(host=file_server_host, user=file_server_user, + password=file_server_passwd) self.afp_final_submission = False self.afp_processed = False self.afp_partial_submission = False @@ -170,11 +162,13 @@ def add_file(self, dir_path, filename, remote_file: bool = False, pdf: bool = Fa pdf (bool): whether the file is in pdf format """ all_supp = False + sup_dir = "" if not self.paper_file_reader and remote_file: raise Exception("a paper reader must be provided to access remote files") if dir_path.endswith("supplemental/") and re.match(r'^[0-9]+$', filename): - filenames = self.paper_file_reader.get_supplemental_file_names(dir_path + filename) + filenames = self.paper_file_reader.get_supplemental_file_names(filename) dir_path = dir_path + filename + "/" + sup_dir = "/" + filename + "/" all_supp = True else: filenames = [filename] @@ -190,24 +184,24 @@ def add_file(self, dir_path, filename, remote_file: bool = False, pdf: bool = Fa author_year or "supplementary" in author_year or "Supplementary" in author_year or re.match(r'[_-][Ss][0-9]+', author_year))): self.supplemental_docs.append(self.paper_file_reader.get_text_from_file( - dir_path, filename, remote_file, pdf)) + dir_path, sup_dir, filename, remote_file, pdf)) return if not additional_options: - self.main_text = self.paper_file_reader.get_text_from_file(dir_path, filename, remote_file, pdf) + self.main_text = self.paper_file_reader.get_text_from_file(dir_path, sup_dir, filename, remote_file, pdf) elif "Supp" in additional_options or "supp" in additional_options or "Table" in additional_options or \ "table" in additional_options or "Movie" in additional_options or "movie" in additional_options: self.supplemental_docs.append(self.paper_file_reader.get_text_from_file( - dir_path, filename, remote_file, pdf)) + dir_path, sup_dir, filename, remote_file, pdf)) elif "ocr" in additional_options: - self.ocr_text = self.paper_file_reader.get_text_from_file(dir_path, filename, remote_file, pdf) + self.ocr_text = self.paper_file_reader.get_text_from_file(dir_path, sup_dir, filename, remote_file, pdf) elif "_proof" in additional_options: - self.proof_text = self.paper_file_reader.get_text_from_file(dir_path, filename, remote_file, pdf) + self.proof_text = self.paper_file_reader.get_text_from_file(dir_path, sup_dir, filename, remote_file, pdf) elif "temp" in additional_options: - self.temp_text = self.paper_file_reader.get_text_from_file(dir_path, filename, remote_file, pdf) + self.temp_text = self.paper_file_reader.get_text_from_file(dir_path, sup_dir, filename, remote_file, pdf) elif "aut" in additional_options: - self.aut_text = self.paper_file_reader.get_text_from_file(dir_path, filename, remote_file, pdf) + self.aut_text = self.paper_file_reader.get_text_from_file(dir_path, sup_dir, filename, remote_file, pdf) elif "html" in additional_options: - self.html_text = self.paper_file_reader.get_text_from_file(dir_path, filename, remote_file, pdf) + self.html_text = self.paper_file_reader.get_text_from_file(dir_path, sup_dir, filename, remote_file, pdf) else: logger.warning("No rule to read filename: " + filename)