From cba3367ead6165a961b1753d15e5967bfbfa0052 Mon Sep 17 00:00:00 2001 From: Joseph Hendrix <80018977+joeleehen@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:16:33 -0600 Subject: [PATCH 1/3] Cleanup (#45) * added callback for --list option * Update README.md --- README.md | 8 +++++--- pubscraper/main.py | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e1e1003..d7757c5 100644 --- a/README.md +++ b/README.md @@ -161,9 +161,11 @@ Springer,Dan Stanzione,10.1007/s44290-024-00034-6,Discover Civil Engineering,Art - Python >=3.12 - Poetry (using [asdf-poetry](https://github.com/asdf-community/asdf-poetry) is recommended) - poetry-bumpversion plugin - ```console - > poetry self add poetry-bumpversion - ``` +```console +> git clone git@github.com:tacc/publication-scraper.git +> cd publication-scraper +> poetry self add poetry-bumpversion +``` Before developing, first install the script with developmend dependencies: ```console diff --git a/pubscraper/main.py b/pubscraper/main.py index 4af8eb3..579091f 100644 --- a/pubscraper/main.py +++ b/pubscraper/main.py @@ -2,7 +2,6 @@ import logging import time import os -import sys import tablib from openpyxl import load_workbook @@ -61,6 +60,17 @@ def set_log_file(ctx, param, value): return value +def list_configured_apis(ctx, param, value): + """ + Callback function for click that lists available APIs + """ + if value: + click.secho("Available endpoints:", underline=True) + for endpoint in APIS.keys(): + click.secho(f" {endpoint}", fg="blue") + ctx.exit() + + @click.command() @click.version_option(__version__) @click.option( @@ -112,6 +122,8 @@ def set_log_file(ctx, param, value): "list_apis", is_flag=True, default=False, + is_eager=True, + callback=list_configured_apis, help="Display APIs configured for search queries", ) @click.option( @@ -150,12 +162,6 @@ def main( if log_file: logger.debug(f"Writing logs to {log_file}") - if list_apis: - click.secho("Available endpoints:", underline=True) - for endpoint in APIS.keys(): - click.secho(f" {endpoint}", fg="blue") - exit(0) - logger.info(f"Querying the following APIs:\n{(", ").join(apis)}") try: authors_workbook = load_workbook(filename=input_file, read_only=True) From 6465bd7920397a4521d757f6a5c3fb03c8af99c8 Mon Sep 17 00:00:00 2001 From: Magret Date: Thu, 12 Dec 2024 15:24:23 -0600 Subject: [PATCH 2/3] Updated hard-coded vars to config.py --- pubscraper/APIClasses/CrossRef.py | 3 --- pubscraper/APIClasses/Elsevier.py | 3 --- pubscraper/APIClasses/IEEE.py | 3 --- pubscraper/APIClasses/MDPI.py | 3 --- pubscraper/APIClasses/PLOS.py | 3 --- pubscraper/APIClasses/PubMed.py | 3 --- pubscraper/APIClasses/Springer.py | 3 --- pubscraper/APIClasses/Wiley.py | 3 --- pubscraper/APIClasses/arXiv.py | 4 ---- pubscraper/config.py | 3 ++- pubscraper/main.py | 5 ++--- 11 files changed, 4 insertions(+), 32 deletions(-) diff --git a/pubscraper/APIClasses/CrossRef.py b/pubscraper/APIClasses/CrossRef.py index 7a0e18a..2f2b648 100644 --- a/pubscraper/APIClasses/CrossRef.py +++ b/pubscraper/APIClasses/CrossRef.py @@ -6,9 +6,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) # NOTE: we might want to limit results to works published after TACC was founded diff --git a/pubscraper/APIClasses/Elsevier.py b/pubscraper/APIClasses/Elsevier.py index 1797e88..2576ae5 100644 --- a/pubscraper/APIClasses/Elsevier.py +++ b/pubscraper/APIClasses/Elsevier.py @@ -9,9 +9,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) class Elsevier(Base): diff --git a/pubscraper/APIClasses/IEEE.py b/pubscraper/APIClasses/IEEE.py index ebae799..7ee8ab7 100644 --- a/pubscraper/APIClasses/IEEE.py +++ b/pubscraper/APIClasses/IEEE.py @@ -5,9 +5,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) diff --git a/pubscraper/APIClasses/MDPI.py b/pubscraper/APIClasses/MDPI.py index 9a97627..e344152 100644 --- a/pubscraper/APIClasses/MDPI.py +++ b/pubscraper/APIClasses/MDPI.py @@ -6,9 +6,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) diff --git a/pubscraper/APIClasses/PLOS.py b/pubscraper/APIClasses/PLOS.py index 1659cb0..ce989b1 100644 --- a/pubscraper/APIClasses/PLOS.py +++ b/pubscraper/APIClasses/PLOS.py @@ -6,9 +6,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) diff --git a/pubscraper/APIClasses/PubMed.py b/pubscraper/APIClasses/PubMed.py index 9f4f1d6..53d77d2 100644 --- a/pubscraper/APIClasses/PubMed.py +++ b/pubscraper/APIClasses/PubMed.py @@ -7,9 +7,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) diff --git a/pubscraper/APIClasses/Springer.py b/pubscraper/APIClasses/Springer.py index a749b34..02d296f 100644 --- a/pubscraper/APIClasses/Springer.py +++ b/pubscraper/APIClasses/Springer.py @@ -9,9 +9,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) diff --git a/pubscraper/APIClasses/Wiley.py b/pubscraper/APIClasses/Wiley.py index 58c2cb4..e36aba0 100644 --- a/pubscraper/APIClasses/Wiley.py +++ b/pubscraper/APIClasses/Wiley.py @@ -8,9 +8,6 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) diff --git a/pubscraper/APIClasses/arXiv.py b/pubscraper/APIClasses/arXiv.py index 9b310ad..1ec3d5e 100644 --- a/pubscraper/APIClasses/arXiv.py +++ b/pubscraper/APIClasses/arXiv.py @@ -7,12 +7,8 @@ from pubscraper.APIClasses.Base import Base import pubscraper.config as config -LOG_FORMAT = config.LOGGER_FORMAT_STRING -LOG_LEVEL = config.LOGGER_LEVEL -logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) logger = logging.getLogger(__name__) - class ArXiv(Base): def __init__(self): """ diff --git a/pubscraper/config.py b/pubscraper/config.py index 5608aab..277e157 100644 --- a/pubscraper/config.py +++ b/pubscraper/config.py @@ -14,4 +14,5 @@ PLOS_URL = "https://api.plos.org/search" CROSSREF_URL = "https://api.crossref.org/works" -API_LIST = ["Elsevier", "IEEE", "MDPI", "PubMed", "Springer", "Wiley", "arXiv", "PLOS", "CrossRef"] +WS_NAME = "Sheet1" +TIME_SLEEP = 0.4 \ No newline at end of file diff --git a/pubscraper/main.py b/pubscraper/main.py index 4af8eb3..ca7d25b 100644 --- a/pubscraper/main.py +++ b/pubscraper/main.py @@ -2,7 +2,6 @@ import logging import time import os -import sys import tablib from openpyxl import load_workbook @@ -159,7 +158,7 @@ def main( logger.info(f"Querying the following APIs:\n{(", ").join(apis)}") try: authors_workbook = load_workbook(filename=input_file, read_only=True) - worksheet = authors_workbook["Sheet1"] + worksheet = authors_workbook[config.WS_NAME] rows = worksheet.rows name_dict = {} @@ -205,7 +204,7 @@ def main( results.update({author: authors_pubs}) authors_and_pubs.append(results) - time.sleep(0.4) + time.sleep(config.TIME_SLEEP) """ Using TabLib to format data in specified format From 1e2a708e0cc026e7663b1d00b90eda8acae8098f Mon Sep 17 00:00:00 2001 From: Joseph Hendrix Date: Thu, 12 Dec 2024 15:45:47 -0600 Subject: [PATCH 3/3] bumped version --- pubscraper/version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pubscraper/version.py b/pubscraper/version.py index 3dc1f76..49e0fc1 100644 --- a/pubscraper/version.py +++ b/pubscraper/version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.7.0" diff --git a/pyproject.toml b/pyproject.toml index 568d9be..9d4bda7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pubscraper" -version = "0.1.0" +version = "0.7.0" description = "A tool designed to pull publication information from various publishers associated with a given author name" authors = ["Magret Adekunle ", "Joseph Hendrix