diff --git a/litstudy/sources/csv.py b/litstudy/sources/csv.py index aa1678e..a4d18f3 100644 --- a/litstudy/sources/csv.py +++ b/litstudy/sources/csv.py @@ -116,7 +116,7 @@ def publication_date(self): for fmt in formats: try: - return datetime.strptime(text, fmt) + return datetime.datetime.strptime(text, fmt) except Exception: pass @@ -167,6 +167,7 @@ def load_csv( citation_field: str = None, date_field: str = None, source_field: str = None, + doi_field: str = None, filter=None, ) -> DocumentSet: """Load an abitrary CSV file and parse its contents as a ``DocumentSet`` @@ -190,6 +191,8 @@ def load_csv( :param abstract_field: Field name for ``abstract``. :param citation_field: Field name for ``citation_count``. :param date_field: Field name for ``publication_date`` or + :param source_field: Field name for ``source``. + :param doi_field: Field name for ``doi``. :param filter: Optional function applied to each loaded record. This function can be used to, for example, add or delete fields. @@ -309,7 +312,8 @@ def load_csv( "pubmed id", ], ), - doi=find_field( + doi=doi_field + or find_field( columns, [ "doi", diff --git a/tests/resources/retraction_watch.csv b/tests/resources/retraction_watch.csv new file mode 100644 index 0000000..ed0d2c3 --- /dev/null +++ b/tests/resources/retraction_watch.csv @@ -0,0 +1,2 @@ +Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,RetractionDate,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes +4242,Reflections on Research Software,(B/T) Computer Science;(B/T) Data Science;(B/T) Technology;,"Netherlands fScience Center, Nieuw-Amsterdam, Netherlands",Journal of Prominent Things,Prominence Inc,Netherlands,Patrick Bos,,Fake Research Article;,7/31/2024 14:00,10.4242/2024/01,0,7/31/2024 13:59,10.4242/2024/00,0,Retraction,+Concerns/Issues About Reality;+Randomly Generated Content;,No,This is a made-up dummy entry. diff --git a/tests/test_sources_csv.py b/tests/test_sources_csv.py index 1e54aae..b104c23 100644 --- a/tests/test_sources_csv.py +++ b/tests/test_sources_csv.py @@ -48,3 +48,27 @@ def test_load_scopus_csv(): assert len(doc.authors) == 10 assert doc.authors[0].name == "Phillips J.C." + +def test_load_retraction_watch_csv(): + path = os.path.dirname(__file__) + "/resources/retraction_watch.csv" + + # let's also go out of our way to make the date field work: + def date_filter(d: dict) -> dict: + import datetime + try: + d["date"] = datetime.datetime.strptime(d["OriginalPaperDate"], "%m/%d/%Y %H:%M").date().isoformat() + print(d["date"]) + except ValueError: + pass + return d + + docs = load_csv(path, doi_field="OriginalPaperDOI", source_field="Journal", filter=date_filter) + doc = docs[0] + + assert doc.title == "Reflections on Research Software" + assert doc.publication_source == "Journal of Prominent Things" + assert doc.language is None + assert doc.publication_year == 2024 + + assert len(doc.authors) == 1 + assert doc.authors[0].name == "Patrick Bos"