Skip to content

Commit

Permalink
Merge pull request #97 from egpbos/custom_doi_field_csv
Browse files Browse the repository at this point in the history
Custom DOI field in `load_csv`
  • Loading branch information
stijnh authored Jul 31, 2024
2 parents fcb82e8 + b59a9ad commit f1a6079
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 2 deletions.
8 changes: 6 additions & 2 deletions litstudy/sources/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def publication_date(self):

for fmt in formats:
try:
return datetime.strptime(text, fmt)
return datetime.datetime.strptime(text, fmt)
except Exception:
pass

Expand Down Expand Up @@ -167,6 +167,7 @@ def load_csv(
citation_field: str = None,
date_field: str = None,
source_field: str = None,
doi_field: str = None,
filter=None,
) -> DocumentSet:
"""Load an abitrary CSV file and parse its contents as a ``DocumentSet``
Expand All @@ -190,6 +191,8 @@ def load_csv(
:param abstract_field: Field name for ``abstract``.
:param citation_field: Field name for ``citation_count``.
:param date_field: Field name for ``publication_date`` or
:param source_field: Field name for ``source``.
:param doi_field: Field name for ``doi``.
:param filter: Optional function applied to each loaded record. This
function can be used to, for example, add or delete fields.
Expand Down Expand Up @@ -309,7 +312,8 @@ def load_csv(
"pubmed id",
],
),
doi=find_field(
doi=doi_field
or find_field(
columns,
[
"doi",
Expand Down
2 changes: 2 additions & 0 deletions tests/resources/retraction_watch.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,RetractionDate,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes
4242,Reflections on Research Software,(B/T) Computer Science;(B/T) Data Science;(B/T) Technology;,"Netherlands fScience Center, Nieuw-Amsterdam, Netherlands",Journal of Prominent Things,Prominence Inc,Netherlands,Patrick Bos,,Fake Research Article;,7/31/2024 14:00,10.4242/2024/01,0,7/31/2024 13:59,10.4242/2024/00,0,Retraction,+Concerns/Issues About Reality;+Randomly Generated Content;,No,This is a made-up dummy entry.
24 changes: 24 additions & 0 deletions tests/test_sources_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,27 @@ def test_load_scopus_csv():

assert len(doc.authors) == 10
assert doc.authors[0].name == "Phillips J.C."

def test_load_retraction_watch_csv():
path = os.path.dirname(__file__) + "/resources/retraction_watch.csv"

# let's also go out of our way to make the date field work:
def date_filter(d: dict) -> dict:
import datetime
try:
d["date"] = datetime.datetime.strptime(d["OriginalPaperDate"], "%m/%d/%Y %H:%M").date().isoformat()
print(d["date"])
except ValueError:
pass
return d

docs = load_csv(path, doi_field="OriginalPaperDOI", source_field="Journal", filter=date_filter)
doc = docs[0]

assert doc.title == "Reflections on Research Software"
assert doc.publication_source == "Journal of Prominent Things"
assert doc.language is None
assert doc.publication_year == 2024

assert len(doc.authors) == 1
assert doc.authors[0].name == "Patrick Bos"

0 comments on commit f1a6079

Please sign in to comment.