Skip to content

Commit

Permalink
Add ContributorIds to XML output
Browse files Browse the repository at this point in the history
  • Loading branch information
nsoranzo committed Jan 13, 2024
1 parent 22569ed commit e70f83d
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 5 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install tox
run: pip install tox
- name: Run unit tests
run: tox -e test
- name: Install project
run: pip install .
- name: Test
Expand Down
2 changes: 2 additions & 0 deletions config.yaml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ rf_password: "secret"
email: "[email protected]"
# URL of the export of publications from NBIROS in XML format
nbiros_pub_export_xml_url: "https://example.org/path/to/pubs.xml"
# URL of the export of EI people data in CSV format
people_data_csv_url: "https://example.org/path/to/people.csv"
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
pythonpath = .
106 changes: 103 additions & 3 deletions rfparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!/usr/bin/env python3

import argparse
import csv
import itertools
import logging
import os
import re
import sys
from dataclasses import dataclass
from time import sleep
from typing import (
Any,
Expand All @@ -19,8 +22,10 @@
from requests import Session

from .util import (
is_same_person,
str_if_not_None,
strip_tags,
unique,
)

if sys.version_info >= (3, 9):
Expand Down Expand Up @@ -63,10 +68,19 @@
"green": "Green Open Access",
"hybrid": "Gold Open Access",
}
VALID_ORCID_ID = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")

log = logging.getLogger(__name__)


@dataclass
class Person:
username: str
given_names: str
family_names: str
orcid_id: Optional[str]


def RF_login(username: str, password: str) -> Session:
"""
Login to ResearchFish API and return a session storing the auth cookie.
Expand Down Expand Up @@ -185,7 +199,49 @@ def get_dois_from_old_xml(nbiros_pub_export_xml_url: Optional[str], pubs_with_do
pubs_with_doi[doi]["nbiros_entries"].append(pub_el)


def write_xml_output(pubs_with_doi: Dict[str, Dict[str, Any]], outfile: str) -> None:
def sanitise_orcid_id(orcid_id: Optional[str]) -> Optional[str]:
if not orcid_id:
return None
# Remove initial part, if it's a URL
number = orcid_id.split("/")[-1]
number = number.replace("-", "-")
assert len(number) == 19, f"Malformed ORCID id {orcid_id}"
assert re.match(VALID_ORCID_ID, number), f"Malformed ORCID id {orcid_id}"
return f"https://orcid.org/{number}"


def get_persons(people_data_csv_url: Optional[str]) -> List[Person]:
log.info("Started get_persons")
if not people_data_csv_url:
log.warning("people_data_csv_url option not specified")
return []
r = requests.get(people_data_csv_url)
r.raise_for_status()
reader = csv.reader(r.text.splitlines())
persons = [
Person(
username=username, given_names=given_names, family_names=family_names, orcid_id=sanitise_orcid_id(orcid_id)
)
for (username, given_names, family_names, orcid_id) in reader
]
duplicated_person_indexes = []
for i, person1 in enumerate(persons):
for person2 in persons[i + 1 :]:
if person1.given_names == person2.given_names and person1.family_names == person2.family_names:
duplicated_person_indexes.append(i)
break
for index in reversed(duplicated_person_indexes):
log.warning("Duplicated person %s will be eliminated", persons[index])
del persons[index]
log.info("Total persons: %s", len(persons))
return persons


def write_xml_output(
pubs_with_doi: Dict[str, Dict[str, Any]],
outfile: str,
people_data_csv_url: Optional[str],
) -> None:
"""
Write the publications to an XML file for the EI website.
"""
Expand All @@ -209,7 +265,39 @@ def author_dict_to_contributor(author_dict: Dict[str, Any]) -> str:
raise Exception(f"Unrecognised author_dict format: {author_dict}")
return name

def author_dict_to_username(author_dict: Dict[str, Any]) -> Optional[str]:
# First try to match the ORCID id
orcid_id = sanitise_orcid_id(author_dict.get("ORCID"))
if orcid_id:
usernames = [person.username for person in persons if person.orcid_id == orcid_id]
if usernames:
if len(usernames) > 1:
log.warning("Multiple usernames for ORCID id %s", orcid_id)
return usernames[0]
# Try to match the family and given names
family_names = author_dict.get("family")
if family_names:
given_names = author_dict.get("given", "")
usernames = [
person.username
for person in persons
if not (orcid_id and person.orcid_id)
and is_same_person(person.family_names, person.given_names, family_names, given_names)
]
if usernames:
if len(usernames) > 1:
log.warning(
"Multiple usernames for family names '%s', given names '%s': %s",
family_names,
given_names,
usernames,
)
return usernames[0]
# No need to try to match "name", which is only used for consortia
return None

log.info("Started write_xml_output")
persons = get_persons(people_data_csv_url)
root_el = ElementTree.Element("publications")
for doi, pub in reversed(pubs_with_doi.items()):
if pub["metadata_ok"]:
Expand All @@ -231,6 +319,18 @@ def author_dict_to_contributor(author_dict: Dict[str, Any]) -> str:
ElementTree.SubElement(publication_el, "SeriesTitle").text = pub["series-title"]
ElementTree.SubElement(publication_el, "JournalVolume").text = pub["volume"]
ElementTree.SubElement(publication_el, "JournalPages").text = pub["pages"]
try:
contributor_ids_list = [author_dict_to_username(author_dict) for author_dict in pub["authors"]]
for nbiros_entry in pub.get("nbiros_entries", []):
ContributorIds_el = nbiros_entry.find("ContributorIds")
assert ContributorIds_el is not None
ContributorIds_text = ContributorIds_el.text or ""
contributor_ids_list.extend(c.strip() for c in ContributorIds_text.split(","))
contributor_ids = unique(filter(None, contributor_ids_list))
except Exception:
log.error("Error while generating ContributorIds for DOI %s", doi)
raise
ElementTree.SubElement(publication_el, "ContributorIds").text = ", ".join(contributor_ids)
ElementTree.SubElement(publication_el, "ContributorList").text = ", ".join(
author_dict_to_contributor(author_dict) for author_dict in pub["authors"]
)
Expand Down Expand Up @@ -277,7 +377,7 @@ def main() -> None:
config = {}
log.warning(f"Could not read configuration file {args.config}")

for env_var in ("RF_USERNAME", "RF_PASSWORD", "RFPARSER_EMAIL", "NBIROS_PUB_EXPORT_XML_URL"):
for env_var in ("RF_USERNAME", "RF_PASSWORD", "RFPARSER_EMAIL", "NBIROS_PUB_EXPORT_XML_URL", "PEOPLE_DATA_CSV_URL"):
if env_var in os.environ:
config_key = env_var.lower()
if config_key.startswith("rfparser_"):
Expand Down Expand Up @@ -412,7 +512,7 @@ def main() -> None:
log.error("Skipping publication '%s': %s", doi, e)

if args.xml:
write_xml_output(pubs_with_doi, args.xml)
write_xml_output(pubs_with_doi, args.xml, config.get("people_data_csv_url"))


if __name__ == "__main__":
Expand Down
52 changes: 52 additions & 0 deletions rfparser/util.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
import re
from html.parser import HTMLParser
from io import StringIO
from typing import (
Any,
Iterable,
List,
Optional,
TypeVar,
)

NAME_SPLITTER_PATTERN = re.compile(r"[\s-]+")


class MLStripper(HTMLParser):
"""
HTML parser that finds tags and strips markup.
"""

# Copied from https://stackoverflow.com/a/925630
def __init__(self) -> None:
super().__init__()
Expand Down Expand Up @@ -34,3 +44,45 @@ def str_if_not_None(s: Any) -> Optional[str]:
Cast a variable to str if it's not None.
"""
return str(s) if s is not None else None


T = TypeVar("T")


def unique(l_: Iterable[T]) -> List[T]:
"""
Return a list with the unique elements of an iterable.
Similar to using set(), but preserving the order of the elements in the
iterable.
"""
return list(dict.fromkeys(l_))


def is_same_person(family_names1: str, given_names1: str, family_names2: str, given_names2: str) -> bool:
"""
Check whether two persons' family and given names are similar enough to be
considered the same person.
"""
assert family_names1
assert family_names2
family_names1_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(family_names1)]
family_names2_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(family_names2)]
for name1, name2 in zip(family_names1_list, family_names2_list):
if name1 != name2:
return False
if not given_names1 and not given_names2:
return True
if not given_names1 or not given_names2:
return False
given_names1_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(given_names1)]
given_names2_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(given_names2)]
for name1, name2 in zip(given_names1_list, given_names2_list):
if name1 == name2:
continue
if name1[0] == name2:
continue
if name1 == name2[0]:
continue
return False
return True
33 changes: 33 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from rfparser.util import (
is_same_person,
unique,
)


def test_is_same_person():
assert is_same_person("Doe", "John", "Doe", "John")
assert not is_same_person("Doe", "John", "Doe", "Mary")
assert is_same_person("Doe", "John-Paul", "Doe", "John Paul")
assert is_same_person("Doe", "John-Paul", "Doe", "John Paul")
assert is_same_person("Doe", "John-Paul", "Doe", "John P")
assert is_same_person("Doe", "John-Paul", "Doe", "J P")
assert is_same_person("Doe", "John-Paul", "Doe", "J-P")
assert is_same_person("Doe", "John-Paul", "Doe", "J")
assert is_same_person("Doe", "John-Paul", "Doe", "J.")
assert not is_same_person("Doe", "John", "D", "J")
assert is_same_person("Doe", "John Jr.", "Doe", "John Jr")
assert is_same_person("Foo-Bar", "John", "Foo Bar", "John")
assert not is_same_person("Foo-Bar", "John", "Foo Doe", "John")
assert not is_same_person("Foo-Bar", "John", "Foo-Bar", "Mary")
assert is_same_person("Foo-Bar", "John", "Foo", "John")
assert is_same_person("Foo Bar", "John", "Foo", "John")
assert is_same_person("McFoo", "John", "Mcfoo", "John")
assert is_same_person("Doe", "John", "Doe", "john")
assert is_same_person("Doe", "", "Doe", "")
assert not is_same_person("Doe", "John", "Doe", "")


def test_unique():
assert unique([]) == []
assert unique(["a", "b", "a", "c", "b"]) == ["a", "b", "c"]
assert unique([3, 2, 2, 1, 3]) == [3, 2, 1]
7 changes: 5 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
[tox]
# envlist is the list of environments that are tested when `tox` is run without any option
# hyphens in an environment name are used to delimit factors
envlist = lint, mypy
skipsdist = True
envlist = lint, mypy,test

[testenv]
commands =
lint: ruff .
lint: flake8 .
mypy: mypy rfparser/
test: pytest
deps =
lint: ruff
lint: flake8
lint: flake8-bugbear
mypy: mypy
mypy: types-requests
mypy: types-PyYAML
test: pytest
skip_install =
lint: True

0 comments on commit e70f83d

Please sign in to comment.