Skip to content

Commit e70f83d

Browse files
committed
Add ContributorIds to XML output
1 parent 22569ed commit e70f83d

File tree

7 files changed

+201
-5
lines changed

7 files changed

+201
-5
lines changed

.github/workflows/test.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ jobs:
2626
- uses: actions/setup-python@v5
2727
with:
2828
python-version: ${{ matrix.python-version }}
29+
- name: Install tox
30+
run: pip install tox
31+
- name: Run unit tests
32+
run: tox -e test
2933
- name: Install project
3034
run: pip install .
3135
- name: Test

config.yaml.sample

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@ rf_password: "secret"
66
77
# URL of the export of publications from NBIROS in XML format
88
nbiros_pub_export_xml_url: "https://example.org/path/to/pubs.xml"
9+
# URL of the export of EI people data in CSV format
10+
people_data_csv_url: "https://example.org/path/to/people.csv"

pytest.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[pytest]
2+
pythonpath = .

rfparser/__init__.py

Lines changed: 103 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
#!/usr/bin/env python3
22

33
import argparse
4+
import csv
45
import itertools
56
import logging
67
import os
8+
import re
79
import sys
10+
from dataclasses import dataclass
811
from time import sleep
912
from typing import (
1013
Any,
@@ -19,8 +22,10 @@
1922
from requests import Session
2023

2124
from .util import (
25+
is_same_person,
2226
str_if_not_None,
2327
strip_tags,
28+
unique,
2429
)
2530

2631
if sys.version_info >= (3, 9):
@@ -63,10 +68,19 @@
6368
"green": "Green Open Access",
6469
"hybrid": "Gold Open Access",
6570
}
71+
VALID_ORCID_ID = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
6672

6773
log = logging.getLogger(__name__)
6874

6975

76+
@dataclass
77+
class Person:
78+
username: str
79+
given_names: str
80+
family_names: str
81+
orcid_id: Optional[str]
82+
83+
7084
def RF_login(username: str, password: str) -> Session:
7185
"""
7286
Login to ResearchFish API and return a session storing the auth cookie.
@@ -185,7 +199,49 @@ def get_dois_from_old_xml(nbiros_pub_export_xml_url: Optional[str], pubs_with_do
185199
pubs_with_doi[doi]["nbiros_entries"].append(pub_el)
186200

187201

188-
def write_xml_output(pubs_with_doi: Dict[str, Dict[str, Any]], outfile: str) -> None:
202+
def sanitise_orcid_id(orcid_id: Optional[str]) -> Optional[str]:
203+
if not orcid_id:
204+
return None
205+
# Remove initial part, if it's a URL
206+
number = orcid_id.split("/")[-1]
207+
number = number.replace("-", "-")
208+
assert len(number) == 19, f"Malformed ORCID id {orcid_id}"
209+
assert re.match(VALID_ORCID_ID, number), f"Malformed ORCID id {orcid_id}"
210+
return f"https://orcid.org/{number}"
211+
212+
213+
def get_persons(people_data_csv_url: Optional[str]) -> List[Person]:
214+
log.info("Started get_persons")
215+
if not people_data_csv_url:
216+
log.warning("people_data_csv_url option not specified")
217+
return []
218+
r = requests.get(people_data_csv_url)
219+
r.raise_for_status()
220+
reader = csv.reader(r.text.splitlines())
221+
persons = [
222+
Person(
223+
username=username, given_names=given_names, family_names=family_names, orcid_id=sanitise_orcid_id(orcid_id)
224+
)
225+
for (username, given_names, family_names, orcid_id) in reader
226+
]
227+
duplicated_person_indexes = []
228+
for i, person1 in enumerate(persons):
229+
for person2 in persons[i + 1 :]:
230+
if person1.given_names == person2.given_names and person1.family_names == person2.family_names:
231+
duplicated_person_indexes.append(i)
232+
break
233+
for index in reversed(duplicated_person_indexes):
234+
log.warning("Duplicated person %s will be eliminated", persons[index])
235+
del persons[index]
236+
log.info("Total persons: %s", len(persons))
237+
return persons
238+
239+
240+
def write_xml_output(
241+
pubs_with_doi: Dict[str, Dict[str, Any]],
242+
outfile: str,
243+
people_data_csv_url: Optional[str],
244+
) -> None:
189245
"""
190246
Write the publications to an XML file for the EI website.
191247
"""
@@ -209,7 +265,39 @@ def author_dict_to_contributor(author_dict: Dict[str, Any]) -> str:
209265
raise Exception(f"Unrecognised author_dict format: {author_dict}")
210266
return name
211267

268+
def author_dict_to_username(author_dict: Dict[str, Any]) -> Optional[str]:
269+
# First try to match the ORCID id
270+
orcid_id = sanitise_orcid_id(author_dict.get("ORCID"))
271+
if orcid_id:
272+
usernames = [person.username for person in persons if person.orcid_id == orcid_id]
273+
if usernames:
274+
if len(usernames) > 1:
275+
log.warning("Multiple usernames for ORCID id %s", orcid_id)
276+
return usernames[0]
277+
# Try to match the family and given names
278+
family_names = author_dict.get("family")
279+
if family_names:
280+
given_names = author_dict.get("given", "")
281+
usernames = [
282+
person.username
283+
for person in persons
284+
if not (orcid_id and person.orcid_id)
285+
and is_same_person(person.family_names, person.given_names, family_names, given_names)
286+
]
287+
if usernames:
288+
if len(usernames) > 1:
289+
log.warning(
290+
"Multiple usernames for family names '%s', given names '%s': %s",
291+
family_names,
292+
given_names,
293+
usernames,
294+
)
295+
return usernames[0]
296+
# No need to try to match "name", which is only used for consortia
297+
return None
298+
212299
log.info("Started write_xml_output")
300+
persons = get_persons(people_data_csv_url)
213301
root_el = ElementTree.Element("publications")
214302
for doi, pub in reversed(pubs_with_doi.items()):
215303
if pub["metadata_ok"]:
@@ -231,6 +319,18 @@ def author_dict_to_contributor(author_dict: Dict[str, Any]) -> str:
231319
ElementTree.SubElement(publication_el, "SeriesTitle").text = pub["series-title"]
232320
ElementTree.SubElement(publication_el, "JournalVolume").text = pub["volume"]
233321
ElementTree.SubElement(publication_el, "JournalPages").text = pub["pages"]
322+
try:
323+
contributor_ids_list = [author_dict_to_username(author_dict) for author_dict in pub["authors"]]
324+
for nbiros_entry in pub.get("nbiros_entries", []):
325+
ContributorIds_el = nbiros_entry.find("ContributorIds")
326+
assert ContributorIds_el is not None
327+
ContributorIds_text = ContributorIds_el.text or ""
328+
contributor_ids_list.extend(c.strip() for c in ContributorIds_text.split(","))
329+
contributor_ids = unique(filter(None, contributor_ids_list))
330+
except Exception:
331+
log.error("Error while generating ContributorIds for DOI %s", doi)
332+
raise
333+
ElementTree.SubElement(publication_el, "ContributorIds").text = ", ".join(contributor_ids)
234334
ElementTree.SubElement(publication_el, "ContributorList").text = ", ".join(
235335
author_dict_to_contributor(author_dict) for author_dict in pub["authors"]
236336
)
@@ -277,7 +377,7 @@ def main() -> None:
277377
config = {}
278378
log.warning(f"Could not read configuration file {args.config}")
279379

280-
for env_var in ("RF_USERNAME", "RF_PASSWORD", "RFPARSER_EMAIL", "NBIROS_PUB_EXPORT_XML_URL"):
380+
for env_var in ("RF_USERNAME", "RF_PASSWORD", "RFPARSER_EMAIL", "NBIROS_PUB_EXPORT_XML_URL", "PEOPLE_DATA_CSV_URL"):
281381
if env_var in os.environ:
282382
config_key = env_var.lower()
283383
if config_key.startswith("rfparser_"):
@@ -412,7 +512,7 @@ def main() -> None:
412512
log.error("Skipping publication '%s': %s", doi, e)
413513

414514
if args.xml:
415-
write_xml_output(pubs_with_doi, args.xml)
515+
write_xml_output(pubs_with_doi, args.xml, config.get("people_data_csv_url"))
416516

417517

418518
if __name__ == "__main__":

rfparser/util.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
1+
import re
12
from html.parser import HTMLParser
23
from io import StringIO
34
from typing import (
45
Any,
6+
Iterable,
7+
List,
58
Optional,
9+
TypeVar,
610
)
711

12+
NAME_SPLITTER_PATTERN = re.compile(r"[\s-]+")
13+
814

915
class MLStripper(HTMLParser):
16+
"""
17+
HTML parser that finds tags and strips markup.
18+
"""
19+
1020
# Copied from https://stackoverflow.com/a/925630
1121
def __init__(self) -> None:
1222
super().__init__()
@@ -34,3 +44,45 @@ def str_if_not_None(s: Any) -> Optional[str]:
3444
Cast a variable to str if it's not None.
3545
"""
3646
return str(s) if s is not None else None
47+
48+
49+
T = TypeVar("T")
50+
51+
52+
def unique(l_: Iterable[T]) -> List[T]:
53+
"""
54+
Return a list with the unique elements of an iterable.
55+
56+
Similar to using set(), but preserving the order of the elements in the
57+
iterable.
58+
"""
59+
return list(dict.fromkeys(l_))
60+
61+
62+
def is_same_person(family_names1: str, given_names1: str, family_names2: str, given_names2: str) -> bool:
63+
"""
64+
Check whether two persons' family and given names are similar enough to be
65+
considered the same person.
66+
"""
67+
assert family_names1
68+
assert family_names2
69+
family_names1_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(family_names1)]
70+
family_names2_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(family_names2)]
71+
for name1, name2 in zip(family_names1_list, family_names2_list):
72+
if name1 != name2:
73+
return False
74+
if not given_names1 and not given_names2:
75+
return True
76+
if not given_names1 or not given_names2:
77+
return False
78+
given_names1_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(given_names1)]
79+
given_names2_list = [name.rstrip(".").lower() for name in NAME_SPLITTER_PATTERN.split(given_names2)]
80+
for name1, name2 in zip(given_names1_list, given_names2_list):
81+
if name1 == name2:
82+
continue
83+
if name1[0] == name2:
84+
continue
85+
if name1 == name2[0]:
86+
continue
87+
return False
88+
return True

tests/test_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from rfparser.util import (
2+
is_same_person,
3+
unique,
4+
)
5+
6+
7+
def test_is_same_person():
8+
assert is_same_person("Doe", "John", "Doe", "John")
9+
assert not is_same_person("Doe", "John", "Doe", "Mary")
10+
assert is_same_person("Doe", "John-Paul", "Doe", "John Paul")
11+
assert is_same_person("Doe", "John-Paul", "Doe", "John Paul")
12+
assert is_same_person("Doe", "John-Paul", "Doe", "John P")
13+
assert is_same_person("Doe", "John-Paul", "Doe", "J P")
14+
assert is_same_person("Doe", "John-Paul", "Doe", "J-P")
15+
assert is_same_person("Doe", "John-Paul", "Doe", "J")
16+
assert is_same_person("Doe", "John-Paul", "Doe", "J.")
17+
assert not is_same_person("Doe", "John", "D", "J")
18+
assert is_same_person("Doe", "John Jr.", "Doe", "John Jr")
19+
assert is_same_person("Foo-Bar", "John", "Foo Bar", "John")
20+
assert not is_same_person("Foo-Bar", "John", "Foo Doe", "John")
21+
assert not is_same_person("Foo-Bar", "John", "Foo-Bar", "Mary")
22+
assert is_same_person("Foo-Bar", "John", "Foo", "John")
23+
assert is_same_person("Foo Bar", "John", "Foo", "John")
24+
assert is_same_person("McFoo", "John", "Mcfoo", "John")
25+
assert is_same_person("Doe", "John", "Doe", "john")
26+
assert is_same_person("Doe", "", "Doe", "")
27+
assert not is_same_person("Doe", "John", "Doe", "")
28+
29+
30+
def test_unique():
31+
assert unique([]) == []
32+
assert unique(["a", "b", "a", "c", "b"]) == ["a", "b", "c"]
33+
assert unique([3, 2, 2, 1, 3]) == [3, 2, 1]

tox.ini

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,21 @@
11
[tox]
22
# envlist is the list of environments that are tested when `tox` is run without any option
33
# hyphens in an environment name are used to delimit factors
4-
envlist = lint, mypy
5-
skipsdist = True
4+
envlist = lint, mypy,test
65

76
[testenv]
87
commands =
98
lint: ruff .
109
lint: flake8 .
1110
mypy: mypy rfparser/
11+
test: pytest
1212
deps =
1313
lint: ruff
1414
lint: flake8
1515
lint: flake8-bugbear
1616
mypy: mypy
1717
mypy: types-requests
1818
mypy: types-PyYAML
19+
test: pytest
20+
skip_install =
21+
lint: True

0 commit comments

Comments
 (0)