1
1
#!/usr/bin/env python3
2
2
3
3
import argparse
4
+ import csv
4
5
import itertools
5
6
import logging
6
7
import os
8
+ import re
7
9
import sys
10
+ from dataclasses import dataclass
8
11
from time import sleep
9
12
from typing import (
10
13
Any ,
19
22
from requests import Session
20
23
21
24
from .util import (
25
+ is_same_person ,
22
26
str_if_not_None ,
23
27
strip_tags ,
28
+ unique ,
24
29
)
25
30
26
31
if sys .version_info >= (3 , 9 ):
63
68
"green" : "Green Open Access" ,
64
69
"hybrid" : "Gold Open Access" ,
65
70
}
71
+ VALID_ORCID_ID = re .compile (r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$" )
66
72
67
73
log = logging .getLogger (__name__ )
68
74
69
75
76
+ @dataclass
77
+ class Person :
78
+ username : str
79
+ given_names : str
80
+ family_names : str
81
+ orcid_id : Optional [str ]
82
+
83
+
70
84
def RF_login (username : str , password : str ) -> Session :
71
85
"""
72
86
Login to ResearchFish API and return a session storing the auth cookie.
@@ -185,7 +199,49 @@ def get_dois_from_old_xml(nbiros_pub_export_xml_url: Optional[str], pubs_with_do
185
199
pubs_with_doi [doi ]["nbiros_entries" ].append (pub_el )
186
200
187
201
188
- def write_xml_output (pubs_with_doi : Dict [str , Dict [str , Any ]], outfile : str ) -> None :
202
+ def sanitise_orcid_id (orcid_id : Optional [str ]) -> Optional [str ]:
203
+ if not orcid_id :
204
+ return None
205
+ # Remove initial part, if it's a URL
206
+ number = orcid_id .split ("/" )[- 1 ]
207
+ number = number .replace ("-" , "-" )
208
+ assert len (number ) == 19 , f"Malformed ORCID id { orcid_id } "
209
+ assert re .match (VALID_ORCID_ID , number ), f"Malformed ORCID id { orcid_id } "
210
+ return f"https://orcid.org/{ number } "
211
+
212
+
213
+ def get_persons (people_data_csv_url : Optional [str ]) -> List [Person ]:
214
+ log .info ("Started get_persons" )
215
+ if not people_data_csv_url :
216
+ log .warning ("people_data_csv_url option not specified" )
217
+ return []
218
+ r = requests .get (people_data_csv_url )
219
+ r .raise_for_status ()
220
+ reader = csv .reader (r .text .splitlines ())
221
+ persons = [
222
+ Person (
223
+ username = username , given_names = given_names , family_names = family_names , orcid_id = sanitise_orcid_id (orcid_id )
224
+ )
225
+ for (username , given_names , family_names , orcid_id ) in reader
226
+ ]
227
+ duplicated_person_indexes = []
228
+ for i , person1 in enumerate (persons ):
229
+ for person2 in persons [i + 1 :]:
230
+ if person1 .given_names == person2 .given_names and person1 .family_names == person2 .family_names :
231
+ duplicated_person_indexes .append (i )
232
+ break
233
+ for index in reversed (duplicated_person_indexes ):
234
+ log .warning ("Duplicated person %s will be eliminated" , persons [index ])
235
+ del persons [index ]
236
+ log .info ("Total persons: %s" , len (persons ))
237
+ return persons
238
+
239
+
240
+ def write_xml_output (
241
+ pubs_with_doi : Dict [str , Dict [str , Any ]],
242
+ outfile : str ,
243
+ people_data_csv_url : Optional [str ],
244
+ ) -> None :
189
245
"""
190
246
Write the publications to an XML file for the EI website.
191
247
"""
@@ -209,7 +265,39 @@ def author_dict_to_contributor(author_dict: Dict[str, Any]) -> str:
209
265
raise Exception (f"Unrecognised author_dict format: { author_dict } " )
210
266
return name
211
267
268
+ def author_dict_to_username (author_dict : Dict [str , Any ]) -> Optional [str ]:
269
+ # First try to match the ORCID id
270
+ orcid_id = sanitise_orcid_id (author_dict .get ("ORCID" ))
271
+ if orcid_id :
272
+ usernames = [person .username for person in persons if person .orcid_id == orcid_id ]
273
+ if usernames :
274
+ if len (usernames ) > 1 :
275
+ log .warning ("Multiple usernames for ORCID id %s" , orcid_id )
276
+ return usernames [0 ]
277
+ # Try to match the family and given names
278
+ family_names = author_dict .get ("family" )
279
+ if family_names :
280
+ given_names = author_dict .get ("given" , "" )
281
+ usernames = [
282
+ person .username
283
+ for person in persons
284
+ if not (orcid_id and person .orcid_id )
285
+ and is_same_person (person .family_names , person .given_names , family_names , given_names )
286
+ ]
287
+ if usernames :
288
+ if len (usernames ) > 1 :
289
+ log .warning (
290
+ "Multiple usernames for family names '%s', given names '%s': %s" ,
291
+ family_names ,
292
+ given_names ,
293
+ usernames ,
294
+ )
295
+ return usernames [0 ]
296
+ # No need to try to match "name", which is only used for consortia
297
+ return None
298
+
212
299
log .info ("Started write_xml_output" )
300
+ persons = get_persons (people_data_csv_url )
213
301
root_el = ElementTree .Element ("publications" )
214
302
for doi , pub in reversed (pubs_with_doi .items ()):
215
303
if pub ["metadata_ok" ]:
@@ -231,6 +319,18 @@ def author_dict_to_contributor(author_dict: Dict[str, Any]) -> str:
231
319
ElementTree .SubElement (publication_el , "SeriesTitle" ).text = pub ["series-title" ]
232
320
ElementTree .SubElement (publication_el , "JournalVolume" ).text = pub ["volume" ]
233
321
ElementTree .SubElement (publication_el , "JournalPages" ).text = pub ["pages" ]
322
+ try :
323
+ contributor_ids_list = [author_dict_to_username (author_dict ) for author_dict in pub ["authors" ]]
324
+ for nbiros_entry in pub .get ("nbiros_entries" , []):
325
+ ContributorIds_el = nbiros_entry .find ("ContributorIds" )
326
+ assert ContributorIds_el is not None
327
+ ContributorIds_text = ContributorIds_el .text or ""
328
+ contributor_ids_list .extend (c .strip () for c in ContributorIds_text .split ("," ))
329
+ contributor_ids = unique (filter (None , contributor_ids_list ))
330
+ except Exception :
331
+ log .error ("Error while generating ContributorIds for DOI %s" , doi )
332
+ raise
333
+ ElementTree .SubElement (publication_el , "ContributorIds" ).text = ", " .join (contributor_ids )
234
334
ElementTree .SubElement (publication_el , "ContributorList" ).text = ", " .join (
235
335
author_dict_to_contributor (author_dict ) for author_dict in pub ["authors" ]
236
336
)
@@ -277,7 +377,7 @@ def main() -> None:
277
377
config = {}
278
378
log .warning (f"Could not read configuration file { args .config } " )
279
379
280
- for env_var in ("RF_USERNAME" , "RF_PASSWORD" , "RFPARSER_EMAIL" , "NBIROS_PUB_EXPORT_XML_URL" ):
380
+ for env_var in ("RF_USERNAME" , "RF_PASSWORD" , "RFPARSER_EMAIL" , "NBIROS_PUB_EXPORT_XML_URL" , "PEOPLE_DATA_CSV_URL" ):
281
381
if env_var in os .environ :
282
382
config_key = env_var .lower ()
283
383
if config_key .startswith ("rfparser_" ):
@@ -412,7 +512,7 @@ def main() -> None:
412
512
log .error ("Skipping publication '%s': %s" , doi , e )
413
513
414
514
if args .xml :
415
- write_xml_output (pubs_with_doi , args .xml )
515
+ write_xml_output (pubs_with_doi , args .xml , config . get ( "people_data_csv_url" ) )
416
516
417
517
418
518
if __name__ == "__main__" :
0 commit comments