From 3295eb0b930a89f3077abe813983c9881570216e Mon Sep 17 00:00:00 2001 From: Leo-gail Date: Tue, 13 Feb 2024 14:45:34 +0100 Subject: [PATCH] feat(biblio-ref): now work on references without dois --- services/biblio-ref/Dockerfile | 2 +- services/biblio-ref/v1/validate.py | 160 +++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 11 deletions(-) diff --git a/services/biblio-ref/Dockerfile b/services/biblio-ref/Dockerfile index 9f4cec48..77fda25f 100644 --- a/services/biblio-ref/Dockerfile +++ b/services/biblio-ref/Dockerfile @@ -19,7 +19,7 @@ FROM cnrsinist/ezs-python-server:py3.9-no16-1.0.8 USER root # Install all python dependencies -RUN pip install pandas==2.1.4 requests_ratelimiter==0.4.2 +RUN pip install pandas==2.1.4 requests_ratelimiter==0.4.2 thefuzz==0.22.1 USER daemon WORKDIR /app/public diff --git a/services/biblio-ref/v1/validate.py b/services/biblio-ref/v1/validate.py index 528fdfb5..045175d3 100755 --- a/services/biblio-ref/v1/validate.py +++ b/services/biblio-ref/v1/validate.py @@ -5,6 +5,9 @@ import sys import json import pandas as pd +import unicodedata +from thefuzz import fuzz + mail_adress = "leo.gaillard@cnrs.fr" session = LimiterSession(per_second=5) @@ -14,9 +17,50 @@ retracted_doi = dumps_pps["DOI"].tolist() +# normalize text +def remove_accents(text): + """ + Remove accents from the input text and return the text with no accents. + + Parameters: + text (str): The input text with accents. + + Returns: + str: The input text with accents removed. + """ + if text == "" or type(text)!= str: + return "" + normalized_text = unicodedata.normalize("NFD", text) + text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) + return text_with_no_accent + +def uniformize(text): + """ + Function to uniformize the given text by removing accents, punctuation, and converting to lowercase. + + Args: + text (str): a string input text to be uniformized + + Returns: + str: a string with uniformized text + """ + text = remove_accents(text) #if text is not a string, it's return "" + + # remove punctuation except " ' " + text = ''.join(char if char.isalpha() else ' ' for char in text) + + return ' '.join(text.lower().split()) + +# DOI funtions def find_doi(text): """ - return the first doi found in a text (input) + Function to find a DOI (Digital Object Identifier) in the given text. + + Args: + text: the input text in which to search for the DOI + + Returns + str: the found DOI, or an empty string if not found """ doi_regex = r"\b10.\d{4,}\/[^\s]+\b" doi = re.search(doi_regex, text) @@ -31,10 +75,14 @@ def find_doi(text): def verify_doi(doi, mail=mail_adress): """ - Check with crossref API if DOI is correct. - Do not use this function without function "find_doi". - - Returns HTTP code + Verify a Digital Object Identifier (DOI) by making a GET request to the Crossref API. + + Args: + doi (str): The DOI to be verified. + mail (str): The email address to be included in the API request. Defaults to the value of mail_address. + + Returns: + int: The HTTP status code of the API response, or 503 if an exception occurs. """ url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" @@ -46,6 +94,97 @@ def verify_doi(doi, mail=mail_adress): return 503 # if there is an unexpected error from crossref +# Functions for ref_biblio +def get_title_authors_doi(message): + """ + Get the title, first author's given name, first author's family name, and DOI from the input message. + + Args: + message (dict): The input message containing information about the publication. + + Returns: + dict: A dictionary containing the title, first author's given name, first author's family name, and DOI. + """ + title = message['title'][0] if 'title' in message else "" + doi = message['DOI'] if 'DOI' in message else "" + try: + first_author_name = message['author'][0]['family'] + except: + first_author_name = "" + try: + first_author_given = message['author'][0]['given'] + except: + first_author_given = "" + return {'title': title, 'first_author_given': first_author_given, 'first_author_name': first_author_name, 'doi': doi} + +def match_title(title, ref_biblio): + """ + Match the title of the publication with the title of the biblio reference. + + Args: + title (str): The title of the publication. + ref_biblio (str): The biblio reference. + + Returns: + bool: True if the title of the publication matches the title of the biblio reference, False otherwise. + """ + title = uniformize(title) + ref_biblio = uniformize(ref_biblio) + + distance = fuzz.partial_ratio(title, ref_biblio) + + #thereshold here + return distance > 90 + +def compare_pubinfo_refbiblio(item,ref_biblio): + """ + Compare informations of one of the crossref publis with the biblio + + Args: + item (json): title, authors name and doi from a crossref publi + ref_biblio (str): the whole biblio reference + + Returns: + tuple (bool, str): True if it's match and whith the doi + """ + # Check first author + if item['first_author_name'] not in ref_biblio: + return False, "" + if not match_title(item['title'], ref_biblio): + return False, "" + return True, item['doi'] + +def verify_biblio(ref_biblio, mail=mail_adress): + """ + check with crossref api if a biblio ref is correct. + + Args : + ref_biblio :a biblio ref + mail : a mail adress + + Returns : + a confidence score about the existence + doi of the biblio ref + """ + url = f'https://api.crossref.org/works?query.bibliographic="{ref_biblio}"&mailto={mail}&rows=5' + try: + response = session.get(url) + data = response.json() + items = data["message"]["items"] #to check + for item in items: + item_info = get_title_authors_doi(item) + # If no authors name in Crossref, return "not_found" + if item_info['first_author_name'] == "" or item_info['title']=="": + continue + # compare pub_info with ref_biblio + match_item, doi = compare_pubinfo_refbiblio(item_info,ref_biblio) + if match_item: + return "found",doi + + return "not_found","" + except Exception: + return "error_service","" + + for line in sys.stdin: data = json.loads(line) ref_biblio = data["value"] @@ -69,7 +208,9 @@ def verify_doi(doi, mail=mail_adress): sys.stdout.write("\n") elif crossref_status_code==404: # If request return code 404 - data["value"] = {"doi":"","status": "not_found"} + status,doi = verify_biblio(ref_biblio) + data["value"] = {"doi":doi, "status": status} + json.dump(data, sys.stdout) sys.stdout.write("\n") @@ -80,9 +221,8 @@ def verify_doi(doi, mail=mail_adress): else: - # C'est dans cette partie que l'on traitera la partie 2 du WS - # data["value"] = future_function_to_check(ref_biblio) - - data["value"] = {"doi":"","status": "not_found"} + status,doi = verify_biblio(ref_biblio) + data["value"] = {"doi":doi, "status": status} + json.dump(data, sys.stdout) sys.stdout.write("\n")