diff --git a/bia-ingest/bia_ingest/biostudies/v4/study.py b/bia-ingest/bia_ingest/biostudies/v4/study.py index 46889185..3ea82585 100644 --- a/bia-ingest/bia_ingest/biostudies/v4/study.py +++ b/bia-ingest/bia_ingest/biostudies/v4/study.py @@ -2,6 +2,8 @@ from pydantic import ValidationError import re from typing import List, Any, Dict, Optional +from email_validator import validate_email, EmailNotValidError + from bia_ingest.cli_logging import ( IngestionResult, log_failed_model_creation, @@ -296,6 +298,7 @@ def get_contributor( author_sections = find_sections_recursive(submission.section, ["author"]) contributor_dicts = [] + email_warnings = [] for section in author_sections: attributes = sanitise_affiliation_attribute( @@ -315,20 +318,22 @@ def get_contributor( model_dict["affiliation"] = [ model_dict["affiliation"], ] - if model_dict["contact_email"] == "UNKNOWN": - model_dict["contact_email"] = None - elif model_dict["contact_email"]: - model_dict["contact_email"] = model_dict["contact_email"].strip("<>") + + sanitised_email, email_warnings = sanitise_contributor_email( + model_dict["contact_email"], email_warnings + ) + model_dict["contact_email"] = sanitised_email contributor_dicts.append(model_dict) + get_unique_email_warnings(email_warnings, result_summary, submission.accno) + contributors = dicts_to_api_models( contributor_dicts, semantic_models.Contributor, result_summary[submission.accno] ) return contributors - def sanitise_affiliation_attribute( attribute_list: List[Attribute], affiliation_dict: dict, @@ -353,3 +358,31 @@ def sanitise_affiliation_attribute( else: sanitised_attribute_list.append(attribute) return sanitised_attribute_list + +def sanitise_contributor_email( + email: str | None, + email_warnings: list +): + if email is not None: + try: + email_info = validate_email(email, check_deliverability=False) + email = email_info.normalized + except EmailNotValidError as e: + email_warnings.append(str(e)) + email = None + + return [email, email_warnings] + +def get_unique_email_warnings( + email_warnings: list, + result_summary: dict[str, IngestionResult], + accno: str +): + if email_warnings != []: + unique_warnings = list(set(email_warnings)) + for warning in unique_warnings: + result_summary[accno].__setattr__( + "Warning", + "Skipped invalid author email: " + str(warning) + "\n" + ) + logger.warning(f"Skipped invalid author email: {warning}") diff --git a/bia-ingest/bia_ingest/cli_logging.py b/bia-ingest/bia_ingest/cli_logging.py index b3df87ea..2bb22a3b 100644 --- a/bia-ingest/bia_ingest/cli_logging.py +++ b/bia-ingest/bia_ingest/cli_logging.py @@ -71,6 +71,7 @@ class IngestionResult(CLIResult): ImageCorrelationMethod_ValidationErrorCount: int = Field(default=0) Uncaught_Exception: str = Field(default="") + Warning: str = Field(default="") def tabulate_ingestion_errors( @@ -78,7 +79,7 @@ def tabulate_ingestion_errors( ) -> Table: table = Table() - headers = ["Accession ID", "Processing Mode", "Status", "Error: Count;"] + headers = ["Accession ID", "Processing Mode", "Status", "Error: Count", "Warnings"] if include_object_count: for field in IngestionResult.model_fields: @@ -118,20 +119,36 @@ def tabulate_ingestion_errors( if result.Uncaught_Exception: error_message += f"Uncaught exception: {result.Uncaught_Exception}" - - if error_message == "": + + warning_message = "" + if result.Warning: + warning_message = result.Warning + + if (error_message == "") & (warning_message == ""): status = Text("Success") status.stylize("green") - else: + elif (error_message == "") & (warning_message != ""): + status = Text("Success with warnings") + status.stylize("yellow") + warning_message = Text(warning_message) + warning_message.stylize("yellow") + elif (error_message != "") & (warning_message != ""): + status = Text("Failures with warnings") + status.stylize("red") + warning_message = Text(warning_message) + warning_message.stylize("yellow") + error_message = Text(error_message) + error_message.stylize("red") + elif (error_message != "") & (warning_message == ""): status = Text("Failures") status.stylize("red") error_message = Text(error_message) error_message.stylize("red") - row_info = [accession_id_key, result.ProcessingVersion, status, error_message] + row_info = [accession_id_key, result.ProcessingVersion, status, error_message, warning_message] if include_object_count: - for header in headers[4:]: + for header in headers[5:]: row_info.append(str(result_dict[header + "_CreationCount"])) table.add_row(*row_info)