Skip to content

Commit

Permalink
Email validator (#267)
Browse files Browse the repository at this point in the history
* Added email validator and warning reporting

* Repeated email warnings no longer recorded

* Warnings more intelligible, and in yellow
  • Loading branch information
Chr1st0p43rR authored Dec 11, 2024
1 parent b09b258 commit 089ea7b
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 11 deletions.
43 changes: 38 additions & 5 deletions bia-ingest/bia_ingest/biostudies/v4/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from pydantic import ValidationError
import re
from typing import List, Any, Dict, Optional
from email_validator import validate_email, EmailNotValidError

from bia_ingest.cli_logging import (
IngestionResult,
log_failed_model_creation,
Expand Down Expand Up @@ -296,6 +298,7 @@ def get_contributor(
author_sections = find_sections_recursive(submission.section, ["author"])

contributor_dicts = []
email_warnings = []
for section in author_sections:

attributes = sanitise_affiliation_attribute(
Expand All @@ -315,20 +318,22 @@ def get_contributor(
model_dict["affiliation"] = [
model_dict["affiliation"],
]
if model_dict["contact_email"] == "UNKNOWN":
model_dict["contact_email"] = None
elif model_dict["contact_email"]:
model_dict["contact_email"] = model_dict["contact_email"].strip("<>")

sanitised_email, email_warnings = sanitise_contributor_email(
model_dict["contact_email"], email_warnings
)
model_dict["contact_email"] = sanitised_email

contributor_dicts.append(model_dict)

get_unique_email_warnings(email_warnings, result_summary, submission.accno)

contributors = dicts_to_api_models(
contributor_dicts, semantic_models.Contributor, result_summary[submission.accno]
)

return contributors


def sanitise_affiliation_attribute(
attribute_list: List[Attribute],
affiliation_dict: dict,
Expand All @@ -353,3 +358,31 @@ def sanitise_affiliation_attribute(
else:
sanitised_attribute_list.append(attribute)
return sanitised_attribute_list

def sanitise_contributor_email(
email: str | None,
email_warnings: list
):
if email is not None:
try:
email_info = validate_email(email, check_deliverability=False)
email = email_info.normalized
except EmailNotValidError as e:
email_warnings.append(str(e))
email = None

return [email, email_warnings]

def get_unique_email_warnings(
email_warnings: list,
result_summary: dict[str, IngestionResult],
accno: str
):
if email_warnings != []:
unique_warnings = list(set(email_warnings))
for warning in unique_warnings:
result_summary[accno].__setattr__(
"Warning",
"Skipped invalid author email: " + str(warning) + "\n"
)
logger.warning(f"Skipped invalid author email: {warning}")
29 changes: 23 additions & 6 deletions bia-ingest/bia_ingest/cli_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,15 @@ class IngestionResult(CLIResult):
ImageCorrelationMethod_ValidationErrorCount: int = Field(default=0)

Uncaught_Exception: str = Field(default="")
Warning: str = Field(default="")


def tabulate_ingestion_errors(
dict_of_results: dict[str, IngestionResult], include_object_count=False
) -> Table:

table = Table()
headers = ["Accession ID", "Processing Mode", "Status", "Error: Count;"]
headers = ["Accession ID", "Processing Mode", "Status", "Error: Count", "Warnings"]

if include_object_count:
for field in IngestionResult.model_fields:
Expand Down Expand Up @@ -118,20 +119,36 @@ def tabulate_ingestion_errors(

if result.Uncaught_Exception:
error_message += f"Uncaught exception: {result.Uncaught_Exception}"

if error_message == "":

warning_message = ""
if result.Warning:
warning_message = result.Warning

if (error_message == "") & (warning_message == ""):
status = Text("Success")
status.stylize("green")
else:
elif (error_message == "") & (warning_message != ""):
status = Text("Success with warnings")
status.stylize("yellow")
warning_message = Text(warning_message)
warning_message.stylize("yellow")
elif (error_message != "") & (warning_message != ""):
status = Text("Failures with warnings")
status.stylize("red")
warning_message = Text(warning_message)
warning_message.stylize("yellow")
error_message = Text(error_message)
error_message.stylize("red")
elif (error_message != "") & (warning_message == ""):
status = Text("Failures")
status.stylize("red")
error_message = Text(error_message)
error_message.stylize("red")

row_info = [accession_id_key, result.ProcessingVersion, status, error_message]
row_info = [accession_id_key, result.ProcessingVersion, status, error_message, warning_message]

if include_object_count:
for header in headers[4:]:
for header in headers[5:]:
row_info.append(str(result_dict[header + "_CreationCount"]))

table.add_row(*row_info)
Expand Down

0 comments on commit 089ea7b

Please sign in to comment.