diff --git a/bia-ingest-shared-models/README.md b/bia-ingest-shared-models/README.md new file mode 100644 index 00000000..5f323917 --- /dev/null +++ b/bia-ingest-shared-models/README.md @@ -0,0 +1,5 @@ +## Usage +Once you've installed the project using poetry, assuming you are in this directory: +```sh +$ poetry run biaingest ingest S-BIAD325 +``` diff --git a/bia-ingest-shared-models/bia_ingest_sm/biostudies.py b/bia-ingest-shared-models/bia_ingest_sm/biostudies.py new file mode 100644 index 00000000..5e61f911 --- /dev/null +++ b/bia-ingest-shared-models/bia_ingest_sm/biostudies.py @@ -0,0 +1,349 @@ +import json +import logging +import pathlib +import datetime +from typing import List, Union, Dict, Optional, Any +from copy import deepcopy + +import requests +from pydantic import BaseModel + + +logger = logging.getLogger(__name__) + + +STUDY_URL_TEMPLATE = "https://www.ebi.ac.uk/biostudies/api/v1/studies/{accession}" +FLIST_URI_TEMPLATE = ( + "https://www.ebi.ac.uk/biostudies/files/{accession_id}/{flist_fname}" +) +FILE_URI_TEMPLATE = "https://www.ebi.ac.uk/biostudies/files/{accession_id}/{relpath}" + + +class AttributeDetail(BaseModel): + name: str + value: str + + +class Attribute(BaseModel): + name: str + value: Optional[str] + reference: bool = False + nmqual: List[AttributeDetail] = [] + valqual: List[AttributeDetail] = [] + + def as_tsv(self) -> str: + if self.reference: + tsv_rep = f"<{self.name}>\t{self.value}\n" + else: + tsv_rep = f"{self.name}\t{self.value}\n" + + return tsv_rep + + +# File List + + +class File(BaseModel): + path: pathlib.Path + size: int + type: str + attributes: List[Attribute] = [] + + +class Link(BaseModel): + url: str + attributes: List[Attribute] = [] + + def as_tsv(self) -> str: + tsv_rep = "\n" + tsv_rep += f"Link\t{self.url}\n" + tsv_rep += "".join([attr.as_tsv() for attr in self.attributes]) + + return tsv_rep + + +class Section(BaseModel): + type: str + accno: Optional[str] = "" + attributes: List[Attribute] = [] + subsections: List[Union["Section", List["Section"]]] = [] + links: List[Link] = [] + files: List[Union[File, List[File]]] = [] + + def as_tsv(self, parent_accno: Optional[str] = None) -> str: + tsv_rep = "\n" + + accno_str = self.accno if self.accno else "" + if parent_accno: + tsv_rep += f"{self.type}\t{accno_str}\t{parent_accno}" + else: + if self.accno: + tsv_rep += f"{self.type}\t{accno_str}" + else: + tsv_rep += f"{self.type}" + + tsv_rep += "\n" + + tsv_rep += "".join([attr.as_tsv() for attr in self.attributes]) + tsv_rep += "".join([link.as_tsv() for link in self.links]) + tsv_rep += "".join([section.as_tsv(self.accno) for section in self.subsections]) + + return tsv_rep + + +class Submission(BaseModel): + accno: Optional[str] + section: Section + attributes: List[Attribute] + + def as_tsv(self) -> str: + tsv_rep = f"Submission" + if self.accno: + tsv_rep += f"\t{self.accno}" + tsv_rep += "\n" + + tsv_rep += "".join([attr.as_tsv() for attr in self.attributes]) + tsv_rep += self.section.as_tsv() + + return tsv_rep + + +# API search classes + + +class StudyResult(BaseModel): + accession: str + title: str + author: str + links: int + files: int + release_date: datetime.date + views: int + isPublic: bool + + +class QueryResult(BaseModel): + page: int + pageSize: int + totalHits: int + isTotalHitsExact: bool + sortBy: str + sortOrder: str + hits: List[StudyResult] + + +# API functions + + +def load_submission(accession_id: str) -> Submission: + + url = STUDY_URL_TEMPLATE.format(accession=accession_id) + logger.info(f"Fetching submission from {url}") + headers = { + "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" + } + r = requests.get(url, headers=headers) + + assert r.status_code == 200 + + submission = Submission.parse_raw(r.content) + + return submission + + +def attributes_to_dict( + attributes: List[Attribute], +) -> Dict[str, Optional[str | List[str]]]: + + attr_dict = {} + for attr in attributes: + if attr.name in attr_dict: + if type(attr_dict[attr.name]) is list: + attr_dict[attr.name].append(attr.value) + else: + attr_dict[attr.name] = [ + attr_dict[attr.name], + ] + attr_dict[attr.name].append(attr.value) + else: + attr_dict[attr.name] = attr.value + return attr_dict + + +def find_file_lists_in_section( + section: Section, flists: List[Dict[str, Union[str, None, List[str]]]] +) -> List[Dict[str, Union[str, None, List[str]]]]: + """Find all of the File Lists in a Section, recursively descending through + the subsections. + + Return a list of dictionaries. + """ + + attr_dict = attributes_to_dict(section.attributes) + + if "File List" in attr_dict: + flists.append(attr_dict) + # Get details of any associations in this subsection + attr_dict["associations"] = [] + for subsection in section.subsections: + if subsection.type == "Associations": + attr_dict["associations"].append( + attributes_to_dict(subsection.attributes) + ) + + for subsection in section.subsections: + subsection_type = type(subsection) + if subsection_type == Section: + find_file_lists_in_section(subsection, flists) + else: + logger.warning( + f"Not processing subsection as type is {subsection_type}, not 'Section'. Contents={subsection}" + ) + + return flists + + +def find_file_lists_in_submission( + submission: Submission, +) -> List[Dict[str, Union[str, None, List[str]]]]: + + return find_file_lists_in_section(submission.section, []) + + +# KB 14/06/2024 commented out as I need to replace parse_raw_as with +# TypeAdapter for pydantic >=2 +# def flist_from_flist_fname( +# accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None +# ) -> List[File]: +# +# flist_url = FLIST_URI_TEMPLATE.format( +# accession_id=accession_id, flist_fname=flist_fname +# ) +# +# r = requests.get(flist_url) +# logger.info(f"Fetching file list from {flist_url}") +# assert r.status_code == 200 +# +# # fl = parse_raw_as(List[File], r.content) +# # KB 18/08/2023 - Hack to fix error due to null values in attributes +# # Remove attribute entries with {"value": "null"} +# dict_content = json.loads(r.content) +# dict_filtered_content = filter_filelist_content(dict_content) +# filtered_content = bytes(json.dumps(dict_filtered_content), "utf-8") +# fl = parse_raw_as(List[File], filtered_content) +# +# if extra_attribute: +# if type(extra_attribute) is not list: +# extra_attribute = [ +# extra_attribute, +# ] +# for file in fl: +# file.attributes.extend(extra_attribute) +# +# return fl + + +def file_uri( + accession_id: str, file: File, file_uri_template: Optional[str] = FILE_URI_TEMPLATE +) -> str: + """For a given accession and file object, return the HTTP URI where we can expect + to be able to access that file.""" + + return file_uri_template.format(accession_id=accession_id, relpath=file.path) + + +def get_file_uri_template_for_accession(accession_id: str) -> str: + """Given an accession identifier, use the BioStudies API to generate a + template which can be populated with the value of relpath to produce + the URI for a given file.""" + + request_uri = f"https://www.ebi.ac.uk/biostudies/api/v1/studies/{accession_id}/info" + r = requests.get(request_uri) + raw_obj = json.loads(r.content) + # Strip the initial ftp from the ftp link, replace by http and add /Files + accession_base_uri = "https" + raw_obj["ftpLink"][3:] + "/Files" + + file_uri_template = accession_base_uri + "/{relpath}" + + return file_uri_template + + +def find_files_in_submission_file_lists(submission: Submission) -> List[File]: + + file_list_dicts = find_file_lists_in_submission(submission) + file_lists = [] + for file_list_dict in file_list_dicts: + fname = file_list_dict["File List"] + extra_attribute = [] + if "Title" in file_list_dict: + extra_attribute.append( + Attribute(name="Title", value=file_list_dict["Title"]) + ) + if "associations" in file_list_dict: + extra_attribute.append( + Attribute( + name="associations", value=f"{file_list_dict['associations']}" + ) + ) + file_list = flist_from_flist_fname(submission.accno, fname, extra_attribute) + file_lists.append(file_list) + + return sum(file_lists, []) + + +def find_files_in_submission(submission: Submission) -> List[File]: + """Find all of the files in a submission, both attached directly to + the submission and as file lists.""" + + all_files = find_files_in_submission_file_lists(submission) + + def descend_and_find_files(section, files_list=[]): + + section_type = type(section) + if section_type == Section: + for file in section.files: + if isinstance(file, List): + files_list += file + else: + files_list.append(file) + + for subsection in section.subsections: + descend_and_find_files(subsection, files_list) + else: + logger.warning( + f"Not processing subsection as type is {section_type}, not 'Section'. Contents={section}" + ) + + descend_and_find_files(submission.section, all_files) + + return all_files + + +def get_with_case_insensitive_key(dictionary: Dict[str, Any], key: str) -> Any: + keys = [k.lower() for k in dictionary.keys()] + temp_key = key.lower() + if temp_key in keys: + key_index = keys.index(temp_key) + temp_key = list(dictionary.keys())[key_index] + return dictionary[temp_key] + else: + raise KeyError(f"{key} not in {dictionary.keys()}") + + +def filter_filelist_content(dictionary: Dict[str, Any]) -> Dict[str, Any]: + """Remove attributes in filelist with null or empty values + + """ + dict_copy = deepcopy(dictionary) + for d in dict_copy: + if "attributes" in d: + d["attributes"] = [ + i + for i in filter( + lambda x: x != {"value": "null"} and x != {}, d["attributes"] + ) + ] + if len(d["attributes"]) == 0: + d.pop("attributes") + + return dict_copy diff --git a/bia-ingest-shared-models/bia_ingest_sm/cli.py b/bia-ingest-shared-models/bia_ingest_sm/cli.py new file mode 100644 index 00000000..39a75360 --- /dev/null +++ b/bia-ingest-shared-models/bia_ingest_sm/cli.py @@ -0,0 +1,23 @@ +import typer +from typing import Optional +from typing_extensions import Annotated +from bia_ingest_sm.biostudies import load_submission +from bia_ingest_sm.conversion import get_study + +app = typer.Typer() + + +@app.command(help="Ingest from biostudies and echo json of bia_data_model.Study") +def ingest(accession_id: Annotated[str, typer.Argument()],) -> None: + submission = load_submission(accession_id) + study = get_study(submission) + typer.echo(study.model_dump_json(indent=2)) + + +@app.callback() +def main() -> None: + return + + +if __name__ == "__main__": + app() diff --git a/bia-ingest-shared-models/bia_ingest_sm/conversion.py b/bia-ingest-shared-models/bia_ingest_sm/conversion.py new file mode 100644 index 00000000..0b8d9f83 --- /dev/null +++ b/bia-ingest-shared-models/bia_ingest_sm/conversion.py @@ -0,0 +1,304 @@ +import re +import hashlib +import uuid +from typing import List, Any, Dict, Optional, Tuple +from .biostudies import Submission, attributes_to_dict, Section, Attribute +from src.bia_models import bia_data_model, semantic_models + + +def get_study(submission: Submission) -> bia_data_model.Study: + """Return an API study model populated from the submission + + """ + + submission_attributes = attributes_to_dict(submission.attributes) + contributors = get_contributor(submission) + grants = get_grant(submission) + + study_attributes = attributes_to_dict(submission.section.attributes) + + study_title = study_title_from_submission(submission) + if "Title" in study_attributes: + study_attributes.pop("Title") + + licence = get_licence(study_attributes) + if "License" in study_attributes: + study_attributes.pop("License") + + study_dict = { + "accession_id": submission.accno, + # TODO: Do more robust search for title - sometimes it is in + # actual submission - see old ingest code + "title": study_title, + "description": study_attributes.pop("Description", None), + "release_date": submission_attributes.pop("ReleaseDate"), + "licence": licence, + "acknowledgement": study_attributes.pop("Acknowledgements", None), + "funding_statement": study_attributes.pop("Funding statement", None), + "keyword": study_attributes.pop("Keywords", []), + "author": [c.model_dump() for c in contributors], + "grant": [g.model_dump() for g in grants], + "attribute": study_attributes, + "experimental_imaging_component": [], + "annotation_component": [], + } + study_uuid = dict_to_uuid(study_dict, ["accession_id",]) + study_dict["uuid"] = study_uuid + study = bia_data_model.Study.model_validate(study_dict) + + return study + + +def study_title_from_submission(submission: Submission) -> str: + + submission_attr_dict = attributes_to_dict(submission.attributes) + study_section_attr_dict = attributes_to_dict(submission.section.attributes) + + study_title = submission_attr_dict.get("Title", None) + if not study_title: + study_title = study_section_attr_dict.get("Title", "Unknown") + + return study_title + + +def get_licence(study_attributes: Dict[str, Any]) -> semantic_models.LicenceType: + """Return enum version of licence of study + + """ + licence = re.sub(r"\s", "_", study_attributes.get("License", "CC0")) + return semantic_models.LicenceType(licence) + + +def get_external_reference( + submission: Submission, +) -> List[semantic_models.ExternalReference]: + """Map biostudies.Submission.Link to semantic_models.ExternalReference + + """ + sections = find_sections_recursive(submission.section, ["links",]) + + key_mapping = [ + ("link", "url", None), + ("link_type", "Type", None), + ("description", "Description", None), + ] + + return_list = [] + for section in sections: + attr_dict = attributes_to_dict(section.attributes) + model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping} + return_list.append( + semantic_models.External_reference.model_validate(model_dict) + ) + return return_list + + +# TODO: Put comments and docstring +def get_grant(submission: Submission) -> List[semantic_models.Grant]: + funding_body_dict = get_funding_body(submission) + key_mapping = [ + ("id", "grant_id", None), + ] + grant_dict = get_generic_section_as_dict( + submission, ["Funding",], semantic_models.Grant, key_mapping + ) + + grant_list = [] + for k, v in grant_dict.items(): + if k in funding_body_dict: + v.funder.append(funding_body_dict[k]) + grant_list.append(v) + return grant_list + + +# TODO: Put comments and docstring +def get_funding_body(submission: Submission) -> semantic_models.FundingBody: + + key_mapping = [ + ("display_name", "Agency", None,), + ] + funding_body = get_generic_section_as_dict( + submission, ["Funding",], semantic_models.FundingBody, key_mapping + ) + return funding_body + + +# TODO: Put comments and docstring +def get_generic_section_as_list( + submission: Submission, + section_name: List[str], + mapped_object: [Any], + key_mapping: List[Tuple[str, str, [str | None | List]]], + mapped_attrs_dict: Optional[Dict[str, Any]] = None, +) -> List[Any]: + """Map biostudies.Submission objects to either semantic_models or bia_data_model equivalent + + """ + sections = find_sections_recursive(submission.section, section_name) + + return_list = [] + for section in sections: + if mapped_attrs_dict is None: + attr_dict = attributes_to_dict(section.attributes) + else: + attr_dict = mattributes_to_dict(section.attributes, mapped_attrs_dict) + model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping} + return_list.append(mapped_object.model_validate(model_dict)) + return return_list + + +# TODO: Put comments and docstring +def get_generic_section_as_dict( + submission: Submission, + section_name: List[str], + mapped_object: [Any], + key_mapping: List[Tuple[str, str, [str | None | List]]], +) -> Dict[str, Any]: + """Map biostudies.Submission objects to dict containing either semantic_models or bia_data_model equivalent + + """ + sections = find_sections_recursive(submission.section, section_name) + + return_dict = {} + for section in sections: + attr_dict = attributes_to_dict(section.attributes) + model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping} + return_dict[section.accno] = mapped_object.model_validate(model_dict) + return return_dict + + +def get_affiliation(submission: Submission) -> Dict[str, semantic_models.Affiliation]: + """Maps biostudies.Submission.Organisation sections to semantic_models.Affiliations + + """ + + organisation_sections = find_sections_recursive( + submission.section, ["organisation", "organization"], [] + ) + + key_mapping = [ + ("display_name", "Name", None), + ("rorid", "RORID", None), + # TODO: Address does not exist in current biostudies.Organisation + ("address", "Address", None), + # TODO: does not exist in current biostudies.Organisation + ("website", "Website", None), + ] + + affiliation_dict = {} + for section in organisation_sections: + attr_dict = attributes_to_dict(section.attributes) + + model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping} + affiliation_dict[section.accno] = semantic_models.Affiliation.model_validate( + model_dict + ) + + return affiliation_dict + + +def get_publication(submission: Submission) -> List[semantic_models.Publication]: + publication_sections = find_sections_recursive( + submission.section, ["publication",], [] + ) + key_mapping = [ + ("doi", "DOI", None), + ("pubmed_id", "Pubmed ID", None), + ("author", "Authors", None), + ("release_date", "Year", None), + ("title", "Title", None), + ] + publications = [] + for section in publication_sections: + attr_dict = attributes_to_dict(section.attributes) + + model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping} + publications.append(semantic_models.Publication.model_validate(model_dict)) + + return publications + + +def get_contributor(submission: Submission) -> List[semantic_models.Contributor]: + """ Map authors in submission to semantic_model.Contributors + + """ + affiliation_dict = get_affiliation(submission) + key_mapping = [ + ("display_name", "Name", None), + ("contact_email", "E-mail", "not@supplied.com"), + ("role", "Role", None), + ("orcid", "ORCID", None), + ("affiliation", "affiliation", []), + ] + author_sections = find_sections_recursive(submission.section, ["author",], []) + contributors = [] + for section in author_sections: + attr_dict = mattributes_to_dict(section.attributes, affiliation_dict) + model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping} + # TODO: Find out if authors can have more than one organisation -> + # what are the implications for mattributes_to_dict? + if model_dict["affiliation"] is None: + model_dict["affiliation"] = [] + elif type(model_dict["affiliation"]) is not list: + model_dict["affiliation"] = [ + model_dict["affiliation"], + ] + contributors.append(semantic_models.Contributor.model_validate(model_dict)) + + return contributors + + +def find_sections_recursive( + section: Section, search_types: List[str], results: Optional[List[Section]] = [] +) -> List[Section]: + """Find all sections of search_types within tree, starting at given section + + """ + + search_types_lower = [s.lower() for s in search_types] + if section.type.lower() in search_types_lower: + results.append(section) + + # Each thing in section.subsections is either Section or List[Section] + # First, let's make sure we ensure they're all lists: + nested = [ + [item] if not isinstance(item, list) else item for item in section.subsections + ] + # Then we can flatten this list of lists: + flattened = sum(nested, []) + + for section in flattened: + find_sections_recursive(section, search_types, results) + + return results + + +# TODO check type of reference_dict. Possibly Dict[str, str], but need to +# verify. This also determines type returned by function +def mattributes_to_dict( + attributes: List[Attribute], reference_dict: Dict[str, Any] +) -> Dict[str, Any]: + """Return attributes as dictionary dereferencing attribute references + + Return the list of attributes supplied as a dictionary. Any attributes + whose values are references are 'dereferenced' using the reference_dict + """ + + def value_or_dereference(attr): + if attr.reference: + return reference_dict[attr.value] + else: + return attr.value + + return {attr.name: value_or_dereference(attr) for attr in attributes} + + +# TODO: Need to use a canonical version for this function e.g. from API +def dict_to_uuid(my_dict: Dict[str, Any], attributes_to_consider: List[str]) -> str: + """Create uuid from specific keys in a dictionary + + """ + + seed = "".join([f"{my_dict[attr]}" for attr in attributes_to_consider]) + hexdigest = hashlib.md5(seed.encode("utf-8")).hexdigest() + return str(uuid.UUID(version=4, hex=hexdigest)) diff --git a/bia-ingest-shared-models/pyproject.toml b/bia-ingest-shared-models/pyproject.toml new file mode 100644 index 00000000..f32187d5 --- /dev/null +++ b/bia-ingest-shared-models/pyproject.toml @@ -0,0 +1,27 @@ +[tool.poetry] +name = "bia-ingest-sm" +version = "0.1.0" +description = "BIA ingest using shared models" +authors = ["Kola Babalola "] +license = "Apache Software License 2.0" +readme = "README.md" +packages = [{include = "bia_ingest_sm"}] + +[tool.poetry.dependencies] +python = "^3.10" +requests = "^2.31.0" +pytest = "^7.0" +bia-shared-datamodels = { path = "../bia-shared-datamodels", develop = true } +typer = "^0.12.3" +typing-extensions = "^4.12.2" + +[tool.poetry.scripts] +biaingest = "bia_ingest_sm.cli:app" + + +[tool.poetry.group.dev.dependencies] +ipython = "^8.22.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/bia-ingest-shared-models/test/__init__.py b/bia-ingest-shared-models/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bia-ingest-shared-models/test/conftest.py b/bia-ingest-shared-models/test/conftest.py new file mode 100644 index 00000000..db2180d9 --- /dev/null +++ b/bia-ingest-shared-models/test/conftest.py @@ -0,0 +1,20 @@ +from pathlib import Path +import json +import pytest +from bia_ingest_sm.biostudies import Submission + + +@pytest.fixture +def base_path() -> Path: + """Return full path to test directory + + """ + return Path(__file__).parent + + +@pytest.fixture +def test_submission(base_path: Path) -> Submission: + submission_path = base_path / "data" / "S-BIADTEST.json" + json_data = json.loads(submission_path.read_text()) + submission = Submission.model_validate(json_data) + return submission diff --git a/bia-ingest-shared-models/test/data/S-BIADTEST.json b/bia-ingest-shared-models/test/data/S-BIADTEST.json new file mode 100644 index 00000000..bea2a603 --- /dev/null +++ b/bia-ingest-shared-models/test/data/S-BIADTEST.json @@ -0,0 +1,297 @@ +{ + "accno" : "S-BIADTEST", + "attributes" : [ { + "name" : "Template", + "value" : "BioImages.v4" + }, { + "name" : "ReleaseDate", + "value" : "2024-02-13" + }, { + "name" : "AttachTo", + "value" : "BioImages" + } ], + "section" : { + "accno" : "section0", + "type" : "Study", + "attributes" : [ { + "name" : "Title", + "value" : "A test submission with title greater than 25 characters" + }, { + "name" : "Description", + "value" : "A test submission to allow testing without retrieving from bia server" + }, { + "name" : "Keywords", + "value" : "Test keyword1" + }, { + "name" : "Keywords", + "value" : "Test keyword2" + }, { + "name" : "Keywords", + "value" : "Test keyword3" + }, { + "name" : "Acknowledgements", + "value" : "We thank you" + }, { + "name" : "License", + "value" : "CC0", + "valqual" : [ { + "name" : "URL", + "value" : "https://creativecommons.org/publicdomain/zero/1.0/legalcode" + } ] + }, { + "name" : "Funding statement", + "value" : "This work was funded by the EBI" + } ], + "links" : [ { + "url" : "https://www.test.link1.com/", + "attributes" : [ { + "name" : "Description", + "value" : "Test link 1." + } ] + }, { + "url" : "ERP116793", + "attributes" : [ { + "name" : "Description", + "value" : "Test ENA link" + }, { + "name" : "Type", + "value" : "ENA" + }] + } ], + "subsections" : [ { + "type" : "author", + "accno" : "author1", + "attributes" : [ { + "name" : "Name", + "value" : "Test Author1" + }, { + "name" : "E-mail", + "value" : "test_author1@ebi.ac.uk" + }, { + "name" : "Role", + "value" : "corresponding author" + }, { + "name" : "ORCID", + "value" : "0000-0000-0000-0000" + }, { + "name" : "affiliation", + "value" : "o1", + "reference" : true + } ] + },{ + "type" : "Author", + "accno" : "author2", + "attributes" : [ { + "name" : "Name", + "value" : "Test Author2" + }, { + "name" : "E-mail", + "value" : "test_author2@ebi.ac.uk" + }, { + "name" : "Role", + "value" : "first author" + }, { + "name" : "ORCID", + "value" : "1111-1111-1111-1111" + }, { + "name" : "affiliation", + "value" : "o2", + "reference" : true + } ] + }, { + "accno" : "o1", + "type" : "organisation", + "attributes" : [ { + "name" : "Name", + "value" : "Test College 1" + } ] + }, { + "accno" : "o2", + "type" : "organisation", + "attributes" : [ { + "name" : "Name", + "value" : "Test College 2" + } ] + }, { + "type" : "Publication", + "attributes" : [ { + "name" : "Pubmed ID", + "value" : "38381674" + }, { + "name" : "Title", + "value" : "Test publication 1" + }, { + "name" : "Authors", + "value" : "Test Author11, Test Author12." + }, { + "name" : "Year", + "value" : "2024" + } ] + }, { + "type" : "Publication", + "attributes" : [ { + "name" : "Pubmed ID", + "value" : "38106175" + }, { + "name" : "Title", + "value" : "Test publication 2" + }, { + "name" : "Authors", + "value" : "Test Author21, Test Author22" + }, { + "name" : "DOI", + "value" : "10.1101/2023.12.07.570699" + },{ + "name" : "Year", + "value" : "2023" + }] + }, { + "type" : "Funding", + "accno" : "funding1", + "attributes" : [ { + "name" : "Agency", + "value" : "Test funding body1" + }, { + "name" : "grant_id", + "value" : "TESTFUNDS1" + } ] + }, { + "type" : "Funding", + "accno" : "funding2", + "attributes" : [ { + "name" : "Agency", + "value" : "Test funding body2" + }, { + "name" : "grant_id", + "value" : "TESTFUNDS2" + } ] + }, { + "accno" : "Biosample-1", + "type" : "Biosample", + "attributes" : [ { + "name" : "Title", + "value" : "Test Biosample " + }, { + "name" : "Organism", + "value" : "Homo sapiens (human)" + }, { + "name" : "Description", + "value" : "Test description (\"with some escaped chars\") " + }, { + "name" : "Biological entity", + "value" : "Test biological entity" + }, { + "name" : "Experimental variable", + "value" : "Test experimental entity" + }, { + "name" : "Extrinsic variable", + "value" : "Test extrinsic variable" + }, { + "name" : "Intrinsic variable", + "value" : "Test intrinsic variable\nwith escaped character" + } ] + }, { + "accno" : "Specimen-2", + "type" : "Specimen", + "attributes" : [ { + "name" : "Title", + "value" : "Test specimen" + }, { + "name" : "Sample preparation protocol", + "value" : "Test sample preparation protocol" + }, { + "name" : "Growth protocol", + "value" : "Test growth protocol" + } ] + }, { + "accno" : "Image acquisition-3", + "type" : "Image acquisition", + "attributes" : [ { + "name" : "Title", + "value" : "Test Primary Screen Image Acquisition" + }, { + "name" : "Imaging instrument", + "value" : "Test imaging instrument" + }, { + "name" : "Image acquisition parameters", + "value" : "Test image acquisition parameters" + }, { + "name" : "Imaging method", + "value" : "confocal microscopy" + } ] + }, { + "accno" : "Image acquisition-7", + "type" : "Image acquisition", + "attributes" : [ { + "name" : "Title", + "value" : "Test Secondary Screen Image Acquisition" + }, { + "name" : "Imaging instrument", + "value" : "Test imaging instrument 2" + }, { + "name" : "Image acquisition parameters", + "value" : "Test image acquisition parameters 2" + }, { + "name" : "Imaging method", + "value" : "fluorescence microscopy" + } ] + }, { + "accno" : "Image analysis-5", + "type" : "Image analysis", + "attributes" : [ { + "name" : "Title", + "value" : "Test image analysis" + }, { + "name" : "Image analysis overview", + "value" : "Test image analysis overview" + } ] + }, { + "accno" : "Study Component-4", + "type" : "Study Component", + "attributes" : [ { + "name" : "Name", + "value" : "Primary and secondary secretome screens" + }, { + "name" : "Description", + "value" : "Images and analyses from primary and secondary secretome screens" + }, { + "name" : "File List", + "value" : "File_List_secretome_both_final.json" + } ], + "subsections" : [ { + "type" : "Associations", + "accno" : "association0", + "attributes" : [ { + "name" : "Biosample", + "value" : "Test Biosample " + }, { + "name" : "Specimen", + "value" : "Test specimen" + }, { + "name" : "Image acquisition", + "value" : "Test Primary Screen Image Acquisition" + }, { + "name" : "Image analysis", + "value" : "Test image analysis" + } ] + }, { + "type" : "Associations", + "accno" : "association1", + "attributes" : [ { + "name" : "Biosample", + "value" : "Test Biosample " + }, { + "name" : "Specimen", + "value" : "Test specimen" + }, { + "name" : "Image acquisition", + "value" : "Test Secondary Screen Image Acquisition" + }, { + "name" : "Image analysis", + "value" : "Test image analysis" + } ] + } ] + } ] + }, + "type" : "submission" +} diff --git a/bia-ingest-shared-models/test/test_shared_models.py b/bia-ingest-shared-models/test/test_shared_models.py new file mode 100644 index 00000000..94432c68 --- /dev/null +++ b/bia-ingest-shared-models/test/test_shared_models.py @@ -0,0 +1,27 @@ +from typing import Dict +import pytest +from . import utils +from .utils import bia_data_model, semantic_models +from bia_ingest_sm import conversion + + +@pytest.mark.parametrize( + ("expected_model_func", "model_creation_func",), + ( + (utils.get_test_affiliation, conversion.get_affiliation,), + (utils.get_test_contributor, conversion.get_contributor,), + (utils.get_test_grant, conversion.get_grant,), + (utils.get_test_study, conversion.get_study,), + # Not testing as we need to deal with links that are not proper + # urls + # (utils.get_test_external_reference, conversion.get_external_reference,), + # Do not test semantic_models.Publication yet. Need to resolve + # issues around some fields being mandatory or optional + # (utils.get_test_publication, conversion.get_publication,), + # (bia_data_model.Study, conversion.get_study_from_submission,), + ), +) +def test_create_models(expected_model_func, model_creation_func, test_submission): + expected = expected_model_func() + created = model_creation_func(test_submission) + assert expected == created diff --git a/bia-ingest-shared-models/test/utils.py b/bia-ingest-shared-models/test/utils.py new file mode 100644 index 00000000..3c4a6686 --- /dev/null +++ b/bia-ingest-shared-models/test/utils.py @@ -0,0 +1,477 @@ +"""Utility functions to create models + + This module attempts to create models starting from the outer nodes (leaves) of the + model dependency graph + +""" + +from typing import Dict, List +from src.bia_models import bia_data_model, semantic_models +from bia_ingest_sm.conversion import dict_to_uuid +from uuid import uuid4 + +template_taxon = semantic_models.Taxon.model_validate( + { + "common_name": "Test Common Name", + "scientific_name": "Test Scientific Name", + "ncbi_id": "Test_NCBI_ID", + } +) + + +def get_template_channel() -> semantic_models.Channel: + return semantic_models.Channel.model_validate( + { + "colormap_start": 0.0, + "colormap_end": 1.0, + "scale_factor": 1.0, + "label": "Template label", + } + ) + + +def get_template_rendered_view() -> semantic_models.RenderedView: + return semantic_models.RenderedView.model_validate( + { + "z": "Template z position", + "t": "Template t position", + "channel_information": [ + get_template_channel(), + ], + } + ) + + +def get_template_specimen_preparation_protocol() -> ( + bia_data_model.SpecimenPrepartionProtocol +): + specimen_preparation_protocol = ( + bia_data_model.SpecimenPrepartionProtocol.model_validate( + { + "uuid": uuid4(), + "title_id": "Test specimen preparation protocol", + "method_description": "Test description", + "signal_contrast_mechanism_description": "Test description", + "growth_protocol_description": "Test description", + "channel_content_description": "Test description", + "channel_biological_entity": "Test Entity", + } + ) + ) + return specimen_preparation_protocol + + +def get_template_biosample() -> bia_data_model.BioSample: + biosample = bia_data_model.BioSample.model_validate( + { + "uuid": uuid4(), + "title_id": "Template BioSample", + "organism_classification": [ + template_taxon.model_dump(), + ], + "description": "Test biosample description", + "experimental_variable_description": [ + "Description of experimental variable", + ], + "extrinsic_variable_description": [ + "Description of external treatment", + ], + "intrinsic_variable_description": [ + "Description of internal treatment", + ], + } + ) + return biosample + + +# Depends on: +# bia_data_model.BioSample +# bia_data_model.SpecimenPreparationProtocol +def get_template_specimen() -> bia_data_model.Specimen: + specimen = bia_data_model.Specimen.model_validate( + { + "preparation_method": [ + get_template_specimen_preparation_protocol().uuid, + ], + "sample_of": [ + get_template_biosample().uuid, + ], + } + ) + return specimen + + +# Depends on ExperimentalImagingDataset (circular) +def get_template_annotation_method() -> bia_data_model.AnnotationMethod: + annotation_method = bia_data_model.AnnotationMethod.model_validate( + { + "uuid": uuid4(), + "title_id": "Template annotation method", + "source_dataset": [], # ExperimentalImagingDataset.uuid or url + "method_description": "Template annotation method description", + "annotation_criteria": "Template annotation criteria", + "annotation_coverage": "Template annotation coverage", + "method_type": semantic_models.AnnotationType.class_labels, + } + ) + return annotation_method + + +# Depends on: +# bia_data_model.ExperimentalImagingDataset (circular dependency) +# bia_data_model.ImageAcquisition +# bia_data_model.ImageRepresentation +# bia_data_model.Specimen +def get_template_experimentally_captured_image() -> ( + bia_data_model.ExperimentallyCapturedImage +): + return bia_data_model.ExperimentallyCapturedImage.model_validate( + { + "uuid": uuid4(), + "acquisition_process": [get_template_image_acquisition().uuid], + "representation": [ + get_template_image_representation().uuid, + ], + "submission_dataset": get_template_experimental_imaging_dataset().uuid, + "subject": get_template_specimen(), + "attribute": {}, + } + ) + + +# Depends on: +# bia_data_model.ImageAnnotationDataset (circular dependency) +# bia_data_model.AnnotationMethod +# bia_data_model.ImageRepresentation +def get_template_derived_image() -> bia_data_model.DerivedImage: + derived_image = bia_data_model.DerivedImage.model_validate( + { + "uuid": uuid4(), + "source_image": [ + get_template_image_representation().uuid, + ], + "submission_dataset": get_template_image_annotation_dataset().uuid, + "creation_process": get_template_annotation_method().uuid, + "representation": [ + get_template_image_representation().uuid, + ], + "transformation_description": "Template transformation description", + "spatial_information": "Template spatial information", + "attribute": {}, + } + ) + return derived_image + + +# Depends on: +# bia_data_model.DerivedImage +# bia_data_model.FileReference (this is a circular dependence!) +# bia_data_model.Study +# bia_data_model.AnnotationFileReference (this is a circular dependence!) +# bia_data_model.AnnotationMethod +# +# TODO: Verify that in practice, the Datasets are created then the +# FileReference instances are added. So here we have empty lists +# for the dataset +def get_template_image_annotation_dataset() -> bia_data_model.ImageAnnotationDataset: + image_annotation_dataset = bia_data_model.ImageAnnotationDataset.model_validate( + { + "uuid": uuid4(), + "title_id": "Template image annotation dataset", + "image": [ + get_template_image_representation().uuid, + ], + "file": [], # This should be a list of FileReference UUIDs ... + "annotation_file": [], # This should be a list of AnnotationFileReference UUIDs ... + "submitted_in_study": get_template_study().uuid, + "annotation_method": get_template_annotation_method().uuid, + "file_reference_count": 0, + "image_count": 0, + "example_image_uri": ["https://dummy.url.org"], + } + ) + return image_annotation_dataset + + +def get_template_image_acquisition() -> bia_data_model.ImageAcquisition: + image_acquisition = bia_data_model.ImageAcquisition.model_validate( + { + "uuid": uuid4(), + "title_id": "Template image acquisition", + "method_description": "Template method description", + "imaging_instrument_description": "Template imaging instrument", + "image_acquisition_parameters": "Template image acquisition parameters", + "fbbi_id": [ + "Test FBBI ID", + ], + } + ) + return image_acquisition + + +def get_template_image_analysis_method() -> semantic_models.ImageAnalysisMethod: + return semantic_models.ImageAnalysisMethod.model_validate( + { + "method_description": "Template Analysis method", + "features_analysed": "Template features analysed", + } + ) + + +def get_template_image_correlation_method() -> semantic_models.ImageCorrelationMethod: + return semantic_models.ImageCorrelationMethod.model_validate( + { + "method_description": "Template Analysis method", + "fiducials_used": "Template fiducials used", + "transformation_matrix": "Template transformation matrix", + } + ) + + +# Depends on: +# bia_data_model.ExperimentallyCapturedImage +# bia_data_model.FileReference (this is a circular dependence!) +# bia_data_model.Study +# bia_data_model.SpecimenPreparationProtocol +# bia_data_model.ImageAcquisition +# bia_data_model.BioSample +# +# TODO: Verify that in practice, the Datasets are created then the +# FileReference instances are added. So here we have empty lists +# for the dataset +def get_template_experimental_imaging_dataset() -> ( + bia_data_model.ExperimentalImagingDataset +): + experimental_imaging_dataset = bia_data_model.ExperimentalImagingDataset.model_validate( + { + "uuid": uuid4(), + "title_id": "Template experimental image dataset", + "image": [], # This should be a list of Experimentally captured image UUIDs + "file": [], # This should be a list of FileReference UUIDs ... + "submitted_in_study": get_template_study().uuid, + "specimen_preparation_method": [ + get_template_specimen_preparation_protocol().uuid, + ], + "acquisition_method": [ + get_template_image_acquisition().uuid, + ], + "biological_entity": [ + get_template_biosample().uuid, + ], + "analysis_method": [ + get_template_image_analysis_method().model_dump(), + ], + "correlation_method": [ + get_template_image_correlation_method().model_dump(), + ], + "file_reference_count": 0, + "image_count": 0, + "example_image_uri": ["https://dummy.url.org"], + } + ) + return experimental_imaging_dataset + + +# Depends on: +# bia_data_model.ImageAnnotationDataset (circular) +# bia_data_model.ExperimentalImagingDataset (circular) +def get_template_annotation_file_reference() -> bia_data_model.AnnotationFileReference: + return bia_data_model.AnnotationFileReference.model_validate( + { + "uuid": uuid4(), + "file_name": "Dummy file name", + "format": "Dummy format", + "size_in_bytes": 10, + "uri": "https://dummy.uri.co", + "attribute": {}, + "submission_dataset": get_template_image_annotation_dataset().uuid, + "source_image": [ + get_template_image_representation().uuid, + ], + "transformation_description": "Template transformation description", + "spatial_information": "Template spatial information", + "creation_process": get_template_annotation_method().uuid, + } + ) + + +# Depends on: +# bia_data_model.ImageAnnotationDataset (circular) +# bia_data_model.ExperimentalImagingDataset (circular) +def get_template_file_reference() -> bia_data_model.FileReference: + file_reference = bia_data_model.FileReference.model_validate( + { + "uuid": uuid4(), + "file_name": "Dummy file name", + "format": "Dummy format", + "size_in_bytes": 10, + "uri": "https://dummy.uri.co", + "attribute": {}, + "submission_dataset": get_template_experimental_imaging_dataset().uuid, + } + ) + return file_reference + + +# Depends on: +# bia_data_model.FileReference ( +def get_template_image_representation() -> bia_data_model.ImageRepresentation: + return bia_data_model.ImageRepresentation.model_validate( + { + "uuid": uuid4(), + "original_file_reference": [ + get_template_file_reference().uuid, + ], + "image_format": "Template image format", + "file_uri": [ + "https://dummy.uri.org", + ], + "total_size_in_bytes": 0, + "physical_size_x": 1, + "physical_size_y": 1, + "physical_size_z": 1, + "size_x": 1, + "size_y": 1, + "size_z": 1, + "size_c": 1, + "size_t": 1, + "image_viewer_setting": [ + get_template_rendered_view().model_dump(), + ], + "attribute": {}, + } + ) + + +def get_test_affiliation() -> Dict[str, semantic_models.Affiliation]: + affiliation1 = semantic_models.Affiliation.model_validate( + { + "display_name": "Test College 1", + "rorid": None, + "address": None, + "website": None, + } + ) + affiliation2 = semantic_models.Affiliation.model_validate( + { + "display_name": "Test College 2", + "rorid": None, + "address": None, + "website": None, + } + ) + return { "o1": affiliation1, "o2": affiliation2, } + + +def get_test_contributor() -> Dict[str, semantic_models.Contributor]: + affiliations = get_test_affiliation() + contributor1 = semantic_models.Contributor.model_validate( + { + "display_name": "Test Author1", + "contact_email": "test_author1@ebi.ac.uk", + "role": "corresponding author", + "affiliation": [ + affiliations["o1"], + ], + "rorid": None, + "address": None, + "website": None, + "orcid": "0000-0000-0000-0000", + } + ) + contributor2 = semantic_models.Contributor.model_validate( + { + "display_name": "Test Author2", + "contact_email": "test_author2@ebi.ac.uk", + "role": "first author", + "affiliation": [ + affiliations["o2"], + ], + "rorid": None, + "address": None, + "website": None, + "orcid": "1111-1111-1111-1111", + } + ) + + return [contributor1, contributor2,] + +def get_test_publication() -> List[semantic_models.Publication]: + publication1 = semantic_models.Publication.model_validate({ + "pubmed_id": "38381674", + "title": "Test publication 1", + # TODO: No release date -> ST only collects Year + "release_date": "2024", + # TODO: Author is a string here. + "author": "Test Author11, Test Author12.", + }) + publication2 = semantic_models.Publication.model_validate({ + "pubmed_id": "38106175", + "doi": "10.1101/2023.12.07.570699", + "title": "Test publication 2", + # TODO: Author is a string here. + "author": "Test Author21, Test Author22", + "release_date": "2023", + }) + return [publication1, publication2,] + +def get_test_external_reference() -> List[semantic_models.ExternalReference]: + link1 = semantic_models.ExternalReference.model_validate({ + "link": "https://www.test.link1.com/", + "description": "Test link 1.", + }) + link1 = semantic_models.ExternalReference.model_validate({ + "link": "ERP116793", + "description": "Test ENA link", + "Type": "ENA", + }) + return [link1, link2,] + + +def get_test_grant() -> List[semantic_models.Grant]: + funding_body1 = semantic_models.FundingBody.model_validate({ + "display_name": "Test funding body1", + }) + funding_body2 = semantic_models.FundingBody.model_validate({ + "display_name": "Test funding body2", + }) + + grant1 = semantic_models.Grant.model_validate({ + "id": "TESTFUNDS1", + "funder": [funding_body1,], + }) + grant2 = semantic_models.Grant.model_validate({ + "id": "TESTFUNDS2", + "funder": [funding_body2,], + }) + return [grant1, grant2,] + +def get_test_study() -> bia_data_model.Study: + contributor = get_test_contributor() + grant = get_test_grant() + study_dict = { + "accession_id": "S-BIADTEST", + "title": "A test submission with title greater than 25 characters", + "description": "A test submission to allow testing without retrieving from bia server", + "release_date": "2024-02-13", + "licence": semantic_models.LicenceType.CC0, + "acknowledgement": "We thank you", + "funding_statement": "This work was funded by the EBI", + "attribute": { + + }, + "related_publication": [], + "author": [ c.model_dump() for c in contributor ], + "keyword": [ + "Test keyword1", + "Test keyword2", + "Test keyword3", + ], + "grant": [ g.model_dump() for g in grant ], + "experimental_imaging_component": [], + "annotation_component": [], + } + study_uuid = dict_to_uuid(study_dict, ["accession_id", ]) + study_dict["uuid"] = study_uuid + study = bia_data_model.Study.model_validate(study_dict) + return study diff --git a/bia-shared-datamodels/Readme.md b/bia-shared-datamodels/README.md similarity index 100% rename from bia-shared-datamodels/Readme.md rename to bia-shared-datamodels/README.md diff --git a/bia-shared-datamodels/pyproject.toml b/bia-shared-datamodels/pyproject.toml index 02fb9ea1..8b2e4591 100644 --- a/bia-shared-datamodels/pyproject.toml +++ b/bia-shared-datamodels/pyproject.toml @@ -5,6 +5,7 @@ description = "Schemas and models for working with the BioImage Archive's (Metad authors = ["Francois Sherwood "] license = "Apache Software License 2.0" readme = "README.md" +packages = [{include = "src/bia_models"}] [tool.poetry.dependencies] python = "^3.10" diff --git a/bia-shared-datamodels/src/bia_models/bia_data_model.py b/bia-shared-datamodels/src/bia_models/bia_data_model.py index 50567787..ff8104e4 100644 --- a/bia-shared-datamodels/src/bia_models/bia_data_model.py +++ b/bia-shared-datamodels/src/bia_models/bia_data_model.py @@ -1,6 +1,6 @@ from __future__ import annotations -import semantic_models +from . import semantic_models from pydantic import BaseModel, Field, AnyUrl from typing import List, Optional, Union from uuid import UUID diff --git a/bia-shared-datamodels/src/bia_models/semantic_models.py b/bia-shared-datamodels/src/bia_models/semantic_models.py index 2cfa27ec..b7432ae0 100644 --- a/bia-shared-datamodels/src/bia_models/semantic_models.py +++ b/bia-shared-datamodels/src/bia_models/semantic_models.py @@ -144,7 +144,14 @@ class Publication(DocumentMixin): pubmed_id: Optional[str] = Field( None, description="""Identifier for journal articles/abstracts in PubMed""" ) - doi: str = Field(description="""Digital Object Identifier (DOI)""") + doi: Optional[str] = Field(None, description="""Digital Object Identifier (DOI)""") + # TODO: Discuss making changes below to allow Publications created by + # submission tool to be ingested. See https://app.clickup.com/t/8694zc48g + #doi: Optional[str] = Field(None, description="""Digital Object Identifier (DOI)""") + ## Override DocumentMixin.release_date as biostudies.Submission.Publication only has year of publication + #release_date: Optional[str] = Field(None, description="""Release date associated with publication. Not necessarily a well formatted date string""") + ## Override DocumentMixin.Authors as biostudies.Submission.Publication imports just a string with author names + #author: Optional[str] = Field(None, description="""Names of author(s)""") class ExternalReference(BaseModel):