Skip to content

Commit

Permalink
Fix some issues, add CLI for ingest, format with black
Browse files Browse the repository at this point in the history
  • Loading branch information
kbab committed Jul 3, 2024
1 parent b1a3225 commit 024b5cb
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 39 deletions.
14 changes: 9 additions & 5 deletions bia-ingest-shared-models/bia_ingest_sm/biostudies.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,19 @@ def load_submission(accession_id: str) -> Submission:
return submission


def attributes_to_dict(attributes: List[Attribute]) -> Dict[str, Optional[str|List[str]]]:
def attributes_to_dict(
attributes: List[Attribute],
) -> Dict[str, Optional[str | List[str]]]:

attr_dict = {}
for attr in attributes:
if attr.name in attr_dict:
if type(attr_dict[attr.name]) is list:
attr_dict[attr.name].append(attr.value)
else:
attr_dict[attr.name] = [attr_dict[attr.name],]
attr_dict[attr.name] = [
attr_dict[attr.name],
]
attr_dict[attr.name].append(attr.value)
else:
attr_dict[attr.name] = attr.value
Expand Down Expand Up @@ -206,11 +210,11 @@ def find_file_lists_in_submission(
return find_file_lists_in_section(submission.section, [])


# KB 14/06/2024 commented out as I need to replace parse_raw_as with
# KB 14/06/2024 commented out as I need to replace parse_raw_as with
# TypeAdapter for pydantic >=2
#def flist_from_flist_fname(
# def flist_from_flist_fname(
# accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None
#) -> List[File]:
# ) -> List[File]:
#
# flist_url = FLIST_URI_TEMPLATE.format(
# accession_id=accession_id, flist_fname=flist_fname
Expand Down
23 changes: 23 additions & 0 deletions bia-ingest-shared-models/bia_ingest_sm/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import typer
from typing import Optional
from typing_extensions import Annotated
from bia_ingest_sm.biostudies import load_submission
from bia_ingest_sm.conversion import get_study

app = typer.Typer()


@app.command(help="Ingest from biostudies and echo json of bia_data_model.Study")
def ingest(accession_id: Annotated[str, typer.Argument()],) -> None:
submission = load_submission(accession_id)
study = get_study(submission)
typer.echo(study.model_dump_json(indent=2))


@app.callback()
def main() -> None:
return


if __name__ == "__main__":
app()
105 changes: 80 additions & 25 deletions bia-ingest-shared-models/bia_ingest_sm/conversion.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,38 @@
import re
import hashlib
import uuid
from typing import List, Any, Dict, Optional, Tuple
from .biostudies import Submission, attributes_to_dict, Section, Attribute
from src.bia_models import bia_data_model, semantic_models


def get_study(submission: Submission) -> bia_data_model.Study:
"""Return an API study model populated from the submission
"""

submission_attributes = attributes_to_dict(submission.attributes)
contributors = get_contributor(submission)
grants = get_grant(submission)

study_attributes = attributes_to_dict(submission.section.attributes)

study_title = study_title_from_submission(submission)
if "Title" in study_attributes:
study_attributes.pop("Title")

licence = get_licence(study_attributes)
if "License" in study_attributes:
study_attributes.pop("License")

study_dict = {
"accession_id": submission.accno,
# TODO: Do more robust search for title - sometimes it is in
# actual submission - see old ingest code
"title": study_attributes.pop("Title", None),
"title": study_title,
"description": study_attributes.pop("Description", None),
"release_date": submission_attributes.pop("ReleaseDate"),
"licence": get_licence(study_attributes),
"licence": licence,
"acknowledgement": study_attributes.pop("Acknowledgements", None),
"funding_statement": study_attributes.pop("Funding statement", None),
"keyword": study_attributes.pop("Keywords", []),
Expand All @@ -31,28 +41,41 @@ def get_study(submission: Submission) -> bia_data_model.Study:
"attribute": study_attributes,
"experimental_imaging_component": [],
"annotation_component": [],

}
study_uuid = dict_to_uuid(study_dict, ["accession_id", "title", "release_date",])
study_uuid = dict_to_uuid(study_dict, ["accession_id",])
study_dict["uuid"] = study_uuid
study = bia_data_model.Study.model_validate(study_dict)

return study

def get_licence(submission_attributes: Dict[str, Any]) -> semantic_models.LicenceType:

def study_title_from_submission(submission: Submission) -> str:

submission_attr_dict = attributes_to_dict(submission.attributes)
study_section_attr_dict = attributes_to_dict(submission.section.attributes)

study_title = submission_attr_dict.get("Title", None)
if not study_title:
study_title = study_section_attr_dict.get("Title", "Unknown")

return study_title


def get_licence(study_attributes: Dict[str, Any]) -> semantic_models.LicenceType:
"""Return enum version of licence of study
"""
licence = submission_attributes.pop("License").replace(" ", "_")
licence = re.sub(r"\s", "_", study_attributes.get("License", "CC0"))
return semantic_models.LicenceType(licence)

def get_external_reference(submission: Submission) -> List[semantic_models.ExternalReference]:

def get_external_reference(
submission: Submission,
) -> List[semantic_models.ExternalReference]:
"""Map biostudies.Submission.Link to semantic_models.ExternalReference
"""
sections = find_sections_recursive(
submission.section, ["links",]
)
sections = find_sections_recursive(submission.section, ["links",])

key_mapping = [
("link", "url", None),
Expand All @@ -64,36 +87,50 @@ def get_external_reference(submission: Submission) -> List[semantic_models.Exter
for section in sections:
attr_dict = attributes_to_dict(section.attributes)
model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}
return_list.append(semantic_models.External_reference.model_validate(model_dict))
return_list.append(
semantic_models.External_reference.model_validate(model_dict)
)
return return_list


# TODO: Put comments and docstring
def get_grant(submission: Submission) -> List[semantic_models.Grant]:
funding_body_dict = get_funding_body(submission)
key_mapping = [
("id", "grant_id", None),
]
grant_dict = get_generic_section_as_dict(submission, ["Funding",], semantic_models.Grant, key_mapping)
grant_dict = get_generic_section_as_dict(
submission, ["Funding",], semantic_models.Grant, key_mapping
)

grant_list = []
for k, v in grant_dict.items():
if k in funding_body_dict:
v.funder.append(funding_body_dict[k])
grant_list.append(v)
return grant_list



# TODO: Put comments and docstring
def get_funding_body(submission: Submission) -> semantic_models.FundingBody:

key_mapping = [
("display_name", "Agency", None,),
]
funding_body = get_generic_section_as_dict(submission, ["Funding",], semantic_models.FundingBody, key_mapping)
funding_body = get_generic_section_as_dict(
submission, ["Funding",], semantic_models.FundingBody, key_mapping
)
return funding_body


# TODO: Put comments and docstring
def get_generic_section_as_list(submission: Submission, section_name: List[str], mapped_object: [Any], key_mapping: List[Tuple[str, str, [str|None|List]]], mapped_attrs_dict: Optional[Dict[str, Any]] = None) -> List[Any]:
def get_generic_section_as_list(
submission: Submission,
section_name: List[str],
mapped_object: [Any],
key_mapping: List[Tuple[str, str, [str | None | List]]],
mapped_attrs_dict: Optional[Dict[str, Any]] = None,
) -> List[Any]:
"""Map biostudies.Submission objects to either semantic_models or bia_data_model equivalent
"""
Expand All @@ -109,8 +146,14 @@ def get_generic_section_as_list(submission: Submission, section_name: List[str],
return_list.append(mapped_object.model_validate(model_dict))
return return_list


# TODO: Put comments and docstring
def get_generic_section_as_dict(submission: Submission, section_name: List[str], mapped_object: [Any], key_mapping: List[Tuple[str, str, [str|None|List]]]) -> Dict[str,Any]:
def get_generic_section_as_dict(
submission: Submission,
section_name: List[str],
mapped_object: [Any],
key_mapping: List[Tuple[str, str, [str | None | List]]],
) -> Dict[str, Any]:
"""Map biostudies.Submission objects to dict containing either semantic_models or bia_data_model equivalent
"""
Expand All @@ -123,6 +166,7 @@ def get_generic_section_as_dict(submission: Submission, section_name: List[str],
return_dict[section.accno] = mapped_object.model_validate(model_dict)
return return_dict


def get_affiliation(submission: Submission) -> Dict[str, semantic_models.Affiliation]:
"""Maps biostudies.Submission.Organisation sections to semantic_models.Affiliations
Expand All @@ -146,13 +190,17 @@ def get_affiliation(submission: Submission) -> Dict[str, semantic_models.Affilia
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}
affiliation_dict[section.accno] = semantic_models.Affiliation.model_validate(model_dict)
affiliation_dict[section.accno] = semantic_models.Affiliation.model_validate(
model_dict
)

return affiliation_dict


def get_publication(submission: Submission) -> List[semantic_models.Publication]:
publication_sections = find_sections_recursive(submission.section, ["publication",], [])
publication_sections = find_sections_recursive(
submission.section, ["publication",], []
)
key_mapping = [
("doi", "DOI", None),
("pubmed_id", "Pubmed ID", None),
Expand All @@ -169,11 +217,12 @@ def get_publication(submission: Submission) -> List[semantic_models.Publication]

return publications


def get_contributor(submission: Submission) -> List[semantic_models.Contributor]:
""" Map authors in submission to semantic_model.Contributors
"""
affiliation_dict = get_affiliation(submission)
affiliation_dict = get_affiliation(submission)
key_mapping = [
("display_name", "Name", None),
("contact_email", "E-mail", "[email protected]"),
Expand All @@ -191,11 +240,13 @@ def get_contributor(submission: Submission) -> List[semantic_models.Contributor]
if model_dict["affiliation"] is None:
model_dict["affiliation"] = []
elif type(model_dict["affiliation"]) is not list:
model_dict["affiliation"] = [model_dict["affiliation"],]
model_dict["affiliation"] = [
model_dict["affiliation"],
]
contributors.append(semantic_models.Contributor.model_validate(model_dict))

return contributors


def find_sections_recursive(
section: Section, search_types: List[str], results: Optional[List[Section]] = []
Expand All @@ -204,7 +255,7 @@ def find_sections_recursive(
"""

search_types_lower = [ s.lower() for s in search_types ]
search_types_lower = [s.lower() for s in search_types]
if section.type.lower() in search_types_lower:
results.append(section)

Expand All @@ -221,14 +272,18 @@ def find_sections_recursive(

return results


# TODO check type of reference_dict. Possibly Dict[str, str], but need to
# verify. This also determines type returned by function
def mattributes_to_dict(attributes: List[Attribute], reference_dict: Dict[str, Any]) -> Dict[str, Any]:
def mattributes_to_dict(
attributes: List[Attribute], reference_dict: Dict[str, Any]
) -> Dict[str, Any]:
"""Return attributes as dictionary dereferencing attribute references
Return the list of attributes supplied as a dictionary. Any attributes
whose values are references are 'dereferenced' using the reference_dict
"""

def value_or_dereference(attr):
if attr.reference:
return reference_dict[attr.value]
Expand Down
6 changes: 5 additions & 1 deletion bia-ingest-shared-models/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@ packages = [{include = "bia_ingest_sm"}]
[tool.poetry.dependencies]
python = "^3.10"
requests = "^2.31.0"
#bia-shared-datamodels = { path = "../bia-shared-datamodels", develop = true }
pytest = "^7.0"
bia-shared-datamodels = { path = "../bia-shared-datamodels", develop = true }
typer = "^0.12.3"
typing-extensions = "^4.12.2"

[tool.poetry.scripts]
biaingest = "bia_ingest_sm.cli:app"


[tool.poetry.group.dev.dependencies]
Expand Down
3 changes: 2 additions & 1 deletion bia-ingest-shared-models/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
import pytest
from bia_ingest_sm.biostudies import Submission


@pytest.fixture
def base_path() -> Path:
"""Return full path to test directory
"""
return Path(__file__).parent


@pytest.fixture
def test_submission(base_path: Path) -> Submission:
submission_path = base_path / "data" / "S-BIADTEST.json"
json_data = json.loads(submission_path.read_text())
submission = Submission.model_validate(json_data)
return submission

10 changes: 4 additions & 6 deletions bia-ingest-shared-models/test/test_shared_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,21 @@
from .utils import bia_data_model, semantic_models
from bia_ingest_sm import conversion


@pytest.mark.parametrize(
("expected_model_func", "model_creation_func",),
(
(utils.get_test_affiliation, conversion.get_affiliation,),
(utils.get_test_contributor, conversion.get_contributor,),
(utils.get_test_grant, conversion.get_grant,),
(utils.get_test_study, conversion.get_study,),
# Not testing as we need to deal with links that are not proper
# urls
#(utils.get_test_external_reference, conversion.get_external_reference,),
# (utils.get_test_external_reference, conversion.get_external_reference,),
# Do not test semantic_models.Publication yet. Need to resolve
# issues around some fields being mandatory or optional
#(utils.get_test_publication, conversion.get_publication,),
#(bia_data_model.Study, conversion.get_study_from_submission,),
# (utils.get_test_publication, conversion.get_publication,),
# (bia_data_model.Study, conversion.get_study_from_submission,),
),
)
def test_create_models(expected_model_func, model_creation_func, test_submission):
Expand Down
2 changes: 1 addition & 1 deletion bia-ingest-shared-models/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def get_test_study() -> bia_data_model.Study:
"experimental_imaging_component": [],
"annotation_component": [],
}
study_uuid = dict_to_uuid(study_dict, ["accession_id", "title", "release_date",])
study_uuid = dict_to_uuid(study_dict, ["accession_id", ])
study_dict["uuid"] = study_uuid
study = bia_data_model.Study.model_validate(study_dict)
return study

0 comments on commit 024b5cb

Please sign in to comment.