Skip to content

Commit

Permalink
Include Ensembl Tark API for reference retrieval (#21)
Browse files Browse the repository at this point in the history
* Retrieve transcript info with version number using Tark

* Update retrieval order

* Add Tark api in configuration

* Update CLI

* Implement retrieve sequence and annotation for model type

* Cleaning

* Add tests

---------

Authored-by: Xiaoyun Liu <[email protected]>
  • Loading branch information
XLIU-hub authored Apr 30, 2024
1 parent d9bbe31 commit b623aa5
Show file tree
Hide file tree
Showing 29 changed files with 1,838 additions and 252 deletions.
2 changes: 1 addition & 1 deletion mutalyzer_retriever/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _parse_args(args):
parser.add_argument("--id", help="the reference id")

parser.add_argument(
"-s", "--source", help="retrieval source", choices=["ncbi", "ensembl", "lrg"]
"-s", "--source", help="retrieval source", choices=["ncbi", "ensembl", "ensembl_tark", "ensembl_rest", "lrg"]
)

parser.add_argument(
Expand Down
1 change: 1 addition & 0 deletions mutalyzer_retriever/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"MAX_FILE_SIZE": 10 * 1048576,
"ENSEMBL_API": "https://rest.ensembl.org",
"ENSEMBL_API_GRCH37": "https://grch37.rest.ensembl.org",
"ENSEMBL_TARK_API":"https://tark.ensembl.org/api",
}


Expand Down
4 changes: 3 additions & 1 deletion mutalyzer_retriever/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .parsers import fasta, gff3, lrg
from .parsers import fasta, gff3, json_ensembl, lrg


def _get_reference_type(content):
Expand All @@ -19,6 +19,8 @@ def parse(reference_content, reference_type=None, reference_source=None):
model = gff3.parse(reference_content, reference_source)
elif reference_type == "fasta":
model = fasta.parse(reference_content)
elif reference_type == "json":
model = json_ensembl.parse(reference_content)
else:
return None

Expand Down
159 changes: 159 additions & 0 deletions mutalyzer_retriever/parsers/json_ensembl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import requests
from ..util import make_location, f_e


def _feature(raw_dict):
"""Convert a general tark sub-dictionary into our internal model.
- only id and location info;
- Tark locations are 1-based, our model is 0-based.
"""
return {
"id": raw_dict["stable_id"],
"location": make_location(
raw_dict["loc_start"] - 1, raw_dict["loc_end"], raw_dict.get("loc_strand")
),
}


def _annotations(ref_id, location, features):
return {
"id": ref_id,
"type": "record",
"location": location,
"features": features,
}


def _exons(tark_exons):
"""Convert exons info from tark response list into internal exon list."""
exons = []
for tark_exon in tark_exons:
exon = _feature(tark_exon)
exon["type"] = "exon"
exons.append(exon)
return exons


def _translation(tark_translations):
"""Convert translations per transcript from tark list into internal translation list.
- null for non-coding RNA in input, return an empty list;
- one value for coding RNA in input, return a list of one item;
- rarely multiple values for coding RNA in input with different versions,
return a list of multiple items.
"""
translations = []
for tark_translation in tark_translations:
translation = _feature(tark_translation)
translation["type"] = "CDS"
translations.append(translation)
return translations


def _transcript(tark_transcript, exon_features, translation_feature):
"""Convert transcript from tark list into internal transcript list.
- Tark has RNA type as protein_coding, change to internal RNA type mRNA.
"""
transcript = {}
transcript = _feature(tark_transcript)
transcript["type"] = tark_transcript["biotype"]
if transcript["type"] == "protein_coding":
transcript["type"] = "mRNA"
transcript["qualifiers"] = {
"assembly_name": tark_transcript["assembly"],
"version": str(tark_transcript["stable_id_version"]),
"tag": "basic",
}
transcript["features"] = exon_features + translation_feature
return [transcript]


def _gene(tark_gene, gene_feature):
"""Convert gene info from tark list into internal gene list."""
gene = {}
gene = _feature(tark_gene)
gene["type"] = "gene"
gene["qualifiers"] = {
"assembly_name": tark_gene["assembly"],
"version": str(tark_gene["stable_id_version"]),
"name": tark_gene["name"],
}
gene["features"] = gene_feature
return [gene]


def _seq_from_rest(assembly, chr_idx, strand, loc_start, loc_end, timeout=1):
"""Retrieve sequence from ensembl Rest API."""
if assembly == "GRCh38":
server = "https://rest.ensembl.org"
elif assembly == "GRCh37":
server = "https://grch37.rest.ensembl.org"
else:
raise NameError("Unsupported assembly {assembly}")
ext = f"/sequence/region/human/{chr_idx}:{loc_start}..{loc_end}:{strand}?"
r = requests.get(
server + ext, headers={"Content-Type": "text/plain"}, timeout=timeout
)
if not r.ok:
raise NameError
return r.text


def _sequence(tark_result):
return {
"seq": _seq_from_rest(
tark_result["assembly"],
tark_result["loc_region"],
tark_result["loc_strand"],
tark_result["loc_start"],
tark_result["loc_end"],
),
"description": " ".join(
[
f"{tark_result['stable_id']}.{str(tark_result['stable_id_version'])}",
":".join(
[
"chromosome",
tark_result["assembly"],
str(tark_result["loc_region"]),
str(tark_result["loc_start"]),
str(tark_result["loc_end"]),
str(tark_result["loc_strand"]),
]
),
]
),
}


def parse(tark_results):
"""Convert the Tark json response into the retriever model json output.
- take the latest version from Tark response if no specific version required;
- for genes, take the latest version with "name" field in case of same stable ID
"""
tark_results = tark_results.get("results")
if tark_results:
tark_result = tark_results[-1]
else:
raise NameError(f_e("ensembl tark", e=None, extra="returns no results"))

exon_features = _exons(tark_result["exons"])

translation_features = _translation(tark_result["translations"])

transcript_features = _transcript(tark_result, exon_features, translation_features)

genes = sorted(
tark_result["genes"],
key=lambda g: (g["stable_id_version"], 0 if g["name"] is None else 1),
)
tark_gene = genes[-1]
gene_feature = _gene(tark_gene, gene_feature=transcript_features)

return {
"annotations": _annotations(
tark_result["loc_region"],
make_location(tark_result["loc_start"] - 1, tark_result["loc_end"]),
gene_feature,
),
"sequence": _sequence(tark_result),
}
35 changes: 22 additions & 13 deletions mutalyzer_retriever/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class NoReferenceError(Exception):
def __init__(self, status, uncertain_sources):
self.uncertain_sources = uncertain_sources
message = ""
if uncertain_sources is not []:
if uncertain_sources != []:
message = f"\n\nUncertain sources: {', '.join(uncertain_sources)}\n"

for source in status.keys():
Expand Down Expand Up @@ -50,12 +50,14 @@ def _raise_error(status):
and isinstance(status[source]["errors"][0], NameError)
):
uncertain_sources.append(source)
if uncertain_sources is []:
if uncertain_sources == []:
raise NoReferenceRetrieved
raise NoReferenceError(status, uncertain_sources)


def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1):
def _fetch_unknown_source(
reference_id, reference_type, reference_source, size_off=True, timeout=1
):

status = {"lrg": {"errors": []}, "ncbi": {"errors": []}, "ensembl": {"errors": []}}

Expand All @@ -69,9 +71,7 @@ def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1
return reference_content, "lrg", "lrg"
else:
status["lrg"]["errors"].append(
ValueError(
"Lrg fetch does not support '{}' reference type.".format(reference_type)
)
ValueError(f"Lrg fetch does not support '{reference_type}' reference type.")
)

# NCBI
Expand All @@ -89,7 +89,7 @@ def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1
# Ensembl
try:
reference_content, reference_type = ensembl.fetch(
reference_id, reference_type, timeout
reference_id, reference_type, reference_source, timeout
)
except (NameError, ConnectionError, ValueError) as e:
status["ensembl"]["errors"].append(e)
Expand Down Expand Up @@ -122,21 +122,20 @@ def retrieve_raw(

if reference_source is None:
reference_content, reference_type, reference_source = _fetch_unknown_source(
reference_id, reference_type, size_off, timeout
reference_id, reference_type, reference_source, size_off, timeout
)
elif reference_source == "ncbi":
reference_content, reference_type = ncbi.fetch(
reference_id, reference_type, timeout
)
elif reference_source == "ensembl":
elif reference_source in ["ensembl", "ensembl_tark", "ensembl_rest"]:
reference_content, reference_type = ensembl.fetch(
reference_id, reference_type, timeout
reference_id, reference_type, reference_source, timeout
)
elif reference_source == "lrg":
reference_content = lrg.fetch_lrg(reference_id, timeout=timeout)
if reference_content:
reference_type = "lrg"

return reference_content, reference_type, reference_source


Expand Down Expand Up @@ -167,9 +166,9 @@ def retrieve_model(
model = parser.parse(reference_content, reference_type, reference_source)
if model_type == "all":
return model
elif model_type == "sequence":
if model_type == "sequence":
return model["sequence"]
elif model_type == "annotations":
if model_type == "annotations":
return model["annotations"]
elif reference_type == "gff3":
if model_type == "all":
Expand All @@ -195,6 +194,16 @@ def retrieve_model(
"sequence": parser.parse(reference_content, "fasta"),
}

elif reference_type == "json":
if "ensembl" in reference_source:
json_model = parser.parse(reference_content, "json")
if model_type == "all":
return json_model
elif model_type == "annotations":
return json_model["annotations"]
elif model_type == "sequence":
return json_model["sequence"]["seq"]


def retrieve_model_from_file(paths=[], is_lrg=False):
"""
Expand Down
Loading

0 comments on commit b623aa5

Please sign in to comment.