Include Ensembl Tark API for reference retrieval (#21)

* Retrieve transcript info with version number using Tark * Update retrieval order * Add Tark api in configuration * Update CLI * Implement retrieve sequence and annotation for model type * Cleaning * Add tests --------- Authored-by: Xiaoyun Liu <[email protected]>
mutalyzer · Apr 30, 2024 · b623aa5 · b623aa5
1 parent d9bbe31
commit b623aa5
Show file tree

Hide file tree

Showing 29 changed files with 1,838 additions and 252 deletions.
diff --git a/mutalyzer_retriever/cli.py b/mutalyzer_retriever/cli.py
@@ -27,7 +27,7 @@ def _parse_args(args):
     parser.add_argument("--id", help="the reference id")
 
     parser.add_argument(
-        "-s", "--source", help="retrieval source", choices=["ncbi", "ensembl", "lrg"]
+        "-s", "--source", help="retrieval source", choices=["ncbi", "ensembl", "ensembl_tark", "ensembl_rest", "lrg"]
     )
 
     parser.add_argument(

diff --git a/mutalyzer_retriever/configuration.py b/mutalyzer_retriever/configuration.py
@@ -10,6 +10,7 @@
     "MAX_FILE_SIZE": 10 * 1048576,
     "ENSEMBL_API": "https://rest.ensembl.org",
     "ENSEMBL_API_GRCH37": "https://grch37.rest.ensembl.org",
+    "ENSEMBL_TARK_API":"https://tark.ensembl.org/api",
 }
 
 

diff --git a/mutalyzer_retriever/parser.py b/mutalyzer_retriever/parser.py
@@ -1,4 +1,4 @@
-from .parsers import fasta, gff3, lrg
+from .parsers import fasta, gff3, json_ensembl, lrg
 
 
 def _get_reference_type(content):
@@ -19,6 +19,8 @@ def parse(reference_content, reference_type=None, reference_source=None):
         model = gff3.parse(reference_content, reference_source)
     elif reference_type == "fasta":
         model = fasta.parse(reference_content)
+    elif reference_type == "json":
+        model = json_ensembl.parse(reference_content)
     else:
         return None
 

diff --git a/mutalyzer_retriever/parsers/json_ensembl.py b/mutalyzer_retriever/parsers/json_ensembl.py
@@ -0,0 +1,159 @@
+import requests
+from ..util import make_location, f_e
+
+
+def _feature(raw_dict):
+    """Convert a general tark sub-dictionary into our internal model.
+       - only id and location info;
+       - Tark locations are 1-based, our model is 0-based.
+    """
+    return {
+        "id": raw_dict["stable_id"],
+        "location": make_location(
+            raw_dict["loc_start"] - 1, raw_dict["loc_end"], raw_dict.get("loc_strand")
+        ),
+    }
+
+
+def _annotations(ref_id, location, features):
+    return {
+        "id": ref_id,
+        "type": "record",
+        "location": location,
+        "features": features,
+    }
+
+
+def _exons(tark_exons):
+    """Convert exons info from tark response list into internal exon list."""
+    exons = []
+    for tark_exon in tark_exons:
+        exon = _feature(tark_exon)
+        exon["type"] = "exon"
+        exons.append(exon)
+    return exons
+
+
+def _translation(tark_translations):
+    """Convert translations per transcript from tark list into internal translation list.
+       - null for non-coding RNA in input, return an empty list;
+       - one value for coding RNA in input, return a list of one item;
+       - rarely multiple values for coding RNA in input with different versions,
+         return a list of multiple items.
+    """
+    translations = []
+    for tark_translation in tark_translations:
+        translation = _feature(tark_translation)
+        translation["type"] = "CDS"
+        translations.append(translation)
+    return translations
+
+
+def _transcript(tark_transcript, exon_features, translation_feature):
+    """Convert transcript from tark list into internal transcript list.
+       - Tark has RNA type as protein_coding, change to internal RNA type mRNA.
+    """
+    transcript = {}
+    transcript = _feature(tark_transcript)
+    transcript["type"] = tark_transcript["biotype"]
+    if transcript["type"] == "protein_coding":
+        transcript["type"] = "mRNA"
+    transcript["qualifiers"] = {
+        "assembly_name": tark_transcript["assembly"],
+        "version": str(tark_transcript["stable_id_version"]),
+        "tag": "basic",
+    }
+    transcript["features"] = exon_features + translation_feature
+    return [transcript]
+
+
+def _gene(tark_gene, gene_feature):
+    """Convert gene info from tark list into internal gene list."""
+    gene = {}
+    gene = _feature(tark_gene)
+    gene["type"] = "gene"
+    gene["qualifiers"] = {
+        "assembly_name": tark_gene["assembly"],
+        "version": str(tark_gene["stable_id_version"]),
+        "name": tark_gene["name"],
+    }
+    gene["features"] = gene_feature
+    return [gene]
+
+
+def _seq_from_rest(assembly, chr_idx, strand, loc_start, loc_end, timeout=1):
+    """Retrieve sequence from ensembl Rest API."""
+    if assembly == "GRCh38":
+        server = "https://rest.ensembl.org"
+    elif assembly == "GRCh37":
+        server = "https://grch37.rest.ensembl.org"
+    else:
+        raise NameError("Unsupported assembly {assembly}")
+    ext = f"/sequence/region/human/{chr_idx}:{loc_start}..{loc_end}:{strand}?"
+    r = requests.get(
+        server + ext, headers={"Content-Type": "text/plain"}, timeout=timeout
+    )
+    if not r.ok:
+        raise NameError
+    return r.text
+
+
+def _sequence(tark_result):
+    return {
+        "seq": _seq_from_rest(
+            tark_result["assembly"],
+            tark_result["loc_region"],
+            tark_result["loc_strand"],
+            tark_result["loc_start"],
+            tark_result["loc_end"],
+        ),
+        "description": " ".join(
+            [
+                f"{tark_result['stable_id']}.{str(tark_result['stable_id_version'])}",
+                ":".join(
+                    [
+                        "chromosome",
+                        tark_result["assembly"],
+                        str(tark_result["loc_region"]),
+                        str(tark_result["loc_start"]),
+                        str(tark_result["loc_end"]),
+                        str(tark_result["loc_strand"]),
+                    ]
+                ),
+            ]
+        ),
+    }
+
+
+def parse(tark_results):
+    """Convert the Tark json response into the retriever model json output.
+       - take the latest version from Tark response if no specific version required;
+       - for genes, take the latest version with "name" field in case of same stable ID
+    """
+    tark_results = tark_results.get("results")
+    if tark_results:
+        tark_result = tark_results[-1]
+    else:
+        raise NameError(f_e("ensembl tark", e=None, extra="returns no results"))
+
+    exon_features = _exons(tark_result["exons"])
+
+    translation_features = _translation(tark_result["translations"])
+
+    transcript_features = _transcript(tark_result, exon_features, translation_features)
+
+    genes = sorted(
+        tark_result["genes"],
+        key=lambda g: (g["stable_id_version"], 0 if g["name"] is None else 1),
+    )
+    tark_gene = genes[-1]
+    gene_feature = _gene(tark_gene, gene_feature=transcript_features)
+
+    return {
+        "annotations": _annotations(
+            tark_result["loc_region"],
+            make_location(tark_result["loc_start"] - 1, tark_result["loc_end"]),
+            gene_feature,
+        ),
+        "sequence": _sequence(tark_result),
+    }
diff --git a/mutalyzer_retriever/retriever.py b/mutalyzer_retriever/retriever.py
@@ -18,7 +18,7 @@ class NoReferenceError(Exception):
     def __init__(self, status, uncertain_sources):
         self.uncertain_sources = uncertain_sources
         message = ""
-        if uncertain_sources is not []:
+        if uncertain_sources != []:
             message = f"\n\nUncertain sources: {', '.join(uncertain_sources)}\n"
 
         for source in status.keys():
@@ -50,12 +50,14 @@ def _raise_error(status):
             and isinstance(status[source]["errors"][0], NameError)
         ):
             uncertain_sources.append(source)
-    if uncertain_sources is []:
+    if uncertain_sources == []:
         raise NoReferenceRetrieved
     raise NoReferenceError(status, uncertain_sources)
 
 
-def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1):
+def _fetch_unknown_source(
+    reference_id, reference_type, reference_source, size_off=True, timeout=1
+):
 
     status = {"lrg": {"errors": []}, "ncbi": {"errors": []}, "ensembl": {"errors": []}}
 
@@ -69,9 +71,7 @@ def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1
             return reference_content, "lrg", "lrg"
     else:
         status["lrg"]["errors"].append(
-            ValueError(
-                "Lrg fetch does not support '{}' reference type.".format(reference_type)
-            )
+            ValueError(f"Lrg fetch does not support '{reference_type}' reference type.")
         )
 
     # NCBI
@@ -89,7 +89,7 @@ def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1
     # Ensembl
     try:
         reference_content, reference_type = ensembl.fetch(
-            reference_id, reference_type, timeout
+            reference_id, reference_type, reference_source, timeout
         )
     except (NameError, ConnectionError, ValueError) as e:
         status["ensembl"]["errors"].append(e)
@@ -122,21 +122,20 @@ def retrieve_raw(
 
     if reference_source is None:
         reference_content, reference_type, reference_source = _fetch_unknown_source(
-            reference_id, reference_type, size_off, timeout
+            reference_id, reference_type, reference_source, size_off, timeout
         )
     elif reference_source == "ncbi":
         reference_content, reference_type = ncbi.fetch(
             reference_id, reference_type, timeout
         )
-    elif reference_source == "ensembl":
+    elif reference_source in ["ensembl", "ensembl_tark", "ensembl_rest"]:
         reference_content, reference_type = ensembl.fetch(
-            reference_id, reference_type, timeout
+            reference_id, reference_type, reference_source, timeout
         )
     elif reference_source == "lrg":
         reference_content = lrg.fetch_lrg(reference_id, timeout=timeout)
         if reference_content:
             reference_type = "lrg"
-
     return reference_content, reference_type, reference_source
 
 
@@ -167,9 +166,9 @@ def retrieve_model(
         model = parser.parse(reference_content, reference_type, reference_source)
         if model_type == "all":
             return model
-        elif model_type == "sequence":
+        if model_type == "sequence":
             return model["sequence"]
-        elif model_type == "annotations":
+        if model_type == "annotations":
             return model["annotations"]
     elif reference_type == "gff3":
         if model_type == "all":
@@ -195,6 +194,16 @@ def retrieve_model(
             "sequence": parser.parse(reference_content, "fasta"),
         }
 
+    elif reference_type == "json":
+        if "ensembl" in reference_source:
+            json_model = parser.parse(reference_content, "json")
+            if model_type == "all":
+                return json_model
+            elif model_type == "annotations":
+                return json_model["annotations"]
+            elif model_type == "sequence":
+                return json_model["sequence"]["seq"]
+
 
 def retrieve_model_from_file(paths=[], is_lrg=False):
     """