geneontology · sierra-moxon · Mar 15, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 15, 2024
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gopreprocess"
-version = "0.0.0"
+version = "0.0.0.post330.dev0+2158c39"
 description = "gopreprocess"
 authors = ["Sierra Moxon <[email protected]>"]
 license = "MIT"
@@ -18,11 +18,8 @@ pyproject-toml = "^0.0.10"
 [tool.poetry.group.dev.dependencies]
 coloredlogs = { version = "^15.0.1", optional = true }
 pytest = { version = ">=7.1.2", optional = true }
-sphinx = { version = ">=6.1.3", optional = true }
-sphinx-rtd-theme = { version = ">=1.0.0", optional = true }
-sphinx-autodoc-typehints = { version = ">=1.2.0", optional = true }
-sphinx-click = { version = ">=4.3.0", optional = true }
 tox = "^4.6.4"
+pytest-mock = "^3.12.0"
 
 [tool.poetry.scripts]
 convert_annotations = "src.gopreprocess.cli:convert_annotations"
@@ -36,16 +33,10 @@ convert_gpad = "src.gopreprocess.cli:convert_noctua_gpad_1_2_to_2_0_annotations"
 validate_merged_gafs = "src.gopreprocess.cli:validate_merged_gafs"
 
 [tool.poetry.extras]
-docs = [
-    "sphinx",
-    "sphinx-rtd-theme",
-    "sphinx-autodoc-typehints",
-    "sphinx-click",
-]
 tests = ["pytest"]
 
 [tool.poetry-dynamic-versioning]
-enable = true
+enable = false
 vcs = "git"
 style = "pep440"
 
@@ -55,7 +46,7 @@ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
 build-backend = "poetry_dynamic_versioning.backend"
 
 [tool.black]
-line-length = 120
+line-length = 170
 target-version = ["py39", "py310"]
 
 [tool.ruff]
@@ -65,8 +56,9 @@ extend-ignore = [
     "D401", # `First line of docstring should be in imperative mood`
     "S101", # `Use of assert detected. It would be nice to keep this check for everything but the BDD tests.`
     "B008", # `Do not perform function calls in argument defaults.`
+    "F841", # `local variable 'x' is assigned to but never used`
 ]
-line-length = 120
+line-length = 170
 
 # Allow autofix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]

diff --git a/src/config/download_config.yaml b/src/config/download_config.yaml
@@ -14,6 +14,8 @@ MGI_XREF:
   url: https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt
 GO:
   url: http://skyhook.berkeleybop.org/go-ontology-dev/ontology/go.json
+GO_RELEASE:
+  url: http://skyhook.berkeleybop.org/release/ontology/go.json
 GOA_taxon_10090:
   url: https://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/goa_mouse.gaf.gz
 GOA_taxon_10090_ISOFORM:

diff --git a/src/gopreprocess/cli.py b/src/gopreprocess/cli.py
@@ -100,9 +100,7 @@ def check_errors(errors: list) -> int:
 @click.option(
     "--namespaces",
     default=["RGD", "UniProtKB"],
-    help="List of providers in the source GAF that should be "
-    "used to retrieve source annotations for conversion. "
-    "e.g. [RGD, HGNC, UniProtKB]",
+    help="List of providers in the source GAF that should be " "used to retrieve source annotations for conversion. " "e.g. [RGD, HGNC, UniProtKB]",
 )
 @click.option(
     "--target_taxon",

diff --git a/src/gopreprocess/file_processors/gaf_processor.py b/src/gopreprocess/file_processors/gaf_processor.py
@@ -120,14 +120,9 @@ def parse_ortho_gaf(self):
                         )
                     ):
                         continue
-                    if self.source is None and (
-                        source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
-                        or source_assoc.provided_by == "GO_Central"
-                    ):
+                    if self.source is None and (source_assoc.provided_by == self.taxon_to_provider[self.target_taxon] or source_assoc.provided_by == "GO_Central"):
                         continue
-                    has_reference = any(
-                        reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference
-                    )
+                    has_reference = any(reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference)
                     if not has_reference:
                         counter = counter + 1
                     if str(source_assoc.object.id) in ["GO:0005515", "GO:0005488"]:
@@ -141,9 +136,7 @@ def parse_ortho_gaf(self):
                             # if it's in the mapped dictionary, then we can replace the UniProt identifier with the
                             # HGNC identifier, formatting that as a Curie with separate Namespace and ID fields.
                             mapped_id = self.uniprot_to_hgnc_map[str(source_assoc.subject.id)]
-                            source_assoc.subject.id = Curie(
-                                namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1]
-                            )
+                            source_assoc.subject.id = Curie(namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1])
                     self.convertible_annotations.append(source_assoc)
         return self.convertible_annotations
 
@@ -177,10 +170,7 @@ def parse_p2g_gaf(self):
                         )
                     ):
                         continue
-                    if self.source is None and (
-                        source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
-                        or source_assoc.provided_by == "GO_Central"
-                    ):
+                    if self.source is None and (source_assoc.provided_by == self.taxon_to_provider[self.target_taxon] or source_assoc.provided_by == "GO_Central"):
                         continue
                     if str(source_assoc.evidence.type) in experimental_evidence_codes:
                         continue  # no IBAs

diff --git a/src/gopreprocess/goa_annotation_creation_controller.py b/src/gopreprocess/goa_annotation_creation_controller.py
@@ -14,9 +14,7 @@
 from src.utils.download import download_file, download_with_retry
 
 
-def generate_annotation(
-    annotation: GoAssociation, xrefs: dict, isoform: bool, protein_xrefs: dict, parent_xrefs: dict
-) -> Union[GoAssociation, None]:
+def generate_annotation(annotation: GoAssociation, xrefs: dict, isoform: bool, protein_xrefs: dict, parent_xrefs: dict) -> Union[GoAssociation, None]:
     """
     Generate a new annotation based on the given protein 2 GO annotation.
 
@@ -89,9 +87,7 @@ def generate_annotation(
         return None
 
 
-def get_source_annotations(
-    isoform: bool, taxon: str
-) -> tuple[dict, Any, Any, Any, Any] | tuple[dict, dict, Any, None, None]:
+def get_source_annotations(isoform: bool, taxon: str) -> tuple[dict, Any, Any, Any, Any] | tuple[dict, dict, Any, None, None]:
     """
     Get the source annotations from the protein 2 GO GAF file.
 
@@ -117,9 +113,7 @@ def get_source_annotations(
 
     if isoform:
         protein_xrefs, parent_xrefs = gpi_processor.get_protein_xrefs()
-        p2go_isoform_file = download_file(
-            target_directory_name=f"GOA_{taxon}_ISOFORM", config_key=f"GOA_{taxon}_ISOFORM", gunzip=True
-        )
+        p2go_isoform_file = download_file(target_directory_name=f"GOA_{taxon}_ISOFORM", config_key=f"GOA_{taxon}_ISOFORM", gunzip=True)
         gp_isoform = GafProcessor(filepath=p2go_isoform_file, source="GOA")
         source_isoform_annotations = gp_isoform.parse_p2g_gaf()
         return xrefs, protein_xrefs, source_annotations, source_isoform_annotations, parent_xrefs
@@ -162,9 +156,7 @@ def convert_annotations(self, isoform: bool, taxon: str) -> None:
         :returns: None
         """
         # Gather source annotations and cross-references
-        xrefs, protein_xrefs, source_annotations, isoform_annotations, parent_xrefs = get_source_annotations(
-            isoform=isoform, taxon=taxon
-        )
+        xrefs, protein_xrefs, source_annotations, isoform_annotations, parent_xrefs = get_source_annotations(isoform=isoform, taxon=taxon)
 
         # Convert source annotations to target format
         converted_target_annotations = [

diff --git a/src/gopreprocess/ortho_annotation_creation_controller.py b/src/gopreprocess/ortho_annotation_creation_controller.py
@@ -32,9 +32,7 @@ def convert_curie_to_string(x):
     return x
 
 
-def dump_converted_annotations(
-    converted_target_annotations: List[List[str]], source_taxon: str, target_taxon: str
-) -> None:
+def dump_converted_annotations(converted_target_annotations: List[List[str]], source_taxon: str, target_taxon: str) -> None:
     """
     Dumps the converted annotations to a TSV file.
 
@@ -109,20 +107,14 @@ def dump_converted_annotations(
         key=taxon_to_provider[target_taxon],
         obj=df_final,
         sep="\t",
-        name=taxon_to_provider[target_taxon].lower()
-        + "-"
-        + taxon_to_provider[source_taxon].lower()
-        + "-ortho-temp.gaf",
+        name=taxon_to_provider[target_taxon].lower() + "-" + taxon_to_provider[source_taxon].lower() + "-ortho-temp.gaf",
         to_csv_kwargs={"index": False, "header": False},
     )
 
     # we need to add the #gaf-version: 2.2 header to the file
     filepath = pystow.join(
         key=taxon_to_provider[target_taxon],
-        name=taxon_to_provider[target_taxon].lower()
-        + "-"
-        + taxon_to_provider[source_taxon].lower()
-        + "-ortho-temp.gaf",
+        name=taxon_to_provider[target_taxon].lower() + "-" + taxon_to_provider[source_taxon].lower() + "-ortho-temp.gaf",
         ensure_exists=True,
     )
 
@@ -253,9 +245,7 @@ def convert_annotations(self) -> None:
                 for new_annotation in new_annotations:
                     converted_target_annotations.append(new_annotation.to_gaf_2_2_tsv())
 
-        dump_converted_annotations(
-            converted_target_annotations, source_taxon=self.source_taxon, target_taxon=self.target_taxon
-        )
+        dump_converted_annotations(converted_target_annotations, source_taxon=self.source_taxon, target_taxon=self.target_taxon)
 
     def generate_annotation(
         self,
@@ -291,11 +281,7 @@ def generate_annotation(
 
         if str(annotation.subject.id) in source_genes.keys():
             for gene in source_genes[str(annotation.subject.id)]:
-                if (
-                    gene in transformed_source_genes
-                    and len(transformed_source_genes[gene]) > 1
-                    and go_aspector.is_biological_process(str(annotation.object.id))
-                ):
+                if gene in transformed_source_genes and len(transformed_source_genes[gene]) > 1 and go_aspector.is_biological_process(str(annotation.object.id)):
                     output = (
                         "NON_1TO1_BP"
                         + str(annotation.subject.id)
@@ -316,12 +302,8 @@ def generate_annotation(
                         uniprot_curie = Curie(namespace=uniprot_id.split(":")[0], identity=uniprot_id.split(":")[1])
                         new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[uniprot_curie])]
                     else:
-                        new_annotation.evidence.with_support_from = [
-                            ConjunctiveSet(elements=[str(annotation.subject.id)])
-                        ]
-                    new_annotation.evidence.has_supporting_reference = [
-                        Curie(namespace="GO_REF", identity=self.ortho_reference)
-                    ]
+                        new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[str(annotation.subject.id)])]
+                    new_annotation.evidence.has_supporting_reference = [Curie(namespace="GO_REF", identity=self.ortho_reference)]
                     # if there is only one human ortholog of the mouse gene and the annotation is not a biological
                     # process, then we add it, else we skip it. inferred from sequence similarity
                     new_annotation.evidence.type = Curie(namespace="ECO", identity=iso_eco_code.split(":")[1])
@@ -348,21 +330,13 @@ def generate_annotation(
                     date_object = Date(year=year, month=month, day=day, time="")
                     new_annotation.date = date_object
 
-                    new_annotation.subject.fullname = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene][
-                        "fullname"
-                    ]
-                    new_annotation.subject.label = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene][
-                        "label"
-                    ]
+                    new_annotation.subject.fullname = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene]["fullname"]
+                    new_annotation.subject.label = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene]["label"]
 
                     # have to convert these to curies in order for the conversion to
                     # GAF 2.2 type to return anything other than
                     # default 'gene_product' -- in ontobio, when this is a list, we just take the first item.
-                    new_annotation.subject.type = [
-                        map_gp_type_label_to_curie(
-                            target_genes[taxon_to_provider[self.target_taxon] + ":" + gene].get("type")[0]
-                        )
-                    ]
+                    new_annotation.subject.type = [map_gp_type_label_to_curie(target_genes[taxon_to_provider[self.target_taxon] + ":" + gene].get("type")[0])]
                     annotations.append(new_annotation)
 
         return annotations
diff --git a/src/utils/differ.py b/src/utils/differ.py
@@ -21,8 +21,6 @@ def compare_files(file1, file2, output):
     :type file2: str
     :param output: The prefix that will be appended to all the output files/reports created by this script.
     :type output: str
-    :param group_by_columns: Name of the target/second file to compare
-    :type group_by_columns: List
 
     """
     pd.set_option("display.max_rows", 50000)
@@ -103,9 +101,7 @@ def compare_associations(assocs1, assocs2, output):
     assocs1_set = set(assoc1_list)
     assocs2_set = set(assoc2_list)
 
-    common_elements, elements_unique_to_set1, elements_unique_to_set2 = compare_association_sets(
-        assocs1_set, assocs2_set
-    )
+    common_elements, elements_unique_to_set1, elements_unique_to_set2 = compare_association_sets(assocs1_set, assocs2_set)
     common_file_path = output + "_common_elements.txt"
     unique_set1_file_path = output + "_" + "unique_to_set1.txt"
     unique_set2_file_path = output + "_" + "unique_to_set2.txt"
@@ -296,9 +292,7 @@ def read_gaf_csv(filename) -> pd:
     for eco_code in ecomapping.mappings():
         for ev in new_df["Evidence_code"]:
             if eco_code[2] == ev:
-                new_df["Evidence_code"] = new_df["Evidence_code"].replace(
-                    [eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0]
-                )
+                new_df["Evidence_code"] = new_df["Evidence_code"].replace([eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0])
     return new_df
 
 
@@ -315,29 +309,21 @@ def read_gpad_csv(filename, version) -> pd:
 
     """
     if version.startswith("1"):
-        data_frame = pd.read_csv(
-            filename, comment="!", header=None, na_filter=False, engine="python", delimiter="\t", names=gpad_1_2_format
-        ).fillna("")
-        df = data_frame.filter(
-            ["db", "subject", "qualifiers", "relation", "object", "evidence_code", "reference"], axis=1
-        )
+        data_frame = pd.read_csv(filename, comment="!", header=None, na_filter=False, engine="python", delimiter="\t", names=gpad_1_2_format).fillna("")
+        df = data_frame.filter(["db", "subject", "qualifiers", "relation", "object", "evidence_code", "reference"], axis=1)
         concat_column = df["db"] + ":" + df["subject"]
         df["concat_column"] = concat_column
         filtered_df = df.filter(["concat_column", "qualifiers", "relation", "object", "evidence_code", "reference"])
         filtered_df.rename(columns={"concat_column": "subject"}, inplace=True)
         new_df = filtered_df
     else:
-        data_frame = pd.read_csv(
-            filename, comment="!", sep="\t", header=None, na_filter=False, names=gpad_2_0_format
-        ).fillna("")
+        data_frame = pd.read_csv(filename, comment="!", sep="\t", header=None, na_filter=False, names=gpad_2_0_format).fillna("")
         new_df = data_frame.filter(["subject", "negation", "relation", "object", "evidence_code", "reference"], axis=1)
     ecomapping = ecomap.EcoMap()
     for eco_code in ecomapping.mappings():
         for ev in new_df["evidence_code"]:
             if eco_code[2] == ev:
-                new_df["evidence_code"] = new_df["evidence_code"].replace(
-                    [eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0]
-                )
+                new_df["evidence_code"] = new_df["evidence_code"].replace([eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0])
 
     # normalize ids
     config = assocparser.AssocParserConfig()

diff --git a/src/utils/download.py b/src/utils/download.py
@@ -24,12 +24,8 @@ def download_files(source_taxon: str, target_taxon: str) -> tuple[Path, Path, Pa
     :param: target_taxon (str): The target taxon to which the annotations will be converted via orthology.
     """
     ortho_path = pystow.ensure_gunzip("ALLIANCE", url=get_url("ALLIANCE_ORTHO"), autoclean=True)
-    source_gaf_path = pystow.ensure_gunzip(
-        taxon_to_provider[source_taxon], url=get_url(taxon_to_provider[source_taxon]), autoclean=True
-    )
-    target_gpi_path = pystow.ensure_gunzip(
-        taxon_to_provider[target_taxon], url=get_url(taxon_to_provider[target_taxon] + "_GPI"), autoclean=True
-    )
+    source_gaf_path = pystow.ensure_gunzip(taxon_to_provider[source_taxon], url=get_url(taxon_to_provider[source_taxon]), autoclean=True)
+    target_gpi_path = pystow.ensure_gunzip(taxon_to_provider[target_taxon], url=get_url(taxon_to_provider[target_taxon] + "_GPI"), autoclean=True)
     return ortho_path, source_gaf_path, target_gpi_path