Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mock test files #47

Merged
merged 11 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 0 additions & 43 deletions .github/workflows/deploy-docs.yml

This file was deleted.

2,353 changes: 1,058 additions & 1,295 deletions poetry.lock

Large diffs are not rendered by default.

20 changes: 6 additions & 14 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gopreprocess"
version = "0.0.0"
version = "0.0.0.post330.dev0+2158c39"
description = "gopreprocess"
authors = ["Sierra Moxon <[email protected]>"]
license = "MIT"
Expand All @@ -18,11 +18,8 @@ pyproject-toml = "^0.0.10"
[tool.poetry.group.dev.dependencies]
coloredlogs = { version = "^15.0.1", optional = true }
pytest = { version = ">=7.1.2", optional = true }
sphinx = { version = ">=6.1.3", optional = true }
sphinx-rtd-theme = { version = ">=1.0.0", optional = true }
sphinx-autodoc-typehints = { version = ">=1.2.0", optional = true }
sphinx-click = { version = ">=4.3.0", optional = true }
tox = "^4.6.4"
pytest-mock = "^3.12.0"

[tool.poetry.scripts]
convert_annotations = "src.gopreprocess.cli:convert_annotations"
Expand All @@ -36,16 +33,10 @@ convert_gpad = "src.gopreprocess.cli:convert_noctua_gpad_1_2_to_2_0_annotations"
validate_merged_gafs = "src.gopreprocess.cli:validate_merged_gafs"

[tool.poetry.extras]
docs = [
"sphinx",
"sphinx-rtd-theme",
"sphinx-autodoc-typehints",
"sphinx-click",
]
tests = ["pytest"]

[tool.poetry-dynamic-versioning]
enable = true
enable = false
vcs = "git"
style = "pep440"

Expand All @@ -55,7 +46,7 @@ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
build-backend = "poetry_dynamic_versioning.backend"

[tool.black]
line-length = 120
line-length = 170
target-version = ["py39", "py310"]

[tool.ruff]
Expand All @@ -65,8 +56,9 @@ extend-ignore = [
"D401", # `First line of docstring should be in imperative mood`
"S101", # `Use of assert detected. It would be nice to keep this check for everything but the BDD tests.`
"B008", # `Do not perform function calls in argument defaults.`
"F841", # `local variable 'x' is assigned to but never used`
]
line-length = 120
line-length = 170

# Allow autofix for all enabled rules (when `--fix`) is provided.
fixable = ["ALL"]
Expand Down
2 changes: 2 additions & 0 deletions src/config/download_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ MGI_XREF:
url: https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt
GO:
url: http://skyhook.berkeleybop.org/go-ontology-dev/ontology/go.json
GO_RELEASE:
url: http://skyhook.berkeleybop.org/release/ontology/go.json
GOA_taxon_10090:
url: https://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/goa_mouse.gaf.gz
GOA_taxon_10090_ISOFORM:
Expand Down
4 changes: 1 addition & 3 deletions src/gopreprocess/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ def check_errors(errors: list) -> int:
@click.option(
"--namespaces",
default=["RGD", "UniProtKB"],
help="List of providers in the source GAF that should be "
"used to retrieve source annotations for conversion. "
"e.g. [RGD, HGNC, UniProtKB]",
help="List of providers in the source GAF that should be " "used to retrieve source annotations for conversion. " "e.g. [RGD, HGNC, UniProtKB]",
)
@click.option(
"--target_taxon",
Expand Down
18 changes: 4 additions & 14 deletions src/gopreprocess/file_processors/gaf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,9 @@ def parse_ortho_gaf(self):
)
):
continue
if self.source is None and (
source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
or source_assoc.provided_by == "GO_Central"
):
if self.source is None and (source_assoc.provided_by == self.taxon_to_provider[self.target_taxon] or source_assoc.provided_by == "GO_Central"):
continue
has_reference = any(
reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference
)
has_reference = any(reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference)
if not has_reference:
counter = counter + 1
if str(source_assoc.object.id) in ["GO:0005515", "GO:0005488"]:
Expand All @@ -141,9 +136,7 @@ def parse_ortho_gaf(self):
# if it's in the mapped dictionary, then we can replace the UniProt identifier with the
# HGNC identifier, formatting that as a Curie with separate Namespace and ID fields.
mapped_id = self.uniprot_to_hgnc_map[str(source_assoc.subject.id)]
source_assoc.subject.id = Curie(
namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1]
)
source_assoc.subject.id = Curie(namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1])
self.convertible_annotations.append(source_assoc)
return self.convertible_annotations

Expand Down Expand Up @@ -177,10 +170,7 @@ def parse_p2g_gaf(self):
)
):
continue
if self.source is None and (
source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
or source_assoc.provided_by == "GO_Central"
):
if self.source is None and (source_assoc.provided_by == self.taxon_to_provider[self.target_taxon] or source_assoc.provided_by == "GO_Central"):
continue
if str(source_assoc.evidence.type) in experimental_evidence_codes:
continue # no IBAs
Expand Down
16 changes: 4 additions & 12 deletions src/gopreprocess/goa_annotation_creation_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
from src.utils.download import download_file, download_with_retry


def generate_annotation(
annotation: GoAssociation, xrefs: dict, isoform: bool, protein_xrefs: dict, parent_xrefs: dict
) -> Union[GoAssociation, None]:
def generate_annotation(annotation: GoAssociation, xrefs: dict, isoform: bool, protein_xrefs: dict, parent_xrefs: dict) -> Union[GoAssociation, None]:
"""
Generate a new annotation based on the given protein 2 GO annotation.

Expand Down Expand Up @@ -89,9 +87,7 @@ def generate_annotation(
return None


def get_source_annotations(
isoform: bool, taxon: str
) -> tuple[dict, Any, Any, Any, Any] | tuple[dict, dict, Any, None, None]:
def get_source_annotations(isoform: bool, taxon: str) -> tuple[dict, Any, Any, Any, Any] | tuple[dict, dict, Any, None, None]:
"""
Get the source annotations from the protein 2 GO GAF file.

Expand All @@ -117,9 +113,7 @@ def get_source_annotations(

if isoform:
protein_xrefs, parent_xrefs = gpi_processor.get_protein_xrefs()
p2go_isoform_file = download_file(
target_directory_name=f"GOA_{taxon}_ISOFORM", config_key=f"GOA_{taxon}_ISOFORM", gunzip=True
)
p2go_isoform_file = download_file(target_directory_name=f"GOA_{taxon}_ISOFORM", config_key=f"GOA_{taxon}_ISOFORM", gunzip=True)
gp_isoform = GafProcessor(filepath=p2go_isoform_file, source="GOA")
source_isoform_annotations = gp_isoform.parse_p2g_gaf()
return xrefs, protein_xrefs, source_annotations, source_isoform_annotations, parent_xrefs
Expand Down Expand Up @@ -162,9 +156,7 @@ def convert_annotations(self, isoform: bool, taxon: str) -> None:
:returns: None
"""
# Gather source annotations and cross-references
xrefs, protein_xrefs, source_annotations, isoform_annotations, parent_xrefs = get_source_annotations(
isoform=isoform, taxon=taxon
)
xrefs, protein_xrefs, source_annotations, isoform_annotations, parent_xrefs = get_source_annotations(isoform=isoform, taxon=taxon)

# Convert source annotations to target format
converted_target_annotations = [
Expand Down
46 changes: 10 additions & 36 deletions src/gopreprocess/ortho_annotation_creation_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ def convert_curie_to_string(x):
return x


def dump_converted_annotations(
converted_target_annotations: List[List[str]], source_taxon: str, target_taxon: str
) -> None:
def dump_converted_annotations(converted_target_annotations: List[List[str]], source_taxon: str, target_taxon: str) -> None:
"""
Dumps the converted annotations to a TSV file.

Expand Down Expand Up @@ -109,20 +107,14 @@ def dump_converted_annotations(
key=taxon_to_provider[target_taxon],
obj=df_final,
sep="\t",
name=taxon_to_provider[target_taxon].lower()
+ "-"
+ taxon_to_provider[source_taxon].lower()
+ "-ortho-temp.gaf",
name=taxon_to_provider[target_taxon].lower() + "-" + taxon_to_provider[source_taxon].lower() + "-ortho-temp.gaf",
to_csv_kwargs={"index": False, "header": False},
)

# we need to add the #gaf-version: 2.2 header to the file
filepath = pystow.join(
key=taxon_to_provider[target_taxon],
name=taxon_to_provider[target_taxon].lower()
+ "-"
+ taxon_to_provider[source_taxon].lower()
+ "-ortho-temp.gaf",
name=taxon_to_provider[target_taxon].lower() + "-" + taxon_to_provider[source_taxon].lower() + "-ortho-temp.gaf",
ensure_exists=True,
)

Expand Down Expand Up @@ -253,9 +245,7 @@ def convert_annotations(self) -> None:
for new_annotation in new_annotations:
converted_target_annotations.append(new_annotation.to_gaf_2_2_tsv())

dump_converted_annotations(
converted_target_annotations, source_taxon=self.source_taxon, target_taxon=self.target_taxon
)
dump_converted_annotations(converted_target_annotations, source_taxon=self.source_taxon, target_taxon=self.target_taxon)

def generate_annotation(
self,
Expand Down Expand Up @@ -291,11 +281,7 @@ def generate_annotation(

if str(annotation.subject.id) in source_genes.keys():
for gene in source_genes[str(annotation.subject.id)]:
if (
gene in transformed_source_genes
and len(transformed_source_genes[gene]) > 1
and go_aspector.is_biological_process(str(annotation.object.id))
):
if gene in transformed_source_genes and len(transformed_source_genes[gene]) > 1 and go_aspector.is_biological_process(str(annotation.object.id)):
output = (
"NON_1TO1_BP"
+ str(annotation.subject.id)
Expand All @@ -316,12 +302,8 @@ def generate_annotation(
uniprot_curie = Curie(namespace=uniprot_id.split(":")[0], identity=uniprot_id.split(":")[1])
new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[uniprot_curie])]
else:
new_annotation.evidence.with_support_from = [
ConjunctiveSet(elements=[str(annotation.subject.id)])
]
new_annotation.evidence.has_supporting_reference = [
Curie(namespace="GO_REF", identity=self.ortho_reference)
]
new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[str(annotation.subject.id)])]
new_annotation.evidence.has_supporting_reference = [Curie(namespace="GO_REF", identity=self.ortho_reference)]
# if there is only one human ortholog of the mouse gene and the annotation is not a biological
# process, then we add it, else we skip it. inferred from sequence similarity
new_annotation.evidence.type = Curie(namespace="ECO", identity=iso_eco_code.split(":")[1])
Expand All @@ -348,21 +330,13 @@ def generate_annotation(
date_object = Date(year=year, month=month, day=day, time="")
new_annotation.date = date_object

new_annotation.subject.fullname = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene][
"fullname"
]
new_annotation.subject.label = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene][
"label"
]
new_annotation.subject.fullname = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene]["fullname"]
new_annotation.subject.label = target_genes[taxon_to_provider[self.target_taxon] + ":" + gene]["label"]

# have to convert these to curies in order for the conversion to
# GAF 2.2 type to return anything other than
# default 'gene_product' -- in ontobio, when this is a list, we just take the first item.
new_annotation.subject.type = [
map_gp_type_label_to_curie(
target_genes[taxon_to_provider[self.target_taxon] + ":" + gene].get("type")[0]
)
]
new_annotation.subject.type = [map_gp_type_label_to_curie(target_genes[taxon_to_provider[self.target_taxon] + ":" + gene].get("type")[0])]
annotations.append(new_annotation)

return annotations
26 changes: 6 additions & 20 deletions src/utils/differ.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ def compare_files(file1, file2, output):
:type file2: str
:param output: The prefix that will be appended to all the output files/reports created by this script.
:type output: str
:param group_by_columns: Name of the target/second file to compare
:type group_by_columns: List

"""
pd.set_option("display.max_rows", 50000)
Expand Down Expand Up @@ -103,9 +101,7 @@ def compare_associations(assocs1, assocs2, output):
assocs1_set = set(assoc1_list)
assocs2_set = set(assoc2_list)

common_elements, elements_unique_to_set1, elements_unique_to_set2 = compare_association_sets(
assocs1_set, assocs2_set
)
common_elements, elements_unique_to_set1, elements_unique_to_set2 = compare_association_sets(assocs1_set, assocs2_set)
common_file_path = output + "_common_elements.txt"
unique_set1_file_path = output + "_" + "unique_to_set1.txt"
unique_set2_file_path = output + "_" + "unique_to_set2.txt"
Expand Down Expand Up @@ -296,9 +292,7 @@ def read_gaf_csv(filename) -> pd:
for eco_code in ecomapping.mappings():
for ev in new_df["Evidence_code"]:
if eco_code[2] == ev:
new_df["Evidence_code"] = new_df["Evidence_code"].replace(
[eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0]
)
new_df["Evidence_code"] = new_df["Evidence_code"].replace([eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0])
return new_df


Expand All @@ -315,29 +309,21 @@ def read_gpad_csv(filename, version) -> pd:

"""
if version.startswith("1"):
data_frame = pd.read_csv(
filename, comment="!", header=None, na_filter=False, engine="python", delimiter="\t", names=gpad_1_2_format
).fillna("")
df = data_frame.filter(
["db", "subject", "qualifiers", "relation", "object", "evidence_code", "reference"], axis=1
)
data_frame = pd.read_csv(filename, comment="!", header=None, na_filter=False, engine="python", delimiter="\t", names=gpad_1_2_format).fillna("")
df = data_frame.filter(["db", "subject", "qualifiers", "relation", "object", "evidence_code", "reference"], axis=1)
concat_column = df["db"] + ":" + df["subject"]
df["concat_column"] = concat_column
filtered_df = df.filter(["concat_column", "qualifiers", "relation", "object", "evidence_code", "reference"])
filtered_df.rename(columns={"concat_column": "subject"}, inplace=True)
new_df = filtered_df
else:
data_frame = pd.read_csv(
filename, comment="!", sep="\t", header=None, na_filter=False, names=gpad_2_0_format
).fillna("")
data_frame = pd.read_csv(filename, comment="!", sep="\t", header=None, na_filter=False, names=gpad_2_0_format).fillna("")
new_df = data_frame.filter(["subject", "negation", "relation", "object", "evidence_code", "reference"], axis=1)
ecomapping = ecomap.EcoMap()
for eco_code in ecomapping.mappings():
for ev in new_df["evidence_code"]:
if eco_code[2] == ev:
new_df["evidence_code"] = new_df["evidence_code"].replace(
[eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0]
)
new_df["evidence_code"] = new_df["evidence_code"].replace([eco_code[2]], ecomapping.ecoclass_to_coderef(eco_code[2])[0])

# normalize ids
config = assocparser.AssocParserConfig()
Expand Down
8 changes: 2 additions & 6 deletions src/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,8 @@ def download_files(source_taxon: str, target_taxon: str) -> tuple[Path, Path, Pa
:param: target_taxon (str): The target taxon to which the annotations will be converted via orthology.
"""
ortho_path = pystow.ensure_gunzip("ALLIANCE", url=get_url("ALLIANCE_ORTHO"), autoclean=True)
source_gaf_path = pystow.ensure_gunzip(
taxon_to_provider[source_taxon], url=get_url(taxon_to_provider[source_taxon]), autoclean=True
)
target_gpi_path = pystow.ensure_gunzip(
taxon_to_provider[target_taxon], url=get_url(taxon_to_provider[target_taxon] + "_GPI"), autoclean=True
)
source_gaf_path = pystow.ensure_gunzip(taxon_to_provider[source_taxon], url=get_url(taxon_to_provider[source_taxon]), autoclean=True)
target_gpi_path = pystow.ensure_gunzip(taxon_to_provider[target_taxon], url=get_url(taxon_to_provider[target_taxon] + "_GPI"), autoclean=True)
return ortho_path, source_gaf_path, target_gpi_path


Expand Down
Loading
Loading