Skip to content

Commit

Permalink
Merge pull request #108 from Knowledge-Graph-Hub/uniprot_sourcename_i…
Browse files Browse the repository at this point in the history
…ssue

Uniprot sourcename fix
  • Loading branch information
hrshdhgd authored Feb 8, 2024
2 parents c8ad248 + 7f2712f commit 41dda3c
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 6 deletions.
1 change: 1 addition & 0 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@
ORGANISM_TO_ENZYME_EDGE = "biolink:expresses"
ENZYME_CATEGORY = "biolink:Enzyme"
CHEMICAL_TO_ENZYME_EDGE = "biolink:binds_to"
UNIPROT_GENOME_FEATURES = "uniprot_genome_features"
UNIPROT_BASE_URL = "https://rest.uniprot.org/uniprotkb/"
UNIPROT_FIELDS = ["organism_id", "id", "accession", "protein_name", "ec", "ft_binding"]
UNIPROT_KEYWORDS = ["Reference+proteome"]
Expand Down
23 changes: 21 additions & 2 deletions kg_microbe/transform_utils/uniprot/uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ENZYME_CATEGORY,
NCBITAXON_PREFIX,
ORGANISM_TO_ENZYME_EDGE,
UNIPROT_GENOME_FEATURES,
UNIPROT_ORG_ID_COLUMN_NAME,
UNIPROT_PREFIX,
)
Expand Down Expand Up @@ -43,7 +44,7 @@ def __init__(self, input_dir: Optional[Path] = None, output_dir: Optional[Path]
"""
self.__enz_data = {}

source_name = "uniprot_genome_features"
source_name = UNIPROT_GENOME_FEATURES
super().__init__(source_name, input_dir, output_dir)

def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
Expand All @@ -70,7 +71,6 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
input_dir, ncbi_organisms, self.source_name, node_writer, edge_writer
)


drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)

Expand Down Expand Up @@ -152,6 +152,25 @@ def write_to_df(self, uniprot_values, edge_writer, node_writer):
else None
)

# Use primary accession number as it's ID does not change, as opposed to Entry Name
if "Entry" in entry.keys():
self.__enz_data["id"] = entry["Entry"]

# example response with multiple protein names:
# {
# "Organism (ID)": "100",
# "Entry Name": "A0A4R1H4N5_ANCAQ",
# "Entry": "A0A4R1H4N5",
# "Protein names": "Ubiquinone biosynthesis O-methyltransferase
# (2-polyprenyl-6-hydroxyphenol methylase) (EC 2.1.1.222)
# (3-demethylubiquinone 3-O-methyltransferase) (EC 2.1.1.64)",
# "EC number": "2.1.1.222; 2.1.1.64",
# }
if "Protein names" in entry:
self.__enz_data["name"] = entry["Protein names"].split("(EC")[0]

organism_id = entry["Organism (ID)"] if "Organism (ID)" in entry.keys() else None

# Use primary accession number as it's ID does not change, as opposed to Entry Name
if "Entry" in entry.keys():
self.__enz_data["id"] = entry["Entry"]
Expand Down
8 changes: 4 additions & 4 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ merged_graph:
filename:
- data/transformed/traits/nodes.tsv
- data/transformed/traits/edges.tsv
uniprot:
uniprot_genome_features:
input:
name: "uniprot"
name: "uniprot_genome_features"
format: tsv
filename:
- data/transformed/uniprot/nodes.tsv
- data/transformed/uniprot/edges.tsv
- data/transformed/uniprot_genome_features/nodes.tsv
- data/transformed/uniprot_genome_features/edges.tsv
operations:
- name: kgx.graph_operations.summarize_graph.generate_graph_stats
args:
Expand Down

0 comments on commit 41dda3c

Please sign in to comment.