From cbab34cdd5248ea9c723252026a13691cc17a0ac Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:05:48 -0600
Subject: [PATCH 01/29] Create README.md
---
metanetx_uniprot/README.md | 1 +
1 file changed, 1 insertion(+)
create mode 100644 metanetx_uniprot/README.md
diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md
new file mode 100644
index 00000000..1165c3cc
--- /dev/null
+++ b/metanetx_uniprot/README.md
@@ -0,0 +1 @@
+Code is reused from Biochem4j: https://github.com/neilswainston/biochem4j/tree/master/sbcdb
From f875441805acf65b2e03f16fb79b15e306cb090a Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:07:24 -0600
Subject: [PATCH 02/29] Add files via upload
---
metanetx_uniprot/build.py | 73 ++++++
metanetx_uniprot/chebi_utils.py | 39 +++
metanetx_uniprot/chemical_utils.py | 172 ++++++++++++
metanetx_uniprot/enzyme_utils.py | 65 +++++
metanetx_uniprot/index.py | 32 +++
metanetx_uniprot/init.cql | 35 +++
metanetx_uniprot/kegg_utils.py | 93 +++++++
metanetx_uniprot/mnxref_utils.py | 291 +++++++++++++++++++++
metanetx_uniprot/namespace_utils.py | 61 +++++
metanetx_uniprot/ncbi_taxonomy_utils.py | 93 +++++++
metanetx_uniprot/reaction_utils.py | 82 ++++++
metanetx_uniprot/rhea_utils.py | 63 +++++
metanetx_uniprot/seq_utils.py | 112 ++++++++
metanetx_uniprot/spectra_utils.py | 122 +++++++++
metanetx_uniprot/test/__init__.py | 9 +
metanetx_uniprot/test/test_enzyme_utils.py | 39 +++
metanetx_uniprot/test/test_mnxref_utils.py | 37 +++
metanetx_uniprot/utils.py | 73 ++++++
18 files changed, 1491 insertions(+)
create mode 100644 metanetx_uniprot/build.py
create mode 100644 metanetx_uniprot/chebi_utils.py
create mode 100644 metanetx_uniprot/chemical_utils.py
create mode 100644 metanetx_uniprot/enzyme_utils.py
create mode 100644 metanetx_uniprot/index.py
create mode 100644 metanetx_uniprot/init.cql
create mode 100644 metanetx_uniprot/kegg_utils.py
create mode 100644 metanetx_uniprot/mnxref_utils.py
create mode 100644 metanetx_uniprot/namespace_utils.py
create mode 100644 metanetx_uniprot/ncbi_taxonomy_utils.py
create mode 100644 metanetx_uniprot/reaction_utils.py
create mode 100644 metanetx_uniprot/rhea_utils.py
create mode 100644 metanetx_uniprot/seq_utils.py
create mode 100644 metanetx_uniprot/spectra_utils.py
create mode 100644 metanetx_uniprot/test/__init__.py
create mode 100644 metanetx_uniprot/test/test_enzyme_utils.py
create mode 100644 metanetx_uniprot/test/test_mnxref_utils.py
create mode 100644 metanetx_uniprot/utils.py
diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py
new file mode 100644
index 00000000..c276e641
--- /dev/null
+++ b/metanetx_uniprot/build.py
@@ -0,0 +1,73 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import multiprocessing
+import sys
+
+import chebi_utils, chemical_utils, mnxref_utils, \
+ ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils, seq_utils #, kegg_utils
+
+
+def build_csv(dest_dir, array_delimiter, num_threads):
+ '''Build database CSV files.'''
+ writer = utils.Writer(dest_dir)
+
+ # Get Organism data:
+ print('Parsing NCBI Taxonomy')
+ #ncbi_taxonomy_utils.load(writer, array_delimiter)
+
+ # Get Chemical and Reaction data.
+ # Write chemistry csv files:
+ chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
+ reac_man = reaction_utils.ReactionManager()
+
+
+ #print('Parsing MNXref')
+ mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
+ mnx_loader.load()
+
+ print('Parsing ChEBI')
+ #chebi_utils.load(chem_man, writer)
+
+ ####Using all memory (120+Gb) and eventually is killed
+ # Get Spectrum data:
+ #print('Parsing spectrum data')
+ #spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter)
+
+ #chem_man.write_files(writer)
+
+ ####Not including KEGG for now
+ # Get Reaction / Enzyme / Organism data:
+ #print('Parsing KEGG')
+ #kegg_utils.load(reac_man, num_threads=num_threads)
+
+
+ print('Parsing Rhea')
+ #rhea_utils.load(reac_man, num_threads=num_threads)
+ #reac_man.write_files(writer)
+
+
+def main(args):
+ '''main method'''
+ num_threads = 0
+
+ if len(args) > 2:
+ try:
+ num_threads = int(args[2])
+ except ValueError:
+ if args[2] == 'True':
+ num_threads = multiprocessing.cpu_count()
+
+ print('Running build with ' + str(num_threads) + ' threads')
+
+ build_csv(args[0], args[1], num_threads)
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/metanetx_uniprot/chebi_utils.py b/metanetx_uniprot/chebi_utils.py
new file mode 100644
index 00000000..284a687d
--- /dev/null
+++ b/metanetx_uniprot/chebi_utils.py
@@ -0,0 +1,39 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+from libchebipy._chebi_entity import ChebiEntity
+
+
+def load(chem_manager, writer):
+ '''Loads ChEBI data from libChEBIpy.'''
+ chebi_ids = []
+ rels = []
+
+ _add_node('CHEBI:24431', chebi_ids, rels, chem_manager)
+
+ writer.write_rels(rels, 'Chemical', 'Chemical')
+
+
+def _add_node(chebi_id, chebi_ids, rels, chem_manager):
+ '''Constructs a node from libChEBI.'''
+ if chebi_id not in chebi_ids:
+ chebi_ids.append(chebi_id)
+
+ chem_id, entity = chem_manager.add_chemical({'chebi': chebi_id})
+
+ for incoming in entity.get_incomings():
+ target_id = incoming.get_target_chebi_id()
+
+ chebi_ent = ChebiEntity(target_id)
+
+ if chebi_ent.get_parent_id():
+ target_id = chebi_ent.get_parent_id()
+
+ _add_node(target_id, chebi_ids, rels, chem_manager)
+ rels.append([target_id, incoming.get_type(), chem_id])
diff --git a/metanetx_uniprot/chemical_utils.py b/metanetx_uniprot/chemical_utils.py
new file mode 100644
index 00000000..30567ed4
--- /dev/null
+++ b/metanetx_uniprot/chemical_utils.py
@@ -0,0 +1,172 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import math
+import uuid
+
+from libchebipy._chebi_entity import ChebiEntity, ChebiException
+
+import namespace_utils as ns_utils
+from synbiochem.utils import chem_utils
+
+
+class ChemicalManager(object):
+ '''Class to implement a manager of Chemical data.'''
+
+ def __init__(self, array_delimiter):
+ '''Constructor.'''
+ self.__array_delimiter = array_delimiter
+ self.__nodes = {}
+ self.__chem_ids = {}
+
+ def write_files(self, writer):
+ '''Write neo4j import files.'''
+ return writer.write_nodes(self.__nodes.values(), 'Chemical')
+
+ def add_chemical(self, properties):
+ '''Adds a chemical to the collection of nodes, ensuring uniqueness.'''
+ chem_id, chebi_ent = self.__get_chem_id(properties)
+
+ if 'charge:float' in properties:
+ charge = properties.pop('charge:float')
+
+ if not math.isnan(charge):
+ properties['charge:float'] = int(charge)
+
+ if chem_id not in self.__nodes:
+ properties[':LABEL'] = 'Chemical'
+ properties['id:ID(Chemical)'] = chem_id
+ properties['source'] = 'chebi' if 'chebi' in properties else 'mnx'
+
+ _normalise_mass(properties)
+ self.__nodes[chem_id] = properties
+ else:
+ self.__nodes[chem_id].update(properties)
+
+ return chem_id, chebi_ent
+
+ def get_props(self, prop, default=None):
+ '''Gets all chem_ids to property as a dict.'''
+ return {key: self.__nodes[chem_id].get(prop, default)
+ for key, chem_id in self.__chem_ids.items()}
+
+ def get_prop(self, chem_id, prop, default=None):
+ '''Gets a property.'''
+ return self.__nodes[self.__chem_ids[chem_id]].get(prop, default)
+
+ def __get_chem_id(self, properties):
+ '''Manages chemical id mapping.'''
+ chebi_id = properties.get('chebi', None)
+ chebi_ent = None
+
+ if chebi_id:
+ try:
+ chebi_id, chebi_ent = _get_chebi_data(chebi_id, properties,
+ self.__array_delimiter)
+ except ChebiException as exception:
+ properties.pop('chebi')
+ chebi_id = None
+ print(exception)
+ except ValueError as exception:
+ properties.pop('chebi')
+ chebi_id = None
+ print(exception)
+
+ mnx_id = properties.get('mnx', None)
+ inchi_id = properties.get('inchi', None)
+
+ if chebi_id:
+ self.__chem_ids[chebi_id] = chebi_id
+
+ if inchi_id:
+ self.__chem_ids[inchi_id] = chebi_id
+
+ if mnx_id:
+ self.__chem_ids[mnx_id] = chebi_id
+
+ return chebi_id, chebi_ent
+
+ if inchi_id:
+ chem_id = self.__chem_ids.get(inchi_id, None)
+
+ if chem_id:
+ return chem_id, None
+
+ if mnx_id:
+ chem_id = self.__chem_ids.get(mnx_id, None)
+
+ if chem_id:
+ return chem_id, None
+
+ if inchi_id:
+ self.__chem_ids[inchi_id] = mnx_id
+
+ self.__chem_ids[mnx_id] = mnx_id
+ return mnx_id, None
+
+ new_id = str(uuid.uuid4())
+ self.__chem_ids[inchi_id] = new_id
+
+ return new_id, None
+
+
+def _get_chebi_data(chebi_id, properties, array_delimiter):
+ '''Gets ChEBI data.'''
+ chebi_ent = ChebiEntity(str(chebi_id))
+
+ if chebi_ent.get_parent_id():
+ chebi_id = chebi_ent.get_parent_id()
+ else:
+ chebi_id = chebi_ent.get_id()
+
+ properties['chebi'] = chebi_id
+
+ formula = chebi_ent.get_formula()
+ charge = chebi_ent.get_charge()
+ inchi = chebi_ent.get_inchi()
+ smiles = chebi_ent.get_smiles()
+
+ if formula:
+ properties['formula'] = formula
+
+ if not math.isnan(charge):
+ properties['charge:float'] = charge
+
+ if inchi:
+ properties['inchi'] = inchi
+
+ if smiles:
+ properties['smiles'] = smiles
+
+ properties['name'] = chebi_ent.get_name()
+ properties['names:string[]'] = \
+ array_delimiter.join([name.get_name()
+ for name in chebi_ent.get_names()] +
+ [chebi_ent.get_name()])
+
+ for db_acc in chebi_ent.get_database_accessions():
+ namespace = ns_utils.resolve_namespace(
+ db_acc.get_type(), True)
+
+ if namespace is not None:
+ properties[namespace] = db_acc.get_accession_number()
+
+ return chebi_id, chebi_ent
+
+
+def _normalise_mass(properties):
+ '''Removes ambiguity in mass values by recalculating according to chemical
+ formula.'''
+ properties.pop('mass:float', None)
+
+ if 'formula' in properties and properties['formula'] is not None:
+ mono_mass = chem_utils.get_molecular_mass(properties['formula'])
+
+ if not math.isnan(mono_mass):
+ properties['monoisotopic_mass:float'] = mono_mass
diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py
new file mode 100644
index 00000000..6f90b475
--- /dev/null
+++ b/metanetx_uniprot/enzyme_utils.py
@@ -0,0 +1,65 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+#from synbiochem.utils import seq_utils
+import queue
+from seq_utils import *
+
+
+class EnzymeManager(object):
+ '''Class to implement a manager of Enzyme data.'''
+
+ def __init__(self):
+ '''Constructor.'''
+ self.__nodes = {}
+ self.__org_enz_rels = []
+
+ def get_nodes(self):
+ '''Gets enzyme nodes.'''
+ return self.__nodes.values()
+
+ def get_org_enz_rels(self):
+ '''Gets organism-to-enzyme relationships.'''
+ return self.__org_enz_rels
+
+ def add_uniprot_data(self, enzyme_ids, source, num_threads=0):
+ '''Gets Uniprot data.'''
+
+ #fields = ['entry name', 'protein names', 'organism-id', 'ec']
+ fields = ['id', 'protein_name', 'organism_id', 'ec']
+ enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids
+ if enzyme_id not in self.__nodes]
+ uniprot_values = get_uniprot_values(enzyme_ids, fields,
+ batch_size=128, # changed to 128 from 512
+ verbose=False, #Changed to False
+ num_threads=num_threads)
+
+ for uniprot_id, uniprot_value in uniprot_values.items():
+ enzyme_node = {':LABEL': 'Enzyme',
+ 'uniprot:ID(Enzyme)': uniprot_id}
+ self.__nodes[uniprot_id] = enzyme_node
+
+ organism_id = uniprot_value.pop('Organism (ID)') \
+ if 'Organism (ID)' in uniprot_value else None
+
+ if 'Entry name' in uniprot_value:
+ enzyme_node['entry'] = uniprot_value['Entry name']
+
+ if 'Protein names' in uniprot_value:
+ enzyme_node['names'] = uniprot_value['Protein names']
+
+ if enzyme_node['names']:
+ enzyme_node['name'] = enzyme_node['names'][0]
+
+ if 'EC number' in uniprot_value:
+ enzyme_node['ec-code'] = uniprot_value['EC number']
+
+ if organism_id:
+ self.__org_enz_rels.append([organism_id, 'expresses',
+ uniprot_id, {'source': source}])
diff --git a/metanetx_uniprot/index.py b/metanetx_uniprot/index.py
new file mode 100644
index 00000000..3adb2499
--- /dev/null
+++ b/metanetx_uniprot/index.py
@@ -0,0 +1,32 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import os
+import subprocess
+import sys
+
+
+def index_db(db_loc):
+ '''Index database.'''
+ directory = os.path.dirname(os.path.realpath(__file__))
+ filename = os.path.join(directory, 'init.cql')
+
+ with open(filename, 'rU') as init_file:
+ for line in init_file:
+ params = ['neo4j-shell', '-path', db_loc, '-c', line.strip()]
+ subprocess.call(params)
+
+
+def main(argv):
+ '''main method'''
+ index_db(argv[0])
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/metanetx_uniprot/init.cql b/metanetx_uniprot/init.cql
new file mode 100644
index 00000000..7e7216e9
--- /dev/null
+++ b/metanetx_uniprot/init.cql
@@ -0,0 +1,35 @@
+CREATE CONSTRAINT ON (n:Organism) ASSERT n.taxonomy IS UNIQUE;
+CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.entry IS UNIQUE;
+CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.uniprot IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`bigg.reaction` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.id IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`kegg.reaction` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.metacyc IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.mnx IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.reactome IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.rhea IS UNIQUE;
+CREATE CONSTRAINT ON (n:Reaction) ASSERT n.seed IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`bigg.metabolite` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.cas IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chebi IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemidplus IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemspider IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.drugbank IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.hmdb IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.id IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.compound` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.drug` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.glycan` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.knapsack IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.lipidmaps IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.metacyc IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.mnx IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.molbase IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pdb IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pubmed IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.reactome IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.resid IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`seed.compound` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`umbbd.compound` IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.unipathway IS UNIQUE;
+CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`wikipedia.en` IS UNIQUE;
\ No newline at end of file
diff --git a/metanetx_uniprot/kegg_utils.py b/metanetx_uniprot/kegg_utils.py
new file mode 100644
index 00000000..95c8d65b
--- /dev/null
+++ b/metanetx_uniprot/kegg_utils.py
@@ -0,0 +1,93 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+from collections import defaultdict
+import urllib
+from urllib.request import urlopen
+
+from synbiochem.utils import thread_utils
+
+
+def load(reaction_manager, organisms=None, num_threads=0):
+ '''Loads KEGG data.'''
+
+ if organisms is None:
+ organisms = \
+ sorted([line.split()[1] for line in
+ urllib.urlopen('http://rest.kegg.jp/list/organism')])
+
+ # EC to gene, gene to Uniprot:
+ ec_genes, gene_uniprots = _get_gene_data(organisms, num_threads)
+
+ data = defaultdict(list)
+
+ # KEGG Reaction to EC:
+ kegg_reac_ec = _parse_url('http://rest.kegg.jp/link/ec/reaction')
+
+ for kegg_reac, ec_terms in kegg_reac_ec.items():
+ for ec_term in ec_terms:
+ if ec_term in ec_genes:
+ for gene in ec_genes[ec_term]:
+ if gene in gene_uniprots:
+ uniprots = [val[3:] for val in gene_uniprots[gene]]
+ data[kegg_reac[3:]].extend(uniprots)
+
+ reaction_manager.add_react_to_enz(data, 'kegg.reaction', num_threads)
+
+
+def _get_gene_data(organisms, num_threads):
+ '''Gets gene data.'''
+ ec_genes = defaultdict(list)
+ gene_uniprots = defaultdict(list)
+
+ if num_threads:
+ thread_pool = thread_utils.ThreadPool(num_threads)
+
+ for org in organisms:
+ thread_pool.add_task(_parse_organism, org, ec_genes, gene_uniprots)
+
+ thread_pool.wait_completion()
+ else:
+ for org in organisms:
+ _parse_organism(org, ec_genes, gene_uniprots)
+
+ return ec_genes, gene_uniprots
+
+
+def _parse_organism(org, ec_genes, gene_uniprots):
+ '''Parse organism.'''
+ print 'KEGG: loading ' + org
+
+ for key, value in _parse_url('http://rest.kegg.jp/link/' + org.lower() +
+ '/enzyme').items():
+ ec_genes[key].extend(value)
+
+ for key, value in _parse_url('http://rest.kegg.jp/conv/uniprot/' +
+ org.lower()).items():
+ gene_uniprots[key].extend(value)
+
+
+def _parse_url(url, attempts=16):
+ '''Parses url to form key to list of values dictionary.'''
+ data = defaultdict(list)
+
+ for _ in range(attempts):
+ try:
+ for line in urllib.urlopen(url):
+ tokens = line.split()
+
+ if len(tokens) > 1:
+ data[tokens[0]].append(tokens[1])
+
+ return data
+ except urllib.URLError as err:
+ # Take no action, but try again...
+ print '\t'.join([url, str(err)])
+
+ return data
diff --git a/metanetx_uniprot/mnxref_utils.py b/metanetx_uniprot/mnxref_utils.py
new file mode 100644
index 00000000..cbb67687
--- /dev/null
+++ b/metanetx_uniprot/mnxref_utils.py
@@ -0,0 +1,291 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+# pylint: disable=no-member
+# pylint: disable=too-few-public-methods
+# pylint: disable=too-many-locals
+from collections import Counter
+import csv
+import itertools
+import math
+import re
+import urllib
+from urllib.request import urlopen
+import requests
+
+import numpy
+#from subliminal import balance
+
+import namespace_utils
+from synbiochem.utils import chem_utils
+
+
+_METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/'
+
+
+class MnxRefReader(object):
+ '''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and
+ reac_prop.tsv files.'''
+
+ def __init__(self, source=_METANETX_URL):
+ self.__source = source
+ self.__mnx_id_patt = re.compile(r'(MNX[MR])(\d+)')
+ self.__chem_data = {}
+ self.__reac_data = {}
+
+ def get_chem_data(self):
+ '''Gets chemical data.'''
+ if not self.__chem_data:
+ self.__read_chem_prop()
+ self.__read_xref('chem_xref.tsv', self.__chem_data, True)
+
+ return self.__chem_data
+
+ def get_reac_data(self):
+ '''Gets reaction data.'''
+ if not self.__reac_data:
+ self.__read_reac_prop()
+ self.__read_xref('reac_xref.tsv', self.__reac_data, False)
+
+ return self.__reac_data
+
+ def __read_chem_prop(self):
+ '''Read chemical properties and create Nodes.'''
+ chem_prop_keys = ['id', 'name', 'reference','formula', 'charge:float',
+ 'mass:float', 'inchi', 'inchikey', 'smiles']
+
+ for values in self.__read_data('chem_prop.tsv'):
+ if not values[0].startswith('#'):
+ values[0] = self.__parse_id(values[0])
+ values[2] = self.__parse_id(values[2])
+ props = dict(zip(chem_prop_keys, values))
+ props.pop('reference')
+ _convert_to_float(props, 'charge:float')
+ _convert_to_float(props, 'mass:float')
+ props = {key: value for key, value in props.items()
+ if value != ''}
+ self.__chem_data[values[0]] = props
+
+ def __read_xref(self, filename, data, chemical):
+ '''Read xrefs and update Nodes.'''
+ xref_keys = ['XREF', 'MNX_ID', 'Description']
+
+ for values in self.__read_data(filename):
+ if not values[0].startswith('#'):
+ xrefs = dict(zip(xref_keys[:len(values)], values))
+ evidence = 'none'
+
+ if evidence == 'identity' or evidence == 'structural':
+ xrefs['MNX_ID'] = self.__parse_id(xrefs['MNX_ID'])
+ xref = xrefs['XREF'].split(':')
+
+ if xrefs['MNX_ID'] in data:
+ entry = data[xrefs['MNX_ID']]
+ self.__add_xref(xref, entry, chemical)
+
+ def __add_xref(self, xref, entry, chemical):
+ '''Adds an xref.'''
+ namespace = namespace_utils.resolve_namespace(xref[0],
+ chemical)
+
+ if namespace is not None:
+ xref[1] = self.__parse_id(xref[1])
+
+ entry[namespace] = xref[1] \
+ if namespace != 'chebi' \
+ else 'CHEBI:' + xref[1]
+
+ def __read_reac_prop(self):
+ '''Read reaction properties and create Nodes.'''
+ reac_prop_keys = ['id', 'equation', 'reference', 'ec', 'balance', 'transport']
+
+ for values in self.__read_data('reac_prop.tsv'):
+ if not values[0].startswith('#'):
+ values[0] = self.__parse_id(values[0])
+ values[2] = self.__parse_id(values[2])
+
+ props = dict(zip(reac_prop_keys, values))
+ props.pop('reference')
+
+ try:
+ participants = chem_utils.parse_equation(
+ props.pop('equation'))
+
+ for participant in participants:
+ participant[0] = self.__parse_id(participant[0])
+
+ if participant[0] not in self.__chem_data:
+ self.__add_chem(participant[0])
+
+ props['reac_defn'] = participants
+ self.__reac_data[values[0]] = props
+ except ValueError:
+ print('WARNING: Suspected polymerisation reaction: ' + \
+ values[0] + '\t' + str(props))
+
+ def __add_chem(self, chem_id):
+ '''Adds a chemical with given id.'''
+ props = {'id': chem_id}
+ self.__chem_data[chem_id] = props
+ return props
+
+ def __read_data(self, filename):
+ '''Downloads and reads tab-limited files into lists of lists of
+ strings.'''
+ with requests.Session() as s:
+ download = s.get(self.__source + filename)
+
+ decoded_content = download.content.decode('utf-8')
+
+ cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
+ my_list = list(cr)
+ return my_list
+
+
+ def __parse_id(self, item_id):
+ '''Parses mnx ids.'''
+ matches = self.__mnx_id_patt.findall(item_id)
+
+ for mat in matches:
+ return mat[0] + str(int(mat[1]))
+
+ return item_id
+
+
+class MnxRefLoader(object):
+ '''Loads MNXref data into neo4j format.'''
+
+ def __init__(self, chem_man, reac_man, writer):
+ self.__chem_man = chem_man
+ self.__reac_man = reac_man
+ self.__writer = writer
+
+ def load(self):
+ '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv,
+ reac_prop.tsv and reac_xref.tsv files.'''
+ reader = MnxRefReader()
+
+ for properties in reader.get_chem_data().values():
+ properties['mnx'] = properties.pop('id')
+ self.__chem_man.add_chemical(properties)
+
+ rels = self.__add_reac_nodes(reader.get_reac_data())
+
+ return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')]
+
+ def __add_reac_nodes(self, reac_data):
+ '''Get reaction nodes from data.'''
+ reac_id_def = {}
+
+ for properties in reac_data.values():
+ reac_def = []
+ mnx_id = properties.pop('id')
+
+ # Remove equation and description (may be inconsistent with
+ # balanced reaction):
+ if 'description' in properties:
+ properties.pop('description')
+
+ for prt in properties.pop('reac_defn'):
+ chem_id, _ = self.__chem_man.add_chemical({'mnx': prt[0]})
+
+ reac_def.append([self.__chem_man.get_prop(prt[0], 'formula'),
+ self.__chem_man.get_prop(prt[0],
+ 'charge:float', 0),
+ prt[1],
+ chem_id])
+
+ #NOT BALANCING REACTIONS since this library doesn't seem to exist anymore
+ '''
+ if all([values[0] is not None for values in reac_def]):
+ balanced, _, balanced_def = balance.balance_reac(reac_def)
+ #properties['balance'] = balanced
+ else:
+ properties['balance'] = 'unknown'
+ balanced_def = reac_def
+ '''
+ properties['balance'] = 'unknown'
+ balanced_def = reac_def
+
+
+ reac_id = self.__reac_man.add_reaction('mnx', mnx_id,
+ properties)
+ reac_id_def[reac_id] = balanced_def
+
+ chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float',
+ float('NaN'))
+ cofactors = [chem_id
+ for chem_id, mass in chem_id_mass.items()
+ if mass > 0 and mass < 44] # Assume mass < CO2 = cofactor
+
+ cofactor_pairs = _calc_cofactors(reac_id_def.values(), cofactors)
+ rels = []
+
+ for reac_id, defn in reac_id_def.items():
+ reactants = [term[3] for term in defn if term[2] < 0]
+ products = [term[3] for term in defn if term[2] > 0]
+ reac_cofactors = []
+
+ # Set metabolites as cofactors:
+ for met in [term[3] for term in defn]:
+ if met in cofactors:
+ reac_cofactors.append(met)
+
+ # Set pairs as cofactors:
+ for pair in itertools.product(reactants, products):
+ if tuple(sorted(pair)) in cofactor_pairs:
+ reac_cofactors.extend(pair)
+
+ for term in defn:
+ rels.append([reac_id,
+ 'has_cofactor' if term[3] in reac_cofactors
+ else 'has_reactant',
+ term[3],
+ {'stoichiometry:float': term[2]}])
+
+ return rels
+
+
+def _calc_cofactors(reaction_defs, cofactors, cutoff=0.8):
+ '''Calculates cofactors.'''
+ pairs = Counter()
+
+ # Calculate all reactant / product pairs...
+ for reaction_def in reaction_defs:
+ reactants = [term[3] for term in reaction_def if term[2] < 0 and
+ term[3] not in cofactors]
+ products = [term[3] for term in reaction_def if term[2] > 0 and
+ term[3] not in cofactors]
+
+ pairs.update([tuple(sorted(pair))
+ for pair in itertools.product(reactants, products)])
+
+ return _filter(pairs, cutoff)
+
+
+def _filter(counter, cutoff):
+ '''Filter counter items according to cutoff.'''
+ # Count occurences of pairs, then bin into a histogram...
+ hist_counter = Counter(counter.values())
+
+ # Fit straight-line to histogram log-log plot and filter...
+ x_val, y_val = zip(*list(hist_counter.items()))
+ m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1)
+
+ return [item[0] for item in counter.items()
+ if item[1] > math.exp(cutoff * -b_val / m_val)]
+
+
+def _convert_to_float(dictionary, key):
+ '''Converts a key value in a dictionary to a float.'''
+ if dictionary.get(key, None):
+ dictionary[key] = float(dictionary[key] if dictionary[key] != 'NA' else 'NaN')
+ else:
+ # Remove key:
+ dictionary.pop(key, None)
diff --git a/metanetx_uniprot/namespace_utils.py b/metanetx_uniprot/namespace_utils.py
new file mode 100644
index 00000000..bb6bd665
--- /dev/null
+++ b/metanetx_uniprot/namespace_utils.py
@@ -0,0 +1,61 @@
+'''
+synbiochem (c) University of Manchester 2015
+
+synbiochem is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+__CHEMICAL_NAMESPACE = {
+ # value (namespace) corresponds to identifiers.org:
+ 'bigg': 'bigg.metabolite',
+ 'CAS Registry Number': 'cas',
+ 'chebi': 'chebi',
+ 'ChemIDplus accession': 'chemidplus',
+ 'Chemspider accession': 'chemspider',
+ 'DrugBank accession': 'drugbank',
+ 'hmdb': 'hmdb',
+ 'HMDB accession': 'hmdb',
+ 'kegg': 'kegg.compound',
+ 'KEGG COMPOUND accession': 'kegg.compound',
+ 'KEGG DRUG accession': 'kegg.drug',
+ 'KEGG GLYCAN accession': 'kegg.glycan',
+ 'KNApSAcK accession': 'knapsack',
+ 'lipidmaps': 'lipidmaps',
+ 'LIPID MAPS instance accession': 'lipidmaps',
+ 'MolBase accession': 'molbase',
+ 'PDB accession': 'pdb',
+ 'PubMed citation': 'pubmed',
+ 'reactome': 'reactome',
+ 'RESID accession': 'resid',
+ 'seed': 'seed.compound',
+ 'umbbd': 'umbbd.compound',
+ 'UM-BBD compID': 'umbbd.compound',
+ 'upa': 'unipathway',
+ 'Wikipedia accession': 'wikipedia.en',
+
+ # Not in identifiers.org:
+ 'metacyc': 'metacyc',
+ 'MetaCyc accession': 'metacyc',
+ 'mnx': 'mnx'
+}
+
+__REACTION_NAMESPACE = {
+ # value (namespace) corresponds to identifiers.org:
+ 'bigg': 'bigg.reaction',
+ 'kegg': 'kegg.reaction',
+ 'reactome': 'reactome',
+ 'rhea': 'rhea',
+ 'seed': 'seed',
+
+ # Not in identifiers.org:
+ 'metacyc': 'metacyc',
+ 'mnx': 'mnx',
+}
+
+
+def resolve_namespace(name, chemical):
+ '''Maps name to distinct namespace from identifiers.org.'''
+ namespace = __CHEMICAL_NAMESPACE if chemical else __REACTION_NAMESPACE
+ return namespace[name] if name in namespace else None
diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py
new file mode 100644
index 00000000..8b7bd1d4
--- /dev/null
+++ b/metanetx_uniprot/ncbi_taxonomy_utils.py
@@ -0,0 +1,93 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import os
+import sys
+import tarfile
+import tempfile
+import urllib
+from urllib.request import urlretrieve
+
+
+__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
+
+
+def load(writer, array_delimiter, source=__NCBITAXONOMY_URL):
+ '''Loads NCBI Taxonomy data.'''
+ nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
+ nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
+ _parse_names(nodes, names_filename, array_delimiter)
+
+ writer.write_nodes(nodes.values(), 'Organism')
+ writer.write_rels(rels, 'Organism', 'Organism')
+
+
+def _get_ncbi_taxonomy_files(source):
+ '''Downloads and extracts NCBI Taxonomy files.'''
+ temp_dir = tempfile.gettempdir()
+ temp_gzipfile = tempfile.NamedTemporaryFile()
+ urlretrieve(source, temp_gzipfile.name)
+
+ temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz')
+ temp_tarfile.extractall(temp_dir)
+
+ temp_gzipfile.close()
+ temp_tarfile.close()
+
+ return os.path.join(temp_dir, 'nodes.dmp'), \
+ os.path.join(temp_dir, 'names.dmp')
+
+
+def _parse_nodes(filename, array_delimiter):
+ '''Parses nodes file.'''
+ nodes = {}
+ rels = []
+
+ with open(filename, 'r') as textfile:
+ for line in textfile:
+ tokens = [x.strip() for x in line.split('|')]
+ tax_id = tokens[0]
+
+ if tax_id != '1':
+ rels.append([tax_id, 'is_a', tokens[1]])
+
+ nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
+ ':LABEL':
+ 'Organism' + array_delimiter + tokens[2]}
+
+ return nodes, rels
+
+
+def _parse_names(nodes, filename, array_delimiter):
+ '''Parses names file.'''
+
+ with open(filename, 'r') as textfile:
+ for line in textfile:
+ tokens = [x.strip() for x in line.split('|')]
+ node = nodes[tokens[0]]
+
+ if 'name' not in node:
+ node['name'] = tokens[1]
+ node['names:string[]'] = set([node['name']])
+ else:
+ node['names:string[]'].add(tokens[1])
+
+ for _, node in nodes.items():
+ if 'names:string[]' in node:
+ node['names:string[]'] = \
+ array_delimiter.join(node['names:string[]'])
+
+
+def main(argv):
+ '''main method'''
+ load(*argv)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py
new file mode 100644
index 00000000..2a6d9394
--- /dev/null
+++ b/metanetx_uniprot/reaction_utils.py
@@ -0,0 +1,82 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+from enzyme_utils import EnzymeManager
+
+
+class ReactionManager(object):
+ '''Class to implement a manager of Reaction data.'''
+
+ def __init__(self):
+ '''Constructor.'''
+ self.__nodes = {}
+ self.__reac_ids = {}
+ self.__reac_enz_rels = []
+ self.__org_enz_rels = []
+ self.__enz_man = EnzymeManager()
+
+ def write_files(self, writer):
+ '''Write neo4j import files.'''
+ return ([writer.write_nodes(self.__nodes.values(),
+ 'Reaction'),
+ writer.write_nodes(self.__enz_man.get_nodes(),
+ 'Enzyme')],
+ [writer.write_rels(self.__reac_enz_rels,
+ 'Reaction', 'Enzyme'),
+ writer.write_rels(self.__enz_man.get_org_enz_rels(),
+ 'Organism', 'Enzyme')])
+
+ def add_reaction(self, source, reac_id, properties):
+ '''Adds a reaction to the collection of nodes, ensuring uniqueness.'''
+ reac_id = self.__reac_ids[source + reac_id] \
+ if source + reac_id in self.__reac_ids else reac_id
+
+ if reac_id not in self.__nodes:
+ properties[':LABEL'] = 'Reaction'
+ properties['id:ID(Reaction)'] = reac_id
+ properties['source'] = source
+ properties[source] = reac_id
+ self.__nodes[reac_id] = properties
+
+ if 'mnx' in properties:
+ self.__reac_ids['mnx' + properties['mnx']] = reac_id
+
+ if 'kegg.reaction' in properties:
+ self.__reac_ids[
+ 'kegg.reaction' + properties['kegg.reaction']] = reac_id
+
+ if 'rhea' in properties:
+ self.__reac_ids['rhea' + properties['rhea']] = reac_id
+ else:
+ self.__nodes[reac_id].update(properties)
+
+ return reac_id
+
+ def add_react_to_enz(self, data, source, num_threads=0):
+ '''Submit data to the graph.'''
+ # Create Reaction and Enzyme nodes:
+ enzyme_ids = self.__create_react_enz(data, source)
+
+ # Create Enzyme nodes:
+ self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads)
+
+ def __create_react_enz(self, data, source):
+ '''Creates Reaction and Enzyme nodes and their Relationships.'''
+ enzyme_ids = []
+
+ for reac_id, uniprot_ids in data.items():
+ reac_id = self.add_reaction(source, reac_id, {})
+
+ for uniprot_id in uniprot_ids:
+ enzyme_ids.append(uniprot_id)
+ self.__reac_enz_rels.append([reac_id, 'catalysed_by',
+ uniprot_id,
+ {'source': source}])
+
+ return list(set(enzyme_ids))
diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py
new file mode 100644
index 00000000..3d926091
--- /dev/null
+++ b/metanetx_uniprot/rhea_utils.py
@@ -0,0 +1,63 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import tempfile
+import urllib
+from urllib.request import urlretrieve
+
+
+__RHEA_URL = 'ftp://ftp.expasy.org/databases/rhea/tsv/rhea2uniprot%5Fsprot.tsv'
+
+
+def load(reaction_manager, source=__RHEA_URL, num_threads=0):
+ '''Loads Rhea data.'''
+ # Parse data:
+ temp_file = tempfile.NamedTemporaryFile()
+ urlretrieve(source, temp_file.name)
+ data = _parse(temp_file.name)
+ '''
+ ###For testing, uncomment the following code
+ data_small = dict()
+ for key in sorted(data)[:50]:
+ data_small[key] = data[key]
+ data.clear()
+ data.update(data_small)
+ '''
+ ######Not sure why source is Rhea here, calls to UniProt
+ reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
+
+
+def _parse(filename):
+ '''Parses file.'''
+ data = {}
+
+ with open(filename, 'r') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.split('\t')
+
+ if len(tokens) == 4:
+ uniprot_id = tokens[3].strip()
+
+ if not tokens[0] or not tokens[2]:
+ print(','.join(tokens))
+
+ _add(data, tokens[0], uniprot_id)
+ _add(data, tokens[2], uniprot_id)
+
+ return data
+
+
+def _add(data, rhea_id, uniprot_id):
+ '''Adds Rhea id and Uniprot id to data.'''
+ if rhea_id in data:
+ data[rhea_id].append(uniprot_id)
+ else:
+ data[rhea_id] = [uniprot_id]
diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py
new file mode 100644
index 00000000..892b3a6c
--- /dev/null
+++ b/metanetx_uniprot/seq_utils.py
@@ -0,0 +1,112 @@
+'''
+synbiochem (c) University of Manchester 2015
+
+synbiochem is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+from collections import defaultdict
+import itertools
+import operator
+import os
+import random
+import re
+import ssl
+from subprocess import call
+import tempfile
+from urllib import parse
+
+from Bio import Seq, SeqIO, SeqRecord
+from Bio.Blast import NCBIXML
+from Bio.Data import CodonTable
+from Bio.Restriction import Restriction, Restriction_Dictionary
+from Bio.SeqUtils.MeltingTemp import Tm_NN
+import requests
+from synbiochem.biochem4j import taxonomy
+from synbiochem.utils import thread_utils
+import queue
+
+import numpy as np
+
+def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False,
+ num_threads=0):
+ '''Gets dictionary of ids to values from Uniprot.'''
+ values = []
+
+ if num_threads:
+ thread_pool = thread_utils.ThreadPool(num_threads)
+
+ for i in range(0, len(uniprot_ids), batch_size):
+ thread_pool.add_task(_get_uniprot_batch, uniprot_ids, i,
+ batch_size, fields, values, verbose)
+
+ thread_pool.wait_completion()
+ else:
+ for i in range(0, len(uniprot_ids), batch_size):
+ _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values,
+ verbose)
+
+ return {value['Entry']: value for value in values}
+
+
+def search_uniprot(query, fields, limit=128):
+ '''Gets dictionary of ids to values from Uniprot.'''
+ values = []
+
+ url = 'http://www.uniprot.org/uniprot/?query=' + parse.quote(query) + \
+ '&sort=score&limit=' + str(limit) + \
+ '&format=tab&columns=id,' + ','.join([parse.quote(field)
+ for field in fields])
+
+ _parse_uniprot_data(url, values)
+
+ return values
+
+
+def _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values, verbose):
+ '''Get batch of Uniprot data.'''
+ if verbose:
+ print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +
+ str(min(i + batch_size, len(uniprot_ids))) + ' / ' +
+ str(len(uniprot_ids)))
+
+ #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE
+ batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]
+ query = '%20OR%20'.join(['accession:' + uniprot_id for uniprot_id in batch])
+ url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
+ '&format=tsv&fields=accession%2C' + '%2C'.join([parse.quote(field)
+ for field in fields])
+
+ _parse_uniprot_data(url, values)
+
+
+def _parse_uniprot_data(url, values):
+ '''Parses Uniprot data.'''
+ headers = None
+
+ try:
+ resp = requests.get(url, allow_redirects=True)
+
+ for line in resp.iter_lines():
+ line = line.decode('utf-8')
+ tokens = line.strip().split('\t')
+
+ if headers is None:
+ headers = tokens
+ else:
+ resp = dict(zip(headers, tokens))
+
+ if 'Protein names' in resp:
+ regexp = re.compile(r'(?<=\()[^)]*(?=\))|^[^(][^()]*')
+ names = regexp.findall(resp.pop('Protein names'))
+ resp['Protein names'] = [nme.strip() for nme in names]
+
+ for key in resp:
+ if key.startswith('Cross-reference'):
+ resp[key] = resp[key].split(';')
+
+ values.append(resp)
+ except Exception as err:
+ print(err)
\ No newline at end of file
diff --git a/metanetx_uniprot/spectra_utils.py b/metanetx_uniprot/spectra_utils.py
new file mode 100644
index 00000000..1efce1bb
--- /dev/null
+++ b/metanetx_uniprot/spectra_utils.py
@@ -0,0 +1,122 @@
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
+SYNBIOCHEM-DB is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import os
+import tempfile
+import urllib
+import zipfile
+from urllib.request import urlretrieve
+
+import ijson
+
+
+__MONA_URL = 'http://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/' + \
+ 'd2eb33f0-b22e-49a7-bc31-eb951f8347b2'
+
+__MONA_FILENAME = 'MoNA-export-All_Spectra.json'
+
+_NAME_MAP = {'kegg': 'kegg.compound',
+ 'molecular formula': 'formula',
+ 'total exact mass': 'monoisotopic_mass:float'}
+
+
+def load(writer, chem_manager,
+ array_delimiter='|', url=__MONA_URL, filename=__MONA_FILENAME):
+ '''Build Spectrum nodes and relationships.'''
+ nodes = []
+ rels = []
+
+ records = _parse(_get_file(url, filename), array_delimiter)
+
+ for record in records:
+ chem_id, _ = chem_manager.add_chemical(record['chemical'])
+ nodes.append(record['spectrum'])
+ rels.append([chem_id, 'has', record['spectrum']['id:ID(Spectrum)']])
+
+ return [writer.write_nodes(nodes, 'Spectrum')], \
+ [writer.write_rels(rels, 'Chemical', 'Spectrum')]
+
+
+def _parse(filename, array_delimiter):
+ '''Parses MoNA json file.'''
+ records = []
+ record = {'chemical': {'names:string[]': []},
+ 'spectrum': {':LABEL': 'Spectrum', 'tags:string[]': []}}
+ name = None
+
+ for prefix, typ, value in ijson.parse(open(filename)):
+ if prefix == 'item' and typ == 'start_map':
+ record = {'chemical': {'names:string[]': []},
+ 'spectrum': {':LABEL': 'Spectrum',
+ 'tags:string[]': []}}
+ elif prefix == 'item.compound.item.inchi':
+ record['chemical']['inchi'] = value
+ elif prefix == 'item.compound.item.names.item.name':
+ if 'name' not in record['chemical']:
+ record['chemical']['name'] = value
+ record['chemical']['names:string[]'].append(value)
+ elif prefix == 'item.compound.item.metaData.item.name' or \
+ prefix == 'item.metaData.item.name':
+ name = _normalise_name(value.lower())
+ elif prefix == 'item.compound.item.metaData.item.value':
+ _parse_compound_metadata(name, value, record)
+ name = None
+ elif prefix == 'item.id':
+ record['spectrum']['id:ID(Spectrum)'] = value
+ elif prefix == 'item.metaData.item.value':
+ record['spectrum'][name] = value
+ name = None
+ elif prefix == 'item.spectrum':
+ values = [float(val) for term in value.split()
+ for val in term.split(':')]
+ record['spectrum']['m/z:float[]'] = \
+ array_delimiter.join(map(str, values[0::2]))
+ record['spectrum']['I:float[]'] = \
+ array_delimiter.join(map(str, values[1::2]))
+ elif prefix == 'item.tags.item.text':
+ record['spectrum']['tags:string[]'].append(value)
+ elif prefix == 'item' and typ == 'end_map':
+ records.append(record)
+
+ return records
+
+
+def _get_file(url, filename):
+ '''Gets file from url.'''
+ destination = os.path.join(os.path.expanduser('~'), 'MoNA')
+
+ if not os.path.exists(destination):
+ os.makedirs(destination)
+
+ filepath = os.path.join(destination, filename)
+
+ if not os.path.exists(filepath):
+ tmp_file = tempfile.NamedTemporaryFile(delete=False)
+ urlretrieve(url, tmp_file.name)
+ zfile = zipfile.ZipFile(tmp_file.name, 'r')
+ filepath = os.path.join(destination, zfile.namelist()[0])
+ zfile.extractall(destination)
+
+ return filepath
+
+
+def _parse_compound_metadata(name, value, record):
+ '''Parses compound metadata.'''
+ if name == 'chebi' and isinstance(value, str):
+ value = value.replace('CHEBI:', '').split()[0]
+
+ record['chemical'][_normalise_name(name)] = value
+
+
+def _normalise_name(name):
+ '''Normalises name in name:value pairs.'''
+ if name in _NAME_MAP:
+ return _NAME_MAP[name]
+
+ return name.replace(':', '_')
diff --git a/metanetx_uniprot/test/__init__.py b/metanetx_uniprot/test/__init__.py
new file mode 100644
index 00000000..e0aa1f5e
--- /dev/null
+++ b/metanetx_uniprot/test/__init__.py
@@ -0,0 +1,9 @@
+'''
+synbiochem (c) University of Manchester 2015
+
+synbiochem is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
diff --git a/metanetx_uniprot/test/test_enzyme_utils.py b/metanetx_uniprot/test/test_enzyme_utils.py
new file mode 100644
index 00000000..c0318f65
--- /dev/null
+++ b/metanetx_uniprot/test/test_enzyme_utils.py
@@ -0,0 +1,39 @@
+'''
+synbiochem (c) University of Manchester 2015
+
+synbiochem is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+# pylint: disable=too-many-public-methods
+import unittest
+
+from sbcdb.enzyme_utils import EnzymeManager
+
+
+class TestEnzymeManager(unittest.TestCase):
+ '''Test class for EnzymeManager.'''
+
+ def setUp(self):
+ unittest.TestCase.setUp(self)
+ self.__manager = EnzymeManager()
+
+ def test_add_uniprot_data(self):
+ '''Tests add_uniprot_data method.'''
+ enzyme_ids = ['P19367', 'Q2KNB7']
+
+ # Test unthreaded:
+ self.__manager.add_uniprot_data(enzyme_ids, source='test')
+ self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes()))
+
+ # Test threaded:
+ self.__manager.add_uniprot_data(enzyme_ids, source='test',
+ num_threads=24)
+ self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes()))
+
+
+if __name__ == "__main__":
+ # import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
diff --git a/metanetx_uniprot/test/test_mnxref_utils.py b/metanetx_uniprot/test/test_mnxref_utils.py
new file mode 100644
index 00000000..88a3da23
--- /dev/null
+++ b/metanetx_uniprot/test/test_mnxref_utils.py
@@ -0,0 +1,37 @@
+'''
+synbiochem (c) University of Manchester 2015
+
+synbiochem is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+# pylint: disable=too-many-public-methods
+import unittest
+
+from sbcdb.mnxref_utils import MnxRefReader
+
+
+class TestMnxRefReader(unittest.TestCase):
+ '''Test class for MnxRefReader.'''
+
+ def setUp(self):
+ unittest.TestCase.setUp(self)
+ reader = MnxRefReader()
+ self.__chem_data = reader.get_chem_data()
+ self.__reac_data = reader.get_reac_data()
+
+ def test_get_chem_data(self):
+ '''Tests get_chem_data method.'''
+ self.assertEquals(self.__chem_data['MNXM1354']['chebi'], 'CHEBI:58282')
+
+ def test_get_reac_data(self):
+ '''Tests get_chem_data method.'''
+ eqn = '1 MNXM1 + 1 MNXM6 + 1 MNXM97401 = 1 MNXM5 + 1 MNXM97393'
+ self.assertEquals(self.__reac_data['MNXR62989']['equation'], eqn)
+
+
+if __name__ == "__main__":
+ # import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
diff --git a/metanetx_uniprot/utils.py b/metanetx_uniprot/utils.py
new file mode 100644
index 00000000..67639e71
--- /dev/null
+++ b/metanetx_uniprot/utils.py
@@ -0,0 +1,73 @@
+'''
+synbiochem (c) University of Manchester 2016
+
+synbiochem is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+# pylint: disable=invalid-name
+# pylint: disable=too-many-arguments
+import os
+from shutil import rmtree
+
+import pandas as pd
+
+
+class Writer(object):
+ '''CSV file writer class for biochem4j files.'''
+
+ def __init__(self, dest_dir):
+ self.__nodes_dir = os.path.join(os.path.abspath(dest_dir), 'nodes')
+ self.__rels_dir = os.path.join(os.path.abspath(dest_dir), 'rels')
+
+ if os.path.exists(self.__nodes_dir):
+ rmtree(self.__nodes_dir)
+
+ os.makedirs(self.__nodes_dir)
+
+ if os.path.exists(self.__rels_dir):
+ rmtree(self.__rels_dir)
+
+ os.makedirs(self.__rels_dir)
+
+ def write_nodes(self, nodes, group, separator=';'):
+ '''Writes Nodes to csv file.'''
+ if not nodes:
+ return None
+
+ df = pd.DataFrame(nodes)
+ df.dropna(axis=1, how='all', inplace=True)
+
+ filename = os.path.join(self.__nodes_dir, group + '.csv')
+ df.to_csv(filename, index=False, encoding='utf-8', sep=separator)
+ print('just wrote: ',filename)
+
+ return filename
+
+ def write_rels(self, rels, group_start, group_end, separator=';'):
+ '''Writes Relationships to csv file.'''
+ if not rels:
+ return None
+
+ columns = [':START_ID(' + group_start + ')',
+ ':TYPE',
+ ':END_ID(' + group_end + ')']
+
+ if len(rels[0]) > 3:
+ columns.append('PROPERTIES')
+
+ df = pd.DataFrame(rels, columns=columns)
+
+ if len(rels[0]) > 3:
+ props_df = pd.DataFrame(list(df['PROPERTIES']))
+ df.drop('PROPERTIES', axis=1, inplace=True)
+ df = df.join(props_df)
+
+ filename = os.path.join(self.__rels_dir,
+ group_start + '_' + group_end + '.csv')
+ df.to_csv(filename, index=False, encoding='utf-8', sep=separator)
+ print('just wrote: ',filename)
+
+ return filename
From 2ca61d9a57af567a5c9e91243a2d9202e2039dc4 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:11:46 -0600
Subject: [PATCH 03/29] Update README.md
---
metanetx_uniprot/README.md | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md
index 1165c3cc..31028033 100644
--- a/metanetx_uniprot/README.md
+++ b/metanetx_uniprot/README.md
@@ -1 +1,16 @@
+# MetaNetX and UniProt Content
+
Code is reused from Biochem4j: https://github.com/neilswainston/biochem4j/tree/master/sbcdb
+
+Access chemical, reaction, enzyme, and organism information from the following sources:
+- libchebipy
+- NCBITaxonomy
+- MetaNetX
+- Rhea
+- UniProt
+
+To run:
+
+```
+python build.py ~/biochem4j ',' 1
+```
From 4f7e473d108915308ca65117b9acc69856d43570 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:12:24 -0600
Subject: [PATCH 04/29] Update build.py
---
metanetx_uniprot/build.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py
index c276e641..4376ea1a 100644
--- a/metanetx_uniprot/build.py
+++ b/metanetx_uniprot/build.py
@@ -20,7 +20,7 @@ def build_csv(dest_dir, array_delimiter, num_threads):
# Get Organism data:
print('Parsing NCBI Taxonomy')
- #ncbi_taxonomy_utils.load(writer, array_delimiter)
+ ncbi_taxonomy_utils.load(writer, array_delimiter)
# Get Chemical and Reaction data.
# Write chemistry csv files:
@@ -28,19 +28,19 @@ def build_csv(dest_dir, array_delimiter, num_threads):
reac_man = reaction_utils.ReactionManager()
- #print('Parsing MNXref')
+ print('Parsing MNXref')
mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
mnx_loader.load()
print('Parsing ChEBI')
- #chebi_utils.load(chem_man, writer)
+ chebi_utils.load(chem_man, writer)
####Using all memory (120+Gb) and eventually is killed
# Get Spectrum data:
#print('Parsing spectrum data')
#spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter)
- #chem_man.write_files(writer)
+ chem_man.write_files(writer)
####Not including KEGG for now
# Get Reaction / Enzyme / Organism data:
@@ -49,8 +49,8 @@ def build_csv(dest_dir, array_delimiter, num_threads):
print('Parsing Rhea')
- #rhea_utils.load(reac_man, num_threads=num_threads)
- #reac_man.write_files(writer)
+ rhea_utils.load(reac_man, num_threads=num_threads)
+ reac_man.write_files(writer)
def main(args):
From fe005b52ba3fa341be8fd6bd5d01bac2f0ae3865 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:13:45 -0600
Subject: [PATCH 05/29] Add files via upload
---
notebooks/Uniprot_API_test.ipynb | 303 +++++++++++++++++++++++++++++++
1 file changed, 303 insertions(+)
create mode 100644 notebooks/Uniprot_API_test.ipynb
diff --git a/notebooks/Uniprot_API_test.ipynb b/notebooks/Uniprot_API_test.ipynb
new file mode 100644
index 00000000..c5eec384
--- /dev/null
+++ b/notebooks/Uniprot_API_test.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "id": "underlying-necessity",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "'''\n",
+ "class ReactionManager(object):\n",
+ " #Class to implement a manager of Reaction data.\n",
+ "\n",
+ " def __init__(self):\n",
+ " #Constructor.\n",
+ " self.__nodes = {}\n",
+ " self.__reac_ids = {}\n",
+ " self.__reac_enz_rels = []\n",
+ " self.__org_enz_rels = []\n",
+ " self.__enz_man = EnzymeManager()\n",
+ "'''\n",
+ "\n",
+ "\n",
+ "def add_uniprot_data(enzyme_ids, source, num_threads=0):\n",
+ " print(enzyme_ids)\n",
+ " '''Gets Uniprot data.'''\n",
+ "\n",
+ " #fields = ['entry name', 'protein names', 'organism-id', 'ec']\n",
+ " fields = ['id', 'protein_name', 'organism_id', 'ec']\n",
+ " #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]\n",
+ " uniprot_values = get_uniprot_values(enzyme_ids, fields,batch_size=128,verbose=False,num_threads=num_threads)\n",
+ "\n",
+ " print('add_uniprot_data function: added uniprot values: ',len(uniprot_values))\n",
+ "\n",
+ "\n",
+ "\n",
+ " for uniprot_id, uniprot_value in uniprot_values.items():\n",
+ " enzyme_node = {':LABEL': 'Enzyme',\n",
+ " 'uniprot:ID(Enzyme)': uniprot_id}\n",
+ " #self.__nodes[uniprot_id] = enzyme_node\n",
+ "\n",
+ " organism_id = uniprot_value.pop('Organism (ID)') \\\n",
+ " if 'Organism (ID)' in uniprot_value else None\n",
+ "\n",
+ " if 'Entry name' in uniprot_value:\n",
+ " enzyme_node['entry'] = uniprot_value['Entry name']\n",
+ "\n",
+ " if 'Protein names' in uniprot_value:\n",
+ " enzyme_node['names'] = uniprot_value['Protein names']\n",
+ "\n",
+ " if enzyme_node['names']:\n",
+ " enzyme_node['name'] = enzyme_node['names'][0]\n",
+ "\n",
+ " if 'EC number' in uniprot_value:\n",
+ " enzyme_node['ec-code'] = uniprot_value['EC number']\n",
+ "\n",
+ " #if organism_id:\n",
+ " #self.__org_enz_rels.append([organism_id, 'expresses',uniprot_id, {'source': source}])\n",
+ " \n",
+ "\n",
+ "def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False, num_threads=0):\n",
+ " values = []\n",
+ "\n",
+ " if num_threads:\n",
+ " thread_pool = thread_utils.ThreadPool(num_threads)\n",
+ "\n",
+ " for i in range(0, len(uniprot_ids), batch_size):\n",
+ " thread_pool.add_task(_get_uniprot_batch, uniprot_ids, i,batch_size, fields, values, verbose)\n",
+ "\n",
+ " thread_pool.wait_completion()\n",
+ " else:\n",
+ " for i in range(0, len(uniprot_ids), batch_size):\n",
+ " _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values,verbose)\n",
+ "\n",
+ " return {value['Entry']: value for value in values}\n",
+ "\n",
+ "\n",
+ "\n",
+ "def _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values, verbose):\n",
+ " '''Get batch of Uniprot data.'''\n",
+ " if verbose:\n",
+ " print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +\n",
+ " str(min(i + batch_size, len(uniprot_ids))) + ' / ' +\n",
+ " str(len(uniprot_ids)))\n",
+ "\n",
+ " #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE\n",
+ " batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]\n",
+ " query = '%20OR%20'.join(['accession:' + uniprot_id for uniprot_id in batch])\n",
+ " url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \\\n",
+ " '&format=tsv&fields=accession%2C' + '%2C'.join([parse.quote(field)\n",
+ " for field in fields])\n",
+ "\n",
+ " print(url)\n",
+ "\n",
+ " _parse_uniprot_data(url, values)\n",
+ " \n",
+ " \n",
+ "def _parse_uniprot_data(url, values):\n",
+ " '''Parses Uniprot data.'''\n",
+ " headers = None\n",
+ "\n",
+ " try:\n",
+ " resp = requests.get(url, allow_redirects=True)\n",
+ "\n",
+ " for line in resp.iter_lines():\n",
+ " line = line.decode('utf-8')\n",
+ " tokens = line.strip().split('\\t')\n",
+ "\n",
+ " if headers is None:\n",
+ " headers = tokens\n",
+ " else:\n",
+ " resp = dict(zip(headers, tokens))\n",
+ "\n",
+ " if 'Protein names' in resp:\n",
+ " regexp = re.compile(r'(?<=\\()[^)]*(?=\\))|^[^(][^()]*')\n",
+ " names = regexp.findall(resp.pop('Protein names'))\n",
+ " resp['Protein names'] = [nme.strip() for nme in names]\n",
+ "\n",
+ " for key in resp:\n",
+ " if key.startswith('Cross-reference'):\n",
+ " resp[key] = resp[key].split(';')\n",
+ " values.append(resp)\n",
+ " print('values from parse_uniprot_data: ',type(values))\n",
+ " return values\n",
+ " except Exception as err:\n",
+ " print(err)\n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "russian-dispatch",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['B4RBW1', 'A9BIS7', 'B5Z3E3']\n",
+ "https://rest.uniprot.org/uniprotkb/search?query=accession:B4RBW1%20OR%20accession:A9BIS7%20OR%20accession:B5Z3E3&format=tsv&fields=accession%2Cid%2Cprotein_name%2Corganism_id%2Cec\n",
+ "add_uniprot_data function: added uniprot values: 3\n"
+ ]
+ }
+ ],
+ "source": [
+ "### Query by protein ID\n",
+ "\n",
+ "\n",
+ "from urllib import parse\n",
+ "import requests\n",
+ "import re\n",
+ "\n",
+ "\n",
+ "num_threads = 1\n",
+ "source = 'rhea'\n",
+ "enzyme_ids = ['B4RBW1', 'A9BIS7', 'B5Z3E3']\n",
+ "\n",
+ "add_uniprot_data(enzyme_ids, source)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "id": "removable-gibraltar",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "#Download then work with it\n",
+ "\n",
+ "def add_uniprot_data_organism(organism_ids, source, num_threads=0):\n",
+ " print(organism_ids)\n",
+ " '''Gets Uniprot data.'''\n",
+ "\n",
+ " #fields = ['entry name', 'protein names', 'organism-id', 'ec']\n",
+ " fields = ['id', 'protein_name', 'organism_id', 'ec']\n",
+ " #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]\n",
+ " organism_values = get_uniprot_values_organism(organism_ids, fields,batch_size=128,verbose=False,num_threads=num_threads)\n",
+ "\n",
+ " print('add_uniprot_data function: added uniprot values: ',len(organism_values))\n",
+ "\n",
+ "\n",
+ "\n",
+ " for uniprot_id, uniprot_value in organism_values.items():\n",
+ " enzyme_node = {':LABEL': 'Enzyme',\n",
+ " 'uniprot:ID(Enzyme)': uniprot_id}\n",
+ " #self.__nodes[uniprot_id] = enzyme_node\n",
+ "\n",
+ " organism_id = uniprot_value.pop('Organism (ID)') \\\n",
+ " if 'Organism (ID)' in uniprot_value else None\n",
+ "\n",
+ " if 'Entry name' in uniprot_value:\n",
+ " enzyme_node['entry'] = uniprot_value['Entry name']\n",
+ "\n",
+ " if 'Protein names' in uniprot_value:\n",
+ " enzyme_node['names'] = uniprot_value['Protein names']\n",
+ "\n",
+ " if enzyme_node['names']:\n",
+ " enzyme_node['name'] = enzyme_node['names'][0]\n",
+ "\n",
+ " if 'EC number' in uniprot_value:\n",
+ " enzyme_node['ec-code'] = uniprot_value['EC number']\n",
+ "\n",
+ " #if organism_id:\n",
+ " #self.__org_enz_rels.append([organism_id, 'expresses',uniprot_id, {'source': source}])\n",
+ " \n",
+ " return organism_values\n",
+ "\n",
+ "def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False, num_threads=0):\n",
+ " values = []\n",
+ "\n",
+ " for i in range(0, len(organism_ids), batch_size):\n",
+ " values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose)\n",
+ "\n",
+ " return {value['Organism (ID)']: value for value in values}\n",
+ "\n",
+ "\n",
+ "def _get_uniprot_batch_organism(uniprot_ids, i, batch_size, fields, values, verbose):\n",
+ " '''Get batch of Uniprot data.'''\n",
+ " if verbose:\n",
+ " print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +\n",
+ " str(min(i + batch_size, len(uniprot_ids))) + ' / ' +\n",
+ " str(len(uniprot_ids)))\n",
+ "\n",
+ " #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE\n",
+ " batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]\n",
+ " query = '%20OR%20'.join(['organism_id:' + uniprot_id for uniprot_id in batch])\n",
+ " url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \\\n",
+ " '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field)\n",
+ " for field in fields])\n",
+ "\n",
+ " print('_get_uniprot_batch_organism url: ',url)\n",
+ "\n",
+ " values = _parse_uniprot_data(url, values)\n",
+ " return values\n",
+ " \n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "id": "removed-unemployment",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['226900', '296591']\n",
+ "_get_uniprot_batch_organism url: https://rest.uniprot.org/uniprotkb/search?query=organism_id:226900%20OR%20organism_id:296591&format=tsv&fields=organism_id%2Cid%2Cprotein_name%2Corganism_id%2Cec\n",
+ "values from parse_uniprot_data: \n",
+ "add_uniprot_data function: added uniprot values: 2\n",
+ "{'226900': {'Entry Name': 'GLMU_BACCR', 'EC number': '2.3.1.157; 2.7.7.23', 'Protein names': ['Bifunctional protein GlmU [Includes: UDP-N-acetylglucosamine pyrophosphorylase', 'EC 2.7.7.23', 'N-acetylglucosamine-1-phosphate uridyltransferase', 'EC 2.3.1.157']}, '296591': {'Entry Name': 'RLMN_POLSJ', 'EC number': '2.1.1.192', 'Protein names': ['Dual-specificity RNA methyltransferase RlmN', 'EC 2.1.1.192', '23S rRNA (adenine(2503', '2', '23S rRNA m2A2503 methyltransferase', 'Ribosomal RNA large subunit methyltransferase N', 'tRNA (adenine(37', '2', 'tRNA m2A37 methyltransferase']}}\n"
+ ]
+ }
+ ],
+ "source": [
+ "### Query by organism ID\n",
+ "\n",
+ "#query = 'https://rest.uniprot.org/uniprotkb/search?query=organism_id:226900'\n",
+ "\n",
+ "\n",
+ "source = 'rhea'\n",
+ "organism_ids = ['226900','296591']\n",
+ "\n",
+ "organism_values = add_uniprot_data_organism(organism_ids, source)\n",
+ "\n",
+ "print(organism_values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "pleased-coaching",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From e3cf90a5251aef3950ddcdd63fc1a307040f3281 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 20 Mar 2023 16:42:09 -0600
Subject: [PATCH 06/29] Update ncbi_taxonomy_utils.py
Update to use ncbi_taxon input from kg-microbe
---
metanetx_uniprot/ncbi_taxonomy_utils.py | 49 +++++++++++++++++++++++--
1 file changed, 46 insertions(+), 3 deletions(-)
diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py
index 8b7bd1d4..5fbc603f 100644
--- a/metanetx_uniprot/ncbi_taxonomy_utils.py
+++ b/metanetx_uniprot/ncbi_taxonomy_utils.py
@@ -14,15 +14,23 @@
import urllib
from urllib.request import urlretrieve
+from kgx.cli.cli_utils import transform
+import pandas as pd
+
__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
def load(writer, array_delimiter, source=__NCBITAXONOMY_URL):
'''Loads NCBI Taxonomy data.'''
- nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
- nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
- _parse_names(nodes, names_filename, array_delimiter)
+ #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
+ ####Update filepath accordingly
+ nodes_filename = '~/kg_microbe/kg-microbe/data/raw/ncbitaxon.json'
+ #nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
+ print('parsing ncbi taxon json file')
+ kgx_nodes_json,kgx_edges_json = _parse_nodes_kgmicrobe(nodes_filename, array_delimiter)
+ nodes,rels = transform_kgx_output_format(kgx_nodes_json,kgx_edges_json)
+ #_parse_names(nodes, names_filename, array_delimiter)
writer.write_nodes(nodes.values(), 'Organism')
writer.write_rels(rels, 'Organism', 'Organism')
@@ -43,6 +51,40 @@ def _get_ncbi_taxonomy_files(source):
return os.path.join(temp_dir, 'nodes.dmp'), \
os.path.join(temp_dir, 'names.dmp')
+def _parse_nodes_kgmicrobe(filename, array_delimiter):
+ '''Parses nodes file.'''
+
+ ####Update filepath accordingly
+ output_dir = '~/biochem4j/'
+ name = 'ncbitaxon_transformed'
+
+ transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, name), output_format='tsv')
+
+ return output_dir+name+'_nodes.tsv',output_dir+name+'_edges.tsv'
+
+def transform_kgx_output_format(transformed_nodes_tsv,transformed_edges_tsv):
+
+ labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name'])
+ triples_df = pd.read_csv(transformed_edges_tsv,sep = '\t', usecols = ['subject', 'object', 'predicate'])
+ triples_df.columns.str.lower()
+
+ nodes = {}
+ rels = []
+
+ for i in range(len(labels)):
+ tax_id = labels.iloc[i].loc['id']
+ nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
+ ':LABEL':
+ 'Organism,unknown'}
+
+ for i in range(len(triples_df)):
+ s = triples_df.iloc[i].loc['subject']
+ p = triples_df.iloc[i].loc['predicate']
+ o = triples_df.iloc[i].loc['object']
+ rels.append([s, p, o])
+
+ return nodes,rels
+
def _parse_nodes(filename, array_delimiter):
'''Parses nodes file.'''
@@ -61,6 +103,7 @@ def _parse_nodes(filename, array_delimiter):
':LABEL':
'Organism' + array_delimiter + tokens[2]}
+ print(list(nodes.values())[0:5])
return nodes, rels
From 32b80d9d646d81481738dcbe62d6b2cad547fbf1 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:08:06 -0600
Subject: [PATCH 07/29] Update build.py
Update to ingest all uniprot/rhea relationships based on kg-microbe microbes
- comment out all other code (for now)
---
metanetx_uniprot/build.py | 22 +++++++++++++---------
1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py
index 4376ea1a..3fce3299 100644
--- a/metanetx_uniprot/build.py
+++ b/metanetx_uniprot/build.py
@@ -17,30 +17,34 @@
def build_csv(dest_dir, array_delimiter, num_threads):
'''Build database CSV files.'''
writer = utils.Writer(dest_dir)
+
+ reac_man = reaction_utils.ReactionManager()
# Get Organism data:
print('Parsing NCBI Taxonomy')
- ncbi_taxonomy_utils.load(writer, array_delimiter)
+ ncbi_taxonomy_utils.load(reac_man, writer, array_delimiter)
+
# Get Chemical and Reaction data.
# Write chemistry csv files:
- chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
- reac_man = reaction_utils.ReactionManager()
+ #chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
+ # May be duplicate line
+ #reac_man = reaction_utils.ReactionManager()
- print('Parsing MNXref')
- mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
- mnx_loader.load()
+ #print('Parsing MNXref')
+ #mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
+ #mnx_loader.load()
- print('Parsing ChEBI')
- chebi_utils.load(chem_man, writer)
+ #print('Parsing ChEBI')
+ #chebi_utils.load(chem_man, writer)
####Using all memory (120+Gb) and eventually is killed
# Get Spectrum data:
#print('Parsing spectrum data')
#spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter)
- chem_man.write_files(writer)
+ #chem_man.write_files(writer)
####Not including KEGG for now
# Get Reaction / Enzyme / Organism data:
From 008e24e8cdc8449113d72fbcbd0632f241a5f930 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:09:49 -0600
Subject: [PATCH 08/29] Update ncbi_taxonomy_utils.py
Update to ingest all uniprot/rhea relationships based on kg-microbe microbes
- Use kg-microbe NCBITaxon.json input rather than ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
---
metanetx_uniprot/ncbi_taxonomy_utils.py | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py
index 5fbc603f..f5ffe788 100644
--- a/metanetx_uniprot/ncbi_taxonomy_utils.py
+++ b/metanetx_uniprot/ncbi_taxonomy_utils.py
@@ -21,20 +21,24 @@
__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
-def load(writer, array_delimiter, source=__NCBITAXONOMY_URL):
+def load(reaction_manager, writer, array_delimiter, source=__NCBITAXONOMY_URL):
'''Loads NCBI Taxonomy data.'''
+ #Not used currently
#nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
- ####Update filepath accordingly
- nodes_filename = '~/kg_microbe/kg-microbe/data/raw/ncbitaxon.json'
#nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
+ #_parse_names(nodes, names_filename, array_delimiter)
+ #######
+ nodes_filename = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg_microbe/kg-microbe/data/raw/ncbitaxon.json'
print('parsing ncbi taxon json file')
kgx_nodes_json,kgx_edges_json = _parse_nodes_kgmicrobe(nodes_filename, array_delimiter)
nodes,rels = transform_kgx_output_format(kgx_nodes_json,kgx_edges_json)
- #_parse_names(nodes, names_filename, array_delimiter)
writer.write_nodes(nodes.values(), 'Organism')
writer.write_rels(rels, 'Organism', 'Organism')
+ print('adding organism-enzyme relationships')
+ reaction_manager.add_org_to_enz(nodes, 'uniprot')
+
def _get_ncbi_taxonomy_files(source):
'''Downloads and extracts NCBI Taxonomy files.'''
@@ -54,8 +58,7 @@ def _get_ncbi_taxonomy_files(source):
def _parse_nodes_kgmicrobe(filename, array_delimiter):
'''Parses nodes file.'''
- ####Update filepath accordingly
- output_dir = '~/biochem4j/'
+ output_dir = '/Users/brooksantangelo/Documents/HunterLab/biochem4j/biochem4j/'
name = 'ncbitaxon_transformed'
transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, name), output_format='tsv')
@@ -72,7 +75,7 @@ def transform_kgx_output_format(transformed_nodes_tsv,transformed_edges_tsv):
rels = []
for i in range(len(labels)):
- tax_id = labels.iloc[i].loc['id']
+ tax_id = labels.iloc[i].loc['id'].split('NCBITaxon:')[1]
nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
':LABEL':
'Organism,unknown'}
@@ -127,6 +130,7 @@ def _parse_names(nodes, filename, array_delimiter):
array_delimiter.join(node['names:string[]'])
+
def main(argv):
'''main method'''
load(*argv)
From 89bd3325e1f67fd0b735a581a09466662a087f03 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:11:02 -0600
Subject: [PATCH 09/29] Update reaction_utils.py
Update to ingest all uniprot/rhea relationships based on kg-microbe microbes
- output Enzyme_Reaction.tsv and Organism_Enzyme.tsv based on kg-microbe nodes --> UniProt enzymes --> Rhea reactions
---
metanetx_uniprot/reaction_utils.py | 46 +++++++++++++++++++++++++++++-
1 file changed, 45 insertions(+), 1 deletion(-)
diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py
index 2a6d9394..058875eb 100644
--- a/metanetx_uniprot/reaction_utils.py
+++ b/metanetx_uniprot/reaction_utils.py
@@ -9,6 +9,8 @@
'''
from enzyme_utils import EnzymeManager
+from numpy import *
+
class ReactionManager(object):
'''Class to implement a manager of Reaction data.'''
@@ -18,6 +20,7 @@ def __init__(self):
self.__nodes = {}
self.__reac_ids = {}
self.__reac_enz_rels = []
+ self.__enz_reac_rels = []
self.__org_enz_rels = []
self.__enz_man = EnzymeManager()
@@ -29,6 +32,8 @@ def write_files(self, writer):
'Enzyme')],
[writer.write_rels(self.__reac_enz_rels,
'Reaction', 'Enzyme'),
+ writer.write_rels(self.__enz_reac_rels,
+ 'Enzyme', 'Reaction'),
writer.write_rels(self.__enz_man.get_org_enz_rels(),
'Organism', 'Enzyme')])
@@ -64,7 +69,12 @@ def add_react_to_enz(self, data, source, num_threads=0):
enzyme_ids = self.__create_react_enz(data, source)
# Create Enzyme nodes:
- self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads)
+ self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads)
+
+ def add_react_to_enz_organism(self, data, source, num_threads=0):
+
+ #Create Reaction relationships
+ reaction_ids = self.__create_enz_react(data, source)
def __create_react_enz(self, data, source):
'''Creates Reaction and Enzyme nodes and their Relationships.'''
@@ -80,3 +90,37 @@ def __create_react_enz(self, data, source):
{'source': source}])
return list(set(enzyme_ids))
+
+ def __create_enz_react(self, data, source):
+ '''Creates Reaction and Enzyme nodes and their Relationships.'''
+ print('adding reaction to enzyme relationships')
+ reaction_ids = []
+ enzyme_ids = self.__enz_man.get_nodes()
+
+ for enz_id in enzyme_ids:
+ reac_ids = [key for key, value in data.items() if enz_id['entry'] in value]
+ reaction_ids = reaction_ids+reac_ids
+ for j in reac_ids:
+ self.__enz_reac_rels.append([j, 'catalysed_by',
+ enz_id['entry'],
+ {'source': source}])
+
+ return list(set(reaction_ids))
+
+ def add_org_to_enz(self, nodes, source, num_threads=0):
+ '''Submit data to the graph.'''
+ # Create Organism nodes:
+ organism_ids = self.__create_organism_ids(nodes, source)
+
+ ## For testing
+ #organism_ids = organism_ids[0:10]
+
+ # Create Organism and Enzyme nodes:
+ self.__enz_man.add_uniprot_data_organism(organism_ids, source, num_threads)
+
+ def __create_organism_ids(self, data, source):
+
+ ids = unique(list(data.keys()))
+
+ return ids
+
From f8fe4cbd44f9f4fd6752cea26c106bbc1c8db02a Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:11:52 -0600
Subject: [PATCH 10/29] Update rhea_utils.py
Update to ingest all uniprot/rhea relationships based on kg-microbe microbes
- Get Rhea reactions based on enzymes expressed by kg-microbe set of microbes from nbcbitaxon.json
---
metanetx_uniprot/rhea_utils.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py
index 3d926091..1fa6e91c 100644
--- a/metanetx_uniprot/rhea_utils.py
+++ b/metanetx_uniprot/rhea_utils.py
@@ -30,7 +30,9 @@ def load(reaction_manager, source=__RHEA_URL, num_threads=0):
data.update(data_small)
'''
######Not sure why source is Rhea here, calls to UniProt
- reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
+ #Remove, since this goes from rhea2uniprot to uniprot enzymes. use add_org_to_enz function in ncbi_taxonomy_utils instead
+ #reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
+ reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads)
def _parse(filename):
From df7978266af3c0c1c0a0db099f49f2381830c1b6 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:12:42 -0600
Subject: [PATCH 11/29] Update enzyme_utils.py
Update to ingest all uniprot/rhea relationships based on kg-microbe microbes
- query uniprot based on organism_ids, not enzyme_ids
---
metanetx_uniprot/enzyme_utils.py | 43 ++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py
index 6f90b475..707a8399 100644
--- a/metanetx_uniprot/enzyme_utils.py
+++ b/metanetx_uniprot/enzyme_utils.py
@@ -63,3 +63,46 @@ def add_uniprot_data(self, enzyme_ids, source, num_threads=0):
if organism_id:
self.__org_enz_rels.append([organism_id, 'expresses',
uniprot_id, {'source': source}])
+
+ #Builds into reactionManager
+ def add_uniprot_data_organism(self, organism_ids, source, num_threads=0):
+ '''Gets Uniprot data.'''
+
+ #fields = ['entry name', 'protein names', 'organism-id', 'ec']
+ fields = ['id', 'accession','protein_name', 'organism_id', 'ec']
+ #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]
+ print('querying uniprot for enzymes per organism')
+ uniprot_values = get_uniprot_values_organism(organism_ids, fields,
+ batch_size=128,
+ verbose=False,
+ num_threads=num_threads)
+
+ print('add_uniprot_data function: added uniprot values: ',len(uniprot_values))
+
+
+
+ print('adding uniprot data to graph')
+ for uniprot_id, uniprot_value in tqdm(uniprot_values.items()):
+ enzyme_node = {':LABEL': 'Enzyme',
+ 'uniprot:ID(Enzyme)': uniprot_id}
+ self.__nodes[uniprot_id] = enzyme_node
+
+ organism_id = uniprot_value.pop('Organism (ID)') \
+ if 'Organism (ID)' in uniprot_value else None
+
+ if 'Entry' in uniprot_value:
+ enzyme_node['entry'] = uniprot_value['Entry']
+
+ if 'Protein names' in uniprot_value:
+ enzyme_node['names'] = uniprot_value['Protein names']
+
+ if enzyme_node['names']:
+ enzyme_node['name'] = enzyme_node['names'][0]
+
+ if 'EC number' in uniprot_value:
+ enzyme_node['ec-code'] = uniprot_value['EC number']
+
+ if organism_id:
+ self.__org_enz_rels.append([organism_id, 'expresses',uniprot_value['Entry'], {'source': source}])
+
+ return uniprot_values
From f15bb82102f5aa8fc05fbc8fa5154f33b4a29af5 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Wed, 22 Mar 2023 16:13:22 -0600
Subject: [PATCH 12/29] Update seq_utils.py
Update to ingest all uniprot/rhea relationships based on kg-microbe microbes
- query uniprot based on organism_ids, not enzyme_ids
---
metanetx_uniprot/seq_utils.py | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py
index 892b3a6c..1776f94a 100644
--- a/metanetx_uniprot/seq_utils.py
+++ b/metanetx_uniprot/seq_utils.py
@@ -29,6 +29,8 @@
import queue
import numpy as np
+from tqdm import tqdm
+import sys
def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False,
num_threads=0):
@@ -107,6 +109,36 @@ def _parse_uniprot_data(url, values):
if key.startswith('Cross-reference'):
resp[key] = resp[key].split(';')
+ if 'Error messages' in resp:
+ print(resp); sys.exit()
values.append(resp)
except Exception as err:
- print(err)
\ No newline at end of file
+ print(err)
+
+
+def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False, num_threads=0):
+ values = []
+
+ for i in tqdm(range(0, len(organism_ids), batch_size)):
+ values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose)
+
+ return {value['Organism (ID)']: value for value in values}
+
+def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, verbose):
+ '''Get batch of Uniprot data.'''
+ if verbose:
+ print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +
+ str(min(i + batch_size, len(organism_ids))) + ' / ' +
+ str(len(organism_ids)))
+
+ #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE
+ batch = organism_ids[i:min(i + batch_size, len(organism_ids))]
+ query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch])
+ url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
+ '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
+ for field in fields])
+
+ #print('_get_uniprot_batch_organism url: ',url)
+
+ _parse_uniprot_data(url, values)
+ return values
From c183254fec387219c6b0436c71badaffd45b13c5 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:46:00 -0600
Subject: [PATCH 13/29] Update build.py
Include enzyme to reaction, and reaction to chemical relationships
---
metanetx_uniprot/build.py | 34 +++++++++++++++++++---------------
1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py
index 3fce3299..0ac9524b 100644
--- a/metanetx_uniprot/build.py
+++ b/metanetx_uniprot/build.py
@@ -1,6 +1,9 @@
'''
SYNBIOCHEM-DB (c) University of Manchester 2015
+'''
+SYNBIOCHEM-DB (c) University of Manchester 2015
+
SYNBIOCHEM-DB is licensed under the MIT License.
To view a copy of this license, visit .
@@ -10,31 +13,23 @@
import multiprocessing
import sys
-import chebi_utils, chemical_utils, mnxref_utils, \
- ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils, seq_utils #, kegg_utils
+import chebi_utils, chemical_utils, mnxref_utils, ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils, seq_utils #, kegg_utils
def build_csv(dest_dir, array_delimiter, num_threads):
'''Build database CSV files.'''
writer = utils.Writer(dest_dir)
-
reac_man = reaction_utils.ReactionManager()
-
+
# Get Organism data:
print('Parsing NCBI Taxonomy')
- ncbi_taxonomy_utils.load(reac_man, writer, array_delimiter)
+ ncbi_taxonomy_utils.load(reac_man, writer, array_delimiter) #--> writes Organism_Enzyme.tsv
-
# Get Chemical and Reaction data.
# Write chemistry csv files:
- #chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
- # May be duplicate line
- #reac_man = reaction_utils.ReactionManager()
+ chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
- #print('Parsing MNXref')
- #mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
- #mnx_loader.load()
#print('Parsing ChEBI')
#chebi_utils.load(chem_man, writer)
@@ -44,7 +39,6 @@ def build_csv(dest_dir, array_delimiter, num_threads):
#print('Parsing spectrum data')
#spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter)
- #chem_man.write_files(writer)
####Not including KEGG for now
# Get Reaction / Enzyme / Organism data:
@@ -53,8 +47,16 @@ def build_csv(dest_dir, array_delimiter, num_threads):
print('Parsing Rhea')
- rhea_utils.load(reac_man, num_threads=num_threads)
- reac_man.write_files(writer)
+ ##Returns rhea reaction ids
+ reaction_ids = rhea_utils.load(reac_man, num_threads=num_threads)
+ reac_man.write_files(writer) #--> writes Enzyme_Reaction.tsv
+
+ #
+ print('Parsing MNXref')
+ mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer, reaction_ids)
+ mnx_loader.load() #--> writes Reaction_Chemical.tsv
+
+ #chem_man.write_files(writer)
def main(args):
@@ -73,5 +75,7 @@ def main(args):
build_csv(args[0], args[1], num_threads)
+
+
if __name__ == '__main__':
main(sys.argv[1:])
From 133222ed0e22192ed433dfa7180a9328c9869b8c Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:46:51 -0600
Subject: [PATCH 14/29] Update mnxref_utils.py
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/mnxref_utils.py | 56 ++++++++++++++++++++++++++------
1 file changed, 46 insertions(+), 10 deletions(-)
diff --git a/metanetx_uniprot/mnxref_utils.py b/metanetx_uniprot/mnxref_utils.py
index cbb67687..aa231e22 100644
--- a/metanetx_uniprot/mnxref_utils.py
+++ b/metanetx_uniprot/mnxref_utils.py
@@ -24,10 +24,12 @@
import namespace_utils
from synbiochem.utils import chem_utils
+import os
_METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/'
-
+#For test, also update __read_data function
+#_METANETX_URL = os.getcwd()+'/TestingFiles/'
class MnxRefReader(object):
'''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and
@@ -47,12 +49,15 @@ def get_chem_data(self):
return self.__chem_data
- def get_reac_data(self):
+ def get_reac_data(self,reaction_ids):
'''Gets reaction data.'''
if not self.__reac_data:
- self.__read_reac_prop()
+ mxn_reaction_ids = self.__read_reac_prop(reaction_ids)
self.__read_xref('reac_xref.tsv', self.__reac_data, False)
+ #Only include reaction data for reactions in reaction_ids
+ self.__reac_data = {key:val for key,val in self.__reac_data.items() if key in mxn_reaction_ids}
+
return self.__reac_data
def __read_chem_prop(self):
@@ -101,15 +106,24 @@ def __add_xref(self, xref, entry, chemical):
if namespace != 'chebi' \
else 'CHEBI:' + xref[1]
- def __read_reac_prop(self):
+ def __read_reac_prop(self,reaction_ids):
'''Read reaction properties and create Nodes.'''
reac_prop_keys = ['id', 'equation', 'reference', 'ec', 'balance', 'transport']
+ ##Relabel reaction ids by MXN id rather than rhea id
+ mxn_reaction_ids = []
+
for values in self.__read_data('reac_prop.tsv'):
- if not values[0].startswith('#'):
+ if not values[0].startswith('#'):
+ if values[0] == 'EMPTY': continue
values[0] = self.__parse_id(values[0])
values[2] = self.__parse_id(values[2])
+ try:
+ if 'rhea' in values[2].split(':')[0].lower() and values[2].split(':')[1] in reaction_ids:
+ mxn_reaction_ids.append(values[0])
+ except IndexError: continue
+
props = dict(zip(reac_prop_keys, values))
props.pop('reference')
@@ -129,6 +143,8 @@ def __read_reac_prop(self):
print('WARNING: Suspected polymerisation reaction: ' + \
values[0] + '\t' + str(props))
+ return mxn_reaction_ids
+
def __add_chem(self, chem_id):
'''Adds a chemical with given id.'''
props = {'id': chem_id}
@@ -138,6 +154,7 @@ def __add_chem(self, chem_id):
def __read_data(self, filename):
'''Downloads and reads tab-limited files into lists of lists of
strings.'''
+
with requests.Session() as s:
download = s.get(self.__source + filename)
@@ -146,7 +163,21 @@ def __read_data(self, filename):
cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
my_list = list(cr)
return my_list
-
+ '''
+ ###Reads downloaded file for offline testing
+ #cr = csv.reader((self.__source + filename).splitlines(), delimiter='\t')
+ import pandas as pd
+ cr = pd.read_csv(self.__source + filename, delimiter='\t', comment='#',header=None)
+ cr_d = []
+ for i in range(len(cr)):
+ l = []
+ for j in range(len(cr.columns)):
+ l.append(cr.iloc[i,j])
+ cr_d.append(l)
+
+ return cr_d
+ '''
+
def __parse_id(self, item_id):
'''Parses mnx ids.'''
@@ -161,21 +192,24 @@ def __parse_id(self, item_id):
class MnxRefLoader(object):
'''Loads MNXref data into neo4j format.'''
- def __init__(self, chem_man, reac_man, writer):
+ def __init__(self, chem_man, reac_man, writer,reaction_ids):
self.__chem_man = chem_man
self.__reac_man = reac_man
self.__writer = writer
+ self.__reactions = reaction_ids
def load(self):
'''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv,
reac_prop.tsv and reac_xref.tsv files.'''
reader = MnxRefReader()
+ #First gets all chemical data from MxnRef (chem_xref and chem_prop) and adds to __chem_man
for properties in reader.get_chem_data().values():
properties['mnx'] = properties.pop('id')
self.__chem_man.add_chemical(properties)
- rels = self.__add_reac_nodes(reader.get_reac_data())
+ #Then gets reaction data from reac_xref and reac_prop and adds to __chem_man
+ rels = self.__add_reac_nodes(reader.get_reac_data(self.__reactions))
return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')]
@@ -276,8 +310,10 @@ def _filter(counter, cutoff):
# Fit straight-line to histogram log-log plot and filter...
x_val, y_val = zip(*list(hist_counter.items()))
- m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1)
-
+ l_x_val = numpy.log(x_val)[0]
+ l_y_val = numpy.log(y_val)[0]
+ if l_x_val == 0.0: l_x_val += 0.01
+ m_val, b_val = numpy.polyfit([l_x_val], [l_y_val], 1)
return [item[0] for item in counter.items()
if item[1] > math.exp(cutoff * -b_val / m_val)]
From 1d5d048f86aefd3bf6bdbb7fad0406ba7464c2a2 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:47:21 -0600
Subject: [PATCH 15/29] Update ncbi_taxonomy_utils.py
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/ncbi_taxonomy_utils.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py
index f5ffe788..e563331d 100644
--- a/metanetx_uniprot/ncbi_taxonomy_utils.py
+++ b/metanetx_uniprot/ncbi_taxonomy_utils.py
@@ -28,7 +28,8 @@ def load(reaction_manager, writer, array_delimiter, source=__NCBITAXONOMY_URL):
#nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
#_parse_names(nodes, names_filename, array_delimiter)
#######
- nodes_filename = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg_microbe/kg-microbe/data/raw/ncbitaxon.json'
+ nodes_filename = os.getcwd()+'/Files/ncbitaxon.json'
+ #nodes_filename = os.getcwd()+'/TestingFiles/ncbitaxon.json'
print('parsing ncbi taxon json file')
kgx_nodes_json,kgx_edges_json = _parse_nodes_kgmicrobe(nodes_filename, array_delimiter)
nodes,rels = transform_kgx_output_format(kgx_nodes_json,kgx_edges_json)
@@ -64,7 +65,7 @@ def _parse_nodes_kgmicrobe(filename, array_delimiter):
transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, name), output_format='tsv')
return output_dir+name+'_nodes.tsv',output_dir+name+'_edges.tsv'
-
+
def transform_kgx_output_format(transformed_nodes_tsv,transformed_edges_tsv):
labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name'])
@@ -106,7 +107,6 @@ def _parse_nodes(filename, array_delimiter):
':LABEL':
'Organism' + array_delimiter + tokens[2]}
- print(list(nodes.values())[0:5])
return nodes, rels
From 303a824c8991da51047b3f113f420fa7db5eb907 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:47:50 -0600
Subject: [PATCH 16/29] Update rhea_utils.py
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/rhea_utils.py | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py
index 1fa6e91c..5c612d90 100644
--- a/metanetx_uniprot/rhea_utils.py
+++ b/metanetx_uniprot/rhea_utils.py
@@ -10,29 +10,28 @@
import tempfile
import urllib
from urllib.request import urlretrieve
+import os
__RHEA_URL = 'ftp://ftp.expasy.org/databases/rhea/tsv/rhea2uniprot%5Fsprot.tsv'
-
+#For test, also update load function
+#__RHEA_URL = os.getcwd()+'/TestingFiles/rhea2uniprot_sprot.txt'
def load(reaction_manager, source=__RHEA_URL, num_threads=0):
'''Loads Rhea data.'''
# Parse data:
+
temp_file = tempfile.NamedTemporaryFile()
urlretrieve(source, temp_file.name)
data = _parse(temp_file.name)
- '''
- ###For testing, uncomment the following code
- data_small = dict()
- for key in sorted(data)[:50]:
- data_small[key] = data[key]
- data.clear()
- data.update(data_small)
- '''
+ ##If using test data
+ #data = _parse(source)
######Not sure why source is Rhea here, calls to UniProt
#Remove, since this goes from rhea2uniprot to uniprot enzymes. use add_org_to_enz function in ncbi_taxonomy_utils instead
#reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
- reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads)
+ reaction_ids = reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads)
+
+ return reaction_ids
def _parse(filename):
From 27f711187b83f839da79194f395d9953e4f58d3c Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:48:23 -0600
Subject: [PATCH 17/29] Update reaction_utils.py
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/reaction_utils.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py
index 058875eb..f13c429c 100644
--- a/metanetx_uniprot/reaction_utils.py
+++ b/metanetx_uniprot/reaction_utils.py
@@ -32,8 +32,9 @@ def write_files(self, writer):
'Enzyme')],
[writer.write_rels(self.__reac_enz_rels,
'Reaction', 'Enzyme'),
+ #Gets reactions connected to all enzymes
writer.write_rels(self.__enz_reac_rels,
- 'Enzyme', 'Reaction'),
+ 'Reaction', 'Enzyme'),
writer.write_rels(self.__enz_man.get_org_enz_rels(),
'Organism', 'Enzyme')])
@@ -76,6 +77,8 @@ def add_react_to_enz_organism(self, data, source, num_threads=0):
#Create Reaction relationships
reaction_ids = self.__create_enz_react(data, source)
+ return reaction_ids
+
def __create_react_enz(self, data, source):
'''Creates Reaction and Enzyme nodes and their Relationships.'''
enzyme_ids = []
@@ -104,7 +107,6 @@ def __create_enz_react(self, data, source):
self.__enz_reac_rels.append([j, 'catalysed_by',
enz_id['entry'],
{'source': source}])
-
return list(set(reaction_ids))
def add_org_to_enz(self, nodes, source, num_threads=0):
From 009b1876dbcd986562bfedc05cd48a754d6a900b Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:48:49 -0600
Subject: [PATCH 18/29] Update seq_utils.py
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/seq_utils.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py
index 1776f94a..43682c9a 100644
--- a/metanetx_uniprot/seq_utils.py
+++ b/metanetx_uniprot/seq_utils.py
@@ -122,7 +122,10 @@ def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False,
for i in tqdm(range(0, len(organism_ids), batch_size)):
values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose)
- return {value['Organism (ID)']: value for value in values}
+ ##Issue: Only returns one enzyme per organism
+ #return {value['Organism (ID)']: value for value in values}
+ ##Returns list of dicts for each organism-id enzyme entry
+ return values
def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, verbose):
'''Get batch of Uniprot data.'''
@@ -135,10 +138,10 @@ def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, ver
batch = organism_ids[i:min(i + batch_size, len(organism_ids))]
query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch])
url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
- '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
+ '&format=tsv&size=500&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
+ # '&format=tsv&size=1&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
for field in fields])
- #print('_get_uniprot_batch_organism url: ',url)
_parse_uniprot_data(url, values)
return values
From cc29ad241a668244cac2b31d25bd82a1d07cfe27 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:49:17 -0600
Subject: [PATCH 19/29] Update enzyme_utils.py
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/enzyme_utils.py | 38 +++++++++++++++-----------------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py
index 707a8399..d6ea4969 100644
--- a/metanetx_uniprot/enzyme_utils.py
+++ b/metanetx_uniprot/enzyme_utils.py
@@ -68,41 +68,39 @@ def add_uniprot_data(self, enzyme_ids, source, num_threads=0):
def add_uniprot_data_organism(self, organism_ids, source, num_threads=0):
'''Gets Uniprot data.'''
- #fields = ['entry name', 'protein names', 'organism-id', 'ec']
fields = ['id', 'accession','protein_name', 'organism_id', 'ec']
- #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]
print('querying uniprot for enzymes per organism')
+ ##Uniprot returns list of dicts for each entry
uniprot_values = get_uniprot_values_organism(organism_ids, fields,
batch_size=128,
verbose=False,
num_threads=num_threads)
- print('add_uniprot_data function: added uniprot values: ',len(uniprot_values))
-
-
-
print('adding uniprot data to graph')
- for uniprot_id, uniprot_value in tqdm(uniprot_values.items()):
+
+ ##To return all organism-enzyme entries
+ for entry in tqdm(uniprot_values):
enzyme_node = {':LABEL': 'Enzyme',
- 'uniprot:ID(Enzyme)': uniprot_id}
- self.__nodes[uniprot_id] = enzyme_node
+ 'uniprot:ID(Enzyme)': entry['Entry']}
+ self.__nodes[entry['Entry']] = enzyme_node
- organism_id = uniprot_value.pop('Organism (ID)') \
- if 'Organism (ID)' in uniprot_value else None
+ organism_id = entry['Organism (ID)'] \
+ if 'Organism (ID)' in entry.keys() else None
- if 'Entry' in uniprot_value:
- enzyme_node['entry'] = uniprot_value['Entry']
+ if 'Entry' in entry.keys():
+ enzyme_node['entry'] = entry['Entry']
- if 'Protein names' in uniprot_value:
- enzyme_node['names'] = uniprot_value['Protein names']
+ if 'Protein names' in entry:
+ enzyme_node['names'] = entry['Protein names']
- if enzyme_node['names']:
- enzyme_node['name'] = enzyme_node['names'][0]
+ if 'names' in entry.keys():
+ enzyme_node['name'] = entry['names'][0]
- if 'EC number' in uniprot_value:
- enzyme_node['ec-code'] = uniprot_value['EC number']
+ if 'EC number' in entry:
+ enzyme_node['ec-code'] = entry['EC number']
if organism_id:
- self.__org_enz_rels.append([organism_id, 'expresses',uniprot_value['Entry'], {'source': source}])
+ self.__org_enz_rels.append([organism_id, 'expresses',entry['Entry'], {'source': source}])
return uniprot_values
+
From a069bc0f469a479638f55122f221c5a1a797de7a Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:50:27 -0600
Subject: [PATCH 20/29] Create rhea2uniprot_sprot.txt
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt | 6 ++++++
1 file changed, 6 insertions(+)
create mode 100644 metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt
diff --git a/metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt b/metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt
new file mode 100644
index 00000000..05b819cc
--- /dev/null
+++ b/metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt
@@ -0,0 +1,6 @@
+50004 UN 50004 Q01911
+61444 UN 61444 Q01911
+42776 UN 42776 A8C927
+18690 LR 18689 P0DTE9
+60624 UN 60624 P0DTE9
+60625 LR 60624 P0DTE9
From c3c8d410a9b3ad2b3da5b83086f502e01f8a56ae Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Mon, 3 Apr 2023 08:50:48 -0600
Subject: [PATCH 21/29] Add files via upload
Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset
---
metanetx_uniprot/TestingFiles/chem_prop.tsv | 358 ++++++++++++++++++
metanetx_uniprot/TestingFiles/chem_xref.tsv | 362 ++++++++++++++++++
metanetx_uniprot/TestingFiles/ncbitaxon.json | 188 ++++++++++
metanetx_uniprot/TestingFiles/reac_prop.tsv | 359 ++++++++++++++++++
metanetx_uniprot/TestingFiles/reac_xref.tsv | 365 +++++++++++++++++++
5 files changed, 1632 insertions(+)
create mode 100644 metanetx_uniprot/TestingFiles/chem_prop.tsv
create mode 100644 metanetx_uniprot/TestingFiles/chem_xref.tsv
create mode 100644 metanetx_uniprot/TestingFiles/ncbitaxon.json
create mode 100644 metanetx_uniprot/TestingFiles/reac_prop.tsv
create mode 100644 metanetx_uniprot/TestingFiles/reac_xref.tsv
diff --git a/metanetx_uniprot/TestingFiles/chem_prop.tsv b/metanetx_uniprot/TestingFiles/chem_prop.tsv
new file mode 100644
index 00000000..d4f28677
--- /dev/null
+++ b/metanetx_uniprot/TestingFiles/chem_prop.tsv
@@ -0,0 +1,358 @@
+### MetaNetX/MNXref reconciliation ###
+#Based on the following resources:
+#
+#RESOURCE: MetaNetX/MNXref
+#VERSION: 4.4
+#DATE: 2022/03/16
+#URL: https://www.metanetx.org
+#LICENSE:
+# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics
+# Except where otherwise noted, the data available from this site are
+# licensed under a Creative Commons Attribution 4.0 International License.
+# MNXref uses information on cellular compartments, reactions, and
+# metabolites that is sourced from a number of external resources. The
+# licensing agreements of those resources are specified in each of the
+# downloadable files listed below. For each compound, reaction and
+# cellular compartment in the MNXref namespace we indicate which external
+# resource provided the information used in MNXref. Compounds and
+# reactions in the MNXref namespace may be identical to, or differ from,
+# those in the external resource. In either case the data from MNXref may
+# be considered to be subject to the original licensing restrictions of
+# the external resource.
+# (https://www.metanetx.org/mnxdoc/mnxref.html)
+#
+#RESOURCE: BiGG
+#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23)
+#URL: http://bigg.ucsd.edu
+#LICENSE:
+# Copyright 2015 The Regents of the University of California
+#
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of BiGG Models
+# for educational, research and non-profit purposes, without fee, and
+# without a written agreement is hereby granted, provided that the above
+# copyright notice, this paragraph and the following three paragraphs
+# appear in all copies.
+#
+# Those desiring to incorporate BiGG Models into commercial products or
+# use for commercial purposes should contact the Technology Transfer &
+# Intellectual Property Services, University of California, San Diego,
+# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858)
+# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu.
+#
+# In no event shall the University of California be liable to any party
+# for direct, indirect, special, incidental, or consequential damages,
+# including lost profits, arising out of the use of this bigg database,
+# even if the University of California has been advised of the possibility
+# of such damage.
+#
+# The BiGG Models provided herein is on an "as is" basis, and the
+# University of California has no obligation to provide maintenance,
+# support, updates, enhancements, or modifications. The University of
+# California makes no representations and extends no warranties of any
+# kind, either implied or express, including, but not limited to, the
+# implied warranties of merchantability or fitness for a particular
+# purpose, or that the use of the BiGG Models will not infringe any
+# patent, trademark or other rights.
+# (http://bigg.ucsd.edu/)
+#
+#RESOURCE: The Cell Component Ontology
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://bioinformatics.ai.sri.com/CCO/
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: ChEBI
+#VERSION: 203 (downloaded on 2021/09/30)
+#URL: https://www.ebi.ac.uk/chebi/
+#LICENSE:
+# All data in the database is non-proprietary or is derived from a
+# non-proprietary source. It is thus freely accessible and available to
+# anyone. In addition, each data item is fully traceable and explicitly
+# referenced to the original source.
+# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do)
+#
+#RESOURCE: enviPath
+#VERSION: (downloaded on 2021/11/24)
+#URL: https://envipath.org
+#LICENSE:
+# The core data sets of enviPath are licensed under the Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
+# license. This allows you to use them in a non-commercial context, for
+# example if you work at a University or for a public research institute.
+# You can even redistribute and modify the data using the same license. If
+# you want to use the data commercially, contact us, we offer commercial
+# license agreements.
+# We summarized how you can use the data on our license page.
+# (https://envipath.com/license/)
+#
+#RESOURCE: HMDB
+#VERSION: 4.0 (downloaded on 2021/06/18)
+#URL: https://hmdb.ca
+#LICENSE:
+# HMDB is offered to the public as a freely available resource. Use and
+# re-distribution of the data, in whole or in part, for commercial
+# purposes requires explicit permission of the authors and explicit
+# acknowledgment of the source material (HMDB) and the original
+# publication.
+# (https://hmdb.ca/about)
+#
+#RESOURCE: KEGG
+#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11)
+#URL: https://www.kegg.jp
+#LICENSE:
+# Academic users may freely use the KEGG website and may also freely link
+# to the KEGG website.
+# Non-academic users may use the KEGG website as end users for
+# non-commercial purposes, but any other use requires a license agreement.
+# Academic users who utilize KEGG for providing academic services are
+# requested to obtain a KEGG FTP subscription for organizational use,
+# which includes a proper license agreement.
+# Non-academic users and Academic users intending to use KEGG for
+# commercial purposes are requested to obtain a license agreement through
+# KEGG's exclusive licensing agent, Pathway Solutions.
+# (https://www.kegg.jp/kegg/legal.html)
+#
+#RESOURCE: LipidMaps
+#VERSION: 2021-05-28 (downloaded on 2021/06/11)
+#URL: https://www.lipidmaps.org
+#LICENSE:
+# The Lipidomics Gateway is provided on an "as is" basis, without warranty
+# or representation of any kind, express or implied. The content of the
+# Lipidomics Gateway website is protected by international copyright,
+# trademark and other laws. You may download articles and web pages from
+# this site for your personal, non-commercial use only, provided that you
+# keep intact all authorship, copyright and other proprietary notices. The
+# Featured Lipid can also be used for educational purposes, provided that
+# credit is given to the Lipidomics Gateway. If you use the Lipidomics
+# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the
+# right to modify these terms at any time.
+# (https://www.lipidmaps.org/about/)
+#
+#RESOURCE: MetaCyc
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://metacyc.org
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: Reactome
+#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03)
+#URL: https://reactome.org
+#LICENSE:
+# Reactome is an open source and open access resource, available to anyone.
+# Usage of Reactome material is covered by two Creative Commons licenses:
+#
+# The terms of the Creative Commons Public Domain (CC0) License apply to all
+# Reactome annotation files, e.g. identifier mapping data, specialized data
+# files, and interaction data derived from Reactome.
+# (https://reactome.org/license/)
+#
+#RESOURCE: Rhea
+#VERSION: 119 (downloaded on 2021/11/03)
+#URL: https://www.rhea-db.org
+#LICENSE:
+# All data in Rhea is freely accessible and available for anyone to use under
+# the Creative Commons Attribution License.
+# (https://www.rhea-db.org/documentation)
+#
+#RESOURCE: SABIO-RK
+#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01)
+#URL: http://sabiork.h-its.org
+#LICENSE:
+# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its
+# associated documentation (all referred to in the following as
+# "Database"). You should carefully read the following terms and
+# conditions before using this Database. Your use of this Database
+# indicates your acceptance of this license agreement and all terms and
+# conditions.You are hereby granted a non-exclusive and non-transferable
+# license to use the Database according to the following terms and
+# conditions. This license is to use the Database for Non-Commercial
+# Purpose only. Non-Commercial Purpose means the use of the Database
+# solely for internal non-commercial research and academic purposes.
+# Non-Commercial Purpose excludes, without limitation, any use of the
+# Database, as part of, or in any way in connection with a product or
+# service which is sold, offered for sale, licensed, leased, loaned, or
+# rented. Permission to use this Database for Non-Commercial Purpose is
+# hereby granted without fee and subject to the following terms of this
+# license.
+#
+# Commercial Use
+# If you desire to use the Database for profit-making or commercial
+# purposes, you agree to negotiate in good faith a license with the HITS
+# prior to such profit-making or commercial use. The HITS shall have no
+# obligation to grant such license to you, and may grant exclusive or
+# non-exclusive licenses to others. You agree to notify the HITS of any
+# inquiries you have for commercial use of the Database and/or its
+# modifications. You may contact the following email to discuss commercial
+# use: sabiork at h-its.org
+#
+# Governing Law
+# This Agreement is governed by the law of the Federal Republic of
+# Germany. The application of the UN Convention on the Sale of Goods is
+# excluded.
+#
+# Disclaimer of Warranty
+# Because this Database is licensed free of charge, there is no warranty
+# for the data in it contained and the methods used for its querying. The
+# HITS makes no warranty or representation that the operation of the
+# Database in this compilation will be error-free, and the HITS is under
+# no obligation to provide any services, by way of maintenance, update, or
+# otherwise.
+#
+# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND
+# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER
+# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A
+# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND
+# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
+# CORRECTION.
+#
+# Limitation of Liability
+# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
+# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
+# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
+# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
+# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
+# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS
+# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+#
+# Reference to SABIO-RK Users will cite SABIO-RK in publications or
+# presentations, whenever the data used was extracted from the database.
+# Termination This agreement is effective until terminated. You may
+# terminate this agreement at any time by destroying all associated
+# material (e.g., documentation or web service clients) to the database in
+# your possession and by stopping any access to the database directly or
+# from software generated by you. This agreement will terminate
+# immediately without notice from and HITS if you fail to comply with any
+# of the terms and conditions of this license. This agreement will also
+# terminate immediately without notice from the HITS if it is found to
+# implement patented algorithms or contain copyrighted code not owned or
+# licensed the HITS for the purpose of its inclusion in the SABIO-RK
+# Database. This agreement cannot be terminated by any other mechanism or
+# for any other reason than those stated herein.
+#
+# Place of Court
+# The exclusive venue for all disputes arising from or in connection with
+# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a
+# business person, a legal entity governed by public law, or a special
+# fund governed by public law, or does not have a general place of
+# jurisdiction within the Federal Republic of Germany. Address all
+# correspondence regarding this license to electronic mail address:
+# sabiork at h-its.org Any inquiries and comments regarding bugs, bug
+# fixes, enhancements, modifications or any other similar issues should be
+# directed to: sabiork at h-its.org
+#
+# Copyright 2007 by HITS, gGmbH. All rights reserved.
+# (http://sabiork.h-its.org/layouts/content/termscondition.gsp)
+#
+#RESOURCE: The SEED
+#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09)
+#URL: https://modelseed.org
+#LICENSE:
+# All tools and datasets that make up the SEED are in the public domain.
+# (https://modelseed.org)
+#
+#RESOURCE: SwissLipids
+#VERSION: (downloaded on 2021/07/29)
+#URL: https://www.swisslipids.org
+#LICENSE:
+# SwissLipids is licensed under a Creative Commons Attribution-Non
+# Commercial-NoDerivatives 4.0 International License.
+#
+# Commercial users and those who wish to use this work for commercial
+# purposes please contact the SIB technology transfer officer at:
+# marc.filliettaz@genebio.com
+# (https://www.swisslipids.org/#/downloads)
+#ID name reference formula charge mass InChI InChIKey SMILES
+MNXM738702 NADPH chebi:57783 C21H26N7O17P3 -4 741.06200 InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(44-46(33,34)35)14(30)11(43-21)6-41-48(38,39)45-47(36,37)40-5-10-13(29)15(31)20(42-10)27-3-1-2-9(4-27)18(23)32/h1,3-4,7-8,10-11,13-16,20-21,29-31H,2,5-6H2,(H2,23,32)(H,36,37)(H,38,39)(H2,22,24,25)(H2,33,34,35)/p-4/t10-,11-,13-,14-,15-,16-,20-,21-/m1/s1 InChIKey=ACFIXJIJDZMPPO-NNYOXOHSSA-J NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1
+MNXM97613 tetracycline chebi:77932 C22H24N2O8 0 444.15327 InChI=1S/C22H24N2O8/c1-21(31)8-5-4-6-11(25)12(8)16(26)13-9(21)7-10-15(24(2)3)17(27)14(20(23)30)19(29)22(10,32)18(13)28/h4-6,9-10,15,25,27-28,31-32H,7H2,1-3H3,(H2,23,30)/t9-,10-,15-,21+,22-/m0/s1 InChIKey=OFVLGDICTFRJMM-WESIUVDSSA-N C[NH+](C)[C@@H]1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C(O)=C3C(=O)c4c(O)cccc4[C@@](C)(O)[C@H]3C[C@@H]12
+MNXM162730 11a-hydroxytetracycline chebi:132727 C22H24N2O9 0 460.14818 InChI=1S/C22H24N2O9/c1-20(31)8-5-4-6-10(25)12(8)16(27)22(33)11(20)7-9-14(24(2)3)15(26)13(18(23)29)17(28)21(9,32)19(22)30/h4-6,9,11,14,25-26,31-33H,7H2,1-3H3,(H2,23,29)/t9-,11+,14-,20+,21+,22-/m0/s1 InChIKey=FWVRSACGGAUWNP-BWOONYPSSA-N C[NH+](C)[C@@H]1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C(=O)[C@@]3(O)C(=O)c4c(O)cccc4[C@@](C)(O)[C@H]3C[C@@H]12
+MNXM5 NADP(+) chebi:58349 C21H25N7O17P3 -3 740.05362 InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(44-46(33,34)35)14(30)11(43-21)6-41-48(38,39)45-47(36,37)40-5-10-13(29)15(31)20(42-10)27-3-1-2-9(4-27)18(23)32/h1-4,7-8,10-11,13-16,20-21,29-31H,5-6H2,(H7-,22,23,24,25,32,33,34,35,36,37,38,39)/p-3/t10-,11-,13-,14-,15-,16-,20-,21-/m1/s1 InChIKey=XJLXINKUBYWONI-NNYOXOHSSA-K NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)c1
+MNXM737425 1,2-di-(9Z-octadecenoyl)-sn-glycero-3-phosphoethanolamine chebi:74986 C41H78NO8P 0 743.54651 InChI=1S/C41H78NO8P/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-31-33-40(43)47-37-39(38-49-51(45,46)48-36-35-42)50-41(44)34-32-30-28-26-24-22-20-18-16-14-12-10-8-6-4-2/h17-20,39H,3-16,21-38,42H2,1-2H3,(H,45,46)/b19-17-,20-18-/t39-/m1/s1 InChIKey=MWRBNPKJOOWZPW-NYVOMTAGSA-N CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\CCCCCCCC
+MNXM1107708 (9Z)-octadecenoate chebi:30823 C18H33O2 -1 281.24860 InChI=1S/C18H34O2/c1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18(19)20/h9-10H,2-8,11-17H2,1H3,(H,19,20)/p-1/b10-9- InChIKey=ZQPPMHVWECSIRJ-KTKRTIGZSA-M CCCCCCCC/C=C\CCCCCCCC(=O)[O-]
\ No newline at end of file
diff --git a/metanetx_uniprot/TestingFiles/chem_xref.tsv b/metanetx_uniprot/TestingFiles/chem_xref.tsv
new file mode 100644
index 00000000..9ce7e27d
--- /dev/null
+++ b/metanetx_uniprot/TestingFiles/chem_xref.tsv
@@ -0,0 +1,362 @@
+### MetaNetX/MNXref reconciliation ###
+#Based on the following resources:
+#
+#RESOURCE: MetaNetX/MNXref
+#VERSION: 4.4
+#DATE: 2022/03/16
+#URL: https://www.metanetx.org
+#LICENSE:
+# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics
+# Except where otherwise noted, the data available from this site are
+# licensed under a Creative Commons Attribution 4.0 International License.
+# MNXref uses information on cellular compartments, reactions, and
+# metabolites that is sourced from a number of external resources. The
+# licensing agreements of those resources are specified in each of the
+# downloadable files listed below. For each compound, reaction and
+# cellular compartment in the MNXref namespace we indicate which external
+# resource provided the information used in MNXref. Compounds and
+# reactions in the MNXref namespace may be identical to, or differ from,
+# those in the external resource. In either case the data from MNXref may
+# be considered to be subject to the original licensing restrictions of
+# the external resource.
+# (https://www.metanetx.org/mnxdoc/mnxref.html)
+#
+#RESOURCE: BiGG
+#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23)
+#URL: http://bigg.ucsd.edu
+#LICENSE:
+# Copyright 2015 The Regents of the University of California
+#
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of BiGG Models
+# for educational, research and non-profit purposes, without fee, and
+# without a written agreement is hereby granted, provided that the above
+# copyright notice, this paragraph and the following three paragraphs
+# appear in all copies.
+#
+# Those desiring to incorporate BiGG Models into commercial products or
+# use for commercial purposes should contact the Technology Transfer &
+# Intellectual Property Services, University of California, San Diego,
+# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858)
+# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu.
+#
+# In no event shall the University of California be liable to any party
+# for direct, indirect, special, incidental, or consequential damages,
+# including lost profits, arising out of the use of this bigg database,
+# even if the University of California has been advised of the possibility
+# of such damage.
+#
+# The BiGG Models provided herein is on an "as is" basis, and the
+# University of California has no obligation to provide maintenance,
+# support, updates, enhancements, or modifications. The University of
+# California makes no representations and extends no warranties of any
+# kind, either implied or express, including, but not limited to, the
+# implied warranties of merchantability or fitness for a particular
+# purpose, or that the use of the BiGG Models will not infringe any
+# patent, trademark or other rights.
+# (http://bigg.ucsd.edu/)
+#
+#RESOURCE: The Cell Component Ontology
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://bioinformatics.ai.sri.com/CCO/
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: ChEBI
+#VERSION: 203 (downloaded on 2021/09/30)
+#URL: https://www.ebi.ac.uk/chebi/
+#LICENSE:
+# All data in the database is non-proprietary or is derived from a
+# non-proprietary source. It is thus freely accessible and available to
+# anyone. In addition, each data item is fully traceable and explicitly
+# referenced to the original source.
+# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do)
+#
+#RESOURCE: enviPath
+#VERSION: (downloaded on 2021/11/24)
+#URL: https://envipath.org
+#LICENSE:
+# The core data sets of enviPath are licensed under the Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
+# license. This allows you to use them in a non-commercial context, for
+# example if you work at a University or for a public research institute.
+# You can even redistribute and modify the data using the same license. If
+# you want to use the data commercially, contact us, we offer commercial
+# license agreements.
+# We summarized how you can use the data on our license page.
+# (https://envipath.com/license/)
+#
+#RESOURCE: HMDB
+#VERSION: 4.0 (downloaded on 2021/06/18)
+#URL: https://hmdb.ca
+#LICENSE:
+# HMDB is offered to the public as a freely available resource. Use and
+# re-distribution of the data, in whole or in part, for commercial
+# purposes requires explicit permission of the authors and explicit
+# acknowledgment of the source material (HMDB) and the original
+# publication.
+# (https://hmdb.ca/about)
+#
+#RESOURCE: KEGG
+#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11)
+#URL: https://www.kegg.jp
+#LICENSE:
+# Academic users may freely use the KEGG website and may also freely link
+# to the KEGG website.
+# Non-academic users may use the KEGG website as end users for
+# non-commercial purposes, but any other use requires a license agreement.
+# Academic users who utilize KEGG for providing academic services are
+# requested to obtain a KEGG FTP subscription for organizational use,
+# which includes a proper license agreement.
+# Non-academic users and Academic users intending to use KEGG for
+# commercial purposes are requested to obtain a license agreement through
+# KEGG's exclusive licensing agent, Pathway Solutions.
+# (https://www.kegg.jp/kegg/legal.html)
+#
+#RESOURCE: LipidMaps
+#VERSION: 2021-05-28 (downloaded on 2021/06/11)
+#URL: https://www.lipidmaps.org
+#LICENSE:
+# The Lipidomics Gateway is provided on an "as is" basis, without warranty
+# or representation of any kind, express or implied. The content of the
+# Lipidomics Gateway website is protected by international copyright,
+# trademark and other laws. You may download articles and web pages from
+# this site for your personal, non-commercial use only, provided that you
+# keep intact all authorship, copyright and other proprietary notices. The
+# Featured Lipid can also be used for educational purposes, provided that
+# credit is given to the Lipidomics Gateway. If you use the Lipidomics
+# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the
+# right to modify these terms at any time.
+# (https://www.lipidmaps.org/about/)
+#
+#RESOURCE: MetaCyc
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://metacyc.org
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: Reactome
+#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03)
+#URL: https://reactome.org
+#LICENSE:
+# Reactome is an open source and open access resource, available to anyone.
+# Usage of Reactome material is covered by two Creative Commons licenses:
+#
+# The terms of the Creative Commons Public Domain (CC0) License apply to all
+# Reactome annotation files, e.g. identifier mapping data, specialized data
+# files, and interaction data derived from Reactome.
+# (https://reactome.org/license/)
+#
+#RESOURCE: Rhea
+#VERSION: 119 (downloaded on 2021/11/03)
+#URL: https://www.rhea-db.org
+#LICENSE:
+# All data in Rhea is freely accessible and available for anyone to use under
+# the Creative Commons Attribution License.
+# (https://www.rhea-db.org/documentation)
+#
+#RESOURCE: SABIO-RK
+#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01)
+#URL: http://sabiork.h-its.org
+#LICENSE:
+# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its
+# associated documentation (all referred to in the following as
+# "Database"). You should carefully read the following terms and
+# conditions before using this Database. Your use of this Database
+# indicates your acceptance of this license agreement and all terms and
+# conditions.You are hereby granted a non-exclusive and non-transferable
+# license to use the Database according to the following terms and
+# conditions. This license is to use the Database for Non-Commercial
+# Purpose only. Non-Commercial Purpose means the use of the Database
+# solely for internal non-commercial research and academic purposes.
+# Non-Commercial Purpose excludes, without limitation, any use of the
+# Database, as part of, or in any way in connection with a product or
+# service which is sold, offered for sale, licensed, leased, loaned, or
+# rented. Permission to use this Database for Non-Commercial Purpose is
+# hereby granted without fee and subject to the following terms of this
+# license.
+#
+# Commercial Use
+# If you desire to use the Database for profit-making or commercial
+# purposes, you agree to negotiate in good faith a license with the HITS
+# prior to such profit-making or commercial use. The HITS shall have no
+# obligation to grant such license to you, and may grant exclusive or
+# non-exclusive licenses to others. You agree to notify the HITS of any
+# inquiries you have for commercial use of the Database and/or its
+# modifications. You may contact the following email to discuss commercial
+# use: sabiork at h-its.org
+#
+# Governing Law
+# This Agreement is governed by the law of the Federal Republic of
+# Germany. The application of the UN Convention on the Sale of Goods is
+# excluded.
+#
+# Disclaimer of Warranty
+# Because this Database is licensed free of charge, there is no warranty
+# for the data in it contained and the methods used for its querying. The
+# HITS makes no warranty or representation that the operation of the
+# Database in this compilation will be error-free, and the HITS is under
+# no obligation to provide any services, by way of maintenance, update, or
+# otherwise.
+#
+# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND
+# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER
+# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A
+# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND
+# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
+# CORRECTION.
+#
+# Limitation of Liability
+# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
+# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
+# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
+# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
+# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
+# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS
+# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+#
+# Reference to SABIO-RK Users will cite SABIO-RK in publications or
+# presentations, whenever the data used was extracted from the database.
+# Termination This agreement is effective until terminated. You may
+# terminate this agreement at any time by destroying all associated
+# material (e.g., documentation or web service clients) to the database in
+# your possession and by stopping any access to the database directly or
+# from software generated by you. This agreement will terminate
+# immediately without notice from and HITS if you fail to comply with any
+# of the terms and conditions of this license. This agreement will also
+# terminate immediately without notice from the HITS if it is found to
+# implement patented algorithms or contain copyrighted code not owned or
+# licensed the HITS for the purpose of its inclusion in the SABIO-RK
+# Database. This agreement cannot be terminated by any other mechanism or
+# for any other reason than those stated herein.
+#
+# Place of Court
+# The exclusive venue for all disputes arising from or in connection with
+# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a
+# business person, a legal entity governed by public law, or a special
+# fund governed by public law, or does not have a general place of
+# jurisdiction within the Federal Republic of Germany. Address all
+# correspondence regarding this license to electronic mail address:
+# sabiork at h-its.org Any inquiries and comments regarding bugs, bug
+# fixes, enhancements, modifications or any other similar issues should be
+# directed to: sabiork at h-its.org
+#
+# Copyright 2007 by HITS, gGmbH. All rights reserved.
+# (http://sabiork.h-its.org/layouts/content/termscondition.gsp)
+#
+#RESOURCE: The SEED
+#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09)
+#URL: https://modelseed.org
+#LICENSE:
+# All tools and datasets that make up the SEED are in the public domain.
+# (https://modelseed.org)
+#
+#RESOURCE: SwissLipids
+#VERSION: (downloaded on 2021/07/29)
+#URL: https://www.swisslipids.org
+#LICENSE:
+# SwissLipids is licensed under a Creative Commons Attribution-Non
+# Commercial-NoDerivatives 4.0 International License.
+#
+# Commercial users and those who wish to use this work for commercial
+# purposes please contact the SIB technology transfer officer at:
+# marc.filliettaz@genebio.com
+# (https://www.swisslipids.org/#/downloads)
+#source ID description
+BIOMASS BIOMASS BIOMASS
+CHEBI:57783 MNXM738702 NADPH||2'-O-phosphonatoadenosine 5'-{3-[1-(3-carbamoyl-1,4-dihydropyridin-1-yl)-1,4-anhydro-D-ribitol-5-yl] diphosphate}||NADPH tetraanion||NADPH(4-)
+CHEBI:77932 MNXM97613 tetracycline||(1S,4aS,11S,11aS,12aS)-3-carbamoyl-1-(dimethylazaniumyl)-4a,5,7,11-tetrahydroxy-11-methyl-4,6-dioxo-1,4,4a,6,11,11a,12,12a-octahydrotetracen-2-olate||tetracycline zwitterion
+CHEBI:132727 MNXM162730 11a-hydroxytetracycline||(1S,4aR5aS,11S,11aR,12aS)-3-carbamoyl-1-(dimethylazaniumyl)-4a,5a,7,11-tetrahydroxy-11-methyl-4,5,6-trioxo-1,4,4a,5,5a,6,11,11a,12,12a-decahydrotetracen-2-olate||11a-hydroxytetracycline zwitterion
+chebi:15377 WATER H2O||BOUND WATER||HOH||WATER||Wasser||Water||[OH2]||acqua||agua||aqua||dihydridooxygen||dihydrogen oxide||eau||hydrogen hydroxide||oxidane||water
+CHEBI:58349 MNXM5 NADP(+)||2'-O-phosphonatoadenosine 5'-{3-[1-(3-carbamoylpyridinio)-1,4-anhydro-D-ribitol-5-yl] diphosphate}||NADP trianion||NADP(3-)
+CHEBI:74986 MNXM737425 1,2-di-(9Z-octadecenoyl)-sn-glycero-3-phosphoethanolamine||1,2-dioleoyl-sn-glycero-3-phosphoethanolamine zwitterion||1-(9Z)-octadecenoyl-2-(9Z)-octadecenoyl-sn-glycero-3-phosphoethanolamine zwitterion||1-C18:1(omega-9)-2-C18:1(omega-9)-phosphatidylethanolamine zwitterion||2-azaniumylethyl (2R)-2,3-bis[(9Z)-octadec-9-enoyloxy]propyl phosphate
+chebi:14389 MNXM738220 secondary/obsolete/fantasy identifier
+CHEBI:15378 MNXM1 H(+)||H+||Hydron||hydrogen(1+)||hydron
+CHEBI:30823 MNXM1107708 (9Z)-octadecenoate||(9Z)-octadec-9-enoate||(Z)-9-octadecenoic acid, ion(1-)||Oleat||cis-9-octadecenoate||oleate||oleic acid anion
\ No newline at end of file
diff --git a/metanetx_uniprot/TestingFiles/ncbitaxon.json b/metanetx_uniprot/TestingFiles/ncbitaxon.json
new file mode 100644
index 00000000..2324a45b
--- /dev/null
+++ b/metanetx_uniprot/TestingFiles/ncbitaxon.json
@@ -0,0 +1,188 @@
+{
+ "graphs" : [ {
+ "nodes" : [ {
+ "id" : "http://purl.obolibrary.org/obo/NCBITaxon_817",
+ "meta" : {
+ "xrefs" : [ {
+ "val" : "PMID:16559622"
+ }, {
+ "val" : "GC_ID:11"
+ }, {
+ "val" : "PMID:28066339"
+ } ],
+ "synonyms" : [ {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacteroides incommunis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Pseudobacterium fragilis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Ristella uncata",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Ristella incommunis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacteroides inaequalis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Pseudobacterium incommunis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacteroides uncatus",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Sphaerophorus inaequalis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Fusiformis fragilis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Sphaerophorus intermedius",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Pseudobacterium inaequalis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacillus fragilis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Ristella fragilis",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Pseudobacterium uncatum",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ } ],
+ "basicPropertyValues" : [ {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:665938"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:33929"
+ }, {
+ "pred" : "http://purl.obolibrary.org/obo/ncbitaxon#has_rank",
+ "val" : "http://purl.obolibrary.org/obo/NCBITaxon_species"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace",
+ "val" : "ncbi_taxonomy"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:469587"
+ } ]
+ },
+ "type" : "CLASS",
+ "lbl" : "Bacteroides fragilis"
+ }, {
+ "id" : "http://purl.obolibrary.org/obo/NCBITaxon_562",
+ "meta" : {
+ "xrefs" : [ {
+ "val" : "GC_ID:11"
+ }, {
+ "val" : "PMID:10319482"
+ } ],
+ "synonyms" : [ {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Enterococcus coli",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasExactSynonym",
+ "val" : "Escherichia/Shigella coli",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#equivalent_name"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacillus coli",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacterium coli",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasRelatedSynonym",
+ "val" : "Bacterium coli commune",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym"
+ }, {
+ "pred" : "hasExactSynonym",
+ "val" : "E. coli",
+ "xrefs" : [ ],
+ "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#common_name"
+ } ],
+ "basicPropertyValues" : [ {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:1806490"
+ }, {
+ "pred" : "http://purl.obolibrary.org/obo/ncbitaxon#has_rank",
+ "val" : "http://purl.obolibrary.org/obo/NCBITaxon_species"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:469598"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:1637691"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace",
+ "val" : "ncbi_taxonomy"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:662104"
+ }, {
+ "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId",
+ "val" : "NCBITaxon:662101"
+ } ]
+ },
+ "type" : "CLASS",
+ "lbl" : "Escherichia coli"
+ } ],
+ "edges" : [ {
+ "sub" : "http://purl.obolibrary.org/obo/NCBITaxon_295405",
+ "pred" : "is_a",
+ "obj" : "http://purl.obolibrary.org/obo/NCBITaxon_817"
+ }, {
+ "sub" : "http://purl.obolibrary.org/obo/NCBITaxon_1389418",
+ "pred" : "is_a",
+ "obj" : "http://purl.obolibrary.org/obo/NCBITaxon_562"
+ } ],
+ "id" : "http://purl.obolibrary.org/obo/ncbitaxon.owl",
+ "meta" : {
+ "subsets" : [ ],
+ "xrefs" : [ ],
+ "basicPropertyValues" : [ ]
+ },
+ "equivalentNodesSets" : [ ],
+ "logicalDefinitionAxioms" : [ ],
+ "`domainRangeAxioms`" : [ ],
+ "propertyChainAxioms" : [ ]
+ } ]
+}
\ No newline at end of file
diff --git a/metanetx_uniprot/TestingFiles/reac_prop.tsv b/metanetx_uniprot/TestingFiles/reac_prop.tsv
new file mode 100644
index 00000000..75826bdb
--- /dev/null
+++ b/metanetx_uniprot/TestingFiles/reac_prop.tsv
@@ -0,0 +1,359 @@
+### MetaNetX/MNXref reconciliation ###
+#Based on the following resources:
+#
+#RESOURCE: MetaNetX/MNXref
+#VERSION: 4.4
+#DATE: 2022/03/16
+#URL: https://www.metanetx.org
+#LICENSE:
+# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics
+# Except where otherwise noted, the data available from this site are
+# licensed under a Creative Commons Attribution 4.0 International License.
+# MNXref uses information on cellular compartments, reactions, and
+# metabolites that is sourced from a number of external resources. The
+# licensing agreements of those resources are specified in each of the
+# downloadable files listed below. For each compound, reaction and
+# cellular compartment in the MNXref namespace we indicate which external
+# resource provided the information used in MNXref. Compounds and
+# reactions in the MNXref namespace may be identical to, or differ from,
+# those in the external resource. In either case the data from MNXref may
+# be considered to be subject to the original licensing restrictions of
+# the external resource.
+# (https://www.metanetx.org/mnxdoc/mnxref.html)
+#
+#RESOURCE: BiGG
+#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23)
+#URL: http://bigg.ucsd.edu
+#LICENSE:
+# Copyright 2015 The Regents of the University of California
+#
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of BiGG Models
+# for educational, research and non-profit purposes, without fee, and
+# without a written agreement is hereby granted, provided that the above
+# copyright notice, this paragraph and the following three paragraphs
+# appear in all copies.
+#
+# Those desiring to incorporate BiGG Models into commercial products or
+# use for commercial purposes should contact the Technology Transfer &
+# Intellectual Property Services, University of California, San Diego,
+# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858)
+# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu.
+#
+# In no event shall the University of California be liable to any party
+# for direct, indirect, special, incidental, or consequential damages,
+# including lost profits, arising out of the use of this bigg database,
+# even if the University of California has been advised of the possibility
+# of such damage.
+#
+# The BiGG Models provided herein is on an "as is" basis, and the
+# University of California has no obligation to provide maintenance,
+# support, updates, enhancements, or modifications. The University of
+# California makes no representations and extends no warranties of any
+# kind, either implied or express, including, but not limited to, the
+# implied warranties of merchantability or fitness for a particular
+# purpose, or that the use of the BiGG Models will not infringe any
+# patent, trademark or other rights.
+# (http://bigg.ucsd.edu/)
+#
+#RESOURCE: The Cell Component Ontology
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://bioinformatics.ai.sri.com/CCO/
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: ChEBI
+#VERSION: 203 (downloaded on 2021/09/30)
+#URL: https://www.ebi.ac.uk/chebi/
+#LICENSE:
+# All data in the database is non-proprietary or is derived from a
+# non-proprietary source. It is thus freely accessible and available to
+# anyone. In addition, each data item is fully traceable and explicitly
+# referenced to the original source.
+# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do)
+#
+#RESOURCE: enviPath
+#VERSION: (downloaded on 2021/11/24)
+#URL: https://envipath.org
+#LICENSE:
+# The core data sets of enviPath are licensed under the Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
+# license. This allows you to use them in a non-commercial context, for
+# example if you work at a University or for a public research institute.
+# You can even redistribute and modify the data using the same license. If
+# you want to use the data commercially, contact us, we offer commercial
+# license agreements.
+# We summarized how you can use the data on our license page.
+# (https://envipath.com/license/)
+#
+#RESOURCE: HMDB
+#VERSION: 4.0 (downloaded on 2021/06/18)
+#URL: https://hmdb.ca
+#LICENSE:
+# HMDB is offered to the public as a freely available resource. Use and
+# re-distribution of the data, in whole or in part, for commercial
+# purposes requires explicit permission of the authors and explicit
+# acknowledgment of the source material (HMDB) and the original
+# publication.
+# (https://hmdb.ca/about)
+#
+#RESOURCE: KEGG
+#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11)
+#URL: https://www.kegg.jp
+#LICENSE:
+# Academic users may freely use the KEGG website and may also freely link
+# to the KEGG website.
+# Non-academic users may use the KEGG website as end users for
+# non-commercial purposes, but any other use requires a license agreement.
+# Academic users who utilize KEGG for providing academic services are
+# requested to obtain a KEGG FTP subscription for organizational use,
+# which includes a proper license agreement.
+# Non-academic users and Academic users intending to use KEGG for
+# commercial purposes are requested to obtain a license agreement through
+# KEGG's exclusive licensing agent, Pathway Solutions.
+# (https://www.kegg.jp/kegg/legal.html)
+#
+#RESOURCE: LipidMaps
+#VERSION: 2021-05-28 (downloaded on 2021/06/11)
+#URL: https://www.lipidmaps.org
+#LICENSE:
+# The Lipidomics Gateway is provided on an "as is" basis, without warranty
+# or representation of any kind, express or implied. The content of the
+# Lipidomics Gateway website is protected by international copyright,
+# trademark and other laws. You may download articles and web pages from
+# this site for your personal, non-commercial use only, provided that you
+# keep intact all authorship, copyright and other proprietary notices. The
+# Featured Lipid can also be used for educational purposes, provided that
+# credit is given to the Lipidomics Gateway. If you use the Lipidomics
+# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the
+# right to modify these terms at any time.
+# (https://www.lipidmaps.org/about/)
+#
+#RESOURCE: MetaCyc
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://metacyc.org
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: Reactome
+#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03)
+#URL: https://reactome.org
+#LICENSE:
+# Reactome is an open source and open access resource, available to anyone.
+# Usage of Reactome material is covered by two Creative Commons licenses:
+#
+# The terms of the Creative Commons Public Domain (CC0) License apply to all
+# Reactome annotation files, e.g. identifier mapping data, specialized data
+# files, and interaction data derived from Reactome.
+# (https://reactome.org/license/)
+#
+#RESOURCE: Rhea
+#VERSION: 119 (downloaded on 2021/11/03)
+#URL: https://www.rhea-db.org
+#LICENSE:
+# All data in Rhea is freely accessible and available for anyone to use under
+# the Creative Commons Attribution License.
+# (https://www.rhea-db.org/documentation)
+#
+#RESOURCE: SABIO-RK
+#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01)
+#URL: http://sabiork.h-its.org
+#LICENSE:
+# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its
+# associated documentation (all referred to in the following as
+# "Database"). You should carefully read the following terms and
+# conditions before using this Database. Your use of this Database
+# indicates your acceptance of this license agreement and all terms and
+# conditions.You are hereby granted a non-exclusive and non-transferable
+# license to use the Database according to the following terms and
+# conditions. This license is to use the Database for Non-Commercial
+# Purpose only. Non-Commercial Purpose means the use of the Database
+# solely for internal non-commercial research and academic purposes.
+# Non-Commercial Purpose excludes, without limitation, any use of the
+# Database, as part of, or in any way in connection with a product or
+# service which is sold, offered for sale, licensed, leased, loaned, or
+# rented. Permission to use this Database for Non-Commercial Purpose is
+# hereby granted without fee and subject to the following terms of this
+# license.
+#
+# Commercial Use
+# If you desire to use the Database for profit-making or commercial
+# purposes, you agree to negotiate in good faith a license with the HITS
+# prior to such profit-making or commercial use. The HITS shall have no
+# obligation to grant such license to you, and may grant exclusive or
+# non-exclusive licenses to others. You agree to notify the HITS of any
+# inquiries you have for commercial use of the Database and/or its
+# modifications. You may contact the following email to discuss commercial
+# use: sabiork at h-its.org
+#
+# Governing Law
+# This Agreement is governed by the law of the Federal Republic of
+# Germany. The application of the UN Convention on the Sale of Goods is
+# excluded.
+#
+# Disclaimer of Warranty
+# Because this Database is licensed free of charge, there is no warranty
+# for the data in it contained and the methods used for its querying. The
+# HITS makes no warranty or representation that the operation of the
+# Database in this compilation will be error-free, and the HITS is under
+# no obligation to provide any services, by way of maintenance, update, or
+# otherwise.
+#
+# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND
+# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER
+# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A
+# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND
+# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
+# CORRECTION.
+#
+# Limitation of Liability
+# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
+# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
+# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
+# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
+# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
+# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS
+# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+#
+# Reference to SABIO-RK Users will cite SABIO-RK in publications or
+# presentations, whenever the data used was extracted from the database.
+# Termination This agreement is effective until terminated. You may
+# terminate this agreement at any time by destroying all associated
+# material (e.g., documentation or web service clients) to the database in
+# your possession and by stopping any access to the database directly or
+# from software generated by you. This agreement will terminate
+# immediately without notice from and HITS if you fail to comply with any
+# of the terms and conditions of this license. This agreement will also
+# terminate immediately without notice from the HITS if it is found to
+# implement patented algorithms or contain copyrighted code not owned or
+# licensed the HITS for the purpose of its inclusion in the SABIO-RK
+# Database. This agreement cannot be terminated by any other mechanism or
+# for any other reason than those stated herein.
+#
+# Place of Court
+# The exclusive venue for all disputes arising from or in connection with
+# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a
+# business person, a legal entity governed by public law, or a special
+# fund governed by public law, or does not have a general place of
+# jurisdiction within the Federal Republic of Germany. Address all
+# correspondence regarding this license to electronic mail address:
+# sabiork at h-its.org Any inquiries and comments regarding bugs, bug
+# fixes, enhancements, modifications or any other similar issues should be
+# directed to: sabiork at h-its.org
+#
+# Copyright 2007 by HITS, gGmbH. All rights reserved.
+# (http://sabiork.h-its.org/layouts/content/termscondition.gsp)
+#
+#RESOURCE: The SEED
+#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09)
+#URL: https://modelseed.org
+#LICENSE:
+# All tools and datasets that make up the SEED are in the public domain.
+# (https://modelseed.org)
+#
+#RESOURCE: SwissLipids
+#VERSION: (downloaded on 2021/07/29)
+#URL: https://www.swisslipids.org
+#LICENSE:
+# SwissLipids is licensed under a Creative Commons Attribution-Non
+# Commercial-NoDerivatives 4.0 International License.
+#
+# Commercial users and those who wish to use this work for commercial
+# purposes please contact the SIB technology transfer officer at:
+# marc.filliettaz@genebio.com
+# (https://www.swisslipids.org/#/downloads)
+#ID mnx_equation reference classifs is_balanced is_transport
+EMPTY = mnx:EMPTY B
+MNXR114744 1 MNXM162730@MNXD1 + 1 MNXM5@MNXD1 + 1 WATER@MNXD1 = 1 MNXM1@MNXD1 + 1 MNXM735438@MNXD1 + 1 MNXM738702@MNXD1 + 1 MNXM97613@MNXD1 rheaR:50004 1.14.13.231 B
+MNXR171656 1 MNXM5@MNXD1 + 1 MNXM743287@MNXD1 + 1 WATER@MNXD1 = 1 MNXM735438@MNXD1 + 1 MNXM738702@MNXD1 + 1 MNXM743286@MNXD1 rheaR:61444
+MNXR168222 1 MNXM1089988@MNXD1 + 1 MNXM1102167@MNXD1 = 1 MNXM1089989@MNXD1 + 1 MNXM1102072@MNXD1 rheaR:42776 2.1.1.180
+MNXR165961 1 MNXM1107698@MNXD1 + 1 WATER@MNXD1 = 1 MNXM1108087@MNXD1 + 1 MNXM728579@MNXD1 rheaR:18689 3.1.1.32
+MNXR171532 2 MNXM1107708@MNXD1 + 2 MNXM1@MNXD1 + 1 MNXM734941@MNXD1 = 1 MNXM737425@MNXD1 + 2 WATER@MNXD1 rheaR:60624 B
+MNXR171532 2 MNXM1107708@MNXD1 + 2 MNXM1@MNXD1 + 1 MNXM734941@MNXD1 = 1 MNXM737425@MNXD1 + 2 WATER@MNXD1 rheaR:60624 B
\ No newline at end of file
diff --git a/metanetx_uniprot/TestingFiles/reac_xref.tsv b/metanetx_uniprot/TestingFiles/reac_xref.tsv
new file mode 100644
index 00000000..d03bb0c0
--- /dev/null
+++ b/metanetx_uniprot/TestingFiles/reac_xref.tsv
@@ -0,0 +1,365 @@
+### MetaNetX/MNXref reconciliation ###
+#Based on the following resources:
+#
+#RESOURCE: MetaNetX/MNXref
+#VERSION: 4.4
+#DATE: 2022/03/16
+#URL: https://www.metanetx.org
+#LICENSE:
+# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics
+# Except where otherwise noted, the data available from this site are
+# licensed under a Creative Commons Attribution 4.0 International License.
+# MNXref uses information on cellular compartments, reactions, and
+# metabolites that is sourced from a number of external resources. The
+# licensing agreements of those resources are specified in each of the
+# downloadable files listed below. For each compound, reaction and
+# cellular compartment in the MNXref namespace we indicate which external
+# resource provided the information used in MNXref. Compounds and
+# reactions in the MNXref namespace may be identical to, or differ from,
+# those in the external resource. In either case the data from MNXref may
+# be considered to be subject to the original licensing restrictions of
+# the external resource.
+# (https://www.metanetx.org/mnxdoc/mnxref.html)
+#
+#RESOURCE: BiGG
+#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23)
+#URL: http://bigg.ucsd.edu
+#LICENSE:
+# Copyright 2015 The Regents of the University of California
+#
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of BiGG Models
+# for educational, research and non-profit purposes, without fee, and
+# without a written agreement is hereby granted, provided that the above
+# copyright notice, this paragraph and the following three paragraphs
+# appear in all copies.
+#
+# Those desiring to incorporate BiGG Models into commercial products or
+# use for commercial purposes should contact the Technology Transfer &
+# Intellectual Property Services, University of California, San Diego,
+# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858)
+# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu.
+#
+# In no event shall the University of California be liable to any party
+# for direct, indirect, special, incidental, or consequential damages,
+# including lost profits, arising out of the use of this bigg database,
+# even if the University of California has been advised of the possibility
+# of such damage.
+#
+# The BiGG Models provided herein is on an "as is" basis, and the
+# University of California has no obligation to provide maintenance,
+# support, updates, enhancements, or modifications. The University of
+# California makes no representations and extends no warranties of any
+# kind, either implied or express, including, but not limited to, the
+# implied warranties of merchantability or fitness for a particular
+# purpose, or that the use of the BiGG Models will not infringe any
+# patent, trademark or other rights.
+# (http://bigg.ucsd.edu/)
+#
+#RESOURCE: The Cell Component Ontology
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://bioinformatics.ai.sri.com/CCO/
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: ChEBI
+#VERSION: 203 (downloaded on 2021/09/30)
+#URL: https://www.ebi.ac.uk/chebi/
+#LICENSE:
+# All data in the database is non-proprietary or is derived from a
+# non-proprietary source. It is thus freely accessible and available to
+# anyone. In addition, each data item is fully traceable and explicitly
+# referenced to the original source.
+# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do)
+#
+#RESOURCE: enviPath
+#VERSION: (downloaded on 2021/11/24)
+#URL: https://envipath.org
+#LICENSE:
+# The core data sets of enviPath are licensed under the Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
+# license. This allows you to use them in a non-commercial context, for
+# example if you work at a University or for a public research institute.
+# You can even redistribute and modify the data using the same license. If
+# you want to use the data commercially, contact us, we offer commercial
+# license agreements.
+# We summarized how you can use the data on our license page.
+# (https://envipath.com/license/)
+#
+#RESOURCE: HMDB
+#VERSION: 4.0 (downloaded on 2021/06/18)
+#URL: https://hmdb.ca
+#LICENSE:
+# HMDB is offered to the public as a freely available resource. Use and
+# re-distribution of the data, in whole or in part, for commercial
+# purposes requires explicit permission of the authors and explicit
+# acknowledgment of the source material (HMDB) and the original
+# publication.
+# (https://hmdb.ca/about)
+#
+#RESOURCE: KEGG
+#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11)
+#URL: https://www.kegg.jp
+#LICENSE:
+# Academic users may freely use the KEGG website and may also freely link
+# to the KEGG website.
+# Non-academic users may use the KEGG website as end users for
+# non-commercial purposes, but any other use requires a license agreement.
+# Academic users who utilize KEGG for providing academic services are
+# requested to obtain a KEGG FTP subscription for organizational use,
+# which includes a proper license agreement.
+# Non-academic users and Academic users intending to use KEGG for
+# commercial purposes are requested to obtain a license agreement through
+# KEGG's exclusive licensing agent, Pathway Solutions.
+# (https://www.kegg.jp/kegg/legal.html)
+#
+#RESOURCE: LipidMaps
+#VERSION: 2021-05-28 (downloaded on 2021/06/11)
+#URL: https://www.lipidmaps.org
+#LICENSE:
+# The Lipidomics Gateway is provided on an "as is" basis, without warranty
+# or representation of any kind, express or implied. The content of the
+# Lipidomics Gateway website is protected by international copyright,
+# trademark and other laws. You may download articles and web pages from
+# this site for your personal, non-commercial use only, provided that you
+# keep intact all authorship, copyright and other proprietary notices. The
+# Featured Lipid can also be used for educational purposes, provided that
+# credit is given to the Lipidomics Gateway. If you use the Lipidomics
+# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the
+# right to modify these terms at any time.
+# (https://www.lipidmaps.org/about/)
+#
+#RESOURCE: MetaCyc
+#VERSION: 25.0 (downloaded on 2021/06/03)
+#URL: https://metacyc.org
+#LICENSE:
+# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome
+# databases.
+#
+# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive,
+# royalty-free license to use, modify and redistribute the Open Databases
+# (as such term is defined in Exhibit B) and LICENSEE's modified
+# versions thereof on a royalty-free basis, worldwide and for any purpose;
+# provided, in each case, that if LICENSEE modifies any Open Database (the
+# modified version being a "Modified Open Database"), then (i)
+# LICENSEE must provide a copy of the Modified Open Database to SRI (and
+# hereby grants to SRI a nonexclusive, royalty-free license to use,
+# modify, and redistribute the Modified Open Database worldwide and for
+# any purpose and to authorize others to do so); and (ii) any Modified
+# Open Databases, or websites from which such Modified Open Databases may
+# be obtained, must clearly and prominently:
+#
+# (a) identify the Open Databases from which they were derived:
+#
+# (b) include all applicable copyright notices and author lists from the
+# Open Databases from which they were derived; and
+#
+# (c) identify or summarize all modifications that were made.
+#
+# Any distribution of such Modified Open Databases without the required
+# notices is a violation of SRI's and its licensors' copyright and other
+# proprietary rights. All trademarks, service marks, and trade names are
+# proprietary to SRI and its licensors. The Open Databases, including any
+# files incorporated in or generated from the Open Databases and data
+# accompanying the Open Databases, are licensed to LICENSEE by SRI and its
+# licensors, and SRI and its licensors do not transfer title or any other
+# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open
+# Databases except as otherwise specified herein.
+#
+# 2.1.1 If SRI, in its sole discretion, determines that a Modified
+# Database is of sufficient quality and interest to the community to be
+# hosted on biocyc.org, then SRI may (if the Modified Database includes
+# significant curation over the original Open Database it is derived from,
+# or the last version of the Modified Database provided to SRI) provide to
+# LICENSEE a personal, one-year subscription to biocyc at no cost;
+# provided, however, that if LICENSEE edits the Modified Database via a
+# MySQL server operated by SRI or its contractors, such free one-year
+# subscription will be forfeited.
+# (https://biocyc.org/ptools-academic-license.shtml)
+#
+#RESOURCE: Reactome
+#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03)
+#URL: https://reactome.org
+#LICENSE:
+# Reactome is an open source and open access resource, available to anyone.
+# Usage of Reactome material is covered by two Creative Commons licenses:
+#
+# The terms of the Creative Commons Public Domain (CC0) License apply to all
+# Reactome annotation files, e.g. identifier mapping data, specialized data
+# files, and interaction data derived from Reactome.
+# (https://reactome.org/license/)
+#
+#RESOURCE: Rhea
+#VERSION: 119 (downloaded on 2021/11/03)
+#URL: https://www.rhea-db.org
+#LICENSE:
+# All data in Rhea is freely accessible and available for anyone to use under
+# the Creative Commons Attribution License.
+# (https://www.rhea-db.org/documentation)
+#
+#RESOURCE: SABIO-RK
+#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01)
+#URL: http://sabiork.h-its.org
+#LICENSE:
+# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its
+# associated documentation (all referred to in the following as
+# "Database"). You should carefully read the following terms and
+# conditions before using this Database. Your use of this Database
+# indicates your acceptance of this license agreement and all terms and
+# conditions.You are hereby granted a non-exclusive and non-transferable
+# license to use the Database according to the following terms and
+# conditions. This license is to use the Database for Non-Commercial
+# Purpose only. Non-Commercial Purpose means the use of the Database
+# solely for internal non-commercial research and academic purposes.
+# Non-Commercial Purpose excludes, without limitation, any use of the
+# Database, as part of, or in any way in connection with a product or
+# service which is sold, offered for sale, licensed, leased, loaned, or
+# rented. Permission to use this Database for Non-Commercial Purpose is
+# hereby granted without fee and subject to the following terms of this
+# license.
+#
+# Commercial Use
+# If you desire to use the Database for profit-making or commercial
+# purposes, you agree to negotiate in good faith a license with the HITS
+# prior to such profit-making or commercial use. The HITS shall have no
+# obligation to grant such license to you, and may grant exclusive or
+# non-exclusive licenses to others. You agree to notify the HITS of any
+# inquiries you have for commercial use of the Database and/or its
+# modifications. You may contact the following email to discuss commercial
+# use: sabiork at h-its.org
+#
+# Governing Law
+# This Agreement is governed by the law of the Federal Republic of
+# Germany. The application of the UN Convention on the Sale of Goods is
+# excluded.
+#
+# Disclaimer of Warranty
+# Because this Database is licensed free of charge, there is no warranty
+# for the data in it contained and the methods used for its querying. The
+# HITS makes no warranty or representation that the operation of the
+# Database in this compilation will be error-free, and the HITS is under
+# no obligation to provide any services, by way of maintenance, update, or
+# otherwise.
+#
+# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND
+# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER
+# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A
+# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND
+# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
+# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
+# CORRECTION.
+#
+# Limitation of Liability
+# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
+# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
+# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
+# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
+# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
+# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS
+# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+#
+# Reference to SABIO-RK Users will cite SABIO-RK in publications or
+# presentations, whenever the data used was extracted from the database.
+# Termination This agreement is effective until terminated. You may
+# terminate this agreement at any time by destroying all associated
+# material (e.g., documentation or web service clients) to the database in
+# your possession and by stopping any access to the database directly or
+# from software generated by you. This agreement will terminate
+# immediately without notice from and HITS if you fail to comply with any
+# of the terms and conditions of this license. This agreement will also
+# terminate immediately without notice from the HITS if it is found to
+# implement patented algorithms or contain copyrighted code not owned or
+# licensed the HITS for the purpose of its inclusion in the SABIO-RK
+# Database. This agreement cannot be terminated by any other mechanism or
+# for any other reason than those stated herein.
+#
+# Place of Court
+# The exclusive venue for all disputes arising from or in connection with
+# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a
+# business person, a legal entity governed by public law, or a special
+# fund governed by public law, or does not have a general place of
+# jurisdiction within the Federal Republic of Germany. Address all
+# correspondence regarding this license to electronic mail address:
+# sabiork at h-its.org Any inquiries and comments regarding bugs, bug
+# fixes, enhancements, modifications or any other similar issues should be
+# directed to: sabiork at h-its.org
+#
+# Copyright 2007 by HITS, gGmbH. All rights reserved.
+# (http://sabiork.h-its.org/layouts/content/termscondition.gsp)
+#
+#RESOURCE: The SEED
+#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09)
+#URL: https://modelseed.org
+#LICENSE:
+# All tools and datasets that make up the SEED are in the public domain.
+# (https://modelseed.org)
+#
+#RESOURCE: SwissLipids
+#VERSION: (downloaded on 2021/07/29)
+#URL: https://www.swisslipids.org
+#LICENSE:
+# SwissLipids is licensed under a Creative Commons Attribution-Non
+# Commercial-NoDerivatives 4.0 International License.
+#
+# Commercial users and those who wish to use this work for commercial
+# purposes please contact the SIB technology transfer officer at:
+# marc.filliettaz@genebio.com
+# (https://www.swisslipids.org/#/downloads)
+#source ID description
+EMPTY EMPTY Empty equation
+rhea:50004 MNXR114744 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp + 1 chebi:77932@rheaC:comp > 1 chebi:132727@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp
+rheaR:50004 MNXR114744 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp + 1 chebi:77932@rheaC:comp > 1 chebi:132727@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp
+rhea:61444 MNXR171656 1 chebi:144644@rheaC:comp + 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp > 1 chebi:144645@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp
+rheaR:61444 MNXR171656 1 chebi:144644@rheaC:comp + 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp > 1 chebi:144645@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp
+rhea:42776 MNXR168222 1 chebi:59789@rheaC:comp + 1 rheaG:10228@rheaC:comp > 1 chebi:15378@rheaC:comp + 1 chebi:57856@rheaC:comp + 1 rheaG:10227@rheaC:comp
+rheaR:42776 MNXR168222 1 chebi:59789@rheaC:comp + 1 rheaG:10228@rheaC:comp > 1 chebi:15378@rheaC:comp + 1 chebi:57856@rheaC:comp + 1 rheaG:10227@rheaC:comp
+rhea:18690 MNXR165961 1 chebi:15377@rheaC:comp + 1 chebi:57643@rheaC:comp --> 1 chebi:15378@rheaC:comp + 1 chebi:28868@rheaC:comp + 1 chebi:57875@rheaC:comp
+rheaR:18690 MNXR165961 1 chebi:15377@rheaC:comp + 1 chebi:57643@rheaC:comp --> 1 chebi:15378@rheaC:comp + 1 chebi:28868@rheaC:comp + 1 chebi:57875@rheaC:comp
+rhea:60624 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp > 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp
+rheaR:60624 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp > 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp
+rhea:60625 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp --> 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp
+rheaR:60625 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp --> 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp
\ No newline at end of file
From 8af33bd38add6b77073d8167be1b775a8974fce4 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Fri, 15 Sep 2023 12:35:27 -0600
Subject: [PATCH 22/29] Update seq_utils.py
---
metanetx_uniprot/seq_utils.py | 53 +++++++++++++++++++++++++++++++++--
1 file changed, 51 insertions(+), 2 deletions(-)
diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py
index 43682c9a..d0233a41 100644
--- a/metanetx_uniprot/seq_utils.py
+++ b/metanetx_uniprot/seq_utils.py
@@ -138,10 +138,59 @@ def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, ver
batch = organism_ids[i:min(i + batch_size, len(organism_ids))]
query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch])
url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
- '&format=tsv&size=500&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
+ '&format=tsv&size=500&keywords=Reference+proteome&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
# '&format=tsv&size=1&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
for field in fields])
-
_parse_uniprot_data(url, values)
return values
+
+def parse_response(res,values):
+
+ headers = None
+
+ for line in res.iter_lines():
+ line = line.decode('utf-8')
+ tokens = line.strip().split('\t')
+
+ if headers is None:
+ headers = tokens
+ else:
+ res = dict(zip(headers, tokens))
+ #print(res)
+ #print(type(res))
+ #print(type(values))
+ values.append(res)
+
+ #print(values)
+
+ return values
+
+
+def get_jobs(url,values):
+
+ session = requests.Session()
+
+ paging = True
+
+ first_page = session.get(url)
+ first_response = parse_response(first_page,values)
+
+ while paging == True:
+
+ if 'next' in first_page.links:
+ next_url = first_page.links['next']['url']
+ next_page = session.get(next_url)
+ next_response = parse_response(next_page,values)
+ first_page = next_page
+ else:
+ paging = False
+ break
+
+def _get_uniprot_batch_reference_proteome(url):
+
+ values = []
+
+ get_jobs(url,values)
+
+ return values
From fb852481dc0e40a08c8a9e01285d375d7eb4561a Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Fri, 15 Sep 2023 12:50:03 -0600
Subject: [PATCH 23/29] Update README.md
---
metanetx_uniprot/README.md | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md
index 31028033..14bd7102 100644
--- a/metanetx_uniprot/README.md
+++ b/metanetx_uniprot/README.md
@@ -9,8 +9,14 @@ Access chemical, reaction, enzyme, and organism information from the following s
- Rhea
- UniProt
-To run:
+To run the full pipeline to get all relationships:
```
python build.py ~/biochem4j ',' 1
```
+
+To run and only get reference proteome taxa that also exist in kg-microbe:
+```
+python build_taxa_ids.py ~/biochem4j 1
+```
+*Note, uses ncbitaxon.json (build from kg-microbe) which is included in the Files directory.
From ee68a063d9124a5d26884b45e425153af1adc1af Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Fri, 15 Sep 2023 12:50:50 -0600
Subject: [PATCH 24/29] Add files via upload
---
metanetx_uniprot/build_taxa_ids.py | 154 +++++++++++++++++++++++++++++
1 file changed, 154 insertions(+)
create mode 100644 metanetx_uniprot/build_taxa_ids.py
diff --git a/metanetx_uniprot/build_taxa_ids.py b/metanetx_uniprot/build_taxa_ids.py
new file mode 100644
index 00000000..fb325e6d
--- /dev/null
+++ b/metanetx_uniprot/build_taxa_ids.py
@@ -0,0 +1,154 @@
+
+## Output all taxa IDs that exist in kg-microbe and as reference proteomes in UniProt.
+
+
+import os
+import sys
+import tarfile
+import tempfile
+import urllib
+from urllib.request import urlretrieve
+
+from kgx.cli.cli_utils import transform
+import pandas as pd
+from seq_utils import _get_uniprot_batch_reference_proteome
+
+import utils, seq_utils
+
+
+__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
+
+__UNIPROT_REFERENCE_PROTEOMES_URL = 'https://rest.uniprot.org/proteomes/search?&format=tsv&query=%28%28taxonomy_id%3A2%29%20OR%20%28taxonomy_id%3A2157%29%29%20AND%20%28proteome_type%3A1%29&size=500'
+
+def build_csv(dest_dir, num_threads):
+ #'''Build database CSV files.'''
+ #writer = utils.Writer(dest_dir)
+
+ # Get Organism data:
+ print('Parsing NCBI Taxonomy')
+ load(dest_dir) #--> writes Organism_Enzyme.tsv
+
+
+
+def load(output_dir, source=__NCBITAXONOMY_URL, ref_source=__UNIPROT_REFERENCE_PROTEOMES_URL):
+ '''Loads NCBI Taxonomy data.'''
+ #To get data directly from NCBI Taxon
+ #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
+ #nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
+ #_parse_names(nodes, names_filename, array_delimiter)
+ #######
+ #To get data from kg-microbe
+ nodes_filename = os.getcwd()+'/Files/ncbitaxon.json'
+ #For testing
+ #nodes_filename = os.getcwd()+'/TestingFiles/ncbitaxon.json'
+ print('parsing ncbi taxon json file')
+ kgx_nodes_json = _parse_nodes_kgmicrobe(nodes_filename,'ncbitaxon_transformed',output_dir)
+
+ nodes,nodes_df = transform_kgx_output_format(kgx_nodes_json)
+
+ #Constrain by those that have reference proteomes, don't use if testing
+ ref_organisms = _get_uniprot_batch_reference_proteome(ref_source)
+ ref_organism_ids = [str(k['Organism Id']) for k in ref_organisms]
+ node_vals = [i for i in nodes if i in ref_organism_ids]
+
+ node_vals = ['NCBITaxon:' + i for i in node_vals]
+ kgx_nodes_json_subset = nodes_df[nodes_df['id'].isin(node_vals)]
+ kgx_nodes_json_subset.to_csv(output_dir+'/Organism.tsv', index=False, sep='\t')
+ print('Wrote file: ',output_dir+'/Organism.tsv')
+
+def _get_ncbi_taxonomy_files(source):
+ '''Downloads and extracts NCBI Taxonomy files.'''
+ temp_dir = tempfile.gettempdir()
+ temp_gzipfile = tempfile.NamedTemporaryFile()
+ urlretrieve(source, temp_gzipfile.name)
+
+ temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz')
+ temp_tarfile.extractall(temp_dir)
+
+ temp_gzipfile.close()
+ temp_tarfile.close()
+
+ return os.path.join(temp_dir, 'nodes.dmp'), \
+ os.path.join(temp_dir, 'names.dmp')
+
+def _parse_nodes_kgmicrobe(filename, output_name,output_dir):
+ '''Parses nodes file.'''
+
+ transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, output_name), output_format='tsv')
+
+ return output_dir+'/'+output_name+'_nodes.tsv'
+
+def transform_kgx_output_format(transformed_nodes_tsv):
+
+ labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name'])
+
+ nodes = []
+
+ #Get node IDs to help subset according to reference proteomes
+ for i in range(len(labels)):
+ tax_id = labels.iloc[i].loc['id'].split('NCBITaxon:')[1]
+ nodes.append(tax_id)
+
+ return nodes,labels
+
+
+def _parse_nodes(filename):
+ '''Parses nodes file.'''
+ nodes = {}
+ rels = []
+
+ with open(filename, 'r') as textfile:
+ for line in textfile:
+ tokens = [x.strip() for x in line.split('|')]
+ tax_id = tokens[0]
+
+ if tax_id != '1':
+ rels.append([tax_id, 'is_a', tokens[1]])
+
+ nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
+ ':LABEL':
+ 'Organism' + ',' + tokens[2]}
+
+ return nodes, rels
+
+
+def _parse_names(nodes, filename):
+ '''Parses names file.'''
+
+ with open(filename, 'r') as textfile:
+ for line in textfile:
+ tokens = [x.strip() for x in line.split('|')]
+ node = nodes[tokens[0]]
+
+ if 'name' not in node:
+ node['name'] = tokens[1]
+ node['names:string[]'] = set([node['name']])
+ else:
+ node['names:string[]'].add(tokens[1])
+
+ for _, node in nodes.items():
+ if 'names:string[]' in node:
+ node['names:string[]'] = \
+ ','.join(node['names:string[]'])
+
+
+def main(args):
+ '''main method'''
+ num_threads = 0
+
+ if len(args) > 2:
+ try:
+ num_threads = int(args[2])
+ except ValueError:
+ if args[2] == 'True':
+ num_threads = multiprocessing.cpu_count()
+
+ print('Running build with ' + str(num_threads) + ' threads')
+
+ build_csv(args[0], num_threads)
+
+
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
\ No newline at end of file
From 79638d7925b65aea0f3e96bf5441ae4a883cfbb0 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Fri, 15 Sep 2023 12:51:28 -0600
Subject: [PATCH 25/29] Update README.md
---
metanetx_uniprot/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md
index 14bd7102..a9e9503c 100644
--- a/metanetx_uniprot/README.md
+++ b/metanetx_uniprot/README.md
@@ -19,4 +19,4 @@ To run and only get reference proteome taxa that also exist in kg-microbe:
```
python build_taxa_ids.py ~/biochem4j 1
```
-*Note, uses ncbitaxon.json (build from kg-microbe) which is included in the Files directory.
+*Note, uses ncbitaxon.json (built from kg-microbe) which is expected to be in the Files directory.
From bbaca9427141218411686ce2ff0af6a432e285c7 Mon Sep 17 00:00:00 2001
From: bsantan <70932395+bsantan@users.noreply.github.com>
Date: Mon, 18 Sep 2023 13:08:40 -0600
Subject: [PATCH 26/29] Updated to introduce go_utils, rhea2go, kg-phenio, and
PheKnowLator resources to graph.
---
metanetx_uniprot/build.py | 13 +-
metanetx_uniprot/build_taxa_ids.py | 161 +++++++++++++++++++++++
metanetx_uniprot/chemical_utils.py | 3 +
metanetx_uniprot/enzyme_utils.py | 18 ++-
metanetx_uniprot/go_utils.py | 10 ++
metanetx_uniprot/mnxref_utils.py | 104 +++++++++++++--
metanetx_uniprot/reaction_utils.py | 202 +++++++++++++++++++++++++++--
metanetx_uniprot/rhea_utils.py | 16 ++-
metanetx_uniprot/seq_utils.py | 53 +++++++-
9 files changed, 541 insertions(+), 39 deletions(-)
create mode 100644 metanetx_uniprot/build_taxa_ids.py
create mode 100644 metanetx_uniprot/go_utils.py
diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py
index 0ac9524b..edf45b74 100644
--- a/metanetx_uniprot/build.py
+++ b/metanetx_uniprot/build.py
@@ -1,9 +1,6 @@
'''
SYNBIOCHEM-DB (c) University of Manchester 2015
-'''
-SYNBIOCHEM-DB (c) University of Manchester 2015
-
SYNBIOCHEM-DB is licensed under the MIT License.
To view a copy of this license, visit .
@@ -30,7 +27,7 @@ def build_csv(dest_dir, array_delimiter, num_threads):
chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
-
+ ## Getting error: urllib.error.URLError:
#print('Parsing ChEBI')
#chebi_utils.load(chem_man, writer)
@@ -51,12 +48,12 @@ def build_csv(dest_dir, array_delimiter, num_threads):
reaction_ids = rhea_utils.load(reac_man, num_threads=num_threads)
reac_man.write_files(writer) #--> writes Enzyme_Reaction.tsv
- #
print('Parsing MNXref')
- mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer, reaction_ids)
- mnx_loader.load() #--> writes Reaction_Chemical.tsv
+ mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer, reaction_ids, process_ids,ncbi_taxonomy_utils,array_delimiter)
+ print('mxn loading')
+ mnx_loader.load() #--> writes Reaction_Chemical.tsv, Chemical_Process.tsv, ##NOT WORKING: Process_Disease.tsv, Process_Phenotype.tsv
- #chem_man.write_files(writer)
+ chem_man.write_files(writer) #--> writes Chemicals.tsv
def main(args):
diff --git a/metanetx_uniprot/build_taxa_ids.py b/metanetx_uniprot/build_taxa_ids.py
new file mode 100644
index 00000000..fd57ee34
--- /dev/null
+++ b/metanetx_uniprot/build_taxa_ids.py
@@ -0,0 +1,161 @@
+
+## Output all taxa IDs that exist in kg-microbe and as reference proteomes in UniProt.
+
+
+import os
+import sys
+import tarfile
+import tempfile
+import urllib
+from urllib.request import urlretrieve
+
+from kgx.cli.cli_utils import transform
+import pandas as pd
+from seq_utils import _get_uniprot_batch_reference_proteome
+
+import utils, seq_utils
+
+
+__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
+
+__UNIPROT_REFERENCE_PROTEOMES_URL = 'https://rest.uniprot.org/proteomes/search?&format=tsv&query=%28%28taxonomy_id%3A2%29%20OR%20%28taxonomy_id%3A2157%29%29%20AND%20%28proteome_type%3A1%29&size=500'
+
+def build_csv(dest_dir, num_threads):
+ #'''Build database CSV files.'''
+
+ # Get Organism data:
+ print('Parsing NCBI Taxonomy')
+ load(dest_dir) #--> writes Organism_Enzyme.tsv
+
+
+
+def load(output_dir, source=__NCBITAXONOMY_URL, ref_source=__UNIPROT_REFERENCE_PROTEOMES_URL):
+ '''Loads NCBI Taxonomy data.'''
+ #To get data directly from NCBI Taxon
+ #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
+ #nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
+ #_parse_names(nodes, names_filename, array_delimiter)
+ #######
+ #To get data from kg-microbe
+ nodes_filename = os.getcwd()+'/Files/ncbitaxon_nodes.tsv' #ncbitaxon.json
+ #For testing
+ #nodes_filename = os.getcwd()+'/TestingFiles/ncbitaxon.json'
+ print('parsing ncbi taxon tsv file') #json
+ #_parse_nodes_kgmicrobe only used if reading ncbitaxon.json
+ #kgx_nodes_file = _parse_nodes_kgmicrobe(nodes_filename,'ncbitaxon_transformed',output_dir)
+ print('length of ncbitaxon_nodes.tsv: ',len(pd.read_csv(nodes_filename,sep='\t'))) #kgx_nodes))
+
+ #Update to kgx_nodes_file if ncbitaxon.json is input
+ nodes,nodes_df = transform_kgx_output_format(nodes_filename) #kgx_nodes_file)
+
+ #Constrain by those that have reference proteomes, don't use if testing
+ ref_organisms = _get_uniprot_batch_reference_proteome(ref_source)
+ ref_organism_ids = [str(k['Organism Id']) for k in ref_organisms]
+ node_vals = [i for i in nodes if i in ref_organism_ids]
+
+ nodes_not_in_refProteome = list(set(ref_organism_ids) - set(nodes))
+ print('nodes_not_in_refProteome: ',nodes_not_in_refProteome)
+
+ node_vals = ['NCBITaxon:' + i for i in node_vals]
+ kgx_nodes_subset = nodes_df[nodes_df['id'].isin(node_vals)]
+ kgx_nodes_subset.to_csv(output_dir+'/Organism.tsv', index=False, sep='\t')
+ print('Wrote file: ',output_dir+'/Organism.tsv')
+
+def _get_ncbi_taxonomy_files(source):
+ '''Downloads and extracts NCBI Taxonomy files.'''
+ temp_dir = tempfile.gettempdir()
+ temp_gzipfile = tempfile.NamedTemporaryFile()
+ urlretrieve(source, temp_gzipfile.name)
+
+ temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz')
+ temp_tarfile.extractall(temp_dir)
+
+ temp_gzipfile.close()
+ temp_tarfile.close()
+
+ return os.path.join(temp_dir, 'nodes.dmp'), \
+ os.path.join(temp_dir, 'names.dmp')
+
+def _parse_nodes_kgmicrobe(filename, output_name,output_dir):
+ '''Parses nodes file.'''
+
+ transform(inputs=[filename], input_format='tsv', output= os.path.join(output_dir, output_name), output_format='tsv') #obojson
+
+ return output_dir+'/'+output_name+'_nodes.tsv'
+
+def transform_kgx_output_format(transformed_nodes_tsv):
+
+ labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name'])
+
+ nodes = []
+
+ #Get node IDs to help subset according to reference proteomes
+ for i in range(len(labels)):
+ try:
+ tax_id = labels.iloc[i].loc['id'].split('NCBITaxon:')[1]
+ nodes.append(tax_id)
+ except IndexError: print(labels.iloc[i].loc['id'])
+
+ return nodes,labels
+
+
+def _parse_nodes(filename):
+ '''Parses nodes file.'''
+ nodes = {}
+ rels = []
+
+ with open(filename, 'r') as textfile:
+ for line in textfile:
+ tokens = [x.strip() for x in line.split('|')]
+ tax_id = tokens[0]
+
+ if tax_id != '1':
+ rels.append([tax_id, 'is_a', tokens[1]])
+
+ nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
+ ':LABEL':
+ 'Organism' + ',' + tokens[2]}
+
+ return nodes, rels
+
+
+def _parse_names(nodes, filename):
+ '''Parses names file.'''
+
+ with open(filename, 'r') as textfile:
+ for line in textfile:
+ tokens = [x.strip() for x in line.split('|')]
+ node = nodes[tokens[0]]
+
+ if 'name' not in node:
+ node['name'] = tokens[1]
+ node['names:string[]'] = set([node['name']])
+ else:
+ node['names:string[]'].add(tokens[1])
+
+ for _, node in nodes.items():
+ if 'names:string[]' in node:
+ node['names:string[]'] = \
+ ','.join(node['names:string[]'])
+
+
+def main(args):
+ '''main method'''
+ num_threads = 0
+
+ if len(args) > 2:
+ try:
+ num_threads = int(args[2])
+ except ValueError:
+ if args[2] == 'True':
+ num_threads = multiprocessing.cpu_count()
+
+ print('Running build with ' + str(num_threads) + ' threads')
+
+ build_csv(args[0], num_threads)
+
+
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
\ No newline at end of file
diff --git a/metanetx_uniprot/chemical_utils.py b/metanetx_uniprot/chemical_utils.py
index 30567ed4..dc3fb310 100644
--- a/metanetx_uniprot/chemical_utils.py
+++ b/metanetx_uniprot/chemical_utils.py
@@ -29,6 +29,9 @@ def write_files(self, writer):
'''Write neo4j import files.'''
return writer.write_nodes(self.__nodes.values(), 'Chemical')
+ def write_rels(self, writer, rels):
+ return writer.write_rels(rels, 'Chemical', 'Process')
+
def add_chemical(self, properties):
'''Adds a chemical to the collection of nodes, ensuring uniqueness.'''
chem_id, chebi_ent = self.__get_chem_id(properties)
diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py
index d6ea4969..95de560e 100644
--- a/metanetx_uniprot/enzyme_utils.py
+++ b/metanetx_uniprot/enzyme_utils.py
@@ -18,12 +18,18 @@ class EnzymeManager(object):
def __init__(self):
'''Constructor.'''
self.__nodes = {}
+ self.__node_enzymes = {}
self.__org_enz_rels = []
def get_nodes(self):
'''Gets enzyme nodes.'''
return self.__nodes.values()
+ def get_enz_nodes(self):
+ #nodes_enzymes_df = pd.DataFrame(self.__node_enzymes.items(), columns=['entity_uri', 'label'])
+ return self.__node_enzymes.values()
+
+
def get_org_enz_rels(self):
'''Gets organism-to-enzyme relationships.'''
return self.__org_enz_rels
@@ -49,10 +55,10 @@ def add_uniprot_data(self, enzyme_ids, source, num_threads=0):
if 'Organism (ID)' in uniprot_value else None
if 'Entry name' in uniprot_value:
- enzyme_node['entry'] = uniprot_value['Entry name']
+ enzyme_node['entry'] = 'Uniprot:'+uniprot_value['Entry name']
if 'Protein names' in uniprot_value:
- enzyme_node['names'] = uniprot_value['Protein names']
+ enzyme_node['names'] = 'Uniprot:'+uniprot_value['Protein names']
if enzyme_node['names']:
enzyme_node['name'] = enzyme_node['names'][0]
@@ -91,7 +97,7 @@ def add_uniprot_data_organism(self, organism_ids, source, num_threads=0):
enzyme_node['entry'] = entry['Entry']
if 'Protein names' in entry:
- enzyme_node['names'] = entry['Protein names']
+ enzyme_node['names'] = entry['Protein names'][0]
if 'names' in entry.keys():
enzyme_node['name'] = entry['names'][0]
@@ -100,7 +106,9 @@ def add_uniprot_data_organism(self, organism_ids, source, num_threads=0):
enzyme_node['ec-code'] = entry['EC number']
if organism_id:
- self.__org_enz_rels.append([organism_id, 'expresses',entry['Entry'], {'source': source}])
+ self.__org_enz_rels.append(['NCBITaxon:'+organism_id, 'expresses','Uniprot:'+entry['Entry'], {'source': source}])
+
+ self.__node_enzymes['Uniprot:'+entry['Entry']] = {'entity_uri':'Uniprot:'+entry['Entry'], 'label':enzyme_node['names']}
return uniprot_values
-
+
\ No newline at end of file
diff --git a/metanetx_uniprot/go_utils.py b/metanetx_uniprot/go_utils.py
new file mode 100644
index 00000000..8d8fab92
--- /dev/null
+++ b/metanetx_uniprot/go_utils.py
@@ -0,0 +1,10 @@
+from kgx.cli.cli_utils import transform
+import os
+
+go_plus_file = '/Users/brooksantangelo/Documents/HunterLab/Exploration/biochem4j/kg-microbe/metanetx_uniprot/Files/go-plus.owl'
+
+
+output_dir = '/Users/brooksantangelo/Documents/HunterLab/biochem4j/biochem4j/'
+name = 'go_plus_transformed'
+
+transform(inputs=[go_plus_file], input_format='xml', output= os.path.join(output_dir, name), output_format='tsv')
diff --git a/metanetx_uniprot/mnxref_utils.py b/metanetx_uniprot/mnxref_utils.py
index aa231e22..7eb3fafb 100644
--- a/metanetx_uniprot/mnxref_utils.py
+++ b/metanetx_uniprot/mnxref_utils.py
@@ -44,32 +44,38 @@ def __init__(self, source=_METANETX_URL):
def get_chem_data(self):
'''Gets chemical data.'''
if not self.__chem_data:
- self.__read_chem_prop()
+ mxn_chebi_mapping = self.__read_chem_prop()
self.__read_xref('chem_xref.tsv', self.__chem_data, True)
- return self.__chem_data
+ return self.__chem_data,mxn_chebi_mapping
def get_reac_data(self,reaction_ids):
'''Gets reaction data.'''
if not self.__reac_data:
- mxn_reaction_ids = self.__read_reac_prop(reaction_ids)
+ mxn_reaction_ids,mxn_rhea_mapping = self.__read_reac_prop(reaction_ids)
self.__read_xref('reac_xref.tsv', self.__reac_data, False)
#Only include reaction data for reactions in reaction_ids
self.__reac_data = {key:val for key,val in self.__reac_data.items() if key in mxn_reaction_ids}
- return self.__reac_data
+ return self.__reac_data,mxn_rhea_mapping
def __read_chem_prop(self):
'''Read chemical properties and create Nodes.'''
chem_prop_keys = ['id', 'name', 'reference','formula', 'charge:float',
'mass:float', 'inchi', 'inchikey', 'smiles']
+ mxn_chebi_mapping = {}
+
for values in self.__read_data('chem_prop.tsv'):
if not values[0].startswith('#'):
values[0] = self.__parse_id(values[0])
values[2] = self.__parse_id(values[2])
props = dict(zip(chem_prop_keys, values))
+
+ #For mapping mxn IDs to Chebi Ids
+ mxn_chebi_mapping[values[0]] = values[2]
+
props.pop('reference')
_convert_to_float(props, 'charge:float')
_convert_to_float(props, 'mass:float')
@@ -77,6 +83,8 @@ def __read_chem_prop(self):
if value != ''}
self.__chem_data[values[0]] = props
+ return mxn_chebi_mapping
+
def __read_xref(self, filename, data, chemical):
'''Read xrefs and update Nodes.'''
xref_keys = ['XREF', 'MNX_ID', 'Description']
@@ -113,15 +121,18 @@ def __read_reac_prop(self,reaction_ids):
##Relabel reaction ids by MXN id rather than rhea id
mxn_reaction_ids = []
+ mxn_rhea_mapping = {}
+
for values in self.__read_data('reac_prop.tsv'):
if not values[0].startswith('#'):
if values[0] == 'EMPTY': continue
values[0] = self.__parse_id(values[0])
values[2] = self.__parse_id(values[2])
-
+ #Grab MXN id if in reaction IDs from filtering by organisms/enzymes
try:
if 'rhea' in values[2].split(':')[0].lower() and values[2].split(':')[1] in reaction_ids:
mxn_reaction_ids.append(values[0])
+ mxn_rhea_mapping[values[0]] = values[2].split(':')[1]
except IndexError: continue
props = dict(zip(reac_prop_keys, values))
@@ -143,7 +154,7 @@ def __read_reac_prop(self,reaction_ids):
print('WARNING: Suspected polymerisation reaction: ' + \
values[0] + '\t' + str(props))
- return mxn_reaction_ids
+ return mxn_reaction_ids,mxn_rhea_mapping
def __add_chem(self, chem_id):
'''Adds a chemical with given id.'''
@@ -192,11 +203,14 @@ def __parse_id(self, item_id):
class MnxRefLoader(object):
'''Loads MNXref data into neo4j format.'''
- def __init__(self, chem_man, reac_man, writer,reaction_ids):
+ def __init__(self, chem_man, reac_man, writer,reaction_ids,process_ids,ncbi_taxonomy_utils,array_delimiter):
self.__chem_man = chem_man
self.__reac_man = reac_man
self.__writer = writer
self.__reactions = reaction_ids
+ self.__processes = process_ids
+ self.__ncbi_tax = ncbi_taxonomy_utils
+ self.__array_delimiter = array_delimiter
def load(self):
'''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv,
@@ -204,14 +218,77 @@ def load(self):
reader = MnxRefReader()
#First gets all chemical data from MxnRef (chem_xref and chem_prop) and adds to __chem_man
- for properties in reader.get_chem_data().values():
- properties['mnx'] = properties.pop('id')
+ c_vals,mxn_chebi_mapping = reader.get_chem_data()
+ for properties in c_vals.values():
+ #Includes chemical as chebi ID if you use reference
+ properties['mnx'] = properties.pop('id') #'reference')
self.__chem_man.add_chemical(properties)
- #Then gets reaction data from reac_xref and reac_prop and adds to __chem_man
- rels = self.__add_reac_nodes(reader.get_reac_data(self.__reactions))
+ #Then gets reaction data from reac_xref and reac_prop and adds to __chem_man only for reaction ids founds linked to organisms
+ reac_data,mxn_rhea_mapping = reader.get_reac_data(self.__reactions)
+ chem_rels = self.__add_reac_nodes(reac_data)
+
+ #Convert rxn id's to Rhea (get mappings from reac_prop) and chemicals to CHEBI IDs
+ #rels is list of lists
+ #print('mxn_chebi_mapping: ',mxn_chebi_mapping)
+ mxn_chebi_mapping['MNXM1'] = 'chebi:24636'
+ mxn_chebi_mapping['WATER'] = 'chebi:15377'
+
+ chemical_ids = []
+
+ for i in enumerate(chem_rels):
+ #MXN ids to rhea ids
+ #reac_ids should have rhea to help identify
+ chem_rels[i[0]][0] = 'Rhea:'+mxn_rhea_mapping[i[1][0]]
+ try:
+ #MXN ids to chebi ids
+ chem_rels[i[0]][2] = mxn_chebi_mapping[i[1][2]]
+ except KeyError:
+ if 'WATER' in i[1][2]:
+ mxn_chebi_mapping[i[1][2]] = 'chebi:15377'
+ chem_rels[i[0]][2] = mxn_chebi_mapping[i[1][2]]
+ else:
+ print('could not map chemical to chebi ID: ',i[1][2])
+ chemical_ids.append(chem_rels[i[0]][2])
+
+
+ #Gets all chemicals from reac_data and adds go processes, and gets all go processes from rhea2go and adds chemicals
+
+ print('self.__processes in mxnref load: ',self.__processes)
+ print('length of self.__processes in mxnref load: ',len(self.__processes))
+ #go plus
+ go_plus_filename = os.getcwd()+'/Files/GO-PLUS.csv'
+ go_plus_rels,process_ids = self.__reac_man.read_go_plus(go_plus_filename,self.__processes,chemical_ids)
+
+ print('go_plus_rels: ',go_plus_rels[0:5])
+
+ #HPO
+ hpo_kgx_nodes_json = os.getcwd()+'/Files/hp_kgx_tsv_nodes.tsv'
+ hpo_kgx_edges_json = os.getcwd()+'/Files/hp_kgx_tsv_edges.tsv'
+ #kgx_nodes_json,kgx_edges_json = self.__ncbi_tax._parse_nodes_kgmicrobe(go_plus_filename, self.__array_delimiter, 'hpo_transformed')
+ nodes,rels = self.__reac_man.transform_kgx_output_format_hp(hpo_kgx_nodes_json,hpo_kgx_edges_json)
+ #Contrain pehnotype - process rels from processes filtered previously
+ hpo_rels = []
+ for i in rels:
+ if i[0] in process_ids or i[2] in process_ids:
+ hpo_rels.append(i)
+
+ n1 = self.__writer.write_nodes(nodes, 'Phenotype') #node_vals #- works
+ f1 = self.__writer.write_rels(hpo_rels, 'Process', 'Phenotype') #rel_vals
+
+ #PKL for GO-MONDO
+ #pkl_rels = self.__reac_man.get_process_disease_pkl_data(os.getcwd()+'/Files/PheKnowLator_v3.0.2_full_instance_relationsOnly_OWLNETS_Triples_Identifiers.txt',os.getcwd()+'/Files/PheKnowLator_v3.0.2_full_instance_relationsOnly_OWLNETS_NodeLabels.txt',self.__processes)
+
+ #KG-phenio for GO-MONDO
+ phenio_rels = self.__reac_man.get_process_disease_phenio_data(os.getcwd()+'/Files/phenio_merged-kg_edges.tsv',os.getcwd()+'/Files/phenio_merged-kg_nodes.tsv',process_ids)
+
+ f2 = self.__writer.write_rels(go_plus_rels, 'GoPlus_Chemical', 'Process') #- works
+
+ f3 = self.__writer.write_rels(chem_rels, 'Reaction', 'Chemical') #-works
+ print('phenio_rels: ',phenio_rels[0:5])
+ f4 = self.__writer.write_rels(phenio_rels, 'Phenio_Process', 'Disease')
- return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')]
+ return [] #,[self.__writer.write_rels(chem_rels, 'Reaction', 'Chemical')], [self.__writer.write_rels(pkl_rels, 'Process', 'Disease')]
def __add_reac_nodes(self, reac_data):
'''Get reaction nodes from data.'''
@@ -250,6 +327,7 @@ def __add_reac_nodes(self, reac_data):
reac_id = self.__reac_man.add_reaction('mnx', mnx_id,
properties)
+ #reac_id_def looks like {'MNXR165961': [[None, 0, -1.0, 'MNXM1107698'], [None, 0, -1.0, 'WATER@MNXD1'], [None, 0, 1.0, 'MNXM1108087'], [None, 0, 1.0, 'MNXM728579']], 'MNXR171532': [['C18H33O2', -1, -2.0, 'MNXM1107708'], [None, 0, -2.0, 'MNXM1'], [None, 0, -1.0, 'MNXM734941'], ['C41H78NO8P', 0, 1.0, 'MNXM737425'], [None, 0, 2.0, 'WATER@MNXD1']]}
reac_id_def[reac_id] = balanced_def
chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float',
@@ -277,6 +355,8 @@ def __add_reac_nodes(self, reac_data):
reac_cofactors.extend(pair)
for term in defn:
+ cof_chebi_id = term[3]
+ react_chebi_id = term[2]
rels.append([reac_id,
'has_cofactor' if term[3] in reac_cofactors
else 'has_reactant',
diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py
index f13c429c..f6a5f390 100644
--- a/metanetx_uniprot/reaction_utils.py
+++ b/metanetx_uniprot/reaction_utils.py
@@ -10,6 +10,9 @@
from enzyme_utils import EnzymeManager
from numpy import *
+import pandas as pd
+from tqdm import tqdm
+import csv
class ReactionManager(object):
@@ -21,6 +24,7 @@ def __init__(self):
self.__reac_ids = {}
self.__reac_enz_rels = []
self.__enz_reac_rels = []
+ self.__go_reac_rels = []
self.__org_enz_rels = []
self.__enz_man = EnzymeManager()
@@ -29,13 +33,18 @@ def write_files(self, writer):
return ([writer.write_nodes(self.__nodes.values(),
'Reaction'),
writer.write_nodes(self.__enz_man.get_nodes(),
- 'Enzyme')],
+ 'Enzyme'),
+ writer.write_nodes(self.__enz_man.get_enz_nodes(),
+ 'Enzyme_nodes')],
[writer.write_rels(self.__reac_enz_rels,
'Reaction', 'Enzyme'),
#Gets reactions connected to all enzymes
writer.write_rels(self.__enz_reac_rels,
'Reaction', 'Enzyme'),
- writer.write_rels(self.__enz_man.get_org_enz_rels(),
+ #Gets reactions connected to all go processes
+ writer.write_rels(self.__go_reac_rels,
+ 'Reaction', 'Process'),
+ writer.write_rels(self.__enz_man.get_org_enz_rels(),
'Organism', 'Enzyme')])
def add_reaction(self, source, reac_id, properties):
@@ -62,6 +71,9 @@ def add_reaction(self, source, reac_id, properties):
else:
self.__nodes[reac_id].update(properties)
+ print('from add_reaction in reaction_utils.py')
+ print(self.__nodes.values())
+
return reac_id
def add_react_to_enz(self, data, source, num_threads=0):
@@ -72,12 +84,13 @@ def add_react_to_enz(self, data, source, num_threads=0):
# Create Enzyme nodes:
self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads)
- def add_react_to_enz_organism(self, data, source, num_threads=0):
+ #data here is rhea-enzyme file, go_data is rhea-go file
+ def add_react_to_enz_organism(self, data, source, go_data, num_threads=0):
#Create Reaction relationships
- reaction_ids = self.__create_enz_react(data, source)
+ reaction_ids,process_ids = self.__create_enz_react(data, go_data, source)
- return reaction_ids
+ return reaction_ids,process_ids
def __create_react_enz(self, data, source):
'''Creates Reaction and Enzyme nodes and their Relationships.'''
@@ -94,26 +107,48 @@ def __create_react_enz(self, data, source):
return list(set(enzyme_ids))
- def __create_enz_react(self, data, source):
+ def __create_enz_react(self, data, go_data, source):
'''Creates Reaction and Enzyme nodes and their Relationships.'''
print('adding reaction to enzyme relationships')
reaction_ids = []
+ process_ids = []
enzyme_ids = self.__enz_man.get_nodes()
for enz_id in enzyme_ids:
+ #Gets relationships between reactions and enzymes from Rhea only if they exist in the enzymes pulled from organism filtering step
reac_ids = [key for key, value in data.items() if enz_id['entry'] in value]
+
reaction_ids = reaction_ids+reac_ids
for j in reac_ids:
- self.__enz_reac_rels.append([j, 'catalysed_by',
- enz_id['entry'],
+ #reac_ids should have rhea to help identify and protein should have UniProt
+ self.__enz_reac_rels.append(['Rhea:'+j, 'catalysed_by',
+ 'Uniprot:'+enz_id['entry'],
+ {'source': source}])
+
+ print('adding reaction to process relationships')
+ #Gets relationships between reactions and Go processes from Rhea only if they exist in above reaction ids
+ go_reac_ids = [key for key, value in go_data.items() if key in reaction_ids]
+ reaction_ids = reaction_ids+go_reac_ids
+
+ for j in go_reac_ids:
+ rxns = go_data[j]
+ for k in rxns:
+ process_ids.append(k)
+ #reac_ids should have rhea to help identify
+ self.__go_reac_rels.append(['Rhea:'+j, 'affects',
+ k,
{'source': source}])
- return list(set(reaction_ids))
+
+ return list(set(reaction_ids)),list(set(process_ids))
def add_org_to_enz(self, nodes, source, num_threads=0):
'''Submit data to the graph.'''
# Create Organism nodes:
organism_ids = self.__create_organism_ids(nodes, source)
+ print('number of orgs for just reference proteomes')
+ print(len(organism_ids))
+
## For testing
#organism_ids = organism_ids[0:10]
@@ -126,3 +161,152 @@ def __create_organism_ids(self, data, source):
return ids
+ def read_go_plus(self,go_plus_file,process_ids,chemical_ids):
+ '''Read chemical properties and create Nodes.'''
+ go_keys = ['Class ID', 'Preferred Label', 'Synonyms','Definitions','Obsolete','CUI','Semantic Types','Parents']
+
+ rels = []
+
+ d = pd.read_csv(go_plus_file, delimiter=',',keep_default_na=False)
+ go_data = d[go_keys]
+ go_data = go_data.replace(regex=['http://purl.obolibrary.org/obo/'],value='').replace(regex=['_'],value=':')
+
+ #Create go-plus nodes
+ #add to nodes: http://www.w3.org/2000/01/rdf-schema#label
+
+ d = d.drop(go_keys,axis=1) #+['Parents'], axis=1)
+ #Update values
+ #Ensure subject is not deprecated
+ d = d[d['http://www.w3.org/2002/07/owl#deprecated'] != 'TRUE']
+ d = d.replace(regex=['http://purl.obolibrary.org/obo/'],value='').replace(regex=['_'],value=':')
+ d = d.replace(regex=['go#'],value='')
+
+ #Update columns
+ #Columns to ignore
+ cols_to_drop = ['http://data.bioontology.org/metadata/prefixIRI','http://data.bioontology.org/metadata/treeView','go#','http://purl.obolibrary.org/obo/IAO_','http://www.w3.org/2000/01/rdf-schema#','http://www.w3.org/2004/02/skos/core#','http://www.w3.org/2002/07/owl#deprecated','http://www.w3.org/2000/01/rdf-schema#label','http://purl.org/dc/terms/','obsolete ','has_narrow_synonym','has_obo_format_version','has_obo_namespace','has_related_synonym','has_scope','has_synonym_type','definition','http://www.geneontology.org/formats/oboInOwl#id','has_alternative_id','http://purl.obolibrary.org/obo/go#creation_date','http://www.geneontology.org/formats/oboInOwl#creation_date','synonym_type_property','Systematic synonym','temporally related to','term replaced by','term tracker item','title','http://www.geneontology.org/formats/oboInOwl#created_by','has_exact_synonym']
+ cols_to_drop = d.columns[d.columns.str.contains('|'.join(cols_to_drop))]
+ d = d.drop(cols_to_drop, axis=1)
+ #There are 2 contains relationships, develops_from
+ d.columns = d.columns.str.replace('http://data.bioontology.org/metadata/obo/contains','biontology_contains', regex=False)
+ d.columns = d.columns.str.replace('http://data.bioontology.org/metadata/obo/develops_from','biontology_develops_from', regex=False)
+ d.columns = d.columns.str.replace('http://data.bioontology.org/metadata/obo/','', regex=False)
+ d.columns = d.columns.str.replace('http://purl.obolibrary.org/obo/', '', regex=False)
+ d.columns = d.columns.str.replace('http://www.geneontology.org/formats/oboInOwl#', '', regex=False)
+
+ for i in tqdm(range(len(d))):
+ s_id = go_data.iloc[i].loc['Class ID']
+ for p_label in d.columns:
+ if d.iloc[i].loc[p_label] != '':
+ if (s_id in chemical_ids or p_label in process_ids) or (s_id in process_ids or p_label in chemical_ids):
+ all_objects = d.iloc[i].loc[p_label].split('|')
+ for j in all_objects:
+ rels.append([s_id, p_label,
+ j,
+ {'source': 'go-plus'}])
+
+ go_process_ids = []
+ for i, v in enumerate(rels):
+ for x in v:
+ if "GO:" in x:
+ go_process_ids.append(x)
+
+ go_process_ids = list(set(go_process_ids))
+
+ print('len process_ids before adding go plus terms: ',len(process_ids))
+ process_ids = process_ids+go_process_ids
+ process_ids = list(set(process_ids))
+ print('len process_ids after adding go plus terms: ',len(process_ids))
+
+ return rels,process_ids
+
+ def transform_kgx_output_format_hp(self,transformed_nodes_tsv,transformed_edges_tsv):
+
+ labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name'])
+ triples_df = pd.read_csv(transformed_edges_tsv,sep = '\t', usecols = ['subject', 'object', 'predicate'])
+ triples_df.columns.str.lower()
+
+ nodes = {}
+ rels = []
+
+
+ #Constrain rels and nodes to only GO process: HP relationships
+ #Constrain rels and nodes to only GO processes that are used in prior rels
+ for i in range(len(triples_df)):
+ s = triples_df.iloc[i].loc['subject']
+ p = triples_df.iloc[i].loc['predicate']
+ o = triples_df.iloc[i].loc['object']
+ if ('GO:' in s and 'HP:' in o) or ('GO:' in o and 'HP:' in s):
+ rels.append([s, p, o])
+
+
+ for i in range(len(labels)):
+ if any(labels.iloc[i].loc['id'] in sublist for sublist in labels):
+ nodes[labels.iloc[i].loc['id']] = {'class:ID': labels.iloc[i].loc['id'],
+ ':LABEL':
+ labels.iloc[i].loc['id'].split(':')[0]}
+
+ return nodes,rels
+
+ def process_pkl_files(self,triples_file,labels_file):
+
+ triples_df = pd.read_csv(triples_file,sep = ' ', quoting=csv.QUOTE_NONE)
+ triples_df.columns.str.lower()
+
+ triples_df.replace({'<': ''}, regex=True, inplace=True)
+ triples_df.replace({'>': ''}, regex=True, inplace=True)
+
+ labels = pd.read_csv(labels_file, sep = ' ', quoting=csv.QUOTE_NONE)
+ labels.columns.str.lower()
+
+ #Remove brackets from URI
+ labels['entity_uri'] = labels['entity_uri'].str.replace("<","")
+ labels['entity_uri'] = labels['entity_uri'].str.replace(">","")
+
+
+ return triples_df,labels
+
+ def get_process_disease_pkl_data(self,triples_file,labels_file,process_ids):
+
+ print('Extracting PKL relationships')
+ triples_df, labels_dict = self.process_pkl_files(triples_file,labels_file)
+
+ rels = []
+
+ for i in tqdm(range(len(triples_df))):
+ if triples_df.iloc[i].loc['object'] in process_ids and 'MONDO_' in triples_df.iloc[i].loc['subject']:
+ rels.append([triples_df.iloc[i].loc['subject'].replace('http://purl.obolibrary.org/obo/','').replace('_',':'), labels_dict.loc[labels_dict['entity_uri'] == triples_df.iloc[i].loc['predicate'],'label'].values[0],
+ triples_df.iloc[i].loc['object'].replace('http://purl.obolibrary.org/obo/','').replace('_',':'),
+ {'source': 'pheknowlator'}])
+
+ return rels
+
+
+ def process_kg_phenio_files(self,triples_file,labels_file):
+
+ triples_df = pd.read_csv(triples_file,sep = '\t', usecols = ['subject', 'object', 'predicate'])
+ triples_df.columns.str.lower()
+
+ labels = pd.read_csv(labels_file, sep = '\t', usecols = ['id','category', 'name','description'])
+ labels.columns = ['entity_uri','category', 'label','description/definition']
+
+ triples_df_relevant = triples_df.loc[((triples_df['subject'].str.contains('MONDO:')) & (triples_df['object'].str.contains('GO:'))) | ((triples_df['object'].str.contains('MONDO:')) & (triples_df['subject'].str.contains('GO:')))]
+
+ #1785727 total, 435 total MONDO/GO or GO/MONDO relationships
+ print(len(triples_df),len(triples_df_relevant))
+
+ return triples_df_relevant,labels
+
+ def get_process_disease_phenio_data(self,triples_file,labels_file,process_ids):
+
+ print('Extracting kg-phenio relationships')
+ triples_df, labels_dict = self.process_kg_phenio_files(triples_file,labels_file)
+
+ rels = []
+
+ for i in tqdm(range(len(triples_df))):
+ if triples_df.iloc[i].loc['object'] in process_ids and 'MONDO:' in triples_df.iloc[i].loc['subject']:
+ rels.append([triples_df.iloc[i].loc['subject'], triples_df.iloc[i].loc['predicate'],
+ triples_df.iloc[i].loc['object'],
+ {'source': 'kg-phenio'}])
+
+ return rels
\ No newline at end of file
diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py
index 5c612d90..a9a9ff6a 100644
--- a/metanetx_uniprot/rhea_utils.py
+++ b/metanetx_uniprot/rhea_utils.py
@@ -17,21 +17,31 @@
#For test, also update load function
#__RHEA_URL = os.getcwd()+'/TestingFiles/rhea2uniprot_sprot.txt'
-def load(reaction_manager, source=__RHEA_URL, num_threads=0):
+__RHEA_GO_URL = 'ftp://ftp.expasy.org/databases/rhea/tsv/rhea2go.tsv'
+#__RHEA_GO_URL = os.getcwd()+'/TestingFiles/rhea2go_NOTREAL.txt'
+
+def load(reaction_manager, source=__RHEA_URL, go_source = __RHEA_GO_URL, num_threads=0):
'''Loads Rhea data.'''
# Parse data:
temp_file = tempfile.NamedTemporaryFile()
urlretrieve(source, temp_file.name)
data = _parse(temp_file.name)
+
+
+ temp_file = tempfile.NamedTemporaryFile()
+ urlretrieve(go_source, temp_file.name)
+ go_data = _parse(temp_file.name)
+
##If using test data
#data = _parse(source)
+ #go_data = _parse(go_source)
######Not sure why source is Rhea here, calls to UniProt
#Remove, since this goes from rhea2uniprot to uniprot enzymes. use add_org_to_enz function in ncbi_taxonomy_utils instead
#reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
- reaction_ids = reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads)
+ reaction_ids,process_ids = reaction_manager.add_react_to_enz_organism(data, 'rhea', go_data, num_threads)
- return reaction_ids
+ return reaction_ids,process_ids
def _parse(filename):
diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py
index 43682c9a..75a80bee 100644
--- a/metanetx_uniprot/seq_utils.py
+++ b/metanetx_uniprot/seq_utils.py
@@ -138,10 +138,59 @@ def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, ver
batch = organism_ids[i:min(i + batch_size, len(organism_ids))]
query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch])
url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
- '&format=tsv&size=500&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
+ '&format=tsv&size=500&keywords=Reference+proteome&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
# '&format=tsv&size=1&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
for field in fields])
-
_parse_uniprot_data(url, values)
return values
+
+def parse_response(res,values):
+
+ headers = None
+
+ for line in res.iter_lines():
+ line = line.decode('utf-8')
+ tokens = line.strip().split('\t')
+
+ if headers is None:
+ headers = tokens
+ else:
+ res = dict(zip(headers, tokens))
+ #print(res)
+ #print(type(res))
+ #print(type(values))
+ values.append(res)
+
+ #print(values)
+
+ return values
+
+
+def get_jobs(url,values):
+
+ session = requests.Session()
+
+ paging = True
+
+ first_page = session.get(url)
+ first_response = parse_response(first_page,values)
+
+ while paging == True:
+
+ if 'next' in first_page.links:
+ next_url = first_page.links['next']['url']
+ next_page = session.get(next_url)
+ next_response = parse_response(next_page,values)
+ first_page = next_page
+ else:
+ paging = False
+ break
+
+def _get_uniprot_batch_reference_proteome(url):
+
+ values = []
+
+ get_jobs(url,values)
+
+ return values
\ No newline at end of file
From a4449ed9ebb4a63f4d0997b6aabb4743547ee367 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 21 Sep 2023 21:13:19 -0600
Subject: [PATCH 27/29] Add files via upload
Adding code to create labels (combined_kgx_merged-kg_nodes.csv) and edges (combined_kg.csv) files for full graph.
---
metanetx_uniprot/combine_rels.py | 64 +++++++++++
metanetx_uniprot/create_labels_file.py | 151 +++++++++++++++++++++++++
2 files changed, 215 insertions(+)
create mode 100644 metanetx_uniprot/combine_rels.py
create mode 100644 metanetx_uniprot/create_labels_file.py
diff --git a/metanetx_uniprot/combine_rels.py b/metanetx_uniprot/combine_rels.py
new file mode 100644
index 00000000..0446c8c1
--- /dev/null
+++ b/metanetx_uniprot/combine_rels.py
@@ -0,0 +1,64 @@
+import os
+import pandas as pd
+import argparse
+
+
+def parse_kg_file(kg_filename):
+
+ kg = pd.read_csv(kg_filename,delimiter=';')
+
+ if len(kg.columns) == 3: kg.columns = [['subject','predicate','object']]
+ if len(kg.columns) == 4:
+ kg.columns = [['subject','predicate','object','source']]
+ kg = kg[['subject','predicate','object']]
+
+ return kg
+
+def concat_kgs(kg1,kg2):
+
+ combined_kg = pd.concat([kg1, kg2], axis=0)
+ combined_kg = combined_kg.drop_duplicates().reset_index(drop=True)
+
+ return combined_kg
+
+#Define arguments for each required and optional input
+def defineArguments():
+ parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument("--directory",dest="Directory",required=True,help="Directory")
+
+ return parser
+
+def main():
+
+ #rels_files_dir = '/Users/brooksantangelo/Documents/HunterLab/Exploration/biochem4j/kg-microbe/metanetx_uniprot/refProteome/LocalRun_0915
+
+ #Generate argument parser and define arguments
+ parser = defineArguments()
+ args = parser.parse_args()
+
+ directory = args.Directory
+
+ rels_files_dir = directory+'/rels/'
+ rels_files = os.listdir(rels_files_dir)
+
+ rels_files = [i for i in rels_files if 'combined_kg' not in i]
+
+ kg_0 = parse_kg_file(rels_files_dir+rels_files[0])
+
+ for fname in rels_files[1:]:
+
+ if fname.endswith('.csv'):
+
+ kg = parse_kg_file(rels_files_dir+fname)
+ kg_0 = concat_kgs(kg_0,kg)
+
+ kg_0.to_csv(rels_files_dir + 'combined_kg.csv', sep = "\t", index = False)
+
+
+if __name__ == '__main__':
+ main()
+
+
+
+
diff --git a/metanetx_uniprot/create_labels_file.py b/metanetx_uniprot/create_labels_file.py
new file mode 100644
index 00000000..4d25eab4
--- /dev/null
+++ b/metanetx_uniprot/create_labels_file.py
@@ -0,0 +1,151 @@
+
+
+
+from tqdm import tqdm
+import pandas as pd
+import argparse
+from collections import defaultdict
+
+
+
+def process_kg_covid19_files(triples_file,labels_file):
+ triples_df = pd.read_csv(triples_file,sep = '\t', usecols = ['subject', 'object', 'predicate'])
+ triples_df.columns.str.lower()
+
+ labels = pd.read_csv(labels_file, sep = '\t', usecols = ['id','category', 'name','description'])
+
+ triples_df_relevant = triples_df.loc[((triples_df['subject'].str.contains('MONDO:')) & (triples_df['object'].str.contains('GO:'))) | ((triples_df['object'].str.contains('MONDO:')) & (triples_df['subject'].str.contains('GO:')))]
+
+ labels_relevant = labels.loc[(labels['id'].str.contains('MONDO:')) | (labels['id'].str.contains('GO:')) | (labels['id'].str.contains('CHEBI:')) | (labels['id'].str.contains('NCBITaxon:'))]
+
+ #1785727 total, 435 total MONDO/GO or GO/MONDO relationships
+ print(len(labels_relevant),len(labels))
+
+ return triples_df_relevant,labels_relevant
+
+def get_process_disease_phenio_data(triples_file,labels_file,process_ids):
+
+ print('Extracting kg-phenio relationships')
+ triples_df, labels_dict = process_kg_covid19_files(triples_file,labels_file)
+
+ #triples_df = triples_df.replace(regex=['http://purl.obolibrary.org/obo/'],value='').replace(regex=['_'],value=':')
+
+ rels = []
+
+ for i in tqdm(range(len(triples_df))):
+ if triples_df.iloc[i].loc['object'] in process_ids and 'MONDO:' in triples_df.iloc[i].loc['subject']:
+ #if ('GO_' in triples_df.iloc[i].loc['subject'] and 'MONDO_' in triples_df.iloc[i].loc['object']) or ('GO_' in triples_df.iloc[i].loc['object'] and 'MONDO_' in triples_df.iloc[i].loc['subject']):
+ print(triples_df.iloc[i])
+ rels.append([triples_df.iloc[i].loc['subject'], triples_df.iloc[i].loc['predicate'],
+ triples_df.iloc[i].loc['object'],
+ {'source': 'kg-phenio'}])
+
+ return rels
+
+#Define arguments for each required and optional input
+def defineArguments():
+ parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument("--directory",dest="Directory",required=True,help="Directory")
+
+ return parser
+
+def main():
+
+ #directory = '/Users/brooksantangelo/Documents/HunterLab/Exploration/biochem4j/kg-microbe/metanetx_uniprot/refProteome/LocalRun_0915'
+
+ #Generate argument parser and define arguments
+ parser = defineArguments()
+ args = parser.parse_args()
+
+ directory = args.Directory
+
+ phenio_labels_file = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg-phenio/phenio_merged-kg_nodes.tsv'
+ phenio_triples_file = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg-phenio/phenio_merged-kg_edges.tsv'
+
+ #Updated 6/19 based on file location
+ kg_covid19_triples_file = '/Users/brooksantangelo/Documents/HunterLab/Cartoomics/PostRevisionUpdates/Inputs/kg-covid19/merged-kg_edges.tsv'
+ kg_covid19_labels_file = '/Users/brooksantangelo/Documents/HunterLab/Cartoomics/PostRevisionUpdates/Inputs/kg-covid19/merged-kg_nodes.tsv'
+
+ enzyme_file = directory + '/nodes' + '/Enzyme.csv'
+
+ kg_filename = directory + '/rels' + '/combined_kg.csv'
+
+ kg = pd.read_csv(kg_filename,delimiter='\t')
+ kg = kg[['subject','object']]
+ kg_vals = pd.unique(kg[['subject', 'object']].values.ravel()).tolist()
+ kg_vals = [str(x) for x in kg_vals]
+
+ kg_labels = {}
+
+ phenio_triples,phenio_labels = process_kg_covid19_files(phenio_triples_file,phenio_labels_file)
+ covid19_triples,covid19_labels = process_kg_covid19_files(kg_covid19_triples_file,kg_covid19_labels_file)
+
+ enzyme_df = pd.read_csv(enzyme_file,delimiter=';')
+ enz_list = []
+
+ #Get uri (ex: O88037) and labels (ex: Probable SapB synthase) for all enzymes
+ print('extracting enzyme labels')
+ for i in range(len(enzyme_df)):
+ enz_list.append({'id': 'Uniprot:'+enzyme_df.iloc[i].loc['uniprot:ID(Enzyme)'] ,
+ 'category': 'biolink:Protein' ,
+ 'name': enzyme_df.iloc[i].loc['names'],
+ 'description': ''})
+
+ enzyme_new_df = pd.DataFrame(enz_list)
+
+ kg_list = []
+ #Convert all uris that exist in phenio or kg-covid19 to labels
+ for i in tqdm(kg_vals):
+ #Determine category of node. What if GO term is not biological process?
+ if 'NCBITaxon:' in i: cat = 'biolink:OrganismalEntity'
+ if 'MONDO:' in i: cat = 'biolink:Disease'
+ if 'CHEBI:' in i: cat = 'biolink:ChemicalSubstance'
+ if 'GO:' in i: cat = 'biolink:BiologicalProcess'
+ try:
+ kg_list.append({'id': i ,
+ 'category': cat ,
+ 'name': phenio_labels.loc[phenio_labels['id'] == i,'name'].values[0],
+ 'description': ''})
+ except (KeyError,IndexError):
+ #print('val doesnt exist in phenio: ',i)
+ pass
+ try:
+ kg_list.append({'id': i ,
+ 'category': cat ,
+ 'name': covid19_labels.loc[covid19_labels['id'] == i,'name'].values[0],
+ 'description': ''})
+ except (KeyError,IndexError):
+ #print('val doesnt exist in kg-covid19: ',i)
+ pass
+
+ kg_new_df = pd.DataFrame(kg_list)
+
+ #Combine enzymes df with other labels from phenio and kg-covid19
+ combined_nodes = pd.concat([kg_new_df, enzyme_new_df], axis=0)
+
+ #Add Rhea labels:
+ rhea_vals = [i for i in kg_vals if 'rhea' in i.lower()]
+ rhea_list = []
+ #Dictionary to output Rhea nodes in current kg form, not kgx
+ rhea_labels = {}
+ for i in rhea_vals:
+ rhea_list.append({'id': i ,
+ 'category': 'biolink:Reaction' ,
+ 'name': i,
+ 'description': ''})
+ rhea_labels[i] = {'id':i, 'label':i}
+
+ #Output Rhea_nodes file
+ rhea_kg_df = pd.DataFrame(rhea_labels.values())
+ rhea_kg_df.to_csv(directory + '/nodes' + '/Rhea_nodes.csv', index=False, encoding='utf-8', sep=';')
+
+ rhea_new_df = pd.DataFrame(rhea_list)
+
+ #Combine all df label types and output
+ combined_nodes = pd.concat([combined_nodes, rhea_new_df], axis=0)
+ combined_nodes.to_csv(directory + '/combined_kgx_merged-kg_nodes.csv',sep='\t',index=False)
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
From 414fca0751211afca55bda4a82d72155b181417c Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 21 Sep 2023 21:18:43 -0600
Subject: [PATCH 28/29] Update README.md
---
metanetx_uniprot/README.md | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md
index a9e9503c..5b76a0d0 100644
--- a/metanetx_uniprot/README.md
+++ b/metanetx_uniprot/README.md
@@ -3,7 +3,7 @@
Code is reused from Biochem4j: https://github.com/neilswainston/biochem4j/tree/master/sbcdb
Access chemical, reaction, enzyme, and organism information from the following sources:
-- libchebipy
+- libchebipy (note, the _parsers.py file found in this repo must be updated for the libchebipy library at ~/libchebipy/_parsers.py)
- NCBITaxonomy
- MetaNetX
- Rhea
@@ -20,3 +20,12 @@ To run and only get reference proteome taxa that also exist in kg-microbe:
python build_taxa_ids.py ~/biochem4j 1
```
*Note, uses ncbitaxon.json (built from kg-microbe) which is expected to be in the Files directory.
+
+To build the entire graph by combining all separate triples files, and creating a kgx format nodes file:
+```
+python combine_rels.py --directory ~/biochem4j/rels
+python create_labels_file.py --directory ~/biochem4j/rels
+```
+This will output the following files:
+- ~/biochem4j/rels/combined_kg.csv
+- ~/biochem4j/combined_kgx_merged-kg_nodes.csv
From 350e05a66be3261e9214e19a668c570a246571e2 Mon Sep 17 00:00:00 2001
From: Brook Santangelo <70932395+bsantan@users.noreply.github.com>
Date: Thu, 21 Sep 2023 21:19:28 -0600
Subject: [PATCH 29/29] Add files via upload
Updated to use wget instead of url retrieve due to ftp issues
---
metanetx_uniprot/_parsers.py | 686 +++++++++++++++++++++++++++++++++++
1 file changed, 686 insertions(+)
create mode 100644 metanetx_uniprot/_parsers.py
diff --git a/metanetx_uniprot/_parsers.py b/metanetx_uniprot/_parsers.py
new file mode 100644
index 00000000..b646bf16
--- /dev/null
+++ b/metanetx_uniprot/_parsers.py
@@ -0,0 +1,686 @@
+'''
+libChEBIpy (c) University of Manchester 2015
+
+libChEBIpy is licensed under the MIT License.
+
+To view a copy of this license, visit .
+
+@author: neilswainston
+'''
+import calendar
+import datetime
+import gzip
+import io
+import os.path
+import re
+import tempfile
+import zipfile
+
+import six.moves.urllib.parse as urlparse
+from six.moves.urllib.request import urlretrieve, urlcleanup
+
+from ._comment import Comment
+from ._compound_origin import CompoundOrigin
+from ._database_accession import DatabaseAccession
+from ._formula import Formula
+from ._name import Name
+from ._reference import Reference
+from ._relation import Relation
+from ._structure import Structure
+import wget
+
+
+__ALL_IDS = {}
+__ALL_NAMES = {}
+__COMMENTS = {}
+__COMPOUND_ORIGINS = {}
+__CHARGES = {}
+__CREATED_BYS = {}
+__DATABASE_ACCESSIONS = {}
+__DEFAULT_STRUCTURE_IDS = []
+__DEFINITIONS = {}
+__FORMULAE = {}
+__INCHIS = {}
+__INCHI_KEYS = {}
+__INCOMINGS = {}
+__MASSES = {}
+__MODIFIED_ONS = {}
+__NAMES = {}
+__OUTGOINGS = {}
+__PARENT_IDS = {}
+__SMILES = {}
+__SOURCES = {}
+__STARS = {}
+__STATUSES = {}
+
+__DOWNLOAD_PARAMS = {'path': os.path.join(os.path.expanduser('~'), 'libChEBI'),
+ 'auto_update': True}
+
+
+def set_download_cache_path(path):
+ '''Sets download cache path.'''
+ __DOWNLOAD_PARAMS['path'] = path
+
+
+def set_auto_update(auto_update):
+ '''Sets auto update flag.'''
+ __DOWNLOAD_PARAMS['auto_update'] = auto_update
+
+
+def get_formulae(chebi_id):
+ '''Returns formulae'''
+ if not __FORMULAE:
+ __parse_chemical_data()
+
+ return __FORMULAE[chebi_id] if chebi_id in __FORMULAE else []
+
+
+def get_all_formulae(chebi_ids):
+ '''Returns all formulae'''
+ all_formulae = [get_formulae(chebi_id) for chebi_id in chebi_ids]
+ return [x for sublist in all_formulae for x in sublist]
+
+
+def get_mass(chebi_id):
+ '''Returns mass'''
+ if not __MASSES:
+ __parse_chemical_data()
+
+ return __MASSES[chebi_id] if chebi_id in __MASSES else float('NaN')
+
+
+def get_charge(chebi_id):
+ '''Returns charge'''
+ if not __CHARGES:
+ __parse_chemical_data()
+
+ return __CHARGES[chebi_id] if chebi_id in __CHARGES else float('NaN')
+
+
+def __parse_chemical_data():
+ '''Gets and parses file'''
+ filename = get_file('chemical_data.tsv')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+
+ if tokens[3] == 'FORMULA':
+ # Many seemingly contradictory formulae exist,
+ # depending upon the source database
+ chebi_id = int(tokens[1])
+
+ if chebi_id not in __FORMULAE:
+ __FORMULAE[chebi_id] = []
+
+ # Append formula:
+ form = Formula(tokens[4], tokens[2])
+ __FORMULAE[chebi_id].append(form)
+
+ elif tokens[3] == 'MASS':
+ __MASSES[int(tokens[1])] = float(tokens[4])
+
+ elif tokens[3] == 'CHARGE':
+ __CHARGES[int(tokens[1])] = int(tokens[4]
+ if tokens[4][-1] != '-'
+ else '-' + tokens[4][:-1])
+
+
+def get_comments(chebi_id):
+ '''Returns comments'''
+ if not __COMMENTS:
+ __parse_comments()
+
+ return __COMMENTS[chebi_id] if chebi_id in __COMMENTS else []
+
+
+def get_all_comments(chebi_ids):
+ '''Returns all comments'''
+ all_comments = [get_comments(chebi_id) for chebi_id in chebi_ids]
+ return [x for sublist in all_comments for x in sublist]
+
+
+def __parse_comments():
+ '''Gets and parses file'''
+ filename = get_file('comments.tsv')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+ chebi_id = int(tokens[1])
+
+ if chebi_id not in __COMMENTS:
+ __COMMENTS[chebi_id] = []
+
+ # Append Comment:
+ com = Comment(tokens[3],
+ tokens[4],
+ tokens[5],
+ datetime.datetime.strptime(tokens[2], '%Y-%M-%d'))
+
+ __COMMENTS[chebi_id].append(com)
+
+
+def get_compound_origins(chebi_id):
+ '''Returns compound origins'''
+ if not __COMPOUND_ORIGINS:
+ __parse_compound_origins()
+ return __COMPOUND_ORIGINS[chebi_id] if chebi_id in \
+ __COMPOUND_ORIGINS else []
+
+
+def get_all_compound_origins(chebi_ids):
+ '''Returns all compound origins'''
+ all_compound_origins = [get_compound_origins(chebi_id)
+ for chebi_id in chebi_ids]
+ return [x for sublist in all_compound_origins for x in sublist]
+
+
+def __parse_compound_origins():
+ '''Gets and parses file'''
+ filename = get_file('compound_origins.tsv')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+
+ if len(tokens) > 10:
+ chebi_id = int(tokens[1])
+
+ if chebi_id not in __COMPOUND_ORIGINS:
+ __COMPOUND_ORIGINS[chebi_id] = []
+
+ # Append CompoundOrigin:
+ comp_orig = CompoundOrigin(tokens[2], tokens[3],
+ tokens[4], tokens[5],
+ tokens[6], tokens[7],
+ tokens[8], tokens[9],
+ tokens[10])
+ __COMPOUND_ORIGINS[chebi_id].append(comp_orig)
+
+
+def get_status(chebi_id):
+ '''Returns status'''
+ if not __STATUSES:
+ __parse_compounds()
+
+ return __STATUSES[chebi_id] if chebi_id in __STATUSES else None
+
+
+def get_source(chebi_id):
+ '''Returns source'''
+ if not __SOURCES:
+ __parse_compounds()
+
+ return __SOURCES[chebi_id] if chebi_id in __SOURCES else None
+
+
+def get_parent_id(chebi_id):
+ '''Returns parent id'''
+ if not __PARENT_IDS:
+ __parse_compounds()
+
+ return __PARENT_IDS[chebi_id] if chebi_id in __PARENT_IDS else float('NaN')
+
+
+def get_all_ids(chebi_id):
+ '''Returns all ids'''
+ if not __ALL_IDS:
+ __parse_compounds()
+
+ return __ALL_IDS[chebi_id] if chebi_id in __ALL_IDS else []
+
+
+def get_name(chebi_id):
+ '''Returns name'''
+ if not __NAMES:
+ __parse_compounds()
+
+ return __NAMES[chebi_id] if chebi_id in __NAMES else None
+
+
+def get_definition(chebi_id):
+ '''Returns definition'''
+ if not __DEFINITIONS:
+ __parse_compounds()
+
+ return __DEFINITIONS[chebi_id] if chebi_id in __DEFINITIONS else None
+
+
+def get_modified_on(chebi_id):
+ '''Returns modified on'''
+ if not __MODIFIED_ONS:
+ __parse_compounds()
+
+ return __MODIFIED_ONS[chebi_id] if chebi_id in __MODIFIED_ONS else None
+
+
+def get_all_modified_on(chebi_ids):
+ '''Returns all modified on'''
+ all_modified_ons = [get_modified_on(chebi_id) for chebi_id in chebi_ids]
+ all_modified_ons = [modified_on for modified_on in all_modified_ons
+ if modified_on is not None]
+ return None if not all_modified_ons else sorted(all_modified_ons)[-1]
+
+
+def get_created_by(chebi_id):
+ '''Returns created by'''
+ if not __CREATED_BYS:
+ __parse_compounds()
+
+ return __CREATED_BYS[chebi_id] if chebi_id in __MODIFIED_ONS else None
+
+
+def get_star(chebi_id):
+ '''Returns star'''
+ if not __STARS:
+ __parse_compounds()
+
+ return __STARS[chebi_id] if chebi_id in __STARS else float('NaN')
+
+
+def __parse_compounds():
+ '''Gets and parses file'''
+ filename = get_file('compounds.tsv.gz')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+ chebi_id = int(tokens[0])
+
+ __STATUSES[chebi_id] = tokens[1]
+ __SOURCES[chebi_id] = tokens[3]
+
+ parent_id_token = tokens[4]
+ __PARENT_IDS[chebi_id] = float('NaN') \
+ if parent_id_token == 'null' \
+ else int(parent_id_token)
+ __put_all_ids(chebi_id, chebi_id)
+
+ if parent_id_token != 'null':
+ parent_id = int(parent_id_token)
+ __put_all_ids(parent_id, chebi_id)
+
+ __NAMES[chebi_id] = None if tokens[5] == 'null' else tokens[5]
+ __DEFINITIONS[chebi_id] = None if tokens[6] == 'null' \
+ else tokens[6]
+ __MODIFIED_ONS[chebi_id] = None if tokens[7] == 'null' \
+ else datetime.datetime.strptime(tokens[7], '%Y-%m-%d')
+ __CREATED_BYS[chebi_id] = None if tokens[8] == 'null' \
+ or len(tokens) == 9 else tokens[8]
+ __STARS[chebi_id] = float('NaN') \
+ if tokens[9 if len(tokens) > 9 else 8] == 'null' \
+ else int(tokens[9 if len(tokens) > 9 else 8])
+
+
+def __put_all_ids(parent_id, child_id):
+ '''COMMENT'''
+ if parent_id in __ALL_IDS:
+ __ALL_IDS[parent_id].append(child_id)
+ else:
+ __ALL_IDS[parent_id] = [child_id]
+
+
+def get_database_accessions(chebi_id):
+ '''Returns database accession'''
+ if not __DATABASE_ACCESSIONS:
+ __parse_database_accessions()
+
+ return __DATABASE_ACCESSIONS[chebi_id] if chebi_id in \
+ __DATABASE_ACCESSIONS else []
+
+
+def get_all_database_accessions(chebi_ids):
+ '''Returns all database accessions'''
+ all_database_accessions = [get_database_accessions(chebi_id)
+ for chebi_id in chebi_ids]
+ return [x for sublist in all_database_accessions for x in sublist]
+
+
+def __parse_database_accessions():
+ '''Gets and parses file'''
+ filename = get_file('database_accession.tsv')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+ chebi_id = int(tokens[1])
+
+ if chebi_id not in __DATABASE_ACCESSIONS:
+ __DATABASE_ACCESSIONS[chebi_id] = []
+
+ # Append DatabaseAccession:
+ dat_acc = DatabaseAccession(tokens[3], tokens[4], tokens[2])
+
+ __DATABASE_ACCESSIONS[chebi_id].append(dat_acc)
+
+
+def get_inchi(chebi_id):
+ '''Returns InChI string'''
+ if not __INCHIS:
+ __parse_inchi()
+
+ return __INCHIS[chebi_id] if chebi_id in __INCHIS else None
+
+
+def __parse_inchi():
+ '''Gets and parses file'''
+ filename = get_file('chebiId_inchi.tsv')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+ __INCHIS[int(tokens[0])] = tokens[1]
+
+
+def get_names(chebi_id):
+ '''Returns names'''
+ if not __ALL_NAMES:
+ __parse_names()
+
+ return __ALL_NAMES[chebi_id] if chebi_id in __ALL_NAMES else []
+
+
+def get_all_names(chebi_ids):
+ '''Returns all names'''
+ all_names = [get_names(chebi_id) for chebi_id in chebi_ids]
+ return [x for sublist in all_names for x in sublist]
+
+
+def __parse_names():
+ '''Gets and parses file'''
+ filename = get_file('names.tsv.gz')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+ chebi_id = int(tokens[1])
+
+ if chebi_id not in __ALL_NAMES:
+ __ALL_NAMES[chebi_id] = []
+
+ # Append Name:
+ nme = Name(tokens[4],
+ tokens[2],
+ tokens[3],
+ tokens[5] == 'T',
+ tokens[6])
+
+ __ALL_NAMES[chebi_id].append(nme)
+
+
+def get_references(chebi_ids):
+ '''Returns references'''
+ references = []
+ chebi_ids = [str(chebi_id) for chebi_id in chebi_ids]
+
+ filename = get_file('reference.tsv.gz')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split('\t')
+
+ if tokens[0] in chebi_ids:
+ # Append Reference:
+ if len(tokens) > 3:
+ ref = Reference(tokens[1], tokens[2], tokens[3],
+ tokens[4])
+ else:
+ ref = Reference(tokens[1], tokens[2])
+
+ references.append(ref)
+ return references
+
+
+def get_outgoings(chebi_id):
+ '''Returns outgoings'''
+ if not __OUTGOINGS:
+ __parse_relation()
+
+ return __OUTGOINGS[chebi_id] if chebi_id in __OUTGOINGS else []
+
+
+def get_all_outgoings(chebi_ids):
+ '''Returns all outgoings'''
+ all_outgoings = [get_outgoings(chebi_id) for chebi_id in chebi_ids]
+ return [x for sublist in all_outgoings for x in sublist]
+
+
+def get_incomings(chebi_id):
+ '''Returns incomings'''
+ if not __INCOMINGS:
+ __parse_relation()
+
+ return __INCOMINGS[chebi_id] if chebi_id in __INCOMINGS else []
+
+
+def get_all_incomings(chebi_ids):
+ '''Returns all incomings'''
+ all_incomings = [get_incomings(chebi_id) for chebi_id in chebi_ids]
+ return [x for sublist in all_incomings for x in sublist]
+
+
+def __parse_relation():
+ '''Gets and parses file'''
+ relation_filename = get_file('relation.tsv')
+ relation_textfile = open(relation_filename, 'r')
+
+ next(relation_textfile)
+
+ for line in relation_textfile:
+ tokens = line.strip().split('\t')
+
+ source_chebi_id = int(tokens[3])
+ target_chebi_id = int(tokens[2])
+ typ = tokens[1]
+
+ if source_chebi_id not in __OUTGOINGS:
+ __OUTGOINGS[source_chebi_id] = []
+
+ if target_chebi_id not in __INCOMINGS:
+ __INCOMINGS[target_chebi_id] = []
+
+ target_relation = Relation(typ, str(target_chebi_id), tokens[4])
+ source_relation = Relation(typ, str(source_chebi_id), tokens[4])
+
+ __OUTGOINGS[source_chebi_id].append(target_relation)
+ __INCOMINGS[target_chebi_id].append(source_relation)
+
+
+def get_inchi_key(chebi_id):
+ '''Returns InChI key'''
+ if not __INCHI_KEYS:
+ __parse_structures()
+
+ return __INCHI_KEYS[chebi_id] if chebi_id in __INCHI_KEYS else None
+
+
+def get_smiles(chebi_id):
+ '''Returns InChI key'''
+ if not __SMILES:
+ __parse_structures()
+
+ return __SMILES[chebi_id] if chebi_id in __SMILES else None
+
+
+def get_mol(chebi_id):
+ '''Returns mol'''
+ chebi_id_regexp = '^\\d+\\,' + str(chebi_id) + '\\,.*'
+ mol_file_end_regexp = '\",mol,\\dD,[Y\\|N],[Y\\|N]$'
+ this_structure = []
+
+ filename = get_file('structures.csv.gz')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ in_chebi_id = False
+
+ next(textfile)
+
+ for line in textfile:
+ if in_chebi_id or line[0].isdigit():
+ if re.match(chebi_id_regexp, line):
+ tokens = line.strip().split(',')
+ in_chebi_id = True
+ this_structure = []
+ this_structure.append(','.join(tokens[2:])
+ .replace('\"', ''))
+ this_structure.append('\n')
+ elif in_chebi_id:
+
+ if re.match(mol_file_end_regexp, line):
+ tokens = line.strip().split(',')
+
+ if _is_default_structure(tokens[3]):
+ tokens = line.strip().split(',')
+ this_structure.append(tokens[0].replace('\"', ''))
+ return Structure(''.join(this_structure),
+ Structure.mol,
+ int(tokens[2][0]))
+
+ # else:
+ this_structure = []
+ in_chebi_id = False
+ continue
+
+ this_structure.append(line)
+
+ return None
+
+
+def get_mol_filename(chebi_id):
+ '''Returns mol file'''
+ mol = get_mol(chebi_id)
+
+ if mol is None:
+ return None
+
+ file_descriptor, mol_filename = tempfile.mkstemp(str(chebi_id) +
+ '_', '.mol')
+ mol_file = open(mol_filename, 'w')
+ mol_file.write(mol.get_structure())
+ mol_file.close()
+ os.close(file_descriptor)
+
+ return mol_filename
+
+
+def __parse_structures():
+ '''COMMENT'''
+ filename = get_file('structures.csv.gz')
+
+ with io.open(filename, 'r', encoding='cp1252') as textfile:
+ next(textfile)
+
+ for line in textfile:
+ tokens = line.strip().split(',')
+
+ if len(tokens) == 7:
+ if tokens[3] == 'InChIKey':
+ __INCHI_KEYS[int(tokens[1])] = \
+ Structure(tokens[2],
+ Structure.InChIKey,
+ int(tokens[4][0]))
+ elif tokens[3] == 'SMILES':
+ __SMILES[int(tokens[1])] = \
+ Structure(tokens[2],
+ Structure.SMILES,
+ int(tokens[4][0]))
+
+
+def get_file(filename):
+ '''Downloads filename from ChEBI FTP site'''
+ destination = __DOWNLOAD_PARAMS['path']
+ filepath = os.path.join(destination, filename)
+
+ if not __is_current(filepath):
+
+ if not os.path.exists(destination):
+ os.makedirs(destination)
+
+ url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \
+ 'Flat_file_tab_delimited/'
+
+ wget.download(url+filename, out=filepath)
+
+ #urlretrieve(urlparse.urljoin(url, filename), filepath)
+ #urlcleanup()
+
+ if filepath.endswith('.zip'):
+ zfile = zipfile.ZipFile(filepath, 'r')
+ filepath = os.path.join(destination, zfile.namelist()[0])
+ zfile.extractall(destination)
+ elif filepath.endswith('.gz'):
+ unzipped_filepath = filepath[:-len('.gz')]
+
+ if os.path.exists(unzipped_filepath) \
+ and __is_current(unzipped_filepath):
+ filepath = unzipped_filepath
+ else:
+ input_file = gzip.open(filepath, 'rb')
+ filepath = os.path.join(destination, input_file.name[:-len('.gz')])
+ output_file = open(filepath, 'wb')
+
+ for line in input_file:
+ output_file.write(line)
+
+ input_file.close()
+ output_file.close()
+
+ return filepath
+
+
+def __is_current(filepath):
+ '''Checks whether file is current'''
+ if not __DOWNLOAD_PARAMS['auto_update']:
+ return True
+
+ if not os.path.isfile(filepath):
+ return False
+
+ return datetime.datetime.utcfromtimestamp(os.path.getmtime(filepath)) \
+ > __get_last_update_time()
+
+
+def __get_last_update_time():
+ '''Returns last FTP site update time'''
+ now = datetime.datetime.utcnow()
+
+ # Get the first Tuesday of the month
+ first_tuesday = __get_first_tuesday(now)
+
+ if first_tuesday < now:
+ return first_tuesday
+ # else:
+ first_of_month = datetime.datetime(now.year, now.month, 1)
+ last_month = first_of_month + datetime.timedelta(days=-1)
+ return __get_first_tuesday(last_month)
+
+
+def __get_first_tuesday(this_date):
+ '''Get the first Tuesday of the month'''
+ month_range = calendar.monthrange(this_date.year, this_date.month)
+ first_of_month = datetime.datetime(this_date.year, this_date.month, 1)
+ first_tuesday_day = (calendar.TUESDAY - month_range[0]) % 7
+ first_tuesday = first_of_month + datetime.timedelta(days=first_tuesday_day)
+ return first_tuesday
+
+
+def _is_default_structure(def_struct):
+ '''Is default structure?'''
+ return def_struct.upper() == 'Y'