From cbab34cdd5248ea9c723252026a13691cc17a0ac Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:05:48 -0600 Subject: [PATCH 01/29] Create README.md --- metanetx_uniprot/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 metanetx_uniprot/README.md diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md new file mode 100644 index 00000000..1165c3cc --- /dev/null +++ b/metanetx_uniprot/README.md @@ -0,0 +1 @@ +Code is reused from Biochem4j: https://github.com/neilswainston/biochem4j/tree/master/sbcdb From f875441805acf65b2e03f16fb79b15e306cb090a Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:07:24 -0600 Subject: [PATCH 02/29] Add files via upload --- metanetx_uniprot/build.py | 73 ++++++ metanetx_uniprot/chebi_utils.py | 39 +++ metanetx_uniprot/chemical_utils.py | 172 ++++++++++++ metanetx_uniprot/enzyme_utils.py | 65 +++++ metanetx_uniprot/index.py | 32 +++ metanetx_uniprot/init.cql | 35 +++ metanetx_uniprot/kegg_utils.py | 93 +++++++ metanetx_uniprot/mnxref_utils.py | 291 +++++++++++++++++++++ metanetx_uniprot/namespace_utils.py | 61 +++++ metanetx_uniprot/ncbi_taxonomy_utils.py | 93 +++++++ metanetx_uniprot/reaction_utils.py | 82 ++++++ metanetx_uniprot/rhea_utils.py | 63 +++++ metanetx_uniprot/seq_utils.py | 112 ++++++++ metanetx_uniprot/spectra_utils.py | 122 +++++++++ metanetx_uniprot/test/__init__.py | 9 + metanetx_uniprot/test/test_enzyme_utils.py | 39 +++ metanetx_uniprot/test/test_mnxref_utils.py | 37 +++ metanetx_uniprot/utils.py | 73 ++++++ 18 files changed, 1491 insertions(+) create mode 100644 metanetx_uniprot/build.py create mode 100644 metanetx_uniprot/chebi_utils.py create mode 100644 metanetx_uniprot/chemical_utils.py create mode 100644 metanetx_uniprot/enzyme_utils.py create mode 100644 metanetx_uniprot/index.py create mode 100644 metanetx_uniprot/init.cql create mode 100644 metanetx_uniprot/kegg_utils.py create mode 100644 metanetx_uniprot/mnxref_utils.py create mode 100644 metanetx_uniprot/namespace_utils.py create mode 100644 metanetx_uniprot/ncbi_taxonomy_utils.py create mode 100644 metanetx_uniprot/reaction_utils.py create mode 100644 metanetx_uniprot/rhea_utils.py create mode 100644 metanetx_uniprot/seq_utils.py create mode 100644 metanetx_uniprot/spectra_utils.py create mode 100644 metanetx_uniprot/test/__init__.py create mode 100644 metanetx_uniprot/test/test_enzyme_utils.py create mode 100644 metanetx_uniprot/test/test_mnxref_utils.py create mode 100644 metanetx_uniprot/utils.py diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py new file mode 100644 index 00000000..c276e641 --- /dev/null +++ b/metanetx_uniprot/build.py @@ -0,0 +1,73 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import multiprocessing +import sys + +import chebi_utils, chemical_utils, mnxref_utils, \ + ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils, seq_utils #, kegg_utils + + +def build_csv(dest_dir, array_delimiter, num_threads): + '''Build database CSV files.''' + writer = utils.Writer(dest_dir) + + # Get Organism data: + print('Parsing NCBI Taxonomy') + #ncbi_taxonomy_utils.load(writer, array_delimiter) + + # Get Chemical and Reaction data. + # Write chemistry csv files: + chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) + reac_man = reaction_utils.ReactionManager() + + + #print('Parsing MNXref') + mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer) + mnx_loader.load() + + print('Parsing ChEBI') + #chebi_utils.load(chem_man, writer) + + ####Using all memory (120+Gb) and eventually is killed + # Get Spectrum data: + #print('Parsing spectrum data') + #spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter) + + #chem_man.write_files(writer) + + ####Not including KEGG for now + # Get Reaction / Enzyme / Organism data: + #print('Parsing KEGG') + #kegg_utils.load(reac_man, num_threads=num_threads) + + + print('Parsing Rhea') + #rhea_utils.load(reac_man, num_threads=num_threads) + #reac_man.write_files(writer) + + +def main(args): + '''main method''' + num_threads = 0 + + if len(args) > 2: + try: + num_threads = int(args[2]) + except ValueError: + if args[2] == 'True': + num_threads = multiprocessing.cpu_count() + + print('Running build with ' + str(num_threads) + ' threads') + + build_csv(args[0], args[1], num_threads) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/metanetx_uniprot/chebi_utils.py b/metanetx_uniprot/chebi_utils.py new file mode 100644 index 00000000..284a687d --- /dev/null +++ b/metanetx_uniprot/chebi_utils.py @@ -0,0 +1,39 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +from libchebipy._chebi_entity import ChebiEntity + + +def load(chem_manager, writer): + '''Loads ChEBI data from libChEBIpy.''' + chebi_ids = [] + rels = [] + + _add_node('CHEBI:24431', chebi_ids, rels, chem_manager) + + writer.write_rels(rels, 'Chemical', 'Chemical') + + +def _add_node(chebi_id, chebi_ids, rels, chem_manager): + '''Constructs a node from libChEBI.''' + if chebi_id not in chebi_ids: + chebi_ids.append(chebi_id) + + chem_id, entity = chem_manager.add_chemical({'chebi': chebi_id}) + + for incoming in entity.get_incomings(): + target_id = incoming.get_target_chebi_id() + + chebi_ent = ChebiEntity(target_id) + + if chebi_ent.get_parent_id(): + target_id = chebi_ent.get_parent_id() + + _add_node(target_id, chebi_ids, rels, chem_manager) + rels.append([target_id, incoming.get_type(), chem_id]) diff --git a/metanetx_uniprot/chemical_utils.py b/metanetx_uniprot/chemical_utils.py new file mode 100644 index 00000000..30567ed4 --- /dev/null +++ b/metanetx_uniprot/chemical_utils.py @@ -0,0 +1,172 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import math +import uuid + +from libchebipy._chebi_entity import ChebiEntity, ChebiException + +import namespace_utils as ns_utils +from synbiochem.utils import chem_utils + + +class ChemicalManager(object): + '''Class to implement a manager of Chemical data.''' + + def __init__(self, array_delimiter): + '''Constructor.''' + self.__array_delimiter = array_delimiter + self.__nodes = {} + self.__chem_ids = {} + + def write_files(self, writer): + '''Write neo4j import files.''' + return writer.write_nodes(self.__nodes.values(), 'Chemical') + + def add_chemical(self, properties): + '''Adds a chemical to the collection of nodes, ensuring uniqueness.''' + chem_id, chebi_ent = self.__get_chem_id(properties) + + if 'charge:float' in properties: + charge = properties.pop('charge:float') + + if not math.isnan(charge): + properties['charge:float'] = int(charge) + + if chem_id not in self.__nodes: + properties[':LABEL'] = 'Chemical' + properties['id:ID(Chemical)'] = chem_id + properties['source'] = 'chebi' if 'chebi' in properties else 'mnx' + + _normalise_mass(properties) + self.__nodes[chem_id] = properties + else: + self.__nodes[chem_id].update(properties) + + return chem_id, chebi_ent + + def get_props(self, prop, default=None): + '''Gets all chem_ids to property as a dict.''' + return {key: self.__nodes[chem_id].get(prop, default) + for key, chem_id in self.__chem_ids.items()} + + def get_prop(self, chem_id, prop, default=None): + '''Gets a property.''' + return self.__nodes[self.__chem_ids[chem_id]].get(prop, default) + + def __get_chem_id(self, properties): + '''Manages chemical id mapping.''' + chebi_id = properties.get('chebi', None) + chebi_ent = None + + if chebi_id: + try: + chebi_id, chebi_ent = _get_chebi_data(chebi_id, properties, + self.__array_delimiter) + except ChebiException as exception: + properties.pop('chebi') + chebi_id = None + print(exception) + except ValueError as exception: + properties.pop('chebi') + chebi_id = None + print(exception) + + mnx_id = properties.get('mnx', None) + inchi_id = properties.get('inchi', None) + + if chebi_id: + self.__chem_ids[chebi_id] = chebi_id + + if inchi_id: + self.__chem_ids[inchi_id] = chebi_id + + if mnx_id: + self.__chem_ids[mnx_id] = chebi_id + + return chebi_id, chebi_ent + + if inchi_id: + chem_id = self.__chem_ids.get(inchi_id, None) + + if chem_id: + return chem_id, None + + if mnx_id: + chem_id = self.__chem_ids.get(mnx_id, None) + + if chem_id: + return chem_id, None + + if inchi_id: + self.__chem_ids[inchi_id] = mnx_id + + self.__chem_ids[mnx_id] = mnx_id + return mnx_id, None + + new_id = str(uuid.uuid4()) + self.__chem_ids[inchi_id] = new_id + + return new_id, None + + +def _get_chebi_data(chebi_id, properties, array_delimiter): + '''Gets ChEBI data.''' + chebi_ent = ChebiEntity(str(chebi_id)) + + if chebi_ent.get_parent_id(): + chebi_id = chebi_ent.get_parent_id() + else: + chebi_id = chebi_ent.get_id() + + properties['chebi'] = chebi_id + + formula = chebi_ent.get_formula() + charge = chebi_ent.get_charge() + inchi = chebi_ent.get_inchi() + smiles = chebi_ent.get_smiles() + + if formula: + properties['formula'] = formula + + if not math.isnan(charge): + properties['charge:float'] = charge + + if inchi: + properties['inchi'] = inchi + + if smiles: + properties['smiles'] = smiles + + properties['name'] = chebi_ent.get_name() + properties['names:string[]'] = \ + array_delimiter.join([name.get_name() + for name in chebi_ent.get_names()] + + [chebi_ent.get_name()]) + + for db_acc in chebi_ent.get_database_accessions(): + namespace = ns_utils.resolve_namespace( + db_acc.get_type(), True) + + if namespace is not None: + properties[namespace] = db_acc.get_accession_number() + + return chebi_id, chebi_ent + + +def _normalise_mass(properties): + '''Removes ambiguity in mass values by recalculating according to chemical + formula.''' + properties.pop('mass:float', None) + + if 'formula' in properties and properties['formula'] is not None: + mono_mass = chem_utils.get_molecular_mass(properties['formula']) + + if not math.isnan(mono_mass): + properties['monoisotopic_mass:float'] = mono_mass diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py new file mode 100644 index 00000000..6f90b475 --- /dev/null +++ b/metanetx_uniprot/enzyme_utils.py @@ -0,0 +1,65 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +#from synbiochem.utils import seq_utils +import queue +from seq_utils import * + + +class EnzymeManager(object): + '''Class to implement a manager of Enzyme data.''' + + def __init__(self): + '''Constructor.''' + self.__nodes = {} + self.__org_enz_rels = [] + + def get_nodes(self): + '''Gets enzyme nodes.''' + return self.__nodes.values() + + def get_org_enz_rels(self): + '''Gets organism-to-enzyme relationships.''' + return self.__org_enz_rels + + def add_uniprot_data(self, enzyme_ids, source, num_threads=0): + '''Gets Uniprot data.''' + + #fields = ['entry name', 'protein names', 'organism-id', 'ec'] + fields = ['id', 'protein_name', 'organism_id', 'ec'] + enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids + if enzyme_id not in self.__nodes] + uniprot_values = get_uniprot_values(enzyme_ids, fields, + batch_size=128, # changed to 128 from 512 + verbose=False, #Changed to False + num_threads=num_threads) + + for uniprot_id, uniprot_value in uniprot_values.items(): + enzyme_node = {':LABEL': 'Enzyme', + 'uniprot:ID(Enzyme)': uniprot_id} + self.__nodes[uniprot_id] = enzyme_node + + organism_id = uniprot_value.pop('Organism (ID)') \ + if 'Organism (ID)' in uniprot_value else None + + if 'Entry name' in uniprot_value: + enzyme_node['entry'] = uniprot_value['Entry name'] + + if 'Protein names' in uniprot_value: + enzyme_node['names'] = uniprot_value['Protein names'] + + if enzyme_node['names']: + enzyme_node['name'] = enzyme_node['names'][0] + + if 'EC number' in uniprot_value: + enzyme_node['ec-code'] = uniprot_value['EC number'] + + if organism_id: + self.__org_enz_rels.append([organism_id, 'expresses', + uniprot_id, {'source': source}]) diff --git a/metanetx_uniprot/index.py b/metanetx_uniprot/index.py new file mode 100644 index 00000000..3adb2499 --- /dev/null +++ b/metanetx_uniprot/index.py @@ -0,0 +1,32 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import os +import subprocess +import sys + + +def index_db(db_loc): + '''Index database.''' + directory = os.path.dirname(os.path.realpath(__file__)) + filename = os.path.join(directory, 'init.cql') + + with open(filename, 'rU') as init_file: + for line in init_file: + params = ['neo4j-shell', '-path', db_loc, '-c', line.strip()] + subprocess.call(params) + + +def main(argv): + '''main method''' + index_db(argv[0]) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/metanetx_uniprot/init.cql b/metanetx_uniprot/init.cql new file mode 100644 index 00000000..7e7216e9 --- /dev/null +++ b/metanetx_uniprot/init.cql @@ -0,0 +1,35 @@ +CREATE CONSTRAINT ON (n:Organism) ASSERT n.taxonomy IS UNIQUE; +CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.entry IS UNIQUE; +CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.uniprot IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`bigg.reaction` IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.id IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`kegg.reaction` IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.metacyc IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.mnx IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.reactome IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.rhea IS UNIQUE; +CREATE CONSTRAINT ON (n:Reaction) ASSERT n.seed IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`bigg.metabolite` IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.cas IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chebi IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemidplus IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemspider IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.drugbank IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.hmdb IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.id IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.compound` IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.drug` IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.glycan` IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.knapsack IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.lipidmaps IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.metacyc IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.mnx IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.molbase IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pdb IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pubmed IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.reactome IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.resid IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`seed.compound` IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`umbbd.compound` IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.unipathway IS UNIQUE; +CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`wikipedia.en` IS UNIQUE; \ No newline at end of file diff --git a/metanetx_uniprot/kegg_utils.py b/metanetx_uniprot/kegg_utils.py new file mode 100644 index 00000000..95c8d65b --- /dev/null +++ b/metanetx_uniprot/kegg_utils.py @@ -0,0 +1,93 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +from collections import defaultdict +import urllib +from urllib.request import urlopen + +from synbiochem.utils import thread_utils + + +def load(reaction_manager, organisms=None, num_threads=0): + '''Loads KEGG data.''' + + if organisms is None: + organisms = \ + sorted([line.split()[1] for line in + urllib.urlopen('http://rest.kegg.jp/list/organism')]) + + # EC to gene, gene to Uniprot: + ec_genes, gene_uniprots = _get_gene_data(organisms, num_threads) + + data = defaultdict(list) + + # KEGG Reaction to EC: + kegg_reac_ec = _parse_url('http://rest.kegg.jp/link/ec/reaction') + + for kegg_reac, ec_terms in kegg_reac_ec.items(): + for ec_term in ec_terms: + if ec_term in ec_genes: + for gene in ec_genes[ec_term]: + if gene in gene_uniprots: + uniprots = [val[3:] for val in gene_uniprots[gene]] + data[kegg_reac[3:]].extend(uniprots) + + reaction_manager.add_react_to_enz(data, 'kegg.reaction', num_threads) + + +def _get_gene_data(organisms, num_threads): + '''Gets gene data.''' + ec_genes = defaultdict(list) + gene_uniprots = defaultdict(list) + + if num_threads: + thread_pool = thread_utils.ThreadPool(num_threads) + + for org in organisms: + thread_pool.add_task(_parse_organism, org, ec_genes, gene_uniprots) + + thread_pool.wait_completion() + else: + for org in organisms: + _parse_organism(org, ec_genes, gene_uniprots) + + return ec_genes, gene_uniprots + + +def _parse_organism(org, ec_genes, gene_uniprots): + '''Parse organism.''' + print 'KEGG: loading ' + org + + for key, value in _parse_url('http://rest.kegg.jp/link/' + org.lower() + + '/enzyme').items(): + ec_genes[key].extend(value) + + for key, value in _parse_url('http://rest.kegg.jp/conv/uniprot/' + + org.lower()).items(): + gene_uniprots[key].extend(value) + + +def _parse_url(url, attempts=16): + '''Parses url to form key to list of values dictionary.''' + data = defaultdict(list) + + for _ in range(attempts): + try: + for line in urllib.urlopen(url): + tokens = line.split() + + if len(tokens) > 1: + data[tokens[0]].append(tokens[1]) + + return data + except urllib.URLError as err: + # Take no action, but try again... + print '\t'.join([url, str(err)]) + + return data diff --git a/metanetx_uniprot/mnxref_utils.py b/metanetx_uniprot/mnxref_utils.py new file mode 100644 index 00000000..cbb67687 --- /dev/null +++ b/metanetx_uniprot/mnxref_utils.py @@ -0,0 +1,291 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +# pylint: disable=no-member +# pylint: disable=too-few-public-methods +# pylint: disable=too-many-locals +from collections import Counter +import csv +import itertools +import math +import re +import urllib +from urllib.request import urlopen +import requests + +import numpy +#from subliminal import balance + +import namespace_utils +from synbiochem.utils import chem_utils + + +_METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/' + + +class MnxRefReader(object): + '''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and + reac_prop.tsv files.''' + + def __init__(self, source=_METANETX_URL): + self.__source = source + self.__mnx_id_patt = re.compile(r'(MNX[MR])(\d+)') + self.__chem_data = {} + self.__reac_data = {} + + def get_chem_data(self): + '''Gets chemical data.''' + if not self.__chem_data: + self.__read_chem_prop() + self.__read_xref('chem_xref.tsv', self.__chem_data, True) + + return self.__chem_data + + def get_reac_data(self): + '''Gets reaction data.''' + if not self.__reac_data: + self.__read_reac_prop() + self.__read_xref('reac_xref.tsv', self.__reac_data, False) + + return self.__reac_data + + def __read_chem_prop(self): + '''Read chemical properties and create Nodes.''' + chem_prop_keys = ['id', 'name', 'reference','formula', 'charge:float', + 'mass:float', 'inchi', 'inchikey', 'smiles'] + + for values in self.__read_data('chem_prop.tsv'): + if not values[0].startswith('#'): + values[0] = self.__parse_id(values[0]) + values[2] = self.__parse_id(values[2]) + props = dict(zip(chem_prop_keys, values)) + props.pop('reference') + _convert_to_float(props, 'charge:float') + _convert_to_float(props, 'mass:float') + props = {key: value for key, value in props.items() + if value != ''} + self.__chem_data[values[0]] = props + + def __read_xref(self, filename, data, chemical): + '''Read xrefs and update Nodes.''' + xref_keys = ['XREF', 'MNX_ID', 'Description'] + + for values in self.__read_data(filename): + if not values[0].startswith('#'): + xrefs = dict(zip(xref_keys[:len(values)], values)) + evidence = 'none' + + if evidence == 'identity' or evidence == 'structural': + xrefs['MNX_ID'] = self.__parse_id(xrefs['MNX_ID']) + xref = xrefs['XREF'].split(':') + + if xrefs['MNX_ID'] in data: + entry = data[xrefs['MNX_ID']] + self.__add_xref(xref, entry, chemical) + + def __add_xref(self, xref, entry, chemical): + '''Adds an xref.''' + namespace = namespace_utils.resolve_namespace(xref[0], + chemical) + + if namespace is not None: + xref[1] = self.__parse_id(xref[1]) + + entry[namespace] = xref[1] \ + if namespace != 'chebi' \ + else 'CHEBI:' + xref[1] + + def __read_reac_prop(self): + '''Read reaction properties and create Nodes.''' + reac_prop_keys = ['id', 'equation', 'reference', 'ec', 'balance', 'transport'] + + for values in self.__read_data('reac_prop.tsv'): + if not values[0].startswith('#'): + values[0] = self.__parse_id(values[0]) + values[2] = self.__parse_id(values[2]) + + props = dict(zip(reac_prop_keys, values)) + props.pop('reference') + + try: + participants = chem_utils.parse_equation( + props.pop('equation')) + + for participant in participants: + participant[0] = self.__parse_id(participant[0]) + + if participant[0] not in self.__chem_data: + self.__add_chem(participant[0]) + + props['reac_defn'] = participants + self.__reac_data[values[0]] = props + except ValueError: + print('WARNING: Suspected polymerisation reaction: ' + \ + values[0] + '\t' + str(props)) + + def __add_chem(self, chem_id): + '''Adds a chemical with given id.''' + props = {'id': chem_id} + self.__chem_data[chem_id] = props + return props + + def __read_data(self, filename): + '''Downloads and reads tab-limited files into lists of lists of + strings.''' + with requests.Session() as s: + download = s.get(self.__source + filename) + + decoded_content = download.content.decode('utf-8') + + cr = csv.reader(decoded_content.splitlines(), delimiter='\t') + my_list = list(cr) + return my_list + + + def __parse_id(self, item_id): + '''Parses mnx ids.''' + matches = self.__mnx_id_patt.findall(item_id) + + for mat in matches: + return mat[0] + str(int(mat[1])) + + return item_id + + +class MnxRefLoader(object): + '''Loads MNXref data into neo4j format.''' + + def __init__(self, chem_man, reac_man, writer): + self.__chem_man = chem_man + self.__reac_man = reac_man + self.__writer = writer + + def load(self): + '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv, + reac_prop.tsv and reac_xref.tsv files.''' + reader = MnxRefReader() + + for properties in reader.get_chem_data().values(): + properties['mnx'] = properties.pop('id') + self.__chem_man.add_chemical(properties) + + rels = self.__add_reac_nodes(reader.get_reac_data()) + + return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')] + + def __add_reac_nodes(self, reac_data): + '''Get reaction nodes from data.''' + reac_id_def = {} + + for properties in reac_data.values(): + reac_def = [] + mnx_id = properties.pop('id') + + # Remove equation and description (may be inconsistent with + # balanced reaction): + if 'description' in properties: + properties.pop('description') + + for prt in properties.pop('reac_defn'): + chem_id, _ = self.__chem_man.add_chemical({'mnx': prt[0]}) + + reac_def.append([self.__chem_man.get_prop(prt[0], 'formula'), + self.__chem_man.get_prop(prt[0], + 'charge:float', 0), + prt[1], + chem_id]) + + #NOT BALANCING REACTIONS since this library doesn't seem to exist anymore + ''' + if all([values[0] is not None for values in reac_def]): + balanced, _, balanced_def = balance.balance_reac(reac_def) + #properties['balance'] = balanced + else: + properties['balance'] = 'unknown' + balanced_def = reac_def + ''' + properties['balance'] = 'unknown' + balanced_def = reac_def + + + reac_id = self.__reac_man.add_reaction('mnx', mnx_id, + properties) + reac_id_def[reac_id] = balanced_def + + chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float', + float('NaN')) + cofactors = [chem_id + for chem_id, mass in chem_id_mass.items() + if mass > 0 and mass < 44] # Assume mass < CO2 = cofactor + + cofactor_pairs = _calc_cofactors(reac_id_def.values(), cofactors) + rels = [] + + for reac_id, defn in reac_id_def.items(): + reactants = [term[3] for term in defn if term[2] < 0] + products = [term[3] for term in defn if term[2] > 0] + reac_cofactors = [] + + # Set metabolites as cofactors: + for met in [term[3] for term in defn]: + if met in cofactors: + reac_cofactors.append(met) + + # Set pairs as cofactors: + for pair in itertools.product(reactants, products): + if tuple(sorted(pair)) in cofactor_pairs: + reac_cofactors.extend(pair) + + for term in defn: + rels.append([reac_id, + 'has_cofactor' if term[3] in reac_cofactors + else 'has_reactant', + term[3], + {'stoichiometry:float': term[2]}]) + + return rels + + +def _calc_cofactors(reaction_defs, cofactors, cutoff=0.8): + '''Calculates cofactors.''' + pairs = Counter() + + # Calculate all reactant / product pairs... + for reaction_def in reaction_defs: + reactants = [term[3] for term in reaction_def if term[2] < 0 and + term[3] not in cofactors] + products = [term[3] for term in reaction_def if term[2] > 0 and + term[3] not in cofactors] + + pairs.update([tuple(sorted(pair)) + for pair in itertools.product(reactants, products)]) + + return _filter(pairs, cutoff) + + +def _filter(counter, cutoff): + '''Filter counter items according to cutoff.''' + # Count occurences of pairs, then bin into a histogram... + hist_counter = Counter(counter.values()) + + # Fit straight-line to histogram log-log plot and filter... + x_val, y_val = zip(*list(hist_counter.items())) + m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1) + + return [item[0] for item in counter.items() + if item[1] > math.exp(cutoff * -b_val / m_val)] + + +def _convert_to_float(dictionary, key): + '''Converts a key value in a dictionary to a float.''' + if dictionary.get(key, None): + dictionary[key] = float(dictionary[key] if dictionary[key] != 'NA' else 'NaN') + else: + # Remove key: + dictionary.pop(key, None) diff --git a/metanetx_uniprot/namespace_utils.py b/metanetx_uniprot/namespace_utils.py new file mode 100644 index 00000000..bb6bd665 --- /dev/null +++ b/metanetx_uniprot/namespace_utils.py @@ -0,0 +1,61 @@ +''' +synbiochem (c) University of Manchester 2015 + +synbiochem is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +__CHEMICAL_NAMESPACE = { + # value (namespace) corresponds to identifiers.org: + 'bigg': 'bigg.metabolite', + 'CAS Registry Number': 'cas', + 'chebi': 'chebi', + 'ChemIDplus accession': 'chemidplus', + 'Chemspider accession': 'chemspider', + 'DrugBank accession': 'drugbank', + 'hmdb': 'hmdb', + 'HMDB accession': 'hmdb', + 'kegg': 'kegg.compound', + 'KEGG COMPOUND accession': 'kegg.compound', + 'KEGG DRUG accession': 'kegg.drug', + 'KEGG GLYCAN accession': 'kegg.glycan', + 'KNApSAcK accession': 'knapsack', + 'lipidmaps': 'lipidmaps', + 'LIPID MAPS instance accession': 'lipidmaps', + 'MolBase accession': 'molbase', + 'PDB accession': 'pdb', + 'PubMed citation': 'pubmed', + 'reactome': 'reactome', + 'RESID accession': 'resid', + 'seed': 'seed.compound', + 'umbbd': 'umbbd.compound', + 'UM-BBD compID': 'umbbd.compound', + 'upa': 'unipathway', + 'Wikipedia accession': 'wikipedia.en', + + # Not in identifiers.org: + 'metacyc': 'metacyc', + 'MetaCyc accession': 'metacyc', + 'mnx': 'mnx' +} + +__REACTION_NAMESPACE = { + # value (namespace) corresponds to identifiers.org: + 'bigg': 'bigg.reaction', + 'kegg': 'kegg.reaction', + 'reactome': 'reactome', + 'rhea': 'rhea', + 'seed': 'seed', + + # Not in identifiers.org: + 'metacyc': 'metacyc', + 'mnx': 'mnx', +} + + +def resolve_namespace(name, chemical): + '''Maps name to distinct namespace from identifiers.org.''' + namespace = __CHEMICAL_NAMESPACE if chemical else __REACTION_NAMESPACE + return namespace[name] if name in namespace else None diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py new file mode 100644 index 00000000..8b7bd1d4 --- /dev/null +++ b/metanetx_uniprot/ncbi_taxonomy_utils.py @@ -0,0 +1,93 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import os +import sys +import tarfile +import tempfile +import urllib +from urllib.request import urlretrieve + + +__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' + + +def load(writer, array_delimiter, source=__NCBITAXONOMY_URL): + '''Loads NCBI Taxonomy data.''' + nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) + nodes, rels = _parse_nodes(nodes_filename, array_delimiter) + _parse_names(nodes, names_filename, array_delimiter) + + writer.write_nodes(nodes.values(), 'Organism') + writer.write_rels(rels, 'Organism', 'Organism') + + +def _get_ncbi_taxonomy_files(source): + '''Downloads and extracts NCBI Taxonomy files.''' + temp_dir = tempfile.gettempdir() + temp_gzipfile = tempfile.NamedTemporaryFile() + urlretrieve(source, temp_gzipfile.name) + + temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz') + temp_tarfile.extractall(temp_dir) + + temp_gzipfile.close() + temp_tarfile.close() + + return os.path.join(temp_dir, 'nodes.dmp'), \ + os.path.join(temp_dir, 'names.dmp') + + +def _parse_nodes(filename, array_delimiter): + '''Parses nodes file.''' + nodes = {} + rels = [] + + with open(filename, 'r') as textfile: + for line in textfile: + tokens = [x.strip() for x in line.split('|')] + tax_id = tokens[0] + + if tax_id != '1': + rels.append([tax_id, 'is_a', tokens[1]]) + + nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id, + ':LABEL': + 'Organism' + array_delimiter + tokens[2]} + + return nodes, rels + + +def _parse_names(nodes, filename, array_delimiter): + '''Parses names file.''' + + with open(filename, 'r') as textfile: + for line in textfile: + tokens = [x.strip() for x in line.split('|')] + node = nodes[tokens[0]] + + if 'name' not in node: + node['name'] = tokens[1] + node['names:string[]'] = set([node['name']]) + else: + node['names:string[]'].add(tokens[1]) + + for _, node in nodes.items(): + if 'names:string[]' in node: + node['names:string[]'] = \ + array_delimiter.join(node['names:string[]']) + + +def main(argv): + '''main method''' + load(*argv) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py new file mode 100644 index 00000000..2a6d9394 --- /dev/null +++ b/metanetx_uniprot/reaction_utils.py @@ -0,0 +1,82 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +from enzyme_utils import EnzymeManager + + +class ReactionManager(object): + '''Class to implement a manager of Reaction data.''' + + def __init__(self): + '''Constructor.''' + self.__nodes = {} + self.__reac_ids = {} + self.__reac_enz_rels = [] + self.__org_enz_rels = [] + self.__enz_man = EnzymeManager() + + def write_files(self, writer): + '''Write neo4j import files.''' + return ([writer.write_nodes(self.__nodes.values(), + 'Reaction'), + writer.write_nodes(self.__enz_man.get_nodes(), + 'Enzyme')], + [writer.write_rels(self.__reac_enz_rels, + 'Reaction', 'Enzyme'), + writer.write_rels(self.__enz_man.get_org_enz_rels(), + 'Organism', 'Enzyme')]) + + def add_reaction(self, source, reac_id, properties): + '''Adds a reaction to the collection of nodes, ensuring uniqueness.''' + reac_id = self.__reac_ids[source + reac_id] \ + if source + reac_id in self.__reac_ids else reac_id + + if reac_id not in self.__nodes: + properties[':LABEL'] = 'Reaction' + properties['id:ID(Reaction)'] = reac_id + properties['source'] = source + properties[source] = reac_id + self.__nodes[reac_id] = properties + + if 'mnx' in properties: + self.__reac_ids['mnx' + properties['mnx']] = reac_id + + if 'kegg.reaction' in properties: + self.__reac_ids[ + 'kegg.reaction' + properties['kegg.reaction']] = reac_id + + if 'rhea' in properties: + self.__reac_ids['rhea' + properties['rhea']] = reac_id + else: + self.__nodes[reac_id].update(properties) + + return reac_id + + def add_react_to_enz(self, data, source, num_threads=0): + '''Submit data to the graph.''' + # Create Reaction and Enzyme nodes: + enzyme_ids = self.__create_react_enz(data, source) + + # Create Enzyme nodes: + self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads) + + def __create_react_enz(self, data, source): + '''Creates Reaction and Enzyme nodes and their Relationships.''' + enzyme_ids = [] + + for reac_id, uniprot_ids in data.items(): + reac_id = self.add_reaction(source, reac_id, {}) + + for uniprot_id in uniprot_ids: + enzyme_ids.append(uniprot_id) + self.__reac_enz_rels.append([reac_id, 'catalysed_by', + uniprot_id, + {'source': source}]) + + return list(set(enzyme_ids)) diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py new file mode 100644 index 00000000..3d926091 --- /dev/null +++ b/metanetx_uniprot/rhea_utils.py @@ -0,0 +1,63 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import tempfile +import urllib +from urllib.request import urlretrieve + + +__RHEA_URL = 'ftp://ftp.expasy.org/databases/rhea/tsv/rhea2uniprot%5Fsprot.tsv' + + +def load(reaction_manager, source=__RHEA_URL, num_threads=0): + '''Loads Rhea data.''' + # Parse data: + temp_file = tempfile.NamedTemporaryFile() + urlretrieve(source, temp_file.name) + data = _parse(temp_file.name) + ''' + ###For testing, uncomment the following code + data_small = dict() + for key in sorted(data)[:50]: + data_small[key] = data[key] + data.clear() + data.update(data_small) + ''' + ######Not sure why source is Rhea here, calls to UniProt + reaction_manager.add_react_to_enz(data, 'rhea', num_threads) + + +def _parse(filename): + '''Parses file.''' + data = {} + + with open(filename, 'r') as textfile: + next(textfile) + + for line in textfile: + tokens = line.split('\t') + + if len(tokens) == 4: + uniprot_id = tokens[3].strip() + + if not tokens[0] or not tokens[2]: + print(','.join(tokens)) + + _add(data, tokens[0], uniprot_id) + _add(data, tokens[2], uniprot_id) + + return data + + +def _add(data, rhea_id, uniprot_id): + '''Adds Rhea id and Uniprot id to data.''' + if rhea_id in data: + data[rhea_id].append(uniprot_id) + else: + data[rhea_id] = [uniprot_id] diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py new file mode 100644 index 00000000..892b3a6c --- /dev/null +++ b/metanetx_uniprot/seq_utils.py @@ -0,0 +1,112 @@ +''' +synbiochem (c) University of Manchester 2015 + +synbiochem is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +from collections import defaultdict +import itertools +import operator +import os +import random +import re +import ssl +from subprocess import call +import tempfile +from urllib import parse + +from Bio import Seq, SeqIO, SeqRecord +from Bio.Blast import NCBIXML +from Bio.Data import CodonTable +from Bio.Restriction import Restriction, Restriction_Dictionary +from Bio.SeqUtils.MeltingTemp import Tm_NN +import requests +from synbiochem.biochem4j import taxonomy +from synbiochem.utils import thread_utils +import queue + +import numpy as np + +def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False, + num_threads=0): + '''Gets dictionary of ids to values from Uniprot.''' + values = [] + + if num_threads: + thread_pool = thread_utils.ThreadPool(num_threads) + + for i in range(0, len(uniprot_ids), batch_size): + thread_pool.add_task(_get_uniprot_batch, uniprot_ids, i, + batch_size, fields, values, verbose) + + thread_pool.wait_completion() + else: + for i in range(0, len(uniprot_ids), batch_size): + _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values, + verbose) + + return {value['Entry']: value for value in values} + + +def search_uniprot(query, fields, limit=128): + '''Gets dictionary of ids to values from Uniprot.''' + values = [] + + url = 'http://www.uniprot.org/uniprot/?query=' + parse.quote(query) + \ + '&sort=score&limit=' + str(limit) + \ + '&format=tab&columns=id,' + ','.join([parse.quote(field) + for field in fields]) + + _parse_uniprot_data(url, values) + + return values + + +def _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values, verbose): + '''Get batch of Uniprot data.''' + if verbose: + print('seq_utils: getting Uniprot values ' + str(i) + ' - ' + + str(min(i + batch_size, len(uniprot_ids))) + ' / ' + + str(len(uniprot_ids))) + + #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE + batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))] + query = '%20OR%20'.join(['accession:' + uniprot_id for uniprot_id in batch]) + url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \ + '&format=tsv&fields=accession%2C' + '%2C'.join([parse.quote(field) + for field in fields]) + + _parse_uniprot_data(url, values) + + +def _parse_uniprot_data(url, values): + '''Parses Uniprot data.''' + headers = None + + try: + resp = requests.get(url, allow_redirects=True) + + for line in resp.iter_lines(): + line = line.decode('utf-8') + tokens = line.strip().split('\t') + + if headers is None: + headers = tokens + else: + resp = dict(zip(headers, tokens)) + + if 'Protein names' in resp: + regexp = re.compile(r'(?<=\()[^)]*(?=\))|^[^(][^()]*') + names = regexp.findall(resp.pop('Protein names')) + resp['Protein names'] = [nme.strip() for nme in names] + + for key in resp: + if key.startswith('Cross-reference'): + resp[key] = resp[key].split(';') + + values.append(resp) + except Exception as err: + print(err) \ No newline at end of file diff --git a/metanetx_uniprot/spectra_utils.py b/metanetx_uniprot/spectra_utils.py new file mode 100644 index 00000000..1efce1bb --- /dev/null +++ b/metanetx_uniprot/spectra_utils.py @@ -0,0 +1,122 @@ +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + +SYNBIOCHEM-DB is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import os +import tempfile +import urllib +import zipfile +from urllib.request import urlretrieve + +import ijson + + +__MONA_URL = 'http://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/' + \ + 'd2eb33f0-b22e-49a7-bc31-eb951f8347b2' + +__MONA_FILENAME = 'MoNA-export-All_Spectra.json' + +_NAME_MAP = {'kegg': 'kegg.compound', + 'molecular formula': 'formula', + 'total exact mass': 'monoisotopic_mass:float'} + + +def load(writer, chem_manager, + array_delimiter='|', url=__MONA_URL, filename=__MONA_FILENAME): + '''Build Spectrum nodes and relationships.''' + nodes = [] + rels = [] + + records = _parse(_get_file(url, filename), array_delimiter) + + for record in records: + chem_id, _ = chem_manager.add_chemical(record['chemical']) + nodes.append(record['spectrum']) + rels.append([chem_id, 'has', record['spectrum']['id:ID(Spectrum)']]) + + return [writer.write_nodes(nodes, 'Spectrum')], \ + [writer.write_rels(rels, 'Chemical', 'Spectrum')] + + +def _parse(filename, array_delimiter): + '''Parses MoNA json file.''' + records = [] + record = {'chemical': {'names:string[]': []}, + 'spectrum': {':LABEL': 'Spectrum', 'tags:string[]': []}} + name = None + + for prefix, typ, value in ijson.parse(open(filename)): + if prefix == 'item' and typ == 'start_map': + record = {'chemical': {'names:string[]': []}, + 'spectrum': {':LABEL': 'Spectrum', + 'tags:string[]': []}} + elif prefix == 'item.compound.item.inchi': + record['chemical']['inchi'] = value + elif prefix == 'item.compound.item.names.item.name': + if 'name' not in record['chemical']: + record['chemical']['name'] = value + record['chemical']['names:string[]'].append(value) + elif prefix == 'item.compound.item.metaData.item.name' or \ + prefix == 'item.metaData.item.name': + name = _normalise_name(value.lower()) + elif prefix == 'item.compound.item.metaData.item.value': + _parse_compound_metadata(name, value, record) + name = None + elif prefix == 'item.id': + record['spectrum']['id:ID(Spectrum)'] = value + elif prefix == 'item.metaData.item.value': + record['spectrum'][name] = value + name = None + elif prefix == 'item.spectrum': + values = [float(val) for term in value.split() + for val in term.split(':')] + record['spectrum']['m/z:float[]'] = \ + array_delimiter.join(map(str, values[0::2])) + record['spectrum']['I:float[]'] = \ + array_delimiter.join(map(str, values[1::2])) + elif prefix == 'item.tags.item.text': + record['spectrum']['tags:string[]'].append(value) + elif prefix == 'item' and typ == 'end_map': + records.append(record) + + return records + + +def _get_file(url, filename): + '''Gets file from url.''' + destination = os.path.join(os.path.expanduser('~'), 'MoNA') + + if not os.path.exists(destination): + os.makedirs(destination) + + filepath = os.path.join(destination, filename) + + if not os.path.exists(filepath): + tmp_file = tempfile.NamedTemporaryFile(delete=False) + urlretrieve(url, tmp_file.name) + zfile = zipfile.ZipFile(tmp_file.name, 'r') + filepath = os.path.join(destination, zfile.namelist()[0]) + zfile.extractall(destination) + + return filepath + + +def _parse_compound_metadata(name, value, record): + '''Parses compound metadata.''' + if name == 'chebi' and isinstance(value, str): + value = value.replace('CHEBI:', '').split()[0] + + record['chemical'][_normalise_name(name)] = value + + +def _normalise_name(name): + '''Normalises name in name:value pairs.''' + if name in _NAME_MAP: + return _NAME_MAP[name] + + return name.replace(':', '_') diff --git a/metanetx_uniprot/test/__init__.py b/metanetx_uniprot/test/__init__.py new file mode 100644 index 00000000..e0aa1f5e --- /dev/null +++ b/metanetx_uniprot/test/__init__.py @@ -0,0 +1,9 @@ +''' +synbiochem (c) University of Manchester 2015 + +synbiochem is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' diff --git a/metanetx_uniprot/test/test_enzyme_utils.py b/metanetx_uniprot/test/test_enzyme_utils.py new file mode 100644 index 00000000..c0318f65 --- /dev/null +++ b/metanetx_uniprot/test/test_enzyme_utils.py @@ -0,0 +1,39 @@ +''' +synbiochem (c) University of Manchester 2015 + +synbiochem is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +# pylint: disable=too-many-public-methods +import unittest + +from sbcdb.enzyme_utils import EnzymeManager + + +class TestEnzymeManager(unittest.TestCase): + '''Test class for EnzymeManager.''' + + def setUp(self): + unittest.TestCase.setUp(self) + self.__manager = EnzymeManager() + + def test_add_uniprot_data(self): + '''Tests add_uniprot_data method.''' + enzyme_ids = ['P19367', 'Q2KNB7'] + + # Test unthreaded: + self.__manager.add_uniprot_data(enzyme_ids, source='test') + self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes())) + + # Test threaded: + self.__manager.add_uniprot_data(enzyme_ids, source='test', + num_threads=24) + self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes())) + + +if __name__ == "__main__": + # import sys;sys.argv = ['', 'Test.testName'] + unittest.main() diff --git a/metanetx_uniprot/test/test_mnxref_utils.py b/metanetx_uniprot/test/test_mnxref_utils.py new file mode 100644 index 00000000..88a3da23 --- /dev/null +++ b/metanetx_uniprot/test/test_mnxref_utils.py @@ -0,0 +1,37 @@ +''' +synbiochem (c) University of Manchester 2015 + +synbiochem is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +# pylint: disable=too-many-public-methods +import unittest + +from sbcdb.mnxref_utils import MnxRefReader + + +class TestMnxRefReader(unittest.TestCase): + '''Test class for MnxRefReader.''' + + def setUp(self): + unittest.TestCase.setUp(self) + reader = MnxRefReader() + self.__chem_data = reader.get_chem_data() + self.__reac_data = reader.get_reac_data() + + def test_get_chem_data(self): + '''Tests get_chem_data method.''' + self.assertEquals(self.__chem_data['MNXM1354']['chebi'], 'CHEBI:58282') + + def test_get_reac_data(self): + '''Tests get_chem_data method.''' + eqn = '1 MNXM1 + 1 MNXM6 + 1 MNXM97401 = 1 MNXM5 + 1 MNXM97393' + self.assertEquals(self.__reac_data['MNXR62989']['equation'], eqn) + + +if __name__ == "__main__": + # import sys;sys.argv = ['', 'Test.testName'] + unittest.main() diff --git a/metanetx_uniprot/utils.py b/metanetx_uniprot/utils.py new file mode 100644 index 00000000..67639e71 --- /dev/null +++ b/metanetx_uniprot/utils.py @@ -0,0 +1,73 @@ +''' +synbiochem (c) University of Manchester 2016 + +synbiochem is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +# pylint: disable=invalid-name +# pylint: disable=too-many-arguments +import os +from shutil import rmtree + +import pandas as pd + + +class Writer(object): + '''CSV file writer class for biochem4j files.''' + + def __init__(self, dest_dir): + self.__nodes_dir = os.path.join(os.path.abspath(dest_dir), 'nodes') + self.__rels_dir = os.path.join(os.path.abspath(dest_dir), 'rels') + + if os.path.exists(self.__nodes_dir): + rmtree(self.__nodes_dir) + + os.makedirs(self.__nodes_dir) + + if os.path.exists(self.__rels_dir): + rmtree(self.__rels_dir) + + os.makedirs(self.__rels_dir) + + def write_nodes(self, nodes, group, separator=';'): + '''Writes Nodes to csv file.''' + if not nodes: + return None + + df = pd.DataFrame(nodes) + df.dropna(axis=1, how='all', inplace=True) + + filename = os.path.join(self.__nodes_dir, group + '.csv') + df.to_csv(filename, index=False, encoding='utf-8', sep=separator) + print('just wrote: ',filename) + + return filename + + def write_rels(self, rels, group_start, group_end, separator=';'): + '''Writes Relationships to csv file.''' + if not rels: + return None + + columns = [':START_ID(' + group_start + ')', + ':TYPE', + ':END_ID(' + group_end + ')'] + + if len(rels[0]) > 3: + columns.append('PROPERTIES') + + df = pd.DataFrame(rels, columns=columns) + + if len(rels[0]) > 3: + props_df = pd.DataFrame(list(df['PROPERTIES'])) + df.drop('PROPERTIES', axis=1, inplace=True) + df = df.join(props_df) + + filename = os.path.join(self.__rels_dir, + group_start + '_' + group_end + '.csv') + df.to_csv(filename, index=False, encoding='utf-8', sep=separator) + print('just wrote: ',filename) + + return filename From 2ca61d9a57af567a5c9e91243a2d9202e2039dc4 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:11:46 -0600 Subject: [PATCH 03/29] Update README.md --- metanetx_uniprot/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md index 1165c3cc..31028033 100644 --- a/metanetx_uniprot/README.md +++ b/metanetx_uniprot/README.md @@ -1 +1,16 @@ +# MetaNetX and UniProt Content + Code is reused from Biochem4j: https://github.com/neilswainston/biochem4j/tree/master/sbcdb + +Access chemical, reaction, enzyme, and organism information from the following sources: +- libchebipy +- NCBITaxonomy +- MetaNetX +- Rhea +- UniProt + +To run: + +``` +python build.py ~/biochem4j ',' 1 +``` From 4f7e473d108915308ca65117b9acc69856d43570 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:12:24 -0600 Subject: [PATCH 04/29] Update build.py --- metanetx_uniprot/build.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py index c276e641..4376ea1a 100644 --- a/metanetx_uniprot/build.py +++ b/metanetx_uniprot/build.py @@ -20,7 +20,7 @@ def build_csv(dest_dir, array_delimiter, num_threads): # Get Organism data: print('Parsing NCBI Taxonomy') - #ncbi_taxonomy_utils.load(writer, array_delimiter) + ncbi_taxonomy_utils.load(writer, array_delimiter) # Get Chemical and Reaction data. # Write chemistry csv files: @@ -28,19 +28,19 @@ def build_csv(dest_dir, array_delimiter, num_threads): reac_man = reaction_utils.ReactionManager() - #print('Parsing MNXref') + print('Parsing MNXref') mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer) mnx_loader.load() print('Parsing ChEBI') - #chebi_utils.load(chem_man, writer) + chebi_utils.load(chem_man, writer) ####Using all memory (120+Gb) and eventually is killed # Get Spectrum data: #print('Parsing spectrum data') #spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter) - #chem_man.write_files(writer) + chem_man.write_files(writer) ####Not including KEGG for now # Get Reaction / Enzyme / Organism data: @@ -49,8 +49,8 @@ def build_csv(dest_dir, array_delimiter, num_threads): print('Parsing Rhea') - #rhea_utils.load(reac_man, num_threads=num_threads) - #reac_man.write_files(writer) + rhea_utils.load(reac_man, num_threads=num_threads) + reac_man.write_files(writer) def main(args): From fe005b52ba3fa341be8fd6bd5d01bac2f0ae3865 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:13:45 -0600 Subject: [PATCH 05/29] Add files via upload --- notebooks/Uniprot_API_test.ipynb | 303 +++++++++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 notebooks/Uniprot_API_test.ipynb diff --git a/notebooks/Uniprot_API_test.ipynb b/notebooks/Uniprot_API_test.ipynb new file mode 100644 index 00000000..c5eec384 --- /dev/null +++ b/notebooks/Uniprot_API_test.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 94, + "id": "underlying-necessity", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "class ReactionManager(object):\n", + " #Class to implement a manager of Reaction data.\n", + "\n", + " def __init__(self):\n", + " #Constructor.\n", + " self.__nodes = {}\n", + " self.__reac_ids = {}\n", + " self.__reac_enz_rels = []\n", + " self.__org_enz_rels = []\n", + " self.__enz_man = EnzymeManager()\n", + "'''\n", + "\n", + "\n", + "def add_uniprot_data(enzyme_ids, source, num_threads=0):\n", + " print(enzyme_ids)\n", + " '''Gets Uniprot data.'''\n", + "\n", + " #fields = ['entry name', 'protein names', 'organism-id', 'ec']\n", + " fields = ['id', 'protein_name', 'organism_id', 'ec']\n", + " #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]\n", + " uniprot_values = get_uniprot_values(enzyme_ids, fields,batch_size=128,verbose=False,num_threads=num_threads)\n", + "\n", + " print('add_uniprot_data function: added uniprot values: ',len(uniprot_values))\n", + "\n", + "\n", + "\n", + " for uniprot_id, uniprot_value in uniprot_values.items():\n", + " enzyme_node = {':LABEL': 'Enzyme',\n", + " 'uniprot:ID(Enzyme)': uniprot_id}\n", + " #self.__nodes[uniprot_id] = enzyme_node\n", + "\n", + " organism_id = uniprot_value.pop('Organism (ID)') \\\n", + " if 'Organism (ID)' in uniprot_value else None\n", + "\n", + " if 'Entry name' in uniprot_value:\n", + " enzyme_node['entry'] = uniprot_value['Entry name']\n", + "\n", + " if 'Protein names' in uniprot_value:\n", + " enzyme_node['names'] = uniprot_value['Protein names']\n", + "\n", + " if enzyme_node['names']:\n", + " enzyme_node['name'] = enzyme_node['names'][0]\n", + "\n", + " if 'EC number' in uniprot_value:\n", + " enzyme_node['ec-code'] = uniprot_value['EC number']\n", + "\n", + " #if organism_id:\n", + " #self.__org_enz_rels.append([organism_id, 'expresses',uniprot_id, {'source': source}])\n", + " \n", + "\n", + "def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False, num_threads=0):\n", + " values = []\n", + "\n", + " if num_threads:\n", + " thread_pool = thread_utils.ThreadPool(num_threads)\n", + "\n", + " for i in range(0, len(uniprot_ids), batch_size):\n", + " thread_pool.add_task(_get_uniprot_batch, uniprot_ids, i,batch_size, fields, values, verbose)\n", + "\n", + " thread_pool.wait_completion()\n", + " else:\n", + " for i in range(0, len(uniprot_ids), batch_size):\n", + " _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values,verbose)\n", + "\n", + " return {value['Entry']: value for value in values}\n", + "\n", + "\n", + "\n", + "def _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values, verbose):\n", + " '''Get batch of Uniprot data.'''\n", + " if verbose:\n", + " print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +\n", + " str(min(i + batch_size, len(uniprot_ids))) + ' / ' +\n", + " str(len(uniprot_ids)))\n", + "\n", + " #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE\n", + " batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]\n", + " query = '%20OR%20'.join(['accession:' + uniprot_id for uniprot_id in batch])\n", + " url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \\\n", + " '&format=tsv&fields=accession%2C' + '%2C'.join([parse.quote(field)\n", + " for field in fields])\n", + "\n", + " print(url)\n", + "\n", + " _parse_uniprot_data(url, values)\n", + " \n", + " \n", + "def _parse_uniprot_data(url, values):\n", + " '''Parses Uniprot data.'''\n", + " headers = None\n", + "\n", + " try:\n", + " resp = requests.get(url, allow_redirects=True)\n", + "\n", + " for line in resp.iter_lines():\n", + " line = line.decode('utf-8')\n", + " tokens = line.strip().split('\\t')\n", + "\n", + " if headers is None:\n", + " headers = tokens\n", + " else:\n", + " resp = dict(zip(headers, tokens))\n", + "\n", + " if 'Protein names' in resp:\n", + " regexp = re.compile(r'(?<=\\()[^)]*(?=\\))|^[^(][^()]*')\n", + " names = regexp.findall(resp.pop('Protein names'))\n", + " resp['Protein names'] = [nme.strip() for nme in names]\n", + "\n", + " for key in resp:\n", + " if key.startswith('Cross-reference'):\n", + " resp[key] = resp[key].split(';')\n", + " values.append(resp)\n", + " print('values from parse_uniprot_data: ',type(values))\n", + " return values\n", + " except Exception as err:\n", + " print(err)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "russian-dispatch", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['B4RBW1', 'A9BIS7', 'B5Z3E3']\n", + "https://rest.uniprot.org/uniprotkb/search?query=accession:B4RBW1%20OR%20accession:A9BIS7%20OR%20accession:B5Z3E3&format=tsv&fields=accession%2Cid%2Cprotein_name%2Corganism_id%2Cec\n", + "add_uniprot_data function: added uniprot values: 3\n" + ] + } + ], + "source": [ + "### Query by protein ID\n", + "\n", + "\n", + "from urllib import parse\n", + "import requests\n", + "import re\n", + "\n", + "\n", + "num_threads = 1\n", + "source = 'rhea'\n", + "enzyme_ids = ['B4RBW1', 'A9BIS7', 'B5Z3E3']\n", + "\n", + "add_uniprot_data(enzyme_ids, source)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "removable-gibraltar", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#Download then work with it\n", + "\n", + "def add_uniprot_data_organism(organism_ids, source, num_threads=0):\n", + " print(organism_ids)\n", + " '''Gets Uniprot data.'''\n", + "\n", + " #fields = ['entry name', 'protein names', 'organism-id', 'ec']\n", + " fields = ['id', 'protein_name', 'organism_id', 'ec']\n", + " #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]\n", + " organism_values = get_uniprot_values_organism(organism_ids, fields,batch_size=128,verbose=False,num_threads=num_threads)\n", + "\n", + " print('add_uniprot_data function: added uniprot values: ',len(organism_values))\n", + "\n", + "\n", + "\n", + " for uniprot_id, uniprot_value in organism_values.items():\n", + " enzyme_node = {':LABEL': 'Enzyme',\n", + " 'uniprot:ID(Enzyme)': uniprot_id}\n", + " #self.__nodes[uniprot_id] = enzyme_node\n", + "\n", + " organism_id = uniprot_value.pop('Organism (ID)') \\\n", + " if 'Organism (ID)' in uniprot_value else None\n", + "\n", + " if 'Entry name' in uniprot_value:\n", + " enzyme_node['entry'] = uniprot_value['Entry name']\n", + "\n", + " if 'Protein names' in uniprot_value:\n", + " enzyme_node['names'] = uniprot_value['Protein names']\n", + "\n", + " if enzyme_node['names']:\n", + " enzyme_node['name'] = enzyme_node['names'][0]\n", + "\n", + " if 'EC number' in uniprot_value:\n", + " enzyme_node['ec-code'] = uniprot_value['EC number']\n", + "\n", + " #if organism_id:\n", + " #self.__org_enz_rels.append([organism_id, 'expresses',uniprot_id, {'source': source}])\n", + " \n", + " return organism_values\n", + "\n", + "def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False, num_threads=0):\n", + " values = []\n", + "\n", + " for i in range(0, len(organism_ids), batch_size):\n", + " values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose)\n", + "\n", + " return {value['Organism (ID)']: value for value in values}\n", + "\n", + "\n", + "def _get_uniprot_batch_organism(uniprot_ids, i, batch_size, fields, values, verbose):\n", + " '''Get batch of Uniprot data.'''\n", + " if verbose:\n", + " print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +\n", + " str(min(i + batch_size, len(uniprot_ids))) + ' / ' +\n", + " str(len(uniprot_ids)))\n", + "\n", + " #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE\n", + " batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]\n", + " query = '%20OR%20'.join(['organism_id:' + uniprot_id for uniprot_id in batch])\n", + " url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \\\n", + " '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field)\n", + " for field in fields])\n", + "\n", + " print('_get_uniprot_batch_organism url: ',url)\n", + "\n", + " values = _parse_uniprot_data(url, values)\n", + " return values\n", + " \n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "removed-unemployment", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['226900', '296591']\n", + "_get_uniprot_batch_organism url: https://rest.uniprot.org/uniprotkb/search?query=organism_id:226900%20OR%20organism_id:296591&format=tsv&fields=organism_id%2Cid%2Cprotein_name%2Corganism_id%2Cec\n", + "values from parse_uniprot_data: \n", + "add_uniprot_data function: added uniprot values: 2\n", + "{'226900': {'Entry Name': 'GLMU_BACCR', 'EC number': '2.3.1.157; 2.7.7.23', 'Protein names': ['Bifunctional protein GlmU [Includes: UDP-N-acetylglucosamine pyrophosphorylase', 'EC 2.7.7.23', 'N-acetylglucosamine-1-phosphate uridyltransferase', 'EC 2.3.1.157']}, '296591': {'Entry Name': 'RLMN_POLSJ', 'EC number': '2.1.1.192', 'Protein names': ['Dual-specificity RNA methyltransferase RlmN', 'EC 2.1.1.192', '23S rRNA (adenine(2503', '2', '23S rRNA m2A2503 methyltransferase', 'Ribosomal RNA large subunit methyltransferase N', 'tRNA (adenine(37', '2', 'tRNA m2A37 methyltransferase']}}\n" + ] + } + ], + "source": [ + "### Query by organism ID\n", + "\n", + "#query = 'https://rest.uniprot.org/uniprotkb/search?query=organism_id:226900'\n", + "\n", + "\n", + "source = 'rhea'\n", + "organism_ids = ['226900','296591']\n", + "\n", + "organism_values = add_uniprot_data_organism(organism_ids, source)\n", + "\n", + "print(organism_values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pleased-coaching", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e3cf90a5251aef3950ddcdd63fc1a307040f3281 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 20 Mar 2023 16:42:09 -0600 Subject: [PATCH 06/29] Update ncbi_taxonomy_utils.py Update to use ncbi_taxon input from kg-microbe --- metanetx_uniprot/ncbi_taxonomy_utils.py | 49 +++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py index 8b7bd1d4..5fbc603f 100644 --- a/metanetx_uniprot/ncbi_taxonomy_utils.py +++ b/metanetx_uniprot/ncbi_taxonomy_utils.py @@ -14,15 +14,23 @@ import urllib from urllib.request import urlretrieve +from kgx.cli.cli_utils import transform +import pandas as pd + __NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' def load(writer, array_delimiter, source=__NCBITAXONOMY_URL): '''Loads NCBI Taxonomy data.''' - nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) - nodes, rels = _parse_nodes(nodes_filename, array_delimiter) - _parse_names(nodes, names_filename, array_delimiter) + #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) + ####Update filepath accordingly + nodes_filename = '~/kg_microbe/kg-microbe/data/raw/ncbitaxon.json' + #nodes, rels = _parse_nodes(nodes_filename, array_delimiter) + print('parsing ncbi taxon json file') + kgx_nodes_json,kgx_edges_json = _parse_nodes_kgmicrobe(nodes_filename, array_delimiter) + nodes,rels = transform_kgx_output_format(kgx_nodes_json,kgx_edges_json) + #_parse_names(nodes, names_filename, array_delimiter) writer.write_nodes(nodes.values(), 'Organism') writer.write_rels(rels, 'Organism', 'Organism') @@ -43,6 +51,40 @@ def _get_ncbi_taxonomy_files(source): return os.path.join(temp_dir, 'nodes.dmp'), \ os.path.join(temp_dir, 'names.dmp') +def _parse_nodes_kgmicrobe(filename, array_delimiter): + '''Parses nodes file.''' + + ####Update filepath accordingly + output_dir = '~/biochem4j/' + name = 'ncbitaxon_transformed' + + transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, name), output_format='tsv') + + return output_dir+name+'_nodes.tsv',output_dir+name+'_edges.tsv' + +def transform_kgx_output_format(transformed_nodes_tsv,transformed_edges_tsv): + + labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name']) + triples_df = pd.read_csv(transformed_edges_tsv,sep = '\t', usecols = ['subject', 'object', 'predicate']) + triples_df.columns.str.lower() + + nodes = {} + rels = [] + + for i in range(len(labels)): + tax_id = labels.iloc[i].loc['id'] + nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id, + ':LABEL': + 'Organism,unknown'} + + for i in range(len(triples_df)): + s = triples_df.iloc[i].loc['subject'] + p = triples_df.iloc[i].loc['predicate'] + o = triples_df.iloc[i].loc['object'] + rels.append([s, p, o]) + + return nodes,rels + def _parse_nodes(filename, array_delimiter): '''Parses nodes file.''' @@ -61,6 +103,7 @@ def _parse_nodes(filename, array_delimiter): ':LABEL': 'Organism' + array_delimiter + tokens[2]} + print(list(nodes.values())[0:5]) return nodes, rels From 32b80d9d646d81481738dcbe62d6b2cad547fbf1 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:08:06 -0600 Subject: [PATCH 07/29] Update build.py Update to ingest all uniprot/rhea relationships based on kg-microbe microbes - comment out all other code (for now) --- metanetx_uniprot/build.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py index 4376ea1a..3fce3299 100644 --- a/metanetx_uniprot/build.py +++ b/metanetx_uniprot/build.py @@ -17,30 +17,34 @@ def build_csv(dest_dir, array_delimiter, num_threads): '''Build database CSV files.''' writer = utils.Writer(dest_dir) + + reac_man = reaction_utils.ReactionManager() # Get Organism data: print('Parsing NCBI Taxonomy') - ncbi_taxonomy_utils.load(writer, array_delimiter) + ncbi_taxonomy_utils.load(reac_man, writer, array_delimiter) + # Get Chemical and Reaction data. # Write chemistry csv files: - chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) - reac_man = reaction_utils.ReactionManager() + #chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) + # May be duplicate line + #reac_man = reaction_utils.ReactionManager() - print('Parsing MNXref') - mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer) - mnx_loader.load() + #print('Parsing MNXref') + #mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer) + #mnx_loader.load() - print('Parsing ChEBI') - chebi_utils.load(chem_man, writer) + #print('Parsing ChEBI') + #chebi_utils.load(chem_man, writer) ####Using all memory (120+Gb) and eventually is killed # Get Spectrum data: #print('Parsing spectrum data') #spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter) - chem_man.write_files(writer) + #chem_man.write_files(writer) ####Not including KEGG for now # Get Reaction / Enzyme / Organism data: From 008e24e8cdc8449113d72fbcbd0632f241a5f930 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:09:49 -0600 Subject: [PATCH 08/29] Update ncbi_taxonomy_utils.py Update to ingest all uniprot/rhea relationships based on kg-microbe microbes - Use kg-microbe NCBITaxon.json input rather than ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz --- metanetx_uniprot/ncbi_taxonomy_utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py index 5fbc603f..f5ffe788 100644 --- a/metanetx_uniprot/ncbi_taxonomy_utils.py +++ b/metanetx_uniprot/ncbi_taxonomy_utils.py @@ -21,20 +21,24 @@ __NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' -def load(writer, array_delimiter, source=__NCBITAXONOMY_URL): +def load(reaction_manager, writer, array_delimiter, source=__NCBITAXONOMY_URL): '''Loads NCBI Taxonomy data.''' + #Not used currently #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) - ####Update filepath accordingly - nodes_filename = '~/kg_microbe/kg-microbe/data/raw/ncbitaxon.json' #nodes, rels = _parse_nodes(nodes_filename, array_delimiter) + #_parse_names(nodes, names_filename, array_delimiter) + ####### + nodes_filename = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg_microbe/kg-microbe/data/raw/ncbitaxon.json' print('parsing ncbi taxon json file') kgx_nodes_json,kgx_edges_json = _parse_nodes_kgmicrobe(nodes_filename, array_delimiter) nodes,rels = transform_kgx_output_format(kgx_nodes_json,kgx_edges_json) - #_parse_names(nodes, names_filename, array_delimiter) writer.write_nodes(nodes.values(), 'Organism') writer.write_rels(rels, 'Organism', 'Organism') + print('adding organism-enzyme relationships') + reaction_manager.add_org_to_enz(nodes, 'uniprot') + def _get_ncbi_taxonomy_files(source): '''Downloads and extracts NCBI Taxonomy files.''' @@ -54,8 +58,7 @@ def _get_ncbi_taxonomy_files(source): def _parse_nodes_kgmicrobe(filename, array_delimiter): '''Parses nodes file.''' - ####Update filepath accordingly - output_dir = '~/biochem4j/' + output_dir = '/Users/brooksantangelo/Documents/HunterLab/biochem4j/biochem4j/' name = 'ncbitaxon_transformed' transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, name), output_format='tsv') @@ -72,7 +75,7 @@ def transform_kgx_output_format(transformed_nodes_tsv,transformed_edges_tsv): rels = [] for i in range(len(labels)): - tax_id = labels.iloc[i].loc['id'] + tax_id = labels.iloc[i].loc['id'].split('NCBITaxon:')[1] nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id, ':LABEL': 'Organism,unknown'} @@ -127,6 +130,7 @@ def _parse_names(nodes, filename, array_delimiter): array_delimiter.join(node['names:string[]']) + def main(argv): '''main method''' load(*argv) From 89bd3325e1f67fd0b735a581a09466662a087f03 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:11:02 -0600 Subject: [PATCH 09/29] Update reaction_utils.py Update to ingest all uniprot/rhea relationships based on kg-microbe microbes - output Enzyme_Reaction.tsv and Organism_Enzyme.tsv based on kg-microbe nodes --> UniProt enzymes --> Rhea reactions --- metanetx_uniprot/reaction_utils.py | 46 +++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py index 2a6d9394..058875eb 100644 --- a/metanetx_uniprot/reaction_utils.py +++ b/metanetx_uniprot/reaction_utils.py @@ -9,6 +9,8 @@ ''' from enzyme_utils import EnzymeManager +from numpy import * + class ReactionManager(object): '''Class to implement a manager of Reaction data.''' @@ -18,6 +20,7 @@ def __init__(self): self.__nodes = {} self.__reac_ids = {} self.__reac_enz_rels = [] + self.__enz_reac_rels = [] self.__org_enz_rels = [] self.__enz_man = EnzymeManager() @@ -29,6 +32,8 @@ def write_files(self, writer): 'Enzyme')], [writer.write_rels(self.__reac_enz_rels, 'Reaction', 'Enzyme'), + writer.write_rels(self.__enz_reac_rels, + 'Enzyme', 'Reaction'), writer.write_rels(self.__enz_man.get_org_enz_rels(), 'Organism', 'Enzyme')]) @@ -64,7 +69,12 @@ def add_react_to_enz(self, data, source, num_threads=0): enzyme_ids = self.__create_react_enz(data, source) # Create Enzyme nodes: - self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads) + self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads) + + def add_react_to_enz_organism(self, data, source, num_threads=0): + + #Create Reaction relationships + reaction_ids = self.__create_enz_react(data, source) def __create_react_enz(self, data, source): '''Creates Reaction and Enzyme nodes and their Relationships.''' @@ -80,3 +90,37 @@ def __create_react_enz(self, data, source): {'source': source}]) return list(set(enzyme_ids)) + + def __create_enz_react(self, data, source): + '''Creates Reaction and Enzyme nodes and their Relationships.''' + print('adding reaction to enzyme relationships') + reaction_ids = [] + enzyme_ids = self.__enz_man.get_nodes() + + for enz_id in enzyme_ids: + reac_ids = [key for key, value in data.items() if enz_id['entry'] in value] + reaction_ids = reaction_ids+reac_ids + for j in reac_ids: + self.__enz_reac_rels.append([j, 'catalysed_by', + enz_id['entry'], + {'source': source}]) + + return list(set(reaction_ids)) + + def add_org_to_enz(self, nodes, source, num_threads=0): + '''Submit data to the graph.''' + # Create Organism nodes: + organism_ids = self.__create_organism_ids(nodes, source) + + ## For testing + #organism_ids = organism_ids[0:10] + + # Create Organism and Enzyme nodes: + self.__enz_man.add_uniprot_data_organism(organism_ids, source, num_threads) + + def __create_organism_ids(self, data, source): + + ids = unique(list(data.keys())) + + return ids + From f8fe4cbd44f9f4fd6752cea26c106bbc1c8db02a Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:11:52 -0600 Subject: [PATCH 10/29] Update rhea_utils.py Update to ingest all uniprot/rhea relationships based on kg-microbe microbes - Get Rhea reactions based on enzymes expressed by kg-microbe set of microbes from nbcbitaxon.json --- metanetx_uniprot/rhea_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py index 3d926091..1fa6e91c 100644 --- a/metanetx_uniprot/rhea_utils.py +++ b/metanetx_uniprot/rhea_utils.py @@ -30,7 +30,9 @@ def load(reaction_manager, source=__RHEA_URL, num_threads=0): data.update(data_small) ''' ######Not sure why source is Rhea here, calls to UniProt - reaction_manager.add_react_to_enz(data, 'rhea', num_threads) + #Remove, since this goes from rhea2uniprot to uniprot enzymes. use add_org_to_enz function in ncbi_taxonomy_utils instead + #reaction_manager.add_react_to_enz(data, 'rhea', num_threads) + reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads) def _parse(filename): From df7978266af3c0c1c0a0db099f49f2381830c1b6 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:12:42 -0600 Subject: [PATCH 11/29] Update enzyme_utils.py Update to ingest all uniprot/rhea relationships based on kg-microbe microbes - query uniprot based on organism_ids, not enzyme_ids --- metanetx_uniprot/enzyme_utils.py | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py index 6f90b475..707a8399 100644 --- a/metanetx_uniprot/enzyme_utils.py +++ b/metanetx_uniprot/enzyme_utils.py @@ -63,3 +63,46 @@ def add_uniprot_data(self, enzyme_ids, source, num_threads=0): if organism_id: self.__org_enz_rels.append([organism_id, 'expresses', uniprot_id, {'source': source}]) + + #Builds into reactionManager + def add_uniprot_data_organism(self, organism_ids, source, num_threads=0): + '''Gets Uniprot data.''' + + #fields = ['entry name', 'protein names', 'organism-id', 'ec'] + fields = ['id', 'accession','protein_name', 'organism_id', 'ec'] + #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes] + print('querying uniprot for enzymes per organism') + uniprot_values = get_uniprot_values_organism(organism_ids, fields, + batch_size=128, + verbose=False, + num_threads=num_threads) + + print('add_uniprot_data function: added uniprot values: ',len(uniprot_values)) + + + + print('adding uniprot data to graph') + for uniprot_id, uniprot_value in tqdm(uniprot_values.items()): + enzyme_node = {':LABEL': 'Enzyme', + 'uniprot:ID(Enzyme)': uniprot_id} + self.__nodes[uniprot_id] = enzyme_node + + organism_id = uniprot_value.pop('Organism (ID)') \ + if 'Organism (ID)' in uniprot_value else None + + if 'Entry' in uniprot_value: + enzyme_node['entry'] = uniprot_value['Entry'] + + if 'Protein names' in uniprot_value: + enzyme_node['names'] = uniprot_value['Protein names'] + + if enzyme_node['names']: + enzyme_node['name'] = enzyme_node['names'][0] + + if 'EC number' in uniprot_value: + enzyme_node['ec-code'] = uniprot_value['EC number'] + + if organism_id: + self.__org_enz_rels.append([organism_id, 'expresses',uniprot_value['Entry'], {'source': source}]) + + return uniprot_values From f15bb82102f5aa8fc05fbc8fa5154f33b4a29af5 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:13:22 -0600 Subject: [PATCH 12/29] Update seq_utils.py Update to ingest all uniprot/rhea relationships based on kg-microbe microbes - query uniprot based on organism_ids, not enzyme_ids --- metanetx_uniprot/seq_utils.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py index 892b3a6c..1776f94a 100644 --- a/metanetx_uniprot/seq_utils.py +++ b/metanetx_uniprot/seq_utils.py @@ -29,6 +29,8 @@ import queue import numpy as np +from tqdm import tqdm +import sys def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False, num_threads=0): @@ -107,6 +109,36 @@ def _parse_uniprot_data(url, values): if key.startswith('Cross-reference'): resp[key] = resp[key].split(';') + if 'Error messages' in resp: + print(resp); sys.exit() values.append(resp) except Exception as err: - print(err) \ No newline at end of file + print(err) + + +def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False, num_threads=0): + values = [] + + for i in tqdm(range(0, len(organism_ids), batch_size)): + values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose) + + return {value['Organism (ID)']: value for value in values} + +def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, verbose): + '''Get batch of Uniprot data.''' + if verbose: + print('seq_utils: getting Uniprot values ' + str(i) + ' - ' + + str(min(i + batch_size, len(organism_ids))) + ' / ' + + str(len(organism_ids))) + + #If getting values in batch Remove 'accession:' + from start of join([HERE .....]) and accession: from query=HERE + batch = organism_ids[i:min(i + batch_size, len(organism_ids))] + query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch]) + url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \ + '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field) + for field in fields]) + + #print('_get_uniprot_batch_organism url: ',url) + + _parse_uniprot_data(url, values) + return values From c183254fec387219c6b0436c71badaffd45b13c5 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:46:00 -0600 Subject: [PATCH 13/29] Update build.py Include enzyme to reaction, and reaction to chemical relationships --- metanetx_uniprot/build.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py index 3fce3299..0ac9524b 100644 --- a/metanetx_uniprot/build.py +++ b/metanetx_uniprot/build.py @@ -1,6 +1,9 @@ ''' SYNBIOCHEM-DB (c) University of Manchester 2015 +''' +SYNBIOCHEM-DB (c) University of Manchester 2015 + SYNBIOCHEM-DB is licensed under the MIT License. To view a copy of this license, visit . @@ -10,31 +13,23 @@ import multiprocessing import sys -import chebi_utils, chemical_utils, mnxref_utils, \ - ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils, seq_utils #, kegg_utils +import chebi_utils, chemical_utils, mnxref_utils, ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils, seq_utils #, kegg_utils def build_csv(dest_dir, array_delimiter, num_threads): '''Build database CSV files.''' writer = utils.Writer(dest_dir) - reac_man = reaction_utils.ReactionManager() - + # Get Organism data: print('Parsing NCBI Taxonomy') - ncbi_taxonomy_utils.load(reac_man, writer, array_delimiter) + ncbi_taxonomy_utils.load(reac_man, writer, array_delimiter) #--> writes Organism_Enzyme.tsv - # Get Chemical and Reaction data. # Write chemistry csv files: - #chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) - # May be duplicate line - #reac_man = reaction_utils.ReactionManager() + chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) - #print('Parsing MNXref') - #mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer) - #mnx_loader.load() #print('Parsing ChEBI') #chebi_utils.load(chem_man, writer) @@ -44,7 +39,6 @@ def build_csv(dest_dir, array_delimiter, num_threads): #print('Parsing spectrum data') #spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter) - #chem_man.write_files(writer) ####Not including KEGG for now # Get Reaction / Enzyme / Organism data: @@ -53,8 +47,16 @@ def build_csv(dest_dir, array_delimiter, num_threads): print('Parsing Rhea') - rhea_utils.load(reac_man, num_threads=num_threads) - reac_man.write_files(writer) + ##Returns rhea reaction ids + reaction_ids = rhea_utils.load(reac_man, num_threads=num_threads) + reac_man.write_files(writer) #--> writes Enzyme_Reaction.tsv + + # + print('Parsing MNXref') + mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer, reaction_ids) + mnx_loader.load() #--> writes Reaction_Chemical.tsv + + #chem_man.write_files(writer) def main(args): @@ -73,5 +75,7 @@ def main(args): build_csv(args[0], args[1], num_threads) + + if __name__ == '__main__': main(sys.argv[1:]) From 133222ed0e22192ed433dfa7180a9328c9869b8c Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:46:51 -0600 Subject: [PATCH 14/29] Update mnxref_utils.py Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/mnxref_utils.py | 56 ++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/metanetx_uniprot/mnxref_utils.py b/metanetx_uniprot/mnxref_utils.py index cbb67687..aa231e22 100644 --- a/metanetx_uniprot/mnxref_utils.py +++ b/metanetx_uniprot/mnxref_utils.py @@ -24,10 +24,12 @@ import namespace_utils from synbiochem.utils import chem_utils +import os _METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/' - +#For test, also update __read_data function +#_METANETX_URL = os.getcwd()+'/TestingFiles/' class MnxRefReader(object): '''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and @@ -47,12 +49,15 @@ def get_chem_data(self): return self.__chem_data - def get_reac_data(self): + def get_reac_data(self,reaction_ids): '''Gets reaction data.''' if not self.__reac_data: - self.__read_reac_prop() + mxn_reaction_ids = self.__read_reac_prop(reaction_ids) self.__read_xref('reac_xref.tsv', self.__reac_data, False) + #Only include reaction data for reactions in reaction_ids + self.__reac_data = {key:val for key,val in self.__reac_data.items() if key in mxn_reaction_ids} + return self.__reac_data def __read_chem_prop(self): @@ -101,15 +106,24 @@ def __add_xref(self, xref, entry, chemical): if namespace != 'chebi' \ else 'CHEBI:' + xref[1] - def __read_reac_prop(self): + def __read_reac_prop(self,reaction_ids): '''Read reaction properties and create Nodes.''' reac_prop_keys = ['id', 'equation', 'reference', 'ec', 'balance', 'transport'] + ##Relabel reaction ids by MXN id rather than rhea id + mxn_reaction_ids = [] + for values in self.__read_data('reac_prop.tsv'): - if not values[0].startswith('#'): + if not values[0].startswith('#'): + if values[0] == 'EMPTY': continue values[0] = self.__parse_id(values[0]) values[2] = self.__parse_id(values[2]) + try: + if 'rhea' in values[2].split(':')[0].lower() and values[2].split(':')[1] in reaction_ids: + mxn_reaction_ids.append(values[0]) + except IndexError: continue + props = dict(zip(reac_prop_keys, values)) props.pop('reference') @@ -129,6 +143,8 @@ def __read_reac_prop(self): print('WARNING: Suspected polymerisation reaction: ' + \ values[0] + '\t' + str(props)) + return mxn_reaction_ids + def __add_chem(self, chem_id): '''Adds a chemical with given id.''' props = {'id': chem_id} @@ -138,6 +154,7 @@ def __add_chem(self, chem_id): def __read_data(self, filename): '''Downloads and reads tab-limited files into lists of lists of strings.''' + with requests.Session() as s: download = s.get(self.__source + filename) @@ -146,7 +163,21 @@ def __read_data(self, filename): cr = csv.reader(decoded_content.splitlines(), delimiter='\t') my_list = list(cr) return my_list - + ''' + ###Reads downloaded file for offline testing + #cr = csv.reader((self.__source + filename).splitlines(), delimiter='\t') + import pandas as pd + cr = pd.read_csv(self.__source + filename, delimiter='\t', comment='#',header=None) + cr_d = [] + for i in range(len(cr)): + l = [] + for j in range(len(cr.columns)): + l.append(cr.iloc[i,j]) + cr_d.append(l) + + return cr_d + ''' + def __parse_id(self, item_id): '''Parses mnx ids.''' @@ -161,21 +192,24 @@ def __parse_id(self, item_id): class MnxRefLoader(object): '''Loads MNXref data into neo4j format.''' - def __init__(self, chem_man, reac_man, writer): + def __init__(self, chem_man, reac_man, writer,reaction_ids): self.__chem_man = chem_man self.__reac_man = reac_man self.__writer = writer + self.__reactions = reaction_ids def load(self): '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv, reac_prop.tsv and reac_xref.tsv files.''' reader = MnxRefReader() + #First gets all chemical data from MxnRef (chem_xref and chem_prop) and adds to __chem_man for properties in reader.get_chem_data().values(): properties['mnx'] = properties.pop('id') self.__chem_man.add_chemical(properties) - rels = self.__add_reac_nodes(reader.get_reac_data()) + #Then gets reaction data from reac_xref and reac_prop and adds to __chem_man + rels = self.__add_reac_nodes(reader.get_reac_data(self.__reactions)) return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')] @@ -276,8 +310,10 @@ def _filter(counter, cutoff): # Fit straight-line to histogram log-log plot and filter... x_val, y_val = zip(*list(hist_counter.items())) - m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1) - + l_x_val = numpy.log(x_val)[0] + l_y_val = numpy.log(y_val)[0] + if l_x_val == 0.0: l_x_val += 0.01 + m_val, b_val = numpy.polyfit([l_x_val], [l_y_val], 1) return [item[0] for item in counter.items() if item[1] > math.exp(cutoff * -b_val / m_val)] From 1d5d048f86aefd3bf6bdbb7fad0406ba7464c2a2 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:47:21 -0600 Subject: [PATCH 15/29] Update ncbi_taxonomy_utils.py Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/ncbi_taxonomy_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metanetx_uniprot/ncbi_taxonomy_utils.py b/metanetx_uniprot/ncbi_taxonomy_utils.py index f5ffe788..e563331d 100644 --- a/metanetx_uniprot/ncbi_taxonomy_utils.py +++ b/metanetx_uniprot/ncbi_taxonomy_utils.py @@ -28,7 +28,8 @@ def load(reaction_manager, writer, array_delimiter, source=__NCBITAXONOMY_URL): #nodes, rels = _parse_nodes(nodes_filename, array_delimiter) #_parse_names(nodes, names_filename, array_delimiter) ####### - nodes_filename = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg_microbe/kg-microbe/data/raw/ncbitaxon.json' + nodes_filename = os.getcwd()+'/Files/ncbitaxon.json' + #nodes_filename = os.getcwd()+'/TestingFiles/ncbitaxon.json' print('parsing ncbi taxon json file') kgx_nodes_json,kgx_edges_json = _parse_nodes_kgmicrobe(nodes_filename, array_delimiter) nodes,rels = transform_kgx_output_format(kgx_nodes_json,kgx_edges_json) @@ -64,7 +65,7 @@ def _parse_nodes_kgmicrobe(filename, array_delimiter): transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, name), output_format='tsv') return output_dir+name+'_nodes.tsv',output_dir+name+'_edges.tsv' - + def transform_kgx_output_format(transformed_nodes_tsv,transformed_edges_tsv): labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name']) @@ -106,7 +107,6 @@ def _parse_nodes(filename, array_delimiter): ':LABEL': 'Organism' + array_delimiter + tokens[2]} - print(list(nodes.values())[0:5]) return nodes, rels From 303a824c8991da51047b3f113f420fa7db5eb907 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:47:50 -0600 Subject: [PATCH 16/29] Update rhea_utils.py Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/rhea_utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py index 1fa6e91c..5c612d90 100644 --- a/metanetx_uniprot/rhea_utils.py +++ b/metanetx_uniprot/rhea_utils.py @@ -10,29 +10,28 @@ import tempfile import urllib from urllib.request import urlretrieve +import os __RHEA_URL = 'ftp://ftp.expasy.org/databases/rhea/tsv/rhea2uniprot%5Fsprot.tsv' - +#For test, also update load function +#__RHEA_URL = os.getcwd()+'/TestingFiles/rhea2uniprot_sprot.txt' def load(reaction_manager, source=__RHEA_URL, num_threads=0): '''Loads Rhea data.''' # Parse data: + temp_file = tempfile.NamedTemporaryFile() urlretrieve(source, temp_file.name) data = _parse(temp_file.name) - ''' - ###For testing, uncomment the following code - data_small = dict() - for key in sorted(data)[:50]: - data_small[key] = data[key] - data.clear() - data.update(data_small) - ''' + ##If using test data + #data = _parse(source) ######Not sure why source is Rhea here, calls to UniProt #Remove, since this goes from rhea2uniprot to uniprot enzymes. use add_org_to_enz function in ncbi_taxonomy_utils instead #reaction_manager.add_react_to_enz(data, 'rhea', num_threads) - reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads) + reaction_ids = reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads) + + return reaction_ids def _parse(filename): From 27f711187b83f839da79194f395d9953e4f58d3c Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:48:23 -0600 Subject: [PATCH 17/29] Update reaction_utils.py Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/reaction_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py index 058875eb..f13c429c 100644 --- a/metanetx_uniprot/reaction_utils.py +++ b/metanetx_uniprot/reaction_utils.py @@ -32,8 +32,9 @@ def write_files(self, writer): 'Enzyme')], [writer.write_rels(self.__reac_enz_rels, 'Reaction', 'Enzyme'), + #Gets reactions connected to all enzymes writer.write_rels(self.__enz_reac_rels, - 'Enzyme', 'Reaction'), + 'Reaction', 'Enzyme'), writer.write_rels(self.__enz_man.get_org_enz_rels(), 'Organism', 'Enzyme')]) @@ -76,6 +77,8 @@ def add_react_to_enz_organism(self, data, source, num_threads=0): #Create Reaction relationships reaction_ids = self.__create_enz_react(data, source) + return reaction_ids + def __create_react_enz(self, data, source): '''Creates Reaction and Enzyme nodes and their Relationships.''' enzyme_ids = [] @@ -104,7 +107,6 @@ def __create_enz_react(self, data, source): self.__enz_reac_rels.append([j, 'catalysed_by', enz_id['entry'], {'source': source}]) - return list(set(reaction_ids)) def add_org_to_enz(self, nodes, source, num_threads=0): From 009b1876dbcd986562bfedc05cd48a754d6a900b Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:48:49 -0600 Subject: [PATCH 18/29] Update seq_utils.py Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/seq_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py index 1776f94a..43682c9a 100644 --- a/metanetx_uniprot/seq_utils.py +++ b/metanetx_uniprot/seq_utils.py @@ -122,7 +122,10 @@ def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False, for i in tqdm(range(0, len(organism_ids), batch_size)): values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose) - return {value['Organism (ID)']: value for value in values} + ##Issue: Only returns one enzyme per organism + #return {value['Organism (ID)']: value for value in values} + ##Returns list of dicts for each organism-id enzyme entry + return values def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, verbose): '''Get batch of Uniprot data.''' @@ -135,10 +138,10 @@ def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, ver batch = organism_ids[i:min(i + batch_size, len(organism_ids))] query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch]) url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \ - '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field) + '&format=tsv&size=500&fields=organism_id%2C' + '%2C'.join([parse.quote(field) + # '&format=tsv&size=1&fields=organism_id%2C' + '%2C'.join([parse.quote(field) for field in fields]) - #print('_get_uniprot_batch_organism url: ',url) _parse_uniprot_data(url, values) return values From cc29ad241a668244cac2b31d25bd82a1d07cfe27 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:49:17 -0600 Subject: [PATCH 19/29] Update enzyme_utils.py Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/enzyme_utils.py | 38 +++++++++++++++----------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py index 707a8399..d6ea4969 100644 --- a/metanetx_uniprot/enzyme_utils.py +++ b/metanetx_uniprot/enzyme_utils.py @@ -68,41 +68,39 @@ def add_uniprot_data(self, enzyme_ids, source, num_threads=0): def add_uniprot_data_organism(self, organism_ids, source, num_threads=0): '''Gets Uniprot data.''' - #fields = ['entry name', 'protein names', 'organism-id', 'ec'] fields = ['id', 'accession','protein_name', 'organism_id', 'ec'] - #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes] print('querying uniprot for enzymes per organism') + ##Uniprot returns list of dicts for each entry uniprot_values = get_uniprot_values_organism(organism_ids, fields, batch_size=128, verbose=False, num_threads=num_threads) - print('add_uniprot_data function: added uniprot values: ',len(uniprot_values)) - - - print('adding uniprot data to graph') - for uniprot_id, uniprot_value in tqdm(uniprot_values.items()): + + ##To return all organism-enzyme entries + for entry in tqdm(uniprot_values): enzyme_node = {':LABEL': 'Enzyme', - 'uniprot:ID(Enzyme)': uniprot_id} - self.__nodes[uniprot_id] = enzyme_node + 'uniprot:ID(Enzyme)': entry['Entry']} + self.__nodes[entry['Entry']] = enzyme_node - organism_id = uniprot_value.pop('Organism (ID)') \ - if 'Organism (ID)' in uniprot_value else None + organism_id = entry['Organism (ID)'] \ + if 'Organism (ID)' in entry.keys() else None - if 'Entry' in uniprot_value: - enzyme_node['entry'] = uniprot_value['Entry'] + if 'Entry' in entry.keys(): + enzyme_node['entry'] = entry['Entry'] - if 'Protein names' in uniprot_value: - enzyme_node['names'] = uniprot_value['Protein names'] + if 'Protein names' in entry: + enzyme_node['names'] = entry['Protein names'] - if enzyme_node['names']: - enzyme_node['name'] = enzyme_node['names'][0] + if 'names' in entry.keys(): + enzyme_node['name'] = entry['names'][0] - if 'EC number' in uniprot_value: - enzyme_node['ec-code'] = uniprot_value['EC number'] + if 'EC number' in entry: + enzyme_node['ec-code'] = entry['EC number'] if organism_id: - self.__org_enz_rels.append([organism_id, 'expresses',uniprot_value['Entry'], {'source': source}]) + self.__org_enz_rels.append([organism_id, 'expresses',entry['Entry'], {'source': source}]) return uniprot_values + From a069bc0f469a479638f55122f221c5a1a797de7a Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:50:27 -0600 Subject: [PATCH 20/29] Create rhea2uniprot_sprot.txt Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt diff --git a/metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt b/metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt new file mode 100644 index 00000000..05b819cc --- /dev/null +++ b/metanetx_uniprot/TestingFiles/rhea2uniprot_sprot.txt @@ -0,0 +1,6 @@ +50004 UN 50004 Q01911 +61444 UN 61444 Q01911 +42776 UN 42776 A8C927 +18690 LR 18689 P0DTE9 +60624 UN 60624 P0DTE9 +60625 LR 60624 P0DTE9 From c3c8d410a9b3ad2b3da5b83086f502e01f8a56ae Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Mon, 3 Apr 2023 08:50:48 -0600 Subject: [PATCH 21/29] Add files via upload Include enzyme to reaction, and reaction to chemical relationships. Also include testing dataset --- metanetx_uniprot/TestingFiles/chem_prop.tsv | 358 ++++++++++++++++++ metanetx_uniprot/TestingFiles/chem_xref.tsv | 362 ++++++++++++++++++ metanetx_uniprot/TestingFiles/ncbitaxon.json | 188 ++++++++++ metanetx_uniprot/TestingFiles/reac_prop.tsv | 359 ++++++++++++++++++ metanetx_uniprot/TestingFiles/reac_xref.tsv | 365 +++++++++++++++++++ 5 files changed, 1632 insertions(+) create mode 100644 metanetx_uniprot/TestingFiles/chem_prop.tsv create mode 100644 metanetx_uniprot/TestingFiles/chem_xref.tsv create mode 100644 metanetx_uniprot/TestingFiles/ncbitaxon.json create mode 100644 metanetx_uniprot/TestingFiles/reac_prop.tsv create mode 100644 metanetx_uniprot/TestingFiles/reac_xref.tsv diff --git a/metanetx_uniprot/TestingFiles/chem_prop.tsv b/metanetx_uniprot/TestingFiles/chem_prop.tsv new file mode 100644 index 00000000..d4f28677 --- /dev/null +++ b/metanetx_uniprot/TestingFiles/chem_prop.tsv @@ -0,0 +1,358 @@ +### MetaNetX/MNXref reconciliation ### +#Based on the following resources: +# +#RESOURCE: MetaNetX/MNXref +#VERSION: 4.4 +#DATE: 2022/03/16 +#URL: https://www.metanetx.org +#LICENSE: +# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics +# Except where otherwise noted, the data available from this site are +# licensed under a Creative Commons Attribution 4.0 International License. +# MNXref uses information on cellular compartments, reactions, and +# metabolites that is sourced from a number of external resources. The +# licensing agreements of those resources are specified in each of the +# downloadable files listed below. For each compound, reaction and +# cellular compartment in the MNXref namespace we indicate which external +# resource provided the information used in MNXref. Compounds and +# reactions in the MNXref namespace may be identical to, or differ from, +# those in the external resource. In either case the data from MNXref may +# be considered to be subject to the original licensing restrictions of +# the external resource. +# (https://www.metanetx.org/mnxdoc/mnxref.html) +# +#RESOURCE: BiGG +#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23) +#URL: http://bigg.ucsd.edu +#LICENSE: +# Copyright 2015 The Regents of the University of California +# +# All Rights Reserved +# +# Permission to use, copy, modify and distribute any part of BiGG Models +# for educational, research and non-profit purposes, without fee, and +# without a written agreement is hereby granted, provided that the above +# copyright notice, this paragraph and the following three paragraphs +# appear in all copies. +# +# Those desiring to incorporate BiGG Models into commercial products or +# use for commercial purposes should contact the Technology Transfer & +# Intellectual Property Services, University of California, San Diego, +# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858) +# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu. +# +# In no event shall the University of California be liable to any party +# for direct, indirect, special, incidental, or consequential damages, +# including lost profits, arising out of the use of this bigg database, +# even if the University of California has been advised of the possibility +# of such damage. +# +# The BiGG Models provided herein is on an "as is" basis, and the +# University of California has no obligation to provide maintenance, +# support, updates, enhancements, or modifications. The University of +# California makes no representations and extends no warranties of any +# kind, either implied or express, including, but not limited to, the +# implied warranties of merchantability or fitness for a particular +# purpose, or that the use of the BiGG Models will not infringe any +# patent, trademark or other rights. +# (http://bigg.ucsd.edu/) +# +#RESOURCE: The Cell Component Ontology +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://bioinformatics.ai.sri.com/CCO/ +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: ChEBI +#VERSION: 203 (downloaded on 2021/09/30) +#URL: https://www.ebi.ac.uk/chebi/ +#LICENSE: +# All data in the database is non-proprietary or is derived from a +# non-proprietary source. It is thus freely accessible and available to +# anyone. In addition, each data item is fully traceable and explicitly +# referenced to the original source. +# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do) +# +#RESOURCE: enviPath +#VERSION: (downloaded on 2021/11/24) +#URL: https://envipath.org +#LICENSE: +# The core data sets of enviPath are licensed under the Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) +# license. This allows you to use them in a non-commercial context, for +# example if you work at a University or for a public research institute. +# You can even redistribute and modify the data using the same license. If +# you want to use the data commercially, contact us, we offer commercial +# license agreements. +# We summarized how you can use the data on our license page. +# (https://envipath.com/license/) +# +#RESOURCE: HMDB +#VERSION: 4.0 (downloaded on 2021/06/18) +#URL: https://hmdb.ca +#LICENSE: +# HMDB is offered to the public as a freely available resource. Use and +# re-distribution of the data, in whole or in part, for commercial +# purposes requires explicit permission of the authors and explicit +# acknowledgment of the source material (HMDB) and the original +# publication. +# (https://hmdb.ca/about) +# +#RESOURCE: KEGG +#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11) +#URL: https://www.kegg.jp +#LICENSE: +# Academic users may freely use the KEGG website and may also freely link +# to the KEGG website. +# Non-academic users may use the KEGG website as end users for +# non-commercial purposes, but any other use requires a license agreement. +# Academic users who utilize KEGG for providing academic services are +# requested to obtain a KEGG FTP subscription for organizational use, +# which includes a proper license agreement. +# Non-academic users and Academic users intending to use KEGG for +# commercial purposes are requested to obtain a license agreement through +# KEGG's exclusive licensing agent, Pathway Solutions. +# (https://www.kegg.jp/kegg/legal.html) +# +#RESOURCE: LipidMaps +#VERSION: 2021-05-28 (downloaded on 2021/06/11) +#URL: https://www.lipidmaps.org +#LICENSE: +# The Lipidomics Gateway is provided on an "as is" basis, without warranty +# or representation of any kind, express or implied. The content of the +# Lipidomics Gateway website is protected by international copyright, +# trademark and other laws. You may download articles and web pages from +# this site for your personal, non-commercial use only, provided that you +# keep intact all authorship, copyright and other proprietary notices. The +# Featured Lipid can also be used for educational purposes, provided that +# credit is given to the Lipidomics Gateway. If you use the Lipidomics +# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the +# right to modify these terms at any time. +# (https://www.lipidmaps.org/about/) +# +#RESOURCE: MetaCyc +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://metacyc.org +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: Reactome +#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03) +#URL: https://reactome.org +#LICENSE: +# Reactome is an open source and open access resource, available to anyone. +# Usage of Reactome material is covered by two Creative Commons licenses: +# +# The terms of the Creative Commons Public Domain (CC0) License apply to all +# Reactome annotation files, e.g. identifier mapping data, specialized data +# files, and interaction data derived from Reactome. +# (https://reactome.org/license/) +# +#RESOURCE: Rhea +#VERSION: 119 (downloaded on 2021/11/03) +#URL: https://www.rhea-db.org +#LICENSE: +# All data in Rhea is freely accessible and available for anyone to use under +# the Creative Commons Attribution License. +# (https://www.rhea-db.org/documentation) +# +#RESOURCE: SABIO-RK +#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01) +#URL: http://sabiork.h-its.org +#LICENSE: +# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its +# associated documentation (all referred to in the following as +# "Database"). You should carefully read the following terms and +# conditions before using this Database. Your use of this Database +# indicates your acceptance of this license agreement and all terms and +# conditions.You are hereby granted a non-exclusive and non-transferable +# license to use the Database according to the following terms and +# conditions. This license is to use the Database for Non-Commercial +# Purpose only. Non-Commercial Purpose means the use of the Database +# solely for internal non-commercial research and academic purposes. +# Non-Commercial Purpose excludes, without limitation, any use of the +# Database, as part of, or in any way in connection with a product or +# service which is sold, offered for sale, licensed, leased, loaned, or +# rented. Permission to use this Database for Non-Commercial Purpose is +# hereby granted without fee and subject to the following terms of this +# license. +# +# Commercial Use +# If you desire to use the Database for profit-making or commercial +# purposes, you agree to negotiate in good faith a license with the HITS +# prior to such profit-making or commercial use. The HITS shall have no +# obligation to grant such license to you, and may grant exclusive or +# non-exclusive licenses to others. You agree to notify the HITS of any +# inquiries you have for commercial use of the Database and/or its +# modifications. You may contact the following email to discuss commercial +# use: sabiork at h-its.org +# +# Governing Law +# This Agreement is governed by the law of the Federal Republic of +# Germany. The application of the UN Convention on the Sale of Goods is +# excluded. +# +# Disclaimer of Warranty +# Because this Database is licensed free of charge, there is no warranty +# for the data in it contained and the methods used for its querying. The +# HITS makes no warranty or representation that the operation of the +# Database in this compilation will be error-free, and the HITS is under +# no obligation to provide any services, by way of maintenance, update, or +# otherwise. +# +# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND +# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER +# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A +# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND +# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +# CORRECTION. +# +# Limitation of Liability +# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR +# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL +# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM +# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED +# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF +# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS +# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +# +# Reference to SABIO-RK Users will cite SABIO-RK in publications or +# presentations, whenever the data used was extracted from the database. +# Termination This agreement is effective until terminated. You may +# terminate this agreement at any time by destroying all associated +# material (e.g., documentation or web service clients) to the database in +# your possession and by stopping any access to the database directly or +# from software generated by you. This agreement will terminate +# immediately without notice from and HITS if you fail to comply with any +# of the terms and conditions of this license. This agreement will also +# terminate immediately without notice from the HITS if it is found to +# implement patented algorithms or contain copyrighted code not owned or +# licensed the HITS for the purpose of its inclusion in the SABIO-RK +# Database. This agreement cannot be terminated by any other mechanism or +# for any other reason than those stated herein. +# +# Place of Court +# The exclusive venue for all disputes arising from or in connection with +# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a +# business person, a legal entity governed by public law, or a special +# fund governed by public law, or does not have a general place of +# jurisdiction within the Federal Republic of Germany. Address all +# correspondence regarding this license to electronic mail address: +# sabiork at h-its.org Any inquiries and comments regarding bugs, bug +# fixes, enhancements, modifications or any other similar issues should be +# directed to: sabiork at h-its.org +# +# Copyright 2007 by HITS, gGmbH. All rights reserved. +# (http://sabiork.h-its.org/layouts/content/termscondition.gsp) +# +#RESOURCE: The SEED +#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09) +#URL: https://modelseed.org +#LICENSE: +# All tools and datasets that make up the SEED are in the public domain. +# (https://modelseed.org) +# +#RESOURCE: SwissLipids +#VERSION: (downloaded on 2021/07/29) +#URL: https://www.swisslipids.org +#LICENSE: +# SwissLipids is licensed under a Creative Commons Attribution-Non +# Commercial-NoDerivatives 4.0 International License. +# +# Commercial users and those who wish to use this work for commercial +# purposes please contact the SIB technology transfer officer at: +# marc.filliettaz@genebio.com +# (https://www.swisslipids.org/#/downloads) +#ID name reference formula charge mass InChI InChIKey SMILES +MNXM738702 NADPH chebi:57783 C21H26N7O17P3 -4 741.06200 InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(44-46(33,34)35)14(30)11(43-21)6-41-48(38,39)45-47(36,37)40-5-10-13(29)15(31)20(42-10)27-3-1-2-9(4-27)18(23)32/h1,3-4,7-8,10-11,13-16,20-21,29-31H,2,5-6H2,(H2,23,32)(H,36,37)(H,38,39)(H2,22,24,25)(H2,33,34,35)/p-4/t10-,11-,13-,14-,15-,16-,20-,21-/m1/s1 InChIKey=ACFIXJIJDZMPPO-NNYOXOHSSA-J NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1 +MNXM97613 tetracycline chebi:77932 C22H24N2O8 0 444.15327 InChI=1S/C22H24N2O8/c1-21(31)8-5-4-6-11(25)12(8)16(26)13-9(21)7-10-15(24(2)3)17(27)14(20(23)30)19(29)22(10,32)18(13)28/h4-6,9-10,15,25,27-28,31-32H,7H2,1-3H3,(H2,23,30)/t9-,10-,15-,21+,22-/m0/s1 InChIKey=OFVLGDICTFRJMM-WESIUVDSSA-N C[NH+](C)[C@@H]1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C(O)=C3C(=O)c4c(O)cccc4[C@@](C)(O)[C@H]3C[C@@H]12 +MNXM162730 11a-hydroxytetracycline chebi:132727 C22H24N2O9 0 460.14818 InChI=1S/C22H24N2O9/c1-20(31)8-5-4-6-10(25)12(8)16(27)22(33)11(20)7-9-14(24(2)3)15(26)13(18(23)29)17(28)21(9,32)19(22)30/h4-6,9,11,14,25-26,31-33H,7H2,1-3H3,(H2,23,29)/t9-,11+,14-,20+,21+,22-/m0/s1 InChIKey=FWVRSACGGAUWNP-BWOONYPSSA-N C[NH+](C)[C@@H]1C([O-])=C(C(N)=O)C(=O)[C@@]2(O)C(=O)[C@@]3(O)C(=O)c4c(O)cccc4[C@@](C)(O)[C@H]3C[C@@H]12 +MNXM5 NADP(+) chebi:58349 C21H25N7O17P3 -3 740.05362 InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(44-46(33,34)35)14(30)11(43-21)6-41-48(38,39)45-47(36,37)40-5-10-13(29)15(31)20(42-10)27-3-1-2-9(4-27)18(23)32/h1-4,7-8,10-11,13-16,20-21,29-31H,5-6H2,(H7-,22,23,24,25,32,33,34,35,36,37,38,39)/p-3/t10-,11-,13-,14-,15-,16-,20-,21-/m1/s1 InChIKey=XJLXINKUBYWONI-NNYOXOHSSA-K NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)([O-])[O-])[C@@H]3O)[C@@H](O)[C@H]2O)c1 +MNXM737425 1,2-di-(9Z-octadecenoyl)-sn-glycero-3-phosphoethanolamine chebi:74986 C41H78NO8P 0 743.54651 InChI=1S/C41H78NO8P/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-31-33-40(43)47-37-39(38-49-51(45,46)48-36-35-42)50-41(44)34-32-30-28-26-24-22-20-18-16-14-12-10-8-6-4-2/h17-20,39H,3-16,21-38,42H2,1-2H3,(H,45,46)/b19-17-,20-18-/t39-/m1/s1 InChIKey=MWRBNPKJOOWZPW-NYVOMTAGSA-N CCCCCCCC/C=C\CCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[NH3+])OC(=O)CCCCCCC/C=C\CCCCCCCC +MNXM1107708 (9Z)-octadecenoate chebi:30823 C18H33O2 -1 281.24860 InChI=1S/C18H34O2/c1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18(19)20/h9-10H,2-8,11-17H2,1H3,(H,19,20)/p-1/b10-9- InChIKey=ZQPPMHVWECSIRJ-KTKRTIGZSA-M CCCCCCCC/C=C\CCCCCCCC(=O)[O-] \ No newline at end of file diff --git a/metanetx_uniprot/TestingFiles/chem_xref.tsv b/metanetx_uniprot/TestingFiles/chem_xref.tsv new file mode 100644 index 00000000..9ce7e27d --- /dev/null +++ b/metanetx_uniprot/TestingFiles/chem_xref.tsv @@ -0,0 +1,362 @@ +### MetaNetX/MNXref reconciliation ### +#Based on the following resources: +# +#RESOURCE: MetaNetX/MNXref +#VERSION: 4.4 +#DATE: 2022/03/16 +#URL: https://www.metanetx.org +#LICENSE: +# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics +# Except where otherwise noted, the data available from this site are +# licensed under a Creative Commons Attribution 4.0 International License. +# MNXref uses information on cellular compartments, reactions, and +# metabolites that is sourced from a number of external resources. The +# licensing agreements of those resources are specified in each of the +# downloadable files listed below. For each compound, reaction and +# cellular compartment in the MNXref namespace we indicate which external +# resource provided the information used in MNXref. Compounds and +# reactions in the MNXref namespace may be identical to, or differ from, +# those in the external resource. In either case the data from MNXref may +# be considered to be subject to the original licensing restrictions of +# the external resource. +# (https://www.metanetx.org/mnxdoc/mnxref.html) +# +#RESOURCE: BiGG +#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23) +#URL: http://bigg.ucsd.edu +#LICENSE: +# Copyright 2015 The Regents of the University of California +# +# All Rights Reserved +# +# Permission to use, copy, modify and distribute any part of BiGG Models +# for educational, research and non-profit purposes, without fee, and +# without a written agreement is hereby granted, provided that the above +# copyright notice, this paragraph and the following three paragraphs +# appear in all copies. +# +# Those desiring to incorporate BiGG Models into commercial products or +# use for commercial purposes should contact the Technology Transfer & +# Intellectual Property Services, University of California, San Diego, +# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858) +# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu. +# +# In no event shall the University of California be liable to any party +# for direct, indirect, special, incidental, or consequential damages, +# including lost profits, arising out of the use of this bigg database, +# even if the University of California has been advised of the possibility +# of such damage. +# +# The BiGG Models provided herein is on an "as is" basis, and the +# University of California has no obligation to provide maintenance, +# support, updates, enhancements, or modifications. The University of +# California makes no representations and extends no warranties of any +# kind, either implied or express, including, but not limited to, the +# implied warranties of merchantability or fitness for a particular +# purpose, or that the use of the BiGG Models will not infringe any +# patent, trademark or other rights. +# (http://bigg.ucsd.edu/) +# +#RESOURCE: The Cell Component Ontology +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://bioinformatics.ai.sri.com/CCO/ +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: ChEBI +#VERSION: 203 (downloaded on 2021/09/30) +#URL: https://www.ebi.ac.uk/chebi/ +#LICENSE: +# All data in the database is non-proprietary or is derived from a +# non-proprietary source. It is thus freely accessible and available to +# anyone. In addition, each data item is fully traceable and explicitly +# referenced to the original source. +# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do) +# +#RESOURCE: enviPath +#VERSION: (downloaded on 2021/11/24) +#URL: https://envipath.org +#LICENSE: +# The core data sets of enviPath are licensed under the Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) +# license. This allows you to use them in a non-commercial context, for +# example if you work at a University or for a public research institute. +# You can even redistribute and modify the data using the same license. If +# you want to use the data commercially, contact us, we offer commercial +# license agreements. +# We summarized how you can use the data on our license page. +# (https://envipath.com/license/) +# +#RESOURCE: HMDB +#VERSION: 4.0 (downloaded on 2021/06/18) +#URL: https://hmdb.ca +#LICENSE: +# HMDB is offered to the public as a freely available resource. Use and +# re-distribution of the data, in whole or in part, for commercial +# purposes requires explicit permission of the authors and explicit +# acknowledgment of the source material (HMDB) and the original +# publication. +# (https://hmdb.ca/about) +# +#RESOURCE: KEGG +#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11) +#URL: https://www.kegg.jp +#LICENSE: +# Academic users may freely use the KEGG website and may also freely link +# to the KEGG website. +# Non-academic users may use the KEGG website as end users for +# non-commercial purposes, but any other use requires a license agreement. +# Academic users who utilize KEGG for providing academic services are +# requested to obtain a KEGG FTP subscription for organizational use, +# which includes a proper license agreement. +# Non-academic users and Academic users intending to use KEGG for +# commercial purposes are requested to obtain a license agreement through +# KEGG's exclusive licensing agent, Pathway Solutions. +# (https://www.kegg.jp/kegg/legal.html) +# +#RESOURCE: LipidMaps +#VERSION: 2021-05-28 (downloaded on 2021/06/11) +#URL: https://www.lipidmaps.org +#LICENSE: +# The Lipidomics Gateway is provided on an "as is" basis, without warranty +# or representation of any kind, express or implied. The content of the +# Lipidomics Gateway website is protected by international copyright, +# trademark and other laws. You may download articles and web pages from +# this site for your personal, non-commercial use only, provided that you +# keep intact all authorship, copyright and other proprietary notices. The +# Featured Lipid can also be used for educational purposes, provided that +# credit is given to the Lipidomics Gateway. If you use the Lipidomics +# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the +# right to modify these terms at any time. +# (https://www.lipidmaps.org/about/) +# +#RESOURCE: MetaCyc +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://metacyc.org +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: Reactome +#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03) +#URL: https://reactome.org +#LICENSE: +# Reactome is an open source and open access resource, available to anyone. +# Usage of Reactome material is covered by two Creative Commons licenses: +# +# The terms of the Creative Commons Public Domain (CC0) License apply to all +# Reactome annotation files, e.g. identifier mapping data, specialized data +# files, and interaction data derived from Reactome. +# (https://reactome.org/license/) +# +#RESOURCE: Rhea +#VERSION: 119 (downloaded on 2021/11/03) +#URL: https://www.rhea-db.org +#LICENSE: +# All data in Rhea is freely accessible and available for anyone to use under +# the Creative Commons Attribution License. +# (https://www.rhea-db.org/documentation) +# +#RESOURCE: SABIO-RK +#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01) +#URL: http://sabiork.h-its.org +#LICENSE: +# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its +# associated documentation (all referred to in the following as +# "Database"). You should carefully read the following terms and +# conditions before using this Database. Your use of this Database +# indicates your acceptance of this license agreement and all terms and +# conditions.You are hereby granted a non-exclusive and non-transferable +# license to use the Database according to the following terms and +# conditions. This license is to use the Database for Non-Commercial +# Purpose only. Non-Commercial Purpose means the use of the Database +# solely for internal non-commercial research and academic purposes. +# Non-Commercial Purpose excludes, without limitation, any use of the +# Database, as part of, or in any way in connection with a product or +# service which is sold, offered for sale, licensed, leased, loaned, or +# rented. Permission to use this Database for Non-Commercial Purpose is +# hereby granted without fee and subject to the following terms of this +# license. +# +# Commercial Use +# If you desire to use the Database for profit-making or commercial +# purposes, you agree to negotiate in good faith a license with the HITS +# prior to such profit-making or commercial use. The HITS shall have no +# obligation to grant such license to you, and may grant exclusive or +# non-exclusive licenses to others. You agree to notify the HITS of any +# inquiries you have for commercial use of the Database and/or its +# modifications. You may contact the following email to discuss commercial +# use: sabiork at h-its.org +# +# Governing Law +# This Agreement is governed by the law of the Federal Republic of +# Germany. The application of the UN Convention on the Sale of Goods is +# excluded. +# +# Disclaimer of Warranty +# Because this Database is licensed free of charge, there is no warranty +# for the data in it contained and the methods used for its querying. The +# HITS makes no warranty or representation that the operation of the +# Database in this compilation will be error-free, and the HITS is under +# no obligation to provide any services, by way of maintenance, update, or +# otherwise. +# +# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND +# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER +# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A +# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND +# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +# CORRECTION. +# +# Limitation of Liability +# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR +# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL +# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM +# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED +# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF +# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS +# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +# +# Reference to SABIO-RK Users will cite SABIO-RK in publications or +# presentations, whenever the data used was extracted from the database. +# Termination This agreement is effective until terminated. You may +# terminate this agreement at any time by destroying all associated +# material (e.g., documentation or web service clients) to the database in +# your possession and by stopping any access to the database directly or +# from software generated by you. This agreement will terminate +# immediately without notice from and HITS if you fail to comply with any +# of the terms and conditions of this license. This agreement will also +# terminate immediately without notice from the HITS if it is found to +# implement patented algorithms or contain copyrighted code not owned or +# licensed the HITS for the purpose of its inclusion in the SABIO-RK +# Database. This agreement cannot be terminated by any other mechanism or +# for any other reason than those stated herein. +# +# Place of Court +# The exclusive venue for all disputes arising from or in connection with +# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a +# business person, a legal entity governed by public law, or a special +# fund governed by public law, or does not have a general place of +# jurisdiction within the Federal Republic of Germany. Address all +# correspondence regarding this license to electronic mail address: +# sabiork at h-its.org Any inquiries and comments regarding bugs, bug +# fixes, enhancements, modifications or any other similar issues should be +# directed to: sabiork at h-its.org +# +# Copyright 2007 by HITS, gGmbH. All rights reserved. +# (http://sabiork.h-its.org/layouts/content/termscondition.gsp) +# +#RESOURCE: The SEED +#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09) +#URL: https://modelseed.org +#LICENSE: +# All tools and datasets that make up the SEED are in the public domain. +# (https://modelseed.org) +# +#RESOURCE: SwissLipids +#VERSION: (downloaded on 2021/07/29) +#URL: https://www.swisslipids.org +#LICENSE: +# SwissLipids is licensed under a Creative Commons Attribution-Non +# Commercial-NoDerivatives 4.0 International License. +# +# Commercial users and those who wish to use this work for commercial +# purposes please contact the SIB technology transfer officer at: +# marc.filliettaz@genebio.com +# (https://www.swisslipids.org/#/downloads) +#source ID description +BIOMASS BIOMASS BIOMASS +CHEBI:57783 MNXM738702 NADPH||2'-O-phosphonatoadenosine 5'-{3-[1-(3-carbamoyl-1,4-dihydropyridin-1-yl)-1,4-anhydro-D-ribitol-5-yl] diphosphate}||NADPH tetraanion||NADPH(4-) +CHEBI:77932 MNXM97613 tetracycline||(1S,4aS,11S,11aS,12aS)-3-carbamoyl-1-(dimethylazaniumyl)-4a,5,7,11-tetrahydroxy-11-methyl-4,6-dioxo-1,4,4a,6,11,11a,12,12a-octahydrotetracen-2-olate||tetracycline zwitterion +CHEBI:132727 MNXM162730 11a-hydroxytetracycline||(1S,4aR5aS,11S,11aR,12aS)-3-carbamoyl-1-(dimethylazaniumyl)-4a,5a,7,11-tetrahydroxy-11-methyl-4,5,6-trioxo-1,4,4a,5,5a,6,11,11a,12,12a-decahydrotetracen-2-olate||11a-hydroxytetracycline zwitterion +chebi:15377 WATER H2O||BOUND WATER||HOH||WATER||Wasser||Water||[OH2]||acqua||agua||aqua||dihydridooxygen||dihydrogen oxide||eau||hydrogen hydroxide||oxidane||water +CHEBI:58349 MNXM5 NADP(+)||2'-O-phosphonatoadenosine 5'-{3-[1-(3-carbamoylpyridinio)-1,4-anhydro-D-ribitol-5-yl] diphosphate}||NADP trianion||NADP(3-) +CHEBI:74986 MNXM737425 1,2-di-(9Z-octadecenoyl)-sn-glycero-3-phosphoethanolamine||1,2-dioleoyl-sn-glycero-3-phosphoethanolamine zwitterion||1-(9Z)-octadecenoyl-2-(9Z)-octadecenoyl-sn-glycero-3-phosphoethanolamine zwitterion||1-C18:1(omega-9)-2-C18:1(omega-9)-phosphatidylethanolamine zwitterion||2-azaniumylethyl (2R)-2,3-bis[(9Z)-octadec-9-enoyloxy]propyl phosphate +chebi:14389 MNXM738220 secondary/obsolete/fantasy identifier +CHEBI:15378 MNXM1 H(+)||H+||Hydron||hydrogen(1+)||hydron +CHEBI:30823 MNXM1107708 (9Z)-octadecenoate||(9Z)-octadec-9-enoate||(Z)-9-octadecenoic acid, ion(1-)||Oleat||cis-9-octadecenoate||oleate||oleic acid anion \ No newline at end of file diff --git a/metanetx_uniprot/TestingFiles/ncbitaxon.json b/metanetx_uniprot/TestingFiles/ncbitaxon.json new file mode 100644 index 00000000..2324a45b --- /dev/null +++ b/metanetx_uniprot/TestingFiles/ncbitaxon.json @@ -0,0 +1,188 @@ +{ + "graphs" : [ { + "nodes" : [ { + "id" : "http://purl.obolibrary.org/obo/NCBITaxon_817", + "meta" : { + "xrefs" : [ { + "val" : "PMID:16559622" + }, { + "val" : "GC_ID:11" + }, { + "val" : "PMID:28066339" + } ], + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "Bacteroides incommunis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Pseudobacterium fragilis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Ristella uncata", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Ristella incommunis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Bacteroides inaequalis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Pseudobacterium incommunis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Bacteroides uncatus", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Sphaerophorus inaequalis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Fusiformis fragilis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Sphaerophorus intermedius", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Pseudobacterium inaequalis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Bacillus fragilis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Ristella fragilis", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Pseudobacterium uncatum", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + } ], + "basicPropertyValues" : [ { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:665938" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:33929" + }, { + "pred" : "http://purl.obolibrary.org/obo/ncbitaxon#has_rank", + "val" : "http://purl.obolibrary.org/obo/NCBITaxon_species" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace", + "val" : "ncbi_taxonomy" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:469587" + } ] + }, + "type" : "CLASS", + "lbl" : "Bacteroides fragilis" + }, { + "id" : "http://purl.obolibrary.org/obo/NCBITaxon_562", + "meta" : { + "xrefs" : [ { + "val" : "GC_ID:11" + }, { + "val" : "PMID:10319482" + } ], + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "Enterococcus coli", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasExactSynonym", + "val" : "Escherichia/Shigella coli", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#equivalent_name" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Bacillus coli", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Bacterium coli", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasRelatedSynonym", + "val" : "Bacterium coli commune", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#synonym" + }, { + "pred" : "hasExactSynonym", + "val" : "E. coli", + "xrefs" : [ ], + "synonymType" : "http://purl.obolibrary.org/obo/ncbitaxon#common_name" + } ], + "basicPropertyValues" : [ { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:1806490" + }, { + "pred" : "http://purl.obolibrary.org/obo/ncbitaxon#has_rank", + "val" : "http://purl.obolibrary.org/obo/NCBITaxon_species" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:469598" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:1637691" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBONamespace", + "val" : "ncbi_taxonomy" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:662104" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasAlternativeId", + "val" : "NCBITaxon:662101" + } ] + }, + "type" : "CLASS", + "lbl" : "Escherichia coli" + } ], + "edges" : [ { + "sub" : "http://purl.obolibrary.org/obo/NCBITaxon_295405", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/NCBITaxon_817" + }, { + "sub" : "http://purl.obolibrary.org/obo/NCBITaxon_1389418", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/NCBITaxon_562" + } ], + "id" : "http://purl.obolibrary.org/obo/ncbitaxon.owl", + "meta" : { + "subsets" : [ ], + "xrefs" : [ ], + "basicPropertyValues" : [ ] + }, + "equivalentNodesSets" : [ ], + "logicalDefinitionAxioms" : [ ], + "`domainRangeAxioms`" : [ ], + "propertyChainAxioms" : [ ] + } ] +} \ No newline at end of file diff --git a/metanetx_uniprot/TestingFiles/reac_prop.tsv b/metanetx_uniprot/TestingFiles/reac_prop.tsv new file mode 100644 index 00000000..75826bdb --- /dev/null +++ b/metanetx_uniprot/TestingFiles/reac_prop.tsv @@ -0,0 +1,359 @@ +### MetaNetX/MNXref reconciliation ### +#Based on the following resources: +# +#RESOURCE: MetaNetX/MNXref +#VERSION: 4.4 +#DATE: 2022/03/16 +#URL: https://www.metanetx.org +#LICENSE: +# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics +# Except where otherwise noted, the data available from this site are +# licensed under a Creative Commons Attribution 4.0 International License. +# MNXref uses information on cellular compartments, reactions, and +# metabolites that is sourced from a number of external resources. The +# licensing agreements of those resources are specified in each of the +# downloadable files listed below. For each compound, reaction and +# cellular compartment in the MNXref namespace we indicate which external +# resource provided the information used in MNXref. Compounds and +# reactions in the MNXref namespace may be identical to, or differ from, +# those in the external resource. In either case the data from MNXref may +# be considered to be subject to the original licensing restrictions of +# the external resource. +# (https://www.metanetx.org/mnxdoc/mnxref.html) +# +#RESOURCE: BiGG +#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23) +#URL: http://bigg.ucsd.edu +#LICENSE: +# Copyright 2015 The Regents of the University of California +# +# All Rights Reserved +# +# Permission to use, copy, modify and distribute any part of BiGG Models +# for educational, research and non-profit purposes, without fee, and +# without a written agreement is hereby granted, provided that the above +# copyright notice, this paragraph and the following three paragraphs +# appear in all copies. +# +# Those desiring to incorporate BiGG Models into commercial products or +# use for commercial purposes should contact the Technology Transfer & +# Intellectual Property Services, University of California, San Diego, +# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858) +# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu. +# +# In no event shall the University of California be liable to any party +# for direct, indirect, special, incidental, or consequential damages, +# including lost profits, arising out of the use of this bigg database, +# even if the University of California has been advised of the possibility +# of such damage. +# +# The BiGG Models provided herein is on an "as is" basis, and the +# University of California has no obligation to provide maintenance, +# support, updates, enhancements, or modifications. The University of +# California makes no representations and extends no warranties of any +# kind, either implied or express, including, but not limited to, the +# implied warranties of merchantability or fitness for a particular +# purpose, or that the use of the BiGG Models will not infringe any +# patent, trademark or other rights. +# (http://bigg.ucsd.edu/) +# +#RESOURCE: The Cell Component Ontology +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://bioinformatics.ai.sri.com/CCO/ +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: ChEBI +#VERSION: 203 (downloaded on 2021/09/30) +#URL: https://www.ebi.ac.uk/chebi/ +#LICENSE: +# All data in the database is non-proprietary or is derived from a +# non-proprietary source. It is thus freely accessible and available to +# anyone. In addition, each data item is fully traceable and explicitly +# referenced to the original source. +# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do) +# +#RESOURCE: enviPath +#VERSION: (downloaded on 2021/11/24) +#URL: https://envipath.org +#LICENSE: +# The core data sets of enviPath are licensed under the Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) +# license. This allows you to use them in a non-commercial context, for +# example if you work at a University or for a public research institute. +# You can even redistribute and modify the data using the same license. If +# you want to use the data commercially, contact us, we offer commercial +# license agreements. +# We summarized how you can use the data on our license page. +# (https://envipath.com/license/) +# +#RESOURCE: HMDB +#VERSION: 4.0 (downloaded on 2021/06/18) +#URL: https://hmdb.ca +#LICENSE: +# HMDB is offered to the public as a freely available resource. Use and +# re-distribution of the data, in whole or in part, for commercial +# purposes requires explicit permission of the authors and explicit +# acknowledgment of the source material (HMDB) and the original +# publication. +# (https://hmdb.ca/about) +# +#RESOURCE: KEGG +#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11) +#URL: https://www.kegg.jp +#LICENSE: +# Academic users may freely use the KEGG website and may also freely link +# to the KEGG website. +# Non-academic users may use the KEGG website as end users for +# non-commercial purposes, but any other use requires a license agreement. +# Academic users who utilize KEGG for providing academic services are +# requested to obtain a KEGG FTP subscription for organizational use, +# which includes a proper license agreement. +# Non-academic users and Academic users intending to use KEGG for +# commercial purposes are requested to obtain a license agreement through +# KEGG's exclusive licensing agent, Pathway Solutions. +# (https://www.kegg.jp/kegg/legal.html) +# +#RESOURCE: LipidMaps +#VERSION: 2021-05-28 (downloaded on 2021/06/11) +#URL: https://www.lipidmaps.org +#LICENSE: +# The Lipidomics Gateway is provided on an "as is" basis, without warranty +# or representation of any kind, express or implied. The content of the +# Lipidomics Gateway website is protected by international copyright, +# trademark and other laws. You may download articles and web pages from +# this site for your personal, non-commercial use only, provided that you +# keep intact all authorship, copyright and other proprietary notices. The +# Featured Lipid can also be used for educational purposes, provided that +# credit is given to the Lipidomics Gateway. If you use the Lipidomics +# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the +# right to modify these terms at any time. +# (https://www.lipidmaps.org/about/) +# +#RESOURCE: MetaCyc +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://metacyc.org +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: Reactome +#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03) +#URL: https://reactome.org +#LICENSE: +# Reactome is an open source and open access resource, available to anyone. +# Usage of Reactome material is covered by two Creative Commons licenses: +# +# The terms of the Creative Commons Public Domain (CC0) License apply to all +# Reactome annotation files, e.g. identifier mapping data, specialized data +# files, and interaction data derived from Reactome. +# (https://reactome.org/license/) +# +#RESOURCE: Rhea +#VERSION: 119 (downloaded on 2021/11/03) +#URL: https://www.rhea-db.org +#LICENSE: +# All data in Rhea is freely accessible and available for anyone to use under +# the Creative Commons Attribution License. +# (https://www.rhea-db.org/documentation) +# +#RESOURCE: SABIO-RK +#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01) +#URL: http://sabiork.h-its.org +#LICENSE: +# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its +# associated documentation (all referred to in the following as +# "Database"). You should carefully read the following terms and +# conditions before using this Database. Your use of this Database +# indicates your acceptance of this license agreement and all terms and +# conditions.You are hereby granted a non-exclusive and non-transferable +# license to use the Database according to the following terms and +# conditions. This license is to use the Database for Non-Commercial +# Purpose only. Non-Commercial Purpose means the use of the Database +# solely for internal non-commercial research and academic purposes. +# Non-Commercial Purpose excludes, without limitation, any use of the +# Database, as part of, or in any way in connection with a product or +# service which is sold, offered for sale, licensed, leased, loaned, or +# rented. Permission to use this Database for Non-Commercial Purpose is +# hereby granted without fee and subject to the following terms of this +# license. +# +# Commercial Use +# If you desire to use the Database for profit-making or commercial +# purposes, you agree to negotiate in good faith a license with the HITS +# prior to such profit-making or commercial use. The HITS shall have no +# obligation to grant such license to you, and may grant exclusive or +# non-exclusive licenses to others. You agree to notify the HITS of any +# inquiries you have for commercial use of the Database and/or its +# modifications. You may contact the following email to discuss commercial +# use: sabiork at h-its.org +# +# Governing Law +# This Agreement is governed by the law of the Federal Republic of +# Germany. The application of the UN Convention on the Sale of Goods is +# excluded. +# +# Disclaimer of Warranty +# Because this Database is licensed free of charge, there is no warranty +# for the data in it contained and the methods used for its querying. The +# HITS makes no warranty or representation that the operation of the +# Database in this compilation will be error-free, and the HITS is under +# no obligation to provide any services, by way of maintenance, update, or +# otherwise. +# +# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND +# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER +# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A +# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND +# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +# CORRECTION. +# +# Limitation of Liability +# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR +# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL +# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM +# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED +# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF +# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS +# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +# +# Reference to SABIO-RK Users will cite SABIO-RK in publications or +# presentations, whenever the data used was extracted from the database. +# Termination This agreement is effective until terminated. You may +# terminate this agreement at any time by destroying all associated +# material (e.g., documentation or web service clients) to the database in +# your possession and by stopping any access to the database directly or +# from software generated by you. This agreement will terminate +# immediately without notice from and HITS if you fail to comply with any +# of the terms and conditions of this license. This agreement will also +# terminate immediately without notice from the HITS if it is found to +# implement patented algorithms or contain copyrighted code not owned or +# licensed the HITS for the purpose of its inclusion in the SABIO-RK +# Database. This agreement cannot be terminated by any other mechanism or +# for any other reason than those stated herein. +# +# Place of Court +# The exclusive venue for all disputes arising from or in connection with +# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a +# business person, a legal entity governed by public law, or a special +# fund governed by public law, or does not have a general place of +# jurisdiction within the Federal Republic of Germany. Address all +# correspondence regarding this license to electronic mail address: +# sabiork at h-its.org Any inquiries and comments regarding bugs, bug +# fixes, enhancements, modifications or any other similar issues should be +# directed to: sabiork at h-its.org +# +# Copyright 2007 by HITS, gGmbH. All rights reserved. +# (http://sabiork.h-its.org/layouts/content/termscondition.gsp) +# +#RESOURCE: The SEED +#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09) +#URL: https://modelseed.org +#LICENSE: +# All tools and datasets that make up the SEED are in the public domain. +# (https://modelseed.org) +# +#RESOURCE: SwissLipids +#VERSION: (downloaded on 2021/07/29) +#URL: https://www.swisslipids.org +#LICENSE: +# SwissLipids is licensed under a Creative Commons Attribution-Non +# Commercial-NoDerivatives 4.0 International License. +# +# Commercial users and those who wish to use this work for commercial +# purposes please contact the SIB technology transfer officer at: +# marc.filliettaz@genebio.com +# (https://www.swisslipids.org/#/downloads) +#ID mnx_equation reference classifs is_balanced is_transport +EMPTY = mnx:EMPTY B +MNXR114744 1 MNXM162730@MNXD1 + 1 MNXM5@MNXD1 + 1 WATER@MNXD1 = 1 MNXM1@MNXD1 + 1 MNXM735438@MNXD1 + 1 MNXM738702@MNXD1 + 1 MNXM97613@MNXD1 rheaR:50004 1.14.13.231 B +MNXR171656 1 MNXM5@MNXD1 + 1 MNXM743287@MNXD1 + 1 WATER@MNXD1 = 1 MNXM735438@MNXD1 + 1 MNXM738702@MNXD1 + 1 MNXM743286@MNXD1 rheaR:61444 +MNXR168222 1 MNXM1089988@MNXD1 + 1 MNXM1102167@MNXD1 = 1 MNXM1089989@MNXD1 + 1 MNXM1102072@MNXD1 rheaR:42776 2.1.1.180 +MNXR165961 1 MNXM1107698@MNXD1 + 1 WATER@MNXD1 = 1 MNXM1108087@MNXD1 + 1 MNXM728579@MNXD1 rheaR:18689 3.1.1.32 +MNXR171532 2 MNXM1107708@MNXD1 + 2 MNXM1@MNXD1 + 1 MNXM734941@MNXD1 = 1 MNXM737425@MNXD1 + 2 WATER@MNXD1 rheaR:60624 B +MNXR171532 2 MNXM1107708@MNXD1 + 2 MNXM1@MNXD1 + 1 MNXM734941@MNXD1 = 1 MNXM737425@MNXD1 + 2 WATER@MNXD1 rheaR:60624 B \ No newline at end of file diff --git a/metanetx_uniprot/TestingFiles/reac_xref.tsv b/metanetx_uniprot/TestingFiles/reac_xref.tsv new file mode 100644 index 00000000..d03bb0c0 --- /dev/null +++ b/metanetx_uniprot/TestingFiles/reac_xref.tsv @@ -0,0 +1,365 @@ +### MetaNetX/MNXref reconciliation ### +#Based on the following resources: +# +#RESOURCE: MetaNetX/MNXref +#VERSION: 4.4 +#DATE: 2022/03/16 +#URL: https://www.metanetx.org +#LICENSE: +# MetaNetX copyright 2011 SystemsX, SIB Swiss Institute of Bioinformatics +# Except where otherwise noted, the data available from this site are +# licensed under a Creative Commons Attribution 4.0 International License. +# MNXref uses information on cellular compartments, reactions, and +# metabolites that is sourced from a number of external resources. The +# licensing agreements of those resources are specified in each of the +# downloadable files listed below. For each compound, reaction and +# cellular compartment in the MNXref namespace we indicate which external +# resource provided the information used in MNXref. Compounds and +# reactions in the MNXref namespace may be identical to, or differ from, +# those in the external resource. In either case the data from MNXref may +# be considered to be subject to the original licensing restrictions of +# the external resource. +# (https://www.metanetx.org/mnxdoc/mnxref.html) +# +#RESOURCE: BiGG +#VERSION: 1.6.0, last updated: 2019/10/31 (downloaded on 2021/07/23) +#URL: http://bigg.ucsd.edu +#LICENSE: +# Copyright 2015 The Regents of the University of California +# +# All Rights Reserved +# +# Permission to use, copy, modify and distribute any part of BiGG Models +# for educational, research and non-profit purposes, without fee, and +# without a written agreement is hereby granted, provided that the above +# copyright notice, this paragraph and the following three paragraphs +# appear in all copies. +# +# Those desiring to incorporate BiGG Models into commercial products or +# use for commercial purposes should contact the Technology Transfer & +# Intellectual Property Services, University of California, San Diego, +# 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910, Ph: (858) +# 534-5815, FAX: (858) 534-7345, e-mail: invent@ucsd.edu. +# +# In no event shall the University of California be liable to any party +# for direct, indirect, special, incidental, or consequential damages, +# including lost profits, arising out of the use of this bigg database, +# even if the University of California has been advised of the possibility +# of such damage. +# +# The BiGG Models provided herein is on an "as is" basis, and the +# University of California has no obligation to provide maintenance, +# support, updates, enhancements, or modifications. The University of +# California makes no representations and extends no warranties of any +# kind, either implied or express, including, but not limited to, the +# implied warranties of merchantability or fitness for a particular +# purpose, or that the use of the BiGG Models will not infringe any +# patent, trademark or other rights. +# (http://bigg.ucsd.edu/) +# +#RESOURCE: The Cell Component Ontology +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://bioinformatics.ai.sri.com/CCO/ +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: ChEBI +#VERSION: 203 (downloaded on 2021/09/30) +#URL: https://www.ebi.ac.uk/chebi/ +#LICENSE: +# All data in the database is non-proprietary or is derived from a +# non-proprietary source. It is thus freely accessible and available to +# anyone. In addition, each data item is fully traceable and explicitly +# referenced to the original source. +# (https://www.ebi.ac.uk/chebi/aboutChebiForward.do) +# +#RESOURCE: enviPath +#VERSION: (downloaded on 2021/11/24) +#URL: https://envipath.org +#LICENSE: +# The core data sets of enviPath are licensed under the Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) +# license. This allows you to use them in a non-commercial context, for +# example if you work at a University or for a public research institute. +# You can even redistribute and modify the data using the same license. If +# you want to use the data commercially, contact us, we offer commercial +# license agreements. +# We summarized how you can use the data on our license page. +# (https://envipath.com/license/) +# +#RESOURCE: HMDB +#VERSION: 4.0 (downloaded on 2021/06/18) +#URL: https://hmdb.ca +#LICENSE: +# HMDB is offered to the public as a freely available resource. Use and +# re-distribution of the data, in whole or in part, for commercial +# purposes requires explicit permission of the authors and explicit +# acknowledgment of the source material (HMDB) and the original +# publication. +# (https://hmdb.ca/about) +# +#RESOURCE: KEGG +#VERSION: 98.0+/06-11, Jun 21 (downloaded on 2021/06/11) +#URL: https://www.kegg.jp +#LICENSE: +# Academic users may freely use the KEGG website and may also freely link +# to the KEGG website. +# Non-academic users may use the KEGG website as end users for +# non-commercial purposes, but any other use requires a license agreement. +# Academic users who utilize KEGG for providing academic services are +# requested to obtain a KEGG FTP subscription for organizational use, +# which includes a proper license agreement. +# Non-academic users and Academic users intending to use KEGG for +# commercial purposes are requested to obtain a license agreement through +# KEGG's exclusive licensing agent, Pathway Solutions. +# (https://www.kegg.jp/kegg/legal.html) +# +#RESOURCE: LipidMaps +#VERSION: 2021-05-28 (downloaded on 2021/06/11) +#URL: https://www.lipidmaps.org +#LICENSE: +# The Lipidomics Gateway is provided on an "as is" basis, without warranty +# or representation of any kind, express or implied. The content of the +# Lipidomics Gateway website is protected by international copyright, +# trademark and other laws. You may download articles and web pages from +# this site for your personal, non-commercial use only, provided that you +# keep intact all authorship, copyright and other proprietary notices. The +# Featured Lipid can also be used for educational purposes, provided that +# credit is given to the Lipidomics Gateway. If you use the Lipidomics +# Gateway, you accept these terms. The LIPID MAPS Consortium reserves the +# right to modify these terms at any time. +# (https://www.lipidmaps.org/about/) +# +#RESOURCE: MetaCyc +#VERSION: 25.0 (downloaded on 2021/06/03) +#URL: https://metacyc.org +#LICENSE: +# "Open Databases" means the EcoCyc and MetaCyc Pathway/genome +# databases. +# +# 2.1 Open Databases. SRI hereby grants to LICENSEE a non-exclusive, +# royalty-free license to use, modify and redistribute the Open Databases +# (as such term is defined in Exhibit B) and LICENSEE's modified +# versions thereof on a royalty-free basis, worldwide and for any purpose; +# provided, in each case, that if LICENSEE modifies any Open Database (the +# modified version being a "Modified Open Database"), then (i) +# LICENSEE must provide a copy of the Modified Open Database to SRI (and +# hereby grants to SRI a nonexclusive, royalty-free license to use, +# modify, and redistribute the Modified Open Database worldwide and for +# any purpose and to authorize others to do so); and (ii) any Modified +# Open Databases, or websites from which such Modified Open Databases may +# be obtained, must clearly and prominently: +# +# (a) identify the Open Databases from which they were derived: +# +# (b) include all applicable copyright notices and author lists from the +# Open Databases from which they were derived; and +# +# (c) identify or summarize all modifications that were made. +# +# Any distribution of such Modified Open Databases without the required +# notices is a violation of SRI's and its licensors' copyright and other +# proprietary rights. All trademarks, service marks, and trade names are +# proprietary to SRI and its licensors. The Open Databases, including any +# files incorporated in or generated from the Open Databases and data +# accompanying the Open Databases, are licensed to LICENSEE by SRI and its +# licensors, and SRI and its licensors do not transfer title or any other +# rights in the Open Databases to LICENSEE. LICENSEE may not use the Open +# Databases except as otherwise specified herein. +# +# 2.1.1 If SRI, in its sole discretion, determines that a Modified +# Database is of sufficient quality and interest to the community to be +# hosted on biocyc.org, then SRI may (if the Modified Database includes +# significant curation over the original Open Database it is derived from, +# or the last version of the Modified Database provided to SRI) provide to +# LICENSEE a personal, one-year subscription to biocyc at no cost; +# provided, however, that if LICENSEE edits the Modified Database via a +# MySQL server operated by SRI or its contractors, such free one-year +# subscription will be forfeited. +# (https://biocyc.org/ptools-academic-license.shtml) +# +#RESOURCE: Reactome +#VERSION: 77 June 14, 2021 (downloaded on 2021/09/03) +#URL: https://reactome.org +#LICENSE: +# Reactome is an open source and open access resource, available to anyone. +# Usage of Reactome material is covered by two Creative Commons licenses: +# +# The terms of the Creative Commons Public Domain (CC0) License apply to all +# Reactome annotation files, e.g. identifier mapping data, specialized data +# files, and interaction data derived from Reactome. +# (https://reactome.org/license/) +# +#RESOURCE: Rhea +#VERSION: 119 (downloaded on 2021/11/03) +#URL: https://www.rhea-db.org +#LICENSE: +# All data in Rhea is freely accessible and available for anyone to use under +# the Creative Commons Attribution License. +# (https://www.rhea-db.org/documentation) +# +#RESOURCE: SABIO-RK +#VERSION: Software Update: 2021/05/11 -- Database Release: 2021/05/28 (downloaded on 2021/07/01) +#URL: http://sabiork.h-its.org +#LICENSE: +# HITS, gGmbH HITS own the SABIO-RK database, its interfaces and its +# associated documentation (all referred to in the following as +# "Database"). You should carefully read the following terms and +# conditions before using this Database. Your use of this Database +# indicates your acceptance of this license agreement and all terms and +# conditions.You are hereby granted a non-exclusive and non-transferable +# license to use the Database according to the following terms and +# conditions. This license is to use the Database for Non-Commercial +# Purpose only. Non-Commercial Purpose means the use of the Database +# solely for internal non-commercial research and academic purposes. +# Non-Commercial Purpose excludes, without limitation, any use of the +# Database, as part of, or in any way in connection with a product or +# service which is sold, offered for sale, licensed, leased, loaned, or +# rented. Permission to use this Database for Non-Commercial Purpose is +# hereby granted without fee and subject to the following terms of this +# license. +# +# Commercial Use +# If you desire to use the Database for profit-making or commercial +# purposes, you agree to negotiate in good faith a license with the HITS +# prior to such profit-making or commercial use. The HITS shall have no +# obligation to grant such license to you, and may grant exclusive or +# non-exclusive licenses to others. You agree to notify the HITS of any +# inquiries you have for commercial use of the Database and/or its +# modifications. You may contact the following email to discuss commercial +# use: sabiork at h-its.org +# +# Governing Law +# This Agreement is governed by the law of the Federal Republic of +# Germany. The application of the UN Convention on the Sale of Goods is +# excluded. +# +# Disclaimer of Warranty +# Because this Database is licensed free of charge, there is no warranty +# for the data in it contained and the methods used for its querying. The +# HITS makes no warranty or representation that the operation of the +# Database in this compilation will be error-free, and the HITS is under +# no obligation to provide any services, by way of maintenance, update, or +# otherwise. +# +# THIS DATABASE AND THE ACCOMPANYING FILES ARE LICENSED "AS IS" AND +# WITHOUT WARRANTIES AS TO PERFORMANCE OR MERCHANTABILITY OR ANY OTHER +# WARRANTIES WHETHER EXPRESSED OR IMPLIED. NO WARRANTY OF FITNESS FOR A +# PARTICULAR PURPOSE IS OFFERED. THE ENTIRE RISK AS TO THE QUALITY AND +# PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +# DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +# CORRECTION. +# +# Limitation of Liability +# IN NO EVENT WILL HITS, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +# REDISTRIBUTE THE DATABASE AS PERMITTED ABOVE, BE LIABLE TO YOU FOR +# DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL +# DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM +# (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED +# INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF +# THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF VTIP AND HITS +# OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +# +# Reference to SABIO-RK Users will cite SABIO-RK in publications or +# presentations, whenever the data used was extracted from the database. +# Termination This agreement is effective until terminated. You may +# terminate this agreement at any time by destroying all associated +# material (e.g., documentation or web service clients) to the database in +# your possession and by stopping any access to the database directly or +# from software generated by you. This agreement will terminate +# immediately without notice from and HITS if you fail to comply with any +# of the terms and conditions of this license. This agreement will also +# terminate immediately without notice from the HITS if it is found to +# implement patented algorithms or contain copyrighted code not owned or +# licensed the HITS for the purpose of its inclusion in the SABIO-RK +# Database. This agreement cannot be terminated by any other mechanism or +# for any other reason than those stated herein. +# +# Place of Court +# The exclusive venue for all disputes arising from or in connection with +# this Agreement is Mannheim, Germany (HRB 337446), when the Licensee is a +# business person, a legal entity governed by public law, or a special +# fund governed by public law, or does not have a general place of +# jurisdiction within the Federal Republic of Germany. Address all +# correspondence regarding this license to electronic mail address: +# sabiork at h-its.org Any inquiries and comments regarding bugs, bug +# fixes, enhancements, modifications or any other similar issues should be +# directed to: sabiork at h-its.org +# +# Copyright 2007 by HITS, gGmbH. All rights reserved. +# (http://sabiork.h-its.org/layouts/content/termscondition.gsp) +# +#RESOURCE: The SEED +#VERSION: 2.6.1 (July 31, 2020) (downloaded on 2021/08/09) +#URL: https://modelseed.org +#LICENSE: +# All tools and datasets that make up the SEED are in the public domain. +# (https://modelseed.org) +# +#RESOURCE: SwissLipids +#VERSION: (downloaded on 2021/07/29) +#URL: https://www.swisslipids.org +#LICENSE: +# SwissLipids is licensed under a Creative Commons Attribution-Non +# Commercial-NoDerivatives 4.0 International License. +# +# Commercial users and those who wish to use this work for commercial +# purposes please contact the SIB technology transfer officer at: +# marc.filliettaz@genebio.com +# (https://www.swisslipids.org/#/downloads) +#source ID description +EMPTY EMPTY Empty equation +rhea:50004 MNXR114744 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp + 1 chebi:77932@rheaC:comp 1 chebi:132727@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp +rheaR:50004 MNXR114744 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp + 1 chebi:77932@rheaC:comp 1 chebi:132727@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp +rhea:61444 MNXR171656 1 chebi:144644@rheaC:comp + 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp 1 chebi:144645@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp +rheaR:61444 MNXR171656 1 chebi:144644@rheaC:comp + 1 chebi:15378@rheaC:comp + 1 chebi:15379@rheaC:comp + 1 chebi:57783@rheaC:comp 1 chebi:144645@rheaC:comp + 1 chebi:15377@rheaC:comp + 1 chebi:58349@rheaC:comp +rhea:42776 MNXR168222 1 chebi:59789@rheaC:comp + 1 rheaG:10228@rheaC:comp 1 chebi:15378@rheaC:comp + 1 chebi:57856@rheaC:comp + 1 rheaG:10227@rheaC:comp +rheaR:42776 MNXR168222 1 chebi:59789@rheaC:comp + 1 rheaG:10228@rheaC:comp 1 chebi:15378@rheaC:comp + 1 chebi:57856@rheaC:comp + 1 rheaG:10227@rheaC:comp +rhea:18690 MNXR165961 1 chebi:15377@rheaC:comp + 1 chebi:57643@rheaC:comp --> 1 chebi:15378@rheaC:comp + 1 chebi:28868@rheaC:comp + 1 chebi:57875@rheaC:comp +rheaR:18690 MNXR165961 1 chebi:15377@rheaC:comp + 1 chebi:57643@rheaC:comp --> 1 chebi:15378@rheaC:comp + 1 chebi:28868@rheaC:comp + 1 chebi:57875@rheaC:comp +rhea:60624 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp +rheaR:60624 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp +rhea:60625 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp --> 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp +rheaR:60625 MNXR171532 1 chebi:74986@rheaC:comp + 2 chebi:15377@rheaC:comp --> 1 chebi:143890@rheaC:comp + 2 chebi:15378@rheaC:comp + 2 chebi:30823@rheaC:comp \ No newline at end of file From 8af33bd38add6b77073d8167be1b775a8974fce4 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Fri, 15 Sep 2023 12:35:27 -0600 Subject: [PATCH 22/29] Update seq_utils.py --- metanetx_uniprot/seq_utils.py | 53 +++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py index 43682c9a..d0233a41 100644 --- a/metanetx_uniprot/seq_utils.py +++ b/metanetx_uniprot/seq_utils.py @@ -138,10 +138,59 @@ def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, ver batch = organism_ids[i:min(i + batch_size, len(organism_ids))] query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch]) url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \ - '&format=tsv&size=500&fields=organism_id%2C' + '%2C'.join([parse.quote(field) + '&format=tsv&size=500&keywords=Reference+proteome&fields=organism_id%2C' + '%2C'.join([parse.quote(field) # '&format=tsv&size=1&fields=organism_id%2C' + '%2C'.join([parse.quote(field) for field in fields]) - _parse_uniprot_data(url, values) return values + +def parse_response(res,values): + + headers = None + + for line in res.iter_lines(): + line = line.decode('utf-8') + tokens = line.strip().split('\t') + + if headers is None: + headers = tokens + else: + res = dict(zip(headers, tokens)) + #print(res) + #print(type(res)) + #print(type(values)) + values.append(res) + + #print(values) + + return values + + +def get_jobs(url,values): + + session = requests.Session() + + paging = True + + first_page = session.get(url) + first_response = parse_response(first_page,values) + + while paging == True: + + if 'next' in first_page.links: + next_url = first_page.links['next']['url'] + next_page = session.get(next_url) + next_response = parse_response(next_page,values) + first_page = next_page + else: + paging = False + break + +def _get_uniprot_batch_reference_proteome(url): + + values = [] + + get_jobs(url,values) + + return values From fb852481dc0e40a08c8a9e01285d375d7eb4561a Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Fri, 15 Sep 2023 12:50:03 -0600 Subject: [PATCH 23/29] Update README.md --- metanetx_uniprot/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md index 31028033..14bd7102 100644 --- a/metanetx_uniprot/README.md +++ b/metanetx_uniprot/README.md @@ -9,8 +9,14 @@ Access chemical, reaction, enzyme, and organism information from the following s - Rhea - UniProt -To run: +To run the full pipeline to get all relationships: ``` python build.py ~/biochem4j ',' 1 ``` + +To run and only get reference proteome taxa that also exist in kg-microbe: +``` +python build_taxa_ids.py ~/biochem4j 1 +``` +*Note, uses ncbitaxon.json (build from kg-microbe) which is included in the Files directory. From ee68a063d9124a5d26884b45e425153af1adc1af Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Fri, 15 Sep 2023 12:50:50 -0600 Subject: [PATCH 24/29] Add files via upload --- metanetx_uniprot/build_taxa_ids.py | 154 +++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 metanetx_uniprot/build_taxa_ids.py diff --git a/metanetx_uniprot/build_taxa_ids.py b/metanetx_uniprot/build_taxa_ids.py new file mode 100644 index 00000000..fb325e6d --- /dev/null +++ b/metanetx_uniprot/build_taxa_ids.py @@ -0,0 +1,154 @@ + +## Output all taxa IDs that exist in kg-microbe and as reference proteomes in UniProt. + + +import os +import sys +import tarfile +import tempfile +import urllib +from urllib.request import urlretrieve + +from kgx.cli.cli_utils import transform +import pandas as pd +from seq_utils import _get_uniprot_batch_reference_proteome + +import utils, seq_utils + + +__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' + +__UNIPROT_REFERENCE_PROTEOMES_URL = 'https://rest.uniprot.org/proteomes/search?&format=tsv&query=%28%28taxonomy_id%3A2%29%20OR%20%28taxonomy_id%3A2157%29%29%20AND%20%28proteome_type%3A1%29&size=500' + +def build_csv(dest_dir, num_threads): + #'''Build database CSV files.''' + #writer = utils.Writer(dest_dir) + + # Get Organism data: + print('Parsing NCBI Taxonomy') + load(dest_dir) #--> writes Organism_Enzyme.tsv + + + +def load(output_dir, source=__NCBITAXONOMY_URL, ref_source=__UNIPROT_REFERENCE_PROTEOMES_URL): + '''Loads NCBI Taxonomy data.''' + #To get data directly from NCBI Taxon + #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) + #nodes, rels = _parse_nodes(nodes_filename, array_delimiter) + #_parse_names(nodes, names_filename, array_delimiter) + ####### + #To get data from kg-microbe + nodes_filename = os.getcwd()+'/Files/ncbitaxon.json' + #For testing + #nodes_filename = os.getcwd()+'/TestingFiles/ncbitaxon.json' + print('parsing ncbi taxon json file') + kgx_nodes_json = _parse_nodes_kgmicrobe(nodes_filename,'ncbitaxon_transformed',output_dir) + + nodes,nodes_df = transform_kgx_output_format(kgx_nodes_json) + + #Constrain by those that have reference proteomes, don't use if testing + ref_organisms = _get_uniprot_batch_reference_proteome(ref_source) + ref_organism_ids = [str(k['Organism Id']) for k in ref_organisms] + node_vals = [i for i in nodes if i in ref_organism_ids] + + node_vals = ['NCBITaxon:' + i for i in node_vals] + kgx_nodes_json_subset = nodes_df[nodes_df['id'].isin(node_vals)] + kgx_nodes_json_subset.to_csv(output_dir+'/Organism.tsv', index=False, sep='\t') + print('Wrote file: ',output_dir+'/Organism.tsv') + +def _get_ncbi_taxonomy_files(source): + '''Downloads and extracts NCBI Taxonomy files.''' + temp_dir = tempfile.gettempdir() + temp_gzipfile = tempfile.NamedTemporaryFile() + urlretrieve(source, temp_gzipfile.name) + + temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz') + temp_tarfile.extractall(temp_dir) + + temp_gzipfile.close() + temp_tarfile.close() + + return os.path.join(temp_dir, 'nodes.dmp'), \ + os.path.join(temp_dir, 'names.dmp') + +def _parse_nodes_kgmicrobe(filename, output_name,output_dir): + '''Parses nodes file.''' + + transform(inputs=[filename], input_format='obojson', output= os.path.join(output_dir, output_name), output_format='tsv') + + return output_dir+'/'+output_name+'_nodes.tsv' + +def transform_kgx_output_format(transformed_nodes_tsv): + + labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name']) + + nodes = [] + + #Get node IDs to help subset according to reference proteomes + for i in range(len(labels)): + tax_id = labels.iloc[i].loc['id'].split('NCBITaxon:')[1] + nodes.append(tax_id) + + return nodes,labels + + +def _parse_nodes(filename): + '''Parses nodes file.''' + nodes = {} + rels = [] + + with open(filename, 'r') as textfile: + for line in textfile: + tokens = [x.strip() for x in line.split('|')] + tax_id = tokens[0] + + if tax_id != '1': + rels.append([tax_id, 'is_a', tokens[1]]) + + nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id, + ':LABEL': + 'Organism' + ',' + tokens[2]} + + return nodes, rels + + +def _parse_names(nodes, filename): + '''Parses names file.''' + + with open(filename, 'r') as textfile: + for line in textfile: + tokens = [x.strip() for x in line.split('|')] + node = nodes[tokens[0]] + + if 'name' not in node: + node['name'] = tokens[1] + node['names:string[]'] = set([node['name']]) + else: + node['names:string[]'].add(tokens[1]) + + for _, node in nodes.items(): + if 'names:string[]' in node: + node['names:string[]'] = \ + ','.join(node['names:string[]']) + + +def main(args): + '''main method''' + num_threads = 0 + + if len(args) > 2: + try: + num_threads = int(args[2]) + except ValueError: + if args[2] == 'True': + num_threads = multiprocessing.cpu_count() + + print('Running build with ' + str(num_threads) + ' threads') + + build_csv(args[0], num_threads) + + + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file From 79638d7925b65aea0f3e96bf5441ae4a883cfbb0 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Fri, 15 Sep 2023 12:51:28 -0600 Subject: [PATCH 25/29] Update README.md --- metanetx_uniprot/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md index 14bd7102..a9e9503c 100644 --- a/metanetx_uniprot/README.md +++ b/metanetx_uniprot/README.md @@ -19,4 +19,4 @@ To run and only get reference proteome taxa that also exist in kg-microbe: ``` python build_taxa_ids.py ~/biochem4j 1 ``` -*Note, uses ncbitaxon.json (build from kg-microbe) which is included in the Files directory. +*Note, uses ncbitaxon.json (built from kg-microbe) which is expected to be in the Files directory. From bbaca9427141218411686ce2ff0af6a432e285c7 Mon Sep 17 00:00:00 2001 From: bsantan <70932395+bsantan@users.noreply.github.com> Date: Mon, 18 Sep 2023 13:08:40 -0600 Subject: [PATCH 26/29] Updated to introduce go_utils, rhea2go, kg-phenio, and PheKnowLator resources to graph. --- metanetx_uniprot/build.py | 13 +- metanetx_uniprot/build_taxa_ids.py | 161 +++++++++++++++++++++++ metanetx_uniprot/chemical_utils.py | 3 + metanetx_uniprot/enzyme_utils.py | 18 ++- metanetx_uniprot/go_utils.py | 10 ++ metanetx_uniprot/mnxref_utils.py | 104 +++++++++++++-- metanetx_uniprot/reaction_utils.py | 202 +++++++++++++++++++++++++++-- metanetx_uniprot/rhea_utils.py | 16 ++- metanetx_uniprot/seq_utils.py | 53 +++++++- 9 files changed, 541 insertions(+), 39 deletions(-) create mode 100644 metanetx_uniprot/build_taxa_ids.py create mode 100644 metanetx_uniprot/go_utils.py diff --git a/metanetx_uniprot/build.py b/metanetx_uniprot/build.py index 0ac9524b..edf45b74 100644 --- a/metanetx_uniprot/build.py +++ b/metanetx_uniprot/build.py @@ -1,9 +1,6 @@ ''' SYNBIOCHEM-DB (c) University of Manchester 2015 -''' -SYNBIOCHEM-DB (c) University of Manchester 2015 - SYNBIOCHEM-DB is licensed under the MIT License. To view a copy of this license, visit . @@ -30,7 +27,7 @@ def build_csv(dest_dir, array_delimiter, num_threads): chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) - + ## Getting error: urllib.error.URLError: #print('Parsing ChEBI') #chebi_utils.load(chem_man, writer) @@ -51,12 +48,12 @@ def build_csv(dest_dir, array_delimiter, num_threads): reaction_ids = rhea_utils.load(reac_man, num_threads=num_threads) reac_man.write_files(writer) #--> writes Enzyme_Reaction.tsv - # print('Parsing MNXref') - mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer, reaction_ids) - mnx_loader.load() #--> writes Reaction_Chemical.tsv + mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer, reaction_ids, process_ids,ncbi_taxonomy_utils,array_delimiter) + print('mxn loading') + mnx_loader.load() #--> writes Reaction_Chemical.tsv, Chemical_Process.tsv, ##NOT WORKING: Process_Disease.tsv, Process_Phenotype.tsv - #chem_man.write_files(writer) + chem_man.write_files(writer) #--> writes Chemicals.tsv def main(args): diff --git a/metanetx_uniprot/build_taxa_ids.py b/metanetx_uniprot/build_taxa_ids.py new file mode 100644 index 00000000..fd57ee34 --- /dev/null +++ b/metanetx_uniprot/build_taxa_ids.py @@ -0,0 +1,161 @@ + +## Output all taxa IDs that exist in kg-microbe and as reference proteomes in UniProt. + + +import os +import sys +import tarfile +import tempfile +import urllib +from urllib.request import urlretrieve + +from kgx.cli.cli_utils import transform +import pandas as pd +from seq_utils import _get_uniprot_batch_reference_proteome + +import utils, seq_utils + + +__NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' + +__UNIPROT_REFERENCE_PROTEOMES_URL = 'https://rest.uniprot.org/proteomes/search?&format=tsv&query=%28%28taxonomy_id%3A2%29%20OR%20%28taxonomy_id%3A2157%29%29%20AND%20%28proteome_type%3A1%29&size=500' + +def build_csv(dest_dir, num_threads): + #'''Build database CSV files.''' + + # Get Organism data: + print('Parsing NCBI Taxonomy') + load(dest_dir) #--> writes Organism_Enzyme.tsv + + + +def load(output_dir, source=__NCBITAXONOMY_URL, ref_source=__UNIPROT_REFERENCE_PROTEOMES_URL): + '''Loads NCBI Taxonomy data.''' + #To get data directly from NCBI Taxon + #nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) + #nodes, rels = _parse_nodes(nodes_filename, array_delimiter) + #_parse_names(nodes, names_filename, array_delimiter) + ####### + #To get data from kg-microbe + nodes_filename = os.getcwd()+'/Files/ncbitaxon_nodes.tsv' #ncbitaxon.json + #For testing + #nodes_filename = os.getcwd()+'/TestingFiles/ncbitaxon.json' + print('parsing ncbi taxon tsv file') #json + #_parse_nodes_kgmicrobe only used if reading ncbitaxon.json + #kgx_nodes_file = _parse_nodes_kgmicrobe(nodes_filename,'ncbitaxon_transformed',output_dir) + print('length of ncbitaxon_nodes.tsv: ',len(pd.read_csv(nodes_filename,sep='\t'))) #kgx_nodes)) + + #Update to kgx_nodes_file if ncbitaxon.json is input + nodes,nodes_df = transform_kgx_output_format(nodes_filename) #kgx_nodes_file) + + #Constrain by those that have reference proteomes, don't use if testing + ref_organisms = _get_uniprot_batch_reference_proteome(ref_source) + ref_organism_ids = [str(k['Organism Id']) for k in ref_organisms] + node_vals = [i for i in nodes if i in ref_organism_ids] + + nodes_not_in_refProteome = list(set(ref_organism_ids) - set(nodes)) + print('nodes_not_in_refProteome: ',nodes_not_in_refProteome) + + node_vals = ['NCBITaxon:' + i for i in node_vals] + kgx_nodes_subset = nodes_df[nodes_df['id'].isin(node_vals)] + kgx_nodes_subset.to_csv(output_dir+'/Organism.tsv', index=False, sep='\t') + print('Wrote file: ',output_dir+'/Organism.tsv') + +def _get_ncbi_taxonomy_files(source): + '''Downloads and extracts NCBI Taxonomy files.''' + temp_dir = tempfile.gettempdir() + temp_gzipfile = tempfile.NamedTemporaryFile() + urlretrieve(source, temp_gzipfile.name) + + temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz') + temp_tarfile.extractall(temp_dir) + + temp_gzipfile.close() + temp_tarfile.close() + + return os.path.join(temp_dir, 'nodes.dmp'), \ + os.path.join(temp_dir, 'names.dmp') + +def _parse_nodes_kgmicrobe(filename, output_name,output_dir): + '''Parses nodes file.''' + + transform(inputs=[filename], input_format='tsv', output= os.path.join(output_dir, output_name), output_format='tsv') #obojson + + return output_dir+'/'+output_name+'_nodes.tsv' + +def transform_kgx_output_format(transformed_nodes_tsv): + + labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name']) + + nodes = [] + + #Get node IDs to help subset according to reference proteomes + for i in range(len(labels)): + try: + tax_id = labels.iloc[i].loc['id'].split('NCBITaxon:')[1] + nodes.append(tax_id) + except IndexError: print(labels.iloc[i].loc['id']) + + return nodes,labels + + +def _parse_nodes(filename): + '''Parses nodes file.''' + nodes = {} + rels = [] + + with open(filename, 'r') as textfile: + for line in textfile: + tokens = [x.strip() for x in line.split('|')] + tax_id = tokens[0] + + if tax_id != '1': + rels.append([tax_id, 'is_a', tokens[1]]) + + nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id, + ':LABEL': + 'Organism' + ',' + tokens[2]} + + return nodes, rels + + +def _parse_names(nodes, filename): + '''Parses names file.''' + + with open(filename, 'r') as textfile: + for line in textfile: + tokens = [x.strip() for x in line.split('|')] + node = nodes[tokens[0]] + + if 'name' not in node: + node['name'] = tokens[1] + node['names:string[]'] = set([node['name']]) + else: + node['names:string[]'].add(tokens[1]) + + for _, node in nodes.items(): + if 'names:string[]' in node: + node['names:string[]'] = \ + ','.join(node['names:string[]']) + + +def main(args): + '''main method''' + num_threads = 0 + + if len(args) > 2: + try: + num_threads = int(args[2]) + except ValueError: + if args[2] == 'True': + num_threads = multiprocessing.cpu_count() + + print('Running build with ' + str(num_threads) + ' threads') + + build_csv(args[0], num_threads) + + + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff --git a/metanetx_uniprot/chemical_utils.py b/metanetx_uniprot/chemical_utils.py index 30567ed4..dc3fb310 100644 --- a/metanetx_uniprot/chemical_utils.py +++ b/metanetx_uniprot/chemical_utils.py @@ -29,6 +29,9 @@ def write_files(self, writer): '''Write neo4j import files.''' return writer.write_nodes(self.__nodes.values(), 'Chemical') + def write_rels(self, writer, rels): + return writer.write_rels(rels, 'Chemical', 'Process') + def add_chemical(self, properties): '''Adds a chemical to the collection of nodes, ensuring uniqueness.''' chem_id, chebi_ent = self.__get_chem_id(properties) diff --git a/metanetx_uniprot/enzyme_utils.py b/metanetx_uniprot/enzyme_utils.py index d6ea4969..95de560e 100644 --- a/metanetx_uniprot/enzyme_utils.py +++ b/metanetx_uniprot/enzyme_utils.py @@ -18,12 +18,18 @@ class EnzymeManager(object): def __init__(self): '''Constructor.''' self.__nodes = {} + self.__node_enzymes = {} self.__org_enz_rels = [] def get_nodes(self): '''Gets enzyme nodes.''' return self.__nodes.values() + def get_enz_nodes(self): + #nodes_enzymes_df = pd.DataFrame(self.__node_enzymes.items(), columns=['entity_uri', 'label']) + return self.__node_enzymes.values() + + def get_org_enz_rels(self): '''Gets organism-to-enzyme relationships.''' return self.__org_enz_rels @@ -49,10 +55,10 @@ def add_uniprot_data(self, enzyme_ids, source, num_threads=0): if 'Organism (ID)' in uniprot_value else None if 'Entry name' in uniprot_value: - enzyme_node['entry'] = uniprot_value['Entry name'] + enzyme_node['entry'] = 'Uniprot:'+uniprot_value['Entry name'] if 'Protein names' in uniprot_value: - enzyme_node['names'] = uniprot_value['Protein names'] + enzyme_node['names'] = 'Uniprot:'+uniprot_value['Protein names'] if enzyme_node['names']: enzyme_node['name'] = enzyme_node['names'][0] @@ -91,7 +97,7 @@ def add_uniprot_data_organism(self, organism_ids, source, num_threads=0): enzyme_node['entry'] = entry['Entry'] if 'Protein names' in entry: - enzyme_node['names'] = entry['Protein names'] + enzyme_node['names'] = entry['Protein names'][0] if 'names' in entry.keys(): enzyme_node['name'] = entry['names'][0] @@ -100,7 +106,9 @@ def add_uniprot_data_organism(self, organism_ids, source, num_threads=0): enzyme_node['ec-code'] = entry['EC number'] if organism_id: - self.__org_enz_rels.append([organism_id, 'expresses',entry['Entry'], {'source': source}]) + self.__org_enz_rels.append(['NCBITaxon:'+organism_id, 'expresses','Uniprot:'+entry['Entry'], {'source': source}]) + + self.__node_enzymes['Uniprot:'+entry['Entry']] = {'entity_uri':'Uniprot:'+entry['Entry'], 'label':enzyme_node['names']} return uniprot_values - + \ No newline at end of file diff --git a/metanetx_uniprot/go_utils.py b/metanetx_uniprot/go_utils.py new file mode 100644 index 00000000..8d8fab92 --- /dev/null +++ b/metanetx_uniprot/go_utils.py @@ -0,0 +1,10 @@ +from kgx.cli.cli_utils import transform +import os + +go_plus_file = '/Users/brooksantangelo/Documents/HunterLab/Exploration/biochem4j/kg-microbe/metanetx_uniprot/Files/go-plus.owl' + + +output_dir = '/Users/brooksantangelo/Documents/HunterLab/biochem4j/biochem4j/' +name = 'go_plus_transformed' + +transform(inputs=[go_plus_file], input_format='xml', output= os.path.join(output_dir, name), output_format='tsv') diff --git a/metanetx_uniprot/mnxref_utils.py b/metanetx_uniprot/mnxref_utils.py index aa231e22..7eb3fafb 100644 --- a/metanetx_uniprot/mnxref_utils.py +++ b/metanetx_uniprot/mnxref_utils.py @@ -44,32 +44,38 @@ def __init__(self, source=_METANETX_URL): def get_chem_data(self): '''Gets chemical data.''' if not self.__chem_data: - self.__read_chem_prop() + mxn_chebi_mapping = self.__read_chem_prop() self.__read_xref('chem_xref.tsv', self.__chem_data, True) - return self.__chem_data + return self.__chem_data,mxn_chebi_mapping def get_reac_data(self,reaction_ids): '''Gets reaction data.''' if not self.__reac_data: - mxn_reaction_ids = self.__read_reac_prop(reaction_ids) + mxn_reaction_ids,mxn_rhea_mapping = self.__read_reac_prop(reaction_ids) self.__read_xref('reac_xref.tsv', self.__reac_data, False) #Only include reaction data for reactions in reaction_ids self.__reac_data = {key:val for key,val in self.__reac_data.items() if key in mxn_reaction_ids} - return self.__reac_data + return self.__reac_data,mxn_rhea_mapping def __read_chem_prop(self): '''Read chemical properties and create Nodes.''' chem_prop_keys = ['id', 'name', 'reference','formula', 'charge:float', 'mass:float', 'inchi', 'inchikey', 'smiles'] + mxn_chebi_mapping = {} + for values in self.__read_data('chem_prop.tsv'): if not values[0].startswith('#'): values[0] = self.__parse_id(values[0]) values[2] = self.__parse_id(values[2]) props = dict(zip(chem_prop_keys, values)) + + #For mapping mxn IDs to Chebi Ids + mxn_chebi_mapping[values[0]] = values[2] + props.pop('reference') _convert_to_float(props, 'charge:float') _convert_to_float(props, 'mass:float') @@ -77,6 +83,8 @@ def __read_chem_prop(self): if value != ''} self.__chem_data[values[0]] = props + return mxn_chebi_mapping + def __read_xref(self, filename, data, chemical): '''Read xrefs and update Nodes.''' xref_keys = ['XREF', 'MNX_ID', 'Description'] @@ -113,15 +121,18 @@ def __read_reac_prop(self,reaction_ids): ##Relabel reaction ids by MXN id rather than rhea id mxn_reaction_ids = [] + mxn_rhea_mapping = {} + for values in self.__read_data('reac_prop.tsv'): if not values[0].startswith('#'): if values[0] == 'EMPTY': continue values[0] = self.__parse_id(values[0]) values[2] = self.__parse_id(values[2]) - + #Grab MXN id if in reaction IDs from filtering by organisms/enzymes try: if 'rhea' in values[2].split(':')[0].lower() and values[2].split(':')[1] in reaction_ids: mxn_reaction_ids.append(values[0]) + mxn_rhea_mapping[values[0]] = values[2].split(':')[1] except IndexError: continue props = dict(zip(reac_prop_keys, values)) @@ -143,7 +154,7 @@ def __read_reac_prop(self,reaction_ids): print('WARNING: Suspected polymerisation reaction: ' + \ values[0] + '\t' + str(props)) - return mxn_reaction_ids + return mxn_reaction_ids,mxn_rhea_mapping def __add_chem(self, chem_id): '''Adds a chemical with given id.''' @@ -192,11 +203,14 @@ def __parse_id(self, item_id): class MnxRefLoader(object): '''Loads MNXref data into neo4j format.''' - def __init__(self, chem_man, reac_man, writer,reaction_ids): + def __init__(self, chem_man, reac_man, writer,reaction_ids,process_ids,ncbi_taxonomy_utils,array_delimiter): self.__chem_man = chem_man self.__reac_man = reac_man self.__writer = writer self.__reactions = reaction_ids + self.__processes = process_ids + self.__ncbi_tax = ncbi_taxonomy_utils + self.__array_delimiter = array_delimiter def load(self): '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv, @@ -204,14 +218,77 @@ def load(self): reader = MnxRefReader() #First gets all chemical data from MxnRef (chem_xref and chem_prop) and adds to __chem_man - for properties in reader.get_chem_data().values(): - properties['mnx'] = properties.pop('id') + c_vals,mxn_chebi_mapping = reader.get_chem_data() + for properties in c_vals.values(): + #Includes chemical as chebi ID if you use reference + properties['mnx'] = properties.pop('id') #'reference') self.__chem_man.add_chemical(properties) - #Then gets reaction data from reac_xref and reac_prop and adds to __chem_man - rels = self.__add_reac_nodes(reader.get_reac_data(self.__reactions)) + #Then gets reaction data from reac_xref and reac_prop and adds to __chem_man only for reaction ids founds linked to organisms + reac_data,mxn_rhea_mapping = reader.get_reac_data(self.__reactions) + chem_rels = self.__add_reac_nodes(reac_data) + + #Convert rxn id's to Rhea (get mappings from reac_prop) and chemicals to CHEBI IDs + #rels is list of lists + #print('mxn_chebi_mapping: ',mxn_chebi_mapping) + mxn_chebi_mapping['MNXM1'] = 'chebi:24636' + mxn_chebi_mapping['WATER'] = 'chebi:15377' + + chemical_ids = [] + + for i in enumerate(chem_rels): + #MXN ids to rhea ids + #reac_ids should have rhea to help identify + chem_rels[i[0]][0] = 'Rhea:'+mxn_rhea_mapping[i[1][0]] + try: + #MXN ids to chebi ids + chem_rels[i[0]][2] = mxn_chebi_mapping[i[1][2]] + except KeyError: + if 'WATER' in i[1][2]: + mxn_chebi_mapping[i[1][2]] = 'chebi:15377' + chem_rels[i[0]][2] = mxn_chebi_mapping[i[1][2]] + else: + print('could not map chemical to chebi ID: ',i[1][2]) + chemical_ids.append(chem_rels[i[0]][2]) + + + #Gets all chemicals from reac_data and adds go processes, and gets all go processes from rhea2go and adds chemicals + + print('self.__processes in mxnref load: ',self.__processes) + print('length of self.__processes in mxnref load: ',len(self.__processes)) + #go plus + go_plus_filename = os.getcwd()+'/Files/GO-PLUS.csv' + go_plus_rels,process_ids = self.__reac_man.read_go_plus(go_plus_filename,self.__processes,chemical_ids) + + print('go_plus_rels: ',go_plus_rels[0:5]) + + #HPO + hpo_kgx_nodes_json = os.getcwd()+'/Files/hp_kgx_tsv_nodes.tsv' + hpo_kgx_edges_json = os.getcwd()+'/Files/hp_kgx_tsv_edges.tsv' + #kgx_nodes_json,kgx_edges_json = self.__ncbi_tax._parse_nodes_kgmicrobe(go_plus_filename, self.__array_delimiter, 'hpo_transformed') + nodes,rels = self.__reac_man.transform_kgx_output_format_hp(hpo_kgx_nodes_json,hpo_kgx_edges_json) + #Contrain pehnotype - process rels from processes filtered previously + hpo_rels = [] + for i in rels: + if i[0] in process_ids or i[2] in process_ids: + hpo_rels.append(i) + + n1 = self.__writer.write_nodes(nodes, 'Phenotype') #node_vals #- works + f1 = self.__writer.write_rels(hpo_rels, 'Process', 'Phenotype') #rel_vals + + #PKL for GO-MONDO + #pkl_rels = self.__reac_man.get_process_disease_pkl_data(os.getcwd()+'/Files/PheKnowLator_v3.0.2_full_instance_relationsOnly_OWLNETS_Triples_Identifiers.txt',os.getcwd()+'/Files/PheKnowLator_v3.0.2_full_instance_relationsOnly_OWLNETS_NodeLabels.txt',self.__processes) + + #KG-phenio for GO-MONDO + phenio_rels = self.__reac_man.get_process_disease_phenio_data(os.getcwd()+'/Files/phenio_merged-kg_edges.tsv',os.getcwd()+'/Files/phenio_merged-kg_nodes.tsv',process_ids) + + f2 = self.__writer.write_rels(go_plus_rels, 'GoPlus_Chemical', 'Process') #- works + + f3 = self.__writer.write_rels(chem_rels, 'Reaction', 'Chemical') #-works + print('phenio_rels: ',phenio_rels[0:5]) + f4 = self.__writer.write_rels(phenio_rels, 'Phenio_Process', 'Disease') - return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')] + return [] #,[self.__writer.write_rels(chem_rels, 'Reaction', 'Chemical')], [self.__writer.write_rels(pkl_rels, 'Process', 'Disease')] def __add_reac_nodes(self, reac_data): '''Get reaction nodes from data.''' @@ -250,6 +327,7 @@ def __add_reac_nodes(self, reac_data): reac_id = self.__reac_man.add_reaction('mnx', mnx_id, properties) + #reac_id_def looks like {'MNXR165961': [[None, 0, -1.0, 'MNXM1107698'], [None, 0, -1.0, 'WATER@MNXD1'], [None, 0, 1.0, 'MNXM1108087'], [None, 0, 1.0, 'MNXM728579']], 'MNXR171532': [['C18H33O2', -1, -2.0, 'MNXM1107708'], [None, 0, -2.0, 'MNXM1'], [None, 0, -1.0, 'MNXM734941'], ['C41H78NO8P', 0, 1.0, 'MNXM737425'], [None, 0, 2.0, 'WATER@MNXD1']]} reac_id_def[reac_id] = balanced_def chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float', @@ -277,6 +355,8 @@ def __add_reac_nodes(self, reac_data): reac_cofactors.extend(pair) for term in defn: + cof_chebi_id = term[3] + react_chebi_id = term[2] rels.append([reac_id, 'has_cofactor' if term[3] in reac_cofactors else 'has_reactant', diff --git a/metanetx_uniprot/reaction_utils.py b/metanetx_uniprot/reaction_utils.py index f13c429c..f6a5f390 100644 --- a/metanetx_uniprot/reaction_utils.py +++ b/metanetx_uniprot/reaction_utils.py @@ -10,6 +10,9 @@ from enzyme_utils import EnzymeManager from numpy import * +import pandas as pd +from tqdm import tqdm +import csv class ReactionManager(object): @@ -21,6 +24,7 @@ def __init__(self): self.__reac_ids = {} self.__reac_enz_rels = [] self.__enz_reac_rels = [] + self.__go_reac_rels = [] self.__org_enz_rels = [] self.__enz_man = EnzymeManager() @@ -29,13 +33,18 @@ def write_files(self, writer): return ([writer.write_nodes(self.__nodes.values(), 'Reaction'), writer.write_nodes(self.__enz_man.get_nodes(), - 'Enzyme')], + 'Enzyme'), + writer.write_nodes(self.__enz_man.get_enz_nodes(), + 'Enzyme_nodes')], [writer.write_rels(self.__reac_enz_rels, 'Reaction', 'Enzyme'), #Gets reactions connected to all enzymes writer.write_rels(self.__enz_reac_rels, 'Reaction', 'Enzyme'), - writer.write_rels(self.__enz_man.get_org_enz_rels(), + #Gets reactions connected to all go processes + writer.write_rels(self.__go_reac_rels, + 'Reaction', 'Process'), + writer.write_rels(self.__enz_man.get_org_enz_rels(), 'Organism', 'Enzyme')]) def add_reaction(self, source, reac_id, properties): @@ -62,6 +71,9 @@ def add_reaction(self, source, reac_id, properties): else: self.__nodes[reac_id].update(properties) + print('from add_reaction in reaction_utils.py') + print(self.__nodes.values()) + return reac_id def add_react_to_enz(self, data, source, num_threads=0): @@ -72,12 +84,13 @@ def add_react_to_enz(self, data, source, num_threads=0): # Create Enzyme nodes: self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads) - def add_react_to_enz_organism(self, data, source, num_threads=0): + #data here is rhea-enzyme file, go_data is rhea-go file + def add_react_to_enz_organism(self, data, source, go_data, num_threads=0): #Create Reaction relationships - reaction_ids = self.__create_enz_react(data, source) + reaction_ids,process_ids = self.__create_enz_react(data, go_data, source) - return reaction_ids + return reaction_ids,process_ids def __create_react_enz(self, data, source): '''Creates Reaction and Enzyme nodes and their Relationships.''' @@ -94,26 +107,48 @@ def __create_react_enz(self, data, source): return list(set(enzyme_ids)) - def __create_enz_react(self, data, source): + def __create_enz_react(self, data, go_data, source): '''Creates Reaction and Enzyme nodes and their Relationships.''' print('adding reaction to enzyme relationships') reaction_ids = [] + process_ids = [] enzyme_ids = self.__enz_man.get_nodes() for enz_id in enzyme_ids: + #Gets relationships between reactions and enzymes from Rhea only if they exist in the enzymes pulled from organism filtering step reac_ids = [key for key, value in data.items() if enz_id['entry'] in value] + reaction_ids = reaction_ids+reac_ids for j in reac_ids: - self.__enz_reac_rels.append([j, 'catalysed_by', - enz_id['entry'], + #reac_ids should have rhea to help identify and protein should have UniProt + self.__enz_reac_rels.append(['Rhea:'+j, 'catalysed_by', + 'Uniprot:'+enz_id['entry'], + {'source': source}]) + + print('adding reaction to process relationships') + #Gets relationships between reactions and Go processes from Rhea only if they exist in above reaction ids + go_reac_ids = [key for key, value in go_data.items() if key in reaction_ids] + reaction_ids = reaction_ids+go_reac_ids + + for j in go_reac_ids: + rxns = go_data[j] + for k in rxns: + process_ids.append(k) + #reac_ids should have rhea to help identify + self.__go_reac_rels.append(['Rhea:'+j, 'affects', + k, {'source': source}]) - return list(set(reaction_ids)) + + return list(set(reaction_ids)),list(set(process_ids)) def add_org_to_enz(self, nodes, source, num_threads=0): '''Submit data to the graph.''' # Create Organism nodes: organism_ids = self.__create_organism_ids(nodes, source) + print('number of orgs for just reference proteomes') + print(len(organism_ids)) + ## For testing #organism_ids = organism_ids[0:10] @@ -126,3 +161,152 @@ def __create_organism_ids(self, data, source): return ids + def read_go_plus(self,go_plus_file,process_ids,chemical_ids): + '''Read chemical properties and create Nodes.''' + go_keys = ['Class ID', 'Preferred Label', 'Synonyms','Definitions','Obsolete','CUI','Semantic Types','Parents'] + + rels = [] + + d = pd.read_csv(go_plus_file, delimiter=',',keep_default_na=False) + go_data = d[go_keys] + go_data = go_data.replace(regex=['http://purl.obolibrary.org/obo/'],value='').replace(regex=['_'],value=':') + + #Create go-plus nodes + #add to nodes: http://www.w3.org/2000/01/rdf-schema#label + + d = d.drop(go_keys,axis=1) #+['Parents'], axis=1) + #Update values + #Ensure subject is not deprecated + d = d[d['http://www.w3.org/2002/07/owl#deprecated'] != 'TRUE'] + d = d.replace(regex=['http://purl.obolibrary.org/obo/'],value='').replace(regex=['_'],value=':') + d = d.replace(regex=['go#'],value='') + + #Update columns + #Columns to ignore + cols_to_drop = ['http://data.bioontology.org/metadata/prefixIRI','http://data.bioontology.org/metadata/treeView','go#','http://purl.obolibrary.org/obo/IAO_','http://www.w3.org/2000/01/rdf-schema#','http://www.w3.org/2004/02/skos/core#','http://www.w3.org/2002/07/owl#deprecated','http://www.w3.org/2000/01/rdf-schema#label','http://purl.org/dc/terms/','obsolete ','has_narrow_synonym','has_obo_format_version','has_obo_namespace','has_related_synonym','has_scope','has_synonym_type','definition','http://www.geneontology.org/formats/oboInOwl#id','has_alternative_id','http://purl.obolibrary.org/obo/go#creation_date','http://www.geneontology.org/formats/oboInOwl#creation_date','synonym_type_property','Systematic synonym','temporally related to','term replaced by','term tracker item','title','http://www.geneontology.org/formats/oboInOwl#created_by','has_exact_synonym'] + cols_to_drop = d.columns[d.columns.str.contains('|'.join(cols_to_drop))] + d = d.drop(cols_to_drop, axis=1) + #There are 2 contains relationships, develops_from + d.columns = d.columns.str.replace('http://data.bioontology.org/metadata/obo/contains','biontology_contains', regex=False) + d.columns = d.columns.str.replace('http://data.bioontology.org/metadata/obo/develops_from','biontology_develops_from', regex=False) + d.columns = d.columns.str.replace('http://data.bioontology.org/metadata/obo/','', regex=False) + d.columns = d.columns.str.replace('http://purl.obolibrary.org/obo/', '', regex=False) + d.columns = d.columns.str.replace('http://www.geneontology.org/formats/oboInOwl#', '', regex=False) + + for i in tqdm(range(len(d))): + s_id = go_data.iloc[i].loc['Class ID'] + for p_label in d.columns: + if d.iloc[i].loc[p_label] != '': + if (s_id in chemical_ids or p_label in process_ids) or (s_id in process_ids or p_label in chemical_ids): + all_objects = d.iloc[i].loc[p_label].split('|') + for j in all_objects: + rels.append([s_id, p_label, + j, + {'source': 'go-plus'}]) + + go_process_ids = [] + for i, v in enumerate(rels): + for x in v: + if "GO:" in x: + go_process_ids.append(x) + + go_process_ids = list(set(go_process_ids)) + + print('len process_ids before adding go plus terms: ',len(process_ids)) + process_ids = process_ids+go_process_ids + process_ids = list(set(process_ids)) + print('len process_ids after adding go plus terms: ',len(process_ids)) + + return rels,process_ids + + def transform_kgx_output_format_hp(self,transformed_nodes_tsv,transformed_edges_tsv): + + labels = pd.read_csv(transformed_nodes_tsv, sep = '\t', usecols = ['id','name']) + triples_df = pd.read_csv(transformed_edges_tsv,sep = '\t', usecols = ['subject', 'object', 'predicate']) + triples_df.columns.str.lower() + + nodes = {} + rels = [] + + + #Constrain rels and nodes to only GO process: HP relationships + #Constrain rels and nodes to only GO processes that are used in prior rels + for i in range(len(triples_df)): + s = triples_df.iloc[i].loc['subject'] + p = triples_df.iloc[i].loc['predicate'] + o = triples_df.iloc[i].loc['object'] + if ('GO:' in s and 'HP:' in o) or ('GO:' in o and 'HP:' in s): + rels.append([s, p, o]) + + + for i in range(len(labels)): + if any(labels.iloc[i].loc['id'] in sublist for sublist in labels): + nodes[labels.iloc[i].loc['id']] = {'class:ID': labels.iloc[i].loc['id'], + ':LABEL': + labels.iloc[i].loc['id'].split(':')[0]} + + return nodes,rels + + def process_pkl_files(self,triples_file,labels_file): + + triples_df = pd.read_csv(triples_file,sep = ' ', quoting=csv.QUOTE_NONE) + triples_df.columns.str.lower() + + triples_df.replace({'<': ''}, regex=True, inplace=True) + triples_df.replace({'>': ''}, regex=True, inplace=True) + + labels = pd.read_csv(labels_file, sep = ' ', quoting=csv.QUOTE_NONE) + labels.columns.str.lower() + + #Remove brackets from URI + labels['entity_uri'] = labels['entity_uri'].str.replace("<","") + labels['entity_uri'] = labels['entity_uri'].str.replace(">","") + + + return triples_df,labels + + def get_process_disease_pkl_data(self,triples_file,labels_file,process_ids): + + print('Extracting PKL relationships') + triples_df, labels_dict = self.process_pkl_files(triples_file,labels_file) + + rels = [] + + for i in tqdm(range(len(triples_df))): + if triples_df.iloc[i].loc['object'] in process_ids and 'MONDO_' in triples_df.iloc[i].loc['subject']: + rels.append([triples_df.iloc[i].loc['subject'].replace('http://purl.obolibrary.org/obo/','').replace('_',':'), labels_dict.loc[labels_dict['entity_uri'] == triples_df.iloc[i].loc['predicate'],'label'].values[0], + triples_df.iloc[i].loc['object'].replace('http://purl.obolibrary.org/obo/','').replace('_',':'), + {'source': 'pheknowlator'}]) + + return rels + + + def process_kg_phenio_files(self,triples_file,labels_file): + + triples_df = pd.read_csv(triples_file,sep = '\t', usecols = ['subject', 'object', 'predicate']) + triples_df.columns.str.lower() + + labels = pd.read_csv(labels_file, sep = '\t', usecols = ['id','category', 'name','description']) + labels.columns = ['entity_uri','category', 'label','description/definition'] + + triples_df_relevant = triples_df.loc[((triples_df['subject'].str.contains('MONDO:')) & (triples_df['object'].str.contains('GO:'))) | ((triples_df['object'].str.contains('MONDO:')) & (triples_df['subject'].str.contains('GO:')))] + + #1785727 total, 435 total MONDO/GO or GO/MONDO relationships + print(len(triples_df),len(triples_df_relevant)) + + return triples_df_relevant,labels + + def get_process_disease_phenio_data(self,triples_file,labels_file,process_ids): + + print('Extracting kg-phenio relationships') + triples_df, labels_dict = self.process_kg_phenio_files(triples_file,labels_file) + + rels = [] + + for i in tqdm(range(len(triples_df))): + if triples_df.iloc[i].loc['object'] in process_ids and 'MONDO:' in triples_df.iloc[i].loc['subject']: + rels.append([triples_df.iloc[i].loc['subject'], triples_df.iloc[i].loc['predicate'], + triples_df.iloc[i].loc['object'], + {'source': 'kg-phenio'}]) + + return rels \ No newline at end of file diff --git a/metanetx_uniprot/rhea_utils.py b/metanetx_uniprot/rhea_utils.py index 5c612d90..a9a9ff6a 100644 --- a/metanetx_uniprot/rhea_utils.py +++ b/metanetx_uniprot/rhea_utils.py @@ -17,21 +17,31 @@ #For test, also update load function #__RHEA_URL = os.getcwd()+'/TestingFiles/rhea2uniprot_sprot.txt' -def load(reaction_manager, source=__RHEA_URL, num_threads=0): +__RHEA_GO_URL = 'ftp://ftp.expasy.org/databases/rhea/tsv/rhea2go.tsv' +#__RHEA_GO_URL = os.getcwd()+'/TestingFiles/rhea2go_NOTREAL.txt' + +def load(reaction_manager, source=__RHEA_URL, go_source = __RHEA_GO_URL, num_threads=0): '''Loads Rhea data.''' # Parse data: temp_file = tempfile.NamedTemporaryFile() urlretrieve(source, temp_file.name) data = _parse(temp_file.name) + + + temp_file = tempfile.NamedTemporaryFile() + urlretrieve(go_source, temp_file.name) + go_data = _parse(temp_file.name) + ##If using test data #data = _parse(source) + #go_data = _parse(go_source) ######Not sure why source is Rhea here, calls to UniProt #Remove, since this goes from rhea2uniprot to uniprot enzymes. use add_org_to_enz function in ncbi_taxonomy_utils instead #reaction_manager.add_react_to_enz(data, 'rhea', num_threads) - reaction_ids = reaction_manager.add_react_to_enz_organism(data, 'rhea', num_threads) + reaction_ids,process_ids = reaction_manager.add_react_to_enz_organism(data, 'rhea', go_data, num_threads) - return reaction_ids + return reaction_ids,process_ids def _parse(filename): diff --git a/metanetx_uniprot/seq_utils.py b/metanetx_uniprot/seq_utils.py index 43682c9a..75a80bee 100644 --- a/metanetx_uniprot/seq_utils.py +++ b/metanetx_uniprot/seq_utils.py @@ -138,10 +138,59 @@ def _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values, ver batch = organism_ids[i:min(i + batch_size, len(organism_ids))] query = '%20OR%20'.join(['organism_id:' + organism_id for organism_id in batch]) url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \ - '&format=tsv&size=500&fields=organism_id%2C' + '%2C'.join([parse.quote(field) + '&format=tsv&size=500&keywords=Reference+proteome&fields=organism_id%2C' + '%2C'.join([parse.quote(field) # '&format=tsv&size=1&fields=organism_id%2C' + '%2C'.join([parse.quote(field) for field in fields]) - _parse_uniprot_data(url, values) return values + +def parse_response(res,values): + + headers = None + + for line in res.iter_lines(): + line = line.decode('utf-8') + tokens = line.strip().split('\t') + + if headers is None: + headers = tokens + else: + res = dict(zip(headers, tokens)) + #print(res) + #print(type(res)) + #print(type(values)) + values.append(res) + + #print(values) + + return values + + +def get_jobs(url,values): + + session = requests.Session() + + paging = True + + first_page = session.get(url) + first_response = parse_response(first_page,values) + + while paging == True: + + if 'next' in first_page.links: + next_url = first_page.links['next']['url'] + next_page = session.get(next_url) + next_response = parse_response(next_page,values) + first_page = next_page + else: + paging = False + break + +def _get_uniprot_batch_reference_proteome(url): + + values = [] + + get_jobs(url,values) + + return values \ No newline at end of file From a4449ed9ebb4a63f4d0997b6aabb4743547ee367 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:13:19 -0600 Subject: [PATCH 27/29] Add files via upload Adding code to create labels (combined_kgx_merged-kg_nodes.csv) and edges (combined_kg.csv) files for full graph. --- metanetx_uniprot/combine_rels.py | 64 +++++++++++ metanetx_uniprot/create_labels_file.py | 151 +++++++++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 metanetx_uniprot/combine_rels.py create mode 100644 metanetx_uniprot/create_labels_file.py diff --git a/metanetx_uniprot/combine_rels.py b/metanetx_uniprot/combine_rels.py new file mode 100644 index 00000000..0446c8c1 --- /dev/null +++ b/metanetx_uniprot/combine_rels.py @@ -0,0 +1,64 @@ +import os +import pandas as pd +import argparse + + +def parse_kg_file(kg_filename): + + kg = pd.read_csv(kg_filename,delimiter=';') + + if len(kg.columns) == 3: kg.columns = [['subject','predicate','object']] + if len(kg.columns) == 4: + kg.columns = [['subject','predicate','object','source']] + kg = kg[['subject','predicate','object']] + + return kg + +def concat_kgs(kg1,kg2): + + combined_kg = pd.concat([kg1, kg2], axis=0) + combined_kg = combined_kg.drop_duplicates().reset_index(drop=True) + + return combined_kg + +#Define arguments for each required and optional input +def defineArguments(): + parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--directory",dest="Directory",required=True,help="Directory") + + return parser + +def main(): + + #rels_files_dir = '/Users/brooksantangelo/Documents/HunterLab/Exploration/biochem4j/kg-microbe/metanetx_uniprot/refProteome/LocalRun_0915 + + #Generate argument parser and define arguments + parser = defineArguments() + args = parser.parse_args() + + directory = args.Directory + + rels_files_dir = directory+'/rels/' + rels_files = os.listdir(rels_files_dir) + + rels_files = [i for i in rels_files if 'combined_kg' not in i] + + kg_0 = parse_kg_file(rels_files_dir+rels_files[0]) + + for fname in rels_files[1:]: + + if fname.endswith('.csv'): + + kg = parse_kg_file(rels_files_dir+fname) + kg_0 = concat_kgs(kg_0,kg) + + kg_0.to_csv(rels_files_dir + 'combined_kg.csv', sep = "\t", index = False) + + +if __name__ == '__main__': + main() + + + + diff --git a/metanetx_uniprot/create_labels_file.py b/metanetx_uniprot/create_labels_file.py new file mode 100644 index 00000000..4d25eab4 --- /dev/null +++ b/metanetx_uniprot/create_labels_file.py @@ -0,0 +1,151 @@ + + + +from tqdm import tqdm +import pandas as pd +import argparse +from collections import defaultdict + + + +def process_kg_covid19_files(triples_file,labels_file): + triples_df = pd.read_csv(triples_file,sep = '\t', usecols = ['subject', 'object', 'predicate']) + triples_df.columns.str.lower() + + labels = pd.read_csv(labels_file, sep = '\t', usecols = ['id','category', 'name','description']) + + triples_df_relevant = triples_df.loc[((triples_df['subject'].str.contains('MONDO:')) & (triples_df['object'].str.contains('GO:'))) | ((triples_df['object'].str.contains('MONDO:')) & (triples_df['subject'].str.contains('GO:')))] + + labels_relevant = labels.loc[(labels['id'].str.contains('MONDO:')) | (labels['id'].str.contains('GO:')) | (labels['id'].str.contains('CHEBI:')) | (labels['id'].str.contains('NCBITaxon:'))] + + #1785727 total, 435 total MONDO/GO or GO/MONDO relationships + print(len(labels_relevant),len(labels)) + + return triples_df_relevant,labels_relevant + +def get_process_disease_phenio_data(triples_file,labels_file,process_ids): + + print('Extracting kg-phenio relationships') + triples_df, labels_dict = process_kg_covid19_files(triples_file,labels_file) + + #triples_df = triples_df.replace(regex=['http://purl.obolibrary.org/obo/'],value='').replace(regex=['_'],value=':') + + rels = [] + + for i in tqdm(range(len(triples_df))): + if triples_df.iloc[i].loc['object'] in process_ids and 'MONDO:' in triples_df.iloc[i].loc['subject']: + #if ('GO_' in triples_df.iloc[i].loc['subject'] and 'MONDO_' in triples_df.iloc[i].loc['object']) or ('GO_' in triples_df.iloc[i].loc['object'] and 'MONDO_' in triples_df.iloc[i].loc['subject']): + print(triples_df.iloc[i]) + rels.append([triples_df.iloc[i].loc['subject'], triples_df.iloc[i].loc['predicate'], + triples_df.iloc[i].loc['object'], + {'source': 'kg-phenio'}]) + + return rels + +#Define arguments for each required and optional input +def defineArguments(): + parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--directory",dest="Directory",required=True,help="Directory") + + return parser + +def main(): + + #directory = '/Users/brooksantangelo/Documents/HunterLab/Exploration/biochem4j/kg-microbe/metanetx_uniprot/refProteome/LocalRun_0915' + + #Generate argument parser and define arguments + parser = defineArguments() + args = parser.parse_args() + + directory = args.Directory + + phenio_labels_file = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg-phenio/phenio_merged-kg_nodes.tsv' + phenio_triples_file = '/Users/brooksantangelo/Documents/HunterLab/Exploration/kg-phenio/phenio_merged-kg_edges.tsv' + + #Updated 6/19 based on file location + kg_covid19_triples_file = '/Users/brooksantangelo/Documents/HunterLab/Cartoomics/PostRevisionUpdates/Inputs/kg-covid19/merged-kg_edges.tsv' + kg_covid19_labels_file = '/Users/brooksantangelo/Documents/HunterLab/Cartoomics/PostRevisionUpdates/Inputs/kg-covid19/merged-kg_nodes.tsv' + + enzyme_file = directory + '/nodes' + '/Enzyme.csv' + + kg_filename = directory + '/rels' + '/combined_kg.csv' + + kg = pd.read_csv(kg_filename,delimiter='\t') + kg = kg[['subject','object']] + kg_vals = pd.unique(kg[['subject', 'object']].values.ravel()).tolist() + kg_vals = [str(x) for x in kg_vals] + + kg_labels = {} + + phenio_triples,phenio_labels = process_kg_covid19_files(phenio_triples_file,phenio_labels_file) + covid19_triples,covid19_labels = process_kg_covid19_files(kg_covid19_triples_file,kg_covid19_labels_file) + + enzyme_df = pd.read_csv(enzyme_file,delimiter=';') + enz_list = [] + + #Get uri (ex: O88037) and labels (ex: Probable SapB synthase) for all enzymes + print('extracting enzyme labels') + for i in range(len(enzyme_df)): + enz_list.append({'id': 'Uniprot:'+enzyme_df.iloc[i].loc['uniprot:ID(Enzyme)'] , + 'category': 'biolink:Protein' , + 'name': enzyme_df.iloc[i].loc['names'], + 'description': ''}) + + enzyme_new_df = pd.DataFrame(enz_list) + + kg_list = [] + #Convert all uris that exist in phenio or kg-covid19 to labels + for i in tqdm(kg_vals): + #Determine category of node. What if GO term is not biological process? + if 'NCBITaxon:' in i: cat = 'biolink:OrganismalEntity' + if 'MONDO:' in i: cat = 'biolink:Disease' + if 'CHEBI:' in i: cat = 'biolink:ChemicalSubstance' + if 'GO:' in i: cat = 'biolink:BiologicalProcess' + try: + kg_list.append({'id': i , + 'category': cat , + 'name': phenio_labels.loc[phenio_labels['id'] == i,'name'].values[0], + 'description': ''}) + except (KeyError,IndexError): + #print('val doesnt exist in phenio: ',i) + pass + try: + kg_list.append({'id': i , + 'category': cat , + 'name': covid19_labels.loc[covid19_labels['id'] == i,'name'].values[0], + 'description': ''}) + except (KeyError,IndexError): + #print('val doesnt exist in kg-covid19: ',i) + pass + + kg_new_df = pd.DataFrame(kg_list) + + #Combine enzymes df with other labels from phenio and kg-covid19 + combined_nodes = pd.concat([kg_new_df, enzyme_new_df], axis=0) + + #Add Rhea labels: + rhea_vals = [i for i in kg_vals if 'rhea' in i.lower()] + rhea_list = [] + #Dictionary to output Rhea nodes in current kg form, not kgx + rhea_labels = {} + for i in rhea_vals: + rhea_list.append({'id': i , + 'category': 'biolink:Reaction' , + 'name': i, + 'description': ''}) + rhea_labels[i] = {'id':i, 'label':i} + + #Output Rhea_nodes file + rhea_kg_df = pd.DataFrame(rhea_labels.values()) + rhea_kg_df.to_csv(directory + '/nodes' + '/Rhea_nodes.csv', index=False, encoding='utf-8', sep=';') + + rhea_new_df = pd.DataFrame(rhea_list) + + #Combine all df label types and output + combined_nodes = pd.concat([combined_nodes, rhea_new_df], axis=0) + combined_nodes.to_csv(directory + '/combined_kgx_merged-kg_nodes.csv',sep='\t',index=False) + + +if __name__ == '__main__': + main() \ No newline at end of file From 414fca0751211afca55bda4a82d72155b181417c Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:18:43 -0600 Subject: [PATCH 28/29] Update README.md --- metanetx_uniprot/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/metanetx_uniprot/README.md b/metanetx_uniprot/README.md index a9e9503c..5b76a0d0 100644 --- a/metanetx_uniprot/README.md +++ b/metanetx_uniprot/README.md @@ -3,7 +3,7 @@ Code is reused from Biochem4j: https://github.com/neilswainston/biochem4j/tree/master/sbcdb Access chemical, reaction, enzyme, and organism information from the following sources: -- libchebipy +- libchebipy (note, the _parsers.py file found in this repo must be updated for the libchebipy library at ~/libchebipy/_parsers.py) - NCBITaxonomy - MetaNetX - Rhea @@ -20,3 +20,12 @@ To run and only get reference proteome taxa that also exist in kg-microbe: python build_taxa_ids.py ~/biochem4j 1 ``` *Note, uses ncbitaxon.json (built from kg-microbe) which is expected to be in the Files directory. + +To build the entire graph by combining all separate triples files, and creating a kgx format nodes file: +``` +python combine_rels.py --directory ~/biochem4j/rels +python create_labels_file.py --directory ~/biochem4j/rels +``` +This will output the following files: +- ~/biochem4j/rels/combined_kg.csv +- ~/biochem4j/combined_kgx_merged-kg_nodes.csv From 350e05a66be3261e9214e19a668c570a246571e2 Mon Sep 17 00:00:00 2001 From: Brook Santangelo <70932395+bsantan@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:19:28 -0600 Subject: [PATCH 29/29] Add files via upload Updated to use wget instead of url retrieve due to ftp issues --- metanetx_uniprot/_parsers.py | 686 +++++++++++++++++++++++++++++++++++ 1 file changed, 686 insertions(+) create mode 100644 metanetx_uniprot/_parsers.py diff --git a/metanetx_uniprot/_parsers.py b/metanetx_uniprot/_parsers.py new file mode 100644 index 00000000..b646bf16 --- /dev/null +++ b/metanetx_uniprot/_parsers.py @@ -0,0 +1,686 @@ +''' +libChEBIpy (c) University of Manchester 2015 + +libChEBIpy is licensed under the MIT License. + +To view a copy of this license, visit . + +@author: neilswainston +''' +import calendar +import datetime +import gzip +import io +import os.path +import re +import tempfile +import zipfile + +import six.moves.urllib.parse as urlparse +from six.moves.urllib.request import urlretrieve, urlcleanup + +from ._comment import Comment +from ._compound_origin import CompoundOrigin +from ._database_accession import DatabaseAccession +from ._formula import Formula +from ._name import Name +from ._reference import Reference +from ._relation import Relation +from ._structure import Structure +import wget + + +__ALL_IDS = {} +__ALL_NAMES = {} +__COMMENTS = {} +__COMPOUND_ORIGINS = {} +__CHARGES = {} +__CREATED_BYS = {} +__DATABASE_ACCESSIONS = {} +__DEFAULT_STRUCTURE_IDS = [] +__DEFINITIONS = {} +__FORMULAE = {} +__INCHIS = {} +__INCHI_KEYS = {} +__INCOMINGS = {} +__MASSES = {} +__MODIFIED_ONS = {} +__NAMES = {} +__OUTGOINGS = {} +__PARENT_IDS = {} +__SMILES = {} +__SOURCES = {} +__STARS = {} +__STATUSES = {} + +__DOWNLOAD_PARAMS = {'path': os.path.join(os.path.expanduser('~'), 'libChEBI'), + 'auto_update': True} + + +def set_download_cache_path(path): + '''Sets download cache path.''' + __DOWNLOAD_PARAMS['path'] = path + + +def set_auto_update(auto_update): + '''Sets auto update flag.''' + __DOWNLOAD_PARAMS['auto_update'] = auto_update + + +def get_formulae(chebi_id): + '''Returns formulae''' + if not __FORMULAE: + __parse_chemical_data() + + return __FORMULAE[chebi_id] if chebi_id in __FORMULAE else [] + + +def get_all_formulae(chebi_ids): + '''Returns all formulae''' + all_formulae = [get_formulae(chebi_id) for chebi_id in chebi_ids] + return [x for sublist in all_formulae for x in sublist] + + +def get_mass(chebi_id): + '''Returns mass''' + if not __MASSES: + __parse_chemical_data() + + return __MASSES[chebi_id] if chebi_id in __MASSES else float('NaN') + + +def get_charge(chebi_id): + '''Returns charge''' + if not __CHARGES: + __parse_chemical_data() + + return __CHARGES[chebi_id] if chebi_id in __CHARGES else float('NaN') + + +def __parse_chemical_data(): + '''Gets and parses file''' + filename = get_file('chemical_data.tsv') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + + if tokens[3] == 'FORMULA': + # Many seemingly contradictory formulae exist, + # depending upon the source database + chebi_id = int(tokens[1]) + + if chebi_id not in __FORMULAE: + __FORMULAE[chebi_id] = [] + + # Append formula: + form = Formula(tokens[4], tokens[2]) + __FORMULAE[chebi_id].append(form) + + elif tokens[3] == 'MASS': + __MASSES[int(tokens[1])] = float(tokens[4]) + + elif tokens[3] == 'CHARGE': + __CHARGES[int(tokens[1])] = int(tokens[4] + if tokens[4][-1] != '-' + else '-' + tokens[4][:-1]) + + +def get_comments(chebi_id): + '''Returns comments''' + if not __COMMENTS: + __parse_comments() + + return __COMMENTS[chebi_id] if chebi_id in __COMMENTS else [] + + +def get_all_comments(chebi_ids): + '''Returns all comments''' + all_comments = [get_comments(chebi_id) for chebi_id in chebi_ids] + return [x for sublist in all_comments for x in sublist] + + +def __parse_comments(): + '''Gets and parses file''' + filename = get_file('comments.tsv') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + chebi_id = int(tokens[1]) + + if chebi_id not in __COMMENTS: + __COMMENTS[chebi_id] = [] + + # Append Comment: + com = Comment(tokens[3], + tokens[4], + tokens[5], + datetime.datetime.strptime(tokens[2], '%Y-%M-%d')) + + __COMMENTS[chebi_id].append(com) + + +def get_compound_origins(chebi_id): + '''Returns compound origins''' + if not __COMPOUND_ORIGINS: + __parse_compound_origins() + return __COMPOUND_ORIGINS[chebi_id] if chebi_id in \ + __COMPOUND_ORIGINS else [] + + +def get_all_compound_origins(chebi_ids): + '''Returns all compound origins''' + all_compound_origins = [get_compound_origins(chebi_id) + for chebi_id in chebi_ids] + return [x for sublist in all_compound_origins for x in sublist] + + +def __parse_compound_origins(): + '''Gets and parses file''' + filename = get_file('compound_origins.tsv') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + + if len(tokens) > 10: + chebi_id = int(tokens[1]) + + if chebi_id not in __COMPOUND_ORIGINS: + __COMPOUND_ORIGINS[chebi_id] = [] + + # Append CompoundOrigin: + comp_orig = CompoundOrigin(tokens[2], tokens[3], + tokens[4], tokens[5], + tokens[6], tokens[7], + tokens[8], tokens[9], + tokens[10]) + __COMPOUND_ORIGINS[chebi_id].append(comp_orig) + + +def get_status(chebi_id): + '''Returns status''' + if not __STATUSES: + __parse_compounds() + + return __STATUSES[chebi_id] if chebi_id in __STATUSES else None + + +def get_source(chebi_id): + '''Returns source''' + if not __SOURCES: + __parse_compounds() + + return __SOURCES[chebi_id] if chebi_id in __SOURCES else None + + +def get_parent_id(chebi_id): + '''Returns parent id''' + if not __PARENT_IDS: + __parse_compounds() + + return __PARENT_IDS[chebi_id] if chebi_id in __PARENT_IDS else float('NaN') + + +def get_all_ids(chebi_id): + '''Returns all ids''' + if not __ALL_IDS: + __parse_compounds() + + return __ALL_IDS[chebi_id] if chebi_id in __ALL_IDS else [] + + +def get_name(chebi_id): + '''Returns name''' + if not __NAMES: + __parse_compounds() + + return __NAMES[chebi_id] if chebi_id in __NAMES else None + + +def get_definition(chebi_id): + '''Returns definition''' + if not __DEFINITIONS: + __parse_compounds() + + return __DEFINITIONS[chebi_id] if chebi_id in __DEFINITIONS else None + + +def get_modified_on(chebi_id): + '''Returns modified on''' + if not __MODIFIED_ONS: + __parse_compounds() + + return __MODIFIED_ONS[chebi_id] if chebi_id in __MODIFIED_ONS else None + + +def get_all_modified_on(chebi_ids): + '''Returns all modified on''' + all_modified_ons = [get_modified_on(chebi_id) for chebi_id in chebi_ids] + all_modified_ons = [modified_on for modified_on in all_modified_ons + if modified_on is not None] + return None if not all_modified_ons else sorted(all_modified_ons)[-1] + + +def get_created_by(chebi_id): + '''Returns created by''' + if not __CREATED_BYS: + __parse_compounds() + + return __CREATED_BYS[chebi_id] if chebi_id in __MODIFIED_ONS else None + + +def get_star(chebi_id): + '''Returns star''' + if not __STARS: + __parse_compounds() + + return __STARS[chebi_id] if chebi_id in __STARS else float('NaN') + + +def __parse_compounds(): + '''Gets and parses file''' + filename = get_file('compounds.tsv.gz') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + chebi_id = int(tokens[0]) + + __STATUSES[chebi_id] = tokens[1] + __SOURCES[chebi_id] = tokens[3] + + parent_id_token = tokens[4] + __PARENT_IDS[chebi_id] = float('NaN') \ + if parent_id_token == 'null' \ + else int(parent_id_token) + __put_all_ids(chebi_id, chebi_id) + + if parent_id_token != 'null': + parent_id = int(parent_id_token) + __put_all_ids(parent_id, chebi_id) + + __NAMES[chebi_id] = None if tokens[5] == 'null' else tokens[5] + __DEFINITIONS[chebi_id] = None if tokens[6] == 'null' \ + else tokens[6] + __MODIFIED_ONS[chebi_id] = None if tokens[7] == 'null' \ + else datetime.datetime.strptime(tokens[7], '%Y-%m-%d') + __CREATED_BYS[chebi_id] = None if tokens[8] == 'null' \ + or len(tokens) == 9 else tokens[8] + __STARS[chebi_id] = float('NaN') \ + if tokens[9 if len(tokens) > 9 else 8] == 'null' \ + else int(tokens[9 if len(tokens) > 9 else 8]) + + +def __put_all_ids(parent_id, child_id): + '''COMMENT''' + if parent_id in __ALL_IDS: + __ALL_IDS[parent_id].append(child_id) + else: + __ALL_IDS[parent_id] = [child_id] + + +def get_database_accessions(chebi_id): + '''Returns database accession''' + if not __DATABASE_ACCESSIONS: + __parse_database_accessions() + + return __DATABASE_ACCESSIONS[chebi_id] if chebi_id in \ + __DATABASE_ACCESSIONS else [] + + +def get_all_database_accessions(chebi_ids): + '''Returns all database accessions''' + all_database_accessions = [get_database_accessions(chebi_id) + for chebi_id in chebi_ids] + return [x for sublist in all_database_accessions for x in sublist] + + +def __parse_database_accessions(): + '''Gets and parses file''' + filename = get_file('database_accession.tsv') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + chebi_id = int(tokens[1]) + + if chebi_id not in __DATABASE_ACCESSIONS: + __DATABASE_ACCESSIONS[chebi_id] = [] + + # Append DatabaseAccession: + dat_acc = DatabaseAccession(tokens[3], tokens[4], tokens[2]) + + __DATABASE_ACCESSIONS[chebi_id].append(dat_acc) + + +def get_inchi(chebi_id): + '''Returns InChI string''' + if not __INCHIS: + __parse_inchi() + + return __INCHIS[chebi_id] if chebi_id in __INCHIS else None + + +def __parse_inchi(): + '''Gets and parses file''' + filename = get_file('chebiId_inchi.tsv') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + __INCHIS[int(tokens[0])] = tokens[1] + + +def get_names(chebi_id): + '''Returns names''' + if not __ALL_NAMES: + __parse_names() + + return __ALL_NAMES[chebi_id] if chebi_id in __ALL_NAMES else [] + + +def get_all_names(chebi_ids): + '''Returns all names''' + all_names = [get_names(chebi_id) for chebi_id in chebi_ids] + return [x for sublist in all_names for x in sublist] + + +def __parse_names(): + '''Gets and parses file''' + filename = get_file('names.tsv.gz') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + chebi_id = int(tokens[1]) + + if chebi_id not in __ALL_NAMES: + __ALL_NAMES[chebi_id] = [] + + # Append Name: + nme = Name(tokens[4], + tokens[2], + tokens[3], + tokens[5] == 'T', + tokens[6]) + + __ALL_NAMES[chebi_id].append(nme) + + +def get_references(chebi_ids): + '''Returns references''' + references = [] + chebi_ids = [str(chebi_id) for chebi_id in chebi_ids] + + filename = get_file('reference.tsv.gz') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split('\t') + + if tokens[0] in chebi_ids: + # Append Reference: + if len(tokens) > 3: + ref = Reference(tokens[1], tokens[2], tokens[3], + tokens[4]) + else: + ref = Reference(tokens[1], tokens[2]) + + references.append(ref) + return references + + +def get_outgoings(chebi_id): + '''Returns outgoings''' + if not __OUTGOINGS: + __parse_relation() + + return __OUTGOINGS[chebi_id] if chebi_id in __OUTGOINGS else [] + + +def get_all_outgoings(chebi_ids): + '''Returns all outgoings''' + all_outgoings = [get_outgoings(chebi_id) for chebi_id in chebi_ids] + return [x for sublist in all_outgoings for x in sublist] + + +def get_incomings(chebi_id): + '''Returns incomings''' + if not __INCOMINGS: + __parse_relation() + + return __INCOMINGS[chebi_id] if chebi_id in __INCOMINGS else [] + + +def get_all_incomings(chebi_ids): + '''Returns all incomings''' + all_incomings = [get_incomings(chebi_id) for chebi_id in chebi_ids] + return [x for sublist in all_incomings for x in sublist] + + +def __parse_relation(): + '''Gets and parses file''' + relation_filename = get_file('relation.tsv') + relation_textfile = open(relation_filename, 'r') + + next(relation_textfile) + + for line in relation_textfile: + tokens = line.strip().split('\t') + + source_chebi_id = int(tokens[3]) + target_chebi_id = int(tokens[2]) + typ = tokens[1] + + if source_chebi_id not in __OUTGOINGS: + __OUTGOINGS[source_chebi_id] = [] + + if target_chebi_id not in __INCOMINGS: + __INCOMINGS[target_chebi_id] = [] + + target_relation = Relation(typ, str(target_chebi_id), tokens[4]) + source_relation = Relation(typ, str(source_chebi_id), tokens[4]) + + __OUTGOINGS[source_chebi_id].append(target_relation) + __INCOMINGS[target_chebi_id].append(source_relation) + + +def get_inchi_key(chebi_id): + '''Returns InChI key''' + if not __INCHI_KEYS: + __parse_structures() + + return __INCHI_KEYS[chebi_id] if chebi_id in __INCHI_KEYS else None + + +def get_smiles(chebi_id): + '''Returns InChI key''' + if not __SMILES: + __parse_structures() + + return __SMILES[chebi_id] if chebi_id in __SMILES else None + + +def get_mol(chebi_id): + '''Returns mol''' + chebi_id_regexp = '^\\d+\\,' + str(chebi_id) + '\\,.*' + mol_file_end_regexp = '\",mol,\\dD,[Y\\|N],[Y\\|N]$' + this_structure = [] + + filename = get_file('structures.csv.gz') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + in_chebi_id = False + + next(textfile) + + for line in textfile: + if in_chebi_id or line[0].isdigit(): + if re.match(chebi_id_regexp, line): + tokens = line.strip().split(',') + in_chebi_id = True + this_structure = [] + this_structure.append(','.join(tokens[2:]) + .replace('\"', '')) + this_structure.append('\n') + elif in_chebi_id: + + if re.match(mol_file_end_regexp, line): + tokens = line.strip().split(',') + + if _is_default_structure(tokens[3]): + tokens = line.strip().split(',') + this_structure.append(tokens[0].replace('\"', '')) + return Structure(''.join(this_structure), + Structure.mol, + int(tokens[2][0])) + + # else: + this_structure = [] + in_chebi_id = False + continue + + this_structure.append(line) + + return None + + +def get_mol_filename(chebi_id): + '''Returns mol file''' + mol = get_mol(chebi_id) + + if mol is None: + return None + + file_descriptor, mol_filename = tempfile.mkstemp(str(chebi_id) + + '_', '.mol') + mol_file = open(mol_filename, 'w') + mol_file.write(mol.get_structure()) + mol_file.close() + os.close(file_descriptor) + + return mol_filename + + +def __parse_structures(): + '''COMMENT''' + filename = get_file('structures.csv.gz') + + with io.open(filename, 'r', encoding='cp1252') as textfile: + next(textfile) + + for line in textfile: + tokens = line.strip().split(',') + + if len(tokens) == 7: + if tokens[3] == 'InChIKey': + __INCHI_KEYS[int(tokens[1])] = \ + Structure(tokens[2], + Structure.InChIKey, + int(tokens[4][0])) + elif tokens[3] == 'SMILES': + __SMILES[int(tokens[1])] = \ + Structure(tokens[2], + Structure.SMILES, + int(tokens[4][0])) + + +def get_file(filename): + '''Downloads filename from ChEBI FTP site''' + destination = __DOWNLOAD_PARAMS['path'] + filepath = os.path.join(destination, filename) + + if not __is_current(filepath): + + if not os.path.exists(destination): + os.makedirs(destination) + + url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \ + 'Flat_file_tab_delimited/' + + wget.download(url+filename, out=filepath) + + #urlretrieve(urlparse.urljoin(url, filename), filepath) + #urlcleanup() + + if filepath.endswith('.zip'): + zfile = zipfile.ZipFile(filepath, 'r') + filepath = os.path.join(destination, zfile.namelist()[0]) + zfile.extractall(destination) + elif filepath.endswith('.gz'): + unzipped_filepath = filepath[:-len('.gz')] + + if os.path.exists(unzipped_filepath) \ + and __is_current(unzipped_filepath): + filepath = unzipped_filepath + else: + input_file = gzip.open(filepath, 'rb') + filepath = os.path.join(destination, input_file.name[:-len('.gz')]) + output_file = open(filepath, 'wb') + + for line in input_file: + output_file.write(line) + + input_file.close() + output_file.close() + + return filepath + + +def __is_current(filepath): + '''Checks whether file is current''' + if not __DOWNLOAD_PARAMS['auto_update']: + return True + + if not os.path.isfile(filepath): + return False + + return datetime.datetime.utcfromtimestamp(os.path.getmtime(filepath)) \ + > __get_last_update_time() + + +def __get_last_update_time(): + '''Returns last FTP site update time''' + now = datetime.datetime.utcnow() + + # Get the first Tuesday of the month + first_tuesday = __get_first_tuesday(now) + + if first_tuesday < now: + return first_tuesday + # else: + first_of_month = datetime.datetime(now.year, now.month, 1) + last_month = first_of_month + datetime.timedelta(days=-1) + return __get_first_tuesday(last_month) + + +def __get_first_tuesday(this_date): + '''Get the first Tuesday of the month''' + month_range = calendar.monthrange(this_date.year, this_date.month) + first_of_month = datetime.datetime(this_date.year, this_date.month, 1) + first_tuesday_day = (calendar.TUESDAY - month_range[0]) % 7 + first_tuesday = first_of_month + datetime.timedelta(days=first_tuesday_day) + return first_tuesday + + +def _is_default_structure(def_struct): + '''Is default structure?''' + return def_struct.upper() == 'Y'