From 1371b156f91a272a6ad5100c4c8628299dd206ab Mon Sep 17 00:00:00 2001 From: Andrew Latham Date: Thu, 20 Jun 2024 05:20:31 -0700 Subject: [PATCH 1/5] Added prepare_protein_library function to spatiotemporal --- modules/spatiotemporal/README.md | 29 +++ .../pyext/src/composition_scoring.py | 71 +++++++ .../pyext/src/prepare_protein_library.py | 189 ++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 modules/spatiotemporal/pyext/src/prepare_protein_library.py diff --git a/modules/spatiotemporal/README.md b/modules/spatiotemporal/README.md index 4e8db2878f..2e234dc8c7 100644 --- a/modules/spatiotemporal/README.md +++ b/modules/spatiotemporal/README.md @@ -147,6 +147,17 @@ prot - string, protein or subcomplex we are interested in finding subcomplex_components - subcomplexes or components in a given node, which can be accessed by graphNode.get_subcomplex_components() +### calc_likelihood_state(exp_comp_map, t, state) +Function that calculates the likelihood of an individual state, used by prepare_protein_library. Returns a single float, the weight of the protein copy number based on a Gaussian likelihood function. + +exp_comp_map - dictionary, which describes protein stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. Only copy numbers for proteins +or subcomplexes included in this dictionary will be scored. For each of these proteins, a csv file should be provided with protein copy number data. The csv file should have 3 columns, +1) "Time", which matches up to the possible times in the graph, 2) "mean", the average protein copy number at that time point from experiment, and 3) "std", the standard deviation of that protein copy number from experiment. + +t - string, time at which the composition likelihood should be calculated. Should match one a possible value in the first column of the exp_comp_map. + +state - list of integers, an array of the number of protein copy numbers for which the likelihood will be calculated. This array should list the proteins in the same order as the exp_comp_map. + ## score_graph.py ### score_graph(nodes,keys): @@ -257,4 +268,22 @@ width - string, width of each node on the dag ### draw_dag_in_graphviz(nodes, coloring=None, draw_label=True, fontname="Helvetica", fontsize="18", penscale=0.6, arrowsize=1.2, height="0.6",width="0.6"): Function used by draw_dag() to render the graph using graphviz. Takes a list of graphNodes (nodes) and initializes the nodes and edges. Coloring is expected to be a list of RGBA strings specifying how to color each node. Expected to be same length as nodes. +## prepare_protein_library.py + +### prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, template_topology='', template_dict={}, match_final_state=True) +Function that reads in experimental stoicheometery data and calculates which compositions and location assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI topology file can be provided, in which case topology files for each composition and location assignment are also written. +The output is 3 types of files: 1. *_time.config - configuration files, which list the proteins included at each time point for each model 2. time.txt - protein copy number files. Each row is a protein copy number state and each column is the protein copy number in that state. Note that each protein copy number state can result in multiple location assignments. 3. *_-_time-topol.txt - topology files for each copy number and location assignment. + +times - list of strings, the times at which the stoicheometery data should be read. + +exp_comp_map - dictionary, which describes protein stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. Only copy numbers for proteins or subcomplexes included in this dictionary will be scored. For each of these proteins, a csv file should be provided with protein copy number data. The csv file should have 3 columns, 1) "Time", which matches up to the possible times in the graph, 2) "mean", the average protein copy number at that time point from experiment, and 3) "std", the standard deviation of that protein copy number from experiment. + +expected_subcomplexes - list of all possible subcomplex strings in the model. Should be a list without duplicates of all components in the subcomplex configuration files. + +nmodels - int, number of models with different protein copy numbers to generate at each time point. + +template_topology: string, name of the topology file for the complete complex (default: '', no topology files are output) + +template_dict: dictionary for connecting the spatiotemporal model to the topology file. The keys (string) are the names of the proteins, defined by the expected_complexes variable. The values (list) are the names of all proteins in the topology file that should have the same copy number as the labeled protein, specifically the "molecule_name." (default: {}, no topology files are output) +match_final_state: Boolean, determines whether to fix the final state to the state defined by expected_subcomplexes. True enforces this match and thus ensures that the final time has only one state. (default: True) \ No newline at end of file diff --git a/modules/spatiotemporal/pyext/src/composition_scoring.py b/modules/spatiotemporal/pyext/src/composition_scoring.py index c7340828d6..cca5c2025b 100644 --- a/modules/spatiotemporal/pyext/src/composition_scoring.py +++ b/modules/spatiotemporal/pyext/src/composition_scoring.py @@ -115,3 +115,74 @@ def calc_likelihood(exp_comp_map, nodes): # add state weight to node node.add_score(float(weight)) return nodes + +def calc_likelihood_state(exp_comp_map, t, state): + """ + Function that adds a score for the compositional likelihood for all + states, similar to how composition_likelihood_function calculates the + composition likelihood of a node. Used by prepare_protein_library. + The composition likelihood assumes a Gaussian distribution for copy + number of each protein or subcomplex with means and standard + deviatiations derived from experiment. Returns the nodes, with the + new weights added. + + @param exp_comp_map: dictionary, which describes protein stoicheometery. + The key describes the protein, which should correspond to names + within the expected_subcomplexes. Only copy numbers for proteins + or subcomplexes included in this dictionary will be scored. For + each of these proteins, a csv file should be provided with protein + copy number data. The csv file should have 3 columns, + 1) "Time", which matches up to the possible times in the graph, + 2) "mean", the average protein copy number at that time point + from experiment, and 3) "std", the standard deviation of that + protein copy number from experiment. + @param t: string, time at which the composition likelihood should be + calculated. Should match one a possible value in the first column + of the exp_comp_map. + @param state: list of integers, an array of the number of protein copy numbers + for which the likelihood will be calculated. This array should list + the proteins in the same order as the exp_comp_map. + @return weight: float, the weight of the graphNode according to the composition + likelihood function. + """ + import pandas as pd + # Data is stored as a dictionary of dictionaries. The first dictionary + # references which protein you are refering to. + # the 2nd dictionary references which time you are refering to. The return + # is the mean or standard deviation of the protein copy number + mean = {} + std = {} + state_cn={} + count=0 + # import csv file as pandas data frame + for prot in exp_comp_map.keys(): + prot_dict_mean = {} + prot_dict_std = {} + state_cn[prot]=state[count] + if os.path.exists(exp_comp_map[prot]): + exp = pd.read_csv(exp_comp_map[prot]) + else: + raise Exception( + "Error!!! Check exp_comp_map. Unable to find composition " + "file: " + exp_comp_map[prot] + '\nClosing...') + for i in range(len(exp)): + prot_dict_mean[exp['Time'][i]] = exp['mean'][i] + prot_dict_std[exp['Time'][i]] = exp['std'][i] + mean[prot] = prot_dict_mean + std[prot] = prot_dict_std + count += 1 + # compute the compositional likelihood of the nodes + weight=0 + for prot in exp_comp_map.keys(): + # x counts the number of proteins of a given type in the node + x = state_cn[prot] + # check std is greater than 0 + if std[prot][t] > 0: + pass + else: + warnings.warn( + 'WARNING!!! Standard deviation of protein ' + prot + + ' 0 or less at time ' + t + + '. May lead to illogical results.') + weight += (0.5 * ((x - mean[prot][t]) / std[prot][t]) ** 2 + np.log(std[prot][t] * np.sqrt(2 * np.pi))) + return weight \ No newline at end of file diff --git a/modules/spatiotemporal/pyext/src/prepare_protein_library.py b/modules/spatiotemporal/pyext/src/prepare_protein_library.py new file mode 100644 index 0000000000..6e8928c400 --- /dev/null +++ b/modules/spatiotemporal/pyext/src/prepare_protein_library.py @@ -0,0 +1,189 @@ +"""@namespace IMP.spatiotemporal.prepare_protein_library + Function for preparing spatiotemporal models for sampling. +""" +import numpy as np +import itertools +import pandas as pd +from IMP.spatiotemporal import composition_scoring + +def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, template_topology='', template_dict={}, match_final_state=True): + """ + Function that reads in experimental stoicheometery data and calculates which compositions and location + assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI + topology file can be provided, in which case topology files for each composition and location assignment are + also written. + The output is 3 types of files: + 1. *_time.config - configuration files, which list the proteins included at each time point for each model + 2. time.txt - protein copy number files. Each row is a protein copy number state and each column is the + protein copy number in that state. Note that each protein copy number state can result in multiple + location assignments. + 3. *_time_topol.txt - topology files for each copy number and location assignment. + + @param times: list of strings, the times at which the stoicheometery data should be read. + @param exp_comp_map: dictionary, which describes protein stoicheometery. + The key describes the protein, which should correspond to names + within the expected_subcomplexes. Only copy numbers for proteins + or subcomplexes included in this dictionary will be scored. For + each of these proteins, a csv file should be provided with protein + copy number data. The csv file should have 3 columns, + 1) "Time", which matches up to the possible times in the graph, + 2) "mean", the average protein copy number at that time point + from experiment, and 3) "std", the standard deviation of that + protein copy number from experiment. + @param expected_subcomplexes: list of all possible subcomplex strings + in the model. Should be a list without duplicates of + all components in the subcomplex configuration files. + @param nmodels: int, number of models with different protein copy numbers to generate at each time point. + @param template_topology: string, name of the topology file for the complete complex. + (default: '', no topology files are output) + @param template_dict: dictionary for connecting the spatiotemporal model to the topology file. + The keys (string) are the names of the proteins, defined by the expected_complexes variable. + The values (list) are the names of all proteins in the topology file that should have the same copy number + as the labeled protein, specifically the "molecule_name." + (default: {}, no topology files are output) + @param match_final_state: Boolean, determines whether to fix the final state to the state defined by + expected_subcomplexes. True enforces this match and thus ensures that the final time has only one state. + (default: True) + """ + # Assert that all inputs are the correct variable type + if not isinstance(times, list): + raise TypeError("times should be of type list") + if not isinstance(exp_comp_map, dict): + raise TypeError("times should be of type dict") + if not isinstance(nmodels, int): + raise TypeError("nmodels should be of type int") + if not isinstance(template_topology, str): + raise TypeError("template_topology should be of type str") + if not isinstance(template_dict, dict): + raise TypeError("template_dict should be of type dict") + if not isinstance(match_final_state, bool): + raise TypeError("match_final_state should be of type bool") + # Whether or not topology files should be written + include_topology = False + # calculate final copy numbers based on the expected complexes + final_CN=np.zeros(len(exp_comp.keys()),dtype=int) + for i, key in enumerate(exp_comp.keys()): + for subcomplex in expected_subcomplexes: + if key in subcomplex: + final_CN[i] += 1 + # Enumerate all possible compositions (independent of time) + # start by computing the range for each protein copy number + copy_num=[] + for CN in final_CN: + copy_num.append(range(CN+1)) + # Convert the range of protein copy numbers to all possible combinations of states + all_library = [l for l in itertools.product(*copy_num)] + # remove empty state + empty_state=[] + for i in range(len(final_CN)): + empty_state.append(0) + all_library.pop(all_library.index(tuple(empty_state))) + + # compare the complete composition library against the composition model + # at each time and extract top scoring compositions + for time in times: + # if match_final_state is True, restrict the final state to match the known copy numbers + if time==times[len(times)-1] and match_final_state: + olist=[list(final_CN)] + state_list=[] + # Keep all expected complexes + state_list.append(expected_subcomplexes) + # otherwise, calculate the nmodels most likely copy numbers for each time point + else: + # caclulate the weights for each state + unnormalized_weights =[] + for state in all_library: + unnormalized_weights.append(composition_scoring.calc_likelihood_state(exp_comp_map,time,state)) + unw = np.array(unnormalized_weights) + print(time) + print(all_library) + print(unw) + # get top scoring nmodels + mindx = np.argsort(unw)[0:nmodels] + # write out library with the top scoring models + olist = [] + state_list = [] + for m in mindx: + state = all_library[m] + # convert state counts to protein list + olist.append(list(state)) + # Loops over each copy number state + for state in olist: + cn_list=[] + # Loops over each protein + for i, cn in enumerate(state): + sub_list=[] + # initiate found_subcomplex + found_subcomplex = [] + for subcomplex in expected_subcomplexes: + # See which subcomplexes this protein appears in + if list(exp_comp_map.keys())[i] in subcomplex: + found_subcomplex.append(subcomplex) + # find proteins using combinatorics + prots=list(itertools.combinations(found_subcomplex, state[i])) + for prot in prots: + sub_list.append(prot) + if len(sub_list[0])>0: + cn_list.append(sub_list) + # combine the list of copy numbers + all_cn = [l for l in itertools.product(*cn_list)] + # format the copy number list from a list (all copy number combinations) of tuples (copy number of 1 + # state) of tuples (name of 1 protein) to a list (all copies) of lists (all proteins) + # loop over all cn + for cn in all_cn: + cn_list2=[] + # select each copy number + for n in cn: + # append each protein in each copy number to list + for prot in n: + cn_list2.append(prot) + state_list.append(cn_list2) + # write top "scoring" compositions to file + oary = np.array(olist, dtype=int) + header='' + for prot_name in exp_comp.keys(): + header=header+str(prot_name)+'\t\t\t\t' + np.savetxt( time + ".txt", oary,header=header) + + # write protein config library to file + for indx,prot_list in enumerate(state_list): + with open(str(indx+1) + "_" + time + ".config", "w") as fh: + for prot in prot_list: + fh.write(prot +"\n") + + if len(template_topology)>0: + include_topology=True + # write topology file for each state + for indx,prot_list in enumerate(state_list): + # Names of proteins to keep, according to template file + keep_prots=[] + # loop over each entry of prot_list and convert to the template file name + for prot in prot_list: + if prot in template_dict.keys(): + keep_prots.extend(template_dict[prot]) + else: + raise Exception("Protein " + prot + ' does not exist in template_dict\nClosing...') + # open new topology file + with open(str(indx+1) + "_" + time + "_topol.txt", "w") as fh: + old=open(template_topology,'r') + line=old.readline() + while line: + line_split = line.split('|') + # keep blank lines + if len(line_split)<2: + fh.write(line) + else: + # Keep lines where the protein name is in the template_dict + if line_split[1] in keep_prots: + fh.write(line) + # Keep instruction line + elif line_split[1] == 'molecule_name ': + fh.write(line) + line=old.readline() + old.close() + fh.close() + if include_topology: + print('Successfully calculated the most likely configurations, and saved them to configuration and topology ' + 'files.') + else: + print('Successfully calculated the most likely configurations, and saved them to configuration files.') \ No newline at end of file From 95539298c0dde96a54ba6b8724fa272581136840 Mon Sep 17 00:00:00 2001 From: Andrew Latham Date: Thu, 20 Jun 2024 05:22:23 -0700 Subject: [PATCH 2/5] Update README.md --- modules/spatiotemporal/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/spatiotemporal/README.md b/modules/spatiotemporal/README.md index 2e234dc8c7..929a0c0368 100644 --- a/modules/spatiotemporal/README.md +++ b/modules/spatiotemporal/README.md @@ -272,7 +272,7 @@ Function used by draw_dag() to render the graph using graphviz. Takes a list of ### prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, template_topology='', template_dict={}, match_final_state=True) Function that reads in experimental stoicheometery data and calculates which compositions and location assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI topology file can be provided, in which case topology files for each composition and location assignment are also written. -The output is 3 types of files: 1. *_time.config - configuration files, which list the proteins included at each time point for each model 2. time.txt - protein copy number files. Each row is a protein copy number state and each column is the protein copy number in that state. Note that each protein copy number state can result in multiple location assignments. 3. *_-_time-topol.txt - topology files for each copy number and location assignment. +The output is 3 types of files: 1. *_time.config - configuration files, which list the proteins included at each time point for each model 2. time.txt - protein copy number files. Each row is a protein copy number state and each column is the protein copy number in that state. Note that each protein copy number state can result in multiple location assignments. 3. *_time_topol.txt - topology files for each copy number and location assignment. times - list of strings, the times at which the stoicheometery data should be read. From d4090459c912cb80c79d1dc50fa26d4aa61fbc88 Mon Sep 17 00:00:00 2001 From: Andrew Latham Date: Sun, 23 Jun 2024 06:15:54 -0700 Subject: [PATCH 3/5] Updated code for preparing protein library --- modules/spatiotemporal/README.md | 2 ++ .../pyext/src/prepare_protein_library.py | 31 ++++++++++++------ .../spatiotemporal/test/test_make_graph.py | 32 +++++++++++++++++-- 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/modules/spatiotemporal/README.md b/modules/spatiotemporal/README.md index 929a0c0368..6734c40acf 100644 --- a/modules/spatiotemporal/README.md +++ b/modules/spatiotemporal/README.md @@ -282,6 +282,8 @@ expected_subcomplexes - list of all possible subcomplex strings in the model. Sh nmodels - int, number of models with different protein copy numbers to generate at each time point. +output_dir - string, directory where the output will be written. Empty string assumes the current working directory. (default: '') + template_topology: string, name of the topology file for the complete complex (default: '', no topology files are output) template_dict: dictionary for connecting the spatiotemporal model to the topology file. The keys (string) are the names of the proteins, defined by the expected_complexes variable. The values (list) are the names of all proteins in the topology file that should have the same copy number as the labeled protein, specifically the "molecule_name." (default: {}, no topology files are output) diff --git a/modules/spatiotemporal/pyext/src/prepare_protein_library.py b/modules/spatiotemporal/pyext/src/prepare_protein_library.py index 6e8928c400..d95f92d839 100644 --- a/modules/spatiotemporal/pyext/src/prepare_protein_library.py +++ b/modules/spatiotemporal/pyext/src/prepare_protein_library.py @@ -5,8 +5,9 @@ import itertools import pandas as pd from IMP.spatiotemporal import composition_scoring +import os -def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, template_topology='', template_dict={}, match_final_state=True): +def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, output_dir='', template_topology='', template_dict={}, match_final_state=True): """ Function that reads in experimental stoicheometery data and calculates which compositions and location assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI @@ -34,6 +35,8 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, in the model. Should be a list without duplicates of all components in the subcomplex configuration files. @param nmodels: int, number of models with different protein copy numbers to generate at each time point. + @param output_dir: string, directory where the output will be written. + Empty string assumes the current working directory. @param template_topology: string, name of the topology file for the complete complex. (default: '', no topology files are output) @param template_dict: dictionary for connecting the spatiotemporal model to the topology file. @@ -50,19 +53,30 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, raise TypeError("times should be of type list") if not isinstance(exp_comp_map, dict): raise TypeError("times should be of type dict") + if not isinstance(expected_subcomplexes, list): + raise TypeError("nmodels should be of type list") if not isinstance(nmodels, int): raise TypeError("nmodels should be of type int") + if not isinstance(output_dir, str): + raise TypeError("output_dir should be of type str") if not isinstance(template_topology, str): raise TypeError("template_topology should be of type str") if not isinstance(template_dict, dict): raise TypeError("template_dict should be of type dict") if not isinstance(match_final_state, bool): raise TypeError("match_final_state should be of type bool") + # make output_dir if necessary + if len(output_dir) > 0: + if os.path.exists(output_dir): + os.chdir(output_dir) + else: + os.mkdir(output_dir) + os.chdir(output_dir) # Whether or not topology files should be written include_topology = False # calculate final copy numbers based on the expected complexes - final_CN=np.zeros(len(exp_comp.keys()),dtype=int) - for i, key in enumerate(exp_comp.keys()): + final_CN=np.zeros(len(exp_comp_map.keys()),dtype=int) + for i, key in enumerate(exp_comp_map.keys()): for subcomplex in expected_subcomplexes: if key in subcomplex: final_CN[i] += 1 @@ -95,9 +109,6 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, for state in all_library: unnormalized_weights.append(composition_scoring.calc_likelihood_state(exp_comp_map,time,state)) unw = np.array(unnormalized_weights) - print(time) - print(all_library) - print(unw) # get top scoring nmodels mindx = np.argsort(unw)[0:nmodels] # write out library with the top scoring models @@ -141,13 +152,13 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, # write top "scoring" compositions to file oary = np.array(olist, dtype=int) header='' - for prot_name in exp_comp.keys(): + for prot_name in exp_comp_map.keys(): header=header+str(prot_name)+'\t\t\t\t' - np.savetxt( time + ".txt", oary,header=header) + np.savetxt(time + ".txt", oary, header=header) # write protein config library to file for indx,prot_list in enumerate(state_list): - with open(str(indx+1) + "_" + time + ".config", "w") as fh: + with open(str(indx + 1) + "_" + time + ".config", "w") as fh: for prot in prot_list: fh.write(prot +"\n") @@ -164,7 +175,7 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, else: raise Exception("Protein " + prot + ' does not exist in template_dict\nClosing...') # open new topology file - with open(str(indx+1) + "_" + time + "_topol.txt", "w") as fh: + with open(str(indx + 1) + "_" + time + "_topol.txt", "w") as fh: old=open(template_topology,'r') line=old.readline() while line: diff --git a/modules/spatiotemporal/test/test_make_graph.py b/modules/spatiotemporal/test/test_make_graph.py index 44148e33f1..f98f76ad37 100644 --- a/modules/spatiotemporal/test/test_make_graph.py +++ b/modules/spatiotemporal/test/test_make_graph.py @@ -6,6 +6,7 @@ import IMP.test import IMP.spatiotemporal as spatiotemporal import IMP.spatiotemporal.graphNode as graphNode +import IMP.spatiotemporal.prepare_protein_library as prepare_protein_library import shutil import os import sys @@ -17,14 +18,41 @@ def setup_system(): Function to set up initial variables """ # Input variables. - dict = {'0min': 2, '5min': 3, '10min': 2} + time_dict = {'0min': 2, '5min': 3, '10min': 2} subcomplexes = ['A1', 'A2', 'B1', 'B2'] # exp_comp_map is a dictionary that describes protein stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. For each of these proteins, a csv file should be provided with protein copy number data exp_comp = {'A': 'exp_comp_A.csv', 'B': 'exp_comp_B.csv'} - return dict, subcomplexes, exp_comp + return time_dict, subcomplexes, exp_comp class Tests(IMP.test.TestCase): + def test_prepare_protein_library(self): + """ + Test setting up a preparing a protein library for spatiotemporal library + """ + # set input dir + state_dict, expected_subcomplexes, exp_comp_map = setup_system() + with IMP.test.temporary_directory() as tmpdir: + input = os.path.join(tmpdir, 'data/') + shutil.copytree(self.get_input_file_name('data/'), input) + # set output dir + output = self.get_tmp_file_name('output') + # run code + exp_comp_map = {'A': input+'exp_comp_A.csv', 'B': input+'exp_comp_B.csv'} + prepare_protein_library.prepare_protein_library(list(state_dict.keys()), exp_comp_map, expected_subcomplexes, 2, output_dir=output) + # check copy numbers + CN_0min=np.loadtxt(output+'/0min.txt') + self.assertAlmostEqual(np.sum(CN_0min[0][:]), 1.0, delta=1e-4) + self.assertAlmostEqual(CN_0min[0][0], 1.0, delta=1e-4) + # check configuration file + check_config=open(output+'/4_0min.config','r') + line1=check_config.readline() + line2=check_config.readline() + check_config.close() + self.assertEqual(line1[0:2], 'A1') + self.assertEqual(line2[0:2], 'B2') + + def test_graph_setup(self): """ Test setting up a graph. Tests functionality of graphNode.py From 3c9301b24b53eb0eeaa6827229191c3d01e89dfe Mon Sep 17 00:00:00 2001 From: Andrew Latham Date: Sun, 23 Jun 2024 06:43:08 -0700 Subject: [PATCH 4/5] Update composition_scoring.py --- modules/spatiotemporal/pyext/src/composition_scoring.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/spatiotemporal/pyext/src/composition_scoring.py b/modules/spatiotemporal/pyext/src/composition_scoring.py index cca5c2025b..cd31d17f66 100644 --- a/modules/spatiotemporal/pyext/src/composition_scoring.py +++ b/modules/spatiotemporal/pyext/src/composition_scoring.py @@ -139,9 +139,10 @@ def calc_likelihood_state(exp_comp_map, t, state): @param t: string, time at which the composition likelihood should be calculated. Should match one a possible value in the first column of the exp_comp_map. - @param state: list of integers, an array of the number of protein copy numbers - for which the likelihood will be calculated. This array should list - the proteins in the same order as the exp_comp_map. + @param state: list of integers, an array of the number of protein copy + numbers for which the likelihood will be calculated. + This array should list the proteins in the same order as + the exp_comp_map. @return weight: float, the weight of the graphNode according to the composition likelihood function. """ From c3905e7bf534c82be8dfc2e3cdfb5e64d636d98b Mon Sep 17 00:00:00 2001 From: Andrew Latham Date: Tue, 25 Jun 2024 09:16:34 -0700 Subject: [PATCH 5/5] Fixed Ben's comments and flake8 --- .../pyext/src/composition_scoring.py | 22 +-- .../pyext/src/prepare_protein_library.py | 150 +++++++++++------- 2 files changed, 102 insertions(+), 70 deletions(-) diff --git a/modules/spatiotemporal/pyext/src/composition_scoring.py b/modules/spatiotemporal/pyext/src/composition_scoring.py index cd31d17f66..a3d318c53d 100644 --- a/modules/spatiotemporal/pyext/src/composition_scoring.py +++ b/modules/spatiotemporal/pyext/src/composition_scoring.py @@ -100,7 +100,7 @@ def calc_likelihood(exp_comp_map, nodes): if os.path.exists(exp_comp_map[prot]): exp = pd.read_csv(exp_comp_map[prot]) else: - raise Exception( + raise FileNotFoundError( "Error!!! Check exp_comp_map. Unable to find composition " "file: " + exp_comp_map[prot] + '\nClosing...') for i in range(len(exp)): @@ -116,6 +116,7 @@ def calc_likelihood(exp_comp_map, nodes): node.add_score(float(weight)) return nodes + def calc_likelihood_state(exp_comp_map, t, state): """ Function that adds a score for the compositional likelihood for all @@ -143,8 +144,8 @@ def calc_likelihood_state(exp_comp_map, t, state): numbers for which the likelihood will be calculated. This array should list the proteins in the same order as the exp_comp_map. - @return weight: float, the weight of the graphNode according to the composition - likelihood function. + @return weight: float, the weight of the graphNode according to the + composition likelihood function. """ import pandas as pd # Data is stored as a dictionary of dictionaries. The first dictionary @@ -153,17 +154,17 @@ def calc_likelihood_state(exp_comp_map, t, state): # is the mean or standard deviation of the protein copy number mean = {} std = {} - state_cn={} - count=0 + state_cn = {} + count = 0 # import csv file as pandas data frame for prot in exp_comp_map.keys(): prot_dict_mean = {} prot_dict_std = {} - state_cn[prot]=state[count] + state_cn[prot] = state[count] if os.path.exists(exp_comp_map[prot]): exp = pd.read_csv(exp_comp_map[prot]) else: - raise Exception( + raise FileNotFoundError( "Error!!! Check exp_comp_map. Unable to find composition " "file: " + exp_comp_map[prot] + '\nClosing...') for i in range(len(exp)): @@ -173,7 +174,7 @@ def calc_likelihood_state(exp_comp_map, t, state): std[prot] = prot_dict_std count += 1 # compute the compositional likelihood of the nodes - weight=0 + weight = 0 for prot in exp_comp_map.keys(): # x counts the number of proteins of a given type in the node x = state_cn[prot] @@ -185,5 +186,6 @@ def calc_likelihood_state(exp_comp_map, t, state): 'WARNING!!! Standard deviation of protein ' + prot + ' 0 or less at time ' + t + '. May lead to illogical results.') - weight += (0.5 * ((x - mean[prot][t]) / std[prot][t]) ** 2 + np.log(std[prot][t] * np.sqrt(2 * np.pi))) - return weight \ No newline at end of file + weight += (0.5 * ((x - mean[prot][t]) / std[prot][t]) ** 2 + + np.log(std[prot][t] * np.sqrt(2 * np.pi))) + return weight diff --git a/modules/spatiotemporal/pyext/src/prepare_protein_library.py b/modules/spatiotemporal/pyext/src/prepare_protein_library.py index d95f92d839..065c62e721 100644 --- a/modules/spatiotemporal/pyext/src/prepare_protein_library.py +++ b/modules/spatiotemporal/pyext/src/prepare_protein_library.py @@ -3,25 +3,34 @@ """ import numpy as np import itertools -import pandas as pd from IMP.spatiotemporal import composition_scoring import os -def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, output_dir='', template_topology='', template_dict={}, match_final_state=True): + +def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, + nmodels, output_dir='', template_topology='', + template_dict={}, match_final_state=True): """ - Function that reads in experimental stoicheometery data and calculates which compositions and location - assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI - topology file can be provided, in which case topology files for each composition and location assignment are - also written. + Function that reads in experimental stoicheometery data and calculates + which compositions and location assignments should be sampled for + spatiotemporal modeling, which are saved as config files. Optionally, + a PMI topology file can be provided, in which case topology files + for each composition and location assignment are also written. The output is 3 types of files: - 1. *_time.config - configuration files, which list the proteins included at each time point for each model - 2. time.txt - protein copy number files. Each row is a protein copy number state and each column is the - protein copy number in that state. Note that each protein copy number state can result in multiple + 1. *_time.config - configuration files, which list the proteins + included at each time point for each model + 2. time.txt - protein copy number files. Each row is a protein + copy number state and each column is the + protein copy number in that state. Note that each + protein copy number state can result in multiple location assignments. - 3. *_time_topol.txt - topology files for each copy number and location assignment. + 3. *_time_topol.txt - topology files for each copy number + and location assignment. - @param times: list of strings, the times at which the stoicheometery data should be read. - @param exp_comp_map: dictionary, which describes protein stoicheometery. + @param times: list of strings, the times at which the stoicheometery + data should be read. + @param exp_comp_map: dictionary, which describes protein + stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. Only copy numbers for proteins or subcomplexes included in this dictionary will be scored. For @@ -34,18 +43,24 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, @param expected_subcomplexes: list of all possible subcomplex strings in the model. Should be a list without duplicates of all components in the subcomplex configuration files. - @param nmodels: int, number of models with different protein copy numbers to generate at each time point. + @param nmodels: int, number of models with different protein copy + numbers to generate at each time point. @param output_dir: string, directory where the output will be written. Empty string assumes the current working directory. - @param template_topology: string, name of the topology file for the complete complex. + @param template_topology: string, name of the topology file + for the complete complex. (default: '', no topology files are output) - @param template_dict: dictionary for connecting the spatiotemporal model to the topology file. - The keys (string) are the names of the proteins, defined by the expected_complexes variable. - The values (list) are the names of all proteins in the topology file that should have the same copy number - as the labeled protein, specifically the "molecule_name." + @param template_dict: dictionary for connecting the spatiotemporal + model to the topology file. The keys (string) are the names of + the proteins, defined by the expected_complexes variable. + The values (list) are the names of all proteins in the topology + file that should have the same copy number + as the labeled protein, specifically the "molecule_name." (default: {}, no topology files are output) - @param match_final_state: Boolean, determines whether to fix the final state to the state defined by - expected_subcomplexes. True enforces this match and thus ensures that the final time has only one state. + @param match_final_state: Boolean, determines whether to fix the + final state to the state defined by expected_subcomplexes. + True enforces this match and thus ensures that the final + time has only one state. (default: True) """ # Assert that all inputs are the correct variable type @@ -68,27 +83,28 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, # make output_dir if necessary if len(output_dir) > 0: if os.path.exists(output_dir): - os.chdir(output_dir) + pass else: os.mkdir(output_dir) - os.chdir(output_dir) + os.chdir(output_dir) # Whether or not topology files should be written include_topology = False # calculate final copy numbers based on the expected complexes - final_CN=np.zeros(len(exp_comp_map.keys()),dtype=int) + final_CN = np.zeros(len(exp_comp_map.keys()), dtype=int) for i, key in enumerate(exp_comp_map.keys()): for subcomplex in expected_subcomplexes: if key in subcomplex: final_CN[i] += 1 # Enumerate all possible compositions (independent of time) # start by computing the range for each protein copy number - copy_num=[] + copy_num = [] for CN in final_CN: copy_num.append(range(CN+1)) - # Convert the range of protein copy numbers to all possible combinations of states - all_library = [l for l in itertools.product(*copy_num)] + # Convert the range of protein copy numbers to all possible + # combinations of states + all_library = [iterable for iterable in itertools.product(*copy_num)] # remove empty state - empty_state=[] + empty_state = [] for i in range(len(final_CN)): empty_state.append(0) all_library.pop(all_library.index(tuple(empty_state))) @@ -96,18 +112,22 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, # compare the complete composition library against the composition model # at each time and extract top scoring compositions for time in times: - # if match_final_state is True, restrict the final state to match the known copy numbers - if time==times[len(times)-1] and match_final_state: - olist=[list(final_CN)] - state_list=[] + # if match_final_state is True, restrict the final state to match + # the known copy numbers + if time == times[len(times)-1] and match_final_state: + olist = [list(final_CN)] + state_list = [] # Keep all expected complexes state_list.append(expected_subcomplexes) - # otherwise, calculate the nmodels most likely copy numbers for each time point + # otherwise, calculate the nmodels most likely copy numbers + # for each time point else: # caclulate the weights for each state - unnormalized_weights =[] + unnormalized_weights = [] for state in all_library: - unnormalized_weights.append(composition_scoring.calc_likelihood_state(exp_comp_map,time,state)) + unnormalized_weights.append( + composition_scoring.calc_likelihood_state( + exp_comp_map, time, state)) unw = np.array(unnormalized_weights) # get top scoring nmodels mindx = np.argsort(unw)[0:nmodels] @@ -120,10 +140,10 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, olist.append(list(state)) # Loops over each copy number state for state in olist: - cn_list=[] + cn_list = [] # Loops over each protein for i, cn in enumerate(state): - sub_list=[] + sub_list = [] # initiate found_subcomplex found_subcomplex = [] for subcomplex in expected_subcomplexes: @@ -131,18 +151,21 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, if list(exp_comp_map.keys())[i] in subcomplex: found_subcomplex.append(subcomplex) # find proteins using combinatorics - prots=list(itertools.combinations(found_subcomplex, state[i])) + prots = list(itertools.combinations(found_subcomplex, + state[i])) for prot in prots: sub_list.append(prot) - if len(sub_list[0])>0: + if len(sub_list[0]) > 0: cn_list.append(sub_list) # combine the list of copy numbers - all_cn = [l for l in itertools.product(*cn_list)] - # format the copy number list from a list (all copy number combinations) of tuples (copy number of 1 - # state) of tuples (name of 1 protein) to a list (all copies) of lists (all proteins) + all_cn = [iterable for iterable in itertools.product(*cn_list)] + # format the copy number list from a list (all copy number + # combinations) of tuples (copy number of 1 + # state) of tuples (name of 1 protein) to a list + # (all copies) of lists (all proteins) # loop over all cn for cn in all_cn: - cn_list2=[] + cn_list2 = [] # select each copy number for n in cn: # append each protein in each copy number to list @@ -151,50 +174,57 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, state_list.append(cn_list2) # write top "scoring" compositions to file oary = np.array(olist, dtype=int) - header='' + header = '' for prot_name in exp_comp_map.keys(): - header=header+str(prot_name)+'\t\t\t\t' + header = header+str(prot_name)+'\t\t\t\t' np.savetxt(time + ".txt", oary, header=header) # write protein config library to file - for indx,prot_list in enumerate(state_list): + for indx, prot_list in enumerate(state_list): with open(str(indx + 1) + "_" + time + ".config", "w") as fh: for prot in prot_list: - fh.write(prot +"\n") + fh.write(prot + "\n") - if len(template_topology)>0: - include_topology=True + if len(template_topology) > 0: + include_topology = True # write topology file for each state - for indx,prot_list in enumerate(state_list): + for indx, prot_list in enumerate(state_list): # Names of proteins to keep, according to template file - keep_prots=[] - # loop over each entry of prot_list and convert to the template file name + keep_prots = [] + # loop over each entry of prot_list and + # convert to the template file name for prot in prot_list: if prot in template_dict.keys(): keep_prots.extend(template_dict[prot]) else: - raise Exception("Protein " + prot + ' does not exist in template_dict\nClosing...') + raise KeyError("Protein " + prot + + ' does not exist in template_dict' + '\nClosing...') # open new topology file - with open(str(indx + 1) + "_" + time + "_topol.txt", "w") as fh: - old=open(template_topology,'r') - line=old.readline() + with open(str(indx + 1) + "_" + time + + "_topol.txt", "w") as fh: + old = open(template_topology, 'r') + line = old.readline() while line: line_split = line.split('|') # keep blank lines - if len(line_split)<2: + if len(line_split) < 2: fh.write(line) else: - # Keep lines where the protein name is in the template_dict + # Keep lines where the protein name + # is in the template_dict if line_split[1] in keep_prots: fh.write(line) # Keep instruction line elif line_split[1] == 'molecule_name ': fh.write(line) - line=old.readline() + line = old.readline() old.close() fh.close() if include_topology: - print('Successfully calculated the most likely configurations, and saved them to configuration and topology ' + print('Successfully calculated the most likely configurations,' + ' and saved them to configuration and topology ' 'files.') else: - print('Successfully calculated the most likely configurations, and saved them to configuration files.') \ No newline at end of file + print('Successfully calculated the most likely configurations,' + ' and saved them to configuration files.')