Merge pull request #1096 from alatham13/develop

Added functionality to spatiotemporal
salilab · Jun 25, 2024 · f0e76b4 · f0e76b4
2 parents 8b7eb21 + c3905e7
commit f0e76b4
Show file tree

Hide file tree

Showing 4 changed files with 366 additions and 3 deletions.
diff --git a/modules/spatiotemporal/README.md b/modules/spatiotemporal/README.md
@@ -147,6 +147,17 @@ prot - string, protein or subcomplex we are interested in finding
 
 subcomplex_components - subcomplexes or components in a given node, which can be accessed by graphNode.get_subcomplex_components()
 
+### calc_likelihood_state(exp_comp_map, t, state)
+Function that calculates the likelihood of an individual state, used by prepare_protein_library. Returns a single float, the weight of the protein copy number based on a Gaussian likelihood function.
+
+exp_comp_map - dictionary, which describes protein stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. Only copy numbers for proteins
+or subcomplexes included in this dictionary will be scored. For each of these proteins, a csv file should be provided with protein copy number data. The csv file should have 3 columns,
+1) "Time", which matches up to the possible times in the graph, 2) "mean", the average protein copy number at that time point from experiment, and 3) "std", the standard deviation of that protein copy number from experiment.
+
+t - string, time at which the composition likelihood should be calculated. Should match one a possible value in the first column of the exp_comp_map.
+
+state - list of integers, an array of the number of protein copy numbers for which the likelihood will be calculated. This array should list the proteins in the same order as the exp_comp_map.
+
 ## score_graph.py
 
 ### score_graph(nodes,keys):
@@ -257,4 +268,24 @@ width - string, width of each node on the dag
 ### draw_dag_in_graphviz(nodes, coloring=None, draw_label=True, fontname="Helvetica", fontsize="18", penscale=0.6, arrowsize=1.2, height="0.6",width="0.6"):
 Function used by draw_dag() to render the graph using graphviz. Takes a list of graphNodes (nodes) and initializes the nodes and edges. Coloring is expected to be a list of RGBA strings specifying how to color each node. Expected to be same length as nodes.
 
+## prepare_protein_library.py
+
+### prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, template_topology='', template_dict={}, match_final_state=True)
+Function that reads in experimental stoicheometery data and calculates which compositions and location assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI topology file can be provided, in which case topology files for each composition and location assignment are also written.
+The output is 3 types of files: 1. *_time.config - configuration files, which list the proteins included at each time point for each model 2. time.txt - protein copy number files. Each row is a protein copy number state and each column is the protein copy number in that state. Note that each protein copy number state can result in multiple location assignments. 3. *_time_topol.txt - topology files for each copy number and location assignment.
+
+times - list of strings, the times at which the stoicheometery data should be read.
+
+exp_comp_map - dictionary, which describes protein stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. Only copy numbers for proteins or subcomplexes included in this dictionary will be scored. For each of these proteins, a csv file should be provided with protein copy number data. The csv file should have 3 columns, 1) "Time", which matches up to the possible times in the graph, 2) "mean", the average protein copy number at that time point from experiment, and 3) "std", the standard deviation of that protein copy number from experiment.
+
+expected_subcomplexes - list of all possible subcomplex strings in the model. Should be a list without duplicates of all components in the subcomplex configuration files.
+
+nmodels - int, number of models with different protein copy numbers to generate at each time point.
+
+output_dir - string, directory where the output will be written. Empty string assumes the current working directory. (default: '')
+
+template_topology: string, name of the topology file for the complete complex (default: '', no topology files are output)
+
+template_dict: dictionary for connecting the spatiotemporal model to the topology file. The keys (string) are the names of the proteins, defined by the expected_complexes variable. The values (list) are the names of all proteins in the topology file that should have the same copy number as the labeled protein, specifically the "molecule_name." (default: {}, no topology files are output)
 
+match_final_state: Boolean, determines whether to fix the final state to the state defined by expected_subcomplexes. True enforces this match and thus ensures that the final time has only one state. (default: True)
diff --git a/modules/spatiotemporal/pyext/src/composition_scoring.py b/modules/spatiotemporal/pyext/src/composition_scoring.py
@@ -100,7 +100,7 @@ def calc_likelihood(exp_comp_map, nodes):
         if os.path.exists(exp_comp_map[prot]):
             exp = pd.read_csv(exp_comp_map[prot])
         else:
-            raise Exception(
+            raise FileNotFoundError(
                 "Error!!! Check exp_comp_map. Unable to find composition "
                 "file: " + exp_comp_map[prot] + '\nClosing...')
         for i in range(len(exp)):
@@ -115,3 +115,77 @@ def calc_likelihood(exp_comp_map, nodes):
         # add state weight to node
         node.add_score(float(weight))
     return nodes
+
+
+def calc_likelihood_state(exp_comp_map, t, state):
+    """
+    Function that adds a score for the compositional likelihood for all
+    states, similar to how composition_likelihood_function calculates the
+    composition likelihood of a node. Used by prepare_protein_library.
+    The composition likelihood assumes a Gaussian distribution for copy
+    number of each protein or subcomplex with means and standard
+    deviatiations derived from experiment. Returns the nodes, with the
+    new weights added.
+
+    @param exp_comp_map: dictionary, which describes protein stoicheometery.
+           The key describes the protein, which should correspond to names
+           within the expected_subcomplexes. Only copy numbers for proteins
+           or subcomplexes included in this dictionary will be scored. For
+           each of these proteins, a csv file should be provided with protein
+           copy number data. The csv file should have 3 columns,
+           1) "Time", which matches up to the possible times in the graph,
+           2) "mean", the average protein copy number at that time point
+           from experiment, and 3) "std", the standard deviation of that
+           protein copy number from experiment.
+    @param t: string, time at which the composition likelihood should be
+            calculated. Should match one a possible value in the first column
+            of the exp_comp_map.
+    @param state: list of integers, an array of the number of protein copy
+            numbers for which the likelihood will be calculated.
+            This array should list the proteins in the same order as
+            the exp_comp_map.
+    @return weight: float, the weight of the graphNode according to the
+            composition likelihood function.
+    """
+    import pandas as pd
+    # Data is stored as a dictionary of dictionaries. The first dictionary
+    # references which protein you are refering to.
+    # the 2nd dictionary references which time you are refering to. The return
+    # is the mean or standard deviation of the protein copy number
+    mean = {}
+    std = {}
+    state_cn = {}
+    count = 0
+    # import csv file as pandas data frame
+    for prot in exp_comp_map.keys():
+        prot_dict_mean = {}
+        prot_dict_std = {}
+        state_cn[prot] = state[count]
+        if os.path.exists(exp_comp_map[prot]):
+            exp = pd.read_csv(exp_comp_map[prot])
+        else:
+            raise FileNotFoundError(
+                "Error!!! Check exp_comp_map. Unable to find composition "
+                "file: " + exp_comp_map[prot] + '\nClosing...')
+        for i in range(len(exp)):
+            prot_dict_mean[exp['Time'][i]] = exp['mean'][i]
+            prot_dict_std[exp['Time'][i]] = exp['std'][i]
+        mean[prot] = prot_dict_mean
+        std[prot] = prot_dict_std
+        count += 1
+    # compute the compositional likelihood of the nodes
+    weight = 0
+    for prot in exp_comp_map.keys():
+        # x counts the number of proteins of a given type in the node
+        x = state_cn[prot]
+        # check std is greater than 0
+        if std[prot][t] > 0:
+            pass
+        else:
+            warnings.warn(
+                'WARNING!!! Standard deviation of protein ' + prot
+                + ' 0 or less at time ' + t
+                + '. May lead to illogical results.')
+        weight += (0.5 * ((x - mean[prot][t]) / std[prot][t]) ** 2 +
+                   np.log(std[prot][t] * np.sqrt(2 * np.pi)))
+    return weight
diff --git a/modules/spatiotemporal/pyext/src/prepare_protein_library.py b/modules/spatiotemporal/pyext/src/prepare_protein_library.py
@@ -0,0 +1,230 @@
+"""@namespace IMP.spatiotemporal.prepare_protein_library
+   Function for preparing spatiotemporal models for sampling.
+"""
+import numpy as np
+import itertools
+from IMP.spatiotemporal import composition_scoring
+import os
+
+
+def prepare_protein_library(times, exp_comp_map, expected_subcomplexes,
+                            nmodels, output_dir='', template_topology='',
+                            template_dict={}, match_final_state=True):
+    """
+        Function that reads in experimental stoicheometery data and calculates
+        which compositions and location assignments should be sampled for
+        spatiotemporal modeling, which are saved as config files. Optionally,
+        a PMI topology file can be provided, in which case topology files
+        for each composition and location assignment are also written.
+        The output is 3 types of files:
+        1. *_time.config - configuration files, which list the proteins
+            included at each time point for each model
+        2. time.txt - protein copy number files. Each row is a protein
+            copy number state and each column is the
+            protein copy number in that state. Note that each
+            protein copy number state can result in multiple
+            location assignments.
+        3. *_time_topol.txt - topology files for each copy number
+            and location assignment.
+
+        @param times: list of strings, the times at which the stoicheometery
+           data should be read.
+        @param exp_comp_map: dictionary, which describes protein
+           stoicheometery.
+           The key describes the protein, which should correspond to names
+           within the expected_subcomplexes. Only copy numbers for proteins
+           or subcomplexes included in this dictionary will be scored. For
+           each of these proteins, a csv file should be provided with protein
+           copy number data. The csv file should have 3 columns,
+           1) "Time", which matches up to the possible times in the graph,
+           2) "mean", the average protein copy number at that time point
+           from experiment, and 3) "std", the standard deviation of that
+           protein copy number from experiment.
+        @param expected_subcomplexes: list of all possible subcomplex strings
+           in the model. Should be a list without duplicates of
+           all components in the subcomplex configuration files.
+        @param nmodels: int, number of models with different protein copy
+           numbers to generate at each time point.
+        @param output_dir: string, directory where the output will be written.
+           Empty string assumes the current working directory.
+        @param template_topology: string, name of the topology file
+            for the complete complex.
+            (default: '', no topology files are output)
+        @param template_dict: dictionary for connecting the spatiotemporal
+            model to the topology file. The keys (string) are the names of
+            the proteins, defined by the expected_complexes variable.
+            The values (list) are the names of all proteins in the topology
+            file that should have the same copy number
+            as the labeled protein, specifically the "molecule_name."
+            (default: {}, no topology files are output)
+        @param match_final_state: Boolean, determines whether to fix the
+            final state to the state defined by expected_subcomplexes.
+            True enforces this match and thus ensures that the final
+            time has only one state.
+            (default: True)
+        """
+    # Assert that all inputs are the correct variable type
+    if not isinstance(times, list):
+        raise TypeError("times should be of type list")
+    if not isinstance(exp_comp_map, dict):
+        raise TypeError("times should be of type dict")
+    if not isinstance(expected_subcomplexes, list):
+        raise TypeError("nmodels should be of type list")
+    if not isinstance(nmodels, int):
+        raise TypeError("nmodels should be of type int")
+    if not isinstance(output_dir, str):
+        raise TypeError("output_dir should be of type str")
+    if not isinstance(template_topology, str):
+        raise TypeError("template_topology should be of type str")
+    if not isinstance(template_dict, dict):
+        raise TypeError("template_dict should be of type dict")
+    if not isinstance(match_final_state, bool):
+        raise TypeError("match_final_state should be of type bool")
+    # make output_dir if necessary
+    if len(output_dir) > 0:
+        if os.path.exists(output_dir):
+            pass
+        else:
+            os.mkdir(output_dir)
+        os.chdir(output_dir)
+    # Whether or not topology files should be written
+    include_topology = False
+    # calculate final copy numbers based on the expected complexes
+    final_CN = np.zeros(len(exp_comp_map.keys()), dtype=int)
+    for i, key in enumerate(exp_comp_map.keys()):
+        for subcomplex in expected_subcomplexes:
+            if key in subcomplex:
+                final_CN[i] += 1
+    # Enumerate all possible compositions (independent of time)
+    # start by computing the range for each protein copy number
+    copy_num = []
+    for CN in final_CN:
+        copy_num.append(range(CN+1))
+    # Convert the range of protein copy numbers to all possible
+    # combinations of states
+    all_library = [iterable for iterable in itertools.product(*copy_num)]
+    # remove empty state
+    empty_state = []
+    for i in range(len(final_CN)):
+        empty_state.append(0)
+    all_library.pop(all_library.index(tuple(empty_state)))
+
+    # compare the complete composition library against the composition model
+    # at each time and extract top scoring compositions
+    for time in times:
+        # if match_final_state is True, restrict the final state to match
+        # the known copy numbers
+        if time == times[len(times)-1] and match_final_state:
+            olist = [list(final_CN)]
+            state_list = []
+            # Keep all expected complexes
+            state_list.append(expected_subcomplexes)
+        # otherwise, calculate the nmodels most likely copy numbers
+        # for each time point
+        else:
+            # caclulate the weights for each state
+            unnormalized_weights = []
+            for state in all_library:
+                unnormalized_weights.append(
+                    composition_scoring.calc_likelihood_state(
+                        exp_comp_map, time, state))
+            unw = np.array(unnormalized_weights)
+            # get top scoring nmodels
+            mindx = np.argsort(unw)[0:nmodels]
+            # write out library with the top scoring models
+            olist = []
+            state_list = []
+            for m in mindx:
+                state = all_library[m]
+                # convert state counts to protein list
+                olist.append(list(state))
+            # Loops over each copy number state
+            for state in olist:
+                cn_list = []
+                # Loops over each protein
+                for i, cn in enumerate(state):
+                    sub_list = []
+                    # initiate found_subcomplex
+                    found_subcomplex = []
+                    for subcomplex in expected_subcomplexes:
+                        # See which subcomplexes this protein appears in
+                        if list(exp_comp_map.keys())[i] in subcomplex:
+                            found_subcomplex.append(subcomplex)
+                    # find proteins using combinatorics
+                    prots = list(itertools.combinations(found_subcomplex,
+                                                        state[i]))
+                    for prot in prots:
+                        sub_list.append(prot)
+                    if len(sub_list[0]) > 0:
+                        cn_list.append(sub_list)
+                # combine the list of copy numbers
+                all_cn = [iterable for iterable in itertools.product(*cn_list)]
+                # format the copy number list from a list (all copy number
+                # combinations) of tuples (copy number of 1
+                # state) of tuples (name of 1 protein) to a list
+                # (all copies) of lists (all proteins)
+                # loop over all cn
+                for cn in all_cn:
+                    cn_list2 = []
+                    # select each copy number
+                    for n in cn:
+                        # append each protein in each copy number to list
+                        for prot in n:
+                            cn_list2.append(prot)
+                    state_list.append(cn_list2)
+        # write top "scoring" compositions to file
+        oary = np.array(olist, dtype=int)
+        header = ''
+        for prot_name in exp_comp_map.keys():
+            header = header+str(prot_name)+'\t\t\t\t'
+        np.savetxt(time + ".txt", oary, header=header)
+
+        # write protein config library to file
+        for indx, prot_list in enumerate(state_list):
+            with open(str(indx + 1) + "_" + time + ".config", "w") as fh:
+                for prot in prot_list:
+                    fh.write(prot + "\n")
+
+        if len(template_topology) > 0:
+            include_topology = True
+            # write topology file for each state
+            for indx, prot_list in enumerate(state_list):
+                # Names of proteins to keep, according to template file
+                keep_prots = []
+                # loop over each entry of prot_list and
+                # convert to the template file name
+                for prot in prot_list:
+                    if prot in template_dict.keys():
+                        keep_prots.extend(template_dict[prot])
+                    else:
+                        raise KeyError("Protein " + prot
+                                       + ' does not exist in template_dict'
+                                         '\nClosing...')
+                # open new topology file
+                with open(str(indx + 1) + "_" + time +
+                          "_topol.txt", "w") as fh:
+                    old = open(template_topology, 'r')
+                    line = old.readline()
+                    while line:
+                        line_split = line.split('|')
+                        # keep blank lines
+                        if len(line_split) < 2:
+                            fh.write(line)
+                        else:
+                            # Keep lines where the protein name
+                            # is in the template_dict
+                            if line_split[1] in keep_prots:
+                                fh.write(line)
+                            # Keep instruction line
+                            elif line_split[1] == 'molecule_name ':
+                                fh.write(line)
+                        line = old.readline()
+                    old.close()
+                    fh.close()
+    if include_topology:
+        print('Successfully calculated the most likely configurations,'
+              ' and saved them to configuration and topology '
+              'files.')
+    else:
+        print('Successfully calculated the most likely configurations,'
+              ' and saved them to configuration files.')