From 06ec3154700ab7bfec863b83d45e35e69d685919 Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Fri, 16 Jul 2021 11:36:08 -0700
Subject: [PATCH 01/23] Added metrics.py file.

---
 pensa/comparison/metrics.py | 156 ++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 pensa/comparison/metrics.py

diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py
new file mode 100644
index 00000000..d159781d
--- /dev/null
+++ b/pensa/comparison/metrics.py
@@ -0,0 +1,156 @@
+import numpy as np
+from pensa import *
+
+
+    """
+    Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters.
+
+    Parameters
+    ----------
+        features_a : list of str
+            Feature names of the first ensemble.
+            Can be obtained from features object via .describe().
+        features_b : list of str
+            Feature names of the first ensemble.
+            Can be obtained from features object via .describe().
+            Must be the same as features_a. Provided as a sanity check.
+        all_data_a : float array
+            Trajectory data from the first ensemble. Format: [frames,frame_data].
+        all_data_b : float array
+            Trajectory data from the second ensemble. 
+            For kld functions, the second ensemble should be the reference ensemble.
+            Format: [frames,frame_data].
+        bin_width : float, default=None
+            Bin width for the axis to compare the distributions on.
+            If bin_width is None, bin_num (see below) bins are used and the width is determined from the common histogram.
+        bin_num : int, default=10
+            Number of bins for the axis to compare the distributions on (only if bin_width=None).
+        verbose : bool, default=True
+            Print intermediate results.
+        override_name_check : bool, default=False
+            Only check number of features, not their names.
+
+    Returns
+    -------
+        Each function returns one value.
+        
+        average_jsd : float
+            Average Jensen-Shannon distance from two ensembles.
+        max_jsd : float
+            Maximum Jensen-Shannon distance from two ensembles.
+        average_kld : float
+            Average Kullback-Leibler divergence from two ensembles.
+        max_kld : float
+            Maximum Kullback-Leibler divergence from two ensembles.
+    """
+
+def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
+    _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
+    return np.mean(data_jsdist)
+
+
+def max_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
+    _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
+    return np.max(data_jsdist)
+
+
+def average_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
+    _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
+    return np.mean(data_kld_ab)
+
+
+def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
+    _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
+    return np.max(data_kld_ab)
+
+
+    """
+    Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of two functions uses the kolmogorov_smirnov_analysis function with the same parameters. 
+
+    Parameters
+    ----------
+        features_a : list of str
+            Feature names of the first ensemble.
+            Can be obtained from features object via .describe().
+        features_b : list of str
+            Feature names of the first ensemble.
+            Can be obtained from features object via .describe().
+            Must be the same as features_a. Provided as a sanity check.
+        all_data_a : float array
+            Trajectory data from the first ensemble. Format: [frames,frame_data].
+        all_data_b : float array
+            Trajectory data from the second ensemble. Format: [frames,frame_data].
+        verbose : bool, default=True
+            Print intermediate results.
+        override_name_check : bool, default=False
+            Only check number of features, not their names.
+
+    Returns
+    -------
+        Each function returns one value.
+
+        average_kss : float
+            Average Kolmogorov-Smirnov statistic for two distributions.
+        max_kss : float
+            Maximum Kolmogorov-Smirnov statistic for two distributions.
+    """
+
+
+def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
+    _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
+    return np.mean(data_kss)
+
+
+def max_kss(): 
+    _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
+    return np.max(data_kss)
+
+
+    """
+    Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. 
+    
+    Parameters
+    ----------
+    features_a : list of str
+        Feature names of the first ensemble. 
+    features_b : list of str
+        Feature names of the first ensemble. 
+        Must be the same as features_a. Provided as a sanity check. 
+    all_data_a : float array
+        Trajectory data from the first ensemble. Format: [frames,frame_data].
+    all_data_b : float array
+        Trajectory data from the second ensemble. Format: [frames,frame_data].
+    torsions : str
+        Torsion angles to use for SSI, including backbone - 'bb', and sidechain - 'sc'. 
+        Default is None.
+    pocket_occupancy : bool, optional
+        Set to 'True' if the data input is pocket occupancy distribution.
+        The default is None.
+    pbc : bool, optional
+        If true, the apply periodic bounary corrections on angular distribution inputs.
+        The input for periodic correction must be radians. The default is True.
+    verbose : bool, default=True
+        Print intermediate results.
+    write_plots : bool, optional
+        If true, visualise the states over the raw distribution. The default is None.
+    override_name_check : bool, default=False
+        Only check number of features, not their names.   
+        
+    Returns
+    -------
+        Each function returns one value.
+
+        average_ssi : float
+            Average of State Specific Information for a feature across two ensembles.
+        max_ssi : float
+            Maximum of State Specific Information for a feature across two ensembles.
+    """
+
+def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False):
+    _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check)
+    return np.mean(data_ssi)
+
+
+def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbs=True, verbose=True, write_plots=None, override_name_check=False):
+    _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check)
+    return np.max(data_ssi)

From a8b191ae2d3ba3027bbb5a117b8aa86af22e4c69 Mon Sep 17 00:00:00 2001
From: Jasper J McAvity <jmcavity@sh02-12n17.int>
Date: Tue, 20 Jul 2021 22:05:13 -0700
Subject: [PATCH 02/23] Added metrics.py to __init__.py and added ksp functions
 to metrics.py.

---
 pensa/comparison/__init__.py |  2 +-
 pensa/comparison/metrics.py  | 29 +++++++++++++++++++++--------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/pensa/comparison/__init__.py b/pensa/comparison/__init__.py
index a4b0802c..79e6e2b7 100644
--- a/pensa/comparison/__init__.py
+++ b/pensa/comparison/__init__.py
@@ -2,4 +2,4 @@
 from .relative_entropy import *  
 from .statespecific import *  
 from .visualization import *  
- 
+from .metrics import * 
diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py
index d159781d..b6d7ad3e 100644
--- a/pensa/comparison/metrics.py
+++ b/pensa/comparison/metrics.py
@@ -1,8 +1,9 @@
 import numpy as np
 from pensa import *
+from pensa.comparison import *
 
 
-    """
+"""
     Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters.
 
     Parameters
@@ -42,7 +43,7 @@
             Average Kullback-Leibler divergence from two ensembles.
         max_kld : float
             Maximum Kullback-Leibler divergence from two ensembles.
-    """
+"""
 
 def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False):
     _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check)
@@ -64,8 +65,8 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_
     return np.max(data_kld_ab)
 
 
-    """
-    Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of two functions uses the kolmogorov_smirnov_analysis function with the same parameters. 
+"""
+    Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of four functions uses the kolmogorov_smirnov_analysis function with the same parameters. 
 
     Parameters
     ----------
@@ -93,7 +94,11 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_
             Average Kolmogorov-Smirnov statistic for two distributions.
         max_kss : float
             Maximum Kolmogorov-Smirnov statistic for two distributions.
-    """
+        average_ksp : float
+            Average Kolmogorov-Smirnov p-value for two distributions.
+        max_ksp : float
+            Maximum Kolmogorov-Smirnov statistic for two distributions.
+"""
 
 
 def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False):
@@ -101,12 +106,20 @@ def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, ov
     return np.mean(data_kss)
 
 
-def max_kss(): 
+def max_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): 
     _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
     return np.max(data_kss)
 
+def average_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): 
+    _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
+    return np.mean(data_ksp)
+
+
+def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): 
+    _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
+    return np.max(data_ksp)
 
-    """
+"""
     Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. 
     
     Parameters
@@ -144,7 +157,7 @@ def max_kss():
             Average of State Specific Information for a feature across two ensembles.
         max_ssi : float
             Maximum of State Specific Information for a feature across two ensembles.
-    """
+"""
 
 def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False):
     _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check)

From 72c5bda763db4a395441b326f21e4e16e490595e Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jaspermcavity@gmail.com>
Date: Tue, 20 Jul 2021 22:07:40 -0700
Subject: [PATCH 03/23] Updated documentation of return values for
 get_structure_features.

---
 pensa/features/pyemma_features.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py
index ee60b067..c9f18cf7 100644
--- a/pensa/features/pyemma_features.py
+++ b/pensa/features/pyemma_features.py
@@ -41,9 +41,9 @@ def get_structure_features(pdb, xtc, start_frame=0, step_width=1, cossin=False,
         
     Returns
     -------
-    feature_names : list of str
+    feature_names : dict of lists of str
         Names of all features
-    features_data : numpy array
+    features_data : dict of numpy arrays
         Data for all features
     
     """

From b0f2343d1ca650a6c2c9af67f396c55571919c72 Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jaspermcavity@gmail.com>
Date: Tue, 20 Jul 2021 22:10:59 -0700
Subject: [PATCH 04/23] Added min_ksp function.

---
 pensa/comparison/metrics.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py
index b6d7ad3e..f4a5bdea 100644
--- a/pensa/comparison/metrics.py
+++ b/pensa/comparison/metrics.py
@@ -66,7 +66,7 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_
 
 
 """
-    Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of four functions uses the kolmogorov_smirnov_analysis function with the same parameters. 
+    Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of five functions uses the kolmogorov_smirnov_analysis function with the same parameters. 
 
     Parameters
     ----------
@@ -98,6 +98,8 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_
             Average Kolmogorov-Smirnov p-value for two distributions.
         max_ksp : float
             Maximum Kolmogorov-Smirnov statistic for two distributions.
+        min_ksp : float
+            Minimum Kolmogorov-Smirnov statistic for two distributions.
 """
 
 
@@ -119,6 +121,10 @@ def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, overri
     _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
     return np.max(data_ksp)
 
+def min_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): 
+    _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check)
+    return np.min(data_ksp)
+
 """
     Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. 
     

From 5945b90adffc69c090355b21687254b19af24d68 Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Wed, 4 Aug 2021 16:46:08 -0700
Subject: [PATCH 05/23] Added pca_sampling_efficiency metric.

---
 pensa/comparison/metrics.py | 46 +++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py
index f4a5bdea..b276f9b1 100644
--- a/pensa/comparison/metrics.py
+++ b/pensa/comparison/metrics.py
@@ -1,6 +1,8 @@
 import numpy as np
 from pensa import *
 from pensa.comparison import *
+import random
+import math
 
 
 """
@@ -173,3 +175,47 @@ def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, p
 def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbs=True, verbose=True, write_plots=None, override_name_check=False):
     _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check)
     return np.max(data_ssi)
+
+
+"""
+    Calculates the relative sampling efficiency of test data based on reference data.
+    
+    Parameters
+    ----------
+    ref_data : float array
+        Trajectory data from the reference ensemble. Format: [frames,frame_data].
+    test_data : float array
+        Trajectory data from the test ensemble. Format: [frames,frame_data].
+    num_pc : int
+        Number of principal components used.
+        
+    Returns
+    -------
+        pca_se : float
+            Sampling efficiency of test data based on reference data.
+        
+"""
+
+def pca_sampling_efficiency(ref_data, test_data, num_pc=2):
+    pca = calculate_pca(ref_data)
+
+    _, ref_components = get_components_pca(ref_data, num_pc, pca=pca)
+    _, test_components = get_components_pca(test_data, num_pc, pca=pca)
+
+    ref_var = np.var(ref_components, axis=0)
+    test_var = np.var(test_components, axis=0)
+
+    ref_vol = np.prod(ref_var)
+    test_vol = np.prod(test_var)
+
+    pca_se = test_vol / ref_vol
+
+    return pca_se
+
+
+
+
+
+            
+
+

From 544d53a1a762b62bc8887792d2e7f2c9c9b2e46f Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Wed, 4 Aug 2021 16:49:00 -0700
Subject: [PATCH 06/23] Added txt_features file.

---
 pensa/features/txt_features.py | 46 ++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 pensa/features/txt_features.py

diff --git a/pensa/features/txt_features.py b/pensa/features/txt_features.py
new file mode 100644
index 00000000..c9206360
--- /dev/null
+++ b/pensa/features/txt_features.py
@@ -0,0 +1,46 @@
+import numpy as np
+from pensa import *
+import random 
+import math
+
+
+
+def get_txt_features_ala2(filename, num_frames, cossin=False):
+    phi = []
+    psi = []
+
+    with open(filename) as f:
+        for s in f.readlines():
+            if s == 'phi\n' or s == 'psi\n':
+                continue
+            if s == '\n':
+                curr = 'psi'
+            else:
+                if curr == 'phi':
+                    phi.append(float(s))
+                else:
+                    psi.append(float(s))
+
+    if len(phi) > num_frames:
+        temp = list(zip(phi, psi))
+        random.shuffle(temp)
+        phi, psi = zip(*temp)
+
+    features = []
+    if not cossin:
+        features = np.zeros((num_frames, 2))
+        for i in range(num_frames):
+            features[i, 0] = phi[i]
+            reatures[i, 1] = psi[i]
+    else:
+        features = np.zeros((num_frames, 4))
+        for i in range(num_frames):
+            features[i, 0] = math.cos(phi[i])
+            features[i, 1] = math.sin(phi[i])
+            features[i, 2] = math.cos(psi[i])
+            features[i, 3] = math.sin(psi[i])
+
+    return features
+
+            
+

From b64491cdff74b66b8be5d1b20d980b595de01f3d Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Wed, 4 Aug 2021 16:50:31 -0700
Subject: [PATCH 07/23] Added txt_features to __init__ file.

---
 pensa/features/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pensa/features/__init__.py b/pensa/features/__init__.py
index 4c9a5bde..4a9a0ed4 100644
--- a/pensa/features/__init__.py
+++ b/pensa/features/__init__.py
@@ -8,4 +8,5 @@
 from .mda_distances import *
 from .processing import *
 from .atom_features import *  
-from .water_features import *  
+from .water_features import * 
+from .txt_features import *

From 6b50514366295e3ac9f702dc0fac9ba205fdd30d Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Thu, 12 Aug 2021 17:04:42 -0700
Subject: [PATCH 08/23] Added sorting functions to processing.py.

---
 pensa/features/processing.py | 59 ++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/pensa/features/processing.py b/pensa/features/processing.py
index 55c76af0..5aa8efb7 100644
--- a/pensa/features/processing.py
+++ b/pensa/features/processing.py
@@ -218,6 +218,48 @@ def sort_sincos_torsions_by_resnum(tors, data):
     new_data = data[:,new_order]
     return new_tors, new_data
 
+def sort_torsions_by_resnum(tors, data):
+    """
+    Sort torsion features by the residue number..
+    Parameters
+    ----------
+    tors : list of str
+        The list of torsion features.
+    Returns
+    -------
+    new_tors : list of str
+        The sorted list of torsion features.
+    """
+    renamed = []
+    for t in tors:
+        rn = t.split(' ')[-1]
+        ft = t.split(' ')[0]
+        renamed.append('%09i %s'%(int(rn),ft))
+    new_order = np.argsort(renamed)
+    new_tors = np.array(tors)[new_order].tolist()
+    new_data = data[:,new_order]
+    return new_tors, new_data
+
+def sort_features_alphabetically(tors, data):
+    """
+    Sort torsion features alphabetically.
+    Parameters
+    ----------
+    tors : list of str
+        The list of torsion features.
+    Returns
+    -------
+    new_tors : list of str
+        The sorted list of torsion features.
+    """
+    renamed = []
+    for t in tors:
+        renamed.append(t)
+    new_order = np.argsort(renamed)
+    new_tors = np.array(tors)[new_order].tolist()
+    new_data = data[:,new_order]
+    return new_tors, new_data
+
 
 def sort_distances_by_resnum(dist, data):
     """
@@ -241,13 +283,24 @@ def sort_distances_by_resnum(dist, data):
     return new_dist, new_data
 
 
-def select_common_features(features_a, features_b):
+def select_common_features(features_a, features_b, boolean=True):
     intersect = set(features_a).intersection(features_b)
-    is_common_a = [f in intersect for f in features_a]
-    is_common_b = [f in intersect for f in features_b]
+    if boolean:
+        is_common_a = [f in intersect for f in features_a]
+        is_common_b = [f in intersect for f in features_b]
+    else:
+        is_common_a = [f for f in features_a if f in intersect]
+        is_common_b = [f for f in features_b if f in intersect]
     return np.array(is_common_a), np.array(is_common_b)
     
 
+def get_common_features_data(features_a, features_b, data_a, data_b):
+    is_common_a, is_common_b = select_common_features(features_a, features_b)
+    new_data_a = data_a[:,is_common_a]
+    new_data_b = data_b[:, is_common_b]
+    new_features_a, new_features_b = select_common_features(features_a, features_b, boolean=False)
+    return new_features_a, new_features_b, new_data_a, new_data_b
+
 
 # -- Utilities to process feature data --
 

From 64e158d528136bf76070c9b59e3241dbd118d9f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Mon, 16 Aug 2021 21:21:17 -0700
Subject: [PATCH 09/23] test script for Sherlock

---
 test.sb | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 test.sb

diff --git a/test.sb b/test.sb
new file mode 100644
index 00000000..9faa2f29
--- /dev/null
+++ b/test.sb
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --time=24:00:00
+#SBATCH --mem=20G
+#SBATCH --partition=rondror
+#SBATCH --qos=high_p
+
+# Activate PENSA environment
+source /home/users/mvoegele/miniconda3/etc/profile.d/conda.sh
+conda activate /oak/stanford/groups/rondror/users/mvoegele/envs/pensa_dev
+
+# Run the tests (ignoring diffnets for now)
+pytest --ignore pensa/diffnets/tests/test_api.py \
+       --ignore pensa/diffnets/tests/test_cli.py \
+       --ignore pensa/diffnets/tests/test_diffnets.py
+

From b5010bbb3fb0473f48233909c7e636722e609d69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Mon, 16 Aug 2021 21:35:22 -0700
Subject: [PATCH 10/23] ignore slurm files

---
 .gitignore             | 3 +++
 test.sb => run-test.sb | 0
 2 files changed, 3 insertions(+)
 rename test.sb => run-test.sb (100%)

diff --git a/.gitignore b/.gitignore
index 98a70d86..2814d516 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,9 @@ tests/test_data/MOR-*/.*.npz
 *.ipynb
 .DS_Store
 
+# Files from workload manager
+slurm*.out
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/test.sb b/run-test.sb
similarity index 100%
rename from test.sb
rename to run-test.sb

From 4cb1c235f5bc4fdfb69fe595569c88741e5a07cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Mon, 16 Aug 2021 22:03:40 -0700
Subject: [PATCH 11/23] add general function to sort coord traj

---
 pensa/preprocessing/coordinates.py | 100 +++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 28 deletions(-)

diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py
index aa476316..0e83ec3d 100644
--- a/pensa/preprocessing/coordinates.py
+++ b/pensa/preprocessing/coordinates.py
@@ -7,33 +7,6 @@
 
 
 # -- Functions to preprocess trajectories --
-def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0):
-    """
-    Aligns selected coordinates from a trajectory file.
-
-    Parameters
-    ----------
-	ref : str
-	    File name for reference topology.
-	    Can read all MDAnalysis-compatible topology formats.
-	pdb : str
-	    File name for reference PDB file.
-	trj_list : list of str
-	    File names for the input trajectory.
-	    Can read all MDAnalysis-compatible trajectory formats.
-	out_name : str
-	    Core of the file names for the output files
-	start_frame : int, optional
-	    First frame to read from the trajectory. 
-    """
-    # Read the reference+PDB files and align selected parts.
-    u = mda.Universe(ref, pdb)
-    for trj in trj_list:
-        mobile = mda.Universe(ref, trj)
-        #mobile.trajectory = mobile.trajectory[start_frame:]
-        alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc')
-        alignment.run()   
- 
 
 
 def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0,
@@ -114,7 +87,7 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0,
 
 def merge_coordinates(ref_files, trj_files, out_name, segid=None):
     """
-    Merges the trajectories of several different systems or system parts.
+    Merge several trajectories of the same system or system part.
     All trajectories must be (at least) as long as the first one.
     
     Parameters
@@ -161,3 +134,74 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None):
             W.write(c.atoms)
     return univ
 
+
+def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0):
+    """
+    Aligns selected coordinates from a trajectory file.
+
+    Parameters
+    ----------
+        ref : str
+            File name for reference topology.
+            Can read all MDAnalysis-compatible topology formats.
+        pdb : str
+            File name for reference PDB file.
+        trj_list : list of str
+            File names for the input trajectory.
+            Can read all MDAnalysis-compatible trajectory formats.
+        out_name : str
+            Core of the file names for the output files
+        start_frame : int, optional
+            First frame to read from the trajectory. 
+    """
+    # Read the reference+PDB files and align selected parts.
+    u = mda.Universe(ref, pdb)
+    for trj in trj_list:
+        mobile = mda.Universe(ref, trj)
+        #mobile.trajectory = mobile.trajectory[start_frame:]
+        alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc')
+        alignment.run()
+
+
+def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0):
+    """
+    Sort coordinate frames along an array of values.
+    
+    Parameters
+    ----------
+    values: float array.
+        Values along which to sort the trajectory.
+    ref_name: string.
+        reference topology for the trajectory. 
+    trj_name: string.
+        Trajetory from which the frames are picked. 
+        Usually the same as the values are from.
+    out_name: string.
+        Name of the output files
+    start_frame: int
+        Offset of the data with respect to the trajectories.
+        
+    Returns
+    -------
+        sort_idx: float array
+            Sorted indices of the values.
+        oidx_sort: float array
+            Sorted indices of the trajectory.
+            
+    """
+    # Remember the index in the simulation (taking into account cutoff)
+    oidx = np.arange(len(values))+start_frame
+    # Define the MDAnalysis trajectory from where the frames come
+    u = mda.Universe(ref_name, trj_name)
+    a = u.select_atoms('all')
+    # Sort everything along the projection on the values
+    sort_idx  = np.argsort(values)
+    oidx_sort = oidx[sort_idx]
+    # Write out sorted trajectory
+    with mda.Writer(out_name, a.n_atoms) as W:
+        for i in range(len(values)):
+            ts = u.trajectory[oidx_sort[i]]
+            W.write(a)
+    return sort_idx, oidx_sort
+
+

From caa9065e127a91a20453a0dae743cd9d1beb205f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Mon, 16 Aug 2021 22:04:29 -0700
Subject: [PATCH 12/23] use general coord sort function for simple pca/tica
 write-out

---
 pensa/dimensionality/pca.py  | 28 +++++++++++-----------------
 pensa/dimensionality/tica.py | 20 +++++++-------------
 tests/test_workflow.py       | 18 +++++++-----------
 3 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py
index 0947dff1..2b6d9c76 100644
--- a/pensa/dimensionality/pca.py
+++ b/pensa/dimensionality/pca.py
@@ -3,7 +3,7 @@
 from pyemma.util.contexts import settings
 import MDAnalysis as mda
 import matplotlib.pyplot as plt
-
+from pensa.preprocessing import sort_coordinates
 
 
 # --- METHODS FOR PRINCIPAL COMPONENT ANALYSIS ---
@@ -186,30 +186,24 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3):
         out_name : str
             Core part of the name of the output files
     
+    Returns
+    -------
+        all_proj : list
+            All projections on the principal components 
     """    
     # Remember the index in the simulation (taking into account cutoff)
     oidx = np.arange(len(data))+start_frame
-    # Define the MDAnalysis trajectories from where the frames come
-    u = mda.Universe(top,trj)
-    a = u.select_atoms('all')
-    return_str = []
-    all_proj = []
+    # Initialize output
+    all_sort = []
     # Loop through the principal components
     for evi in range(num_pc):
         # Project the combined data on the principal component
         proj = project_on_pc(data,evi,pca=pca)
-        all_proj.append(proj)
         # Sort everything along the projection onto the PC
-        sort_idx  = np.argsort(proj)
-        proj_sort = proj[sort_idx] 
-        oidx_sort = oidx[sort_idx]
-        # Write the trajectory, ordered along the PC
-        with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", a.n_atoms) as W:
-            for i in range(data.shape[0]):
-                ts = u.trajectory[oidx_sort[i]]
-                W.write(a)     
-                return_str.append(a)
-    return return_str, all_proj
+        out_xtc = out_name+"_pc"+str(evi+1)+".xtc"
+        sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
+        all_sort.append(oidx_sort)
+    return all_sort
 
 
 def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3):
diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py
index 07a0c370..41b82d59 100644
--- a/pensa/dimensionality/tica.py
+++ b/pensa/dimensionality/tica.py
@@ -3,7 +3,7 @@
 from pyemma.util.contexts import settings
 import MDAnalysis as mda
 import matplotlib.pyplot as plt
-
+from pensa.preprocessing import sort_coordinates
 
 
 # --- METHODS FOR TIME-LAGGED INDEPENDENT COMPONENT ANALYSIS ---
@@ -186,23 +186,17 @@ def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3):
     """    
     # Remember the index in the simulation (taking into account cutoff)
     oidx = np.arange(len(data))+start_frame
-    # Define the MDAnalysis trajectories from where the frames come
-    u = mda.Universe(top,trj)
-    a = u.select_atoms('all')
+    # Initialize output
+    all_sort = []
     # Loop through the time-lagged independent components
     for evi in range(num_tic):
         # Project the combined data on the time-lagged independent component
         proj = project_on_tic(data,evi,tica=tica)
         # Sort everything along the projection onto the TIC
-        sort_idx  = np.argsort(proj)
-        proj_sort = proj[sort_idx] 
-        oidx_sort = oidx[sort_idx]
-        # Write the trajectory, ordered along the TIC
-        with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", a.n_atoms) as W:
-            for i in range(data.shape[0]):
-                ts = u.trajectory[oidx_sort[i]]
-                W.write(a)
-    return oidx_sort
+        out_xtc = out_name+"_tic"+str(evi+1)+".xtc"
+        sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
+        all_sort.append(oidx_sort)
+    return all_sort
 
 
 def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3):
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index b1898e54..aa5c951c 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -94,10 +94,10 @@ def setUp(self):
     pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions'])
     pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4)
     plt.close()
-    self.sort_traj, self.all_proj = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0,
-                                                      test_data_path + "/traj/condition-a_receptor.gro",
-                                                      test_data_path + "/traj/condition-a_receptor.xtc",
-                                                      test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3)
+    self.all_sort = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0,
+                                       test_data_path + "/traj/condition-a_receptor.gro",
+                                       test_data_path + "/traj/condition-a_receptor.xtc",
+                                       test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3)
 
     # -- Compare projections
     self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'],
@@ -277,11 +277,7 @@ def test_sort_trajs_along_pc(self):
     for ele in self.sort_common_traj:
       self.assertEqual(ele.n_atoms, 2322)
 
-    self.assertEqual(len(self.sort_traj), 90)
-    for ele in self.sort_traj:
-      self.assertEqual(ele.n_atoms, 2322)
-
-    self.assertEqual(len(self.all_proj), 3)
+    self.assertEqual(len(self.all_sort), 3)
 
   # -- sort_trajs_along_common_tic()
   def test_sort_trajs_along_common_tic(self):
@@ -298,11 +294,11 @@ def test_sort_trajs_along_common_tic(self):
 
   # -- sort_traj_along_tic()
   def test_sort_traj_along_tic(self):
-    oidx = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0,
+    all_sort = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0,
                                test_data_path + "/traj/condition-a_receptor.gro",
                                test_data_path + "/traj/condition-a_receptor.xtc",
                                test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3)
-    self.assertEqual(len(oidx), 30)
+    self.assertEqual(len(all_sort), 3)
 
 
   # -- compare_projections()

From a42d6d0d4a0d1615114eb058991dfb1443d91704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Mon, 16 Aug 2021 22:33:57 -0700
Subject: [PATCH 13/23] output both index types for pca/tica sort

---
 pensa/dimensionality/pca.py  | 27 +++++++++++++++------------
 pensa/dimensionality/tica.py | 21 ++++++++++++++-------
 tests/test_workflow.py       | 16 ++++++++--------
 3 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py
index 2b6d9c76..61928d64 100644
--- a/pensa/dimensionality/pca.py
+++ b/pensa/dimensionality/pca.py
@@ -188,22 +188,25 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3):
     
     Returns
     -------
-        all_proj : list
-            All projections on the principal components 
-    """    
-    # Remember the index in the simulation (taking into account cutoff)
-    oidx = np.arange(len(data))+start_frame
+        sorted_indices_data : list
+            Sorted indices of the data array for each principal component
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each principal component
+
+    """
     # Initialize output
-    all_sort = []
-    # Loop through the principal components
+    sorted_indices_data = []
+    sorted_indices_traj = []
+    # Loop through the time-lagged independent components
     for evi in range(num_pc):
-        # Project the combined data on the principal component
-        proj = project_on_pc(data,evi,pca=pca)
-        # Sort everything along the projection onto the PC
+        # Project the combined data on the time-lagged independent component
+        proj = project_on_pc(data, evi, pca=pca)
+        # Sort everything along the projection onto the TIC
         out_xtc = out_name+"_pc"+str(evi+1)+".xtc"
         sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
-        all_sort.append(oidx_sort)
-    return all_sort
+        sorted_indices_data.append(sort_idx)
+        sorted_indices_traj.append(oidx_sort)
+    return sorted_indices_data, sorted_indices_traj
 
 
 def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3):
diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py
index 41b82d59..67775cb5 100644
--- a/pensa/dimensionality/tica.py
+++ b/pensa/dimensionality/tica.py
@@ -182,21 +182,28 @@ def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3):
             Should be the same as data was from.
         out_name : str
             Core part of the name of the output files.
+
+    Returns
+    -------
+        sorted_indices_data : list
+            Sorted indices of the data array for each independent components
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each independent components
     
     """    
-    # Remember the index in the simulation (taking into account cutoff)
-    oidx = np.arange(len(data))+start_frame
     # Initialize output
-    all_sort = []
-    # Loop through the time-lagged independent components
+    sorted_indices_data = []
+    sorted_indices_traj = []
+    # Loop through the independent components
     for evi in range(num_tic):
-        # Project the combined data on the time-lagged independent component
+        # Project the combined data on the independent component
         proj = project_on_tic(data,evi,tica=tica)
         # Sort everything along the projection onto the TIC
         out_xtc = out_name+"_tic"+str(evi+1)+".xtc"
         sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
-        all_sort.append(oidx_sort)
-    return all_sort
+        sorted_indices_data.append(sort_idx)
+        sorted_indices_traj.append(oidx_sort)
+    return sorted_indices_data, sorted_indices_traj
 
 
 def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3):
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index aa5c951c..b9ddd99f 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -94,10 +94,10 @@ def setUp(self):
     pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions'])
     pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4)
     plt.close()
-    self.all_sort = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0,
-                                       test_data_path + "/traj/condition-a_receptor.gro",
-                                       test_data_path + "/traj/condition-a_receptor.xtc",
-                                       test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3)
+    self.all_sort, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0,
+                                          test_data_path + "/traj/condition-a_receptor.gro",
+                                          test_data_path + "/traj/condition-a_receptor.xtc",
+                                          test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3)
 
     # -- Compare projections
     self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'],
@@ -294,10 +294,10 @@ def test_sort_trajs_along_common_tic(self):
 
   # -- sort_traj_along_tic()
   def test_sort_traj_along_tic(self):
-    all_sort = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0,
-                               test_data_path + "/traj/condition-a_receptor.gro",
-                               test_data_path + "/traj/condition-a_receptor.xtc",
-                               test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3)
+    all_sort, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0,
+                                      test_data_path + "/traj/condition-a_receptor.gro",
+                                      test_data_path + "/traj/condition-a_receptor.xtc",
+                                      test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3)
     self.assertEqual(len(all_sort), 3)
 
 

From 8c9d7aa81fe7b784aac0acd685124a1a97fb0cdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Mon, 16 Aug 2021 23:45:03 -0700
Subject: [PATCH 14/23] add function to sort traj by mda feature

---
 pensa/features/mda_distances.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/pensa/features/mda_distances.py b/pensa/features/mda_distances.py
index d997d8a1..b6d07727 100644
--- a/pensa/features/mda_distances.py
+++ b/pensa/features/mda_distances.py
@@ -178,3 +178,36 @@ def get_gpcr_calpha_distances(pdb, xtc, gpcr_name, res_dbnum,
     return names, distlabels, data
 
 
+def sort_traj_along_mda_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0):
+    """
+    Sort a trajectory along a feature.
+
+    Parameters
+    ----------
+        feat : list of str
+            List with all feature names.
+        data : float array
+            Feature values data from the simulation.
+        feature_name : str
+            Name of the selected feature.
+        ref_name: string
+            Reference topology for the trajectory.
+        trj_name: string
+            Trajetory from which the frames are picked.
+            Usually the same as the values are from.
+        out_name: string.
+            Name of the output files.
+        start_frame: int
+            Offset of the data with respect to the trajectories.
+
+    Returns
+    -------
+        d_sorted: float array
+            Sorted data of the selected feature.
+
+    """
+    d = pensa.get_feature_data(feat, data, feature_name)
+    sort_idx, oidx_sort = pensa.sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
+    d_sorted = d[sort_idx]
+    return d_sorted
+

From bb79bd11e0ccf6d388b7d6b7cc9392a237a9bf30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Tue, 17 Aug 2021 01:55:52 -0700
Subject: [PATCH 15/23] add function to sort a traj by one feature

---
 pensa/features/mda_distances.py   | 33 ---------------------------
 pensa/features/processing.py      | 37 +++++++++++++++++++++++++++++--
 pensa/features/pyemma_features.py | 37 ++++++++++++++++++++++++++++++-
 3 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/pensa/features/mda_distances.py b/pensa/features/mda_distances.py
index b6d07727..d997d8a1 100644
--- a/pensa/features/mda_distances.py
+++ b/pensa/features/mda_distances.py
@@ -178,36 +178,3 @@ def get_gpcr_calpha_distances(pdb, xtc, gpcr_name, res_dbnum,
     return names, distlabels, data
 
 
-def sort_traj_along_mda_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0):
-    """
-    Sort a trajectory along a feature.
-
-    Parameters
-    ----------
-        feat : list of str
-            List with all feature names.
-        data : float array
-            Feature values data from the simulation.
-        feature_name : str
-            Name of the selected feature.
-        ref_name: string
-            Reference topology for the trajectory.
-        trj_name: string
-            Trajetory from which the frames are picked.
-            Usually the same as the values are from.
-        out_name: string.
-            Name of the output files.
-        start_frame: int
-            Offset of the data with respect to the trajectories.
-
-    Returns
-    -------
-        d_sorted: float array
-            Sorted data of the selected feature.
-
-    """
-    d = pensa.get_feature_data(feat, data, feature_name)
-    sort_idx, oidx_sort = pensa.sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
-    d_sorted = d[sort_idx]
-    return d_sorted
-
diff --git a/pensa/features/processing.py b/pensa/features/processing.py
index 55c76af0..2cf1839e 100644
--- a/pensa/features/processing.py
+++ b/pensa/features/processing.py
@@ -9,8 +9,7 @@
 import matplotlib.pyplot as plt
 import os
 import warnings
-#from pensa.features import *
-
+from pensa.preprocessing import sort_coordinates
 
 
 # -- Utilities to extract time series --
@@ -283,3 +282,37 @@ def correct_angle_periodicity(angle):
     return new_angle
 
 
+# Process trajectories according to feature data
+
+def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0):
+    """
+    Sort a trajectory along a feature.
+
+    Parameters
+    ----------
+        feat : list of str
+            List with all feature names.
+        data : float array
+            Feature values data from the simulation.
+        feature_name : str
+            Name of the selected feature.
+        ref_name: string
+            Reference topology for the trajectory.
+        trj_name: string
+            Trajetory from which the frames are picked.
+            Usually the same as the values are from.
+        out_name: string.
+            Name of the output files.
+        start_frame: int
+            Offset of the data with respect to the trajectories.
+
+    Returns
+    -------
+        d_sorted: float array
+            Sorted data of the selected feature.
+
+    """
+    d = get_feature_data(feat, data, feature_name)
+    sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
+    d_sorted = d[sort_idx]
+    return d_sorted
diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py
index ee60b067..b1cf0e11 100644
--- a/pensa/features/pyemma_features.py
+++ b/pensa/features/pyemma_features.py
@@ -10,7 +10,8 @@
 import numpy as np
 import pyemma
 from pyemma.util.contexts import settings
-
+from pensa.features.processing import get_feature_timeseries 
+from pensa.preprocessing.coordinates import sort_coordinates
 
 
 # -- Loading the Features --
@@ -169,4 +170,38 @@ def _remove_resnum_offset(features, offset):
     return new_features
     
     
+def sort_traj_along_pyemma_feature(feat, data, feature_name, feature_type, ref_name, trj_name, out_name, start_frame=0):
+    """
+    Sort a trajectory along a PyEMMA feature.
+
+    Parameters
+    ----------
+        feat : list of str
+            List with all feature names.
+        data : float array
+            Feature values data from the simulation.
+        feature_name : str
+            Name of the selected feature.
+        feature_type : str
+            Type of the selected feature.
+        ref_name: string
+            Reference topology for the trajectory.
+        trj_name: string
+            Trajetory from which the frames are picked.
+            Usually the same as the values are from.
+        out_name: string.
+            Name of the output files.
+        start_frame: int
+            Offset of the data with respect to the trajectories.
+
+    Returns
+    -------
+        d_sorted: float array
+            Sorted data of the selected feature.
+
+    """
+    d = get_feature_timeseries(feat, data, feature_type, feature_name)
+    sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
+    d_sorted = d[sort_idx]
+    return d_sorted
 

From 4aad56b42c4feb2fe920b8e45b2010c4ee8b6f83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Tue, 17 Aug 2021 13:06:12 -0700
Subject: [PATCH 16/23] log and doc for traj sort functions

---
 pensa/features/processing.py       | 3 ++-
 pensa/preprocessing/coordinates.py | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pensa/features/processing.py b/pensa/features/processing.py
index 2cf1839e..52f7c948 100644
--- a/pensa/features/processing.py
+++ b/pensa/features/processing.py
@@ -284,7 +284,7 @@ def correct_angle_periodicity(angle):
 
 # Process trajectories according to feature data
 
-def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0):
+def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0, verbose=False):
     """
     Sort a trajectory along a feature.
 
@@ -312,6 +312,7 @@ def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_na
             Sorted data of the selected feature.
 
     """
+    if verbose: print('Sorting along feature '+feature_name)
     d = get_feature_data(feat, data, feature_name)
     sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
     d_sorted = d[sort_idx]
diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py
index 0e83ec3d..8110ee8a 100644
--- a/pensa/preprocessing/coordinates.py
+++ b/pensa/preprocessing/coordinates.py
@@ -163,7 +163,7 @@ def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_fram
         alignment.run()
 
 
-def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0):
+def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0, verbose=False):
     """
     Sort coordinate frames along an array of values.
     
@@ -189,10 +189,15 @@ def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0):
             Sorted indices of the trajectory.
             
     """
-    # Remember the index in the simulation (taking into account cutoff)
+    # Remember the index in the simulation (taking into account offset)
     oidx = np.arange(len(values))+start_frame
     # Define the MDAnalysis trajectory from where the frames come
+    if verbose: print('Loading:', ref_name, trj_name)
     u = mda.Universe(ref_name, trj_name)
+    if verbose: 
+        print('Trajectory length:', len(u.trajectory))
+        print('Number of values: ', len(values))
+        print('Trajectory offset:', start_frame)
     a = u.select_atoms('all')
     # Sort everything along the projection on the values
     sort_idx  = np.argsort(values)

From 4414b1ccfe0f02594fb9e4961fbf3a964775b9b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Wed, 18 Aug 2021 00:03:54 -0700
Subject: [PATCH 17/23] make dim. reduction functions more consistent

---
 docs/tut-5-dimensionality.rst                 |  14 +-
 pensa/dimensionality/__init__.py              |   1 +
 pensa/dimensionality/pca.py                   | 298 ++++++------------
 pensa/dimensionality/tica.py                  | 268 ++++++----------
 pensa/dimensionality/visualization.py         | 183 +++++++++++
 pensa/features/processing.py                  |   3 +-
 pensa/preprocessing/coordinates.py            | 152 +++++++--
 ...calculate_combined_principal_components.py |   4 +-
 tests/test_workflow.py                        |  34 +-
 9 files changed, 521 insertions(+), 436 deletions(-)
 create mode 100644 pensa/dimensionality/visualization.py

diff --git a/docs/tut-5-dimensionality.rst b/docs/tut-5-dimensionality.rst
index 93c072c1..f053eed9 100644
--- a/docs/tut-5-dimensionality.rst
+++ b/docs/tut-5-dimensionality.rst
@@ -89,13 +89,12 @@ entire receptor, sorted by the PCs of the transmembrane region.
 
     _ = sort_trajs_along_common_pc(sim_a_tmr_data['bb-torsions'],
                                    sim_b_tmr_data['bb-torsions'],
-                                   feature_start_frame,
                                    "traj/condition-a_receptor.gro",
                                    "traj/condition-b_receptor.gro",
                                    "traj/condition-a_receptor.xtc",
                                    "traj/condition-b_receptor.xtc",
                                    "pca/receptor_by_tmr",
-                                   num_pc=3)
+                                   num_pc=3, start_frame=feature_start_frame)
 
 The above function deals with the special case of two input
 trajectories. We also provide the functions for a single one (see
@@ -121,8 +120,9 @@ Here are the major steps of a PCA demonstrated for a single simulation.
 
 .. code:: python
 
-    _, __ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'], 
-                               pca_a, feature_start_frame, 
-                               "traj/condition-a_receptor.gro", 
-                               "traj/condition-a_receptor.xtc", 
-                               "pca/condition-a_receptor_by_tmr", num_pc=3)
+    _, __, ___ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'],
+                                    "traj/condition-a_receptor.gro", 
+                                    "traj/condition-a_receptor.xtc", 
+                                    "pca/condition-a_receptor_by_tmr", 
+                                    start_frame = feature_start_frame, 
+                                    pca=pca_a, num_pc=3)
diff --git a/pensa/dimensionality/__init__.py b/pensa/dimensionality/__init__.py
index 3ae57a17..bbf189db 100644
--- a/pensa/dimensionality/__init__.py
+++ b/pensa/dimensionality/__init__.py
@@ -1,3 +1,4 @@
+from .visualization import *
 from .pca import *
 from .tica import *
 
diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py
index 61928d64..6d79d4cb 100644
--- a/pensa/dimensionality/pca.py
+++ b/pensa/dimensionality/pca.py
@@ -3,11 +3,14 @@
 from pyemma.util.contexts import settings
 import MDAnalysis as mda
 import matplotlib.pyplot as plt
-from pensa.preprocessing import sort_coordinates
+from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates
+from .visualization import project_on_eigenvector, sort_traj_along_projection
 
 
 # --- METHODS FOR PRINCIPAL COMPONENT ANALYSIS ---
 
+# http://www.emma-project.org/latest/api/generated/pyemma.coordinates.pca.html
+
 
 def calculate_pca(data):
     """
@@ -115,14 +118,11 @@ def project_on_pc(data, ev_idx, pca=None):
             Value along the PC for each frame.
         
     """
-    # Perform PCA if none is provided
+    # Perform PCA if none is provided.
     if pca is None:
-        pca = pyemma.coordinates.pca(data) #,dim=3)
-    # Project the features onto the principal components
-    projection = np.zeros(data.shape[0])
-    for ti in range(data.shape[0]):
-        projection[ti] = np.dot(data[ti],pca.eigenvectors[:,ev_idx])
-    # Return the value along the PC for each frame  
+        pca = pyemma.coordinates.pca(data)
+    # Project the features onto the principal components.
+    projection = project_on_eigenvector(data, ev_idx, pca)  
     return projection
 
    
@@ -164,20 +164,14 @@ def get_components_pca(data, num, pca=None, prefix=''):
     return comp_names, np.array(components).T
      
 
-def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3):
+def sort_traj_along_pc(data, top, trj, out_name, pca=None, num_pc=3, start_frame=0):
     """
-    Sort a trajectory along given principal components.
+    Sort a trajectory along principal components.
     
     Parameters
     ----------
         data : float array
             Trajectory data [frames,frame_data].
-        pca : PCA obj
-            Principal components information.
-        num_pc : int
-            Sort along the first num_pc principal components.
-        start_frame : int
-            Offset of the data with respect to the trajectories (defined below).
         top : str
             File name of the reference topology for the trajectory. 
         trj : str
@@ -185,31 +179,38 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3):
             Should be the same as data was from.
         out_name : str
             Core part of the name of the output files
+        pca : PCA obj, optional
+            Principal components information.
+            If none is provided, it will be calculated.
+            Defaults to None.
+        num_pc : int, optional
+            Sort along the first num_pc principal components.
+            Defaults to 3.
+        start_frame : int, optional
+            Offset of the data with respect to the trajectories (defined below).
+            Defaults to 0.
     
     Returns
     -------
+        sorted_proj: list
+            sorted projections on each principal component
         sorted_indices_data : list
             Sorted indices of the data array for each principal component
         sorted_indices_traj : list
             Sorted indices of the coordinate frames for each principal component
 
     """
-    # Initialize output
-    sorted_indices_data = []
-    sorted_indices_traj = []
-    # Loop through the time-lagged independent components
-    for evi in range(num_pc):
-        # Project the combined data on the time-lagged independent component
-        proj = project_on_pc(data, evi, pca=pca)
-        # Sort everything along the projection onto the TIC
-        out_xtc = out_name+"_pc"+str(evi+1)+".xtc"
-        sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
-        sorted_indices_data.append(sort_idx)
-        sorted_indices_traj.append(oidx_sort)
-    return sorted_indices_data, sorted_indices_traj
+    # Calculate the principal components if they are not given.
+    if pca is None: 
+        pca = pyemma.coordinates.pca(all_data, dim=3)
+    # Sort the trajectory along them.
+    sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection(
+        data, pca, top, trj, out_name, num_comp=num_pc, start_frame = start_frame
+        )
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 
-def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3):
+def sort_trajs_along_common_pc(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_pc=3, start_frame=0):
     """
     Sort two trajectories along their most important common principal components.
     
@@ -219,8 +220,6 @@ def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a,
             Trajectory data [frames,frame_data].
         data_b : float array
             Trajectory data [frames,frame_data].
-        start_frame : int
-            Offset of the data with respect to the trajectories (defined below).
         top_a : str
             Reference topology for the first trajectory. 
         top_b : str
@@ -233,48 +232,30 @@ def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a,
             Should be the same as data_b was from.
         out_name : str
             Core part of the name of the output files.
-    
+        num_pc : int, optional
+            Sort along the first num_pc principal components.
+            Defaults to 3.
+        start_frame : int or list of int
+            Offset of the data with respect to the trajectories.
+            Defaults to 0.
+
+    Returns
+    -------
+        sorted_proj: list
+            sorted projections on each principal component
+        sorted_indices_data : list
+            Sorted indices of the data array for each principal component
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each principal component
+                
     """
-    # Combine the input data
-    data = np.concatenate([data_a,data_b],0)
-    # Remember which simulation the data came frome
-    cond = np.concatenate([np.ones(len(data_a)), np.zeros(len(data_b))])
-    # Remember the index in the respective simulation (taking into account cutoff)
-    oidx = np.concatenate([np.arange(len(data_a))+start_frame, 
-                           np.arange(len(data_b))+start_frame])
-    # Calculate the principal components
-    pca = pyemma.coordinates.pca(data,dim=3)
-    # Define the MDAnalysis trajectories from where the frames come
-    ua = mda.Universe(top_a,trj_a)
-    ub = mda.Universe(top_b,trj_b)
-    # ... and select all atoms
-    aa = ua.select_atoms('all')
-    ab = ub.select_atoms('all')
-    return_str = []
-    # Loop over principal components.
-    for evi in range(num_pc):
-        # Project the combined data on the principal component
-        proj = project_on_pc(data,evi,pca=pca)
-        # Sort everything along the projection on th resp. PC
-        sort_idx  = np.argsort(proj)
-        proj_sort = proj[sort_idx] 
-        cond_sort = cond[sort_idx]
-        oidx_sort = oidx[sort_idx]
-        # Write the trajectory, ordered along the PC
-        with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", aa.n_atoms) as W:
-            for i in range(data.shape[0]):
-                if cond_sort[i] == 1: # G-protein bound
-                    ts = ua.trajectory[oidx_sort[i]]
-                    W.write(aa)
-                    return_str.append(aa)
-                elif cond_sort[i] == 0: # arrestin bound
-                    ts = ub.trajectory[oidx_sort[i]]
-                    W.write(ab)
-                    return_str.append(ab)
-    return return_str
+    sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_pc(
+        [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=3, start_frame = start_frame
+        )
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 
-def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_pc=3):
+def sort_mult_trajs_along_common_pc(data, top, trj, out_name, num_pc=3, start_frame=0):
     """
     Sort multiple trajectories along their most important common principal components.
 
@@ -282,8 +263,6 @@ def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_p
     ----------
         data : list of float arrays
             List of trajectory data arrays, each [frames,frame_data].
-        start_frame : int
-            Offset of the data with respect to the trajectories (defined below).
         top : list of str
             Reference topology files.
         trj : list of str
@@ -291,148 +270,47 @@ def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_p
             trj[i] should be the same as data[i] was from.
         out_name : str
             Core part of the name of the output files.
+        num_pc : int, optional
+            Sort along the first num_pc principal components.
+            Defaults to 3.
+        start_frame : int or list of int
+            Offset of the data with respect to the trajectories.
+            Defaults to 0.
+
+    Returns
+    -------
+        sorted_proj: list
+            sorted projections on each principal component
+        sorted_indices_data : list
+            Sorted indices of the data array for each principal component
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each principal component
 
     """
     num_frames = [len(d) for d in data]
     num_traj = len(data)
+    if type(start_frame) == int:
+        start_frame *= np.ones(num_traj)
+        start_frame = start_frame.tolist()
     # Combine the input data
-    data = np.concatenate(data,0)
-    # Remember which simulation the data came frome
-    cond = np.concatenate([i*np.ones(num_frames[i],dtype=int) for i in range(num_traj)])
-    # Remember the index in the respective simulation (taking into account cutoff)
-    oidx = np.concatenate([np.arange(num_frames[i])+start_frame for i in range(num_traj)])
-    # Calculate the principal components
-    pca = pyemma.coordinates.pca(data,dim=3)
-    # Define the MDAnalysis trajectories from where the frames come
-    univs = []
-    atoms = []
-    for j in range(num_traj):
-        u = mda.Universe(top[j],trj[j])
-        print('Length of trajectory',len(u.trajectory))
-        univs.append(u)
-        atoms.append(u.select_atoms('all'))
+    all_data = np.concatenate(data,0)
+    # Calculate the principal component
+    pca = pyemma.coordinates.pca(all_data, dim=3)
+    # Initialize output
+    sorted_proj = []
+    sorted_indices_data = []
+    sorted_indices_traj = []    
     # Loop over principal components.
     for evi in range(num_pc):
         # Project the combined data on the principal component
-        proj = project_on_pc(data,evi,pca=pca)
-        # Sort everything along the projection on th resp. PC
-        sort_idx  = np.argsort(proj)
-        proj_sort = proj[sort_idx]
-        cond_sort = cond[sort_idx]
-        oidx_sort = oidx[sort_idx]
-        # Write the trajectory, ordered along the PC
-        with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", atoms[0].n_atoms) as W:
-            for i in range(data.shape[0]):
-                j = cond_sort[i]
-                o = oidx_sort[i]
-                uj = univs[j] 
-                ts = uj.trajectory[o]
-                W.write(atoms[j])
-    return
-
-
-def compare_projections(data_a, data_b, pca, num=3, saveas=None, label_a=None, label_b=None):
-    """
-    Compare two datasets along a given principal component.
-    
-    Parameters
-    ----------
-        data_a : float array
-            Trajectory data [frames,frame_data]
-        data_b : float array
-            Trajectory data [frames,frame_data]
-        pca : PCA object
-            Principal components information.
-        num : int
-            Number of principal components to plot. 
-        saveas : str, optional
-            Name of the output file.
-        label_a : str, optional
-            Label for the first dataset.
-        label_b : str, optional
-            Label for the second dataset.
-        
-    """
-    # Start the figure    
-    fig,ax = plt.subplots(num, 2, figsize=[8,3*num], dpi=300)
-    val = []
-    # Loop over PCs
-    for evi in range(num):
-        # Calculate values along PC for each frame
-        proj_a = project_on_pc(data_a, evi, pca=pca)
-        proj_b = project_on_pc(data_b, evi, pca=pca)
-        # Plot the time series in the left panel
-        ax[evi,0].plot(proj_a, alpha=0.5, label=label_a)
-        ax[evi,0].plot(proj_b, alpha=0.5, label=label_b)
-        ax[evi,0].set_xlabel('frame number')
-        ax[evi,0].set_ylabel('PC %i'%(evi+1))
-        # Plot the histograms in the right panel
-        ax[evi,1].hist(proj_a, bins=30, alpha=0.5, density=True, label=label_a)
-        ax[evi,1].hist(proj_b, bins=30, alpha=0.5, density=True, label=label_b)
-        ax[evi,1].set_xlabel('PC %i'%(evi+1))
-        ax[evi,1].set_ylabel('frequency')
-        # Legend
-        if label_a and label_b:
-            ax[evi,0].legend()
-            ax[evi,1].legend()
-        val.append([proj_a, proj_b])
-    fig.tight_layout()
-    # Save the figure
-    if saveas is not None:
-        fig.savefig(saveas, dpi=300)
-    return val
-    
-    
-def compare_mult_projections(data, pca, num=3, saveas=None, labels=None, colors=None):
-    """
-    Compare two datasets along a given principal component.
-    
-    Parameters
-    ----------
-        data : list of float arrays
-            Data from multiple trajectories [frames,frame_data]
-        pca : PCA object
-            Principal components information.
-        num : int
-            Number of principal components to plot. 
-        saveas : str, optional
-            Name of the output file.
-        labels : list of str, optional
-            Labels for the datasets. If provided, it must have the same length as data.
-        
-    """
-    if labels is not None:
-        assert len(labels) == len(data)
-    else:
-        labels = [None for _ in range(len(data))]
-    if colors is not None:
-        assert len(colors) == len(data)
-    else:
-        colors = ['C%i'%num for num in range(len(data))]
-    # Start the figure    
-    fig,ax = plt.subplots(num, 2, figsize=[9,3*num], dpi=300)
-    # Loop over PCs
-    for evi in range(num):
-        for j,d in enumerate(data):
-            # Calculate values along PC for each frame
-            proj = project_on_pc(d, evi, pca=pca)
-            # Plot the time series in the left panel
-            ax[evi,0].plot(proj, alpha=0.5, 
-                           label=labels[j], color=colors[j])
-            # Plot the histograms in the right panel
-            ax[evi,1].hist(proj, bins=30, alpha=0.5, density=True, 
-                           label=labels[j], color=colors[j])
-        # Axis labels
-        ax[evi,0].set_xlabel('frame number')
-        ax[evi,0].set_ylabel('PC %i'%(evi+1))            
-        ax[evi,1].set_xlabel('PC %i'%(evi+1))
-        ax[evi,1].set_ylabel('frequency')
-        # Legend
-        if labels[0] is not None:
-            ax[evi,0].legend()
-            ax[evi,1].legend()
-    fig.tight_layout()
-    # Save the figure
-    if saveas is not None:
-        fig.savefig(saveas, dpi=300)
-    return
+        proj = [project_on_pc(d, evi, pca=pca) for d in data]
+        # Sort everything along the projection on the respective PC
+        out_xtc = out_name+"_pc"+str(evi+1)+".xtc"
+        proj_sort, sort_idx, oidx_sort = merge_and_sort_coordinates(
+            proj, top, trj, out_xtc, start_frame=start_frame, verbose=False
+            )
+        sorted_proj.append(proj_sort)
+        sorted_indices_data.append(sort_idx)
+        sorted_indices_traj.append(oidx_sort)        
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
+   
diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py
index 67775cb5..01546eaa 100644
--- a/pensa/dimensionality/tica.py
+++ b/pensa/dimensionality/tica.py
@@ -3,11 +3,13 @@
 from pyemma.util.contexts import settings
 import MDAnalysis as mda
 import matplotlib.pyplot as plt
-from pensa.preprocessing import sort_coordinates
+from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates
+from .visualization import project_on_eigenvector, sort_traj_along_projection
 
 
 # --- METHODS FOR TIME-LAGGED INDEPENDENT COMPONENT ANALYSIS ---
-# http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html#pyemma.coordinates.tica
+
+# http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html
 
 
 def calculate_tica(data):
@@ -43,7 +45,7 @@ def tica_eigenvalues_plot(tica, num=12, plot_file=None):
             Path and name of the file to save the plot.
         
     """
-    # Plot eigenvalues over component numbers
+    # Plot eigenvalues over component numbers.
     fig,ax = plt.subplots(1, 1, figsize=[4,3], dpi=300)
     componentnr = np.arange(num)+1 
     eigenvalues = tica.eigenvalues[:num]
@@ -51,7 +53,7 @@ def tica_eigenvalues_plot(tica, num=12, plot_file=None):
     ax.set_xlabel('component number')
     ax.set_ylabel('eigenvalue')
     fig.tight_layout()
-    # Save the figure to a file
+    # Save the figure to a file.
     if plot_file: fig.savefig(plot_file, dpi=300)
     return componentnr, eigenvalues
 
@@ -75,7 +77,7 @@ def tica_features(tica, features, num, threshold, plot_file=None):
             Path and name of the file to save the plot.
         
     """
-    # Plot the highest TIC correlations and print relevant features
+    # Plot the highest TIC correlations and print relevant features.
     fig,ax = plt.subplots(num,1,figsize=[4,num*3],dpi=300,sharex=True)
     for i in range(num):
         relevant = tica.feature_TIC_correlation[:,i]**2 > threshold**2
@@ -87,7 +89,7 @@ def tica_features(tica, features, num, threshold, plot_file=None):
         ax[i].set_xlabel('feature index')
         ax[i].set_ylabel('correlation with TIC%i'%(i+1))
     fig.tight_layout()
-    # Save the figure to a file
+    # Save the figure to a file.
     if plot_file: fig.savefig(plot_file,dpi=300)
     return test_feature
     
@@ -112,14 +114,11 @@ def project_on_tic(data, ev_idx, tica=None):
             Value along the TIC for each frame.
         
     """
-    # Perform TICA if none is provided
+    # Perform TICA if none is provided.
     if tica is None:
-        tica = pyemma.coordinates.tica(data) #,dim=3)
-    # Project the features onto the time-lagged independent components
-    projection = np.zeros(data.shape[0])
-    for ti in range(data.shape[0]):
-        projection[ti] = np.dot(data[ti],tica.eigenvectors[:,ev_idx])
-    # Return the value along the TIC for each frame  
+        tica = pyemma.coordinates.tica(data)
+    # Project the features onto the time-lagged independent components.
+    projection = project_on_eigenvector(data, ev_idx, tica) 
     return projection
     
 
@@ -145,10 +144,10 @@ def get_components_tica(data, num, tica=None, prefix=''):
             Component data [frames,components]
         
     """
-    # Perform tICA if none is provided
+    # Perform tICA if none is provided.
     if tica is None:
         tica = pyemma.coordinates.tica(data) 
-    # Project the features onto the principal components
+    # Project the features onto the principal components.
     comp_names = []
     components = []
     for ev_idx in range(num):
@@ -157,56 +156,57 @@ def get_components_tica(data, num, tica=None, prefix=''):
             projection[ti] = np.dot(data[ti],tica.eigenvectors[:,ev_idx])
         components.append(projection)
         comp_names.append(prefix+'IC'+str(ev_idx+1))
-    # Return the names and data
+    # Return the names and data.
     return comp_names, np.array(components).T
     
 
-def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3):
+def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_frame=0):
     """
-    Sort a trajectory along given time-lagged independent components.
+    Sort a trajectory along independent components.
     
     Parameters
     ----------
         data : float array
             Trajectory data [frames,frame_data].
-        tica : TICA obj
-            Time-lagged independent components information.
-        num_tic : int
-            Sort along the first num_tic time-lagged independent components.
-        start_frame : int
-            Offset of the data with respect to the trajectories (defined below).
         top : str
             File name of the reference topology for the trajectory. 
         trj : str
             File name of the trajetory from which the frames are picked. 
             Should be the same as data was from.
         out_name : str
-            Core part of the name of the output files.
-
+            Core part of the name of the output files
+        tica : tICA obj, optional
+            Time-lagged independent components information.
+            If none is provided, it will be calculated.
+            Defaults to None.
+        num_ic : int, optional
+            Sort along the first num_ic independent components.
+            Defaults to 3.
+        start_frame : int, optional
+            Offset of the data with respect to the trajectories (defined below).
+            Defaults to 0.
+    
     Returns
     -------
+        sorted_proj: list
+            sorted projections on each principal component
         sorted_indices_data : list
-            Sorted indices of the data array for each independent components
+            Sorted indices of the data array for each principal component
         sorted_indices_traj : list
-            Sorted indices of the coordinate frames for each independent components
-    
-    """    
-    # Initialize output
-    sorted_indices_data = []
-    sorted_indices_traj = []
-    # Loop through the independent components
-    for evi in range(num_tic):
-        # Project the combined data on the independent component
-        proj = project_on_tic(data,evi,tica=tica)
-        # Sort everything along the projection onto the TIC
-        out_xtc = out_name+"_tic"+str(evi+1)+".xtc"
-        sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
-        sorted_indices_data.append(sort_idx)
-        sorted_indices_traj.append(oidx_sort)
-    return sorted_indices_data, sorted_indices_traj
+            Sorted indices of the coordinate frames for each principal component
+
+    """
+    # Calculate the principal components if they are not given.
+    if tica is None: 
+        tica = pyemma.coordinates.tica(all_data, dim=3)
+    # Sort the trajectory along them.
+    sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection(
+        data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame
+        )
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 
-def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3):
+def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, start_frame=0):
     """
     Sort two trajectories along their most important common time-lagged independent components.
     
@@ -216,8 +216,6 @@ def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a
             Trajectory data [frames,frame_data].
         data_b : float array
             Trajectory data [frames,frame_data].
-        start_frame : int
-            Offset of the data with respect to the trajectories (defined below).
         top_a : str
             Reference topology for the first trajectory. 
         top_b : str
@@ -230,54 +228,37 @@ def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a
             Should be the same as data_b was from.
         out_name : str
             Core part of the name of the output files.
+        num_ic : int, optional
+            Sort along the first num_ic independent components.
+            Defaults to 3.
+        start_frame : int, optional
+            Offset of the data with respect to the trajectories (defined below).
+            Defaults to 0.
+
+    Returns
+    -------
+        sorted_proj: list
+            sorted projections on each principal component
+        sorted_indices_data : list
+            Sorted indices of the data array for each principal component
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each principal component
     
     """
-    # Combine the input data
-    data = np.concatenate([data_a,data_b],0)
-    # Remember which simulation the data came frome
-    cond = np.concatenate([np.ones(len(data_a)), np.zeros(len(data_b))])
-    # Remember the index in the respective simulation (taking into account cutoff)
-    oidx = np.concatenate([np.arange(len(data_a))+start_frame, 
-                           np.arange(len(data_b))+start_frame])
-    # Calculate the time-lagged independent components
-    tica = pyemma.coordinates.tica(data,dim=3)
-    # Define the MDAnalysis trajectories from where the frames come
-    ua = mda.Universe(top_a,trj_a)
-    ub = mda.Universe(top_b,trj_b)
-    # ... and select all atoms
-    aa = ua.select_atoms('all')
-    ab = ub.select_atoms('all')
-    # Loop over time-lagged independent components.
-    for evi in range(num_tic):
-        # Project the combined data on the time-lagged independent component
-        proj = project_on_tic(data,evi,tica=tica)
-        # Sort everything along the projection on th resp. PC
-        sort_idx  = np.argsort(proj)
-        proj_sort = proj[sort_idx] 
-        cond_sort = cond[sort_idx]
-        oidx_sort = oidx[sort_idx]
-        # Write the trajectory, ordered along the PC
-        with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", aa.n_atoms) as W:
-            for i in range(data.shape[0]):
-                if cond_sort[i] == 1: # G-protein bound
-                    ts = ua.trajectory[oidx_sort[i]]
-                    W.write(aa)
-                elif cond_sort[i] == 0: # arrestin bound
-                    ts = ub.trajectory[oidx_sort[i]]
-                    W.write(ab)
-    return proj, oidx_sort
+    sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_tic(
+        [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=3, start_frame = start_frame
+        )
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 
-def sort_mult_trajs_along_common_tic(data, start_frame, top, trj, out_name, num_tic=3):
+def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_frame=0):
     """
-    Sort multiple trajectories along their most important common time-lagged independent components.
+    Sort multiple trajectories along their most important independent components.
 
     Parameters
     ----------
         data : list of float arrays
             List of trajectory data arrays, each [frames,frame_data].
-        start_frame : int
-            Offset of the data with respect to the trajectories (defined below).
         top : list of str
             Reference topology files.
         trj : list of str
@@ -285,88 +266,47 @@ def sort_mult_trajs_along_common_tic(data, start_frame, top, trj, out_name, num_
             trj[i] should be the same as data[i] was from.
         out_name : str
             Core part of the name of the output files.
+        num_ic : int, optional
+            Sort along the first num_ic independent components.
+            Defaults to 3.
+        start_frame : int or list of int
+            Offset of the data with respect to the trajectories.
+            Defaults to 0.
+            
+    Returns
+    -------
+        sorted_proj: list
+            sorted projections on each independent component
+        sorted_indices_data : list
+            Sorted indices of the data array for each independent component
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each independent component
 
     """
     num_frames = [len(d) for d in data]
     num_traj = len(data)
+    if type(start_frame) == int:
+        start_frame *= np.ones(num_traj)
+        start_frame = start_frame.tolist()
     # Combine the input data
-    data = np.concatenate(data,0)
-    # Remember which simulation the data came frome
-    cond = np.concatenate([i*np.ones(num_frames[i]) for i in range(num_traj)])
-    # Remember the index in the respective simulation (taking into account cutoff)
-    oidx = np.concatenate([np.arange(num_frames[i])+start_frame for i in range(num_traj)])
-    # Calculate the time-lagged independent components
-    tica = pyemma.coordinates.tica(data,dim=3)
-    # Define the MDAnalysis trajectories from where the frames come
-    univs = []
-    atoms = []
-    for j in range(num_traj):
-        u = mda.Universe(top[j],trj[j])
-        univs.append(u)
-        atoms.append(u.select_atoms('all'))
-    # Loop over time-lagged independent component.
-    for evi in range(num_tic):
-        # Project the combined data on the time-lagged independent component
-        proj = project_on_tic(data,evi,tica=tica)
-        # Sort everything along the projection on th resp. PC
-        sort_idx  = np.argsort(proj)
-        proj_sort = proj[sort_idx]
-        cond_sort = cond[sort_idx]
-        oidx_sort = oidx[sort_idx]
-        # Write the trajectory, ordered along the PC
-        with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", atoms[0].n_atoms) as W:
-            for i in range(data.shape[0]):
-                j = cond_sort[i] 
-                ts = univs[j].trajectory[oidx_sort[i]]
-                W.write(atoms[j])
-    return
-
-
-def compare_projections_tica(data_a, data_b, tica, num=3, saveas=None, label_a=None, label_b=None):
-    """
-    Compare two datasets along a given time-lagged indepedent component.
-    
-    Parameters
-    ----------
-        data_a : float array
-            Trajectory data [frames,frame_data]
-        data_b : float array
-            Trajectory data [frames,frame_data]
-        tica : TICA object
-            Time-lagged independent components information.
-        num : int, default=3
-            Number of time-lagged independent components to plot. 
-        saveas : str, optional
-            Name of the output file.
-        label_a : str, optional
-            Label for the first dataset.
-        label_b : str, optional
-            Label for the second dataset.
-        
-    """
-    # Start the figure    
-    fig,ax = plt.subplots(num, 2, figsize=[8,3*num], dpi=300)
-    # Loop over PCs
-    for evi in range(num):
-        # Calculate values along TIC for each frame
-        proj_a = project_on_tic(data_a, evi, tica=tica)
-        proj_b = project_on_tic(data_b, evi, tica=tica)
-        # Plot the time series in the left panel
-        ax[evi,0].plot(proj_a, alpha=0.5, label=label_a)
-        ax[evi,0].plot(proj_b, alpha=0.5, label=label_b)
-        ax[evi,0].set_xlabel('frame number')
-        ax[evi,0].set_ylabel('TIC %i'%(evi+1))
-        # Plot the histograms in the right panel
-        ax[evi,1].hist(proj_a, bins=30, alpha=0.5, density=True, label=label_a)
-        ax[evi,1].hist(proj_b, bins=30, alpha=0.5, density=True, label=label_b)
-        ax[evi,1].set_xlabel('TIC %i'%(evi+1))
-        ax[evi,1].set_ylabel('frequency')
-        # Legend
-        if label_a and label_b:
-            ax[evi,0].legend()
-            ax[evi,1].legend()
-    fig.tight_layout()
-    # Save the figure
-    if saveas is not None:
-        fig.savefig(saveas, dpi=300)
-    return
+    all_data = np.concatenate(data,0)
+    # Calculate the independent components
+    tica = pyemma.coordinates.tica(all_data, dim=3)
+    # Initialize output
+    sorted_proj = []
+    sorted_indices_data = []
+    sorted_indices_traj = []    
+    # Loop over principal components.
+    for evi in range(num_ic):
+        # Project the combined data on the independent component
+        proj = [project_on_tic(d, evi, tica=tica) for d in data]
+        # Sort everything along the projection on the respective independent component
+        out_xtc = out_name+"_tic"+str(evi+1)+".xtc"
+        proj_sort, sort_idx, oidx_sort = merge_and_sort_coordinates(
+            proj, top, trj, out_xtc, start_frame=start_frame, verbose=False
+            )
+        sorted_proj.append(proj_sort)
+        sorted_indices_data.append(sort_idx)
+        sorted_indices_traj.append(oidx_sort)        
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
+   
diff --git a/pensa/dimensionality/visualization.py b/pensa/dimensionality/visualization.py
new file mode 100644
index 00000000..7263a7a3
--- /dev/null
+++ b/pensa/dimensionality/visualization.py
@@ -0,0 +1,183 @@
+import numpy as np
+import pyemma
+from pyemma.util.contexts import settings
+import MDAnalysis as mda
+import matplotlib.pyplot as plt
+from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates
+
+
+def project_on_eigenvector(data, ev_idx, ana):
+    """
+    Projects a trajectory onto an eigenvector of its PCA/tICA.
+    
+    Parameters
+    ----------
+        data : float array
+            Trajectory data [frames,frame_data].
+        ev_idx : int
+            Index of the eigenvector to project on (starts with zero). 
+        ana : PCA or tICA obj
+            Information of pre-calculated PCA or tICA.
+            Must be calculated for the same features (but not necessarily the same trajectory).
+    
+    Returns
+    -------
+        projection : float array
+            Value along the PC for each frame.
+        
+    """
+    # Project the features onto the components
+    projection = np.zeros(data.shape[0])
+    for ti in range(data.shape[0]):
+        projection[ti] = np.dot(data[ti], ana.eigenvectors[:,ev_idx])
+    # Return the value along the PC for each frame  
+    return projection
+    
+    
+def compare_projections(data_a, data_b, ana, num=3, saveas=None, label_a=None, label_b=None):
+    """
+    Compare two datasets along the components of a PCA or tICA.
+    
+    Parameters
+    ----------
+        data_a : float array
+            Trajectory data [frames,frame_data].
+        data_b : float array
+            Trajectory data [frames,frame_data].
+        ana : PCA or tICA object
+            Components analysis information.
+        num : int
+            Number of components to plot. 
+        saveas : str, optional
+            Name of the output file.
+        label_a : str, optional
+            Label for the first dataset.
+        label_b : str, optional
+            Label for the second dataset.
+
+    Returns:
+    --------
+        projections : list of float arrays
+            Projections of the trajectory on each component.
+                    
+    """
+    if label_a is not None and label_b is not None:
+        labels = [label_a, label_b]
+    else:
+        labels = None
+    projections = compare_mult_projections([data_a, data_b], ana, num=num, saveas=saveas, labels=labels, colors=None)
+    return projections
+    
+    
+def compare_mult_projections(data, ana, num=3, saveas=None, labels=None, colors=None):
+    """
+    Compare multiple datasets along the components of a PCA or tICA.
+    
+    Parameters
+    ----------
+        data : list of float arrays
+            Data from multiple trajectories [frames,frame_data].
+        ana : PCA or tICA object
+            Components analysis information.
+        num : int
+            Number of principal components to plot. 
+        saveas : str, optional
+            Name of the output file.
+        labels : list of str, optional
+            Labels for the datasets. If provided, it must have the same length as data.
+            
+    Returns:
+    --------
+        projections : list of float arrays
+            Projections of the trajectory on each principal component.
+        
+    """
+    if labels is not None:
+        assert len(labels) == len(data)
+    else:
+        labels = [None for _ in range(len(data))]
+    if colors is not None:
+        assert len(colors) == len(data)
+    else:
+        colors = ['C%i'%num for num in range(len(data))]
+    # Start the figure    
+    fig,ax = plt.subplots(num, 2, figsize=[9,3*num], dpi=300)
+    # Loop over components
+    projections = []
+    for evi in range(num):
+        proj_evi = []
+        for j,d in enumerate(data):
+            # Calculate values along PC for each frame
+            proj = project_on_eigenvector(d, evi, ana)
+            # Plot the time series in the left panel
+            ax[evi,0].plot(proj, alpha=0.5, 
+                           label=labels[j], color=colors[j])
+            # Plot the histograms in the right panel
+            ax[evi,1].hist(proj, bins=30, alpha=0.5, density=True, 
+                           label=labels[j], color=colors[j])
+            proj_evi.append(proj)
+        projections.append(proj_evi)
+        # Axis labels
+        ax[evi,0].set_xlabel('frame number')
+        ax[evi,0].set_ylabel('PC %i'%(evi+1))            
+        ax[evi,1].set_xlabel('PC %i'%(evi+1))
+        ax[evi,1].set_ylabel('frequency')
+        # Legend
+        if labels[0] is not None:
+            ax[evi,0].legend()
+            ax[evi,1].legend()
+    fig.tight_layout()
+    # Save the figure
+    if saveas is not None:
+        fig.savefig(saveas, dpi=300)
+    return projections
+    
+    
+def sort_traj_along_projection(data, ana, top, trj, out_name, num_comp=3, start_frame=0):
+    """
+    Sort a trajectory along given principal components.
+    
+    Parameters
+    ----------
+        data : float array
+            Trajectory data [frames,frame_data].
+        ana : PCA or tICA obj
+            Components information.
+        top : str
+            File name of the reference topology for the trajectory. 
+        trj : str
+            File name of the trajetory from which the frames are picked. 
+            Should be the same as data was from.
+        out_name : str
+            Core part of the name of the output files
+        num_comp : int, optional
+            Sort along the first num_comp components.
+        start_frame : int, optional
+            Offset of the data with respect to the trajectories (defined below).
+    
+    Returns
+    -------
+        sorted_proj: list
+            sorted projections on each component
+        sorted_indices_data : list
+            Sorted indices of the data array for each component
+        sorted_indices_traj : list
+            Sorted indices of the coordinate frames for each component
+
+    """
+    # Initialize output
+    sorted_proj = []
+    sorted_indices_data = []
+    sorted_indices_traj = []
+    # Loop through the principal components
+    for evi in range(num_comp):
+        # Project the combined data on the principal component
+        proj = project_on_eigenvector(data, evi, ana)
+        # Sort everything along the projection onto the PC
+        out_xtc = out_name+"_pc"+str(evi+1)+".xtc"
+        proj_sort, sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame)
+        sorted_proj.append(proj_sort)
+        sorted_indices_data.append(sort_idx)
+        sorted_indices_traj.append(oidx_sort)
+    return sorted_proj, sorted_indices_data, sorted_indices_traj
+   
diff --git a/pensa/features/processing.py b/pensa/features/processing.py
index 52f7c948..34ec2858 100644
--- a/pensa/features/processing.py
+++ b/pensa/features/processing.py
@@ -314,6 +314,5 @@ def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_na
     """
     if verbose: print('Sorting along feature '+feature_name)
     d = get_feature_data(feat, data, feature_name)
-    sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
-    d_sorted = d[sort_idx]
+    d_sorted, sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame)
     return d_sorted
diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py
index 8110ee8a..db3a9a68 100644
--- a/pensa/preprocessing/coordinates.py
+++ b/pensa/preprocessing/coordinates.py
@@ -9,15 +9,15 @@
 # -- Functions to preprocess trajectories --
 
 
-def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0,
+def extract_coordinates(top, pdb, trj_list, out_name, sel_string, start_frame=0,
                         rename_segments=None, residues_offset=0 ):
     """
     Extracts selected coordinates from a trajectory file.
     
     Parameters
     ----------
-        ref : str
-            File name for reference topology. 
+        top : str
+            File name for topology. 
             Can read all MDAnalysis-compatible topology formats.
         pdb : str
             File name for the reference PDB file.
@@ -30,8 +30,8 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0,
             First frame to read from the trajectory.
     
     """
-    # Read the reference+PDB files and extract selected parts.
-    u = mda.Universe(ref,pdb)
+    # Read the topology+PDB files and extract selected parts.
+    u = mda.Universe(top,pdb)
     u.residues.resids -= residues_offset
     selection = u.select_atoms(sel_string)
     num_at = selection.n_atoms
@@ -43,7 +43,7 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0,
     # Read the trajectories and extract selected parts.
     with mda.Writer(out_name+'.xtc', selection.n_atoms) as W:
         for trj in trj_list:
-            u = mda.Universe(ref,trj)
+            u = mda.Universe(top,trj)
             u.residues.resids -= residues_offset
             selection = u.select_atoms(sel_string)
             for ts in u.trajectory[start_frame:]:
@@ -51,14 +51,14 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0,
     return num_at
 
 
-def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, verbose=False):
+def extract_coordinates_combined(top, trj, sel_string, out_name, start_frame=0, verbose=False):
     """
     Extracts selected coordinates from several trajectory files.
     
     Parameters
     ----------
-        ref : list of str 
-            File names for the reference topologies. 
+        top : list of str 
+            File names for the topologies. 
             Can read all MDAnalysis-compatible topology formats.
         trj : list of str
             File names for the input trajectories.
@@ -70,12 +70,12 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0,
     
     """        
     # Determine the number of atoms from the first trajectory
-    u = mda.Universe(ref[0], trj[0])
+    u = mda.Universe(top[0], trj[0])
     selection = u.select_atoms(sel_string[0])
     num_at = selection.n_atoms              
     # Go through trajectories and write selections
     with mda.Writer(out_name+'.xtc', num_at) as W:
-        for r, t, s in zip(ref, trj, sel_string):
+        for r, t, s in zip(top, trj, sel_string):
             print(r, t)
             if verbose: print(s)
             u = mda.Universe(r, t)
@@ -85,14 +85,14 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0,
     return num_at
 
 
-def merge_coordinates(ref_files, trj_files, out_name, segid=None):
+def merge_coordinates(top_files, trj_files, out_name, segid=None):
     """
     Merge several trajectories of the same system or system part.
     All trajectories must be (at least) as long as the first one.
     
     Parameters
     ----------
-        ref_files : str[]
+        top_files : str[]
             List of input topology files.
         trj_files : str[]: 
             List of input trajectory files.
@@ -107,10 +107,10 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None):
             MDAnalysis universe of the merged system.
 
     """
-    num_parts = len(ref_files)
+    num_parts = len(top_files)
     assert num_parts == len(trj_files)
     # Create an array of universes
-    u = [ mda.Universe(ref_files[i],trj_files[i]) for i in range(num_parts) ]
+    u = [ mda.Universe(top_files[i],trj_files[i]) for i in range(num_parts) ]
     num_frames = len(u[0].trajectory)
     new_num_at = sum([len(ui.atoms) for ui in u]) 
     # Create the merged starting structure
@@ -135,17 +135,17 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None):
     return univ
 
 
-def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0):
+def align_coordinates(top, pdb, trj_list, out_name, sel_string='all', start_frame=0):
     """
     Aligns selected coordinates from a trajectory file.
 
     Parameters
     ----------
-        ref : str
-            File name for reference topology.
+        top : str
+            File name for the topology.
             Can read all MDAnalysis-compatible topology formats.
         pdb : str
-            File name for reference PDB file.
+            File name for the reference PDB file.
         trj_list : list of str
             File names for the input trajectory.
             Can read all MDAnalysis-compatible trajectory formats.
@@ -154,46 +154,49 @@ def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_fram
         start_frame : int, optional
             First frame to read from the trajectory. 
     """
-    # Read the reference+PDB files and align selected parts.
-    u = mda.Universe(ref, pdb)
+    # Read the topology+PDB files and align selected parts.
+    u = mda.Universe(top, pdb)
     for trj in trj_list:
-        mobile = mda.Universe(ref, trj)
+        mobile = mda.Universe(top, trj)
         #mobile.trajectory = mobile.trajectory[start_frame:]
         alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc')
         alignment.run()
 
 
-def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0, verbose=False):
+def sort_coordinates(values, top_name, trj_name, out_name, start_frame=0, verbose=False):
     """
-    Sort coordinate frames along an array of values.
+    Sort coordinate frames along corresponding values.
     
     Parameters
     ----------
-    values: float array.
+    values: float array
         Values along which to sort the trajectory.
-    ref_name: string.
-        reference topology for the trajectory. 
-    trj_name: string.
+    top_name: str
+        Topology for the trajectory. 
+    trj_name: str
         Trajetory from which the frames are picked. 
         Usually the same as the values are from.
-    out_name: string.
-        Name of the output files
+    out_name: str
+        Name of the output trajectory (usual format is .xtc).
     start_frame: int
         Offset of the data with respect to the trajectories.
         
     Returns
     -------
+        data_sort: float array
+            Sorted values of the input data.
         sort_idx: float array
             Sorted indices of the values.
         oidx_sort: float array
             Sorted indices of the trajectory.
+        data_sort: float array
             
     """
     # Remember the index in the simulation (taking into account offset)
-    oidx = np.arange(len(values))+start_frame
+    oidx = np.arange(len(values)) + start_frame
     # Define the MDAnalysis trajectory from where the frames come
-    if verbose: print('Loading:', ref_name, trj_name)
-    u = mda.Universe(ref_name, trj_name)
+    if verbose: print('Loading:', top_name, trj_name)
+    u = mda.Universe(top_name, trj_name)
     if verbose: 
         print('Trajectory length:', len(u.trajectory))
         print('Number of values: ', len(values))
@@ -201,12 +204,93 @@ def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0, verbos
     a = u.select_atoms('all')
     # Sort everything along the projection on the values
     sort_idx  = np.argsort(values)
+    data_sort = values[sort_idx]
     oidx_sort = oidx[sort_idx]
     # Write out sorted trajectory
     with mda.Writer(out_name, a.n_atoms) as W:
         for i in range(len(values)):
             ts = u.trajectory[oidx_sort[i]]
             W.write(a)
-    return sort_idx, oidx_sort
+    return data_sort, sort_idx, oidx_sort
 
 
+def merge_and_sort_coordinates(values, top_names, trj_names, out_name, start_frame=0, verbose=False):
+    """
+    Write multiple trajectories of coordinate frames into one trajectory, sorted along corresponding values.
+    
+    Parameters
+    ----------
+    values: list of float arrays
+        Values along which to sort the trajectory.
+    top_names: list of str
+        topology for the trajectory. 
+    trj_names: list of str
+        Trajetory from which the frames are picked. 
+        Usually the same as the values are from.
+    out_name: str
+        Name of the output trajectory (usual format is .xtc).
+    start_frame: int or list of int
+        Offsets of the data with respect to the trajectories.
+        Defaults to zero.
+        
+    Returns
+    -------
+        data_sort: float array
+            Sorted values of the input data.
+        sort_idx: float array
+            Sorted indices of the values.
+        oidx_sort: float array
+            Sorted indices of the trajectory.
+            
+    """
+    assert type(values) == list and type(top_names) == list and type(trj_names) == list
+    # Get some stats
+    num_traj = len(values)
+    num_frames = [len(val) for val in values]
+    # Number of values must be consistent with topologies and trajectories
+    assert num_traj == len(top_names) 
+    assert num_traj == len(trj_names)
+    # Set offset if not provided
+    assert type(start_frame) == int or len(start_frame) == num_traj
+    if type(start_frame) == int:
+       start_frame *= np.ones(num_traj)
+       start_frame = start_frame.tolist()
+    # Make sure the start indices are integers (MDA does not accept floats for indexing a trajectory) 
+    start_frame = [int(sf) for sf in start_frame]
+    
+    # Combine the input data
+    data = np.concatenate(values)
+    # Remember which simulation the data came frome
+    cond = np.concatenate([i*np.ones(num_frames[i], dtype=int) for i in range(num_traj)])
+    # Remember the index in the respective simulation (taking into account cutoff)
+    oidx = np.concatenate([np.arange(num_frames[i], dtype=int) + start_frame[i] for i in range(num_traj)])
+    assert type(oidx[0]==int)
+    
+    # Define the MDAnalysis trajectories from where the frames come
+    univs = []
+    atoms = []
+    for j in range(num_traj):
+        u = mda.Universe(top_names[j],trj_names[j])
+        if verbose: print('Length of trajectory',len(u.trajectory))
+        univs.append(u)
+        atoms.append(u.select_atoms('all'))
+        # Make sure the numbers of atoms are the same in each trajectory
+        assert atoms[j].n_atoms == atoms[0].n_atoms
+
+    # Sort everything along the data
+    sort_idx  = np.argsort(data)
+    data_sort = data[sort_idx]
+    cond_sort = cond[sort_idx]
+    oidx_sort = oidx[sort_idx]
+    
+    # Write the trajectory, ordered along the PC
+    with mda.Writer(out_name, atoms[0].n_atoms) as W:
+        for i in range(len(data)):
+            j = cond_sort[i]
+            o = oidx_sort[i]
+            uj = univs[j] 
+            ts = uj.trajectory[o]
+            W.write(atoms[j])
+          
+    return data_sort, sort_idx, oidx_sort  
+    
diff --git a/scripts/calculate_combined_principal_components.py b/scripts/calculate_combined_principal_components.py
index 3e0b58d7..d25fe8a3 100644
--- a/scripts/calculate_combined_principal_components.py
+++ b/scripts/calculate_combined_principal_components.py
@@ -68,9 +68,9 @@
                  args.num_components, args.feat_threshold,
                  plot_file=args.out_plots+"_"+ftype+"_feature_correlation.pdf")
     # Sort each of the trajectories along the top components of combined data
-    sort_trajs_along_common_pc(data_a[ftype], data_b[ftype], args.start_frame,
+    sort_trajs_along_common_pc(data_a[ftype], data_b[ftype],
                                args.ref_file_a, args.ref_file_b, args.trj_file_a, args.trj_file_b,
-                               args.out_pc, num_pc=args.num_components)
+                               args.out_pc, num_pc=args.num_components, start_frame=args.start_frame)
     # Plot histograms of both simulations along the common PCs
     compare_projections(data_a[ftype], data_b[ftype], pca,
                         num=args.num_components, saveas=args.out_plots+"_"+ftype+"_pc-comparison.pdf")
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index b9ddd99f..5ef9a44c 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -81,23 +81,23 @@ def setUp(self):
     # -- Sort trajectory along common pc
     self.sort_common_traj = sort_trajs_along_common_pc(self.sim_a_tmr_data['bb-torsions'],
                                                       self.sim_b_tmr_data['bb-torsions'],
-                                                      start_frame,
                                                       test_data_path + "/traj/condition-a_receptor.gro",
                                                       test_data_path + "/traj/condition-b_receptor.gro",
                                                       test_data_path + "/traj/condition-a_receptor.xtc",
                                                       test_data_path + "/traj/condition-b_receptor.xtc",
                                                       test_data_path + "/pca/receptor_by_tmr",
-                                                      num_pc=3)
+                                                      num_pc=3, start_frame = start_frame)
     plt.close()
 
     # -- Sort trajectory pc
     pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions'])
     pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4)
     plt.close()
-    self.all_sort, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0,
-                                          test_data_path + "/traj/condition-a_receptor.gro",
-                                          test_data_path + "/traj/condition-a_receptor.xtc",
-                                          test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3)
+    self.all_sort, _, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'],
+                                             test_data_path + "/traj/condition-a_receptor.gro",
+                                             test_data_path + "/traj/condition-a_receptor.xtc",
+                                             test_data_path + "/pca/condition-a_receptor_by_tmr", 
+                                             pca=pca_a, num_pc=3)
 
     # -- Compare projections
     self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'],
@@ -273,31 +273,31 @@ def test_tica_features(self):
 
   # -- sort_trajs_along_common_pc() + sort_traj_along_pc() + project_on_pc()
   def test_sort_trajs_along_pc(self):
-    self.assertEqual(len(self.sort_common_traj), 180)
     for ele in self.sort_common_traj:
-      self.assertEqual(ele.n_atoms, 2322)
+      self.assertEqual(len(ele), 3)
 
     self.assertEqual(len(self.all_sort), 3)
 
   # -- sort_trajs_along_common_tic()
   def test_sort_trajs_along_common_tic(self):
-    proj, atom = sort_trajs_along_common_tic(self.sim_a_tmr_data['bb-torsions'],
-                            self.sim_b_tmr_data['bb-torsions'], 0,
+    sproj, sidx_data, sidx_traj = sort_trajs_along_common_tic(self.sim_a_tmr_data['bb-torsions'],
+                            self.sim_b_tmr_data['bb-torsions'],
                             test_data_path + "/traj/condition-a_receptor.gro",
                             test_data_path + "/traj/condition-b_receptor.gro",
                             test_data_path + "/traj/condition-a_receptor.xtc",
                             test_data_path + "/traj/condition-b_receptor.xtc",
                             test_data_path + "/tica/receptor_by_tmr",
-                             num_tic=3)
-    self.assertEqual(len(proj), 60)
-    self.assertEqual(len(atom), 60)
+                            num_ic=3)
+    self.assertEqual(len(sproj[0]), 60)
+    self.assertEqual(len(sidx_data[0]), 60)
 
   # -- sort_traj_along_tic()
   def test_sort_traj_along_tic(self):
-    all_sort, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0,
-                                      test_data_path + "/traj/condition-a_receptor.gro",
-                                      test_data_path + "/traj/condition-a_receptor.xtc",
-                                      test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3)
+    all_sort, _, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], 
+                                         test_data_path + "/traj/condition-a_receptor.gro",
+                                         test_data_path + "/traj/condition-a_receptor.xtc",
+                                         test_data_path + "/pca/condition-a_receptor_by_tmr", 
+                                         tica = self.tica_combined, num_ic=3)
     self.assertEqual(len(all_sort), 3)
 
 

From cdb837816cd4e74a828e54a715bc03d132a38c77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Wed, 18 Aug 2021 01:01:35 -0700
Subject: [PATCH 18/23] dim and lag options + bugfixes

---
 pensa/dimensionality/pca.py  | 33 +++++++++------
 pensa/dimensionality/tica.py | 79 ++++++++++++++++++++++--------------
 2 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py
index 6d79d4cb..2edbf756 100644
--- a/pensa/dimensionality/pca.py
+++ b/pensa/dimensionality/pca.py
@@ -12,7 +12,7 @@
 # http://www.emma-project.org/latest/api/generated/pyemma.coordinates.pca.html
 
 
-def calculate_pca(data):
+def calculate_pca(data, dim=-1):
     """
     Performs a PyEMMA PCA on the provided data.
     
@@ -20,6 +20,9 @@ def calculate_pca(data):
     ----------
         data : float array
             Trajectory data [frames,frame_data].
+        dim : int, optional, default -1
+            The number of dimensions (principal components) to project onto. 
+            -1 means all numerically available dimensions will be used.
         
     Returns
     -------
@@ -27,7 +30,7 @@ def calculate_pca(data):
             Principal components information.
         
     """
-    pca = pyemma.coordinates.pca(data)
+    pca = pyemma.coordinates.pca(data, dim=dim)
     return pca
 
 
@@ -98,7 +101,7 @@ def pca_features(pca, features, num, threshold, plot_file=None):
     return test_graph, test_corr
     
     
-def project_on_pc(data, ev_idx, pca=None):
+def project_on_pc(data, ev_idx, pca=None, dim=-1):
     """
     Projects a trajectory onto an eigenvector of its PCA.
     
@@ -111,7 +114,9 @@ def project_on_pc(data, ev_idx, pca=None):
         pca : PCA obj, optional
             Information of pre-calculated PCA. Defaults to None.
             Must be calculated for the same features (but not necessarily the same trajectory).
-    
+        dim : int, optional, default -1
+            The number of dimensions (principal components) to project onto.
+            Only used if tica is not provided.
     Returns
     -------
         projection : float array
@@ -119,14 +124,13 @@ def project_on_pc(data, ev_idx, pca=None):
         
     """
     # Perform PCA if none is provided.
-    if pca is None:
-        pca = pyemma.coordinates.pca(data)
+    if pca is None: pca = calculate_pca(data)
     # Project the features onto the principal components.
     projection = project_on_eigenvector(data, ev_idx, pca)  
     return projection
 
    
-def get_components_pca(data, num, pca=None, prefix=''):
+def get_components_pca(data, num, pca=None, dim=-1, prefix=''):
     """
     Projects a trajectory onto the first num eigenvectors of its PCA.
     
@@ -139,6 +143,11 @@ def get_components_pca(data, num, pca=None, prefix=''):
         pca : PCA obj, optional
             Information of pre-calculated PCA. Defaults to None.
             Must be calculated for the same features (but not necessarily the same trajectory).
+        dim : int, optional, default = -1
+            The number of dimensions (principal components) to project onto.
+            Only used if tica is not provided.
+        prefix : str, optional, default = ''
+            First part of the component names. Second part is "PC"+<PC number>
     
     Returns
     -------
@@ -149,8 +158,7 @@ def get_components_pca(data, num, pca=None, prefix=''):
         
     """
     # Perform PCA if none is provided
-    if pca is None:
-        pca = pyemma.coordinates.pca(data) 
+    if pca is None: calculate_pca(data) 
     # Project the features onto the principal components
     comp_names = []
     components = []
@@ -201,8 +209,7 @@ def sort_traj_along_pc(data, top, trj, out_name, pca=None, num_pc=3, start_frame
 
     """
     # Calculate the principal components if they are not given.
-    if pca is None: 
-        pca = pyemma.coordinates.pca(all_data, dim=3)
+    if pca is None: calculate_pca(data, dim=num_pc)
     # Sort the trajectory along them.
     sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection(
         data, pca, top, trj, out_name, num_comp=num_pc, start_frame = start_frame
@@ -250,7 +257,7 @@ def sort_trajs_along_common_pc(data_a, data_b, top_a, top_b, trj_a, trj_b, out_n
                 
     """
     sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_pc(
-        [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=3, start_frame = start_frame
+        [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=num_pc, start_frame = start_frame
         )
     return sorted_proj, sorted_indices_data, sorted_indices_traj
 
@@ -295,7 +302,7 @@ def sort_mult_trajs_along_common_pc(data, top, trj, out_name, num_pc=3, start_fr
     # Combine the input data
     all_data = np.concatenate(data,0)
     # Calculate the principal component
-    pca = pyemma.coordinates.pca(all_data, dim=3)
+    pca = calculate_pca(all_data)
     # Initialize output
     sorted_proj = []
     sorted_indices_data = []
diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py
index 01546eaa..062ef5b0 100644
--- a/pensa/dimensionality/tica.py
+++ b/pensa/dimensionality/tica.py
@@ -12,7 +12,7 @@
 # http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html
 
 
-def calculate_tica(data):
+def calculate_tica(data, dim=-1, lag=10):
     """
     Performs a PyEMMA TICA on the provided data.
     
@@ -20,6 +20,11 @@ def calculate_tica(data):
     ----------
         data : float array
             Trajectory data. Format: [frames,frame_data].
+        dim : int, optional, default -1
+            The number of dimensions (independent components) to project onto. 
+            -1 means all numerically available dimensions will be used.
+        lag : int, optional, default = 10
+            The lag time, in multiples of the input time step.
         
     Returns
     -------
@@ -27,13 +32,13 @@ def calculate_tica(data):
             Time-lagged independent component information.
         
     """
-    tica = pyemma.coordinates.tica(data)
+    tica = pyemma.coordinates.tica(data, lag=lag)
     return tica
 
 
 def tica_eigenvalues_plot(tica, num=12, plot_file=None):
     """
-    Plots the highest eigenvalues over the numberr of the time-lagged independent components.
+    Plots the highest eigenvalues over the number of the time-lagged independent components.
     
     Parameters
     ----------
@@ -94,7 +99,7 @@ def tica_features(tica, features, num, threshold, plot_file=None):
     return test_feature
     
     
-def project_on_tic(data, ev_idx, tica=None):
+def project_on_tic(data, ev_idx, tica=None, dim=-1, lag=10):
     """
     Projects a trajectory onto an eigenvector of its TICA.
     
@@ -107,22 +112,27 @@ def project_on_tic(data, ev_idx, tica=None):
         tica : TICA obj, optional
             Information of pre-calculated TICA.
             Must be calculated for the same features (but not necessarily the same trajectory).
-    
+        dim : int, optional, default -1
+            The number of dimensions (independent components) to project onto.
+            Only used if tica is not provided.
+        lag : int, optional, default = 10
+            The lag time, in multiples of the input time step.
+            Only used if tica is not provided.
+                
     Returns
     -------
         projection : float array
             Value along the TIC for each frame.
         
     """
-    # Perform TICA if none is provided.
-    if tica is None:
-        tica = pyemma.coordinates.tica(data)
+    # Perform standard TICA if none is provided.
+    if tica is None: calculate_tica(data, dim=dim, lag=lag)
     # Project the features onto the time-lagged independent components.
     projection = project_on_eigenvector(data, ev_idx, tica) 
     return projection
     
 
-def get_components_tica(data, num, tica=None, prefix=''):
+def get_components_tica(data, num, tica=None, dim=-1, lag=10, prefix=''):
     """
     Projects a trajectory onto the first num eigenvectors of its tICA.
     
@@ -135,6 +145,14 @@ def get_components_tica(data, num, tica=None, prefix=''):
         tica : tICA obj, optional
             Information of pre-calculated tICA. Defaults to None.
             Must be calculated for the same features (but not necessarily the same trajectory).
+        dim : int, optional, default -1
+            The number of dimensions (independent components) to project onto.
+            Only used if tica is not provided.
+        lag : int, optional, default = 10
+            The lag time, in multiples of the input time step.
+            Only used if tica is not provided.
+        prefix : str, optional, default = ''
+            First part of the component names. Second part is "IC"+<IC number>
     
     Returns
     -------
@@ -145,8 +163,7 @@ def get_components_tica(data, num, tica=None, prefix=''):
         
     """
     # Perform tICA if none is provided.
-    if tica is None:
-        tica = pyemma.coordinates.tica(data) 
+    if tica is None: calculate_tica(data, lag=lag)
     # Project the features onto the principal components.
     comp_names = []
     components = []
@@ -160,7 +177,7 @@ def get_components_tica(data, num, tica=None, prefix=''):
     return comp_names, np.array(components).T
     
 
-def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_frame=0):
+def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, lag=10, start_frame=0):
     """
     Sort a trajectory along independent components.
     
@@ -179,12 +196,13 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_fra
             Time-lagged independent components information.
             If none is provided, it will be calculated.
             Defaults to None.
-        num_ic : int, optional
+        num_ic : int, optional, default = 3
             Sort along the first num_ic independent components.
-            Defaults to 3.
-        start_frame : int, optional
+        lag : int, optional, default = 10
+            The lag time, in multiples of the input time step.
+            Only used if tica is not provided.
+        start_frame : int, optional, default = 0
             Offset of the data with respect to the trajectories (defined below).
-            Defaults to 0.
     
     Returns
     -------
@@ -197,8 +215,7 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_fra
 
     """
     # Calculate the principal components if they are not given.
-    if tica is None: 
-        tica = pyemma.coordinates.tica(all_data, dim=3)
+    if tica is None: tica = calculate_tica(all_data, dim=num_ic, lag=lag)
     # Sort the trajectory along them.
     sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection(
         data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame
@@ -206,7 +223,7 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_fra
     return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 
-def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, start_frame=0):
+def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, lag=10, start_frame=0):
     """
     Sort two trajectories along their most important common time-lagged independent components.
     
@@ -228,12 +245,13 @@ def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_
             Should be the same as data_b was from.
         out_name : str
             Core part of the name of the output files.
-        num_ic : int, optional
+        num_ic : int, optional, default = 3
             Sort along the first num_ic independent components.
-            Defaults to 3.
-        start_frame : int, optional
+        lag : int, optional, default = 10
+            The lag time, in multiples of the input time step.
+            Only used if tica is not provided.
+        start_frame : int, optional, default = 0
             Offset of the data with respect to the trajectories (defined below).
-            Defaults to 0.
 
     Returns
     -------
@@ -246,12 +264,12 @@ def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_
     
     """
     sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_tic(
-        [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=3, start_frame = start_frame
+        [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=num_ic, lag=lag, start_frame = start_frame
         )
     return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 
-def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_frame=0):
+def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, lag=10, start_frame=0):
     """
     Sort multiple trajectories along their most important independent components.
 
@@ -266,12 +284,13 @@ def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_f
             trj[i] should be the same as data[i] was from.
         out_name : str
             Core part of the name of the output files.
-        num_ic : int, optional
+        num_ic : int, optional, default = 3
             Sort along the first num_ic independent components.
-            Defaults to 3.
-        start_frame : int or list of int
+        lag : int, optional, default = 10
+            The lag time, in multiples of the input time step.
+            Only used if tica is not provided.
+        start_frame : int or list of int, default = 0
             Offset of the data with respect to the trajectories.
-            Defaults to 0.
             
     Returns
     -------
@@ -291,7 +310,7 @@ def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_f
     # Combine the input data
     all_data = np.concatenate(data,0)
     # Calculate the independent components
-    tica = pyemma.coordinates.tica(all_data, dim=3)
+    tica = pyemma.coordinates.tica(all_data, lag=lag)
     # Initialize output
     sorted_proj = []
     sorted_indices_data = []

From 4e411fa21dae74a3b06f1adc20f9ebeb83cbad11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Wed, 18 Aug 2021 01:08:35 -0700
Subject: [PATCH 19/23] bugfix

---
 pensa/dimensionality/tica.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py
index 062ef5b0..df473b43 100644
--- a/pensa/dimensionality/tica.py
+++ b/pensa/dimensionality/tica.py
@@ -215,11 +215,11 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, lag=10, s
 
     """
     # Calculate the principal components if they are not given.
-    if tica is None: tica = calculate_tica(all_data, dim=num_ic, lag=lag)
+    if tica is None: tica = calculate_tica(data, dim=num_ic, lag=lag)
     # Sort the trajectory along them.
     sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection(
         data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame
-        )
+    )
     return sorted_proj, sorted_indices_data, sorted_indices_traj
 
 

From db49325eb7e6b7612eaa69570d7b4253f8021d73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?= <martinvoegele1989@gmail.com>
Date: Thu, 19 Aug 2021 08:03:23 -0700
Subject: [PATCH 20/23] fix bugs in distance heatmap

---
 pensa/comparison/visualization.py | 74 ++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/pensa/comparison/visualization.py b/pensa/comparison/visualization.py
index 44131d06..0cb2f5a0 100644
--- a/pensa/comparison/visualization.py
+++ b/pensa/comparison/visualization.py
@@ -175,7 +175,7 @@ def pair_features_heatmap(feat_names, feat_diff, plot_filename, separator=' - ',
     
     
 def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6,
-                   vmin=None, vmax=None, symmetric=True, verbose=True, cbar_label=None):
+                   vmin=None, vmax=None, symmetric=True, verbose=False, cbar_label=None, tick_step=50):
     """
     Visualizes data per residue pair in a heatmap. 
     
@@ -187,21 +187,23 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6,
             Data to be plotted for each residue-pair feature.
         plot_filename : str
             Name of the file for the plot.
-        res1_pos : int
+        res1_pos : int, optional, default = 2
             Position of the 1st residue ID in the feature name when separated by ' '.
-        res2_pos : int
+        res2_pos : int, optional, default = 6
             Position of the 2nd residue ID in the feature name when separated by ' '.
-        vmin : float, optional
+        vmin : float, optional, default = None
             Minimum value for the heatmap.
-        vmax : float, optional
+        vmax : float, optional, default = None
             Maximum value for the heatmap.
-        symmetric : bool, optional
+        symmetric : bool, optional, default = True
             The matrix is symmetric and values provided only for the upper or lower triangle. 
             Defaults to True.
-        verbose : bool, optional
+        verbose : bool, optional, default = False
             Print numbers of first and last residue. Defaults to True.
-        cbar_label : str, optional
+        cbar_label : str, optional, default = None
             Label for the color bar.
+        tick_step : int, optional, default = 50
+            Step between two ticks on the plot axes.
         
     Returns
     -------
@@ -209,17 +211,21 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6,
             Matrix with the values of the difference/divergence.
          
     """
-    firstres = int(feat_names[0].split(' ')[res1_pos])
-    lastres  = int(feat_names[-1].split(' ')[res1_pos])
-    if verbose:
-        print('first res:', firstres, ', last res:', lastres)
-    size = lastres-firstres+2
+    # Find first and last residue
+    rn1 = [int(fn.split(' ')[res1_pos]) for fn in feat_names]
+    rn2 = [int(fn.split(' ')[res2_pos]) for fn in feat_names]
+    resnums = np.concatenate([np.array(rn1,dtype=int),np.array(rn2,dtype=int)])
+    first_res = resnums.min()
+    last_res  = resnums.max()
+    if verbose: print('first res:', first_res, ', last res:', last_res)
+    # Create a 2D array with the values 
+    size = last_res - first_res + 1
     diff = np.zeros([size,size])
-    for n,name in enumerate(feat_names):
+    for n, name in enumerate(feat_names):
         splitname = name.split(' ')
-        resi,resj = int(splitname[res1_pos]),int(splitname[res2_pos])
-        i = resi - firstres
-        j = resj - firstres
+        resi,resj = int(splitname[res1_pos]), int(splitname[res2_pos])
+        i = resi - first_res
+        j = resj - first_res
         diff[i,j] = feat_diff[n]
         if symmetric:
             diff[j,i] = feat_diff[n]  
@@ -227,10 +233,15 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6,
     fig,ax = plt.subplots(1,1,figsize=[6,4],dpi=300)
     img = ax.imshow(diff, vmin=vmin, vmax=vmax)
     ax.xaxis.set_ticks_position('top')
-    ax.set_xticks(np.arange(50-firstres,lastres-firstres+1,50))
-    ax.set_yticks(np.arange(50-firstres,lastres-firstres+1,50))
-    ax.set_xticklabels(np.arange(50,lastres+1,50))
-    ax.set_yticklabels(np.arange(50,lastres+1,50))
+    # Find position for the first tick
+    first_tick = 0
+    while first_res > first_tick:
+        first_tick += tick_step 
+    # Ticks and labels
+    ax.set_xticks(np.arange(first_tick-first_res, size, tick_step))
+    ax.set_yticks(np.arange(first_tick-first_res, size, tick_step))
+    ax.set_xticklabels(np.arange(first_tick, last_res+1, tick_step))
+    ax.set_yticklabels(np.arange(first_tick, last_res+1, tick_step))
     ax.xaxis.set_label_position('top')
     ax.set_xlabel('residue number')
     ax.set_ylabel('residue number')
@@ -241,7 +252,8 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6,
     
     
 def distances_visualization(dist_names, dist_diff, plot_filename, 
-                            vmin=None, vmax=None, verbose=True, cbar_label=None):
+                            vmin=None, vmax=None, verbose=True, 
+                            cbar_label=None, tick_step=50):
     """
     Visualizes distance features for pairs of residues in a heatmap. 
     
@@ -254,25 +266,27 @@ def distances_visualization(dist_names, dist_diff, plot_filename,
             Data for each distance feature.
         plot_filename : str
             Name of the file for the plot.
-        vmin : float, optional
+        vmin : float, optional, default = None
             Minimum value for the heatmap.
-        vmax : float, optional
+        vmax : float, optional, default = None
             Maximum value for the heatmap.
-        verbose : bool, optional
+        verbose : bool, optional, default = False
             Print numbers of first and last residue. Defaults to True.
-        cbar_label : str, optional
+        cbar_label : str, optional, default = None
             Label for the color bar.
-        
+        tick_step : int, optional, default = 50
+            Step between two ticks on the plot axes.
+
     Returns
     -------
         diff : float array
             Distance matrix.
          
     """
-    if verbose:
-        print('Plotting heatmap for distance features.')
+    if verbose: print('Plotting heatmap for distance features.')
     diff = resnum_heatmap(dist_names, dist_diff, plot_filename, res1_pos=2, res2_pos=6,
-                          vmin=vmin, vmax=vmax, verbose=verbose, cbar_label=cbar_label)
+                          vmin=vmin, vmax=vmax, verbose=verbose, cbar_label=cbar_label,
+                          tick_step=tick_step)
     return diff
 
 

From 7316f6aba88d8de22c590c6f9b59936bdabe6319 Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Wed, 8 Sep 2021 16:55:34 -0700
Subject: [PATCH 21/23] Added docstrings to various functions in modified
 files.

---
 pensa/comparison/metrics.py    |  1 +
 pensa/features/processing.py   | 44 ++++++++++++++++++++++++++++++++--
 pensa/features/txt_features.py | 22 ++++++++++++++++-
 3 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py
index b276f9b1..ecb7aaca 100644
--- a/pensa/comparison/metrics.py
+++ b/pensa/comparison/metrics.py
@@ -1,6 +1,7 @@
 import numpy as np
 from pensa import *
 from pensa.comparison import *
+from pensa.dimensionality import *
 import random
 import math
 
diff --git a/pensa/features/processing.py b/pensa/features/processing.py
index 5aa8efb7..af267485 100644
--- a/pensa/features/processing.py
+++ b/pensa/features/processing.py
@@ -263,7 +263,7 @@ def sort_features_alphabetically(tors, data):
 
 def sort_distances_by_resnum(dist, data):
     """
-    Sort distance features by the residue number..
+    Sort distance features by the residue number.
     Parameters
     ----------
     dist : list of str
@@ -284,6 +284,24 @@ def sort_distances_by_resnum(dist, data):
 
 
 def select_common_features(features_a, features_b, boolean=True):
+    """
+    Finds features in common between two trajectories.
+
+    Parameters
+    ----------
+    features_a : list of str
+        First set of features.
+    features_b : list of str
+        Second set of features.
+    boolean : bool
+        Determines if returned array contains booleans or features.
+    Returns
+    -------
+    common_a : np array of bool or str
+        Common features taken from features_a.
+    common_b : np array of bool or str
+        Common features taken from features_b.
+    """
     intersect = set(features_a).intersection(features_b)
     if boolean:
         is_common_a = [f in intersect for f in features_a]
@@ -291,10 +309,32 @@ def select_common_features(features_a, features_b, boolean=True):
     else:
         is_common_a = [f for f in features_a if f in intersect]
         is_common_b = [f for f in features_b if f in intersect]
-    return np.array(is_common_a), np.array(is_common_b)
+    common_a = np.array(is_common_a)
+    common_b = n.array(is_common_b)
+    return common_a, common_b
     
 
 def get_common_features_data(features_a, features_b, data_a, data_b):
+    """
+    Finds common features and corresponding data from two trajectories.
+
+    Parameters
+    ----------
+    features_a : list of str
+        First set of features.
+    features_b : list of str
+        Second set of features.
+    data_a : float array
+        Data from first trajectory.
+    data_b : float array
+        Data from second trajectory.
+    Returns
+    -------
+    new_features_a, new_features_b : np array of str
+        Common features between the two trajectories.
+    new_data_a, new_data_b : float array
+        Data corresponding to common features between the two trajectories.
+    """
     is_common_a, is_common_b = select_common_features(features_a, features_b)
     new_data_a = data_a[:,is_common_a]
     new_data_b = data_b[:, is_common_b]
diff --git a/pensa/features/txt_features.py b/pensa/features/txt_features.py
index c9206360..7f57cac5 100644
--- a/pensa/features/txt_features.py
+++ b/pensa/features/txt_features.py
@@ -6,9 +6,29 @@
 
 
 def get_txt_features_ala2(filename, num_frames, cossin=False):
+    """
+    Parses features for ala2 from a text file. 
+    The text file must be formatted with "phi", followed by all phi angles, a blank line,
+    followed by "psi" and all psi angles, with one angle per line.
+
+    Parameters
+    ----------
+    filename : str
+        File name of the text file.
+    num_frames : int
+        Maximum number of trajectory frames used in features array.
+    cossin : bool
+        Determines if the features array contains torsion angles or the sin and cos of torsion angles.
+
+    Returns
+    -------
+        features : numpy array
+            Data for all features
+
+    """
     phi = []
     psi = []
-
+    curr = 'phi'
     with open(filename) as f:
         for s in f.readlines():
             if s == 'phi\n' or s == 'psi\n':

From 4ae2fb64d6026ded442e5666b5b7727ec55eaaba Mon Sep 17 00:00:00 2001
From: Jasper McAvity <jmcavity@stanford.edu>
Date: Wed, 8 Sep 2021 22:36:37 -0700
Subject: [PATCH 22/23] Fixed typo in processing.py

---
 pensa/features/processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pensa/features/processing.py b/pensa/features/processing.py
index af267485..4003a3e6 100644
--- a/pensa/features/processing.py
+++ b/pensa/features/processing.py
@@ -310,7 +310,7 @@ def select_common_features(features_a, features_b, boolean=True):
         is_common_a = [f for f in features_a if f in intersect]
         is_common_b = [f for f in features_b if f in intersect]
     common_a = np.array(is_common_a)
-    common_b = n.array(is_common_b)
+    common_b = np.array(is_common_b)
     return common_a, common_b
     
 

From 9adc9b8e56b401e69469958651986350e40faf8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20V=C3=B6gele?=
 <martinvoegele@users.noreply.github.com>
Date: Fri, 10 Sep 2021 06:55:52 -0700
Subject: [PATCH 23/23] v0.2.5

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4dd2d823..ddb50063 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(name='pensa',
-      version='0.2.4',
+      version='0.2.5',
       description='PENSA - protein ensemble analysis',
       url='http://github.com/drorlab/pensa',
       author='Martin Voegele, Neil Thomson, Sang Truong, Jasper McAvity',