From 06ec3154700ab7bfec863b83d45e35e69d685919 Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Fri, 16 Jul 2021 11:36:08 -0700 Subject: [PATCH 01/23] Added metrics.py file. --- pensa/comparison/metrics.py | 156 ++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 pensa/comparison/metrics.py diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py new file mode 100644 index 00000000..d159781d --- /dev/null +++ b/pensa/comparison/metrics.py @@ -0,0 +1,156 @@ +import numpy as np +from pensa import * + + + """ + Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters. + + Parameters + ---------- + features_a : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + features_b : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + Must be the same as features_a. Provided as a sanity check. + all_data_a : float array + Trajectory data from the first ensemble. Format: [frames,frame_data]. + all_data_b : float array + Trajectory data from the second ensemble. + For kld functions, the second ensemble should be the reference ensemble. + Format: [frames,frame_data]. + bin_width : float, default=None + Bin width for the axis to compare the distributions on. + If bin_width is None, bin_num (see below) bins are used and the width is determined from the common histogram. + bin_num : int, default=10 + Number of bins for the axis to compare the distributions on (only if bin_width=None). + verbose : bool, default=True + Print intermediate results. + override_name_check : bool, default=False + Only check number of features, not their names. + + Returns + ------- + Each function returns one value. + + average_jsd : float + Average Jensen-Shannon distance from two ensembles. + max_jsd : float + Maximum Jensen-Shannon distance from two ensembles. + average_kld : float + Average Kullback-Leibler divergence from two ensembles. + max_kld : float + Maximum Kullback-Leibler divergence from two ensembles. + """ + +def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_jsdist) + + +def max_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.max(data_jsdist) + + +def average_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_kld_ab) + + +def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.max(data_kld_ab) + + + """ + Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of two functions uses the kolmogorov_smirnov_analysis function with the same parameters. + + Parameters + ---------- + features_a : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + features_b : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + Must be the same as features_a. Provided as a sanity check. + all_data_a : float array + Trajectory data from the first ensemble. Format: [frames,frame_data]. + all_data_b : float array + Trajectory data from the second ensemble. Format: [frames,frame_data]. + verbose : bool, default=True + Print intermediate results. + override_name_check : bool, default=False + Only check number of features, not their names. + + Returns + ------- + Each function returns one value. + + average_kss : float + Average Kolmogorov-Smirnov statistic for two distributions. + max_kss : float + Maximum Kolmogorov-Smirnov statistic for two distributions. + """ + + +def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_kss) + + +def max_kss(): + _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.max(data_kss) + + + """ + Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. + + Parameters + ---------- + features_a : list of str + Feature names of the first ensemble. + features_b : list of str + Feature names of the first ensemble. + Must be the same as features_a. Provided as a sanity check. + all_data_a : float array + Trajectory data from the first ensemble. Format: [frames,frame_data]. + all_data_b : float array + Trajectory data from the second ensemble. Format: [frames,frame_data]. + torsions : str + Torsion angles to use for SSI, including backbone - 'bb', and sidechain - 'sc'. + Default is None. + pocket_occupancy : bool, optional + Set to 'True' if the data input is pocket occupancy distribution. + The default is None. + pbc : bool, optional + If true, the apply periodic bounary corrections on angular distribution inputs. + The input for periodic correction must be radians. The default is True. + verbose : bool, default=True + Print intermediate results. + write_plots : bool, optional + If true, visualise the states over the raw distribution. The default is None. + override_name_check : bool, default=False + Only check number of features, not their names. + + Returns + ------- + Each function returns one value. + + average_ssi : float + Average of State Specific Information for a feature across two ensembles. + max_ssi : float + Maximum of State Specific Information for a feature across two ensembles. + """ + +def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False): + _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) + return np.mean(data_ssi) + + +def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbs=True, verbose=True, write_plots=None, override_name_check=False): + _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) + return np.max(data_ssi) From a8b191ae2d3ba3027bbb5a117b8aa86af22e4c69 Mon Sep 17 00:00:00 2001 From: Jasper J McAvity Date: Tue, 20 Jul 2021 22:05:13 -0700 Subject: [PATCH 02/23] Added metrics.py to __init__.py and added ksp functions to metrics.py. --- pensa/comparison/__init__.py | 2 +- pensa/comparison/metrics.py | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/pensa/comparison/__init__.py b/pensa/comparison/__init__.py index a4b0802c..79e6e2b7 100644 --- a/pensa/comparison/__init__.py +++ b/pensa/comparison/__init__.py @@ -2,4 +2,4 @@ from .relative_entropy import * from .statespecific import * from .visualization import * - +from .metrics import * diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py index d159781d..b6d7ad3e 100644 --- a/pensa/comparison/metrics.py +++ b/pensa/comparison/metrics.py @@ -1,8 +1,9 @@ import numpy as np from pensa import * +from pensa.comparison import * - """ +""" Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters. Parameters @@ -42,7 +43,7 @@ Average Kullback-Leibler divergence from two ensembles. max_kld : float Maximum Kullback-Leibler divergence from two ensembles. - """ +""" def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) @@ -64,8 +65,8 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_ return np.max(data_kld_ab) - """ - Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of two functions uses the kolmogorov_smirnov_analysis function with the same parameters. +""" + Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of four functions uses the kolmogorov_smirnov_analysis function with the same parameters. Parameters ---------- @@ -93,7 +94,11 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_ Average Kolmogorov-Smirnov statistic for two distributions. max_kss : float Maximum Kolmogorov-Smirnov statistic for two distributions. - """ + average_ksp : float + Average Kolmogorov-Smirnov p-value for two distributions. + max_ksp : float + Maximum Kolmogorov-Smirnov statistic for two distributions. +""" def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): @@ -101,12 +106,20 @@ def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, ov return np.mean(data_kss) -def max_kss(): +def max_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) return np.max(data_kss) +def average_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_ksp) + + +def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.max(data_ksp) - """ +""" Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. Parameters @@ -144,7 +157,7 @@ def max_kss(): Average of State Specific Information for a feature across two ensembles. max_ssi : float Maximum of State Specific Information for a feature across two ensembles. - """ +""" def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False): _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) From 72c5bda763db4a395441b326f21e4e16e490595e Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Tue, 20 Jul 2021 22:07:40 -0700 Subject: [PATCH 03/23] Updated documentation of return values for get_structure_features. --- pensa/features/pyemma_features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py index ee60b067..c9f18cf7 100644 --- a/pensa/features/pyemma_features.py +++ b/pensa/features/pyemma_features.py @@ -41,9 +41,9 @@ def get_structure_features(pdb, xtc, start_frame=0, step_width=1, cossin=False, Returns ------- - feature_names : list of str + feature_names : dict of lists of str Names of all features - features_data : numpy array + features_data : dict of numpy arrays Data for all features """ From b0f2343d1ca650a6c2c9af67f396c55571919c72 Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Tue, 20 Jul 2021 22:10:59 -0700 Subject: [PATCH 04/23] Added min_ksp function. --- pensa/comparison/metrics.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py index b6d7ad3e..f4a5bdea 100644 --- a/pensa/comparison/metrics.py +++ b/pensa/comparison/metrics.py @@ -66,7 +66,7 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_ """ - Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of four functions uses the kolmogorov_smirnov_analysis function with the same parameters. + Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of five functions uses the kolmogorov_smirnov_analysis function with the same parameters. Parameters ---------- @@ -98,6 +98,8 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_ Average Kolmogorov-Smirnov p-value for two distributions. max_ksp : float Maximum Kolmogorov-Smirnov statistic for two distributions. + min_ksp : float + Minimum Kolmogorov-Smirnov statistic for two distributions. """ @@ -119,6 +121,10 @@ def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, overri _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) return np.max(data_ksp) +def min_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.min(data_ksp) + """ Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. From 5945b90adffc69c090355b21687254b19af24d68 Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Wed, 4 Aug 2021 16:46:08 -0700 Subject: [PATCH 05/23] Added pca_sampling_efficiency metric. --- pensa/comparison/metrics.py | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py index f4a5bdea..b276f9b1 100644 --- a/pensa/comparison/metrics.py +++ b/pensa/comparison/metrics.py @@ -1,6 +1,8 @@ import numpy as np from pensa import * from pensa.comparison import * +import random +import math """ @@ -173,3 +175,47 @@ def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, p def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbs=True, verbose=True, write_plots=None, override_name_check=False): _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) return np.max(data_ssi) + + +""" + Calculates the relative sampling efficiency of test data based on reference data. + + Parameters + ---------- + ref_data : float array + Trajectory data from the reference ensemble. Format: [frames,frame_data]. + test_data : float array + Trajectory data from the test ensemble. Format: [frames,frame_data]. + num_pc : int + Number of principal components used. + + Returns + ------- + pca_se : float + Sampling efficiency of test data based on reference data. + +""" + +def pca_sampling_efficiency(ref_data, test_data, num_pc=2): + pca = calculate_pca(ref_data) + + _, ref_components = get_components_pca(ref_data, num_pc, pca=pca) + _, test_components = get_components_pca(test_data, num_pc, pca=pca) + + ref_var = np.var(ref_components, axis=0) + test_var = np.var(test_components, axis=0) + + ref_vol = np.prod(ref_var) + test_vol = np.prod(test_var) + + pca_se = test_vol / ref_vol + + return pca_se + + + + + + + + From 544d53a1a762b62bc8887792d2e7f2c9c9b2e46f Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Wed, 4 Aug 2021 16:49:00 -0700 Subject: [PATCH 06/23] Added txt_features file. --- pensa/features/txt_features.py | 46 ++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 pensa/features/txt_features.py diff --git a/pensa/features/txt_features.py b/pensa/features/txt_features.py new file mode 100644 index 00000000..c9206360 --- /dev/null +++ b/pensa/features/txt_features.py @@ -0,0 +1,46 @@ +import numpy as np +from pensa import * +import random +import math + + + +def get_txt_features_ala2(filename, num_frames, cossin=False): + phi = [] + psi = [] + + with open(filename) as f: + for s in f.readlines(): + if s == 'phi\n' or s == 'psi\n': + continue + if s == '\n': + curr = 'psi' + else: + if curr == 'phi': + phi.append(float(s)) + else: + psi.append(float(s)) + + if len(phi) > num_frames: + temp = list(zip(phi, psi)) + random.shuffle(temp) + phi, psi = zip(*temp) + + features = [] + if not cossin: + features = np.zeros((num_frames, 2)) + for i in range(num_frames): + features[i, 0] = phi[i] + reatures[i, 1] = psi[i] + else: + features = np.zeros((num_frames, 4)) + for i in range(num_frames): + features[i, 0] = math.cos(phi[i]) + features[i, 1] = math.sin(phi[i]) + features[i, 2] = math.cos(psi[i]) + features[i, 3] = math.sin(psi[i]) + + return features + + + From b64491cdff74b66b8be5d1b20d980b595de01f3d Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Wed, 4 Aug 2021 16:50:31 -0700 Subject: [PATCH 07/23] Added txt_features to __init__ file. --- pensa/features/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pensa/features/__init__.py b/pensa/features/__init__.py index 4c9a5bde..4a9a0ed4 100644 --- a/pensa/features/__init__.py +++ b/pensa/features/__init__.py @@ -8,4 +8,5 @@ from .mda_distances import * from .processing import * from .atom_features import * -from .water_features import * +from .water_features import * +from .txt_features import * From 6b50514366295e3ac9f702dc0fac9ba205fdd30d Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Thu, 12 Aug 2021 17:04:42 -0700 Subject: [PATCH 08/23] Added sorting functions to processing.py. --- pensa/features/processing.py | 59 ++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 55c76af0..5aa8efb7 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -218,6 +218,48 @@ def sort_sincos_torsions_by_resnum(tors, data): new_data = data[:,new_order] return new_tors, new_data +def sort_torsions_by_resnum(tors, data): + """ + Sort torsion features by the residue number.. + Parameters + ---------- + tors : list of str + The list of torsion features. + Returns + ------- + new_tors : list of str + The sorted list of torsion features. + """ + renamed = [] + for t in tors: + rn = t.split(' ')[-1] + ft = t.split(' ')[0] + renamed.append('%09i %s'%(int(rn),ft)) + new_order = np.argsort(renamed) + new_tors = np.array(tors)[new_order].tolist() + new_data = data[:,new_order] + return new_tors, new_data + +def sort_features_alphabetically(tors, data): + """ + Sort torsion features alphabetically. + Parameters + ---------- + tors : list of str + The list of torsion features. + Returns + ------- + new_tors : list of str + The sorted list of torsion features. + """ + renamed = [] + for t in tors: + renamed.append(t) + new_order = np.argsort(renamed) + new_tors = np.array(tors)[new_order].tolist() + new_data = data[:,new_order] + return new_tors, new_data + def sort_distances_by_resnum(dist, data): """ @@ -241,13 +283,24 @@ def sort_distances_by_resnum(dist, data): return new_dist, new_data -def select_common_features(features_a, features_b): +def select_common_features(features_a, features_b, boolean=True): intersect = set(features_a).intersection(features_b) - is_common_a = [f in intersect for f in features_a] - is_common_b = [f in intersect for f in features_b] + if boolean: + is_common_a = [f in intersect for f in features_a] + is_common_b = [f in intersect for f in features_b] + else: + is_common_a = [f for f in features_a if f in intersect] + is_common_b = [f for f in features_b if f in intersect] return np.array(is_common_a), np.array(is_common_b) +def get_common_features_data(features_a, features_b, data_a, data_b): + is_common_a, is_common_b = select_common_features(features_a, features_b) + new_data_a = data_a[:,is_common_a] + new_data_b = data_b[:, is_common_b] + new_features_a, new_features_b = select_common_features(features_a, features_b, boolean=False) + return new_features_a, new_features_b, new_data_a, new_data_b + # -- Utilities to process feature data -- From 64e158d528136bf76070c9b59e3241dbd118d9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Mon, 16 Aug 2021 21:21:17 -0700 Subject: [PATCH 09/23] test script for Sherlock --- test.sb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 test.sb diff --git a/test.sb b/test.sb new file mode 100644 index 00000000..9faa2f29 --- /dev/null +++ b/test.sb @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --time=24:00:00 +#SBATCH --mem=20G +#SBATCH --partition=rondror +#SBATCH --qos=high_p + +# Activate PENSA environment +source /home/users/mvoegele/miniconda3/etc/profile.d/conda.sh +conda activate /oak/stanford/groups/rondror/users/mvoegele/envs/pensa_dev + +# Run the tests (ignoring diffnets for now) +pytest --ignore pensa/diffnets/tests/test_api.py \ + --ignore pensa/diffnets/tests/test_cli.py \ + --ignore pensa/diffnets/tests/test_diffnets.py + From b5010bbb3fb0473f48233909c7e636722e609d69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Mon, 16 Aug 2021 21:35:22 -0700 Subject: [PATCH 10/23] ignore slurm files --- .gitignore | 3 +++ test.sb => run-test.sb | 0 2 files changed, 3 insertions(+) rename test.sb => run-test.sb (100%) diff --git a/.gitignore b/.gitignore index 98a70d86..2814d516 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,9 @@ tests/test_data/MOR-*/.*.npz *.ipynb .DS_Store +# Files from workload manager +slurm*.out + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/test.sb b/run-test.sb similarity index 100% rename from test.sb rename to run-test.sb From 4cb1c235f5bc4fdfb69fe595569c88741e5a07cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Mon, 16 Aug 2021 22:03:40 -0700 Subject: [PATCH 11/23] add general function to sort coord traj --- pensa/preprocessing/coordinates.py | 100 +++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py index aa476316..0e83ec3d 100644 --- a/pensa/preprocessing/coordinates.py +++ b/pensa/preprocessing/coordinates.py @@ -7,33 +7,6 @@ # -- Functions to preprocess trajectories -- -def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0): - """ - Aligns selected coordinates from a trajectory file. - - Parameters - ---------- - ref : str - File name for reference topology. - Can read all MDAnalysis-compatible topology formats. - pdb : str - File name for reference PDB file. - trj_list : list of str - File names for the input trajectory. - Can read all MDAnalysis-compatible trajectory formats. - out_name : str - Core of the file names for the output files - start_frame : int, optional - First frame to read from the trajectory. - """ - # Read the reference+PDB files and align selected parts. - u = mda.Universe(ref, pdb) - for trj in trj_list: - mobile = mda.Universe(ref, trj) - #mobile.trajectory = mobile.trajectory[start_frame:] - alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc') - alignment.run() - def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, @@ -114,7 +87,7 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, def merge_coordinates(ref_files, trj_files, out_name, segid=None): """ - Merges the trajectories of several different systems or system parts. + Merge several trajectories of the same system or system part. All trajectories must be (at least) as long as the first one. Parameters @@ -161,3 +134,74 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None): W.write(c.atoms) return univ + +def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0): + """ + Aligns selected coordinates from a trajectory file. + + Parameters + ---------- + ref : str + File name for reference topology. + Can read all MDAnalysis-compatible topology formats. + pdb : str + File name for reference PDB file. + trj_list : list of str + File names for the input trajectory. + Can read all MDAnalysis-compatible trajectory formats. + out_name : str + Core of the file names for the output files + start_frame : int, optional + First frame to read from the trajectory. + """ + # Read the reference+PDB files and align selected parts. + u = mda.Universe(ref, pdb) + for trj in trj_list: + mobile = mda.Universe(ref, trj) + #mobile.trajectory = mobile.trajectory[start_frame:] + alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc') + alignment.run() + + +def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0): + """ + Sort coordinate frames along an array of values. + + Parameters + ---------- + values: float array. + Values along which to sort the trajectory. + ref_name: string. + reference topology for the trajectory. + trj_name: string. + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: string. + Name of the output files + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + sort_idx: float array + Sorted indices of the values. + oidx_sort: float array + Sorted indices of the trajectory. + + """ + # Remember the index in the simulation (taking into account cutoff) + oidx = np.arange(len(values))+start_frame + # Define the MDAnalysis trajectory from where the frames come + u = mda.Universe(ref_name, trj_name) + a = u.select_atoms('all') + # Sort everything along the projection on the values + sort_idx = np.argsort(values) + oidx_sort = oidx[sort_idx] + # Write out sorted trajectory + with mda.Writer(out_name, a.n_atoms) as W: + for i in range(len(values)): + ts = u.trajectory[oidx_sort[i]] + W.write(a) + return sort_idx, oidx_sort + + From caa9065e127a91a20453a0dae743cd9d1beb205f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Mon, 16 Aug 2021 22:04:29 -0700 Subject: [PATCH 12/23] use general coord sort function for simple pca/tica write-out --- pensa/dimensionality/pca.py | 28 +++++++++++----------------- pensa/dimensionality/tica.py | 20 +++++++------------- tests/test_workflow.py | 18 +++++++----------- 3 files changed, 25 insertions(+), 41 deletions(-) diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py index 0947dff1..2b6d9c76 100644 --- a/pensa/dimensionality/pca.py +++ b/pensa/dimensionality/pca.py @@ -3,7 +3,7 @@ from pyemma.util.contexts import settings import MDAnalysis as mda import matplotlib.pyplot as plt - +from pensa.preprocessing import sort_coordinates # --- METHODS FOR PRINCIPAL COMPONENT ANALYSIS --- @@ -186,30 +186,24 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3): out_name : str Core part of the name of the output files + Returns + ------- + all_proj : list + All projections on the principal components """ # Remember the index in the simulation (taking into account cutoff) oidx = np.arange(len(data))+start_frame - # Define the MDAnalysis trajectories from where the frames come - u = mda.Universe(top,trj) - a = u.select_atoms('all') - return_str = [] - all_proj = [] + # Initialize output + all_sort = [] # Loop through the principal components for evi in range(num_pc): # Project the combined data on the principal component proj = project_on_pc(data,evi,pca=pca) - all_proj.append(proj) # Sort everything along the projection onto the PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", a.n_atoms) as W: - for i in range(data.shape[0]): - ts = u.trajectory[oidx_sort[i]] - W.write(a) - return_str.append(a) - return return_str, all_proj + out_xtc = out_name+"_pc"+str(evi+1)+".xtc" + sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) + all_sort.append(oidx_sort) + return all_sort def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3): diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py index 07a0c370..41b82d59 100644 --- a/pensa/dimensionality/tica.py +++ b/pensa/dimensionality/tica.py @@ -3,7 +3,7 @@ from pyemma.util.contexts import settings import MDAnalysis as mda import matplotlib.pyplot as plt - +from pensa.preprocessing import sort_coordinates # --- METHODS FOR TIME-LAGGED INDEPENDENT COMPONENT ANALYSIS --- @@ -186,23 +186,17 @@ def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3): """ # Remember the index in the simulation (taking into account cutoff) oidx = np.arange(len(data))+start_frame - # Define the MDAnalysis trajectories from where the frames come - u = mda.Universe(top,trj) - a = u.select_atoms('all') + # Initialize output + all_sort = [] # Loop through the time-lagged independent components for evi in range(num_tic): # Project the combined data on the time-lagged independent component proj = project_on_tic(data,evi,tica=tica) # Sort everything along the projection onto the TIC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the TIC - with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", a.n_atoms) as W: - for i in range(data.shape[0]): - ts = u.trajectory[oidx_sort[i]] - W.write(a) - return oidx_sort + out_xtc = out_name+"_tic"+str(evi+1)+".xtc" + sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) + all_sort.append(oidx_sort) + return all_sort def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3): diff --git a/tests/test_workflow.py b/tests/test_workflow.py index b1898e54..aa5c951c 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -94,10 +94,10 @@ def setUp(self): pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions']) pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4) plt.close() - self.sort_traj, self.all_proj = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3) + self.all_sort = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0, + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3) # -- Compare projections self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'], @@ -277,11 +277,7 @@ def test_sort_trajs_along_pc(self): for ele in self.sort_common_traj: self.assertEqual(ele.n_atoms, 2322) - self.assertEqual(len(self.sort_traj), 90) - for ele in self.sort_traj: - self.assertEqual(ele.n_atoms, 2322) - - self.assertEqual(len(self.all_proj), 3) + self.assertEqual(len(self.all_sort), 3) # -- sort_trajs_along_common_tic() def test_sort_trajs_along_common_tic(self): @@ -298,11 +294,11 @@ def test_sort_trajs_along_common_tic(self): # -- sort_traj_along_tic() def test_sort_traj_along_tic(self): - oidx = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0, + all_sort = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0, test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3) - self.assertEqual(len(oidx), 30) + self.assertEqual(len(all_sort), 3) # -- compare_projections() From a42d6d0d4a0d1615114eb058991dfb1443d91704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Mon, 16 Aug 2021 22:33:57 -0700 Subject: [PATCH 13/23] output both index types for pca/tica sort --- pensa/dimensionality/pca.py | 27 +++++++++++++++------------ pensa/dimensionality/tica.py | 21 ++++++++++++++------- tests/test_workflow.py | 16 ++++++++-------- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py index 2b6d9c76..61928d64 100644 --- a/pensa/dimensionality/pca.py +++ b/pensa/dimensionality/pca.py @@ -188,22 +188,25 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3): Returns ------- - all_proj : list - All projections on the principal components - """ - # Remember the index in the simulation (taking into account cutoff) - oidx = np.arange(len(data))+start_frame + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component + + """ # Initialize output - all_sort = [] - # Loop through the principal components + sorted_indices_data = [] + sorted_indices_traj = [] + # Loop through the time-lagged independent components for evi in range(num_pc): - # Project the combined data on the principal component - proj = project_on_pc(data,evi,pca=pca) - # Sort everything along the projection onto the PC + # Project the combined data on the time-lagged independent component + proj = project_on_pc(data, evi, pca=pca) + # Sort everything along the projection onto the TIC out_xtc = out_name+"_pc"+str(evi+1)+".xtc" sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) - all_sort.append(oidx_sort) - return all_sort + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_indices_data, sorted_indices_traj def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3): diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py index 41b82d59..67775cb5 100644 --- a/pensa/dimensionality/tica.py +++ b/pensa/dimensionality/tica.py @@ -182,21 +182,28 @@ def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3): Should be the same as data was from. out_name : str Core part of the name of the output files. + + Returns + ------- + sorted_indices_data : list + Sorted indices of the data array for each independent components + sorted_indices_traj : list + Sorted indices of the coordinate frames for each independent components """ - # Remember the index in the simulation (taking into account cutoff) - oidx = np.arange(len(data))+start_frame # Initialize output - all_sort = [] - # Loop through the time-lagged independent components + sorted_indices_data = [] + sorted_indices_traj = [] + # Loop through the independent components for evi in range(num_tic): - # Project the combined data on the time-lagged independent component + # Project the combined data on the independent component proj = project_on_tic(data,evi,tica=tica) # Sort everything along the projection onto the TIC out_xtc = out_name+"_tic"+str(evi+1)+".xtc" sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) - all_sort.append(oidx_sort) - return all_sort + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_indices_data, sorted_indices_traj def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3): diff --git a/tests/test_workflow.py b/tests/test_workflow.py index aa5c951c..b9ddd99f 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -94,10 +94,10 @@ def setUp(self): pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions']) pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4) plt.close() - self.all_sort = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3) + self.all_sort, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0, + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3) # -- Compare projections self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'], @@ -294,10 +294,10 @@ def test_sort_trajs_along_common_tic(self): # -- sort_traj_along_tic() def test_sort_traj_along_tic(self): - all_sort = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3) + all_sort, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0, + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3) self.assertEqual(len(all_sort), 3) From 8c9d7aa81fe7b784aac0acd685124a1a97fb0cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Mon, 16 Aug 2021 23:45:03 -0700 Subject: [PATCH 14/23] add function to sort traj by mda feature --- pensa/features/mda_distances.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pensa/features/mda_distances.py b/pensa/features/mda_distances.py index d997d8a1..b6d07727 100644 --- a/pensa/features/mda_distances.py +++ b/pensa/features/mda_distances.py @@ -178,3 +178,36 @@ def get_gpcr_calpha_distances(pdb, xtc, gpcr_name, res_dbnum, return names, distlabels, data +def sort_traj_along_mda_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0): + """ + Sort a trajectory along a feature. + + Parameters + ---------- + feat : list of str + List with all feature names. + data : float array + Feature values data from the simulation. + feature_name : str + Name of the selected feature. + ref_name: string + Reference topology for the trajectory. + trj_name: string + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: string. + Name of the output files. + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + d_sorted: float array + Sorted data of the selected feature. + + """ + d = pensa.get_feature_data(feat, data, feature_name) + sort_idx, oidx_sort = pensa.sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) + d_sorted = d[sort_idx] + return d_sorted + From bb79bd11e0ccf6d388b7d6b7cc9392a237a9bf30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Tue, 17 Aug 2021 01:55:52 -0700 Subject: [PATCH 15/23] add function to sort a traj by one feature --- pensa/features/mda_distances.py | 33 --------------------------- pensa/features/processing.py | 37 +++++++++++++++++++++++++++++-- pensa/features/pyemma_features.py | 37 ++++++++++++++++++++++++++++++- 3 files changed, 71 insertions(+), 36 deletions(-) diff --git a/pensa/features/mda_distances.py b/pensa/features/mda_distances.py index b6d07727..d997d8a1 100644 --- a/pensa/features/mda_distances.py +++ b/pensa/features/mda_distances.py @@ -178,36 +178,3 @@ def get_gpcr_calpha_distances(pdb, xtc, gpcr_name, res_dbnum, return names, distlabels, data -def sort_traj_along_mda_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0): - """ - Sort a trajectory along a feature. - - Parameters - ---------- - feat : list of str - List with all feature names. - data : float array - Feature values data from the simulation. - feature_name : str - Name of the selected feature. - ref_name: string - Reference topology for the trajectory. - trj_name: string - Trajetory from which the frames are picked. - Usually the same as the values are from. - out_name: string. - Name of the output files. - start_frame: int - Offset of the data with respect to the trajectories. - - Returns - ------- - d_sorted: float array - Sorted data of the selected feature. - - """ - d = pensa.get_feature_data(feat, data, feature_name) - sort_idx, oidx_sort = pensa.sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) - d_sorted = d[sort_idx] - return d_sorted - diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 55c76af0..2cf1839e 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -9,8 +9,7 @@ import matplotlib.pyplot as plt import os import warnings -#from pensa.features import * - +from pensa.preprocessing import sort_coordinates # -- Utilities to extract time series -- @@ -283,3 +282,37 @@ def correct_angle_periodicity(angle): return new_angle +# Process trajectories according to feature data + +def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0): + """ + Sort a trajectory along a feature. + + Parameters + ---------- + feat : list of str + List with all feature names. + data : float array + Feature values data from the simulation. + feature_name : str + Name of the selected feature. + ref_name: string + Reference topology for the trajectory. + trj_name: string + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: string. + Name of the output files. + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + d_sorted: float array + Sorted data of the selected feature. + + """ + d = get_feature_data(feat, data, feature_name) + sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) + d_sorted = d[sort_idx] + return d_sorted diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py index ee60b067..b1cf0e11 100644 --- a/pensa/features/pyemma_features.py +++ b/pensa/features/pyemma_features.py @@ -10,7 +10,8 @@ import numpy as np import pyemma from pyemma.util.contexts import settings - +from pensa.features.processing import get_feature_timeseries +from pensa.preprocessing.coordinates import sort_coordinates # -- Loading the Features -- @@ -169,4 +170,38 @@ def _remove_resnum_offset(features, offset): return new_features +def sort_traj_along_pyemma_feature(feat, data, feature_name, feature_type, ref_name, trj_name, out_name, start_frame=0): + """ + Sort a trajectory along a PyEMMA feature. + + Parameters + ---------- + feat : list of str + List with all feature names. + data : float array + Feature values data from the simulation. + feature_name : str + Name of the selected feature. + feature_type : str + Type of the selected feature. + ref_name: string + Reference topology for the trajectory. + trj_name: string + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: string. + Name of the output files. + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + d_sorted: float array + Sorted data of the selected feature. + + """ + d = get_feature_timeseries(feat, data, feature_type, feature_name) + sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) + d_sorted = d[sort_idx] + return d_sorted From 4aad56b42c4feb2fe920b8e45b2010c4ee8b6f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Tue, 17 Aug 2021 13:06:12 -0700 Subject: [PATCH 16/23] log and doc for traj sort functions --- pensa/features/processing.py | 3 ++- pensa/preprocessing/coordinates.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 2cf1839e..52f7c948 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -284,7 +284,7 @@ def correct_angle_periodicity(angle): # Process trajectories according to feature data -def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0): +def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0, verbose=False): """ Sort a trajectory along a feature. @@ -312,6 +312,7 @@ def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_na Sorted data of the selected feature. """ + if verbose: print('Sorting along feature '+feature_name) d = get_feature_data(feat, data, feature_name) sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) d_sorted = d[sort_idx] diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py index 0e83ec3d..8110ee8a 100644 --- a/pensa/preprocessing/coordinates.py +++ b/pensa/preprocessing/coordinates.py @@ -163,7 +163,7 @@ def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_fram alignment.run() -def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0): +def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0, verbose=False): """ Sort coordinate frames along an array of values. @@ -189,10 +189,15 @@ def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0): Sorted indices of the trajectory. """ - # Remember the index in the simulation (taking into account cutoff) + # Remember the index in the simulation (taking into account offset) oidx = np.arange(len(values))+start_frame # Define the MDAnalysis trajectory from where the frames come + if verbose: print('Loading:', ref_name, trj_name) u = mda.Universe(ref_name, trj_name) + if verbose: + print('Trajectory length:', len(u.trajectory)) + print('Number of values: ', len(values)) + print('Trajectory offset:', start_frame) a = u.select_atoms('all') # Sort everything along the projection on the values sort_idx = np.argsort(values) From 4414b1ccfe0f02594fb9e4961fbf3a964775b9b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Wed, 18 Aug 2021 00:03:54 -0700 Subject: [PATCH 17/23] make dim. reduction functions more consistent --- docs/tut-5-dimensionality.rst | 14 +- pensa/dimensionality/__init__.py | 1 + pensa/dimensionality/pca.py | 298 ++++++------------ pensa/dimensionality/tica.py | 268 ++++++---------- pensa/dimensionality/visualization.py | 183 +++++++++++ pensa/features/processing.py | 3 +- pensa/preprocessing/coordinates.py | 152 +++++++-- ...calculate_combined_principal_components.py | 4 +- tests/test_workflow.py | 34 +- 9 files changed, 521 insertions(+), 436 deletions(-) create mode 100644 pensa/dimensionality/visualization.py diff --git a/docs/tut-5-dimensionality.rst b/docs/tut-5-dimensionality.rst index 93c072c1..f053eed9 100644 --- a/docs/tut-5-dimensionality.rst +++ b/docs/tut-5-dimensionality.rst @@ -89,13 +89,12 @@ entire receptor, sorted by the PCs of the transmembrane region. _ = sort_trajs_along_common_pc(sim_a_tmr_data['bb-torsions'], sim_b_tmr_data['bb-torsions'], - feature_start_frame, "traj/condition-a_receptor.gro", "traj/condition-b_receptor.gro", "traj/condition-a_receptor.xtc", "traj/condition-b_receptor.xtc", "pca/receptor_by_tmr", - num_pc=3) + num_pc=3, start_frame=feature_start_frame) The above function deals with the special case of two input trajectories. We also provide the functions for a single one (see @@ -121,8 +120,9 @@ Here are the major steps of a PCA demonstrated for a single simulation. .. code:: python - _, __ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'], - pca_a, feature_start_frame, - "traj/condition-a_receptor.gro", - "traj/condition-a_receptor.xtc", - "pca/condition-a_receptor_by_tmr", num_pc=3) + _, __, ___ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'], + "traj/condition-a_receptor.gro", + "traj/condition-a_receptor.xtc", + "pca/condition-a_receptor_by_tmr", + start_frame = feature_start_frame, + pca=pca_a, num_pc=3) diff --git a/pensa/dimensionality/__init__.py b/pensa/dimensionality/__init__.py index 3ae57a17..bbf189db 100644 --- a/pensa/dimensionality/__init__.py +++ b/pensa/dimensionality/__init__.py @@ -1,3 +1,4 @@ +from .visualization import * from .pca import * from .tica import * diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py index 61928d64..6d79d4cb 100644 --- a/pensa/dimensionality/pca.py +++ b/pensa/dimensionality/pca.py @@ -3,11 +3,14 @@ from pyemma.util.contexts import settings import MDAnalysis as mda import matplotlib.pyplot as plt -from pensa.preprocessing import sort_coordinates +from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates +from .visualization import project_on_eigenvector, sort_traj_along_projection # --- METHODS FOR PRINCIPAL COMPONENT ANALYSIS --- +# http://www.emma-project.org/latest/api/generated/pyemma.coordinates.pca.html + def calculate_pca(data): """ @@ -115,14 +118,11 @@ def project_on_pc(data, ev_idx, pca=None): Value along the PC for each frame. """ - # Perform PCA if none is provided + # Perform PCA if none is provided. if pca is None: - pca = pyemma.coordinates.pca(data) #,dim=3) - # Project the features onto the principal components - projection = np.zeros(data.shape[0]) - for ti in range(data.shape[0]): - projection[ti] = np.dot(data[ti],pca.eigenvectors[:,ev_idx]) - # Return the value along the PC for each frame + pca = pyemma.coordinates.pca(data) + # Project the features onto the principal components. + projection = project_on_eigenvector(data, ev_idx, pca) return projection @@ -164,20 +164,14 @@ def get_components_pca(data, num, pca=None, prefix=''): return comp_names, np.array(components).T -def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3): +def sort_traj_along_pc(data, top, trj, out_name, pca=None, num_pc=3, start_frame=0): """ - Sort a trajectory along given principal components. + Sort a trajectory along principal components. Parameters ---------- data : float array Trajectory data [frames,frame_data]. - pca : PCA obj - Principal components information. - num_pc : int - Sort along the first num_pc principal components. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : str File name of the reference topology for the trajectory. trj : str @@ -185,31 +179,38 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3): Should be the same as data was from. out_name : str Core part of the name of the output files + pca : PCA obj, optional + Principal components information. + If none is provided, it will be calculated. + Defaults to None. + num_pc : int, optional + Sort along the first num_pc principal components. + Defaults to 3. + start_frame : int, optional + Offset of the data with respect to the trajectories (defined below). + Defaults to 0. Returns ------- + sorted_proj: list + sorted projections on each principal component sorted_indices_data : list Sorted indices of the data array for each principal component sorted_indices_traj : list Sorted indices of the coordinate frames for each principal component """ - # Initialize output - sorted_indices_data = [] - sorted_indices_traj = [] - # Loop through the time-lagged independent components - for evi in range(num_pc): - # Project the combined data on the time-lagged independent component - proj = project_on_pc(data, evi, pca=pca) - # Sort everything along the projection onto the TIC - out_xtc = out_name+"_pc"+str(evi+1)+".xtc" - sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) - sorted_indices_data.append(sort_idx) - sorted_indices_traj.append(oidx_sort) - return sorted_indices_data, sorted_indices_traj + # Calculate the principal components if they are not given. + if pca is None: + pca = pyemma.coordinates.pca(all_data, dim=3) + # Sort the trajectory along them. + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( + data, pca, top, trj, out_name, num_comp=num_pc, start_frame = start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3): +def sort_trajs_along_common_pc(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_pc=3, start_frame=0): """ Sort two trajectories along their most important common principal components. @@ -219,8 +220,6 @@ def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, Trajectory data [frames,frame_data]. data_b : float array Trajectory data [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top_a : str Reference topology for the first trajectory. top_b : str @@ -233,48 +232,30 @@ def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, Should be the same as data_b was from. out_name : str Core part of the name of the output files. - + num_pc : int, optional + Sort along the first num_pc principal components. + Defaults to 3. + start_frame : int or list of int + Offset of the data with respect to the trajectories. + Defaults to 0. + + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component + """ - # Combine the input data - data = np.concatenate([data_a,data_b],0) - # Remember which simulation the data came frome - cond = np.concatenate([np.ones(len(data_a)), np.zeros(len(data_b))]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(len(data_a))+start_frame, - np.arange(len(data_b))+start_frame]) - # Calculate the principal components - pca = pyemma.coordinates.pca(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - ua = mda.Universe(top_a,trj_a) - ub = mda.Universe(top_b,trj_b) - # ... and select all atoms - aa = ua.select_atoms('all') - ab = ub.select_atoms('all') - return_str = [] - # Loop over principal components. - for evi in range(num_pc): - # Project the combined data on the principal component - proj = project_on_pc(data,evi,pca=pca) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", aa.n_atoms) as W: - for i in range(data.shape[0]): - if cond_sort[i] == 1: # G-protein bound - ts = ua.trajectory[oidx_sort[i]] - W.write(aa) - return_str.append(aa) - elif cond_sort[i] == 0: # arrestin bound - ts = ub.trajectory[oidx_sort[i]] - W.write(ab) - return_str.append(ab) - return return_str + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_pc( + [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=3, start_frame = start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_pc=3): +def sort_mult_trajs_along_common_pc(data, top, trj, out_name, num_pc=3, start_frame=0): """ Sort multiple trajectories along their most important common principal components. @@ -282,8 +263,6 @@ def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_p ---------- data : list of float arrays List of trajectory data arrays, each [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : list of str Reference topology files. trj : list of str @@ -291,148 +270,47 @@ def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_p trj[i] should be the same as data[i] was from. out_name : str Core part of the name of the output files. + num_pc : int, optional + Sort along the first num_pc principal components. + Defaults to 3. + start_frame : int or list of int + Offset of the data with respect to the trajectories. + Defaults to 0. + + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component """ num_frames = [len(d) for d in data] num_traj = len(data) + if type(start_frame) == int: + start_frame *= np.ones(num_traj) + start_frame = start_frame.tolist() # Combine the input data - data = np.concatenate(data,0) - # Remember which simulation the data came frome - cond = np.concatenate([i*np.ones(num_frames[i],dtype=int) for i in range(num_traj)]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(num_frames[i])+start_frame for i in range(num_traj)]) - # Calculate the principal components - pca = pyemma.coordinates.pca(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - univs = [] - atoms = [] - for j in range(num_traj): - u = mda.Universe(top[j],trj[j]) - print('Length of trajectory',len(u.trajectory)) - univs.append(u) - atoms.append(u.select_atoms('all')) + all_data = np.concatenate(data,0) + # Calculate the principal component + pca = pyemma.coordinates.pca(all_data, dim=3) + # Initialize output + sorted_proj = [] + sorted_indices_data = [] + sorted_indices_traj = [] # Loop over principal components. for evi in range(num_pc): # Project the combined data on the principal component - proj = project_on_pc(data,evi,pca=pca) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", atoms[0].n_atoms) as W: - for i in range(data.shape[0]): - j = cond_sort[i] - o = oidx_sort[i] - uj = univs[j] - ts = uj.trajectory[o] - W.write(atoms[j]) - return - - -def compare_projections(data_a, data_b, pca, num=3, saveas=None, label_a=None, label_b=None): - """ - Compare two datasets along a given principal component. - - Parameters - ---------- - data_a : float array - Trajectory data [frames,frame_data] - data_b : float array - Trajectory data [frames,frame_data] - pca : PCA object - Principal components information. - num : int - Number of principal components to plot. - saveas : str, optional - Name of the output file. - label_a : str, optional - Label for the first dataset. - label_b : str, optional - Label for the second dataset. - - """ - # Start the figure - fig,ax = plt.subplots(num, 2, figsize=[8,3*num], dpi=300) - val = [] - # Loop over PCs - for evi in range(num): - # Calculate values along PC for each frame - proj_a = project_on_pc(data_a, evi, pca=pca) - proj_b = project_on_pc(data_b, evi, pca=pca) - # Plot the time series in the left panel - ax[evi,0].plot(proj_a, alpha=0.5, label=label_a) - ax[evi,0].plot(proj_b, alpha=0.5, label=label_b) - ax[evi,0].set_xlabel('frame number') - ax[evi,0].set_ylabel('PC %i'%(evi+1)) - # Plot the histograms in the right panel - ax[evi,1].hist(proj_a, bins=30, alpha=0.5, density=True, label=label_a) - ax[evi,1].hist(proj_b, bins=30, alpha=0.5, density=True, label=label_b) - ax[evi,1].set_xlabel('PC %i'%(evi+1)) - ax[evi,1].set_ylabel('frequency') - # Legend - if label_a and label_b: - ax[evi,0].legend() - ax[evi,1].legend() - val.append([proj_a, proj_b]) - fig.tight_layout() - # Save the figure - if saveas is not None: - fig.savefig(saveas, dpi=300) - return val - - -def compare_mult_projections(data, pca, num=3, saveas=None, labels=None, colors=None): - """ - Compare two datasets along a given principal component. - - Parameters - ---------- - data : list of float arrays - Data from multiple trajectories [frames,frame_data] - pca : PCA object - Principal components information. - num : int - Number of principal components to plot. - saveas : str, optional - Name of the output file. - labels : list of str, optional - Labels for the datasets. If provided, it must have the same length as data. - - """ - if labels is not None: - assert len(labels) == len(data) - else: - labels = [None for _ in range(len(data))] - if colors is not None: - assert len(colors) == len(data) - else: - colors = ['C%i'%num for num in range(len(data))] - # Start the figure - fig,ax = plt.subplots(num, 2, figsize=[9,3*num], dpi=300) - # Loop over PCs - for evi in range(num): - for j,d in enumerate(data): - # Calculate values along PC for each frame - proj = project_on_pc(d, evi, pca=pca) - # Plot the time series in the left panel - ax[evi,0].plot(proj, alpha=0.5, - label=labels[j], color=colors[j]) - # Plot the histograms in the right panel - ax[evi,1].hist(proj, bins=30, alpha=0.5, density=True, - label=labels[j], color=colors[j]) - # Axis labels - ax[evi,0].set_xlabel('frame number') - ax[evi,0].set_ylabel('PC %i'%(evi+1)) - ax[evi,1].set_xlabel('PC %i'%(evi+1)) - ax[evi,1].set_ylabel('frequency') - # Legend - if labels[0] is not None: - ax[evi,0].legend() - ax[evi,1].legend() - fig.tight_layout() - # Save the figure - if saveas is not None: - fig.savefig(saveas, dpi=300) - return + proj = [project_on_pc(d, evi, pca=pca) for d in data] + # Sort everything along the projection on the respective PC + out_xtc = out_name+"_pc"+str(evi+1)+".xtc" + proj_sort, sort_idx, oidx_sort = merge_and_sort_coordinates( + proj, top, trj, out_xtc, start_frame=start_frame, verbose=False + ) + sorted_proj.append(proj_sort) + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_proj, sorted_indices_data, sorted_indices_traj + diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py index 67775cb5..01546eaa 100644 --- a/pensa/dimensionality/tica.py +++ b/pensa/dimensionality/tica.py @@ -3,11 +3,13 @@ from pyemma.util.contexts import settings import MDAnalysis as mda import matplotlib.pyplot as plt -from pensa.preprocessing import sort_coordinates +from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates +from .visualization import project_on_eigenvector, sort_traj_along_projection # --- METHODS FOR TIME-LAGGED INDEPENDENT COMPONENT ANALYSIS --- -# http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html#pyemma.coordinates.tica + +# http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html def calculate_tica(data): @@ -43,7 +45,7 @@ def tica_eigenvalues_plot(tica, num=12, plot_file=None): Path and name of the file to save the plot. """ - # Plot eigenvalues over component numbers + # Plot eigenvalues over component numbers. fig,ax = plt.subplots(1, 1, figsize=[4,3], dpi=300) componentnr = np.arange(num)+1 eigenvalues = tica.eigenvalues[:num] @@ -51,7 +53,7 @@ def tica_eigenvalues_plot(tica, num=12, plot_file=None): ax.set_xlabel('component number') ax.set_ylabel('eigenvalue') fig.tight_layout() - # Save the figure to a file + # Save the figure to a file. if plot_file: fig.savefig(plot_file, dpi=300) return componentnr, eigenvalues @@ -75,7 +77,7 @@ def tica_features(tica, features, num, threshold, plot_file=None): Path and name of the file to save the plot. """ - # Plot the highest TIC correlations and print relevant features + # Plot the highest TIC correlations and print relevant features. fig,ax = plt.subplots(num,1,figsize=[4,num*3],dpi=300,sharex=True) for i in range(num): relevant = tica.feature_TIC_correlation[:,i]**2 > threshold**2 @@ -87,7 +89,7 @@ def tica_features(tica, features, num, threshold, plot_file=None): ax[i].set_xlabel('feature index') ax[i].set_ylabel('correlation with TIC%i'%(i+1)) fig.tight_layout() - # Save the figure to a file + # Save the figure to a file. if plot_file: fig.savefig(plot_file,dpi=300) return test_feature @@ -112,14 +114,11 @@ def project_on_tic(data, ev_idx, tica=None): Value along the TIC for each frame. """ - # Perform TICA if none is provided + # Perform TICA if none is provided. if tica is None: - tica = pyemma.coordinates.tica(data) #,dim=3) - # Project the features onto the time-lagged independent components - projection = np.zeros(data.shape[0]) - for ti in range(data.shape[0]): - projection[ti] = np.dot(data[ti],tica.eigenvectors[:,ev_idx]) - # Return the value along the TIC for each frame + tica = pyemma.coordinates.tica(data) + # Project the features onto the time-lagged independent components. + projection = project_on_eigenvector(data, ev_idx, tica) return projection @@ -145,10 +144,10 @@ def get_components_tica(data, num, tica=None, prefix=''): Component data [frames,components] """ - # Perform tICA if none is provided + # Perform tICA if none is provided. if tica is None: tica = pyemma.coordinates.tica(data) - # Project the features onto the principal components + # Project the features onto the principal components. comp_names = [] components = [] for ev_idx in range(num): @@ -157,56 +156,57 @@ def get_components_tica(data, num, tica=None, prefix=''): projection[ti] = np.dot(data[ti],tica.eigenvectors[:,ev_idx]) components.append(projection) comp_names.append(prefix+'IC'+str(ev_idx+1)) - # Return the names and data + # Return the names and data. return comp_names, np.array(components).T -def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3): +def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_frame=0): """ - Sort a trajectory along given time-lagged independent components. + Sort a trajectory along independent components. Parameters ---------- data : float array Trajectory data [frames,frame_data]. - tica : TICA obj - Time-lagged independent components information. - num_tic : int - Sort along the first num_tic time-lagged independent components. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : str File name of the reference topology for the trajectory. trj : str File name of the trajetory from which the frames are picked. Should be the same as data was from. out_name : str - Core part of the name of the output files. - + Core part of the name of the output files + tica : tICA obj, optional + Time-lagged independent components information. + If none is provided, it will be calculated. + Defaults to None. + num_ic : int, optional + Sort along the first num_ic independent components. + Defaults to 3. + start_frame : int, optional + Offset of the data with respect to the trajectories (defined below). + Defaults to 0. + Returns ------- + sorted_proj: list + sorted projections on each principal component sorted_indices_data : list - Sorted indices of the data array for each independent components + Sorted indices of the data array for each principal component sorted_indices_traj : list - Sorted indices of the coordinate frames for each independent components - - """ - # Initialize output - sorted_indices_data = [] - sorted_indices_traj = [] - # Loop through the independent components - for evi in range(num_tic): - # Project the combined data on the independent component - proj = project_on_tic(data,evi,tica=tica) - # Sort everything along the projection onto the TIC - out_xtc = out_name+"_tic"+str(evi+1)+".xtc" - sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) - sorted_indices_data.append(sort_idx) - sorted_indices_traj.append(oidx_sort) - return sorted_indices_data, sorted_indices_traj + Sorted indices of the coordinate frames for each principal component + + """ + # Calculate the principal components if they are not given. + if tica is None: + tica = pyemma.coordinates.tica(all_data, dim=3) + # Sort the trajectory along them. + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( + data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3): +def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, start_frame=0): """ Sort two trajectories along their most important common time-lagged independent components. @@ -216,8 +216,6 @@ def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a Trajectory data [frames,frame_data]. data_b : float array Trajectory data [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top_a : str Reference topology for the first trajectory. top_b : str @@ -230,54 +228,37 @@ def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a Should be the same as data_b was from. out_name : str Core part of the name of the output files. + num_ic : int, optional + Sort along the first num_ic independent components. + Defaults to 3. + start_frame : int, optional + Offset of the data with respect to the trajectories (defined below). + Defaults to 0. + + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component """ - # Combine the input data - data = np.concatenate([data_a,data_b],0) - # Remember which simulation the data came frome - cond = np.concatenate([np.ones(len(data_a)), np.zeros(len(data_b))]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(len(data_a))+start_frame, - np.arange(len(data_b))+start_frame]) - # Calculate the time-lagged independent components - tica = pyemma.coordinates.tica(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - ua = mda.Universe(top_a,trj_a) - ub = mda.Universe(top_b,trj_b) - # ... and select all atoms - aa = ua.select_atoms('all') - ab = ub.select_atoms('all') - # Loop over time-lagged independent components. - for evi in range(num_tic): - # Project the combined data on the time-lagged independent component - proj = project_on_tic(data,evi,tica=tica) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", aa.n_atoms) as W: - for i in range(data.shape[0]): - if cond_sort[i] == 1: # G-protein bound - ts = ua.trajectory[oidx_sort[i]] - W.write(aa) - elif cond_sort[i] == 0: # arrestin bound - ts = ub.trajectory[oidx_sort[i]] - W.write(ab) - return proj, oidx_sort + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_tic( + [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=3, start_frame = start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_mult_trajs_along_common_tic(data, start_frame, top, trj, out_name, num_tic=3): +def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_frame=0): """ - Sort multiple trajectories along their most important common time-lagged independent components. + Sort multiple trajectories along their most important independent components. Parameters ---------- data : list of float arrays List of trajectory data arrays, each [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : list of str Reference topology files. trj : list of str @@ -285,88 +266,47 @@ def sort_mult_trajs_along_common_tic(data, start_frame, top, trj, out_name, num_ trj[i] should be the same as data[i] was from. out_name : str Core part of the name of the output files. + num_ic : int, optional + Sort along the first num_ic independent components. + Defaults to 3. + start_frame : int or list of int + Offset of the data with respect to the trajectories. + Defaults to 0. + + Returns + ------- + sorted_proj: list + sorted projections on each independent component + sorted_indices_data : list + Sorted indices of the data array for each independent component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each independent component """ num_frames = [len(d) for d in data] num_traj = len(data) + if type(start_frame) == int: + start_frame *= np.ones(num_traj) + start_frame = start_frame.tolist() # Combine the input data - data = np.concatenate(data,0) - # Remember which simulation the data came frome - cond = np.concatenate([i*np.ones(num_frames[i]) for i in range(num_traj)]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(num_frames[i])+start_frame for i in range(num_traj)]) - # Calculate the time-lagged independent components - tica = pyemma.coordinates.tica(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - univs = [] - atoms = [] - for j in range(num_traj): - u = mda.Universe(top[j],trj[j]) - univs.append(u) - atoms.append(u.select_atoms('all')) - # Loop over time-lagged independent component. - for evi in range(num_tic): - # Project the combined data on the time-lagged independent component - proj = project_on_tic(data,evi,tica=tica) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", atoms[0].n_atoms) as W: - for i in range(data.shape[0]): - j = cond_sort[i] - ts = univs[j].trajectory[oidx_sort[i]] - W.write(atoms[j]) - return - - -def compare_projections_tica(data_a, data_b, tica, num=3, saveas=None, label_a=None, label_b=None): - """ - Compare two datasets along a given time-lagged indepedent component. - - Parameters - ---------- - data_a : float array - Trajectory data [frames,frame_data] - data_b : float array - Trajectory data [frames,frame_data] - tica : TICA object - Time-lagged independent components information. - num : int, default=3 - Number of time-lagged independent components to plot. - saveas : str, optional - Name of the output file. - label_a : str, optional - Label for the first dataset. - label_b : str, optional - Label for the second dataset. - - """ - # Start the figure - fig,ax = plt.subplots(num, 2, figsize=[8,3*num], dpi=300) - # Loop over PCs - for evi in range(num): - # Calculate values along TIC for each frame - proj_a = project_on_tic(data_a, evi, tica=tica) - proj_b = project_on_tic(data_b, evi, tica=tica) - # Plot the time series in the left panel - ax[evi,0].plot(proj_a, alpha=0.5, label=label_a) - ax[evi,0].plot(proj_b, alpha=0.5, label=label_b) - ax[evi,0].set_xlabel('frame number') - ax[evi,0].set_ylabel('TIC %i'%(evi+1)) - # Plot the histograms in the right panel - ax[evi,1].hist(proj_a, bins=30, alpha=0.5, density=True, label=label_a) - ax[evi,1].hist(proj_b, bins=30, alpha=0.5, density=True, label=label_b) - ax[evi,1].set_xlabel('TIC %i'%(evi+1)) - ax[evi,1].set_ylabel('frequency') - # Legend - if label_a and label_b: - ax[evi,0].legend() - ax[evi,1].legend() - fig.tight_layout() - # Save the figure - if saveas is not None: - fig.savefig(saveas, dpi=300) - return + all_data = np.concatenate(data,0) + # Calculate the independent components + tica = pyemma.coordinates.tica(all_data, dim=3) + # Initialize output + sorted_proj = [] + sorted_indices_data = [] + sorted_indices_traj = [] + # Loop over principal components. + for evi in range(num_ic): + # Project the combined data on the independent component + proj = [project_on_tic(d, evi, tica=tica) for d in data] + # Sort everything along the projection on the respective independent component + out_xtc = out_name+"_tic"+str(evi+1)+".xtc" + proj_sort, sort_idx, oidx_sort = merge_and_sort_coordinates( + proj, top, trj, out_xtc, start_frame=start_frame, verbose=False + ) + sorted_proj.append(proj_sort) + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_proj, sorted_indices_data, sorted_indices_traj + diff --git a/pensa/dimensionality/visualization.py b/pensa/dimensionality/visualization.py new file mode 100644 index 00000000..7263a7a3 --- /dev/null +++ b/pensa/dimensionality/visualization.py @@ -0,0 +1,183 @@ +import numpy as np +import pyemma +from pyemma.util.contexts import settings +import MDAnalysis as mda +import matplotlib.pyplot as plt +from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates + + +def project_on_eigenvector(data, ev_idx, ana): + """ + Projects a trajectory onto an eigenvector of its PCA/tICA. + + Parameters + ---------- + data : float array + Trajectory data [frames,frame_data]. + ev_idx : int + Index of the eigenvector to project on (starts with zero). + ana : PCA or tICA obj + Information of pre-calculated PCA or tICA. + Must be calculated for the same features (but not necessarily the same trajectory). + + Returns + ------- + projection : float array + Value along the PC for each frame. + + """ + # Project the features onto the components + projection = np.zeros(data.shape[0]) + for ti in range(data.shape[0]): + projection[ti] = np.dot(data[ti], ana.eigenvectors[:,ev_idx]) + # Return the value along the PC for each frame + return projection + + +def compare_projections(data_a, data_b, ana, num=3, saveas=None, label_a=None, label_b=None): + """ + Compare two datasets along the components of a PCA or tICA. + + Parameters + ---------- + data_a : float array + Trajectory data [frames,frame_data]. + data_b : float array + Trajectory data [frames,frame_data]. + ana : PCA or tICA object + Components analysis information. + num : int + Number of components to plot. + saveas : str, optional + Name of the output file. + label_a : str, optional + Label for the first dataset. + label_b : str, optional + Label for the second dataset. + + Returns: + -------- + projections : list of float arrays + Projections of the trajectory on each component. + + """ + if label_a is not None and label_b is not None: + labels = [label_a, label_b] + else: + labels = None + projections = compare_mult_projections([data_a, data_b], ana, num=num, saveas=saveas, labels=labels, colors=None) + return projections + + +def compare_mult_projections(data, ana, num=3, saveas=None, labels=None, colors=None): + """ + Compare multiple datasets along the components of a PCA or tICA. + + Parameters + ---------- + data : list of float arrays + Data from multiple trajectories [frames,frame_data]. + ana : PCA or tICA object + Components analysis information. + num : int + Number of principal components to plot. + saveas : str, optional + Name of the output file. + labels : list of str, optional + Labels for the datasets. If provided, it must have the same length as data. + + Returns: + -------- + projections : list of float arrays + Projections of the trajectory on each principal component. + + """ + if labels is not None: + assert len(labels) == len(data) + else: + labels = [None for _ in range(len(data))] + if colors is not None: + assert len(colors) == len(data) + else: + colors = ['C%i'%num for num in range(len(data))] + # Start the figure + fig,ax = plt.subplots(num, 2, figsize=[9,3*num], dpi=300) + # Loop over components + projections = [] + for evi in range(num): + proj_evi = [] + for j,d in enumerate(data): + # Calculate values along PC for each frame + proj = project_on_eigenvector(d, evi, ana) + # Plot the time series in the left panel + ax[evi,0].plot(proj, alpha=0.5, + label=labels[j], color=colors[j]) + # Plot the histograms in the right panel + ax[evi,1].hist(proj, bins=30, alpha=0.5, density=True, + label=labels[j], color=colors[j]) + proj_evi.append(proj) + projections.append(proj_evi) + # Axis labels + ax[evi,0].set_xlabel('frame number') + ax[evi,0].set_ylabel('PC %i'%(evi+1)) + ax[evi,1].set_xlabel('PC %i'%(evi+1)) + ax[evi,1].set_ylabel('frequency') + # Legend + if labels[0] is not None: + ax[evi,0].legend() + ax[evi,1].legend() + fig.tight_layout() + # Save the figure + if saveas is not None: + fig.savefig(saveas, dpi=300) + return projections + + +def sort_traj_along_projection(data, ana, top, trj, out_name, num_comp=3, start_frame=0): + """ + Sort a trajectory along given principal components. + + Parameters + ---------- + data : float array + Trajectory data [frames,frame_data]. + ana : PCA or tICA obj + Components information. + top : str + File name of the reference topology for the trajectory. + trj : str + File name of the trajetory from which the frames are picked. + Should be the same as data was from. + out_name : str + Core part of the name of the output files + num_comp : int, optional + Sort along the first num_comp components. + start_frame : int, optional + Offset of the data with respect to the trajectories (defined below). + + Returns + ------- + sorted_proj: list + sorted projections on each component + sorted_indices_data : list + Sorted indices of the data array for each component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each component + + """ + # Initialize output + sorted_proj = [] + sorted_indices_data = [] + sorted_indices_traj = [] + # Loop through the principal components + for evi in range(num_comp): + # Project the combined data on the principal component + proj = project_on_eigenvector(data, evi, ana) + # Sort everything along the projection onto the PC + out_xtc = out_name+"_pc"+str(evi+1)+".xtc" + proj_sort, sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) + sorted_proj.append(proj_sort) + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_proj, sorted_indices_data, sorted_indices_traj + diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 52f7c948..34ec2858 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -314,6 +314,5 @@ def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_na """ if verbose: print('Sorting along feature '+feature_name) d = get_feature_data(feat, data, feature_name) - sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) - d_sorted = d[sort_idx] + d_sorted, sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) return d_sorted diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py index 8110ee8a..db3a9a68 100644 --- a/pensa/preprocessing/coordinates.py +++ b/pensa/preprocessing/coordinates.py @@ -9,15 +9,15 @@ # -- Functions to preprocess trajectories -- -def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, +def extract_coordinates(top, pdb, trj_list, out_name, sel_string, start_frame=0, rename_segments=None, residues_offset=0 ): """ Extracts selected coordinates from a trajectory file. Parameters ---------- - ref : str - File name for reference topology. + top : str + File name for topology. Can read all MDAnalysis-compatible topology formats. pdb : str File name for the reference PDB file. @@ -30,8 +30,8 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, First frame to read from the trajectory. """ - # Read the reference+PDB files and extract selected parts. - u = mda.Universe(ref,pdb) + # Read the topology+PDB files and extract selected parts. + u = mda.Universe(top,pdb) u.residues.resids -= residues_offset selection = u.select_atoms(sel_string) num_at = selection.n_atoms @@ -43,7 +43,7 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, # Read the trajectories and extract selected parts. with mda.Writer(out_name+'.xtc', selection.n_atoms) as W: for trj in trj_list: - u = mda.Universe(ref,trj) + u = mda.Universe(top,trj) u.residues.resids -= residues_offset selection = u.select_atoms(sel_string) for ts in u.trajectory[start_frame:]: @@ -51,14 +51,14 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, return num_at -def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, verbose=False): +def extract_coordinates_combined(top, trj, sel_string, out_name, start_frame=0, verbose=False): """ Extracts selected coordinates from several trajectory files. Parameters ---------- - ref : list of str - File names for the reference topologies. + top : list of str + File names for the topologies. Can read all MDAnalysis-compatible topology formats. trj : list of str File names for the input trajectories. @@ -70,12 +70,12 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, """ # Determine the number of atoms from the first trajectory - u = mda.Universe(ref[0], trj[0]) + u = mda.Universe(top[0], trj[0]) selection = u.select_atoms(sel_string[0]) num_at = selection.n_atoms # Go through trajectories and write selections with mda.Writer(out_name+'.xtc', num_at) as W: - for r, t, s in zip(ref, trj, sel_string): + for r, t, s in zip(top, trj, sel_string): print(r, t) if verbose: print(s) u = mda.Universe(r, t) @@ -85,14 +85,14 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, return num_at -def merge_coordinates(ref_files, trj_files, out_name, segid=None): +def merge_coordinates(top_files, trj_files, out_name, segid=None): """ Merge several trajectories of the same system or system part. All trajectories must be (at least) as long as the first one. Parameters ---------- - ref_files : str[] + top_files : str[] List of input topology files. trj_files : str[]: List of input trajectory files. @@ -107,10 +107,10 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None): MDAnalysis universe of the merged system. """ - num_parts = len(ref_files) + num_parts = len(top_files) assert num_parts == len(trj_files) # Create an array of universes - u = [ mda.Universe(ref_files[i],trj_files[i]) for i in range(num_parts) ] + u = [ mda.Universe(top_files[i],trj_files[i]) for i in range(num_parts) ] num_frames = len(u[0].trajectory) new_num_at = sum([len(ui.atoms) for ui in u]) # Create the merged starting structure @@ -135,17 +135,17 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None): return univ -def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0): +def align_coordinates(top, pdb, trj_list, out_name, sel_string='all', start_frame=0): """ Aligns selected coordinates from a trajectory file. Parameters ---------- - ref : str - File name for reference topology. + top : str + File name for the topology. Can read all MDAnalysis-compatible topology formats. pdb : str - File name for reference PDB file. + File name for the reference PDB file. trj_list : list of str File names for the input trajectory. Can read all MDAnalysis-compatible trajectory formats. @@ -154,46 +154,49 @@ def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_fram start_frame : int, optional First frame to read from the trajectory. """ - # Read the reference+PDB files and align selected parts. - u = mda.Universe(ref, pdb) + # Read the topology+PDB files and align selected parts. + u = mda.Universe(top, pdb) for trj in trj_list: - mobile = mda.Universe(ref, trj) + mobile = mda.Universe(top, trj) #mobile.trajectory = mobile.trajectory[start_frame:] alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc') alignment.run() -def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0, verbose=False): +def sort_coordinates(values, top_name, trj_name, out_name, start_frame=0, verbose=False): """ - Sort coordinate frames along an array of values. + Sort coordinate frames along corresponding values. Parameters ---------- - values: float array. + values: float array Values along which to sort the trajectory. - ref_name: string. - reference topology for the trajectory. - trj_name: string. + top_name: str + Topology for the trajectory. + trj_name: str Trajetory from which the frames are picked. Usually the same as the values are from. - out_name: string. - Name of the output files + out_name: str + Name of the output trajectory (usual format is .xtc). start_frame: int Offset of the data with respect to the trajectories. Returns ------- + data_sort: float array + Sorted values of the input data. sort_idx: float array Sorted indices of the values. oidx_sort: float array Sorted indices of the trajectory. + data_sort: float array """ # Remember the index in the simulation (taking into account offset) - oidx = np.arange(len(values))+start_frame + oidx = np.arange(len(values)) + start_frame # Define the MDAnalysis trajectory from where the frames come - if verbose: print('Loading:', ref_name, trj_name) - u = mda.Universe(ref_name, trj_name) + if verbose: print('Loading:', top_name, trj_name) + u = mda.Universe(top_name, trj_name) if verbose: print('Trajectory length:', len(u.trajectory)) print('Number of values: ', len(values)) @@ -201,12 +204,93 @@ def sort_coordinates(values, ref_name, trj_name, out_name, start_frame=0, verbos a = u.select_atoms('all') # Sort everything along the projection on the values sort_idx = np.argsort(values) + data_sort = values[sort_idx] oidx_sort = oidx[sort_idx] # Write out sorted trajectory with mda.Writer(out_name, a.n_atoms) as W: for i in range(len(values)): ts = u.trajectory[oidx_sort[i]] W.write(a) - return sort_idx, oidx_sort + return data_sort, sort_idx, oidx_sort +def merge_and_sort_coordinates(values, top_names, trj_names, out_name, start_frame=0, verbose=False): + """ + Write multiple trajectories of coordinate frames into one trajectory, sorted along corresponding values. + + Parameters + ---------- + values: list of float arrays + Values along which to sort the trajectory. + top_names: list of str + topology for the trajectory. + trj_names: list of str + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: str + Name of the output trajectory (usual format is .xtc). + start_frame: int or list of int + Offsets of the data with respect to the trajectories. + Defaults to zero. + + Returns + ------- + data_sort: float array + Sorted values of the input data. + sort_idx: float array + Sorted indices of the values. + oidx_sort: float array + Sorted indices of the trajectory. + + """ + assert type(values) == list and type(top_names) == list and type(trj_names) == list + # Get some stats + num_traj = len(values) + num_frames = [len(val) for val in values] + # Number of values must be consistent with topologies and trajectories + assert num_traj == len(top_names) + assert num_traj == len(trj_names) + # Set offset if not provided + assert type(start_frame) == int or len(start_frame) == num_traj + if type(start_frame) == int: + start_frame *= np.ones(num_traj) + start_frame = start_frame.tolist() + # Make sure the start indices are integers (MDA does not accept floats for indexing a trajectory) + start_frame = [int(sf) for sf in start_frame] + + # Combine the input data + data = np.concatenate(values) + # Remember which simulation the data came frome + cond = np.concatenate([i*np.ones(num_frames[i], dtype=int) for i in range(num_traj)]) + # Remember the index in the respective simulation (taking into account cutoff) + oidx = np.concatenate([np.arange(num_frames[i], dtype=int) + start_frame[i] for i in range(num_traj)]) + assert type(oidx[0]==int) + + # Define the MDAnalysis trajectories from where the frames come + univs = [] + atoms = [] + for j in range(num_traj): + u = mda.Universe(top_names[j],trj_names[j]) + if verbose: print('Length of trajectory',len(u.trajectory)) + univs.append(u) + atoms.append(u.select_atoms('all')) + # Make sure the numbers of atoms are the same in each trajectory + assert atoms[j].n_atoms == atoms[0].n_atoms + + # Sort everything along the data + sort_idx = np.argsort(data) + data_sort = data[sort_idx] + cond_sort = cond[sort_idx] + oidx_sort = oidx[sort_idx] + + # Write the trajectory, ordered along the PC + with mda.Writer(out_name, atoms[0].n_atoms) as W: + for i in range(len(data)): + j = cond_sort[i] + o = oidx_sort[i] + uj = univs[j] + ts = uj.trajectory[o] + W.write(atoms[j]) + + return data_sort, sort_idx, oidx_sort + diff --git a/scripts/calculate_combined_principal_components.py b/scripts/calculate_combined_principal_components.py index 3e0b58d7..d25fe8a3 100644 --- a/scripts/calculate_combined_principal_components.py +++ b/scripts/calculate_combined_principal_components.py @@ -68,9 +68,9 @@ args.num_components, args.feat_threshold, plot_file=args.out_plots+"_"+ftype+"_feature_correlation.pdf") # Sort each of the trajectories along the top components of combined data - sort_trajs_along_common_pc(data_a[ftype], data_b[ftype], args.start_frame, + sort_trajs_along_common_pc(data_a[ftype], data_b[ftype], args.ref_file_a, args.ref_file_b, args.trj_file_a, args.trj_file_b, - args.out_pc, num_pc=args.num_components) + args.out_pc, num_pc=args.num_components, start_frame=args.start_frame) # Plot histograms of both simulations along the common PCs compare_projections(data_a[ftype], data_b[ftype], pca, num=args.num_components, saveas=args.out_plots+"_"+ftype+"_pc-comparison.pdf") diff --git a/tests/test_workflow.py b/tests/test_workflow.py index b9ddd99f..5ef9a44c 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -81,23 +81,23 @@ def setUp(self): # -- Sort trajectory along common pc self.sort_common_traj = sort_trajs_along_common_pc(self.sim_a_tmr_data['bb-torsions'], self.sim_b_tmr_data['bb-torsions'], - start_frame, test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-b_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/traj/condition-b_receptor.xtc", test_data_path + "/pca/receptor_by_tmr", - num_pc=3) + num_pc=3, start_frame = start_frame) plt.close() # -- Sort trajectory pc pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions']) pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4) plt.close() - self.all_sort, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3) + self.all_sort, _, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", + pca=pca_a, num_pc=3) # -- Compare projections self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'], @@ -273,31 +273,31 @@ def test_tica_features(self): # -- sort_trajs_along_common_pc() + sort_traj_along_pc() + project_on_pc() def test_sort_trajs_along_pc(self): - self.assertEqual(len(self.sort_common_traj), 180) for ele in self.sort_common_traj: - self.assertEqual(ele.n_atoms, 2322) + self.assertEqual(len(ele), 3) self.assertEqual(len(self.all_sort), 3) # -- sort_trajs_along_common_tic() def test_sort_trajs_along_common_tic(self): - proj, atom = sort_trajs_along_common_tic(self.sim_a_tmr_data['bb-torsions'], - self.sim_b_tmr_data['bb-torsions'], 0, + sproj, sidx_data, sidx_traj = sort_trajs_along_common_tic(self.sim_a_tmr_data['bb-torsions'], + self.sim_b_tmr_data['bb-torsions'], test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-b_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/traj/condition-b_receptor.xtc", test_data_path + "/tica/receptor_by_tmr", - num_tic=3) - self.assertEqual(len(proj), 60) - self.assertEqual(len(atom), 60) + num_ic=3) + self.assertEqual(len(sproj[0]), 60) + self.assertEqual(len(sidx_data[0]), 60) # -- sort_traj_along_tic() def test_sort_traj_along_tic(self): - all_sort, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3) + all_sort, _, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", + tica = self.tica_combined, num_ic=3) self.assertEqual(len(all_sort), 3) From cdb837816cd4e74a828e54a715bc03d132a38c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Wed, 18 Aug 2021 01:01:35 -0700 Subject: [PATCH 18/23] dim and lag options + bugfixes --- pensa/dimensionality/pca.py | 33 +++++++++------ pensa/dimensionality/tica.py | 79 ++++++++++++++++++++++-------------- 2 files changed, 69 insertions(+), 43 deletions(-) diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py index 6d79d4cb..2edbf756 100644 --- a/pensa/dimensionality/pca.py +++ b/pensa/dimensionality/pca.py @@ -12,7 +12,7 @@ # http://www.emma-project.org/latest/api/generated/pyemma.coordinates.pca.html -def calculate_pca(data): +def calculate_pca(data, dim=-1): """ Performs a PyEMMA PCA on the provided data. @@ -20,6 +20,9 @@ def calculate_pca(data): ---------- data : float array Trajectory data [frames,frame_data]. + dim : int, optional, default -1 + The number of dimensions (principal components) to project onto. + -1 means all numerically available dimensions will be used. Returns ------- @@ -27,7 +30,7 @@ def calculate_pca(data): Principal components information. """ - pca = pyemma.coordinates.pca(data) + pca = pyemma.coordinates.pca(data, dim=dim) return pca @@ -98,7 +101,7 @@ def pca_features(pca, features, num, threshold, plot_file=None): return test_graph, test_corr -def project_on_pc(data, ev_idx, pca=None): +def project_on_pc(data, ev_idx, pca=None, dim=-1): """ Projects a trajectory onto an eigenvector of its PCA. @@ -111,7 +114,9 @@ def project_on_pc(data, ev_idx, pca=None): pca : PCA obj, optional Information of pre-calculated PCA. Defaults to None. Must be calculated for the same features (but not necessarily the same trajectory). - + dim : int, optional, default -1 + The number of dimensions (principal components) to project onto. + Only used if tica is not provided. Returns ------- projection : float array @@ -119,14 +124,13 @@ def project_on_pc(data, ev_idx, pca=None): """ # Perform PCA if none is provided. - if pca is None: - pca = pyemma.coordinates.pca(data) + if pca is None: pca = calculate_pca(data) # Project the features onto the principal components. projection = project_on_eigenvector(data, ev_idx, pca) return projection -def get_components_pca(data, num, pca=None, prefix=''): +def get_components_pca(data, num, pca=None, dim=-1, prefix=''): """ Projects a trajectory onto the first num eigenvectors of its PCA. @@ -139,6 +143,11 @@ def get_components_pca(data, num, pca=None, prefix=''): pca : PCA obj, optional Information of pre-calculated PCA. Defaults to None. Must be calculated for the same features (but not necessarily the same trajectory). + dim : int, optional, default = -1 + The number of dimensions (principal components) to project onto. + Only used if tica is not provided. + prefix : str, optional, default = '' + First part of the component names. Second part is "PC"+ Returns ------- @@ -149,8 +158,7 @@ def get_components_pca(data, num, pca=None, prefix=''): """ # Perform PCA if none is provided - if pca is None: - pca = pyemma.coordinates.pca(data) + if pca is None: calculate_pca(data) # Project the features onto the principal components comp_names = [] components = [] @@ -201,8 +209,7 @@ def sort_traj_along_pc(data, top, trj, out_name, pca=None, num_pc=3, start_frame """ # Calculate the principal components if they are not given. - if pca is None: - pca = pyemma.coordinates.pca(all_data, dim=3) + if pca is None: calculate_pca(data, dim=num_pc) # Sort the trajectory along them. sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( data, pca, top, trj, out_name, num_comp=num_pc, start_frame = start_frame @@ -250,7 +257,7 @@ def sort_trajs_along_common_pc(data_a, data_b, top_a, top_b, trj_a, trj_b, out_n """ sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_pc( - [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=3, start_frame = start_frame + [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=num_pc, start_frame = start_frame ) return sorted_proj, sorted_indices_data, sorted_indices_traj @@ -295,7 +302,7 @@ def sort_mult_trajs_along_common_pc(data, top, trj, out_name, num_pc=3, start_fr # Combine the input data all_data = np.concatenate(data,0) # Calculate the principal component - pca = pyemma.coordinates.pca(all_data, dim=3) + pca = calculate_pca(all_data) # Initialize output sorted_proj = [] sorted_indices_data = [] diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py index 01546eaa..062ef5b0 100644 --- a/pensa/dimensionality/tica.py +++ b/pensa/dimensionality/tica.py @@ -12,7 +12,7 @@ # http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html -def calculate_tica(data): +def calculate_tica(data, dim=-1, lag=10): """ Performs a PyEMMA TICA on the provided data. @@ -20,6 +20,11 @@ def calculate_tica(data): ---------- data : float array Trajectory data. Format: [frames,frame_data]. + dim : int, optional, default -1 + The number of dimensions (independent components) to project onto. + -1 means all numerically available dimensions will be used. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. Returns ------- @@ -27,13 +32,13 @@ def calculate_tica(data): Time-lagged independent component information. """ - tica = pyemma.coordinates.tica(data) + tica = pyemma.coordinates.tica(data, lag=lag) return tica def tica_eigenvalues_plot(tica, num=12, plot_file=None): """ - Plots the highest eigenvalues over the numberr of the time-lagged independent components. + Plots the highest eigenvalues over the number of the time-lagged independent components. Parameters ---------- @@ -94,7 +99,7 @@ def tica_features(tica, features, num, threshold, plot_file=None): return test_feature -def project_on_tic(data, ev_idx, tica=None): +def project_on_tic(data, ev_idx, tica=None, dim=-1, lag=10): """ Projects a trajectory onto an eigenvector of its TICA. @@ -107,22 +112,27 @@ def project_on_tic(data, ev_idx, tica=None): tica : TICA obj, optional Information of pre-calculated TICA. Must be calculated for the same features (but not necessarily the same trajectory). - + dim : int, optional, default -1 + The number of dimensions (independent components) to project onto. + Only used if tica is not provided. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + Returns ------- projection : float array Value along the TIC for each frame. """ - # Perform TICA if none is provided. - if tica is None: - tica = pyemma.coordinates.tica(data) + # Perform standard TICA if none is provided. + if tica is None: calculate_tica(data, dim=dim, lag=lag) # Project the features onto the time-lagged independent components. projection = project_on_eigenvector(data, ev_idx, tica) return projection -def get_components_tica(data, num, tica=None, prefix=''): +def get_components_tica(data, num, tica=None, dim=-1, lag=10, prefix=''): """ Projects a trajectory onto the first num eigenvectors of its tICA. @@ -135,6 +145,14 @@ def get_components_tica(data, num, tica=None, prefix=''): tica : tICA obj, optional Information of pre-calculated tICA. Defaults to None. Must be calculated for the same features (but not necessarily the same trajectory). + dim : int, optional, default -1 + The number of dimensions (independent components) to project onto. + Only used if tica is not provided. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + prefix : str, optional, default = '' + First part of the component names. Second part is "IC"+ Returns ------- @@ -145,8 +163,7 @@ def get_components_tica(data, num, tica=None, prefix=''): """ # Perform tICA if none is provided. - if tica is None: - tica = pyemma.coordinates.tica(data) + if tica is None: calculate_tica(data, lag=lag) # Project the features onto the principal components. comp_names = [] components = [] @@ -160,7 +177,7 @@ def get_components_tica(data, num, tica=None, prefix=''): return comp_names, np.array(components).T -def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_frame=0): +def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, lag=10, start_frame=0): """ Sort a trajectory along independent components. @@ -179,12 +196,13 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_fra Time-lagged independent components information. If none is provided, it will be calculated. Defaults to None. - num_ic : int, optional + num_ic : int, optional, default = 3 Sort along the first num_ic independent components. - Defaults to 3. - start_frame : int, optional + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + start_frame : int, optional, default = 0 Offset of the data with respect to the trajectories (defined below). - Defaults to 0. Returns ------- @@ -197,8 +215,7 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_fra """ # Calculate the principal components if they are not given. - if tica is None: - tica = pyemma.coordinates.tica(all_data, dim=3) + if tica is None: tica = calculate_tica(all_data, dim=num_ic, lag=lag) # Sort the trajectory along them. sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame @@ -206,7 +223,7 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, start_fra return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, start_frame=0): +def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, lag=10, start_frame=0): """ Sort two trajectories along their most important common time-lagged independent components. @@ -228,12 +245,13 @@ def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_ Should be the same as data_b was from. out_name : str Core part of the name of the output files. - num_ic : int, optional + num_ic : int, optional, default = 3 Sort along the first num_ic independent components. - Defaults to 3. - start_frame : int, optional + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + start_frame : int, optional, default = 0 Offset of the data with respect to the trajectories (defined below). - Defaults to 0. Returns ------- @@ -246,12 +264,12 @@ def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_ """ sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_tic( - [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=3, start_frame = start_frame + [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=num_ic, lag=lag, start_frame = start_frame ) return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_frame=0): +def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, lag=10, start_frame=0): """ Sort multiple trajectories along their most important independent components. @@ -266,12 +284,13 @@ def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_f trj[i] should be the same as data[i] was from. out_name : str Core part of the name of the output files. - num_ic : int, optional + num_ic : int, optional, default = 3 Sort along the first num_ic independent components. - Defaults to 3. - start_frame : int or list of int + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + start_frame : int or list of int, default = 0 Offset of the data with respect to the trajectories. - Defaults to 0. Returns ------- @@ -291,7 +310,7 @@ def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, start_f # Combine the input data all_data = np.concatenate(data,0) # Calculate the independent components - tica = pyemma.coordinates.tica(all_data, dim=3) + tica = pyemma.coordinates.tica(all_data, lag=lag) # Initialize output sorted_proj = [] sorted_indices_data = [] From 4e411fa21dae74a3b06f1adc20f9ebeb83cbad11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Wed, 18 Aug 2021 01:08:35 -0700 Subject: [PATCH 19/23] bugfix --- pensa/dimensionality/tica.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py index 062ef5b0..df473b43 100644 --- a/pensa/dimensionality/tica.py +++ b/pensa/dimensionality/tica.py @@ -215,11 +215,11 @@ def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, lag=10, s """ # Calculate the principal components if they are not given. - if tica is None: tica = calculate_tica(all_data, dim=num_ic, lag=lag) + if tica is None: tica = calculate_tica(data, dim=num_ic, lag=lag) # Sort the trajectory along them. sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame - ) + ) return sorted_proj, sorted_indices_data, sorted_indices_traj From db49325eb7e6b7612eaa69570d7b4253f8021d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Thu, 19 Aug 2021 08:03:23 -0700 Subject: [PATCH 20/23] fix bugs in distance heatmap --- pensa/comparison/visualization.py | 74 ++++++++++++++++++------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/pensa/comparison/visualization.py b/pensa/comparison/visualization.py index 44131d06..0cb2f5a0 100644 --- a/pensa/comparison/visualization.py +++ b/pensa/comparison/visualization.py @@ -175,7 +175,7 @@ def pair_features_heatmap(feat_names, feat_diff, plot_filename, separator=' - ', def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, - vmin=None, vmax=None, symmetric=True, verbose=True, cbar_label=None): + vmin=None, vmax=None, symmetric=True, verbose=False, cbar_label=None, tick_step=50): """ Visualizes data per residue pair in a heatmap. @@ -187,21 +187,23 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, Data to be plotted for each residue-pair feature. plot_filename : str Name of the file for the plot. - res1_pos : int + res1_pos : int, optional, default = 2 Position of the 1st residue ID in the feature name when separated by ' '. - res2_pos : int + res2_pos : int, optional, default = 6 Position of the 2nd residue ID in the feature name when separated by ' '. - vmin : float, optional + vmin : float, optional, default = None Minimum value for the heatmap. - vmax : float, optional + vmax : float, optional, default = None Maximum value for the heatmap. - symmetric : bool, optional + symmetric : bool, optional, default = True The matrix is symmetric and values provided only for the upper or lower triangle. Defaults to True. - verbose : bool, optional + verbose : bool, optional, default = False Print numbers of first and last residue. Defaults to True. - cbar_label : str, optional + cbar_label : str, optional, default = None Label for the color bar. + tick_step : int, optional, default = 50 + Step between two ticks on the plot axes. Returns ------- @@ -209,17 +211,21 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, Matrix with the values of the difference/divergence. """ - firstres = int(feat_names[0].split(' ')[res1_pos]) - lastres = int(feat_names[-1].split(' ')[res1_pos]) - if verbose: - print('first res:', firstres, ', last res:', lastres) - size = lastres-firstres+2 + # Find first and last residue + rn1 = [int(fn.split(' ')[res1_pos]) for fn in feat_names] + rn2 = [int(fn.split(' ')[res2_pos]) for fn in feat_names] + resnums = np.concatenate([np.array(rn1,dtype=int),np.array(rn2,dtype=int)]) + first_res = resnums.min() + last_res = resnums.max() + if verbose: print('first res:', first_res, ', last res:', last_res) + # Create a 2D array with the values + size = last_res - first_res + 1 diff = np.zeros([size,size]) - for n,name in enumerate(feat_names): + for n, name in enumerate(feat_names): splitname = name.split(' ') - resi,resj = int(splitname[res1_pos]),int(splitname[res2_pos]) - i = resi - firstres - j = resj - firstres + resi,resj = int(splitname[res1_pos]), int(splitname[res2_pos]) + i = resi - first_res + j = resj - first_res diff[i,j] = feat_diff[n] if symmetric: diff[j,i] = feat_diff[n] @@ -227,10 +233,15 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, fig,ax = plt.subplots(1,1,figsize=[6,4],dpi=300) img = ax.imshow(diff, vmin=vmin, vmax=vmax) ax.xaxis.set_ticks_position('top') - ax.set_xticks(np.arange(50-firstres,lastres-firstres+1,50)) - ax.set_yticks(np.arange(50-firstres,lastres-firstres+1,50)) - ax.set_xticklabels(np.arange(50,lastres+1,50)) - ax.set_yticklabels(np.arange(50,lastres+1,50)) + # Find position for the first tick + first_tick = 0 + while first_res > first_tick: + first_tick += tick_step + # Ticks and labels + ax.set_xticks(np.arange(first_tick-first_res, size, tick_step)) + ax.set_yticks(np.arange(first_tick-first_res, size, tick_step)) + ax.set_xticklabels(np.arange(first_tick, last_res+1, tick_step)) + ax.set_yticklabels(np.arange(first_tick, last_res+1, tick_step)) ax.xaxis.set_label_position('top') ax.set_xlabel('residue number') ax.set_ylabel('residue number') @@ -241,7 +252,8 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, def distances_visualization(dist_names, dist_diff, plot_filename, - vmin=None, vmax=None, verbose=True, cbar_label=None): + vmin=None, vmax=None, verbose=True, + cbar_label=None, tick_step=50): """ Visualizes distance features for pairs of residues in a heatmap. @@ -254,25 +266,27 @@ def distances_visualization(dist_names, dist_diff, plot_filename, Data for each distance feature. plot_filename : str Name of the file for the plot. - vmin : float, optional + vmin : float, optional, default = None Minimum value for the heatmap. - vmax : float, optional + vmax : float, optional, default = None Maximum value for the heatmap. - verbose : bool, optional + verbose : bool, optional, default = False Print numbers of first and last residue. Defaults to True. - cbar_label : str, optional + cbar_label : str, optional, default = None Label for the color bar. - + tick_step : int, optional, default = 50 + Step between two ticks on the plot axes. + Returns ------- diff : float array Distance matrix. """ - if verbose: - print('Plotting heatmap for distance features.') + if verbose: print('Plotting heatmap for distance features.') diff = resnum_heatmap(dist_names, dist_diff, plot_filename, res1_pos=2, res2_pos=6, - vmin=vmin, vmax=vmax, verbose=verbose, cbar_label=cbar_label) + vmin=vmin, vmax=vmax, verbose=verbose, cbar_label=cbar_label, + tick_step=tick_step) return diff From 7316f6aba88d8de22c590c6f9b59936bdabe6319 Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Wed, 8 Sep 2021 16:55:34 -0700 Subject: [PATCH 21/23] Added docstrings to various functions in modified files. --- pensa/comparison/metrics.py | 1 + pensa/features/processing.py | 44 ++++++++++++++++++++++++++++++++-- pensa/features/txt_features.py | 22 ++++++++++++++++- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py index b276f9b1..ecb7aaca 100644 --- a/pensa/comparison/metrics.py +++ b/pensa/comparison/metrics.py @@ -1,6 +1,7 @@ import numpy as np from pensa import * from pensa.comparison import * +from pensa.dimensionality import * import random import math diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 5aa8efb7..af267485 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -263,7 +263,7 @@ def sort_features_alphabetically(tors, data): def sort_distances_by_resnum(dist, data): """ - Sort distance features by the residue number.. + Sort distance features by the residue number. Parameters ---------- dist : list of str @@ -284,6 +284,24 @@ def sort_distances_by_resnum(dist, data): def select_common_features(features_a, features_b, boolean=True): + """ + Finds features in common between two trajectories. + + Parameters + ---------- + features_a : list of str + First set of features. + features_b : list of str + Second set of features. + boolean : bool + Determines if returned array contains booleans or features. + Returns + ------- + common_a : np array of bool or str + Common features taken from features_a. + common_b : np array of bool or str + Common features taken from features_b. + """ intersect = set(features_a).intersection(features_b) if boolean: is_common_a = [f in intersect for f in features_a] @@ -291,10 +309,32 @@ def select_common_features(features_a, features_b, boolean=True): else: is_common_a = [f for f in features_a if f in intersect] is_common_b = [f for f in features_b if f in intersect] - return np.array(is_common_a), np.array(is_common_b) + common_a = np.array(is_common_a) + common_b = n.array(is_common_b) + return common_a, common_b def get_common_features_data(features_a, features_b, data_a, data_b): + """ + Finds common features and corresponding data from two trajectories. + + Parameters + ---------- + features_a : list of str + First set of features. + features_b : list of str + Second set of features. + data_a : float array + Data from first trajectory. + data_b : float array + Data from second trajectory. + Returns + ------- + new_features_a, new_features_b : np array of str + Common features between the two trajectories. + new_data_a, new_data_b : float array + Data corresponding to common features between the two trajectories. + """ is_common_a, is_common_b = select_common_features(features_a, features_b) new_data_a = data_a[:,is_common_a] new_data_b = data_b[:, is_common_b] diff --git a/pensa/features/txt_features.py b/pensa/features/txt_features.py index c9206360..7f57cac5 100644 --- a/pensa/features/txt_features.py +++ b/pensa/features/txt_features.py @@ -6,9 +6,29 @@ def get_txt_features_ala2(filename, num_frames, cossin=False): + """ + Parses features for ala2 from a text file. + The text file must be formatted with "phi", followed by all phi angles, a blank line, + followed by "psi" and all psi angles, with one angle per line. + + Parameters + ---------- + filename : str + File name of the text file. + num_frames : int + Maximum number of trajectory frames used in features array. + cossin : bool + Determines if the features array contains torsion angles or the sin and cos of torsion angles. + + Returns + ------- + features : numpy array + Data for all features + + """ phi = [] psi = [] - + curr = 'phi' with open(filename) as f: for s in f.readlines(): if s == 'phi\n' or s == 'psi\n': From 4ae2fb64d6026ded442e5666b5b7727ec55eaaba Mon Sep 17 00:00:00 2001 From: Jasper McAvity Date: Wed, 8 Sep 2021 22:36:37 -0700 Subject: [PATCH 22/23] Fixed typo in processing.py --- pensa/features/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pensa/features/processing.py b/pensa/features/processing.py index af267485..4003a3e6 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -310,7 +310,7 @@ def select_common_features(features_a, features_b, boolean=True): is_common_a = [f for f in features_a if f in intersect] is_common_b = [f for f in features_b if f in intersect] common_a = np.array(is_common_a) - common_b = n.array(is_common_b) + common_b = np.array(is_common_b) return common_a, common_b From 9adc9b8e56b401e69469958651986350e40faf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20V=C3=B6gele?= Date: Fri, 10 Sep 2021 06:55:52 -0700 Subject: [PATCH 23/23] v0.2.5 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4dd2d823..ddb50063 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='pensa', - version='0.2.4', + version='0.2.5', description='PENSA - protein ensemble analysis', url='http://github.com/drorlab/pensa', author='Martin Voegele, Neil Thomson, Sang Truong, Jasper McAvity',