diff --git a/.gitignore b/.gitignore index 98a70d86..2814d516 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,9 @@ tests/test_data/MOR-*/.*.npz *.ipynb .DS_Store +# Files from workload manager +slurm*.out + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/tut-5-dimensionality.rst b/docs/tut-5-dimensionality.rst index 93c072c1..f053eed9 100644 --- a/docs/tut-5-dimensionality.rst +++ b/docs/tut-5-dimensionality.rst @@ -89,13 +89,12 @@ entire receptor, sorted by the PCs of the transmembrane region. _ = sort_trajs_along_common_pc(sim_a_tmr_data['bb-torsions'], sim_b_tmr_data['bb-torsions'], - feature_start_frame, "traj/condition-a_receptor.gro", "traj/condition-b_receptor.gro", "traj/condition-a_receptor.xtc", "traj/condition-b_receptor.xtc", "pca/receptor_by_tmr", - num_pc=3) + num_pc=3, start_frame=feature_start_frame) The above function deals with the special case of two input trajectories. We also provide the functions for a single one (see @@ -121,8 +120,9 @@ Here are the major steps of a PCA demonstrated for a single simulation. .. code:: python - _, __ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'], - pca_a, feature_start_frame, - "traj/condition-a_receptor.gro", - "traj/condition-a_receptor.xtc", - "pca/condition-a_receptor_by_tmr", num_pc=3) + _, __, ___ = sort_traj_along_pc(sim_a_tmr_data['bb-torsions'], + "traj/condition-a_receptor.gro", + "traj/condition-a_receptor.xtc", + "pca/condition-a_receptor_by_tmr", + start_frame = feature_start_frame, + pca=pca_a, num_pc=3) diff --git a/pensa/comparison/__init__.py b/pensa/comparison/__init__.py index a4b0802c..79e6e2b7 100644 --- a/pensa/comparison/__init__.py +++ b/pensa/comparison/__init__.py @@ -2,4 +2,4 @@ from .relative_entropy import * from .statespecific import * from .visualization import * - +from .metrics import * diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py new file mode 100644 index 00000000..ecb7aaca --- /dev/null +++ b/pensa/comparison/metrics.py @@ -0,0 +1,222 @@ +import numpy as np +from pensa import * +from pensa.comparison import * +from pensa.dimensionality import * +import random +import math + + +""" + Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters. + + Parameters + ---------- + features_a : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + features_b : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + Must be the same as features_a. Provided as a sanity check. + all_data_a : float array + Trajectory data from the first ensemble. Format: [frames,frame_data]. + all_data_b : float array + Trajectory data from the second ensemble. + For kld functions, the second ensemble should be the reference ensemble. + Format: [frames,frame_data]. + bin_width : float, default=None + Bin width for the axis to compare the distributions on. + If bin_width is None, bin_num (see below) bins are used and the width is determined from the common histogram. + bin_num : int, default=10 + Number of bins for the axis to compare the distributions on (only if bin_width=None). + verbose : bool, default=True + Print intermediate results. + override_name_check : bool, default=False + Only check number of features, not their names. + + Returns + ------- + Each function returns one value. + + average_jsd : float + Average Jensen-Shannon distance from two ensembles. + max_jsd : float + Maximum Jensen-Shannon distance from two ensembles. + average_kld : float + Average Kullback-Leibler divergence from two ensembles. + max_kld : float + Maximum Kullback-Leibler divergence from two ensembles. +""" + +def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_jsdist) + + +def max_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.max(data_jsdist) + + +def average_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_kld_ab) + + +def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): + _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) + return np.max(data_kld_ab) + + +""" + Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of five functions uses the kolmogorov_smirnov_analysis function with the same parameters. + + Parameters + ---------- + features_a : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + features_b : list of str + Feature names of the first ensemble. + Can be obtained from features object via .describe(). + Must be the same as features_a. Provided as a sanity check. + all_data_a : float array + Trajectory data from the first ensemble. Format: [frames,frame_data]. + all_data_b : float array + Trajectory data from the second ensemble. Format: [frames,frame_data]. + verbose : bool, default=True + Print intermediate results. + override_name_check : bool, default=False + Only check number of features, not their names. + + Returns + ------- + Each function returns one value. + + average_kss : float + Average Kolmogorov-Smirnov statistic for two distributions. + max_kss : float + Maximum Kolmogorov-Smirnov statistic for two distributions. + average_ksp : float + Average Kolmogorov-Smirnov p-value for two distributions. + max_ksp : float + Maximum Kolmogorov-Smirnov statistic for two distributions. + min_ksp : float + Minimum Kolmogorov-Smirnov statistic for two distributions. +""" + + +def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_kss) + + +def max_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.max(data_kss) + +def average_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.mean(data_ksp) + + +def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.max(data_ksp) + +def min_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) + return np.min(data_ksp) + +""" + Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. + + Parameters + ---------- + features_a : list of str + Feature names of the first ensemble. + features_b : list of str + Feature names of the first ensemble. + Must be the same as features_a. Provided as a sanity check. + all_data_a : float array + Trajectory data from the first ensemble. Format: [frames,frame_data]. + all_data_b : float array + Trajectory data from the second ensemble. Format: [frames,frame_data]. + torsions : str + Torsion angles to use for SSI, including backbone - 'bb', and sidechain - 'sc'. + Default is None. + pocket_occupancy : bool, optional + Set to 'True' if the data input is pocket occupancy distribution. + The default is None. + pbc : bool, optional + If true, the apply periodic bounary corrections on angular distribution inputs. + The input for periodic correction must be radians. The default is True. + verbose : bool, default=True + Print intermediate results. + write_plots : bool, optional + If true, visualise the states over the raw distribution. The default is None. + override_name_check : bool, default=False + Only check number of features, not their names. + + Returns + ------- + Each function returns one value. + + average_ssi : float + Average of State Specific Information for a feature across two ensembles. + max_ssi : float + Maximum of State Specific Information for a feature across two ensembles. +""" + +def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False): + _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) + return np.mean(data_ssi) + + +def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbs=True, verbose=True, write_plots=None, override_name_check=False): + _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) + return np.max(data_ssi) + + +""" + Calculates the relative sampling efficiency of test data based on reference data. + + Parameters + ---------- + ref_data : float array + Trajectory data from the reference ensemble. Format: [frames,frame_data]. + test_data : float array + Trajectory data from the test ensemble. Format: [frames,frame_data]. + num_pc : int + Number of principal components used. + + Returns + ------- + pca_se : float + Sampling efficiency of test data based on reference data. + +""" + +def pca_sampling_efficiency(ref_data, test_data, num_pc=2): + pca = calculate_pca(ref_data) + + _, ref_components = get_components_pca(ref_data, num_pc, pca=pca) + _, test_components = get_components_pca(test_data, num_pc, pca=pca) + + ref_var = np.var(ref_components, axis=0) + test_var = np.var(test_components, axis=0) + + ref_vol = np.prod(ref_var) + test_vol = np.prod(test_var) + + pca_se = test_vol / ref_vol + + return pca_se + + + + + + + + diff --git a/pensa/comparison/visualization.py b/pensa/comparison/visualization.py index 44131d06..0cb2f5a0 100644 --- a/pensa/comparison/visualization.py +++ b/pensa/comparison/visualization.py @@ -175,7 +175,7 @@ def pair_features_heatmap(feat_names, feat_diff, plot_filename, separator=' - ', def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, - vmin=None, vmax=None, symmetric=True, verbose=True, cbar_label=None): + vmin=None, vmax=None, symmetric=True, verbose=False, cbar_label=None, tick_step=50): """ Visualizes data per residue pair in a heatmap. @@ -187,21 +187,23 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, Data to be plotted for each residue-pair feature. plot_filename : str Name of the file for the plot. - res1_pos : int + res1_pos : int, optional, default = 2 Position of the 1st residue ID in the feature name when separated by ' '. - res2_pos : int + res2_pos : int, optional, default = 6 Position of the 2nd residue ID in the feature name when separated by ' '. - vmin : float, optional + vmin : float, optional, default = None Minimum value for the heatmap. - vmax : float, optional + vmax : float, optional, default = None Maximum value for the heatmap. - symmetric : bool, optional + symmetric : bool, optional, default = True The matrix is symmetric and values provided only for the upper or lower triangle. Defaults to True. - verbose : bool, optional + verbose : bool, optional, default = False Print numbers of first and last residue. Defaults to True. - cbar_label : str, optional + cbar_label : str, optional, default = None Label for the color bar. + tick_step : int, optional, default = 50 + Step between two ticks on the plot axes. Returns ------- @@ -209,17 +211,21 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, Matrix with the values of the difference/divergence. """ - firstres = int(feat_names[0].split(' ')[res1_pos]) - lastres = int(feat_names[-1].split(' ')[res1_pos]) - if verbose: - print('first res:', firstres, ', last res:', lastres) - size = lastres-firstres+2 + # Find first and last residue + rn1 = [int(fn.split(' ')[res1_pos]) for fn in feat_names] + rn2 = [int(fn.split(' ')[res2_pos]) for fn in feat_names] + resnums = np.concatenate([np.array(rn1,dtype=int),np.array(rn2,dtype=int)]) + first_res = resnums.min() + last_res = resnums.max() + if verbose: print('first res:', first_res, ', last res:', last_res) + # Create a 2D array with the values + size = last_res - first_res + 1 diff = np.zeros([size,size]) - for n,name in enumerate(feat_names): + for n, name in enumerate(feat_names): splitname = name.split(' ') - resi,resj = int(splitname[res1_pos]),int(splitname[res2_pos]) - i = resi - firstres - j = resj - firstres + resi,resj = int(splitname[res1_pos]), int(splitname[res2_pos]) + i = resi - first_res + j = resj - first_res diff[i,j] = feat_diff[n] if symmetric: diff[j,i] = feat_diff[n] @@ -227,10 +233,15 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, fig,ax = plt.subplots(1,1,figsize=[6,4],dpi=300) img = ax.imshow(diff, vmin=vmin, vmax=vmax) ax.xaxis.set_ticks_position('top') - ax.set_xticks(np.arange(50-firstres,lastres-firstres+1,50)) - ax.set_yticks(np.arange(50-firstres,lastres-firstres+1,50)) - ax.set_xticklabels(np.arange(50,lastres+1,50)) - ax.set_yticklabels(np.arange(50,lastres+1,50)) + # Find position for the first tick + first_tick = 0 + while first_res > first_tick: + first_tick += tick_step + # Ticks and labels + ax.set_xticks(np.arange(first_tick-first_res, size, tick_step)) + ax.set_yticks(np.arange(first_tick-first_res, size, tick_step)) + ax.set_xticklabels(np.arange(first_tick, last_res+1, tick_step)) + ax.set_yticklabels(np.arange(first_tick, last_res+1, tick_step)) ax.xaxis.set_label_position('top') ax.set_xlabel('residue number') ax.set_ylabel('residue number') @@ -241,7 +252,8 @@ def resnum_heatmap(feat_names, feat_diff, plot_filename, res1_pos=2, res2_pos=6, def distances_visualization(dist_names, dist_diff, plot_filename, - vmin=None, vmax=None, verbose=True, cbar_label=None): + vmin=None, vmax=None, verbose=True, + cbar_label=None, tick_step=50): """ Visualizes distance features for pairs of residues in a heatmap. @@ -254,25 +266,27 @@ def distances_visualization(dist_names, dist_diff, plot_filename, Data for each distance feature. plot_filename : str Name of the file for the plot. - vmin : float, optional + vmin : float, optional, default = None Minimum value for the heatmap. - vmax : float, optional + vmax : float, optional, default = None Maximum value for the heatmap. - verbose : bool, optional + verbose : bool, optional, default = False Print numbers of first and last residue. Defaults to True. - cbar_label : str, optional + cbar_label : str, optional, default = None Label for the color bar. - + tick_step : int, optional, default = 50 + Step between two ticks on the plot axes. + Returns ------- diff : float array Distance matrix. """ - if verbose: - print('Plotting heatmap for distance features.') + if verbose: print('Plotting heatmap for distance features.') diff = resnum_heatmap(dist_names, dist_diff, plot_filename, res1_pos=2, res2_pos=6, - vmin=vmin, vmax=vmax, verbose=verbose, cbar_label=cbar_label) + vmin=vmin, vmax=vmax, verbose=verbose, cbar_label=cbar_label, + tick_step=tick_step) return diff diff --git a/pensa/dimensionality/__init__.py b/pensa/dimensionality/__init__.py index 3ae57a17..bbf189db 100644 --- a/pensa/dimensionality/__init__.py +++ b/pensa/dimensionality/__init__.py @@ -1,3 +1,4 @@ +from .visualization import * from .pca import * from .tica import * diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py index 0947dff1..2edbf756 100644 --- a/pensa/dimensionality/pca.py +++ b/pensa/dimensionality/pca.py @@ -3,13 +3,16 @@ from pyemma.util.contexts import settings import MDAnalysis as mda import matplotlib.pyplot as plt - +from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates +from .visualization import project_on_eigenvector, sort_traj_along_projection # --- METHODS FOR PRINCIPAL COMPONENT ANALYSIS --- +# http://www.emma-project.org/latest/api/generated/pyemma.coordinates.pca.html + -def calculate_pca(data): +def calculate_pca(data, dim=-1): """ Performs a PyEMMA PCA on the provided data. @@ -17,6 +20,9 @@ def calculate_pca(data): ---------- data : float array Trajectory data [frames,frame_data]. + dim : int, optional, default -1 + The number of dimensions (principal components) to project onto. + -1 means all numerically available dimensions will be used. Returns ------- @@ -24,7 +30,7 @@ def calculate_pca(data): Principal components information. """ - pca = pyemma.coordinates.pca(data) + pca = pyemma.coordinates.pca(data, dim=dim) return pca @@ -95,7 +101,7 @@ def pca_features(pca, features, num, threshold, plot_file=None): return test_graph, test_corr -def project_on_pc(data, ev_idx, pca=None): +def project_on_pc(data, ev_idx, pca=None, dim=-1): """ Projects a trajectory onto an eigenvector of its PCA. @@ -108,25 +114,23 @@ def project_on_pc(data, ev_idx, pca=None): pca : PCA obj, optional Information of pre-calculated PCA. Defaults to None. Must be calculated for the same features (but not necessarily the same trajectory). - + dim : int, optional, default -1 + The number of dimensions (principal components) to project onto. + Only used if tica is not provided. Returns ------- projection : float array Value along the PC for each frame. """ - # Perform PCA if none is provided - if pca is None: - pca = pyemma.coordinates.pca(data) #,dim=3) - # Project the features onto the principal components - projection = np.zeros(data.shape[0]) - for ti in range(data.shape[0]): - projection[ti] = np.dot(data[ti],pca.eigenvectors[:,ev_idx]) - # Return the value along the PC for each frame + # Perform PCA if none is provided. + if pca is None: pca = calculate_pca(data) + # Project the features onto the principal components. + projection = project_on_eigenvector(data, ev_idx, pca) return projection -def get_components_pca(data, num, pca=None, prefix=''): +def get_components_pca(data, num, pca=None, dim=-1, prefix=''): """ Projects a trajectory onto the first num eigenvectors of its PCA. @@ -139,6 +143,11 @@ def get_components_pca(data, num, pca=None, prefix=''): pca : PCA obj, optional Information of pre-calculated PCA. Defaults to None. Must be calculated for the same features (but not necessarily the same trajectory). + dim : int, optional, default = -1 + The number of dimensions (principal components) to project onto. + Only used if tica is not provided. + prefix : str, optional, default = '' + First part of the component names. Second part is "PC"+ Returns ------- @@ -149,8 +158,7 @@ def get_components_pca(data, num, pca=None, prefix=''): """ # Perform PCA if none is provided - if pca is None: - pca = pyemma.coordinates.pca(data) + if pca is None: calculate_pca(data) # Project the features onto the principal components comp_names = [] components = [] @@ -164,20 +172,14 @@ def get_components_pca(data, num, pca=None, prefix=''): return comp_names, np.array(components).T -def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3): +def sort_traj_along_pc(data, top, trj, out_name, pca=None, num_pc=3, start_frame=0): """ - Sort a trajectory along given principal components. + Sort a trajectory along principal components. Parameters ---------- data : float array Trajectory data [frames,frame_data]. - pca : PCA obj - Principal components information. - num_pc : int - Sort along the first num_pc principal components. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : str File name of the reference topology for the trajectory. trj : str @@ -185,34 +187,37 @@ def sort_traj_along_pc(data, pca, start_frame, top, trj, out_name, num_pc=3): Should be the same as data was from. out_name : str Core part of the name of the output files + pca : PCA obj, optional + Principal components information. + If none is provided, it will be calculated. + Defaults to None. + num_pc : int, optional + Sort along the first num_pc principal components. + Defaults to 3. + start_frame : int, optional + Offset of the data with respect to the trajectories (defined below). + Defaults to 0. - """ - # Remember the index in the simulation (taking into account cutoff) - oidx = np.arange(len(data))+start_frame - # Define the MDAnalysis trajectories from where the frames come - u = mda.Universe(top,trj) - a = u.select_atoms('all') - return_str = [] - all_proj = [] - # Loop through the principal components - for evi in range(num_pc): - # Project the combined data on the principal component - proj = project_on_pc(data,evi,pca=pca) - all_proj.append(proj) - # Sort everything along the projection onto the PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", a.n_atoms) as W: - for i in range(data.shape[0]): - ts = u.trajectory[oidx_sort[i]] - W.write(a) - return_str.append(a) - return return_str, all_proj + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component + + """ + # Calculate the principal components if they are not given. + if pca is None: calculate_pca(data, dim=num_pc) + # Sort the trajectory along them. + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( + data, pca, top, trj, out_name, num_comp=num_pc, start_frame = start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_pc=3): +def sort_trajs_along_common_pc(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_pc=3, start_frame=0): """ Sort two trajectories along their most important common principal components. @@ -222,8 +227,6 @@ def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, Trajectory data [frames,frame_data]. data_b : float array Trajectory data [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top_a : str Reference topology for the first trajectory. top_b : str @@ -236,48 +239,30 @@ def sort_trajs_along_common_pc(data_a, data_b, start_frame, top_a, top_b, trj_a, Should be the same as data_b was from. out_name : str Core part of the name of the output files. - + num_pc : int, optional + Sort along the first num_pc principal components. + Defaults to 3. + start_frame : int or list of int + Offset of the data with respect to the trajectories. + Defaults to 0. + + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component + """ - # Combine the input data - data = np.concatenate([data_a,data_b],0) - # Remember which simulation the data came frome - cond = np.concatenate([np.ones(len(data_a)), np.zeros(len(data_b))]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(len(data_a))+start_frame, - np.arange(len(data_b))+start_frame]) - # Calculate the principal components - pca = pyemma.coordinates.pca(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - ua = mda.Universe(top_a,trj_a) - ub = mda.Universe(top_b,trj_b) - # ... and select all atoms - aa = ua.select_atoms('all') - ab = ub.select_atoms('all') - return_str = [] - # Loop over principal components. - for evi in range(num_pc): - # Project the combined data on the principal component - proj = project_on_pc(data,evi,pca=pca) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", aa.n_atoms) as W: - for i in range(data.shape[0]): - if cond_sort[i] == 1: # G-protein bound - ts = ua.trajectory[oidx_sort[i]] - W.write(aa) - return_str.append(aa) - elif cond_sort[i] == 0: # arrestin bound - ts = ub.trajectory[oidx_sort[i]] - W.write(ab) - return_str.append(ab) - return return_str + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_pc( + [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_pc=num_pc, start_frame = start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_pc=3): +def sort_mult_trajs_along_common_pc(data, top, trj, out_name, num_pc=3, start_frame=0): """ Sort multiple trajectories along their most important common principal components. @@ -285,8 +270,6 @@ def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_p ---------- data : list of float arrays List of trajectory data arrays, each [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : list of str Reference topology files. trj : list of str @@ -294,148 +277,47 @@ def sort_mult_trajs_along_common_pc(data, start_frame, top, trj, out_name, num_p trj[i] should be the same as data[i] was from. out_name : str Core part of the name of the output files. + num_pc : int, optional + Sort along the first num_pc principal components. + Defaults to 3. + start_frame : int or list of int + Offset of the data with respect to the trajectories. + Defaults to 0. + + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component """ num_frames = [len(d) for d in data] num_traj = len(data) + if type(start_frame) == int: + start_frame *= np.ones(num_traj) + start_frame = start_frame.tolist() # Combine the input data - data = np.concatenate(data,0) - # Remember which simulation the data came frome - cond = np.concatenate([i*np.ones(num_frames[i],dtype=int) for i in range(num_traj)]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(num_frames[i])+start_frame for i in range(num_traj)]) - # Calculate the principal components - pca = pyemma.coordinates.pca(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - univs = [] - atoms = [] - for j in range(num_traj): - u = mda.Universe(top[j],trj[j]) - print('Length of trajectory',len(u.trajectory)) - univs.append(u) - atoms.append(u.select_atoms('all')) + all_data = np.concatenate(data,0) + # Calculate the principal component + pca = calculate_pca(all_data) + # Initialize output + sorted_proj = [] + sorted_indices_data = [] + sorted_indices_traj = [] # Loop over principal components. for evi in range(num_pc): # Project the combined data on the principal component - proj = project_on_pc(data,evi,pca=pca) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_pc"+str(evi+1)+".xtc", atoms[0].n_atoms) as W: - for i in range(data.shape[0]): - j = cond_sort[i] - o = oidx_sort[i] - uj = univs[j] - ts = uj.trajectory[o] - W.write(atoms[j]) - return - - -def compare_projections(data_a, data_b, pca, num=3, saveas=None, label_a=None, label_b=None): - """ - Compare two datasets along a given principal component. - - Parameters - ---------- - data_a : float array - Trajectory data [frames,frame_data] - data_b : float array - Trajectory data [frames,frame_data] - pca : PCA object - Principal components information. - num : int - Number of principal components to plot. - saveas : str, optional - Name of the output file. - label_a : str, optional - Label for the first dataset. - label_b : str, optional - Label for the second dataset. - - """ - # Start the figure - fig,ax = plt.subplots(num, 2, figsize=[8,3*num], dpi=300) - val = [] - # Loop over PCs - for evi in range(num): - # Calculate values along PC for each frame - proj_a = project_on_pc(data_a, evi, pca=pca) - proj_b = project_on_pc(data_b, evi, pca=pca) - # Plot the time series in the left panel - ax[evi,0].plot(proj_a, alpha=0.5, label=label_a) - ax[evi,0].plot(proj_b, alpha=0.5, label=label_b) - ax[evi,0].set_xlabel('frame number') - ax[evi,0].set_ylabel('PC %i'%(evi+1)) - # Plot the histograms in the right panel - ax[evi,1].hist(proj_a, bins=30, alpha=0.5, density=True, label=label_a) - ax[evi,1].hist(proj_b, bins=30, alpha=0.5, density=True, label=label_b) - ax[evi,1].set_xlabel('PC %i'%(evi+1)) - ax[evi,1].set_ylabel('frequency') - # Legend - if label_a and label_b: - ax[evi,0].legend() - ax[evi,1].legend() - val.append([proj_a, proj_b]) - fig.tight_layout() - # Save the figure - if saveas is not None: - fig.savefig(saveas, dpi=300) - return val - - -def compare_mult_projections(data, pca, num=3, saveas=None, labels=None, colors=None): - """ - Compare two datasets along a given principal component. - - Parameters - ---------- - data : list of float arrays - Data from multiple trajectories [frames,frame_data] - pca : PCA object - Principal components information. - num : int - Number of principal components to plot. - saveas : str, optional - Name of the output file. - labels : list of str, optional - Labels for the datasets. If provided, it must have the same length as data. - - """ - if labels is not None: - assert len(labels) == len(data) - else: - labels = [None for _ in range(len(data))] - if colors is not None: - assert len(colors) == len(data) - else: - colors = ['C%i'%num for num in range(len(data))] - # Start the figure - fig,ax = plt.subplots(num, 2, figsize=[9,3*num], dpi=300) - # Loop over PCs - for evi in range(num): - for j,d in enumerate(data): - # Calculate values along PC for each frame - proj = project_on_pc(d, evi, pca=pca) - # Plot the time series in the left panel - ax[evi,0].plot(proj, alpha=0.5, - label=labels[j], color=colors[j]) - # Plot the histograms in the right panel - ax[evi,1].hist(proj, bins=30, alpha=0.5, density=True, - label=labels[j], color=colors[j]) - # Axis labels - ax[evi,0].set_xlabel('frame number') - ax[evi,0].set_ylabel('PC %i'%(evi+1)) - ax[evi,1].set_xlabel('PC %i'%(evi+1)) - ax[evi,1].set_ylabel('frequency') - # Legend - if labels[0] is not None: - ax[evi,0].legend() - ax[evi,1].legend() - fig.tight_layout() - # Save the figure - if saveas is not None: - fig.savefig(saveas, dpi=300) - return + proj = [project_on_pc(d, evi, pca=pca) for d in data] + # Sort everything along the projection on the respective PC + out_xtc = out_name+"_pc"+str(evi+1)+".xtc" + proj_sort, sort_idx, oidx_sort = merge_and_sort_coordinates( + proj, top, trj, out_xtc, start_frame=start_frame, verbose=False + ) + sorted_proj.append(proj_sort) + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_proj, sorted_indices_data, sorted_indices_traj + diff --git a/pensa/dimensionality/tica.py b/pensa/dimensionality/tica.py index 07a0c370..df473b43 100644 --- a/pensa/dimensionality/tica.py +++ b/pensa/dimensionality/tica.py @@ -3,14 +3,16 @@ from pyemma.util.contexts import settings import MDAnalysis as mda import matplotlib.pyplot as plt - +from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates +from .visualization import project_on_eigenvector, sort_traj_along_projection # --- METHODS FOR TIME-LAGGED INDEPENDENT COMPONENT ANALYSIS --- -# http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html#pyemma.coordinates.tica + +# http://emma-project.org/latest/api/generated/pyemma.coordinates.tica.html -def calculate_tica(data): +def calculate_tica(data, dim=-1, lag=10): """ Performs a PyEMMA TICA on the provided data. @@ -18,6 +20,11 @@ def calculate_tica(data): ---------- data : float array Trajectory data. Format: [frames,frame_data]. + dim : int, optional, default -1 + The number of dimensions (independent components) to project onto. + -1 means all numerically available dimensions will be used. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. Returns ------- @@ -25,13 +32,13 @@ def calculate_tica(data): Time-lagged independent component information. """ - tica = pyemma.coordinates.tica(data) + tica = pyemma.coordinates.tica(data, lag=lag) return tica def tica_eigenvalues_plot(tica, num=12, plot_file=None): """ - Plots the highest eigenvalues over the numberr of the time-lagged independent components. + Plots the highest eigenvalues over the number of the time-lagged independent components. Parameters ---------- @@ -43,7 +50,7 @@ def tica_eigenvalues_plot(tica, num=12, plot_file=None): Path and name of the file to save the plot. """ - # Plot eigenvalues over component numbers + # Plot eigenvalues over component numbers. fig,ax = plt.subplots(1, 1, figsize=[4,3], dpi=300) componentnr = np.arange(num)+1 eigenvalues = tica.eigenvalues[:num] @@ -51,7 +58,7 @@ def tica_eigenvalues_plot(tica, num=12, plot_file=None): ax.set_xlabel('component number') ax.set_ylabel('eigenvalue') fig.tight_layout() - # Save the figure to a file + # Save the figure to a file. if plot_file: fig.savefig(plot_file, dpi=300) return componentnr, eigenvalues @@ -75,7 +82,7 @@ def tica_features(tica, features, num, threshold, plot_file=None): Path and name of the file to save the plot. """ - # Plot the highest TIC correlations and print relevant features + # Plot the highest TIC correlations and print relevant features. fig,ax = plt.subplots(num,1,figsize=[4,num*3],dpi=300,sharex=True) for i in range(num): relevant = tica.feature_TIC_correlation[:,i]**2 > threshold**2 @@ -87,12 +94,12 @@ def tica_features(tica, features, num, threshold, plot_file=None): ax[i].set_xlabel('feature index') ax[i].set_ylabel('correlation with TIC%i'%(i+1)) fig.tight_layout() - # Save the figure to a file + # Save the figure to a file. if plot_file: fig.savefig(plot_file,dpi=300) return test_feature -def project_on_tic(data, ev_idx, tica=None): +def project_on_tic(data, ev_idx, tica=None, dim=-1, lag=10): """ Projects a trajectory onto an eigenvector of its TICA. @@ -105,25 +112,27 @@ def project_on_tic(data, ev_idx, tica=None): tica : TICA obj, optional Information of pre-calculated TICA. Must be calculated for the same features (but not necessarily the same trajectory). - + dim : int, optional, default -1 + The number of dimensions (independent components) to project onto. + Only used if tica is not provided. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + Returns ------- projection : float array Value along the TIC for each frame. """ - # Perform TICA if none is provided - if tica is None: - tica = pyemma.coordinates.tica(data) #,dim=3) - # Project the features onto the time-lagged independent components - projection = np.zeros(data.shape[0]) - for ti in range(data.shape[0]): - projection[ti] = np.dot(data[ti],tica.eigenvectors[:,ev_idx]) - # Return the value along the TIC for each frame + # Perform standard TICA if none is provided. + if tica is None: calculate_tica(data, dim=dim, lag=lag) + # Project the features onto the time-lagged independent components. + projection = project_on_eigenvector(data, ev_idx, tica) return projection -def get_components_tica(data, num, tica=None, prefix=''): +def get_components_tica(data, num, tica=None, dim=-1, lag=10, prefix=''): """ Projects a trajectory onto the first num eigenvectors of its tICA. @@ -136,6 +145,14 @@ def get_components_tica(data, num, tica=None, prefix=''): tica : tICA obj, optional Information of pre-calculated tICA. Defaults to None. Must be calculated for the same features (but not necessarily the same trajectory). + dim : int, optional, default -1 + The number of dimensions (independent components) to project onto. + Only used if tica is not provided. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + prefix : str, optional, default = '' + First part of the component names. Second part is "IC"+ Returns ------- @@ -145,10 +162,9 @@ def get_components_tica(data, num, tica=None, prefix=''): Component data [frames,components] """ - # Perform tICA if none is provided - if tica is None: - tica = pyemma.coordinates.tica(data) - # Project the features onto the principal components + # Perform tICA if none is provided. + if tica is None: calculate_tica(data, lag=lag) + # Project the features onto the principal components. comp_names = [] components = [] for ev_idx in range(num): @@ -157,55 +173,57 @@ def get_components_tica(data, num, tica=None, prefix=''): projection[ti] = np.dot(data[ti],tica.eigenvectors[:,ev_idx]) components.append(projection) comp_names.append(prefix+'IC'+str(ev_idx+1)) - # Return the names and data + # Return the names and data. return comp_names, np.array(components).T -def sort_traj_along_tic(data, tica, start_frame, top, trj, out_name, num_tic=3): +def sort_traj_along_tic(data, top, trj, out_name, tica=None, num_ic=3, lag=10, start_frame=0): """ - Sort a trajectory along given time-lagged independent components. + Sort a trajectory along independent components. Parameters ---------- data : float array Trajectory data [frames,frame_data]. - tica : TICA obj - Time-lagged independent components information. - num_tic : int - Sort along the first num_tic time-lagged independent components. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : str File name of the reference topology for the trajectory. trj : str File name of the trajetory from which the frames are picked. Should be the same as data was from. out_name : str - Core part of the name of the output files. + Core part of the name of the output files + tica : tICA obj, optional + Time-lagged independent components information. + If none is provided, it will be calculated. + Defaults to None. + num_ic : int, optional, default = 3 + Sort along the first num_ic independent components. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + start_frame : int, optional, default = 0 + Offset of the data with respect to the trajectories (defined below). - """ - # Remember the index in the simulation (taking into account cutoff) - oidx = np.arange(len(data))+start_frame - # Define the MDAnalysis trajectories from where the frames come - u = mda.Universe(top,trj) - a = u.select_atoms('all') - # Loop through the time-lagged independent components - for evi in range(num_tic): - # Project the combined data on the time-lagged independent component - proj = project_on_tic(data,evi,tica=tica) - # Sort everything along the projection onto the TIC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the TIC - with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", a.n_atoms) as W: - for i in range(data.shape[0]): - ts = u.trajectory[oidx_sort[i]] - W.write(a) - return oidx_sort + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component + + """ + # Calculate the principal components if they are not given. + if tica is None: tica = calculate_tica(data, dim=num_ic, lag=lag) + # Sort the trajectory along them. + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_traj_along_projection( + data, tica, top, trj, out_name, num_comp=num_ic, start_frame=start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a, trj_b, out_name, num_tic=3): +def sort_trajs_along_common_tic(data_a, data_b, top_a, top_b, trj_a, trj_b, out_name, num_ic=3, lag=10, start_frame=0): """ Sort two trajectories along their most important common time-lagged independent components. @@ -215,8 +233,6 @@ def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a Trajectory data [frames,frame_data]. data_b : float array Trajectory data [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top_a : str Reference topology for the first trajectory. top_b : str @@ -229,54 +245,38 @@ def sort_trajs_along_common_tic(data_a, data_b, start_frame, top_a, top_b, trj_a Should be the same as data_b was from. out_name : str Core part of the name of the output files. + num_ic : int, optional, default = 3 + Sort along the first num_ic independent components. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + start_frame : int, optional, default = 0 + Offset of the data with respect to the trajectories (defined below). + + Returns + ------- + sorted_proj: list + sorted projections on each principal component + sorted_indices_data : list + Sorted indices of the data array for each principal component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each principal component """ - # Combine the input data - data = np.concatenate([data_a,data_b],0) - # Remember which simulation the data came frome - cond = np.concatenate([np.ones(len(data_a)), np.zeros(len(data_b))]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(len(data_a))+start_frame, - np.arange(len(data_b))+start_frame]) - # Calculate the time-lagged independent components - tica = pyemma.coordinates.tica(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - ua = mda.Universe(top_a,trj_a) - ub = mda.Universe(top_b,trj_b) - # ... and select all atoms - aa = ua.select_atoms('all') - ab = ub.select_atoms('all') - # Loop over time-lagged independent components. - for evi in range(num_tic): - # Project the combined data on the time-lagged independent component - proj = project_on_tic(data,evi,tica=tica) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", aa.n_atoms) as W: - for i in range(data.shape[0]): - if cond_sort[i] == 1: # G-protein bound - ts = ua.trajectory[oidx_sort[i]] - W.write(aa) - elif cond_sort[i] == 0: # arrestin bound - ts = ub.trajectory[oidx_sort[i]] - W.write(ab) - return proj, oidx_sort + sorted_proj, sorted_indices_data, sorted_indices_traj = sort_mult_trajs_along_common_tic( + [data_a, data_b], [top_a, top_b], [trj_a, trj_b], out_name, num_ic=num_ic, lag=lag, start_frame = start_frame + ) + return sorted_proj, sorted_indices_data, sorted_indices_traj -def sort_mult_trajs_along_common_tic(data, start_frame, top, trj, out_name, num_tic=3): +def sort_mult_trajs_along_common_tic(data, top, trj, out_name, num_ic=3, lag=10, start_frame=0): """ - Sort multiple trajectories along their most important common time-lagged independent components. + Sort multiple trajectories along their most important independent components. Parameters ---------- data : list of float arrays List of trajectory data arrays, each [frames,frame_data]. - start_frame : int - Offset of the data with respect to the trajectories (defined below). top : list of str Reference topology files. trj : list of str @@ -284,88 +284,48 @@ def sort_mult_trajs_along_common_tic(data, start_frame, top, trj, out_name, num_ trj[i] should be the same as data[i] was from. out_name : str Core part of the name of the output files. + num_ic : int, optional, default = 3 + Sort along the first num_ic independent components. + lag : int, optional, default = 10 + The lag time, in multiples of the input time step. + Only used if tica is not provided. + start_frame : int or list of int, default = 0 + Offset of the data with respect to the trajectories. + + Returns + ------- + sorted_proj: list + sorted projections on each independent component + sorted_indices_data : list + Sorted indices of the data array for each independent component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each independent component """ num_frames = [len(d) for d in data] num_traj = len(data) + if type(start_frame) == int: + start_frame *= np.ones(num_traj) + start_frame = start_frame.tolist() # Combine the input data - data = np.concatenate(data,0) - # Remember which simulation the data came frome - cond = np.concatenate([i*np.ones(num_frames[i]) for i in range(num_traj)]) - # Remember the index in the respective simulation (taking into account cutoff) - oidx = np.concatenate([np.arange(num_frames[i])+start_frame for i in range(num_traj)]) - # Calculate the time-lagged independent components - tica = pyemma.coordinates.tica(data,dim=3) - # Define the MDAnalysis trajectories from where the frames come - univs = [] - atoms = [] - for j in range(num_traj): - u = mda.Universe(top[j],trj[j]) - univs.append(u) - atoms.append(u.select_atoms('all')) - # Loop over time-lagged independent component. - for evi in range(num_tic): - # Project the combined data on the time-lagged independent component - proj = project_on_tic(data,evi,tica=tica) - # Sort everything along the projection on th resp. PC - sort_idx = np.argsort(proj) - proj_sort = proj[sort_idx] - cond_sort = cond[sort_idx] - oidx_sort = oidx[sort_idx] - # Write the trajectory, ordered along the PC - with mda.Writer(out_name+"_tic"+str(evi+1)+".xtc", atoms[0].n_atoms) as W: - for i in range(data.shape[0]): - j = cond_sort[i] - ts = univs[j].trajectory[oidx_sort[i]] - W.write(atoms[j]) - return - - -def compare_projections_tica(data_a, data_b, tica, num=3, saveas=None, label_a=None, label_b=None): - """ - Compare two datasets along a given time-lagged indepedent component. - - Parameters - ---------- - data_a : float array - Trajectory data [frames,frame_data] - data_b : float array - Trajectory data [frames,frame_data] - tica : TICA object - Time-lagged independent components information. - num : int, default=3 - Number of time-lagged independent components to plot. - saveas : str, optional - Name of the output file. - label_a : str, optional - Label for the first dataset. - label_b : str, optional - Label for the second dataset. - - """ - # Start the figure - fig,ax = plt.subplots(num, 2, figsize=[8,3*num], dpi=300) - # Loop over PCs - for evi in range(num): - # Calculate values along TIC for each frame - proj_a = project_on_tic(data_a, evi, tica=tica) - proj_b = project_on_tic(data_b, evi, tica=tica) - # Plot the time series in the left panel - ax[evi,0].plot(proj_a, alpha=0.5, label=label_a) - ax[evi,0].plot(proj_b, alpha=0.5, label=label_b) - ax[evi,0].set_xlabel('frame number') - ax[evi,0].set_ylabel('TIC %i'%(evi+1)) - # Plot the histograms in the right panel - ax[evi,1].hist(proj_a, bins=30, alpha=0.5, density=True, label=label_a) - ax[evi,1].hist(proj_b, bins=30, alpha=0.5, density=True, label=label_b) - ax[evi,1].set_xlabel('TIC %i'%(evi+1)) - ax[evi,1].set_ylabel('frequency') - # Legend - if label_a and label_b: - ax[evi,0].legend() - ax[evi,1].legend() - fig.tight_layout() - # Save the figure - if saveas is not None: - fig.savefig(saveas, dpi=300) - return + all_data = np.concatenate(data,0) + # Calculate the independent components + tica = pyemma.coordinates.tica(all_data, lag=lag) + # Initialize output + sorted_proj = [] + sorted_indices_data = [] + sorted_indices_traj = [] + # Loop over principal components. + for evi in range(num_ic): + # Project the combined data on the independent component + proj = [project_on_tic(d, evi, tica=tica) for d in data] + # Sort everything along the projection on the respective independent component + out_xtc = out_name+"_tic"+str(evi+1)+".xtc" + proj_sort, sort_idx, oidx_sort = merge_and_sort_coordinates( + proj, top, trj, out_xtc, start_frame=start_frame, verbose=False + ) + sorted_proj.append(proj_sort) + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_proj, sorted_indices_data, sorted_indices_traj + diff --git a/pensa/dimensionality/visualization.py b/pensa/dimensionality/visualization.py new file mode 100644 index 00000000..7263a7a3 --- /dev/null +++ b/pensa/dimensionality/visualization.py @@ -0,0 +1,183 @@ +import numpy as np +import pyemma +from pyemma.util.contexts import settings +import MDAnalysis as mda +import matplotlib.pyplot as plt +from pensa.preprocessing import sort_coordinates, merge_and_sort_coordinates + + +def project_on_eigenvector(data, ev_idx, ana): + """ + Projects a trajectory onto an eigenvector of its PCA/tICA. + + Parameters + ---------- + data : float array + Trajectory data [frames,frame_data]. + ev_idx : int + Index of the eigenvector to project on (starts with zero). + ana : PCA or tICA obj + Information of pre-calculated PCA or tICA. + Must be calculated for the same features (but not necessarily the same trajectory). + + Returns + ------- + projection : float array + Value along the PC for each frame. + + """ + # Project the features onto the components + projection = np.zeros(data.shape[0]) + for ti in range(data.shape[0]): + projection[ti] = np.dot(data[ti], ana.eigenvectors[:,ev_idx]) + # Return the value along the PC for each frame + return projection + + +def compare_projections(data_a, data_b, ana, num=3, saveas=None, label_a=None, label_b=None): + """ + Compare two datasets along the components of a PCA or tICA. + + Parameters + ---------- + data_a : float array + Trajectory data [frames,frame_data]. + data_b : float array + Trajectory data [frames,frame_data]. + ana : PCA or tICA object + Components analysis information. + num : int + Number of components to plot. + saveas : str, optional + Name of the output file. + label_a : str, optional + Label for the first dataset. + label_b : str, optional + Label for the second dataset. + + Returns: + -------- + projections : list of float arrays + Projections of the trajectory on each component. + + """ + if label_a is not None and label_b is not None: + labels = [label_a, label_b] + else: + labels = None + projections = compare_mult_projections([data_a, data_b], ana, num=num, saveas=saveas, labels=labels, colors=None) + return projections + + +def compare_mult_projections(data, ana, num=3, saveas=None, labels=None, colors=None): + """ + Compare multiple datasets along the components of a PCA or tICA. + + Parameters + ---------- + data : list of float arrays + Data from multiple trajectories [frames,frame_data]. + ana : PCA or tICA object + Components analysis information. + num : int + Number of principal components to plot. + saveas : str, optional + Name of the output file. + labels : list of str, optional + Labels for the datasets. If provided, it must have the same length as data. + + Returns: + -------- + projections : list of float arrays + Projections of the trajectory on each principal component. + + """ + if labels is not None: + assert len(labels) == len(data) + else: + labels = [None for _ in range(len(data))] + if colors is not None: + assert len(colors) == len(data) + else: + colors = ['C%i'%num for num in range(len(data))] + # Start the figure + fig,ax = plt.subplots(num, 2, figsize=[9,3*num], dpi=300) + # Loop over components + projections = [] + for evi in range(num): + proj_evi = [] + for j,d in enumerate(data): + # Calculate values along PC for each frame + proj = project_on_eigenvector(d, evi, ana) + # Plot the time series in the left panel + ax[evi,0].plot(proj, alpha=0.5, + label=labels[j], color=colors[j]) + # Plot the histograms in the right panel + ax[evi,1].hist(proj, bins=30, alpha=0.5, density=True, + label=labels[j], color=colors[j]) + proj_evi.append(proj) + projections.append(proj_evi) + # Axis labels + ax[evi,0].set_xlabel('frame number') + ax[evi,0].set_ylabel('PC %i'%(evi+1)) + ax[evi,1].set_xlabel('PC %i'%(evi+1)) + ax[evi,1].set_ylabel('frequency') + # Legend + if labels[0] is not None: + ax[evi,0].legend() + ax[evi,1].legend() + fig.tight_layout() + # Save the figure + if saveas is not None: + fig.savefig(saveas, dpi=300) + return projections + + +def sort_traj_along_projection(data, ana, top, trj, out_name, num_comp=3, start_frame=0): + """ + Sort a trajectory along given principal components. + + Parameters + ---------- + data : float array + Trajectory data [frames,frame_data]. + ana : PCA or tICA obj + Components information. + top : str + File name of the reference topology for the trajectory. + trj : str + File name of the trajetory from which the frames are picked. + Should be the same as data was from. + out_name : str + Core part of the name of the output files + num_comp : int, optional + Sort along the first num_comp components. + start_frame : int, optional + Offset of the data with respect to the trajectories (defined below). + + Returns + ------- + sorted_proj: list + sorted projections on each component + sorted_indices_data : list + Sorted indices of the data array for each component + sorted_indices_traj : list + Sorted indices of the coordinate frames for each component + + """ + # Initialize output + sorted_proj = [] + sorted_indices_data = [] + sorted_indices_traj = [] + # Loop through the principal components + for evi in range(num_comp): + # Project the combined data on the principal component + proj = project_on_eigenvector(data, evi, ana) + # Sort everything along the projection onto the PC + out_xtc = out_name+"_pc"+str(evi+1)+".xtc" + proj_sort, sort_idx, oidx_sort = sort_coordinates(proj, top, trj, out_xtc, start_frame=start_frame) + sorted_proj.append(proj_sort) + sorted_indices_data.append(sort_idx) + sorted_indices_traj.append(oidx_sort) + return sorted_proj, sorted_indices_data, sorted_indices_traj + diff --git a/pensa/features/__init__.py b/pensa/features/__init__.py index 4c9a5bde..4a9a0ed4 100644 --- a/pensa/features/__init__.py +++ b/pensa/features/__init__.py @@ -8,4 +8,5 @@ from .mda_distances import * from .processing import * from .atom_features import * -from .water_features import * +from .water_features import * +from .txt_features import * diff --git a/pensa/features/processing.py b/pensa/features/processing.py index 55c76af0..c09e233e 100644 --- a/pensa/features/processing.py +++ b/pensa/features/processing.py @@ -9,8 +9,7 @@ import matplotlib.pyplot as plt import os import warnings -#from pensa.features import * - +from pensa.preprocessing import sort_coordinates # -- Utilities to extract time series -- @@ -218,10 +217,52 @@ def sort_sincos_torsions_by_resnum(tors, data): new_data = data[:,new_order] return new_tors, new_data +def sort_torsions_by_resnum(tors, data): + """ + Sort torsion features by the residue number.. + Parameters + ---------- + tors : list of str + The list of torsion features. + Returns + ------- + new_tors : list of str + The sorted list of torsion features. + """ + renamed = [] + for t in tors: + rn = t.split(' ')[-1] + ft = t.split(' ')[0] + renamed.append('%09i %s'%(int(rn),ft)) + new_order = np.argsort(renamed) + new_tors = np.array(tors)[new_order].tolist() + new_data = data[:,new_order] + return new_tors, new_data + +def sort_features_alphabetically(tors, data): + """ + Sort torsion features alphabetically. + Parameters + ---------- + tors : list of str + The list of torsion features. + Returns + ------- + new_tors : list of str + The sorted list of torsion features. + """ + renamed = [] + for t in tors: + renamed.append(t) + new_order = np.argsort(renamed) + new_tors = np.array(tors)[new_order].tolist() + new_data = data[:,new_order] + return new_tors, new_data + def sort_distances_by_resnum(dist, data): """ - Sort distance features by the residue number.. + Sort distance features by the residue number. Parameters ---------- dist : list of str @@ -241,13 +282,64 @@ def sort_distances_by_resnum(dist, data): return new_dist, new_data -def select_common_features(features_a, features_b): +def select_common_features(features_a, features_b, boolean=True): + """ + Finds features in common between two trajectories. + + Parameters + ---------- + features_a : list of str + First set of features. + features_b : list of str + Second set of features. + boolean : bool + Determines if returned array contains booleans or features. + Returns + ------- + common_a : np array of bool or str + Common features taken from features_a. + common_b : np array of bool or str + Common features taken from features_b. + """ intersect = set(features_a).intersection(features_b) - is_common_a = [f in intersect for f in features_a] - is_common_b = [f in intersect for f in features_b] - return np.array(is_common_a), np.array(is_common_b) + if boolean: + is_common_a = [f in intersect for f in features_a] + is_common_b = [f in intersect for f in features_b] + else: + is_common_a = [f for f in features_a if f in intersect] + is_common_b = [f for f in features_b if f in intersect] + common_a = np.array(is_common_a) + common_b = np.array(is_common_b) + return common_a, common_b +def get_common_features_data(features_a, features_b, data_a, data_b): + """ + Finds common features and corresponding data from two trajectories. + + Parameters + ---------- + features_a : list of str + First set of features. + features_b : list of str + Second set of features. + data_a : float array + Data from first trajectory. + data_b : float array + Data from second trajectory. + Returns + ------- + new_features_a, new_features_b : np array of str + Common features between the two trajectories. + new_data_a, new_data_b : float array + Data corresponding to common features between the two trajectories. + """ + is_common_a, is_common_b = select_common_features(features_a, features_b) + new_data_a = data_a[:,is_common_a] + new_data_b = data_b[:, is_common_b] + new_features_a, new_features_b = select_common_features(features_a, features_b, boolean=False) + return new_features_a, new_features_b, new_data_a, new_data_b + # -- Utilities to process feature data -- @@ -283,3 +375,37 @@ def correct_angle_periodicity(angle): return new_angle +# Process trajectories according to feature data + +def sort_traj_along_feature(feat, data, feature_name, ref_name, trj_name, out_name, start_frame=0, verbose=False): + """ + Sort a trajectory along a feature. + + Parameters + ---------- + feat : list of str + List with all feature names. + data : float array + Feature values data from the simulation. + feature_name : str + Name of the selected feature. + ref_name: string + Reference topology for the trajectory. + trj_name: string + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: string. + Name of the output files. + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + d_sorted: float array + Sorted data of the selected feature. + + """ + if verbose: print('Sorting along feature '+feature_name) + d = get_feature_data(feat, data, feature_name) + d_sorted, sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) + return d_sorted diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py index ee60b067..8efe1690 100644 --- a/pensa/features/pyemma_features.py +++ b/pensa/features/pyemma_features.py @@ -10,7 +10,8 @@ import numpy as np import pyemma from pyemma.util.contexts import settings - +from pensa.features.processing import get_feature_timeseries +from pensa.preprocessing.coordinates import sort_coordinates # -- Loading the Features -- @@ -41,9 +42,9 @@ def get_structure_features(pdb, xtc, start_frame=0, step_width=1, cossin=False, Returns ------- - feature_names : list of str + feature_names : dict of lists of str Names of all features - features_data : numpy array + features_data : dict of numpy arrays Data for all features """ @@ -169,4 +170,38 @@ def _remove_resnum_offset(features, offset): return new_features +def sort_traj_along_pyemma_feature(feat, data, feature_name, feature_type, ref_name, trj_name, out_name, start_frame=0): + """ + Sort a trajectory along a PyEMMA feature. + + Parameters + ---------- + feat : list of str + List with all feature names. + data : float array + Feature values data from the simulation. + feature_name : str + Name of the selected feature. + feature_type : str + Type of the selected feature. + ref_name: string + Reference topology for the trajectory. + trj_name: string + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: string. + Name of the output files. + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + d_sorted: float array + Sorted data of the selected feature. + + """ + d = get_feature_timeseries(feat, data, feature_type, feature_name) + sort_idx, oidx_sort = sort_coordinates(d, ref_name, trj_name, out_name, start_frame=start_frame) + d_sorted = d[sort_idx] + return d_sorted diff --git a/pensa/features/txt_features.py b/pensa/features/txt_features.py new file mode 100644 index 00000000..7f57cac5 --- /dev/null +++ b/pensa/features/txt_features.py @@ -0,0 +1,66 @@ +import numpy as np +from pensa import * +import random +import math + + + +def get_txt_features_ala2(filename, num_frames, cossin=False): + """ + Parses features for ala2 from a text file. + The text file must be formatted with "phi", followed by all phi angles, a blank line, + followed by "psi" and all psi angles, with one angle per line. + + Parameters + ---------- + filename : str + File name of the text file. + num_frames : int + Maximum number of trajectory frames used in features array. + cossin : bool + Determines if the features array contains torsion angles or the sin and cos of torsion angles. + + Returns + ------- + features : numpy array + Data for all features + + """ + phi = [] + psi = [] + curr = 'phi' + with open(filename) as f: + for s in f.readlines(): + if s == 'phi\n' or s == 'psi\n': + continue + if s == '\n': + curr = 'psi' + else: + if curr == 'phi': + phi.append(float(s)) + else: + psi.append(float(s)) + + if len(phi) > num_frames: + temp = list(zip(phi, psi)) + random.shuffle(temp) + phi, psi = zip(*temp) + + features = [] + if not cossin: + features = np.zeros((num_frames, 2)) + for i in range(num_frames): + features[i, 0] = phi[i] + reatures[i, 1] = psi[i] + else: + features = np.zeros((num_frames, 4)) + for i in range(num_frames): + features[i, 0] = math.cos(phi[i]) + features[i, 1] = math.sin(phi[i]) + features[i, 2] = math.cos(psi[i]) + features[i, 3] = math.sin(psi[i]) + + return features + + + diff --git a/pensa/preprocessing/coordinates.py b/pensa/preprocessing/coordinates.py index aa476316..db3a9a68 100644 --- a/pensa/preprocessing/coordinates.py +++ b/pensa/preprocessing/coordinates.py @@ -7,44 +7,17 @@ # -- Functions to preprocess trajectories -- -def align_coordinates(ref, pdb, trj_list, out_name, sel_string='all', start_frame=0): - """ - Aligns selected coordinates from a trajectory file. - - Parameters - ---------- - ref : str - File name for reference topology. - Can read all MDAnalysis-compatible topology formats. - pdb : str - File name for reference PDB file. - trj_list : list of str - File names for the input trajectory. - Can read all MDAnalysis-compatible trajectory formats. - out_name : str - Core of the file names for the output files - start_frame : int, optional - First frame to read from the trajectory. - """ - # Read the reference+PDB files and align selected parts. - u = mda.Universe(ref, pdb) - for trj in trj_list: - mobile = mda.Universe(ref, trj) - #mobile.trajectory = mobile.trajectory[start_frame:] - alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc') - alignment.run() - -def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, +def extract_coordinates(top, pdb, trj_list, out_name, sel_string, start_frame=0, rename_segments=None, residues_offset=0 ): """ Extracts selected coordinates from a trajectory file. Parameters ---------- - ref : str - File name for reference topology. + top : str + File name for topology. Can read all MDAnalysis-compatible topology formats. pdb : str File name for the reference PDB file. @@ -57,8 +30,8 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, First frame to read from the trajectory. """ - # Read the reference+PDB files and extract selected parts. - u = mda.Universe(ref,pdb) + # Read the topology+PDB files and extract selected parts. + u = mda.Universe(top,pdb) u.residues.resids -= residues_offset selection = u.select_atoms(sel_string) num_at = selection.n_atoms @@ -70,7 +43,7 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, # Read the trajectories and extract selected parts. with mda.Writer(out_name+'.xtc', selection.n_atoms) as W: for trj in trj_list: - u = mda.Universe(ref,trj) + u = mda.Universe(top,trj) u.residues.resids -= residues_offset selection = u.select_atoms(sel_string) for ts in u.trajectory[start_frame:]: @@ -78,14 +51,14 @@ def extract_coordinates(ref, pdb, trj_list, out_name, sel_string, start_frame=0, return num_at -def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, verbose=False): +def extract_coordinates_combined(top, trj, sel_string, out_name, start_frame=0, verbose=False): """ Extracts selected coordinates from several trajectory files. Parameters ---------- - ref : list of str - File names for the reference topologies. + top : list of str + File names for the topologies. Can read all MDAnalysis-compatible topology formats. trj : list of str File names for the input trajectories. @@ -97,12 +70,12 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, """ # Determine the number of atoms from the first trajectory - u = mda.Universe(ref[0], trj[0]) + u = mda.Universe(top[0], trj[0]) selection = u.select_atoms(sel_string[0]) num_at = selection.n_atoms # Go through trajectories and write selections with mda.Writer(out_name+'.xtc', num_at) as W: - for r, t, s in zip(ref, trj, sel_string): + for r, t, s in zip(top, trj, sel_string): print(r, t) if verbose: print(s) u = mda.Universe(r, t) @@ -112,14 +85,14 @@ def extract_coordinates_combined(ref, trj, sel_string, out_name, start_frame=0, return num_at -def merge_coordinates(ref_files, trj_files, out_name, segid=None): +def merge_coordinates(top_files, trj_files, out_name, segid=None): """ - Merges the trajectories of several different systems or system parts. + Merge several trajectories of the same system or system part. All trajectories must be (at least) as long as the first one. Parameters ---------- - ref_files : str[] + top_files : str[] List of input topology files. trj_files : str[]: List of input trajectory files. @@ -134,10 +107,10 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None): MDAnalysis universe of the merged system. """ - num_parts = len(ref_files) + num_parts = len(top_files) assert num_parts == len(trj_files) # Create an array of universes - u = [ mda.Universe(ref_files[i],trj_files[i]) for i in range(num_parts) ] + u = [ mda.Universe(top_files[i],trj_files[i]) for i in range(num_parts) ] num_frames = len(u[0].trajectory) new_num_at = sum([len(ui.atoms) for ui in u]) # Create the merged starting structure @@ -161,3 +134,163 @@ def merge_coordinates(ref_files, trj_files, out_name, segid=None): W.write(c.atoms) return univ + +def align_coordinates(top, pdb, trj_list, out_name, sel_string='all', start_frame=0): + """ + Aligns selected coordinates from a trajectory file. + + Parameters + ---------- + top : str + File name for the topology. + Can read all MDAnalysis-compatible topology formats. + pdb : str + File name for the reference PDB file. + trj_list : list of str + File names for the input trajectory. + Can read all MDAnalysis-compatible trajectory formats. + out_name : str + Core of the file names for the output files + start_frame : int, optional + First frame to read from the trajectory. + """ + # Read the topology+PDB files and align selected parts. + u = mda.Universe(top, pdb) + for trj in trj_list: + mobile = mda.Universe(top, trj) + #mobile.trajectory = mobile.trajectory[start_frame:] + alignment = align.AlignTraj(mobile, u, select=sel_string, filename=f'{out_name}.xtc') + alignment.run() + + +def sort_coordinates(values, top_name, trj_name, out_name, start_frame=0, verbose=False): + """ + Sort coordinate frames along corresponding values. + + Parameters + ---------- + values: float array + Values along which to sort the trajectory. + top_name: str + Topology for the trajectory. + trj_name: str + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: str + Name of the output trajectory (usual format is .xtc). + start_frame: int + Offset of the data with respect to the trajectories. + + Returns + ------- + data_sort: float array + Sorted values of the input data. + sort_idx: float array + Sorted indices of the values. + oidx_sort: float array + Sorted indices of the trajectory. + data_sort: float array + + """ + # Remember the index in the simulation (taking into account offset) + oidx = np.arange(len(values)) + start_frame + # Define the MDAnalysis trajectory from where the frames come + if verbose: print('Loading:', top_name, trj_name) + u = mda.Universe(top_name, trj_name) + if verbose: + print('Trajectory length:', len(u.trajectory)) + print('Number of values: ', len(values)) + print('Trajectory offset:', start_frame) + a = u.select_atoms('all') + # Sort everything along the projection on the values + sort_idx = np.argsort(values) + data_sort = values[sort_idx] + oidx_sort = oidx[sort_idx] + # Write out sorted trajectory + with mda.Writer(out_name, a.n_atoms) as W: + for i in range(len(values)): + ts = u.trajectory[oidx_sort[i]] + W.write(a) + return data_sort, sort_idx, oidx_sort + + +def merge_and_sort_coordinates(values, top_names, trj_names, out_name, start_frame=0, verbose=False): + """ + Write multiple trajectories of coordinate frames into one trajectory, sorted along corresponding values. + + Parameters + ---------- + values: list of float arrays + Values along which to sort the trajectory. + top_names: list of str + topology for the trajectory. + trj_names: list of str + Trajetory from which the frames are picked. + Usually the same as the values are from. + out_name: str + Name of the output trajectory (usual format is .xtc). + start_frame: int or list of int + Offsets of the data with respect to the trajectories. + Defaults to zero. + + Returns + ------- + data_sort: float array + Sorted values of the input data. + sort_idx: float array + Sorted indices of the values. + oidx_sort: float array + Sorted indices of the trajectory. + + """ + assert type(values) == list and type(top_names) == list and type(trj_names) == list + # Get some stats + num_traj = len(values) + num_frames = [len(val) for val in values] + # Number of values must be consistent with topologies and trajectories + assert num_traj == len(top_names) + assert num_traj == len(trj_names) + # Set offset if not provided + assert type(start_frame) == int or len(start_frame) == num_traj + if type(start_frame) == int: + start_frame *= np.ones(num_traj) + start_frame = start_frame.tolist() + # Make sure the start indices are integers (MDA does not accept floats for indexing a trajectory) + start_frame = [int(sf) for sf in start_frame] + + # Combine the input data + data = np.concatenate(values) + # Remember which simulation the data came frome + cond = np.concatenate([i*np.ones(num_frames[i], dtype=int) for i in range(num_traj)]) + # Remember the index in the respective simulation (taking into account cutoff) + oidx = np.concatenate([np.arange(num_frames[i], dtype=int) + start_frame[i] for i in range(num_traj)]) + assert type(oidx[0]==int) + + # Define the MDAnalysis trajectories from where the frames come + univs = [] + atoms = [] + for j in range(num_traj): + u = mda.Universe(top_names[j],trj_names[j]) + if verbose: print('Length of trajectory',len(u.trajectory)) + univs.append(u) + atoms.append(u.select_atoms('all')) + # Make sure the numbers of atoms are the same in each trajectory + assert atoms[j].n_atoms == atoms[0].n_atoms + + # Sort everything along the data + sort_idx = np.argsort(data) + data_sort = data[sort_idx] + cond_sort = cond[sort_idx] + oidx_sort = oidx[sort_idx] + + # Write the trajectory, ordered along the PC + with mda.Writer(out_name, atoms[0].n_atoms) as W: + for i in range(len(data)): + j = cond_sort[i] + o = oidx_sort[i] + uj = univs[j] + ts = uj.trajectory[o] + W.write(atoms[j]) + + return data_sort, sort_idx, oidx_sort + diff --git a/run-test.sb b/run-test.sb new file mode 100644 index 00000000..9faa2f29 --- /dev/null +++ b/run-test.sb @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --time=24:00:00 +#SBATCH --mem=20G +#SBATCH --partition=rondror +#SBATCH --qos=high_p + +# Activate PENSA environment +source /home/users/mvoegele/miniconda3/etc/profile.d/conda.sh +conda activate /oak/stanford/groups/rondror/users/mvoegele/envs/pensa_dev + +# Run the tests (ignoring diffnets for now) +pytest --ignore pensa/diffnets/tests/test_api.py \ + --ignore pensa/diffnets/tests/test_cli.py \ + --ignore pensa/diffnets/tests/test_diffnets.py + diff --git a/scripts/calculate_combined_principal_components.py b/scripts/calculate_combined_principal_components.py index 3e0b58d7..d25fe8a3 100644 --- a/scripts/calculate_combined_principal_components.py +++ b/scripts/calculate_combined_principal_components.py @@ -68,9 +68,9 @@ args.num_components, args.feat_threshold, plot_file=args.out_plots+"_"+ftype+"_feature_correlation.pdf") # Sort each of the trajectories along the top components of combined data - sort_trajs_along_common_pc(data_a[ftype], data_b[ftype], args.start_frame, + sort_trajs_along_common_pc(data_a[ftype], data_b[ftype], args.ref_file_a, args.ref_file_b, args.trj_file_a, args.trj_file_b, - args.out_pc, num_pc=args.num_components) + args.out_pc, num_pc=args.num_components, start_frame=args.start_frame) # Plot histograms of both simulations along the common PCs compare_projections(data_a[ftype], data_b[ftype], pca, num=args.num_components, saveas=args.out_plots+"_"+ftype+"_pc-comparison.pdf") diff --git a/setup.py b/setup.py index 4dd2d823..ddb50063 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='pensa', - version='0.2.4', + version='0.2.5', description='PENSA - protein ensemble analysis', url='http://github.com/drorlab/pensa', author='Martin Voegele, Neil Thomson, Sang Truong, Jasper McAvity', diff --git a/tests/test_workflow.py b/tests/test_workflow.py index b1898e54..5ef9a44c 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -81,23 +81,23 @@ def setUp(self): # -- Sort trajectory along common pc self.sort_common_traj = sort_trajs_along_common_pc(self.sim_a_tmr_data['bb-torsions'], self.sim_b_tmr_data['bb-torsions'], - start_frame, test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-b_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/traj/condition-b_receptor.xtc", test_data_path + "/pca/receptor_by_tmr", - num_pc=3) + num_pc=3, start_frame = start_frame) plt.close() # -- Sort trajectory pc pca_a = calculate_pca(self.sim_a_tmr_data['bb-torsions']) pca_features(pca_a, self.sim_a_tmr_feat['bb-torsions'], 3, 0.4) plt.close() - self.sort_traj, self.all_proj = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], pca_a, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_pc=3) + self.all_sort, _, _ = sort_traj_along_pc(self.sim_a_tmr_data['bb-torsions'], + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", + pca=pca_a, num_pc=3) # -- Compare projections self.val = compare_projections(self.sim_a_tmr_data['bb-torsions'], @@ -273,36 +273,32 @@ def test_tica_features(self): # -- sort_trajs_along_common_pc() + sort_traj_along_pc() + project_on_pc() def test_sort_trajs_along_pc(self): - self.assertEqual(len(self.sort_common_traj), 180) for ele in self.sort_common_traj: - self.assertEqual(ele.n_atoms, 2322) - - self.assertEqual(len(self.sort_traj), 90) - for ele in self.sort_traj: - self.assertEqual(ele.n_atoms, 2322) + self.assertEqual(len(ele), 3) - self.assertEqual(len(self.all_proj), 3) + self.assertEqual(len(self.all_sort), 3) # -- sort_trajs_along_common_tic() def test_sort_trajs_along_common_tic(self): - proj, atom = sort_trajs_along_common_tic(self.sim_a_tmr_data['bb-torsions'], - self.sim_b_tmr_data['bb-torsions'], 0, + sproj, sidx_data, sidx_traj = sort_trajs_along_common_tic(self.sim_a_tmr_data['bb-torsions'], + self.sim_b_tmr_data['bb-torsions'], test_data_path + "/traj/condition-a_receptor.gro", test_data_path + "/traj/condition-b_receptor.gro", test_data_path + "/traj/condition-a_receptor.xtc", test_data_path + "/traj/condition-b_receptor.xtc", test_data_path + "/tica/receptor_by_tmr", - num_tic=3) - self.assertEqual(len(proj), 60) - self.assertEqual(len(atom), 60) + num_ic=3) + self.assertEqual(len(sproj[0]), 60) + self.assertEqual(len(sidx_data[0]), 60) # -- sort_traj_along_tic() def test_sort_traj_along_tic(self): - oidx = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], self.tica_combined, 0, - test_data_path + "/traj/condition-a_receptor.gro", - test_data_path + "/traj/condition-a_receptor.xtc", - test_data_path + "/pca/condition-a_receptor_by_tmr", num_tic=3) - self.assertEqual(len(oidx), 30) + all_sort, _, _ = sort_traj_along_tic(self.sim_a_tmr_data['bb-torsions'], + test_data_path + "/traj/condition-a_receptor.gro", + test_data_path + "/traj/condition-a_receptor.xtc", + test_data_path + "/pca/condition-a_receptor_by_tmr", + tica = self.tica_combined, num_ic=3) + self.assertEqual(len(all_sort), 3) # -- compare_projections()