diff --git a/pypef/__init__.py b/pypef/__init__.py index 8e000d9..42404b4 100644 --- a/pypef/__init__.py +++ b/pypef/__init__.py @@ -17,4 +17,4 @@ # §Equal contribution -VERSION = '0.2.0-alpha' +VERSION = '0.2.1-alpha' diff --git a/pypef/dca/encoding.py b/pypef/dca/encoding.py index e7ac0e6..7255f71 100644 --- a/pypef/dca/encoding.py +++ b/pypef/dca/encoding.py @@ -285,14 +285,14 @@ def __read_plmc_v2(self, filename, precision): self.wt_aa_pos = [] for aa, pos in zip(self._target_seq, self.index_list): self.wt_aa_pos.append(str(aa) + str(pos)) + print(f'Evaluating gap content of PLMC parameter file... ' + f'First amino acid position used in the MSA (PLMC params file) is ' + f'{self._target_seq[0]}{self.index_list[0]} and the last position ' + f'used is {self._target_seq[-1]}{self.index_list[-1]}.') + if len(not_valid) > 0: + print(f'\nFurther, non-included positions are:') + print(str(not_valid)[1:-1]) if self.verbose: - print(f'Evaluating gap content of PLMC parameter file... ' - f'First amino acid position used in the MSA (PLMC params file) is ' - f'{self._target_seq[0]}{self.index_list[0]} and the last position ' - f'used is {self._target_seq[-1]}{self.index_list[-1]}.') - if len(not_valid) > 0: - print(f'\nFurther, non-included positions are:') - print(str(not_valid)[1:-1]) print(f'\nSummary of all effective positions represented in the MSA ' f'based on wild-type sequence ({len(valid)} encoded positions):') for aa_pos in self.wt_aa_pos: diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py index 562df93..bc29859 100644 --- a/pypef/dca/hybrid_model.py +++ b/pypef/dca/hybrid_model.py @@ -35,7 +35,7 @@ import matplotlib.pyplot as plt from pypef.utils.variant_data import get_sequences_from_file, remove_nan_encoded_positions from pypef.dca.encoding import DCAEncoding, get_dca_data_parallel, get_encoded_sequence, ActiveSiteError -from pypef.ml.regression import predictions_out +from pypef.ml.regression import predictions_out, plot_y_true_vs_y_pred np.warnings.filterwarnings('error', category=np.VisibleDeprecationWarning) @@ -570,7 +570,7 @@ def run( """ -def get_model_and_save_pkl( +def generate_model_and_save_pkl( xs: list, ys: list, dca_encoder: DCAEncoding, @@ -650,51 +650,17 @@ def get_model_and_save_pkl( pass print(f'Save model as Pickle file... HYBRIDMODEL') pickle.dump( - {'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2, - 'spearman_rho': test_spearman_r, 'regressor': reg}, + { + 'hybrid_model': hybrid_model, + 'beta_1': beta_1, + 'beta_2': beta_2, + 'spearman_rho': test_spearman_r, + 'regressor': reg + }, open('Pickles/HYBRIDMODEL', 'wb') ) -def plot_y_true_vs_y_pred( - y_true: np.ndarray, - y_pred: np.ndarray, - variants: np.ndarray, # just required for labeling - label=False -): - """ - Plots predicted versus true values using the hybrid model for prediction. - Function called by function predict_ps. - """ - spearman_rho = spearmanr(y_true, y_pred)[0] - - figure, ax = plt.subplots() - ax.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, - label=fr'$\rho$ = {spearman_rho:.3f}') - ax.legend() - ax.set_xlabel(r'$y_\mathrm{true}$' + fr' ($N$ = {len(y_true)})') - ax.set_ylabel(r'$y_\mathrm{pred}$' + fr' ($N$ = {len(y_pred)})') - print('\nPlotting...') - if label: - from adjustText import adjust_text - print('Adjusting variant labels for plotting can take some ' - 'time (the limit for labeling is 150 data points)...') - if len(y_true) < 150: - texts = [ax.text(y_true[i], y_pred[i], txt, fontsize=4) - for i, txt in enumerate(variants)] - adjust_text( - texts, only_move={'points': 'y', 'text': 'y'}, force_points=0.5, lim=250) - else: - print("Terminating label process. Too many variants " - "(> 150) for plotting (labels would overlap).") - file_name = 'DCA_Hybrid_Model_LS_TS_Performance.png' - # i = 1 - # while os.path.isfile(file_name): - # i += 1 # iterate until finding an unused file name - # file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png' - plt.savefig(file_name, dpi=500) - - def performance_ls_ts( ls_fasta: str, ts_fasta: str, @@ -799,13 +765,18 @@ def performance_ls_ts( pass print(f'Save model as Pickle file... HYBRIDMODEL') pickle.dump( - {'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2, - 'spearman_rho': test_spearman_r, 'regressor': reg}, + { + 'hybrid_model': hybrid_model, + 'beta_1': beta_1, + 'beta_2': beta_2, + 'spearman_rho': test_spearman_r, + 'regressor': reg + }, open('Pickles/HYBRIDMODEL', 'wb') ) -def predict_ps( # "predict pmult" +def predict_ps( # also predicting "pmult" dirs prediction_dict: dict, params_file: str, threads: int, @@ -974,7 +945,14 @@ def predict_ps( # "predict pmult" print('Testing performance...') print(f'Spearman\'s rho = {spearmanr(y_true, y_pred)[0]:.3f}') if figure is not None: - plot_y_true_vs_y_pred(np.array(y_true), np.array(y_pred), np.array(variants), label) + plot_y_true_vs_y_pred( + np.array(y_true), np.array(y_pred), np.array(variants), label, hybrid=True + ) + else: + raise SystemError( + 'Define set(s) for prediction (e.g. \'-p PS.fasta\' or ' + 'created prediction set folder, e.g. \'--pmult --drecomb\')' + ) def predict_directed_evolution( diff --git a/pypef/dca/run.py b/pypef/dca/run.py index b6e3340..1f40733 100644 --- a/pypef/dca/run.py +++ b/pypef/dca/run.py @@ -22,7 +22,7 @@ from pypef.utils.variant_data import read_csv, remove_nan_encoded_positions from pypef.dca.encoding import DCAEncoding, get_dca_data_parallel -from pypef.dca.hybrid_model import performance_ls_ts, predict_ps, get_model_and_save_pkl +from pypef.dca.hybrid_model import performance_ls_ts, predict_ps, generate_model_and_save_pkl from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n @@ -77,7 +77,8 @@ def run_pypef_hybrid_modeling(arguments): if arguments['train_and_save']: dca_encode = DCAEncoding( params_file=arguments['--params'], - separator=arguments['--mutation_sep'] + separator=arguments['--mutation_sep'], + verbose=False ) variants, fitnesses, _ = read_csv(arguments['--input']) @@ -95,7 +96,7 @@ def run_pypef_hybrid_modeling(arguments): encoded_sequences, fitnesses = remove_nan_encoded_positions(copy.copy(encoded_sequences_), fitnesses) assert len(encoded_sequences) == len(variants) == len(fitnesses) - get_model_and_save_pkl( + generate_model_and_save_pkl( xs=encoded_sequences, ys=fitnesses, dca_encoder=dca_encode, diff --git a/pypef/main.py b/pypef/main.py index 944e01b..391086e 100644 --- a/pypef/main.py +++ b/pypef/main.py @@ -161,7 +161,8 @@ [--params PLMC_FILE] [--threads THREADS] [--nofft] [--negative] pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --pmult [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS] + [--ddiverse] [--tdiverse] [--qdiverse] + [--regressor TYPE] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS] pypef ml --encoding ENCODING_TECHNIQUE directevo --model MODEL12345 --wt WT_SEQ [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] [--numtraj NUM_TRAJ] [--temp TEMPERATURE] [--nofft] [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] [--params PLMC_FILE] diff --git a/pypef/ml/regression.py b/pypef/ml/regression.py index 4c0e591..b0436cf 100644 --- a/pypef/ml/regression.py +++ b/pypef/ml/regression.py @@ -660,8 +660,6 @@ def formatted_output( for row in data: f.write("".join(str(value).ljust(col_width) for value in row) + '\n') - return () - def cross_validation( x: np.ndarray, @@ -830,29 +828,35 @@ def save_model( os.mkdir('Pickles') except FileExistsError: pass - - try: - os.remove('CV_performance/_CV_Results.txt') - except FileNotFoundError: - pass - file = open('CV_performance/_CV_Results.txt', 'w') - file.write('5-fold cross-validated performance of top ' - 'models for validation set across all data.\n\n') - if no_fft: - file.write("No FFT used in this model construction, performance represents" - " model accuracies on raw encoded sequence data.\n\n") - file.close() - for t in range(threshold): + cv_filename, name = '', '' + train_sequences, train_variants, y_train = get_sequences_from_file(training_set) + test_sequences, test_variants, y_test = get_sequences_from_file(test_set) + for i, t in enumerate(range(threshold)): try: idx = performance_list[t][0] parameter = performance_list[t][7] + if i == 0: + if encoding == 'onehot' or encoding == 'dca': + name = idx + else: + name = get_basename(idx) + cv_filename = f'CV_performance/{name}_{regressor}_CV_Results.txt' + try: + os.remove(cv_filename) + except FileNotFoundError: + pass + file = open(cv_filename, 'w') + file.write('5-fold cross-validated performance of top ' + 'models for validation set across all data.\n\n') + if no_fft: + file.write("No FFT used in this model construction, performance represents" + " model accuracies on raw encoded sequence data.\n\n") + file.close() # Estimating the CV performance of the n_component-fitted model on all data - sequences_train, variants_train, y_train = get_sequences_from_file(training_set) - sequences_test, variants_test, y_test = get_sequences_from_file(test_set) if encoding == 'aaidx': # AAindex encoding technique - x_aaidx_train = AAIndexEncoding(full_aaidx_txt_path(idx), sequences_train) - x_aaidx_test = AAIndexEncoding(full_aaidx_txt_path(idx), sequences_test) + x_aaidx_train = AAIndexEncoding(full_aaidx_txt_path(idx), train_sequences) + x_aaidx_test = AAIndexEncoding(full_aaidx_txt_path(idx), test_sequences) if no_fft is False: # use FFT on encoded sequences (default) x_train, _ = x_aaidx_train.collect_encoded_sequences() x_test, _ = x_aaidx_test.collect_encoded_sequences() @@ -860,20 +864,22 @@ def save_model( _, x_train = x_aaidx_train.collect_encoded_sequences() _, x_test = x_aaidx_test.collect_encoded_sequences() elif encoding == 'onehot': # OneHot encoding technique - x_onehot_train = OneHotEncoding(sequences_train) - x_onehot_test = OneHotEncoding(sequences_test) + x_onehot_train = OneHotEncoding(train_sequences) + x_onehot_test = OneHotEncoding(test_sequences) x_train = x_onehot_train.collect_encoded_sequences() x_test = x_onehot_test.collect_encoded_sequences() else: # DCA dca_encoder = DCAEncoding(couplings_file, verbose=False) if threads > 1: # parallelization of encoding, NaNs are already being removed by the called function - train_variants, x_train, y_train = get_dca_data_parallel(variants_train, y_train, dca_encoder, threads) - test_variants, x_test, y_test = get_dca_data_parallel(variants_test, y_test, dca_encoder, threads) + train_variants, x_train, y_train = get_dca_data_parallel(train_variants, y_train, dca_encoder, threads) + test_variants, x_test, y_test = get_dca_data_parallel(test_variants, y_test, dca_encoder, threads) else: # encode using a single thread - x_train = dca_encoder.collect_encoded_sequences(variants_train) - x_test = dca_encoder.collect_encoded_sequences(variants_test) - x_train, y_train = remove_nan_encoded_positions(x_train, y_train) - x_test, y_test = remove_nan_encoded_positions(x_test, y_test) + x_train_ = dca_encoder.collect_encoded_sequences(train_variants) + x_test_ = dca_encoder.collect_encoded_sequences(test_variants) + x_train, y_train = remove_nan_encoded_positions(copy.copy(x_train_), y_train) + x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), y_train) + x_test, y_test = remove_nan_encoded_positions(copy.copy(x_test_), y_test) + x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), y_test) x = np.concatenate([x_train, x_test]) y = np.concatenate([y_train, y_test]) @@ -885,28 +891,28 @@ def save_model( r_squared, rmse, nrmse, pearson_r, spearman_rho = \ get_performances(y_test_total, y_predicted_total) - if encoding == 'onehot' or encoding == 'dca': - name = idx - else: - name = get_basename(idx) - with open('CV_performance/_CV_Results.txt', 'a') as f: + + with open(cv_filename, 'a') as f: f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format( regressor.upper(), parameter, name)) f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};' ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho)) figure, ax = plt.subplots() - ax.scatter(y_test_total, y_predicted_total, marker='o', s=20, linewidths=0.5, edgecolor='black') + legend = r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + \ + f'\nPearson\'s ' + r'$r$'+f' = {pearson_r:.3f}' + f'\nSpearman\'s ' + \ + fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_test_total)})' + ax.scatter( + y_test_total, y_predicted_total, + marker='o', s=20, linewidths=0.5, edgecolor='black', label=legend, alpha=0.8 + ) ax.plot([min(y_test_total) - 1, max(y_test_total) + 1], - [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=2) - ax.legend([ - '$R^2$ = {}\nRMSE = {}\nNRMSE = {}\nPearson\'s $r$ = {}\nSpearman\'s '.format( - round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) - + r'$\rho$ = {}'.format(str(round(spearman_rho, 3))) - ]) + [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=0.5) + ax.set_xlabel('Measured') ax.set_ylabel('Predicted') - plt.savefig('CV_performance/' + name + '_' + str(n_samples) + '-fold-CV.png', dpi=250) + ax.legend(prop={'size': 8}) + plt.savefig('CV_performance/' + f'{name}_{regressor}_{n_samples}' + '-fold-CV.png', dpi=250) plt.close('all') if train_on_all: # Train model hyperparameters based on all available data (training + test set) @@ -916,6 +922,20 @@ def save_model( # fit (only) on full learning set (FFT or noFFT is defined already above) regressor_.fit(x_train, y_train) + y_test_pred = [] # 2D prediction array output to 1D + for value in regressor_.predict(x_test): + y_test_pred.append(float(value)) + y_test_pred = np.array(y_test_pred) + + plot_y_true_vs_y_pred( + y_true=y_test, + y_pred=y_test_pred, + variants=test_variants, + label=True, + hybrid=False, + name=f'CV_performance/{name}_{regressor}_' + ) + file = open(os.path.join(path, 'Pickles/' + name), 'wb') pickle.dump(regressor_, file) file.close() @@ -1024,7 +1044,7 @@ def predictions_out( predictions, model, prediction_set, - path='./' + path: str = '' ): """ Writes predictions (of the new sequence space) to text file(s). @@ -1038,16 +1058,15 @@ def predictions_out( col_width = max(len(str(value)) for row in data for value in row) + 5 head = ['Name', 'Prediction'] - with open(path + 'Predictions_' + str(model) + '_' + str(prediction_set)[:-6] + '.txt', 'w') as f: + path_ = os.path.join(path, 'Predictions_' + str(model) + '_' + str(prediction_set)[:-6] + '.txt') + with open(path_, 'w') as f: f.write("".join(caption.ljust(col_width) for caption in head) + '\n') f.write(len(head)*col_width*'-' + '\n') for row in data: f.write("".join(str(value).ljust(col_width) for value in row) + '\n') - return () - -def plot( +def predict_and_plot( path, fasta_file, model, @@ -1099,7 +1118,6 @@ def plot( f"-v VS.fasta -s 10 and pay attention to capitalization of model " f"names (e.g. -m PONP930101 or -m ONEHOT." ) - y_pred = [] try: # predicting (again) with (re-)loaded model (that was trained on training or full data) @@ -1120,12 +1138,14 @@ def plot( r_squared, rmse, nrmse, pearson_r, spearman_rho = \ get_performances(y_test, y_pred) legend = '$R^2$ = {}\nRMSE = {}\nNRMSE = {}\nPearson\'s $r$ = {}\nSpearman\'s '.format( - round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) \ - + r'$\rho$ = {}'.format(round(spearman_rho, 3)) + round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) + \ + r'$\rho$ = {}'.format(round(spearman_rho, 3)) + \ + '\n' + r'($N$ = {})'.format(len(y_test)) x = np.linspace(min(y_pred) - 1, max(y_pred) + 1, 100) fig, ax = plt.subplots() ax.scatter(y_test, y_pred, label=legend, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.8) + ax.legend(prop={'size': 8}) ax.plot(x, x, color='black', linewidth=0.5) # plot diagonal line if label is not False: print('Adjusting variant labels for plotting can take some ' @@ -1191,3 +1211,55 @@ def plot( plt.ylabel('Predicted') plt.legend(prop={'size': 8}) plt.savefig(str(model) + '_' + str(fasta_file[:-6]) + '.png', dpi=500) + + +def plot_y_true_vs_y_pred( + y_true: np.ndarray, + y_pred: np.ndarray, + variants: np.ndarray, # just required for labeling + label=False, + hybrid=False, + name: str = '' +): + """ + Plots predicted versus true values using the hybrid model for prediction. + Function called by function predict_ps. + """ + figure, ax = plt.subplots() + if hybrid: + spearman_rho = stats.spearmanr(y_true, y_pred)[0] + ax.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, + label=f'Spearman\'s' + fr'$\rho$ = {spearman_rho:.3f}') + file_name = name + 'DCA_Hybrid_Model_LS_TS_Performance.png' + else: + r_squared, rmse, nrmse, pearson_r, spearman_rho = get_performances( + y_true=list(y_true), y_pred=list(y_pred) + ) + ax.scatter( + y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, + label=r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + + f'\nPearson\'s ' + r'$r$'+f' = {pearson_r:.3f}' + f'\nSpearman\'s ' + + fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})' + ) + file_name = name + 'ML_Model_LS_TS_Performance.png' + ax.legend(prop={'size': 8}) + ax.set_xlabel('Measured') + ax.set_ylabel('Predicted') + print('\nPlotting...') + if label: + print('Adjusting variant labels for plotting can take some ' + 'time (the limit for labeling is 150 data points)...') + if len(y_true) < 150: + texts = [ax.text(y_true[i], y_pred[i], txt, fontsize=4) + for i, txt in enumerate(variants)] + adjust_text( + texts, only_move={'points': 'y', 'text': 'y'}, force_points=0.5, lim=250) + else: + print("Terminating label process. Too many variants " + "(> 150) for plotting (labels would overlap).") + + # i = 1 + # while os.path.isfile(file_name): + # i += 1 # iterate until finding an unused file name + # file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png' + plt.savefig(file_name, dpi=500) \ No newline at end of file diff --git a/pypef/ml/run.py b/pypef/ml/run.py index d42844f..a772dfa 100644 --- a/pypef/ml/run.py +++ b/pypef/ml/run.py @@ -28,7 +28,7 @@ # importing own modules from pypef.ml.regression import ( read_models, formatted_output, performance_list, save_model, predict, - predictions_out, plot + predictions_out, predict_and_plot ) from pypef.utils.low_n_mutation_extrapolation import low_n, performance_mutation_extrapolation from pypef.utils.variant_data import absolute_path_cwd_file @@ -108,7 +108,7 @@ def run_pypef_pure_ml(arguments): elif arguments['--figure'] is not None and arguments['--model'] is not None: # plotting path = os.getcwd() - plot( + predict_and_plot( path=path, fasta_file=arguments['--figure'], model=arguments['--model'], diff --git a/pypef/utils/prediction_sets.py b/pypef/utils/prediction_sets.py index 5681e68..10c9f91 100644 --- a/pypef/utils/prediction_sets.py +++ b/pypef/utils/prediction_sets.py @@ -89,6 +89,8 @@ def make_recombinations_double(arr: tuple) -> list: ['A217N', 'R219S'], ['A217N', 'L249Y'], ['R219S', 'L249Y']] """ doubles = [] + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) for i in tqdm(range(len(arr))): for j in range(len(arr)): if j > i: @@ -120,12 +122,16 @@ def make_recombinations_triple(arr: list): ['L215F', 'R219S', 'L249Y'], ['A217N', 'R219S', 'L249Y']] """ length = len(arr) + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) triples = [] for i in tqdm(range(length)): for j in range(length): for k in range(length): if k > j > i: - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] != (arr[k][0])[1:-1]: + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[k][0])[1:-1]: triples.append([arr[i][0], arr[j][0], arr[k][0]]) if len(triples) >= 8E04: yield triples @@ -152,13 +158,20 @@ def make_recombinations_quadruple(arr): [['L215F', 'A217N', 'R219S', 'L249Y']] """ length = len(arr) + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) quadruples = [] for i in tqdm(range(length)): for j in range(length): for k in range(length): for l in range(length): if l > k > j > i: - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] != (arr[k][0])[1:-1] != (arr[l][0])[1:-1]: + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[k][0])[1:-1] != (arr[l][0])[1:-1]: quadruples.append([arr[i][0], arr[j][0], arr[k][0], arr[l][0]]) if len(quadruples) >= 8E04: yield quadruples @@ -170,13 +183,16 @@ def make_recombinations_quintuple(arr): """ Make quintuple recombination variants. - :parameter arr: List of single substitutions in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y'], ['P252I]) + :parameter arr: List(s) of all available single substitution(s) + in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y'], ['P252I']) :returns quintuples: List of quintuple substitution lists, e.g., [['L215F', 'A217N', 'R219S', 'L249Y', 'P252I']] """ length = len(arr) + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) quintuples = [] for i in tqdm(range(length)): for j in range(length): @@ -184,11 +200,16 @@ def make_recombinations_quintuple(arr): for l in range(length): for m in range(length): if m > l > k > j > i: - if (arr[i][0])[1:-1] \ - != (arr[j][0])[1:-1] \ - != (arr[k][0])[1:-1] \ - != (arr[l][0])[1:-1] \ - != (arr[m][0][1:-1]): + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[m][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[m][0])[1:-1] and \ + (arr[k][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[k][0])[1:-1] != (arr[m][0])[1:-1] and \ + (arr[l][0])[1:-1] != (arr[m][0])[1:-1]: quintuples.append([arr[i][0], arr[j][0], arr[k][0], arr[l][0], arr[m][0]]) if len(quintuples) >= 8E04: yield quintuples @@ -309,7 +330,7 @@ def make_combinations_double_all_diverse_and_all_positions(wt_seq, aminoacids): and pos_2[0] != pos_2[-1] \ and pos_1[1:-1] != pos_2[1:-1]: doubles.append(tuple([pos_1, pos_2])) # tuple needed for - if len(doubles) >= 8E04: # list(dict()): + if len(doubles) >= 8E04: # list(dict()): doubles = list(dict.fromkeys(doubles)) # removes duplicated list entries counter += len(doubles) yield doubles @@ -377,3 +398,6 @@ def make_combinations_quadruple_all_diverse(arr, aminoacids): quadruples = [] quadruples = list(dict.fromkeys(quadruples)) yield quadruples + +if __name__ == '__main__': + make_recombinations_quintuple((['A86V'], ['T91S'], ['M108Q'], ['A109E'], ['T111P'], ['A86S'], ['T91E'], ['M108L'], ['A109S'], ['T111G'], ['M108R'], ['T111N'], ['T91V'], ['M108T'], ['A109G'], ['T111F'], ['T91A'], ['A109M'], ['A86D'], ['T91R'], ['A109K'], ['T111D'], ['T91Q'], ['A109V'], ['T111S'], ['A86C'], ['T91L'], ['A109T'], ['M108S'], ['A109F'], ['T111L'], ['A86T'], ['A109Q'], ['M108A'], ['A109P'], ['T111Q'], ['A86N'], ['T91Y'], ['A109L'], ['T111A'], ['T91F'], ['A109Y'], ['A86I'], ['A109D'], ['M108K'], ['M108I'], ['T91N'], ['T111C'], ['T91M'], ['T91C'], ['M108P'], ['T111M'], ['T91H'], ['M108C'], ['M108F'], ['M108G'], ['A109N'], ['M108E'], ['A109W'], ['M108W'], ['A109I'], ['T91P'], ['M108H'], ['T91D'], ['A109R'], ['T91I'], ['M108Y'], ['T91G'], ['T91W'], ['A86R'], ['T91K'], ['T111Y'], ['M108D'], ['A86W'], ['M108V'], ['T111I'], ['M108N'], ['A109C'], ['A109H'])) \ No newline at end of file diff --git a/pypef/utils/run.py b/pypef/utils/run.py index 5870940..7a791b3 100644 --- a/pypef/utils/run.py +++ b/pypef/utils/run.py @@ -175,7 +175,8 @@ def run_pypef_utils(arguments): if arguments['hybrid'] or arguments['--encoding'] == 'dca': dca_encoder = DCAEncoding( params_file=arguments['--params'], - separator=arguments['--sep'] + separator=arguments['--sep'], + verbose=False ) if arguments['ml']: ml_or_hybrid = 'ml' @@ -278,13 +279,13 @@ def run_pypef_utils(arguments): threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 threads = threads + 1 if threads == 0 else threads print(f'Using {threads} thread(s) for running...') - if threads > 1: - ray.init() dca_encode = DCAEncoding( params_file=arguments['--params'], - separator=arguments['--mutation_sep'] + separator=arguments['--mutation_sep'], + verbose=False ) if threads > 1: + ray.init() variants, encoded_sequences, fitnesses = get_dca_data_parallel( variants=variants, fitnesses=fitnesses, diff --git a/setup.py b/setup.py index e329464..6679f40 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='pypef', - version='0.2.0', + version='0.2.1', author='Niklas Siedhoff & Alexander-Maurice Illig', author_email='n.siedhoff@biotec.rwth-aachen.de', license='CC BY-NC-SA 4.0', @@ -28,13 +28,11 @@ package_data={'pypef': ['ml/AAindex/*', 'ml/AAindex/Refined_cluster_indices_r0.93_r0.97/*']}, include_package_data=True, install_requires=[requirements], - python_requires='>=3.7', + python_requires='>=3.9', keywords='Pythonic Protein Engineering Framework', classifiers=[ 'Development Status :: 3 - Alpha', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Topic :: Scientific/Engineering :: Bio-Informatics',