diff --git a/.gitignore b/.gitignore index e604e19..4f83ab9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build/ dist/ pypef.egg-info/ -workflow/.ipynb_checkpoints/ \ No newline at end of file +workflow/.ipynb_checkpoints/ +.idea/ \ No newline at end of file diff --git a/pypef/__init__.py b/pypef/__init__.py index 7df4ce4..900a0f6 100644 --- a/pypef/__init__.py +++ b/pypef/__init__.py @@ -17,4 +17,4 @@ # §Equal contribution -VERSION = '0.2.2-alpha' +VERSION = '0.2.3-alpha' diff --git a/pypef/main.py b/pypef/main.py index 391086e..8d1dd7e 100644 --- a/pypef/main.py +++ b/pypef/main.py @@ -152,7 +152,7 @@ [--conc] pypef ml --encoding ENCODING_TECHNIQUE --ls LEARNING_SET --ts TEST_SET [--save NUMBER] [--regressor TYPE] [--nofft] [--all] [--params PLMC_FILE] - [--sort METRIC_INT] [--threads THREADS] [--color] + [--sort METRIC_INT] [--threads THREADS] pypef ml --show [MODELS] pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --figure TS_FOR_PLOTTING diff --git a/pypef/ml/regression.py b/pypef/ml/regression.py index 5065199..5499eab 100644 --- a/pypef/ml/regression.py +++ b/pypef/ml/regression.py @@ -828,30 +828,29 @@ def save_model( os.mkdir('Pickles') except FileExistsError: pass - cv_filename, name = '', '' train_sequences, train_variants, y_train = get_sequences_from_file(training_set) test_sequences, test_variants, y_test = get_sequences_from_file(test_set) for i, t in enumerate(range(threshold)): try: idx = performance_list[t][0] parameter = performance_list[t][7] - if i == 0: - if encoding == 'onehot' or encoding == 'dca': - name = idx - else: - name = get_basename(idx) - cv_filename = os.path.join('CV_performance', f'{name}_{regressor}_CV_Results.txt') - try: - os.remove(cv_filename) - except FileNotFoundError: - pass - file = open(cv_filename, 'w') - file.write('5-fold cross-validated performance of top ' - 'models for validation set across all data.\n\n') - if no_fft: - file.write("No FFT used in this model construction, performance represents" - " model accuracies on raw encoded sequence data.\n\n") - file.close() + + if encoding == 'onehot' or encoding == 'dca': + name = idx + else: + name = get_basename(idx) + cv_filename = os.path.join('CV_performance', f'{name}_{regressor.upper()}_CV_Results.txt') + try: + os.remove(cv_filename) + except FileNotFoundError: + pass + file = open(cv_filename, 'w') + file.write('5-fold cross-validated performance of top ' + 'models for validation set across all data.\n\n') + if no_fft: + file.write("No FFT used in this model construction, performance represents" + " model accuracies on raw encoded sequence data.\n\n") + file.close() # Estimating the CV performance of the n_component-fitted model on all data if encoding == 'aaidx': # AAindex encoding technique @@ -877,9 +876,11 @@ def save_model( x_train_ = dca_encoder.collect_encoded_sequences(train_variants) x_test_ = dca_encoder.collect_encoded_sequences(test_variants) x_train, y_train = remove_nan_encoded_positions(copy.copy(x_train_), y_train) - x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), y_train) + x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), train_variants) x_test, y_test = remove_nan_encoded_positions(copy.copy(x_test_), y_test) - x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), y_test) + x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), test_variants) + assert len(x_train) == len(y_train) == len(train_variants) + assert len(x_test) == len(y_test) == len(test_variants) x = np.concatenate([x_train, x_test]) y = np.concatenate([y_train, y_test]) @@ -912,7 +913,7 @@ def save_model( ax.set_xlabel('Measured') ax.set_ylabel('Predicted') ax.legend(prop={'size': 8}) - plt.savefig(os.path.join('CV_performance', f'{name}_{regressor}_{n_samples}-fold-CV.png'), dpi=300) + plt.savefig(os.path.join('CV_performance', f'{name}_{regressor.upper()}_{n_samples}-fold-CV.png'), dpi=300) plt.close('all') if train_on_all: # Train model hyperparameters based on all available data (training + test set) @@ -933,7 +934,7 @@ def save_model( variants=test_variants, label=True, hybrid=False, - name=os.path.join('CV_performance', f'{name}_{regressor}_') + name=f'{name}_{regressor.upper()}_' ) file = open(os.path.join(path, 'Pickles', name), 'wb') @@ -1141,7 +1142,7 @@ def predict_and_plot( round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) + \ r'$\rho$ = {}'.format(round(spearman_rho, 3)) + \ '\n' + r'($N$ = {})'.format(len(y_test)) - x = np.linspace(min(y_pred) - 1, max(y_pred) + 1, 100) + x = np.linspace(min(y_pred), max(y_pred), 100) fig, ax = plt.subplots() ax.scatter(y_test, y_pred, label=legend, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.8) @@ -1194,7 +1195,7 @@ def predict_and_plot( plt.hlines(y=(y_wt + limit_y_pred) - (((y_wt + limit_y_pred) - (y_wt - limit_y_pred)) / 2), xmin=y_wt - limit_y_true, xmax=y_wt + limit_y_true, color='grey', linewidth=0.5) crossline = np.linspace(y_wt - limit_y_true, y_wt + limit_y_true) - plt.plot(crossline, crossline, color='black', linewidth=0.5) + plt.plot(crossline, crossline, color='black', linewidth=0.25) steps = float(abs(max(y_pred))) gradient = [] for x in np.linspace(0, steps, 100): @@ -1242,6 +1243,8 @@ def plot_y_true_vs_y_pred( fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})' ) file_name = name + 'ML_Model_LS_TS_Performance.png' + x = np.linspace(min(y_pred), max(y_pred), 100) + ax.plot(x, x, color='black', linewidth=0.25) # plot diagonal line ax.legend(prop={'size': 8}) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') @@ -1262,4 +1265,4 @@ def plot_y_true_vs_y_pred( # while os.path.isfile(file_name): # i += 1 # iterate until finding an unused file name # file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png' - plt.savefig(file_name, dpi=500) \ No newline at end of file + plt.savefig(file_name, dpi=500) diff --git a/pypef/ml/run.py b/pypef/ml/run.py index a772dfa..90eedf5 100644 --- a/pypef/ml/run.py +++ b/pypef/ml/run.py @@ -92,6 +92,7 @@ def run_pypef_pure_ml(arguments): no_fft=arguments['--nofft'], minimum_r2=0.0 ) + save_model( path=path, performance_list=encoding_performance_list, diff --git a/pypef/utils/learning_test_sets.py b/pypef/utils/learning_test_sets.py index 8d786e9..ae09d7b 100644 --- a/pypef/utils/learning_test_sets.py +++ b/pypef/utils/learning_test_sets.py @@ -366,21 +366,28 @@ def make_sub_ls_ts_randomly( def make_fasta_ls_ts( filename, - wt, - sub, - val -): # sub = substitution, val = value + wt_seq, + substitutions, + fitness_values +): """ Creates learning and test sets (.fasta style files) - sub: list + filename: str + String for defining the filename for the learning and test set "fasta-like" files. + wt: str + Wild-type sequence as string + substitutions: list List of substiutuions of a single variant of the format: - - Single substitution variant, e.g. variant A123C: [['A123C']] - - Higher variants, e.g. variant A123C/D234E/F345G: [['A123C'], ['D234E], ['F345G']] + - Single substitution variant, e.g. variant A123C: ['A123C'] + - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] + --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] + fitness_values: list + List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] """ myfile = open(filename, 'w') - for i, var in enumerate(sub): # var are lists of (single or multiple) substitutions - temp = list(wt) + for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions + temp = list(wt_seq) name = '' separation = 0 if var == ['WT']: @@ -397,28 +404,33 @@ def make_fasta_ls_ts( name += '/' + single_var separation += 1 print(f'>{name}', file=myfile) - print(f';{val[i]}', file=myfile) + print(f';{fitness_values[i]}', file=myfile) print(''.join(temp), file=myfile) # print(name+';'+str(val[i])+';'+''.join(temp), file=myfile) # output: CSV format myfile.close() def get_seqs_from_var_name( - wt, - sub, - val + wt_seq, + substitutions, + fitness_values ) -> tuple[list, list, list]: """ Similar to function above but just returns sequences - variant: list + wt: str + Wild-type sequence as string + substitutions: list List of substiutuions of a single variant of the format: - - Single substitution variant, e.g. variant A123C: [['A123C']] - - Higher variants, e.g. variant A123C/D234E/F345G: [['A123C'], ['D234E], ['F345G']] + - Single substitution variant, e.g. variant A123C: ['A123C'] + - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] + --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] + fitness_values: list + List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] """ variant, values, sequences = [], [], [] - for i, var in enumerate(sub): # var are lists of (single or multiple) substitutions - temp = list(wt) + for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions + temp = list(wt_seq) name = '' separation = 0 if var == ['WT']: @@ -435,7 +447,7 @@ def get_seqs_from_var_name( name += '/' + single_var separation += 1 variant.append(name) - values.append(val[i]) + values.append(fitness_values[i]) sequences.append(''.join(temp)) return variant, values, sequences diff --git a/requirements.txt b/requirements.txt index 5bfeb5a..8b220e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ adjustText scikit-learn biopython schema -ray[default] \ No newline at end of file +ray[default]<2.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 755eba6..d5a5a0d 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# for installation run me with: pip install . --use-feature=in-tree-build +# for installation run me with: pip install . from setuptools import setup, find_packages @@ -10,7 +10,7 @@ setup( name='pypef', - version='0.2.2', + version='0.2.3', author='Niklas Siedhoff & Alexander-Maurice Illig', author_email='n.siedhoff@biotec.rwth-aachen.de', license='CC BY-NC-SA 4.0', @@ -34,6 +34,7 @@ 'Development Status :: 3 - Alpha', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', 'Topic :: Scientific/Engineering :: Bio-Informatics', 'Topic :: Scientific/Engineering :: Artificial Intelligence' ],