v 0.2.3

Protein-Engineering-Framework · Aug 27, 2022 · a2cdf03 · a2cdf03
1 parent 9416890
commit a2cdf03
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 50 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 build/
 dist/
 pypef.egg-info/
-workflow/.ipynb_checkpoints/
+workflow/.ipynb_checkpoints/
+.idea/
diff --git a/pypef/__init__.py b/pypef/__init__.py
@@ -17,4 +17,4 @@
 # §Equal contribution
 
 
-VERSION = '0.2.2-alpha'
+VERSION = '0.2.3-alpha'
diff --git a/pypef/main.py b/pypef/main.py
@@ -152,7 +152,7 @@
         [--conc]
     pypef ml --encoding ENCODING_TECHNIQUE --ls LEARNING_SET --ts TEST_SET
         [--save NUMBER] [--regressor TYPE] [--nofft] [--all] [--params PLMC_FILE]
-        [--sort METRIC_INT] [--threads THREADS] [--color]
+        [--sort METRIC_INT] [--threads THREADS]
     pypef ml --show
         [MODELS]
     pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --figure TS_FOR_PLOTTING

diff --git a/pypef/ml/regression.py b/pypef/ml/regression.py
@@ -828,30 +828,29 @@ def save_model(
         os.mkdir('Pickles')
     except FileExistsError:
         pass
-    cv_filename, name = '', ''
     train_sequences, train_variants, y_train = get_sequences_from_file(training_set)
     test_sequences, test_variants, y_test = get_sequences_from_file(test_set)
     for i, t in enumerate(range(threshold)):
         try:
             idx = performance_list[t][0]
             parameter = performance_list[t][7]
-            if i == 0:
-                if encoding == 'onehot' or encoding == 'dca':
-                    name = idx
-                else:
-                    name = get_basename(idx)
-                cv_filename = os.path.join('CV_performance', f'{name}_{regressor}_CV_Results.txt')
-                try:
-                    os.remove(cv_filename)
-                except FileNotFoundError:
-                    pass
-                file = open(cv_filename, 'w')
-                file.write('5-fold cross-validated performance of top '
-                           'models for validation set across all data.\n\n')
-                if no_fft:
-                    file.write("No FFT used in this model construction, performance represents"
-                               " model accuracies on raw encoded sequence data.\n\n")
-                file.close()
+
+            if encoding == 'onehot' or encoding == 'dca':
+                name = idx
+            else:
+                name = get_basename(idx)
+            cv_filename = os.path.join('CV_performance', f'{name}_{regressor.upper()}_CV_Results.txt')
+            try:
+                os.remove(cv_filename)
+            except FileNotFoundError:
+                pass
+            file = open(cv_filename, 'w')
+            file.write('5-fold cross-validated performance of top '
+                       'models for validation set across all data.\n\n')
+            if no_fft:
+                file.write("No FFT used in this model construction, performance represents"
+                           " model accuracies on raw encoded sequence data.\n\n")
+            file.close()
 
             # Estimating the CV performance of the n_component-fitted model on all data
             if encoding == 'aaidx':  # AAindex encoding technique
@@ -877,9 +876,11 @@ def save_model(
                     x_train_ = dca_encoder.collect_encoded_sequences(train_variants)
                     x_test_ = dca_encoder.collect_encoded_sequences(test_variants)
                     x_train, y_train = remove_nan_encoded_positions(copy.copy(x_train_), y_train)
-                    x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), y_train)
+                    x_train, train_variants = remove_nan_encoded_positions(copy.copy(x_train_), train_variants)
                     x_test, y_test = remove_nan_encoded_positions(copy.copy(x_test_), y_test)
-                    x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), y_test)
+                    x_test, test_variants = remove_nan_encoded_positions(copy.copy(x_test_), test_variants)
+                    assert len(x_train) == len(y_train) == len(train_variants)
+                    assert len(x_test) == len(y_test) == len(test_variants)
 
             x = np.concatenate([x_train, x_test])
             y = np.concatenate([y_train, y_test])
@@ -912,7 +913,7 @@ def save_model(
             ax.set_xlabel('Measured')
             ax.set_ylabel('Predicted')
             ax.legend(prop={'size': 8})
-            plt.savefig(os.path.join('CV_performance', f'{name}_{regressor}_{n_samples}-fold-CV.png'), dpi=300)
+            plt.savefig(os.path.join('CV_performance', f'{name}_{regressor.upper()}_{n_samples}-fold-CV.png'), dpi=300)
             plt.close('all')
 
             if train_on_all:  # Train model hyperparameters based on all available data (training + test set)
@@ -933,7 +934,7 @@ def save_model(
                 variants=test_variants,
                 label=True,
                 hybrid=False,
-                name=os.path.join('CV_performance', f'{name}_{regressor}_')
+                name=f'{name}_{regressor.upper()}_'
             )
 
             file = open(os.path.join(path, 'Pickles', name), 'wb')
@@ -1141,7 +1142,7 @@ def predict_and_plot(
         round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) + \
              r'$\rho$ = {}'.format(round(spearman_rho, 3)) + \
              '\n' + r'($N$ = {})'.format(len(y_test))
-    x = np.linspace(min(y_pred) - 1, max(y_pred) + 1, 100)
+    x = np.linspace(min(y_pred), max(y_pred), 100)
     fig, ax = plt.subplots()
     ax.scatter(y_test, y_pred,
                label=legend, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.8)
@@ -1194,7 +1195,7 @@ def predict_and_plot(
         plt.hlines(y=(y_wt + limit_y_pred) - (((y_wt + limit_y_pred) - (y_wt - limit_y_pred)) / 2),
                    xmin=y_wt - limit_y_true, xmax=y_wt + limit_y_true, color='grey', linewidth=0.5)
         crossline = np.linspace(y_wt - limit_y_true, y_wt + limit_y_true)
-        plt.plot(crossline, crossline, color='black', linewidth=0.5)
+        plt.plot(crossline, crossline, color='black', linewidth=0.25)
         steps = float(abs(max(y_pred)))
         gradient = []
         for x in np.linspace(0, steps, 100):
@@ -1242,6 +1243,8 @@ def plot_y_true_vs_y_pred(
                   fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})'
         )
         file_name = name + 'ML_Model_LS_TS_Performance.png'
+    x = np.linspace(min(y_pred), max(y_pred), 100)
+    ax.plot(x, x, color='black', linewidth=0.25)  # plot diagonal line
     ax.legend(prop={'size': 8})
     ax.set_xlabel('Measured')
     ax.set_ylabel('Predicted')
@@ -1262,4 +1265,4 @@ def plot_y_true_vs_y_pred(
     # while os.path.isfile(file_name):
     #     i += 1  # iterate until finding an unused file name
     #     file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png'
-    plt.savefig(file_name, dpi=500)
+    plt.savefig(file_name, dpi=500)
diff --git a/pypef/ml/run.py b/pypef/ml/run.py
@@ -92,6 +92,7 @@ def run_pypef_pure_ml(arguments):
                     no_fft=arguments['--nofft'],
                     minimum_r2=0.0
                 )
+
                 save_model(
                     path=path,
                     performance_list=encoding_performance_list,

diff --git a/pypef/utils/learning_test_sets.py b/pypef/utils/learning_test_sets.py
@@ -366,21 +366,28 @@ def make_sub_ls_ts_randomly(
 
 def make_fasta_ls_ts(
         filename,
-        wt,
-        sub,
-        val
-):  # sub = substitution, val = value
+        wt_seq,
+        substitutions,
+        fitness_values
+):
     """
     Creates learning and test sets (.fasta style files)
 
-    sub: list
+    filename: str
+        String for defining the filename for the learning and test set "fasta-like" files.
+    wt: str
+        Wild-type sequence as string
+    substitutions: list
         List of substiutuions of a single variant of the format:
-            - Single substitution variant, e.g. variant A123C: [['A123C']]
-            - Higher variants, e.g. variant A123C/D234E/F345G: [['A123C'], ['D234E], ['F345G']]
+            - Single substitution variant, e.g. variant A123C: ['A123C']
+            - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G']
+            --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']]
+    fitness_values: list
+        List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8]
     """
     myfile = open(filename, 'w')
-    for i, var in enumerate(sub):  # var are lists of (single or multiple) substitutions
-        temp = list(wt)
+    for i, var in enumerate(substitutions):  # var are lists of (single or multiple) substitutions
+        temp = list(wt_seq)
         name = ''
         separation = 0
         if var == ['WT']:
@@ -397,28 +404,33 @@ def make_fasta_ls_ts(
                     name += '/' + single_var
                 separation += 1
         print(f'>{name}', file=myfile)
-        print(f';{val[i]}', file=myfile)
+        print(f';{fitness_values[i]}', file=myfile)
         print(''.join(temp), file=myfile)
         # print(name+';'+str(val[i])+';'+''.join(temp), file=myfile)  # output: CSV format
     myfile.close()
 
 
 def get_seqs_from_var_name(
-        wt,
-        sub,
-        val
+        wt_seq,
+        substitutions,
+        fitness_values
 ) -> tuple[list, list, list]:
     """
     Similar to function above but just returns sequences
 
-    variant: list
+    wt: str
+        Wild-type sequence as string
+    substitutions: list
         List of substiutuions of a single variant of the format:
-            - Single substitution variant, e.g. variant A123C: [['A123C']]
-            - Higher variants, e.g. variant A123C/D234E/F345G: [['A123C'], ['D234E], ['F345G']]
+            - Single substitution variant, e.g. variant A123C: ['A123C']
+            - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G']
+            --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']]
+    fitness_values: list
+        List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8]
     """
     variant, values, sequences = [], [], []
-    for i, var in enumerate(sub):  # var are lists of (single or multiple) substitutions
-        temp = list(wt)
+    for i, var in enumerate(substitutions):  # var are lists of (single or multiple) substitutions
+        temp = list(wt_seq)
         name = ''
         separation = 0
         if var == ['WT']:
@@ -435,7 +447,7 @@ def get_seqs_from_var_name(
                     name += '/' + single_var
                 separation += 1
         variant.append(name)
-        values.append(val[i])
+        values.append(fitness_values[i])
         sequences.append(''.join(temp))
 
     return variant, values, sequences
diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,4 @@ adjustText
 scikit-learn
 biopython
 schema
-ray[default]
+ray[default]<2.0.0
diff --git a/setup.py b/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# for installation run me with: pip install . --use-feature=in-tree-build
+# for installation run me with: pip install .
 
 
 from setuptools import setup, find_packages
@@ -10,7 +10,7 @@
 
 setup(
     name='pypef',
-    version='0.2.2',
+    version='0.2.3',
     author='Niklas Siedhoff & Alexander-Maurice Illig',
     author_email='[email protected]',
     license='CC BY-NC-SA 4.0',
@@ -34,6 +34,7 @@
         'Development Status :: 3 - Alpha',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence'
     ],
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@
		# §Equal contribution


		VERSION = '0.2.2-alpha'
		VERSION = '0.2.3-alpha'