update v0.2.1

Protein-Engineering-Framework · Jul 9, 2022 · df7839d · df7839d
1 parent 0b0b909
commit df7839d
Show file tree

Hide file tree

Showing 10 changed files with 202 additions and 127 deletions.
diff --git a/pypef/__init__.py b/pypef/__init__.py
@@ -17,4 +17,4 @@
 # §Equal contribution
 
 
-VERSION = '0.2.0-alpha'
+VERSION = '0.2.1-alpha'
diff --git a/pypef/dca/encoding.py b/pypef/dca/encoding.py
@@ -285,14 +285,14 @@ def __read_plmc_v2(self, filename, precision):
             self.wt_aa_pos = []
             for aa, pos in zip(self._target_seq, self.index_list):
                 self.wt_aa_pos.append(str(aa) + str(pos))
+            print(f'Evaluating gap content of PLMC parameter file... '
+                  f'First amino acid position used in the MSA (PLMC params file) is '
+                  f'{self._target_seq[0]}{self.index_list[0]} and the last position '
+                  f'used is {self._target_seq[-1]}{self.index_list[-1]}.')
+            if len(not_valid) > 0:
+                print(f'\nFurther, non-included positions are:')
+                print(str(not_valid)[1:-1])
             if self.verbose:
-                print(f'Evaluating gap content of PLMC parameter file... '
-                      f'First amino acid position used in the MSA (PLMC params file) is '
-                      f'{self._target_seq[0]}{self.index_list[0]} and the last position '
-                      f'used is {self._target_seq[-1]}{self.index_list[-1]}.')
-                if len(not_valid) > 0:
-                    print(f'\nFurther, non-included positions are:')
-                    print(str(not_valid)[1:-1])
                     print(f'\nSummary of all effective positions represented in the MSA '
                           f'based on wild-type sequence ({len(valid)} encoded positions):')
                     for aa_pos in self.wt_aa_pos:

diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py
@@ -35,7 +35,7 @@
 import matplotlib.pyplot as plt
 from pypef.utils.variant_data import get_sequences_from_file, remove_nan_encoded_positions
 from pypef.dca.encoding import DCAEncoding, get_dca_data_parallel, get_encoded_sequence, ActiveSiteError
-from pypef.ml.regression import predictions_out
+from pypef.ml.regression import predictions_out, plot_y_true_vs_y_pred
 
 np.warnings.filterwarnings('error', category=np.VisibleDeprecationWarning)
 
@@ -570,7 +570,7 @@ def run(
 """
 
 
-def get_model_and_save_pkl(
+def generate_model_and_save_pkl(
         xs: list,
         ys: list,
         dca_encoder: DCAEncoding,
@@ -650,51 +650,17 @@ def get_model_and_save_pkl(
         pass
     print(f'Save model as Pickle file... HYBRIDMODEL')
     pickle.dump(
-        {'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2,
-         'spearman_rho': test_spearman_r, 'regressor': reg},
+        {
+            'hybrid_model': hybrid_model,
+            'beta_1': beta_1,
+            'beta_2': beta_2,
+            'spearman_rho': test_spearman_r,
+            'regressor': reg
+        },
         open('Pickles/HYBRIDMODEL', 'wb')
     )
 
 
-def plot_y_true_vs_y_pred(
-        y_true: np.ndarray,
-        y_pred: np.ndarray,
-        variants: np.ndarray,  # just required for labeling
-        label=False
-):
-    """
-    Plots predicted versus true values using the hybrid model for prediction.
-    Function called by function predict_ps.
-    """
-    spearman_rho = spearmanr(y_true, y_pred)[0]
-
-    figure, ax = plt.subplots()
-    ax.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7,
-               label=fr'$\rho$ = {spearman_rho:.3f}')
-    ax.legend()
-    ax.set_xlabel(r'$y_\mathrm{true}$' + fr' ($N$ = {len(y_true)})')
-    ax.set_ylabel(r'$y_\mathrm{pred}$' + fr' ($N$ = {len(y_pred)})')
-    print('\nPlotting...')
-    if label:
-        from adjustText import adjust_text
-        print('Adjusting variant labels for plotting can take some '
-              'time (the limit for labeling is 150 data points)...')
-        if len(y_true) < 150:
-            texts = [ax.text(y_true[i], y_pred[i], txt, fontsize=4)
-                     for i, txt in enumerate(variants)]
-            adjust_text(
-                texts, only_move={'points': 'y', 'text': 'y'}, force_points=0.5, lim=250)
-        else:
-            print("Terminating label process. Too many variants "
-                  "(> 150) for plotting (labels would overlap).")
-    file_name = 'DCA_Hybrid_Model_LS_TS_Performance.png'
-    # i = 1
-    # while os.path.isfile(file_name):
-    #     i += 1  # iterate until finding an unused file name
-    #     file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png'
-    plt.savefig(file_name, dpi=500)
-
-
 def performance_ls_ts(
         ls_fasta: str,
         ts_fasta: str,
@@ -799,13 +765,18 @@ def performance_ls_ts(
         pass
     print(f'Save model as Pickle file... HYBRIDMODEL')
     pickle.dump(
-        {'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2,
-         'spearman_rho': test_spearman_r, 'regressor': reg},
+        {
+            'hybrid_model': hybrid_model,
+            'beta_1': beta_1,
+            'beta_2': beta_2,
+            'spearman_rho': test_spearman_r,
+            'regressor': reg
+        },
         open('Pickles/HYBRIDMODEL', 'wb')
     )
 
 
-def predict_ps(  # "predict pmult"
+def predict_ps(  # also predicting "pmult" dirs
         prediction_dict: dict,
         params_file: str,
         threads: int,
@@ -974,7 +945,14 @@ def predict_ps(  # "predict pmult"
         print('Testing performance...')
         print(f'Spearman\'s rho = {spearmanr(y_true, y_pred)[0]:.3f}')
         if figure is not None:
-            plot_y_true_vs_y_pred(np.array(y_true), np.array(y_pred), np.array(variants), label)
+            plot_y_true_vs_y_pred(
+                np.array(y_true), np.array(y_pred), np.array(variants), label, hybrid=True
+            )
+    else:
+        raise SystemError(
+            'Define set(s) for prediction (e.g. \'-p PS.fasta\' or '
+            'created prediction set folder, e.g. \'--pmult --drecomb\')'
+        )
 
 
 def predict_directed_evolution(

diff --git a/pypef/dca/run.py b/pypef/dca/run.py
@@ -22,7 +22,7 @@
 
 from pypef.utils.variant_data import read_csv, remove_nan_encoded_positions
 from pypef.dca.encoding import DCAEncoding, get_dca_data_parallel
-from pypef.dca.hybrid_model import performance_ls_ts, predict_ps, get_model_and_save_pkl
+from pypef.dca.hybrid_model import performance_ls_ts, predict_ps, generate_model_and_save_pkl
 from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n
 
 
@@ -77,7 +77,8 @@ def run_pypef_hybrid_modeling(arguments):
     if arguments['train_and_save']:
         dca_encode = DCAEncoding(
             params_file=arguments['--params'],
-            separator=arguments['--mutation_sep']
+            separator=arguments['--mutation_sep'],
+            verbose=False
         )
 
         variants, fitnesses, _ = read_csv(arguments['--input'])
@@ -95,7 +96,7 @@ def run_pypef_hybrid_modeling(arguments):
             encoded_sequences, fitnesses = remove_nan_encoded_positions(copy.copy(encoded_sequences_), fitnesses)
         assert len(encoded_sequences) == len(variants) == len(fitnesses)
 
-        get_model_and_save_pkl(
+        generate_model_and_save_pkl(
             xs=encoded_sequences,
             ys=fitnesses,
             dca_encoder=dca_encode,

diff --git a/pypef/main.py b/pypef/main.py
@@ -161,7 +161,8 @@
         [--params PLMC_FILE] [--threads THREADS] [--nofft] [--negative]
     pypef ml --encoding ENCODING_TECHNIQUE --model MODEL12345 --pmult
         [--drecomb] [--trecomb] [--qarecomb] [--qirecomb]
-        [--ddiverse] [--tdiverse] [--qdiverse] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS]
+        [--ddiverse] [--tdiverse] [--qdiverse]
+        [--regressor TYPE] [--nofft] [--negative] [--params PLMC_FILE] [--threads THREADS]
     pypef ml --encoding ENCODING_TECHNIQUE directevo --model MODEL12345 --wt WT_SEQ
         [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] [--numtraj NUM_TRAJ] [--temp TEMPERATURE]
         [--nofft] [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] [--params PLMC_FILE]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@
		# §Equal contribution


		VERSION = '0.2.0-alpha'
		VERSION = '0.2.1-alpha'