Upload v0.3.4

Protein-Engineering-Framework · Jul 28, 2024 · 5862936 · 5862936
1 parent 4325bce
commit 5862936
Show file tree

Hide file tree

Showing 30 changed files with 1,045 additions and 436 deletions.
diff --git a/.github/imgs/ML_Model_Performance_DCA_GREMLIN.png b/.github/imgs/ML_Model_Performance_DCA_GREMLIN.png
diff --git a/.github/imgs/ML_Model_Performance_DCA_PLS.png b/.github/imgs/ML_Model_Performance_DCA_PLS.png
diff --git a/.github/imgs/multi_point_mut_performance_PGym_DCA.png b/.github/imgs/multi_point_mut_performance_PGym_DCA.png
diff --git a/.github/imgs/multi_point_mut_performance_PGym_Hybrid.png b/.github/imgs/multi_point_mut_performance_PGym_Hybrid.png
diff --git a/.github/imgs/runtimes.png b/.github/imgs/runtimes.png
diff --git a/.github/imgs/runtimes_2.png b/.github/imgs/runtimes_2.png
diff --git a/.github/imgs/single_point_mut_performance_PGyM_DCA.png b/.github/imgs/single_point_mut_performance_PGyM_DCA.png
diff --git a/.github/imgs/single_point_mut_performance_PGym_Hybrid.png b/.github/imgs/single_point_mut_performance_PGym_Hybrid.png
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -17,9 +17,9 @@ jobs:
         python-version: ["3.9", "3.10", "3.11"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Display Path and Python version

diff --git a/.gitignore b/.gitignore
diff --git a/README.md b/README.md
diff --git a/pypef/__init__.py b/pypef/__init__.py
@@ -17,4 +17,4 @@
 # §Equal contribution
 
 
-__version__ = '0.3.3-alpha'
+__version__ = '0.3.4'
diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py
@@ -46,71 +46,142 @@
     https://doi.org/10.1103/PhysRevE.87.012707
 """
 
+from __future__ import annotations
+
 import logging
 logger = logging.getLogger('pypef.dca.params_inference')
 
-from os import mkdir
+from os import mkdir, PathLike, environ
 import pickle
 import numpy as np
 import matplotlib.pyplot as plt
+from Bio import AlignIO
 from scipy.spatial.distance import pdist, squareform
 from scipy.special import logsumexp
 from scipy.stats import boxcox
 import pandas as pd
 import tensorflow as tf
 tf.get_logger().setLevel('DEBUG')
-
-from pypef.utils.variant_data import get_sequences_from_file
+# Uncomment to hide GPU devices
+#environ['CUDA_VISIBLE_DEVICES'] = '-1'  
 
 
 class GREMLIN:
+    """
+    Alphabet char order in GREMLIN: "ARNDCQEGHILKMFPSTWYV-".
+    gap_cutoff = 0.5 and eff_cutoff = 0.8 is proposed by Hopf et al.;
+    here, by default all columns are used (gap_cutoff >= 1.0 and eff_cutoff > 1.0).
+    v_ini represent the weighted frequency of each amino acid at each position, i.e.,
+    np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count).
+    """
     def __init__(
             self,
-            alignment: str,
+            alignment: str | PathLike,
             char_alphabet: str = "ARNDCQEGHILKMFPSTWYV-",
             wt_seq=None,
+            offset=0,
             optimize=True,
             gap_cutoff=0.5,
             eff_cutoff=0.8,
-            opt_iter=100
+            opt_iter=100,
+            max_msa_seqs: int | None = 10000,
+            seqs=None
     ):
+        if not tf.config.list_physical_devices('GPU'):
+            logger.info('Using CPU for GREMLIN computations...')
+        else:
+            logger.info('Using GPU for GREMLIN computations...')
+
         self.char_alphabet = char_alphabet
+        self.allowed_chars = "ARNDCQEGHILKMFPSTWYV-"
+        self.allowed_chars += self.allowed_chars.lower()
+        self.offset = offset
         self.gap_cutoff = gap_cutoff
         self.eff_cutoff = eff_cutoff
         self.opt_iter = opt_iter
+        self.gaps_1_indexed = None
+        if max_msa_seqs == None:
+            self.max_msa_seqs = 1E9
+        else:
+            self.max_msa_seqs = max_msa_seqs
         self.states = len(self.char_alphabet)
-        self.seqs, _, _ = get_sequences_from_file(alignment)
+        logger.info('Loading MSA...')
+        if seqs is None:
+            self.seqs, self.seq_ids = self.get_sequences_from_msa(alignment)
+        else:
+            self.seqs = seqs
+            self.seq_ids = np.array([n for n in range(len(self.seqs))])
+        logger.info(f'Found {len(self.seqs)} sequences in the MSA...')
         self.msa_ori = self.get_msa_ori()
+        logger.info(f'MSA shape: {np.shape(self.msa_ori)}')
         self.n_col_ori = self.msa_ori.shape[1]
         if wt_seq is not None:
             self.wt_seq = wt_seq
         else:  # Taking the first sequence in the MSA as wild type sequence
-            logger.info("No wild-type sequence provided: The first sequence "
-                        "in the MSA is considered the wild-type sequence.")
             self.wt_seq = "".join([self.char_alphabet[i] for i in self.msa_ori[0]])
+            logger.info(f"No wild-type sequence provided: The first sequence "
+                        f"in the MSA is considered the wild-type sequence "
+                        f"(Length: {len(self.wt_seq)}):\n{self.wt_seq}\n")
         if len(self.wt_seq) != self.n_col_ori:
-            raise SystemError("Length of (provided) wild-type sequence does not match "
-                              "number of MSA columns, i.e., common MSA sequence length.")
+            raise SystemError(f"Length of (provided) wild-type sequence ({len(self.wt_seq)}) "
+                              f"does not match number of MSA columns ({self.n_col_ori}), "
+                              f"i.e., common MSA sequence length.")
+        logger.info('Filtering gaps...')
         self.msa_trimmed, self.v_idx, self.w_idx, self.w_rel_idx, self.gaps = self.filt_gaps(self.msa_ori)
+        logger.info('Getting effective sequence weights...')
         self.msa_weights = self.get_eff_msa_weights(self.msa_trimmed)
         self.n_eff = np.sum(self.msa_weights)
         self.n_row = self.msa_trimmed.shape[0]
         self.n_col = self.msa_trimmed.shape[1]
+        logger.info('Initializing v and W terms based on MSA frequencies...')
         self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False)
+        self.aa_freqs = self.aa_counts / self.n_row
         self.optimize = optimize
         if self.optimize:
-            self.v_opt, self.w_opt = self.run_opt_tf()
+            self.v_opt_with_gaps, self.w_opt_with_gaps = self.run_opt_tf()
+            no_gap_states = self.states - 1
+            self.v_opt = self.v_opt_with_gaps[:, :no_gap_states],
+            self.w_opt = self.w_opt_with_gaps[:, :no_gap_states, :, :no_gap_states]
         self.x_wt = self.collect_encoded_sequences(np.atleast_1d(self.wt_seq))
 
+    def get_sequences_from_msa(self, msa_file: str):
+        """
+        "Get_Sequences" reads (learning and test) .fasta and
+        .fasta-like ".fasl" format files and extracts the name,
+        the target value and the sequence of the protein.
+        Only takes one-liner sequences for correct input.
+        See example directory for required fasta file format.
+        Make sure every marker (> and ;) is seperated by a
+        space ' ' from the value respectively name.
+        Trimming MSA sequences starting at offset.
+
+        msa_file: str
+            Path to MSA in FASTA or A2M format.
+        """
+        sequences = []
+        seq_ids = []
+        alignment = AlignIO.read(open(msa_file), "fasta")
+        for record in alignment:
+            sequences.append(str(record.seq))
+            seq_ids.append(str(record.id))
+        assert len(sequences) == len(seq_ids), f"{len(sequences)}, {len(seq_ids)}"
+        return np.array(sequences), np.array(seq_ids)
+
     def a2n_dict(self):
+        """
+        Convert alphabet to numerical integer values, e.g.:
+        {"A": 0, "C": 1, "D": 2, ...}
+        """
         a2n = {}
         for a, n in zip(self.char_alphabet, range(self.states)):
             a2n[a] = n
         return a2n
 
     def aa2int(self, aa):
-        """convert single aa into numerical integer value, e.g.:
-        "A" -> 0 or "-" to 21 dependent on char_alphabet"""
+        """
+        convert single aa into numerical integer value, e.g.:
+        "A" -> 0 or "-" to 21 dependent on char_alphabet
+        """
         a2n = self.a2n_dict()
         if aa in a2n:
             return a2n[aa]
@@ -119,7 +190,8 @@ def aa2int(self, aa):
 
     def seq2int(self, aa_seqs):
         """
-        convert a single sequence or a list of sequences into a list of integer sequences, e.g.:
+        Convert a single sequence or a list of sequences 
+        into a list of integer sequences, e.g.:
         ["ACD","EFG"] -> [[0,4,3], [6,13,7]]
         """
         if type(aa_seqs) == str:
@@ -139,28 +211,38 @@ def get_v_idx_w_idx(self):
         return self.v_idx, self.w_idx
 
     def get_msa_ori(self):
-        """converts list of sequences to msa"""
+        """
+        Converts list of sequences to MSA.
+        Also checks for unknown amino acid characters 
+        and removes those sequences from the MSA.
+        """
         msa_ori = []
-        for seq in self.seqs:
-            msa_ori.append([self.aa2int(aa.upper()) for aa in seq])
+        for i, (seq, seq_id) in enumerate(zip(self.seqs, self.seq_ids)):
+            if i < self.max_msa_seqs:
+                msa_ori.append([self.aa2int(aa.upper()) for aa in seq])
+            else:
+                logger.info(f'Reached max. number of MSA sequences ({self.max_msa_seqs})...')
+                break
         msa_ori = np.array(msa_ori)
         return msa_ori
 
     def filt_gaps(self, msa_ori):
-        """filters alignment to remove gappy positions"""
+        """Filters alignment to remove gappy positions"""
         tmp = (msa_ori == self.states - 1).astype(float)
         non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0]
-
         gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0]
-        logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}')
+        self.gaps_1_indexed = [g+1 for g in gaps]
+        logger.info(f'Gap positions (removed from MSA; 1-indexed):\n{self.gaps_1_indexed}')
         ncol_trimmed = len(non_gaps)
+        logger.info(f'Positions remaining: {ncol_trimmed} of {np.shape(msa_ori)[1]} '
+                    f'({(ncol_trimmed / np.shape(msa_ori)[1]) * 100 :.2f}%)')
         v_idx = non_gaps
         w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)]
         w_rel_idx = np.stack(np.triu_indices(ncol_trimmed, 1), -1)
         return msa_ori[:, non_gaps], v_idx, w_idx, w_rel_idx, gaps
 
     def get_eff_msa_weights(self, msa):
-        """compute effective weight for each sequence"""
+        """Compute effective weight for each sequence"""
         # pairwise identity
         pdistance_msa = pdist(msa, "hamming")
         msa_sm = 1.0 - squareform(pdistance_msa)
@@ -174,9 +256,11 @@ def l2(x):
         return np.sum(np.square(x))
 
     def objective(self, v, w=None, flattened=True):
-        """Same objective function as used in run_opt_tf below
+        """
+        Same objective function as used in run_opt_tf below
         but here only using numpy not TensorFlow functions.
-        Potentially helpful for implementing SciPy optimizers."""
+        Potentially helpful for implementing SciPy optimizers.
+        """
         if w is None:
             w = self.w_ini
         onehot_cat_msa = np.eye(self.states)[self.msa_trimmed]
@@ -352,9 +436,10 @@ def feed(feed_all=False):
             # save the v and w parameters of the MRF
             v_opt = sess.run(v)
             w_opt = sess.run(w)
-
         no_gap_states = self.states - 1
         return v_opt[:, :no_gap_states], w_opt[:, :no_gap_states, :, :no_gap_states]
+        #return v_opt, w_opt
+
 
     def initialize_v_w(self, remove_gap_entries=True):
         """
@@ -390,6 +475,9 @@ def get_v_w_opt(self):
             )
 
     def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0, recompute_z=False):
+        """
+        Computes the GREMLIN score for a given sequence or list of sequences.
+        """
         if v is None or w is None:
             if self.optimize:
                 v, w = self.v_opt, self.w_opt

diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py
@@ -33,6 +33,8 @@
 import numpy as np
 import sklearn.base
 from scipy.stats import spearmanr
+from sklearnex import patch_sklearn
+patch_sklearn(verbose=False)
 from sklearn.linear_model import Ridge
 from sklearn.model_selection import GridSearchCV, train_test_split
 from scipy.optimize import differential_evolution
@@ -50,24 +52,23 @@
 
 
 class DCAHybridModel:
-    alphas = np.logspace(-6, 6, 100)  # Grid for the parameter 'alpha'.
-    parameter_range = [(0, 1), (0, 1)]  # Parameter range of 'beta_1' and 'beta_2' with lb <= x <= ub
     # TODO: Implementation of other regression techniques (CVRegression models)
-
     def __init__(
             self,
-            alphas=alphas,
-            parameter_range=None,
             x_train: np.ndarray = None,
             y_train: np.ndarray = None,
             x_test: np.ndarray = None,  # not necessary for training
             y_test: np.ndarray = None,  # not necessary for training
-            x_wt=None
+            x_wt=None,
+            alphas=None,     # Ridge regression grid for the parameter 'alpha'
+            parameter_range=None,  # Parameter range of 'beta_1' and 'beta_2' with lower bound <= x <= upper bound
     ):
         if parameter_range is None:
-            parameter_range = parameter_range
-        self._alphas = alphas
-        self._parameter_range = parameter_range
+            parameter_range = [(0, 1), (0, 1)] 
+        if alphas is None:
+            alphas = np.logspace(-6, 6, 100)
+        self.alphas = alphas
+        self.parameter_range = parameter_range
         self.x_train = x_train
         self.y_train = y_train
         self.x_test = x_test
@@ -148,7 +149,7 @@ def _delta_e(
         of the variant and wild-type sequence.
 
         dE = E (variant) - E (wild-type)
-        with E = \sum_{i} h_i (o_i) + \sum_{i<j} J_{ij} (o_i, o_j)
+        with E = sum_{i} h_i (o_i) + sum_{i<j} J_{ij} (o_i, o_j)
 
         Parameters
         ----------
@@ -198,7 +199,7 @@ def ridge_predictor(
         Ridge object trained on 'x_train' and 'y_train' (cv=5)
         with optimized 'alpha'. 
         """
-        grid = GridSearchCV(Ridge(), {'alpha': self._alphas}, cv=5)
+        grid = GridSearchCV(Ridge(), {'alpha': self.alphas}, cv=5)
         grid.fit(x_train, y_train)
         return Ridge(**grid.best_params_).fit(x_train, y_train)
 
@@ -637,7 +638,8 @@ def save_model_to_dict_pickle(
     if model_type is None:
         model_type = 'MODEL'
 
-    logger.info(f'Save model as Pickle file... {model_type}')
+    pkl_path = os.path.abspath(f'Pickles/{model_type}')
+    logger.info(f'Saving model as Pickle file ({pkl_path})...')
     pickle.dump(
         {
             'model': model,
@@ -693,7 +695,7 @@ def plmc_or_gremlin_encoding(
             logger.info(f"Following positions are frequent gap positions in the MSA "
                         f"and cannot be considered for effective modeling, i.e., "
                         f"substitutions at these positions are removed as these would be "
-                        f"predicted as wild type:\n{[gap + 1 for gap in model.gaps]}.\n"
+                        f"predicted with wild-type fitness:\n{[gap + 1 for gap in model.gaps]}.\n"
                         f"Effective positions (N={len(model.v_idx)}) are:\n"
                         f"{[v_pos + 1 for v_pos in model.v_idx]}")
         xs, x_wt, variants, sequences, ys_true = gremlin_encoding(
@@ -1025,14 +1027,17 @@ def performance_ls_ts(
 
         save_model_to_dict_pickle(model, model_type, None, None, spearmanr(y_test, ys_pred)[0], None)
 
+        model_type = f'{model_type}_no_ML'
+
     else:
         raise SystemError('No Test Set given for performance estimation.')
 
     spearman_rho = spearmanr(y_test, ys_pred)
     logger.info(f'Spearman Rho = {spearman_rho[0]:.3f}')
 
     plot_y_true_vs_y_pred(
-        np.array(y_test), np.array(ys_pred), np.array(test_variants), label=label, hybrid=True
+        np.array(y_test), np.array(ys_pred), np.array(test_variants), 
+        label=label, hybrid=True, name=model_type
     )
 
 

diff --git a/pypef/dca/plmc_encoding.py b/pypef/dca/plmc_encoding.py
@@ -76,9 +76,9 @@ class InvalidVariantError(Exception):
 
     def __init__(self, variant: str):
         self.variant = variant
-        message = "The entered variant '%s' does not follow the required scheme " \
-                  "(integer enclosed by two one letter code representations of amino acids). " \
-                  "Check separator or variant." % self.variant
+        message = f"The entered variant '{self.variant}' does not follow the required scheme " \
+                  f"(integer enclosed by two one letter code representations of amino acids). " \
+                  f"Check separator or variant."
         self.message = message
         super().__init__(self.message)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@
		# §Equal contribution


		__version__ = '0.3.3-alpha'
		__version__ = '0.3.4'