v0.3.2

Protein-Engineering-Framework · Aug 17, 2023 · ec8f077 · ec8f077
1 parent fcfd8db
commit ec8f077
Show file tree

Hide file tree

Showing 9 changed files with 1,687 additions and 506 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,38 @@
+# This workflow will install Python dependencies, run tests and lint with multiple versions of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: build
+
+on: [push]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: [ubuntu-latest, windows-latest, macos-latest]
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/README.md b/README.md
@@ -37,6 +37,7 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081.
 # PyPEF: Pythonic Protein Engineering Framework
 [![PyPI version](https://img.shields.io/pypi/v/PyPEF?color=blue)](https://pypi.org/project/pypef/)
 [![Python version](https://img.shields.io/pypi/pyversions/PyPEF)](https://www.python.org/downloads/)
+![Build](https://github.com/Protein-Engineering-Framework/PyPEF/actions/workflows/build.yml/badge.svg)
 
 a framework written in Python 3 for performing sequence-based machine learning-assisted protein engineering to predict a protein's fitness from its sequence using different forms of sequence encoding:
 - One-hot encoding

diff --git a/pypef/__init__.py b/pypef/__init__.py
@@ -17,4 +17,4 @@
 # §Equal contribution
 
 
-__version__ = '0.3.1-alpha'
+__version__ = '0.3.2-alpha'
diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py
@@ -79,7 +79,6 @@ def __init__(
         self.eff_cutoff = eff_cutoff
         self.opt_iter = opt_iter
         self.states = len(self.char_alphabet)
-        self.a2n = self.a2n_dict()
         self.seqs, _, _ = get_sequences_from_file(alignment)
         self.msa_ori = self.get_msa_ori()
         self.n_col_ori = self.msa_ori.shape[1]
@@ -97,7 +96,7 @@ def __init__(
         self.n_eff = np.sum(self.msa_weights)
         self.n_row = self.msa_trimmed.shape[0]
         self.n_col = self.msa_trimmed.shape[1]
-        self.v_ini, self.w_ini = self.initialize_v_w(remove_gap_entries=False)
+        self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False)
         self.optimize = optimize
         if self.optimize:
             self.v_opt, self.w_opt = self.run_opt_tf()
@@ -110,27 +109,30 @@ def a2n_dict(self):
         return a2n
 
     def aa2int(self, aa):
-        """convert single aa into numerical integer value, e.g.
+        """convert single aa into numerical integer value, e.g.:
         "A" -> 0 or "-" to 21 dependent on char_alphabet"""
-        if aa in self.a2n:
-            return self.a2n[aa]
+        a2n = self.a2n_dict()
+        if aa in a2n:
+            return a2n[aa]
         else:  # for unknown characters insert Gap character
-            return self.a2n['-']
+            return a2n['-']
 
-    def str2int(self, x):
+    def seq2int(self, aa_seqs):
         """
-        convert a list of strings into list of integers
-        Example: ["ACD","EFG"] -> [[0,4,3], [6,13,7]]
+        convert a single sequence or a list of sequences into a list of integer sequences, e.g.:
+        ["ACD","EFG"] -> [[0,4,3], [6,13,7]]
         """
-        if type(x) == list:
-            x = np.array(x)
-        if x.dtype.type is np.str_:
-            if x.ndim == 0:  # single seq
-                return np.array([self.aa2int(aa) for aa in str(x)])
+        if type(aa_seqs) == str:
+            aa_seqs = np.array(aa_seqs)
+        if type(aa_seqs) == list:
+            aa_seqs = np.array(aa_seqs)
+        if aa_seqs.dtype.type is np.str_:
+            if aa_seqs.ndim == 0:  # single seq
+                return np.array([self.aa2int(aa) for aa in str(aa_seqs)])
             else:  # list of seqs
-                return np.array([[self.aa2int(aa) for aa in seq] for seq in x])
+                return np.array([[self.aa2int(aa) for aa in seq] for seq in aa_seqs])
         else:
-            return x
+            return aa_seqs
 
     @property
     def get_v_idx_w_idx(self):
@@ -150,7 +152,7 @@ def filt_gaps(self, msa_ori):
         non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0]
 
         gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0]
-        logger.info(f'Gap positions (removed from msa):\n{gaps}')
+        logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}')
         ncol_trimmed = len(non_gaps)
         v_idx = non_gaps
         w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)]
@@ -362,17 +364,19 @@ def initialize_v_w(self, remove_gap_entries=True):
         """
         w_ini = np.zeros((self.n_col, self.states, self.n_col, self.states))
         onehot_cat_msa = np.eye(self.states)[self.msa_trimmed]
+        aa_counts = np.sum(onehot_cat_msa, axis=0)
         pseudo_count = 0.01 * np.log(self.n_eff)
         v_ini = np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count)
         v_ini = v_ini - np.mean(v_ini, -1, keepdims=True)
-        # loss_score_ini = self.objective(v_ini, w_ini, flattened=False)  # * self.n_eff
+        # loss_score_ini = self.objective(v_ini, w_ini, flattened=False)
 
         if remove_gap_entries:
             no_gap_states = self.states - 1
             v_ini = v_ini[:, :no_gap_states]
             w_ini = w_ini[:, :no_gap_states, :, :no_gap_states]
+            aa_counts = aa_counts[:, :no_gap_states]
 
-        return v_ini, w_ini
+        return v_ini, w_ini, aa_counts
 
     @property
     def get_v_w_opt(self):
@@ -390,10 +394,10 @@ def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0
             if self.optimize:
                 v, w = self.v_opt, self.w_opt
             else:
-                v, w = self.v_ini, self.w_ini
+                v, w, _ = self.initialize_v_w(remove_gap_entries=True)
         if v_idx is None:
             v_idx = self.v_idx
-        seqs_int = self.str2int(seqs)
+        seqs_int = self.seq2int(seqs)
         # if length of sequence != length of model use only
         # valid positions (v_idx) from the trimmed alignment
         try:
@@ -439,7 +443,7 @@ def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0
         else:
             return np.sum(h, axis=-1) - h_wt_seq
 
-    def get_wt_score(self, wt_seq=None, v=None, w=None):
+    def get_wt_score(self, wt_seq=None, v=None, w=None, encode=False):
         if wt_seq is None:
             wt_seq = self.wt_seq
         if v is None or w is None:
@@ -448,7 +452,7 @@ def get_wt_score(self, wt_seq=None, v=None, w=None):
             else:
                 v, w = self.v_ini, self.w_ini
         wt_seq = np.array(wt_seq, dtype=str)
-        return self.get_score(wt_seq, v, w)
+        return self.get_score(wt_seq, v, w, encode=encode)
 
     def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None):
         """
@@ -541,17 +545,22 @@ def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True):
         else:
             ax.imshow(matrix, cmap='Blues')
         tick_pos = ax.get_xticks()
+        tick_pos = np.array([int(t) for t in tick_pos])
         tick_pos[-1] = matrix.shape[0]
-        tick_pos[2:] -= 1
+        if tick_pos[2] > 1:
+            tick_pos[2:] -= 1
         ax.set_xticks(tick_pos)
         ax.set_yticks(tick_pos)
         labels = [item.get_text() for item in ax.get_xticklabels()]
-        labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]]
+        try:
+            labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]]
+        except ValueError:
+            pass
         ax.set_xticklabels(labels)
         ax.set_yticklabels(labels)
         ax.set_xlim(-1, matrix.shape[0])
         ax.set_ylim(-1, matrix.shape[0])
-        plt.title(matrix_type)
+        plt.title(matrix_type.upper())
         plt.savefig(f'{matrix_type}.png', dpi=500)
         plt.close('all')
 

diff --git a/scripts/GREMLIN_numba/README.md b/scripts/GREMLIN_numba/README.md
@@ -0,0 +1,15 @@
+## GREMLIN in Python using numba
+
+GREMLIN_CPP-like (L-BFGS and CG-based MSA-DCA optimization) port to Python using numba.
+
+
+GREMLIN_CPP (https://github.com/sokrypton/GREMLIN_CPP) is licensed under  
+
+----------------------------------------------------------------------------
+
+"THE BEER-WARE LICENSE" (Revision 42):
+<[email protected]> wrote this file.  As long as you retain this notice you
+can do whatever you want with this stuff. If we meet some day, and you think
+this stuff is worth it, you can buy me a beer in return.  Sergey Ovchinnikov
+
+----------------------------------------------------------------------------
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@
		# §Equal contribution


		__version__ = '0.3.1-alpha'
		__version__ = '0.3.2-alpha'