Skip to content

Commit

Permalink
v0.3.2
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Aug 17, 2023
1 parent fcfd8db commit ec8f077
Show file tree
Hide file tree
Showing 9 changed files with 1,687 additions and 506 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This workflow will install Python dependencies, run tests and lint with multiple versions of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: build

on: [push]

permissions:
contents: read

jobs:
build:

runs-on: [ubuntu-latest, windows-latest, macos-latest]
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081.
# PyPEF: Pythonic Protein Engineering Framework
[![PyPI version](https://img.shields.io/pypi/v/PyPEF?color=blue)](https://pypi.org/project/pypef/)
[![Python version](https://img.shields.io/pypi/pyversions/PyPEF)](https://www.python.org/downloads/)
![Build](https://github.com/Protein-Engineering-Framework/PyPEF/actions/workflows/build.yml/badge.svg)

a framework written in Python 3 for performing sequence-based machine learning-assisted protein engineering to predict a protein's fitness from its sequence using different forms of sequence encoding:
- One-hot encoding
Expand Down
2 changes: 1 addition & 1 deletion pypef/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
# §Equal contribution


__version__ = '0.3.1-alpha'
__version__ = '0.3.2-alpha'
61 changes: 35 additions & 26 deletions pypef/dca/gremlin_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def __init__(
self.eff_cutoff = eff_cutoff
self.opt_iter = opt_iter
self.states = len(self.char_alphabet)
self.a2n = self.a2n_dict()
self.seqs, _, _ = get_sequences_from_file(alignment)
self.msa_ori = self.get_msa_ori()
self.n_col_ori = self.msa_ori.shape[1]
Expand All @@ -97,7 +96,7 @@ def __init__(
self.n_eff = np.sum(self.msa_weights)
self.n_row = self.msa_trimmed.shape[0]
self.n_col = self.msa_trimmed.shape[1]
self.v_ini, self.w_ini = self.initialize_v_w(remove_gap_entries=False)
self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False)
self.optimize = optimize
if self.optimize:
self.v_opt, self.w_opt = self.run_opt_tf()
Expand All @@ -110,27 +109,30 @@ def a2n_dict(self):
return a2n

def aa2int(self, aa):
"""convert single aa into numerical integer value, e.g.
"""convert single aa into numerical integer value, e.g.:
"A" -> 0 or "-" to 21 dependent on char_alphabet"""
if aa in self.a2n:
return self.a2n[aa]
a2n = self.a2n_dict()
if aa in a2n:
return a2n[aa]
else: # for unknown characters insert Gap character
return self.a2n['-']
return a2n['-']

def str2int(self, x):
def seq2int(self, aa_seqs):
"""
convert a list of strings into list of integers
Example: ["ACD","EFG"] -> [[0,4,3], [6,13,7]]
convert a single sequence or a list of sequences into a list of integer sequences, e.g.:
["ACD","EFG"] -> [[0,4,3], [6,13,7]]
"""
if type(x) == list:
x = np.array(x)
if x.dtype.type is np.str_:
if x.ndim == 0: # single seq
return np.array([self.aa2int(aa) for aa in str(x)])
if type(aa_seqs) == str:
aa_seqs = np.array(aa_seqs)
if type(aa_seqs) == list:
aa_seqs = np.array(aa_seqs)
if aa_seqs.dtype.type is np.str_:
if aa_seqs.ndim == 0: # single seq
return np.array([self.aa2int(aa) for aa in str(aa_seqs)])
else: # list of seqs
return np.array([[self.aa2int(aa) for aa in seq] for seq in x])
return np.array([[self.aa2int(aa) for aa in seq] for seq in aa_seqs])
else:
return x
return aa_seqs

@property
def get_v_idx_w_idx(self):
Expand All @@ -150,7 +152,7 @@ def filt_gaps(self, msa_ori):
non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0]

gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0]
logger.info(f'Gap positions (removed from msa):\n{gaps}')
logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}')
ncol_trimmed = len(non_gaps)
v_idx = non_gaps
w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)]
Expand Down Expand Up @@ -362,17 +364,19 @@ def initialize_v_w(self, remove_gap_entries=True):
"""
w_ini = np.zeros((self.n_col, self.states, self.n_col, self.states))
onehot_cat_msa = np.eye(self.states)[self.msa_trimmed]
aa_counts = np.sum(onehot_cat_msa, axis=0)
pseudo_count = 0.01 * np.log(self.n_eff)
v_ini = np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count)
v_ini = v_ini - np.mean(v_ini, -1, keepdims=True)
# loss_score_ini = self.objective(v_ini, w_ini, flattened=False) # * self.n_eff
# loss_score_ini = self.objective(v_ini, w_ini, flattened=False)

if remove_gap_entries:
no_gap_states = self.states - 1
v_ini = v_ini[:, :no_gap_states]
w_ini = w_ini[:, :no_gap_states, :, :no_gap_states]
aa_counts = aa_counts[:, :no_gap_states]

return v_ini, w_ini
return v_ini, w_ini, aa_counts

@property
def get_v_w_opt(self):
Expand All @@ -390,10 +394,10 @@ def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0
if self.optimize:
v, w = self.v_opt, self.w_opt
else:
v, w = self.v_ini, self.w_ini
v, w, _ = self.initialize_v_w(remove_gap_entries=True)
if v_idx is None:
v_idx = self.v_idx
seqs_int = self.str2int(seqs)
seqs_int = self.seq2int(seqs)
# if length of sequence != length of model use only
# valid positions (v_idx) from the trimmed alignment
try:
Expand Down Expand Up @@ -439,7 +443,7 @@ def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0
else:
return np.sum(h, axis=-1) - h_wt_seq

def get_wt_score(self, wt_seq=None, v=None, w=None):
def get_wt_score(self, wt_seq=None, v=None, w=None, encode=False):
if wt_seq is None:
wt_seq = self.wt_seq
if v is None or w is None:
Expand All @@ -448,7 +452,7 @@ def get_wt_score(self, wt_seq=None, v=None, w=None):
else:
v, w = self.v_ini, self.w_ini
wt_seq = np.array(wt_seq, dtype=str)
return self.get_score(wt_seq, v, w)
return self.get_score(wt_seq, v, w, encode=encode)

def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None):
"""
Expand Down Expand Up @@ -541,17 +545,22 @@ def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True):
else:
ax.imshow(matrix, cmap='Blues')
tick_pos = ax.get_xticks()
tick_pos = np.array([int(t) for t in tick_pos])
tick_pos[-1] = matrix.shape[0]
tick_pos[2:] -= 1
if tick_pos[2] > 1:
tick_pos[2:] -= 1
ax.set_xticks(tick_pos)
ax.set_yticks(tick_pos)
labels = [item.get_text() for item in ax.get_xticklabels()]
labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]]
try:
labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]]
except ValueError:
pass
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
ax.set_xlim(-1, matrix.shape[0])
ax.set_ylim(-1, matrix.shape[0])
plt.title(matrix_type)
plt.title(matrix_type.upper())
plt.savefig(f'{matrix_type}.png', dpi=500)
plt.close('all')

Expand Down
15 changes: 15 additions & 0 deletions scripts/GREMLIN_numba/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## GREMLIN in Python using numba

GREMLIN_CPP-like (L-BFGS and CG-based MSA-DCA optimization) port to Python using numba.


GREMLIN_CPP (https://github.com/sokrypton/GREMLIN_CPP) is licensed under

----------------------------------------------------------------------------

"THE BEER-WARE LICENSE" (Revision 42):
<[email protected]> wrote this file. As long as you retain this notice you
can do whatever you want with this stuff. If we meet some day, and you think
this stuff is worth it, you can buy me a beer in return. Sergey Ovchinnikov

----------------------------------------------------------------------------
Loading

0 comments on commit ec8f077

Please sign in to comment.