Skip to content

Commit

Permalink
Upload v0.3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Jul 28, 2024
1 parent 4325bce commit 5862936
Show file tree
Hide file tree
Showing 30 changed files with 1,045 additions and 436 deletions.
Binary file added .github/imgs/ML_Model_Performance_DCA_GREMLIN.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added .github/imgs/ML_Model_Performance_DCA_PLS.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added .github/imgs/runtimes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added .github/imgs/runtimes_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ jobs:
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Display Path and Python version
Expand Down
616 changes: 305 additions & 311 deletions .gitignore

Large diffs are not rendered by default.

74 changes: 55 additions & 19 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pypef/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
# §Equal contribution


__version__ = '0.3.3-alpha'
__version__ = '0.3.4'
136 changes: 112 additions & 24 deletions pypef/dca/gremlin_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,71 +46,142 @@
https://doi.org/10.1103/PhysRevE.87.012707
"""

from __future__ import annotations

import logging
logger = logging.getLogger('pypef.dca.params_inference')

from os import mkdir
from os import mkdir, PathLike, environ
import pickle
import numpy as np
import matplotlib.pyplot as plt
from Bio import AlignIO
from scipy.spatial.distance import pdist, squareform
from scipy.special import logsumexp
from scipy.stats import boxcox
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('DEBUG')

from pypef.utils.variant_data import get_sequences_from_file
# Uncomment to hide GPU devices
#environ['CUDA_VISIBLE_DEVICES'] = '-1'


class GREMLIN:
"""
Alphabet char order in GREMLIN: "ARNDCQEGHILKMFPSTWYV-".
gap_cutoff = 0.5 and eff_cutoff = 0.8 is proposed by Hopf et al.;
here, by default all columns are used (gap_cutoff >= 1.0 and eff_cutoff > 1.0).
v_ini represent the weighted frequency of each amino acid at each position, i.e.,
np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count).
"""
def __init__(
self,
alignment: str,
alignment: str | PathLike,
char_alphabet: str = "ARNDCQEGHILKMFPSTWYV-",
wt_seq=None,
offset=0,
optimize=True,
gap_cutoff=0.5,
eff_cutoff=0.8,
opt_iter=100
opt_iter=100,
max_msa_seqs: int | None = 10000,
seqs=None
):
if not tf.config.list_physical_devices('GPU'):
logger.info('Using CPU for GREMLIN computations...')
else:
logger.info('Using GPU for GREMLIN computations...')

self.char_alphabet = char_alphabet
self.allowed_chars = "ARNDCQEGHILKMFPSTWYV-"
self.allowed_chars += self.allowed_chars.lower()
self.offset = offset
self.gap_cutoff = gap_cutoff
self.eff_cutoff = eff_cutoff
self.opt_iter = opt_iter
self.gaps_1_indexed = None
if max_msa_seqs == None:
self.max_msa_seqs = 1E9
else:
self.max_msa_seqs = max_msa_seqs
self.states = len(self.char_alphabet)
self.seqs, _, _ = get_sequences_from_file(alignment)
logger.info('Loading MSA...')
if seqs is None:
self.seqs, self.seq_ids = self.get_sequences_from_msa(alignment)
else:
self.seqs = seqs
self.seq_ids = np.array([n for n in range(len(self.seqs))])
logger.info(f'Found {len(self.seqs)} sequences in the MSA...')
self.msa_ori = self.get_msa_ori()
logger.info(f'MSA shape: {np.shape(self.msa_ori)}')
self.n_col_ori = self.msa_ori.shape[1]
if wt_seq is not None:
self.wt_seq = wt_seq
else: # Taking the first sequence in the MSA as wild type sequence
logger.info("No wild-type sequence provided: The first sequence "
"in the MSA is considered the wild-type sequence.")
self.wt_seq = "".join([self.char_alphabet[i] for i in self.msa_ori[0]])
logger.info(f"No wild-type sequence provided: The first sequence "
f"in the MSA is considered the wild-type sequence "
f"(Length: {len(self.wt_seq)}):\n{self.wt_seq}\n")
if len(self.wt_seq) != self.n_col_ori:
raise SystemError("Length of (provided) wild-type sequence does not match "
"number of MSA columns, i.e., common MSA sequence length.")
raise SystemError(f"Length of (provided) wild-type sequence ({len(self.wt_seq)}) "
f"does not match number of MSA columns ({self.n_col_ori}), "
f"i.e., common MSA sequence length.")
logger.info('Filtering gaps...')
self.msa_trimmed, self.v_idx, self.w_idx, self.w_rel_idx, self.gaps = self.filt_gaps(self.msa_ori)
logger.info('Getting effective sequence weights...')
self.msa_weights = self.get_eff_msa_weights(self.msa_trimmed)
self.n_eff = np.sum(self.msa_weights)
self.n_row = self.msa_trimmed.shape[0]
self.n_col = self.msa_trimmed.shape[1]
logger.info('Initializing v and W terms based on MSA frequencies...')
self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False)
self.aa_freqs = self.aa_counts / self.n_row
self.optimize = optimize
if self.optimize:
self.v_opt, self.w_opt = self.run_opt_tf()
self.v_opt_with_gaps, self.w_opt_with_gaps = self.run_opt_tf()
no_gap_states = self.states - 1
self.v_opt = self.v_opt_with_gaps[:, :no_gap_states],
self.w_opt = self.w_opt_with_gaps[:, :no_gap_states, :, :no_gap_states]
self.x_wt = self.collect_encoded_sequences(np.atleast_1d(self.wt_seq))

def get_sequences_from_msa(self, msa_file: str):
"""
"Get_Sequences" reads (learning and test) .fasta and
.fasta-like ".fasl" format files and extracts the name,
the target value and the sequence of the protein.
Only takes one-liner sequences for correct input.
See example directory for required fasta file format.
Make sure every marker (> and ;) is seperated by a
space ' ' from the value respectively name.
Trimming MSA sequences starting at offset.
msa_file: str
Path to MSA in FASTA or A2M format.
"""
sequences = []
seq_ids = []
alignment = AlignIO.read(open(msa_file), "fasta")
for record in alignment:
sequences.append(str(record.seq))
seq_ids.append(str(record.id))
assert len(sequences) == len(seq_ids), f"{len(sequences)}, {len(seq_ids)}"
return np.array(sequences), np.array(seq_ids)

def a2n_dict(self):
"""
Convert alphabet to numerical integer values, e.g.:
{"A": 0, "C": 1, "D": 2, ...}
"""
a2n = {}
for a, n in zip(self.char_alphabet, range(self.states)):
a2n[a] = n
return a2n

def aa2int(self, aa):
"""convert single aa into numerical integer value, e.g.:
"A" -> 0 or "-" to 21 dependent on char_alphabet"""
"""
convert single aa into numerical integer value, e.g.:
"A" -> 0 or "-" to 21 dependent on char_alphabet
"""
a2n = self.a2n_dict()
if aa in a2n:
return a2n[aa]
Expand All @@ -119,7 +190,8 @@ def aa2int(self, aa):

def seq2int(self, aa_seqs):
"""
convert a single sequence or a list of sequences into a list of integer sequences, e.g.:
Convert a single sequence or a list of sequences
into a list of integer sequences, e.g.:
["ACD","EFG"] -> [[0,4,3], [6,13,7]]
"""
if type(aa_seqs) == str:
Expand All @@ -139,28 +211,38 @@ def get_v_idx_w_idx(self):
return self.v_idx, self.w_idx

def get_msa_ori(self):
"""converts list of sequences to msa"""
"""
Converts list of sequences to MSA.
Also checks for unknown amino acid characters
and removes those sequences from the MSA.
"""
msa_ori = []
for seq in self.seqs:
msa_ori.append([self.aa2int(aa.upper()) for aa in seq])
for i, (seq, seq_id) in enumerate(zip(self.seqs, self.seq_ids)):
if i < self.max_msa_seqs:
msa_ori.append([self.aa2int(aa.upper()) for aa in seq])
else:
logger.info(f'Reached max. number of MSA sequences ({self.max_msa_seqs})...')
break
msa_ori = np.array(msa_ori)
return msa_ori

def filt_gaps(self, msa_ori):
"""filters alignment to remove gappy positions"""
"""Filters alignment to remove gappy positions"""
tmp = (msa_ori == self.states - 1).astype(float)
non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0]

gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0]
logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}')
self.gaps_1_indexed = [g+1 for g in gaps]
logger.info(f'Gap positions (removed from MSA; 1-indexed):\n{self.gaps_1_indexed}')
ncol_trimmed = len(non_gaps)
logger.info(f'Positions remaining: {ncol_trimmed} of {np.shape(msa_ori)[1]} '
f'({(ncol_trimmed / np.shape(msa_ori)[1]) * 100 :.2f}%)')
v_idx = non_gaps
w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)]
w_rel_idx = np.stack(np.triu_indices(ncol_trimmed, 1), -1)
return msa_ori[:, non_gaps], v_idx, w_idx, w_rel_idx, gaps

def get_eff_msa_weights(self, msa):
"""compute effective weight for each sequence"""
"""Compute effective weight for each sequence"""
# pairwise identity
pdistance_msa = pdist(msa, "hamming")
msa_sm = 1.0 - squareform(pdistance_msa)
Expand All @@ -174,9 +256,11 @@ def l2(x):
return np.sum(np.square(x))

def objective(self, v, w=None, flattened=True):
"""Same objective function as used in run_opt_tf below
"""
Same objective function as used in run_opt_tf below
but here only using numpy not TensorFlow functions.
Potentially helpful for implementing SciPy optimizers."""
Potentially helpful for implementing SciPy optimizers.
"""
if w is None:
w = self.w_ini
onehot_cat_msa = np.eye(self.states)[self.msa_trimmed]
Expand Down Expand Up @@ -352,9 +436,10 @@ def feed(feed_all=False):
# save the v and w parameters of the MRF
v_opt = sess.run(v)
w_opt = sess.run(w)

no_gap_states = self.states - 1
return v_opt[:, :no_gap_states], w_opt[:, :no_gap_states, :, :no_gap_states]
#return v_opt, w_opt


def initialize_v_w(self, remove_gap_entries=True):
"""
Expand Down Expand Up @@ -390,6 +475,9 @@ def get_v_w_opt(self):
)

def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0, recompute_z=False):
"""
Computes the GREMLIN score for a given sequence or list of sequences.
"""
if v is None or w is None:
if self.optimize:
v, w = self.v_opt, self.w_opt
Expand Down
33 changes: 19 additions & 14 deletions pypef/dca/hybrid_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import numpy as np
import sklearn.base
from scipy.stats import spearmanr
from sklearnex import patch_sklearn
patch_sklearn(verbose=False)
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from scipy.optimize import differential_evolution
Expand All @@ -50,24 +52,23 @@


class DCAHybridModel:
alphas = np.logspace(-6, 6, 100) # Grid for the parameter 'alpha'.
parameter_range = [(0, 1), (0, 1)] # Parameter range of 'beta_1' and 'beta_2' with lb <= x <= ub
# TODO: Implementation of other regression techniques (CVRegression models)

def __init__(
self,
alphas=alphas,
parameter_range=None,
x_train: np.ndarray = None,
y_train: np.ndarray = None,
x_test: np.ndarray = None, # not necessary for training
y_test: np.ndarray = None, # not necessary for training
x_wt=None
x_wt=None,
alphas=None, # Ridge regression grid for the parameter 'alpha'
parameter_range=None, # Parameter range of 'beta_1' and 'beta_2' with lower bound <= x <= upper bound
):
if parameter_range is None:
parameter_range = parameter_range
self._alphas = alphas
self._parameter_range = parameter_range
parameter_range = [(0, 1), (0, 1)]
if alphas is None:
alphas = np.logspace(-6, 6, 100)
self.alphas = alphas
self.parameter_range = parameter_range
self.x_train = x_train
self.y_train = y_train
self.x_test = x_test
Expand Down Expand Up @@ -148,7 +149,7 @@ def _delta_e(
of the variant and wild-type sequence.
dE = E (variant) - E (wild-type)
with E = \sum_{i} h_i (o_i) + \sum_{i<j} J_{ij} (o_i, o_j)
with E = sum_{i} h_i (o_i) + sum_{i<j} J_{ij} (o_i, o_j)
Parameters
----------
Expand Down Expand Up @@ -198,7 +199,7 @@ def ridge_predictor(
Ridge object trained on 'x_train' and 'y_train' (cv=5)
with optimized 'alpha'.
"""
grid = GridSearchCV(Ridge(), {'alpha': self._alphas}, cv=5)
grid = GridSearchCV(Ridge(), {'alpha': self.alphas}, cv=5)
grid.fit(x_train, y_train)
return Ridge(**grid.best_params_).fit(x_train, y_train)

Expand Down Expand Up @@ -637,7 +638,8 @@ def save_model_to_dict_pickle(
if model_type is None:
model_type = 'MODEL'

logger.info(f'Save model as Pickle file... {model_type}')
pkl_path = os.path.abspath(f'Pickles/{model_type}')
logger.info(f'Saving model as Pickle file ({pkl_path})...')
pickle.dump(
{
'model': model,
Expand Down Expand Up @@ -693,7 +695,7 @@ def plmc_or_gremlin_encoding(
logger.info(f"Following positions are frequent gap positions in the MSA "
f"and cannot be considered for effective modeling, i.e., "
f"substitutions at these positions are removed as these would be "
f"predicted as wild type:\n{[gap + 1 for gap in model.gaps]}.\n"
f"predicted with wild-type fitness:\n{[gap + 1 for gap in model.gaps]}.\n"
f"Effective positions (N={len(model.v_idx)}) are:\n"
f"{[v_pos + 1 for v_pos in model.v_idx]}")
xs, x_wt, variants, sequences, ys_true = gremlin_encoding(
Expand Down Expand Up @@ -1025,14 +1027,17 @@ def performance_ls_ts(

save_model_to_dict_pickle(model, model_type, None, None, spearmanr(y_test, ys_pred)[0], None)

model_type = f'{model_type}_no_ML'

else:
raise SystemError('No Test Set given for performance estimation.')

spearman_rho = spearmanr(y_test, ys_pred)
logger.info(f'Spearman Rho = {spearman_rho[0]:.3f}')

plot_y_true_vs_y_pred(
np.array(y_test), np.array(ys_pred), np.array(test_variants), label=label, hybrid=True
np.array(y_test), np.array(ys_pred), np.array(test_variants),
label=label, hybrid=True, name=model_type
)


Expand Down
6 changes: 3 additions & 3 deletions pypef/dca/plmc_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ class InvalidVariantError(Exception):

def __init__(self, variant: str):
self.variant = variant
message = "The entered variant '%s' does not follow the required scheme " \
"(integer enclosed by two one letter code representations of amino acids). " \
"Check separator or variant." % self.variant
message = f"The entered variant '{self.variant}' does not follow the required scheme " \
f"(integer enclosed by two one letter code representations of amino acids). " \
f"Check separator or variant."
self.message = message
super().__init__(self.message)

Expand Down
Loading

0 comments on commit 5862936

Please sign in to comment.