diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..fc9e9c3 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,38 @@ +# This workflow will install Python dependencies, run tests and lint with multiple versions of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +on: [push] + +permissions: + contents: read + +jobs: + build: + + runs-on: [ubuntu-latest, windows-latest, macos-latest] + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/README.md b/README.md index 29ba344..096ac71 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081. # PyPEF: Pythonic Protein Engineering Framework [![PyPI version](https://img.shields.io/pypi/v/PyPEF?color=blue)](https://pypi.org/project/pypef/) [![Python version](https://img.shields.io/pypi/pyversions/PyPEF)](https://www.python.org/downloads/) +![Build](https://github.com/Protein-Engineering-Framework/PyPEF/actions/workflows/build.yml/badge.svg) a framework written in Python 3 for performing sequence-based machine learning-assisted protein engineering to predict a protein's fitness from its sequence using different forms of sequence encoding: - One-hot encoding diff --git a/pypef/__init__.py b/pypef/__init__.py index 1fe65ba..16d8b1f 100644 --- a/pypef/__init__.py +++ b/pypef/__init__.py @@ -17,4 +17,4 @@ # §Equal contribution -__version__ = '0.3.1-alpha' +__version__ = '0.3.2-alpha' diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py index bf13576..cc2cbc3 100644 --- a/pypef/dca/gremlin_inference.py +++ b/pypef/dca/gremlin_inference.py @@ -79,7 +79,6 @@ def __init__( self.eff_cutoff = eff_cutoff self.opt_iter = opt_iter self.states = len(self.char_alphabet) - self.a2n = self.a2n_dict() self.seqs, _, _ = get_sequences_from_file(alignment) self.msa_ori = self.get_msa_ori() self.n_col_ori = self.msa_ori.shape[1] @@ -97,7 +96,7 @@ def __init__( self.n_eff = np.sum(self.msa_weights) self.n_row = self.msa_trimmed.shape[0] self.n_col = self.msa_trimmed.shape[1] - self.v_ini, self.w_ini = self.initialize_v_w(remove_gap_entries=False) + self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False) self.optimize = optimize if self.optimize: self.v_opt, self.w_opt = self.run_opt_tf() @@ -110,27 +109,30 @@ def a2n_dict(self): return a2n def aa2int(self, aa): - """convert single aa into numerical integer value, e.g. + """convert single aa into numerical integer value, e.g.: "A" -> 0 or "-" to 21 dependent on char_alphabet""" - if aa in self.a2n: - return self.a2n[aa] + a2n = self.a2n_dict() + if aa in a2n: + return a2n[aa] else: # for unknown characters insert Gap character - return self.a2n['-'] + return a2n['-'] - def str2int(self, x): + def seq2int(self, aa_seqs): """ - convert a list of strings into list of integers - Example: ["ACD","EFG"] -> [[0,4,3], [6,13,7]] + convert a single sequence or a list of sequences into a list of integer sequences, e.g.: + ["ACD","EFG"] -> [[0,4,3], [6,13,7]] """ - if type(x) == list: - x = np.array(x) - if x.dtype.type is np.str_: - if x.ndim == 0: # single seq - return np.array([self.aa2int(aa) for aa in str(x)]) + if type(aa_seqs) == str: + aa_seqs = np.array(aa_seqs) + if type(aa_seqs) == list: + aa_seqs = np.array(aa_seqs) + if aa_seqs.dtype.type is np.str_: + if aa_seqs.ndim == 0: # single seq + return np.array([self.aa2int(aa) for aa in str(aa_seqs)]) else: # list of seqs - return np.array([[self.aa2int(aa) for aa in seq] for seq in x]) + return np.array([[self.aa2int(aa) for aa in seq] for seq in aa_seqs]) else: - return x + return aa_seqs @property def get_v_idx_w_idx(self): @@ -150,7 +152,7 @@ def filt_gaps(self, msa_ori): non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0] gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0] - logger.info(f'Gap positions (removed from msa):\n{gaps}') + logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}') ncol_trimmed = len(non_gaps) v_idx = non_gaps w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)] @@ -362,17 +364,19 @@ def initialize_v_w(self, remove_gap_entries=True): """ w_ini = np.zeros((self.n_col, self.states, self.n_col, self.states)) onehot_cat_msa = np.eye(self.states)[self.msa_trimmed] + aa_counts = np.sum(onehot_cat_msa, axis=0) pseudo_count = 0.01 * np.log(self.n_eff) v_ini = np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count) v_ini = v_ini - np.mean(v_ini, -1, keepdims=True) - # loss_score_ini = self.objective(v_ini, w_ini, flattened=False) # * self.n_eff + # loss_score_ini = self.objective(v_ini, w_ini, flattened=False) if remove_gap_entries: no_gap_states = self.states - 1 v_ini = v_ini[:, :no_gap_states] w_ini = w_ini[:, :no_gap_states, :, :no_gap_states] + aa_counts = aa_counts[:, :no_gap_states] - return v_ini, w_ini + return v_ini, w_ini, aa_counts @property def get_v_w_opt(self): @@ -390,10 +394,10 @@ def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0 if self.optimize: v, w = self.v_opt, self.w_opt else: - v, w = self.v_ini, self.w_ini + v, w, _ = self.initialize_v_w(remove_gap_entries=True) if v_idx is None: v_idx = self.v_idx - seqs_int = self.str2int(seqs) + seqs_int = self.seq2int(seqs) # if length of sequence != length of model use only # valid positions (v_idx) from the trimmed alignment try: @@ -439,7 +443,7 @@ def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0 else: return np.sum(h, axis=-1) - h_wt_seq - def get_wt_score(self, wt_seq=None, v=None, w=None): + def get_wt_score(self, wt_seq=None, v=None, w=None, encode=False): if wt_seq is None: wt_seq = self.wt_seq if v is None or w is None: @@ -448,7 +452,7 @@ def get_wt_score(self, wt_seq=None, v=None, w=None): else: v, w = self.v_ini, self.w_ini wt_seq = np.array(wt_seq, dtype=str) - return self.get_score(wt_seq, v, w) + return self.get_score(wt_seq, v, w, encode=encode) def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None): """ @@ -541,17 +545,22 @@ def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True): else: ax.imshow(matrix, cmap='Blues') tick_pos = ax.get_xticks() + tick_pos = np.array([int(t) for t in tick_pos]) tick_pos[-1] = matrix.shape[0] - tick_pos[2:] -= 1 + if tick_pos[2] > 1: + tick_pos[2:] -= 1 ax.set_xticks(tick_pos) ax.set_yticks(tick_pos) labels = [item.get_text() for item in ax.get_xticklabels()] - labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]] + try: + labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]] + except ValueError: + pass ax.set_xticklabels(labels) ax.set_yticklabels(labels) ax.set_xlim(-1, matrix.shape[0]) ax.set_ylim(-1, matrix.shape[0]) - plt.title(matrix_type) + plt.title(matrix_type.upper()) plt.savefig(f'{matrix_type}.png', dpi=500) plt.close('all') diff --git a/scripts/GREMLIN_numba/README.md b/scripts/GREMLIN_numba/README.md new file mode 100644 index 0000000..e52a68b --- /dev/null +++ b/scripts/GREMLIN_numba/README.md @@ -0,0 +1,15 @@ +## GREMLIN in Python using numba + +GREMLIN_CPP-like (L-BFGS and CG-based MSA-DCA optimization) port to Python using numba. + + +GREMLIN_CPP (https://github.com/sokrypton/GREMLIN_CPP) is licensed under + +---------------------------------------------------------------------------- + +"THE BEER-WARE LICENSE" (Revision 42): + wrote this file. As long as you retain this notice you +can do whatever you want with this stuff. If we meet some day, and you think +this stuff is worth it, you can buy me a beer in return. Sergey Ovchinnikov + +---------------------------------------------------------------------------- diff --git a/scripts/GREMLIN_numba/gremlin_inference_numba.py b/scripts/GREMLIN_numba/gremlin_inference_numba.py new file mode 100644 index 0000000..fc09c82 --- /dev/null +++ b/scripts/GREMLIN_numba/gremlin_inference_numba.py @@ -0,0 +1,799 @@ +# Gremlin in Python + +""" +------------------------------------------------------------ +"THE BEERWARE LICENSE" (Revision 42): + wrote this code. As long as you retain this + notice, you can do whatever you want with this stuff. If we meet + someday, and you think this stuff is worth it, you can buy me a + beer in return. --Sergey Ovchinnikov + ------------------------------------------------------------ + If you use this code, please cite the following papers: + + Balakrishnan, Sivaraman, Hetunandan Kamisetty, Jaime G. Carbonell, + Su?In Lee, and Christopher James Langmead. + "Learning generative models for protein fold families." + Proteins: Structure, Function, and Bioinformatics 79, no. 4 (2011): 1061-1078. + + Kamisetty, Hetunandan, Sergey Ovchinnikov, and David Baker. + "Assessing the utility of coevolution-based residue–residue + contact predictions in a sequence-and structure-rich era." + Proceedings of the National Academy of Sciences (2013): 201314045. +""" + +# Using numba for getting closer to C/C++ performance: +# https://numba.pydata.org/numba-doc/latest/user/performance-tips.html +# https://numba.pydata.org/numba-doc/latest/user/parallel.html#numba-parallel-supported + +import numpy as np +from numba import njit, prange +from typing import List + + +def get_sequences_from_file(file: str) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + "Get_Sequences" reads (learning and test) .fasta and + .fasta-like ".fasl" format files and extracts the name, + the target value and the sequence of the protein. + Only takes one-liner sequences for correct input. + See example directory for required fasta file format. + Make sure every marker (> and ;) is seperated by a + space ' ' from the value respectively name. + file: str : File in FASTA or A2M format + """ + sequences = [] + values = [] + names_of_mutations = [] + + with open(file, 'r') as f: + words = "" + for line in f: + if line.startswith('>'): + if words != "": + sequences.append(words) + words = line.split('>') + names_of_mutations.append(words[1].strip()) + words = "" + + elif line.startswith('#'): + pass # are comments + + elif line.startswith(';'): + if words != "": + sequences.append(words) + words = line.split(';') + values.append(float(words[1].strip())) + words = "" + + else: + try: + words += line.strip() + except IndexError: + raise IndexError("Learning or Validation sets (.fasta) likely " + "have emtpy lines (e.g. at end of file)") + if words != "": + sequences.append(words) + # Check consistency + if len(values) != 0: + if len(sequences) != len(values): + raise SystemError( + f'Error: Number of sequences does not fit with number of target values! ' + f'Number of sequences: {str(len(sequences))}, Number of target values: {str(len(values))}.' + ) + + return np.array(sequences), np.array(names_of_mutations), np.array(values) + + +def get_seqs_from_var_name( + wt_seq, + substitutions, + fitness_values +) -> tuple[list, list, list]: + """ + Similar to function above but just returns sequences + + wt: str + Wild-type sequence as string + substitutions: list + List of substiutuions of a single variant of the format: + - Single substitution variant, e.g. variant A123C: ['A123C'] + - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] + --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] + fitness_values: list + List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] + """ + variant, values, sequences = [], [], [] + for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions + temp = list(wt_seq) + name = '' + separation = 0 + if var == ['WT']: + name = 'WT' + else: + for single_var in var: # single entries of substitution list + position_index = int(str(single_var)[1:-1]) - 1 + new_amino_acid = str(single_var)[-1] + temp[position_index] = new_amino_acid + # checking if multiple entries are inside list + if separation == 0: + name += single_var + else: + name += '/' + single_var + separation += 1 + variant.append(name) + values.append(fitness_values[i]) + sequences.append(''.join(temp)) + + return variant, values, sequences + + +def a2n_dict(char_alphabet="ARNDCQEGHILKMFPSTWYV-"): + states = len(char_alphabet) + a2n = {} + for a, n in zip(char_alphabet, range(states)): + a2n[a] = n + return a2n + + +def aa2int(aa): + """convert single aa into numerical integer value, e.g. + "A" -> 0 or "-" to 21 dependent on char_alphabet""" + a2n = a2n_dict() + if aa in a2n: + return a2n[aa] + else: # for unknown characters insert Gap character + return a2n['-'] + + +def seqs2int(msa_seqs): + """converts list of sequences to msa""" + msa_ori = [] + for seq in msa_seqs: + msa_ori.append([aa2int(aa.upper()) for aa in seq]) + msa_ori = np.array(msa_ori) + return msa_ori + + +def filt_gaps(msa_ori, gap_cutoff=0.5): + """filters alignment to remove gappy positions""" + tmp = (msa_ori == 21 - 1).astype(float) + non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < gap_cutoff)[0] + gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= gap_cutoff)[0] + return msa_ori[:, non_gaps], gaps + + +def aa_counts_to_v_ini(): + pass + + +@njit +def get_1d_position(i_pos: int, aa_pos: int, na: int = 21) -> int: + """ + positions for 1-bd term + """ + return i_pos * na + aa_pos + + +@njit +def get_2d_position(w: int, a: int, b: int, nc: int, na: int = 21) -> int: + """ + positions for 2-bd term + + w: int from range 0, 1, ..., pair_size-1 + a: int from range 0, 1, ..., na-1 (=20) + b: int from range 0, 1, ..., na-1 (=20) + nc: Number of trimmed MSA columns for shifting position by N1 = nc * na + """ + return nc * na + w * na * na + a * na + b + + +@njit +def get_h_entropy(msa_trimmed, weights, na : int = 21): + """ + Transform AA counts to entropy = counts * eff[sequence_n] + """ + nr = msa_trimmed.shape[0] + nc = msa_trimmed.shape[1] + freq = np.zeros(nc * na) + for i in range(nc): + for n in range(nr): + d = get_1d_position(i, msa_trimmed[n][i]) + freq[d] += weights[n] + return freq + + +@njit +def get_all_pairs(nc): + """ + Coupling pairs of two, resulting in shape (int(nc * (nc - 1) / 2), 2) + """ + pairs = [] + for i in range(nc): + for j in range(i + 1, nc): + pairs.append([i, j]) + return pairs + + +@njit +def get_eff_weights(msa_trimmed: np.ndarray, eff_cutoff: float = 0.8): + """ + Weights alignment sequences according to sequence similarity: + If sequences are too identical, they get down-weighted. + + :return: + """ + nr = msa_trimmed.shape[0] + nc = msa_trimmed.shape[1] + chk = nc * eff_cutoff + N_ = np.ones(nr) + eff_weights = np.ones(nr) + for n in prange(nr): + w = N_[n] + for m in range(n+1, nr): + hm = 0 + for i in range(nc): + if msa_trimmed[n][i] == msa_trimmed[m][i]: + hm += 1 + if hm > chk: + N_[m] += 1 + w += 1 + eff_weights[n] = 1.0 / w + neff = np.sum(eff_weights) + return neff, eff_weights + + +@njit +def eval_v( + x: np.ndarray, + msa_trimmed: np.ndarray, + na: int = 21, + lam_v=0.01 +) -> tuple[float, np.ndarray]: + """ + 1-bd term + :return: float + """ + # nr = msa_trimmed.shape[0] + nc = msa_trimmed.shape[1] + g = np.zeros(nc * na) # Reset g + neff, weights = get_eff_weights(msa_trimmed) + f = get_h_entropy(msa_trimmed, weights) + + fx = 0.0 + reg = 0.0 + PC = np.zeros(shape=(nc, na)) # (P)robability of (C)olumn entries + + for i in range(nc): + d = get_1d_position(i, 0) + for a in range(na): + PC[i][a] += x[d] + d += 1 + + for i in range(nc): + # compute Z for each column + Z = 0.0 + for a in range(na): + Z += np.exp(PC[i][a]) + Z = np.log(Z) # The natural logarithm is logarithm in base e + for a in range(na): + fx += f[get_1d_position(i, a)] * (PC[i][a] - Z) + PC[i][a] = np.exp(PC[i][a] - Z) + for i in range(nc): + d = get_1d_position(i, 0) + for a in range(na): + g[d] += (PC[i][a] * neff) - f[get_1d_position(i, a)] + (lam_v * 2 * x[d]) # Changing of gradient + reg += lam_v * np.power(x[d], 2) + d += 1 + + return -1.0 * (fx - reg), g + + +@njit(parallel=True) +def eval_vw( + x: np.ndarray, + msa_trimmed: np.ndarray, + na: int = 21, + lam_v: float = 0.01 +) -> tuple[float, np.ndarray]: + """ + Updates + (g)radient (array) + and fx (float) + according to given input + x (array). + N1 = nc * na + N2 = N1 + pair_size * na * na + = N1 + + """ + nr = msa_trimmed.shape[0] + nc = msa_trimmed.shape[1] + pairs = get_all_pairs(nc) + pair_size = int(nc * (nc - 1) / 2) + N1 = nc * na + N2 = N1 + pair_size * na * na + g = np.zeros(N2) # Reset g + neff, weights = get_eff_weights(msa_trimmed) + lam_w = lam_v * (nc - 1.0) * (na - 1.0) + + fx = 0.0 + reg = 0.0 + PCN = np.zeros(shape=(nr, nc, na)) # (P)robability of (C)olumn entries?! + + for n in prange(nr): + # precompute sum(V+W) for each position "i" and amino acids "a" + # assuming all other positions are fixed + PC = np.zeros(shape=(nc, na)) + # for each position i + for i in range(nc): + # for each amino acid + for a in range(na): + # 1-body-term + PC[i][a] += x[get_1d_position(i, a)] + if True: # if (N == msa.N2) + for w in range(pair_size): + i = pairs[w][0] + j = pairs[w][1] + xni = msa_trimmed[n][i] + xnj = msa_trimmed[n][j] + for a in range(na): + PC[i][a] += x[get_2d_position(w, a, xnj, nc)] + PC[j][a] += x[get_2d_position(w, xni, a, nc)] + for i in range(nc): + # compute local Z + Z = 0.0 + for a in range(na): + Z += np.exp(PC[i][a]) + Z = np.log(Z) + + # compute fx + xni = msa_trimmed[n][i] + fx += (PC[i][xni] - Z) * weights[n] + + # needed for (g)radient calculation + for a in range(na): + PCN[n][i][a] = np.exp(PC[i][a] - Z) * weights[n] + # compute (g)radient for 1bd + if True: # if (N >= msa.N1) + for i in prange(nc): + for n in range(nr): + xni = msa_trimmed[n][i] + g[get_1d_position(i, xni)] -= weights[n] + for a in range(na): + g[get_1d_position(i, a)] += PCN[n][i][a] + # compute (g)radient for 2bd + if True: # if (N == msa.N2) + for w in prange(pair_size): + i = pairs[w][0] + j = pairs[w][1] + for n in range(nr): + xni = msa_trimmed[n][i] + xnj = msa_trimmed[n][j] + g[get_2d_position(w, xni, xnj, nc)] -= 2.0 * weights[n] + for a in range(na): + g[get_2d_position(w, a, xnj, nc)] += PCN[n][i][a] + g[get_2d_position(w, xni, a, nc)] += PCN[n][j][a] + # compute (reg)ularization and (g)raident for 1bd + if True: # if (N >= msa.N1) + for d in prange(N1): + reg += lam_v * np.power(x[d], 2) + g[d] += lam_v * 2.0 * x[d] + # compute (reg)ularization and (g)radient for 2bd + if True: # if (N == msa.N2) + for d in prange(N1, N2): + reg += lam_w * np.power(x[d], 2) + g[d] += lam_w * 2.0 * x[d] + # flip direction, since we are passing function to a minimizer + return -1.0 * (fx - reg), g + + +@njit +def deep_copy(arr: list | np.ndarray): + """ + Just makes a copy similar to the copy (deepcopy) module. + Required for making a copy of 1D-arrays using numba. + """ + ret: List[float] = list() + for value in arr: + ret.append(value) + return np.array(ret) + + +@njit(parallel=True) +def lbfgs(msa_trimmed, func, mode: str = 'vw', max_iter: int = 100, na: int = 21): + """ + Limited-memory Broyden–Fletcher–Goldfarb–Shanno (LBFGS) + --------------------------------------------------------------------------- + Adopted from: GREMLIN_CPP, https://github.com/sokrypton/GREMLIN_CPP, that is + Adopted from: https://github.com/js850/lbfgs_cpp + "THE BEER-WARE LICENSE" (Revision 42): + wrote this function. As long as you retain this notice + you can do whatever you want with this stuff. If we meet some day, and you + think this stuff is worth it, you can buy me a beer in return Jacob Stevenson + --------------------------------------------------------------------------- + modified to remove convergence criteria, will continue until max_iter + modified to remove maxstep check + --------------------------------------------------------------------------- + + func: Target function returning a scalar value to be minimized + :return: + """ + # nr = msa_trimmed.shape[0] + nc = msa_trimmed.shape[1] + + max_f_rise = 1e-4 + H0 = 0.1 + # maxstep = 0.2 + # pairs = get_all_pairs(nc) + pair_size = int(nc * (nc - 1) / 2) + N1 = nc * na + N2 = N1 + pair_size * na * na + + if mode == 'v': + N = nc * na + elif mode == 'vw': + N = N2 + else: + raise SystemError("Unknown mode.") + + M = 5 + y = np.zeros(shape=(M, N)) + s = np.zeros(shape=(M, N)) + x_new = np.zeros(N) + rho = np.zeros(M) + step = np.zeros(N) + + # Compute target (f)unction value + fx_new, g_new = func(x_new, msa_trimmed) + + print('lbfgs::iter S_S fx: ', fx_new, 'gnorm:', np.linalg.norm(g_new)) + + for iteration in range(max_iter): + g_old = deep_copy(g_new) + x_old = deep_copy(x_new) + fx_old = fx_new + if iteration == 0: + gnorm = np.linalg.norm(g_old) + if gnorm > 1.0: + gnorm = 1.0 / gnorm + for n in range(N): + step[n] = -gnorm * H0 * g_old[n] + else: # iteration > 0 + step = deep_copy(g_old) + jmin = iteration - M + if jmin < 0: + jmin = 0 + jmax = iteration + + # i = 0 + # beta = 0.0 + alpha = np.zeros(M) + + for j in range(jmax - 1, jmin - 1, -1): + i = j % M + alpha[i] = rho[i] * np.dot(s[i], step) + for n in range(N): + step[n] -= alpha[i] * y[i][n] + # Scale the step size by H0 + for n in range(N): + step[n] *= H0 + for j in range(jmin, jmax): + i = j % M + beta = rho[i] * np.dot(y[i], step) + for n in range(N): + step[n] += s[i][n] * (alpha[i] - beta) + # invert the step to point downhill + for n in range(N): + step[n] *= -1 + ############################################### + # backtracking_linesearch + ############################################### + # fnew = 0.0 + # if the step is pointing uphill, invert it + if np.dot(step, g_old) > 0.0: + for n in range(N): + step[n] *= -1 + attempt = 0 + factor = 1.0 + # stepsize = np.linalg.norm(step) + + step_sum = 0.0 + for step_i in step: + step_sum += step_i + # make sure the step is no larger than maxstep + # if (factor * stepsize > maxstep){factor = maxstep/stepsize;} + for nred in range(10): + attempt += 1 + for n in range(N): + x_new[n] = x_old[n] + factor * step[n] + fx_new, g_new = func(x_new, msa_trimmed) # new func calculation + df = fx_new - fx_old + if df < max_f_rise: + break + else: + factor /= 10.0 + # stepsize = stepsize * factor + ############################################### + # update_memory + ############################################### + klocal = iteration % M + for n in range(N): + y[klocal][n] = g_new[n] - g_old[n] + s[klocal][n] = x_new[n] - x_old[n] + ys = np.dot(y[klocal], s[klocal]) + if ys == 0.0: + ys = 1.0 + rho[klocal] = 1.0 / ys + + yy = np.dot(y[klocal], y[klocal]) + if yy == 0.0: + yy = 1.0 + H0 = ys / yy + try: + if (iteration + 1) % int(max_iter / 10) == 0: + print(iteration + 1, '/', max_iter, ' : ', fx_new) + except: # bare 'except' used for numba functionality + print(iteration + 1, '/', max_iter, ' : ', fx_new) + + return x_new + + +@njit +def cg(msa_trimmed, func, mode: str = 'vw', max_iter: int = 100, na: int = 21): + """ + Nonlinear Conjugate Gradient (CG) + --------------------------------------------------------------------------- + Adopted from: GREMLIN_CPP, https://github.com/sokrypton/GREMLIN_CPP, that is + Adopted from: https://bitbucket.org/soedinglab/libconjugrad + CCMpred is released under the GNU Affero General Public License v3 or later. + --------------------------------------------------------------------------- + modified to remove convergence criteria, will continue until max_iter + --------------------------------------------------------------------------- + + :return: + """ + # nr = msa_trimmed.shape[0] + nc = msa_trimmed.shape[1] + pair_size = int(nc * (nc - 1) / 2) + N1 = nc * na + N2 = N1 + pair_size * na * na + + if mode == 'v': + N = N1 + elif mode == 'vw': + N = N2 + else: + raise SystemError("Unknown mode.") + + # epsilon = 1e-5 + ftol = 1e-4 + wolfe = 0.1 + alpha_mul = 0.5 + max_line = 10 + + s = np.zeros(N) + + gnorm_old = 0.0 + alpha_old = 0.0 + dg_old = 0.0 + + x = np.zeros(N) + + fx, g = func(x, msa_trimmed) + + gnorm = np.dot(g, g) + + # dg = 0.0 + alpha = 1.0 / np.sqrt(gnorm) + + print("# cg::iter S_S fx: ", fx, " gnorm: ", np.sqrt(gnorm)) + + for iteration in range(max_iter): + if iteration == 0: + for n in range(N): + s[n] = -g[n] + dg = np.dot(s, g) + else: # iteration > 0 + # Fletcher-Reeves + beta = gnorm / gnorm_old + for n in range(N): + s[n] = s[n] * beta - g[n] + dg = np.dot(s, g) + alpha = alpha_old * dg_old / dg + ######################################### + # Linesearch + ######################################### + attempts = 0 + dg_ini = dg + dg_test = dg_ini * ftol + fx_ini = fx + old_alpha = 0.0 + for line in range(max_line): + attempts += 1 + step = alpha - old_alpha + for n in range(N): + x[n] += s[n] * step + fx_step, g = func(x, msa_trimmed) + if fx_step <= fx_ini + alpha * dg_test: + if np.dot(s, g) < wolfe * dg_ini: + fx = fx_step + break + old_alpha = alpha + alpha *= alpha_mul + ######################################### + gnorm_old = gnorm + gnorm = np.dot(g, g) + alpha_old = alpha + dg_old = dg + try: + if (iteration + 1) % int(max_iter / 10) == 0: + print(iteration + 1, '/', max_iter, ' : ', fx) + except: # bare 'except' used for numba functionality + print(iteration + 1, '/', max_iter, ' : ', fx) + return x + + +def get_seqs_onehot_1bd(seqs_int): + new_seqs_oh = [] + for arr in seqs_int: + new_seq_oh = np.zeros(len(arr) * 21, dtype=int) + for i, a, in enumerate(arr): + d = get_1d_position(i, a) + new_seq_oh[d] += 1 + new_seqs_oh.append(new_seq_oh) + return np.array(new_seqs_oh) + + +def get_pair_size(seqs_int): + for arr in seqs_int[:1]: + c = 0 + for a_i in range(len(arr)): + for b_i in range(a_i + 1, len(arr)): + c += 1 + return c + + +@njit +def oh_1bd_predict(seqs_int, x_opt, na: int = 21): + nc = seqs_int.shape[1] + N1 = nc * na + y_pred_all: List[float] = list() + for arr in seqs_int: + new_seq_oh = np.zeros(N1) + for i, a, in enumerate(arr): + d = get_1d_position(i, a) + new_seq_oh[d] += 1 + # Predict here directly to save memory instead + # of storing whole one-hot vector + y_pred = np.sum(new_seq_oh * x_opt) + y_pred_all.append(y_pred) + return y_pred_all + + +@njit(parallel=True) +def oh_2bd_predict(seqs_int, x_opt, na: int = 21): + nc = seqs_int.shape[1] + pair_size = int(nc * (nc - 1) / 2) + N2 = nc * na + pair_size * na * na + y_pred_all: List[float] = list() + pairs = get_all_pairs(nc) + for arr in seqs_int: + new_seq_oh = np.zeros(N2) + for i, a, in enumerate(arr): + d = get_1d_position(i, a) + new_seq_oh[d] += 1 + for w in prange(pair_size): + i = pairs[w][0] + j = pairs[w][1] + xni = arr[i] + xnj = arr[j] + new_seq_oh[get_2d_position(w, xni, xnj, nc)] += 1 + # Predict here directly to save memory instead + # of storing whole one-hot vector + y_pred = np.sum(new_seq_oh * x_opt) + y_pred_all.append(y_pred) + return y_pred_all + + +@njit +def oh_1bd_1d_encode(seqs_int, x_opt): + """ + Encoding the input sequences (amino acids to integer-transformed + trimmed MSA sequences). + As V, i.e., 1-body-transformed, sequences require much memory to + be stored, encoding leads to len(sequence) encoded sequences, i.e., + adding up sequence position-dependent probabilities. + + :param seqs_int: + :return: + """ + x_oh_pos_all: List[float] = list() + for arr in seqs_int: + x_oh_pos = np.zeros(np.shape(seqs_int)[1]) + for i, a, in enumerate(arr): + d = get_1d_position(i, a) + x_oh_pos[i] += x_opt[d] + x_oh_pos_all.append(x_oh_pos) + + return x_oh_pos_all + + + +@njit(parallel=True) +def oh_2bd_1d_encode(seqs_int, x_opt): + """ + Encoding the input sequences (amino acids to integer-transformed + trimmed MSA sequences). + As V+W, i.e., 2-body-transformed, sequences require much memory to + be stored, encoding leads to len(sequence) encoded sequences, i.e., + adding up sequence position-dependent probabilities. + + :param seqs_int: + :return: + """ + nc = seqs_int.shape[1] + pairs = get_all_pairs(nc) + pair_size = int(nc * (nc - 1) / 2) + + x_oh_pos_all: List[float] = list() + for arr in seqs_int: + x_oh_pos = np.zeros(np.shape(seqs_int)[1]) + for i, a, in enumerate(arr): + d = get_1d_position(i, a) + x_oh_pos[i] += x_opt[d] + for w in prange(pair_size): + i = pairs[w][0] + j = pairs[w][1] + xni = arr[i] + xnj = arr[j] + d = get_2d_position(w, xni, xnj, nc) + x_oh_pos[i] += x_opt[d] + x_oh_pos_all.append(x_oh_pos) + + return x_oh_pos_all + + +def aa_count_predict(msa_trimmed, sequences_int_trimmed): + """ + Simple encoding based on amino acid counts (MSA column-based counts/frequencies). + + :param msa_trimmed: Multiple sequence alignment (MSA) in trimmed form. + :param sequences_int_trimmed: Int-converted input sequences to be used + for fitness prediction. + + :return: Predicted fitness values of input sequences. + """ + onehot_cat_msa = np.eye(21)[msa_trimmed] + aa_counts = np.sum(onehot_cat_msa, axis=0) + sequences_int_trimmed_onehot = np.eye(21)[sequences_int_trimmed] + tmp = [] + for seq_oh in sequences_int_trimmed_onehot: + tmp.append(seq_oh.flatten()) + sequences_int_trimmed_onehot = np.array(tmp) + x_aac_pred = sequences_int_trimmed_onehot * aa_counts.flatten() + y_pred_aac = np.sum(x_aac_pred, axis=1) + return y_pred_aac + + +# def sum_encoded_seqs(x_seqs): +# return np.sum(x_seqs, axis=1) +# +# +# def oh_2bd_pred(seqs_int, x_opt): +# nc = seqs_int.shape[1] +# pairs = get_all_pairs(nc) +# pair_size = int(nc * (nc - 1) / 2) +# x_oh_pos_all: List[float] = list(list()) +# for arr in seqs_int: +# sum_x = 0.0 +# for i, a, in enumerate(arr): +# d = get_1d_position(i, a) +# sum_x += x_opt[d] +# for w in prange(pair_size): +# i = pairs[w][0] +# j = pairs[w][1] +# xni = arr[i] +# xnj = arr[j] +# d = get_2d_position(w, xni, xnj, nc) +# sum_x += x_opt[d] +# x_oh_pos_all.append(sum_x) +# return x_oh_pos_all diff --git a/scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb b/scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb new file mode 100644 index 0000000..1087f58 --- /dev/null +++ b/scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "64ae8419", + "metadata": {}, + "source": [ + "### Maximization of MSA sequence probability using L-BFGS (and CG) \n", + "Port of GREMLIN_CPP (https://github.com/sokrypton/GREMLIN_CPP) using Python and numba." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4274632d", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import spearmanr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cd9f9db9", + "metadata": {}, + "outputs": [], + "source": [ + "# Local .py file imports\n", + "from gremlin_inference_numba import (\n", + " get_sequences_from_file, seqs2int, filt_gaps, lbfgs, cg, get_seqs_from_var_name,\n", + " eval_v, eval_vw, oh_1bd_predict, oh_2bd_predict, aa_count_predict\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3df7d2d4", + "metadata": {}, + "source": [ + "Loading MSA sequences, converting alphabetical amino acid sequences to seuqences of integer values, and trimming MSA." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3cb55570", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['mskgeELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKqhDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMadkqKNGIKVNFKIRHNIEDGSVQLADHyqQNTPIGDGPVLLPDNHYLSTQSALSKDpNEKRDHMVLLEFVTAAGithgmdelyk'\n", + " 'metgrALFSKPMTCQTEIDGEINGNKFKVVGNGDS-PGGGDFSIHAYCTTGELPMSWVVLGSPLQYGFHMFSGYPDDII..HYFQECFPEGYILTRSLRFEYDGTLTTTHHYSLEGNCVKAKVTLKGEGFDPNGPTMTKEEEQHPSQVQIFPH....GSGIRLLSNVVFKKKDGTTQLALQdcSVKPLGSRDVPLPNVHFLRTQIIQKKDdSDKRDHVVQREIAIAEH..........'\n", + " '..rgrALFSNSMTSKTEIDGEINGKKFKVVGEGDS-PGGGDFTIRAYCTTGELPMSWVVMGSPLQYGFHMLSHYPDDIV..HYFQECFPEGYTLTRKLRFEGDGTLTTHHRYELAGTCVKAKVSLTGESFDPSGPTMTKTVEQLPNQVQVFPH....ADGIRLLSDVVFVKNDGTTQIAHQdcSVKPLATRKITLPRFHFLHTQISQWKDrSDKRDHVVQREVSKAE-..........'\n", + " ...\n", + " 'mersaSFFTGTAKSKVIAEIMVDDTEYKVSGEGFACPLEGHQTLELHCSGKAMSINWSILGTIIQSNFKLFTQYTGSCV.yDFFKTSFPGGLKVETTASFSDGAVIRGNSSLTYVKDTVICRCNIQCEGFCEESPARARDLGQTLPCYEVIEG..ykADEVTCTMDLEWNDSDKQKYLCRLesSFVSGGTGNF-APPRHFIGHHFKITDK.SPNNLHFAQRCKSRANRi.........'\n", + " 'mersiSLFTGTAKSKVIATVKIDDMEYTITGEGFACPTEGQQNLELHCSGSALPINWCILGTIIQCNFKLFTQYKGSNE.yDFFKTSFPGGLKVEYVGSFVDGAEVSGNSTMSYVKDTVICRCSIQCKGFSEESPARAHDLGQTLTCYEVVEG..lrADEVTSQVSLEWLDGYNQKYACRLnsSVKSSGSQGNFSPSRHFIGHHFKVTDK.SPNNLHFAQRLKSRACSi.........'\n", + " 'mersiSLFTGTAKSKVIATVKIDDMEYTITGEGFACPTEGNQNLELHCSGTALPINWCILGTIIQCNFKLFTQYKGSNE.yDFFKTSFPGGLKVEYVGSFVDGAEVSGNSTMSYVKDTVICRCSIQCKGFSEESPARAHDLGQTLTCYEVVEG..lrADEVTSQVTLEWLDGYNQKYACRLnsSVKSSGSQGNFSPSRHFIGHHFKVTDK.SPNNLHFAQRLKSRACSi.........']\n", + "[[12 15 11 ... 10 18 11]\n", + " [12 6 16 ... 20 20 20]\n", + " [20 20 1 ... 20 20 20]\n", + " ...\n", + " [12 6 1 ... 20 20 20]\n", + " [12 6 1 ... 20 20 20]\n", + " [12 6 1 ... 20 20 20]]\n", + "(1046, 238)\n", + "[[11 7 6 ... 0 0 7]\n", + " [16 7 1 ... 0 6 8]\n", + " [ 1 7 1 ... 0 6 20]\n", + " ...\n", + " [ 1 15 0 ... 0 2 1]\n", + " [ 1 15 9 ... 0 4 15]\n", + " [ 1 15 9 ... 0 4 15]]\n", + "(1046, 220)\n" + ] + } + ], + "source": [ + "alignment = '../../workflow/test_dataset_avgfp/uref100_avgfp_jhmmer_119.a2m'\n", + "msa, *_ = get_sequences_from_file(alignment)\n", + "print(msa)\n", + "msa_int = seqs2int(msa)\n", + "print(msa_int)\n", + "print(msa_int.shape)\n", + "msa_trimmed, gaps = filt_gaps(msa_int)\n", + "print(msa_trimmed)\n", + "print(msa_trimmed.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "7f7083fa", + "metadata": {}, + "source": [ + "Optimization of MSA probability using L-BFGS and CG." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8cf645de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lbfgs::iter S_S fx: 107388.27079431039 gnorm: 1225.1515293096882\n", + "10 / 100 : 60609.63108277181\n", + "20 / 100 : 59947.64281511534\n", + "30 / 100 : 59869.53178623354\n", + "40 / 100 : 59859.941731911356\n", + "50 / 100 : 59857.879720644436\n", + "60 / 100 : 59857.31594100573\n", + "70 / 100 : 59857.15028889863\n", + "80 / 100 : 59857.09504682756\n", + "90 / 100 : 59857.08003258002\n", + "100 / 100 : 59857.07454351859\n", + "lbfgs::iter S_S fx: 107388.27079430803 gnorm: 15388.644842127675\n", + "10 / 100 : 31533.55176383793\n", + "20 / 100 : 31060.84534749837\n", + "30 / 100 : 28888.573386249183\n", + "40 / 100 : 28396.431248611527\n", + "50 / 100 : 28342.800784800973\n", + "60 / 100 : 28254.50087272058\n", + "70 / 100 : 28227.888482528473\n", + "80 / 100 : 28188.92690343361\n", + "90 / 100 : 28171.676017789905\n", + "100 / 100 : 28158.183075984813\n", + "# cg::iter S_S fx: 107388.27079431039 gnorm: 1225.15152930969\n", + "10 / 100 : 95195.27951793215\n", + "20 / 100 : 83096.38618130801\n", + "30 / 100 : 71395.61723248461\n", + "40 / 100 : 67400.86027747851\n", + "50 / 100 : 66669.79690088452\n", + "60 / 100 : 65910.63369043467\n", + "70 / 100 : 65151.95901760452\n", + "80 / 100 : 64395.652024425486\n", + "90 / 100 : 63644.68778084792\n", + "100 / 100 : 62911.661334753415\n", + "# cg::iter S_S fx: 107388.27079430803 gnorm: 15388.644842126007\n", + "10 / 100 : 35864.91179731878\n", + "20 / 100 : 31408.292000635145\n", + "30 / 100 : 31130.30687731922\n", + "40 / 100 : 31071.685376873065\n", + "50 / 100 : 31003.20159049482\n", + "60 / 100 : 30932.300252526038\n", + "70 / 100 : 30859.601343470487\n", + "80 / 100 : 30785.998696509025\n", + "90 / 100 : 30712.083040509264\n", + "100 / 100 : 30637.811822276144\n" + ] + } + ], + "source": [ + "# Limited-memory Broyden–Fletcher–Goldfarb–Shanno (L-BFGS) algorithm\n", + "x_opt_1bd = lbfgs(msa_trimmed, eval_v, mode='v', max_iter=100)\n", + "x_opt_2bd = lbfgs(msa_trimmed, eval_vw, mode='vw', max_iter=100)\n", + "# Conjugate Gradient (CG) method\n", + "x_opt_1bd_cg = cg(msa_trimmed, eval_v, mode='v', max_iter=100)\n", + "x_opt_2bd_cg = cg(msa_trimmed, eval_vw, mode='vw', max_iter=100)" + ] + }, + { + "cell_type": "markdown", + "id": "7477db54", + "metadata": {}, + "source": [ + "Loading collected literature sequences, converting to integer sequences, and trimming sequences, to predict fitness and compare to true measured fitness later." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "54954ad2", + "metadata": {}, + "outputs": [], + "source": [ + "wt_sequence = 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL' \\\n", + " 'VTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV' \\\n", + " 'NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD' \\\n", + " 'HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'\n", + "variant_fitness_data = pd.read_csv('../../workflow/test_dataset_avgfp/avgfp.csv', sep=';')\n", + "variants = variant_fitness_data.iloc[:2000, 0].values # \"just\" using 2000 variants for faster processing\n", + "fitness_values = variant_fitness_data.iloc[:2000, 1].values\n", + "variants_split = []\n", + "for variant in variants:\n", + " variants_split.append(variant.split('/'))\n", + "variants, fitness_values, sequences = get_seqs_from_var_name(wt_sequence, variants_split, fitness_values)\n", + "sequences_int = seqs2int(sequences)\n", + "sequences_int_trimmed = np.delete(sequences_int, gaps, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "66dc4b88", + "metadata": {}, + "source": [ + "L-BFGS optimization: Rank correlation of measured and predicted sequence fitness." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1119ebba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AA counts: SignificanceResult(statistic=0.5029247668771557, pvalue=1.0850574512849578e-128)\n", + "1-body term (V): SignificanceResult(statistic=0.6585532465117758, pvalue=5.395952272751471e-249)\n", + "2-body term (VW): SignificanceResult(statistic=0.718349826225515, pvalue=3.4027793e-317)\n" + ] + } + ], + "source": [ + "y_pred_1bd = oh_1bd_predict(sequences_int_trimmed, x_opt_1bd)\n", + "y_pred_2bd = oh_2bd_predict(sequences_int_trimmed, x_opt_2bd)\n", + "y_pred_aac = aa_count_predict(msa_trimmed, sequences_int_trimmed)\n", + "\n", + "# 1D-sequence encodings for further machine learning tasks\n", + "#x_pred_1bd = oh_1bd_1d_encode(sequences_int_trimmed, x_opt_1bd)\n", + "#x_pred_2bd = oh_2bd_1d_encode(sequences_int_trimmed, x_opt_2bd)\n", + "\n", + "print(f\"{'AA counts:':<18}\", spearmanr(fitness_values, y_pred_aac))\n", + "print(f\"{'1-body term (V):':<18}\", spearmanr(fitness_values, y_pred_1bd))\n", + "print(f\"{'2-body term (VW):':<18}\", spearmanr(fitness_values, y_pred_2bd))" + ] + }, + { + "cell_type": "markdown", + "id": "61fab7bc", + "metadata": {}, + "source": [ + "CG optimization: Rank correlation of measured and predicted sequence fitness." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3cb353d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AA counts: SignificanceResult(statistic=0.5029247668771557, pvalue=1.0850574512849578e-128)\n", + "1-body term (V): SignificanceResult(statistic=0.5459010681666135, pvalue=9.995612748011931e-156)\n", + "2-body term (VW): SignificanceResult(statistic=0.6943000690360119, pvalue=9.57615754836693e-288)\n" + ] + } + ], + "source": [ + "y_pred_1bd = oh_1bd_predict(sequences_int_trimmed, x_opt_1bd_cg)\n", + "y_pred_2bd = oh_2bd_predict(sequences_int_trimmed, x_opt_2bd_cg)\n", + "y_pred_aac = aa_count_predict(msa_trimmed, sequences_int_trimmed)\n", + "\n", + "# 1D-sequence encodings for further machine learning tasks\n", + "#x_pred_1bd = oh_1bd_1d_encode(sequences_int_trimmed, x_opt_1bd)\n", + "#x_pred_2bd = oh_2bd_1d_encode(sequences_int_trimmed, x_opt_2bd)\n", + "\n", + "print(f\"{'AA counts:':<18}\", spearmanr(fitness_values, y_pred_aac))\n", + "print(f\"{'1-body term (V):':<18}\", spearmanr(fitness_values, y_pred_1bd))\n", + "print(f\"{'2-body term (VW):':<18}\", spearmanr(fitness_values, y_pred_2bd))" + ] + }, + { + "cell_type": "markdown", + "id": "a7249221", + "metadata": {}, + "source": [ + "Notebook end." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pypef_dev", + "language": "python", + "name": "pypef_dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflow/run_cli_tests_linux.sh b/workflow/run_cli_tests_linux.sh index 2db1b9a..81dcbf5 100644 --- a/workflow/run_cli_tests_linux.sh +++ b/workflow/run_cli_tests_linux.sh @@ -1,476 +1,483 @@ -#!/bin/bash - -### Bash script for testing some PyPEF CLI commands -### based on the two datasets provided (ANEH and avGFP) -printf 'For successful running, following files are required:\n\nin test_dataset_aneh/\n\tSequence_WT_ANEH.fasta\n\t37_ANEH_variants.csv -\tANEH_jhmmer.a2m\n\tANEH_72.6.params (generated using PLMC or dowloaded from https://github.com/niklases/PyPEF/blob/main/workflow/test_dataset_aneh/ANEH_72.6.params)\n -in test_dataset_avgfp/\n\tP42212_F64L.fasta\n\tavGFP.csv\n\turef100_avgfp_jhmmer_119.a2m -\turef100_avgfp_jhmmer_119_plmc_42.6.params (generated using PLMC or dowloaded from https://github.com/niklases/PyPEF/blob/main/workflow/test_dataset_avgfp/uref100_avgfp_jhmmer_119_plmc_42.6.params)\n\n' - -set -x # echo on -set -e # exit on (PyPEF) errors -export PS4='+(Line ${LINENO}): ' # echo script line numbers - -### RUN ME WITH -### $ ./run_cli_tests_linux.sh # printing STDOUT and STDERR to terminal -### $ ./run_cli_tests_linux.sh &> test_cli_run.log # writing STDOUT and STDERR to log file - -### if using downloaded/locally stored pypef .py files: -############### CHANGE THIS PATHS AND USED THREADS, REQUIRES PYTHON ENVIRONMENT WITH PRE-INSTALLED MODULES ############### -export PYTHONPATH=${PYTHONPATH}:/path/to/pypef-main # -pypef='python3 /path/to/pypef-main/pypef/main.py' # # -########################################################################################################################## -### else just use pip-installed pypef version (uncomment): # -#pypef=pypef # -########################################################################################################################## -threads=16 # -########################################################################################################################## - -### threads=1 shows progress bar where possible -### CV-based mlp and rf regression option optimization take a long time and related testing commands are commented out/not included herein - -### Pure ML (and some hybrid model) tests on ANEH dataset -cd 'test_dataset_aneh' -####################################################################### -echo - -$pypef --version -echo -$pypef -h -echo -$pypef mklsts -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -echo - -$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor pls -echo -$pypef ml --show -echo -$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor pls_loocv -echo -$pypef ml --show -echo -$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor ridge -echo -$pypef ml --show -echo -$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor lasso -echo -$pypef ml --show -echo -$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor elasticnet -echo -$pypef ml --show -echo -#$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor mlp -#$pypef ml --show -#$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor rf -#$pypef ml --show - -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls_loocv --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor ridge --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor lasso --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor elasticnet --threads $threads -echo -$pypef ml --show -echo - -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls --nofft --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls_loocv --nofft --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor ridge --nofft --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor lasso --nofft --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor elasticnet --nofft --threads $threads -echo -$pypef ml --show -echo - -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls_loocv --params ANEH_72.6.params --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor ridge --params ANEH_72.6.params --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor lasso --params ANEH_72.6.params --threads $threads -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor elasticnet --params ANEH_72.6.params --threads $threads -echo -$pypef ml --show -echo - -$pypef param_inference --msa ANEH_jhmmer.a2m --opt_iter 100 -echo -$pypef save_msa_info --msa ANEH_jhmmer.a2m -w Sequence_WT_ANEH.fasta --opt_iter 100 -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls_loocv --params GREMLIN -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor ridge --params GREMLIN -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor lasso --params GREMLIN -echo -$pypef ml --show -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor elasticnet --params GREMLIN -echo -$pypef ml --show -echo - -$pypef ml -e aaidx -m FAUJ880104 -t TS.fasl -echo -$pypef ml -e onehot -m ONEHOT -t TS.fasl -echo -$pypef ml -e dca -m MLplmc -t TS.fasl --params ANEH_72.6.params --threads $threads -echo -$pypef ml -e aaidx -m FAUJ880104 -t TS.fasl --label -echo -$pypef ml -e onehot -m ONEHOT -t TS.fasl --label -echo -$pypef ml -e dca -m MLplmc -t TS.fasl --label --params ANEH_72.6.params --threads $threads -echo - -$pypef mkps -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -echo -$pypef ml -e aaidx -m FAUJ880104 -p 37_ANEH_variants_prediction_set.fasta -echo -$pypef ml -e onehot -m ONEHOT -p 37_ANEH_variants_prediction_set.fasta -echo -$pypef ml -e dca -m MLplmc -p 37_ANEH_variants_prediction_set.fasta --params ANEH_72.6.params --threads $threads -echo -$pypef ml -e dca -m MLgremlin -p 37_ANEH_variants_prediction_set.fasta --params GREMLIN -echo - -$pypef mkps -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta --drecomb --trecomb --qarecomb --qirecomb --ddiverse -echo -$pypef ml -e aaidx -m FAUJ880104 --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse -echo -$pypef ml -e onehot -m ONEHOT --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse -echo -$pypef ml -e dca -m MLplmc --params ANEH_72.6.params --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse --threads $threads -echo -$pypef ml -e dca -m MLgremlin --params GREMLIN --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse -echo - -$pypef ml -e aaidx directevo -m FAUJ880104 -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative -echo -$pypef ml -e onehot directevo -m ONEHOT -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative -echo -$pypef ml -e dca directevo -m MLplmc -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative --params ANEH_72.6.params -echo -$pypef ml -e dca directevo -m MLgremlin -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative --params GREMLIN -echo -$pypef ml -e aaidx directevo -m FAUJ880104 -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative -echo -$pypef ml -e onehot directevo -m ONEHOT -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative -echo -$pypef ml -e dca directevo -m MLplmc -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative --params ANEH_72.6.params -echo -$pypef ml -e dca directevo -m MLgremlin -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative --params GREMLIN -echo -$pypef ml -e aaidx directevo -m FAUJ880104 -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative -echo -$pypef ml -e onehot directevo -m ONEHOT -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative -echo -$pypef ml -e dca directevo -m MLplmc -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative --params ANEH_72.6.params -echo -$pypef ml -e dca directevo -m MLgremlin -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative --params GREMLIN -echo - -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls --nofft -echo -$pypef ml --show -echo -$pypef ml -e aaidx directevo -m WEBA780101 -w Sequence_WT_ANEH.fasta -y -1.5 --negative --nofft -echo - -$pypef encode -i 37_ANEH_variants.csv -e aaidx -m FAUJ880104 -w Sequence_WT_ANEH.fasta -echo -$pypef encode -i 37_ANEH_variants.csv -e onehot -w Sequence_WT_ANEH.fasta -echo -$pypef encode -i 37_ANEH_variants.csv -e dca -w Sequence_WT_ANEH.fasta --params ANEH_72.6.params --threads $threads -echo -mv 37_ANEH_variants_dca_encoded.csv 37_ANEH_variants_plmc_dca_encoded.csv -echo -$pypef encode -i 37_ANEH_variants.csv -e dca -w Sequence_WT_ANEH.fasta --params GREMLIN -echo -mv 37_ANEH_variants_dca_encoded.csv 37_ANEH_variants_gremlin_dca_encoded.csv -echo - -$pypef ml low_n -i 37_ANEH_variants_aaidx_encoded.csv -echo -$pypef ml low_n -i 37_ANEH_variants_onehot_encoded.csv -echo -$pypef ml low_n -i 37_ANEH_variants_plmc_dca_encoded.csv -echo -$pypef ml low_n -i 37_ANEH_variants_gremlin_dca_encoded.csv -echo - -$pypef ml extrapolation -i 37_ANEH_variants_aaidx_encoded.csv -echo -$pypef ml extrapolation -i 37_ANEH_variants_onehot_encoded.csv -echo -$pypef ml extrapolation -i 37_ANEH_variants_plmc_dca_encoded.csv -echo -$pypef ml extrapolation -i 37_ANEH_variants_gremlin_dca_encoded.csv -echo - -$pypef ml extrapolation -i 37_ANEH_variants_aaidx_encoded.csv --conc -echo -$pypef ml extrapolation -i 37_ANEH_variants_onehot_encoded.csv --conc -echo -$pypef ml extrapolation -i 37_ANEH_variants_plmc_dca_encoded.csv --conc -echo -$pypef ml extrapolation -i 37_ANEH_variants_gremlin_dca_encoded.csv --conc -echo - -$pypef hybrid train_and_save -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta --params ANEH_72.6.params --fit_size 0.66 --threads $threads -echo -$pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params --threads $threads -echo -$pypef hybrid -m HYBRIDplmc -t TS.fasl --params ANEH_72.6.params --threads $threads -echo - -$pypef hybrid train_and_save -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta --params GREMLIN --fit_size 0.66 -echo -$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN -echo -$pypef hybrid -m HYBRIDgremlin -t TS.fasl --params GREMLIN -echo - -$pypef mkps -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -echo -$pypef hybrid -m HYBRIDplmc -p 37_ANEH_variants_prediction_set.fasta --params ANEH_72.6.params --threads $threads -echo -$pypef hybrid -m HYBRIDplmc --params ANEH_72.6.params --pmult --drecomb --threads $threads -echo - -$pypef hybrid directevo -m HYBRIDplmc -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative --params ANEH_72.6.params -echo -$pypef hybrid directevo -m HYBRIDplmc -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative --params ANEH_72.6.params -echo -$pypef hybrid directevo -m HYBRIDplmc -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative --params ANEH_72.6.params -echo - -$pypef encode -i 37_ANEH_variants.csv -e dca -w Sequence_WT_ANEH.fasta --params ANEH_72.6.params --threads $threads -echo -$pypef hybrid low_n -i 37_ANEH_variants_dca_encoded.csv -echo -$pypef hybrid extrapolation -i 37_ANEH_variants_dca_encoded.csv -echo -$pypef hybrid extrapolation -i 37_ANEH_variants_dca_encoded.csv --conc -echo - - -### Hybrid model (and some pure ML and pure DCA) tests on avGFP dataset -cd '../test_dataset_avgfp' -####################################################################### -echo - -$pypef mklsts -i avGFP.csv -w P42212_F64L.fasta -echo -$pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m --opt_iter 100 -echo -# Check MSA coevolution info -$pypef save_msa_info --msa uref100_avgfp_jhmmer_119.a2m -w P42212_F64L.fasta --opt_iter 100 -### -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --params GREMLIN -echo -$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN -echo -# Similar to line above -$pypef hybrid -t TS.fasl --params GREMLIN -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --params GREMLIN -echo -$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN -echo -$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN -echo -# pure statistical -$pypef hybrid -t TS.fasl --params GREMLIN -echo - - -# using .params file -$pypef ml -e dca -l LS.fasl -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo -# ML LS/TS -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params uref100_avgfp_jhmmer_119_plmc_42.6.params -echo -# Transforming .params file to DCAEncoding and using DCAEncoding Pickle; output file: Pickles/MLplmc. -# That means using uref100_avgfp_jhmmer_119_plmc_42.6.params or PLMC as params file is identical. -$pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params PLMC --threads $threads -echo -# ml only TS -$pypef ml -e dca -m MLplmc -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo -$pypef ml -e dca -m MLgremlin -t TS.fasl --params GREMLIN --threads $threads -echo - - -echo -$pypef ml -e dca -l LS.fasl -t TS.fasl --params PLMC --threads $threads -echo -$pypef hybrid -l LS.fasl -t TS.fasl --params PLMC --threads $threads -echo -$pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads -echo - -# pure statistical -$pypef hybrid -t TS.fasl --params PLMC --threads $threads -echo -$pypef hybrid -t TS.fasl --params GREMLIN -echo -$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN -echo -$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN -echo -$pypef save_msa_info --msa uref100_avgfp_jhmmer_119.a2m -w P42212_F64L.fasta --opt_iter 100 -# train and save only for hybrid -$pypef hybrid train_and_save -i avGFP.csv --params GREMLIN --wt P42212_F64L.fasta -echo -# Encode CSV -$pypef encode -e dca -i avGFP.csv --wt P42212_F64L.fasta --params GREMLIN -echo -$pypef encode --encoding dca -i avGFP.csv --wt P42212_F64L.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads 12 - -#Extrapolation -echo -$pypef ml low_n -i avGFP_dca_encoded.csv --regressor ridge -echo -$pypef ml extrapolation -i avGFP_dca_encoded.csv --regressor ridge -echo -$pypef ml extrapolation -i avGFP_dca_encoded.csv --conc --regressor ridge -echo - -# Direct Evo -$pypef ml -e dca directevo -m MLgremlin --wt P42212_F64L.fasta --params GREMLIN -echo -$pypef ml -e dca directevo -m MLplmc --wt P42212_F64L.fasta --params PLMC -echo -$pypef hybrid directevo -m GREMLIN --wt P42212_F64L.fasta --params GREMLIN -echo -$pypef hybrid directevo -m PLMC --wt P42212_F64L.fasta --params PLMC -echo -$pypef hybrid directevo --wt P42212_F64L.fasta --params GREMLIN -echo -$pypef hybrid directevo --wt P42212_F64L.fasta --params PLMC -echo - -$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN -echo -$pypef hybrid -m HYBRIDgremlin -t TS.fasl --params GREMLIN -echo - -### Similar to old CLI run test from here - -$pypef encode -i avGFP.csv -e dca -w P42212_F64L.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo -$pypef encode -i avGFP.csv -e onehot -w P42212_F64L.fasta -echo -$pypef ml -e aaidx -l LS.fasl -t TS.fasl --threads $threads -echo -$pypef ml --show -echo -$pypef encode -i avGFP.csv -e aaidx -m GEIM800103 -w P42212_F64L.fasta -echo - -$pypef hybrid train_and_save -i avGFP.csv --params uref100_avgfp_jhmmer_119_plmc_42.6.params --fit_size 0.66 -w P42212_F64L.fasta --threads $threads -echo -$pypef hybrid -l LS.fasl -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo -$pypef hybrid -m HYBRIDplmc -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo - -# No training set given -$pypef hybrid -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo -$pypef ml -e dca -m MLplmc -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --label --threads $threads -echo - -$pypef mkps -i avGFP.csv -w P42212_F64L.fasta -echo -$pypef hybrid -m HYBRIDplmc -p avGFP_prediction_set.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads -echo -$pypef mkps -i avGFP.csv -w P42212_F64L.fasta --drecomb -#$pypef hybrid -m HYBRID --params uref100_avgfp_jhmmer_119_plmc_42.6.params --pmult --drecomb --threads $threads # many single variants for recombination, takes too long -echo - -$pypef hybrid directevo -m HYBRIDplmc -w P42212_F64L.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params -echo -$pypef hybrid directevo -m HYBRIDplmc -w P42212_F64L.fasta --numiter 10 --numtraj 8 --params uref100_avgfp_jhmmer_119_plmc_42.6.params -echo -$pypef hybrid directevo -m HYBRIDplmc -i avGFP.csv -w P42212_F64L.fasta --temp 0.1 --usecsv --csvaa --params uref100_avgfp_jhmmer_119_plmc_42.6.params - -$pypef hybrid low_n -i avGFP_dca_encoded.csv -echo -$pypef hybrid extrapolation -i avGFP_dca_encoded.csv -echo -$pypef hybrid extrapolation -i avGFP_dca_encoded.csv --conc -echo - -$pypef ml low_n -i avGFP_dca_encoded.csv --regressor ridge -echo -$pypef ml extrapolation -i avGFP_dca_encoded.csv --regressor ridge -echo -$pypef ml extrapolation -i avGFP_dca_encoded.csv --conc --regressor ridge -echo - -$pypef ml low_n -i avGFP_onehot_encoded.csv --regressor pls -echo -$pypef ml extrapolation -i avGFP_onehot_encoded.csv --regressor pls -echo -$pypef ml extrapolation -i avGFP_onehot_encoded.csv --conc --regressor pls -echo - -$pypef ml low_n -i avGFP_aaidx_encoded.csv --regressor ridge -echo -$pypef ml extrapolation -i avGFP_aaidx_encoded.csv --regressor ridge -echo -$pypef ml extrapolation -i avGFP_aaidx_encoded.csv --conc --regressor ridge -echo - +#!/bin/bash + +### Bash script for testing some PyPEF CLI commands +### based on the two datasets provided (ANEH and avGFP) +### REQUIRES MINICONDA OR ANACONDA BEING INSTALLED +printf 'For successful running, following files are required:\n\nin test_dataset_aneh/\n\tSequence_WT_ANEH.fasta\n\t37_ANEH_variants.csv +\tANEH_jhmmer.a2m\n\tANEH_72.6.params (generated using PLMC or dowloaded from https://github.com/niklases/PyPEF/blob/main/workflow/test_dataset_aneh/ANEH_72.6.params)\n +in test_dataset_avgfp/\n\tP42212_F64L.fasta\n\tavGFP.csv\n\turef100_avgfp_jhmmer_119.a2m +\turef100_avgfp_jhmmer_119_plmc_42.6.params (generated using PLMC or dowloaded from https://github.com/niklases/PyPEF/blob/main/workflow/test_dataset_avgfp/uref100_avgfp_jhmmer_119_plmc_42.6.params)\n\n' + +set -x # echo on +set -e # exit on (PyPEF) errors +export PS4='+(Line ${LINENO}): ' # echo script line numbers + +### RUN ME WITH +### $ ./run_cli_tests_linux.sh # printing STDOUT and STDERR to terminal +### $ ./run_cli_tests_linux.sh &> test_cli_run.log # writing STDOUT and STDERR to log file + +### if using downloaded/locally stored pypef .py files: +########################################################################################################################## +conda env remove -n pypef # +conda create -n pypef python=3.10 -y # +eval "$(conda shell.bash hook)" # +conda activate pypef # +python -m pip install -r ../requirements.txt # +path=$( echo ${PWD%/*} ) # +export PYTHONPATH=${PYTHONPATH}:$path # +pypef='python3 '$path'/pypef/main.py' # # +########################################################################################################################## +### else just use pip-installed pypef version (uncomment): # +#pypef=pypef # +########################################################################################################################## +threads=12 # +########################################################################################################################## + +### threads=1 shows progress bar where possible +### CV-based mlp and rf regression option optimization take a long time and related testing commands are commented out/not included herein + +### Pure ML (and some hybrid model) tests on ANEH dataset +cd 'test_dataset_aneh' +####################################################################### +echo + +$pypef --version +echo +$pypef -h +echo +$pypef mklsts -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta +echo + +$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor pls +echo +$pypef ml --show +echo +$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor pls_loocv +echo +$pypef ml --show +echo +$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor ridge +echo +$pypef ml --show +echo +$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor lasso +echo +$pypef ml --show +echo +$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor elasticnet +echo +$pypef ml --show +echo +#$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor mlp +#$pypef ml --show +#$pypef ml -e onehot -l LS.fasl -t TS.fasl --regressor rf +#$pypef ml --show + +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls_loocv --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor ridge --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor lasso --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor elasticnet --threads $threads +echo +$pypef ml --show +echo + +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls --nofft --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls_loocv --nofft --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor ridge --nofft --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor lasso --nofft --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor elasticnet --nofft --threads $threads +echo +$pypef ml --show +echo + +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls_loocv --params ANEH_72.6.params --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor ridge --params ANEH_72.6.params --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor lasso --params ANEH_72.6.params --threads $threads +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor elasticnet --params ANEH_72.6.params --threads $threads +echo +$pypef ml --show +echo + +$pypef param_inference --msa ANEH_jhmmer.a2m --opt_iter 100 +echo +$pypef save_msa_info --msa ANEH_jhmmer.a2m -w Sequence_WT_ANEH.fasta --opt_iter 100 +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls_loocv --params GREMLIN +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor ridge --params GREMLIN +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor lasso --params GREMLIN +echo +$pypef ml --show +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor elasticnet --params GREMLIN +echo +$pypef ml --show +echo + +$pypef ml -e aaidx -m FAUJ880104 -t TS.fasl +echo +$pypef ml -e onehot -m ONEHOT -t TS.fasl +echo +$pypef ml -e dca -m MLplmc -t TS.fasl --params ANEH_72.6.params --threads $threads +echo +$pypef ml -e aaidx -m FAUJ880104 -t TS.fasl --label +echo +$pypef ml -e onehot -m ONEHOT -t TS.fasl --label +echo +$pypef ml -e dca -m MLplmc -t TS.fasl --label --params ANEH_72.6.params --threads $threads +echo + +$pypef mkps -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta +echo +$pypef ml -e aaidx -m FAUJ880104 -p 37_ANEH_variants_prediction_set.fasta +echo +$pypef ml -e onehot -m ONEHOT -p 37_ANEH_variants_prediction_set.fasta +echo +$pypef ml -e dca -m MLplmc -p 37_ANEH_variants_prediction_set.fasta --params ANEH_72.6.params --threads $threads +echo +$pypef ml -e dca -m MLgremlin -p 37_ANEH_variants_prediction_set.fasta --params GREMLIN +echo + +$pypef mkps -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta --drecomb --trecomb --qarecomb --qirecomb --ddiverse +echo +$pypef ml -e aaidx -m FAUJ880104 --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse +echo +$pypef ml -e onehot -m ONEHOT --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse +echo +$pypef ml -e dca -m MLplmc --params ANEH_72.6.params --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse --threads $threads +echo +$pypef ml -e dca -m MLgremlin --params GREMLIN --pmult --drecomb --trecomb --qarecomb --qirecomb --ddiverse +echo + +$pypef ml -e aaidx directevo -m FAUJ880104 -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative +echo +$pypef ml -e onehot directevo -m ONEHOT -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative +echo +$pypef ml -e dca directevo -m MLplmc -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative --params ANEH_72.6.params +echo +$pypef ml -e dca directevo -m MLgremlin -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative --params GREMLIN +echo +$pypef ml -e aaidx directevo -m FAUJ880104 -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative +echo +$pypef ml -e onehot directevo -m ONEHOT -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative +echo +$pypef ml -e dca directevo -m MLplmc -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative --params ANEH_72.6.params +echo +$pypef ml -e dca directevo -m MLgremlin -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative --params GREMLIN +echo +$pypef ml -e aaidx directevo -m FAUJ880104 -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative +echo +$pypef ml -e onehot directevo -m ONEHOT -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative +echo +$pypef ml -e dca directevo -m MLplmc -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative --params ANEH_72.6.params +echo +$pypef ml -e dca directevo -m MLgremlin -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative --params GREMLIN +echo + +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --regressor pls --nofft +echo +$pypef ml --show +echo +$pypef ml -e aaidx directevo -m WEBA780101 -w Sequence_WT_ANEH.fasta -y -1.5 --negative --nofft +echo + +$pypef encode -i 37_ANEH_variants.csv -e aaidx -m FAUJ880104 -w Sequence_WT_ANEH.fasta +echo +$pypef encode -i 37_ANEH_variants.csv -e onehot -w Sequence_WT_ANEH.fasta +echo +$pypef encode -i 37_ANEH_variants.csv -e dca -w Sequence_WT_ANEH.fasta --params ANEH_72.6.params --threads $threads +echo +mv 37_ANEH_variants_dca_encoded.csv 37_ANEH_variants_plmc_dca_encoded.csv +echo +$pypef encode -i 37_ANEH_variants.csv -e dca -w Sequence_WT_ANEH.fasta --params GREMLIN +echo +mv 37_ANEH_variants_dca_encoded.csv 37_ANEH_variants_gremlin_dca_encoded.csv +echo + +$pypef ml low_n -i 37_ANEH_variants_aaidx_encoded.csv +echo +$pypef ml low_n -i 37_ANEH_variants_onehot_encoded.csv +echo +$pypef ml low_n -i 37_ANEH_variants_plmc_dca_encoded.csv +echo +$pypef ml low_n -i 37_ANEH_variants_gremlin_dca_encoded.csv +echo + +$pypef ml extrapolation -i 37_ANEH_variants_aaidx_encoded.csv +echo +$pypef ml extrapolation -i 37_ANEH_variants_onehot_encoded.csv +echo +$pypef ml extrapolation -i 37_ANEH_variants_plmc_dca_encoded.csv +echo +$pypef ml extrapolation -i 37_ANEH_variants_gremlin_dca_encoded.csv +echo + +$pypef ml extrapolation -i 37_ANEH_variants_aaidx_encoded.csv --conc +echo +$pypef ml extrapolation -i 37_ANEH_variants_onehot_encoded.csv --conc +echo +$pypef ml extrapolation -i 37_ANEH_variants_plmc_dca_encoded.csv --conc +echo +$pypef ml extrapolation -i 37_ANEH_variants_gremlin_dca_encoded.csv --conc +echo + +$pypef hybrid train_and_save -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta --params ANEH_72.6.params --fit_size 0.66 --threads $threads +echo +$pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params --threads $threads +echo +$pypef hybrid -m HYBRIDplmc -t TS.fasl --params ANEH_72.6.params --threads $threads +echo + +$pypef hybrid train_and_save -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta --params GREMLIN --fit_size 0.66 +echo +$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN +echo +$pypef hybrid -m HYBRIDgremlin -t TS.fasl --params GREMLIN +echo + +$pypef mkps -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta +echo +$pypef hybrid -m HYBRIDplmc -p 37_ANEH_variants_prediction_set.fasta --params ANEH_72.6.params --threads $threads +echo +$pypef hybrid -m HYBRIDplmc --params ANEH_72.6.params --pmult --drecomb --threads $threads +echo + +$pypef hybrid directevo -m HYBRIDplmc -w Sequence_WT_ANEH.fasta --y_wt -1.5 --negative --params ANEH_72.6.params +echo +$pypef hybrid directevo -m HYBRIDplmc -w Sequence_WT_ANEH.fasta -y -1.5 --numiter 10 --numtraj 8 --negative --params ANEH_72.6.params +echo +$pypef hybrid directevo -m HYBRIDplmc -i 37_ANEH_variants.csv -w Sequence_WT_ANEH.fasta -y -1.5 --temp 0.1 --usecsv --csvaa --negative --params ANEH_72.6.params +echo + +$pypef encode -i 37_ANEH_variants.csv -e dca -w Sequence_WT_ANEH.fasta --params ANEH_72.6.params --threads $threads +echo +$pypef hybrid low_n -i 37_ANEH_variants_dca_encoded.csv +echo +$pypef hybrid extrapolation -i 37_ANEH_variants_dca_encoded.csv +echo +$pypef hybrid extrapolation -i 37_ANEH_variants_dca_encoded.csv --conc +echo + + +### Hybrid model (and some pure ML and pure DCA) tests on avGFP dataset +cd '../test_dataset_avgfp' +####################################################################### +echo + +$pypef mklsts -i avGFP.csv -w P42212_F64L.fasta +echo +$pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m --opt_iter 100 +echo +# Check MSA coevolution info +$pypef save_msa_info --msa uref100_avgfp_jhmmer_119.a2m -w P42212_F64L.fasta --opt_iter 100 +### +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --params GREMLIN +echo +$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN +echo +# Similar to line above +$pypef hybrid -t TS.fasl --params GREMLIN +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --params GREMLIN +echo +$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN +echo +$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN +echo +# pure statistical +$pypef hybrid -t TS.fasl --params GREMLIN +echo + + +# using .params file +$pypef ml -e dca -l LS.fasl -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo +# ML LS/TS +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params uref100_avgfp_jhmmer_119_plmc_42.6.params +echo +# Transforming .params file to DCAEncoding and using DCAEncoding Pickle; output file: Pickles/MLplmc. +# That means using uref100_avgfp_jhmmer_119_plmc_42.6.params or PLMC as params file is identical. +$pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params PLMC --threads $threads +echo +# ml only TS +$pypef ml -e dca -m MLplmc -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo +$pypef ml -e dca -m MLgremlin -t TS.fasl --params GREMLIN --threads $threads +echo + + +echo +$pypef ml -e dca -l LS.fasl -t TS.fasl --params PLMC --threads $threads +echo +$pypef hybrid -l LS.fasl -t TS.fasl --params PLMC --threads $threads +echo +$pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads +echo + +# pure statistical +$pypef hybrid -t TS.fasl --params PLMC --threads $threads +echo +$pypef hybrid -t TS.fasl --params GREMLIN +echo +$pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN +echo +$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN +echo +$pypef save_msa_info --msa uref100_avgfp_jhmmer_119.a2m -w P42212_F64L.fasta --opt_iter 100 +# train and save only for hybrid +$pypef hybrid train_and_save -i avGFP.csv --params GREMLIN --wt P42212_F64L.fasta +echo +# Encode CSV +$pypef encode -e dca -i avGFP.csv --wt P42212_F64L.fasta --params GREMLIN +echo +$pypef encode --encoding dca -i avGFP.csv --wt P42212_F64L.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads 12 + +#Extrapolation +echo +$pypef ml low_n -i avGFP_dca_encoded.csv --regressor ridge +echo +$pypef ml extrapolation -i avGFP_dca_encoded.csv --regressor ridge +echo +$pypef ml extrapolation -i avGFP_dca_encoded.csv --conc --regressor ridge +echo + +# Direct Evo +$pypef ml -e dca directevo -m MLgremlin --wt P42212_F64L.fasta --params GREMLIN +echo +$pypef ml -e dca directevo -m MLplmc --wt P42212_F64L.fasta --params PLMC +echo +$pypef hybrid directevo -m GREMLIN --wt P42212_F64L.fasta --params GREMLIN +echo +$pypef hybrid directevo -m PLMC --wt P42212_F64L.fasta --params PLMC +echo +$pypef hybrid directevo --wt P42212_F64L.fasta --params GREMLIN +echo +$pypef hybrid directevo --wt P42212_F64L.fasta --params PLMC +echo + +$pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN +echo +$pypef hybrid -m HYBRIDgremlin -t TS.fasl --params GREMLIN +echo + +### Similar to old CLI run test from here + +$pypef encode -i avGFP.csv -e dca -w P42212_F64L.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo +$pypef encode -i avGFP.csv -e onehot -w P42212_F64L.fasta +echo +$pypef ml -e aaidx -l LS.fasl -t TS.fasl --threads $threads +echo +$pypef ml --show +echo +$pypef encode -i avGFP.csv -e aaidx -m GEIM800103 -w P42212_F64L.fasta +echo + +$pypef hybrid train_and_save -i avGFP.csv --params uref100_avgfp_jhmmer_119_plmc_42.6.params --fit_size 0.66 -w P42212_F64L.fasta --threads $threads +echo +$pypef hybrid -l LS.fasl -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo +$pypef hybrid -m HYBRIDplmc -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo + +# No training set given +$pypef hybrid -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo +$pypef ml -e dca -m MLplmc -t TS.fasl --params uref100_avgfp_jhmmer_119_plmc_42.6.params --label --threads $threads +echo + +$pypef mkps -i avGFP.csv -w P42212_F64L.fasta +echo +$pypef hybrid -m HYBRIDplmc -p avGFP_prediction_set.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params --threads $threads +echo +$pypef mkps -i avGFP.csv -w P42212_F64L.fasta --drecomb +#$pypef hybrid -m HYBRID --params uref100_avgfp_jhmmer_119_plmc_42.6.params --pmult --drecomb --threads $threads # many single variants for recombination, takes too long +echo + +$pypef hybrid directevo -m HYBRIDplmc -w P42212_F64L.fasta --params uref100_avgfp_jhmmer_119_plmc_42.6.params +echo +$pypef hybrid directevo -m HYBRIDplmc -w P42212_F64L.fasta --numiter 10 --numtraj 8 --params uref100_avgfp_jhmmer_119_plmc_42.6.params +echo +$pypef hybrid directevo -m HYBRIDplmc -i avGFP.csv -w P42212_F64L.fasta --temp 0.1 --usecsv --csvaa --params uref100_avgfp_jhmmer_119_plmc_42.6.params + +$pypef hybrid low_n -i avGFP_dca_encoded.csv +echo +$pypef hybrid extrapolation -i avGFP_dca_encoded.csv +echo +$pypef hybrid extrapolation -i avGFP_dca_encoded.csv --conc +echo + +$pypef ml low_n -i avGFP_dca_encoded.csv --regressor ridge +echo +$pypef ml extrapolation -i avGFP_dca_encoded.csv --regressor ridge +echo +$pypef ml extrapolation -i avGFP_dca_encoded.csv --conc --regressor ridge +echo + +$pypef ml low_n -i avGFP_onehot_encoded.csv --regressor pls +echo +$pypef ml extrapolation -i avGFP_onehot_encoded.csv --regressor pls +echo +$pypef ml extrapolation -i avGFP_onehot_encoded.csv --conc --regressor pls +echo + +$pypef ml low_n -i avGFP_aaidx_encoded.csv --regressor ridge +echo +$pypef ml extrapolation -i avGFP_aaidx_encoded.csv --regressor ridge +echo +$pypef ml extrapolation -i avGFP_aaidx_encoded.csv --conc --regressor ridge +echo + echo 'All tests finished without error!' \ No newline at end of file diff --git a/workflow/run_cli_tests_win.ps1 b/workflow/run_cli_tests_win.ps1 index 3ff7453..fb3dab0 100644 --- a/workflow/run_cli_tests_win.ps1 +++ b/workflow/run_cli_tests_win.ps1 @@ -1,4 +1,6 @@ +### PowerShell script for testing some PyPEF CLI commands ### based on the two datasets provided (ANEH and avGFP) +### REQUIRES MINICONDA OR ANACONDA BEING INSTALLED Write-Host "For successful running, following files are required:`n`nin test_dataset_aneh/ `tSequence_WT_ANEH.fasta`n`t37_ANEH_variants.csv`n`tANEH_jhmmer.a2m `tANEH_72.6.params (generated using PLMC or dowloaded from https://github.com/niklases/PyPEF/blob/main/workflow/test_dataset_aneh/ANEH_72.6.params) @@ -19,14 +21,20 @@ function ExitOnExitCode { if ($LastExitCode) { Write-Host "PyPEF command error; ### $ .\run_cli_tests_win.ps1 # printing STDOUT and STDERR to terminal ### if using downloaded/locally stored pypef .py files: -############### CHANGE THIS PATHS AND USED THREADS, REQUIRES PYTHON ENVIRONMENT WITH PRE-INSTALLED MODULES ############### -$env:PYTHONPATH="C:\path\to\pypef-main" # -function pypef { python C:\path\to\pypef-main\pypef\main.py @args } # +########################################################################################################################## +conda env remove -n pypef # +conda create -n pypef python=3.10 -y # +conda activate pypef # +$path=Get-Location # +$path=Split-Path -Path $path -Parent # +python -m pip install -r $path\requirements.txt # +$env:PYTHONPATH=$path # +function pypef { python $path\pypef\main.py @args } # ########################################################################################################################## ### else just use pip-installed pypef version (uncomment): # #pypef = pypef # ########################################################################################################################## -$threads = 16 # +$threads = 12 # ########################################################################################################################## ### threads=1 shows progress bar where possible