From a7d6afc13a50d735836b4cd0d9ddeb87b0026970 Mon Sep 17 00:00:00 2001 From: niklases <60261497+niklases@users.noreply.github.com> Date: Fri, 5 Jan 2024 15:17:14 +0100 Subject: [PATCH] Upload v0.3.3 --- .github/workflows/build.yml | 4 +- .gitignore | 4 + .vscode/launch.json | 209 ++ .vscode/settings.json | 3 + README.md | 76 +- pypef/__init__.py | 40 +- pypef/dca/dca_run.py | 286 +-- pypef/dca/gremlin_inference.py | 1300 +++++----- pypef/dca/hybrid_model.py | 2431 ++++++++++--------- pypef/dca/plmc_encoding.py | 1714 ++++++------- pypef/main.py | 890 +++---- pypef/ml/ml_run.py | 440 ++-- pypef/ml/regression.py | 2196 ++++++++--------- pypef/utils/directed_evolution.py | 708 +++--- pypef/utils/learning_test_sets.py | 808 +++--- pypef/utils/low_n_mutation_extrapolation.py | 876 +++---- pypef/utils/performance.py | 120 +- pypef/utils/plot.py | 162 +- pypef/utils/prediction_sets.py | 842 ++++--- pypef/utils/sto2a2m.py | 190 +- pypef/utils/to_file.py | 92 +- pypef/utils/utils_run.py | 704 +++--- pypef/utils/variant_data.py | 920 +++---- scripts/CLI/run_cli_tests_linux.sh | 14 +- scripts/CLI/run_cli_tests_win.ps1 | 16 +- setup.py | 102 +- tests/test_api_functions.py | 61 + 27 files changed, 7798 insertions(+), 7410 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 tests/test_api_functions.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5bbf760..455fb3a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -34,6 +34,6 @@ jobs: run: | # stop the build if there are Python syntax errors or undefined names flake8 ./pypef --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Export Pythonpath and run CLI PyPEF version test with pytest + - name: Export Pythonpath and run PyPEF API and CLI version test with pytest run: | - export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest tests/cli/ + export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest tests/ diff --git a/.gitignore b/.gitignore index 3f1f9f3..6ae8115 100644 --- a/.gitignore +++ b/.gitignore @@ -386,3 +386,7 @@ scripts/Setup/linux/AVGFP/uref100_avgfp_jhmmer_119.a2m scripts/Setup/linux/AVGFP/avGFP.csv scripts/Setup/linux/api_encoding_train_test.py scripts/Setup/linux/apc.png +datasets/ANEH/KARS160122_PLS_LOOCV_ML_Model_Performance.png +datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_5-fold-CV.png +datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_CV_Results.txt +datasets/AVGFP/Predictions_Hybrid_TopTS.txt diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..036fea6 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,209 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: PyPEF Help", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "args": [ + "--help" + ] + }, + + { + "name": "Python: PyPEF MKLSTS ANEH", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/ANEH/", + "args": [ + "mklsts", + "--wt", "${workspaceFolder}/datasets/ANEH/Sequence_WT_ANEH.fasta", + "--input", "${workspaceFolder}/datasets/ANEH/37_ANEH_variants.csv" + ] + }, + + { + "name": "Python: PyPEF MKLSTS avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "mklsts", + "--wt", "P42212_F64L.fasta", + "--input", "avGFP.csv" + ] + }, + + { + "name": "Python: PyPEF ml -e onehot pls_loocv", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/ANEH", + "args": [ + "ml", + "-e", "onehot", + "-l", "LS.fasl", + "-t", "TS.fasl", + "--regressor", "pls_loocv" + ] + }, + + { // GREMLIN zero-shot steps: + // 1. $pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m --opt_iter 100 + // 2. $pypef hybrid -t TS.fasl --params GREMLIN + // or + // 2. $pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN + "name": "Python: PyPEF save GREMLIN avGFP model", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "param_inference", + "--msa", "uref100_avgfp_jhmmer_119.a2m", + "--opt_iter", "100" + ] + }, + + { + "name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "GREMLIN", // optional, not required + "--ts", "TS.fasl", + "--params", "GREMLIN" + ] + }, + + { + "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "GREMLIN", // optional, not required + "--ps", "TS.fasl", + "--params", "GREMLIN" + ] + }, + + { // PLMC zero-shot steps: + // 1. $pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params + // 2. $pypef hybrid -t TS.fasl --params PLMC + "name": "Python: PyPEF save PLMC avGFP model", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "param_inference", + "--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params" + ] + }, + + { + "name": "Python: PyPEF hybrid/only-TS-zero-shot PLMC-DCA avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + "--ts", "TS.fasl", + "--params", "PLMC", + "--threads", "24" + ] + }, + + { + "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "PLMC", // optional, not required + "--ps", "TS.fasl", + "--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params", + "--threads", "24" + ] + }, + + { + "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA variant 2 avGFP", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "hybrid", + //"-m", "PLMC", // optional, not required + "--ps", "TS.fasl", + "--params", "PLMC", + "--threads", "24" + ] + }, + + { + "name": "Python: PyPEF !wrong! MSA input format (STO)", + "type": "python", + "request": "launch", + "env": {"PYTHONPATH": "${workspaceFolder}"}, + "program": "${workspaceFolder}/pypef/main.py", + "console": "integratedTerminal", + "justMyCode": true, + "cwd": "${workspaceFolder}/datasets/AVGFP/", + "args": [ + "param_inference", + "--msa", "uref100_avgfp_jhmmer_119.sto" + ] + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..2b6b3b4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "markdown.extension.toc.updateOnSave": false +} diff --git a/README.md b/README.md index ac6d55c..9cf1b59 100644 --- a/README.md +++ b/README.md @@ -17,20 +17,22 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081. *§*Equal contribution
--- + ## Table of Contents -- [PyPEF](#pypef) -- [Installation](#installation) -- [Requirements](#requirements) -- [Running Examples](#examples) -- [Tutorial](#tutorial) -- [Encoding Technique Options](#encoding-options) -- [Modeling Techniques](#modeling-techniques) - - [Pure Machine Learning (ML)-based Modeling](#pure-ml) - - [Hybrid Modeling](#hybrid-modeling) -- [Model Hyperparameter Grids for Training](#grids) -- [Setting Up the Scripts Yourself](#set-up) -- [Preprocessing for DCA-based Sequence Encoding](#dca-preprocessing) -- [API Usage for Sequence Encoding](#api-usage) +[PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework) + - [Quick Installation](#quick-installation) + - [Requirements](#requirements) + - [Running Examples](#running-examples) + - [Tutorial](#tutorial) + - [Encoding Technique Options](#encoding-technique-options) + - [Modeling Techniques](#modeling-techniques) + - [Pure Machine Learning (ML)-based Modeling](#pure-machine-learning-ml-based-modeling) + - [Hybrid Modeling Using the MERGE Method](#hybrid-modeling-using-the-merge-method) + - [Model Hyperparameter Grids for Training](#model-hyperparameter-grids-for-training) + - [Setting Up the Scripts Yourself](#setting-up-the-scripts-yourself) + - [Preprocessing for DCA-based Sequence Encoding](#preprocessing-for-dca-based-sequence-encoding) + - [Unsupervised/zero-shot prediction](#unsupervisedzero-shot-prediction) + - [API Usage for Sequence Encoding](#api-usage-for-sequence-encoding) --- @@ -278,7 +280,7 @@ Copy the notebook URL in your internet browser and select the Workflow_PyPEF.ipy ## Modeling Techniques ### Pure Machine Learning (ML)-based Modeling -Serveral linear and non-linear modeling options are available by default to construct supervised regression models based on the generated sequence features, i.e. encoded sequences. +Several linear and non-linear modeling options are available by default to construct supervised regression models based on the generated sequence features, i.e. encoded sequences. Regression models are trained, i.e. model hyperparameters are optimized, by *k*- fold (by default, fivefold) cross-validation on training samples. Here, the model aims to map the encoded variant sequences that are the features (***X***) for predicting the corresponding fitness labels (***y***) such that *f(***X***)* --> ***y*** – while cross-validation and/or using a model implementing a penalty will be necessary for better model generalization behavior. Following regression options from [Scikit-learn](https://scikit-learn.org/stable/) are implemented (for optimized hyperparameters, see Model Hyperparameters section below): - [Partial Least Squares Regression (linear model)](https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html) @@ -376,38 +378,80 @@ python3 ./pypef/main.py pypef sto2a2m --sto ANEH_jhmmer.sto ``` -5. Now you can follow approaches 5.1 (using GREMLIN; implemented in TensorFlow) or 5.2 (using plmc; extern parameter generation in C). +5. Now you can follow approaches 5.1 (using GREMLIN; implemented in TensorFlow) or 5.2 (using plmc; extern parameter generation in C). 5.1. Running GREMLIN on the generated MSA (in FASTA or A2M format): + ``` pypef param_inference --msa ANEH_jhmmer.a2m -w WT_SEQUENCE.FASTA --opt_iter 250 ``` + The pickled GREMLIN file can then be used for encoding new/test sequences: + ``` pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN ``` + Or for hybrid modeling: + ``` pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN ``` 5.2 After [installing plmc](https://github.com/debbiemarkslab/plmc#compilation), generate the evolutionary coupling file, which is used for encoding sequences. For example, set `-le` to the value output by `sto2a2m`: + ``` plmc -o ANEH_72.6.params -le 72.6 -m 100 -g -f WT_ANEH ANEH_jhmmer.a2m ``` The output parameter (.params) file can be used for encoding sequences with the DCA-based encoding technique (`-e dca`) by providing it to PyPEF; e.g. for pure ML modeling: + ``` pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params ``` + Or for hybrid modeling: + ``` pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params ``` + +## Unsupervised/zero-shot prediction +Several developed methods allow unsupervised prediction of a proteins fitness based on its sequence (and/or structure). +These methods have the advantage that no initial knowledge about a proteins fitness is required for prediction, while a correlation of the predicted score and a protein's natural fitness is assumed. +DCA itself is a statistical/unsupervised method based on MSA information that outperforms simpler MSA-based methods (such as (un)coupled raw MSA sequence frequencies or BLOSUM scores), e.g., see [scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb](scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb). +To make zero-shot predictions using PyPEF (plmc-DCA or GREMLIN-DCA) just do not provide a training set (no `-l` flag, only a `-t` or `-p` flag) for hybrid modeling, e.g., for the avGFP data, try + +``` +pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m +pypef hybrid -t TS.fasl --params GREMLIN +pypef hybrid -p PS.fasta --params GREMLIN +``` + +using the GREMLIN parameters, or, + +``` +pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params +pypef hybrid -t TS.fasl --params PLMC +pypef hybrid -p PS.fasta --params PLMC +``` + +using the plmc parameters. + +Other well-performing zero-shot prediction methods with available source code are: + +- ESM-1v/ESM-2 (https://github.com/facebookresearch/esm) +- DeepSequence (https://github.com/debbiemarkslab/DeepSequence) +- EVcouplings (plmc-DCA, https://github.com/debbiemarkslab/EVcouplings) +- EVE (https://github.com/OATML/EVE) +- Tranception (https://github.com/OATML-Markslab/Tranception) + +This list is by no means complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances (as well as for getting many benchmark data sets). + ## API Usage for Sequence Encoding -For script-based encoding of sequences using PyPEF and the available AAindex-, OneHot- or DCA-based techniques, the classes and corresponding functions can be imported, i.e. `OneHotEncoding`, `AAIndexEncoding`, `GREMLIN` (DCA), `PLMC` (DCA), and `DCAHybridModel`. In addition, implemented functions for CV-based tuning of regression models can be used to train and validate models, eventually deriving them to obtain performances on retained data for testing. An exemplary script and a Jupyter notebook for CV-based (low-*N*) tuning of models and using them for testing is provided at [scripts/Encoding_low_N/api_encoding_train_test.py]( scripts/Encoding_low_N/api_encoding_train_test.py) and [scripts/Encoding_low_N/api_encoding_train_test.ipynb](scripts/Encoding_low_N/api_encoding_train_test.ipynb), respectively. +For script-based encoding of sequences using PyPEF and the available AAindex-, OneHot- or DCA-based techniques, the classes and corresponding functions can be imported, i.e. `OneHotEncoding`, `AAIndexEncoding`, `GREMLIN` (DCA), `PLMC` (DCA), and `DCAHybridModel`. In addition, implemented functions for CV-based tuning of regression models can be used to train and validate models, eventually deriving them to obtain performances on retained data for testing. An exemplary script and a Jupyter notebook for CV-based (low-*N*) tuning of models and using them for testing is provided at [scripts/Encoding_low_N/api_encoding_train_test.py](scripts/Encoding_low_N/api_encoding_train_test.py) and [scripts/Encoding_low_N/api_encoding_train_test.ipynb](scripts/Encoding_low_N/api_encoding_train_test.ipynb), respectively.

drawing diff --git a/pypef/__init__.py b/pypef/__init__.py index 16d8b1f..6e981a5 100644 --- a/pypef/__init__.py +++ b/pypef/__init__.py @@ -1,20 +1,20 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - - -__version__ = '0.3.2-alpha' +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + + +__version__ = '0.3.3-alpha' diff --git a/pypef/dca/dca_run.py b/pypef/dca/dca_run.py index 07f6a61..1447a98 100644 --- a/pypef/dca/dca_run.py +++ b/pypef/dca/dca_run.py @@ -1,143 +1,143 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import logging -logger = logging.getLogger('pypef.dca.dca_run') -import ray - -from pypef.utils.variant_data import read_csv, get_wt_sequence -from pypef.dca.plmc_encoding import save_plmc_dca_encoding_model -from pypef.dca.hybrid_model import get_model_and_type, performance_ls_ts, predict_ps, generate_model_and_save_pkl -from pypef.dca.gremlin_inference import save_gremlin_as_pickle, save_corr_csv, plot_all_corr_mtx -from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n - - -def run_pypef_hybrid_modeling(arguments): - threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 - threads = threads + 1 if threads == 0 else threads - try: - _, model_type = get_model_and_type(arguments['--params'], arguments['--mutation_sep']) - except TypeError: - model_type = 'undefined' - except SystemError: - model_type = 'undefined' - if model_type in ['GREMLIN', 'DCAHybridModel'] and threads > 1: - logger.info(f'No (Ray) parallelization for {model_type} model...') - elif model_type not in ['GREMLIN', 'DCAHybridModel'] and threads > 1: - ray.init() - logger.info(f'Using {threads} threads for running...') - if model_type == 'DCAHybridModel': - logger.info( - f"Note that the hybrid model only optimizes model performances in terms of " - f"Spearman's correlation of measured versus predicted values. Further, the " - f"hybrid approach uses only Ridge regression for supervised ML-based hybrid " - f"model contribution. In hybrid modeling, the ranks of predictions are " - f"important and not the exact predicted value." - ) - - if arguments['--ts']: - performance_ls_ts( - ls_fasta=arguments['--ls'], - ts_fasta=arguments['--ts'], - threads=threads, - params_file=arguments['--params'], - model_pickle_file=arguments['--model'], - substitution_sep=arguments['--mutation_sep'], - label=arguments['--label'] - ) - - elif arguments['--params'] and arguments['--model']: - prediction_dict = {} - prediction_dict.update({ - 'drecomb': arguments['--drecomb'], - 'trecomb': arguments['--trecomb'], - 'qarecomb': arguments['--qarecomb'], - 'qirecomb': arguments['--qirecomb'], - 'ddiverse': arguments['--ddiverse'], - 'tdiverse': arguments['--tdiverse'], - 'qdiverse': arguments['--qdiverse'] - }) - - predict_ps( - prediction_dict=prediction_dict, - params_file=arguments['--params'], - threads=threads, - separator=arguments['--mutation_sep'], - model_pickle_file=arguments['--model'], - prediction_set=arguments['--ps'], - negative=arguments['--negative'] - ) - - elif arguments['train_and_save']: - variants, fitnesses, _ = read_csv(arguments['--input']) - generate_model_and_save_pkl( - variants=variants, - ys_true=fitnesses, - params_file=arguments['--params'], - wt=arguments['--wt'], - train_percent_fit=arguments['--fit_size'], - test_percent=arguments['--test_size'], - random_state=arguments['--rnd_state'], - substitution_sep=arguments['--mutation_sep'], - threads=arguments['--threads'] - ) - - elif arguments['low_n'] or arguments['extrapolation']: - if arguments['low_n']: - low_n( - encoded_csv=arguments['--input'], - hybrid_modeling=arguments['hybrid'] - ) - elif arguments['extrapolation']: - performance_mutation_extrapolation( - encoded_csv=arguments['--input'], - cv_regressor=arguments['--regressor'], - conc=arguments['--conc'], - hybrid_modeling=arguments['hybrid'] - ) - - elif arguments['param_inference']: - if arguments['--msa']: - save_gremlin_as_pickle( - alignment=arguments['--msa'], - wt_seq=get_wt_sequence(arguments['--wt']), - opt_iter=arguments['--opt_iter'] - ) - elif arguments['--params']: - save_plmc_dca_encoding_model( - params_file=arguments['--params'], - substitution_sep=arguments['--mutation_sep'] - ) - - elif arguments['save_msa_info']: - gremlin = save_gremlin_as_pickle( - alignment=arguments['--msa'], - wt_seq=get_wt_sequence(arguments['--wt']), - opt_iter=arguments['--opt_iter'] - ) - save_corr_csv(gremlin) - plot_all_corr_mtx(gremlin) - - else: - performance_ls_ts( - ls_fasta=arguments['--ls'], - ts_fasta=arguments['--ts'], - threads=threads, - params_file=arguments['--params'], - substitution_sep=arguments['--mutation_sep'] - ) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import logging +logger = logging.getLogger('pypef.dca.dca_run') +import ray + +from pypef.utils.variant_data import read_csv, get_wt_sequence +from pypef.dca.plmc_encoding import save_plmc_dca_encoding_model +from pypef.dca.hybrid_model import get_model_and_type, performance_ls_ts, predict_ps, generate_model_and_save_pkl +from pypef.dca.gremlin_inference import save_gremlin_as_pickle, save_corr_csv, plot_all_corr_mtx +from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n + + +def run_pypef_hybrid_modeling(arguments): + threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 + threads = threads + 1 if threads == 0 else threads + try: + _, model_type = get_model_and_type(arguments['--params'], arguments['--mutation_sep']) + except TypeError: + model_type = 'undefined' + except SystemError: + model_type = 'undefined' + if model_type in ['GREMLIN', 'DCAHybridModel'] and threads > 1: + logger.info(f'No (Ray) parallelization for {model_type} model...') + elif model_type not in ['GREMLIN', 'DCAHybridModel'] and threads > 1: + ray.init() + logger.info(f'Using {threads} threads for running...') + if model_type == 'DCAHybridModel': + logger.info( + f"Note that the hybrid model only optimizes model performances in terms of " + f"Spearman's correlation of measured versus predicted values. Further, the " + f"hybrid approach uses only Ridge regression for supervised ML-based hybrid " + f"model contribution. In hybrid modeling, the ranks of predictions are " + f"important and not the exact predicted value." + ) + + if arguments['--ts']: + performance_ls_ts( + ls_fasta=arguments['--ls'], + ts_fasta=arguments['--ts'], + threads=threads, + params_file=arguments['--params'], + model_pickle_file=arguments['--model'], + substitution_sep=arguments['--mutation_sep'], + label=arguments['--label'] + ) + + elif arguments['--params'] and arguments['--model'] or arguments['--ps']: + prediction_dict = {} + prediction_dict.update({ + 'drecomb': arguments['--drecomb'], + 'trecomb': arguments['--trecomb'], + 'qarecomb': arguments['--qarecomb'], + 'qirecomb': arguments['--qirecomb'], + 'ddiverse': arguments['--ddiverse'], + 'tdiverse': arguments['--tdiverse'], + 'qdiverse': arguments['--qdiverse'] + }) + + predict_ps( + prediction_dict=prediction_dict, + params_file=arguments['--params'], + threads=threads, + separator=arguments['--mutation_sep'], + model_pickle_file=arguments['--model'], + prediction_set=arguments['--ps'], + negative=arguments['--negative'] + ) + + elif arguments['train_and_save']: + variants, fitnesses, _ = read_csv(arguments['--input']) + generate_model_and_save_pkl( + variants=variants, + ys_true=fitnesses, + params_file=arguments['--params'], + wt=arguments['--wt'], + train_percent_fit=arguments['--fit_size'], + test_percent=arguments['--test_size'], + random_state=arguments['--rnd_state'], + substitution_sep=arguments['--mutation_sep'], + threads=arguments['--threads'] + ) + + elif arguments['low_n'] or arguments['extrapolation']: + if arguments['low_n']: + low_n( + encoded_csv=arguments['--input'], + hybrid_modeling=arguments['hybrid'] + ) + elif arguments['extrapolation']: + performance_mutation_extrapolation( + encoded_csv=arguments['--input'], + cv_regressor=arguments['--regressor'], + conc=arguments['--conc'], + hybrid_modeling=arguments['hybrid'] + ) + + elif arguments['param_inference']: + if arguments['--msa']: + save_gremlin_as_pickle( + alignment=arguments['--msa'], + wt_seq=get_wt_sequence(arguments['--wt']), + opt_iter=arguments['--opt_iter'] + ) + elif arguments['--params']: + save_plmc_dca_encoding_model( + params_file=arguments['--params'], + substitution_sep=arguments['--mutation_sep'] + ) + + elif arguments['save_msa_info']: + gremlin = save_gremlin_as_pickle( + alignment=arguments['--msa'], + wt_seq=get_wt_sequence(arguments['--wt']), + opt_iter=arguments['--opt_iter'] + ) + save_corr_csv(gremlin) + plot_all_corr_mtx(gremlin) + + else: + performance_ls_ts( + ls_fasta=arguments['--ls'], + ts_fasta=arguments['--ts'], + threads=threads, + params_file=arguments['--params'], + substitution_sep=arguments['--mutation_sep'] + ) diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py index cc2cbc3..b715f60 100644 --- a/pypef/dca/gremlin_inference.py +++ b/pypef/dca/gremlin_inference.py @@ -1,650 +1,650 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 17 May 2023 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -""" -Code taken from GREMLIN repository available at https://github.com/sokrypton/GREMLIN_CPP/ -and adapted (put functions into a class termed GREMLIN) and used under the -"THE BEER-WARE LICENSE" (Revision 42): - ---------------------------------------------------------------------------------------- - "THE BEER-WARE LICENSE" (Revision 42): - wrote this file. As long as you retain this notice you - can do whatever you want with this stuff. If we meet some day, and you think - this stuff is worth it, you can buy me a beer in return. Sergey Ovchinnikov - ---------------------------------------------------------------------------------------- ---> Thanks for sharing the great code, I will gladly provide you a beer or two. (Niklas) -Code mainly taken from -https://github.com/sokrypton/GREMLIN_CPP/blob/master/GREMLIN_TF.ipynb - -References: -[1] Kamisetty, H., Ovchinnikov, S., & Baker, D. - Assessing the utility of coevolution-based residue–residue contact predictions in a - sequence- and structure-rich era. - Proceedings of the National Academy of Sciences, 2013, 110, 15674-15679 - https://www.pnas.org/doi/10.1073/pnas.1314045110 -[2] Balakrishnan, S., Kamisetty, H., Carbonell, J. G., Lee, S.-I., & Langmead, C. J. - Learning generative models for protein fold families. - Proteins, 79(4), 2011, 1061–78. - https://doi.org/10.1002/prot.22934 -[3] Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E. - Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models. - Physical Review E, 87(1), 2013, 012707. doi:10.1103/PhysRevE.87.012707 - https://doi.org/10.1103/PhysRevE.87.012707 -""" - -import logging -logger = logging.getLogger('pypef.dca.params_inference') - -from os import mkdir -import pickle -import numpy as np -import matplotlib.pyplot as plt -from scipy.spatial.distance import pdist, squareform -from scipy.special import logsumexp -from scipy.stats import boxcox -import pandas as pd -import tensorflow as tf -tf.get_logger().setLevel('DEBUG') - -from pypef.utils.variant_data import get_sequences_from_file - - -class GREMLIN: - def __init__( - self, - alignment: str, - char_alphabet: str = "ARNDCQEGHILKMFPSTWYV-", - wt_seq=None, - optimize=True, - gap_cutoff=0.5, - eff_cutoff=0.8, - opt_iter=100 - ): - self.char_alphabet = char_alphabet - self.gap_cutoff = gap_cutoff - self.eff_cutoff = eff_cutoff - self.opt_iter = opt_iter - self.states = len(self.char_alphabet) - self.seqs, _, _ = get_sequences_from_file(alignment) - self.msa_ori = self.get_msa_ori() - self.n_col_ori = self.msa_ori.shape[1] - if wt_seq is not None: - self.wt_seq = wt_seq - else: # Taking the first sequence in the MSA as wild type sequence - logger.info("No wild-type sequence provided: The first sequence " - "in the MSA is considered the wild-type sequence.") - self.wt_seq = "".join([self.char_alphabet[i] for i in self.msa_ori[0]]) - if len(self.wt_seq) != self.n_col_ori: - raise SystemError("Length of (provided) wild-type sequence does not match " - "number of MSA columns, i.e., common MSA sequence length.") - self.msa_trimmed, self.v_idx, self.w_idx, self.w_rel_idx, self.gaps = self.filt_gaps(self.msa_ori) - self.msa_weights = self.get_eff_msa_weights(self.msa_trimmed) - self.n_eff = np.sum(self.msa_weights) - self.n_row = self.msa_trimmed.shape[0] - self.n_col = self.msa_trimmed.shape[1] - self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False) - self.optimize = optimize - if self.optimize: - self.v_opt, self.w_opt = self.run_opt_tf() - self.x_wt = self.collect_encoded_sequences(np.atleast_1d(self.wt_seq)) - - def a2n_dict(self): - a2n = {} - for a, n in zip(self.char_alphabet, range(self.states)): - a2n[a] = n - return a2n - - def aa2int(self, aa): - """convert single aa into numerical integer value, e.g.: - "A" -> 0 or "-" to 21 dependent on char_alphabet""" - a2n = self.a2n_dict() - if aa in a2n: - return a2n[aa] - else: # for unknown characters insert Gap character - return a2n['-'] - - def seq2int(self, aa_seqs): - """ - convert a single sequence or a list of sequences into a list of integer sequences, e.g.: - ["ACD","EFG"] -> [[0,4,3], [6,13,7]] - """ - if type(aa_seqs) == str: - aa_seqs = np.array(aa_seqs) - if type(aa_seqs) == list: - aa_seqs = np.array(aa_seqs) - if aa_seqs.dtype.type is np.str_: - if aa_seqs.ndim == 0: # single seq - return np.array([self.aa2int(aa) for aa in str(aa_seqs)]) - else: # list of seqs - return np.array([[self.aa2int(aa) for aa in seq] for seq in aa_seqs]) - else: - return aa_seqs - - @property - def get_v_idx_w_idx(self): - return self.v_idx, self.w_idx - - def get_msa_ori(self): - """converts list of sequences to msa""" - msa_ori = [] - for seq in self.seqs: - msa_ori.append([self.aa2int(aa.upper()) for aa in seq]) - msa_ori = np.array(msa_ori) - return msa_ori - - def filt_gaps(self, msa_ori): - """filters alignment to remove gappy positions""" - tmp = (msa_ori == self.states - 1).astype(float) - non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0] - - gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0] - logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}') - ncol_trimmed = len(non_gaps) - v_idx = non_gaps - w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)] - w_rel_idx = np.stack(np.triu_indices(ncol_trimmed, 1), -1) - return msa_ori[:, non_gaps], v_idx, w_idx, w_rel_idx, gaps - - def get_eff_msa_weights(self, msa): - """compute effective weight for each sequence""" - # pairwise identity - pdistance_msa = pdist(msa, "hamming") - msa_sm = 1.0 - squareform(pdistance_msa) - # weight for each sequence - msa_w = (msa_sm >= self.eff_cutoff).astype(float) - msa_w = 1 / np.sum(msa_w, -1) - return msa_w - - @staticmethod - def l2(x): - return np.sum(np.square(x)) - - def objective(self, v, w=None, flattened=True): - """Same objective function as used in run_opt_tf below - but here only using numpy not TensorFlow functions. - Potentially helpful for implementing SciPy optimizers.""" - if w is None: - w = self.w_ini - onehot_cat_msa = np.eye(self.states)[self.msa_trimmed] - if flattened: - v = np.reshape(v, (self.n_col, self.states)) - w = np.reshape(w, (self.n_col, self.states, self.n_col, self.states)) - ######################################## - # Pseudo-Log-Likelihood - ######################################## - # v + w - vw = v + np.tensordot(onehot_cat_msa, w, 2) - # Hamiltonian - h = np.sum(np.multiply(onehot_cat_msa, vw), axis=(1, 2)) - # local z (partition function) - z = np.sum(np.log(np.sum(np.exp(vw), axis=2)), axis=1) - # Pseudo-Log-Likelihood - pll = h - z - ######################################## - # Regularization - ######################################## - l2_v = 0.01 * self.l2(v) - l2_w = 0.01 * self.l2(w) * 0.5 * (self.n_col - 1) * (self.states - 1) - # loss function to minimize - loss = -np.sum(pll * self.msa_weights) / np.sum(self.msa_weights) - loss = loss + (l2_v + l2_w) / self.n_eff - return loss - - @staticmethod - def opt_adam(loss, name, var_list=None, lr=1.0, b1=0.9, b2=0.999, b_fix=False): - """ - Adam optimizer [https://arxiv.org/abs/1412.6980] with first and second moments - mt and - vt (greek letter nu) at time steps t, respectively. - Note by GREMLIN authors: this is a modified version of adam optimizer. - More specifically, we replace "vt" with sum(g*g) instead of (g*g). - Furthermore, we find that disabling the bias correction - (b_fix=False) speeds up convergence for our case. - """ - if var_list is None: - var_list = tf.compat.v1.trainable_variables() - gradients = tf.gradients(loss, var_list) - if b_fix: - t = tf.Variable(0.0, "t") - opt = [] - for n, (x, g) in enumerate(zip(var_list, gradients)): - if g is not None: - ini = dict(initializer=tf.zeros_initializer, trainable=False) - mt = tf.compat.v1.get_variable(name + "_mt_" + str(n), shape=list(x.shape), **ini) - vt = tf.compat.v1.get_variable(name + "_vt_" + str(n), shape=[], **ini) - - mt_tmp = b1 * mt + (1 - b1) * g - vt_tmp = b2 * vt + (1 - b2) * tf.reduce_sum(tf.square(g)) - lr_tmp = lr / (tf.sqrt(vt_tmp) + 1e-8) - - if b_fix: - lr_tmp = lr_tmp * tf.sqrt(1 - tf.pow(b2, t)) / (1 - tf.pow(b1, t)) - - opt.append(x.assign_add(-lr_tmp * mt_tmp)) - opt.append(vt.assign(vt_tmp)) - opt.append(mt.assign(mt_tmp)) - - if b_fix: - opt.append(t.assign_add(1.0)) - return tf.group(opt) - - @staticmethod - def sym_w(w): - """ - Symmetrize input matrix of shape (x,y,x,y) - As the full couplings matrix W might/will be slightly "unsymmetrical" - it will be symmetrized according to one half being "mirrored". - """ - x = w.shape[0] - w = w * np.reshape(1 - np.eye(x), (x, 1, x, 1)) - w = w + tf.transpose(w, [2, 3, 0, 1]) - return w - - @staticmethod - def l2_tf(x): - return tf.reduce_sum(tf.square(x)) - - def run_opt_tf(self, opt_rate=1.0, batch_size=None): - """ - For optimization of v and w ADAM is used here (L-BFGS-B not (yet) implemented - for TF 2.x, e.g. using scipy.optimize.minimize). - Gaps (char '-' respectively '21') included. - """ - ############################################################## - # SETUP COMPUTE GRAPH - ############################################################## - # kill any existing tensorflow graph - tf.compat.v1.reset_default_graph() - tf.compat.v1.disable_eager_execution() - - # msa (multiple sequence alignment) - msa = tf.compat.v1.placeholder(tf.int32, shape=(None, self.n_col), name="msa") - - # one-hot encode msa - oh_msa = tf.one_hot(msa, self.states) - - # msa weights - msa_weights = tf.compat.v1.placeholder(tf.float32, shape=(None,), name="msa_weights") - - # 1-body-term of the MRF - v = tf.compat.v1.get_variable(name="v", - shape=[self.n_col, self.states], - initializer=tf.compat.v1.zeros_initializer) - - # 2-body-term of the MRF - w = tf.compat.v1.get_variable(name="w", - shape=[self.n_col, self.states, self.n_col, self.states], - initializer=tf.compat.v1.zeros_initializer) - - # symmetrize w - w = self.sym_w(w) - - ######################################## - # Pseudo-Log-Likelihood - ######################################## - # v + w - vw = v + tf.tensordot(oh_msa, w, 2) - - # Hamiltonian - h = tf.reduce_sum(tf.multiply(oh_msa, vw), axis=(1, 2)) - # partition function Z - z = tf.reduce_sum(tf.reduce_logsumexp(vw, axis=2), axis=1) - - # Pseudo-Log-Likelihood - pll = h - z - - ######################################## - # Regularization - ######################################## - l2_v = 0.01 * self.l2_tf(v) - lw_w = 0.01 * self.l2_tf(w) * 0.5 * (self.n_col - 1) * (self.states - 1) - - # loss function to minimize - loss = -tf.reduce_sum(pll * msa_weights) / tf.reduce_sum(msa_weights) - loss = loss + (l2_v + lw_w) / self.n_eff - - ############################################################## - # MINIMIZE LOSS FUNCTION - ############################################################## - opt = self.opt_adam(loss, "adam", lr=opt_rate) - # initialize V (local fields) - msa_cat = tf.keras.utils.to_categorical(self.msa_trimmed, self.states) - pseudo_count = 0.01 * np.log(self.n_eff) - v_ini = np.log(np.sum(msa_cat.T * self.msa_weights, -1).T + pseudo_count) - v_ini = v_ini - np.mean(v_ini, -1, keepdims=True) - - # generate input/feed - def feed(feed_all=False): - if batch_size is None or feed_all: - return {msa: self.msa_trimmed, msa_weights: self.msa_weights} - else: - idx = np.random.randint(0, self.n_row, size=batch_size) - return {msa: self.msa_trimmed[idx], msa_weights: self.msa_weights[idx]} - - with tf.compat.v1.Session() as sess: - # initialize variables V (local fields) and W (couplings) - sess.run(tf.compat.v1.global_variables_initializer()) - sess.run(v.assign(v_ini)) - # compute loss across all data - get_loss = lambda: round(sess.run(loss, feed(feed_all=True)) * self.n_eff, 2) - logger.info(f"Initial loss: {get_loss()}. Starting parameter optimization...") - for i in range(self.opt_iter): - sess.run(opt, feed()) - try: - if (i + 1) % int(self.opt_iter / 10) == 0: - logger.info(f"Iteration {(i + 1)} {get_loss()}") - except ZeroDivisionError: - logger.info(f"Iteration {(i + 1)} {get_loss()}") - # save the v and w parameters of the MRF - v_opt = sess.run(v) - w_opt = sess.run(w) - - no_gap_states = self.states - 1 - return v_opt[:, :no_gap_states], w_opt[:, :no_gap_states, :, :no_gap_states] - - def initialize_v_w(self, remove_gap_entries=True): - """ - For optimization of v and w ADAM is used here (L-BFGS-B not (yet) - implemented for TF 2.x, e.g. using scipy.optimize.minimize). - Gaps (char '-' respectively '21') included. - """ - w_ini = np.zeros((self.n_col, self.states, self.n_col, self.states)) - onehot_cat_msa = np.eye(self.states)[self.msa_trimmed] - aa_counts = np.sum(onehot_cat_msa, axis=0) - pseudo_count = 0.01 * np.log(self.n_eff) - v_ini = np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count) - v_ini = v_ini - np.mean(v_ini, -1, keepdims=True) - # loss_score_ini = self.objective(v_ini, w_ini, flattened=False) - - if remove_gap_entries: - no_gap_states = self.states - 1 - v_ini = v_ini[:, :no_gap_states] - w_ini = w_ini[:, :no_gap_states, :, :no_gap_states] - aa_counts = aa_counts[:, :no_gap_states] - - return v_ini, w_ini, aa_counts - - @property - def get_v_w_opt(self): - try: - return self.v_opt, self.w_opt - except AttributeError: - raise SystemError( - "No v_opt and w_opt available, this means GREMLIN " - "has not been initialized setting optimize to True, " - "e.g., try GREMLIN('Alignment.fasta', optimize=True)." - ) - - def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0, recompute_z=False): - if v is None or w is None: - if self.optimize: - v, w = self.v_opt, self.w_opt - else: - v, w, _ = self.initialize_v_w(remove_gap_entries=True) - if v_idx is None: - v_idx = self.v_idx - seqs_int = self.seq2int(seqs) - # if length of sequence != length of model use only - # valid positions (v_idx) from the trimmed alignment - try: - if seqs_int.shape[-1] != len(v_idx): - seqs_int = seqs_int[..., v_idx] - except IndexError: - raise SystemError( - "The loaded GREMLIN parameter model does not match the input model " - "in terms of sequence encoding shape or is a gap-substituted sequence. " - "E.g., when providing two different DCA models/parameters provided by: " - "\"-m DCA\" and \"--params GREMLIN\", where -m DCA represents a ml input " - "model potentially generated using plmc parameters and --params GREMLIN " - "provides differently encoded sequences generated using GREMLIN." - ) - - # one hot encode - x = np.eye(self.states)[seqs_int] - # aa_pos_counts = np.sum(x, axis=0) - - # get non-gap positions - # no_gap = 1.0 - x[..., -1] - - # remove gap from one-hot-encoding - x = x[..., :-1] - - # compute score - vw = v + np.tensordot(x, w, 2) - - # ============================================================================================ - # Note, Z (the partition function) is a constant. In GREMLIN, V, W & Z are estimated using all - # the original weighted input sequence(s). It is NOT recommended to recalculate z with a - # different set of sequences. Given the common ERROR of recomputing Z, we include the option - # to do so, for comparison. - # ============================================================================================ - h = np.sum(np.multiply(x, vw), axis=-1) - - if encode: - return h - - if recompute_z: - z = logsumexp(vw, axis=-1) - return np.sum((h - z), axis=-1) - h_wt_seq - else: - return np.sum(h, axis=-1) - h_wt_seq - - def get_wt_score(self, wt_seq=None, v=None, w=None, encode=False): - if wt_seq is None: - wt_seq = self.wt_seq - if v is None or w is None: - if self.optimize: - v, w = self.v_opt, self.w_opt - else: - v, w = self.v_ini, self.w_ini - wt_seq = np.array(wt_seq, dtype=str) - return self.get_score(wt_seq, v, w, encode=encode) - - def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None): - """ - Wrapper function for encoding input sequences using the self.get_score - function with encode set to True. - """ - xs = self.get_score(seqs, v, w, v_idx, encode=True) - return xs - - @staticmethod - def normalize(apc_mat): - """ - Normalization of APC matrix for getting z-Score matrix - """ - dim = apc_mat.shape[0] - apc_mat_flat = apc_mat.flatten() - x, _ = boxcox(apc_mat_flat - np.amin(apc_mat_flat) + 1.0) - x_mean = np.mean(x) - x_std = np.std(x) - x = (x - x_mean) / x_std - x = x.reshape(dim, dim) - return x - - def mtx_gaps_as_zeros(self, gap_reduced_mtx, insert_gap_zeros=True): - """ - Inserts zeros at gap positions of the (L,L) matrices, - i.e., raw/apc/zscore matrices. - """ - if insert_gap_zeros: - gap_reduced_mtx = list(gap_reduced_mtx) - mtx_zeroed = [] - c = 0 - for i in range(self.n_col_ori): - mtx_i = [] - c_i = 0 - if i in self.gaps: - mtx_zeroed.append(list(np.zeros(self.n_col_ori))) - else: - for j in range(self.n_col_ori): - if j in self.gaps: - mtx_i.append(0.0) - else: - mtx_i.append(gap_reduced_mtx[c][c_i]) - c_i += 1 - mtx_zeroed.append(mtx_i) - c += 1 - return np.array(mtx_zeroed) - - else: - return gap_reduced_mtx - - def get_correlation_matrix(self, matrix_type: str = 'apc', insert_gap_zeros=False): - """ - Requires optimized w matrix (of shape (L, 20, L, 20)) - inputs - ------------------------------------------------------ - w : coevolution shape=(L,A,L,A) - ------------------------------------------------------ - outputs - ------------------------------------------------------ - raw : l2norm(w) shape=(L,L) - apc : apc(raw) shape=(L,L) - zscore : normalize(apc) shape=(L,L) - """ - # l2norm of 20x20 matrices (note: gaps already excluded) - raw = np.sqrt(np.sum(np.square(self.w_opt), (1, 3))) - - # apc (average product correction) - ap = np.sum(raw, 0, keepdims=True) * np.sum(raw, 1, keepdims=True) / np.sum(raw) - apc = raw - ap - - if matrix_type == 'apc': - return self.mtx_gaps_as_zeros(apc, insert_gap_zeros=insert_gap_zeros) - elif matrix_type == 'raw': - return self.mtx_gaps_as_zeros(raw, insert_gap_zeros=insert_gap_zeros) - elif matrix_type == 'zscore' or matrix_type == 'z_score': - return self.mtx_gaps_as_zeros(self.normalize(apc), insert_gap_zeros=insert_gap_zeros) - else: - raise SystemError("Unknown matrix type. Choose between 'apc', 'raw', or 'zscore'.") - - def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True): - matrix = self.get_correlation_matrix(matrix_type, insert_gap_zeros=True) - if set_diag_zero: - np.fill_diagonal(matrix, 0.0) - - fig, ax = plt.subplots(figsize=(10, 10)) - - if matrix_type == 'zscore' or matrix_type == 'z_score': - ax.imshow(matrix, cmap='Blues', interpolation='none', vmin=1, vmax=3) - else: - ax.imshow(matrix, cmap='Blues') - tick_pos = ax.get_xticks() - tick_pos = np.array([int(t) for t in tick_pos]) - tick_pos[-1] = matrix.shape[0] - if tick_pos[2] > 1: - tick_pos[2:] -= 1 - ax.set_xticks(tick_pos) - ax.set_yticks(tick_pos) - labels = [item.get_text() for item in ax.get_xticklabels()] - try: - labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]] - except ValueError: - pass - ax.set_xticklabels(labels) - ax.set_yticklabels(labels) - ax.set_xlim(-1, matrix.shape[0]) - ax.set_ylim(-1, matrix.shape[0]) - plt.title(matrix_type.upper()) - plt.savefig(f'{matrix_type}.png', dpi=500) - plt.close('all') - - def get_top_coevolving_residues(self, wt_seq=None, min_distance=0, sort_by="apc"): - if wt_seq is None: - wt_seq = self.wt_seq - if wt_seq is None: - raise SystemError("Getting top co-evolving residues requires " - "the wild type sequence as input.") - raw = self.get_correlation_matrix(matrix_type='raw') - apc = self.get_correlation_matrix(matrix_type='apc') - zscore = self.get_correlation_matrix(matrix_type='zscore') - - # Explore top co-evolving residue pairs - i_rel_idx = self.w_rel_idx[:, 0] - j_rel_idx = self.w_rel_idx[:, 1] - - apc_flat = [] - zscore_flat = [] - raw_flat = [] - for i, _ in enumerate(i_rel_idx): - raw_flat.append(raw[i_rel_idx[i]][j_rel_idx[i]]) - apc_flat.append(apc[i_rel_idx[i]][j_rel_idx[i]]) - zscore_flat.append(zscore[i_rel_idx[i]][j_rel_idx[i]]) - - i_idx = self.w_idx[:, 0] - j_idx = self.w_idx[:, 1] - - i_aa = [f"{wt_seq[i]}_{i + 1}" for i in i_idx] - j_aa = [f"{wt_seq[j]}_{j + 1}" for j in j_idx] - - # load mtx into pandas dataframe - mtx = { - "i": i_idx, "j": j_idx, "apc": apc_flat, "zscore": zscore_flat, - "raw": raw_flat, "i_aa": i_aa, "j_aa": j_aa - } - df_mtx = pd.DataFrame(mtx, columns=["i", "j", "apc", "zscore", "raw", "i_aa", "j_aa"]) - df_mtx_sorted = df_mtx.sort_values(sort_by, ascending=False) - - # get contacts with sequence separation > min_distance - df_mtx_sorted_mindist = df_mtx_sorted.loc[df_mtx['j'] - df_mtx['i'] > min_distance] - - return df_mtx_sorted_mindist - - -""" -GREMLIN class helper functions below. -""" - - -def save_gremlin_as_pickle(alignment: str, wt_seq: str, opt_iter: int = 100): - """ - Function for getting and/or saving (optimized or unoptimized) GREMLIN model - """ - logger.info(f'Inferring GREMLIN DCA parameters based on the provided MSA...') - gremlin = GREMLIN(alignment, wt_seq=wt_seq, optimize=True, opt_iter=opt_iter) - try: - mkdir('Pickles') - except FileExistsError: - pass - - logger.info(f'Saving GREMLIN model as Pickle file...') - pickle.dump( - { - 'model': gremlin, - 'model_type': 'GREMLINpureDCA', - 'beta_1': None, - 'beta_2': None, - 'spearman_rho': None, - 'regressor': None - }, - open('Pickles/GREMLIN', 'wb') - ) - return gremlin - - -def plot_all_corr_mtx(gremlin: GREMLIN): - gremlin.plot_correlation_matrix(matrix_type='raw') - gremlin.plot_correlation_matrix(matrix_type='apc') - gremlin.plot_correlation_matrix(matrix_type='zscore') - - -def save_corr_csv(gremlin: GREMLIN, min_distance: int = 0, sort_by: str = 'apc'): - df_mtx_sorted_mindist = gremlin.get_top_coevolving_residues( - min_distance=min_distance, sort_by=sort_by - ) - df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv") +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 17 May 2023 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +""" +Code taken from GREMLIN repository available at https://github.com/sokrypton/GREMLIN_CPP/ +and adapted (put functions into a class termed GREMLIN) and used under the +"THE BEER-WARE LICENSE" (Revision 42): + ---------------------------------------------------------------------------------------- + "THE BEER-WARE LICENSE" (Revision 42): + wrote this file. As long as you retain this notice you + can do whatever you want with this stuff. If we meet some day, and you think + this stuff is worth it, you can buy me a beer in return. Sergey Ovchinnikov + ---------------------------------------------------------------------------------------- +--> Thanks for sharing the great code, I will gladly provide you a beer or two. (Niklas) +Code mainly taken from +https://github.com/sokrypton/GREMLIN_CPP/blob/master/GREMLIN_TF.ipynb + +References: +[1] Kamisetty, H., Ovchinnikov, S., & Baker, D. + Assessing the utility of coevolution-based residue–residue contact predictions in a + sequence- and structure-rich era. + Proceedings of the National Academy of Sciences, 2013, 110, 15674-15679 + https://www.pnas.org/doi/10.1073/pnas.1314045110 +[2] Balakrishnan, S., Kamisetty, H., Carbonell, J. G., Lee, S.-I., & Langmead, C. J. + Learning generative models for protein fold families. + Proteins, 79(4), 2011, 1061–78. + https://doi.org/10.1002/prot.22934 +[3] Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E. + Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models. + Physical Review E, 87(1), 2013, 012707. doi:10.1103/PhysRevE.87.012707 + https://doi.org/10.1103/PhysRevE.87.012707 +""" + +import logging +logger = logging.getLogger('pypef.dca.params_inference') + +from os import mkdir +import pickle +import numpy as np +import matplotlib.pyplot as plt +from scipy.spatial.distance import pdist, squareform +from scipy.special import logsumexp +from scipy.stats import boxcox +import pandas as pd +import tensorflow as tf +tf.get_logger().setLevel('DEBUG') + +from pypef.utils.variant_data import get_sequences_from_file + + +class GREMLIN: + def __init__( + self, + alignment: str, + char_alphabet: str = "ARNDCQEGHILKMFPSTWYV-", + wt_seq=None, + optimize=True, + gap_cutoff=0.5, + eff_cutoff=0.8, + opt_iter=100 + ): + self.char_alphabet = char_alphabet + self.gap_cutoff = gap_cutoff + self.eff_cutoff = eff_cutoff + self.opt_iter = opt_iter + self.states = len(self.char_alphabet) + self.seqs, _, _ = get_sequences_from_file(alignment) + self.msa_ori = self.get_msa_ori() + self.n_col_ori = self.msa_ori.shape[1] + if wt_seq is not None: + self.wt_seq = wt_seq + else: # Taking the first sequence in the MSA as wild type sequence + logger.info("No wild-type sequence provided: The first sequence " + "in the MSA is considered the wild-type sequence.") + self.wt_seq = "".join([self.char_alphabet[i] for i in self.msa_ori[0]]) + if len(self.wt_seq) != self.n_col_ori: + raise SystemError("Length of (provided) wild-type sequence does not match " + "number of MSA columns, i.e., common MSA sequence length.") + self.msa_trimmed, self.v_idx, self.w_idx, self.w_rel_idx, self.gaps = self.filt_gaps(self.msa_ori) + self.msa_weights = self.get_eff_msa_weights(self.msa_trimmed) + self.n_eff = np.sum(self.msa_weights) + self.n_row = self.msa_trimmed.shape[0] + self.n_col = self.msa_trimmed.shape[1] + self.v_ini, self.w_ini, self.aa_counts = self.initialize_v_w(remove_gap_entries=False) + self.optimize = optimize + if self.optimize: + self.v_opt, self.w_opt = self.run_opt_tf() + self.x_wt = self.collect_encoded_sequences(np.atleast_1d(self.wt_seq)) + + def a2n_dict(self): + a2n = {} + for a, n in zip(self.char_alphabet, range(self.states)): + a2n[a] = n + return a2n + + def aa2int(self, aa): + """convert single aa into numerical integer value, e.g.: + "A" -> 0 or "-" to 21 dependent on char_alphabet""" + a2n = self.a2n_dict() + if aa in a2n: + return a2n[aa] + else: # for unknown characters insert Gap character + return a2n['-'] + + def seq2int(self, aa_seqs): + """ + convert a single sequence or a list of sequences into a list of integer sequences, e.g.: + ["ACD","EFG"] -> [[0,4,3], [6,13,7]] + """ + if type(aa_seqs) == str: + aa_seqs = np.array(aa_seqs) + if type(aa_seqs) == list: + aa_seqs = np.array(aa_seqs) + if aa_seqs.dtype.type is np.str_: + if aa_seqs.ndim == 0: # single seq + return np.array([self.aa2int(aa) for aa in str(aa_seqs)]) + else: # list of seqs + return np.array([[self.aa2int(aa) for aa in seq] for seq in aa_seqs]) + else: + return aa_seqs + + @property + def get_v_idx_w_idx(self): + return self.v_idx, self.w_idx + + def get_msa_ori(self): + """converts list of sequences to msa""" + msa_ori = [] + for seq in self.seqs: + msa_ori.append([self.aa2int(aa.upper()) for aa in seq]) + msa_ori = np.array(msa_ori) + return msa_ori + + def filt_gaps(self, msa_ori): + """filters alignment to remove gappy positions""" + tmp = (msa_ori == self.states - 1).astype(float) + non_gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] < self.gap_cutoff)[0] + + gaps = np.where(np.sum(tmp.T, -1).T / msa_ori.shape[0] >= self.gap_cutoff)[0] + logger.info(f'Gap positions (removed from MSA; 0-indexed):\n{gaps}') + ncol_trimmed = len(non_gaps) + v_idx = non_gaps + w_idx = v_idx[np.stack(np.triu_indices(ncol_trimmed, 1), -1)] + w_rel_idx = np.stack(np.triu_indices(ncol_trimmed, 1), -1) + return msa_ori[:, non_gaps], v_idx, w_idx, w_rel_idx, gaps + + def get_eff_msa_weights(self, msa): + """compute effective weight for each sequence""" + # pairwise identity + pdistance_msa = pdist(msa, "hamming") + msa_sm = 1.0 - squareform(pdistance_msa) + # weight for each sequence + msa_w = (msa_sm >= self.eff_cutoff).astype(float) + msa_w = 1 / np.sum(msa_w, -1) + return msa_w + + @staticmethod + def l2(x): + return np.sum(np.square(x)) + + def objective(self, v, w=None, flattened=True): + """Same objective function as used in run_opt_tf below + but here only using numpy not TensorFlow functions. + Potentially helpful for implementing SciPy optimizers.""" + if w is None: + w = self.w_ini + onehot_cat_msa = np.eye(self.states)[self.msa_trimmed] + if flattened: + v = np.reshape(v, (self.n_col, self.states)) + w = np.reshape(w, (self.n_col, self.states, self.n_col, self.states)) + ######################################## + # Pseudo-Log-Likelihood + ######################################## + # v + w + vw = v + np.tensordot(onehot_cat_msa, w, 2) + # Hamiltonian + h = np.sum(np.multiply(onehot_cat_msa, vw), axis=(1, 2)) + # local z (partition function) + z = np.sum(np.log(np.sum(np.exp(vw), axis=2)), axis=1) + # Pseudo-Log-Likelihood + pll = h - z + ######################################## + # Regularization + ######################################## + l2_v = 0.01 * self.l2(v) + l2_w = 0.01 * self.l2(w) * 0.5 * (self.n_col - 1) * (self.states - 1) + # loss function to minimize + loss = -np.sum(pll * self.msa_weights) / np.sum(self.msa_weights) + loss = loss + (l2_v + l2_w) / self.n_eff + return loss + + @staticmethod + def opt_adam(loss, name, var_list=None, lr=1.0, b1=0.9, b2=0.999, b_fix=False): + """ + Adam optimizer [https://arxiv.org/abs/1412.6980] with first and second moments + mt and + vt (greek letter nu) at time steps t, respectively. + Note by GREMLIN authors: this is a modified version of adam optimizer. + More specifically, we replace "vt" with sum(g*g) instead of (g*g). + Furthermore, we find that disabling the bias correction + (b_fix=False) speeds up convergence for our case. + """ + if var_list is None: + var_list = tf.compat.v1.trainable_variables() + gradients = tf.gradients(loss, var_list) + if b_fix: + t = tf.Variable(0.0, "t") + opt = [] + for n, (x, g) in enumerate(zip(var_list, gradients)): + if g is not None: + ini = dict(initializer=tf.zeros_initializer, trainable=False) + mt = tf.compat.v1.get_variable(name + "_mt_" + str(n), shape=list(x.shape), **ini) + vt = tf.compat.v1.get_variable(name + "_vt_" + str(n), shape=[], **ini) + + mt_tmp = b1 * mt + (1 - b1) * g + vt_tmp = b2 * vt + (1 - b2) * tf.reduce_sum(tf.square(g)) + lr_tmp = lr / (tf.sqrt(vt_tmp) + 1e-8) + + if b_fix: + lr_tmp = lr_tmp * tf.sqrt(1 - tf.pow(b2, t)) / (1 - tf.pow(b1, t)) + + opt.append(x.assign_add(-lr_tmp * mt_tmp)) + opt.append(vt.assign(vt_tmp)) + opt.append(mt.assign(mt_tmp)) + + if b_fix: + opt.append(t.assign_add(1.0)) + return tf.group(opt) + + @staticmethod + def sym_w(w): + """ + Symmetrize input matrix of shape (x,y,x,y) + As the full couplings matrix W might/will be slightly "unsymmetrical" + it will be symmetrized according to one half being "mirrored". + """ + x = w.shape[0] + w = w * np.reshape(1 - np.eye(x), (x, 1, x, 1)) + w = w + tf.transpose(w, [2, 3, 0, 1]) + return w + + @staticmethod + def l2_tf(x): + return tf.reduce_sum(tf.square(x)) + + def run_opt_tf(self, opt_rate=1.0, batch_size=None): + """ + For optimization of v and w ADAM is used here (L-BFGS-B not (yet) implemented + for TF 2.x, e.g. using scipy.optimize.minimize). + Gaps (char '-' respectively '21') included. + """ + ############################################################## + # SETUP COMPUTE GRAPH + ############################################################## + # kill any existing tensorflow graph + tf.compat.v1.reset_default_graph() + tf.compat.v1.disable_eager_execution() + + # msa (multiple sequence alignment) + msa = tf.compat.v1.placeholder(tf.int32, shape=(None, self.n_col), name="msa") + + # one-hot encode msa + oh_msa = tf.one_hot(msa, self.states) + + # msa weights + msa_weights = tf.compat.v1.placeholder(tf.float32, shape=(None,), name="msa_weights") + + # 1-body-term of the MRF + v = tf.compat.v1.get_variable(name="v", + shape=[self.n_col, self.states], + initializer=tf.compat.v1.zeros_initializer) + + # 2-body-term of the MRF + w = tf.compat.v1.get_variable(name="w", + shape=[self.n_col, self.states, self.n_col, self.states], + initializer=tf.compat.v1.zeros_initializer) + + # symmetrize w + w = self.sym_w(w) + + ######################################## + # Pseudo-Log-Likelihood + ######################################## + # v + w + vw = v + tf.tensordot(oh_msa, w, 2) + + # Hamiltonian + h = tf.reduce_sum(tf.multiply(oh_msa, vw), axis=(1, 2)) + # partition function Z + z = tf.reduce_sum(tf.reduce_logsumexp(vw, axis=2), axis=1) + + # Pseudo-Log-Likelihood + pll = h - z + + ######################################## + # Regularization + ######################################## + l2_v = 0.01 * self.l2_tf(v) + lw_w = 0.01 * self.l2_tf(w) * 0.5 * (self.n_col - 1) * (self.states - 1) + + # loss function to minimize + loss = -tf.reduce_sum(pll * msa_weights) / tf.reduce_sum(msa_weights) + loss = loss + (l2_v + lw_w) / self.n_eff + + ############################################################## + # MINIMIZE LOSS FUNCTION + ############################################################## + opt = self.opt_adam(loss, "adam", lr=opt_rate) + # initialize V (local fields) + msa_cat = tf.keras.utils.to_categorical(self.msa_trimmed, self.states) + pseudo_count = 0.01 * np.log(self.n_eff) + v_ini = np.log(np.sum(msa_cat.T * self.msa_weights, -1).T + pseudo_count) + v_ini = v_ini - np.mean(v_ini, -1, keepdims=True) + + # generate input/feed + def feed(feed_all=False): + if batch_size is None or feed_all: + return {msa: self.msa_trimmed, msa_weights: self.msa_weights} + else: + idx = np.random.randint(0, self.n_row, size=batch_size) + return {msa: self.msa_trimmed[idx], msa_weights: self.msa_weights[idx]} + + with tf.compat.v1.Session() as sess: + # initialize variables V (local fields) and W (couplings) + sess.run(tf.compat.v1.global_variables_initializer()) + sess.run(v.assign(v_ini)) + # compute loss across all data + get_loss = lambda: round(sess.run(loss, feed(feed_all=True)) * self.n_eff, 2) + logger.info(f"Initial loss: {get_loss()}. Starting parameter optimization...") + for i in range(self.opt_iter): + sess.run(opt, feed()) + try: + if (i + 1) % int(self.opt_iter / 10) == 0: + logger.info(f"Iteration {(i + 1)} {get_loss()}") + except ZeroDivisionError: + logger.info(f"Iteration {(i + 1)} {get_loss()}") + # save the v and w parameters of the MRF + v_opt = sess.run(v) + w_opt = sess.run(w) + + no_gap_states = self.states - 1 + return v_opt[:, :no_gap_states], w_opt[:, :no_gap_states, :, :no_gap_states] + + def initialize_v_w(self, remove_gap_entries=True): + """ + For optimization of v and w ADAM is used here (L-BFGS-B not (yet) + implemented for TF 2.x, e.g. using scipy.optimize.minimize). + Gaps (char '-' respectively '21') included. + """ + w_ini = np.zeros((self.n_col, self.states, self.n_col, self.states)) + onehot_cat_msa = np.eye(self.states)[self.msa_trimmed] + aa_counts = np.sum(onehot_cat_msa, axis=0) + pseudo_count = 0.01 * np.log(self.n_eff) + v_ini = np.log(np.sum(onehot_cat_msa.T * self.msa_weights, -1).T + pseudo_count) + v_ini = v_ini - np.mean(v_ini, -1, keepdims=True) + # loss_score_ini = self.objective(v_ini, w_ini, flattened=False) + + if remove_gap_entries: + no_gap_states = self.states - 1 + v_ini = v_ini[:, :no_gap_states] + w_ini = w_ini[:, :no_gap_states, :, :no_gap_states] + aa_counts = aa_counts[:, :no_gap_states] + + return v_ini, w_ini, aa_counts + + @property + def get_v_w_opt(self): + try: + return self.v_opt, self.w_opt + except AttributeError: + raise SystemError( + "No v_opt and w_opt available, this means GREMLIN " + "has not been initialized setting optimize to True, " + "e.g., try GREMLIN('Alignment.fasta', optimize=True)." + ) + + def get_score(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.0, recompute_z=False): + if v is None or w is None: + if self.optimize: + v, w = self.v_opt, self.w_opt + else: + v, w, _ = self.initialize_v_w(remove_gap_entries=True) + if v_idx is None: + v_idx = self.v_idx + seqs_int = self.seq2int(seqs) + # if length of sequence != length of model use only + # valid positions (v_idx) from the trimmed alignment + try: + if seqs_int.shape[-1] != len(v_idx): + seqs_int = seqs_int[..., v_idx] + except IndexError: + raise SystemError( + "The loaded GREMLIN parameter model does not match the input model " + "in terms of sequence encoding shape or is a gap-substituted sequence. " + "E.g., when providing two different DCA models/parameters provided by: " + "\"-m DCA\" and \"--params GREMLIN\", where -m DCA represents a ml input " + "model potentially generated using plmc parameters and --params GREMLIN " + "provides differently encoded sequences generated using GREMLIN." + ) + + # one hot encode + x = np.eye(self.states)[seqs_int] + # aa_pos_counts = np.sum(x, axis=0) + + # get non-gap positions + # no_gap = 1.0 - x[..., -1] + + # remove gap from one-hot-encoding + x = x[..., :-1] + + # compute score + vw = v + np.tensordot(x, w, 2) + + # ============================================================================================ + # Note, Z (the partition function) is a constant. In GREMLIN, V, W & Z are estimated using all + # the original weighted input sequence(s). It is NOT recommended to recalculate z with a + # different set of sequences. Given the common ERROR of recomputing Z, we include the option + # to do so, for comparison. + # ============================================================================================ + h = np.sum(np.multiply(x, vw), axis=-1) + + if encode: + return h + + if recompute_z: + z = logsumexp(vw, axis=-1) + return np.sum((h - z), axis=-1) - h_wt_seq + else: + return np.sum(h, axis=-1) - h_wt_seq + + def get_wt_score(self, wt_seq=None, v=None, w=None, encode=False): + if wt_seq is None: + wt_seq = self.wt_seq + if v is None or w is None: + if self.optimize: + v, w = self.v_opt, self.w_opt + else: + v, w = self.v_ini, self.w_ini + wt_seq = np.array(wt_seq, dtype=str) + return self.get_score(wt_seq, v, w, encode=encode) + + def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None): + """ + Wrapper function for encoding input sequences using the self.get_score + function with encode set to True. + """ + xs = self.get_score(seqs, v, w, v_idx, encode=True) + return xs + + @staticmethod + def normalize(apc_mat): + """ + Normalization of APC matrix for getting z-Score matrix + """ + dim = apc_mat.shape[0] + apc_mat_flat = apc_mat.flatten() + x, _ = boxcox(apc_mat_flat - np.amin(apc_mat_flat) + 1.0) + x_mean = np.mean(x) + x_std = np.std(x) + x = (x - x_mean) / x_std + x = x.reshape(dim, dim) + return x + + def mtx_gaps_as_zeros(self, gap_reduced_mtx, insert_gap_zeros=True): + """ + Inserts zeros at gap positions of the (L,L) matrices, + i.e., raw/apc/zscore matrices. + """ + if insert_gap_zeros: + gap_reduced_mtx = list(gap_reduced_mtx) + mtx_zeroed = [] + c = 0 + for i in range(self.n_col_ori): + mtx_i = [] + c_i = 0 + if i in self.gaps: + mtx_zeroed.append(list(np.zeros(self.n_col_ori))) + else: + for j in range(self.n_col_ori): + if j in self.gaps: + mtx_i.append(0.0) + else: + mtx_i.append(gap_reduced_mtx[c][c_i]) + c_i += 1 + mtx_zeroed.append(mtx_i) + c += 1 + return np.array(mtx_zeroed) + + else: + return gap_reduced_mtx + + def get_correlation_matrix(self, matrix_type: str = 'apc', insert_gap_zeros=False): + """ + Requires optimized w matrix (of shape (L, 20, L, 20)) + inputs + ------------------------------------------------------ + w : coevolution shape=(L,A,L,A) + ------------------------------------------------------ + outputs + ------------------------------------------------------ + raw : l2norm(w) shape=(L,L) + apc : apc(raw) shape=(L,L) + zscore : normalize(apc) shape=(L,L) + """ + # l2norm of 20x20 matrices (note: gaps already excluded) + raw = np.sqrt(np.sum(np.square(self.w_opt), (1, 3))) + + # apc (average product correction) + ap = np.sum(raw, 0, keepdims=True) * np.sum(raw, 1, keepdims=True) / np.sum(raw) + apc = raw - ap + + if matrix_type == 'apc': + return self.mtx_gaps_as_zeros(apc, insert_gap_zeros=insert_gap_zeros) + elif matrix_type == 'raw': + return self.mtx_gaps_as_zeros(raw, insert_gap_zeros=insert_gap_zeros) + elif matrix_type == 'zscore' or matrix_type == 'z_score': + return self.mtx_gaps_as_zeros(self.normalize(apc), insert_gap_zeros=insert_gap_zeros) + else: + raise SystemError("Unknown matrix type. Choose between 'apc', 'raw', or 'zscore'.") + + def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True): + matrix = self.get_correlation_matrix(matrix_type, insert_gap_zeros=True) + if set_diag_zero: + np.fill_diagonal(matrix, 0.0) + + fig, ax = plt.subplots(figsize=(10, 10)) + + if matrix_type == 'zscore' or matrix_type == 'z_score': + ax.imshow(matrix, cmap='Blues', interpolation='none', vmin=1, vmax=3) + else: + ax.imshow(matrix, cmap='Blues') + tick_pos = ax.get_xticks() + tick_pos = np.array([int(t) for t in tick_pos]) + tick_pos[-1] = matrix.shape[0] + if tick_pos[2] > 1: + tick_pos[2:] -= 1 + ax.set_xticks(tick_pos) + ax.set_yticks(tick_pos) + labels = [item.get_text() for item in ax.get_xticklabels()] + try: + labels = [labels[0]] + [str(int(label) + 1) for label in labels[1:]] + except ValueError: + pass + ax.set_xticklabels(labels) + ax.set_yticklabels(labels) + ax.set_xlim(-1, matrix.shape[0]) + ax.set_ylim(-1, matrix.shape[0]) + plt.title(matrix_type.upper()) + plt.savefig(f'{matrix_type}.png', dpi=500) + plt.close('all') + + def get_top_coevolving_residues(self, wt_seq=None, min_distance=0, sort_by="apc"): + if wt_seq is None: + wt_seq = self.wt_seq + if wt_seq is None: + raise SystemError("Getting top co-evolving residues requires " + "the wild type sequence as input.") + raw = self.get_correlation_matrix(matrix_type='raw') + apc = self.get_correlation_matrix(matrix_type='apc') + zscore = self.get_correlation_matrix(matrix_type='zscore') + + # Explore top co-evolving residue pairs + i_rel_idx = self.w_rel_idx[:, 0] + j_rel_idx = self.w_rel_idx[:, 1] + + apc_flat = [] + zscore_flat = [] + raw_flat = [] + for i, _ in enumerate(i_rel_idx): + raw_flat.append(raw[i_rel_idx[i]][j_rel_idx[i]]) + apc_flat.append(apc[i_rel_idx[i]][j_rel_idx[i]]) + zscore_flat.append(zscore[i_rel_idx[i]][j_rel_idx[i]]) + + i_idx = self.w_idx[:, 0] + j_idx = self.w_idx[:, 1] + + i_aa = [f"{wt_seq[i]}_{i + 1}" for i in i_idx] + j_aa = [f"{wt_seq[j]}_{j + 1}" for j in j_idx] + + # load mtx into pandas dataframe + mtx = { + "i": i_idx, "j": j_idx, "apc": apc_flat, "zscore": zscore_flat, + "raw": raw_flat, "i_aa": i_aa, "j_aa": j_aa + } + df_mtx = pd.DataFrame(mtx, columns=["i", "j", "apc", "zscore", "raw", "i_aa", "j_aa"]) + df_mtx_sorted = df_mtx.sort_values(sort_by, ascending=False) + + # get contacts with sequence separation > min_distance + df_mtx_sorted_mindist = df_mtx_sorted.loc[df_mtx['j'] - df_mtx['i'] > min_distance] + + return df_mtx_sorted_mindist + + +""" +GREMLIN class helper functions below. +""" + + +def save_gremlin_as_pickle(alignment: str, wt_seq: str, opt_iter: int = 100): + """ + Function for getting and/or saving (optimized or unoptimized) GREMLIN model + """ + logger.info(f'Inferring GREMLIN DCA parameters based on the provided MSA...') + gremlin = GREMLIN(alignment, wt_seq=wt_seq, optimize=True, opt_iter=opt_iter) + try: + mkdir('Pickles') + except FileExistsError: + pass + + logger.info(f'Saving GREMLIN model as Pickle file...') + pickle.dump( + { + 'model': gremlin, + 'model_type': 'GREMLINpureDCA', + 'beta_1': None, + 'beta_2': None, + 'spearman_rho': None, + 'regressor': None + }, + open('Pickles/GREMLIN', 'wb') + ) + return gremlin + + +def plot_all_corr_mtx(gremlin: GREMLIN): + gremlin.plot_correlation_matrix(matrix_type='raw') + gremlin.plot_correlation_matrix(matrix_type='apc') + gremlin.plot_correlation_matrix(matrix_type='zscore') + + +def save_corr_csv(gremlin: GREMLIN, min_distance: int = 0, sort_by: str = 'apc'): + df_mtx_sorted_mindist = gremlin.get_top_coevolving_residues( + min_distance=min_distance, sort_by=sort_by + ) + df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv") diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py index f6724e6..509fc0f 100644 --- a/pypef/dca/hybrid_model.py +++ b/pypef/dca/hybrid_model.py @@ -1,1207 +1,1224 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -# Contains Python code used for the approach presented in our 'hybrid modeling' paper -# Preprint available at: https://doi.org/10.1101/2022.06.07.495081 -# Code available at: https://github.com/Protein-Engineering-Framework/Hybrid_Model - -from __future__ import annotations - -import os -import pickle -from os import listdir -from os.path import isfile, join -from typing import Union -import logging -logger = logging.getLogger('pypef.dca.hybrid_model') - -import numpy as np -import sklearn.base -from scipy.stats import spearmanr -from sklearn.linear_model import Ridge -from sklearn.model_selection import GridSearchCV, train_test_split -from scipy.optimize import differential_evolution - -from pypef.utils.variant_data import ( - get_sequences_from_file, get_seqs_from_var_name, - remove_nan_encoded_positions, get_wt_sequence, split_variants -) - -from pypef.dca.plmc_encoding import PLMC, get_dca_data_parallel, get_encoded_sequence, EffectiveSiteError -from pypef.utils.to_file import predictions_out -from pypef.utils.plot import plot_y_true_vs_y_pred -import pypef.dca.gremlin_inference -from pypef.dca.gremlin_inference import GREMLIN - - -class DCAHybridModel: - alphas = np.logspace(-6, 6, 100) # Grid for the parameter 'alpha'. - parameter_range = [(0, 1), (0, 1)] # Parameter range of 'beta_1' and 'beta_2' with lb <= x <= ub - # TODO: Implementation of other regression techniques (CVRegression models) - - def __init__( - self, - alphas=alphas, - parameter_range=None, - x_train: np.ndarray = None, - y_train: np.ndarray = None, - x_test: np.ndarray = None, # not necessary for training - y_test: np.ndarray = None, # not necessary for training - x_wt=None - ): - if parameter_range is None: - parameter_range = parameter_range - self._alphas = alphas - self._parameter_range = parameter_range - self.x_train = x_train - self.y_train = y_train - self.x_test = x_test - self.y_test = y_test - self.X = np.concatenate((x_train, x_test), axis=0) if self.x_test is not None else self.x_train - self.y = np.concatenate((y_train, y_test), axis=0) if self.y_test is not None else self.y_train - self.x_wild_type = x_wt - self._spearmanr_dca = self._spearmanr_dca() - self.beta_1, self.beta_2, self.regressor = self.settings(self.x_train, self.y_train) - - @staticmethod - def spearmanr( - y1: np.ndarray, - y2: np.ndarray - ) -> float: - """ - Parameters - ---------- - y1 : np.ndarray - Array of target fitness values. - y2 : np.ndarray - Array of predicted fitness values. - - Returns - ------- - Spearman's rank correlation coefficient. - """ - return spearmanr(y1, y2)[0] - - @staticmethod - def _standardize( - x: np.ndarray, - axis=0 - ) -> np.ndarray: - """ - Standardizes the input array x by subtracting the mean - and dividing it by the (sample) standard deviation. - - Parameters - ---------- - x : np.ndarray - Array to be standardized. - axis : integer (default=0) - Axis to exectute operations on. - - Returns - ------- - Standardized version of 'x'. - """ - return np.subtract(x, np.mean(x, axis=axis)) / np.std(x, axis=axis, ddof=1) - - def _delta_x( - self, - x: np.ndarray - ) -> np.ndarray: - """ - Substracts for each variant the encoded wild-type sequence - from its encoded sequence. - - Parameters - ---------- - x : np.ndarray - Array of encoded variant sequences (matrix X). - - Returns - ------- - Array of encoded variant sequences with substracted encoded - wild-type sequence. - """ - return np.subtract(x, self.x_wild_type) - - def _delta_e( - self, - x: np.ndarray - ) -> np.ndarray: - """ - Calculates the difference of the statistical energy 'dE' - of the variant and wild-type sequence. - - dE = E (variant) - E (wild-type) - with E = \sum_{i} h_i (o_i) + \sum_{i float: - """ - Returns - ------- - Spearman's rank correlation coefficient of the full - data and the statistical DCA predictions (difference - of statistical energies). Used to adjust the sign - of hybrid predictions, i.e. - beta_1 * y_dca + beta_2 * y_ridge - or - beta_1 * y_dca - beta_2 * y_ridge. - """ - y_dca = self._delta_e(self.X) - return self.spearmanr(self.y, y_dca) - - def ridge_predictor( - self, - x_train: np.ndarray, - y_train: np.ndarray, - ) -> object: - """ - Sets the parameter 'alpha' for ridge regression. - - Parameters - ---------- - x_train : np.ndarray - Array of the encoded sequences for training. - y_train : np.ndarray - Associated fitness values to the sequences present - in 'x_train'. - - Returns - ------- - Ridge object trained on 'x_train' and 'y_train' (cv=5) - with optimized 'alpha'. - """ - grid = GridSearchCV(Ridge(), {'alpha': self._alphas}, cv=5) - grid.fit(x_train, y_train) - return Ridge(**grid.best_params_).fit(x_train, y_train) - - def _y_hybrid( - self, - y_dca: np.ndarray, - y_ridge: np.ndarray, - beta_1: float, - beta_2: float - ) -> np.ndarray: - """ - Chooses sign for connecting the parts of the hybrid model. - - Parameters - ---------- - y_dca : np.ndarray - Difference of the statistical energies of variants - and wild-type. - y_ridge : np.ndarray - (Ridge) predicted fitness values of the variants. - b1 : float - Float between [0,1] coefficient for regulating DCA - model contribution. - b2 : float - Float between [0,1] coefficient for regulating ML - model contribution. - - Returns - ------- - The predicted fitness value-representatives of the hybrid - model. - """ - # Uncomment lines below to see if correlation between - # y_true and y_dca is positive or negative: - # logger.info(f'Positive or negative correlation of (all data) y_true ' - # f'and y_dca (+/-?): {self._spearmanr_dca:.3f}') - if self._spearmanr_dca >= 0: - return beta_1 * y_dca + beta_2 * y_ridge - else: # negative correlation - return beta_1 * y_dca - beta_2 * y_ridge - - def _adjust_betas( - self, - y: np.ndarray, - y_dca: np.ndarray, - y_ridge: np.ndarray - ) -> np.ndarray: - """ - Find parameters that maximize the absolut Spearman rank - correlation coefficient using differential evolution. - - Parameters - ---------- - y : np.ndarray - Array of fitness values. - y_dca : np.ndarray - Difference of the statistical energies of variants - and wild-type. - y_ridge : np.ndarray - (Ridge) predicted fitness values of the variants. - - Returns - ------- - 'beta_1' and 'beta_2' that maximize the absolut Spearman rank correlation - coefficient. - """ - loss = lambda b: -np.abs(self.spearmanr(y, b[0] * y_dca + b[1] * y_ridge)) - minimizer = differential_evolution(loss, bounds=self.parameter_range, tol=1e-4) - return minimizer.x - - def settings( - self, - x_train: np.ndarray, - y_train: np.ndarray, - train_size_fit=0.66, - random_state=42 - ) -> tuple: - """ - Get the adjusted parameters 'beta_1', 'beta_2', and the - tuned regressor of the hybrid model. - - Parameters - ---------- - x_train : np.ndarray - Encoded sequences of the variants in the training set. - y_train : np.ndarray - Fitness values of the variants in the training set. - train_size_fit : float [0,1] (default 0.66) - Fraction to split training set into another - training and testing set. - random_state : int (default=224) - Random state used to split. - - Returns - ------- - Tuple containing the adjusted parameters 'beta_1' and 'beta_2', - as well as the tuned regressor of the hybrid model. - """ - try: - X_ttrain, X_ttest, y_ttrain, y_ttest = train_test_split( - x_train, y_train, - train_size=train_size_fit, - random_state=random_state - ) - - except ValueError: - """ - Not enough sequences to construct a sub-training and sub-testing - set when splitting the training set. - - Machine learning/adjusting the parameters 'beta_1' and 'beta_2' not - possible -> return parameter setting for 'EVmutation' model. - """ - return 1.0, 0.0, None - - """ - The sub-training set 'y_ttrain' is subjected to a five-fold cross - validation. This leads to the constraint that at least two sequences - need to be in the 20 % of that set in order to allow a ranking. - - If this is not given -> return parameter setting for 'EVmutation' model. - """ - y_ttrain_min_cv = int(0.2 * len(y_ttrain)) # 0.2 because of five-fold cross validation (1/5) - if y_ttrain_min_cv < 2: - return 1.0, 0.0, None - - y_dca_ttest = self._delta_e(X_ttest) - - ridge = self.ridge_predictor(X_ttrain, y_ttrain) - y_ridge_ttest = ridge.predict(X_ttest) - - beta1, beta2 = self._adjust_betas(y_ttest, y_dca_ttest, y_ridge_ttest) - return beta1, beta2, ridge - - def hybrid_prediction( - self, - x: np.ndarray, - reg: object, # any regression-based estimator (from sklearn) - beta_1: float, - beta_2: float - ) -> np.ndarray: - """ - Use the regressor 'reg' and the parameters 'beta_1' - and 'beta_2' for constructing a hybrid model and - predicting the fitness associates of 'X'. - - Parameters - ---------- - x : np.ndarray - Encoded sequences X used for prediction. - reg : object - Tuned ridge regressor for the hybrid model. - beta_1 : float - Float for regulating EVmutation model contribution. - beta_2 : float - Float for regulating Ridge regressor contribution. - - Returns - ------- - Predicted fitness associates of 'X' using the - hybrid model. - """ - y_dca = self._delta_e(x) - if reg is None: - y_ridge = np.random.random(len(y_dca)) # in order to suppress error - else: - y_ridge = reg.predict(x) - # adjusting: + or - on all data --> +-beta_1 * y_dca + beta_2 * y_ridge - return self._y_hybrid(y_dca, y_ridge, beta_1, beta_2) - - def split_performance( - self, - train_size: float = 0.8, - n_runs: int = 10, - seed: int = 42, - save_model: bool = False - ) -> dict: - """ - Estimates performance of the model. - - Parameters - ---------- - train_size : int or float (default=0.8) - Number of samples in the training dataset - or fraction of full dataset used for training. - n_runs : int (default=10) - Number of different splits to perform. - seed : int (default=42) - Seed for random generator. - save_model : bool (default=False) - If True, model is saved using pickle, else not. - - Returns - ------- - data : dict - Contains information about hybrid model parameters - and performance results. - """ - data = {} - np.random.seed(seed) - - for r, random_state in enumerate(np.random.randint(100, size=n_runs)): - x_train, x_test, y_train, y_test = train_test_split( - self.X, self.y, train_size=train_size, random_state=random_state) - beta_1, beta_2, reg = self.settings(x_train, y_train) - if beta_2 == 0.0: - alpha = np.nan - else: - if save_model: - pickle.dumps(reg) - alpha = reg.alpha - data.update( - {f'{len(y_train)}_{r}': - { - 'no_run': r, - 'n_y_train': len(y_train), - 'n_y_test': len(y_test), - 'rnd_state': random_state, - 'spearman_rho': self.spearmanr( - y_test, self.hybrid_prediction( - x_test, reg, beta_1, beta_2 - ) - ), - 'beta_1': beta_1, - 'beta_2': beta_2, - 'alpha': alpha - } - } - ) - - return data - - def ls_ts_performance(self): - beta_1, beta_2, reg = self.settings( - x_train=self.x_train, - y_train=self.y_train - ) - spearman_r = self.spearmanr( - self.y_test, - self.hybrid_prediction(self.x_test, reg, beta_1, beta_2) - ) - return spearman_r, reg, beta_1, beta_2 - - def train_and_test( - self, - train_percent_fit: float = 0.66, - random_state: int = 42 - ): - """ - Description - ---------- - Trains the hybrid model on a relative number of all variants - and returns the individual model contribution weights beta_1 (DCA) - and beta_2 (ML) as well as the hyperparameter-tuned regression model, - e.g. to save all the hybrid model parameters for later loading as - Pickle file. - - Parameters - ---------- - train_percent_fit: float (default = 0.66) - Relative number of variants used for model fitting (not - hyperparameter validation. Default of 0.66 and overall train - size of 0.8 means the total size for least squares fitting - is 0.8 * 0.66 = 0.528, thus for hyperparameter validation - the size is 0.8 * 0.33 = 0.264 and for testing the size is - 1 - 0.528 - 0.264 = 0.208. - random_state: int (default = 42) - Random state for splitting (and reproduction of results). - - Returns - ---------- - beta_1: float - DCA model contribution to hybrid model predictions. - beta_2: float - ML model contribution to hybrid model predictions. - reg: object - sklearn Estimator class, e.g. sklearn.linear_model.Ridge - fitted and with optimized hyperparameters (e.g. alpha). - self._spearmanr_dca: float - To determine, if spearmanr_dca (i.e. DCA correlation) and measured - fitness values is positive (>= 0) or negative (< 0). - test_spearman_r : float - Achieved performance in terms of Spearman's rank correlation - between measured and predicted test set variant fitness values. - """ - beta_1, beta_2, reg = self.settings( - x_train=self.x_train, - y_train=self.y_train, - train_size_fit=train_percent_fit, - random_state=random_state - ) - - if len(self.y_test) > 0: - test_spearman_r = self.spearmanr( - self.y_test, - self.hybrid_prediction( - self.x_test, reg, beta_1, beta_2 - ) - ) - else: - test_spearman_r = None - return beta_1, beta_2, reg, self._spearmanr_dca, test_spearman_r - - def get_train_sizes(self) -> np.ndarray: - """ - Generates a list of train sizes to perform low-n with. - - Returns - ------- - Numpy array of train sizes up to 80% (i.e. 0.8 * N_variants). - """ - eighty_percent = int(len(self.y) * 0.8) - - train_sizes = np.sort(np.concatenate([ - np.arange(15, 50, 5), np.arange(50, 100, 10), - np.arange(100, 150, 20), [160, 200, 250, 300, eighty_percent], - np.arange(400, 1100, 100) - ])) - - idx_max = np.where(train_sizes >= eighty_percent)[0][0] + 1 - return train_sizes[:idx_max] - - def run( - self, - train_sizes: list = None, - n_runs: int = 10 - ) -> dict: - """ - - Returns - ---------- - data: dict - Performances of the split with size of the - training set = train_size and size of the - test set = N_variants - train_size. - """ - data = {} - for t, train_size in enumerate(train_sizes): - logger.info(f'{t + 1}/{len(train_sizes)}:{train_size}') - data.update(self.split_performance(train_size=train_size, n_runs=n_runs)) - return data - - -""" -Below: Some helper functions that call or are dependent on the DCAHybridModel class. -""" - - -def check_model_type(model: dict | DCAHybridModel | PLMC | GREMLIN): - """ - Checks type/instance of model. - """ - if type(model) == dict: - try: - model = model['model'] - except KeyError: - raise SystemError("Unknown model dictionary taken from Pickle file.") - if type(model) == pypef.dca.plmc_encoding.PLMC: - return 'PLMC' - elif type(model) == pypef.dca.hybrid_model.DCAHybridModel: - return 'Hybrid' - elif type(model) == pypef.dca.gremlin_inference.GREMLIN: - return 'GREMLIN' - elif isinstance(model, sklearn.base.BaseEstimator): - raise SystemError("Loaded an sklearn ML model. For pure ML-based modeling the " - "\'ml\' flag has to be used instead of the \'hybrid\' flag.") - else: - raise SystemError('Unknown model/unknown Pickle file.') - - -def get_model_path(model: str): - try: - if isfile(model): - model_path = model - elif isfile(f'Pickles/{model}'): - model_path = f'Pickles/{model}' - else: - raise SystemError("Did not find specified model file.") - return model_path - except TypeError: - raise SystemError("No provided model. " - "Specify a model for DCA-based encoding.") - - -def get_model_and_type(params_file: str, substitution_sep: str = '/'): - file_path = get_model_path(params_file) - try: - with open(file_path, 'rb') as read_pkl_file: - model = pickle.load(read_pkl_file) - model_type = check_model_type(model) - except pickle.UnpicklingError: - model_type = 'PLMC_Params' - - if model_type == 'PLMC_Params': - model = PLMC( - params_file=params_file, - separator=substitution_sep, - verbose=False - ) - model_type = 'PLMC' - - else: # --> elif model_type in ['PLMC', 'GREMLIN', 'Hybrid']: - model = model['model'] - - return model, model_type - - -def save_model_to_dict_pickle( - model: DCAHybridModel | PLMC | GREMLIN, - model_type: str | None = None, - beta_1: float | None = None, - beta_2: float | None = None, - spearman_r: float | None = None, - regressor: sklearn.base.BaseEstimator = None -): - try: - os.mkdir('Pickles') - except FileExistsError: - pass - - if model_type is None: - model_type = 'MODEL' - # else: - # model_type += '_MODEL' - logger.info(f'Save model as Pickle file... {model_type}') - pickle.dump( - { - 'model': model, - 'model_type': model_type, - 'beta_1': beta_1, - 'beta_2': beta_2, - 'spearman_rho': spearman_r, - 'regressor': regressor - }, - open(f'Pickles/{model_type}', 'wb') - ) - - -global_model = None -global_model_type = None - - -def plmc_or_gremlin_encoding( - variants, - sequences, - ys_true, - params_file, - substitution_sep='/', - threads=1, - verbose=True, - use_global_model=False -): - """ - Decides based on the params file input type which DCA encoding to be performed, i.e., - GREMLIN or PLMC. - If use_global_model==True, to avoid each time pickle model file getting loaded, which - is quite inefficient when performing directed evolution, i.e., encoding of single - sequences, a global model is stored at the first evolution step and used in the - subsequent steps. - """ - global global_model, global_model_type - if ys_true is None: - ys_true = np.zeros(np.shape(sequences)) - if use_global_model: - if global_model is None: - global_model, global_model_type = get_model_and_type(params_file, substitution_sep) - model, model_type = global_model, global_model_type - else: - model, model_type = global_model, global_model_type - else: - model, model_type = get_model_and_type(params_file, substitution_sep) - if model_type == 'PLMC': - xs, x_wt, variants, sequences, ys_true = plmc_encoding( - model, variants, sequences, ys_true, threads, verbose - ) - elif model_type == 'GREMLIN': - if verbose: - logger.info(f"Following positions are frequent gap positions in the MSA " - f"and cannot be considered for effective modeling, i.e., " - f"substitutions at these positions are removed as these would be " - f"predicted as wild type:\n{[gap + 1 for gap in model.gaps]}.\n" - f"Effective positions (N={len(model.v_idx)}) are:\n" - f"{[v_pos + 1 for v_pos in model.v_idx]}") - xs, x_wt, variants, sequences, ys_true = gremlin_encoding( - model, variants, sequences, ys_true, - shift_pos=1, substitution_sep=substitution_sep - ) - else: - raise SystemError( - f"Found a {model_type.lower()} model as input. Please train a new " - f"hybrid model on the provided LS/TS datasets." - ) - assert len(xs) == len(variants) == len(sequences) == len(ys_true) - return xs, variants, sequences, ys_true, x_wt, model, model_type - - -def gremlin_encoding(gremlin: GREMLIN, variants, sequences, ys_true, shift_pos=1, substitution_sep='/'): - """ - Gets X and x_wt for DCA prediction: delta_Hamiltonian respectively - delta_E = np.subtract(X, x_wt), with X = encoded sequences of variants. - Also removes variants, sequences, and y_trues at MSA gap positions. - """ - variants, sequences, ys_true = np.atleast_1d(variants), np.atleast_1d(sequences), np.atleast_1d(ys_true) - variants, sequences, ys_true = remove_gap_pos( - gremlin.gaps, variants, sequences, ys_true, - shift_pos=shift_pos, substitution_sep=substitution_sep - ) - try: - xs = gremlin.get_score(sequences, encode=True) - except SystemError: - xs = [] - x_wt = gremlin.get_score(np.atleast_1d(gremlin.wt_seq), encode=True) - return xs, x_wt, variants, sequences, ys_true - - -def plmc_encoding(plmc: PLMC, variants, sequences, ys_true, threads=1, verbose=False): - """ - Gets X and x_wt for DCA prediction: delta_E = np.subtract(X, x_wt), - with X = encoded sequences of variants. - Also removes variants, sequences, and y_trues at MSA gap positions. - """ - target_seq, index = plmc.get_target_seq_and_index() - wt_name = target_seq[0] + str(index[0]) + target_seq[0] - if verbose: - logger.info(f"Using to-self-substitution '{wt_name}' as wild type reference. " - f"Encoding variant sequences. This might take some time...") - x_wt = get_encoded_sequence(wt_name, plmc) - if threads > 1: - # Hyperthreading, NaNs are already being removed by the called function - variants, sequences, xs, ys_true = get_dca_data_parallel( - variants, sequences, ys_true, plmc, threads, verbose=verbose) - else: - x_ = plmc.collect_encoded_sequences(variants) - # NaNs must still be removed - xs, variants, sequences, ys_true = remove_nan_encoded_positions( - x_, variants, sequences, ys_true - ) - return xs, x_wt, variants, sequences, ys_true - - -def remove_gap_pos( - gaps, - variants, - sequences, - fitnesses, - shift_pos=1, - substitution_sep='/' -): - """ - Remove gap postions from input variants, sequences, and fitness values - based on input gaps (gap positions). - Note that by default, gap positions are shifted by +1 to match the input - variant identifiers (e.g., variant A123C is removed if gap pos is 122; (122 += 1). - - Returns - ----------- - variants_v - Variants with substitutions at valid sequence positions, i.e., at non-gap positions - sequences_v - Sequences of variants with substitutions at valid sequence positions, i.e., at non-gap positions - fitnesses_v - Fitness values of variants with substitutions at valid sequence positions, i.e., at non-gap positions - """ - variants_v, sequences_v, fitnesses_v = [], [], [] - valid = [] - for i, variant in enumerate(variants): - variant = variant.split(substitution_sep) - for var in variant: - if int(var[1:-1]) not in [gap + shift_pos for gap in gaps]: - if i not in valid: - valid.append(i) - variants_v.append(variants[i]) - sequences_v.append(sequences[i]) - fitnesses_v.append(fitnesses[i]) - return variants_v, sequences_v, fitnesses_v - - -def get_delta_e_statistical_model( - x_test: np.ndarray, - x_wt: np.ndarray -): - """ - Description - ----------- - Delta_E means difference in evolutionary energy in plmc terms. - In other words, this is the delta of the sum of Hamiltonian-encoded - sequences of local fields and couplings of encoded sequence and wild-type - sequence in GREMLIN terms. - - Parameters - ----------- - x_test: np.ndarray [2-dim] - Encoded sequences to be subtracted by x_wt to compute delta E. - x_wt: np.ndarray [1-dim] - Encoded wild-type sequence. - - Returns - ----------- - delta_e: np.ndarray [1-dim] - Summed subtracted encoded sequences. - - """ - delta_x = np.subtract(x_test, x_wt) - delta_e = np.sum(delta_x, axis=1) - return delta_e - - -def generate_model_and_save_pkl( - variants, - ys_true, - params_file, - wt, - train_percent_fit: float = 0.66, # percent of all data: 0.8 * 0.66 - test_percent: float = 0.2, - random_state: int = 42, - substitution_sep = '/', - threads=1 -): - """ - Description - ----------- - Save (Ridge) regression model (trained and with tuned alpha parameter) - with betas (beta_1 and beta_2) as dictionary-structured pickle file. - - Parameters - ---------- - test_percent: float - Percent of DataFrame data used for testing. The remaining data is - used for training (fitting and validation). - train_percent_fit: float - Percent of DataFrame data to train on. - The remaining data is used for validation. - random_state: int - Random seed for splitting in train and test data for reproducing results. - - Returns - ---------- - () - Just saving model parameters as pickle file. - """ - wt_seq = get_wt_sequence(wt) - variants_splitted = split_variants(variants, substitution_sep) - variants, ys_true, sequences = get_seqs_from_var_name(wt_seq, variants_splitted, ys_true) - - xs, variants, sequences, ys_true, x_wt, model, model_type = plmc_or_gremlin_encoding( - variants, sequences, ys_true, params_file, substitution_sep, threads) - - logger.info( - f'Train size (fitting): {train_percent_fit * 100:.1f} % of training data ' - f'({((1 - test_percent) * train_percent_fit) * 100:.1f} % of all data)\n' - f'Train size validation: {(1 - train_percent_fit) * 100:.1f} % of training data ' - f'({((1 - test_percent) * (1 - train_percent_fit)) * 100:.1f} % of all data)\n' - f'Test size: {test_percent * 100:.1f} % ({test_percent * 100:.1f} % of all data)\n' - f'Using random state: {random_state}...\n' - ) - - x_train, x_test, y_train, y_test = train_test_split( - xs, ys_true, test_size=test_percent, random_state=random_state - ) - - hybrid_model = DCAHybridModel( - x_train=x_train, - y_train=y_train, - x_test=x_test, - y_test=y_test, - x_wt=x_wt - ) - - beta_1, beta_2, reg, spearman_dca, test_spearman_r = hybrid_model.train_and_test( - train_percent_fit=train_percent_fit, - random_state=random_state - ) - if reg is None: - alpha_ = 'None' - else: - alpha_ = f'{reg.alpha:.3f}' - logger.info( - f'Individual model weights and regressor hyperparameters:\n' - f'Hybrid model individual model contributions:\nBeta1 (DCA): ' - f'{beta_1:.3f}, Beta2 (ML): {beta_2:.3f} (' - f'regressor: Ridge(alpha={alpha_}))\n' - f'Test performance: Spearman\'s rho = {test_spearman_r:.3f}' - ) - try: - os.mkdir('Pickles') - except FileExistsError: - pass - model_name = f'HYBRID{model_type.lower()}' - save_model_to_dict_pickle(hybrid_model, model_name, beta_1, beta_2, test_spearman_r, reg) - - -def performance_ls_ts( - ls_fasta: str | None, - ts_fasta: str | None, - threads: int, - params_file: str, - model_pickle_file: str | None = None, - substitution_sep: str = '/', - label=False -): - """ - Description - ----------- - Computes performance based on a (linear) regression model trained - on the training set by optimizing model hyperparameters based on - validation performances on training subsets (default: 5-fold CV) - and predicting test set entries using the hyperparmeter-tuned model - to estimate performance for model generalization. - - Parameters - ----------- - ls_fasta: str - Fasta-like file with fitness values. Will be read and extracted - for training the regressor. - ts_fasta: str - Fasta-like file with fitness values. Used for computing performance - of the tuned regressor for test set entries (performance metric of - measured and predicted fitness values). - threads: int - Number of threads to use for parallel computing using Ray. - params_file: str - PLMC parameter file (containing evolutionary, i.e. MSA-based local - and coupling terms. - model: str - Model to load for TS prediction. - separator: str - Character to split the variant to obtain the single substitutions - (default='/'). - - Returns - ----------- - None - Just plots test results (predicted fitness vs. measured fitness) - using def plot_y_true_vs_y_pred. - """ - test_sequences, test_variants, y_test = get_sequences_from_file(ts_fasta) - - if ls_fasta is not None and ts_fasta is not None: - train_sequences, train_variants, y_train = get_sequences_from_file(ls_fasta) - x_train, train_variants, train_sequences, y_train, x_wt, _, model_type = plmc_or_gremlin_encoding( - train_variants, train_sequences, y_train, params_file, substitution_sep, threads - ) - - x_test, test_variants, test_sequences, y_test, *_ = plmc_or_gremlin_encoding( - test_variants, test_sequences, y_test, params_file, substitution_sep, threads, verbose=False - ) - - logger.info(f"\nInitial training set variants: {len(train_sequences)}. " - f"Remaining: {len(train_variants)} (after removing " - f"substitutions at gap positions).\nInitial test set " - f"variants: {len(test_sequences)}. Remaining: {len(test_variants)} " - f"(after removing substitutions at gap positions)." - ) - - hybrid_model = DCAHybridModel( - x_train=np.array(x_train), - y_train=np.array(y_train), - x_test=np.array(x_test), - y_test=np.array(y_test), - x_wt=x_wt - ) - model_name = f'HYBRID{model_type.lower()}' - - spearman_r, reg, beta_1, beta_2 = hybrid_model.ls_ts_performance() - ys_pred = hybrid_model.hybrid_prediction(np.array(x_test), reg, beta_1, beta_2) - - if reg is None: - alpha_ = 'None' - else: - alpha_ = f'{reg.alpha:.3f}' - logger.info( - f'Individual model weights and regressor hyperparameters:\n' - f'Hybrid model individual model contributions: Beta1 (DCA): ' - f'{beta_1:.3f}, Beta2 (ML): {beta_2:.3f} (regressor: ' - f'Ridge(alpha={alpha_}))\nTesting performance...' - ) - - save_model_to_dict_pickle(hybrid_model, model_name, beta_1, beta_2, spearman_r, reg) - - elif ts_fasta is not None and model_pickle_file is not None and params_file is not None: - logger.info(f'Taking model from saved model (Pickle file): {model_pickle_file}...') - - model, model_type = get_model_and_type(model_pickle_file) - - if model_type != 'Hybrid': # same as below in next elif - x_test, test_variants, test_sequences, y_test, x_wt, *_ = plmc_or_gremlin_encoding( - test_variants, test_sequences, y_test, model_pickle_file, substitution_sep, threads, False) - ys_pred = get_delta_e_statistical_model(x_test, x_wt) - else: # Hybrid model input requires params from plmc or GREMLIN model - beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor - x_test, test_variants, test_sequences, y_test, *_ = plmc_or_gremlin_encoding( - test_variants, test_sequences, y_test, params_file, - substitution_sep, threads, False - ) - ys_pred = model.hybrid_prediction(x_test, reg, beta_1, beta_2) - - elif ts_fasta is not None and model_pickle_file is None: # no LS provided --> statistical modeling / no ML - logger.info(f'No learning set provided, falling back to statistical DCA model: ' - f'no adjustments of individual hybrid model parameters (beta_1 and beta_2).') - test_sequences, test_variants, y_test = get_sequences_from_file(ts_fasta) - x_test, test_variants, test_sequences, y_test, x_wt, model, model_type = plmc_or_gremlin_encoding( - test_variants, test_sequences, y_test, params_file, substitution_sep, threads - ) - - logger.info(f"Initial test set variants: {len(test_sequences)}. " - f"Remaining: {len(test_variants)} (after removing " - f"substitutions at gap positions).") - - ys_pred = get_delta_e_statistical_model(x_test, x_wt) - - save_model_to_dict_pickle(model, model_type, None, None, spearmanr(y_test, ys_pred)[0], None) - - else: - raise SystemError('No Test Set given for performance estimation.') - - spearman_rho = spearmanr(y_test, ys_pred) - logger.info(f'Spearman Rho = {spearman_rho[0]:.3f}') - - plot_y_true_vs_y_pred( - np.array(y_test), np.array(ys_pred), np.array(test_variants), label=label, hybrid=True - ) - - -def predict_ps( # also predicting "pmult" dict directories - prediction_dict: dict, - threads: int, - separator: str, - model_pickle_file: str, - params_file: str = None, - prediction_set: str = None, - negative: bool = False -): - """ - Description - ----------- - Predicting the fitness of sequences of a prediction set - or multiple prediction sets that were exemplary created with - 'pypef mkps' based on single substitutional variant data - provided in a CSV and the wild type sequence: - pypef mkps --wt WT_SEQ --input CSV_FILE - [--drop THRESHOLD] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] - - Parameters - ----------- - prediction_dict: dict - Contains arguments which directory to predict, e.g. {'drecomb': True}, - than predicts prediction files that are present in this directory, e.g. - in directory './Recomb_Double_Split'. - params_file: str - PLMC couplings parameter file - threads: int - Threads used for parallelization for DCA-based sequence encoding - separator: str - Separator of individual substitution of variants, default '/' - model_pickle_file: str - Pickle file containing the hybrid model and model parameters in - a dictionary format - test_set: str = None - Test set for prediction and plotting of predictions (contains - true fitness values of variants). - prediction_set: str = None - Prediction set for prediction, does not contain true fitness values. - figure: str = None - Plotting the test set predictions and the corresponding true fitness - values. - label: bool = False - If True, plots associated variant names of predicted variants. - negative: bool = False - If true, negative defines improved variants having a reduced/negative - fitness compared to wild type. - - - Returns - ----------- - () - Writes sorted predictions to files (for [--drecomb] [--trecomb] - [--qarecomb] [--qirecomb] [--ddiverse] [--tdiverse] [--qdiverse] - in the respective created folders). - - """ - logger.info(f'Taking model from saved model (Pickle file): {model_pickle_file}...') - - model, model_type = get_model_and_type(model_pickle_file) - - if model_type == 'PLMC': - logger.info(f'No hybrid model provided – falling back to a statistical DCA model.') - elif model_type == 'Hybrid': - beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor - if reg is None: - alpha_ = 'None' - else: - alpha_ = f'{reg.alpha:.3f}' - logger.info( - f'Individual model weights and regressor hyperparameters:\n' - f'Hybrid model individual model contributions: Beta1 (DCA): {beta_1:.3f}, ' - f'Beta2 (ML): {beta_2:.3f} (regressor: Ridge(alpha={alpha_})).' - ) - - pmult = [ - 'Recomb_Double_Split', 'Recomb_Triple_Split', 'Recomb_Quadruple_Split', - 'Recomb_Quintuple_Split', 'Diverse_Double_Split', 'Diverse_Triple_Split', - 'Diverse_Quadruple_Split' - ] - if True in prediction_dict.values(): - for ps, path in zip(prediction_dict.values(), pmult): - if ps: # if True, run prediction in this directory, e.g. for drecomb - logger.info(f'Running predictions for variant-sequence files in directory {path}...') - all_y_v_pred = [] - files = [f for f in listdir(path) if isfile(join(path, f)) if f.endswith('.fasta')] - for i, file in enumerate(files): # collect and predict for each file in the directory - logger.info(f'Encoding files ({i + 1}/{len(files)}) for prediction...\n') - file_path = os.path.join(path, file) - sequences, variants, _ = get_sequences_from_file(file_path) - if model_type != 'Hybrid': - x_test, test_variants, x_wt, *_ = plmc_or_gremlin_encoding( - variants, sequences, None, model, threads=threads, verbose=False, - substitution_sep=separator) - ys_pred = get_delta_e_statistical_model(x_test, x_wt) - else: # Hybrid model input requires params from plmc or GREMLIN model - ##encoding_model, encoding_model_type = get_model_and_type(params_file) - x_test, test_variants, *_ = plmc_or_gremlin_encoding( - variants, sequences, None, params_file, - threads=threads, verbose=False, substitution_sep=separator - ) - ys_pred = model.hybrid_prediction(x_test, reg, beta_1, beta_2) - for k, y in enumerate(ys_pred): - all_y_v_pred.append((ys_pred[k], variants[k])) - if negative: # sort by fitness value - all_y_v_pred = sorted(all_y_v_pred, key=lambda x: x[0], reverse=False) - else: - all_y_v_pred = sorted(all_y_v_pred, key=lambda x: x[0], reverse=True) - predictions_out( - predictions=all_y_v_pred, - model='Hybrid', - prediction_set=f'Top{path}', - path=path - ) - else: # check next task to do, e.g., predicting triple substituted variants, e.g. trecomb - continue - - elif prediction_set is not None: - sequences, variants, _ = get_sequences_from_file(prediction_set) - # NaNs are already being removed by the called function - if model_type != 'Hybrid': # statistical DCA model - xs, variants, _, _, x_wt, *_ = plmc_or_gremlin_encoding( - variants, sequences, None, params_file, - threads=threads, verbose=False, substitution_sep=separator) - ys_pred = get_delta_e_statistical_model(xs, x_wt) - else: # Hybrid model input requires params from plmc or GREMLIN model - xs, variants, *_ = plmc_or_gremlin_encoding( - variants, sequences, None, params_file, - threads=threads, verbose=True, substitution_sep=separator - ) - ys_pred = model.hybrid_prediction(xs, reg, beta_1, beta_2) - assert len(xs) == len(variants) - y_v_pred = zip(ys_pred, variants) - y_v_pred = sorted(y_v_pred, key=lambda x: x[0], reverse=True) - predictions_out( - predictions=y_v_pred, - model='Hybrid', - prediction_set=f'Top{prediction_set}' - ) - - -def predict_directed_evolution( - encoder: str, - variant: str, - sequence: str, - hybrid_model_data_pkl: str -) -> Union[str, list]: - """ - Perform directed in silico evolution and predict the fitness of a - (randomly) selected variant using the hybrid model. This function opens - the stored DCAHybridModel and the model parameters to predict the fitness - of the variant encoded herein using the PLMC class. If the variant - cannot be encoded (based on the PLMC params file), returns 'skip'. Else, - returning the predicted fitness value and the variant name. - """ - if hybrid_model_data_pkl is not None: - model, model_type = get_model_and_type(hybrid_model_data_pkl) - else: - model_type = 'StatisticalModel' # any name != 'Hybrid' - - if model_type != 'Hybrid': # statistical DCA model - xs, variant, _, _, x_wt, *_ = plmc_or_gremlin_encoding( - variant, sequence, None, encoder, verbose=False, use_global_model=True) - if not list(xs): - return 'skip' - y_pred = get_delta_e_statistical_model(xs, x_wt) - else: # model_type == 'Hybrid': Hybrid model input requires params from PLMC or GREMLIN model - xs, variant, *_ = plmc_or_gremlin_encoding( - variant, sequence, None, encoder, verbose=False, use_global_model=True - ) - if not list(xs): - return 'skip' - try: - y_pred = model.hybrid_prediction(np.atleast_2d(xs), model.regressor, model.beta_1, model.beta_2)[0] - except ValueError: - raise SystemError( - "Probably a different model was used for encoding than for modeling; " - "e.g. using a HYBRIDgremlin model in combination with parameters taken from a PLMC file." - ) - y_pred = float(y_pred) - - return [(y_pred, variant[0][1:])] +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +# Contains Python code used for the approach presented in our 'hybrid modeling' paper +# Preprint available at: https://doi.org/10.1101/2022.06.07.495081 +# Code available at: https://github.com/Protein-Engineering-Framework/Hybrid_Model + +from __future__ import annotations + +import os +import pickle +from os import listdir +from os.path import isfile, join +from typing import Union +import logging +logger = logging.getLogger('pypef.dca.hybrid_model') + +import numpy as np +import sklearn.base +from scipy.stats import spearmanr +from sklearn.linear_model import Ridge +from sklearn.model_selection import GridSearchCV, train_test_split +from scipy.optimize import differential_evolution + +from pypef.utils.variant_data import ( + get_sequences_from_file, get_seqs_from_var_name, + remove_nan_encoded_positions, get_wt_sequence, split_variants +) + +from pypef.dca.plmc_encoding import PLMC, get_dca_data_parallel, get_encoded_sequence, EffectiveSiteError +from pypef.utils.to_file import predictions_out +from pypef.utils.plot import plot_y_true_vs_y_pred +import pypef.dca.gremlin_inference +from pypef.dca.gremlin_inference import GREMLIN + + +class DCAHybridModel: + alphas = np.logspace(-6, 6, 100) # Grid for the parameter 'alpha'. + parameter_range = [(0, 1), (0, 1)] # Parameter range of 'beta_1' and 'beta_2' with lb <= x <= ub + # TODO: Implementation of other regression techniques (CVRegression models) + + def __init__( + self, + alphas=alphas, + parameter_range=None, + x_train: np.ndarray = None, + y_train: np.ndarray = None, + x_test: np.ndarray = None, # not necessary for training + y_test: np.ndarray = None, # not necessary for training + x_wt=None + ): + if parameter_range is None: + parameter_range = parameter_range + self._alphas = alphas + self._parameter_range = parameter_range + self.x_train = x_train + self.y_train = y_train + self.x_test = x_test + self.y_test = y_test + self.X = np.concatenate((x_train, x_test), axis=0) if self.x_test is not None else self.x_train + self.y = np.concatenate((y_train, y_test), axis=0) if self.y_test is not None else self.y_train + self.x_wild_type = x_wt + self._spearmanr_dca = self._spearmanr_dca() + self.beta_1, self.beta_2, self.regressor = self.settings(self.x_train, self.y_train) + + @staticmethod + def spearmanr( + y1: np.ndarray, + y2: np.ndarray + ) -> float: + """ + Parameters + ---------- + y1 : np.ndarray + Array of target fitness values. + y2 : np.ndarray + Array of predicted fitness values. + + Returns + ------- + Spearman's rank correlation coefficient. + """ + return spearmanr(y1, y2)[0] + + @staticmethod + def _standardize( + x: np.ndarray, + axis=0 + ) -> np.ndarray: + """ + Standardizes the input array x by subtracting the mean + and dividing it by the (sample) standard deviation. + + Parameters + ---------- + x : np.ndarray + Array to be standardized. + axis : integer (default=0) + Axis to exectute operations on. + + Returns + ------- + Standardized version of 'x'. + """ + return np.subtract(x, np.mean(x, axis=axis)) / np.std(x, axis=axis, ddof=1) + + def _delta_x( + self, + x: np.ndarray + ) -> np.ndarray: + """ + Substracts for each variant the encoded wild-type sequence + from its encoded sequence. + + Parameters + ---------- + x : np.ndarray + Array of encoded variant sequences (matrix X). + + Returns + ------- + Array of encoded variant sequences with substracted encoded + wild-type sequence. + """ + return np.subtract(x, self.x_wild_type) + + def _delta_e( + self, + x: np.ndarray + ) -> np.ndarray: + """ + Calculates the difference of the statistical energy 'dE' + of the variant and wild-type sequence. + + dE = E (variant) - E (wild-type) + with E = \sum_{i} h_i (o_i) + \sum_{i float: + """ + Returns + ------- + Spearman's rank correlation coefficient of the full + data and the statistical DCA predictions (difference + of statistical energies). Used to adjust the sign + of hybrid predictions, i.e. + beta_1 * y_dca + beta_2 * y_ridge + or + beta_1 * y_dca - beta_2 * y_ridge. + """ + y_dca = self._delta_e(self.X) + return self.spearmanr(self.y, y_dca) + + def ridge_predictor( + self, + x_train: np.ndarray, + y_train: np.ndarray, + ) -> object: + """ + Sets the parameter 'alpha' for ridge regression. + + Parameters + ---------- + x_train : np.ndarray + Array of the encoded sequences for training. + y_train : np.ndarray + Associated fitness values to the sequences present + in 'x_train'. + + Returns + ------- + Ridge object trained on 'x_train' and 'y_train' (cv=5) + with optimized 'alpha'. + """ + grid = GridSearchCV(Ridge(), {'alpha': self._alphas}, cv=5) + grid.fit(x_train, y_train) + return Ridge(**grid.best_params_).fit(x_train, y_train) + + def _y_hybrid( + self, + y_dca: np.ndarray, + y_ridge: np.ndarray, + beta_1: float, + beta_2: float + ) -> np.ndarray: + """ + Chooses sign for connecting the parts of the hybrid model. + + Parameters + ---------- + y_dca : np.ndarray + Difference of the statistical energies of variants + and wild-type. + y_ridge : np.ndarray + (Ridge) predicted fitness values of the variants. + b1 : float + Float between [0,1] coefficient for regulating DCA + model contribution. + b2 : float + Float between [0,1] coefficient for regulating ML + model contribution. + + Returns + ------- + The predicted fitness value-representatives of the hybrid + model. + """ + # Uncomment lines below to see if correlation between + # y_true and y_dca is positive or negative: + # logger.info(f'Positive or negative correlation of (all data) y_true ' + # f'and y_dca (+/-?): {self._spearmanr_dca:.3f}') + if self._spearmanr_dca >= 0: + return beta_1 * y_dca + beta_2 * y_ridge + else: # negative correlation + return beta_1 * y_dca - beta_2 * y_ridge + + def _adjust_betas( + self, + y: np.ndarray, + y_dca: np.ndarray, + y_ridge: np.ndarray + ) -> np.ndarray: + """ + Find parameters that maximize the absolut Spearman rank + correlation coefficient using differential evolution. + + Parameters + ---------- + y : np.ndarray + Array of fitness values. + y_dca : np.ndarray + Difference of the statistical energies of variants + and wild-type. + y_ridge : np.ndarray + (Ridge) predicted fitness values of the variants. + + Returns + ------- + 'beta_1' and 'beta_2' that maximize the absolut Spearman rank correlation + coefficient. + """ + loss = lambda b: -np.abs(self.spearmanr(y, b[0] * y_dca + b[1] * y_ridge)) + minimizer = differential_evolution(loss, bounds=self.parameter_range, tol=1e-4) + return minimizer.x + + def settings( + self, + x_train: np.ndarray, + y_train: np.ndarray, + train_size_fit=0.66, + random_state=42 + ) -> tuple: + """ + Get the adjusted parameters 'beta_1', 'beta_2', and the + tuned regressor of the hybrid model. + + Parameters + ---------- + x_train : np.ndarray + Encoded sequences of the variants in the training set. + y_train : np.ndarray + Fitness values of the variants in the training set. + train_size_fit : float [0,1] (default 0.66) + Fraction to split training set into another + training and testing set. + random_state : int (default=224) + Random state used to split. + + Returns + ------- + Tuple containing the adjusted parameters 'beta_1' and 'beta_2', + as well as the tuned regressor of the hybrid model. + """ + try: + X_ttrain, X_ttest, y_ttrain, y_ttest = train_test_split( + x_train, y_train, + train_size=train_size_fit, + random_state=random_state + ) + + except ValueError: + """ + Not enough sequences to construct a sub-training and sub-testing + set when splitting the training set. + + Machine learning/adjusting the parameters 'beta_1' and 'beta_2' not + possible -> return parameter setting for 'EVmutation' model. + """ + return 1.0, 0.0, None + + """ + The sub-training set 'y_ttrain' is subjected to a five-fold cross + validation. This leads to the constraint that at least two sequences + need to be in the 20 % of that set in order to allow a ranking. + + If this is not given -> return parameter setting for 'EVmutation' model. + """ + y_ttrain_min_cv = int(0.2 * len(y_ttrain)) # 0.2 because of five-fold cross validation (1/5) + if y_ttrain_min_cv < 2: + return 1.0, 0.0, None + + y_dca_ttest = self._delta_e(X_ttest) + + ridge = self.ridge_predictor(X_ttrain, y_ttrain) + y_ridge_ttest = ridge.predict(X_ttest) + + beta1, beta2 = self._adjust_betas(y_ttest, y_dca_ttest, y_ridge_ttest) + return beta1, beta2, ridge + + def hybrid_prediction( + self, + x: np.ndarray, + reg: object, # any regression-based estimator (from sklearn) + beta_1: float, + beta_2: float + ) -> np.ndarray: + """ + Use the regressor 'reg' and the parameters 'beta_1' + and 'beta_2' for constructing a hybrid model and + predicting the fitness associates of 'X'. + + Parameters + ---------- + x : np.ndarray + Encoded sequences X used for prediction. + reg : object + Tuned ridge regressor for the hybrid model. + beta_1 : float + Float for regulating EVmutation model contribution. + beta_2 : float + Float for regulating Ridge regressor contribution. + + Returns + ------- + Predicted fitness associates of 'X' using the + hybrid model. + """ + y_dca = self._delta_e(x) + if reg is None: + y_ridge = np.random.random(len(y_dca)) # in order to suppress error + else: + y_ridge = reg.predict(x) + # adjusting: + or - on all data --> +-beta_1 * y_dca + beta_2 * y_ridge + return self._y_hybrid(y_dca, y_ridge, beta_1, beta_2) + + def split_performance( + self, + train_size: float = 0.8, + n_runs: int = 10, + seed: int = 42, + save_model: bool = False + ) -> dict: + """ + Estimates performance of the model. + + Parameters + ---------- + train_size : int or float (default=0.8) + Number of samples in the training dataset + or fraction of full dataset used for training. + n_runs : int (default=10) + Number of different splits to perform. + seed : int (default=42) + Seed for random generator. + save_model : bool (default=False) + If True, model is saved using pickle, else not. + + Returns + ------- + data : dict + Contains information about hybrid model parameters + and performance results. + """ + data = {} + np.random.seed(seed) + + for r, random_state in enumerate(np.random.randint(100, size=n_runs)): + x_train, x_test, y_train, y_test = train_test_split( + self.X, self.y, train_size=train_size, random_state=random_state) + beta_1, beta_2, reg = self.settings(x_train, y_train) + if beta_2 == 0.0: + alpha = np.nan + else: + if save_model: + pickle.dumps(reg) + alpha = reg.alpha + data.update( + {f'{len(y_train)}_{r}': + { + 'no_run': r, + 'n_y_train': len(y_train), + 'n_y_test': len(y_test), + 'rnd_state': random_state, + 'spearman_rho': self.spearmanr( + y_test, self.hybrid_prediction( + x_test, reg, beta_1, beta_2 + ) + ), + 'beta_1': beta_1, + 'beta_2': beta_2, + 'alpha': alpha + } + } + ) + + return data + + def ls_ts_performance(self): + beta_1, beta_2, reg = self.settings( + x_train=self.x_train, + y_train=self.y_train + ) + spearman_r = self.spearmanr( + self.y_test, + self.hybrid_prediction(self.x_test, reg, beta_1, beta_2) + ) + return spearman_r, reg, beta_1, beta_2 + + def train_and_test( + self, + train_percent_fit: float = 0.66, + random_state: int = 42 + ): + """ + Description + ---------- + Trains the hybrid model on a relative number of all variants + and returns the individual model contribution weights beta_1 (DCA) + and beta_2 (ML) as well as the hyperparameter-tuned regression model, + e.g. to save all the hybrid model parameters for later loading as + Pickle file. + + Parameters + ---------- + train_percent_fit: float (default = 0.66) + Relative number of variants used for model fitting (not + hyperparameter validation. Default of 0.66 and overall train + size of 0.8 means the total size for least squares fitting + is 0.8 * 0.66 = 0.528, thus for hyperparameter validation + the size is 0.8 * 0.33 = 0.264 and for testing the size is + 1 - 0.528 - 0.264 = 0.208. + random_state: int (default = 42) + Random state for splitting (and reproduction of results). + + Returns + ---------- + beta_1: float + DCA model contribution to hybrid model predictions. + beta_2: float + ML model contribution to hybrid model predictions. + reg: object + sklearn Estimator class, e.g. sklearn.linear_model.Ridge + fitted and with optimized hyperparameters (e.g. alpha). + self._spearmanr_dca: float + To determine, if spearmanr_dca (i.e. DCA correlation) and measured + fitness values is positive (>= 0) or negative (< 0). + test_spearman_r : float + Achieved performance in terms of Spearman's rank correlation + between measured and predicted test set variant fitness values. + """ + beta_1, beta_2, reg = self.settings( + x_train=self.x_train, + y_train=self.y_train, + train_size_fit=train_percent_fit, + random_state=random_state + ) + + if len(self.y_test) > 0: + test_spearman_r = self.spearmanr( + self.y_test, + self.hybrid_prediction( + self.x_test, reg, beta_1, beta_2 + ) + ) + else: + test_spearman_r = None + return beta_1, beta_2, reg, self._spearmanr_dca, test_spearman_r + + def get_train_sizes(self) -> np.ndarray: + """ + Generates a list of train sizes to perform low-n with. + + Returns + ------- + Numpy array of train sizes up to 80% (i.e. 0.8 * N_variants). + """ + eighty_percent = int(len(self.y) * 0.8) + + train_sizes = np.sort(np.concatenate([ + np.arange(15, 50, 5), np.arange(50, 100, 10), + np.arange(100, 150, 20), [160, 200, 250, 300, eighty_percent], + np.arange(400, 1100, 100) + ])) + + idx_max = np.where(train_sizes >= eighty_percent)[0][0] + 1 + return train_sizes[:idx_max] + + def run( + self, + train_sizes: list = None, + n_runs: int = 10 + ) -> dict: + """ + + Returns + ---------- + data: dict + Performances of the split with size of the + training set = train_size and size of the + test set = N_variants - train_size. + """ + data = {} + for t, train_size in enumerate(train_sizes): + logger.info(f'{t + 1}/{len(train_sizes)}:{train_size}') + data.update(self.split_performance(train_size=train_size, n_runs=n_runs)) + return data + + +""" +Below: Some helper functions that call or are dependent on the DCAHybridModel class. +""" + + +def check_model_type(model: dict | DCAHybridModel | PLMC | GREMLIN): + """ + Checks type/instance of model. + """ + if type(model) == dict: + try: + model = model['model'] + except KeyError: + raise SystemError("Unknown model dictionary taken from Pickle file.") + if type(model) == pypef.dca.plmc_encoding.PLMC: + return 'PLMC' + elif type(model) == pypef.dca.hybrid_model.DCAHybridModel: + return 'Hybrid' + elif type(model) == pypef.dca.gremlin_inference.GREMLIN: + return 'GREMLIN' + elif isinstance(model, sklearn.base.BaseEstimator): + raise SystemError("Loaded an sklearn ML model. For pure ML-based modeling the " + "\'ml\' flag has to be used instead of the \'hybrid\' flag.") + else: + raise SystemError('Unknown model/unknown Pickle file.') + + +def get_model_path(model: str): + """ + Checks if model Pickle files exits in CWD + and then in ./Pickles directory. + """ + try: + if isfile(model): + model_path = model + elif isfile(f'Pickles/{model}'): + model_path = f'Pickles/{model}' + else: + raise SystemError("Did not find specified model file in current working directory " + " or /Pickles subdirectory. Make sure to train/save a model first " + "(e.g., for saving a GREMLIN model, type \"pypef param_inference --msa TARGET_MSA.a2m\" " + "or, for saving a plmc model, type \"pypef param_inference --params TARGET_PLMC.params\").") + return model_path + except TypeError: + raise SystemError("No provided model. " + "Specify a model for DCA-based encoding.") + + +def get_model_and_type( + params_file: str, + substitution_sep: str = '/' +): + """ + Tries to load/unpickle model to identify the model type + and to load the model from the identified plmc pickle file + or from the loaded pickle dictionary. + """ + file_path = get_model_path(params_file) + try: + with open(file_path, 'rb') as read_pkl_file: + model = pickle.load(read_pkl_file) + model_type = check_model_type(model) + except pickle.UnpicklingError: + model_type = 'PLMC_Params' + + if model_type == 'PLMC_Params': + model = PLMC( + params_file=params_file, + separator=substitution_sep, + verbose=False + ) + model_type = 'PLMC' + + else: # --> elif model_type in ['PLMC', 'GREMLIN', 'Hybrid']: + model = model['model'] + + return model, model_type + + +def save_model_to_dict_pickle( + model: DCAHybridModel | PLMC | GREMLIN, + model_type: str | None = None, + beta_1: float | None = None, + beta_2: float | None = None, + spearman_r: float | None = None, + regressor: sklearn.base.BaseEstimator = None +): + try: + os.mkdir('Pickles') + except FileExistsError: + pass + + if model_type is None: + model_type = 'MODEL' + + logger.info(f'Save model as Pickle file... {model_type}') + pickle.dump( + { + 'model': model, + 'model_type': model_type, + 'beta_1': beta_1, + 'beta_2': beta_2, + 'spearman_rho': spearman_r, + 'regressor': regressor + }, + open(f'Pickles/{model_type}', 'wb') + ) + + +global_model = None +global_model_type = None + + +def plmc_or_gremlin_encoding( + variants, + sequences, + ys_true, + params_file, + substitution_sep='/', + threads=1, + verbose=True, + use_global_model=False +): + """ + Decides based on the params file input type which DCA encoding to be performed, i.e., + GREMLIN or PLMC. + If use_global_model==True, to avoid each time pickle model file getting loaded, which + is quite inefficient when performing directed evolution, i.e., encoding of single + sequences, a global model is stored at the first evolution step and used in the + subsequent steps. + """ + global global_model, global_model_type + if ys_true is None: + ys_true = np.zeros(np.shape(sequences)) + if use_global_model: + if global_model is None: + global_model, global_model_type = get_model_and_type(params_file, substitution_sep) + model, model_type = global_model, global_model_type + else: + model, model_type = global_model, global_model_type + else: + model, model_type = get_model_and_type(params_file, substitution_sep) + if model_type == 'PLMC': + xs, x_wt, variants, sequences, ys_true = plmc_encoding( + model, variants, sequences, ys_true, threads, verbose + ) + elif model_type == 'GREMLIN': + if verbose: + logger.info(f"Following positions are frequent gap positions in the MSA " + f"and cannot be considered for effective modeling, i.e., " + f"substitutions at these positions are removed as these would be " + f"predicted as wild type:\n{[gap + 1 for gap in model.gaps]}.\n" + f"Effective positions (N={len(model.v_idx)}) are:\n" + f"{[v_pos + 1 for v_pos in model.v_idx]}") + xs, x_wt, variants, sequences, ys_true = gremlin_encoding( + model, variants, sequences, ys_true, + shift_pos=1, substitution_sep=substitution_sep + ) + else: + raise SystemError( + f"Found a {model_type.lower()} model as input. Please train a new " + f"hybrid model on the provided LS/TS datasets." + ) + assert len(xs) == len(variants) == len(sequences) == len(ys_true) + return xs, variants, sequences, ys_true, x_wt, model, model_type + + +def gremlin_encoding(gremlin: GREMLIN, variants, sequences, ys_true, shift_pos=1, substitution_sep='/'): + """ + Gets X and x_wt for DCA prediction: delta_Hamiltonian respectively + delta_E = np.subtract(X, x_wt), with X = encoded sequences of variants. + Also removes variants, sequences, and y_trues at MSA gap positions. + """ + variants, sequences, ys_true = np.atleast_1d(variants), np.atleast_1d(sequences), np.atleast_1d(ys_true) + variants, sequences, ys_true = remove_gap_pos( + gremlin.gaps, variants, sequences, ys_true, + shift_pos=shift_pos, substitution_sep=substitution_sep + ) + try: + xs = gremlin.get_score(sequences, encode=True) + except SystemError: + xs = [] + x_wt = gremlin.get_score(np.atleast_1d(gremlin.wt_seq), encode=True) + return xs, x_wt, variants, sequences, ys_true + + +def plmc_encoding(plmc: PLMC, variants, sequences, ys_true, threads=1, verbose=False): + """ + Gets X and x_wt for DCA prediction: delta_E = np.subtract(X, x_wt), + with X = encoded sequences of variants. + Also removes variants, sequences, and y_trues at MSA gap positions. + """ + target_seq, index = plmc.get_target_seq_and_index() + wt_name = target_seq[0] + str(index[0]) + target_seq[0] + if verbose: + logger.info(f"Using to-self-substitution '{wt_name}' as wild type reference. " + f"Encoding variant sequences. This might take some time...") + x_wt = get_encoded_sequence(wt_name, plmc) + if threads > 1: + # Hyperthreading, NaNs are already being removed by the called function + variants, sequences, xs, ys_true = get_dca_data_parallel( + variants, sequences, ys_true, plmc, threads, verbose=verbose) + else: + x_ = plmc.collect_encoded_sequences(variants) + # NaNs must still be removed + xs, variants, sequences, ys_true = remove_nan_encoded_positions( + x_, variants, sequences, ys_true + ) + return xs, x_wt, variants, sequences, ys_true + + +def remove_gap_pos( + gaps, + variants, + sequences, + fitnesses, + shift_pos=1, + substitution_sep='/' +): + """ + Remove gap postions from input variants, sequences, and fitness values + based on input gaps (gap positions). + Note that by default, gap positions are shifted by +1 to match the input + variant identifiers (e.g., variant A123C is removed if gap pos is 122; (122 += 1). + + Returns + ----------- + variants_v + Variants with substitutions at valid sequence positions, i.e., at non-gap positions + sequences_v + Sequences of variants with substitutions at valid sequence positions, i.e., at non-gap positions + fitnesses_v + Fitness values of variants with substitutions at valid sequence positions, i.e., at non-gap positions + """ + variants_v, sequences_v, fitnesses_v = [], [], [] + valid = [] + for i, variant in enumerate(variants): + variant = variant.split(substitution_sep) + for var in variant: + if int(var[1:-1]) not in [gap + shift_pos for gap in gaps]: + if i not in valid: + valid.append(i) + variants_v.append(variants[i]) + sequences_v.append(sequences[i]) + fitnesses_v.append(fitnesses[i]) + return variants_v, sequences_v, fitnesses_v + + +def get_delta_e_statistical_model( + x_test: np.ndarray, + x_wt: np.ndarray +): + """ + Description + ----------- + Delta_E means difference in evolutionary energy in plmc terms. + In other words, this is the delta of the sum of Hamiltonian-encoded + sequences of local fields and couplings of encoded sequence and wild-type + sequence in GREMLIN terms. + + Parameters + ----------- + x_test: np.ndarray [2-dim] + Encoded sequences to be subtracted by x_wt to compute delta E. + x_wt: np.ndarray [1-dim] + Encoded wild-type sequence. + + Returns + ----------- + delta_e: np.ndarray [1-dim] + Summed subtracted encoded sequences. + + """ + delta_x = np.subtract(x_test, x_wt) + delta_e = np.sum(delta_x, axis=1) + return delta_e + + +def generate_model_and_save_pkl( + variants, + ys_true, + params_file, + wt, + train_percent_fit: float = 0.66, # percent of all data: 0.8 * 0.66 + test_percent: float = 0.2, + random_state: int = 42, + substitution_sep = '/', + threads=1 +): + """ + Description + ----------- + Save (Ridge) regression model (trained and with tuned alpha parameter) + with betas (beta_1 and beta_2) as dictionary-structured pickle file. + + Parameters + ---------- + test_percent: float + Percent of DataFrame data used for testing. The remaining data is + used for training (fitting and validation). + train_percent_fit: float + Percent of DataFrame data to train on. + The remaining data is used for validation. + random_state: int + Random seed for splitting in train and test data for reproducing results. + + Returns + ---------- + () + Just saving model parameters as pickle file. + """ + wt_seq = get_wt_sequence(wt) + variants_splitted = split_variants(variants, substitution_sep) + variants, ys_true, sequences = get_seqs_from_var_name(wt_seq, variants_splitted, ys_true) + + xs, variants, sequences, ys_true, x_wt, model, model_type = plmc_or_gremlin_encoding( + variants, sequences, ys_true, params_file, substitution_sep, threads) + + logger.info( + f'Train size (fitting): {train_percent_fit * 100:.1f} % of training data ' + f'({((1 - test_percent) * train_percent_fit) * 100:.1f} % of all data)\n' + f'Train size validation: {(1 - train_percent_fit) * 100:.1f} % of training data ' + f'({((1 - test_percent) * (1 - train_percent_fit)) * 100:.1f} % of all data)\n' + f'Test size: {test_percent * 100:.1f} % ({test_percent * 100:.1f} % of all data)\n' + f'Using random state: {random_state}...\n' + ) + + x_train, x_test, y_train, y_test = train_test_split( + xs, ys_true, test_size=test_percent, random_state=random_state + ) + + hybrid_model = DCAHybridModel( + x_train=x_train, + y_train=y_train, + x_test=x_test, + y_test=y_test, + x_wt=x_wt + ) + + beta_1, beta_2, reg, spearman_dca, test_spearman_r = hybrid_model.train_and_test( + train_percent_fit=train_percent_fit, + random_state=random_state + ) + if reg is None: + alpha_ = 'None' + else: + alpha_ = f'{reg.alpha:.3f}' + logger.info( + f'Individual model weights and regressor hyperparameters:\n' + f'Hybrid model individual model contributions:\nBeta1 (DCA): ' + f'{beta_1:.3f}, Beta2 (ML): {beta_2:.3f} (' + f'regressor: Ridge(alpha={alpha_}))\n' + f'Test performance: Spearman\'s rho = {test_spearman_r:.3f}' + ) + try: + os.mkdir('Pickles') + except FileExistsError: + pass + model_name = f'HYBRID{model_type.lower()}' + save_model_to_dict_pickle(hybrid_model, model_name, beta_1, beta_2, test_spearman_r, reg) + + +def performance_ls_ts( + ls_fasta: str | None, + ts_fasta: str | None, + threads: int, + params_file: str, + model_pickle_file: str | None = None, + substitution_sep: str = '/', + label=False +): + """ + Description + ----------- + Computes performance based on a (linear) regression model trained + on the training set by optimizing model hyperparameters based on + validation performances on training subsets (default: 5-fold CV) + and predicting test set entries using the hyperparmeter-tuned model + to estimate performance for model generalization. + + Parameters + ----------- + ls_fasta: str + Fasta-like file with fitness values. Will be read and extracted + for training the regressor. + ts_fasta: str + Fasta-like file with fitness values. Used for computing performance + of the tuned regressor for test set entries (performance metric of + measured and predicted fitness values). + threads: int + Number of threads to use for parallel computing using Ray. + params_file: str + PLMC parameter file (containing evolutionary, i.e. MSA-based local + and coupling terms. + model: str + Model to load for TS prediction. + separator: str + Character to split the variant to obtain the single substitutions + (default='/'). + + Returns + ----------- + None + Just plots test results (predicted fitness vs. measured fitness) + using def plot_y_true_vs_y_pred. + """ + test_sequences, test_variants, y_test = get_sequences_from_file(ts_fasta) + + if ls_fasta is not None and ts_fasta is not None: + train_sequences, train_variants, y_train = get_sequences_from_file(ls_fasta) + x_train, train_variants, train_sequences, y_train, x_wt, _, model_type = plmc_or_gremlin_encoding( + train_variants, train_sequences, y_train, params_file, substitution_sep, threads + ) + + x_test, test_variants, test_sequences, y_test, *_ = plmc_or_gremlin_encoding( + test_variants, test_sequences, y_test, params_file, substitution_sep, threads, verbose=False + ) + + logger.info(f"\nInitial training set variants: {len(train_sequences)}. " + f"Remaining: {len(train_variants)} (after removing " + f"substitutions at gap positions).\nInitial test set " + f"variants: {len(test_sequences)}. Remaining: {len(test_variants)} " + f"(after removing substitutions at gap positions)." + ) + + hybrid_model = DCAHybridModel( + x_train=np.array(x_train), + y_train=np.array(y_train), + x_test=np.array(x_test), + y_test=np.array(y_test), + x_wt=x_wt + ) + model_name = f'HYBRID{model_type.lower()}' + + spearman_r, reg, beta_1, beta_2 = hybrid_model.ls_ts_performance() + ys_pred = hybrid_model.hybrid_prediction(np.array(x_test), reg, beta_1, beta_2) + + if reg is None: + alpha_ = 'None' + else: + alpha_ = f'{reg.alpha:.3f}' + logger.info( + f'Individual model weights and regressor hyperparameters:\n' + f'Hybrid model individual model contributions: Beta1 (DCA): ' + f'{beta_1:.3f}, Beta2 (ML): {beta_2:.3f} (regressor: ' + f'Ridge(alpha={alpha_}))\nTesting performance...' + ) + + save_model_to_dict_pickle(hybrid_model, model_name, beta_1, beta_2, spearman_r, reg) + + elif ts_fasta is not None and model_pickle_file is not None and params_file is not None: + logger.info(f'Taking model from saved model (Pickle file): {model_pickle_file}...') + + model, model_type = get_model_and_type(model_pickle_file) + + if model_type != 'Hybrid': # same as below in next elif + x_test, test_variants, test_sequences, y_test, x_wt, *_ = plmc_or_gremlin_encoding( + test_variants, test_sequences, y_test, model_pickle_file, substitution_sep, threads, False) + ys_pred = get_delta_e_statistical_model(x_test, x_wt) + else: # Hybrid model input requires params from plmc or GREMLIN model + beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor + x_test, test_variants, test_sequences, y_test, *_ = plmc_or_gremlin_encoding( + test_variants, test_sequences, y_test, params_file, + substitution_sep, threads, False + ) + ys_pred = model.hybrid_prediction(x_test, reg, beta_1, beta_2) + + elif ts_fasta is not None and model_pickle_file is None: # no LS provided --> statistical modeling / no ML + logger.info(f'No learning set provided, falling back to statistical DCA model: ' + f'no adjustments of individual hybrid model parameters (beta_1 and beta_2).') + test_sequences, test_variants, y_test = get_sequences_from_file(ts_fasta) + x_test, test_variants, test_sequences, y_test, x_wt, model, model_type = plmc_or_gremlin_encoding( + test_variants, test_sequences, y_test, params_file, substitution_sep, threads + ) + + logger.info(f"Initial test set variants: {len(test_sequences)}. " + f"Remaining: {len(test_variants)} (after removing " + f"substitutions at gap positions).") + + ys_pred = get_delta_e_statistical_model(x_test, x_wt) + + save_model_to_dict_pickle(model, model_type, None, None, spearmanr(y_test, ys_pred)[0], None) + + else: + raise SystemError('No Test Set given for performance estimation.') + + spearman_rho = spearmanr(y_test, ys_pred) + logger.info(f'Spearman Rho = {spearman_rho[0]:.3f}') + + plot_y_true_vs_y_pred( + np.array(y_test), np.array(ys_pred), np.array(test_variants), label=label, hybrid=True + ) + + +def predict_ps( # also predicting "pmult" dict directories + prediction_dict: dict, + threads: int, + separator: str, + model_pickle_file: str, + params_file: str = None, + prediction_set: str = None, + negative: bool = False +): + """ + Description + ----------- + Predicting the fitness of sequences of a prediction set + or multiple prediction sets that were exemplary created with + 'pypef mkps' based on single substitutional variant data + provided in a CSV and the wild type sequence: + pypef mkps --wt WT_SEQ --input CSV_FILE + [--drop THRESHOLD] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] + [--ddiverse] [--tdiverse] [--qdiverse] + + Parameters + ----------- + prediction_dict: dict + Contains arguments which directory to predict, e.g. {'drecomb': True}, + than predicts prediction files that are present in this directory, e.g. + in directory './Recomb_Double_Split'. + params_file: str + PLMC couplings parameter file + threads: int + Threads used for parallelization for DCA-based sequence encoding + separator: str + Separator of individual substitution of variants, default '/' + model_pickle_file: str + Pickle file containing the hybrid model and model parameters in + a dictionary format + test_set: str = None + Test set for prediction and plotting of predictions (contains + true fitness values of variants). + prediction_set: str = None + Prediction set for prediction, does not contain true fitness values. + figure: str = None + Plotting the test set predictions and the corresponding true fitness + values. + label: bool = False + If True, plots associated variant names of predicted variants. + negative: bool = False + If true, negative defines improved variants having a reduced/negative + fitness compared to wild type. + + + Returns + ----------- + () + Writes sorted predictions to files (for [--drecomb] [--trecomb] + [--qarecomb] [--qirecomb] [--ddiverse] [--tdiverse] [--qdiverse] + in the respective created folders). + + """ + if model_pickle_file is None: + model_pickle_file = params_file + logger.info(f'Trying to load model from saved parameters (Pickle file): {model_pickle_file}...') + else: + logger.info(f'Loading model from saved model (Pickle file): {model_pickle_file}...') + model, model_type = get_model_and_type(model_pickle_file) + + if model_type == 'PLMC' or model_type == 'GREMLIN': + logger.info(f'No hybrid model provided – falling back to a statistical DCA model.') + elif model_type == 'Hybrid': + beta_1, beta_2, reg = model.beta_1, model.beta_2, model.regressor + if reg is None: + alpha_ = 'None' + else: + alpha_ = f'{reg.alpha:.3f}' + logger.info( + f'Individual model weights and regressor hyperparameters:\n' + f'Hybrid model individual model contributions: Beta1 (DCA): {beta_1:.3f}, ' + f'Beta2 (ML): {beta_2:.3f} (regressor: Ridge(alpha={alpha_})).' + ) + + pmult = [ + 'Recomb_Double_Split', 'Recomb_Triple_Split', 'Recomb_Quadruple_Split', + 'Recomb_Quintuple_Split', 'Diverse_Double_Split', 'Diverse_Triple_Split', + 'Diverse_Quadruple_Split' + ] + if True in prediction_dict.values(): + for ps, path in zip(prediction_dict.values(), pmult): + if ps: # if True, run prediction in this directory, e.g. for drecomb + logger.info(f'Running predictions for variant-sequence files in directory {path}...') + all_y_v_pred = [] + files = [f for f in listdir(path) if isfile(join(path, f)) if f.endswith('.fasta')] + for i, file in enumerate(files): # collect and predict for each file in the directory + logger.info(f'Encoding files ({i + 1}/{len(files)}) for prediction...\n') + file_path = os.path.join(path, file) + sequences, variants, _ = get_sequences_from_file(file_path) + if model_type != 'Hybrid': + x_test, test_variants, x_wt, *_ = plmc_or_gremlin_encoding( + variants, sequences, None, model, threads=threads, verbose=False, + substitution_sep=separator) + ys_pred = get_delta_e_statistical_model(x_test, x_wt) + else: # Hybrid model input requires params from plmc or GREMLIN model + ##encoding_model, encoding_model_type = get_model_and_type(params_file) + x_test, test_variants, *_ = plmc_or_gremlin_encoding( + variants, sequences, None, params_file, + threads=threads, verbose=False, substitution_sep=separator + ) + ys_pred = model.hybrid_prediction(x_test, reg, beta_1, beta_2) + for k, y in enumerate(ys_pred): + all_y_v_pred.append((ys_pred[k], variants[k])) + if negative: # sort by fitness value + all_y_v_pred = sorted(all_y_v_pred, key=lambda x: x[0], reverse=False) + else: + all_y_v_pred = sorted(all_y_v_pred, key=lambda x: x[0], reverse=True) + predictions_out( + predictions=all_y_v_pred, + model='Hybrid', + prediction_set=f'Top{path}', + path=path + ) + else: # check next task to do, e.g., predicting triple substituted variants, e.g. trecomb + continue + + elif prediction_set is not None: + sequences, variants, _ = get_sequences_from_file(prediction_set) + # NaNs are already being removed by the called function + if model_type != 'Hybrid': # statistical DCA model + xs, variants, _, _, x_wt, *_ = plmc_or_gremlin_encoding( + variants, sequences, None, params_file, + threads=threads, verbose=False, substitution_sep=separator) + ys_pred = get_delta_e_statistical_model(xs, x_wt) + else: # Hybrid model input requires params from plmc or GREMLIN model + xs, variants, *_ = plmc_or_gremlin_encoding( + variants, sequences, None, params_file, + threads=threads, verbose=True, substitution_sep=separator + ) + ys_pred = model.hybrid_prediction(xs, reg, beta_1, beta_2) + assert len(xs) == len(variants) + y_v_pred = zip(ys_pred, variants) + y_v_pred = sorted(y_v_pred, key=lambda x: x[0], reverse=True) + predictions_out( + predictions=y_v_pred, + model='Hybrid', + prediction_set=f'Top{prediction_set}' + ) + + +def predict_directed_evolution( + encoder: str, + variant: str, + sequence: str, + hybrid_model_data_pkl: str +) -> Union[str, list]: + """ + Perform directed in silico evolution and predict the fitness of a + (randomly) selected variant using the hybrid model. This function opens + the stored DCAHybridModel and the model parameters to predict the fitness + of the variant encoded herein using the PLMC class. If the variant + cannot be encoded (based on the PLMC params file), returns 'skip'. Else, + returning the predicted fitness value and the variant name. + """ + if hybrid_model_data_pkl is not None: + model, model_type = get_model_and_type(hybrid_model_data_pkl) + else: + model_type = 'StatisticalModel' # any name != 'Hybrid' + + if model_type != 'Hybrid': # statistical DCA model + xs, variant, _, _, x_wt, *_ = plmc_or_gremlin_encoding( + variant, sequence, None, encoder, verbose=False, use_global_model=True) + if not list(xs): + return 'skip' + y_pred = get_delta_e_statistical_model(xs, x_wt) + else: # model_type == 'Hybrid': Hybrid model input requires params from PLMC or GREMLIN model + xs, variant, *_ = plmc_or_gremlin_encoding( + variant, sequence, None, encoder, verbose=False, use_global_model=True + ) + if not list(xs): + return 'skip' + try: + y_pred = model.hybrid_prediction(np.atleast_2d(xs), model.regressor, model.beta_1, model.beta_2)[0] + except ValueError: + raise SystemError( + "Probably a different model was used for encoding than for modeling; " + "e.g. using a HYBRIDgremlin model in combination with parameters taken from a PLMC file." + ) + y_pred = float(y_pred) + + return [(y_pred, variant[0][1:])] diff --git a/pypef/dca/plmc_encoding.py b/pypef/dca/plmc_encoding.py index bc39a4c..acafbb7 100644 --- a/pypef/dca/plmc_encoding.py +++ b/pypef/dca/plmc_encoding.py @@ -1,857 +1,857 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -""" -Contains Python code used for the approach presented in our 'hybrid modeling' paper -Preprint available at: https://doi.org/10.1101/2022.06.07.495081 -Code available at: https://github.com/Protein-Engineering-Framework/Hybrid_Model - -The included class 'CouplingsModel' has been taken from the script 'model.py' as part of the -EVmutation module (https://github.com/debbiemarkslab/EVmutation) written by Thomas Hopf in the -labs of Debora Marks and Chris Sander at Harvard Medical School and modified (shortened). -See also: https://doi.org/10.1038/nbt.3769 -Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. (2016). -Mutation effects predicted from sequence co-variation. Nature Biotechnology, in press. - -References: -[1] Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. - Mutation effects predicted from sequence co-variation. - Nature Biotechnology, 35, 2017, 128–135 - https://doi.org/10.1038/nbt.3769 -[2] Hopf T. A., Green A. G., Schubert B., et al. - The EVcouplings Python framework for coevolutionary sequence analysis. - Bioinformatics 35, 2019, 1582–1584 - https://doi.org/10.1093/bioinformatics/bty862 -[3] Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E. - Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models. - Physical Review E, 87(1), 2013, 012707. doi:10.1103/PhysRevE.87.012707 - https://doi.org/10.1103/PhysRevE.87.012707 -""" - - -import os -from collections.abc import Iterable -import logging -logger = logging.getLogger('pypef.dca.encoding') - -import numpy as np -import ray -from tqdm import tqdm -import pickle - -from pypef.utils.variant_data import amino_acids - -_SLICE = np.s_[:] - - -class InvalidVariantError(Exception): - """ - Description - ----------- - Exception raised when entered variant does not follow the required scheme - (integer enclosed by two one-letter code representations of amino acids). - - Attributes - ---------- - variant: str - Variant that causes the error - message: str - Explanation of the error - """ - - def __init__(self, variant: str): - self.variant = variant - message = "The entered variant '%s' does not follow the required scheme " \ - "(integer enclosed by two one letter code representations of amino acids). " \ - "Check separator or variant." % self.variant - self.message = message - super().__init__(self.message) - - -class EffectiveSiteError(Exception): - """ - Description - ----------- - Exception raised when requested position is not implemented in the DCA model. - - Attributes - ---------- - position: int - Position that causes the error - variant: str - Variant including that position - message: str - Explanation of the error - """ - - def __init__(self, position: int, variant: str, verbose: bool = True): - self.position = position - self.variant = variant - self.verbose = verbose - message = f"The position {self.position} of variant '{self.variant}' is " \ - f"not an effective site in the DCA model and thus cannot be predicted." - if self.verbose: - logger.info(message) - self.message = message - super().__init__(self.message) - - -def is_valid_substitution(substitution: str) -> bool: - """ - Description - ----------- - A substitution has to follow the scheme: - First character: (wild-type/substituted) amino acid in one-letter code representation - Last character: (introduced) amino acid in one-letter code representation - In between: position (of substitution) - - If the entered substitution does not follow this scheme (integer enclosed by two one - letter code representations of amino acids) return False, else return True. - - Parameters - ----------- - substitution : str - Substitution as string: Integer enclosed by two letters representing - the wild-type (first) and variant amino acid (last) in one letter code. - - Returns - ------- - boolian - """ - if not substitution[0] in amino_acids: # not accepting format IntegerAA, e.g., 123D - return False - - if not substitution[-1] in amino_acids: - return False - - try: - int(substitution[1:-1]) - except ValueError: - return False - - return True - - -def is_valid_variant(variant: str, separator='/') -> bool: - """ - Description - ----------- - Gets the single substitutions of the variant and checks if they follow the required scheme. - - If the entered substitution does not follow this scheme (integer enclosed by two one- - letter code representations of amino acids) return False, else return True. - - Parameters - ---------- - variant : str - Joined string of integers enclosed by two letters representing the wild type - and variant amino acid in the single letter code. -> Check separator - separator : str - Character to split the variant to obtain the single substitutions (default=','). - - Returns - ------- - boolian - """ - for substitution in variant.split(separator): - if not is_valid_substitution(substitution): - return False - - return True - - -def get_single_substitutions(variant: str, separator='/') -> Iterable: - """ - Description - ----------- - Generator that extracts and returns the single substitutions of the entered variant. - - Parameters - ---------- - See 'is_valid_variant' for an explanation. - - Returns - ------- - Generator object - """ - if is_valid_variant(variant, separator): - for substitution in variant.split(separator): - yield substitution - - else: - raise InvalidVariantError(variant) - - -class CouplingsModel: - """ - Class to store parameters of pairwise undirected graphical model of sequences - and compute evolutionary couplings, sequence statistical energies, etc. - """ - def __init__( - self, - filename, - precision="float32", - verbose: bool = False, - **kwargs - ): - """ - Initializes the object with raw values read from binary .Jij file - - Parameters - ---------- - filename : str - Binary Jij file containing model parameters from plmc software - alphabet : str - Symbols corresponding to model states (e.g. "-ACGT"). - precision : {"float32", "float64"}, default: "float32" - Sets if input file has single (float32) or double precision (float64) - """ - self.index_map = None - self._target_seq = None - self._index_list = None - self.x_wt = None - self.verbose = verbose - try: - self.__read_plmc_v2(filename, precision) - except TypeError or FileNotFoundError: - raise SystemError( - "Did not find (specified) PLMC parameter file. " - "The parameter file is required for DCA-based " - "encoding and can be provided via the flag " - "--params PLMC_FILE." - ) - self.alphabet_map = {s: i for i, s in enumerate(self.alphabet)} - - # in non-gap mode, focus sequence is still coded with a gap character, - # but gap is not part of model alphabet anymore; so if mapping crashes - # that means there is a non-alphabet character in sequence array - # and therefore there is no focus sequence. - try: - self.target_seq_mapped = np.array([self.alphabet_map[x] for x in self.target_seq]) - self.has_target_seq = (np.sum(self.target_seq_mapped) > 0) - except KeyError: - self.target_seq_mapped = np.zeros(shape=np.shape(self.l), dtype=np.int32) - self.has_target_seq = False - - def __read_plmc_v2(self, filename, precision): - """ - Read updated Jij file format from plmc. - - Parameters - ---------- - filename : str - Binary Jij file containing model parameters - precision : {"float32", "float64"} - Sets if input file has single or double precision - - """ - with open(filename, "rb") as f: - # model length, number of symbols, valid/invalid sequences - # and iterations - self.l, self.num_symbols, self.n_valid, self.n_invalid, self.num_iter = ( - np.fromfile(f, "int32", 5) - ) - - # theta, regularization weights, and effective number of samples - self.theta, self.lambda_h, self.lambda_j, self.lambda_group, self.n_eff = ( - np.fromfile(f, precision, 5) - ) - - # Read alphabet (make sure we get proper unicode rather than byte string) - self.alphabet = np.fromfile( - f, "S1", self.num_symbols - ).astype("U1") - - # weights of individual sequences (after clustering) - self.weights = np.fromfile( - f, precision, self.n_valid + self.n_invalid - ) - - # target sequence and index mapping, again ensure unicode - self._target_seq = np.fromfile(f, "S1", self.l).astype("U1") - self.index_list = np.fromfile(f, "int32", self.l) - - # Analyzing Positions included in the PLMC file (based on the MSA) - not_valid, valid = [], [] - for num in range(self.index_list[0], self.index_list[-1] + 1, 1): - if num not in self.index_list: - not_valid.append(num) - else: - valid.append(num) - self.wt_aa_pos = [] - for aa, pos in zip(self._target_seq, self.index_list): - self.wt_aa_pos.append(str(aa) + str(pos)) - if self.verbose: - logger.info(f'Evaluating gap content of PLMC parameter file... ' - f'First amino acid position used in the MSA (PLMC params file) is ' - f'{self._target_seq[0]}{self.index_list[0]} and the last position ' - f'used is {self._target_seq[-1]}{self.index_list[-1]}.') - if len(not_valid) > 0: - logger.info(f'Further, non-included positions are:\n{str(not_valid)[1:-1]}') - logger.info(f'Summary of all effective positions represented in the MSA ' - f'based on wild-type sequence ({len(valid)} encoded positions):\n' - f'{str([aa_pos for aa_pos in self.wt_aa_pos])[1:-1]}'.replace("'", "")) - - # single site frequencies f_i and fields h_i - self.f_i, = np.fromfile( - f, dtype=(precision, (self.l, self.num_symbols)), count=1 - ) - - self.h_i, = np.fromfile( - f, dtype=(precision, (self.l, self.num_symbols)), count=1 - ) - - # pair frequencies f_ij and pair couplings J_ij / J_ij - self.f_ij = np.zeros( - (self.l, self.l, self.num_symbols, self.num_symbols) - ) - - self.j_ij = np.zeros( - (self.l, self.l, self.num_symbols, self.num_symbols) - ) - - for i in range(self.l - 1): - for j in range(i + 1, self.l): - self.f_ij[i, j], = np.fromfile( - f, dtype=(precision, (self.num_symbols, self.num_symbols)), - count=1 - ) - self.f_ij[j, i] = self.f_ij[i, j].T - - for i in range(self.l - 1): - for j in range(i + 1, self.l): - self.j_ij[i, j], = np.fromfile( - f, dtype=(precision, (self.num_symbols, self.num_symbols)), - count=1 - ) - - self.j_ij[j, i] = self.j_ij[i, j].T - - def get_target_seq_and_index(self): - """ - Gets and returns the target sequence of encodeable positions as - well as the index list of encodeable positions that are the - corresponding amino acid positions of the wild type sequence (1-indexed). - - Returns - ---------- - self._target_seq: list - List of single letter strings of the wild-type amino acids - at the encodeable positions - self._index_list: list - List of integers of encodeable amino acid positions - """ - return self._target_seq, self._index_list - - @property - def target_seq(self): - """ - Target/Focus sequence of model used for delta_hamiltonian - calculations (including single and double mutation matrices) - """ - return self._target_seq - - @target_seq.setter - def target_seq(self, sequence): - """ - Define a new target sequence - - Parameters - ---------- - sequence : str, or list of chars - Define a new default sequence for relative Hamiltonian - calculations (e.g. energy difference relative to wild-type - sequence). - Length of sequence must correspond to model length (self.l) - """ - if len(sequence) != self.l: - raise ValueError(f"Sequence length inconsistent with model length: {len(sequence)} != {self.l}") - - if isinstance(sequence, str): - sequence = list(sequence) - - self._target_seq = np.array(sequence) - self.target_seq_mapped = np.array([self.alphabet_map[x] for x in self.target_seq]) - self.has_target_seq = True - - @property - def index_list(self): - """ - Target/Focus sequence of model used for delta_hamiltonian - calculations (including single and double mutation matrices) - """ - return self._index_list - - @index_list.setter - def index_list(self, mapping): - """ - Define a new number mapping for sequences - - Parameters - ---------- - mapping: list of int - Sequence indices of the positions in the model. - Length of list must correspond to model length (self.l) - """ - if len(mapping) != self.l: - raise ValueError(f"Mapping length inconsistent with model length: {len(mapping)} != {self.l}") - - self._index_list = np.array(mapping) - self.index_map = {b: a for a, b in enumerate(self.index_list)} - - def __map(self, indices, mapping): - """ - Applies a mapping either to a single index, or to a list of indices - - Parameters - ---------- - indices : Iterable of items to be mapped, or single item - mapping: Dictionary containing mapping into new space - - Returns - ------- - Iterable, or single item - Items mapped into new space - """ - if ((isinstance(indices, Iterable) and not isinstance(indices, str)) or - (isinstance(indices, str) and len(indices) > 1)): - return np.array([mapping[i] for i in indices]) - else: - return mapping[indices] - - def __4d_access(self, matrix, i=None, j=None, a_i=None, a_j=None): - """ - Provides shortcut access to column pair properties - (e.g. J_ij or f_ij matrices) - - Parameters - ----------- - i : Iterable(int) or int - Position(s) on first matrix axis - j : Iterable(int) or int - Position(s) on second matrix axis - a_i : Iterable(str) or str - Symbols corresponding to first matrix axis - a_j : Iterable(str) or str - Symbols corresponding to second matrix axis - - Returns - ------- - np.array - 4D matrix "matrix" sliced according to values i, j, a_i and a_j - """ - i = self.__map(i, self.index_map) if i is not None else _SLICE - j = self.__map(j, self.index_map) if j is not None else _SLICE - a_i = self.__map(a_i, self.alphabet_map) if a_i is not None else _SLICE - a_j = self.__map(a_j, self.alphabet_map) if a_j is not None else _SLICE - return matrix[i, j, a_i, a_j] - - def __2d_access(self, matrix, i=None, a_i=None): - """ - Provides shortcut access to single-column properties - (e.g. f_i or h_i matrices) - - Parameters - ----------- - i : Iterable(int) or int - Position(s) on first matrix axis - a_i : Iterable(str) or str - Symbols corresponding to first matrix axis - - Returns - ------- - np.array - 2D matrix "matrix" sliced according to values i and a_i - """ - i = self.__map(i, self.index_map) if i is not None else _SLICE - a_i = self.__map(a_i, self.alphabet_map) if a_i is not None else _SLICE - return matrix[i, a_i] - - def get_jij(self, i=None, j=None, a_i=None, a_j=None): - """ - Quick access to J_ij matrix with automatic index mapping. - See __4d_access for explanation of parameters. - """ - return self.__4d_access(self.j_ij, i, j, a_i, a_j) - - def get_hi(self, i=None, a_i=None): - """ - Quick access to h_i matrix with automatic index mapping. - See __2d_access for explanation of parameters. - """ - return self.__2d_access(self.h_i, i, a_i) - - -class PLMC(CouplingsModel): - def __init__( - self, - params_file: str, - separator: str = '/', - verbose: bool = True - ): - """ - Class for performing the 'DCA encoding'. - - Attributes - ---------- - params_file: str - Binary parameter file outputted by PLMC. - """ - super().__init__(filename=params_file) # inherit functions and variables from class CouplingsModel - self.verbose = verbose - self.separator = separator - target_seq, index = self.get_target_seq_and_index() - self.x_wt = self.collect_encoded_sequences(target_seq[0] + str(index[0]) + target_seq[0]) - - def _get_position_internal(self, position: int): - """ - Description - ----------- - Returns the "internal position" of an amino acid, e.g., D19V is the desired substitution, - but the fasta sequence starts from residue 3, i.e., the first two residues are "missing". - The DCA model will then recognize D19 as D17. In order to avoid wrong assignments, - it is inevitable to calculate the "internal position" 'i'. - - Parameters - ---------- - position : int - Position of interest - - Returns - ------- - i : int - "Internal position" that may differ due to different starting residue. - None - If the requested position is not an active site. - """ - offset = 0 - i = position - offset - if i in self.index_list: - return i - else: - return None - - def sum_ji(self, i: int, a_i: str, sequence: np.ndarray) -> float: - """ - Description - ----------- - Calculates the sum of all site-site interaction terms when site 'i' is occupied with amino acid 'a_i'. - - Parameters - ---------- - i : int - "Internal position" see '_get_position_internal' for an explanation. - a_i : str - Introduced amino acid at 'i' in one-letter code representation. - sequence: np.ndarray - Sequence of the variant as numpy array. - - Returns - ------- - j_i : float - Sum J(i) of all site-site interaction terms acting on position 'i' when occupied with 'a_i'. - """ - j_i_sum = 0.0 - for j, a_j in zip(self.index_list, sequence): - j_i_sum += self.get_jij(i=i, a_i=a_i, j=j, a_j=a_j) - - return j_i_sum - - @staticmethod - def _unpack_substitution(substitution: str) -> tuple: - """ - Description - ----------- - Turns string representation of variant into tuple. - - Parameters - ---------- - substitution : str - Substitution as string: Integer enclosed by two letters representing - the wild-type (first) and variant amino acid (last) in one letter code. - - Returns - ------- - substitution : tuple - (wild-type amino acid, position, variant amino acid) - """ - return substitution[0], int(substitution[1:-1]), substitution[-1] - - def check_substitution_naming_against_wt(self, substitution: str, variant: str): - """ - Checks whether the amino acid to substitute of the variant matches - the amino acid of the wild type at this position. - """ - if substitution[:-1] not in self.wt_aa_pos: - wild_type_aa, position, a_i = self._unpack_substitution(substitution) - raise SystemError( - f"The variant naming scheme is not fitting to the PLMC " - f"scheme. Substitution {substitution} of variant {variant} has " - f"the amino acid {wild_type_aa} at position {position}, which " - f"does not match the wild type sequence used as target for DCA-" - f"based coupling parameter file generation. See summary of " - f"(effective) wild-type positions and amino acids above. Please " - f"check your input variant data or generate a new parameter file " - f"for encoding." - ) - - def encode_variant(self, variant: str) -> np.ndarray: - """ - Description - ----------- - Encodes the variant using its "DCA representation". - - Parameters - ---------- - variant : str - Joined string of integers enclosed by two letters representing the wild type - and variant amino acid in the single letter code. -> Check separator - - Returns - ------- - x_var : np.ndarray - Encoded sequence of the variant. - """ - sequence = self.target_seq.copy() - for substitution in get_single_substitutions(variant, self.separator): # e.g. A123C/D234E --> A123C, D234C - wild_type_aa, position, a_i = self._unpack_substitution(substitution) - - i = self._get_position_internal(position) - if not i: - raise EffectiveSiteError(position, variant, self.verbose) - - self.check_substitution_naming_against_wt(substitution, variant) - i_mapped = self.index_map[i] - sequence[i_mapped] = a_i - - x_var = np.zeros(sequence.size, dtype=float) - for idx, (i, a_i) in enumerate(zip(self.index_list, sequence)): - x_var[idx] = self.get_hi(i, a_i) + 0.5 * self.sum_ji(i, a_i, sequence) - - return x_var - - def collect_encoded_sequences(self, variants: list) -> np.ndarray: - """ - Description - ----------- - Collects all encoded sequences based on input variant names. - - Parameters - ---------- - variants: list - Variant name, e.g. 'A13E', or 'D127F' (wild-type sequence - would be defined by substitution to itself, e.g. 'F17F'). - - Returns - ---------- - encoded_sequences: list - List of all collected encoded sequences (features) for all - inputted variants. - non_encoded_variants_list_pos:list - Internal array/list position for variants that could not be - encoded due to the underlying MSA (inter-gap threshold for - computing of local and coupling PLMC parameters). These list - positions must then be removed for the corresponding fitness - arrays/lists for training and testing the model. - """ - encoded_sequences = [] - if len(np.atleast_1d(variants)) == 1: # do not show progress bar for single variant - set_silence = True # thus, also not for directed evolution - else: - set_silence = False - for i, variant in enumerate(tqdm(np.atleast_1d(variants), disable=set_silence)): - try: - encoded_sequences.append(self.encode_variant(variant)) - except EffectiveSiteError: - encoded_sequences.append([None]) - - return np.array(encoded_sequences, dtype='object') - - -""" -Below: Some helper functions to run the PLMC class and get -the encoded sequences in parallel (threading) using Ray and -to construct a pandas.DataFrame to store the encoded sequences -(features) and the associated fitness values in a CSV file. -""" - - -def save_plmc_dca_encoding_model(params_file, substitution_sep='/'): - """ - Just converts plmc params from raw binary to - Pickle-saved PLMC class. - """ - logger.info("Transforming the provided plmc params file " - "to a PLMC Pickle file (Pickles/PLMC).") - plmc = PLMC( - params_file=params_file, - separator=substitution_sep, - verbose=False - ) - try: - os.mkdir('Pickles') - except FileExistsError: - pass - pickle.dump({ - 'model': plmc, - 'model_type': 'PLMCpureDCA', - 'beta_1': None, - 'beta_2': None, - 'spearman_rho': None, - 'regressor': None - }, - open(f'Pickles/PLMC', 'wb') - ) - - -def get_encoded_sequence( - variant: str, - dca_encode: PLMC -): - """ - Description - ----------- - Gets encoded sequence based on input variant name and a preexisting - PLMC instance. - - Parameters - ---------- - variant: str - Variant name, e.g. 'A13E', or 'D127F'. Wild-type sequence - is defined by substitution to itself, e.g. 'F17F'. - dca_encode: PLMC class object - For encoding sequences, see above: class PLMC. - """ - try: - encoded_seq = dca_encode.encode_variant(variant) - except EffectiveSiteError: # position not included in processed MSA - return - - return encoded_seq - - -@ray.remote -def _get_data_parallel( - variants: list, - sequences: list, - fitnesses: list, - dca_encode: PLMC, - data: list -) -> list: - """ - Description - ----------- - Get the variant name, the associated fitness value, and its ("DCA"-)encoded sequence. - - Parameters - ---------- - variants : list - List of strings containing the variants to be encoded. - fitnesses : list - List of floats (1d) containing the fitness values associated to the variants. - dca_encode : object - Initialized 'PLMC' class object. - data : manager.list() - Manager.list() object to store the output of multiple processors. - - Returns - ------- - data : manager.list() - Filled list with variant names, fitnesses, and encoded sequence. - """ - for i, (variant, sequence, fitness) in enumerate(zip(variants, sequences, fitnesses)): - try: - data.append([variant, sequence, dca_encode.encode_variant(variant), fitness]) - except EffectiveSiteError: # do not append non-encoded sequences and - pass # associated fitness values - - return data - - -def get_dca_data_parallel( - variants: list, - sequences: list, - fitnesses: list, - dca_encode: PLMC, - threads: int, - verbose=True -) -> tuple[list, list, list, list]: - """ - Description - ----------- - This function allows to generate the encoded sequences based on the variants - given in 'csv_file' in a parallel manner. - - Parameters - ---------- - variants: list (or np.ndarray) - Variant names. - sequences: list (or np.ndarray) - Variant-associated protein sequences. - fitnesses: list (or np.ndarray) - Variant-associated fitness values. - dca_encode : object - Initialized 'PLMC' class object. - threads : int - Number of processes to be used for parallel execution. - n_cores = 1 defines no threading (not using Ray). - verbose: bool - Logging message on/off. - - Returns - ------- - data: numpy.ndarray - Filled numpy array including variant names, fitnesses, and encoded sequences. - non_effective_subs: list - List of variant names that cannot be used for modelling as they are not effective - positions in the underlying MSA used for generating local and coupling terms. - """ - if verbose: - logger.info(f'{len(variants)} input variants. Encoding variant sequences using parameters ' - f'taken from plmc generated file. This might take some time...') - - idxs_nan = np.array([i for i, b in enumerate(np.isnan(fitnesses)) if b]) # find fitness NaNs - if idxs_nan.size > 0: # remove NaNs if present - logger.info(f'Fitness NaNs are: {idxs_nan}') - fitnesses = np.delete(fitnesses, idxs_nan) - variants = np.delete(variants, idxs_nan) - - variants_split = np.array_split(variants, threads) # split array in n_cores parts - sequences_split = np.array_split(sequences, threads) # for Ray parallelization - fitnesses_split = np.array_split(fitnesses, threads) - results = ray.get([ - _get_data_parallel.remote( - variants_split[i], - sequences_split[i], - fitnesses_split[i], - dca_encode, - [] - ) for i in range(len(variants_split)) - ]) - - data = [item for sublist in results for item in sublist] # fusing all the individual results - variants = [item[0] for item in data] - sequences = [item[1] for item in data] - xs = [item[2] for item in data] - fitnesses = [item[3] for item in data] - - if verbose: - logger.info(f'{len(data)} variants after NaN-valued and non-effective ' - f'site-substituted variant (EffectiveSiteError) dropping.') - - return variants, sequences, xs, fitnesses +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +""" +Contains Python code used for the approach presented in our 'hybrid modeling' paper +Preprint available at: https://doi.org/10.1101/2022.06.07.495081 +Code available at: https://github.com/Protein-Engineering-Framework/Hybrid_Model + +The included class 'CouplingsModel' has been taken from the script 'model.py' as part of the +EVmutation module (https://github.com/debbiemarkslab/EVmutation) written by Thomas Hopf in the +labs of Debora Marks and Chris Sander at Harvard Medical School and modified (shortened). +See also: https://doi.org/10.1038/nbt.3769 +Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. (2016). +Mutation effects predicted from sequence co-variation. Nature Biotechnology, in press. + +References: +[1] Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. + Mutation effects predicted from sequence co-variation. + Nature Biotechnology, 35, 2017, 128–135 + https://doi.org/10.1038/nbt.3769 +[2] Hopf T. A., Green A. G., Schubert B., et al. + The EVcouplings Python framework for coevolutionary sequence analysis. + Bioinformatics 35, 2019, 1582–1584 + https://doi.org/10.1093/bioinformatics/bty862 +[3] Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E. + Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models. + Physical Review E, 87(1), 2013, 012707. doi:10.1103/PhysRevE.87.012707 + https://doi.org/10.1103/PhysRevE.87.012707 +""" + + +import os +from collections.abc import Iterable +import logging +logger = logging.getLogger('pypef.dca.encoding') + +import numpy as np +import ray +from tqdm import tqdm +import pickle + +from pypef.utils.variant_data import amino_acids + +_SLICE = np.s_[:] + + +class InvalidVariantError(Exception): + """ + Description + ----------- + Exception raised when entered variant does not follow the required scheme + (integer enclosed by two one-letter code representations of amino acids). + + Attributes + ---------- + variant: str + Variant that causes the error + message: str + Explanation of the error + """ + + def __init__(self, variant: str): + self.variant = variant + message = "The entered variant '%s' does not follow the required scheme " \ + "(integer enclosed by two one letter code representations of amino acids). " \ + "Check separator or variant." % self.variant + self.message = message + super().__init__(self.message) + + +class EffectiveSiteError(Exception): + """ + Description + ----------- + Exception raised when requested position is not implemented in the DCA model. + + Attributes + ---------- + position: int + Position that causes the error + variant: str + Variant including that position + message: str + Explanation of the error + """ + + def __init__(self, position: int, variant: str, verbose: bool = True): + self.position = position + self.variant = variant + self.verbose = verbose + message = f"The position {self.position} of variant '{self.variant}' is " \ + f"not an effective site in the DCA model and thus cannot be predicted." + if self.verbose: + logger.info(message) + self.message = message + super().__init__(self.message) + + +def is_valid_substitution(substitution: str) -> bool: + """ + Description + ----------- + A substitution has to follow the scheme: + First character: (wild-type/substituted) amino acid in one-letter code representation + Last character: (introduced) amino acid in one-letter code representation + In between: position (of substitution) + + If the entered substitution does not follow this scheme (integer enclosed by two one + letter code representations of amino acids) return False, else return True. + + Parameters + ----------- + substitution : str + Substitution as string: Integer enclosed by two letters representing + the wild-type (first) and variant amino acid (last) in one letter code. + + Returns + ------- + boolian + """ + if not substitution[0] in amino_acids: # not accepting format IntegerAA, e.g., 123D + return False + + if not substitution[-1] in amino_acids: + return False + + try: + int(substitution[1:-1]) + except ValueError: + return False + + return True + + +def is_valid_variant(variant: str, separator='/') -> bool: + """ + Description + ----------- + Gets the single substitutions of the variant and checks if they follow the required scheme. + + If the entered substitution does not follow this scheme (integer enclosed by two one- + letter code representations of amino acids) return False, else return True. + + Parameters + ---------- + variant : str + Joined string of integers enclosed by two letters representing the wild type + and variant amino acid in the single letter code. -> Check separator + separator : str + Character to split the variant to obtain the single substitutions (default=','). + + Returns + ------- + boolian + """ + for substitution in variant.split(separator): + if not is_valid_substitution(substitution): + return False + + return True + + +def get_single_substitutions(variant: str, separator='/') -> Iterable: + """ + Description + ----------- + Generator that extracts and returns the single substitutions of the entered variant. + + Parameters + ---------- + See 'is_valid_variant' for an explanation. + + Returns + ------- + Generator object + """ + if is_valid_variant(variant, separator): + for substitution in variant.split(separator): + yield substitution + + else: + raise InvalidVariantError(variant) + + +class CouplingsModel: + """ + Class to store parameters of pairwise undirected graphical model of sequences + and compute evolutionary couplings, sequence statistical energies, etc. + """ + def __init__( + self, + filename, + precision="float32", + verbose: bool = False, + **kwargs + ): + """ + Initializes the object with raw values read from binary .Jij file + + Parameters + ---------- + filename : str + Binary Jij file containing model parameters from plmc software + alphabet : str + Symbols corresponding to model states (e.g. "-ACGT"). + precision : {"float32", "float64"}, default: "float32" + Sets if input file has single (float32) or double precision (float64) + """ + self.index_map = None + self._target_seq = None + self._index_list = None + self.x_wt = None + self.verbose = verbose + try: + self.__read_plmc_v2(filename, precision) + except TypeError or FileNotFoundError: + raise SystemError( + "Did not find (specified) PLMC parameter file. " + "The parameter file is required for DCA-based " + "encoding and can be provided via the flag " + "--params PLMC_FILE." + ) + self.alphabet_map = {s: i for i, s in enumerate(self.alphabet)} + + # in non-gap mode, focus sequence is still coded with a gap character, + # but gap is not part of model alphabet anymore; so if mapping crashes + # that means there is a non-alphabet character in sequence array + # and therefore there is no focus sequence. + try: + self.target_seq_mapped = np.array([self.alphabet_map[x] for x in self.target_seq]) + self.has_target_seq = (np.sum(self.target_seq_mapped) > 0) + except KeyError: + self.target_seq_mapped = np.zeros(shape=np.shape(self.l), dtype=np.int32) + self.has_target_seq = False + + def __read_plmc_v2(self, filename, precision): + """ + Read updated Jij file format from plmc. + + Parameters + ---------- + filename : str + Binary Jij file containing model parameters + precision : {"float32", "float64"} + Sets if input file has single or double precision + + """ + with open(filename, "rb") as f: + # model length, number of symbols, valid/invalid sequences + # and iterations + self.l, self.num_symbols, self.n_valid, self.n_invalid, self.num_iter = ( + np.fromfile(f, "int32", 5) + ) + + # theta, regularization weights, and effective number of samples + self.theta, self.lambda_h, self.lambda_j, self.lambda_group, self.n_eff = ( + np.fromfile(f, precision, 5) + ) + + # Read alphabet (make sure we get proper unicode rather than byte string) + self.alphabet = np.fromfile( + f, "S1", self.num_symbols + ).astype("U1") + + # weights of individual sequences (after clustering) + self.weights = np.fromfile( + f, precision, self.n_valid + self.n_invalid + ) + + # target sequence and index mapping, again ensure unicode + self._target_seq = np.fromfile(f, "S1", self.l).astype("U1") + self.index_list = np.fromfile(f, "int32", self.l) + + # Analyzing Positions included in the PLMC file (based on the MSA) + not_valid, valid = [], [] + for num in range(self.index_list[0], self.index_list[-1] + 1, 1): + if num not in self.index_list: + not_valid.append(num) + else: + valid.append(num) + self.wt_aa_pos = [] + for aa, pos in zip(self._target_seq, self.index_list): + self.wt_aa_pos.append(str(aa) + str(pos)) + if self.verbose: + logger.info(f'Evaluating gap content of PLMC parameter file... ' + f'First amino acid position used in the MSA (PLMC params file) is ' + f'{self._target_seq[0]}{self.index_list[0]} and the last position ' + f'used is {self._target_seq[-1]}{self.index_list[-1]}.') + if len(not_valid) > 0: + logger.info(f'Further, non-included positions are:\n{str(not_valid)[1:-1]}') + logger.info(f'Summary of all effective positions represented in the MSA ' + f'based on wild-type sequence ({len(valid)} encoded positions):\n' + f'{str([aa_pos for aa_pos in self.wt_aa_pos])[1:-1]}'.replace("'", "")) + + # single site frequencies f_i and fields h_i + self.f_i, = np.fromfile( + f, dtype=(precision, (self.l, self.num_symbols)), count=1 + ) + + self.h_i, = np.fromfile( + f, dtype=(precision, (self.l, self.num_symbols)), count=1 + ) + + # pair frequencies f_ij and pair couplings J_ij / J_ij + self.f_ij = np.zeros( + (self.l, self.l, self.num_symbols, self.num_symbols) + ) + + self.j_ij = np.zeros( + (self.l, self.l, self.num_symbols, self.num_symbols) + ) + + for i in range(self.l - 1): + for j in range(i + 1, self.l): + self.f_ij[i, j], = np.fromfile( + f, dtype=(precision, (self.num_symbols, self.num_symbols)), + count=1 + ) + self.f_ij[j, i] = self.f_ij[i, j].T + + for i in range(self.l - 1): + for j in range(i + 1, self.l): + self.j_ij[i, j], = np.fromfile( + f, dtype=(precision, (self.num_symbols, self.num_symbols)), + count=1 + ) + + self.j_ij[j, i] = self.j_ij[i, j].T + + def get_target_seq_and_index(self): + """ + Gets and returns the target sequence of encodeable positions as + well as the index list of encodeable positions that are the + corresponding amino acid positions of the wild type sequence (1-indexed). + + Returns + ---------- + self._target_seq: list + List of single letter strings of the wild-type amino acids + at the encodeable positions + self._index_list: list + List of integers of encodeable amino acid positions + """ + return self._target_seq, self._index_list + + @property + def target_seq(self): + """ + Target/Focus sequence of model used for delta_hamiltonian + calculations (including single and double mutation matrices) + """ + return self._target_seq + + @target_seq.setter + def target_seq(self, sequence): + """ + Define a new target sequence + + Parameters + ---------- + sequence : str, or list of chars + Define a new default sequence for relative Hamiltonian + calculations (e.g. energy difference relative to wild-type + sequence). + Length of sequence must correspond to model length (self.l) + """ + if len(sequence) != self.l: + raise ValueError(f"Sequence length inconsistent with model length: {len(sequence)} != {self.l}") + + if isinstance(sequence, str): + sequence = list(sequence) + + self._target_seq = np.array(sequence) + self.target_seq_mapped = np.array([self.alphabet_map[x] for x in self.target_seq]) + self.has_target_seq = True + + @property + def index_list(self): + """ + Target/Focus sequence of model used for delta_hamiltonian + calculations (including single and double mutation matrices) + """ + return self._index_list + + @index_list.setter + def index_list(self, mapping): + """ + Define a new number mapping for sequences + + Parameters + ---------- + mapping: list of int + Sequence indices of the positions in the model. + Length of list must correspond to model length (self.l) + """ + if len(mapping) != self.l: + raise ValueError(f"Mapping length inconsistent with model length: {len(mapping)} != {self.l}") + + self._index_list = np.array(mapping) + self.index_map = {b: a for a, b in enumerate(self.index_list)} + + def __map(self, indices, mapping): + """ + Applies a mapping either to a single index, or to a list of indices + + Parameters + ---------- + indices : Iterable of items to be mapped, or single item + mapping: Dictionary containing mapping into new space + + Returns + ------- + Iterable, or single item + Items mapped into new space + """ + if ((isinstance(indices, Iterable) and not isinstance(indices, str)) or + (isinstance(indices, str) and len(indices) > 1)): + return np.array([mapping[i] for i in indices]) + else: + return mapping[indices] + + def __4d_access(self, matrix, i=None, j=None, a_i=None, a_j=None): + """ + Provides shortcut access to column pair properties + (e.g. J_ij or f_ij matrices) + + Parameters + ----------- + i : Iterable(int) or int + Position(s) on first matrix axis + j : Iterable(int) or int + Position(s) on second matrix axis + a_i : Iterable(str) or str + Symbols corresponding to first matrix axis + a_j : Iterable(str) or str + Symbols corresponding to second matrix axis + + Returns + ------- + np.array + 4D matrix "matrix" sliced according to values i, j, a_i and a_j + """ + i = self.__map(i, self.index_map) if i is not None else _SLICE + j = self.__map(j, self.index_map) if j is not None else _SLICE + a_i = self.__map(a_i, self.alphabet_map) if a_i is not None else _SLICE + a_j = self.__map(a_j, self.alphabet_map) if a_j is not None else _SLICE + return matrix[i, j, a_i, a_j] + + def __2d_access(self, matrix, i=None, a_i=None): + """ + Provides shortcut access to single-column properties + (e.g. f_i or h_i matrices) + + Parameters + ----------- + i : Iterable(int) or int + Position(s) on first matrix axis + a_i : Iterable(str) or str + Symbols corresponding to first matrix axis + + Returns + ------- + np.array + 2D matrix "matrix" sliced according to values i and a_i + """ + i = self.__map(i, self.index_map) if i is not None else _SLICE + a_i = self.__map(a_i, self.alphabet_map) if a_i is not None else _SLICE + return matrix[i, a_i] + + def get_jij(self, i=None, j=None, a_i=None, a_j=None): + """ + Quick access to J_ij matrix with automatic index mapping. + See __4d_access for explanation of parameters. + """ + return self.__4d_access(self.j_ij, i, j, a_i, a_j) + + def get_hi(self, i=None, a_i=None): + """ + Quick access to h_i matrix with automatic index mapping. + See __2d_access for explanation of parameters. + """ + return self.__2d_access(self.h_i, i, a_i) + + +class PLMC(CouplingsModel): + def __init__( + self, + params_file: str, + separator: str = '/', + verbose: bool = True + ): + """ + Class for performing the 'DCA encoding'. + + Attributes + ---------- + params_file: str + Binary parameter file outputted by PLMC. + """ + super().__init__(filename=params_file) # inherit functions and variables from class CouplingsModel + self.verbose = verbose + self.separator = separator + target_seq, index = self.get_target_seq_and_index() + self.x_wt = self.collect_encoded_sequences(target_seq[0] + str(index[0]) + target_seq[0]) + + def _get_position_internal(self, position: int): + """ + Description + ----------- + Returns the "internal position" of an amino acid, e.g., D19V is the desired substitution, + but the fasta sequence starts from residue 3, i.e., the first two residues are "missing". + The DCA model will then recognize D19 as D17. In order to avoid wrong assignments, + it is inevitable to calculate the "internal position" 'i'. + + Parameters + ---------- + position : int + Position of interest + + Returns + ------- + i : int + "Internal position" that may differ due to different starting residue. + None + If the requested position is not an active site. + """ + offset = 0 + i = position - offset + if i in self.index_list: + return i + else: + return None + + def sum_ji(self, i: int, a_i: str, sequence: np.ndarray) -> float: + """ + Description + ----------- + Calculates the sum of all site-site interaction terms when site 'i' is occupied with amino acid 'a_i'. + + Parameters + ---------- + i : int + "Internal position" see '_get_position_internal' for an explanation. + a_i : str + Introduced amino acid at 'i' in one-letter code representation. + sequence: np.ndarray + Sequence of the variant as numpy array. + + Returns + ------- + j_i : float + Sum J(i) of all site-site interaction terms acting on position 'i' when occupied with 'a_i'. + """ + j_i_sum = 0.0 + for j, a_j in zip(self.index_list, sequence): + j_i_sum += self.get_jij(i=i, a_i=a_i, j=j, a_j=a_j) + + return j_i_sum + + @staticmethod + def _unpack_substitution(substitution: str) -> tuple: + """ + Description + ----------- + Turns string representation of variant into tuple. + + Parameters + ---------- + substitution : str + Substitution as string: Integer enclosed by two letters representing + the wild-type (first) and variant amino acid (last) in one letter code. + + Returns + ------- + substitution : tuple + (wild-type amino acid, position, variant amino acid) + """ + return substitution[0], int(substitution[1:-1]), substitution[-1] + + def check_substitution_naming_against_wt(self, substitution: str, variant: str): + """ + Checks whether the amino acid to substitute of the variant matches + the amino acid of the wild type at this position. + """ + if substitution[:-1] not in self.wt_aa_pos: + wild_type_aa, position, a_i = self._unpack_substitution(substitution) + raise SystemError( + f"The variant naming scheme is not fitting to the PLMC " + f"scheme. Substitution {substitution} of variant {variant} has " + f"the amino acid {wild_type_aa} at position {position}, which " + f"does not match the wild type sequence used as target for DCA-" + f"based coupling parameter file generation. See summary of " + f"(effective) wild-type positions and amino acids above. Please " + f"check your input variant data or generate a new parameter file " + f"for encoding." + ) + + def encode_variant(self, variant: str) -> np.ndarray: + """ + Description + ----------- + Encodes the variant using its "DCA representation". + + Parameters + ---------- + variant : str + Joined string of integers enclosed by two letters representing the wild type + and variant amino acid in the single letter code. -> Check separator + + Returns + ------- + x_var : np.ndarray + Encoded sequence of the variant. + """ + sequence = self.target_seq.copy() + for substitution in get_single_substitutions(variant, self.separator): # e.g. A123C/D234E --> A123C, D234C + wild_type_aa, position, a_i = self._unpack_substitution(substitution) + + i = self._get_position_internal(position) + if not i: + raise EffectiveSiteError(position, variant, self.verbose) + + self.check_substitution_naming_against_wt(substitution, variant) + i_mapped = self.index_map[i] + sequence[i_mapped] = a_i + + x_var = np.zeros(sequence.size, dtype=float) + for idx, (i, a_i) in enumerate(zip(self.index_list, sequence)): + x_var[idx] = self.get_hi(i, a_i) + 0.5 * self.sum_ji(i, a_i, sequence) + + return x_var + + def collect_encoded_sequences(self, variants: list) -> np.ndarray: + """ + Description + ----------- + Collects all encoded sequences based on input variant names. + + Parameters + ---------- + variants: list + Variant name, e.g. 'A13E', or 'D127F' (wild-type sequence + would be defined by substitution to itself, e.g. 'F17F'). + + Returns + ---------- + encoded_sequences: list + List of all collected encoded sequences (features) for all + inputted variants. + non_encoded_variants_list_pos:list + Internal array/list position for variants that could not be + encoded due to the underlying MSA (inter-gap threshold for + computing of local and coupling PLMC parameters). These list + positions must then be removed for the corresponding fitness + arrays/lists for training and testing the model. + """ + encoded_sequences = [] + if len(np.atleast_1d(variants)) == 1: # do not show progress bar for single variant + set_silence = True # thus, also not for directed evolution + else: + set_silence = False + for i, variant in enumerate(tqdm(np.atleast_1d(variants), disable=set_silence)): + try: + encoded_sequences.append(self.encode_variant(variant)) + except EffectiveSiteError: + encoded_sequences.append([None]) + + return np.array(encoded_sequences, dtype='object') + + +""" +Below: Some helper functions to run the PLMC class and get +the encoded sequences in parallel (threading) using Ray and +to construct a pandas.DataFrame to store the encoded sequences +(features) and the associated fitness values in a CSV file. +""" + + +def save_plmc_dca_encoding_model(params_file, substitution_sep='/'): + """ + Just converts plmc params from raw binary to + Pickle-saved PLMC class. + """ + logger.info("Transforming the provided plmc params file " + "to a PLMC Pickle file (Pickles/PLMC).") + plmc = PLMC( + params_file=params_file, + separator=substitution_sep, + verbose=False + ) + try: + os.mkdir('Pickles') + except FileExistsError: + pass + pickle.dump({ + 'model': plmc, + 'model_type': 'PLMCpureDCA', + 'beta_1': None, + 'beta_2': None, + 'spearman_rho': None, + 'regressor': None + }, + open(f'Pickles/PLMC', 'wb') + ) + + +def get_encoded_sequence( + variant: str, + dca_encode: PLMC +): + """ + Description + ----------- + Gets encoded sequence based on input variant name and a preexisting + PLMC instance. + + Parameters + ---------- + variant: str + Variant name, e.g. 'A13E', or 'D127F'. Wild-type sequence + is defined by substitution to itself, e.g. 'F17F'. + dca_encode: PLMC class object + For encoding sequences, see above: class PLMC. + """ + try: + encoded_seq = dca_encode.encode_variant(variant) + except EffectiveSiteError: # position not included in processed MSA + return + + return encoded_seq + + +@ray.remote +def _get_data_parallel( + variants: list, + sequences: list, + fitnesses: list, + dca_encode: PLMC, + data: list +) -> list: + """ + Description + ----------- + Get the variant name, the associated fitness value, and its ("DCA"-)encoded sequence. + + Parameters + ---------- + variants : list + List of strings containing the variants to be encoded. + fitnesses : list + List of floats (1d) containing the fitness values associated to the variants. + dca_encode : object + Initialized 'PLMC' class object. + data : manager.list() + Manager.list() object to store the output of multiple processors. + + Returns + ------- + data : manager.list() + Filled list with variant names, fitnesses, and encoded sequence. + """ + for i, (variant, sequence, fitness) in enumerate(zip(variants, sequences, fitnesses)): + try: + data.append([variant, sequence, dca_encode.encode_variant(variant), fitness]) + except EffectiveSiteError: # do not append non-encoded sequences and + pass # associated fitness values + + return data + + +def get_dca_data_parallel( + variants: list, + sequences: list, + fitnesses: list, + dca_encode: PLMC, + threads: int, + verbose=True +) -> tuple[list, list, list, list]: + """ + Description + ----------- + This function allows to generate the encoded sequences based on the variants + given in 'csv_file' in a parallel manner. + + Parameters + ---------- + variants: list (or np.ndarray) + Variant names. + sequences: list (or np.ndarray) + Variant-associated protein sequences. + fitnesses: list (or np.ndarray) + Variant-associated fitness values. + dca_encode : object + Initialized 'PLMC' class object. + threads : int + Number of processes to be used for parallel execution. + n_cores = 1 defines no threading (not using Ray). + verbose: bool + Logging message on/off. + + Returns + ------- + data: numpy.ndarray + Filled numpy array including variant names, fitnesses, and encoded sequences. + non_effective_subs: list + List of variant names that cannot be used for modelling as they are not effective + positions in the underlying MSA used for generating local and coupling terms. + """ + if verbose: + logger.info(f'{len(variants)} input variants. Encoding variant sequences using parameters ' + f'taken from plmc generated file. This might take some time...') + + idxs_nan = np.array([i for i, b in enumerate(np.isnan(fitnesses)) if b]) # find fitness NaNs + if idxs_nan.size > 0: # remove NaNs if present + logger.info(f'Fitness NaNs are: {idxs_nan}') + fitnesses = np.delete(fitnesses, idxs_nan) + variants = np.delete(variants, idxs_nan) + + variants_split = np.array_split(variants, threads) # split array in n_cores parts + sequences_split = np.array_split(sequences, threads) # for Ray parallelization + fitnesses_split = np.array_split(fitnesses, threads) + results = ray.get([ + _get_data_parallel.remote( + variants_split[i], + sequences_split[i], + fitnesses_split[i], + dca_encode, + [] + ) for i in range(len(variants_split)) + ]) + + data = [item for sublist in results for item in sublist] # fusing all the individual results + variants = [item[0] for item in data] + sequences = [item[1] for item in data] + xs = [item[2] for item in data] + fitnesses = [item[3] for item in data] + + if verbose: + logger.info(f'{len(data)} variants after NaN-valued and non-effective ' + f'site-substituted variant (EffectiveSiteError) dropping.') + + return variants, sequences, xs, fitnesses diff --git a/pypef/main.py b/pypef/main.py index 662eff8..1c35d32 100644 --- a/pypef/main.py +++ b/pypef/main.py @@ -1,436 +1,454 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - - -# docstring for argument parsing using docopts -""" -PyPEF - Pythonic Protein Engineering Framework - -written by Niklas Siedhoff and Alexander-Maurice Illig. - -Modeling options ----------------- - I. Pure ML modeling - ------------------- - PyPEF provides three encoding options for training machine learning models, i.e. - regression models trained by supervised learning: - - 1. DCA: Direct coupling analysis (DCA) based on evolutionary couplings (input: - coupling parameter file generated by the C framework plmc) or - generating parameters using TensorFlow-based GREMLIN (input: MSA). - - 2. AAidx: Based on AAindex descriptors (566 amino acid descriptor files - taken from the AAindex database). - - 3. OneHot: One-hot encoding representing the occurrence of an - amino acid at a sequence position as a single 1 and 19 0's. - - Any encoding technique enables pure ML-based modeling, see - https://doi.org/10.1021/acs.jcim.1c00099 - and DCA-based sequence encoding enables a hybrid modeling approach, see - https://doi.org/10.1101/2022.06.07.495081 - - If an MSA can be constructed for the target sequence, e.g. using Jackhmmer, - encoding option 1 will likely outperform encoding option 2. - However, encoding option 2 provides a static encoding technique that is - independent from the evolutionary history of a target sequence and - without the need for MSA construction. - Here, the AAidx encodings for modeling are compared, i.e. validated, with respect - to their performance on the test set (comparable to an hyperparameter search - for finding the best static encoding set for model inference). - Further, one-hot encoding (encoding option 3) provides a simple but fast and often - well-performing encoding option that will likely outperform the AAindex-based - technique for model generalization. - - II. Hybrid modeling - ------------------- - Constructing a hybrid model that combines pure statistical DCA-based prediction (a - variant's relative 'evolutionary energy' to the wild type) and DCA-encoding based - training of a ML model similar to pure ML modeling option I.1. - Based on features generated from the direct coupling analysis (.params file output - using the plmc framework or provided MSA and running GREMLIN). - Individual model contributions are optimization only based on Spearman's correlation - coefficient and thus, only variant fitness ranks are to be considered for evaluating - model performance, not the exact predicted fitness value. For regression, up to now - only L2-regularized linear regression (Ridge regression) is provided as modeling option. - - -Running example of training, testing, and using a pure ML model for prediction ------------------------------------------------------------------------------- -Exemplary running of PyPEF for training a pure ML model using encoding option 2 -based on features generated from the AAIndex database (566 amino acid descriptor -indices taken from the AAIndex database). - 1. Create files for training and testing from variant-fitness CSV data: - pypef mklsts -i variant_and_fitness.csv -w wt_sequence.fasta - 2. Train and validate models: - pypef ml -e onehot -l LS.fasta -t TS.fasta --regressor pls - 3. Plot the test set entries against test set predictions (creates PNG figure, MODEL is - the chosen AAindex, the ML-DCA model, or here the ONEHOT model): - pypef ml -e onehot -m ONEHOT -t TS.fasta - 4. Create files for prediction: - - Single file: - pypef mkps -w wt_sequence.fasta -i variant_fitness.csv - - Recombinant/diverse prediction files: - pypef mkps -w wt_sequence.fasta -i variant_fitness.csv - [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] - 5. Predict (unknown/new) variants: - - Single file: - pypef ml -e aaidx -m MODEL -p Prediction_Set.fasta - - Recombinant/diverse prediction files in created prediction set folders: - pypef ml -e aaidx -m MODEL --pmult [--drecomb] [...] [--qdiverse] - - Directed evolution – for performing and plotting in silico evolution trajectories: - pypef ml -e aaidx directevo -m MODEL [...] -Note: The commands for hybrid modeling are very similar to the commands for pure ML modeling, -see pypef -h for possible commands. - -For generating DCA parameters using GREMLIN, you have to provide an MSA in FASTA or A2M format: -pypef param_inference --msa MSA_FILE --wt WT_FASTA [--opt_iter 100] - - -Helpful commands for data conversion ------------------------------------------------ -Creation of learning and test sets – splitting CSV variant-fitness data: - pypef mklsts --wt WT_FASTA --input CSV_FILE - [--drop THRESHOLD] [--numrnd NUMBER] - -Creation of prediction sets from CSV data (using single-substituted variant data): - pypef mkps --wt WT_FASTA --input CSV_FILE - [--drop THRESHOLD] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] - -Encoding a CSV file (for further performance studies such as "low N" or -"mutational extrapolation" engineering tasks: - pypef encode --input CSV_FILE --encoding ENCODING_TECHNIQUE --wt WT_FASTA - [--params PARAM_FILE] [--y_wt WT_FITNESS] [--model MODEL] [--nofft] - [--threads THREADS] [--sep CSV_COLUMN_SEPARATOR] [--fitness_key FITNESS_KEY] - -Converting a STO alignment file to A2M format: - pypef sto2a2m --sto STO_MSA_FILE - [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP] - - -Usage: - pypef mklsts --wt WT_FASTA --input CSV_FILE - [--drop THRESHOLD] [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--numrnd NUMBER] - pypef mkps --wt WT_FASTA [--input CSV_FILE] - [--drop THRESHOLD] [--ssm] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] - pypef param_inference - [--msa MSA_FILE] [--params PARAM_FILE] - [--wt WT_FASTA] [--opt_iter N_ITER] - pypef save_msa_info --msa MSA_FILE --wt WT_FASTA - [--opt_iter N_ITER] - pypef encode --input CSV_FILE --encoding ENCODING_TECHNIQUE --wt WT_FASTA - [--params PARAM_FILE] [--y_wt WT_FITNESS] [--model MODEL] [--nofft] - [--threads THREADS] - [--sep CSV_COLUMN_SEPARATOR] [--fitness_key FITNESS_KEY] - pypef reformat_csv --input CSV_FILE - [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY] - pypef shift_pos --input CSV_FILE --offset OFFSET - [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY] - pypef sto2a2m --sto STO_MSA_FILE [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP] - pypef hybrid --ts TEST_SET - [--model MODEL] [--params PARAM_FILE] - [--ls LEARNING_SET] [--label] [--threads THREADS] - pypef hybrid --model MODEL --params PARAM_FILE - [--ts TEST_SET] [--label] - [--ps PREDICTION_SET] [--pmult] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] [--negative] - [--threads THREADS] - pypef hybrid directevo --wt WT_FASTA --params PARAM_FILE - [--model MODEL] - [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] - [--numtraj NUM_TRAJ] [--temp TEMPERATURE] - [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] - pypef hybrid train_and_save --input CSV_FILE --params PARAM_FILE --wt WT_FASTA - [--fit_size REL_LEARN_FIT_SIZE] [--test_size REL_TEST_SIZE] - [--threads THREADS] [--sep CSV_COLUMN_SEPARATOR] - [--fitness_key FITNESS_KEY] [--rnd_state RND_STATE] - pypef hybrid low_n --input ENCODED_CSV_FILE - pypef hybrid extrapolation --input ENCODED_CSV_FILE - [--conc] - pypef ml --encoding ENCODING_TECHNIQUE --ls LEARNING_SET --ts TEST_SET - [--save NUMBER] [--regressor TYPE] [--nofft] [--all] [--params PARAM_FILE] - [--sort METRIC_INT] [--threads THREADS] [--label] - pypef ml --encoding ENCODING_TECHNIQUE --model MODEL --ts TEST_SET - [--nofft] [--params PARAM_FILE] [--threads THREADS] [--label] - pypef ml --show - [MODELS] - pypef ml --encoding ENCODING_TECHNIQUE --model MODEL --ps PREDICTION_SET - [--params PARAM_FILE] [--threads THREADS] [--nofft] [--negative] - pypef ml --encoding ENCODING_TECHNIQUE --model MODEL --pmult - [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] - [--ddiverse] [--tdiverse] [--qdiverse] - [--regressor TYPE] [--nofft] [--negative] [--params PARAM_FILE] [--threads THREADS] - pypef ml --encoding ENCODING_TECHNIQUE directevo --model MODEL --wt WT_FASTA - [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] [--numtraj NUM_TRAJ] [--temp TEMPERATURE] - [--nofft] [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] [--params PARAM_FILE] - pypef ml low_n --input ENCODED_CSV_FILE - [--regressor TYPE] - pypef ml extrapolation --input ENCODED_CSV_FILE - [--regressor TYPE] [--conc] - - -Options: - --all Finally training on all data [default: False]. - --conc Concatenating mutational level variants for predicting variants - from next higher level [default: False]. - --csvaa Directed evolution csv amino acid substitutions, - requires flag "--usecsv" [default: False]. - --ddiverse Create/predict double natural diverse variants [default: False]. - --drecomb Create/predict double recombinants [default: False]. - -d --drop THRESHOLD Below threshold variants will be discarded from the - data [default: -9E09]. - -e --encoding ENCODING_TECHNIQUE Sets technique used for encoding sequences for constructing regression models; - choose between 'aaidx' (AAIndex-based encoding), 'onehot' (OneHot-based encoding), - and DCA encoding using Gremlin/plmc (DCA-based encoding) [default: onehot]. - --fitness_key FITNESS_KEY Label of CSV fitness column. Else uses second column. - -h --help Show this screen [default: False]. - -i --input CSV_FILE Input data file in .csv format. - --inter_gap INTER_GAP Fraction to delete all positions with more than - 'inter_gap' * 100 % gaps (columnar trimming) [default: 0.3]. - --intra_gap INTRA_GAP Fraction to delete all sequences with more than - 'intra_gap' * 100 % gaps after being columnar trimmed - (line trimming) [default: 0.5]. - --label Label the plot instances [default: False]. - -l --ls LEARNING_SET Input learning set in .fasta format. - -m --model MODEL Model (pickle file) for plotting of validation or for - performing predictions. - --msa MSA_FILE Multiple sequence alignment (MSA) ins FASTA or A2M format for - inferring DCA parameters. - --mutation_sep MUTATION_SEP Mutation separator [default: /]. - --mutation_extrapolation Mutation extrapolation [default: False]. - --negative Set if more negative values define better variants [default: False]. - --nofft Raw sequence input, i.e., no FFT for establishing protein spectra - as vector inputs, only implemented as option for AAindex-based - sequence encoding [default: False]. - -n --numrnd NUMBER Number of randomly created Learning and Validation - datasets [default: 0]. - --numiter NUM_ITER Number of mutation iterations per evolution trajectory [default: 5]. - --numtraj NUM_TRAJ Number of trajectories, i.e., evolution pathways [default: 5]. - -o --offset OFFSET Offset for shifting substitution positions of the input CSV file [default: 0]. - --opt_iter N_ITER Number of iterations for GREMLIN-based optimization of local fields - and couplings [default: 100]. - --params PARAM_FILE Input PLMC couplings parameter file. - -u --pmult Predict for all prediction files in folder for recombinants - or for diverse variants [default: False]. - -p --ps PREDICTION_SET Prediction set for performing predictions using a trained Model. - --qdiverse Create quadruple natural diverse variants [default: False]. - --qarecomb Create/predict quadruple recombinants [default: False]. - --qirecomb Create/predict quintuple recombinants [default: False]. - --regressor TYPE Type of regression (R.) to use, options: PLS CV R.: pls, - PLS LOOCV R.: pls_loocv, Random Forest CV R.: rf, SVM CV R.: svr, - MLP CV R.: mlp, Ridge CV R.: ridge (or l2), - LassoLars CV R.: lassolars (or l1) [default: pls]. - --rnd_splits RND_SPLITS Number of random splits for Low N testing [default: 5]. - --rnd_state RND_STATE Sets the random state for reproduction, only implemented - for hybrid train_and_save [default: 42]. - -s --save NUMBER Number of models to be saved as pickle files [default: 5]. - --sep CSV_COLUMN_SEPARATOR CSV Column separator [default: ;]. - --show Show achieved model performances from Model_Results.txt. - --sort METRIC_INT Rank models based on metric {1: R^2, 2: RMSE, 3: NRMSE, - 4: Pearson's r, 5: Spearman's rho} [default: 1]. - --ssm Create single-saturation mutagenesis prediction set (does not - require CSV input) [default: False]. - --sto STO_MSA_FILE The input MSA file in STO (Stockholm) format. - --tdiverse Create/predict triple natural diverse variants [default: False]. - --temp TEMPERATURE "Temperature" of Metropolis-Hastings criterion [default: 0.01] - --threads THREADS Parallel computing of training and validation of models. - Number of threads used in parallel computing, by default - no hyperthreading. - --fit_size REL_LEARN_FIT_SIZE Relative size of the train set for initial fitting. The remaining data - for training is used for hyperparameter optimization on train subsets - used for validation, while in sum the total data for training is - training data = train_fit data + train_test(validation) data - = all data - test data. - The default of 0.66 means that 34 % of the train data is taken for - train_test validation [default: 0.66]. - --test_size REL_TEST_SIZE Relative size of the test set; if set to 0.0 the trained model - will not be tested [default: 0.2]. - --trecomb Create/predict triple recombinants [default: False]. - --usecsv Perform directed evolution on single variant csv position - data [default: False]. - -t --ts TEST_SET Input validation set in .fasta format. - --version Show version [default: False]. - -w --wt WT_FASTA Input wild-type sequence file (in FASTA format). - --wt_pos WT_POSITION Row position of encoded wild-type in encoding CSV file (0-indexed) [default: 0]. - -y --y_wt WT_FITNESS Fitness value (y) of wild-type [default: 1.0]. - encode Encoding [default: False]. - hybrid Hybrid modeling based on DCA-derived sequence encoding [default: False]. - ml Pure machine learning modeling based on encoded sequences [default: False]. - MODELS Number of saved models to show [default: 5]. - onehot OneHot-based encoding [default: False]. - param_inference Inferring DCA params using the GREMLIN approach [default: False]. - reformat_csv Reformat input CSV with indicated column and mutation separators to default - CSV style (column separator ';' and mutation separator '/') [default: False.] - save_msa_info Optimize local fields and couplings of MSA based on GREMLIN DCA approach and - save resulting coupling matrix and highly coevolved amino acids. - shift_pos Shift positions of all variant substitutions of the input CSV - file (identical to reformat_csv when setting --offset to 0) [default: False.] - sto2a2m Transform multiple sequence alignment from STO format to - A2M format [default: False]. -""" - - -from os import environ -try: - environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # only print TensorFlow errors, set to '0' or comment -except KeyError: # lines for seeing TensorFlow infos and warnings - pass - -from sys import argv, version_info -from pypef import __version__ -if version_info[0] < 3 or version_info[1] < 9: - raise SystemError(f"The current version of PyPEF (v {__version__}) " - f"requires at least Python 3.9 or higher.") - -import time -from datetime import timedelta -import logging - -from docopt import docopt -from schema import Schema, SchemaError, Optional, Or, Use - -from pypef.ml.ml_run import run_pypef_pure_ml -from pypef.dca.dca_run import run_pypef_hybrid_modeling -from pypef.utils.utils_run import run_pypef_utils - -logger = logging.getLogger("pypef") -logger.setLevel(logging.INFO) - -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) - -formatter = logging.Formatter( - "%(asctime)s.%(msecs)03d %(levelname)s %(filename)s:%(lineno)d -- %(message)s", - "%Y-%m-%d %H:%M:%S" -) -ch.setFormatter(formatter) -logger.addHandler(ch) - - -schema = Schema({ - Optional('--all'): bool, - Optional('--conc'): bool, - Optional('--csvaa'): bool, - Optional('--ddiverse'): bool, - Optional('--drecomb'): bool, - Optional('--drop'): Use(float), - Optional('--encoding'): Use(str), - Optional('--fitness_key'): Or(None, str), - Optional('--fit_size'): Use(float), - Optional('--help'): bool, - Optional('--input'): Or(None, str), - Optional('--inter_gap'): Use(float), - Optional('--intra_gap'): Use(float), - Optional('--label'): bool, - Optional('--ls'): Or(None, str), - Optional('--model'): Or(None, str), - Optional('--msa'): Or(None, str), - Optional('--mutation_sep'): Or(None, str), - Optional('--negative'): bool, - Optional('--nofft'): bool, - Optional('--numrnd'): Use(int), - Optional('--numiter'): Use(int), - Optional('--numtraj'): Use(int), - Optional('--offset'): Use(int), - Optional('--opt_iter'): Use(int), - Optional('--params'): Or(None, str), - Optional('--pmult'): bool, - Optional('--ps'): Or(None, str), - Optional('--qdiverse'): bool, - Optional('--qarecomb'): bool, - Optional('--qirecomb'): bool, - Optional('--regressor'): Or(None, str), - Optional('--rnd_splits'): Use(int), - Optional('--rnd_state'): Use(int), - Optional('--save'): Use(int), - Optional('--sep'): Or(None, str), - Optional('--show'): Use(int), - Optional('--sort'): Use(int), - Optional('--ssm'): bool, - Optional('--sto'): Or(None, str), - Optional('--tdiverse'): bool, - Optional('--temp'): Use(float), - Optional('--test_size'): Use(float), - Optional('--threads'): Or(None, Use(int)), - Optional('--train_size'): Use(float), - Optional('--trecomb'): bool, - Optional('--usecsv'): bool, - Optional('--ts'): Or(None, str), - Optional('--wt'): Or(None, str), - Optional('--wt_pos'): Use(int), - Optional('--y_wt'): Or(None, Use(float)), - Optional('aaidx'): bool, - Optional('param_inference'): bool, - Optional('hybrid'): bool, - Optional('directevo'): bool, - Optional('encode'): bool, - Optional('extrapolation'): bool, - Optional('low_n'): bool, - Optional('mklsts'): bool, - Optional('mkps'): bool, - Optional('ml'): bool, - Optional('MODELS'): Or(None, Use(int)), - Optional('onehot'): bool, - Optional('reformat_csv'): bool, - Optional('save_msa_info'): bool, - Optional('shift_pos'): bool, - Optional('sto2a2m'): bool, - Optional('train_and_save'): bool, -}) - - -def validate(args): - try: - args = schema.validate(args) - return args - except SchemaError as e: - exit(e) - - -def run_main(): - """ - Entry point for pip-installed version. - """ - arguments = docopt(__doc__, version=__version__) - start_time = time.time() - logger.debug(f'main.py __name__: {__name__}, version: {__version__}') - logger.debug(str(argv)[1:-1].replace("\'", "").replace(",", "")) - logger.debug(f'\n{arguments}') - arguments = validate(arguments) - if arguments['directevo']: - run_pypef_utils(arguments) - elif arguments['ml']: - run_pypef_pure_ml(arguments) - elif arguments['hybrid'] or arguments['param_inference'] or arguments['save_msa_info']: - run_pypef_hybrid_modeling(arguments) - else: - run_pypef_utils(arguments) - - elapsed = str(timedelta(seconds=time.time()-start_time)).split(".")[0] - elapsed = f'{elapsed.split(":")[0]} h {elapsed.split(":")[1]} min {elapsed.split(":")[2]} s' - logger.info(f'Done! (Run time: {elapsed})') - - -if __name__ == '__main__': - """ - Entry point for direct file run. - """ - run_main() +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + + +# docstring for argument parsing using docopts +""" +PyPEF - Pythonic Protein Engineering Framework + +written by Niklas Siedhoff and Alexander-Maurice Illig. + +Modeling options +---------------- + I. Pure ML modeling + ------------------- + PyPEF provides three encoding options for training machine learning models, i.e. + regression models trained by supervised learning: + + 1. DCA: Direct coupling analysis (DCA) based on evolutionary couplings (input: + coupling parameter file generated by the C framework plmc) or + generating parameters using TensorFlow-based GREMLIN (input: MSA). + + 2. AAidx: Based on AAindex descriptors (566 amino acid descriptor files + taken from the AAindex database). + + 3. OneHot: One-hot encoding representing the occurrence of an + amino acid at a sequence position as a single 1 and 19 0's. + + Any encoding technique enables pure ML-based modeling, see + https://doi.org/10.1021/acs.jcim.1c00099 + and DCA-based sequence encoding enables a hybrid modeling approach, see + https://doi.org/10.1101/2022.06.07.495081 + + If an MSA can be constructed for the target sequence, e.g. using Jackhmmer, + encoding option 1 will likely outperform encoding option 2. + However, encoding option 2 provides a static encoding technique that is + independent from the evolutionary history of a target sequence and + without the need for MSA construction. + Here, the AAidx encodings for modeling are compared, i.e. validated, with respect + to their performance on the test set (comparable to an hyperparameter search + for finding the best static encoding set for model inference). + Further, one-hot encoding (encoding option 3) provides a simple but fast and often + well-performing encoding option that will likely outperform the AAindex-based + technique for model generalization. + + II. Hybrid modeling + ------------------- + Constructing a hybrid model that combines pure statistical DCA-based prediction (a + variant's relative 'evolutionary energy' to the wild type) and DCA-encoding based + training of a ML model similar to pure ML modeling option I.1. + Based on features generated from the direct coupling analysis (.params file output + using the plmc framework or provided MSA and running GREMLIN). + Individual model contributions are optimization only based on Spearman's correlation + coefficient and thus, only variant fitness ranks are to be considered for evaluating + model performance, not the exact predicted fitness value. For regression, up to now + only L2-regularized linear regression (Ridge regression) is provided as modeling option. + + +Running example of training, testing, and using a pure ML model for prediction +------------------------------------------------------------------------------ +Exemplary running of PyPEF for training a pure ML model using encoding option 2 +based on features generated from the AAIndex database (566 amino acid descriptor +indices taken from the AAIndex database). + 1. Create files for training and testing from variant-fitness CSV data: + pypef mklsts -i variant_and_fitness.csv -w wt_sequence.fasta + 2. Train and validate models: + pypef ml -e onehot -l LS.fasta -t TS.fasta --regressor pls + 3. Plot the test set entries against test set predictions (creates PNG figure, MODEL is + the chosen AAindex, the ML-DCA model, or here the ONEHOT model): + pypef ml -e onehot -m ONEHOT -t TS.fasta + 4. Create files for prediction: + - Single file: + pypef mkps -w wt_sequence.fasta -i variant_fitness.csv + - Recombinant/diverse prediction files: + pypef mkps -w wt_sequence.fasta -i variant_fitness.csv + [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] + [--ddiverse] [--tdiverse] [--qdiverse] + 5. Predict (unknown/new) variants: + - Single file: + pypef ml -e aaidx -m MODEL -p Prediction_Set.fasta + - Recombinant/diverse prediction files in created prediction set folders: + pypef ml -e aaidx -m MODEL --pmult [--drecomb] [...] [--qdiverse] + - Directed evolution – for performing and plotting in silico evolution trajectories: + pypef ml -e aaidx directevo -m MODEL [...] +Note: The commands for hybrid modeling are very similar to the commands for pure ML modeling, +see pypef -h for possible commands. + +For generating DCA parameters using GREMLIN, you have to provide an MSA in FASTA or A2M format: +pypef param_inference --msa MSA_FILE --wt WT_FASTA [--opt_iter 100] + + +Helpful commands for data conversion +----------------------------------------------- +Creation of learning and test sets – splitting CSV variant-fitness data: + pypef mklsts --wt WT_FASTA --input CSV_FILE + [--drop THRESHOLD] [--numrnd NUMBER] + +Creation of prediction sets from CSV data (using single-substituted variant data): + pypef mkps --wt WT_FASTA --input CSV_FILE + [--drop THRESHOLD] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] + [--ddiverse] [--tdiverse] [--qdiverse] + +Encoding a CSV file (for further performance studies such as "low N" or +"mutational extrapolation" engineering tasks: + pypef encode --input CSV_FILE --encoding ENCODING_TECHNIQUE --wt WT_FASTA + [--params PARAM_FILE] [--y_wt WT_FITNESS] [--model MODEL] [--nofft] + [--threads THREADS] [--sep CSV_COLUMN_SEPARATOR] [--fitness_key FITNESS_KEY] + +Converting a STO alignment file to A2M format: + pypef sto2a2m --sto STO_MSA_FILE + [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP] + + +Usage: + pypef mklsts --wt WT_FASTA --input CSV_FILE + [--drop THRESHOLD] [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--numrnd NUMBER] + pypef mkps --wt WT_FASTA [--input CSV_FILE] + [--drop THRESHOLD] [--ssm] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] + [--ddiverse] [--tdiverse] [--qdiverse] + pypef param_inference + [--msa MSA_FILE] [--params PARAM_FILE] + [--wt WT_FASTA] [--opt_iter N_ITER] + pypef save_msa_info --msa MSA_FILE --wt WT_FASTA + [--opt_iter N_ITER] + pypef encode --input CSV_FILE --encoding ENCODING_TECHNIQUE --wt WT_FASTA + [--params PARAM_FILE] [--y_wt WT_FITNESS] [--model MODEL] [--nofft] + [--threads THREADS] + [--sep CSV_COLUMN_SEPARATOR] [--fitness_key FITNESS_KEY] + pypef reformat_csv --input CSV_FILE + [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY] + pypef shift_pos --input CSV_FILE --offset OFFSET + [--sep CSV_COLUMN_SEPARATOR] [--mutation_sep MUTATION_SEPARATOR] [--fitness_key FITNESS_KEY] + pypef sto2a2m --sto STO_MSA_FILE [--inter_gap INTER_GAP] [--intra_gap INTRA_GAP] + pypef hybrid + [--ts TEST_SET] [--ps PREDICTION_SET] + [--model MODEL] [--params PARAM_FILE] + [--ls LEARNING_SET] [--label] [--threads THREADS] + pypef hybrid --model MODEL --params PARAM_FILE + [--ts TEST_SET] [--label] + [--ps PREDICTION_SET] [--pmult] [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] + [--ddiverse] [--tdiverse] [--qdiverse] [--negative] + [--threads THREADS] + pypef hybrid directevo --wt WT_FASTA --params PARAM_FILE + [--model MODEL] + [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] + [--numtraj NUM_TRAJ] [--temp TEMPERATURE] + [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] + pypef hybrid train_and_save --input CSV_FILE --params PARAM_FILE --wt WT_FASTA + [--fit_size REL_LEARN_FIT_SIZE] [--test_size REL_TEST_SIZE] + [--threads THREADS] [--sep CSV_COLUMN_SEPARATOR] + [--fitness_key FITNESS_KEY] [--rnd_state RND_STATE] + pypef hybrid low_n --input ENCODED_CSV_FILE + pypef hybrid extrapolation --input ENCODED_CSV_FILE + [--conc] + pypef ml --encoding ENCODING_TECHNIQUE --ls LEARNING_SET --ts TEST_SET + [--save NUMBER] [--regressor TYPE] [--nofft] [--all] [--params PARAM_FILE] + [--sort METRIC_INT] [--threads THREADS] [--label] + pypef ml --encoding ENCODING_TECHNIQUE --model MODEL --ts TEST_SET + [--nofft] [--params PARAM_FILE] [--threads THREADS] [--label] + pypef ml --show + [MODELS] + pypef ml --encoding ENCODING_TECHNIQUE --model MODEL --ps PREDICTION_SET + [--params PARAM_FILE] [--threads THREADS] [--nofft] [--negative] + pypef ml --encoding ENCODING_TECHNIQUE --model MODEL --pmult + [--drecomb] [--trecomb] [--qarecomb] [--qirecomb] + [--ddiverse] [--tdiverse] [--qdiverse] + [--regressor TYPE] [--nofft] [--negative] [--params PARAM_FILE] [--threads THREADS] + pypef ml --encoding ENCODING_TECHNIQUE directevo --model MODEL --wt WT_FASTA + [--input CSV_FILE] [--y_wt WT_FITNESS] [--numiter NUM_ITER] [--numtraj NUM_TRAJ] [--temp TEMPERATURE] + [--nofft] [--negative] [--usecsv] [--csvaa] [--drop THRESHOLD] [--params PARAM_FILE] + pypef ml low_n --input ENCODED_CSV_FILE + [--regressor TYPE] + pypef ml extrapolation --input ENCODED_CSV_FILE + [--regressor TYPE] [--conc] + + +Options: + --all Finally training on all data [default: False]. + --conc Concatenating mutational level variants for predicting variants + from next higher level [default: False]. + --csvaa Directed evolution csv amino acid substitutions, + requires flag "--usecsv" [default: False]. + --ddiverse Create/predict double natural diverse variants [default: False]. + --drecomb Create/predict double recombinants [default: False]. + -d --drop THRESHOLD Below threshold variants will be discarded from the + data [default: -9E09]. + -e --encoding ENCODING_TECHNIQUE Sets technique used for encoding sequences for constructing regression models; + choose between 'aaidx' (AAIndex-based encoding), 'onehot' (OneHot-based encoding), + and DCA encoding using Gremlin/plmc (DCA-based encoding) [default: onehot]. + --fitness_key FITNESS_KEY Label of CSV fitness column. Else uses second column. + -h --help Show this screen [default: False]. + -i --input CSV_FILE Input data file in .csv format. + --inter_gap INTER_GAP Fraction to delete all positions with more than + 'inter_gap' * 100 % gaps (columnar trimming) [default: 0.3]. + --intra_gap INTRA_GAP Fraction to delete all sequences with more than + 'intra_gap' * 100 % gaps after being columnar trimmed + (line trimming) [default: 0.5]. + --label Label the plot instances [default: False]. + -l --ls LEARNING_SET Input learning set in .fasta format. + -m --model MODEL Model (pickle file) for plotting of validation or for + performing predictions. + --msa MSA_FILE Multiple sequence alignment (MSA) ins FASTA or A2M format for + inferring DCA parameters. + --mutation_sep MUTATION_SEP Mutation separator [default: /]. + --mutation_extrapolation Mutation extrapolation [default: False]. + --negative Set if more negative values define better variants [default: False]. + --nofft Raw sequence input, i.e., no FFT for establishing protein spectra + as vector inputs, only implemented as option for AAindex-based + sequence encoding [default: False]. + -n --numrnd NUMBER Number of randomly created Learning and Validation + datasets [default: 0]. + --numiter NUM_ITER Number of mutation iterations per evolution trajectory [default: 5]. + --numtraj NUM_TRAJ Number of trajectories, i.e., evolution pathways [default: 5]. + -o --offset OFFSET Offset for shifting substitution positions of the input CSV file [default: 0]. + --opt_iter N_ITER Number of iterations for GREMLIN-based optimization of local fields + and couplings [default: 100]. + --params PARAM_FILE Input PLMC couplings parameter file. + -u --pmult Predict for all prediction files in folder for recombinants + or for diverse variants [default: False]. + -p --ps PREDICTION_SET Prediction set for performing predictions using a trained Model. + --qdiverse Create quadruple natural diverse variants [default: False]. + --qarecomb Create/predict quadruple recombinants [default: False]. + --qirecomb Create/predict quintuple recombinants [default: False]. + --regressor TYPE Type of regression (R.) to use, options: PLS CV R.: pls, + PLS LOOCV R.: pls_loocv, Random Forest CV R.: rf, SVM CV R.: svr, + MLP CV R.: mlp, Ridge CV R.: ridge (or l2), + LassoLars CV R.: lassolars (or l1) [default: pls]. + --rnd_splits RND_SPLITS Number of random splits for Low N testing [default: 5]. + --rnd_state RND_STATE Sets the random state for reproduction, only implemented + for hybrid train_and_save [default: 42]. + -s --save NUMBER Number of models to be saved as pickle files [default: 5]. + --sep CSV_COLUMN_SEPARATOR CSV Column separator [default: ;]. + --show Show achieved model performances from Model_Results.txt. + --sort METRIC_INT Rank models based on metric {1: R^2, 2: RMSE, 3: NRMSE, + 4: Pearson's r, 5: Spearman's rho} [default: 1]. + --ssm Create single-saturation mutagenesis prediction set (does not + require CSV input) [default: False]. + --sto STO_MSA_FILE The input MSA file in STO (Stockholm) format. + --tdiverse Create/predict triple natural diverse variants [default: False]. + --temp TEMPERATURE "Temperature" of Metropolis-Hastings criterion [default: 0.01] + --threads THREADS Parallel computing of training and validation of models. + Number of threads used in parallel computing, by default + no hyperthreading. + --fit_size REL_LEARN_FIT_SIZE Relative size of the train set for initial fitting. The remaining data + for training is used for hyperparameter optimization on train subsets + used for validation, while in sum the total data for training is + training data = train_fit data + train_test(validation) data + = all data - test data. + The default of 0.66 means that 34 % of the train data is taken for + train_test validation [default: 0.66]. + --test_size REL_TEST_SIZE Relative size of the test set; if set to 0.0 the trained model + will not be tested [default: 0.2]. + --trecomb Create/predict triple recombinants [default: False]. + --usecsv Perform directed evolution on single variant csv position + data [default: False]. + -t --ts TEST_SET Input validation set in .fasta format. + --version Show version [default: False]. + -w --wt WT_FASTA Input wild-type sequence file (in FASTA format). + --wt_pos WT_POSITION Row position of encoded wild-type in encoding CSV file (0-indexed) [default: 0]. + -y --y_wt WT_FITNESS Fitness value (y) of wild-type [default: 1.0]. + encode Encoding [default: False]. + hybrid Hybrid modeling based on DCA-derived sequence encoding [default: False]. + ml Pure machine learning modeling based on encoded sequences [default: False]. + MODELS Number of saved models to show [default: 5]. + onehot OneHot-based encoding [default: False]. + param_inference Inferring DCA params using the GREMLIN approach [default: False]. + reformat_csv Reformat input CSV with indicated column and mutation separators to default + CSV style (column separator ';' and mutation separator '/') [default: False.] + save_msa_info Optimize local fields and couplings of MSA based on GREMLIN DCA approach and + save resulting coupling matrix and highly coevolved amino acids. + shift_pos Shift positions of all variant substitutions of the input CSV + file (identical to reformat_csv when setting --offset to 0) [default: False.] + sto2a2m Transform multiple sequence alignment from STO format to + A2M format [default: False]. +""" + + +from os import environ +try: + environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 3 = TensorFlow INFO, WARNING, and ERROR messages are not printed +except KeyError: + pass + +from sys import argv, version_info +from pypef import __version__ +if version_info[0] < 3 or version_info[1] < 9: + raise SystemError(f"The current version of PyPEF (v {__version__}) " + f"requires at least Python 3.9 or higher.") + +import time +from datetime import timedelta +import logging + +from docopt import docopt +from schema import Schema, SchemaError, Optional, Or, Use + +from pypef.ml.ml_run import run_pypef_pure_ml +from pypef.dca.dca_run import run_pypef_hybrid_modeling +from pypef.utils.utils_run import run_pypef_utils + +logger = logging.getLogger("pypef") +logger.setLevel(logging.INFO) + +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + "%(asctime)s.%(msecs)03d %(levelname)s %(filename)s:%(lineno)d -- %(message)s", + "%Y-%m-%d %H:%M:%S" +) +ch.setFormatter(formatter) +logger.addHandler(ch) + +schema = Schema({ + Optional('--all'): bool, + Optional('--conc'): bool, + Optional('--csvaa'): bool, + Optional('--ddiverse'): bool, + Optional('--drecomb'): bool, + Optional('--drop'): Use(float), + Optional('--encoding'): Use(str), + Optional('--fitness_key'): Or(None, str), + Optional('--fit_size'): Use(float), + Optional('--help'): bool, + Optional('--input'): Or(None, str), + Optional('--inter_gap'): Use(float), + Optional('--intra_gap'): Use(float), + Optional('--label'): bool, + Optional('--ls'): Or(None, str), + Optional('--model'): Or(None, str), + Optional('--msa'): Or(None, str), + Optional('--mutation_sep'): Or(None, str), + Optional('--negative'): bool, + Optional('--nofft'): bool, + Optional('--numrnd'): Use(int), + Optional('--numiter'): Use(int), + Optional('--numtraj'): Use(int), + Optional('--offset'): Use(int), + Optional('--opt_iter'): Use(int), + Optional('--params'): Or(None, str), + Optional('--pmult'): bool, + Optional('--ps'): Or(None, str), + Optional('--qdiverse'): bool, + Optional('--qarecomb'): bool, + Optional('--qirecomb'): bool, + Optional('--regressor'): Or(None, str), + Optional('--rnd_splits'): Use(int), + Optional('--rnd_state'): Use(int), + Optional('--save'): Use(int), + Optional('--sep'): Or(None, str), + Optional('--show'): Use(int), + Optional('--sort'): Use(int), + Optional('--ssm'): bool, + Optional('--sto'): Or(None, str), + Optional('--tdiverse'): bool, + Optional('--temp'): Use(float), + Optional('--test_size'): Use(float), + Optional('--threads'): Or(None, Use(int)), + Optional('--train_size'): Use(float), + Optional('--trecomb'): bool, + Optional('--usecsv'): bool, + Optional('--ts'): Or(None, str), + Optional('--wt'): Or(None, str), + Optional('--wt_pos'): Use(int), + Optional('--y_wt'): Or(None, Use(float)), + Optional('aaidx'): bool, + Optional('param_inference'): bool, + Optional('hybrid'): bool, + Optional('directevo'): bool, + Optional('encode'): bool, + Optional('extrapolation'): bool, + Optional('low_n'): bool, + Optional('mklsts'): bool, + Optional('mkps'): bool, + Optional('ml'): bool, + Optional('MODELS'): Or(None, Use(int)), + Optional('onehot'): bool, + Optional('reformat_csv'): bool, + Optional('save_msa_info'): bool, + Optional('shift_pos'): bool, + Optional('sto2a2m'): bool, + Optional('train_and_save'): bool, +}) + + +def validate(args): + """ + Validate (docopt) arguments. + + Parameters + ---------- + args: dict + Key-value pairs of arguments, + e.g., + {'mklsts': True, + '--wt': 'WT_Seq.fasta', + '--input': 'Variant-Fitness.csv'} + + Returns + ------- + None + """ + try: + args = schema.validate(args) + return args + except SchemaError as e: + exit(e) + + +def run_main(): + """ + Entry point for pip-installed version. + Arguments are created from Docstring using docopt that + creates an argument dict. + """ + arguments = docopt(__doc__, version=__version__) + start_time = time.time() + logger.debug(f'main.py __name__: {__name__}, version: {__version__}') + logger.debug(str(argv)[1:-1].replace("\'", "").replace(",", "")) + logger.debug(f'\n{arguments}') + arguments = validate(arguments) + if arguments['directevo']: + run_pypef_utils(arguments) + elif arguments['ml']: + run_pypef_pure_ml(arguments) + elif arguments['hybrid'] or arguments['param_inference'] or arguments['save_msa_info']: + run_pypef_hybrid_modeling(arguments) + else: + run_pypef_utils(arguments) + + elapsed = str(timedelta(seconds=time.time() - start_time)).split(".")[0] + elapsed = f'{elapsed.split(":")[0]} h {elapsed.split(":")[1]} min {elapsed.split(":")[2]} s' + logger.info(f'Done! (Run time: {elapsed})') + + +if __name__ == '__main__': + """ + Entry point for direct file run. + """ + run_main() diff --git a/pypef/ml/ml_run.py b/pypef/ml/ml_run.py index f8a5d1f..1b1629c 100644 --- a/pypef/ml/ml_run.py +++ b/pypef/ml/ml_run.py @@ -1,220 +1,220 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import os -from os import listdir -from os.path import isfile, join -import ray -import logging -logger = logging.getLogger('pypef.ml.ml_run') - -from pypef.ml.parallelization import aaindex_performance_parallel -from pypef.ml.regression import ( - read_models, formatted_output, performance_list, save_model, predict, predict_ts -) -from pypef.utils.to_file import predictions_out -from pypef.utils.low_n_mutation_extrapolation import low_n, performance_mutation_extrapolation - - -def run_pypef_pure_ml(arguments): - """ - Running the program, importing all required self-made modules and - running them dependent on user-passed input arguments using docopt - for argument parsing. - """ - threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 - threads = threads + 1 if threads == 0 else threads - if threads > 1: - ray.init() - if arguments['--show']: - if arguments['MODELS'] != str(5): - try: - print(read_models(int(arguments['MODELS']))) - except ValueError: - print(read_models(5)) - except TypeError: - print(read_models(5)) - else: - print(read_models(5)) - - else: - if arguments['--ls'] is not None and arguments['--ts'] is not None: # LS --> TS - if arguments['--model'] is None: - path = os.getcwd() - try: - t_save = int(arguments['--save']) - except ValueError: - t_save = 5 - # Parallelization of AAindex iteration if threads is not None (but int) - if threads > 1 and arguments['--encoding'] == 'aaidx': - logger.info(f'Using {threads} threads for parallel computing. Running...') - encoding_performance_list = aaindex_performance_parallel( - train_set=arguments['--ls'], - test_set=arguments['--ts'], - cores=threads, - regressor=arguments['--regressor'], - no_fft=arguments['--nofft'], - sort=arguments['--sort'] - ) - - else: # run using a single core or use onehot or DCA-based encoding for modeling - encoding_performance_list = performance_list( - train_set=arguments['--ls'], - test_set=arguments['--ts'], - encoding=arguments['--encoding'], - regressor=arguments['--regressor'], - no_fft=arguments['--nofft'], - sort=arguments['--sort'], - couplings_file=arguments['--params'], # only for DCA - threads=threads # only for DCA - ) - - formatted_output( - performance_list=encoding_performance_list, - no_fft=arguments['--nofft'], - minimum_r2=-1E9 - ) - - # save_model encodes variants again (possibly change) - save_model( - path=path, - performances=encoding_performance_list, - training_set=arguments['--ls'], - test_set=arguments['--ts'], - threshold=t_save, - encoding=arguments['--encoding'], - regressor=arguments['--regressor'], - no_fft=arguments['--nofft'], - train_on_all=arguments['--all'], - couplings_file=arguments['--params'], # only for DCA - threads=threads, # only for DCA - label=arguments['--label'] - ) - - # Prediction of single .fasta file - elif arguments['--ts'] is not None and arguments['--model'] is not None: - predict_ts( - path=os.getcwd(), - model=arguments['--model'], - test_set=arguments['--ts'], - encoding=arguments['--encoding'], - no_fft=arguments['--nofft'], - couplings_file=arguments['--params'], # only for DCA - label=arguments['--label'], - threads=threads - ) - - elif arguments['--ps'] is not None and arguments['--model'] is not None: - predictions = predict( - path=os.getcwd(), - prediction_set=arguments['--ps'], - model=arguments['--model'], - encoding=arguments['--encoding'], - mult_path=None, - no_fft=arguments['--nofft'], - couplings_file=arguments['--params'], # only for DCA - threads=threads # only for DCA - ) - if predictions == 'skip' and not arguments['--params']: - raise SystemError("No couplings file provided. DCA-based sequence encoding " - "requires a (plmc or GREMLIN) parameter file.") - if arguments['--negative']: - predictions = sorted(predictions, key=lambda x: x[0], reverse=False) - predictions_out( - predictions=predictions, - model=arguments['--model'], - prediction_set=arguments['--ps'] - ) - - # Prediction on recombinant/diverse variant folder data - elif arguments['--pmult'] and arguments['--model'] is not None: - path = os.getcwd() - recombs_total = [] - recomb_d, recomb_t, recomb_qa, recomb_qi = \ - '/Recomb_Double_Split/', '/Recomb_Triple_Split/', \ - '/Recomb_Quadruple_Split/', '/Recomb_Quintuple_Split/' - diverse_d, diverse_t, diverse_q = \ - '/Diverse_Double_Split/', '/Diverse_Triple_Split/', '/Diverse_Quadruple_Split/' - if arguments['--drecomb']: - recombs_total.append(recomb_d) - if arguments['--trecomb']: - recombs_total.append(recomb_t) - if arguments['--qarecomb']: - recombs_total.append(recomb_qa) - if arguments['--qirecomb']: - recombs_total.append(recomb_qi) - if arguments['--ddiverse']: - recombs_total.append(diverse_d) - if arguments['--tdiverse']: - recombs_total.append(diverse_t) - if arguments['--qdiverse']: - recombs_total.append(diverse_q) - if arguments['--drecomb'] is False \ - and arguments['--trecomb'] is False \ - and arguments['--qarecomb'] is False \ - and arguments['--qirecomb'] is False \ - and arguments['--ddiverse'] is False \ - and arguments['--tdiverse'] is False \ - and arguments['--qdiverse'] is False: - raise SystemError('Define prediction target for --pmult, e.g. --pmult --drecomb.') - - for args in recombs_total: - predictions_total = [] - logger.info(f'Running predictions for variant-sequence files in directory {args[1:-1]}...') - path_recomb = path + args - files = [path_recomb + f for f in listdir(path_recomb) - if isfile(join(path_recomb, f)) if f.endswith('.fasta')] - for i, file in enumerate(files): - logger.info(f'Encoding files ({i+1}/{len(files)}) for prediction...') - predictions = predict( - path=path, - prediction_set=file, - model=arguments['--model'], - encoding=arguments['--encoding'], - mult_path=path_recomb, - no_fft=arguments['--nofft'], - couplings_file=arguments['--params'], # only for DCA - threads=threads # only for DCA - ) - for pred in predictions: - predictions_total.append(pred) # if array gets too large? - predictions_total = list(dict.fromkeys(predictions_total)) # removing duplicates from list - if arguments['--negative']: - predictions_total = sorted(predictions_total, key=lambda x: x[0], reverse=False) - else: - predictions_total = sorted(predictions_total, key=lambda x: x[0], reverse=True) - - predictions_out( - predictions=predictions_total, - model=arguments['--model'], - prediction_set=f'Top{args[1:-1]}', - path=path_recomb - ) - - elif arguments['extrapolation']: - performance_mutation_extrapolation( - encoded_csv=arguments['--input'], - cv_regressor=arguments['--regressor'], - conc=arguments['--conc'] - ) - - elif arguments['low_n']: - low_n( - encoded_csv=arguments['--input'], - cv_regressor=arguments['--regressor'] - ) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import os +from os import listdir +from os.path import isfile, join +import ray +import logging +logger = logging.getLogger('pypef.ml.ml_run') + +from pypef.ml.parallelization import aaindex_performance_parallel +from pypef.ml.regression import ( + read_models, formatted_output, performance_list, save_model, predict, predict_ts +) +from pypef.utils.to_file import predictions_out +from pypef.utils.low_n_mutation_extrapolation import low_n, performance_mutation_extrapolation + + +def run_pypef_pure_ml(arguments): + """ + Running the program, importing all required self-made modules and + running them dependent on user-passed input arguments using docopt + for argument parsing. + """ + threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 + threads = threads + 1 if threads == 0 else threads + if threads > 1: + ray.init() + if arguments['--show']: + if arguments['MODELS'] != str(5): + try: + print(read_models(int(arguments['MODELS']))) + except ValueError: + print(read_models(5)) + except TypeError: + print(read_models(5)) + else: + print(read_models(5)) + + else: + if arguments['--ls'] is not None and arguments['--ts'] is not None: # LS --> TS + if arguments['--model'] is None: + path = os.getcwd() + try: + t_save = int(arguments['--save']) + except ValueError: + t_save = 5 + # Parallelization of AAindex iteration if threads is not None (but int) + if threads > 1 and arguments['--encoding'] == 'aaidx': + logger.info(f'Using {threads} threads for parallel computing. Running...') + encoding_performance_list = aaindex_performance_parallel( + train_set=arguments['--ls'], + test_set=arguments['--ts'], + cores=threads, + regressor=arguments['--regressor'], + no_fft=arguments['--nofft'], + sort=arguments['--sort'] + ) + + else: # run using a single core or use onehot or DCA-based encoding for modeling + encoding_performance_list = performance_list( + train_set=arguments['--ls'], + test_set=arguments['--ts'], + encoding=arguments['--encoding'], + regressor=arguments['--regressor'], + no_fft=arguments['--nofft'], + sort=arguments['--sort'], + couplings_file=arguments['--params'], # only for DCA + threads=threads # only for DCA + ) + + formatted_output( + performance_list=encoding_performance_list, + no_fft=arguments['--nofft'], + minimum_r2=-1E9 + ) + + # save_model encodes variants again (possibly change) + save_model( + path=path, + performances=encoding_performance_list, + training_set=arguments['--ls'], + test_set=arguments['--ts'], + threshold=t_save, + encoding=arguments['--encoding'], + regressor=arguments['--regressor'], + no_fft=arguments['--nofft'], + train_on_all=arguments['--all'], + couplings_file=arguments['--params'], # only for DCA + threads=threads, # only for DCA + label=arguments['--label'] + ) + + # Prediction of single .fasta file + elif arguments['--ts'] is not None and arguments['--model'] is not None: + predict_ts( + path=os.getcwd(), + model=arguments['--model'], + test_set=arguments['--ts'], + encoding=arguments['--encoding'], + no_fft=arguments['--nofft'], + couplings_file=arguments['--params'], # only for DCA + label=arguments['--label'], + threads=threads + ) + + elif arguments['--ps'] is not None and arguments['--model'] is not None: + predictions = predict( + path=os.getcwd(), + prediction_set=arguments['--ps'], + model=arguments['--model'], + encoding=arguments['--encoding'], + mult_path=None, + no_fft=arguments['--nofft'], + couplings_file=arguments['--params'], # only for DCA + threads=threads # only for DCA + ) + if predictions == 'skip' and not arguments['--params']: + raise SystemError("No couplings file provided. DCA-based sequence encoding " + "requires a (plmc or GREMLIN) parameter file.") + if arguments['--negative']: + predictions = sorted(predictions, key=lambda x: x[0], reverse=False) + predictions_out( + predictions=predictions, + model=arguments['--model'], + prediction_set=arguments['--ps'] + ) + + # Prediction on recombinant/diverse variant folder data + elif arguments['--pmult'] and arguments['--model'] is not None: + path = os.getcwd() + recombs_total = [] + recomb_d, recomb_t, recomb_qa, recomb_qi = \ + '/Recomb_Double_Split/', '/Recomb_Triple_Split/', \ + '/Recomb_Quadruple_Split/', '/Recomb_Quintuple_Split/' + diverse_d, diverse_t, diverse_q = \ + '/Diverse_Double_Split/', '/Diverse_Triple_Split/', '/Diverse_Quadruple_Split/' + if arguments['--drecomb']: + recombs_total.append(recomb_d) + if arguments['--trecomb']: + recombs_total.append(recomb_t) + if arguments['--qarecomb']: + recombs_total.append(recomb_qa) + if arguments['--qirecomb']: + recombs_total.append(recomb_qi) + if arguments['--ddiverse']: + recombs_total.append(diverse_d) + if arguments['--tdiverse']: + recombs_total.append(diverse_t) + if arguments['--qdiverse']: + recombs_total.append(diverse_q) + if arguments['--drecomb'] is False \ + and arguments['--trecomb'] is False \ + and arguments['--qarecomb'] is False \ + and arguments['--qirecomb'] is False \ + and arguments['--ddiverse'] is False \ + and arguments['--tdiverse'] is False \ + and arguments['--qdiverse'] is False: + raise SystemError('Define prediction target for --pmult, e.g. --pmult --drecomb.') + + for args in recombs_total: + predictions_total = [] + logger.info(f'Running predictions for variant-sequence files in directory {args[1:-1]}...') + path_recomb = path + args + files = [path_recomb + f for f in listdir(path_recomb) + if isfile(join(path_recomb, f)) if f.endswith('.fasta')] + for i, file in enumerate(files): + logger.info(f'Encoding files ({i+1}/{len(files)}) for prediction...') + predictions = predict( + path=path, + prediction_set=file, + model=arguments['--model'], + encoding=arguments['--encoding'], + mult_path=path_recomb, + no_fft=arguments['--nofft'], + couplings_file=arguments['--params'], # only for DCA + threads=threads # only for DCA + ) + for pred in predictions: + predictions_total.append(pred) # if array gets too large? + predictions_total = list(dict.fromkeys(predictions_total)) # removing duplicates from list + if arguments['--negative']: + predictions_total = sorted(predictions_total, key=lambda x: x[0], reverse=False) + else: + predictions_total = sorted(predictions_total, key=lambda x: x[0], reverse=True) + + predictions_out( + predictions=predictions_total, + model=arguments['--model'], + prediction_set=f'Top{args[1:-1]}', + path=path_recomb + ) + + elif arguments['extrapolation']: + performance_mutation_extrapolation( + encoded_csv=arguments['--input'], + cv_regressor=arguments['--regressor'], + conc=arguments['--conc'] + ) + + elif arguments['low_n']: + low_n( + encoded_csv=arguments['--input'], + cv_regressor=arguments['--regressor'] + ) diff --git a/pypef/ml/regression.py b/pypef/ml/regression.py index c758bc7..8dbcf0d 100644 --- a/pypef/ml/regression.py +++ b/pypef/ml/regression.py @@ -1,1096 +1,1100 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -""" -Main modules for regression/ML including feature generation -(i.e. sequence encoding), cross-validation-based hyperparameter -tuning, prediction, and plotting routines. -""" - - -import os -from typing import Union - -import logging -logger = logging.getLogger('pypef.ml.regression') -import matplotlib -matplotlib.use('Agg') # no plt.show(), just save plot -import matplotlib.pyplot as plt -import numpy as np -import pickle -from tqdm import tqdm # progress bars -from sklearn.model_selection import LeaveOneOut -from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_error -from sklearn.model_selection import GridSearchCV # default: refit=True - -# import regression models -from sklearn.cross_decomposition import PLSRegression -from sklearn.ensemble import RandomForestRegressor -from sklearn.svm import SVR -from sklearn.neural_network import MLPRegressor -from sklearn.linear_model import Ridge, Lasso, ElasticNet - -from pypef.utils.variant_data import ( - amino_acids, get_sequences_from_file, get_basename -) -from pypef.utils.plot import plot_y_true_vs_y_pred -from pypef.utils.performance import get_performances -from pypef.dca.hybrid_model import plmc_or_gremlin_encoding - -import warnings -warnings.filterwarnings(action='ignore', category=RuntimeWarning, module='numpy') -# ignoring warnings of PLS regression when using n_components -warnings.filterwarnings(action='ignore', category=RuntimeWarning, module='sklearn') -warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn') -warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn') -# FutureWarning: The default of 'normalize' will be set to False in version 1.2 and deprecated in version 1.4. -# If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. -# sklearn: The default value of 'normalize' should be changed to False in linear models where now normalize=True -warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn') - - -# globals -x_train = None -x_test = None -y_train = None -y_test = None -train_variants = None -test_variants = None -train_sequences = None -test_sequences = None -model_type = None - - -def read_models(number): - """ - reads the models found in the file Model_Results.txt. - If no model was trained, the .txt file does not exist. - """ - try: - ls = "" - with open('Model_Results.txt', 'r') as file: - for i, lines in enumerate(file): - if i == 0: - if lines[:6] == 'No FFT': - number += 2 - if i <= number + 1: - ls += lines - return ls - except FileNotFoundError: - return "No Model_Results.txt found." - - -def full_aaidx_txt_path(filename): - """ - returns the path of an index inside the folder /AAindex/, - e.g. /path/to/pypef/ml/AAindex/FAUJ880104.txt. - """ - modules_path = os.path.dirname(os.path.abspath(__file__)) - return os.path.join(modules_path, 'AAindex', filename) - - -def path_aaindex_dir(): - """ - returns the absolute path to the /AAindex folder, - e.g. /path/to/AAindex/. - """ - return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'AAindex') - - -class AAIndex: - """ - gets all the information that are given in each AAindex file. - For the program routine it provides the library to enable translation - of the alphabetical amino acid sequence to an array of numericals. - """ - def __init__(self, filename): - self.file = filename - self.accession_number = None - self.data_description = None - self.pmid = None - self.authors = None - self.title_of_article = None - self.journal_reference = None - - def general_information(self): - """ - Gets and allocates general information based on the AAindex file - format defined by file sections 'H', 'D', 'E', 'A', 'T', 'J' - """ - with open(self.file, 'r') as f: - for line in f: - # try/ except "removes" empty lines. - try: - words = line.split() - id_letter = words[0] - except IndexError: - break - - # Extract some general information about AAindex file. - if id_letter == 'H': - self.accession_number = words[1] - elif id_letter == 'D': - self.data_description = words[1] - elif id_letter == 'E': - self.pmid = words[1:] - elif id_letter == 'A': - self.authors = ' '.join(words[1:]) - elif id_letter == 'T': - self.title_of_article = ' '.join(words[1:]) - elif id_letter == 'J': - self.journal_reference = ' '.join(words[1:]) - - def encoding_dictionary(self): - """ - Get numerical values of AAindex for each amino acid - """ - try: - with open(self.file, 'r') as f: - for line in f: - # try/ except "removes" empty lines - try: - words = line.split() - id_letter = words[0] - except IndexError: - break - - # Extract numerical values of AAindex. - if id_letter == 'I': - - keys = [] - for word in words[1:]: - keys.append(word[0]) - keys.append(word[-1]) - - values = [] - for row in range(2): - line = f.readline() - strings = line.split() - for idx, string in enumerate(strings): - # Some amino acids may have no value - try: - strings[idx] = float(string) - except ValueError: - strings[idx] = None - values.append(strings) - values = np.reshape(np.array(values).T, len(keys)) - - return dict(zip(keys, values)) - except FileNotFoundError: - raise FileNotFoundError( - "Probably you used an encoding technique option in combination with a model " - "that was created using another encoding option (e.g. pypef ml -e aaidx -m " - "ONEHOT -p TS.fasta) which is not allowed." - ) - - -class AAIndexEncoding: - """ - converts the string sequence into a list of numericals - using the AAindex translation library; Fourier trans- - forming the numerical array that was translated by - get_numerical_sequence --> do_fourier,computing the input - matrices X and Y for the regressor (get_x_and_y). - Returns FFT-ed encoded sequences (amplitudes), - and raw_encoded sequences (raw_numerical_sequences). - """ - def __init__( - self, - aaindex_file=None, - sequences: list = None, - ): - aaidx = AAIndex(aaindex_file) - self.dictionary = aaidx.encoding_dictionary() - self.sequences = sequences - - def get_numerical_sequence(self, sequence): - return np.array([self.dictionary[aminoacid] for aminoacid in sequence]) - - @staticmethod - def do_fourier(sequence): - """ - This static function does the Fast Fourier Transform. - Since the condition - - len(Array) = 2^k -> k = log_2(len(Array)) , k in N - - must be satisfied, the array must be reshaped (zero padding) - if k is no integer value. The verbose parameter prints also - the real and imaginary part separately. - """ - threshold = 1e-8 # errors due to computer uncertainties - k = np.log2(sequence.size) # get exponent k - mean = np.mean(sequence, axis=0) # calculate mean of numerical array - sequence = np.subtract(sequence, mean) # subtract mean to avoid artificial effects of FT - - if abs(int(k) - k) > threshold: # check if length of array fulfills previous equation - numerical_sequence_reshaped = np.zeros(pow(2, (int(k) + 1))) # reshape array - for index, value in enumerate(sequence): - numerical_sequence_reshaped[index] = value - sequence = numerical_sequence_reshaped - - fourier_transformed = np.fft.fft(sequence) # FFT - ft_real = np.real(fourier_transformed) - ft_imag = np.imag(fourier_transformed) - - x = np.linspace(1, sequence.size, sequence.size) # frequencies - x = x / max(x) # normalization of frequency - - amplitude = ft_real * ft_real + ft_imag * ft_imag - - if max(amplitude) != 0: - amplitude = np.true_divide(amplitude, max(amplitude)) # normalization of amplitude - - return amplitude, x - - def aaidx_and_or_fft_encode_sequence(self, sequence): - """ - getting the input matrices X (FFT amplitudes) and Y (variant labels) - """ - num = self.get_numerical_sequence(sequence) - # Numerical sequence gets expended by zeros so that also different - # lengths of sequences can be processed using '--nofft' option - k = np.log2(len(num)) - if abs(int(k) - k) > 1e-8: # check if length of array fulfills previous equation - raw_numerical_seq = np.append(num, np.zeros(pow(2, (int(k) + 1)) - len(num))) # reshape array - else: - raw_numerical_seq = num - - if None not in num: # Not all amino acids could be encoded with the corresponding AAindex - amplitudes_, frequencies_ = self.do_fourier(num) # --> None values in encoded sequence - else: # If None in encoded Sequence, do not further use encoding (and FFT not possible) - return [None], [None] - # Fourier spectra are mirrored at frequency = 0.5 -> No more information at higher frequencies - half = len(frequencies_) // 2 # // for integer division - amplitude = amplitudes_[:half] # FFT-ed encoded amino acid sequences - # Appended zeros of raw encoding allow also prediction of differently sizes sequences - - # return -> X_fft_encoding, X_raw_encoding - return amplitude, raw_numerical_seq - - def collect_encoded_sequences(self): - """ - Loop over all sequences to encode each and collect - and return all encoded sequences - """ - # There may be amino acids without a value in AAindex - # Skip these indices - fft_encoded_sequences, raw_encoded_sequences = [], [] - for sequence in self.sequences: - fft_encoded_sequence, raw_encoded_sequence = self.aaidx_and_or_fft_encode_sequence(sequence) - if None in raw_encoded_sequence: - return 'skip', 'skip' # skipping this AAindex - else: - fft_encoded_sequences.append(fft_encoded_sequence) - raw_encoded_sequences.append(raw_encoded_sequence) - - return fft_encoded_sequences, raw_encoded_sequences - - -class OneHotEncoding: - """ - Generates an one-hot encoding, i.e. represents - the current amino acid at a position as 1 and - the other (19) amino acids as 0. Thus, the encoding - of a sequence has the length 20 x sequence length. - E.g. 'ACDY' --> [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] - """ - def __init__( - self, - sequences: list, - verbose=True - ): - self.sequences = sequences - self.amino_acids = amino_acids # imported, 20 standard AAs - self.verbose = verbose - - def encoding_dict(self) -> dict[str, np.ndarray]: - encoding_dict = {} - for idx, amino_acid in enumerate(self.amino_acids): - encoding_vector = np.zeros(20, dtype=int) - encoding_vector[idx] = 1 - encoding_dict.update({amino_acid: encoding_vector}) - return encoding_dict - - def one_hot_encode_sequence(self, sequence: str) -> np.ndarray: - encoded_sequence = [] - for aminoacid in sequence: - encoded_sequence.append(self.encoding_dict()[aminoacid]) - return np.concatenate(encoded_sequence) - - def collect_encoded_sequences(self, verbose: bool = None) -> np.ndarray: - if verbose is None: - disable = not self.verbose - else: - disable = not verbose - if len(np.atleast_1d(self.sequences)) == 1: # always silence for single (wt) sequence - disable = True - encoded_sequences = [] - for sequence in tqdm(self.sequences, disable=disable): - encoded_sequences.append(self.one_hot_encode_sequence(sequence)) - return np.array(encoded_sequences) - - -def pls_loocv( - x_train: np.ndarray, - y_train: np.ndarray -) -> Union[tuple[str, str], tuple[PLSRegression, dict]]: - """ - PLS regression with LOOCV n_components tuning as described by Cadet et al. - https://doi.org/10.1186/s12859-018-2407-8 - https://doi.org/10.1038/s41598-018-35033-y - Hyperparameter (N component) tuning of PLS regressor, can achieve slightly better - """ - mean_squared_error_list = [] - for n_comp in range(1, 10): # n_comp = 1, 2,..., 9 - try: - pls = PLSRegression(n_components=n_comp) - loo = LeaveOneOut() - y_pred_loo = [] - y_test_loo = [] - for train, test in loo.split(x_train): - x_learn_loo = [] - y_learn_loo = [] - x_test_loo = [] - for j in train: - x_learn_loo.append(x_train[j]) - y_learn_loo.append(y_train[j]) - for k in test: - x_test_loo.append(x_train[k]) - y_test_loo.append(y_train[k]) - x_learn_loo = np.array(x_learn_loo) - x_test_loo = np.array(x_test_loo) - y_learn_loo = np.array(y_learn_loo) - try: - pls.fit(x_learn_loo, y_learn_loo) - except ValueError: # scipy/linalg/decomp_svd.py ValueError: - continue # illegal value in %dth argument of internal gesdd - y_pred_loo.append(pls.predict(x_test_loo)[0][0]) - except np.linalg.LinAlgError: # numpy.linalg.LinAlgError: SVD did not converge - continue - try: - mse = mean_squared_error(y_test_loo, y_pred_loo) - mean_squared_error_list.append(mse) - except ValueError: # MSE could not be calculated (No values due to numpy.linalg.LinAlgErrors) - return 'skip', 'skip' - mean_squared_error_list = np.array(mean_squared_error_list) - idx = np.where(mean_squared_error_list == np.min(mean_squared_error_list))[0][0] + 1 - # Model is fitted with best n_components (lowest MSE) - best_params = {'n_components': idx} - regressor_ = PLSRegression(n_components=best_params.get('n_components')) - - return regressor_, best_params - - -def cv_regression_options(regressor: str) -> GridSearchCV: - """ - Returns the CVRegressor with the tunable regression-specific hyperparameter grid - for training a regression model. - Regression options are - - Partial Least Squares Regression - - Random Forest Regression - - Support Vector Machines Regression - - Multilayer Perceptron Regression - - Ridge Regression - - Lasso Regression - - ElasticNet Regression - """ - if regressor == 'pls': - params = {'n_components': list(np.arange(1, 10))} # n_comp = 1, 2,..., 9 - regressor_ = GridSearchCV(PLSRegression(), param_grid=params, cv=5) # iid in future versions redundant - - elif regressor == 'rf': - params = { # similar parameter grid as Xu et al., https://doi.org/10.1021/acs.jcim.0c00073 - 'random_state': [42], # state determined - 'n_estimators': [100, 250, 500, 1000], # number of individual decision trees in the forest - 'max_features': ['auto', 'sqrt', 'log2'] # “auto” -> max_features=n_features, - # “sqrt” -> max_features=sqrt(n_features) “log2” -> max_features=log2(n_features) - } - regressor_ = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=5) - - elif regressor == 'svr': - params = { # similar parameter grid as Xu et al. - 'C': [2 ** 0, 2 ** 2, 2 ** 4, 2 ** 6, 2 ** 8, 2 ** 10, 2 ** 12], # Regularization parameter - 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001] # often 1 / n_features or 1 / (n_features * X.var()) - } - regressor_ = GridSearchCV(SVR(), param_grid=params, cv=5) - - elif regressor == 'mlp': - params = { - # feedforward network trained via backpropagation – here only using a single hidden layer - 'hidden_layer_sizes': [i for i in range(1, 12)], # size of hidden layer [(1,), (2,), ..., (12,)] - 'activation': ['relu'], # rectified linear unit - 'solver': ['adam', 'lbfgs'], # ADAM: A Method for Stochastic Optimization , or Limited-memory BFGS - 'learning_rate': ['constant'], # learning rate given by ‘learning_rate_init’ - 'learning_rate_init': [0.001, 0.01, 0.1], # only used when solver=’sgd’ or ‘adam’ - 'max_iter': [1000, 200], # for stochastic solvers (‘sgd’, ‘adam’) determines epochs - 'random_state': [42] - } - regressor_ = GridSearchCV(MLPRegressor(), param_grid=params, cv=5) - - elif regressor == 'elasticnet' or regressor == 'l1l2': - # Linear regression with combined L1 and L2 priors as regularizer. - # min(w): ||y - Xw||^2_2 + alpha*l1_ratio*||w||_1 + 0.5*alpha*(1 - l1_ratio)*||w||^2_2 - params = { - 'alpha': np.logspace(-6, 6, num=100) # {1.000E-06, 1.322E-06, 1.748E-06, ..., 1.000E06} - } - regressor_ = GridSearchCV(ElasticNet(), param_grid=params, cv=5) - - elif regressor == 'ridge' or regressor == 'l2': - # Performs L2 regularization, i.e., adds penalty equivalent to square of the magnitude of coefficients - # Majorly used to prevent overfitting, since it includes all the features - # min(w): ||y - Xw||^2_2 + alpha*||w||^2_2 - # in case of exorbitantly high features, it will pose computational challenges. - params = { - # alpha = 0 is equivalent to an ordinary least square regression - # higher values of alpha reduce overfitting, significantly high values can - # cause underfitting as well (e.g., regularization strength alpha = 5) - 'alpha': np.logspace(-6, 6, num=100) # {1.000E-06, 1.322E-06, 1.748E-06, ..., 1.000E06} - } - regressor_ = GridSearchCV(Ridge(), param_grid=params, cv=5) - - elif regressor == 'lasso' or regressor == 'l1': - # Lasso model fit with Least Angle Regression a.k.a. Lars. - # Performs L1 regularization, i.e., adds penalty equivalent to absolute value of the magnitude of coefficients - # min(w): ||y - Xw||^2_2 + alpha*||w||_1 - # Provides sparse solutions: computationally efficient as features with zero coefficients can be ignored - params = { - # alpha = 0 is equivalent to an ordinary least square Regression - 'alpha': np.logspace(-6, 6, num=100) # {1.000E-06, 1.322E-06, 1.748E-06, ..., 1.000E06} - } - regressor_ = GridSearchCV(Lasso(), param_grid=params, cv=5) - - else: - raise SystemError("Did not find specified regression model as valid option. See '--help' for valid " - "regression model options.") - - return regressor_ - - -def get_regressor_performances( - x_learn: list, - x_test: list, - y_learn: list, - y_test: list, - regressor: str = 'pls', - verbose: bool = False -): - """ - The function get_regressor_performances takes features and labels from the - learning and test set. - - When using 'pls_loocv' as regressor, the MSE is calculated for all LOOCV - sets for predicted vs true labels (mse = mean_squared_error(y_test_loo, y_pred_loo) - for a fixed number of components for PLS regression. - In the next iteration, the number of components is increased by 1 (number_of_components += 1) - and the MSE is calculated for this regressor. The loop breaks if i > 9. - Finally, the model of the single AAindex model with the lowest MSE is chosen. - - When using other regressors the parameters are tuned using GridSearchCV. - - This function returnes performance (R2, (N)RMSE, Pearson's r) and model parameters. - """ - regressor = regressor.lower() - best_params = None - - if regressor == 'pls_loocv': # PLS LOOCV tuning - regressor_, best_params = pls_loocv(x_learn, y_learn) - if regressor_ == 'skip': - return [None, None, None, None, None, regressor, None] - - # other regression options (k-fold CV tuning) - else: - regressor_ = cv_regression_options(regressor) - try: - if verbose: - logger.info('CV-based training of regression model...') - regressor_.fit(x_learn, y_learn) # fit model - except ValueError: # scipy/linalg/decomp_svd.py --> ValueError('illegal value in %dth argument of internal gesdd' - return [None, None, None, None, None, regressor, None] - - if regressor != 'pls_loocv': # take best parameters for the regressor and the AAindex - best_params = regressor_.best_params_ - - y_pred = [] - try: - for y_p in regressor_.predict(x_test): # predict validation entries with fitted model - y_pred.append(float(y_p)) - except ValueError: - raise ValueError("Above error message exception indicates that your test set may be empty.") - - r2, rmse, nrmse, pearson_r, spearman_rho = get_performances(y_test, y_pred) - - return r2, rmse, nrmse, pearson_r, spearman_rho, regressor, best_params - - -def performance_list( - train_set: str, - test_set: str, - encoding: str = 'aaidx', - regressor: str = 'pls', - no_fft: bool = False, - sort: str = '1', - couplings_file: str = None, - threads: int = 1 # for parallelization of DCA-based encoding -): - """ - returns the sorted list of all the model parameters and the - performance values (R2 etc.) from function get_performances. - """ - global x_train, y_train, train_variants, train_sequences, \ - x_test, y_test, test_variants, test_sequences, \ - model_type - encoding = encoding.lower() - performance_list = [] - train_sequences, train_variants, y_train = get_sequences_from_file(train_set) - test_sequences, test_variants, y_test = get_sequences_from_file(test_set) - if encoding == 'onehot': # OneHot-based encoding - x_onehot_train = OneHotEncoding(train_sequences) - x_onehot_test = OneHotEncoding(test_sequences) - x_train = x_onehot_train.collect_encoded_sequences() - x_test = x_onehot_test.collect_encoded_sequences() - r2, rmse, nrmse, pearson_r, spearman_rho, regression_model, \ - params = get_regressor_performances(x_train, x_test, y_train, y_test, regressor, verbose=True) - if r2 is not None: # get_regressor_performances() returns None for metrics if MSE can't be calculated - performance_list.append([ - encoding.upper(), r2, rmse, nrmse, pearson_r, - spearman_rho, regression_model, params - ]) - elif encoding == 'dca': # 'plmc' or encoding == 'gremlin': - x_train, train_variants, train_sequences, y_train, x_wt, model, model_type = plmc_or_gremlin_encoding( - train_variants, train_sequences, y_train, couplings_file, threads=threads - ) - x_test, test_variants, test_sequences, y_test, x_wt, model, model_type = plmc_or_gremlin_encoding( - test_variants, test_sequences, y_test, couplings_file, threads=threads - ) - - r2, rmse, nrmse, pearson_r, spearman_rho, regression_model, \ - params = get_regressor_performances(x_train, x_test, y_train, y_test, regressor, verbose=True) - if r2 is not None: # get_regressor_performances() returns None for metrics if MSE can't be calculated - performance_list.append([ - encoding.upper(), r2, rmse, nrmse, pearson_r, - spearman_rho, regression_model, params - ]) - - else: # AAindex-based encoding - aa_indices = [file for file in os.listdir(path_aaindex_dir()) if file.endswith('.txt')] - # loop over the 566 AAindex entries, encode with each AAindex and test performance - # can be seen as a AAindex hyperparameter search on the test set --> also see CV performance - # in created folder across all data to ensure a relatively well generalizable model - for index, aaindex in enumerate(tqdm(aa_indices)): - x_aaidx_train = AAIndexEncoding(full_aaidx_txt_path(aaindex), train_sequences) - if not no_fft: # X is FFT-ed of encoded alphabetical sequence - x_train, _ = x_aaidx_train.collect_encoded_sequences() - else: # X is raw encoded of alphabetical sequence - _, x_train = x_aaidx_train.collect_encoded_sequences() - x_aaidx_test = AAIndexEncoding(full_aaidx_txt_path(aaindex), test_sequences) - if not no_fft: # X is FFT-ed of the encoded alphabetical sequence - x_test, _ = x_aaidx_test.collect_encoded_sequences() - else: # X is the raw encoded of alphabetical sequence - _, x_test = x_aaidx_test.collect_encoded_sequences() - # If x_learn or x_test contains None, the sequence could not be (fully) encoded --> Skip - if x_train == 'skip' or x_test == 'skip': - continue # skip the rest and do next iteration - r2, rmse, nrmse, pearson_r, spearman_rho, regression_model, \ - params = get_regressor_performances(x_train, x_test, y_train, y_test, regressor) - if r2 is not None: # get_regressor_performances() returns None for metrics if MSE can't be calculated - performance_list.append([ - aaindex, r2, rmse, nrmse, pearson_r, - spearman_rho, regression_model, params - ]) - - try: - sort = int(sort) - if sort == 2 or sort == 3: - performance_list.sort(key=lambda x: x[sort]) - else: - performance_list.sort(key=lambda x: x[sort], reverse=True) - - except ValueError: - raise ValueError("Choose between options 1 to 5 (R2, RMSE, NRMSE, Pearson's r, Spearman's rho.") - - return performance_list - - -def formatted_output( - performance_list, - no_fft=False, - minimum_r2=-1E9 -): - """ - Takes the sorted list from function r2_list and writes the model names with an R2 ≥ 0 - as well as the corresponding parameters for each model so that the user gets - a list (Model_Results.txt) of the top ranking models for the given validation set. - """ - - index, value, value2, value3, value4, value5, regression_model, params = [], [], [], [], [], [], [], [] - - for (idx, val, val2, val3, val4, val5, r_m, pam) in performance_list: - if val >= minimum_r2: - index.append(get_basename(idx)) - value.append('{:f}'.format(val)) - value2.append('{:f}'.format(val2)) - value3.append('{:f}'.format(val3)) - value4.append('{:f}'.format(val4)) - value5.append('{:f}'.format(val5)) - regression_model.append(r_m.upper()) - params.append(pam) - - if len(value) == 0: # Criterion of not finding suitable model is defined by Minimum_R2 - raise SystemError(f'No model with minimum R2 ({minimum_r2}).') - - data = np.array([index, value, value2, value3, value4, value5, regression_model, params]).T - col_width = max(len(str(value)) for row in data for value in row[:-1]) + 5 - - head = ['Index', 'R2', 'RMSE', 'NRMSE', 'Pearson\'s r', 'Spearman\'s rho', 'Regression', 'Model parameters'] - with open('Model_Results.txt', 'w') as f: - if no_fft is not False: - f.write("No FFT used in this model construction, performance" - " represents model accuracies on raw encoded sequence data.\n\n") - - heading = "".join(caption.ljust(col_width) for caption in head) + '\n' - f.write(heading) - - row_length = [] - for row in data: - row_ = "".join(str(value).ljust(col_width) for value in row) + '\n' - row_length.append(len(row_)) - row_length_max = max(row_length) - f.write(row_length_max * '-' + '\n') - - for row in data: - f.write("".join(str(value).ljust(col_width) for value in row) + '\n') - - -def cross_validation( - x: np.ndarray, - y: np.ndarray, - regressor_, # Union[PLSRegression, Ridge, Lasso, ElasticNet, ...] - n_samples: int = 5): - """ - Perform k-fold cross-validation on the input data (encoded sequences and - corresponding fitness values) with default k = 5. Returns all predicted - fitness values of the length y (e.g. (1/5)*len(y) * 5 = 1*len(y)). - """ - # perform k-fold cross-validation on all data - # k = Number of splits, change for changing k in k-fold splitting, default: 5 - y_test_total = [] - y_predicted_total = [] - - kf = KFold(n_splits=n_samples, shuffle=True) - - for train_index, test_index in kf.split(y): - y = np.array(y) - try: - x_train, x_test = x[train_index], x[test_index] - y_train, y_test = y[train_index], y[test_index] - - for numbers in y_test: - y_test_total.append(numbers) - regressor_.fit(x_train, y_train) - y_pred_test = regressor_.predict(x_test) - - for values in y_pred_test: - y_predicted_total.append(float(values)) - except UserWarning: - continue - - return y_test_total, y_predicted_total - - -def get_regressor( - regressor: str, - parameter: dict -): - """ - Returns the tuned CVRegressor with the tuned hyperparameters. - Regression options are - - Partial Least Squares Regression - - Random Forest Regression - - Support Vector Machines Regression - - Multilayer Perceptron Regression - - Ridge Regression - - Lasso Regression - - ElasticNet Regression - """ - if regressor == 'pls' or regressor == 'pls_loocv': - # n_components according to lowest MSE for validation set - regressor_ = PLSRegression(n_components=parameter.get('n_components')) - - elif regressor == 'rf': - regressor_ = RandomForestRegressor( - random_state=parameter.get('random_state'), - n_estimators=parameter.get('n_estimators'), - max_features=parameter.get('max_features') - ) - - elif regressor == 'svr': - regressor_ = SVR(C=parameter.get('C'), gamma=parameter.get('gamma')) - - elif regressor == 'mlp': - regressor_ = MLPRegressor( - hidden_layer_sizes=parameter.get('hidden_layer_sizes'), - activation=parameter.get('activation'), - solver=parameter.get('solver'), - learning_rate=parameter.get('learning_rate'), - learning_rate_init=parameter.get('learning_rate_init'), - max_iter=parameter.get('max_iter'), - random_state=parameter.get('random_state') - ) - - elif regressor == 'ridge' or regressor == 'l2': - regressor_ = Ridge( - alpha=parameter.get('alpha') - ) - - elif regressor == 'lasso' or regressor == 'l1': - regressor_ = Lasso( - alpha=parameter.get('alpha') - ) - - elif regressor == 'elasticnet' or regressor == 'l1l2': - regressor_ = ElasticNet( - alpha=parameter.get('alpha') - ) - - else: - raise SystemError("Did not find specified regression model as valid option. " - "See '--help' for valid regression model options.") - - return regressor_ - - -def encode_based_on_type( - encoding: str, - variants, - sequences, - y_true=None, - couplings_file=None, - idx=None, - threads=1, - no_fft=True, - substitution_sep='/', - verbose=True -): - if y_true is None: - y_true = np.zeros(np.shape(sequences)) - if encoding == 'aaidx': # AAindex encoding technique - encoder = AAIndexEncoding(full_aaidx_txt_path(idx), list(np.atleast_1d(sequences))) - if no_fft is False: # use FFT on encoded sequences (default) - x, _ = encoder.collect_encoded_sequences() - else: # use raw encoding (no FFT used on encoded sequences) - _, x = encoder.collect_encoded_sequences() - elif encoding == 'onehot': # OneHot encoding technique - encoder = OneHotEncoding(sequences) - x = encoder.collect_encoded_sequences() - elif encoding == 'dca': # PLMC or GREMLIN-based encoding - if len(sequences) == 1: - use_global_model = True - else: - use_global_model = False - x, variants, sequences, y_true, x_wt, model, model_type = plmc_or_gremlin_encoding( - variants, sequences, y_true, couplings_file, substitution_sep, threads, verbose, use_global_model - ) - else: - raise SystemError("Unknown encoding option.") - - assert len(x) == len(variants) == len(sequences) == len(y_true) - return x, variants, sequences, y_true - - -def crossval_on_all(x_train, x_test, y_train, y_test, regressor: str, parameter, idx=None, no_fft=False): - """ - Use e.g. 80 % of all data for learning (hyperparameter tuning via validation) - and e.g. 20 % for testing 5 times for 5-fold cross validation. - """ - name = get_basename(idx) - cv_filename = os.path.join('CV_performance', f'{name}_{regressor.upper()}_CV_Results.txt') - try: - os.remove(cv_filename) - except FileNotFoundError: - pass - file = open(cv_filename, 'w') - file.write('5-fold cross-validated performance of top ' - 'models for validation set across all data.\n\n') - if no_fft: - file.write("No FFT used in this model construction, performance represents" - " model accuracies on raw encoded sequence data.\n\n") - file.close() - - x = np.concatenate([x_train, x_test]) - y = np.concatenate([y_train, y_test]) - - regressor_ = get_regressor(regressor, parameter) - # perform 5-fold cross-validation on all data (on X and Y) - n_samples = 5 - y_test_total, y_predicted_total = cross_validation(x, y, regressor_, n_samples) - - r_squared, rmse, nrmse, pearson_r, spearman_rho = \ - get_performances(y_test_total, y_predicted_total) - - with open(cv_filename, 'a') as f: - f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format( - regressor.upper(), parameter, name)) - f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};' - ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho)) - - figure, ax = plt.subplots() - legend = r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + \ - f'\nPearson\'s ' + r'$r$' + f' = {pearson_r:.3f}' + f'\nSpearman\'s ' + \ - fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_test_total)})' - ax.scatter( - y_test_total, y_predicted_total, - marker='o', s=20, linewidths=0.5, edgecolor='black', label=legend, alpha=0.8 - ) - ax.plot([min(y_test_total) - 1, max(y_test_total) + 1], - [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=0.5) - - ax.set_xlabel('Measured') - ax.set_ylabel('Predicted') - ax.legend(prop={'size': 8}) - plt.savefig( - os.path.join('CV_performance', f'{name}_{regressor.upper()}_{n_samples}-fold-CV.png'), - dpi=500 - ) - plt.close('all') - - -def save_model( - path, - performances, - training_set, - test_set, - threshold=5, - encoding='aaidx', - regressor='pls', - no_fft=False, - train_on_all=False, - couplings_file=None, - threads: int = 1, - label=False -): - """ - Function Save_Model saves the best -s THRESHOLD models as 'Pickle' - files (pickle.dump), which can be loaded again for doing predictions. - Also, in Save_Model included is the def cross_validation-based computing - of the k-fold CV performance of the n component-optimized model on all - data (learning + test set); by default k is 5 (n_samples = 5). - Plots of the CV performance for the t best models are stored inside the - folder CV_performance. - """ - global x_train, y_train, train_variants, train_sequences, \ - x_test, y_test, test_variants, test_sequences, \ - model_type - logger.info('Encoding and cross validation on all data (creating folder CV_performance)...') - regressor = regressor.lower() - try: - os.mkdir('CV_performance') - except FileExistsError: - pass - try: - os.mkdir('Pickles') - except FileExistsError: - pass - if encoding != 'aaidx' and x_train is not None and x_test is not None: - pass # take global encodings instead of recomputing DCA encodings - else: - train_sequences, train_variants, y_train = get_sequences_from_file(training_set) - test_sequences, test_variants, y_test = get_sequences_from_file(test_set) - for i, t in enumerate(range(threshold)): - try: - idx = performances[t][0] - parameter = performances[t][7] - - if encoding != 'aaidx' and x_train is not None and x_test is not None: - pass # take global encodings instead of recomputing DCA encodings - else: - x_train, train_variants, train_sequences, y_train = encode_based_on_type( - encoding, train_variants, train_sequences, y_train, couplings_file, idx, threads, no_fft - ) - x_test, test_variants, test_sequences, y_test = encode_based_on_type( - encoding, test_variants, test_sequences, y_test, couplings_file, idx, threads, no_fft - ) - - crossval_on_all(x_train, x_test, y_train, y_test, regressor, parameter, idx, no_fft) - regressor_ = get_regressor(regressor, parameter) - if train_on_all: # Train model hyperparameters based on all available data (training + test set) - # But, no generalization performance can be estimated as the model also trained on the test set - x = np.concatenate([x_train, x_test]) - y = np.concatenate([y_train, y_test]) - regressor_.fit(x, y) - else: - # fit (only) on full learning set (FFT or noFFT is defined already above) - regressor_.fit(x_train, y_train) - # 2D prediction array output to 1D - y_test_pred = np.array(regressor_.predict(x_test)).flatten() - plot_y_true_vs_y_pred( - y_true=y_test, - y_pred=y_test_pred, - variants=test_variants, - label=label, - hybrid=False, - name=f'{get_basename(idx)}_{regressor.upper()}_' - ) - name = get_basename(idx) - if model_type in ['PLMC', 'GREMLIN'] and encoding not in ['aaidx', 'onehot']: - name = 'ML' + model_type.lower() - logger.info(f'Saving model as {name}') - file = open(os.path.join(path, 'Pickles', name), 'wb') - pickle.dump(regressor_, file) - file.close() - - except IndexError: - raise IndexError - # break - - if encoding == 'onehot' or encoding == 'dca': # only 1 model/encoding --> - break # no further iteration needed, thus break loop - - -def predict( - path, - model, - prediction_set=None, - encoding='aaidx', - mult_path=None, - no_fft=False, - variants=None, - sequences=None, - couplings_file=None, - threads: int = 1, # for parallelization of DCA-based encoding - substitution_sep='/', - verbose=False -): - """ - The function Predict is used to perform predictions. - Saved pickle files of models will be loaded again: - mod = pickle.load(file) - and used for predicting the label y (y = mod.predict(x)) - of sequences given in the Prediction_Set.fasta. - """ - # model defines pickle to load (and thus determines encoding AAidx) - file = open(os.path.join(path, 'Pickles', str(model)), 'rb') - loaded_model = pickle.load(file) - file.close() - idx = None - if encoding == 'aaidx': - idx = model + '.txt' - if sequences is None and variants is None: # File-based prediction - sequences, variants, _ = get_sequences_from_file(prediction_set, mult_path) - - try: - x, variants, sequences, _ = encode_based_on_type( - encoding, variants, sequences, None, couplings_file, - idx, threads, no_fft, substitution_sep, verbose - ) - except SystemError: - return 'skip' - if type(x) == list: - if not x: - return 'skip' - elif type(x) == np.ndarray: - if not x.any(): - return 'skip' - - assert len(variants) == len(x) - - try: - ys = loaded_model.predict(x) - except ValueError: - raise SystemError( - "If you used an encoding such as onehot, make sure to use the correct model, e.g. -m ONEHOT. " - "If you used an AAindex-encoded model you likely tried to predict using a model encoded with " - "(or without) FFT featurization ('--nofft') while the model was trained without (or with) FFT " - "featurization so check Model_Results.txt line 1, if the models were trained with or without FFT." - ) - except AttributeError: - raise SystemError( - "The model specified is likely a hybrid or pure statistical DCA (and no pure ML model).\n" - "Check the specified model provided via the \'-m\' flag." - ) - - predictions = [(float(ys[i]), variants[i]) for i in range(len(ys))] # List of tuples - - # Pay attention if increased negative values would define a better variant --> use negative flag - predictions.sort() - predictions.reverse() # if predictions array is too large? - return predictions - - -def predict_ts( - path, - model, - test_set=None, - encoding='aaidx', - idx=None, - no_fft=False, - couplings_file=None, - label=False, - threads: int = 1 # for parallelization of DCA-based encoding -): - """ - The function Predict is used to perform predictions. - Saved pickle files of models will be loaded again: - mod = pickle.load(file) - and used for predicting the label y (y = mod.predict(x)) - of sequences given in the Prediction_Set.fasta. - """ - file = open(os.path.join(path, 'Pickles', str(model)), 'rb') - loaded_model = pickle.load(file) - file.close() - if type(loaded_model) == dict: - loaded_model = loaded_model['model'] - if encoding == 'aaidx': - idx = model + '.txt' - - sequences, variants, y_test = get_sequences_from_file(test_set) - x, variants, sequences, y_test, *_ = encode_based_on_type( - encoding, variants, sequences, y_test, couplings_file, idx, threads, no_fft - ) - if type(x) == list: - if not x: - return 'skip' - elif type(x) == np.ndarray: - if not x.any(): - return 'skip' - if encoding != 'aaidx': - idx = encoding - - assert len(variants) == len(x) - - try: - y_pred = loaded_model.predict(x) - y_pred = list(np.array(y_pred).flatten()) - except ValueError: - raise SystemError( - "If you used an encoding such as onehot, make sure to use the correct model, e.g. -m ONEHOT. " - "If you used an AAindex-encoded model you likely tried to predict using a model encoded with " - "(or without) FFT featurization ('--nofft') while the model was trained without (or with) FFT " - "featurization so check Model_Results.txt line 1, if the models were trained with or without FFT." - ) - except AttributeError: - raise SystemError( - "The model specified is likely a hybrid or pure statistical DCA (and no pure ML model).\n" - "Check the specified model provided via the \'-m\' flag." - ) - - plot_y_true_vs_y_pred(y_test, y_pred, variants, label, hybrid=False, name=f'{get_basename(idx).upper()}_') +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +""" +Main modules for regression/ML including feature generation +(i.e. sequence encoding), cross-validation-based hyperparameter +tuning, prediction, and plotting routines. +""" + + +import os +from typing import Union + +import logging +logger = logging.getLogger('pypef.ml.regression') +import matplotlib +matplotlib.use('Agg') # no plt.show(), just save plot +import matplotlib.pyplot as plt +import numpy as np +import pickle +from tqdm import tqdm # progress bars +from sklearn.model_selection import LeaveOneOut +from sklearn.model_selection import KFold +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import GridSearchCV # default: refit=True + +# import regression models +from sklearn.cross_decomposition import PLSRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.svm import SVR +from sklearn.neural_network import MLPRegressor +from sklearn.linear_model import Ridge, Lasso, ElasticNet + +from pypef.utils.variant_data import ( + amino_acids, get_sequences_from_file, get_basename +) +from pypef.utils.plot import plot_y_true_vs_y_pred +from pypef.utils.performance import get_performances +from pypef.dca.hybrid_model import plmc_or_gremlin_encoding + +import warnings +warnings.filterwarnings(action='ignore', category=RuntimeWarning, module='numpy') +# ignoring warnings of PLS regression when using n_components +warnings.filterwarnings(action='ignore', category=RuntimeWarning, module='sklearn') +warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn') +warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn') +# FutureWarning: The default of 'normalize' will be set to False in version 1.2 and deprecated in version 1.4. +# If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. +# sklearn: The default value of 'normalize' should be changed to False in linear models where now normalize=True +warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn') + + +# globals +x_train = None +x_test = None +y_train = None +y_test = None +train_variants = None +test_variants = None +train_sequences = None +test_sequences = None +model_type = None + + +def read_models(number): + """ + reads the models found in the file Model_Results.txt. + If no model was trained, the .txt file does not exist. + """ + try: + ls = "" + with open('Model_Results.txt', 'r') as file: + for i, lines in enumerate(file): + if i == 0: + if lines[:6] == 'No FFT': + number += 2 + if i <= number + 1: + ls += lines + return ls + except FileNotFoundError: + return "No Model_Results.txt found." + + +def full_aaidx_txt_path(filename): + """ + returns the path of an index inside the folder /AAindex/, + e.g. /path/to/pypef/ml/AAindex/FAUJ880104.txt. + """ + modules_path = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(modules_path, 'AAindex', filename) + + +def path_aaindex_dir(): + """ + returns the absolute path to the /AAindex folder, + e.g. /path/to/AAindex/. + """ + return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'AAindex') + + +class AAIndex: + """ + gets all the information that are given in each AAindex file. + For the program routine it provides the library to enable translation + of the alphabetical amino acid sequence to an array of numericals. + """ + def __init__(self, filename): + self.file = filename + self.accession_number = None + self.data_description = None + self.pmid = None + self.authors = None + self.title_of_article = None + self.journal_reference = None + + def general_information(self): + """ + Gets and allocates general information based on the AAindex file + format defined by file sections 'H', 'D', 'E', 'A', 'T', 'J' + """ + with open(self.file, 'r') as f: + for line in f: + # try/ except "removes" empty lines. + try: + words = line.split() + id_letter = words[0] + except IndexError: + break + + # Extract some general information about AAindex file. + if id_letter == 'H': + self.accession_number = words[1] + elif id_letter == 'D': + self.data_description = words[1] + elif id_letter == 'E': + self.pmid = words[1:] + elif id_letter == 'A': + self.authors = ' '.join(words[1:]) + elif id_letter == 'T': + self.title_of_article = ' '.join(words[1:]) + elif id_letter == 'J': + self.journal_reference = ' '.join(words[1:]) + + def encoding_dictionary(self): + """ + Get numerical values of AAindex for each amino acid + """ + try: + with open(self.file, 'r') as f: + for line in f: + # try/ except "removes" empty lines + try: + words = line.split() + id_letter = words[0] + except IndexError: + break + + # Extract numerical values of AAindex. + if id_letter == 'I': + + keys = [] + for word in words[1:]: + keys.append(word[0]) + keys.append(word[-1]) + + values = [] + for row in range(2): + line = f.readline() + strings = line.split() + for idx, string in enumerate(strings): + # Some amino acids may have no value + try: + strings[idx] = float(string) + except ValueError: + strings[idx] = None + values.append(strings) + values = np.reshape(np.array(values).T, len(keys)) + + return dict(zip(keys, values)) + except FileNotFoundError: + raise FileNotFoundError( + "Probably you used an encoding technique option in combination with a model " + "that was created using another encoding option (e.g. pypef ml -e aaidx -m " + "ONEHOT -p TS.fasta) which is not allowed." + ) + + +class AAIndexEncoding: + """ + converts the string sequence into a list of numericals + using the AAindex translation library; Fourier trans- + forming the numerical array that was translated by + get_numerical_sequence --> do_fourier,computing the input + matrices X and Y for the regressor (get_x_and_y). + Returns FFT-ed encoded sequences (amplitudes), + and raw_encoded sequences (raw_numerical_sequences). + """ + def __init__( + self, + aaindex_file=None, + sequences: list = None, + ): + aaidx = AAIndex(aaindex_file) + self.dictionary = aaidx.encoding_dictionary() + self.sequences = sequences + + def get_numerical_sequence(self, sequence): + return np.array([self.dictionary[aminoacid] for aminoacid in sequence]) + + @staticmethod + def do_fourier(sequence): + """ + This static function does the Fast Fourier Transform. + Since the condition + + len(Array) = 2^k -> k = log_2(len(Array)) , k in N + + must be satisfied, the array must be reshaped (zero padding) + if k is no integer value. The verbose parameter prints also + the real and imaginary part separately. + """ + threshold = 1e-8 # errors due to computer uncertainties + k = np.log2(sequence.size) # get exponent k + mean = np.mean(sequence, axis=0) # calculate mean of numerical array + sequence = np.subtract(sequence, mean) # subtract mean to avoid artificial effects of FT + + if abs(int(k) - k) > threshold: # check if length of array fulfills previous equation + numerical_sequence_reshaped = np.zeros(pow(2, (int(k) + 1))) # reshape array + for index, value in enumerate(sequence): + numerical_sequence_reshaped[index] = value + sequence = numerical_sequence_reshaped + + fourier_transformed = np.fft.fft(sequence) # FFT + ft_real = np.real(fourier_transformed) + ft_imag = np.imag(fourier_transformed) + + x = np.linspace(1, sequence.size, sequence.size) # frequencies + x = x / max(x) # normalization of frequency + + amplitude = ft_real * ft_real + ft_imag * ft_imag + + if max(amplitude) != 0: + amplitude = np.true_divide(amplitude, max(amplitude)) # normalization of amplitude + + return amplitude, x + + def aaidx_and_or_fft_encode_sequence(self, sequence): + """ + getting the input matrices X (FFT amplitudes) and Y (variant labels) + """ + num = self.get_numerical_sequence(sequence) + # Numerical sequence gets expended by zeros so that also different + # lengths of sequences can be processed using '--nofft' option + k = np.log2(len(num)) + if abs(int(k) - k) > 1e-8: # check if length of array fulfills previous equation + raw_numerical_seq = np.append(num, np.zeros(pow(2, (int(k) + 1)) - len(num))) # reshape array + else: + raw_numerical_seq = num + + if None not in num: # Not all amino acids could be encoded with the corresponding AAindex + amplitudes_, frequencies_ = self.do_fourier(num) # --> None values in encoded sequence + else: # If None in encoded Sequence, do not further use encoding (and FFT not possible) + return [None], [None] + # Fourier spectra are mirrored at frequency = 0.5 -> No more information at higher frequencies + half = len(frequencies_) // 2 # // for integer division + amplitude = amplitudes_[:half] # FFT-ed encoded amino acid sequences + # Appended zeros of raw encoding allow also prediction of differently sizes sequences + + # return -> X_fft_encoding, X_raw_encoding + return amplitude, raw_numerical_seq + + def collect_encoded_sequences(self): + """ + Loop over all sequences to encode each and collect + and return all encoded sequences + """ + # There may be amino acids without a value in AAindex + # Skip these indices + fft_encoded_sequences, raw_encoded_sequences = [], [] + for sequence in self.sequences: + fft_encoded_sequence, raw_encoded_sequence = self.aaidx_and_or_fft_encode_sequence(sequence) + if None in raw_encoded_sequence: + return 'skip', 'skip' # skipping this AAindex + else: + fft_encoded_sequences.append(fft_encoded_sequence) + raw_encoded_sequences.append(raw_encoded_sequence) + + return fft_encoded_sequences, raw_encoded_sequences + + +class OneHotEncoding: + """ + Generates an one-hot encoding, i.e. represents + the current amino acid at a position as 1 and + the other (19) amino acids as 0. Thus, the encoding + of a sequence has the length 20 x sequence length. + E.g. 'ACDY' --> [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] + """ + def __init__( + self, + sequences: list, + verbose=True + ): + self.sequences = sequences + self.amino_acids = amino_acids # imported, 20 standard AAs + self.verbose = verbose + + def encoding_dict(self) -> dict[str, np.ndarray]: + encoding_dict = {} + for idx, amino_acid in enumerate(self.amino_acids): + encoding_vector = np.zeros(20, dtype=int) + encoding_vector[idx] = 1 + encoding_dict.update({amino_acid: encoding_vector}) + return encoding_dict + + def one_hot_encode_sequence(self, sequence: str) -> np.ndarray: + encoded_sequence = [] + for aminoacid in sequence: + encoded_sequence.append(self.encoding_dict()[aminoacid]) + return np.concatenate(encoded_sequence) + + def collect_encoded_sequences(self, verbose: bool = None) -> np.ndarray: + if verbose is None: + disable = not self.verbose + else: + disable = not verbose + if len(np.atleast_1d(self.sequences)) == 1: # always silence for single (wt) sequence + disable = True + encoded_sequences = [] + for sequence in tqdm(self.sequences, disable=disable): + encoded_sequences.append(self.one_hot_encode_sequence(sequence)) + return np.array(encoded_sequences) + + +def pls_loocv( + x_train: np.ndarray, + y_train: np.ndarray +) -> Union[tuple[str, str], tuple[PLSRegression, dict]]: + """ + PLS regression with LOOCV n_components tuning as described by Cadet et al. + https://doi.org/10.1186/s12859-018-2407-8 + https://doi.org/10.1038/s41598-018-35033-y + Hyperparameter (N component) tuning of PLS regressor, can achieve slightly better + results than e.g. 5-fold CV. + """ + mean_squared_error_list = [] + for n_comp in range(1, 10): # n_comp = 1, 2,..., 9 + try: + pls = PLSRegression(n_components=n_comp) + loo = LeaveOneOut() + y_pred_loo = [] + y_test_loo = [] + for train, test in loo.split(x_train): + x_learn_loo = [] + y_learn_loo = [] + x_test_loo = [] + for j in train: + x_learn_loo.append(x_train[j]) + y_learn_loo.append(y_train[j]) + for k in test: + x_test_loo.append(x_train[k]) + y_test_loo.append(y_train[k]) + x_learn_loo = np.array(x_learn_loo) + x_test_loo = np.array(x_test_loo) + y_learn_loo = np.array(y_learn_loo) + try: + pls.fit(x_learn_loo, y_learn_loo) + except ValueError: # scipy/linalg/decomp_svd.py ValueError: + continue # illegal value in %dth argument of internal gesdd + y_pred___ = pls.predict(x_test_loo)[0] + y_pred_loo.append(pls.predict(x_test_loo)[0]) + except np.linalg.LinAlgError: # numpy.linalg.LinAlgError: SVD did not converge + continue + try: + mse = mean_squared_error(y_test_loo, y_pred_loo) + mean_squared_error_list.append(mse) + except ValueError: # MSE could not be calculated (No values due to numpy.linalg.LinAlgErrors) + return 'skip', 'skip' + mean_squared_error_list = np.array(mean_squared_error_list) + idx = np.where(mean_squared_error_list == np.min(mean_squared_error_list))[0][0] + 1 + # Model is fitted with best n_components (lowest MSE) + best_params = {'n_components': idx} + regressor_ = PLSRegression(n_components=best_params.get('n_components')) + + return regressor_, best_params + + +def cv_regression_options(regressor: str) -> GridSearchCV: + """ + Returns the CVRegressor with the tunable regression-specific hyperparameter grid + for training a regression model. + Regression options are + - Partial Least Squares Regression + - Random Forest Regression + - Support Vector Machines Regression + - Multilayer Perceptron Regression + - Ridge Regression + - Lasso Regression + - ElasticNet Regression + """ + if regressor == 'pls': + params = {'n_components': list(np.arange(1, 10))} # n_comp = 1, 2,..., 9 + regressor_ = GridSearchCV(PLSRegression(), param_grid=params, cv=5) # iid in future versions redundant + + elif regressor == 'rf': + params = { # similar parameter grid as Xu et al., https://doi.org/10.1021/acs.jcim.0c00073 + 'random_state': [42], # state determined + 'n_estimators': [100, 250, 500, 1000], # number of individual decision trees in the forest + 'max_features': ['auto', 'sqrt', 'log2'] # “auto” -> max_features=n_features, + # “sqrt” -> max_features=sqrt(n_features) “log2” -> max_features=log2(n_features) + } + regressor_ = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=5) + + elif regressor == 'svr': + params = { # similar parameter grid as Xu et al. + 'C': [2 ** 0, 2 ** 2, 2 ** 4, 2 ** 6, 2 ** 8, 2 ** 10, 2 ** 12], # Regularization parameter + 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001] # often 1 / n_features or 1 / (n_features * X.var()) + } + regressor_ = GridSearchCV(SVR(), param_grid=params, cv=5) + + elif regressor == 'mlp': + params = { + # feedforward network trained via backpropagation – here only using a single hidden layer + 'hidden_layer_sizes': [i for i in range(1, 12)], # size of hidden layer [(1,), (2,), ..., (12,)] + 'activation': ['relu'], # rectified linear unit + 'solver': ['adam', 'lbfgs'], # ADAM: A Method for Stochastic Optimization , or Limited-memory BFGS + 'learning_rate': ['constant'], # learning rate given by ‘learning_rate_init’ + 'learning_rate_init': [0.001, 0.01, 0.1], # only used when solver=’sgd’ or ‘adam’ + 'max_iter': [1000, 200], # for stochastic solvers (‘sgd’, ‘adam’) determines epochs + 'random_state': [42] + } + regressor_ = GridSearchCV(MLPRegressor(), param_grid=params, cv=5) + + elif regressor == 'elasticnet' or regressor == 'l1l2': + # Linear regression with combined L1 and L2 priors as regularizer. + # min(w): ||y - Xw||^2_2 + alpha*l1_ratio*||w||_1 + 0.5*alpha*(1 - l1_ratio)*||w||^2_2 + params = { + 'alpha': np.logspace(-6, 6, num=100) # {1.000E-06, 1.322E-06, 1.748E-06, ..., 1.000E06} + } + regressor_ = GridSearchCV(ElasticNet(), param_grid=params, cv=5) + + elif regressor == 'ridge' or regressor == 'l2': + # Performs L2 regularization, i.e., adds penalty equivalent to square of the magnitude of coefficients + # Majorly used to prevent overfitting, since it includes all the features + # min(w): ||y - Xw||^2_2 + alpha*||w||^2_2 + # in case of exorbitantly high features, it will pose computational challenges. + params = { + # alpha = 0 is equivalent to an ordinary least square regression + # higher values of alpha reduce overfitting, significantly high values can + # cause underfitting as well (e.g., regularization strength alpha = 5) + 'alpha': np.logspace(-6, 6, num=100) # {1.000E-06, 1.322E-06, 1.748E-06, ..., 1.000E06} + } + regressor_ = GridSearchCV(Ridge(), param_grid=params, cv=5) + + elif regressor == 'lasso' or regressor == 'l1': + # Lasso model fit with Least Angle Regression a.k.a. Lars. + # Performs L1 regularization, i.e., adds penalty equivalent to absolute value of the magnitude of coefficients + # min(w): ||y - Xw||^2_2 + alpha*||w||_1 + # Provides sparse solutions: computationally efficient as features with zero coefficients can be ignored + params = { + # alpha = 0 is equivalent to an ordinary least square Regression + 'alpha': np.logspace(-6, 6, num=100) # {1.000E-06, 1.322E-06, 1.748E-06, ..., 1.000E06} + } + regressor_ = GridSearchCV(Lasso(), param_grid=params, cv=5) + + else: + raise SystemError("Did not find specified regression model as valid option. See '--help' for valid " + "regression model options.") + + return regressor_ + + +def get_regressor_performances( + x_learn: list, + x_test: list, + y_learn: list, + y_test: list, + regressor: str = 'pls', + verbose: bool = False +): + """ + The function get_regressor_performances takes features and labels from the + learning and test set. + + When using 'pls_loocv' as regressor, the MSE is calculated for all LOOCV + sets for predicted vs true labels (mse = mean_squared_error(y_test_loo, y_pred_loo) + for a fixed number of components for PLS regression. + In the next iteration, the number of components is increased by 1 (number_of_components += 1) + and the MSE is calculated for this regressor. The loop breaks if i > 9. + Finally, the model of the single AAindex model with the lowest MSE is chosen. + + When using other regressors the parameters are tuned using GridSearchCV. + + Returns + ------- + Performances (R2, RMSE, NRSME, Pearson's r, Spearman's rho) regressor type, and model parameters. + """ + regressor = regressor.lower() + best_params = None + + if regressor == 'pls_loocv': # PLS LOOCV tuning + regressor_, best_params = pls_loocv(x_learn, y_learn) + if regressor_ == 'skip': + return [None, None, None, None, None, regressor, None] + + # other regression options (k-fold CV tuning) + else: + regressor_ = cv_regression_options(regressor) + try: + if verbose: + logger.info('CV-based training of regression model...') + regressor_.fit(x_learn, y_learn) # fit model + except ValueError: # scipy/linalg/decomp_svd.py --> ValueError('illegal value in %dth argument of internal gesdd' + return [None, None, None, None, None, regressor, None] + + if regressor != 'pls_loocv': # take best parameters for the regressor and the AAindex + best_params = regressor_.best_params_ + + y_pred = [] + try: + for y_p in regressor_.predict(x_test): # predict validation entries with fitted model + y_pred.append(float(y_p)) + except ValueError: + raise ValueError("Above error message exception indicates that your test set may be empty.") + + r2, rmse, nrmse, pearson_r, spearman_rho = get_performances(y_test, y_pred) + + return r2, rmse, nrmse, pearson_r, spearman_rho, regressor, best_params + + +def performance_list( + train_set: str, + test_set: str, + encoding: str = 'aaidx', + regressor: str = 'pls', + no_fft: bool = False, + sort: str = '1', + couplings_file: str = None, + threads: int = 1 # for parallelization of DCA-based encoding +): + """ + returns the sorted list of all the model parameters and the + performance values (R2 etc.) from function get_performances. + """ + global x_train, y_train, train_variants, train_sequences, \ + x_test, y_test, test_variants, test_sequences, \ + model_type + encoding = encoding.lower() + performance_list = [] + train_sequences, train_variants, y_train = get_sequences_from_file(train_set) + test_sequences, test_variants, y_test = get_sequences_from_file(test_set) + if encoding == 'onehot': # OneHot-based encoding + x_onehot_train = OneHotEncoding(train_sequences) + x_onehot_test = OneHotEncoding(test_sequences) + x_train = x_onehot_train.collect_encoded_sequences() + x_test = x_onehot_test.collect_encoded_sequences() + r2, rmse, nrmse, pearson_r, spearman_rho, regression_model, \ + params = get_regressor_performances(x_train, x_test, y_train, y_test, regressor, verbose=True) + if r2 is not None: # get_regressor_performances() returns None for metrics if MSE can't be calculated + performance_list.append([ + encoding.upper(), r2, rmse, nrmse, pearson_r, + spearman_rho, regression_model, params + ]) + elif encoding == 'dca': # 'plmc' or encoding == 'gremlin': + x_train, train_variants, train_sequences, y_train, x_wt, model, model_type = plmc_or_gremlin_encoding( + train_variants, train_sequences, y_train, couplings_file, threads=threads + ) + x_test, test_variants, test_sequences, y_test, x_wt, model, model_type = plmc_or_gremlin_encoding( + test_variants, test_sequences, y_test, couplings_file, threads=threads + ) + + r2, rmse, nrmse, pearson_r, spearman_rho, regression_model, \ + params = get_regressor_performances(x_train, x_test, y_train, y_test, regressor, verbose=True) + if r2 is not None: # get_regressor_performances() returns None for metrics if MSE can't be calculated + performance_list.append([ + encoding.upper(), r2, rmse, nrmse, pearson_r, + spearman_rho, regression_model, params + ]) + + else: # AAindex-based encoding + aa_indices = [file for file in os.listdir(path_aaindex_dir()) if file.endswith('.txt')] + # loop over the 566 AAindex entries, encode with each AAindex and test performance + # can be seen as a AAindex hyperparameter search on the test set --> also see CV performance + # in created folder across all data to ensure a relatively well generalizable model + for index, aaindex in enumerate(tqdm(aa_indices)): + x_aaidx_train = AAIndexEncoding(full_aaidx_txt_path(aaindex), train_sequences) + if not no_fft: # X is FFT-ed of encoded alphabetical sequence + x_train, _ = x_aaidx_train.collect_encoded_sequences() + else: # X is raw encoded of alphabetical sequence + _, x_train = x_aaidx_train.collect_encoded_sequences() + x_aaidx_test = AAIndexEncoding(full_aaidx_txt_path(aaindex), test_sequences) + if not no_fft: # X is FFT-ed of the encoded alphabetical sequence + x_test, _ = x_aaidx_test.collect_encoded_sequences() + else: # X is the raw encoded of alphabetical sequence + _, x_test = x_aaidx_test.collect_encoded_sequences() + # If x_learn or x_test contains None, the sequence could not be (fully) encoded --> Skip + if x_train == 'skip' or x_test == 'skip': + continue # skip the rest and do next iteration + r2, rmse, nrmse, pearson_r, spearman_rho, regression_model, \ + params = get_regressor_performances(x_train, x_test, y_train, y_test, regressor) + if r2 is not None: # get_regressor_performances() returns None for metrics if MSE can't be calculated + performance_list.append([ + aaindex, r2, rmse, nrmse, pearson_r, + spearman_rho, regression_model, params + ]) + + try: + sort = int(sort) + if sort == 2 or sort == 3: + performance_list.sort(key=lambda x: x[sort]) + else: + performance_list.sort(key=lambda x: x[sort], reverse=True) + + except ValueError: + raise ValueError("Choose between options 1 to 5 (R2, RMSE, NRMSE, Pearson's r, Spearman's rho.") + + return performance_list + + +def formatted_output( + performance_list, + no_fft=False, + minimum_r2=-1E9 +): + """ + Takes the sorted list from function r2_list and writes the model names with an R2 ≥ 0 + as well as the corresponding parameters for each model so that the user gets + a list (Model_Results.txt) of the top ranking models for the given validation set. + """ + + index, value, value2, value3, value4, value5, regression_model, params = [], [], [], [], [], [], [], [] + + for (idx, val, val2, val3, val4, val5, r_m, pam) in performance_list: + if val >= minimum_r2: + index.append(get_basename(idx)) + value.append('{:f}'.format(val)) + value2.append('{:f}'.format(val2)) + value3.append('{:f}'.format(val3)) + value4.append('{:f}'.format(val4)) + value5.append('{:f}'.format(val5)) + regression_model.append(r_m.upper()) + params.append(pam) + + if len(value) == 0: # Criterion of not finding suitable model is defined by Minimum_R2 + raise SystemError(f'No model with minimum R2 ({minimum_r2}).') + + data = np.array([index, value, value2, value3, value4, value5, regression_model, params]).T + col_width = max(len(str(value)) for row in data for value in row[:-1]) + 5 + + head = ['Index', 'R2', 'RMSE', 'NRMSE', 'Pearson\'s r', 'Spearman\'s rho', 'Regression', 'Model parameters'] + with open('Model_Results.txt', 'w') as f: + if no_fft is not False: + f.write("No FFT used in this model construction, performance" + " represents model accuracies on raw encoded sequence data.\n\n") + + heading = "".join(caption.ljust(col_width) for caption in head) + '\n' + f.write(heading) + + row_length = [] + for row in data: + row_ = "".join(str(value).ljust(col_width) for value in row) + '\n' + row_length.append(len(row_)) + row_length_max = max(row_length) + f.write(row_length_max * '-' + '\n') + + for row in data: + f.write("".join(str(value).ljust(col_width) for value in row) + '\n') + + +def cross_validation( + x: np.ndarray, + y: np.ndarray, + regressor_, # Union[PLSRegression, Ridge, Lasso, ElasticNet, ...] + n_samples: int = 5): + """ + Perform k-fold cross-validation on the input data (encoded sequences and + corresponding fitness values) with default k = 5. Returns all predicted + fitness values of the length y (e.g. (1/5)*len(y) * 5 = 1*len(y)). + """ + # perform k-fold cross-validation on all data + # k = Number of splits, change for changing k in k-fold splitting, default: 5 + y_test_total = [] + y_predicted_total = [] + + kf = KFold(n_splits=n_samples, shuffle=True) + + for train_index, test_index in kf.split(y): + y = np.array(y) + try: + x_train, x_test = x[train_index], x[test_index] + y_train, y_test = y[train_index], y[test_index] + + for numbers in y_test: + y_test_total.append(numbers) + regressor_.fit(x_train, y_train) + y_pred_test = regressor_.predict(x_test) + + for values in y_pred_test: + y_predicted_total.append(float(values)) + except UserWarning: + continue + + return y_test_total, y_predicted_total + + +def get_regressor( + regressor: str, + parameter: dict +): + """ + Returns the tuned CVRegressor with the tuned hyperparameters. + Regression options are + - Partial Least Squares Regression + - Random Forest Regression + - Support Vector Machines Regression + - Multilayer Perceptron Regression + - Ridge Regression + - Lasso Regression + - ElasticNet Regression + """ + if regressor == 'pls' or regressor == 'pls_loocv': + # n_components according to lowest MSE for validation set + regressor_ = PLSRegression(n_components=parameter.get('n_components')) + + elif regressor == 'rf': + regressor_ = RandomForestRegressor( + random_state=parameter.get('random_state'), + n_estimators=parameter.get('n_estimators'), + max_features=parameter.get('max_features') + ) + + elif regressor == 'svr': + regressor_ = SVR(C=parameter.get('C'), gamma=parameter.get('gamma')) + + elif regressor == 'mlp': + regressor_ = MLPRegressor( + hidden_layer_sizes=parameter.get('hidden_layer_sizes'), + activation=parameter.get('activation'), + solver=parameter.get('solver'), + learning_rate=parameter.get('learning_rate'), + learning_rate_init=parameter.get('learning_rate_init'), + max_iter=parameter.get('max_iter'), + random_state=parameter.get('random_state') + ) + + elif regressor == 'ridge' or regressor == 'l2': + regressor_ = Ridge( + alpha=parameter.get('alpha') + ) + + elif regressor == 'lasso' or regressor == 'l1': + regressor_ = Lasso( + alpha=parameter.get('alpha') + ) + + elif regressor == 'elasticnet' or regressor == 'l1l2': + regressor_ = ElasticNet( + alpha=parameter.get('alpha') + ) + + else: + raise SystemError("Did not find specified regression model as valid option. " + "See '--help' for valid regression model options.") + + return regressor_ + + +def encode_based_on_type( + encoding: str, + variants, + sequences, + y_true=None, + couplings_file=None, + idx=None, + threads=1, + no_fft=True, + substitution_sep='/', + verbose=True +): + if y_true is None: + y_true = np.zeros(np.shape(sequences)) + if encoding == 'aaidx': # AAindex encoding technique + encoder = AAIndexEncoding(full_aaidx_txt_path(idx), list(np.atleast_1d(sequences))) + if no_fft is False: # use FFT on encoded sequences (default) + x, _ = encoder.collect_encoded_sequences() + else: # use raw encoding (no FFT used on encoded sequences) + _, x = encoder.collect_encoded_sequences() + elif encoding == 'onehot': # OneHot encoding technique + encoder = OneHotEncoding(sequences) + x = encoder.collect_encoded_sequences() + elif encoding == 'dca': # PLMC or GREMLIN-based encoding + if len(sequences) == 1: + use_global_model = True + else: + use_global_model = False + x, variants, sequences, y_true, x_wt, model, model_type = plmc_or_gremlin_encoding( + variants, sequences, y_true, couplings_file, substitution_sep, threads, verbose, use_global_model + ) + else: + raise SystemError("Unknown encoding option.") + + assert len(x) == len(variants) == len(sequences) == len(y_true) + return x, variants, sequences, y_true + + +def crossval_on_all(x_train, x_test, y_train, y_test, regressor: str, parameter, idx=None, no_fft=False): + """ + Use e.g. 80 % of all data for learning (hyperparameter tuning via validation) + and e.g. 20 % for testing 5 times for 5-fold cross validation. + """ + name = get_basename(idx) + cv_filename = os.path.join('CV_performance', f'{name}_{regressor.upper()}_CV_Results.txt') + try: + os.remove(cv_filename) + except FileNotFoundError: + pass + file = open(cv_filename, 'w') + file.write('5-fold cross-validated performance of top ' + 'models for validation set across all data.\n\n') + if no_fft: + file.write("No FFT used in this model construction, performance represents" + " model accuracies on raw encoded sequence data.\n\n") + file.close() + + x = np.concatenate([x_train, x_test]) + y = np.concatenate([y_train, y_test]) + + regressor_ = get_regressor(regressor, parameter) + # perform 5-fold cross-validation on all data (on X and Y) + n_samples = 5 + y_test_total, y_predicted_total = cross_validation(x, y, regressor_, n_samples) + + r_squared, rmse, nrmse, pearson_r, spearman_rho = \ + get_performances(y_test_total, y_predicted_total) + + with open(cv_filename, 'a') as f: + f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format( + regressor.upper(), parameter, name)) + f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};' + ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho)) + + figure, ax = plt.subplots() + legend = r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + \ + f'\nPearson\'s ' + r'$r$' + f' = {pearson_r:.3f}' + f'\nSpearman\'s ' + \ + fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_test_total)})' + ax.scatter( + y_test_total, y_predicted_total, + marker='o', s=20, linewidths=0.5, edgecolor='black', label=legend, alpha=0.8 + ) + ax.plot([min(y_test_total) - 1, max(y_test_total) + 1], + [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=0.5) + + ax.set_xlabel('Measured') + ax.set_ylabel('Predicted') + ax.legend(prop={'size': 8}) + plt.savefig( + os.path.join('CV_performance', f'{name}_{regressor.upper()}_{n_samples}-fold-CV.png'), + dpi=500 + ) + plt.close('all') + + +def save_model( + path, + performances, + training_set, + test_set, + threshold=5, + encoding='aaidx', + regressor='pls', + no_fft=False, + train_on_all=False, + couplings_file=None, + threads: int = 1, + label=False +): + """ + Function Save_Model saves the best -s THRESHOLD models as 'Pickle' + files (pickle.dump), which can be loaded again for doing predictions. + Also, in Save_Model included is the def cross_validation-based computing + of the k-fold CV performance of the n component-optimized model on all + data (learning + test set); by default k is 5 (n_samples = 5). + Plots of the CV performance for the t best models are stored inside the + folder CV_performance. + """ + global x_train, y_train, train_variants, train_sequences, \ + x_test, y_test, test_variants, test_sequences, \ + model_type + logger.info('Encoding and cross validation on all data (creating folder CV_performance)...') + regressor = regressor.lower() + try: + os.mkdir('CV_performance') + except FileExistsError: + pass + try: + os.mkdir('Pickles') + except FileExistsError: + pass + if encoding != 'aaidx' and x_train is not None and x_test is not None: + pass # take global encodings instead of recomputing DCA encodings + else: + train_sequences, train_variants, y_train = get_sequences_from_file(training_set) + test_sequences, test_variants, y_test = get_sequences_from_file(test_set) + for i, t in enumerate(range(threshold)): + try: + idx = performances[t][0] + parameter = performances[t][7] + + if encoding != 'aaidx' and x_train is not None and x_test is not None: + pass # take global encodings instead of recomputing DCA encodings + else: + x_train, train_variants, train_sequences, y_train = encode_based_on_type( + encoding, train_variants, train_sequences, y_train, couplings_file, idx, threads, no_fft + ) + x_test, test_variants, test_sequences, y_test = encode_based_on_type( + encoding, test_variants, test_sequences, y_test, couplings_file, idx, threads, no_fft + ) + + crossval_on_all(x_train, x_test, y_train, y_test, regressor, parameter, idx, no_fft) + regressor_ = get_regressor(regressor, parameter) + if train_on_all: # Train model hyperparameters based on all available data (training + test set) + # But, no generalization performance can be estimated as the model also trained on the test set + x = np.concatenate([x_train, x_test]) + y = np.concatenate([y_train, y_test]) + regressor_.fit(x, y) + else: + # fit (only) on full learning set (FFT or noFFT is defined already above) + regressor_.fit(x_train, y_train) + # 2D prediction array output to 1D + y_test_pred = np.array(regressor_.predict(x_test)).flatten() + plot_y_true_vs_y_pred( + y_true=y_test, + y_pred=y_test_pred, + variants=test_variants, + label=label, + hybrid=False, + name=f'{get_basename(idx)}_{regressor.upper()}_' + ) + name = get_basename(idx) + if model_type in ['PLMC', 'GREMLIN'] and encoding not in ['aaidx', 'onehot']: + name = 'ML' + model_type.lower() + logger.info(f'Saving model as {name}') + file = open(os.path.join(path, 'Pickles', name), 'wb') + pickle.dump(regressor_, file) + file.close() + + except IndexError: + raise IndexError + # break + + if encoding == 'onehot' or encoding == 'dca': # only 1 model/encoding --> + break # no further iteration needed, thus break loop + + +def predict( + path, + model, + prediction_set=None, + encoding='aaidx', + mult_path=None, + no_fft=False, + variants=None, + sequences=None, + couplings_file=None, + threads: int = 1, # for parallelization of DCA-based encoding + substitution_sep='/', + verbose=False +): + """ + The function Predict is used to perform predictions. + Saved pickle files of models will be loaded again: + mod = pickle.load(file) + and used for predicting the label y (y = mod.predict(x)) + of sequences given in the Prediction_Set.fasta. + """ + # model defines pickle to load (and thus determines encoding AAidx) + file = open(os.path.join(path, 'Pickles', str(model)), 'rb') + loaded_model = pickle.load(file) + file.close() + idx = None + if encoding == 'aaidx': + idx = model + '.txt' + if sequences is None and variants is None: # File-based prediction + sequences, variants, _ = get_sequences_from_file(prediction_set, mult_path) + + try: + x, variants, sequences, _ = encode_based_on_type( + encoding, variants, sequences, None, couplings_file, + idx, threads, no_fft, substitution_sep, verbose + ) + except SystemError: + return 'skip' + if type(x) == list: + if not x: + return 'skip' + elif type(x) == np.ndarray: + if not x.any(): + return 'skip' + + assert len(variants) == len(x) + + try: + ys = loaded_model.predict(x) + except ValueError: + raise SystemError( + "If you used an encoding such as onehot, make sure to use the correct model, e.g. -m ONEHOT. " + "If you used an AAindex-encoded model you likely tried to predict using a model encoded with " + "(or without) FFT featurization ('--nofft') while the model was trained without (or with) FFT " + "featurization so check Model_Results.txt line 1, if the models were trained with or without FFT." + ) + except AttributeError: + raise SystemError( + "The model specified is likely a hybrid or pure statistical DCA (and no pure ML model).\n" + "Check the specified model provided via the \'-m\' flag." + ) + + predictions = [(float(ys[i]), variants[i]) for i in range(len(ys))] # List of tuples + + # Pay attention if increased negative values would define a better variant --> use negative flag + predictions.sort() + predictions.reverse() # if predictions array is too large? + return predictions + + +def predict_ts( + path, + model, + test_set=None, + encoding='aaidx', + idx=None, + no_fft=False, + couplings_file=None, + label=False, + threads: int = 1 # for parallelization of DCA-based encoding +): + """ + The function Predict is used to perform predictions. + Saved pickle files of models will be loaded again: + mod = pickle.load(file) + and used for predicting the label y (y = mod.predict(x)) + of sequences given in the Prediction_Set.fasta. + """ + file = open(os.path.join(path, 'Pickles', str(model)), 'rb') + loaded_model = pickle.load(file) + file.close() + if type(loaded_model) == dict: + loaded_model = loaded_model['model'] + if encoding == 'aaidx': + idx = model + '.txt' + + sequences, variants, y_test = get_sequences_from_file(test_set) + x, variants, sequences, y_test, *_ = encode_based_on_type( + encoding, variants, sequences, y_test, couplings_file, idx, threads, no_fft + ) + if type(x) == list: + if not x: + return 'skip' + elif type(x) == np.ndarray: + if not x.any(): + return 'skip' + if encoding != 'aaidx': + idx = encoding + + assert len(variants) == len(x) + + try: + y_pred = loaded_model.predict(x) + y_pred = list(np.array(y_pred).flatten()) + except ValueError: + raise SystemError( + "If you used an encoding such as onehot, make sure to use the correct model, e.g. -m ONEHOT. " + "If you used an AAindex-encoded model you likely tried to predict using a model encoded with " + "(or without) FFT featurization ('--nofft') while the model was trained without (or with) FFT " + "featurization so check Model_Results.txt line 1, if the models were trained with or without FFT." + ) + except AttributeError: + raise SystemError( + "The model specified is likely a hybrid or pure statistical DCA (and no pure ML model).\n" + "Check the specified model provided via the \'-m\' flag." + ) + + plot_y_true_vs_y_pred(y_test, y_pred, variants, label, hybrid=False, name=f'{get_basename(idx).upper()}_') diff --git a/pypef/utils/directed_evolution.py b/pypef/utils/directed_evolution.py index 1c3a83e..58a958b 100644 --- a/pypef/utils/directed_evolution.py +++ b/pypef/utils/directed_evolution.py @@ -1,354 +1,354 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -""" -Modules for performing random evolution walks -similar as presented by Biswas et al. -""" - - -from __future__ import annotations -import os -import re -import random - -import matplotlib.pyplot as plt -import numpy as np -import warnings -from adjustText import adjust_text -import logging -logger = logging.getLogger('pypef.utils.directed_evolution') - -from pypef.ml.regression import predict -from pypef.dca.hybrid_model import predict_directed_evolution - -# ignoring warnings of scikit-learn regression -warnings.filterwarnings(action='ignore', category=RuntimeWarning, module='sklearn') -warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn') - - -class DirectedEvolution: - # Class attributes (None) - def __init__( # Instance attributes - self, - ml_or_hybrid: str, - encoding: str, - s_wt: str, - y_wt: float, - single_vars: list, - num_iterations: int, - num_trajectories: int, - amino_acids: list, - temp: float, - path: str, - model: str = None, - no_fft: bool = False, - dca_encoder: str | None = None, - usecsv: bool = False, - csvaa: bool = False, - negative: bool = False - ): - """ - Runs in silico directed evolution and plots and writes trajectories. - - Parameters - ---------- - ml_or_hybrid: str - 'ml' or 'hybrid' - encoding: str - 'aaidx' or 'dca' - s_wt: str - WT sequence, s_wt = get_wt_sequence(arguments['--wt']) - y_wt: float - WT fitness, y_wt = arguments['--y_wt'] - single_vars: list - single substituted protein variants; used for recombination - of variants. Obtained from the CSV file with get_variants: - single_variants, single_values, higher_variants, higher_values = \ - get_variants(df, amino_acids, s_wt) - num_iterations: int - Number of tried steps in the evolution process - num_trajectories: int - Number of independent evolutionary trajectories - amino_acids: list - Usually the 20 standard amino acids - temp: float - (Boltzmann) 'Temperature' of the Metropolis-Hastings algorithm for - accepting new trajectory variants - path: str - Just current working directory (os.getcwd()) - model: str - Loaded Pickle file for regression/hybrid modeling. - no_fft: bool - If True, not using FFT for AAindex-based encoding - dca_encoder = None or PLMC object - dca_encoder = PLMC( - params_file=arguments['--plmc_params'], - separator=arguments['--sep'] - ) - usecsv: bool - Using only CSV variants for recombination (but all 20 amino acids) - csvaa: bool - Using only CSV variants for recombination (but all amino acids that - are present CSV) - negative: bool - More negative variants define improved variants - """ - self.ml_or_hybrid = ml_or_hybrid - self.encoding = encoding - self.s_wt = s_wt - self.y_wt = y_wt - self.single_vars = single_vars - self.num_iterations = num_iterations - self.num_trajectories = num_trajectories - self.amino_acids = amino_acids - self.temp = temp - self.path = path - self.model = model - self.no_fft = no_fft # for AAidx only - self.dca_encoder = dca_encoder - self.usecsv = usecsv - self.csvaa = csvaa - self.negative = negative - self.de_step_counter = 0 # DE steps - self.traj_counter = 0 # Trajectory counter - - def mutate_sequence( - self, - seq: str, - prev_mut_loc: int - ): - """ - Parameters - ---------- - seq: str, - Initial sequence to be mutated, must not be WT Seq but can - also itself be already substituted (iterative sequence substitutions) - prev_mut_loc: int - Previous position mutated, new position will be randomly chosen within - a range, by default: new_pos = previous_pos +- 8 - - Produces a mutant sequence (integer representation), given an initial sequence - and the previous position of mutation. - - """ - try: - os.mkdir('EvoTraj') - except FileExistsError: - pass - - var_seq_list = [] - rand_loc = random.randint(prev_mut_loc - 8, prev_mut_loc + 8) # find random position to mutate - while (rand_loc <= 0) or (rand_loc >= len(seq)): - rand_loc = random.randint(prev_mut_loc - 8, prev_mut_loc + 8) - aa_list = self.amino_acids - if self.usecsv: # Only perform directed evolution on positional csv variant data, - pos_list = [] # else: aa_list = amino_acids - aa_list = [] # overwrite aa_list = self.amino_acids - for aa_positions_aa in self.single_vars: # getting each single variant, e.g. of [['L215F'], ['A217N']] - for variant in aa_positions_aa: # just unpacking the variant, e.g. ['L215F'] -> 'L215F' - pos_int = int(re.findall(r"\d+", variant)[0]) - if pos_int not in pos_list: - pos_list.append(pos_int) - if self.csvaa: - new_aa = str(variant[-1:]) # new AA from known variant, e.g. 'F' from 'L215F' - if new_aa not in aa_list: - aa_list.append(new_aa) - else: - aa_list = self.amino_acids # new AA can be any of the 20 standard AA's - # Select closest position to single AA positions: - # However, this means that it is more probable that starting with lower substitution - # positions new substitution positions will likely be shifted towards higher positions. - # And for higher substitution positions new substitutions will likely be at lower positions. - absolute_difference_function = lambda list_value: abs(list_value - rand_loc) - try: - closest_loc = min(pos_list, key=absolute_difference_function) - except ValueError: - raise ValueError("No positions for recombination found. Likely no single " - "substituted variants were found in provided .csv file.") - rand_loc = closest_loc - 1 # - 1 as position is shifted by one when starting with 0 index - rand_aa = random.choice(aa_list) # find random amino acid to mutate to - seq_list = list(seq) - seq_list[rand_loc] = rand_aa # update sequence to have new amino acid at randomly chosen position - seq_m = ''.join(seq_list) - var = str(rand_loc + 1) + str(rand_aa) - var_seq_list.append((var, seq_m)) # list of tuples - - return var_seq_list - - @staticmethod - def assert_trajectory_sequences(v_traj, s_traj): - """ - Making sure that sequence mutations have been introduced correctly - (for last sequence only). - """ - for i, variant in enumerate(v_traj[1:]): # [1:] as not checking for WT - variant_position = int(re.findall(r"\d+", variant)[0]) - 1 - variant_amino_acid = str(variant[-1]) - assert variant_amino_acid == s_traj[i+1][variant_position] # checking AA of last trajectory sequence - - def in_silico_de(self): - """ - Perform directed evolution by randomly selecting a sequence - position for substitution and randomly choose the amino acid - to substitute to. New sequence gets accepted if meeting the - Metropolis criterion and will be taken for new substitution - iteration. Metropolis-Hastings-driven directed evolution, - similar to Biswas et al.: - Low-N protein engineering with data-efficient deep learning, - see https://github.com/ivanjayapurna/low-n-protein-engineering/tree/master/directed-evo - """ - # iterate through the trial mutation steps for the directed evolution trajectory - # m = 1 (only 1 mutation per step) instead of (np.random.poisson(2) + 1) - v_traj, s_traj, y_traj = [], [], [] - v_traj.append('WT') - y_traj.append(self.y_wt) - s_traj.append(self.s_wt) - accepted = 0 - logger.info(f"Step 0: WT --> {self.y_wt:.3f}") - for iteration in range(self.num_iterations): # num_iterations - self.de_step_counter = iteration - - if accepted == 0: - prior_mutation_location = random.randint(0, len(self.s_wt)) # not really "prior" as first - else: # get prior mutation position - prior_mutation_location = int(re.findall(r"\d+", v_traj[-1])[0]) - prior_y = y_traj[-1] # prior y, always at [-1] - prior_sequence = s_traj[-1] # prior sequence, always at [-1] - - new_var_seq = self.mutate_sequence( - seq=prior_sequence, - prev_mut_loc=prior_mutation_location - ) - - new_variant = new_var_seq[0][0] # int + string char, e.g. '17A' - new_full_variant = str(self.s_wt[int(new_variant[:-1])-1]) + new_variant # full variant name, e.g. 'F17A' - new_sequence = new_var_seq[0][1] - # encode and predict new sequence fitness - if self.ml_or_hybrid == 'ml': - predictions = predict( # AAidx, OneHot, or DCA-based pure ML prediction - path=self.path, - model=self.model, - encoding=self.encoding, - variants=np.atleast_1d(new_full_variant), - sequences=np.atleast_1d(new_sequence), - no_fft=self.no_fft, - couplings_file=self.dca_encoder - ) - - else: # hybrid modeling and prediction - predictions = predict_directed_evolution( - encoder=self.dca_encoder, - variant=self.s_wt[int(new_variant[:-1]) - 1] + new_variant, - sequence=new_sequence, - hybrid_model_data_pkl=self.model - ) - if predictions != 'skip': - logger.info(f"Step {self.de_step_counter + 1}: " - f"{self.s_wt[int(new_variant[:-1]) - 1]}{new_variant} --> {predictions[0][0]:.3f}") - else: # skip if variant cannot be encoded by DCA-based encoding technique - logger.info(f"Step {self.de_step_counter + 1}: " - f"{self.s_wt[int(new_variant[:-1]) - 1]}{new_variant} --> {predictions}") - continue - new_y, new_var = predictions[0][0], predictions[0][1] # new_var == new_variant nonetheless - # probability function for trial sequence - # The lower the fitness (y) of the new variant, the higher are the chances to get excluded - with warnings.catch_warnings(): # catching Overflow warning - warnings.simplefilter("ignore") - try: - boltz = np.exp(((new_y - prior_y) / self.temp), dtype=np.longfloat) - if self.negative: - boltz = np.exp((-(new_y - prior_y) / self.temp), dtype=np.longfloat) - except OverflowError: - boltz = 1 - p = min(1, boltz) - rand_var = random.random() # random float between 0 and 1 - if rand_var < p: # Metropolis-Hastings update selection criterion, else do nothing (do not accept variant) - v_traj.append(new_var) # update the variant naming trajectory - y_traj.append(new_y) # update the fitness trajectory records - s_traj.append(new_sequence) # update the sequence trajectory records - accepted += 1 - - self.assert_trajectory_sequences(v_traj, s_traj) - - return v_traj, s_traj, y_traj - - def run_de_trajectories(self): - """ - Runs the directed evolution by addressing the in_silico_de - function and plots the evolution trajectories. - """ - v_records = [] # initialize list of sequence variant names - s_records = [] # initialize list of sequence records - y_records = [] # initialize list of fitness score records - # i = counter, iterate through however many mutation trajectories we want to sample - for i in range(self.num_trajectories): - self.traj_counter = i - # call the directed evolution function, outputting the trajectory - # sequence and fitness score records - v_traj, s_traj, y_traj = self.in_silico_de() - v_records.append(v_traj) # update variant naming full mutagenesis trajectory - s_records.append(s_traj) # update the sequence full mutagenesis trajectory - y_records.append(y_traj) # update the fitness full mutagenesis trajectory - - return s_records, v_records, y_records - - def plot_trajectories(self): - """ - Plots evolutionary trajectories and saves steps - in CSV file. - """ - s_records, v_records, y_records = self.run_de_trajectories() - # Idea: Standardizing DCA-HybridModel predictions as just trained by Spearman's rho - # e.g., meaning that fitness values could differ only at the 6th decimal place and only - # predicted fitness ranks matter and not associated fitness values - fig, ax = plt.subplots(figsize=(10,6)) # figsize=(10, 6) - ax.locator_params(integer=True) - y_records_ = [] - for i, fitness_array in enumerate(y_records): - ax.plot(np.arange(1, len(fitness_array) + 1, 1), fitness_array, - '-o', alpha=0.7, markeredgecolor='black', label='EvoTraj' + str(i + 1)) - y_records_.append(fitness_array) - label_x_y_name = [] - traj_max_len = 0 - for i, v_record in enumerate(v_records): # i = 1, 2, 3, .., ; v_record = variant label array - for j, v in enumerate(v_record): # j = 1, 2, 3, ..., ; v = variant name; y_records[i][j] = fitness - if len(v_record) > traj_max_len: - traj_max_len = len(v_record) - if i == 0: # j + 1 -> x-axis position shifted by 1 - label_x_y_name.append(ax.text(j + 1, y_records_[i][j], v, size=7)) - else: - if v != 'WT': # only plot 'WT' name once at i == 0 - label_x_y_name.append(ax.text(j + 1, y_records_[i][j], v, size=7)) - adjust_text(label_x_y_name, only_move={'points': 'y', 'text': 'y'}, force_points=0.6) - ax.legend() - plt.xticks(np.arange(1, traj_max_len + 1, 1), np.arange(1, traj_max_len + 1, 1)) - - plt.ylabel('Predicted fitness') - plt.xlabel('Mutation trial steps') - plt.tight_layout() - plt.savefig(str(self.model) + '_DE_trajectories.png', dpi=500) - plt.clf() - - with open(os.path.join('EvoTraj', 'Trajectories.csv'), 'w') as file: - file.write('Trajectory;Variant;Sequence;Fitness\n') - for i in range(self.num_trajectories): - v_records_str = str(v_records[i])[1:-1].replace("'", "") - s_records_str = str(s_records[i])[1:-1].replace("'", "") - y_records_str = str(y_records[i])[1:-1] - file.write(f'{i+1};{v_records_str};{s_records_str};{y_records_str}\n') +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +""" +Modules for performing random evolution walks +similar as presented by Biswas et al. +""" + + +from __future__ import annotations +import os +import re +import random + +import matplotlib.pyplot as plt +import numpy as np +import warnings +from adjustText import adjust_text +import logging +logger = logging.getLogger('pypef.utils.directed_evolution') + +from pypef.ml.regression import predict +from pypef.dca.hybrid_model import predict_directed_evolution + +# ignoring warnings of scikit-learn regression +warnings.filterwarnings(action='ignore', category=RuntimeWarning, module='sklearn') +warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn') + + +class DirectedEvolution: + # Class attributes (None) + def __init__( # Instance attributes + self, + ml_or_hybrid: str, + encoding: str, + s_wt: str, + y_wt: float, + single_vars: list, + num_iterations: int, + num_trajectories: int, + amino_acids: list, + temp: float, + path: str, + model: str = None, + no_fft: bool = False, + dca_encoder: str | None = None, + usecsv: bool = False, + csvaa: bool = False, + negative: bool = False + ): + """ + Runs in silico directed evolution and plots and writes trajectories. + + Parameters + ---------- + ml_or_hybrid: str + 'ml' or 'hybrid' + encoding: str + 'aaidx' or 'dca' + s_wt: str + WT sequence, s_wt = get_wt_sequence(arguments['--wt']) + y_wt: float + WT fitness, y_wt = arguments['--y_wt'] + single_vars: list + single substituted protein variants; used for recombination + of variants. Obtained from the CSV file with get_variants: + single_variants, single_values, higher_variants, higher_values = \ + get_variants(df, amino_acids, s_wt) + num_iterations: int + Number of tried steps in the evolution process + num_trajectories: int + Number of independent evolutionary trajectories + amino_acids: list + Usually the 20 standard amino acids + temp: float + (Boltzmann) 'Temperature' of the Metropolis-Hastings algorithm for + accepting new trajectory variants + path: str + Just current working directory (os.getcwd()) + model: str + Loaded Pickle file for regression/hybrid modeling. + no_fft: bool + If True, not using FFT for AAindex-based encoding + dca_encoder = None or PLMC object + dca_encoder = PLMC( + params_file=arguments['--plmc_params'], + separator=arguments['--sep'] + ) + usecsv: bool + Using only CSV variants for recombination (but all 20 amino acids) + csvaa: bool + Using only CSV variants for recombination (but all amino acids that + are present CSV) + negative: bool + More negative variants define improved variants + """ + self.ml_or_hybrid = ml_or_hybrid + self.encoding = encoding + self.s_wt = s_wt + self.y_wt = y_wt + self.single_vars = single_vars + self.num_iterations = num_iterations + self.num_trajectories = num_trajectories + self.amino_acids = amino_acids + self.temp = temp + self.path = path + self.model = model + self.no_fft = no_fft # for AAidx only + self.dca_encoder = dca_encoder + self.usecsv = usecsv + self.csvaa = csvaa + self.negative = negative + self.de_step_counter = 0 # DE steps + self.traj_counter = 0 # Trajectory counter + + def mutate_sequence( + self, + seq: str, + prev_mut_loc: int + ): + """ + Parameters + ---------- + seq: str, + Initial sequence to be mutated, must not be WT Seq but can + also itself be already substituted (iterative sequence substitutions) + prev_mut_loc: int + Previous position mutated, new position will be randomly chosen within + a range, by default: new_pos = previous_pos +- 8 + + Produces a mutant sequence (integer representation), given an initial sequence + and the previous position of mutation. + + """ + try: + os.mkdir('EvoTraj') + except FileExistsError: + pass + + var_seq_list = [] + rand_loc = random.randint(prev_mut_loc - 8, prev_mut_loc + 8) # find random position to mutate + while (rand_loc <= 0) or (rand_loc >= len(seq)): + rand_loc = random.randint(prev_mut_loc - 8, prev_mut_loc + 8) + aa_list = self.amino_acids + if self.usecsv: # Only perform directed evolution on positional csv variant data, + pos_list = [] # else: aa_list = amino_acids + aa_list = [] # overwrite aa_list = self.amino_acids + for aa_positions_aa in self.single_vars: # getting each single variant, e.g. of [['L215F'], ['A217N']] + for variant in aa_positions_aa: # just unpacking the variant, e.g. ['L215F'] -> 'L215F' + pos_int = int(re.findall(r"\d+", variant)[0]) + if pos_int not in pos_list: + pos_list.append(pos_int) + if self.csvaa: + new_aa = str(variant[-1:]) # new AA from known variant, e.g. 'F' from 'L215F' + if new_aa not in aa_list: + aa_list.append(new_aa) + else: + aa_list = self.amino_acids # new AA can be any of the 20 standard AA's + # Select closest position to single AA positions: + # However, this means that it is more probable that starting with lower substitution + # positions new substitution positions will likely be shifted towards higher positions. + # And for higher substitution positions new substitutions will likely be at lower positions. + absolute_difference_function = lambda list_value: abs(list_value - rand_loc) + try: + closest_loc = min(pos_list, key=absolute_difference_function) + except ValueError: + raise ValueError("No positions for recombination found. Likely no single " + "substituted variants were found in provided .csv file.") + rand_loc = closest_loc - 1 # - 1 as position is shifted by one when starting with 0 index + rand_aa = random.choice(aa_list) # find random amino acid to mutate to + seq_list = list(seq) + seq_list[rand_loc] = rand_aa # update sequence to have new amino acid at randomly chosen position + seq_m = ''.join(seq_list) + var = str(rand_loc + 1) + str(rand_aa) + var_seq_list.append((var, seq_m)) # list of tuples + + return var_seq_list + + @staticmethod + def assert_trajectory_sequences(v_traj, s_traj): + """ + Making sure that sequence mutations have been introduced correctly + (for last sequence only). + """ + for i, variant in enumerate(v_traj[1:]): # [1:] as not checking for WT + variant_position = int(re.findall(r"\d+", variant)[0]) - 1 + variant_amino_acid = str(variant[-1]) + assert variant_amino_acid == s_traj[i+1][variant_position] # checking AA of last trajectory sequence + + def in_silico_de(self): + """ + Perform directed evolution by randomly selecting a sequence + position for substitution and randomly choose the amino acid + to substitute to. New sequence gets accepted if meeting the + Metropolis criterion and will be taken for new substitution + iteration. Metropolis-Hastings-driven directed evolution, + similar to Biswas et al.: + Low-N protein engineering with data-efficient deep learning, + see https://github.com/ivanjayapurna/low-n-protein-engineering/tree/master/directed-evo + """ + # iterate through the trial mutation steps for the directed evolution trajectory + # m = 1 (only 1 mutation per step) instead of (np.random.poisson(2) + 1) + v_traj, s_traj, y_traj = [], [], [] + v_traj.append('WT') + y_traj.append(self.y_wt) + s_traj.append(self.s_wt) + accepted = 0 + logger.info(f"Step 0: WT --> {self.y_wt:.3f}") + for iteration in range(self.num_iterations): # num_iterations + self.de_step_counter = iteration + + if accepted == 0: + prior_mutation_location = random.randint(0, len(self.s_wt)) # not really "prior" as first + else: # get prior mutation position + prior_mutation_location = int(re.findall(r"\d+", v_traj[-1])[0]) + prior_y = y_traj[-1] # prior y, always at [-1] + prior_sequence = s_traj[-1] # prior sequence, always at [-1] + + new_var_seq = self.mutate_sequence( + seq=prior_sequence, + prev_mut_loc=prior_mutation_location + ) + + new_variant = new_var_seq[0][0] # int + string char, e.g. '17A' + new_full_variant = str(self.s_wt[int(new_variant[:-1])-1]) + new_variant # full variant name, e.g. 'F17A' + new_sequence = new_var_seq[0][1] + # encode and predict new sequence fitness + if self.ml_or_hybrid == 'ml': + predictions = predict( # AAidx, OneHot, or DCA-based pure ML prediction + path=self.path, + model=self.model, + encoding=self.encoding, + variants=np.atleast_1d(new_full_variant), + sequences=np.atleast_1d(new_sequence), + no_fft=self.no_fft, + couplings_file=self.dca_encoder + ) + + else: # hybrid modeling and prediction + predictions = predict_directed_evolution( + encoder=self.dca_encoder, + variant=self.s_wt[int(new_variant[:-1]) - 1] + new_variant, + sequence=new_sequence, + hybrid_model_data_pkl=self.model + ) + if predictions != 'skip': + logger.info(f"Step {self.de_step_counter + 1}: " + f"{self.s_wt[int(new_variant[:-1]) - 1]}{new_variant} --> {predictions[0][0]:.3f}") + else: # skip if variant cannot be encoded by DCA-based encoding technique + logger.info(f"Step {self.de_step_counter + 1}: " + f"{self.s_wt[int(new_variant[:-1]) - 1]}{new_variant} --> {predictions}") + continue + new_y, new_var = predictions[0][0], predictions[0][1] # new_var == new_variant nonetheless + # probability function for trial sequence + # The lower the fitness (y) of the new variant, the higher are the chances to get excluded + with warnings.catch_warnings(): # catching Overflow warning + warnings.simplefilter("ignore") + try: + boltz = np.exp(((new_y - prior_y) / self.temp), dtype=np.longfloat) + if self.negative: + boltz = np.exp((-(new_y - prior_y) / self.temp), dtype=np.longfloat) + except OverflowError: + boltz = 1 + p = min(1, boltz) + rand_var = random.random() # random float between 0 and 1 + if rand_var < p: # Metropolis-Hastings update selection criterion, else do nothing (do not accept variant) + v_traj.append(new_var) # update the variant naming trajectory + y_traj.append(new_y) # update the fitness trajectory records + s_traj.append(new_sequence) # update the sequence trajectory records + accepted += 1 + + self.assert_trajectory_sequences(v_traj, s_traj) + + return v_traj, s_traj, y_traj + + def run_de_trajectories(self): + """ + Runs the directed evolution by addressing the in_silico_de + function and plots the evolution trajectories. + """ + v_records = [] # initialize list of sequence variant names + s_records = [] # initialize list of sequence records + y_records = [] # initialize list of fitness score records + # i = counter, iterate through however many mutation trajectories we want to sample + for i in range(self.num_trajectories): + self.traj_counter = i + # call the directed evolution function, outputting the trajectory + # sequence and fitness score records + v_traj, s_traj, y_traj = self.in_silico_de() + v_records.append(v_traj) # update variant naming full mutagenesis trajectory + s_records.append(s_traj) # update the sequence full mutagenesis trajectory + y_records.append(y_traj) # update the fitness full mutagenesis trajectory + + return s_records, v_records, y_records + + def plot_trajectories(self): + """ + Plots evolutionary trajectories and saves steps + in CSV file. + """ + s_records, v_records, y_records = self.run_de_trajectories() + # Idea: Standardizing DCA-HybridModel predictions as just trained by Spearman's rho + # e.g., meaning that fitness values could differ only at the 6th decimal place and only + # predicted fitness ranks matter and not associated fitness values + fig, ax = plt.subplots(figsize=(10,6)) # figsize=(10, 6) + ax.locator_params(integer=True) + y_records_ = [] + for i, fitness_array in enumerate(y_records): + ax.plot(np.arange(1, len(fitness_array) + 1, 1), fitness_array, + '-o', alpha=0.7, markeredgecolor='black', label='EvoTraj' + str(i + 1)) + y_records_.append(fitness_array) + label_x_y_name = [] + traj_max_len = 0 + for i, v_record in enumerate(v_records): # i = 1, 2, 3, .., ; v_record = variant label array + for j, v in enumerate(v_record): # j = 1, 2, 3, ..., ; v = variant name; y_records[i][j] = fitness + if len(v_record) > traj_max_len: + traj_max_len = len(v_record) + if i == 0: # j + 1 -> x-axis position shifted by 1 + label_x_y_name.append(ax.text(j + 1, y_records_[i][j], v, size=7)) + else: + if v != 'WT': # only plot 'WT' name once at i == 0 + label_x_y_name.append(ax.text(j + 1, y_records_[i][j], v, size=7)) + adjust_text(label_x_y_name, only_move={'points': 'y', 'text': 'y'}, force_points=0.6) + ax.legend() + plt.xticks(np.arange(1, traj_max_len + 1, 1), np.arange(1, traj_max_len + 1, 1)) + + plt.ylabel('Predicted fitness') + plt.xlabel('Mutation trial steps') + plt.tight_layout() + plt.savefig(str(self.model) + '_DE_trajectories.png', dpi=500) + plt.clf() + + with open(os.path.join('EvoTraj', 'Trajectories.csv'), 'w') as file: + file.write('Trajectory;Variant;Sequence;Fitness\n') + for i in range(self.num_trajectories): + v_records_str = str(v_records[i])[1:-1].replace("'", "") + s_records_str = str(s_records[i])[1:-1].replace("'", "") + y_records_str = str(y_records[i])[1:-1] + file.write(f'{i+1};{v_records_str};{s_records_str};{y_records_str}\n') diff --git a/pypef/utils/learning_test_sets.py b/pypef/utils/learning_test_sets.py index 0549025..017bdb8 100644 --- a/pypef/utils/learning_test_sets.py +++ b/pypef/utils/learning_test_sets.py @@ -1,404 +1,404 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -""" -Modules for creating training and test sets -from input CSV file (with value separators sep=',' or sep=';') -having the CSV-format - -HEADER_VARIANTS;HEADER_VARIANTS_FITNESS -VARIANT_1;FITNESS_VALUE_1 -VARIANT_2;FITNESS_VALUE_2 -... - -according to the self devised training and test set convention -> VARIANT_NAME_1 -; FITNESS_1 -VARIANT_SEQUENCE_1 -> VARIANT_NAME_2 -; FITNESS_2 -VARIANT_SEQUENCE_2 -... -""" - -import logging -logger = logging.getLogger('pypef.utils.learning_test_sets') - -import numpy as np -import random -import pandas as pd -import re - - -def csv_input(csv_file): - """ - Gets input data from defined .csv file (that contains variant names and fitness labels) - """ - if csv_file is None: - raise FileNotFoundError( - f'Did not find (specified) csv file! ' - f'Used csv input file instead: {str(csv_file)}.' - ) - return csv_file - - -def drop_rows( - csv_file, - amino_acids, - threshold_drop, - csv_sep: str = ';', - mutation_sep: str = '/' -): - """ - Drops rows from .csv data if below defined fitness threshold or if - amino acid/variant name is unknown or if fitness label is not a digit. - """ - separator = ';' - try: - df_raw = pd.read_csv(csv_file, sep=separator, usecols=[0, 1]) - except ValueError: - separator = ',' - df_raw = pd.read_csv(csv_file, sep=separator, usecols=[0, 1]) - except FileNotFoundError: - raise FileNotFoundError( - f"Specify the input CSV file containing the variant-fitness data. " - f"Required CSV format: variant{csv_sep}fitness.") - - label = df_raw.iloc[:, 1] - sequence = df_raw.iloc[:, 0] - - dropping_rows = [] - - for i, row in enumerate(label): - try: - row = float(row) - if row < threshold_drop: - dropping_rows.append(i) - except ValueError: - dropping_rows.append(i) - - for i, variant in enumerate(sequence): - try: - if mutation_sep in variant: - m = re.split(rf'{mutation_sep}', variant) - for a, splits in enumerate(m): - if splits[0].isdigit() and variant[-1] in amino_acids: - continue - elif splits[0] not in amino_acids or splits[-1] not in amino_acids: - if i not in dropping_rows: - dropping_rows.append(i) - else: - if ',' in variant or ';' in variant or '\t' in variant: - raise SystemError("Found invalid characters (';', ',', or tabulator) in variants. " - "Check the --mutation_sep flag and try specifying it, e.g. --mutation_sep \',\'.") - if variant[0].isdigit() and variant[-1] in amino_acids: - continue - elif variant not in ['wt', 'wild_type']: - if variant[0] not in amino_acids or variant[-1] not in amino_acids: - dropping_rows.append(i) - - except TypeError: - raise TypeError('You might consider checking the input .csv for empty first two columns,' - ' e.g. in the last row.') - - logger.info(f'No. of dropped rows: {len(dropping_rows)}. ' - f'Total given variants (if provided plus WT): {len(df_raw)}') - - df = df_raw.drop(dropping_rows) - df.dropna(inplace=True) - df.reset_index(drop=True, inplace=True) - - return df - - -def get_variants( - df, - amino_acids, - wild_type_sequence, - mutation_sep: str = '/' -): - """ - Gets variants and divides and counts the variant data for single substituted - and higher substituted variants. Raises NameError if variant naming is not - matching the given wild-type sequence, e.g. if variant A17C would define - a substitution at residue Ala-17 to Cys but the wild-type sequence has no Ala - at position 17. - """ - x = df.iloc[:, 0] - y = df.iloc[:, 1] - wt_position = None - single_variants, higher_variants, index_higher, index_lower, \ - higher_values, single_values = [], [], [], [], [], [] - single, double, triple, quadruple, quintuple, sextuple, septuple,\ - octuple, nonuple, decuple, higher = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - for i, variant in enumerate(x): - if mutation_sep in variant: - count = variant.count(mutation_sep) - if count == 1: - double += 1 - elif count == 2: - triple += 1 - elif count == 3: - quadruple += 1 - elif count == 4: - quintuple += 1 - elif count == 5: - sextuple += 1 - elif count == 6: - septuple += 1 - elif count == 7: - octuple += 1 - elif count == 8: - nonuple += 1 - elif count == 9: - decuple += 1 - else: - higher += 1 - m = re.split(rf'{mutation_sep}', variant) - for a, splits in enumerate(m): - if splits[0].isdigit() or splits[0] in amino_acids and splits[-1] in amino_acids: - new = int(re.findall(r'\d+', splits)[0]) - if splits[0] in amino_acids: - if splits[0] != wild_type_sequence[new - 1]: - raise NameError( - 'Position of amino acids in given sequence does not match the given ' - 'positions in the input data! E.g. see position {} and position {} being {} ' - 'in the given sequence'.format(variant, new, wild_type_sequence[new - 1]) - ) - higher_var = wild_type_sequence[new - 1] + str(new) + str(splits[-1]) - m[a] = higher_var - if a == len(m) - 1: - higher_variants.append(m) - if i not in index_higher: - index_higher.append(i) - else: - single += 1 - if variant.upper() == 'WT' or variant.upper() == 'WILD_TYPE': - wt_position = i - - continue - - elif variant[0].isdigit() or variant[0] in amino_acids and variant[-1] in amino_acids: - try: - num = int(re.findall(r'\d+', variant)[0]) - except IndexError: - raise IndexError('Wrong input format. Please check if the input CSV corresponds to the ' - 'required input style (while the wild-type protein must be designated as \'WT\').') - if variant[0] in amino_acids: - try: - if variant[0] != wild_type_sequence[num - 1]: - raise NameError('Position of amino acids in given sequence does not match the given ' - 'positions in the input data! E.g. see position {} and position {} being {}' - ' in the given sequence.'.format(variant, num, wild_type_sequence[num - 1])) - except IndexError: - raise IndexError("Found variant sequence position {} in data which " - "is out of range of wild-type sequence length.".format(str(num))) - try: - full_variant = wild_type_sequence[num - 1] + str(num) + str(variant[-1]) - except IndexError: - raise IndexError("Found variant sequence position {} in data which " - "is out of range of wild-type sequence length.".format(str(num))) - single_variants.append([full_variant]) - if i not in index_lower: - index_lower.append(i) - logger.info( - '\nSingle (for mklsts if provided plus WT): {}\nDouble: {}\nTriple: {}\nQuadruple: {}\nQuintuple: {}\n' - 'Sextuple: {}\nSeptuple: {}\nOctuple: {}\nNonuple: {}\nDecuple: {}\nHigher (>Decuple): {}'.format( - single, double, triple, quadruple, quintuple, sextuple, septuple, octuple, nonuple, decuple, higher - ) - ) - for vals in y[index_higher]: - higher_values.append(vals) - for vals in y[index_lower]: - single_values.append(vals) - if wt_position is not None: - single_variants.append(['WT']) - single_values.append(y[wt_position]) - - single_variants, single_values = tuple(single_variants), tuple(single_values) - higher_variants, higher_values = tuple(higher_variants), tuple(higher_values) - - return single_variants, single_values, higher_variants, higher_values - - -def make_sub_ls_ts( - single_variants, - single_values, - higher_variants, - higher_values, - directed_evolution=False): - """ - Creates learning and test sets, fills learning set with single substituted variants and splits - rest (higher substituted) for learning and test sets: 3/4 to LS and 1/4 to TS - """ - logger.info(f'No. of single substituted variants (if provided plus WT): {len(single_variants)}.' - f'No. of values: {len(single_values)}.') - logger.info(f'No. of higher substituted variants: {len(higher_variants)}. ' - f'No. of values: {len(higher_values)}.') - - if len(single_values) != len(single_variants): - logger.info(f'Error due to different lengths for given variants and label! ' - 'No. of single substituted variants: {len(single_variants)}. ' - 'Number of given values: {len(single_values)}.') - - if len(higher_values) != len(higher_variants): - logger.info(f'Error due to different lengths for given variants and label! ' - f'No. of higher subst. variants: {len(higher_variants)}. ' - f'Number of given values: {len(higher_values)}.') - - # 1. CREATION OF LS AND TS SPLIT FOR SINGLE FOR LS AND HIGHER VARIANTS FOR TS - all_variants = single_variants + higher_variants - all_values = single_values + higher_values - sub_ts = [] # Substitutions of TS - values_ts = [] # Values of TS - sub_ls = [] - values_ls = [] - - if directed_evolution is False: - if len(higher_variants) != 0: - sub_ls = list(single_variants) # Substitutions of LS - values_ls = list(single_values) # Values of LS - for i in range(len(higher_variants)): - if len(higher_variants) < 6: # if less than 6 higher variants --> all higher variants to TS - sub_ts.append(higher_variants[i]) - values_ts.append(higher_values[i]) - elif (i % 3) == 0 and i != 0: # 1/4 of higher variants to TS, 3/4 to LS - sub_ts.append(higher_variants[i]) - values_ts.append(higher_values[i]) - else: # 3/4 to LS - sub_ls.append(higher_variants[i]) - values_ls.append(higher_values[i]) - else: # if no higher substituted variants are available split 80%/20% - random_nums = [] - range_list = np.arange(0, len(all_variants)) - while len(sub_ls) < len(all_variants) * 4 // 5: # 80 % Learning Set - random_num = random.choice(range_list) - if random_num not in random_nums: - random_nums.append(random_num) - sub_ls.append(all_variants[random_num]) - values_ls.append(all_values[random_num]) - else: # 20 % Test Set - for num in range_list: - if num not in random_nums: - sub_ts.append(all_variants[num]) - values_ts.append(all_values[num]) - - return sub_ls, values_ls, sub_ts, values_ts - - -def make_sub_ls_ts_randomly( - single_variants, - single_values, - higher_variants, - higher_values -): - """ - Creation of learning set and test set by randomly splitting sets - """ - length = len(single_variants) + len(higher_variants) - range_list = np.arange(0, length) - - ts = [] - ls = [] - while len(ls) < length * 4 // 5: - random_num = random.choice(range_list) - if random_num not in ls: - ls.append(random_num) - - for j in range_list: - if j not in ls: - ts.append(j) - - combined = single_variants + higher_variants # substitutions - combined2 = single_values + higher_values # values - - sub_ls = [] - values_ls = [] - tot_sub_ls, tot_values_ls = [], [] - tot_sub_ts, tot_values_ts = [], [] - - for i in ls: - sub_ls.append(combined[i]) - values_ls.append(combined2[i]) - - sub_ts = [] - values_ts = [] - for j in ts: - sub_ts.append(combined[j]) - values_ts.append(combined2[j]) - - for subs in sub_ls: - for subs2 in sub_ts: - if subs == subs2: - logger.warning(f'\n LS and TS overlap for: {subs} - ' - f'You might want to consider checking the provided ' - f'datasets for multiple entries') - - tot_sub_ls.append(sub_ls) - tot_values_ls.append(values_ls) - tot_sub_ts.append(sub_ts) - tot_values_ts.append(values_ts) - - return tot_sub_ls[0], tot_values_ls[0], tot_sub_ts[0], tot_values_ts[0] - - -def make_fasta_ls_ts( - filename, - wt_seq, - substitutions, - fitness_values -): - """ - Creates learning and test sets (.fasta style-like files with fitness values - indicated by starting semicolon ';') - - filename: str - String for defining the filename for the learning and test set "fasta-like" files. - wt: str - Wild-type sequence as string - substitutions: list - List of substitutions of a single variant of the format: - - Single substitution variant, e.g. variant A123C: ['A123C'] - - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] - --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] - fitness_values: list - List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] - """ - myfile = open(filename, 'w') - for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions - temp = list(wt_seq) - name = '' - separation = 0 - if var == ['WT']: - name = 'WT' - else: - for single_var in var: # single entries of substitution list - position_index = int(str(single_var)[1:-1]) - 1 - new_amino_acid = str(single_var)[-1] - temp[position_index] = new_amino_acid - # checking if multiple entries are inside list - if separation == 0: - name += single_var - else: - name += '/' + single_var - separation += 1 - print(f'>{name}', file=myfile) - print(f';{fitness_values[i]}', file=myfile) - print(''.join(temp), file=myfile) - # print(name+';'+str(val[i])+';'+''.join(temp), file=myfile) # uncomment output: CSV format - myfile.close() +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +""" +Modules for creating training and test sets +from input CSV file (with value separators sep=',' or sep=';') +having the CSV-format + +HEADER_VARIANTS;HEADER_VARIANTS_FITNESS +VARIANT_1;FITNESS_VALUE_1 +VARIANT_2;FITNESS_VALUE_2 +... + +according to the self devised training and test set convention +> VARIANT_NAME_1 +; FITNESS_1 +VARIANT_SEQUENCE_1 +> VARIANT_NAME_2 +; FITNESS_2 +VARIANT_SEQUENCE_2 +... +""" + +import logging +logger = logging.getLogger('pypef.utils.learning_test_sets') + +import numpy as np +import random +import pandas as pd +import re + + +def csv_input(csv_file): + """ + Gets input data from defined .csv file (that contains variant names and fitness labels) + """ + if csv_file is None: + raise FileNotFoundError( + f'Did not find (specified) csv file! ' + f'Used csv input file instead: {str(csv_file)}.' + ) + return csv_file + + +def drop_rows( + csv_file, + amino_acids, + threshold_drop, + csv_sep: str = ';', + mutation_sep: str = '/' +): + """ + Drops rows from .csv data if below defined fitness threshold or if + amino acid/variant name is unknown or if fitness label is not a digit. + """ + separator = ';' + try: + df_raw = pd.read_csv(csv_file, sep=separator, usecols=[0, 1]) + except ValueError: + separator = ',' + df_raw = pd.read_csv(csv_file, sep=separator, usecols=[0, 1]) + except FileNotFoundError: + raise FileNotFoundError( + f"Specify the input CSV file containing the variant-fitness data. " + f"Required CSV format: variant{csv_sep}fitness.") + + label = df_raw.iloc[:, 1] + sequence = df_raw.iloc[:, 0] + + dropping_rows = [] + + for i, row in enumerate(label): + try: + row = float(row) + if row < threshold_drop: + dropping_rows.append(i) + except ValueError: + dropping_rows.append(i) + + for i, variant in enumerate(sequence): + try: + if mutation_sep in variant: + m = re.split(rf'{mutation_sep}', variant) + for a, splits in enumerate(m): + if splits[0].isdigit() and variant[-1] in amino_acids: + continue + elif splits[0] not in amino_acids or splits[-1] not in amino_acids: + if i not in dropping_rows: + dropping_rows.append(i) + else: + if ',' in variant or ';' in variant or '\t' in variant: + raise SystemError("Found invalid characters (';', ',', or tabulator) in variants. " + "Check the --mutation_sep flag and try specifying it, e.g. --mutation_sep \',\'.") + if variant[0].isdigit() and variant[-1] in amino_acids: + continue + elif variant not in ['wt', 'wild_type']: + if variant[0] not in amino_acids or variant[-1] not in amino_acids: + dropping_rows.append(i) + + except TypeError: + raise TypeError('You might consider checking the input .csv for empty first two columns,' + ' e.g. in the last row.') + + logger.info(f'No. of dropped rows: {len(dropping_rows)}. ' + f'Total given variants (if provided plus WT): {len(df_raw)}') + + df = df_raw.drop(dropping_rows) + df.dropna(inplace=True) + df.reset_index(drop=True, inplace=True) + + return df + + +def get_variants( + df, + amino_acids, + wild_type_sequence, + mutation_sep: str = '/' +): + """ + Gets variants and divides and counts the variant data for single substituted + and higher substituted variants. Raises NameError if variant naming is not + matching the given wild-type sequence, e.g. if variant A17C would define + a substitution at residue Ala-17 to Cys but the wild-type sequence has no Ala + at position 17. + """ + x = df.iloc[:, 0] + y = df.iloc[:, 1] + wt_position = None + single_variants, higher_variants, index_higher, index_lower, \ + higher_values, single_values = [], [], [], [], [], [] + single, double, triple, quadruple, quintuple, sextuple, septuple,\ + octuple, nonuple, decuple, higher = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + for i, variant in enumerate(x): + if mutation_sep in variant: + count = variant.count(mutation_sep) + if count == 1: + double += 1 + elif count == 2: + triple += 1 + elif count == 3: + quadruple += 1 + elif count == 4: + quintuple += 1 + elif count == 5: + sextuple += 1 + elif count == 6: + septuple += 1 + elif count == 7: + octuple += 1 + elif count == 8: + nonuple += 1 + elif count == 9: + decuple += 1 + else: + higher += 1 + m = re.split(rf'{mutation_sep}', variant) + for a, splits in enumerate(m): + if splits[0].isdigit() or splits[0] in amino_acids and splits[-1] in amino_acids: + new = int(re.findall(r'\d+', splits)[0]) + if splits[0] in amino_acids: + if splits[0] != wild_type_sequence[new - 1]: + raise NameError( + 'Position of amino acids in given sequence does not match the given ' + 'positions in the input data! E.g. see position {} and position {} being {} ' + 'in the given sequence'.format(variant, new, wild_type_sequence[new - 1]) + ) + higher_var = wild_type_sequence[new - 1] + str(new) + str(splits[-1]) + m[a] = higher_var + if a == len(m) - 1: + higher_variants.append(m) + if i not in index_higher: + index_higher.append(i) + else: + single += 1 + if variant.upper() == 'WT' or variant.upper() == 'WILD_TYPE': + wt_position = i + + continue + + elif variant[0].isdigit() or variant[0] in amino_acids and variant[-1] in amino_acids: + try: + num = int(re.findall(r'\d+', variant)[0]) + except IndexError: + raise IndexError('Wrong input format. Please check if the input CSV corresponds to the ' + 'required input style (while the wild-type protein must be designated as \'WT\').') + if variant[0] in amino_acids: + try: + if variant[0] != wild_type_sequence[num - 1]: + raise NameError('Position of amino acids in given sequence does not match the given ' + 'positions in the input data! E.g. see position {} and position {} being {}' + ' in the given sequence.'.format(variant, num, wild_type_sequence[num - 1])) + except IndexError: + raise IndexError("Found variant sequence position {} in data which " + "is out of range of wild-type sequence length.".format(str(num))) + try: + full_variant = wild_type_sequence[num - 1] + str(num) + str(variant[-1]) + except IndexError: + raise IndexError("Found variant sequence position {} in data which " + "is out of range of wild-type sequence length.".format(str(num))) + single_variants.append([full_variant]) + if i not in index_lower: + index_lower.append(i) + logger.info( + '\nSingle (for mklsts if provided plus WT): {}\nDouble: {}\nTriple: {}\nQuadruple: {}\nQuintuple: {}\n' + 'Sextuple: {}\nSeptuple: {}\nOctuple: {}\nNonuple: {}\nDecuple: {}\nHigher (>Decuple): {}'.format( + single, double, triple, quadruple, quintuple, sextuple, septuple, octuple, nonuple, decuple, higher + ) + ) + for vals in y[index_higher]: + higher_values.append(vals) + for vals in y[index_lower]: + single_values.append(vals) + if wt_position is not None: + single_variants.append(['WT']) + single_values.append(y[wt_position]) + + single_variants, single_values = tuple(single_variants), tuple(single_values) + higher_variants, higher_values = tuple(higher_variants), tuple(higher_values) + + return single_variants, single_values, higher_variants, higher_values + + +def make_sub_ls_ts( + single_variants, + single_values, + higher_variants, + higher_values, + directed_evolution=False): + """ + Creates learning and test sets, fills learning set with single substituted variants and splits + rest (higher substituted) for learning and test sets: 3/4 to LS and 1/4 to TS + """ + logger.info(f'No. of single substituted variants (if provided plus WT): {len(single_variants)}.' + f'No. of values: {len(single_values)}.') + logger.info(f'No. of higher substituted variants: {len(higher_variants)}. ' + f'No. of values: {len(higher_values)}.') + + if len(single_values) != len(single_variants): + logger.info(f'Error due to different lengths for given variants and label! ' + 'No. of single substituted variants: {len(single_variants)}. ' + 'Number of given values: {len(single_values)}.') + + if len(higher_values) != len(higher_variants): + logger.info(f'Error due to different lengths for given variants and label! ' + f'No. of higher subst. variants: {len(higher_variants)}. ' + f'Number of given values: {len(higher_values)}.') + + # 1. CREATION OF LS AND TS SPLIT FOR SINGLE FOR LS AND HIGHER VARIANTS FOR TS + all_variants = single_variants + higher_variants + all_values = single_values + higher_values + sub_ts = [] # Substitutions of TS + values_ts = [] # Values of TS + sub_ls = [] + values_ls = [] + + if directed_evolution is False: + if len(higher_variants) != 0: + sub_ls = list(single_variants) # Substitutions of LS + values_ls = list(single_values) # Values of LS + for i in range(len(higher_variants)): + if len(higher_variants) < 6: # if less than 6 higher variants --> all higher variants to TS + sub_ts.append(higher_variants[i]) + values_ts.append(higher_values[i]) + elif (i % 3) == 0 and i != 0: # 1/4 of higher variants to TS, 3/4 to LS + sub_ts.append(higher_variants[i]) + values_ts.append(higher_values[i]) + else: # 3/4 to LS + sub_ls.append(higher_variants[i]) + values_ls.append(higher_values[i]) + else: # if no higher substituted variants are available split 80%/20% + random_nums = [] + range_list = np.arange(0, len(all_variants)) + while len(sub_ls) < len(all_variants) * 4 // 5: # 80 % Learning Set + random_num = random.choice(range_list) + if random_num not in random_nums: + random_nums.append(random_num) + sub_ls.append(all_variants[random_num]) + values_ls.append(all_values[random_num]) + else: # 20 % Test Set + for num in range_list: + if num not in random_nums: + sub_ts.append(all_variants[num]) + values_ts.append(all_values[num]) + + return sub_ls, values_ls, sub_ts, values_ts + + +def make_sub_ls_ts_randomly( + single_variants, + single_values, + higher_variants, + higher_values +): + """ + Creation of learning set and test set by randomly splitting sets + """ + length = len(single_variants) + len(higher_variants) + range_list = np.arange(0, length) + + ts = [] + ls = [] + while len(ls) < length * 4 // 5: + random_num = random.choice(range_list) + if random_num not in ls: + ls.append(random_num) + + for j in range_list: + if j not in ls: + ts.append(j) + + combined = single_variants + higher_variants # substitutions + combined2 = single_values + higher_values # values + + sub_ls = [] + values_ls = [] + tot_sub_ls, tot_values_ls = [], [] + tot_sub_ts, tot_values_ts = [], [] + + for i in ls: + sub_ls.append(combined[i]) + values_ls.append(combined2[i]) + + sub_ts = [] + values_ts = [] + for j in ts: + sub_ts.append(combined[j]) + values_ts.append(combined2[j]) + + for subs in sub_ls: + for subs2 in sub_ts: + if subs == subs2: + logger.warning(f'\n LS and TS overlap for: {subs} - ' + f'You might want to consider checking the provided ' + f'datasets for multiple entries.') + + tot_sub_ls.append(sub_ls) + tot_values_ls.append(values_ls) + tot_sub_ts.append(sub_ts) + tot_values_ts.append(values_ts) + + return tot_sub_ls[0], tot_values_ls[0], tot_sub_ts[0], tot_values_ts[0] + + +def make_fasta_ls_ts( + filename, + wt_seq, + substitutions, + fitness_values +): + """ + Creates learning and test sets (.fasta style-like files with fitness values + indicated by starting semicolon ';') + + filename: str + String for defining the filename for the learning and test set "fasta-like" files. + wt: str + Wild-type sequence as string + substitutions: list + List of substitutions of a single variant of the format: + - Single substitution variant, e.g. variant A123C: ['A123C'] + - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] + --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] + fitness_values: list + List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] + """ + myfile = open(filename, 'w') + for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions + temp = list(wt_seq) + name = '' + separation = 0 + if var == ['WT']: + name = 'WT' + else: + for single_var in var: # single entries of substitution list + position_index = int(str(single_var)[1:-1]) - 1 + new_amino_acid = str(single_var)[-1] + temp[position_index] = new_amino_acid + # checking if multiple entries are inside list + if separation == 0: + name += single_var + else: + name += '/' + single_var + separation += 1 + print(f'>{name}', file=myfile) + print(f';{fitness_values[i]}', file=myfile) + print(''.join(temp), file=myfile) + # print(name+';'+str(val[i])+';'+''.join(temp), file=myfile) # uncomment output: CSV format + myfile.close() diff --git a/pypef/utils/low_n_mutation_extrapolation.py b/pypef/utils/low_n_mutation_extrapolation.py index bd63023..1bc1d0a 100644 --- a/pypef/utils/low_n_mutation_extrapolation.py +++ b/pypef/utils/low_n_mutation_extrapolation.py @@ -1,438 +1,438 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import os -import random -import logging -logger = logging.getLogger('pypef.utils.low_n_mutation_extrapolation') - -import pandas as pd -import numpy as np -from scipy import stats -import matplotlib.pyplot as plt -import pickle -from tqdm import tqdm - -from pypef.ml.regression import cv_regression_options -from pypef.dca.hybrid_model import DCAHybridModel -from pypef.utils.variant_data import process_df_encoding, get_basename - - -def get_train_sizes(number_variants) -> np.ndarray: - """ - Generates a list of train sizes to perform low-n with. - Returns - ------- - Numpy array of train sizes up to 80% (i.e. 0.8 * N_variants). - """ - eighty_percent = int(number_variants * 0.8) - train_sizes = np.sort(np.concatenate([ - np.arange(15, 50, 5), np.arange(50, 100, 10), - np.arange(100, 150, 20), [160, 200, 250, 300, eighty_percent], - np.arange(400, 1100, 100) - ])) - idx_max = np.where(train_sizes >= eighty_percent)[0][0] + 1 - return train_sizes[:idx_max] - - -def plot_low_n( - train_sizes: list, - avg_spearmanr: list, - stddev_spearmanr: list, - plt_name: str = '' -): - """ - Plot the performance results of the low N engineering task. - """ - logger.info('Plotting...') - plt.plot(train_sizes, avg_spearmanr, 'ko--', linewidth=1, markersize=1.5) - plt.fill_between( - np.array(train_sizes), - np.array(avg_spearmanr) + np.array(stddev_spearmanr), - np.array(avg_spearmanr) - np.array(stddev_spearmanr), - alpha=0.5 - ) - plt.ylim(0, max(np.array(avg_spearmanr) * 1.1 + np.array(stddev_spearmanr))) - plt.xlabel('Train sizes') - plt.ylabel(r"Spearman's $\rho$") - - plt.savefig(plt_name.split(os.sep)[-1] + '.png', dpi=500) - plt.clf() - - -def low_n( - encoded_csv: str, - cv_regressor: str = None, - n_runs: int = 10, - hybrid_modeling: bool = False, - train_size_train: float = 0.66 -): - """ - Performs the "low N protein engineering task" learning on distinct - numbers of encoded_variant_sequences-fitness data to predict the - left out data (full dataset - train dataset). Maximum sizes of - learning sets is 0.8 * full dataset (and thus maximal size of test - set 0.2 * full dataset). - """ - df = pd.read_csv(encoded_csv, sep=';', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(encoded_csv, sep=',', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(encoded_csv, sep='\t', comment='#') - if cv_regressor: - name = 'ml_' + cv_regressor - if cv_regressor == 'pls_loocv': - raise SystemError( - 'PLS LOOCV is not (yet) implemented for the extrapolation task. ' - 'Please choose another CV regression option.' - ) - regressor = cv_regression_options(cv_regressor) - elif hybrid_modeling: - name = 'hybrid_ridge' - n_variants = df.shape[0] - train_sizes = get_train_sizes(n_variants).tolist() - variants, x, y = process_df_encoding(df) - if not x.any(): - raise SystemError("Provided CSV file seems to have no encoding columns " - "(required input CSV format: column 1: variant, column 2: " - "variant fitness value, column 3 and ongoing columns: " - "encoding feature values).") - - avg_spearmanr, stddev_spearmanr = [], [] - # test_sizes = [n_variants - size for size in train_sizes] - if hybrid_modeling: - logger.info('Using first CSV row/entry as wild type reference...') - for size in tqdm(train_sizes): - spearmanr_nruns = [] - for _ in range(n_runs): - train_idxs = random.sample(range(n_variants - 1), int(size)) - test_idxs = [] - for n in range(n_variants - 1): - if n not in train_idxs: - test_idxs.append(n) - x_train, y_train = x[train_idxs], y[train_idxs] - x_test, y_test = x[test_idxs], y[test_idxs] - - if hybrid_modeling: - x_wt = x[0] # WT should be first CSV variant entry - hybrid_model = DCAHybridModel( - x_train=x_train, - y_train=y_train, - x_test=x_test, # only used for adjusting +/- sign of y_dca, can also be None - y_test=y_test, # only used for adjusting +/- sign of y_dca, can also be None - x_wt=x_wt - ) - beta_1, beta_2, reg = hybrid_model.settings( - x_train, y_train, train_size_fit=train_size_train - ) - spearmanr_nruns.append( - hybrid_model.spearmanr( - y_test, - hybrid_model.hybrid_prediction( - x_test, reg, beta_1, beta_2 - ) - ) - ) - - else: # ML - regressor.fit(x_train, y_train) - # Best CV params: best_params = regressor.best_params_ - y_pred = regressor.predict(x_test) - spearmanr_nruns.append(stats.spearmanr(y_test, y_pred)[0]) - avg_spearmanr.append(np.mean(spearmanr_nruns)) - stddev_spearmanr.append(np.std(spearmanr_nruns, ddof=1)) - - plot_low_n( - train_sizes, - avg_spearmanr, - stddev_spearmanr, - 'low_N_' + str(encoded_csv).split('.')[0] + '_' + name - ) - - return train_sizes, avg_spearmanr, stddev_spearmanr - - -def count_mutation_levels_and_get_dfs(df_encoding) -> tuple: - """ - The input dataframe (from the sequence encoding CSV file) is split - according to levels of variant substitutions. Substitution seperator - is '/'. - """ - single_variants_index, all_higher_variants_index = [], [] - double_i, triple_i, quadruple_i, quintuple_i, sextuple_i, \ - septuple_i, octuple_i, nonuple_i, higher_nine_i = [], [], [], [], [], [], [], [], [] - for i, row in enumerate(df_encoding.iloc[:, 0]): # iterate over variant column - if '/' in row: # TypeError: argument of type 'float' is not iterable if empty columns are (at end of) CSV - all_higher_variants_index.append(i) - if row.count('/') == 1: - double_i.append(i) - elif row.count('/') == 2: - triple_i.append(i) - elif row.count('/') == 3: - quadruple_i.append(i) - elif row.count('/') == 4: - quintuple_i.append(i) - elif row.count('/') == 5: - sextuple_i.append(i) - elif row.count('/') == 6: - septuple_i.append(i) - elif row.count('/') == 7: - octuple_i.append(i) - elif row.count('/') == 8: - nonuple_i.append(i) - elif row.count('/') >= 9: - higher_nine_i.append(i) - else: - single_variants_index.append(i) - logger.info(f'\nNo. Singles: {len(single_variants_index)}\nNo. All higher: {len(all_higher_variants_index)}\n' - f'2: {len(double_i)}\n3: {len(triple_i)}\n4: {len(quadruple_i)}\n' - f'5: {len(quintuple_i)}\n6: {len(sextuple_i)}\n7: {len(septuple_i)}\n' - f'8: {len(octuple_i)}\n9: {len(nonuple_i)}\n>=10: {len(higher_nine_i)}') - return ( - df_encoding.iloc[single_variants_index, :], - df_encoding.iloc[double_i, :], - df_encoding.iloc[triple_i, :], - df_encoding.iloc[quadruple_i, :], - df_encoding.iloc[quintuple_i, :], - df_encoding.iloc[sextuple_i, :], - df_encoding.iloc[septuple_i, :], - df_encoding.iloc[octuple_i, :], - df_encoding.iloc[nonuple_i, :], - df_encoding.iloc[higher_nine_i, :], - df_encoding.iloc[all_higher_variants_index, :], - ) - - -def performance_mutation_extrapolation( - encoded_csv: str, - cv_regressor: str = None, - train_size: float = 0.66, - conc: bool = False, - save_model: bool = True, - hybrid_modeling: bool = False -) -> dict: - """ - Train on distinct mutation levels, e.g. only single-substituted samples - of encoded_variant_sequences-fitness data to predict distinct levels - of higher substituted variants (i.e. 1->2, 1->3, 1->4 etc.). Also can - train on concatenated levels of substitution-fitness data using the flag - --conc, i.e. conc = True (i.e. 1->2, 1+2->3, 1+2+3->4, etc.). - """ - df = pd.read_csv(encoded_csv, sep=';', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(encoded_csv, sep=',', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(encoded_csv, sep='\t', comment='#') - - df_mut_lvl = count_mutation_levels_and_get_dfs(df) - name = '' - if save_model: - try: - os.mkdir('Pickles') - except FileExistsError: - pass - if hybrid_modeling: - regressor = None - name = 'hybrid_ridge_' + get_basename(encoded_csv) - elif cv_regressor: - name = 'ml_' + cv_regressor + '_' + get_basename(encoded_csv) - if cv_regressor == 'pls_loocv': - raise SystemError( - 'PLS LOOCV is not implemented for the extrapolation ' - 'task. Please choose another CV regressor.' - ) - regressor = cv_regression_options(cv_regressor) - beta_1, beta_2 = None, None - else: - regressor = None - hybrid_model = None - data = {} - collected_levels = [] - for i_m, mutation_level_df in enumerate(df_mut_lvl): - if mutation_level_df.shape[0] != 0: - collected_levels.append(i_m) - train_idx_appended = [] - if len(collected_levels) > 1: - train_idx = collected_levels[0] - train_df = df_mut_lvl[train_idx] - train_variants, x_train, y_train = process_df_encoding(train_df) - all_higher_df = df_mut_lvl[-1] # only used for adjusting +/- of y_dca - all_higher_variants, x_all_higher, y_all_higher = process_df_encoding(all_higher_df) - if hybrid_modeling: - x_wt = x_train[0] - hybrid_model = DCAHybridModel( - x_train=x_train, - y_train=y_train, - x_test=x_all_higher, # only used for adjusting +/- of y_dca, can also be None but - y_test=y_all_higher, # higher risk of wrong sign assignment of beta_1 (y_dca) - x_wt=x_wt - ) - beta_1, beta_2, reg = hybrid_model.settings( - x_train, y_train, train_size_fit=train_size) - pickle.dump( - {'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2, - 'spearman_rho': float('nan'), 'regressor': reg}, - open(os.path.join('Pickles', 'HYBRID_LVL_1'), 'wb') - ) - elif cv_regressor: - logger.info('Fitting regressor on lvl 1 substitution data...') - regressor.fit(x_train, y_train) - if save_model: - logger.info(f'Saving model as Pickle file: ML_LVL_1') - pickle.dump(regressor, open(os.path.join('Pickles', 'ML_LVL_1'), 'wb')) - for i, _ in enumerate(tqdm(collected_levels)): - if i < len(collected_levels) - 1: # not last i else error, last entry is: lvl 1 --> all higher variants - test_idx = collected_levels[i + 1] - test_df = df_mut_lvl[test_idx] - test_variants, x_test, y_test = process_df_encoding(test_df) - if not conc: - # For training on distinct iterated level i, uncomment lines below: - # train_idx = collected_levels[i] - # train_df = self.mutation_level_dfs[train_idx] - # train_variants, x_train, y_train = self._process_df_encoding(train_df) - if hybrid_modeling: - data.update({ - test_idx + 1: - { - 'hybrid_model': hybrid_model, - 'max_train_lvl': train_idx + 1, - 'n_y_train': len(y_train), - 'test_lvl': test_idx + 1, - 'n_y_test': len(y_test), - 'spearman_rho': hybrid_model.spearmanr( - y_test, - hybrid_model.hybrid_prediction( - x_test, reg, beta_1, beta_2 - ) - ), - 'beta_1': beta_1, - 'beta_2': beta_2, - 'regressor': reg - } - }) - else: # ML - data.update({ - test_idx + 1: - { - 'regressor': regressor, - 'max_train_lvl': train_idx + 1, - 'n_y_train': len(y_train), - 'test_lvl': test_idx + 1, - 'n_y_test': len(y_test), - 'spearman_rho': stats.spearmanr( - y_test, # Call predict on the BaseSearchCV estimator - regressor.predict(x_test) # with the best found parameters - )[0] - } - }) - - else: # conc mode, training on mutational levels i: 1, ..., max(i)-1 - train_idx_appended.append(collected_levels[i]) - if i < len(collected_levels) - 2: # -2 as not the last (all_higher) ## i != 0 and - train_df_appended_conc = pd.DataFrame() - for idx in train_idx_appended: - train_df_appended_conc = pd.concat( - [train_df_appended_conc, df_mut_lvl[idx]]) - train_variants_conc, x_train_conc, y_train_conc = \ - process_df_encoding(train_df_appended_conc) - if hybrid_modeling: # updating hybrid model params with newly inputted concatenated train data - beta_1_conc, beta_2_conc, reg_conc = hybrid_model.settings( - x_train_conc, - y_train_conc, - train_size_fit=train_size - ) - data.update({ - test_idx + 1: - { - 'hybrid_model': hybrid_model, - 'max_train_lvl': train_idx_appended[-1] + 1, - 'n_y_train': len(y_train_conc), - 'test_lvl': test_idx + 1, - 'n_y_test': len(y_test), - 'spearman_rho': hybrid_model.spearmanr( - y_test, - hybrid_model.hybrid_prediction( - x_test, reg_conc, beta_1_conc, beta_2_conc - ) - ), - 'beta_1': beta_1_conc, - 'beta_2': beta_2_conc, - 'regressor': reg_conc - } - }) - else: # ML updating pureML regression model params with newly inputted concatenated train data - # Fitting regressor on concatenated substitution data - regressor.fit(x_train_conc, y_train_conc) - data.update({ - test_idx + 1: - { - 'max_train_lvl': train_idx_appended[-1] + 1, - 'n_y_train': len(y_train_conc), - 'test_lvl': test_idx + 1, - 'n_y_test': len(y_test), - 'spearman_rho': stats.spearmanr( - y_test, # Call predict on the BaseSearchCV estimator - regressor.predict(x_test) # with the best found parameters - )[0], - 'regressor': regressor - } - }) - plot_extrapolation(data, name, conc) - - return data - - -def plot_extrapolation( - extrapolation_data: dict, - name: str = '', - conc=False -): - """ - Plot extrapolation results. - """ - logger.info('Plotting...') - test_lvls, spearman_rhos, label_infos = [], [], [] - for test_lvl, result_dict in extrapolation_data.items(): - if result_dict['spearman_rho'] is np.nan: - continue - test_lvls.append(test_lvl) - spearman_rhos.append(result_dict['spearman_rho']) - label_infos.append( - r'$\leq$' + str(result_dict['max_train_lvl']) + r'$\rightarrow$' + str(result_dict['test_lvl']) + - '\n' + str(result_dict['n_y_train']) + r'$\rightarrow$' + str(result_dict['n_y_test']) - ) - label_infos[0] = 'Lvl: ' + label_infos[0].split('\n')[0] + '\n' + r'$N$: ' + label_infos[0].split('\n')[1] - if not conc: - label_infos[-1] = label_infos[-1][6] + r'$\rightarrow$' + '>' + \ - label_infos[-1][6] + '\n' + label_infos[-1].split('\n')[1] - plt.plot(test_lvls, spearman_rhos, 'x--', markeredgecolor='k', linewidth=0.7, markersize=4) - plt.fill_between( - np.array(test_lvls), - np.repeat(min(spearman_rhos), len(spearman_rhos)), - np.array(spearman_rhos), - alpha=0.3 - ) - if conc: - name += '_train_concat_lvls' - else: - name += '_train_lvl_1' - plt.xticks(test_lvls, label_infos, fontsize=5) - plt.ylabel(r"Spearman's $\rho$") - name = name.split(os.sep)[-1] + '_extrapolation.png' - plt.savefig(name, dpi=500) - plt.clf() +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import os +import random +import logging +logger = logging.getLogger('pypef.utils.low_n_mutation_extrapolation') + +import pandas as pd +import numpy as np +from scipy import stats +import matplotlib.pyplot as plt +import pickle +from tqdm import tqdm + +from pypef.ml.regression import cv_regression_options +from pypef.dca.hybrid_model import DCAHybridModel +from pypef.utils.variant_data import process_df_encoding, get_basename + + +def get_train_sizes(number_variants) -> np.ndarray: + """ + Generates a list of train sizes to perform low-n with. + Returns + ------- + Numpy array of train sizes up to 80% (i.e. 0.8 * N_variants). + """ + eighty_percent = int(number_variants * 0.8) + train_sizes = np.sort(np.concatenate([ + np.arange(15, 50, 5), np.arange(50, 100, 10), + np.arange(100, 150, 20), [160, 200, 250, 300, eighty_percent], + np.arange(400, 1100, 100) + ])) + idx_max = np.where(train_sizes >= eighty_percent)[0][0] + 1 + return train_sizes[:idx_max] + + +def plot_low_n( + train_sizes: list, + avg_spearmanr: list, + stddev_spearmanr: list, + plt_name: str = '' +): + """ + Plot the performance results of the low N engineering task. + """ + logger.info('Plotting...') + plt.plot(train_sizes, avg_spearmanr, 'ko--', linewidth=1, markersize=1.5) + plt.fill_between( + np.array(train_sizes), + np.array(avg_spearmanr) + np.array(stddev_spearmanr), + np.array(avg_spearmanr) - np.array(stddev_spearmanr), + alpha=0.5 + ) + plt.ylim(0, max(np.array(avg_spearmanr) * 1.1 + np.array(stddev_spearmanr))) + plt.xlabel('Train sizes') + plt.ylabel(r"Spearman's $\rho$") + + plt.savefig(plt_name.split(os.sep)[-1] + '.png', dpi=500) + plt.clf() + + +def low_n( + encoded_csv: str, + cv_regressor: str = None, + n_runs: int = 10, + hybrid_modeling: bool = False, + train_size_train: float = 0.66 +): + """ + Performs the "low N protein engineering task" learning on distinct + numbers of encoded_variant_sequences-fitness data to predict the + left out data (full dataset - train dataset). Maximum sizes of + learning sets is 0.8 * full dataset (and thus maximal size of test + set 0.2 * full dataset). + """ + df = pd.read_csv(encoded_csv, sep=';', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(encoded_csv, sep=',', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(encoded_csv, sep='\t', comment='#') + if cv_regressor: + name = 'ml_' + cv_regressor + if cv_regressor == 'pls_loocv': + raise SystemError( + 'PLS LOOCV is not (yet) implemented for the extrapolation task. ' + 'Please choose another CV regression option.' + ) + regressor = cv_regression_options(cv_regressor) + elif hybrid_modeling: + name = 'hybrid_ridge' + n_variants = df.shape[0] + train_sizes = get_train_sizes(n_variants).tolist() + variants, x, y = process_df_encoding(df) + if not x.any(): + raise SystemError("Provided CSV file seems to have no encoding columns " + "(required input CSV format: column 1: variant, column 2: " + "variant fitness value, column 3 and ongoing columns: " + "encoding feature values).") + + avg_spearmanr, stddev_spearmanr = [], [] + # test_sizes = [n_variants - size for size in train_sizes] + if hybrid_modeling: + logger.info('Using first CSV row/entry as wild type reference...') + for size in tqdm(train_sizes): + spearmanr_nruns = [] + for _ in range(n_runs): + train_idxs = random.sample(range(n_variants - 1), int(size)) + test_idxs = [] + for n in range(n_variants - 1): + if n not in train_idxs: + test_idxs.append(n) + x_train, y_train = x[train_idxs], y[train_idxs] + x_test, y_test = x[test_idxs], y[test_idxs] + + if hybrid_modeling: + x_wt = x[0] # WT should be first CSV variant entry + hybrid_model = DCAHybridModel( + x_train=x_train, + y_train=y_train, + x_test=x_test, # only used for adjusting +/- sign of y_dca, can also be None + y_test=y_test, # only used for adjusting +/- sign of y_dca, can also be None + x_wt=x_wt + ) + beta_1, beta_2, reg = hybrid_model.settings( + x_train, y_train, train_size_fit=train_size_train + ) + spearmanr_nruns.append( + hybrid_model.spearmanr( + y_test, + hybrid_model.hybrid_prediction( + x_test, reg, beta_1, beta_2 + ) + ) + ) + + else: # ML + regressor.fit(x_train, y_train) + # Best CV params: best_params = regressor.best_params_ + y_pred = regressor.predict(x_test) + spearmanr_nruns.append(stats.spearmanr(y_test, y_pred)[0]) + avg_spearmanr.append(np.mean(spearmanr_nruns)) + stddev_spearmanr.append(np.std(spearmanr_nruns, ddof=1)) + + plot_low_n( + train_sizes, + avg_spearmanr, + stddev_spearmanr, + 'low_N_' + str(encoded_csv).split('.')[0] + '_' + name + ) + + return train_sizes, avg_spearmanr, stddev_spearmanr + + +def count_mutation_levels_and_get_dfs(df_encoding) -> tuple: + """ + The input dataframe (from the sequence encoding CSV file) is split + according to levels of variant substitutions. Substitution seperator + is '/'. + """ + single_variants_index, all_higher_variants_index = [], [] + double_i, triple_i, quadruple_i, quintuple_i, sextuple_i, \ + septuple_i, octuple_i, nonuple_i, higher_nine_i = [], [], [], [], [], [], [], [], [] + for i, row in enumerate(df_encoding.iloc[:, 0]): # iterate over variant column + if '/' in row: # TypeError: argument of type 'float' is not iterable if empty columns are (at end of) CSV + all_higher_variants_index.append(i) + if row.count('/') == 1: + double_i.append(i) + elif row.count('/') == 2: + triple_i.append(i) + elif row.count('/') == 3: + quadruple_i.append(i) + elif row.count('/') == 4: + quintuple_i.append(i) + elif row.count('/') == 5: + sextuple_i.append(i) + elif row.count('/') == 6: + septuple_i.append(i) + elif row.count('/') == 7: + octuple_i.append(i) + elif row.count('/') == 8: + nonuple_i.append(i) + elif row.count('/') >= 9: + higher_nine_i.append(i) + else: + single_variants_index.append(i) + logger.info(f'\nNo. Singles: {len(single_variants_index)}\nNo. All higher: {len(all_higher_variants_index)}\n' + f'2: {len(double_i)}\n3: {len(triple_i)}\n4: {len(quadruple_i)}\n' + f'5: {len(quintuple_i)}\n6: {len(sextuple_i)}\n7: {len(septuple_i)}\n' + f'8: {len(octuple_i)}\n9: {len(nonuple_i)}\n>=10: {len(higher_nine_i)}') + return ( + df_encoding.iloc[single_variants_index, :], + df_encoding.iloc[double_i, :], + df_encoding.iloc[triple_i, :], + df_encoding.iloc[quadruple_i, :], + df_encoding.iloc[quintuple_i, :], + df_encoding.iloc[sextuple_i, :], + df_encoding.iloc[septuple_i, :], + df_encoding.iloc[octuple_i, :], + df_encoding.iloc[nonuple_i, :], + df_encoding.iloc[higher_nine_i, :], + df_encoding.iloc[all_higher_variants_index, :], + ) + + +def performance_mutation_extrapolation( + encoded_csv: str, + cv_regressor: str = None, + train_size: float = 0.66, + conc: bool = False, + save_model: bool = True, + hybrid_modeling: bool = False +) -> dict: + """ + Train on distinct mutation levels, e.g. only single-substituted samples + of encoded_variant_sequences-fitness data to predict distinct levels + of higher substituted variants (i.e. 1->2, 1->3, 1->4 etc.). Also can + train on concatenated levels of substitution-fitness data using the flag + --conc, i.e. conc = True (i.e. 1->2, 1+2->3, 1+2+3->4, etc.). + """ + df = pd.read_csv(encoded_csv, sep=';', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(encoded_csv, sep=',', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(encoded_csv, sep='\t', comment='#') + + df_mut_lvl = count_mutation_levels_and_get_dfs(df) + name = '' + if save_model: + try: + os.mkdir('Pickles') + except FileExistsError: + pass + if hybrid_modeling: + regressor = None + name = 'hybrid_ridge_' + get_basename(encoded_csv) + elif cv_regressor: + name = 'ml_' + cv_regressor + '_' + get_basename(encoded_csv) + if cv_regressor == 'pls_loocv': + raise SystemError( + 'PLS LOOCV is not implemented for the extrapolation ' + 'task. Please choose another CV regressor.' + ) + regressor = cv_regression_options(cv_regressor) + beta_1, beta_2 = None, None + else: + regressor = None + hybrid_model = None + data = {} + collected_levels = [] + for i_m, mutation_level_df in enumerate(df_mut_lvl): + if mutation_level_df.shape[0] != 0: + collected_levels.append(i_m) + train_idx_appended = [] + if len(collected_levels) > 1: + train_idx = collected_levels[0] + train_df = df_mut_lvl[train_idx] + train_variants, x_train, y_train = process_df_encoding(train_df) + all_higher_df = df_mut_lvl[-1] # only used for adjusting +/- of y_dca + all_higher_variants, x_all_higher, y_all_higher = process_df_encoding(all_higher_df) + if hybrid_modeling: + x_wt = x_train[0] + hybrid_model = DCAHybridModel( + x_train=x_train, + y_train=y_train, + x_test=x_all_higher, # only used for adjusting +/- of y_dca, can also be None but + y_test=y_all_higher, # higher risk of wrong sign assignment of beta_1 (y_dca) + x_wt=x_wt + ) + beta_1, beta_2, reg = hybrid_model.settings( + x_train, y_train, train_size_fit=train_size) + pickle.dump( + {'hybrid_model': hybrid_model, 'beta_1': beta_1, 'beta_2': beta_2, + 'spearman_rho': float('nan'), 'regressor': reg}, + open(os.path.join('Pickles', 'HYBRID_LVL_1'), 'wb') + ) + elif cv_regressor: + logger.info('Fitting regressor on lvl 1 substitution data...') + regressor.fit(x_train, y_train) + if save_model: + logger.info(f'Saving model as Pickle file: ML_LVL_1') + pickle.dump(regressor, open(os.path.join('Pickles', 'ML_LVL_1'), 'wb')) + for i, _ in enumerate(tqdm(collected_levels)): + if i < len(collected_levels) - 1: # not last i else error, last entry is: lvl 1 --> all higher variants + test_idx = collected_levels[i + 1] + test_df = df_mut_lvl[test_idx] + test_variants, x_test, y_test = process_df_encoding(test_df) + if not conc: + # For training on distinct iterated level i, uncomment lines below: + # train_idx = collected_levels[i] + # train_df = self.mutation_level_dfs[train_idx] + # train_variants, x_train, y_train = self._process_df_encoding(train_df) + if hybrid_modeling: + data.update({ + test_idx + 1: + { + 'hybrid_model': hybrid_model, + 'max_train_lvl': train_idx + 1, + 'n_y_train': len(y_train), + 'test_lvl': test_idx + 1, + 'n_y_test': len(y_test), + 'spearman_rho': hybrid_model.spearmanr( + y_test, + hybrid_model.hybrid_prediction( + x_test, reg, beta_1, beta_2 + ) + ), + 'beta_1': beta_1, + 'beta_2': beta_2, + 'regressor': reg + } + }) + else: # ML + data.update({ + test_idx + 1: + { + 'regressor': regressor, + 'max_train_lvl': train_idx + 1, + 'n_y_train': len(y_train), + 'test_lvl': test_idx + 1, + 'n_y_test': len(y_test), + 'spearman_rho': stats.spearmanr( + y_test, # Call predict on the BaseSearchCV estimator + regressor.predict(x_test) # with the best found parameters + )[0] + } + }) + + else: # conc mode, training on mutational levels i: 1, ..., max(i)-1 + train_idx_appended.append(collected_levels[i]) + if i < len(collected_levels) - 2: # -2 as not the last (all_higher) ## i != 0 and + train_df_appended_conc = pd.DataFrame() + for idx in train_idx_appended: + train_df_appended_conc = pd.concat( + [train_df_appended_conc, df_mut_lvl[idx]]) + train_variants_conc, x_train_conc, y_train_conc = \ + process_df_encoding(train_df_appended_conc) + if hybrid_modeling: # updating hybrid model params with newly inputted concatenated train data + beta_1_conc, beta_2_conc, reg_conc = hybrid_model.settings( + x_train_conc, + y_train_conc, + train_size_fit=train_size + ) + data.update({ + test_idx + 1: + { + 'hybrid_model': hybrid_model, + 'max_train_lvl': train_idx_appended[-1] + 1, + 'n_y_train': len(y_train_conc), + 'test_lvl': test_idx + 1, + 'n_y_test': len(y_test), + 'spearman_rho': hybrid_model.spearmanr( + y_test, + hybrid_model.hybrid_prediction( + x_test, reg_conc, beta_1_conc, beta_2_conc + ) + ), + 'beta_1': beta_1_conc, + 'beta_2': beta_2_conc, + 'regressor': reg_conc + } + }) + else: # ML updating pureML regression model params with newly inputted concatenated train data + # Fitting regressor on concatenated substitution data + regressor.fit(x_train_conc, y_train_conc) + data.update({ + test_idx + 1: + { + 'max_train_lvl': train_idx_appended[-1] + 1, + 'n_y_train': len(y_train_conc), + 'test_lvl': test_idx + 1, + 'n_y_test': len(y_test), + 'spearman_rho': stats.spearmanr( + y_test, # Call predict on the BaseSearchCV estimator + regressor.predict(x_test) # with the best found parameters + )[0], + 'regressor': regressor + } + }) + plot_extrapolation(data, name, conc) + + return data + + +def plot_extrapolation( + extrapolation_data: dict, + name: str = '', + conc=False +): + """ + Plot extrapolation results. + """ + logger.info('Plotting...') + test_lvls, spearman_rhos, label_infos = [], [], [] + for test_lvl, result_dict in extrapolation_data.items(): + if result_dict['spearman_rho'] is np.nan: + continue + test_lvls.append(test_lvl) + spearman_rhos.append(result_dict['spearman_rho']) + label_infos.append( + r'$\leq$' + str(result_dict['max_train_lvl']) + r'$\rightarrow$' + str(result_dict['test_lvl']) + + '\n' + str(result_dict['n_y_train']) + r'$\rightarrow$' + str(result_dict['n_y_test']) + ) + label_infos[0] = 'Lvl: ' + label_infos[0].split('\n')[0] + '\n' + r'$N$: ' + label_infos[0].split('\n')[1] + if not conc: + label_infos[-1] = label_infos[-1][6] + r'$\rightarrow$' + '>' + \ + label_infos[-1][6] + '\n' + label_infos[-1].split('\n')[1] + plt.plot(test_lvls, spearman_rhos, 'x--', markeredgecolor='k', linewidth=0.7, markersize=4) + plt.fill_between( + np.array(test_lvls), + np.repeat(min(spearman_rhos), len(spearman_rhos)), + np.array(spearman_rhos), + alpha=0.3 + ) + if conc: + name += '_train_concat_lvls' + else: + name += '_train_lvl_1' + plt.xticks(test_lvls, label_infos, fontsize=5) + plt.ylabel(r"Spearman's $\rho$") + name = name.split(os.sep)[-1] + '_extrapolation.png' + plt.savefig(name, dpi=500) + plt.clf() diff --git a/pypef/utils/performance.py b/pypef/utils/performance.py index 7aacffd..aa9e2c9 100644 --- a/pypef/utils/performance.py +++ b/pypef/utils/performance.py @@ -1,61 +1,61 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import warnings -import numpy as np -from scipy import stats -from sklearn.metrics import mean_squared_error, r2_score - - -def get_performances( - y_true: list, - y_pred: list -) -> tuple[float, float, float, float, float]: - """ - Description - ----------- - Gets performance metrics (R^2, RMSE, NRMSE, Pearson's r, Spearman's rho) - between y_true and y_pred. - - Parameters - ----------- - y_true: list - Measured fitness values. - y_pred: list - Predicted fitness values. - - Returns - ----------- - r_squared: float - rmse: float - nrmse: float - pearson_r: float - spearman_rho: float - """ - y_true = list(y_true) - y_pred = list(y_pred) - r_squared = r2_score(y_true, y_pred) - rmse = np.sqrt(mean_squared_error(y_true, y_pred)) - stddev = np.std(y_true, ddof=1) - nrmse = rmse / stddev - with warnings.catch_warnings(): # catching RunTime warning when there's no variance in an array, - warnings.simplefilter("ignore") # e.g. [2, 2, 2, 2] which would mean divide by zero - pearson_r = np.corrcoef(y_true, y_pred)[0][1] - spearman_rho = stats.spearmanr(y_true, y_pred)[0] - +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import warnings +import numpy as np +from scipy import stats +from sklearn.metrics import mean_squared_error, r2_score + + +def get_performances( + y_true: list, + y_pred: list +) -> tuple[float, float, float, float, float]: + """ + Description + ----------- + Gets performance metrics (R^2, RMSE, NRMSE, Pearson's r, Spearman's rho) + between y_true and y_pred. + + Parameters + ----------- + y_true: list + Measured fitness values. + y_pred: list + Predicted fitness values. + + Returns + ----------- + r_squared: float + rmse: float + nrmse: float + pearson_r: float + spearman_rho: float + """ + y_true = list(y_true) + y_pred = list(y_pred) + r_squared = r2_score(y_true, y_pred) + rmse = np.sqrt(mean_squared_error(y_true, y_pred)) + stddev = np.std(y_true, ddof=1) + nrmse = rmse / stddev + with warnings.catch_warnings(): # catching RunTime warning when there's no variance in an array, + warnings.simplefilter("ignore") # e.g. [2, 2, 2, 2] which would mean divide by zero + pearson_r = np.corrcoef(y_true, y_pred)[0][1] + spearman_rho = stats.spearmanr(y_true, y_pred)[0] + return r_squared, rmse, nrmse, pearson_r, spearman_rho \ No newline at end of file diff --git a/pypef/utils/plot.py b/pypef/utils/plot.py index a7feefa..85d0fee 100644 --- a/pypef/utils/plot.py +++ b/pypef/utils/plot.py @@ -1,81 +1,81 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import numpy as np -from scipy import stats -import matplotlib.pyplot as plt -from adjustText import adjust_text -import logging -logger = logging.getLogger('pypef.ml.regression') - -from pypef.utils.performance import get_performances - - -def plot_y_true_vs_y_pred( - y_true: np.ndarray, - y_pred: np.ndarray, - variants: np.ndarray, # just required for labeling - label=False, - hybrid=False, - name: str = '' -): - """ - Plots predicted versus true values using the hybrid model for prediction. - Function called by function predict_ps. - """ - figure, ax = plt.subplots() - if hybrid: - spearman_rho = stats.spearmanr(y_true, y_pred)[0] - ax.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, - label=f'Spearman\'s ' + fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})') - file_name = name + 'DCA_Hybrid_Model_Performance.png' - else: - r_squared, rmse, nrmse, pearson_r, spearman_rho = get_performances( - y_true=y_true, y_pred=y_pred - ) - ax.scatter( - y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, - label=r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + - f'\nPearson\'s ' + r'$r$'+f' = {pearson_r:.3f}' + f'\nSpearman\'s ' + - fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})' - ) - file_name = name + 'ML_Model_Performance.png' - # x = np.linspace(min(y_pred), max(y_pred), 100) - # ax.plot(x, x, color='black', linewidth=0.25) # plot diagonal line - ax.legend(prop={'size': 8}) - ax.set_xlabel('Measured') - ax.set_ylabel('Predicted') - logger.info('Plotting...') - if label: - logger.info('Adjusting variant labels for plotting can take some ' - 'time (the limit for labeling is 150 data points)...') - if len(y_true) < 150: - texts = [ax.text(y_true[i], y_pred[i], txt, fontsize=4) - for i, txt in enumerate(variants)] - adjust_text( - texts, only_move={'points': 'y', 'text': 'y'}, force_points=0.5, lim=250) - else: - logger.info("Terminating label process. Too many variants " - "(> 150) for plotting (labels would overlap).") - # Uncomment for renaming new plots - # i = 1 - # while os.path.isfile(file_name): - # i += 1 # iterate until finding an unused file name - # file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png' - plt.savefig(file_name, dpi=500) - plt.close('all') +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import numpy as np +from scipy import stats +import matplotlib.pyplot as plt +from adjustText import adjust_text +import logging +logger = logging.getLogger('pypef.ml.regression') + +from pypef.utils.performance import get_performances + + +def plot_y_true_vs_y_pred( + y_true: np.ndarray, + y_pred: np.ndarray, + variants: np.ndarray, # just required for labeling + label=False, + hybrid=False, + name: str = '' +): + """ + Plots predicted versus true values using the hybrid model for prediction. + Function called by function predict_ps. + """ + figure, ax = plt.subplots() + if hybrid: + spearman_rho = stats.spearmanr(y_true, y_pred)[0] + ax.scatter(y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, + label=f'Spearman\'s ' + fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})') + file_name = name + 'DCA_Hybrid_Model_Performance.png' + else: + r_squared, rmse, nrmse, pearson_r, spearman_rho = get_performances( + y_true=y_true, y_pred=y_pred + ) + ax.scatter( + y_true, y_pred, marker='o', s=20, linewidths=0.5, edgecolor='black', alpha=0.7, + label=r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + + f'\nPearson\'s ' + r'$r$'+f' = {pearson_r:.3f}' + f'\nSpearman\'s ' + + fr'$\rho$ = {spearman_rho:.3f}' + '\n' + fr'($N$ = {len(y_true)})' + ) + file_name = name + 'ML_Model_Performance.png' + # x = np.linspace(min(y_pred), max(y_pred), 100) + # ax.plot(x, x, color='black', linewidth=0.25) # plot diagonal line + ax.legend(prop={'size': 8}) + ax.set_xlabel('Measured') + ax.set_ylabel('Predicted') + logger.info('Plotting...') + if label: + logger.info('Adjusting variant labels for plotting can take some ' + 'time (the limit for labeling is 150 data points)...') + if len(y_true) < 150: + texts = [ax.text(y_true[i], y_pred[i], txt, fontsize=4) + for i, txt in enumerate(variants)] + adjust_text( + texts, only_move={'points': 'y', 'text': 'y'}, force_points=0.5, lim=250) + else: + logger.info("Terminating label process. Too many variants " + "(> 150) for plotting (labels would overlap).") + # Uncomment for renaming new plots + # i = 1 + # while os.path.isfile(file_name): + # i += 1 # iterate until finding an unused file name + # file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png' + plt.savefig(file_name, dpi=500) + plt.close('all') diff --git a/pypef/utils/prediction_sets.py b/pypef/utils/prediction_sets.py index df4e252..c3f8aa7 100644 --- a/pypef/utils/prediction_sets.py +++ b/pypef/utils/prediction_sets.py @@ -1,429 +1,413 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -""" -Modules for making prediction files -from input CSV file (with value separators sep=',' or sep=';') -having the CSV-format - -HEADER_VARIANTS;HEADER_VARIANTS_FITNESS -VARIANT_1;FITNESS_VALUE_1 -VARIANT_2;FITNESS_VALUE_2 -... - -according to the self devised prediction set convention -> VARIANT_NAME_1 -VARIANT_SEQUENCE_1 -> VARIANT_NAME_2 -VARIANT_SEQUENCE_2 -... -""" - - -import os -import numpy as np -from tqdm import tqdm - - -def make_fasta_ps( - filename, - wt, - substitutions -): - """ - Creates prediction sets (.fasta style files, i.e. without fitness values) - """ - myfile = open(filename, 'w') - count = 0 - for i, var in enumerate(substitutions): - temporary = list(wt) - name = '' - separation = 0 - for single_var in var: - position_index = int(str(single_var)[1:-1]) - 1 - new_amino_acid = str(single_var)[-1] - temporary[position_index] = new_amino_acid - if separation == 0: - name += single_var - else: - name += '/' + single_var - separation += 1 - print('>', name, file=myfile) - print(''.join(temporary), file=myfile) - count += 1 - myfile.close() - - -def make_recombinations_double(arr: tuple) -> list: - """ - Description - ----------- - Make double recombinant variants. - - Parameters - ---------- - arr : tuple - Lists if single substitutions in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y']) - - Returns - ------- - doubles : list - List of double substitution lists, e.g., - [['L215F', 'A217N'], ['L215F', 'R219S'], ['L215F', 'L249Y'], - ['A217N', 'R219S'], ['A217N', 'L249Y'], ['R219S', 'L249Y']] - """ - doubles = [] - arr_pos = [int(substitution[0][1:-1]) for substitution in arr] - arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) - for i in tqdm(range(len(arr))): - for j in range(len(arr)): - if j > i: - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1]: - doubles.append([arr[i][0], arr[j][0]]) - if len(doubles) >= 8E04: - yield doubles - doubles = [] - yield doubles - - -def make_recombinations_triple(arr: list): - """ - Description - ----------- - Make triple recombinant variants. - - Parameters - ---------- - arr: list - List of single substitutions in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y']) - - Returns - ------- - triples: list - List of triple substitution lists, e.g., - [['L215F', 'A217N', 'R219S'], ['L215F', 'A217N', 'L249Y'], - ['L215F', 'R219S', 'L249Y'], ['A217N', 'R219S', 'L249Y']] - """ - length = len(arr) - arr_pos = [int(substitution[0][1:-1]) for substitution in arr] - arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) - triples = [] - for i in tqdm(range(length)): - for j in range(length): - for k in range(length): - if k > j > i: - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ - (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ - (arr[j][0])[1:-1] != (arr[k][0])[1:-1]: - triples.append([arr[i][0], arr[j][0], arr[k][0]]) - if len(triples) >= 8E04: - yield triples - triples = [] - yield triples - - -def make_recombinations_quadruple(arr): - """ - Description - ----------- - Make quadruple recombination variants. - - Parameters - ---------- - arr: list - List of single substitutions in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y']) - - Returns - ------- - quadruples: list - List of quadruple substitution lists, e.g., - [['L215F', 'A217N', 'R219S', 'L249Y']] - """ - length = len(arr) - arr_pos = [int(substitution[0][1:-1]) for substitution in arr] - arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) - quadruples = [] - for i in tqdm(range(length)): - for j in range(length): - for k in range(length): - for l in range(length): - if l > k > j > i: - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ - (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ - (arr[i][0])[1:-1] != (arr[l][0])[1:-1] and \ - (arr[j][0])[1:-1] != (arr[k][0])[1:-1] and \ - (arr[j][0])[1:-1] != (arr[l][0])[1:-1] and \ - (arr[k][0])[1:-1] != (arr[l][0])[1:-1]: - quadruples.append([arr[i][0], arr[j][0], arr[k][0], arr[l][0]]) - if len(quadruples) >= 8E04: - yield quadruples - quadruples = [] - yield quadruples - - -def make_recombinations_quintuple(arr): - """ - Make quintuple recombination variants. - - :parameter arr: List(s) of all available single substitution(s) - in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y'], ['P252I']) - - :returns quintuples: List of quintuple substitution lists, e.g., - [['L215F', 'A217N', 'R219S', 'L249Y', 'P252I']] - """ - length = len(arr) - arr_pos = [int(substitution[0][1:-1]) for substitution in arr] - arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) - quintuples = [] - for i in tqdm(range(length)): - for j in range(length): - for k in range(length): - for l in range(length): - for m in range(length): - if m > l > k > j > i: - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ - (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ - (arr[i][0])[1:-1] != (arr[l][0])[1:-1] and \ - (arr[i][0])[1:-1] != (arr[m][0])[1:-1] and \ - (arr[j][0])[1:-1] != (arr[k][0])[1:-1] and \ - (arr[j][0])[1:-1] != (arr[l][0])[1:-1] and \ - (arr[j][0])[1:-1] != (arr[m][0])[1:-1] and \ - (arr[k][0])[1:-1] != (arr[l][0])[1:-1] and \ - (arr[k][0])[1:-1] != (arr[m][0])[1:-1] and \ - (arr[l][0])[1:-1] != (arr[m][0])[1:-1]: - quintuples.append([arr[i][0], arr[j][0], arr[k][0], arr[l][0], arr[m][0]]) - if len(quintuples) >= 8E04: - yield quintuples - quintuples = [] - yield quintuples - - -def make_directory_and_enter(directory): - """ - Makes directory for recombined or diverse prediction sets - """ - previous_working_directory = os.getcwd() - try: - if not os.path.exists(os.path.dirname(directory)): - os.mkdir(directory) - except OSError: - pass - os.chdir(directory) - - return previous_working_directory - - -def create_split_files( - array, - single_variants, - wt_sequence, - name, - no -): - """ - Creates split files from given variants for yielded recombined or diverse variants. - """ - if len(array) > 0: - number_of_split_files = len(array) / (len(single_variants) * 20 ** 3) - number_of_split_files = round(number_of_split_files) - if number_of_split_files == 0: - number_of_split_files += 1 - split = np.array_split(array, number_of_split_files) - pwd = make_directory_and_enter(name + '_Split') - for i in split: - name_ = name + '_Split' + str(no) + '.fasta' - make_fasta_ps(name_, wt_sequence, i) - - os.chdir(pwd) - - return () - - -def make_ssm_singles(wt_seq, aminoacids): - """ - Making diverse single-saturation mutagenesis dataset, i.e., all - 19 amino acid substitutions at each wild-type sequence position. - """ - ssm_singles = [] - for i, aa_wt in enumerate(wt_seq): - for aa in aminoacids: - if aa_wt != aa: - ssm_singles.append([f'{aa_wt}{i+1}{aa}']) - return ssm_singles - - -def make_combinations_double_all_diverse(arr, aminoacids): - """ - Make double substituted naturally diverse variants - - :parameter arr: List of single substitutions in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y']) - :parameter aminoacids: List of amino acids to combine, e.g., all 20 naturally occuring: - ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', - 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] - - :returns doubles: List of double (fully) diverse substitution tuples, e.g., - [('L215A', 'A217C'), ('L215A', 'A217D'), ('L215A', 'A217E'), ('L215A', 'A217F'), ..., - ('R219Y', 'L249T'), ('R219Y', 'L249V'), ('R219Y', 'L249W'), ('R219Y', 'L249Y')] - """ - doubles = [] - for i in tqdm(range(len(arr))): - for j in range(i + 1, len(arr)): - for k in aminoacids: - for l in aminoacids: - """ - Make sure that following substitution types are not - included for prediction. Examples: - 1. Both simultaneous substitutions define exactly the - same substitution at the same position, e.g., A1C/A1C: - (arr[i][0])[1:-1] != (arr[j][0])[1:-1] - 2. "To-Wild-Type-Substitutions" at a single position e.g., A1A: - ((arr[i][0])[:-1] + k)[0] != ((arr[i][0])[:-1] + k)[-1] # e.g., A1A - ((arr[j][0])[:-1] + l)[0] != ((arr[j][0])[:-1] + l)[-1] # e.g., C2C - 3. Just reversed substitution patterns, e.g., A1C/A2D and A2D/A1C - in doubles tuple (only possible until results not emptied/yielded - and should generally not occur often): - not tuple([(arr[j][0])[:-1] + l, (arr[i][0])[:-1] + k]) in doubles - """ - if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ - ((arr[i][0])[:-1] + k)[0] != ((arr[i][0])[:-1] + k)[-1] and \ - ((arr[j][0])[:-1] + l)[0] != ((arr[j][0])[:-1] + l)[-1] and \ - not tuple([(arr[j][0])[:-1] + l, (arr[i][0])[:-1] + k]) in doubles: - doubles.append(tuple([(arr[i][0])[:-1] + k, (arr[j][0])[:-1] + l])) # tuple needed for - if len(doubles) >= 8E04: # list(dict()): - doubles = list(dict.fromkeys(doubles)) # removes duplicated list entries - yield doubles - doubles = [] - doubles = list(dict.fromkeys(doubles)) - yield doubles - - -def make_combinations_double_all_diverse_and_all_positions(wt_seq, aminoacids): - """ - Make double substituted naturally diverse variants - - :parameter arr: List of single substitutions in tuple, e.g., - (['L215F'], ['A217N'], ['R219S'], ['L249Y']) - :parameter aminoacids: List of amino acids to combine, e.g., all 20 naturally occuring: - ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', - 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] - - :returns doubles: List of double (fully) diverse substitution lists, e.g., - [('L215A', 'A217C'), ('L215A', 'A217D'), ('L215A', 'A217E'), ('L215A', 'A217F'), ..., - ('R219Y', 'L249T'), ('R219Y', 'L249V'), ('R219Y', 'L249W'), ('R219Y', 'L249Y')] - """ - counter = 0 - doubles = [] - for i in tqdm(range(len(wt_seq))): - for j in range(i + 1, len(wt_seq)): - for k in aminoacids: - pos_1 = wt_seq[i] + str(i + 1) + str(k) - for l in aminoacids: - pos_2 = wt_seq[j] + str(j + 1) + str(l) - if pos_1[0] != pos_1[-1] \ - and pos_2[0] != pos_2[-1] \ - and pos_1[1:-1] != pos_2[1:-1]: - doubles.append(tuple([pos_1, pos_2])) # tuple needed for - if len(doubles) >= 8E04: # list(dict()): - doubles = list(dict.fromkeys(doubles)) # removes duplicated list entries - counter += len(doubles) - yield doubles - doubles = [] - doubles = list(dict.fromkeys(doubles)) - yield doubles - - -def make_combinations_triple_all_diverse(arr, aminoacids): - """ - Make triple substituted naturally diverse variants. - Analogous to function "make_combinations_double_all_diverse" - but yielding three combined substitutions. - """ - triples = [] - for i in tqdm(range(len(arr))): - for j in range(i + 1, len(arr)): - for k in range(j + 1, len(arr)): - for l in aminoacids: - for m in aminoacids: - for n in aminoacids: - if ((arr[i][0])[1:-1]) != ((arr[j][0])[1:-1]) != ((arr[k][0])[1:-1]) and\ - ((arr[i][0])[:-1] + l)[0] != ((arr[i][0])[:-1] + l)[-1] and\ - ((arr[j][0])[:-1] + m)[0] != ((arr[j][0])[:-1] + m)[-1] and\ - ((arr[k][0])[:-1] + n)[0] != ((arr[k][0])[:-1] + n)[-1]: - triples.append(tuple([(arr[i][0])[:-1] + l, (arr[j][0])[:-1] + m, - (arr[k][0])[:-1] + n])) - if len(triples) >= 8E04: - triples = list(dict.fromkeys(triples)) # transfer to dict and back to list - yield triples - triples = [] - triples = list(dict.fromkeys(triples)) - yield triples - - -def make_combinations_quadruple_all_diverse(arr, aminoacids): - """ - Make quadruple substituted naturally diverse variants. - Analogous to function "make_combinations_double_all_diverse" - but yielding four combined substitutions. - """ - quadruples = [] - for i in tqdm(range(len(arr))): - for j in range(i + 1, len(arr)): - for k in range(j + 1, len(arr)): - for l in range(k + 1, len(arr)): - for m in aminoacids: - for n in aminoacids: - for o in aminoacids: - for p in aminoacids: - if ((arr[i][0])[1:-1]) \ - != ((arr[j][0])[1:-1]) \ - != ((arr[k][0])[1:-1]) \ - != ((arr[l][0])[1:-1]) \ - and\ - ((arr[i][0])[:-1] + m)[0] != ((arr[i][0])[:-1] + m)[-1] and\ - ((arr[j][0])[:-1] + n)[0] != ((arr[j][0])[:-1] + n)[-1] and\ - ((arr[k][0])[:-1] + o)[0] != ((arr[k][0])[:-1] + o)[-1] and\ - ((arr[l][0])[:-1] + p)[0] != ((arr[l][0])[:-1] + p)[-1]: - quadruples.append(tuple([(arr[i][0])[:-1] + m, (arr[j][0])[:-1] + n, - (arr[k][0])[:-1] + o, (arr[l][0])[:-1] + p])) - if len(quadruples) >= 8E04: - quadruples = list(dict.fromkeys(quadruples)) # transfer to dict - yield quadruples # and back to list - quadruples = [] - quadruples = list(dict.fromkeys(quadruples)) - yield quadruples - - -if __name__ == '__main__': - k = list(make_recombinations_quintuple(( - ['A86V'], ['T91S'], ['M108Q'], ['A109E'], ['T111P'], ['A86S'], ['T91E'], ['M108L'], ['A109S'], ['T111G'], - ['M108R'], ['T111N'], ['T91V'], ['M108T'], ['A109G'], ['T111F'], ['T91A'], ['A109M'], ['A86D'], ['T91R'], - ['A109K'], ['T111D'], ['T91Q'], ['A109V'], ['T111S'], ['A86C'], ['T91L'], ['A109T'], ['M108S'], ['A109F'], - ['T111L'], ['A86T'], ['A109Q'], ['M108A'], ['A109P'], ['T111Q'], ['A86N'], ['T91Y'], ['A109L'], ['T111A'], - ['T91F'], ['A109Y'], ['A86I'], ['A109D'], ['M108K'], ['M108I'], ['T91N'], ['T111C'], ['T91M'], ['T91C'], - ['M108P'], ['T111M'], ['T91H'], ['M108C'], ['M108F'], ['M108G'], ['A109N'], ['M108E'], ['A109W'], ['M108W'], - ['A109I'], ['T91P'], ['M108H'], ['T91D'], ['A109R'], ['T91I'], ['M108Y'], ['T91G'], ['T91W'], ['A86R'], - ['T91K'], ['T111Y'], ['M108D'], ['A86W'], ['M108V'], ['T111I'], ['M108N'], ['A109C'], ['A109H'] - ))) - for i, k_ in enumerate(k): - print(i + 1, np.shape(k_)) - # (10 * 80,000 (* 5)) + (1 * 2503 (* 5)) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +""" +Modules for making prediction files +from input CSV file (with value separators sep=',' or sep=';') +having the CSV-format + +HEADER_VARIANTS;HEADER_VARIANTS_FITNESS +VARIANT_1;FITNESS_VALUE_1 +VARIANT_2;FITNESS_VALUE_2 +... + +according to the self devised prediction set convention +> VARIANT_NAME_1 +VARIANT_SEQUENCE_1 +> VARIANT_NAME_2 +VARIANT_SEQUENCE_2 +... +""" + + +import os +import numpy as np +from tqdm import tqdm + + +def make_fasta_ps( + filename, + wt, + substitutions +): + """ + Creates prediction sets (.fasta style files, i.e. without fitness values). + """ + myfile = open(filename, 'w') + count = 0 + for i, var in enumerate(substitutions): + temporary = list(wt) + name = '' + separation = 0 + for single_var in var: + position_index = int(str(single_var)[1:-1]) - 1 + new_amino_acid = str(single_var)[-1] + temporary[position_index] = new_amino_acid + if separation == 0: + name += single_var + else: + name += '/' + single_var + separation += 1 + print('>', name, file=myfile) + print(''.join(temporary), file=myfile) + count += 1 + myfile.close() + + +def make_recombinations_double(arr: tuple) -> list: + """ + Description + ----------- + Make double recombinant variants. + + Parameters + ---------- + arr : tuple + Lists if single substitutions in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y']) + + Returns + ------- + doubles : list + List of double substitution lists, e.g., + [['L215F', 'A217N'], ['L215F', 'R219S'], ['L215F', 'L249Y'], + ['A217N', 'R219S'], ['A217N', 'L249Y'], ['R219S', 'L249Y']] + """ + doubles = [] + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) + for i in tqdm(range(len(arr))): + for j in range(len(arr)): + if j > i: + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1]: + doubles.append([arr[i][0], arr[j][0]]) + if len(doubles) >= 8E04: + yield doubles + doubles = [] + yield doubles + + +def make_recombinations_triple(arr: list): + """ + Description + ----------- + Make triple recombinant variants. + + Parameters + ---------- + arr: list + List of single substitutions in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y']) + + Returns + ------- + triples: list + List of triple substitution lists, e.g., + [['L215F', 'A217N', 'R219S'], ['L215F', 'A217N', 'L249Y'], + ['L215F', 'R219S', 'L249Y'], ['A217N', 'R219S', 'L249Y']] + """ + length = len(arr) + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) + triples = [] + for i in tqdm(range(length)): + for j in range(length): + for k in range(length): + if k > j > i: + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[k][0])[1:-1]: + triples.append([arr[i][0], arr[j][0], arr[k][0]]) + if len(triples) >= 8E04: + yield triples + triples = [] + yield triples + + +def make_recombinations_quadruple(arr): + """ + Description + ----------- + Make quadruple recombination variants. + + Parameters + ---------- + arr: list + List of single substitutions in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y']) + + Returns + ------- + quadruples: list + List of quadruple substitution lists, e.g., + [['L215F', 'A217N', 'R219S', 'L249Y']] + """ + length = len(arr) + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) + quadruples = [] + for i in tqdm(range(length)): + for j in range(length): + for k in range(length): + for l in range(length): + if l > k > j > i: + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[k][0])[1:-1] != (arr[l][0])[1:-1]: + quadruples.append([arr[i][0], arr[j][0], arr[k][0], arr[l][0]]) + if len(quadruples) >= 8E04: + yield quadruples + quadruples = [] + yield quadruples + + +def make_recombinations_quintuple(arr): + """ + Make quintuple recombination variants. + + :parameter arr: List(s) of all available single substitution(s) + in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y'], ['P252I']) + + :returns quintuples: List of quintuple substitution lists, e.g., + [['L215F', 'A217N', 'R219S', 'L249Y', 'P252I']] + """ + length = len(arr) + arr_pos = [int(substitution[0][1:-1]) for substitution in arr] + arr_pos, arr = zip(*sorted(zip(arr_pos, arr), key=lambda x: x[0])) + quintuples = [] + for i in tqdm(range(length)): + for j in range(length): + for k in range(length): + for l in range(length): + for m in range(length): + if m > l > k > j > i: + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[i][0])[1:-1] != (arr[m][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[k][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[j][0])[1:-1] != (arr[m][0])[1:-1] and \ + (arr[k][0])[1:-1] != (arr[l][0])[1:-1] and \ + (arr[k][0])[1:-1] != (arr[m][0])[1:-1] and \ + (arr[l][0])[1:-1] != (arr[m][0])[1:-1]: + quintuples.append([arr[i][0], arr[j][0], arr[k][0], arr[l][0], arr[m][0]]) + if len(quintuples) >= 8E04: + yield quintuples + quintuples = [] + yield quintuples + + +def make_directory_and_enter(directory): + """ + Makes directory for recombined or diverse prediction sets. + """ + previous_working_directory = os.getcwd() + try: + if not os.path.exists(os.path.dirname(directory)): + os.mkdir(directory) + except OSError: + pass + os.chdir(directory) + + return previous_working_directory + + +def create_split_files( + array, + single_variants, + wt_sequence, + name, + no +): + """ + Creates split files from given variants for yielded recombined or diverse variants. + """ + if len(array) > 0: + number_of_split_files = len(array) / (len(single_variants) * 20 ** 3) + number_of_split_files = round(number_of_split_files) + if number_of_split_files == 0: + number_of_split_files += 1 + split = np.array_split(array, number_of_split_files) + pwd = make_directory_and_enter(name + '_Split') + for i in split: + name_ = name + '_Split' + str(no) + '.fasta' + make_fasta_ps(name_, wt_sequence, i) + + os.chdir(pwd) + + return () + + +def make_ssm_singles(wt_seq, aminoacids): + """ + Making diverse single-saturation mutagenesis dataset, i.e., all + 19 amino acid substitutions at each wild-type sequence position. + """ + ssm_singles = [] + for i, aa_wt in enumerate(wt_seq): + for aa in aminoacids: + if aa_wt != aa: + ssm_singles.append([f'{aa_wt}{i+1}{aa}']) + return ssm_singles + + +def make_combinations_double_all_diverse(arr, aminoacids): + """ + Make double substituted naturally diverse variants + + :parameter arr: List of single substitutions in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y']) + :parameter aminoacids: List of amino acids to combine, e.g., all 20 naturally occuring: + ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', + 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + + :returns doubles: List of double (fully) diverse substitution tuples, e.g., + [('L215A', 'A217C'), ('L215A', 'A217D'), ('L215A', 'A217E'), ('L215A', 'A217F'), ..., + ('R219Y', 'L249T'), ('R219Y', 'L249V'), ('R219Y', 'L249W'), ('R219Y', 'L249Y')] + """ + doubles = [] + for i in tqdm(range(len(arr))): + for j in range(i + 1, len(arr)): + for k in aminoacids: + for l in aminoacids: + """ + Make sure that following substitution types are not + included for prediction. Examples: + 1. Both simultaneous substitutions define exactly the + same substitution at the same position, e.g., A1C/A1C: + (arr[i][0])[1:-1] != (arr[j][0])[1:-1] + 2. "To-Wild-Type-Substitutions" at a single position e.g., A1A: + ((arr[i][0])[:-1] + k)[0] != ((arr[i][0])[:-1] + k)[-1] # e.g., A1A + ((arr[j][0])[:-1] + l)[0] != ((arr[j][0])[:-1] + l)[-1] # e.g., C2C + 3. Just reversed substitution patterns, e.g., A1C/A2D and A2D/A1C + in doubles tuple (only possible until results not emptied/yielded + and should generally not occur often): + not tuple([(arr[j][0])[:-1] + l, (arr[i][0])[:-1] + k]) in doubles + """ + if (arr[i][0])[1:-1] != (arr[j][0])[1:-1] and \ + ((arr[i][0])[:-1] + k)[0] != ((arr[i][0])[:-1] + k)[-1] and \ + ((arr[j][0])[:-1] + l)[0] != ((arr[j][0])[:-1] + l)[-1] and \ + not tuple([(arr[j][0])[:-1] + l, (arr[i][0])[:-1] + k]) in doubles: + doubles.append(tuple([(arr[i][0])[:-1] + k, (arr[j][0])[:-1] + l])) # tuple needed for + if len(doubles) >= 8E04: # list(dict()): + doubles = list(dict.fromkeys(doubles)) # removes duplicated list entries + yield doubles + doubles = [] + doubles = list(dict.fromkeys(doubles)) + yield doubles + + +def make_combinations_double_all_diverse_and_all_positions(wt_seq, aminoacids): + """ + Make double substituted naturally diverse variants + + :parameter arr: List of single substitutions in tuple, e.g., + (['L215F'], ['A217N'], ['R219S'], ['L249Y']) + :parameter aminoacids: List of amino acids to combine, e.g., all 20 naturally occuring: + ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', + 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + + :returns doubles: List of double (fully) diverse substitution lists, e.g., + [('L215A', 'A217C'), ('L215A', 'A217D'), ('L215A', 'A217E'), ('L215A', 'A217F'), ..., + ('R219Y', 'L249T'), ('R219Y', 'L249V'), ('R219Y', 'L249W'), ('R219Y', 'L249Y')] + """ + counter = 0 + doubles = [] + for i in tqdm(range(len(wt_seq))): + for j in range(i + 1, len(wt_seq)): + for k in aminoacids: + pos_1 = wt_seq[i] + str(i + 1) + str(k) + for l in aminoacids: + pos_2 = wt_seq[j] + str(j + 1) + str(l) + if pos_1[0] != pos_1[-1] \ + and pos_2[0] != pos_2[-1] \ + and pos_1[1:-1] != pos_2[1:-1]: + doubles.append(tuple([pos_1, pos_2])) # tuple needed for + if len(doubles) >= 8E04: # list(dict()): + doubles = list(dict.fromkeys(doubles)) # removes duplicated list entries + counter += len(doubles) + yield doubles + doubles = [] + doubles = list(dict.fromkeys(doubles)) + yield doubles + + +def make_combinations_triple_all_diverse(arr, aminoacids): + """ + Make triple substituted naturally diverse variants. + Analogous to function "make_combinations_double_all_diverse" + but yielding three combined substitutions. + """ + triples = [] + for i in tqdm(range(len(arr))): + for j in range(i + 1, len(arr)): + for k in range(j + 1, len(arr)): + for l in aminoacids: + for m in aminoacids: + for n in aminoacids: + if ((arr[i][0])[1:-1]) != ((arr[j][0])[1:-1]) != ((arr[k][0])[1:-1]) and\ + ((arr[i][0])[:-1] + l)[0] != ((arr[i][0])[:-1] + l)[-1] and\ + ((arr[j][0])[:-1] + m)[0] != ((arr[j][0])[:-1] + m)[-1] and\ + ((arr[k][0])[:-1] + n)[0] != ((arr[k][0])[:-1] + n)[-1]: + triples.append(tuple([(arr[i][0])[:-1] + l, (arr[j][0])[:-1] + m, + (arr[k][0])[:-1] + n])) + if len(triples) >= 8E04: + triples = list(dict.fromkeys(triples)) # transfer to dict and back to list + yield triples + triples = [] + triples = list(dict.fromkeys(triples)) + yield triples + + +def make_combinations_quadruple_all_diverse(arr, aminoacids): + """ + Make quadruple substituted naturally diverse variants. + Analogous to function "make_combinations_double_all_diverse" + but yielding four combined substitutions. + """ + quadruples = [] + for i in tqdm(range(len(arr))): + for j in range(i + 1, len(arr)): + for k in range(j + 1, len(arr)): + for l in range(k + 1, len(arr)): + for m in aminoacids: + for n in aminoacids: + for o in aminoacids: + for p in aminoacids: + if ((arr[i][0])[1:-1]) \ + != ((arr[j][0])[1:-1]) \ + != ((arr[k][0])[1:-1]) \ + != ((arr[l][0])[1:-1]) \ + and\ + ((arr[i][0])[:-1] + m)[0] != ((arr[i][0])[:-1] + m)[-1] and\ + ((arr[j][0])[:-1] + n)[0] != ((arr[j][0])[:-1] + n)[-1] and\ + ((arr[k][0])[:-1] + o)[0] != ((arr[k][0])[:-1] + o)[-1] and\ + ((arr[l][0])[:-1] + p)[0] != ((arr[l][0])[:-1] + p)[-1]: + quadruples.append(tuple([(arr[i][0])[:-1] + m, (arr[j][0])[:-1] + n, + (arr[k][0])[:-1] + o, (arr[l][0])[:-1] + p])) + if len(quadruples) >= 8E04: + quadruples = list(dict.fromkeys(quadruples)) # transfer to dict + yield quadruples # and back to list + quadruples = [] + quadruples = list(dict.fromkeys(quadruples)) + yield quadruples diff --git a/pypef/utils/sto2a2m.py b/pypef/utils/sto2a2m.py index 51efd49..f13b5b2 100644 --- a/pypef/utils/sto2a2m.py +++ b/pypef/utils/sto2a2m.py @@ -1,95 +1,95 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import logging -logger = logging.getLogger('pypef.utils.sto2a2m') - -import numpy as np -from tqdm import tqdm -from Bio import AlignIO - - -def convert_sto2a2m( - sto_file: str, - inter_gap: float, - intra_gap: float -): - """ - Converts alignment in format STO to A2M format. - Removes specific sequences with inter and/or intra gaps - over specific thresholds. More alignment type transferring - options can be performed using the reformat.pl script of the - HH-suite; (c) Johannes Soeding, 2012: - https://github.com/soedinglab/hh-suite/blob/master/scripts/reformat.pl - """ - # Generate the a2m output filename - a2m_file = f"{sto_file.split('.sto')[0]}.a2m" - - # Load the stockholm alignment - logger.info('Loading MSA in stockholm format...') - sto_alignment = AlignIO.read(sto_file, 'stockholm') - logger.info('Trimming MSA...') - # Save this 'raw' multiple sequence alignment as numpy array - raw_msa = [] - for record in tqdm(sto_alignment): - raw_msa.append(np.array(record.seq)) - raw_msa = np.array(raw_msa) - - # 1st processing step - # Delete all positions, where WT has a gap to obtain the 'trimmed' MSA - ungap_pos = np.where(raw_msa[0] == "-") - msa_trimmed = np.array([np.delete(seq, ungap_pos) for seq in raw_msa]) - - # 2nd processing step - # Remove ("lower") all positions with more than 'inter_gap'*100 % gaps (columnar trimming) - count_gaps = np.count_nonzero(msa_trimmed == '-', axis=0) / msa_trimmed.shape[0] - lower = [idx for idx, count in enumerate(count_gaps) if count > inter_gap] - msa_trimmed_T = msa_trimmed.T - for idx in lower: - msa_trimmed_T[idx] = np.char.lower(msa_trimmed_T[idx]) - # replace all columns that are "removed" due to high gap content and have an "-" element by "." - msa_trimmed_T[idx] = np.where(msa_trimmed_T[idx] == '-', '.', msa_trimmed_T[idx]) - msa_trimmed_inter_gap = msa_trimmed_T.T - - # 3rd processing step - # Remove all sequences with more than 'intra_gap'*100 % gaps (line trimming) - target_len = len(msa_trimmed_inter_gap[0]) - gap_content = (np.count_nonzero(msa_trimmed_inter_gap == "-", axis=1) + np.count_nonzero( - msa_trimmed_inter_gap == ".", axis=1)) / target_len - delete = np.where(gap_content > intra_gap)[0] - msa_final = np.delete(msa_trimmed_inter_gap, delete, axis=0) - seqs_cls = [seq_cls for idx, seq_cls in enumerate(sto_alignment) if not idx in delete] - chunk_size = 60 - with open(a2m_file, 'w') as f: - for i, (seq, seq_cls) in enumerate(zip(msa_final, seqs_cls)): - if i == 0: - f.write(f'>TARGET_SEQ\n') - else: - f.write('>' + seq_cls.id + '\n') - for chunk in [seq[x:x + chunk_size] for x in range(0, len(seq), chunk_size)]: - f.write("".join(chunk) + '\n') - - # Get number of sequences and effective sites in the alignment - n_seqs = msa_final.shape[0] - n_sites = sum(1 for char in msa_final[0] if char.isupper()) - logger.info(f'Generated trimmed MSA {a2m_file} in A2M format:\n' - f'No. of sequences: {n_seqs}\n' - f'No. of effective sites: {n_sites} (out of {target_len} sites)\n' - f'-le --lambdae: {0.2 * (n_sites - 1):.1f}') - - return n_seqs, n_sites, target_len +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import logging +logger = logging.getLogger('pypef.utils.sto2a2m') + +import numpy as np +from tqdm import tqdm +from Bio import AlignIO + + +def convert_sto2a2m( + sto_file: str, + inter_gap: float, + intra_gap: float +): + """ + Converts alignment in format STO to A2M format. + Removes specific sequences with inter and/or intra gaps + over specific thresholds. More alignment type transferring + options can be performed using the reformat.pl script of the + HH-suite; (c) Johannes Soeding, 2012: + https://github.com/soedinglab/hh-suite/blob/master/scripts/reformat.pl + """ + # Generate the a2m output filename + a2m_file = f"{sto_file.split('.sto')[0]}.a2m" + + # Load the stockholm alignment + logger.info('Loading MSA in stockholm format...') + sto_alignment = AlignIO.read(sto_file, 'stockholm') + logger.info('Trimming MSA...') + # Save this 'raw' multiple sequence alignment as numpy array + raw_msa = [] + for record in tqdm(sto_alignment): + raw_msa.append(np.array(record.seq)) + raw_msa = np.array(raw_msa) + + # 1st processing step + # Delete all positions, where WT has a gap to obtain the 'trimmed' MSA + ungap_pos = np.where(raw_msa[0] == "-") + msa_trimmed = np.array([np.delete(seq, ungap_pos) for seq in raw_msa]) + + # 2nd processing step + # Remove ("lower") all positions with more than 'inter_gap'*100 % gaps (columnar trimming) + count_gaps = np.count_nonzero(msa_trimmed == '-', axis=0) / msa_trimmed.shape[0] + lower = [idx for idx, count in enumerate(count_gaps) if count > inter_gap] + msa_trimmed_T = msa_trimmed.T + for idx in lower: + msa_trimmed_T[idx] = np.char.lower(msa_trimmed_T[idx]) + # replace all columns that are "removed" due to high gap content and have an "-" element by "." + msa_trimmed_T[idx] = np.where(msa_trimmed_T[idx] == '-', '.', msa_trimmed_T[idx]) + msa_trimmed_inter_gap = msa_trimmed_T.T + + # 3rd processing step + # Remove all sequences with more than 'intra_gap'*100 % gaps (line trimming) + target_len = len(msa_trimmed_inter_gap[0]) + gap_content = (np.count_nonzero(msa_trimmed_inter_gap == "-", axis=1) + np.count_nonzero( + msa_trimmed_inter_gap == ".", axis=1)) / target_len + delete = np.where(gap_content > intra_gap)[0] + msa_final = np.delete(msa_trimmed_inter_gap, delete, axis=0) + seqs_cls = [seq_cls for idx, seq_cls in enumerate(sto_alignment) if not idx in delete] + chunk_size = 60 + with open(a2m_file, 'w') as f: + for i, (seq, seq_cls) in enumerate(zip(msa_final, seqs_cls)): + if i == 0: + f.write(f'>TARGET_SEQ\n') + else: + f.write('>' + seq_cls.id + '\n') + for chunk in [seq[x:x + chunk_size] for x in range(0, len(seq), chunk_size)]: + f.write("".join(chunk) + '\n') + + # Get number of sequences and effective sites in the alignment + n_seqs = msa_final.shape[0] + n_sites = sum(1 for char in msa_final[0] if char.isupper()) + logger.info(f'Generated trimmed MSA {a2m_file} in A2M format:\n' + f'No. of sequences: {n_seqs}\n' + f'No. of effective sites: {n_sites} (out of {target_len} sites)\n' + f'-le --lambdae: {0.2 * (n_sites - 1):.1f}') + + return n_seqs, n_sites, target_len diff --git a/pypef/utils/to_file.py b/pypef/utils/to_file.py index bd8d3f9..9cdb638 100644 --- a/pypef/utils/to_file.py +++ b/pypef/utils/to_file.py @@ -1,46 +1,46 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import os -import numpy as np - - -def predictions_out( - predictions, - model, - prediction_set, - path: str = '' -): - """ - Writes predictions (of the new sequence space) to text file(s). - """ - name, value = [], [] - for (val, nam) in predictions: - name.append(nam) - value.append('{:f}'.format(val)) - - data = np.array([name, value]).T - col_width = max(len(str(value)) for row in data for value in row) + 5 - - head = ['Name', 'Prediction'] - path_ = os.path.join(path, 'Predictions_' + str(model) + '_' + str(prediction_set.split('.')[0]) + '.txt') - with open(path_, 'w') as f: - f.write("".join(caption.ljust(col_width) for caption in head) + '\n') - f.write(len(head)*col_width*'-' + '\n') - for row in data: - f.write("".join(str(value).ljust(col_width) for value in row) + '\n') +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import os +import numpy as np + + +def predictions_out( + predictions, + model, + prediction_set, + path: str = '' +): + """ + Writes predictions (of the new sequence space) to text file(s). + """ + name, value = [], [] + for (val, nam) in predictions: + name.append(nam) + value.append('{:f}'.format(val)) + + data = np.array([name, value]).T + col_width = max(len(str(value)) for row in data for value in row) + 5 + + head = ['Name', 'Prediction'] + path_ = os.path.join(path, 'Predictions_' + str(model) + '_' + str(prediction_set.split('.')[0]) + '.txt') + with open(path_, 'w') as f: + f.write("".join(caption.ljust(col_width) for caption in head) + '\n') + f.write(len(head)*col_width*'-' + '\n') + for row in data: + f.write("".join(str(value).ljust(col_width) for value in row) + '\n') diff --git a/pypef/utils/utils_run.py b/pypef/utils/utils_run.py index 35a6152..5e661e9 100644 --- a/pypef/utils/utils_run.py +++ b/pypef/utils/utils_run.py @@ -1,352 +1,352 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -import os - -import logging -logger = logging.getLogger('pypef.utils.utils_run') - -import numpy as np -import re - -from pypef.utils.variant_data import ( - amino_acids, generate_dataframe_and_save_csv, - get_basename, read_csv_and_shift_pos_ints, - get_seqs_from_var_name, get_wt_sequence -) - -from pypef.utils.learning_test_sets import ( - csv_input, drop_rows, get_variants, make_sub_ls_ts, - make_sub_ls_ts_randomly, make_fasta_ls_ts -) -from pypef.utils.prediction_sets import ( - make_fasta_ps, make_recombinations_double, make_recombinations_triple, - make_recombinations_quadruple, make_recombinations_quintuple, - create_split_files, make_combinations_double_all_diverse, - make_combinations_triple_all_diverse, make_combinations_quadruple_all_diverse, - make_ssm_singles -) # not yet implemented: make_combinations_double_all_diverse_and_all_positions - -from pypef.utils.directed_evolution import DirectedEvolution -from pypef.utils.sto2a2m import convert_sto2a2m - -from pypef.ml.regression import OneHotEncoding, AAIndexEncoding, full_aaidx_txt_path -from pypef.dca.hybrid_model import plmc_or_gremlin_encoding - - -def run_pypef_utils(arguments): - if arguments['mklsts']: - wt_sequence = get_wt_sequence(arguments['--wt']) - t_drop = float(arguments['--drop']) - - logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.') - df = drop_rows(arguments['--input'], amino_acids, t_drop, arguments['--sep'], arguments['--mutation_sep']) - no_rnd = arguments['--numrnd'] - - single_variants, single_values, higher_variants, higher_values = get_variants( - df, amino_acids, wt_sequence, arguments['--mutation_sep'] - ) - logger.info(f'Number of single variants: {len(single_variants)}.') - if len(single_variants) == 0: - logger.info('Found NO single substitution variants for possible recombination!') - sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts( - single_variants, single_values, higher_variants, higher_values) - logger.info('Tip: You can edit your LS and TS datasets just by ' - 'cutting/pasting between the LS and TS fasta datasets.') - - logger.info('Creating LS dataset...') - make_fasta_ls_ts('LS.fasl', wt_sequence, sub_ls, val_ls) - logger.info('Creating TS dataset...') - make_fasta_ls_ts('TS.fasl', wt_sequence, sub_ts, val_ts) - - try: - no_rnd = int(no_rnd) - except ValueError: - no_rnd = 0 - if no_rnd != 0: - random_set_counter = 1 - no_rnd = int(no_rnd) - while random_set_counter <= no_rnd: - sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts_randomly( - single_variants, single_values, - higher_variants, higher_values - ) - make_fasta_ls_ts('LS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ls, val_ls) - make_fasta_ls_ts('TS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ts, val_ts) - random_set_counter += 1 - - elif arguments['mkps']: - wt_sequence = get_wt_sequence(arguments['--wt']) - if not arguments['--ssm']: - try: - csv_file = csv_input(arguments['--input']) - except FileNotFoundError: - raise SystemError("If creating prediction sets ('mkps') a CSV input is " - "required (if not running 'mkps --ssm').") - t_drop = float(arguments['--drop']) - df = drop_rows(csv_file, amino_acids, t_drop) - drop_wt = [] - for i in range(len(df)): - if df.iloc[i, 0] == 'WT': - logger.info('Dropping wild-type (WT) from DataFrame as it cannot be used for (re-)combination.') - drop_wt.append(i) - df = df.drop(drop_wt).reset_index(drop=True) - - logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.') - single_variants, _, higher_variants, _ = get_variants(df, amino_acids, wt_sequence) - logger.info(f'Using single substitution variants for (re-)combination. ' - f'Number of single variants: {len(single_variants)}.') - if len(single_variants) == 0: - logger.info('Found NO single substitution variants for possible recombination! ' - 'No prediction files can be created!') - - if arguments['--drecomb']: - logger.info('Creating Recomb_Double_Split...') - for no, files in enumerate(make_recombinations_double(single_variants)): - double_mutants = np.array(files) - create_split_files(double_mutants, single_variants, wt_sequence, 'Recomb_Double', no) - - if arguments['--trecomb']: - logger.info('Creating Recomb_Triple_Split...') - for no, files in enumerate(make_recombinations_triple(single_variants)): - triple_mutants = np.array(files) - create_split_files(triple_mutants, single_variants, wt_sequence, 'Recomb_Triple', no) - - if arguments['--qarecomb']: - logger.info('Beware that this step might require much disk space as PyPEF is ' - 'creating prediction files in TXT format. Creating Recomb_Quadruple_Split...') - for no, files in enumerate(make_recombinations_quadruple(single_variants)): - quadruple_mutants = np.array(files) - create_split_files(quadruple_mutants, single_variants, wt_sequence, 'Recomb_Quadruple', no) - - if arguments['--qirecomb']: - logger.info('Beware that this step might require much disk space as PyPEF is ' - 'creating prediction files in plain text format. Creating Recomb_Quintuple_Split...') - for no, files in enumerate(make_recombinations_quintuple(single_variants)): - quintuple_mutants = np.array(files) - create_split_files(quintuple_mutants, single_variants, wt_sequence, 'Recomb_Quintuple', no) - - if arguments['--ddiverse']: - logger.info('Creating Diverse_Double_Split...') - # if functions required, uncomment the next two lines and comment the other ones - # for no, files in enumerate( - # make_recombinations_double_all_diverse_and_all_positions(wt_sequence, amino_acids)): - for no, files in enumerate(make_combinations_double_all_diverse(single_variants, amino_acids)): - doubles = np.array(files) - create_split_files(doubles, single_variants, wt_sequence, 'Diverse_Double', no + 1) - - if arguments['--tdiverse']: - logger.info('Beware that this step might require much disk space as PyPEF is ' - 'creating prediction files in plain text format. Creating Diverse_Triple_Split... ') - for no, files in enumerate(make_combinations_triple_all_diverse(single_variants, amino_acids)): - triples = np.array(files) - create_split_files(triples, single_variants, wt_sequence, 'Diverse_Triple', no + 1) - - if arguments['--qdiverse']: - logger.info('Beware that this step might require much disk space as PyPEF is ' - 'creating prediction files in plain text format. Creating Diverse_Quadruple_Split...') - for no, files in enumerate(make_combinations_quadruple_all_diverse(single_variants, amino_acids)): - quadruples = np.array(files) - create_split_files(quadruples, single_variants, wt_sequence, 'Diverse_Quadruple', no + 1) - - if arguments['--ssm']: - singles = make_ssm_singles(wt_sequence, amino_acids) - make_fasta_ps('ssm_singles.fasta', wt_sequence, np.array(singles)) - - if True not in [ - arguments['--drecomb'], arguments['--trecomb'], arguments['--qarecomb'], - arguments['--qirecomb'], arguments['--ddiverse'], arguments['--tdiverse'], - arguments['--qdiverse'], arguments['--ssm'] - ]: - logger.info(f'\nMaking prediction set fasta file from {csv_file}...\n') - make_fasta_ps( - filename=f'{get_basename(csv_file)}_prediction_set.fasta', - wt=wt_sequence, - substitutions=tuple(list(single_variants) + list(higher_variants)) - ) - - # Metropolis-Hastings-driven directed evolution, similar to Biswas et al.: - # Low-N protein engineering with data-efficient deep learning, - # see https://github.com/ivanjayapurna/low-n-protein-engineering/tree/master/directed-evo - elif arguments['directevo']: - if arguments['hybrid'] or arguments['--encoding'] == 'dca': - dca_encoder = arguments['--params'] - if arguments['ml']: - ml_or_hybrid = 'ml' - else: - ml_or_hybrid = 'hybrid' - else: - dca_encoder = None - ml_or_hybrid = 'ml' - # Prediction using a saved model Pickle file specific AAindex used for encoding - # Model saved in Pickle file also for DCA-based encoding, a default file name - logger.info('Not counting WT as variant in directed evolution ' - 'as it cannot be used for (re-)combination.') - path = os.getcwd() - try: - # "temperature" parameter: determines sensitivity of Metropolis-Hastings acceptance criteria - temp = float(arguments['--temp']) - # how many subsequent mutation trials per simulated evolution trajectory - num_iterations = int(arguments['--numiter']) - # how many separate evolution trajectories to run - num_trajectories = int(arguments['--numtraj']) - except ValueError: - raise ValueError("Define flags 'numiter' and 'numtraj' as integer and 'temp' as float.") - s_wt = get_wt_sequence(arguments['--wt']) - y_wt = arguments['--y_wt'] - negative = arguments['--negative'] - # Metropolis-Hastings-driven directed evolution on single mutant position csv data - usecsv = arguments['--usecsv'] - if usecsv: - csv_file = csv_input(arguments['--input']) - t_drop = float(arguments['--drop']) - logger.info(f'Length of provided sequence: {len(s_wt)} amino acids.') - df = drop_rows(csv_file, amino_acids, t_drop) - drop_wt = [] - for i in range(len(df)): - if df.iloc[i, 0] == 'WT': - logger.info('Using fitness value (y_WT) for wild-type (WT) as specified in CSV.') - drop_wt.append(i) - y_wt = df.iloc[i, 1] - df = df.drop(drop_wt).reset_index(drop=True) - single_variants, single_values, higher_variants, higher_values = \ - get_variants(df, amino_acids, s_wt) - logger.info(f'Number of single variants: {len(single_variants)}.') - if len(single_variants) == 0: - logger.info('Found NO single substitution variants for possible recombination!') - single_vars, single_ys = list(single_variants), list(single_values) # only tuples to lists - - else: - single_vars = None # What happens now? (Full diverse?) - # Metropolis-Hastings-driven directed evolution on single mutant .csv amino acid substitution data - csvaa = arguments['--csvaa'] # only use identified substitutions --> taken from CSV file - logger.info('Running evolution trajectories and plotting...') - DirectedEvolution( - ml_or_hybrid=ml_or_hybrid, - encoding=arguments['--encoding'], - s_wt=s_wt, - y_wt=y_wt, - single_vars=single_vars, - num_iterations=num_iterations, - num_trajectories=num_trajectories, - amino_acids=amino_acids, - temp=temp, - path=path, - model=arguments['--model'], - no_fft=arguments['--nofft'], - dca_encoder=dca_encoder, - usecsv=usecsv, - csvaa=csvaa, - negative=negative - ).plot_trajectories() - - - elif arguments['sto2a2m']: - convert_sto2a2m( - sto_file=arguments['--sto'], - inter_gap=arguments['--inter_gap'], - intra_gap=arguments['--intra_gap'] - ) - - elif arguments['reformat_csv']: - read_csv_and_shift_pos_ints( - infile=arguments['--input'], - offset=0, - col_sep=arguments['--sep'], - substitution_sep=arguments['--mutation_sep'] - ) - - elif arguments['shift_pos']: - read_csv_and_shift_pos_ints( - infile=arguments['--input'], - offset=arguments['--offset'], - col_sep=arguments['--sep'], - substitution_sep=arguments['--mutation_sep'] - ) - - elif arguments['encode']: # sole parallelized task for utils for DCA encoding - df = drop_rows(arguments['--input'], amino_acids, arguments['--drop']) - wt_sequence = get_wt_sequence(arguments['--wt']) - logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.') - single_variants, single_values, higher_variants, higher_values = get_variants( - df, amino_acids, wt_sequence) - variants = list(single_variants) + list(higher_variants) - ys_true = list(single_values) + list(higher_values) - variants, ys_true, sequences = get_seqs_from_var_name(wt_sequence, variants, ys_true) - assert len(variants) == len(ys_true) == len(sequences) - logger.info('Encoding variant sequences...') - - if arguments['--encoding'] == 'dca': - threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 - threads = threads + 1 if threads == 0 else threads - logger.info(f'Using {threads} thread(s) for running...') - xs, variants, sequences, ys_true, x_wt, model, model_type = plmc_or_gremlin_encoding( - variants=variants, - sequences=sequences, - ys_true=ys_true, - params_file=arguments['--params'], - substitution_sep=arguments['--mutation_sep'], - threads=threads, - verbose=True - ) - assert len(xs) == len(variants) == len(ys_true) - - if variants[0][0] != variants[0][-1]: # WT is required for DCA-based hybrid modeling - if arguments['--y_wt'] is not None: - y_wt = arguments['--y_wt'] - else: - y_wt = 1 - # better using re then: wt = variants[0][0] + str(variants[0][1:-1] + variants[0][0]) - wt = variants[0][0] + re.findall(r"\d+", variants[0])[0] + variants[0][0] - variants = list(variants) - variants.insert(0, wt) # inserting WT at pos. 0 - xs = list(xs) - xs.insert(0, list(x_wt.flatten())) - ys_true = list(ys_true) - ys_true.insert(0, y_wt) # set WT fitness to 1 or use arguments y_wt? - - elif arguments['--encoding'] == 'onehot': - onehot_encoder = OneHotEncoding(sequences) - xs = onehot_encoder.collect_encoded_sequences() - - elif arguments['--encoding'] == 'aaidx': - if arguments['--model'] is None: - raise SystemError( - "Define the AAindex to use for encoding with the " - "flag --model AAINDEX, e.g.: --model CORJ870104." - ) - aa_index_encoder = AAIndexEncoding( - full_aaidx_txt_path(arguments['--model'] + '.txt'), sequences - ) - x_fft, x_raw = aa_index_encoder.collect_encoded_sequences() - if arguments['--nofft']: - xs = x_raw - else: - xs = x_fft - - else: - raise SystemError("Unknown encoding option.") - logger.info(f'{len(variants)} variants (plus inserted WT) remained after encoding. ' - f'Saving to encoding CSV file...') - generate_dataframe_and_save_csv( # put WT at pos. 0 for hybrid low_N or extrapolation - variants=variants, - sequence_encodings=xs, - fitnesses=ys_true, - csv_file=arguments['--input'], - encoding_type=arguments['--encoding'] - ) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +import os + +import logging +logger = logging.getLogger('pypef.utils.utils_run') + +import numpy as np +import re + +from pypef.utils.variant_data import ( + amino_acids, generate_dataframe_and_save_csv, + get_basename, read_csv_and_shift_pos_ints, + get_seqs_from_var_name, get_wt_sequence +) + +from pypef.utils.learning_test_sets import ( + csv_input, drop_rows, get_variants, make_sub_ls_ts, + make_sub_ls_ts_randomly, make_fasta_ls_ts +) +from pypef.utils.prediction_sets import ( + make_fasta_ps, make_recombinations_double, make_recombinations_triple, + make_recombinations_quadruple, make_recombinations_quintuple, + create_split_files, make_combinations_double_all_diverse, + make_combinations_triple_all_diverse, make_combinations_quadruple_all_diverse, + make_ssm_singles +) # not yet implemented: make_combinations_double_all_diverse_and_all_positions + +from pypef.utils.directed_evolution import DirectedEvolution +from pypef.utils.sto2a2m import convert_sto2a2m + +from pypef.ml.regression import OneHotEncoding, AAIndexEncoding, full_aaidx_txt_path +from pypef.dca.hybrid_model import plmc_or_gremlin_encoding + + +def run_pypef_utils(arguments): + if arguments['mklsts']: + wt_sequence = get_wt_sequence(arguments['--wt']) + t_drop = float(arguments['--drop']) + + logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.') + df = drop_rows(arguments['--input'], amino_acids, t_drop, arguments['--sep'], arguments['--mutation_sep']) + no_rnd = arguments['--numrnd'] + + single_variants, single_values, higher_variants, higher_values = get_variants( + df, amino_acids, wt_sequence, arguments['--mutation_sep'] + ) + logger.info(f'Number of single variants: {len(single_variants)}.') + if len(single_variants) == 0: + logger.info('Found NO single substitution variants for possible recombination!') + sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts( + single_variants, single_values, higher_variants, higher_values) + logger.info('Tip: You can edit your LS and TS datasets just by ' + 'cutting/pasting between the LS and TS fasta datasets.') + + logger.info('Creating LS dataset...') + make_fasta_ls_ts('LS.fasl', wt_sequence, sub_ls, val_ls) + logger.info('Creating TS dataset...') + make_fasta_ls_ts('TS.fasl', wt_sequence, sub_ts, val_ts) + + try: + no_rnd = int(no_rnd) + except ValueError: + no_rnd = 0 + if no_rnd != 0: + random_set_counter = 1 + no_rnd = int(no_rnd) + while random_set_counter <= no_rnd: + sub_ls, val_ls, sub_ts, val_ts = make_sub_ls_ts_randomly( + single_variants, single_values, + higher_variants, higher_values + ) + make_fasta_ls_ts('LS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ls, val_ls) + make_fasta_ls_ts('TS_random_' + str(random_set_counter) + '.fasl', wt_sequence, sub_ts, val_ts) + random_set_counter += 1 + + elif arguments['mkps']: + wt_sequence = get_wt_sequence(arguments['--wt']) + if not arguments['--ssm']: + try: + csv_file = csv_input(arguments['--input']) + except FileNotFoundError: + raise SystemError("If creating prediction sets ('mkps') a CSV input is " + "required (if not running 'mkps --ssm').") + t_drop = float(arguments['--drop']) + df = drop_rows(csv_file, amino_acids, t_drop) + drop_wt = [] + for i in range(len(df)): + if df.iloc[i, 0] == 'WT': + logger.info('Dropping wild-type (WT) from DataFrame as it cannot be used for (re-)combination.') + drop_wt.append(i) + df = df.drop(drop_wt).reset_index(drop=True) + + logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.') + single_variants, _, higher_variants, _ = get_variants(df, amino_acids, wt_sequence) + logger.info(f'Using single substitution variants for (re-)combination. ' + f'Number of single variants: {len(single_variants)}.') + if len(single_variants) == 0: + logger.info('Found NO single substitution variants for possible recombination! ' + 'No prediction files can be created!') + + if arguments['--drecomb']: + logger.info('Creating Recomb_Double_Split...') + for no, files in enumerate(make_recombinations_double(single_variants)): + double_mutants = np.array(files) + create_split_files(double_mutants, single_variants, wt_sequence, 'Recomb_Double', no) + + if arguments['--trecomb']: + logger.info('Creating Recomb_Triple_Split...') + for no, files in enumerate(make_recombinations_triple(single_variants)): + triple_mutants = np.array(files) + create_split_files(triple_mutants, single_variants, wt_sequence, 'Recomb_Triple', no) + + if arguments['--qarecomb']: + logger.info('Beware that this step might require much disk space as PyPEF is ' + 'creating prediction files in TXT format. Creating Recomb_Quadruple_Split...') + for no, files in enumerate(make_recombinations_quadruple(single_variants)): + quadruple_mutants = np.array(files) + create_split_files(quadruple_mutants, single_variants, wt_sequence, 'Recomb_Quadruple', no) + + if arguments['--qirecomb']: + logger.info('Beware that this step might require much disk space as PyPEF is ' + 'creating prediction files in plain text format. Creating Recomb_Quintuple_Split...') + for no, files in enumerate(make_recombinations_quintuple(single_variants)): + quintuple_mutants = np.array(files) + create_split_files(quintuple_mutants, single_variants, wt_sequence, 'Recomb_Quintuple', no) + + if arguments['--ddiverse']: + logger.info('Creating Diverse_Double_Split...') + # if functions required, uncomment the next two lines and comment the other ones + # for no, files in enumerate( + # make_recombinations_double_all_diverse_and_all_positions(wt_sequence, amino_acids)): + for no, files in enumerate(make_combinations_double_all_diverse(single_variants, amino_acids)): + doubles = np.array(files) + create_split_files(doubles, single_variants, wt_sequence, 'Diverse_Double', no + 1) + + if arguments['--tdiverse']: + logger.info('Beware that this step might require much disk space as PyPEF is ' + 'creating prediction files in plain text format. Creating Diverse_Triple_Split... ') + for no, files in enumerate(make_combinations_triple_all_diverse(single_variants, amino_acids)): + triples = np.array(files) + create_split_files(triples, single_variants, wt_sequence, 'Diverse_Triple', no + 1) + + if arguments['--qdiverse']: + logger.info('Beware that this step might require much disk space as PyPEF is ' + 'creating prediction files in plain text format. Creating Diverse_Quadruple_Split...') + for no, files in enumerate(make_combinations_quadruple_all_diverse(single_variants, amino_acids)): + quadruples = np.array(files) + create_split_files(quadruples, single_variants, wt_sequence, 'Diverse_Quadruple', no + 1) + + if arguments['--ssm']: + singles = make_ssm_singles(wt_sequence, amino_acids) + make_fasta_ps('ssm_singles.fasta', wt_sequence, np.array(singles)) + + if True not in [ + arguments['--drecomb'], arguments['--trecomb'], arguments['--qarecomb'], + arguments['--qirecomb'], arguments['--ddiverse'], arguments['--tdiverse'], + arguments['--qdiverse'], arguments['--ssm'] + ]: + logger.info(f'\nMaking prediction set fasta file from {csv_file}...\n') + make_fasta_ps( + filename=f'{get_basename(csv_file)}_prediction_set.fasta', + wt=wt_sequence, + substitutions=tuple(list(single_variants) + list(higher_variants)) + ) + + # Metropolis-Hastings-driven directed evolution, similar to Biswas et al.: + # Low-N protein engineering with data-efficient deep learning, + # see https://github.com/ivanjayapurna/low-n-protein-engineering/tree/master/directed-evo + elif arguments['directevo']: + if arguments['hybrid'] or arguments['--encoding'] == 'dca': + dca_encoder = arguments['--params'] + if arguments['ml']: + ml_or_hybrid = 'ml' + else: + ml_or_hybrid = 'hybrid' + else: + dca_encoder = None + ml_or_hybrid = 'ml' + # Prediction using a saved model Pickle file specific AAindex used for encoding + # Model saved in Pickle file also for DCA-based encoding, a default file name + logger.info('Not counting WT as variant in directed evolution ' + 'as it cannot be used for (re-)combination.') + path = os.getcwd() + try: + # "temperature" parameter: determines sensitivity of Metropolis-Hastings acceptance criteria + temp = float(arguments['--temp']) + # how many subsequent mutation trials per simulated evolution trajectory + num_iterations = int(arguments['--numiter']) + # how many separate evolution trajectories to run + num_trajectories = int(arguments['--numtraj']) + except ValueError: + raise ValueError("Define flags 'numiter' and 'numtraj' as integer and 'temp' as float.") + s_wt = get_wt_sequence(arguments['--wt']) + y_wt = arguments['--y_wt'] + negative = arguments['--negative'] + # Metropolis-Hastings-driven directed evolution on single mutant position csv data + usecsv = arguments['--usecsv'] + if usecsv: + csv_file = csv_input(arguments['--input']) + t_drop = float(arguments['--drop']) + logger.info(f'Length of provided sequence: {len(s_wt)} amino acids.') + df = drop_rows(csv_file, amino_acids, t_drop) + drop_wt = [] + for i in range(len(df)): + if df.iloc[i, 0] == 'WT': + logger.info('Using fitness value (y_WT) for wild-type (WT) as specified in CSV.') + drop_wt.append(i) + y_wt = df.iloc[i, 1] + df = df.drop(drop_wt).reset_index(drop=True) + single_variants, single_values, higher_variants, higher_values = \ + get_variants(df, amino_acids, s_wt) + logger.info(f'Number of single variants: {len(single_variants)}.') + if len(single_variants) == 0: + logger.info('Found NO single substitution variants for possible recombination!') + single_vars, single_ys = list(single_variants), list(single_values) # only tuples to lists + + else: + single_vars = None # What happens now? (Full diverse?) + # Metropolis-Hastings-driven directed evolution on single mutant .csv amino acid substitution data + csvaa = arguments['--csvaa'] # only use identified substitutions --> taken from CSV file + logger.info('Running evolution trajectories and plotting...') + DirectedEvolution( + ml_or_hybrid=ml_or_hybrid, + encoding=arguments['--encoding'], + s_wt=s_wt, + y_wt=y_wt, + single_vars=single_vars, + num_iterations=num_iterations, + num_trajectories=num_trajectories, + amino_acids=amino_acids, + temp=temp, + path=path, + model=arguments['--model'], + no_fft=arguments['--nofft'], + dca_encoder=dca_encoder, + usecsv=usecsv, + csvaa=csvaa, + negative=negative + ).plot_trajectories() + + + elif arguments['sto2a2m']: + convert_sto2a2m( + sto_file=arguments['--sto'], + inter_gap=arguments['--inter_gap'], + intra_gap=arguments['--intra_gap'] + ) + + elif arguments['reformat_csv']: + read_csv_and_shift_pos_ints( + infile=arguments['--input'], + offset=0, + col_sep=arguments['--sep'], + substitution_sep=arguments['--mutation_sep'] + ) + + elif arguments['shift_pos']: + read_csv_and_shift_pos_ints( + infile=arguments['--input'], + offset=arguments['--offset'], + col_sep=arguments['--sep'], + substitution_sep=arguments['--mutation_sep'] + ) + + elif arguments['encode']: # sole parallelized task for utils for DCA encoding + df = drop_rows(arguments['--input'], amino_acids, arguments['--drop']) + wt_sequence = get_wt_sequence(arguments['--wt']) + logger.info(f'Length of provided sequence: {len(wt_sequence)} amino acids.') + single_variants, single_values, higher_variants, higher_values = get_variants( + df, amino_acids, wt_sequence) + variants = list(single_variants) + list(higher_variants) + ys_true = list(single_values) + list(higher_values) + variants, ys_true, sequences = get_seqs_from_var_name(wt_sequence, variants, ys_true) + assert len(variants) == len(ys_true) == len(sequences) + logger.info('Encoding variant sequences...') + + if arguments['--encoding'] == 'dca': + threads = abs(arguments['--threads']) if arguments['--threads'] is not None else 1 + threads = threads + 1 if threads == 0 else threads + logger.info(f'Using {threads} thread(s) for running...') + xs, variants, sequences, ys_true, x_wt, model, model_type = plmc_or_gremlin_encoding( + variants=variants, + sequences=sequences, + ys_true=ys_true, + params_file=arguments['--params'], + substitution_sep=arguments['--mutation_sep'], + threads=threads, + verbose=True + ) + assert len(xs) == len(variants) == len(ys_true) + + if variants[0][0] != variants[0][-1]: # WT is required for DCA-based hybrid modeling + if arguments['--y_wt'] is not None: + y_wt = arguments['--y_wt'] + else: + y_wt = 1 + # better using re then: wt = variants[0][0] + str(variants[0][1:-1] + variants[0][0]) + wt = variants[0][0] + re.findall(r"\d+", variants[0])[0] + variants[0][0] + variants = list(variants) + variants.insert(0, wt) # inserting WT at pos. 0 + xs = list(xs) + xs.insert(0, list(x_wt.flatten())) + ys_true = list(ys_true) + ys_true.insert(0, y_wt) # set WT fitness to 1 or use arguments y_wt? + + elif arguments['--encoding'] == 'onehot': + onehot_encoder = OneHotEncoding(sequences) + xs = onehot_encoder.collect_encoded_sequences() + + elif arguments['--encoding'] == 'aaidx': + if arguments['--model'] is None: + raise SystemError( + "Define the AAindex to use for encoding with the " + "flag --model AAINDEX, e.g.: --model CORJ870104." + ) + aa_index_encoder = AAIndexEncoding( + full_aaidx_txt_path(arguments['--model'] + '.txt'), sequences + ) + x_fft, x_raw = aa_index_encoder.collect_encoded_sequences() + if arguments['--nofft']: + xs = x_raw + else: + xs = x_fft + + else: + raise SystemError("Unknown encoding option.") + logger.info(f'{len(variants)} variants (plus inserted WT) remained after encoding. ' + f'Saving to encoding CSV file...') + generate_dataframe_and_save_csv( # put WT at pos. 0 for hybrid low_N or extrapolation + variants=variants, + sequence_encodings=xs, + fitnesses=ys_true, + csv_file=arguments['--input'], + encoding_type=arguments['--encoding'] + ) diff --git a/pypef/utils/variant_data.py b/pypef/utils/variant_data.py index b73dc5a..a6c726b 100644 --- a/pypef/utils/variant_data.py +++ b/pypef/utils/variant_data.py @@ -1,452 +1,468 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Created on 05 October 2020 -# @authors: Niklas Siedhoff, Alexander-Maurice Illig -# @contact: -# PyPEF - Pythonic Protein Engineering Framework -# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) -# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode - -# PyPEF – An Integrated Framework for Data-Driven Protein Engineering -# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 -# https://doi.org/10.1021/acs.jcim.1c00099 -# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* -# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany -# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany -# *Corresponding author -# §Equal contribution - -from __future__ import annotations -import os -import numpy as np -import pandas as pd - - -amino_acids = [ - 'A', 'C', 'D', 'E', 'F', - 'G', 'H', 'I', 'K', 'L', - 'M', 'N', 'P', 'Q', 'R', - 'S', 'T', 'V', 'W', 'Y' -] - - -def get_wt_sequence(sequence_fasta): - """ - Gets wild-type sequence from defined input file (can be pure sequence or fasta style) - """ - if sequence_fasta is None: - return None - wild_type_sequence = "" - try: - with open(sequence_fasta, 'r') as sf: - for lines in sf.readlines(): - if lines.startswith(">"): - continue - lines = ''.join(lines.split()) - wild_type_sequence += lines - except FileNotFoundError: - raise FileNotFoundError("Did not find FASTA file. Check/specify input FASTA " - "sequence file for getting the wild-type sequence.") - return wild_type_sequence - - -def read_models(number): - """ - reads the models found in the file Model_Results.txt. - If no model was trained, the .txt file does not exist. - """ - try: - ls = "" - with open('Model_Results.txt', 'r') as file: - for i, lines in enumerate(file): - if i == 0: - if lines[:6] == 'No FFT': - number += 2 - if i <= number + 1: - ls += lines - return ls - except FileNotFoundError: - return "No Model_Results.txt found." - - -def absolute_path_cwd_file(file): - """ - Get the current working directory - """ - if file is None: - return None - return os.path.join(os.getcwd(), file) - - -def path_aaidx_txt_path_from_utils(filename): - """ - returns the relative path to the /AAindex folder from the utils directory, - e.g. path/to/pypef/utils/../aaidx/AAindex/FAUJ880104.txt. - """ - modules_path = os.path.dirname(os.path.abspath(__file__)) - return os.path.join(modules_path, '..', 'ml', 'AAindex', f'{filename}.txt') - - -def get_sequences_from_file( - fasta: str, - mult_path: str | None = None -) -> (np.ndarray, np.ndarray, np.ndarray): - """ - "Get_Sequences" reads (learning and test) .fasta and - .fasta-like ".fasl" format files and extracts the name, - the target value and the sequence of the protein. - Only takes one-liner sequences for correct input. - See example directory for required fasta file format. - Make sure every marker (> and ;) is seperated by a - space ' ' from the value respectively name. - """ - if mult_path is not None: - os.chdir(mult_path) - - sequences = [] - values = [] - names_of_mutations = [] - - with open(fasta, 'r') as f: - words = "" - for line in f: - if line.startswith('>'): - if words != "": - sequences.append(words) - words = line.split('>') - names_of_mutations.append(words[1].strip()) - words = "" - - elif line.startswith('#'): - pass # are comments - - elif line.startswith(';'): - if words != "": - sequences.append(words) - words = line.split(';') - values.append(float(words[1].strip())) - words = "" - - else: - try: - words += line.strip() - except IndexError: - raise IndexError("Learning or Validation sets (.fasta) likely " - "have emtpy lines (e.g. at end of file)") - if words != "": - sequences.append(words) - # Check consistency - if len(values) != 0: - if len(sequences) != len(values): - raise SystemError( - f'Error: Number of sequences does not fit with number of target values! ' - f'Number of sequences: {str(len(sequences))}, Number of target values: {str(len(values))}.' - ) - if mult_path is not None: - os.chdir('..') - - return np.array(sequences), np.array(names_of_mutations), np.array(values) - - -def remove_nan_encoded_positions( - xs: np.ndarray | list, - *yss -): - """ - Removes encoded sequence (x) of sequence list xs when NaNs occur in x. - Also removes the corresponding fitness value y (f(x) --> y) at position i. - ys can also be any type of list, e.g. variants or sequences. - """ - xs = list(xs) - temp = [] - for ys in yss: - try: - ys = list(np.atleast_1d(ys)) - if isinstance(ys, pd.Series): - temp.append(list(ys)) - elif ys is None: - if len(yss) == 1: - temp = (None,) - else: - temp.append([None]) - else: - if type(ys) == np.ndarray: - if np.array(ys).ndim == 0: - temp.append([list(np.atleast_1d(ys).tolist())]) - else: - temp.append(list(np.atleast_1d(ys).tolist())) - else: - temp.append(list(ys)) - except ValueError: - temp.append(list(ys)) - if temp: - yss = temp - if not yss == () and not yss == (None,): - for i, ys in enumerate(yss): - assert len(xs) == len(ys), "Number of input sequences to be compared unequal." - try: - for j, x in enumerate(xs): - if np.shape(np.array(xs, dtype='object'))[1] and np.shape(np.array(ys, dtype='object'))[1]: - assert len(xs[j]) == len(ys[j]), "Length of input sequences to be compared unequal." - except IndexError: - break - drop = [] - for i, x in enumerate(xs): - try: - if None in x: - drop.append(i) - except TypeError: - raise TypeError( - "Take lists of lists as input, e.g., for single sequence " - "[[1, 2, 3, 4]]." - ) - drop = sorted(drop, reverse=True) - for idx in drop: - del xs[idx] - if not yss == () and not yss == (None,): - for ys in yss: - del ys[idx] - if len(yss) == 1: - return np.array(xs, dtype='object'), np.array(yss[0]) - - return np.array(xs, dtype='object'), *np.array(yss, dtype='object') - - -def get_basename(filename: str) -> str: - """ - Description - ----------- - Extracts and returns the basename of the filename. - - Parameters - ---------- - filename: str - - Returns - ------- - str - os.path.basename (filename) string without filename extension - """ - return os.path.basename(filename).split('.')[0] - - -def get_seqs_from_var_name( - wt_seq, - substitutions, - fitness_values -) -> tuple[list, list, list]: - """ - Similar to function above but just returns sequences - - wt: str - Wild-type sequence as string - substitutions: list - List of substiutuions of a single variant of the format: - - Single substitution variant, e.g. variant A123C: ['A123C'] - - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] - --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] - fitness_values: list - List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] - """ - variant, values, sequences = [], [], [] - for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions - temp = list(wt_seq) - name = '' - separation = 0 - if var == ['WT']: - name = 'WT' - else: - for single_var in var: # single entries of substitution list - position_index = int(str(single_var)[1:-1]) - 1 - new_amino_acid = str(single_var)[-1] - temp[position_index] = new_amino_acid - # checking if multiple entries are inside list - if separation == 0: - name += single_var - else: - name += '/' + single_var - separation += 1 - variant.append(name) - values.append(fitness_values[i]) - sequences.append(''.join(temp)) - - return variant, values, sequences - - -def split_variants(variants, sep='/'): - """ - Splits variants according to mutation separator. - """ - variants_splitted = [] - for variant in variants: - variants_splitted.append(variant.split(sep)) - return variants_splitted - - -def read_csv( - file_name: str, - fitness_key: str = None -) -> tuple[list, list, list]: - """ - Description - ----------- - Reads input CSV file and return variants names and - associated fitness values. - - Parameters - ---------- - file_name: str - Name of CSV file to read. - fitness_key: str - Name of column containing the fitness values. - If None, column 1 (0-indexed) will be taken. - - Returns - ------- - variants: np.ndarray - Array of variant names - fitnesses: - Array of fitness values - """ - df = pd.read_csv(file_name, sep=';', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(file_name, sep=',', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(file_name, sep='\t', comment='#') - if fitness_key is not None: - fitnesses = df[fitness_key].to_numpy(dtype=float) - else: - fitnesses = list(df.iloc[:, 1].to_numpy(dtype=float)) - variants = list(df.iloc[:, 0].to_numpy(dtype=str)) - features = list(df.iloc[:, 2:].to_numpy(dtype=float)) - - return variants, fitnesses, features - - -def generate_dataframe_and_save_csv( - variants: list, - sequence_encodings: list, - fitnesses: list, - csv_file: str, - encoding_type: str = '', - save_df_as_csv: bool = True -) -> pd.DataFrame: - """ - Description - ----------- - Creates a pandas.DataFrame from the input data (numpy array including - variant names, fitnesses, and encoded sequences). - Writes pandas.DataFrame to a specified CSV file follwing the scheme: - variants; fitness values; encoded sequences - - Parameters - ---------- - variants: list - Variant names. - fitnesses: list - Sequence-associated fitness value. - sequence_encodings: list - Sequence encodings (feature matrix) of sequences. - csv_file : str - Name of the csv file containing variant names and associated fitness values. - encoding_type: str = '' - Defines name for saved CSV file based on the chosen encoding technique: - 'aaidx', 'onehot', or 'dca'. - save_df_as_csv : bool - Writing DataFrame (Substitution;Fitness;Encoding_Features) to CSV (False/True). - - Returns - ------- - df_dca: pandas.DataFrame - Dataframe with variant names, fitness values, and features (encoded sequences). - If save_df_as_csv is True also writes DF to CSV. - """ - x = np.stack(sequence_encodings) - feature_dict = {} # Collecting features for each MSA position i - for i in range(x.shape[1]): # (encoding at pos. i) in a dict - feature_dict[f'X{i + 1:d}'] = x[:, i] - - df_dca = pd.DataFrame() - df_dca.insert(0, 'variant', variants) - df_dca.insert(1, 'y', fitnesses) - df_dca = pd.concat([df_dca, pd.DataFrame(feature_dict)], axis=1) - - if save_df_as_csv: - filename = f'{get_basename(csv_file)}_{encoding_type}_encoded.csv' - df_dca.to_csv(filename, sep=';', index=False) - - return df_dca - - -def process_df_encoding(df_encoding) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Extracts the array of names, encoded sequences, and fitness values - of the variants from the dataframe 'self.df_encoding'. - It is mandatory that 'df_encoding' contains the names of the - variants in the first column, the associated fitness value in the - second column, and the encoded sequence starting from the third - column. - - Returns - ------- - Tuple of variant names, encoded sequences, and fitness values. - """ - return ( - df_encoding.iloc[:, 0].to_numpy(), - df_encoding.iloc[:, 2:].to_numpy(), - df_encoding.iloc[:, 1].to_numpy() - ) - - -def read_csv_and_shift_pos_ints( - infile: str, - offset: int = 0, - col_sep: str = ';', - substitution_sep: str = '/', - target_column: int = 1 -): - """ - Shifts position of substitutions of variants for all variants in the provided - CSV file and saves the position-shifted variants with the corresponding fitness - values to a new CSV file. - """ - df = pd.read_csv(infile, sep=col_sep, comment='#') - if df.shape[1] == 1: - df = pd.read_csv(infile, sep=',', comment='#') - if df.shape[1] == 1: - df = pd.read_csv(infile, sep='\t', comment='#') - try: - df = df.dropna(subset=df.columns[[target_column]]) # if specific column has a NaN drop entire row - except IndexError: - raise IndexError("Did only detect a single column which might indicate a missing " - "target value column / a wrong specification of the CSV column " - "spearator character (e.g., --sep \';\').") - - column_1 = df.iloc[:, 0] - column_2 = df.iloc[:, target_column].to_numpy() - - new_col = [] - - for variant in column_1: - if substitution_sep in variant: - split_vars_list = [] - splitted_var = variant.split(substitution_sep) - for s_var in splitted_var: - new_var_int = int(s_var[1:-1]) - offset - new_variant = s_var[0] + str(new_var_int) + s_var[-1] - split_vars_list.append(new_variant) - new_variant = '' - for i, v in enumerate(split_vars_list): - if i != len(split_vars_list) - 1: - new_variant += f'{v}/' # substitution_sep replaced by '/' - else: - new_variant += v - new_col.append(new_variant) - else: - new_var_int = int(variant[1:-1]) - offset - new_variant = variant[0] + str(new_var_int) + variant[-1] - new_col.append(new_variant) - - data = np.array([new_col, column_2]).T - new_df = pd.DataFrame(data, columns=['variant', 'fitness']) - new_df.to_csv(infile[:-4] + '_new' + infile[-4:], sep=';', index=False) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Created on 05 October 2020 +# @authors: Niklas Siedhoff, Alexander-Maurice Illig +# @contact: +# PyPEF - Pythonic Protein Engineering Framework +# Released under Creative Commons Attribution-NonCommercial 4.0 International Public License (CC BY-NC 4.0) +# For more information about the license see https://creativecommons.org/licenses/by-nc/4.0/legalcode + +# PyPEF – An Integrated Framework for Data-Driven Protein Engineering +# Journal of Chemical Information and Modeling, 2021, 61, 3463-3476 +# https://doi.org/10.1021/acs.jcim.1c00099 +# Niklas E. Siedhoff1,§, Alexander-Maurice Illig1,§, Ulrich Schwaneberg1,2, Mehdi D. Davari1,* +# 1Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany +# 2DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany +# *Corresponding author +# §Equal contribution + +from __future__ import annotations +import os +import numpy as np +import pandas as pd + + +amino_acids = [ + 'A', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'K', 'L', + 'M', 'N', 'P', 'Q', 'R', + 'S', 'T', 'V', 'W', 'Y' +] + + +def get_wt_sequence(sequence_fasta): + """ + Gets wild-type sequence from defined input file (can be pure sequence or fasta style). + """ + if sequence_fasta is None: + return None + wild_type_sequence = "" + try: + with open(sequence_fasta, 'r') as sf: + for lines in sf.readlines(): + if lines.startswith(">"): + continue + lines = ''.join(lines.split()) + wild_type_sequence += lines + except FileNotFoundError: + raise FileNotFoundError("Did not find FASTA file. Check/specify input FASTA " + "sequence file for getting the wild-type sequence.") + return wild_type_sequence + + +def read_models(number): + """ + Reads the models found in the file Model_Results.txt. + If no model was trained, the .txt file does not exist. + """ + try: + ls = "" + with open('Model_Results.txt', 'r') as file: + for i, lines in enumerate(file): + if i == 0: + if lines[:6] == 'No FFT': + number += 2 + if i <= number + 1: + ls += lines + return ls + except FileNotFoundError: + return "No Model_Results.txt found." + + +def absolute_path_cwd_file(file): + """ + Gets the current working directory. + """ + if file is None: + return None + return os.path.join(os.getcwd(), file) + + +def path_aaidx_txt_path_from_utils(filename): + """ + Returns the relative path to the /AAindex folder from the utils directory, + e.g. path/to/pypef/utils/../aaidx/AAindex/FAUJ880104.txt. + """ + modules_path = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(modules_path, '..', 'ml', 'AAindex', f'{filename}.txt') + + +def get_sequences_from_file( + fasta: str, + mult_path: str | None = None +) -> (np.ndarray, np.ndarray, np.ndarray): + """ + Reads (learning and test) .fasta and .fasta-like ".fasl" + format files and extracts the name, the target value and + the sequence of the protein. + Only takes one-liner sequences for correct input. + Make sure every marker (> and ;) is seperated by a + space ' ' from the value respectively name. + + Returns + ---------- + np.array(sequences), np.array(names_of_mutations), np.array(values) + """ + if mult_path is not None: + os.chdir(mult_path) + + sequences = [] + values = [] + names_of_mutations = [] + + allowed_chars = "ABCDEFGHIKLMNPQRSTVWYX-." + allowed_chars += allowed_chars.lower() + + with open(fasta, 'r') as f: + words = "" + for line in f: + if line.startswith('>'): + if words != "": + sequences.append(words) + words = line.split('>') + names_of_mutations.append(words[1].strip()) + words = "" + + elif line.startswith('#'): + pass # are comments + + elif line.startswith(';'): + if words != "": + sequences.append(words) + words = line.split(';') + values.append(float(words[1].strip())) + words = "" + + else: + try: + line = line.strip() + if any(not c in line for c in allowed_chars): + for c in line: + if c not in allowed_chars: + raise SystemError( + f"The input file(s) (MSA or train/test sets) contain(s) unknown protein sequence characters " + f"(e.g.: \"{c}\"). Note that an MSA has to be provided in FASTA or A2M format (or formatted as " + F"pure linebreak-separated sequences).") + words += line + except IndexError: + raise IndexError("Sequences in input file(s) likely " + "have emtpy lines (e.g. at end of file)") + if words != "": + sequences.append(words) + # Check consistency + if len(values) != 0: + if len(sequences) != len(values): + raise SystemError( + f'Error: Number of sequences does not fit with number of target values! ' + f'Number of sequences: {str(len(sequences))}, Number of target values: {str(len(values))}.' + ) + if mult_path is not None: + os.chdir('..') + + return np.array(sequences), np.array(names_of_mutations), np.array(values) + + +def get_seqs_from_var_name( + wt_seq: str, + substitutions: list, + fitness_values: list +) -> tuple[list, list, list]: + """ + Similar to function "get_sequences_from_file" but instead of getting + sequences from fasta file it directly gets them from wt sequence and + variant specifiers. + + wt_seq: str + Wild-type sequence as string + substitutions: list + List of amino acid substittuions of a single variant of the format: + - Single substitution variant, e.g. variant A123C: ['A123C'] + - Higher variants, e.g. variant A123C/D234E/F345G: ['A123C', 'D234E, 'F345G'] + --> Full substitutions list, e.g.: [['A123C'], ['A123C', 'D234E, 'F345G']] + fitness_values: list + List of ints/floats of the variant fitness values, e.g. for two variants: [1.4, 0.8] + """ + variant, values, sequences = [], [], [] + for i, var in enumerate(substitutions): # var are lists of (single or multiple) substitutions + temp = list(wt_seq) + name = '' + separation = 0 + if var == ['WT']: + name = 'WT' + else: + for single_var in var: # single entries of substitution list + position_index = int(str(single_var)[1:-1]) - 1 + new_amino_acid = str(single_var)[-1] + temp[position_index] = new_amino_acid + # checking if multiple entries are inside list + if separation == 0: + name += single_var + else: + name += '/' + single_var + separation += 1 + variant.append(name) + values.append(fitness_values[i]) + sequences.append(''.join(temp)) + + return variant, values, sequences + + +def remove_nan_encoded_positions( + xs: np.ndarray | list, + *yss +): + """ + Removes encoded sequence (x) of sequence list xs when NaNs occur in x. + Also removes the corresponding fitness value y (f(x) --> y) at position i. + ys can also be any type of list, e.g. variants or sequences. + """ + xs = list(xs) + temp = [] + for ys in yss: + try: + ys = list(np.atleast_1d(ys)) + if isinstance(ys, pd.Series): + temp.append(list(ys)) + elif ys is None: + if len(yss) == 1: + temp = (None,) + else: + temp.append([None]) + else: + if type(ys) == np.ndarray: + if np.array(ys).ndim == 0: + temp.append([list(np.atleast_1d(ys).tolist())]) + else: + temp.append(list(np.atleast_1d(ys).tolist())) + else: + temp.append(list(ys)) + except ValueError: + temp.append(list(ys)) + if temp: + yss = temp + if not yss == () and not yss == (None,): + for i, ys in enumerate(yss): + assert len(xs) == len(ys), "Number of input sequences to be compared unequal." + try: + for j, x in enumerate(xs): + if np.shape(np.array(xs, dtype='object'))[1] and np.shape(np.array(ys, dtype='object'))[1]: + assert len(xs[j]) == len(ys[j]), "Length of input sequences to be compared unequal." + except IndexError: + break + drop = [] + for i, x in enumerate(xs): + try: + if None in x: + drop.append(i) + except TypeError: + raise TypeError( + "Take lists of lists as input, e.g., for single sequence " + "[[1, 2, 3, 4]]." + ) + drop = sorted(drop, reverse=True) + for idx in drop: + del xs[idx] + if not yss == () and not yss == (None,): + for ys in yss: + del ys[idx] + if len(yss) == 1: + return np.array(xs, dtype='object'), np.array(yss[0]) + + return np.array(xs, dtype='object'), *np.array(yss, dtype='object') + + +def get_basename(filename: str) -> str: + """ + Description + ----------- + Extracts and returns the basename of the filename. + + Parameters + ---------- + filename: str + + Returns + ------- + str + os.path.basename (filename) string without filename extension + """ + return os.path.basename(filename).split('.')[0] + + +def split_variants(variants, sep='/'): + """ + Splits variants according to mutation separator. + """ + variants_splitted = [] + for variant in variants: + variants_splitted.append(variant.split(sep)) + return variants_splitted + + +def read_csv( + file_name: str, + fitness_key: str = None +) -> tuple[list, list, list]: + """ + Description + ----------- + Reads input CSV file and return variants names and + associated fitness values. + + Parameters + ---------- + file_name: str + Name of CSV file to read. + fitness_key: str + Name of column containing the fitness values. + If None, column 1 (0-indexed) will be taken. + + Returns + ------- + variants: np.ndarray + Array of variant names + fitnesses: + Array of fitness values + """ + df = pd.read_csv(file_name, sep=';', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(file_name, sep=',', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(file_name, sep='\t', comment='#') + if fitness_key is not None: + fitnesses = df[fitness_key].to_numpy(dtype=float) + else: + fitnesses = list(df.iloc[:, 1].to_numpy(dtype=float)) + variants = list(df.iloc[:, 0].to_numpy(dtype=str)) + features = list(df.iloc[:, 2:].to_numpy(dtype=float)) + + return variants, fitnesses, features + + +def generate_dataframe_and_save_csv( + variants: list, + sequence_encodings: list, + fitnesses: list, + csv_file: str, + encoding_type: str = '', + save_df_as_csv: bool = True +) -> pd.DataFrame: + """ + Description + ----------- + Creates a pandas.DataFrame from the input data (numpy array including + variant names, fitnesses, and encoded sequences). + Writes pandas.DataFrame to a specified CSV file follwing the scheme: + variants; fitness values; encoded sequences + + Parameters + ---------- + variants: list + Variant names. + fitnesses: list + Sequence-associated fitness value. + sequence_encodings: list + Sequence encodings (feature matrix) of sequences. + csv_file : str + Name of the csv file containing variant names and associated fitness values. + encoding_type: str = '' + Defines name for saved CSV file based on the chosen encoding technique: + 'aaidx', 'onehot', or 'dca'. + save_df_as_csv : bool + Writing DataFrame (Substitution;Fitness;Encoding_Features) to CSV (False/True). + + Returns + ------- + df_dca: pandas.DataFrame + Dataframe with variant names, fitness values, and features (encoded sequences). + If save_df_as_csv is True also writes DF to CSV. + """ + x = np.stack(sequence_encodings) + feature_dict = {} # Collecting features for each MSA position i + for i in range(x.shape[1]): # (encoding at pos. i) in a dict + feature_dict[f'X{i + 1:d}'] = x[:, i] + + df_dca = pd.DataFrame() + df_dca.insert(0, 'variant', variants) + df_dca.insert(1, 'y', fitnesses) + df_dca = pd.concat([df_dca, pd.DataFrame(feature_dict)], axis=1) + + if save_df_as_csv: + filename = f'{get_basename(csv_file)}_{encoding_type}_encoded.csv' + df_dca.to_csv(filename, sep=';', index=False) + + return df_dca + + +def process_df_encoding(df_encoding) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Extracts the array of names, encoded sequences, and fitness values + of the variants from the dataframe 'self.df_encoding'. + It is mandatory that 'df_encoding' contains the names of the + variants in the first column, the associated fitness value in the + second column, and the encoded sequence starting from the third + column. + + Returns + ------- + Tuple of variant names, encoded sequences, and fitness values. + """ + return ( + df_encoding.iloc[:, 0].to_numpy(), + df_encoding.iloc[:, 2:].to_numpy(), + df_encoding.iloc[:, 1].to_numpy() + ) + + +def read_csv_and_shift_pos_ints( + infile: str, + offset: int = 0, + col_sep: str = ';', + substitution_sep: str = '/', + target_column: int = 1 +): + """ + Shifts position of substitutions of variants for all variants in the provided + CSV file and saves the position-shifted variants with the corresponding fitness + values to a new CSV file. + """ + df = pd.read_csv(infile, sep=col_sep, comment='#') + if df.shape[1] == 1: + df = pd.read_csv(infile, sep=',', comment='#') + if df.shape[1] == 1: + df = pd.read_csv(infile, sep='\t', comment='#') + try: + df = df.dropna(subset=df.columns[[target_column]]) # if specific column has a NaN drop entire row + except IndexError: + raise IndexError("Did only detect a single column which might indicate a missing " + "target value column / a wrong specification of the CSV column " + "spearator character (e.g., --sep \';\').") + + column_1 = df.iloc[:, 0] + column_2 = df.iloc[:, target_column].to_numpy() + + new_col = [] + + for variant in column_1: + if substitution_sep in variant: + split_vars_list = [] + splitted_var = variant.split(substitution_sep) + for s_var in splitted_var: + new_var_int = int(s_var[1:-1]) - offset + new_variant = s_var[0] + str(new_var_int) + s_var[-1] + split_vars_list.append(new_variant) + new_variant = '' + for i, v in enumerate(split_vars_list): + if i != len(split_vars_list) - 1: + new_variant += f'{v}/' # substitution_sep replaced by '/' + else: + new_variant += v + new_col.append(new_variant) + else: + new_var_int = int(variant[1:-1]) - offset + new_variant = variant[0] + str(new_var_int) + variant[-1] + new_col.append(new_variant) + + data = np.array([new_col, column_2]).T + new_df = pd.DataFrame(data, columns=['variant', 'fitness']) + new_df.to_csv(infile[:-4] + '_new' + infile[-4:], sep=';', index=False) diff --git a/scripts/CLI/run_cli_tests_linux.sh b/scripts/CLI/run_cli_tests_linux.sh index 4601370..e155447 100644 --- a/scripts/CLI/run_cli_tests_linux.sh +++ b/scripts/CLI/run_cli_tests_linux.sh @@ -12,7 +12,7 @@ set -x # echo on set -e # exit on (PyPEF) errors export PS4='+(Line ${LINENO}): ' # echo script line numbers -### RUN ME WITH +### RUN ME FROM CURRENT FILE DIRECTORY: ### $ ./run_cli_tests_linux.sh # printing STDOUT and STDERR to terminal ### $ ./run_cli_tests_linux.sh &> test_cli_run.log # writing STDOUT and STDERR to log file @@ -368,11 +368,21 @@ echo $pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads echo -# pure statistical +# Hybrid: pure statistical $pypef hybrid -t TS.fasl --params PLMC --threads $threads echo +$pypef hybrid -p TS.fasl --params PLMC --threads $threads +echo +# Same as above command +$pypef hybrid -p TS.fasl -m PLMC --params PLMC --threads $threads +echo $pypef hybrid -t TS.fasl --params GREMLIN echo +$pypef hybrid -p TS.fasl --params GREMLIN +echo +# Same as above command +$pypef hybrid -p TS.fasl -m GREMLIN --params GREMLIN +echo $pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN echo $pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN diff --git a/scripts/CLI/run_cli_tests_win.ps1 b/scripts/CLI/run_cli_tests_win.ps1 index daac44a..96f0a40 100644 --- a/scripts/CLI/run_cli_tests_win.ps1 +++ b/scripts/CLI/run_cli_tests_win.ps1 @@ -511,13 +511,27 @@ pypef hybrid -m PLMC -t TS.fasl --params PLMC --threads $threads ExitOnExitCode Write-Host -# pure statistical +# Hybrid: pure statistical pypef hybrid -t TS.fasl --params PLMC --threads $threads ExitOnExitCode Write-Host +pypef hybrid -p TS.fasl --params PLMC --threads $threads +ExitOnExitCode +Write-Host +# Same as above command +pypef hybrid -p TS.fasl -m PLMC --params PLMC --threads $threads +ExitOnExitCode +Write-Host pypef hybrid -t TS.fasl --params GREMLIN ExitOnExitCode Write-Host +pypef hybrid -p TS.fasl --params GREMLIN +ExitOnExitCode +Write-Host +# Same as above command +pypef hybrid -p TS.fasl -m GREMLIN --params GREMLIN +ExitOnExitCode +Write-Host pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN ExitOnExitCode Write-Host diff --git a/setup.py b/setup.py index 2ebc828..7e7de20 100644 --- a/setup.py +++ b/setup.py @@ -1,49 +1,53 @@ -#!/usr/bin/env python3 -# for installation run me with: pip install . -# or with: pip install -e . - - -from setuptools import setup, find_packages -from pypef import __version__ - - -with open("requirements.txt", "r", encoding="utf-8") as install_requirements: - requirements = install_requirements.read() - -setup( - name='pypef', - version=__version__.split('-')[0], - author='Niklas Siedhoff & Alexander-Maurice Illig', - author_email='n.siedhoff@biotec.rwth-aachen.de', - license='CC BY-NC-SA 4.0', - description='A command-line interface (CLI) tool for performing data-driven protein engineering ' - 'by building machine learning (ML)-trained regression models from sequence variant ' - 'fitness data (in CSV format) based on different techniques for protein sequence encoding. ' - 'Next to building pure ML models, \'hybrid modeling\' is also possible using a blended ' - 'model optimized for predictive contributions of a statistical and an ML-based prediction.', - long_description='For detailed description including a short Jupyter Notebook-based ' - 'tutorial please refer to the GitHub page.', - long_description_content_type='text/markdown', - url='https://github.com/Protein-Engineering-Framework/PyPEF', - py_modules=['pypef'], - packages=find_packages(include=['pypef', 'pypef.*']), - package_data={'pypef': ['ml/AAindex/*', 'ml/AAindex/Refined_cluster_indices_r0.93_r0.97/*']}, - include_package_data=True, - install_requires=[requirements], - python_requires='>= 3.9, < 3.12', - keywords='Pythonic Protein Engineering Framework', - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Artificial Intelligence' - ], - entry_points={ - 'console_scripts': [ - 'pypef = pypef.main:run_main' - ], - } -) +#!/usr/bin/env python3 +# for installation run me with: pip install . +# or for editable/develop mode with: pip install -e . + + +from setuptools import setup, find_packages +from pathlib import Path + +from pypef import __version__ + + +this_directory = Path(__file__).parent +long_description = Path.joinpath(this_directory, "README.md").read_text() + +with open("requirements.txt", "r", encoding="utf-8") as install_requirements: + requirements = install_requirements.read() + +setup( + name='pypef', + version=__version__.split('-')[0], + author='Niklas Siedhoff & Alexander-Maurice Illig', + author_email='n.siedhoff@biotec.rwth-aachen.de', + license='CC BY-NC-SA 4.0', + description='A command-line interface (CLI) tool for performing data-driven protein engineering ' + 'by building machine learning (ML)-trained regression models from sequence variant ' + 'fitness data (in CSV format) based on different techniques for protein sequence encoding. ' + 'Next to building pure ML models, \"hybrid modeling\" is also possible using a blended ' + 'model optimized for predictive contributions of a statistical and an ML-based prediction.', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://github.com/Protein-Engineering-Framework/PyPEF', + py_modules=['pypef'], + packages=find_packages(include=['pypef', 'pypef.*']), + package_data={'pypef': ['ml/AAindex/*', 'ml/AAindex/Refined_cluster_indices_r0.93_r0.97/*']}, + include_package_data=True, + install_requires=[requirements], + python_requires='>= 3.9, < 3.12', + keywords='Pythonic Protein Engineering Framework', + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Topic :: Scientific/Engineering :: Artificial Intelligence' + ], + entry_points={ + 'console_scripts': [ + 'pypef = pypef.main:run_main' + ], + } +) diff --git a/tests/test_api_functions.py b/tests/test_api_functions.py new file mode 100644 index 0000000..9970b0d --- /dev/null +++ b/tests/test_api_functions.py @@ -0,0 +1,61 @@ + +import os.path +import numpy as np + +from pypef.ml.regression import AAIndexEncoding, full_aaidx_txt_path, get_regressor_performances +from pypef.dca.gremlin_inference import GREMLIN +from pypef.utils.variant_data import get_sequences_from_file + + + +msa_file = os.path.abspath( + os.path.join( + os.path.abspath(__file__), + '../../datasets/AVGFP/uref100_avgfp_jhmmer_119.a2m' + ) +) + +ls_b = os.path.abspath( + os.path.join( + os.path.abspath(__file__), + '../../datasets/ANEH/LS_B.fasl' + ) +) + +ts_b = os.path.abspath( + os.path.join( + os.path.abspath(__file__), + '../../datasets/ANEH/TS_B.fasl' + ) +) + + +def test_gremlin(): + g = GREMLIN( + alignment=msa_file, + char_alphabet="ARNDCQEGHILKMFPSTWYV-", + wt_seq=None, + optimize=True, + gap_cutoff=0.5, + eff_cutoff=0.8, + opt_iter=100 + ) + wt_score = g.get_wt_score() # only 1 decimal place for TensorFlow result + np.testing.assert_almost_equal(wt_score, 1203.549234202937, decimal=1) + + +def test_dataset_b_results(): + train_seqs, train_vars, train_ys = get_sequences_from_file(ls_b) + test_seqs, test_vars, test_ys = get_sequences_from_file(ts_b) + aaindex = "WOLR810101.txt" + x_fft_train, _ = AAIndexEncoding(full_aaidx_txt_path(aaindex), train_seqs).collect_encoded_sequences() + x_fft_test, _ = AAIndexEncoding(full_aaidx_txt_path(aaindex), test_seqs).collect_encoded_sequences() + performances = get_regressor_performances( + x_learn=x_fft_train, + x_test=x_fft_test, + y_learn=train_ys, + y_test=test_ys, + regressor='pls_loocv' + ) + # Dataset B PLS_LOOCV results: R², RMSE, NRMSE, Pearson's r, Spearman's rho + np.testing.assert_almost_equal(performances[:5], [0.72, 14.48, 0.52, 0.86, 0.89], decimal=2)