Upload v0.3.3

Protein-Engineering-Framework · Jan 5, 2024 · a7d6afc · a7d6afc
1 parent caf921e
commit a7d6afc
Show file tree

Hide file tree

Showing 27 changed files with 7,798 additions and 7,410 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -34,6 +34,6 @@ jobs:
       run: |
         # stop the build if there are Python syntax errors or undefined names
         flake8 ./pypef --count --select=E9,F63,F7,F82 --show-source --statistics
-    - name: Export Pythonpath and run CLI PyPEF version test with pytest
+    - name: Export Pythonpath and run PyPEF API and CLI version test with pytest
       run: |
-        export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest tests/cli/
+        export PYTHONPATH="${PYTHONPATH}:${PWD}" && python -m pytest tests/
diff --git a/.gitignore b/.gitignore
@@ -386,3 +386,7 @@ scripts/Setup/linux/AVGFP/uref100_avgfp_jhmmer_119.a2m
 scripts/Setup/linux/AVGFP/avGFP.csv
 scripts/Setup/linux/api_encoding_train_test.py
 scripts/Setup/linux/apc.png
+datasets/ANEH/KARS160122_PLS_LOOCV_ML_Model_Performance.png
+datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_5-fold-CV.png
+datasets/ANEH/CV_performance/KARS160122_PLS_LOOCV_CV_Results.txt
+datasets/AVGFP/Predictions_Hybrid_TopTS.txt
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,209 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: PyPEF Help",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": [
+                "--help"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF MKLSTS ANEH",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/ANEH/",
+            "args": [
+                "mklsts", 
+                "--wt", "${workspaceFolder}/datasets/ANEH/Sequence_WT_ANEH.fasta", 
+                "--input", "${workspaceFolder}/datasets/ANEH/37_ANEH_variants.csv"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF MKLSTS avGFP",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "mklsts", 
+                "--wt", "P42212_F64L.fasta", 
+                "--input", "avGFP.csv"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF ml -e onehot pls_loocv",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/ANEH",
+            "args": [
+                "ml", 
+                "-e", "onehot", 
+                "-l", "LS.fasl",
+                "-t", "TS.fasl",
+                "--regressor", "pls_loocv"
+            ]
+        },
+
+        {   // GREMLIN zero-shot steps:
+            // 1. $pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m --opt_iter 100
+            // 2. $pypef hybrid -t TS.fasl --params GREMLIN
+            // or
+            // 2. $pypef hybrid -m GREMLIN -t TS.fasl --params GREMLIN
+            "name": "Python: PyPEF save GREMLIN avGFP model",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "param_inference", 
+                "--msa", "uref100_avgfp_jhmmer_119.a2m", 
+                "--opt_iter", "100"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA avGFP",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "GREMLIN",   // optional, not required  
+                "--ts", "TS.fasl", 
+                "--params", "GREMLIN"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "GREMLIN",   // optional, not required  
+                "--ps", "TS.fasl", 
+                "--params", "GREMLIN"
+            ]
+        },
+
+        {   // PLMC zero-shot steps:
+            // 1. $pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
+            // 2. $pypef hybrid -t TS.fasl --params PLMC
+            "name": "Python: PyPEF save PLMC avGFP model",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "param_inference", 
+                "--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params"
+            ]
+        },
+
+        {   
+            "name": "Python: PyPEF hybrid/only-TS-zero-shot PLMC-DCA avGFP",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                "--ts", "TS.fasl", 
+                "--params", "PLMC",
+                "--threads", "24"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA avGFP",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "PLMC",   // optional, not required  
+                "--ps", "TS.fasl", 
+                "--params", "uref100_avgfp_jhmmer_119_plmc_42.6.params",
+                "--threads", "24"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF hybrid/only-PS-zero-shot PLMC-DCA variant 2 avGFP",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "PLMC",   // optional, not required  
+                "--ps", "TS.fasl", 
+                "--params", "PLMC",
+                "--threads", "24"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF !wrong! MSA input format (STO)",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "param_inference", 
+                "--msa", "uref100_avgfp_jhmmer_119.sto"
+            ]
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "markdown.extension.toc.updateOnSave": false
+}
diff --git a/README.md b/README.md
@@ -17,20 +17,22 @@ Preprint available at bioRxiv: https://doi.org/10.1101/2022.06.07.495081.
 <sup>*§*</sup><sub>Equal contribution</sub> <br>
 
 ---
+
 ## Table of Contents
-- [PyPEF](#pypef)
-- [Installation](#installation)
-- [Requirements](#requirements)
-- [Running Examples](#examples)
-- [Tutorial](#tutorial)
-- [Encoding Technique Options](#encoding-options)
-- [Modeling Techniques](#modeling-techniques)
-    - [Pure Machine Learning (ML)-based Modeling](#pure-ml)
-    - [Hybrid Modeling](#hybrid-modeling)
-- [Model Hyperparameter Grids for Training](#grids)
-- [Setting Up the Scripts Yourself](#set-up)
-- [Preprocessing for DCA-based Sequence Encoding](#dca-preprocessing)
-- [API Usage for Sequence Encoding](#api-usage)
+[PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework)
+  - [Quick Installation](#quick-installation)
+  - [Requirements](#requirements)
+  - [Running Examples](#running-examples)
+  - [Tutorial](#tutorial)
+  - [Encoding Technique Options](#encoding-technique-options)
+  - [Modeling Techniques](#modeling-techniques)
+    - [Pure Machine Learning (ML)-based Modeling](#pure-machine-learning-ml-based-modeling)
+    - [Hybrid Modeling Using the MERGE Method](#hybrid-modeling-using-the-merge-method)
+  - [Model Hyperparameter Grids for Training](#model-hyperparameter-grids-for-training)
+  - [Setting Up the Scripts Yourself](#setting-up-the-scripts-yourself)
+  - [Preprocessing for DCA-based Sequence Encoding](#preprocessing-for-dca-based-sequence-encoding)
+  - [Unsupervised/zero-shot prediction](#unsupervisedzero-shot-prediction)
+  - [API Usage for Sequence Encoding](#api-usage-for-sequence-encoding)
 ---
 
 <a name="pypef"></a>
@@ -278,7 +280,7 @@ Copy the notebook URL in your internet browser and select the Workflow_PyPEF.ipy
 ## Modeling Techniques
 <a name="pure-ml"></a>
 ### Pure Machine Learning (ML)-based Modeling
-Serveral linear and non-linear modeling options are available by default to construct supervised regression models based on the generated sequence features, i.e. encoded sequences. 
+Several linear and non-linear modeling options are available by default to construct supervised regression models based on the generated sequence features, i.e. encoded sequences.
 Regression models are trained, i.e. model hyperparameters are optimized, by *k*- fold (by default, fivefold) cross-validation on training samples. Here, the model aims to map the encoded variant sequences that are the features (***X***) for predicting the corresponding fitness labels (***y***) such that *f(***X***)* --> ***y*** – while cross-validation and/or using a model implementing a penalty will be necessary for better model generalization behavior.
 Following regression options from [Scikit-learn](https://scikit-learn.org/stable/) are implemented (for optimized hyperparameters, see Model Hyperparameters section below):
 - [Partial Least Squares Regression (linear model)](https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html)
@@ -376,38 +378,80 @@ python3 ./pypef/main.py
    pypef sto2a2m --sto ANEH_jhmmer.sto
    ```
 
-5. Now you can follow approaches 5.1 (using GREMLIN; implemented in TensorFlow) or 5.2 (using plmc; extern parameter generation in C). 
+5. Now you can follow approaches 5.1 (using GREMLIN; implemented in TensorFlow) or 5.2 (using plmc; extern parameter generation in C).
 
     5.1. Running GREMLIN on the generated MSA (in FASTA or A2M format):
+
     ```
     pypef param_inference --msa ANEH_jhmmer.a2m -w WT_SEQUENCE.FASTA --opt_iter 250
     ```
+
     The pickled GREMLIN file can then be used for encoding new/test sequences:
+
     ```
     pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params GREMLIN
     ```
+
     Or for hybrid modeling:
+
     ```
     pypef hybrid -l LS.fasl -t TS.fasl --params GREMLIN
     ```
 
     5.2 After [installing plmc](https://github.com/debbiemarkslab/plmc#compilation), generate the evolutionary coupling file, which is used for encoding sequences. For example, set `-le` to the value output by `sto2a2m`:
+
     ```
     plmc -o ANEH_72.6.params -le 72.6 -m 100 -g -f WT_ANEH ANEH_jhmmer.a2m
     ```
     
     The output parameter (.params) file can be used for encoding sequences with the DCA-based encoding technique (`-e dca`) by providing it to PyPEF; e.g. for pure ML modeling:
+
     ```
     pypef ml -e dca -l LS.fasl -t TS.fasl --regressor pls --params ANEH_72.6.params
     ```
+
     Or for hybrid modeling:
+
     ```
     pypef hybrid -l LS.fasl -t TS.fasl --params ANEH_72.6.params
     ```
 
+<a name="zero-shot-prediction"></a>
+## Unsupervised/zero-shot prediction
+Several developed methods allow unsupervised prediction of a proteins fitness based on its sequence (and/or structure).
+These methods have the advantage that no initial knowledge about a proteins fitness is required for prediction, while a correlation of the predicted score and a protein's natural fitness is assumed.
+DCA itself is a statistical/unsupervised method based on MSA information that outperforms simpler MSA-based methods (such as (un)coupled raw MSA sequence frequencies or BLOSUM scores), e.g., see [scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb](scripts/GREMLIN_numba/using_gremlin_functionalities.ipynb).
+To make zero-shot predictions using PyPEF (plmc-DCA or GREMLIN-DCA) just do not provide a training set (no `-l` flag, only a `-t` or `-p` flag) for hybrid modeling, e.g., for the avGFP data, try
+
+```
+pypef param_inference --msa uref100_avgfp_jhmmer_119.a2m
+pypef hybrid -t TS.fasl --params GREMLIN
+pypef hybrid -p PS.fasta --params GREMLIN
+```
+
+using the GREMLIN parameters, or,
+
+```
+pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
+pypef hybrid -t TS.fasl --params PLMC
+pypef hybrid -p PS.fasta --params PLMC
+```
+
+using the plmc parameters.
+
+Other well-performing zero-shot prediction methods with available source code are:
+
+- ESM-1v/ESM-2 (https://github.com/facebookresearch/esm)
+- DeepSequence (https://github.com/debbiemarkslab/DeepSequence)
+- EVcouplings (plmc-DCA, https://github.com/debbiemarkslab/EVcouplings)
+- EVE (https://github.com/OATML/EVE)
+- Tranception (https://github.com/OATML-Markslab/Tranception)
+  
+This list is by no means complete, see ProteinGym [repository](https://github.com/OATML-Markslab/ProteinGym) and [website](https://proteingym.org/) for a more detailed overview of available methods and achieved performances (as well as for getting many benchmark data sets).
+
 <a name="api-usage"></a>
 ## API Usage for Sequence Encoding
-For script-based encoding of sequences using PyPEF and the available AAindex-, OneHot- or DCA-based techniques, the classes and corresponding functions can be imported, i.e. `OneHotEncoding`, `AAIndexEncoding`, `GREMLIN` (DCA),  `PLMC` (DCA), and `DCAHybridModel`. In addition, implemented functions for CV-based tuning of regression models can be used to train and validate models, eventually deriving them to obtain performances on retained data for testing. An exemplary script and a Jupyter notebook for CV-based (low-*N*) tuning of models and using them for testing is provided at [scripts/Encoding_low_N/api_encoding_train_test.py]( scripts/Encoding_low_N/api_encoding_train_test.py) and [scripts/Encoding_low_N/api_encoding_train_test.ipynb](scripts/Encoding_low_N/api_encoding_train_test.ipynb), respectively.
+For script-based encoding of sequences using PyPEF and the available AAindex-, OneHot- or DCA-based techniques, the classes and corresponding functions can be imported, i.e. `OneHotEncoding`, `AAIndexEncoding`, `GREMLIN` (DCA),  `PLMC` (DCA), and `DCAHybridModel`. In addition, implemented functions for CV-based tuning of regression models can be used to train and validate models, eventually deriving them to obtain performances on retained data for testing. An exemplary script and a Jupyter notebook for CV-based (low-*N*) tuning of models and using them for testing is provided at [scripts/Encoding_low_N/api_encoding_train_test.py](scripts/Encoding_low_N/api_encoding_train_test.py) and [scripts/Encoding_low_N/api_encoding_train_test.ipynb](scripts/Encoding_low_N/api_encoding_train_test.ipynb), respectively.
 
 <p align="center">
     <img src=".github/imgs/low_N_avGFP_extrapolation.png" alt="drawing" width="500"/>