diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index cd0089675..0360a7222 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -1,6 +1,6 @@ -name: "Install Python and deeprank2" +name: "Install Python and DeepRank2" -description: "Installs Python, updates pip and installs deeprank2 together with its dependencies." +description: "Installs Python, updates pip and installs DeepRank2 together with its dependencies." inputs: python-version: @@ -27,8 +27,10 @@ runs: with: update-conda: true python-version: ${{ inputs.python-version }} - conda-channels: anaconda - - run: conda --version + conda-channels: pytorch, pyg, bioconda, defaults, sbl, conda-forge + - run: | + conda --version + conda env list shell: bash {0} - name: Python info shell: bash -e {0} @@ -41,16 +43,16 @@ runs: CMAKE_INSTALL_PREFIX: .local if: runner.os == 'Linux' run: | - # Install dependencies not handled by setuptools + # Install deeprank2 conda dependencies ## DSSP - sudo apt-get install -y dssp + conda install -c sbl dssp>=4.2.2.1 ## MSMS - conda install -c bioconda msms + conda install -c bioconda msms>=2.6.1 ## PyTorch, PyG, PyG adds ### Installing for CPU only on the CI - conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 -c pytorch - pip install torch_geometric==2.3.1 - pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-$(python3 -c "import torch; print(torch.__version__)")+cpu.html + conda install pytorch=2.1.1 torchvision=0.16.1 torchaudio=2.1.1 cpuonly=2.0.* -c pytorch + conda install pyg=2.4.0 -c pyg + pip install torch_scatter==2.1.2 torch_sparse==0.6.18 torch_cluster==1.6.3 torch_spline_conv==1.2.2 -f https://data.pyg.org/whl/torch-2.1.0+cpu.html - name: Install dependencies on MacOS shell: bash {0} env: @@ -59,26 +61,7 @@ runs: run: | # Install dependencies not handled by setuptools ## DSSP - git clone https://github.com/PDB-REDO/libcifpp.git --recurse-submodules - cd libcifpp - cmake -S . -B build -DCMAKE_INSTALL_PREFIX=$HOME/.local -DCMAKE_BUILD_TYPE=Release - cmake --build build - cmake --install build - ####### - git clone https://github.com/mhekkel/libmcfp.git - cd libmcfp - mkdir build - cd build - cmake .. - cmake --build . - cmake --install . - ####### - git clone https://github.com/PDB-REDO/dssp.git - cd dssp - mkdir build - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release - cmake --build build - cmake --install build + conda install -c sbl dssp>=4.2.2.1 ## MSMS cd /tmp/ wget http://mgltools.scripps.edu/downloads/tars/releases/MSMSRELEASE/REL2.6.1/msms_i86Linux2_2.6.1.tar.gz diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4eb534ded..bb41a32a7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -37,7 +37,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest"] - python-version: ["3.10", "3.11"] + python-version: ["3.10"] # ["3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index ba0f5a391..8a7632985 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -46,7 +46,7 @@ jobs: python-version: ${{ matrix.python-version }} extras-require: test - name: Run unit tests with coverage - run: pytest --cov --cov-append --cov-report xml --cov-report term --cov-report html + run: pytest --cov --cov-append --cov-report xml --cov-fail-under=80 --cov-report term --cov-report html - name: Coveralls env: GITHUB_TOKEN: ${{ secrets.github_token }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..f4b52528a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,32 @@ +# Pull base image +FROM --platform=linux/x86_64 condaforge/miniforge3:23.3.1-1 + +# Add files +ADD ./tutorials /home/deeprank2/tutorials +ADD ./env/environment.yml /home/deeprank2 +ADD ./env/requirements.txt /home/deeprank2 + +# Install +RUN \ + apt update -y && + apt install unzip -y && + ## GCC + apt install -y gcc && + ## Conda and pip deps + mamba env create -f /home/deeprank2/environment.yml && + ## Get the data for running the tutorials + if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && + if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && + wget https://zenodo.org/records/8349335/files/data_raw.zip && + unzip data_raw.zip -d data_raw && + mv data_raw /home/deeprank2/tutorials + +# Activate the environment +RUN echo "source activate deeprank2" >~/.bashrc +ENV PATH /opt/conda/envs/deeprank2/bin:$PATH + +# Define working directory +WORKDIR /home/deeprank2 + +# Define default command +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--NotebookApp.token=''","--NotebookApp.password=''", "--allow-root"] diff --git a/README.md b/README.md index d0ae4356b..60b83b595 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# Deeprank2 +# DeepRank2 -| Badges | | -| :------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | -| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f3f98b2d1883493ead50e3acaa23f2cc)](https://app.codacy.com/gh/DeepRank/deeprank2?utm_source=github.com&utm_medium=referral&utm_content=DeepRank/deeprank2&utm_campaign=Badge_Grade) | -| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) | -| **tests** | [![Build Status](https://github.com/DeepRank/deeprank2/actions/workflows/build.yml/badge.svg)](https://github.com/DeepRank/deeprank2/actions) ![Linting status](https://github.com/DeepRank/deeprank2/actions/workflows/linting.yml/badge.svg?branch=main) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank2/badge.svg?branch=main)](https://coveralls.io/github/DeepRank/deeprank2?branch=main) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) ![Python](https://img.shields.io/badge/python-3.11-blue.svg) | -| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | -| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | +| Badges | | +| :------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | +| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f3f98b2d1883493ead50e3acaa23f2cc)](https://app.codacy.com/gh/DeepRank/deeprank2?utm_source=github.com&utm_medium=referral&utm_content=DeepRank/deeprank2&utm_campaign=Badge_Grade) | +| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) | +| **tests** | [![Build Status](https://github.com/DeepRank/deeprank2/actions/workflows/build.yml/badge.svg)](https://github.com/DeepRank/deeprank2/actions) ![Linting status](https://github.com/DeepRank/deeprank2/actions/workflows/linting.yml/badge.svg?branch=main) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank2/badge.svg?branch=main)](https://coveralls.io/github/DeepRank/deeprank2?branch=main) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) | +| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | +| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | ## Overview @@ -34,54 +34,102 @@ DeepRank2 extensive documentation can be found [here](https://deeprank2.rtfd.io/ ## Table of contents -- [Deeprank2](#deeprank2) +- [DeepRank2](#deeprank2) - [Overview](#overview) - [Table of contents](#table-of-contents) - [Installation](#installation) - - [Dependencies](#dependencies) - - [Deeprank2 Package](#deeprank2-package) - - [Test installation](#test-installation) - - [Contributing](#contributing) + - [Containerized Installation](#containerized-installation) + - [Local/remote installation](#localremote-installation) + - [YML file installation](#yml-file-installation) + - [Manual installation](#manual-installation) + - [Testing DeepRank2 installation](#testing-deeprank2-installation) + - [Contributing](#contributing) + - [Using DeepRank2](#using-deeprank2) - [Data generation](#data-generation) - [Datasets](#datasets) - [GraphDataset](#graphdataset) - [GridDataset](#griddataset) - [Training](#training) - - [Run a pre-trained model on new data](#run-a-pre-trained-model-on-new-data) + - [Run a pre-trained model on new data](#run-a-pre-trained-model-on-new-data) - [Computational performances](#computational-performances) - [Package development](#package-development) ## Installation -The package officially supports ubuntu-latest OS only, whose functioning is widely tested through the continuous integration workflows. +There are two ways to install DeepRank2: -### Dependencies +1. In a [dockerized container](#containerized-installation). This allows you to use DeepRank2, including all the notebooks within the container (a protected virtual space), without worrying about your operating system or installation of dependencies. + - We recommend this installation for inexperienced users and to learn to use or test our software, e.g. using the provided [tutorials](tutorials/TUTORIAL.md). However, resources might be limited in this installation and we would not recommend using it for large datasets or on high-performance computing facilities. +2. [Local installation](#localremote-installation) on your system. This allows you to use the full potential of DeepRank2, but requires a few additional steps during installation. + - We recommend this installation for more experienced users, for larger projects, and for (potential) [contributors](#contributing) to the codebase. -Before installing deeprank2 you need to install some dependencies. We advise to use a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) with Python >= 3.10 installed. The following dependency installation instructions are updated as of 14/09/2023, but in case of issues during installation always refer to the official documentation which is linked below: +### Containerized Installation -- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms`. - - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -- [PyTorch](https://pytorch.org/get-started/locally/) - - We support torch's CPU library as well as CUDA. - - Currently, the package is tested using [PyTorch 2.0.1](https://pytorch.org/get-started/previous-versions/#v201). -- [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. -- [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/) - - Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4: - - on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`. - - on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread. -- [GCC](https://gcc.gnu.org/install/) - - Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`. -- For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +In order to try out the package without worrying about your OS and without the need of installing all the required dependencies, we created a `Dockerfile` that can be used for taking care of everything in a suitable container. + +For this, you first need to install [Docker](https://docs.docker.com/engine/install/) on your system. Then run the following commands. You may need to have sudo permission for some steps, in which case the commands below can be preceded by `sudo`: + +```bash +# Clone the DeepRank2 repository and enter its root directory +git clone https://github.com/DeepRank/deeprank2 +cd deeprank2 + +# Build and run the Docker image +docker build -t deeprank2 . +docker run -p 8888:8888 deeprank2 +``` + +Next, open a browser and go to `http://localhost:8888` to access the application running inside the Docker container. From there you can use DeepRank2, e.g. to run the tutorial notebooks. + +More details about the tutorials' contents can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that in the docker container only the raw PDB files are downloaded, which needed as a starting point for the tutorials. You can obtain the processed HDF5 files by running the `data_generation_xxx.ipynb` notebooks. Because Docker containers are limited in memory resources, we limit the number of data points processed in the tutorials. Please [install the package locally](#localremote-installation) to fully leverage its capabilities. + +If after running the tutorials you want to remove the (quite large) Docker image from your machine, you must first [stop the container](https://docs.docker.com/engine/reference/commandline/stop/) and can then [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). More general information about Docker can be found on the [official website docs](https://docs.docker.com/get-started/). + +### Local/remote installation + +Local installation is formally only supported on the latest stable release of ubuntu, for which widespread automated testing through continuous integration workflows has been set up. However, it is likely that the package runs smoothly on other operating systems as well. -## Deeprank2 Package +Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -Once the dependencies are installed, you can install the latest stable release of deeprank2 using the PyPi package manager: +#### YML file installation + +You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) containing the latest stable release of DeepRank2 and all its dependencies. +This will install the CPU-only version of DeepRank2 on Python 3.10. +Note that this will not work for MacOS. Do the [Manual Installation](#manual-installation) instead. ```bash -pip install deeprank2 +# Clone the DeepRank2 repository and enter its root directory +git clone https://github.com/DeepRank/deeprank2 +cd deeprank2 + +# Ensure you are in your base environment +conda activate +# Create the environment +conda env create -f env/environment.yml +# Activate the environment +conda activate deeprank2 ``` -Alternatively, get all the new developments by cloning the repo and installing the editable version of the package with: +See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. + +#### Manual installation + +If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not succesful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). +If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Jan 2024): + +- [DSSP 4](https://anaconda.org/sbl/dssp): `conda install -c sbl dssp` +- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms` + - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` + - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). + - We support torch's CPU library as well as CUDA. +- [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. + - The exact command to install pyg will depend on the version of pytorch you are using. Please refer to the source's installation instructions (we recommend using the pip installation for this as it also shows the command for the dependencies). +- For MacOS with M1 chip users: install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). + +Finally install deeprank2 itself: `pip install deeprank2`. + +Alternatively, get the latest updates by cloning the repo and installing the editable version of the package with: ```bash git clone https://github.com/DeepRank/deeprank2 @@ -89,21 +137,23 @@ cd deeprank2 pip install -e .'[test]' ``` -The `test` extra is optional, and can be used to install test-related dependencies useful during the development. +The `test` extra is optional, and can be used to install test-related dependencies, useful during development. + +#### Testing DeepRank2 installation -### Test installation +You can check that all components were installed correctly, using pytest. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). -If you have installed the package from a cloned repository (second option above), you can check that all components were installed correctly, using pytest. The quick test should be sufficient to ensure that the software works, while the full test (a few minutes) will cover a much broader range of settings to ensure everything is correct. -Run `pytest tests/test_integration.py` for the quick test or just `pytest` for the full test (expect a few minutes to run). +First run `pip install pytest`, if you did not install it above. Then run `pytest tests/test_integration.py` for the quick test or just `pytest` for the full test (expect a few minutes to run). -### Contributing +## Contributing If you would like to contribute to the package in any way, please see [our guidelines](CONTRIBUTING.rst). -The following section serves as a first guide to start using the package, using protein-protein Interface (PPI) queries -as example. For an enhanced learning experience, we provide in-depth [tutorial notebooks](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for generating PPI data, generating SVR data, and for the training pipeline. +## Using DeepRank2 + +The following section serves as a first guide to start using the package, using protein-protein Interface (PPI) queries as example. For an enhanced learning experience, we provide in-depth [tutorial notebooks](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for generating PPI data, generating SVR data, and for the training pipeline. For more details, see the [extended documentation](https://deeprank2.rtfd.io/). ### Data generation @@ -313,7 +363,7 @@ trainer.test() ``` -### Run a pre-trained model on new data +#### Run a pre-trained model on new data If you want to analyze new PDB files using a pre-trained model, the first step is to process and save them into HDF5 files [as we have done above](#data-generation). diff --git a/docs/features.md b/docs/features.md index de3e9e639..3ed576ccd 100644 --- a/docs/features.md +++ b/docs/features.md @@ -4,7 +4,9 @@ Features implemented in the code-base are defined in `deeprank2.feature` subpack ## Custom features -Users can add custom features by creating a new module and placing it in `deeprank2.feature` subpackage. One requirement for any feature module is to implement an `add_features` function, as shown below. This will be used in `deeprank2.models.query` to add the features to the nodes or edges of the graph. +Users can add custom features by cloning the repository, creating a new module and placing it in `deeprank2.feature` subpackage. The custom features can then be used by installing the package in editable mode (see [here](https://deeprank2.readthedocs.io/en/latest/installation.html#install-deeprank2) for more details). We strongly recommend submitting a pull request (PR) to merge the new feature into the official repository. + +One requirement for any feature module is to implement an `add_features` function, as shown below. This will be used in `deeprank2.models.query` to add the features to the nodes or edges of the graph. ```python @@ -20,6 +22,59 @@ def add_features( pass ``` +Additionally, the nomenclature of the custom feature should be added in `deeprank2.domain.edgestorage` or `deeprank2.domain.nodestorage`, depending on which type of feature it is. + +As an example, this is the implementation of the node feature `res_type`, which represents the one-hot encoding of the amino acid residue and is defined in `deeprank2.features.components` module: + +```python +from deeprank2.domain import nodestorage as Nfeat +from deeprank2.molstruct.atom import Atom +from deeprank2.molstruct.residue import Residue, SingleResidueVariant +from deeprank2.utils.graph import Graph + +def add_features( + pdb_path: str, graph: Graph, + single_amino_acid_variant: Optional[SingleResidueVariant] = None + ): + + for node in graph.nodes: + if isinstance(node.id, Residue): + residue = node.id + elif isinstance(node.id, Atom): + atom = node.id + residue = atom.residue + else: + raise TypeError(f"Unexpected node type: {type(node.id)}") + + node.features[Nfeat.RESTYPE] = residue.amino_acid.onehot +``` + +`RESTYPE` is the name of the variable assigned to the feature `res_type` in `deeprank2.domain.nodestorage`. In order to use the feature from DeepRank2 API, its module needs to be imported and specified during the queries processing: + +```python +from deeprank2.features import components + +feature_modules = [components] + +# Save data into 3D-graphs only +hdf5_paths = queries.process( + "/", + feature_modules = feature_modules) +``` + +Then, the feature `res_type` can be used from the DeepRank2 datasets API: + +```python +from deeprank2.dataset import GraphDataset + +node_features = ["res_type"] + +dataset = GraphDataset( + hdf5_path = hdf5_paths, + node_features = node_features +) +``` + The following is a brief description of the features already implemented in the code-base, for each features' module. ## Default node features diff --git a/docs/getstarted.md b/docs/getstarted.md index bb5d3c311..9a98462d7 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -137,7 +137,7 @@ As representative example, the following is the HDF5 structure generated by the └── binary ``` -This entry represents the interface between the two proteins contained in the `.pdb` file, at the residue level. `edge_features` and `node_features` are specific for the graph-like representation of the PPI, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, apart from the features and the target that are specified by the user. +This entry represents the interface between the two proteins contained in the `.pdb` file, at the residue level. `edge_features` and `node_features` are specific for the graph-like representation of the PPI, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by DeepRank2 has the above structure, apart from the features and the target that are specified by the user. It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. For this purpose, users can either use [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files, or Python packages such as [h5py](https://docs.h5py.org/en/stable/). Few examples for the latter: @@ -360,7 +360,7 @@ trainer.test() ### Results export and visualization -The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter of the Trainer class, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Example: +The user can specify a DeepRank2 exporter or a custom one in `output_exporters` parameter of the Trainer class, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Example: ```python from deeprank2.trainer import Trainer diff --git a/docs/index.rst b/docs/index.rst index 3825a07c2..fe3d4084d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -70,7 +70,6 @@ Package reference :doc:`reference/deeprank2` This section documents the DeepRank2 API. - Indices and tables ================== diff --git a/docs/installation.md b/docs/installation.md index 0b40d3197..af2443ea9 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,33 +1,90 @@ +# Table of contents + +- [Table of contents](#table-of-contents) +- [Installation](#installation) + - [Containerized Installation](#containerized-installation) + - [Local/remote installation](#localremote-installation) + - [YML file installation](#yml-file-installation) + - [Manual installation](#manual-installation) + - [Testing DeepRank2 installation](#testing-deeprank2-installation) +- [Contributing](#contributing) + # Installation -The package officially supports ubuntu-latest OS only, whose functioning is widely tested through the continuous integration workflows. +There are two ways to install DeepRank2: -## Dependencies +1. In a [dockerized container](#containerized-installation). This allows you to use DeepRank2, including all the notebooks within the container (a protected virtual space), without worrying about your operating system or installation of dependencies. + - We recommend this installation for inexperienced users and to learn to use or test our software, e.g. using the provided [tutorials](tutorials/TUTORIAL.md). However, resources might be limited in this installation and we would not recommend using it for large datasets or on high-performance computing facilities. +2. [Local installation](#localremote-installation) on your system. This allows you to use the full potential of DeepRank2, but requires a few additional steps during installation. + - We recommend this installation for more experienced users, for larger projects, and for (potential) [contributors](#contributing) to the codebase. -Before installing deeprank2 you need to install some dependencies. We advise to use a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) with Python >= 3.10 installed. The following dependency installation instructions are updated as of 14/09/2023, but in case of issues during installation always refer to the official documentation which is linked below: +## Containerized Installation -- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms`. - - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -- [PyTorch](https://pytorch.org/get-started/locally/) - - We support torch's CPU library as well as CUDA. -- [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. -- [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/) -- Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4: -- on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`. -- on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. -- [GCC](https://gcc.gnu.org/install/) -- Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`. -- For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +In order to try out the package without worrying about your OS and without the need of installing all the required dependencies, we created a `Dockerfile` that can be used for taking care of everything in a suitable container. + +For this, you first need to install [Docker](https://docs.docker.com/engine/install/) on your system. Then run the following commands. You may need to have sudo permission for some steps, in which case the commands below can be preceded by `sudo`: + +```bash +# Clone the DeepRank2 repository and enter its root directory +git clone https://github.com/DeepRank/deeprank2 +cd deeprank2 + +# Build and run the Docker image +docker build -t deeprank2 . +docker run -p 8888:8888 deeprank2 +``` + +Next, open a browser and go to `http://localhost:8888` to access the application running inside the Docker container. From there you can use DeepRank2, e.g. to run the tutorial notebooks. + +More details about the tutorials' contents can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that in the docker container only the raw PDB files are downloaded, which needed as a starting point for the tutorials. You can obtain the processed HDF5 files by running the `data_generation_xxx.ipynb` notebooks. Because Docker containers are limited in memory resources, we limit the number of data points processed in the tutorials. Please [install the package locally](#localremote-installation) to fully leverage its capabilities. + +If after running the tutorials you want to remove the (quite large) Docker image from your machine, you must first [stop the container](https://docs.docker.com/engine/reference/commandline/stop/) and can then [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). More general information about Docker can be found on the [official website docs](https://docs.docker.com/get-started/). -## Deeprank2 Package +## Local/remote installation -Once the dependencies are installed, you can install the latest stable release of deeprank2 using the PyPi package manager: +Local installation is formally only supported on the latest stable release of ubuntu, for which widespread automated testing through continuous integration workflows has been set up. However, it is likely that the package runs smoothly on other operating systems as well. + +Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. + +#### YML file installation + +You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) containing the latest stable release of DeepRank2 and all its dependencies. +This will install the CPU-only version of DeepRank2 on Python 3.10. +Note that this will not work for MacOS. Do the [Manual Installation](#manual-installation) instead. ```bash -pip install deeprank2 +# Clone the DeepRank2 repository and enter its root directory +git clone https://github.com/DeepRank/deeprank2 +cd deeprank2 + +# Ensure you are in your base environment +conda activate +# Create the environment +conda env create -f env/environment.yml +# Activate the environment +conda activate deeprank2 ``` -Alternatively, get all the new developments by cloning the repo and installing the editable version of the package with: +See instructions below to [test](#testing-deeprank2-installation) that the installation was succesful. + +### Manual installation + +If you want to use the GPUs, choose a specific python version, are a MacOS user, or if the YML installation was not succesful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). +If you have any issues during installation of dependencies, please refer to the official documentation for each package (linked below), as our instructions may be out of date (last tested on 19 Jan 2024): + +- [DSSP 4](https://anaconda.org/sbl/dssp): `conda install -c sbl dssp` +- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms` + - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` + - Pytorch regularly publishes updates and not all newest versions will work stably with DeepRank2. Currently, the package is tested using [PyTorch 2.1.1](https://pytorch.org/get-started/previous-versions/#v211). + - We support torch's CPU library as well as CUDA. +- [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. + - The exact command to install pyg will depend on the version of pytorch you are using. Please refer to the source's installation instructions (we recommend using the pip installation for this as it also shows the command for the dependencies). +- For MacOS with M1 chip users: install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). + +Finally install deeprank2 itself: `pip install deeprank2`. + +Alternatively, get the latest updates by cloning the repo and installing the editable version of the package with: ```bash git clone https://github.com/DeepRank/deeprank2 @@ -35,15 +92,16 @@ cd deeprank2 pip install -e .'[test]' ``` -The `test` extra is optional, and can be used to install test-related dependencies useful during the development. +The `test` extra is optional, and can be used to install test-related dependencies, useful during development. + +### Testing DeepRank2 installation -## Test installation +You can check that all components were installed correctly, using pytest. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). -If you have installed the package from a cloned repository (second option above), you can check that all components were installed correctly, using pytest. The quick test should be sufficient to ensure that the software works, while the full test (a few minutes) will cover a much broader range of settings to ensure everything is correct. -Run `pytest tests/test_integration.py` for the quick test or just `pytest` for the full test (expect a few minutes to run). +First run `pip install pytest`, if you did not install it above. Then run `pytest tests/test_integration.py` for the quick test or just `pytest` for the full test (expect a few minutes to run). -## Contributing +# Contributing If you would like to contribute to the package in any way, please see [our guidelines](CONTRIBUTING.rst). diff --git a/docs/reference/deeprankcore.rst b/docs/reference/deeprank2.rst similarity index 100% rename from docs/reference/deeprankcore.rst rename to docs/reference/deeprank2.rst diff --git a/env/environment.yml b/env/environment.yml new file mode 100644 index 000000000..037f735f9 --- /dev/null +++ b/env/environment.yml @@ -0,0 +1,22 @@ +name: deeprank2 +channels: + - pytorch + - pyg + - bioconda + - defaults + - conda-forge + - sbl +dependencies: + - pip==23.3.* + - python==3.10.* + - msms==2.6.1 + - dssp>=4.2.2.1 + - pytorch==2.1.1 + - pytorch-mutex==1.0.* + - torchvision==0.16.1 + - torchaudio==2.1.1 + - cpuonly==2.0.* + - pyg==2.4.0 + - notebook==7.0.6 + - pip: + - --requirement requirements.txt diff --git a/env/requirements.txt b/env/requirements.txt new file mode 100644 index 000000000..23b468d33 --- /dev/null +++ b/env/requirements.txt @@ -0,0 +1,6 @@ +--find-links https://data.pyg.org/whl/torch-2.1.0+cpu.html +torch_scatter==2.1.2 +torch_sparse==0.6.18 +torch_cluster==1.6.3 +torch_spline_conv==1.2.2 +deeprank2==2.1.2 diff --git a/paper/paper.md b/paper/paper.md index 13a10b3ae..e4537c680 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -58,7 +58,7 @@ bibliography: paper.bib [comment]: <> (CHECK FOR AUTHORS: Do the summary describe the high-level functionality and purpose of the software for a diverse, non-specialist audience?) -We present DeepRank2, a deep learning (DL) framework geared towards making predictions on 3D protein structures for variety of biologically relevant applications. Our software can be used for predicting structural properties in drug design, immunotherapy, or designing novel proteins, among other fields. DeepRank2 allows for transformation and storage of 3D representations of both protein-protein interfaces (PPIs) and protein single-residue variants (SRVs) into either graphs or volumetric grids containing structural and physico-chemical information. These can be used for training neural networks for a variety of patterns of interest, using either our pre-implemented training pipeline for graph neural networks (GNNs) or convolutional neural networks (CNNs) or external pipelines. The entire framework flowchart is visualized in \autoref{fig:flowchart}. The package is fully open-source, follows the community-endorsed FAIR principles for research software, provides user-friendly APIs, publicily available [documentation](https://deeprank2.readthedocs.io/en/latest/), and in-depth [tutorials](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). +We present DeepRank2, a deep learning (DL) framework geared towards making predictions on 3D protein structures for variety of biologically relevant applications. Our software can be used for predicting structural properties in drug design, immunotherapy, or designing novel proteins, among other fields. DeepRank2 allows for transformation and storage of 3D representations of both protein-protein interfaces (PPIs) and protein single-residue variants (SRVs) into either graphs or volumetric grids containing structural and physico-chemical information. These can be used for training neural networks for a variety of patterns of interest, using either our pre-implemented training pipeline for graph neural networks (GNNs) or convolutional neural networks (CNNs) or external pipelines. The entire framework flowchart is visualized in \autoref{fig:flowchart}. The package is fully open-source, follows the community-endorsed FAIR principles for research software, provides user-friendly APIs, publicly available [documentation](https://deeprank2.readthedocs.io/en/latest/), and in-depth [tutorials](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). [comment]: <> (CHECK FOR AUTHORS: Do the authors clearly state what problems the software is designed to solve and who the target audience is?) [comment]: <> (CHECK FOR AUTHORS: Do the authors describe how this software compares to other commonly-used packages?) @@ -74,12 +74,12 @@ The 3D structure of proteins and protein complexes provides fundamental informat In the past decades, a variety of experimental methods (e.g., X-ray crystallography, nuclear magnetic resonance, cryogenic electron microscopy) have determined and accumulated a large number of atomic-resolution 3D structures of proteins and protein-protein complexes [@schwede_protein_2013]. Since experimental determination of structures is a tedious and expensive process, several computational prediction methods have been developed over the past decades, exploiting classical molecular modelling [@rosetta; @modeller; @haddock], and, more recently, DL [@alphafold_2021; @alphafold_multi]. The large amount of data available makes it possible to use DL to leverage 3D structures and learn their complex patterns. Unlike other machine learning (ML) techniques, deep neural networks hold the promise of learning from millions of data points without reaching a performance plateau quickly, which is made computationally feasible by hardware accelerators (i.e., GPUs, TPUs) and parallel file system technologies. [comment]: <> (Examples of DL with PPIs and SRVs) -The main types of data structures in vogue for representing 3D structures are 3D grids, graphs and surfaces. 3D CNNs have been trained on 3D grids for the classification of biological vs. crystallographic PPIs [@renaud_deeprank_2021], and for the scoring of models of protein-protein complexes generated by computational docking [@renaud_deeprank_2021; @dove]. Gaiza et al. have applied geodesic CNNs to extract protein interaction fingerprints by applying 2D CNNs on spread-out protein surface patches [@gainza2023novo]. 3D CNNs have been used for exploiting protein structure data for predicting mutation-induced changes in protein stability [@mut_cnn; @ramakrishnan2023] and identifying novel gain-of-function mutations [@shroff]. Contrary to CNNs, in GNNs the convolution operations on graphs can rely on the relative local connectivity between nodes and not on the data orientation, making graphs rotationally invariant. Additionally, GNNs can accept any size of graph, while in a CNN the size of the 3D grid for all input data needs to be the same, which may be problematic for datasets containing highly variable in size structures. Based on these arguments, different GNN-based tools have been designed to predict patterns from PPIs [@dove_gnn; @fout_protein_nodate; @reau_deeprank-gnn_2022]. Eisman et al. developed a rotation-equivariant neural network trained on point-based representation of the protein atomic structure to classify PPIs [@rot_eq_gnn]. +The main types of data structures in vogue for representing 3D structures are 3D grids, graphs, and surfaces. 3D CNNs have been trained on 3D grids for the classification of biological vs. crystallographic PPIs [@renaud_deeprank_2021], and for the scoring of models of protein-protein complexes generated by computational docking [@renaud_deeprank_2021; @dove]. Gaiza et al. have applied geodesic CNNs to extract protein interaction fingerprints by applying 2D CNNs on spread-out protein surface patches [@gainza2023novo]. 3D CNNs have been used for exploiting protein structure data for predicting mutation-induced changes in protein stability [@mut_cnn; @ramakrishnan2023] and identifying novel gain-of-function mutations [@shroff]. Contrary to CNNs, in GNNs the convolution operations on graphs can rely on the relative local connectivity between nodes and not on the data orientation, making graphs rotationally invariant. Additionally, GNNs can accept any size of graph, while in a CNN the size of the 3D grid for all input data needs to be the same, which may be problematic for datasets containing highly variable in size structures. Based on these arguments, different GNN-based tools have been designed to predict patterns from PPIs [@dove_gnn; @fout_protein_nodate; @reau_deeprank-gnn_2022]. Eisman et al. developed a rotation-equivariant neural network trained on point-based representation of the protein atomic structure to classify PPIs [@rot_eq_gnn]. # Statement of need [comment]: <> (Motivation for a flexible framework) -Data mining 3D structures of proteins presents several challenges. These include complex physico-chemical rules governing structural features, the possibility of characterizartion at different scales (e.g., atom-level, residue level, and secondary structure level), and the large diversity in shape and size. Furthermore, because a structure can easily comprise of hundreds to thousands of residues (and ~15 times as many atoms), efficient processing and featurization of many structures is critical to handle the computational cost and file storage requirements. Existing software solutions are often highly specialized and not developed as reusable and flexible frameworks, and cannot be easily adapted to diverse applications and predictive tasks. Examples include DeepAtom [@deepatom] for protein-ligand binding affinity prediction only, and MaSIF [@gainza2023novo] for deciphering patterns in protein surfaces. While some frameworks, such as TorchProtein and TorchDrug [@torchdrug], configure themselves as general-purpose ML libraries for both molecular sequences and 3D structures, they only implement geometric-related features and do not incorporate fundamental physico-chemical information in the 3D representation of molecules. +Data mining 3D structures of proteins presents several challenges. These include complex physico-chemical rules governing structural features, the possibility of characterization at different scales (e.g., atom-level, residue level, and secondary structure level), and the large diversity in shape and size. Furthermore, because a structure can easily comprise of hundreds to thousands of residues (and ~15 times as many atoms), efficient processing and featurization of many structures is critical to handle the computational cost and file storage requirements. Existing software solutions are often highly specialized and not developed as reusable and flexible frameworks, and cannot be easily adapted to diverse applications and predictive tasks. Examples include DeepAtom [@deepatom] for protein-ligand binding affinity prediction only, and MaSIF [@gainza2023novo] for deciphering patterns in protein surfaces. While some frameworks, such as TorchProtein and TorchDrug [@torchdrug], configure themselves as general-purpose ML libraries for both molecular sequences and 3D structures, they only implement geometric-related features and do not incorporate fundamental physico-chemical information in the 3D representation of molecules. These limitations create a growing demand for a generic and flexible DL framework that researchers can readily utilize for their specific research questions while cutting down the tedious data preprocessing stages. Generic DL frameworks have already emerged in diverse scientific fields, such as computational chemistry (e.g., DeepChem [@deepchem]) and condensed matter physics (e.g., NetKet [@netket]), which have promoted collaborative efforts, facilitated novel insights, and benefited from continuous improvements and maintenance by engaged user communities. @@ -94,7 +94,7 @@ As input, DeepRank2 takes [PDB-formatted](https://www.cgl.ucsf.edu/chimera/docs/ The physico-chemical and geometrical features are then computed and assigned to each node and edge. The user can choose which features to generate from several pre-existing options defined in the package, or define custom features modules, as explained in the documentation. Examples of pre-defined node features are the type of the amino acid, its size and polarity, as well as more complex features such as its buried surface area and secondary structure features. Examples of pre-defined edge features are distance, covalency, and potential energy. A detailed list of predefined features can be found in the [documentation's features page](https://deeprank2.readthedocs.io/en/latest/features.html). Graphs can either be used directly or mapped to volumetric grids (i.e., 3D image-like representations), together with their features. Multiple CPUs can be used to parallelize and speed up the featurization process. The processed data are saved into HDF5 files, designed to efficiently store and organize big data. Users can then use the data for any ML or DL framework suited for the application. Specifically, graphs can be used for the training of GNNs, and 3D grids can be used for the training of CNNs. -DeepRank2 also provides convenient pre-implemented modules for training simple [PyTorch](https://pytorch.org/)-based GNNs and CNNs using the data generated in the previous step. Alternatively, users can implement custom PyTorch networks in the DeepRank package (or export the data to external software). Data can be loaded across multiple CPUs, and the training can be run on GPUs. The data stored within the HDF5 files are read into customized datasets, and the user-friendly API allows for selection of individual features (from those generated above), definition of the targets, and the predictive task (classfication or regression), among other settings. Then the datasets can be used for training, validating, and testing the chosen neural network. The final model and results can be saved using built-in data exporter modules. +DeepRank2 also provides convenient pre-implemented modules for training simple [PyTorch](https://pytorch.org/)-based GNNs and CNNs using the data generated in the previous step. Alternatively, users can implement custom PyTorch networks in the DeepRank package (or export the data to external software). Data can be loaded across multiple CPUs, and the training can be run on GPUs. The data stored within the HDF5 files are read into customized datasets, and the user-friendly API allows for selection of individual features (from those generated above), definition of the targets, and the predictive task (classification or regression), among other settings. Then the datasets can be used for training, validating, and testing the chosen neural network. The final model and results can be saved using built-in data exporter modules. DeepRank2 embraces the best practices of open-source development by utilizing platforms like GitHub and Git, unit testing (as of August 2023 coverage is 83%), continuous integration, automatic documentation, and Findable, Accessible, Interoperable, and Reusable (FAIR) principles. Detailed [documentation](https://deeprank2.readthedocs.io/en/latest/?badge=latest) and [tutorials](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md) for getting started with the package are publicly available. The project aims to create high-quality software that can be easily accessed, used, and contributed to by a wide range of researchers. @@ -102,6 +102,6 @@ We believe this project will have a positive impact across the all of structural # Acknowledgements -This work was supported by the [Netherlands eScience Center](https://www.esciencecenter.nl/) under grant number NLESC.OEC.2021.008, and [SURF](https://www.surf.nl/en) infrastructure, and was developed in collaboration with the [Department of Medical BioSciences](https://www.radboudumc.nl/en/research/departments/medical-biosciences) at RadboudUMC (Hypatia Fellowship, Rv819.52706). This work was also supported from NVIDIA Acamedic Award. +This work was supported by the [Netherlands eScience Center](https://www.esciencecenter.nl/) under grant number NLESC.OEC.2021.008, and [SURF](https://www.surf.nl/en) infrastructure, and was developed in collaboration with the [Department of Medical BioSciences](https://www.radboudumc.nl/en/research/departments/medical-biosciences) at RadboudUMC (Hypatia Fellowship, Rv819.52706). This work was also supported from NVIDIA Academic Award. # References diff --git a/tutorials/TUTORIAL.md b/tutorials/TUTORIAL.md index d41135c2e..c6622a44a 100644 --- a/tutorials/TUTORIAL.md +++ b/tutorials/TUTORIAL.md @@ -33,7 +33,7 @@ PDB models and target data used in this tutorial have been retrieved from [Ramak - Clone the [deeprank2 repository](https://github.com/DeepRank/deeprank2). - From within the repository, run `pip install ".[tutorials]"` 3. To test whether the package has been succesfully installed: - - Navigate to your deeprank2 folder. + - Navigate to your `deeprank2` folder. - Run `pytest tests`. All tests should pass at this point. ## Running the notebooks diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 2bcc213d1..8330e9a8b 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -102,7 +102,9 @@ "data_path = os.path.join(\"data_raw\", \"ppi\")\n", "processed_data_path = os.path.join(\"data_processed\", \"ppi\")\n", "os.makedirs(os.path.join(processed_data_path, \"residue\"))\n", - "os.makedirs(os.path.join(processed_data_path, \"atomic\"))" + "os.makedirs(os.path.join(processed_data_path, \"atomic\"))\n", + "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", + "limit_data = True" ] }, { @@ -140,7 +142,10 @@ " return pdb_files, bas\n", "\n", "\n", - "pdb_files, bas = get_pdb_files_and_target_data(data_path)" + "pdb_files, bas = get_pdb_files_and_target_data(data_path)\n", + "\n", + "if limit_data:\n", + " pdb_files = pdb_files[:15]" ] }, { @@ -208,7 +213,7 @@ " if count % 20 == 0:\n", " print(f\"{count} queries added to the collection.\")\n", "\n", - "print(f\"Queries ready to be processed.\\n\")" + "print(\"Queries ready to be processed.\\n\")" ] }, { @@ -250,7 +255,9 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" + "print(\n", + " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.'\n", + ")" ] }, { @@ -353,7 +360,9 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", + "fname = os.path.join(\n", + " processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"])\n", + ")\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -445,7 +454,7 @@ " if count % 20 == 0:\n", " print(f\"{count} queries added to the collection.\")\n", "\n", - "print(f\"Queries ready to be processed.\\n\")" + "print(\"Queries ready to be processed.\\n\")" ] }, { @@ -471,7 +480,9 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" + "print(\n", + " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.'\n", + ")" ] }, { diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index d4835ff4f..11a776051 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -103,7 +103,9 @@ "data_path = os.path.join(\"data_raw\", \"srv\")\n", "processed_data_path = os.path.join(\"data_processed\", \"srv\")\n", "os.makedirs(os.path.join(processed_data_path, \"residue\"))\n", - "os.makedirs(os.path.join(processed_data_path, \"atomic\"))" + "os.makedirs(os.path.join(processed_data_path, \"atomic\"))\n", + "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", + "limit_data = True" ] }, { @@ -146,7 +148,10 @@ " return pdb_files, res_numbers, res_wildtypes, res_variants, targets\n", "\n", "\n", - "pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)" + "pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)\n", + "\n", + "if limit_data:\n", + " pdb_files = pdb_files[:15]" ] }, { @@ -261,7 +266,9 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" + "print(\n", + " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.'\n", + ")" ] }, { @@ -371,7 +378,9 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", + "fname = os.path.join(\n", + " processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"])\n", + ")\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -465,7 +474,7 @@ " if count % 20 == 0:\n", " print(f\"{count} queries added to the collection.\")\n", "\n", - "print(f\"Queries ready to be processed.\\n\")" + "print(\"Queries ready to be processed.\\n\")" ] }, { @@ -491,7 +500,9 @@ " grid_map_method=grid_map_method,\n", ")\n", "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" + "print(\n", + " f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.'\n", + ")" ] }, {