diff --git a/.github/workflows/_ghcr.yml b/.github/workflows/_ghcr.yml new file mode 100644 index 00000000..abdc910f --- /dev/null +++ b/.github/workflows/_ghcr.yml @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) +# SPDX-FileCopyrightText: 2022 dv4all +# SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +# SPDX-FileCopyrightText: 2024 Netherlands eScience Center +# +# SPDX-License-Identifier: Apache-2.0 + +name: reusable ghcr.io module + +on: + workflow_call: + inputs: + ghcr_user: + required: true + description: User for logging to ghcr.io (use github.actor) + type: string + base_image_name: + required: true + description: Base image name incl. ghcr.io + type: string + image_tag: + required: true + description: Image tag (version) + type: string + dockerfile: + required: true + description: Location and name of docker file + type: string + docker_context: + required: true + description: Docker context for the build command + type: string + secrets: + token: + required: true + outputs: + image_created: + description: Full image name after upload to ghcr.io + value: ${{jobs.build_and_push.outputs.image_build}} + image_uploaded: + description: Confirmation that image is uploaded to ghcr.io + value: ${{jobs.build_and_push.outputs.image_pushed}} + +jobs: + build_and_push: + name: build and push image + runs-on: ubuntu-22.04 + outputs: + image_build: ${{steps.build_image.outputs.image_build}} + image_pushed: ${{steps.build_image.outputs.image_pushed}} + steps: + - name: checkout + # https://github.com/actions/checkout + uses: actions/checkout@v4 + - name: build + id: build_image + run: | + IMAGE_TAG_VERSION=${{inputs.base_image_name}}:${{inputs.image_tag}} + IMAGE_TAG_LASTEST=${{inputs.base_image_name}}:latest + echo image_tag_version $IMAGE_TAG_VERSION + docker build -t $IMAGE_TAG_VERSION -t $IMAGE_TAG_LASTEST -f ${{inputs.dockerfile}} ${{inputs.docker_context}} + echo "{image_build}={$IMAGE_TAG_VERSIONvalue}" >> $GITHUB_OUTPUT + - name: push to ghcr.io + id: push_image + run: | + echo login + echo "${{secrets.token}}" | docker login https://ghcr.io -u ${{inputs.ghcr_user}} --password-stdin + echo push auth image with all tags + docker push ${{inputs.base_image_name}} --all-tags + echo "{image_build}={$IMAGE_TAG_VERSIONvalue}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index 9daa9936..9689e5cd 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -3,7 +3,7 @@ name: build (latest release) # Only trigger, when the release workflow succeeded on: workflow_run: - workflows: ["Build and upload to PyPI"] + workflows: ["Build and upload to PyPI and ghcr.io"] types: - completed @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index 47f29dce..47ab259e 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 7af4cc7a..e777c5d9 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml index 6851c52d..e63baf7f 100644 --- a/.github/workflows/cffconvert.yml +++ b/.github/workflows/cffconvert.yml @@ -8,7 +8,7 @@ on: jobs: validate: name: "validate" - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Check out a copy of the repository uses: actions/checkout@v3 diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index eb4feff2..0193c441 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml index 8b5159f7..d044846b 100644 --- a/.github/workflows/draft-pdf.yml +++ b/.github/workflows/draft-pdf.yml @@ -5,7 +5,7 @@ on: jobs: paper: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 name: Paper Draft steps: - name: Checkout diff --git a/.github/workflows/fair-software.yml b/.github/workflows/fair-software.yml index f20d3c84..f336c6ac 100644 --- a/.github/workflows/fair-software.yml +++ b/.github/workflows/fair-software.yml @@ -11,7 +11,7 @@ jobs: verify: if: github.event.pull_request.draft == false name: "fair-software" - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: fair-software/howfairis-github-action@0.2.1 name: Measure compliance with fair-software.eu recommendations diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index b870e6e4..a9369bfd 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml index 016e3812..a988faf8 100644 --- a/.github/workflows/markdown-link-check.yml +++ b/.github/workflows/markdown-link-check.yml @@ -23,7 +23,7 @@ jobs: markdown-link-check: if: github.event.pull_request.draft == false name: Check markdown links - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 - uses: gaurav-nelson/github-action-markdown-link-check@v1 diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index f4e9d1f1..758b3030 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -34,7 +34,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/release_pypi.yml b/.github/workflows/release_pypi.yml index 0f85061b..7639baa2 100644 --- a/.github/workflows/release_pypi.yml +++ b/.github/workflows/release_pypi.yml @@ -1,4 +1,4 @@ -name: Build and upload to PyPI +name: Build and upload to PyPI and ghcr.io on: workflow_dispatch: @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell @@ -36,8 +36,8 @@ jobs: path: dist/* upload_test_pypi: - needs: [build] - runs-on: ubuntu-latest + needs: build + runs-on: ubuntu-22.04 if: github.event_name == 'workflow_dispatch' steps: - uses: actions/download-artifact@v3 @@ -51,8 +51,8 @@ jobs: repository_url: https://test.pypi.org/legacy/ upload_pypi: - needs: [build] - runs-on: ubuntu-latest + needs: build + runs-on: ubuntu-22.04 if: github.event_name == 'release' && github.event.action == 'published' steps: - uses: actions/download-artifact@v3 @@ -63,3 +63,39 @@ jobs: with: user: __token__ password: ${{ secrets.PYPI_TOKEN_DEEPRANK2 }} + + read_only_version: + needs: upload_pypi + name: Read version from TOML + runs-on: ubuntu-22.04 + outputs: + version: ${{ steps.get_version.outputs.VERSION }} + repo_lowercase: ${{ steps.repo_lowercase.outputs.REPO_LOWERCASE }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Read version from TOML + id: get_version + run: | + VERSION=$(grep '^version =' pyproject.toml | awk -F '"' '{print $2}') + echo "VERSION=$VERSION" >> $GITHUB_OUTPUT + + - name: Convert repository name to lowercase + id: repo_lowercase + run: | + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + echo "REPO_LOWERCASE=$REPO_LOWERCASE" >> $GITHUB_OUTPUT + + upload_docker_image: + needs: read_only_version + name: Upload Docker image to ghcr.io + uses: ./.github/workflows/_ghcr.yml + with: + ghcr_user: ${{github.actor}} + base_image_name: ghcr.io/${{ needs.read_only_version.outputs.repo_lowercase }} + image_tag: ${{ needs.read_only_version.outputs.version }} + dockerfile: ./Dockerfile + docker_context: . + secrets: + token: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/stale_issue_pr.yml b/.github/workflows/stale_issue_pr.yml index d184c9c9..74ef3f5a 100644 --- a/.github/workflows/stale_issue_pr.yml +++ b/.github/workflows/stale_issue_pr.yml @@ -5,7 +5,7 @@ on: jobs: close-issues: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: issues: write pull-requests: write diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml new file mode 100644 index 00000000..35eb6bbe --- /dev/null +++ b/.github/workflows/test-docker-image.yml @@ -0,0 +1,64 @@ +name: test latest docker image + +# Only trigger, when the release workflow succeeded +on: + workflow_run: + workflows: ["Build and upload to PyPI and ghcr.io"] + types: + - completed + +jobs: + test_latest_docker_image: + runs-on: ubuntu-22.04 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull latest Docker image + run: | + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + docker pull ghcr.io/$REPO_LOWERCASE:latest + + - name: Run tests in Docker container + run: | + + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null + + PROJECT_DIR="/app" + TEST_DIR="$PROJECT_DIR/tests" + echo "Project directory: $PROJECT_DIR" + echo "Tests directory: $TEST_DIR" + + # Create project directory and copy tests folder + docker exec test_container mkdir -p $PROJECT_DIR + docker cp tests test_container:$TEST_DIR + + # Verify the directory structure + echo "Contents of project directory:" + docker exec test_container ls -la $PROJECT_DIR + echo "Contents of tests directory:" + docker exec test_container ls -la $TEST_DIR + + # Install pytest + docker exec test_container pip install pytest + + # Run pytest from the project directory + echo "Running pytest from the project directory:" + docker exec -w $PROJECT_DIR test_container python -m pytest tests -v + + # Clean up + docker stop test_container + docker rm test_container + + - name: Output test results + if: failure() + run: | + echo "Tests failed. Please check the test output above for more details." diff --git a/Dockerfile b/Dockerfile index 98d7eb80..9cbe9900 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,8 @@ RUN \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc -ADD ./env/deeprank2.yml /home/deeprank2 +ADD ./env/deeprank2.yml /home/deeprank2/ +ADD ./tutorials /home/deeprank2/tutorials RUN \ ## Create the environment and install the dependencies @@ -35,7 +36,7 @@ RUN \ # Get the data for running the tutorials if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && \ if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && \ - wget https://zenodo.org/records/8349335/files/data_raw.zip && \ + wget https://zenodo.org/records/13709906/files/data_raw.zip && \ unzip data_raw.zip -d data_raw && \ mv data_raw /home/deeprank2/tutorials && \ apt-get clean && \ @@ -45,8 +46,6 @@ RUN \ find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete && \ conda clean --force-pkgs-dirs --all --yes -ADD ./tutorials /home/deeprank2/tutorials - ENV PATH /opt/conda/envs/deeprank2/bin:$PATH # Define working directory diff --git a/README.md b/README.md index c9cd5187..49e028af 100644 --- a/README.md +++ b/README.md @@ -41,27 +41,30 @@ Main features: πŸ“£ [Discussions](https://github.com/DeepRank/deeprank2/discussions) -## Table of contents +## Table of Contents - [DeepRank2](#deeprank2) - [Overview](#overview) - - [Table of contents](#table-of-contents) + - [Table of Contents](#table-of-contents) - [Installation](#installation) - [Containerized Installation](#containerized-installation) - - [Local/remote installation](#localremote-installation) - - [YML file installation (recommended)](#yml-file-installation-recommended) - - [Manual installation (customizable)](#manual-installation-customizable) - - [Testing DeepRank2 installation](#testing-deeprank2-installation) + - [Pull and Run the Pre-build Docker Image (Recommended)](#pull-and-run-the-pre-build-docker-image-recommended) + - [Build the Docker Image Manually](#build-the-docker-image-manually) + - [Removing the Docker Image](#removing-the-docker-image) + - [Local/remote Installation](#localremote-installation) + - [YML File Installation (Recommended)](#yml-file-installation-recommended) + - [Manual Installation (Customizable)](#manual-installation-customizable) + - [Testing DeepRank2 Installation](#testing-deeprank2-installation) - [Contributing](#contributing) - [Using DeepRank2](#using-deeprank2) - - [Data generation](#data-generation) + - [Data Generation](#data-generation) - [Datasets](#datasets) - [GraphDataset](#graphdataset) - [GridDataset](#griddataset) - [Training](#training) - - [Run a pre-trained model on new data](#run-a-pre-trained-model-on-new-data) - - [Computational performances](#computational-performances) - - [Package development](#package-development) + - [Run a Pre-trained Model on New Data](#run-a-pre-trained-model-on-new-data) + - [Computational Performances](#computational-performances) + - [Package Development](#package-development) ## Installation @@ -74,33 +77,59 @@ There are two ways to install DeepRank2: ### Containerized Installation -In order to try out the package without worrying about your OS and without the need of installing all the required dependencies, we created a `Dockerfile` that can be used for taking care of everything in a suitable container. +We provide a pre-built Docker image hosted on GitHub Packages, allowing you to use DeepRank2 without worrying about installing dependencies or configuring your system. This is the recommended method for trying out the package quickly. -For this, you first need to install [Docker](https://docs.docker.com/engine/install/) on your system. Then run the following commands. You may need to have sudo permission for some steps, in which case the commands below can be preceded by `sudo`: +#### Pull and Run the Pre-build Docker Image (Recommended) + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Pull the latest Docker image from GitHub Packages by running the following command: + +```bash +docker pull ghcr.io/deeprank/deeprank2:latest +``` + +- Run the container from the pulled image: + +```bash +docker run -p 8888:8888 ghcr.io/deeprank/deeprank2:latest +``` + +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. + +From here, you can use DeepRank2, including running the tutorial notebooks. More details about the tutorials can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that the Docker container downloads only the raw PDB files required for the tutorials. To generate processed HDF5 files, you will need to run the `data_generation_xxx.ipynb` notebooks. Since Docker containers may have limited memory resources, we reduce the number of data points processed in the tutorials. To fully utilize the package, consider [installing it locally](#localremote-installation). + +#### Build the Docker Image Manually + +If you prefer to build the Docker image yourself or run into issues with the pre-built image, you can manually build and run the container as follows: + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Clone the DeepRank2 repository and navigate to its root directory: ```bash -# Clone the DeepRank2 repository and enter its root directory git clone https://github.com/DeepRank/deeprank2 cd deeprank2 +``` -# Build and run the Docker image +- Build and run the Docker image: + +```bash docker build -t deeprank2 . docker run -p 8888:8888 deeprank2 ``` -Next, open a browser and go to `http://localhost:8888` to access the application running inside the Docker container. From there you can use DeepRank2, e.g. to run the tutorial notebooks. +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. -More details about the tutorials' contents can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that in the docker container only the raw PDB files are downloaded, which needed as a starting point for the tutorials. You can obtain the processed HDF5 files by running the `data_generation_xxx.ipynb` notebooks. Because Docker containers are limited in memory resources, we limit the number of data points processed in the tutorials. Please [install the package locally](#localremote-installation) to fully leverage its capabilities. +#### Removing the Docker Image -If after running the tutorials you want to remove the (quite large) Docker image from your machine, you must first [stop the container](https://docs.docker.com/engine/reference/commandline/stop/) and can then [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). More general information about Docker can be found on the [official website docs](https://docs.docker.com/get-started/). +If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/) instructions. For more general information on Docker, refer to the [Docker documentation](https://docs.docker.com/get-started/) directly. -### Local/remote installation +### Local/remote Installation Local installation is formally only supported on the latest stable release of ubuntu, for which widespread automated testing through continuous integration workflows has been set up. However, it is likely that the package runs smoothly on other operating systems as well. Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -#### YML file installation (recommended) +#### YML File Installation (Recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) via [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html), containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -117,7 +146,7 @@ pip install deeprank2 We also provide a frozen environment YML file located at `env/deeprank2_frozen.yml` with all dependencies set to fixed versions. The `env/deeprank2_frozen.yml` file provides a frozen environment with all dependencies set to fixed versions. This ensures reproducibility of experiments and results by preventing changes in package versions that could occur due to updates or modifications in the default `env/deeprank2.yml`. Use this frozen environment file for a stable and consistent setup, particularly if you encounter issues with the default environment file. -#### Manual installation (customizable) +#### Manual Installation (Customizable) If you want to use the GPUs, choose a specific python version (note that at the moment we support python 3.10 only), are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). @@ -143,7 +172,7 @@ pip install -e .'[test]' The `test` extra is optional, and can be used to install test-related dependencies, useful during development. -#### Testing DeepRank2 installation +#### Testing DeepRank2 Installation If you have cloned the repository, you can check that all components were installed correctly using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). @@ -160,7 +189,7 @@ If you would like to contribute to the package in any way, please see [our guide The following section serves as a first guide to start using the package, using protein-protein Interface (PPI) queries as example. For an enhanced learning experience, we provide in-depth [tutorial notebooks](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for generating PPI data, generating SVR data, and for the training pipeline. For more details, see the [extended documentation](https://deeprank2.rtfd.io/). -### Data generation +### Data Generation For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. @@ -370,7 +399,7 @@ trainer.test() ``` -#### Run a pre-trained model on new data +#### Run a Pre-trained Model on New Data If you want to analyze new PDB files using a pre-trained model, the first step is to process and save them into HDF5 files [as we have done above](#data-generation). @@ -404,7 +433,7 @@ trainer.test() For more details about how to run a pre-trained model on new data, see the [docs](https://deeprank2.readthedocs.io/en/latest/getstarted.html#run-a-pre-trained-model-on-new-data). -## Computational performances +## Computational Performances We measured the efficiency of data generation in DeepRank2 using the tutorials' [PDB files](https://zenodo.org/record/8187806) (~100 data points per data set), averaging the results run on Apple M1 Pro, using a single CPU. Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Γ…, radius (for SRV only) of 10 Γ…. The [features modules](https://deeprank2.readthedocs.io/en/latest/features.html) used were `components`, `contact`, `exposure`, `irc`, `secondary_structure`, `surfacearea`, for a total of 33 features for PPIs and 26 for SRVs (the latter do not use `irc` features). @@ -414,6 +443,6 @@ Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Γ…, radius | PPIs | graph only: **2.99** (std 0.23)
graph+grid: **11.35** (std 1.30) | graph only: **0.54** (std 0.07)
graph+grid: **16.09** (std 0.44) | | SRVs | graph only: **2.20** (std 0.08)
graph+grid: **2.85** (std 0.10) | graph only: **0.05** (std 0.01)
graph+grid: **17.52** (std 0.59) | -## Package development +## Package Development If you're looking for developer documentation, go [here](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md). diff --git a/docs/source/docking.md b/docs/source/docking.md index eb5f83ae..f7e05091 100644 --- a/docs/source/docking.md +++ b/docs/source/docking.md @@ -1,4 +1,4 @@ -# Docking scores +# Docking Scores The following scores have been developed for evaluating the quality of the protein-protein models produced by computational methods (docking models), and all of them compare the structural similarity between the decoys (computationally generated structures) and the experimentally solved native structures. To calculate these measures, the interface between the two interacting protein molecules is defined as any pair of heavy atoms from the two molecules within 5Γ… of each other. @@ -11,7 +11,7 @@ The following scores have been developed for evaluating the quality of the prote See https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.10393 for more details about `capri_class`, `lrmsd`, `irmsd`, and `fnat`. See https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0161879 for more details about `dockq`. -## Compute and add docking scores +## Compute and Add Docking Scores The following code snippet shows an example of how to use deeprank2 to compute the docking scores for a given docking model, and how to add one of the scores (e.g., `dockq`) as a target to the already processed data. diff --git a/docs/source/features.md b/docs/source/features.md index 6fa59ea6..3271f6e9 100644 --- a/docs/source/features.md +++ b/docs/source/features.md @@ -2,7 +2,7 @@ Features implemented in the code-base are defined in `deeprank2.feature` subpackage. -## Custom features +## Custom Features Users can add custom features by cloning the repository, creating a new module and placing it in `deeprank2.feature` subpackage. The custom features can then be used by installing the package in editable mode (see [here](https://deeprank2.readthedocs.io/en/latest/installation.html#install-deeprank2) for more details). We strongly recommend submitting a pull request (PR) to merge the new feature into the official repository. @@ -77,15 +77,15 @@ dataset = GraphDataset( The following is a brief description of the features already implemented in the code-base, for each features' module. -## Default node features +## Default Node Features For atomic graphs, when features relate to residues then _all_ atoms of one residue receive the feature value for that residue. -### Core properties of atoms and residues: `deeprank2.features.components` +### Core Properties of Atoms and Residues: `deeprank2.features.components` These features relate to the chemical components (atoms and amino acid residues) of which the graph is composed. Detailed information and descrepancies between sources are described can be found in `deeprank2.domain.aminoacidlist.py`. -#### Atom properties: +#### Atom Properties: These features are only used in atomic graphs. @@ -93,7 +93,7 @@ These features are only used in atomic graphs. - `atom_charge`: Atomic charge in Coulomb (float). Taken from `deeprank2.domain.forcefield.patch.top`. - `pdb_occupancy`: Proportion of structures where the atom was detected at this position (float). In some cases a single atom was detected at different positions, in which case separate structures exist whose occupancies sum to 1. Only the highest occupancy atom is used by deeprank2. -#### Residue properties: +#### Residue Properties: - `res_type`: One-hot encoding of the amino acid residue (size 20). - `polarity`: One-hot encoding of the polarity of the amino acid (options: NONPOLAR, POLAR, NEGATIVE, POSITIVE). Note that sources vary on the polarity for few of the amino acids; see detailed information in `deeprank2.domain.aminoacidlist.py`. @@ -104,14 +104,14 @@ These features are only used in atomic graphs. - `hb_donors`, `hb_acceptors`: The number of hydrogen bond donor/acceptor atoms in the residue (int). Hydrogen bonds are noncovalent intermolecular interactions formed between an hydrogen atom (partially positively charged) bound to a small, highly electronegative atom (O, N, F) with an unshared electron pair. -#### Properties related to variant residues: +#### Properties Related to Variant Residues: These features are only used in SingleResidueVariant queries. - `variant_res`: One-hot encoding of variant amino acid (size 20). - `diff_charge`, `diff_polarity`, `diff_size`, `diff_mass`, `diff_pI`, `diff_hb_donors`, `diff_hb_acceptors`: Subtraction of the wildtype value of indicated feature from the variant value. For example, if the variant has 4 hb_donors and the wildtype has 5, then `diff_hb_donors == -1`. -### Conservation features: `deeprank2.features.conservation` +### Conservation Features: `deeprank2.features.conservation` These features relate to the conservation state of individual residues. @@ -120,36 +120,36 @@ These features relate to the conservation state of individual residues. - `conservation` (only used in SingleResidueVariant queries): Conservation of the wild type amino acid (float). _More details required._ - `diff_conservation` (only used in SingleResidueVariant queries): Subtraction of wildtype conservation from the variant conservation (float). -### Protein context features: +### Protein Context Features: -#### Surface exposure: `deeprank2.features.exposure` +#### Surface Exposure: `deeprank2.features.exposure` These features relate to the exposure of residues to the surface, and are computed using [biopython](https://biopython.org/docs/1.81/api/Bio.PDB.html). Note that these features can only be calculated per residue and not per atom. - `res_depth`: [Residue depth](https://en.wikipedia.org/wiki/Residue_depth) is the average distance (in Γ…) of the residue to the closest molecule of bulk water (float). See also [`Bio.PDB.ResidueDepth`](https://biopython.org/docs/1.75/api/Bio.PDB.ResidueDepth.html). - `hse`: [Half sphere exposure (HSE)](https://en.wikipedia.org/wiki/Half_sphere_exposure) is a protein solvent exposure measure indicating how buried an amino acid residue is in a protein (3 float values, see [Bio.PDB.HSExposure](https://biopython.org/docs/dev/api/Bio.PDB.HSExposure.html#module-Bio.PDB.HSExposure) for details). -#### Surface accessibility: `deeprank2.features.surfacearea` +#### Surface Accessibility: `deeprank2.features.surfacearea` These features relate to the surface area of the residue, and are computed using [freesasa](https://freesasa.github.io). Note that these features can only be calculated per residue and not per atom. - `sasa`: [Solvent-Accessible Surface Area](https://en.wikipedia.org/wiki/Accessible_surface_area) is the surface area (in Γ…^2) of a biomolecule that is accessible to the solvent (float). - `bsa`: Buried Surface Area is the surface area (in Γ…^2) that is buried away from the solvent when two or more proteins or subunits associate to form a complex, i.e. it measures the size of the complex interface (float). -#### Secondary structure: `deeprank2.features.secondary_structure` +#### Secondary Structure: `deeprank2.features.secondary_structure` - `sec_struct`: One-hot encoding of the [DSSP]() assigned secondary structure of the amino acid, using the three major classes (HELIX, STRAND, COIL). Calculated using [DSSP4](https://github.com/PDB-REDO/dssp). -#### Inter-residue contacts (IRCs): `deeprank2.features.irc` +#### Inter-residue Contacts (IRCs): `deeprank2.features.irc` These features are only calculated for ProteinProteinInterface queries. - `irc_total`: The number of residues on the other chain that are within a cutoff distance of 5.5 Γ… (int). - `irc_nonpolar_nonpolar`, `irc_nonpolar_polar`, `irc_nonpolar_negative`, `irc_nonpolar_positive`, `irc_polar_polar`, `irc_polar_negative`, `irc_polar_positive`, `irc_negative_negative`, `irc_positive_positive`, `irc_negative_positive`: As above, but for specific residue polarity pairings. -## Default edge features +## Default Edge Features -### Contact features: `deeprank2.features.contact` +### Contact Features: `deeprank2.features.contact` These features relate to relationships between individual nodes. For atomic graphs, when features relate to residues then _all_ atoms of one residue receive the feature value for that residue. @@ -166,7 +166,7 @@ These features relate to the structural relationship between nodes. - `same_res`: Boolean indicating whether atoms belong to the same residue (1) or separate residues (0). Only used in atomic graphs. - `covalent`: Boolean indicating whether nodes are covalently bound (1) or not (0). Note that covalency is not directly assessed, but any edge with a maximum distance of 2.1 Γ… is considered covalent. -#### Nonbond energies: +#### Nonbond Energies: These features measure nonbond energy potentials between nodes, and are calculated using [OPLS forcefield](https://en.wikipedia.org/wiki/OPLS). For residue graphs, the pairwise sum of potentials for all atoms from each residue is used. Note that no distance cutoff is used and the radius of influence is assumed to be infinite, although the potentials tends to 0 at large distance. Also edges are only assigned within a given cutoff radius when graphs are created. diff --git a/docs/source/getstarted.md b/docs/source/getstarted.md index cca87829..9c264df0 100644 --- a/docs/source/getstarted.md +++ b/docs/source/getstarted.md @@ -1,9 +1,9 @@ -# Get started +# Get Started The following section serves as a first guide to start using the package, using protein-protein interface (PPI) queries as example. For an enhanced learning experience, we provide in-depth [tutorial notebooks](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for generating PPI data, generating SRVs data, and for the training pipeline. -## Data generation +## Data Generation For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. @@ -89,7 +89,7 @@ hdf5_paths = queries.process( grid_map_method = MapMethod.GAUSSIAN) ``` -## Data exploration +## Data Exploration As representative example, the following is the HDF5 structure generated by the previous phase for `1ATN_1w.pdb`, so for one single graph, for the graph + grid case: @@ -199,7 +199,7 @@ dataset_test = GraphDataset( ) ``` -#### Transforming features +#### Transforming Features For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization. If `True`, standardization is applied after transformation, if the latter is present. Example: @@ -360,7 +360,7 @@ trainer.test() ``` -### Results export and visualization +### Results Export and Visualization The user can specify a DeepRank2 exporter or a custom one in `output_exporters` parameter of the Trainer class, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Example: @@ -411,7 +411,7 @@ fig.update_layout( ) ``` -## Run a pre-trained model on new data +## Run a Pre-trained Model on New Data If you want to run a pre-trained model on new PDB files, the first step is to process and save them into HDF5 files. Let's suppose that the model has been trained with `ProteinProteinInterfaceQuery` queries mapped to graphs: diff --git a/docs/source/installation.md b/docs/source/installation.md index b945f260..9a0f22ea 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -11,27 +11,53 @@ There are two ways to install DeepRank2: (containerized-installation)= -In order to try out the package without worrying about your OS and without the need of installing all the required dependencies, we created a `Dockerfile` that can be used for taking care of everything in a suitable container. +We provide a pre-built Docker image hosted on GitHub Packages, allowing you to use DeepRank2 without worrying about installing dependencies or configuring your system. This is the recommended method for trying out the package quickly. -For this, you first need to install [Docker](https://docs.docker.com/engine/install/) on your system. Then run the following commands. You may need to have sudo permission for some steps, in which case the commands below can be preceded by `sudo`: +### Pull and Run the Pre-build Docker Image (Recommended) + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Pull the latest Docker image from GitHub Packages by running the following command: + +```bash +docker pull ghcr.io/deeprank/deeprank2:latest +``` + +- Run the container from the pulled image: + +```bash +docker run -p 8888:8888 ghcr.io/deeprank/deeprank2:latest +``` + +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. + +From here, you can use DeepRank2, including running the tutorial notebooks. More details about the tutorials can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that the Docker container downloads only the raw PDB files required for the tutorials. To generate processed HDF5 files, you will need to run the `data_generation_xxx.ipynb` notebooks. Since Docker containers may have limited memory resources, we reduce the number of data points processed in the tutorials. To fully utilize the package, consider [installing it locally](#localremote-installation). + +### Build the Docker Image Manually + +If you prefer to build the Docker image yourself or run into issues with the pre-built image, you can manually build and run the container as follows: + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Clone the DeepRank2 repository and navigate to its root directory: ```bash -# Clone the DeepRank2 repository and enter its root directory git clone https://github.com/DeepRank/deeprank2 cd deeprank2 +``` -# Build and run the Docker image +- Build and run the Docker image: + +```bash docker build -t deeprank2 . docker run -p 8888:8888 deeprank2 ``` -Next, open a browser and go to `http://localhost:8888` to access the application running inside the Docker container. From there you can use DeepRank2, e.g. to run the tutorial notebooks. +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. -More details about the tutorials' contents can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that in the docker container only the raw PDB files are downloaded, which needed as a starting point for the tutorials. You can obtain the processed HDF5 files by running the `data_generation_xxx.ipynb` notebooks. Because Docker containers are limited in memory resources, we limit the number of data points processed in the tutorials. Please [install the package locally](#localremote-installation) to fully leverage its capabilities. +### Removing the Docker Image -If after running the tutorials you want to remove the (quite large) Docker image from your machine, you must first [stop the container](https://docs.docker.com/engine/reference/commandline/stop/) and can then [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). More general information about Docker can be found on the [official website docs](https://docs.docker.com/get-started/). +If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/) instructions. For more general information on Docker, refer to the [Docker documentation](https://docs.docker.com/get-started/) directly. -## Local/remote installation +## Local/remote Installation (localremote-installation)= @@ -39,7 +65,7 @@ Local installation is formally only supported on the latest stable release of ub Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -## YML file installation (recommended) +## YML File Installation (Recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) via [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html), containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -56,7 +82,7 @@ pip install deeprank2 We also provide a frozen environment YML file located at `env/deeprank2_frozen.yml` with all dependencies set to fixed versions. The `env/deeprank2_frozen.yml` file provides a frozen environment with all dependencies set to fixed versions. This ensures reproducibility of experiments and results by preventing changes in package versions that could occur due to updates or modifications in the default `env/deeprank2.yml`. Use this frozen environment file for a stable and consistent setup, particularly if you encounter issues with the default environment file. -## Manual installation (customizable) +## Manual Installation (Customizable) (manual-installation)= @@ -84,7 +110,7 @@ pip install -e .'[test]' The `test` extra is optional, and can be used to install test-related dependencies, useful during development. -## Testing DeepRank2 installation +## Testing DeepRank2 Installation If you have cloned the repository, you can check that all components were installed correctly using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). diff --git a/tests/perf/srv_perf.py b/tests/perf/srv_perf.py index 94be374e..645030cb 100644 --- a/tests/perf/srv_perf.py +++ b/tests/perf/srv_perf.py @@ -88,7 +88,7 @@ def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list, list, list, list]: - csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values.csv")) + csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values_curated.csv")) # before running this script change .ent to .pdb pdb_files = glob.glob(os.path.join(data_path, "pdb", "*.pdb")) pdb_files.sort() diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 98cd77fa..8106e363 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -1,559 +1,559 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data preparation for protein-protein interfaces\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "\n", - "\n", - "This tutorial will demonstrate the use of DeepRank2 for generating protein-protein interface (PPI) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files]() of protein-protein complexes as input.\n", - "\n", - "In this data processing phase, for each protein-protein complex an interface is selected according to a distance threshold that the user can customize, and it is mapped to a graph. Nodes either represent residues or atoms, and edges are the interactions between them. Each node and edge can have several different features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. The mapped data are finally saved into HDF5 files, and can be used for later models' training (for details go to [training_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs).\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Input Data\n", - "\n", - "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/7997585). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", - "\n", - "Note that the dataset contains only 100 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilities\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The libraries needed for this tutorial:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import contextlib\n", - "import glob\n", - "import os\n", - "from pathlib import Path\n", - "\n", - "import h5py\n", - "import matplotlib.image as img\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "\n", - "from deeprank2.dataset import GraphDataset\n", - "from deeprank2.features import components, contact\n", - "from deeprank2.query import ProteinProteinInterfaceQuery, QueryCollection\n", - "from deeprank2.utils.grid import GridSettings, MapMethod" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Raw files and paths\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The paths for reading raw data and saving the processed ones:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_path = os.path.join(\"data_raw\", \"ppi\")\n", - "processed_data_path = os.path.join(\"data_processed\", \"ppi\")\n", - "residue_data_path = os.path.join(processed_data_path, \"residue\")\n", - "atomic_data_path = os.path.join(processed_data_path, \"atomic\")\n", - "\n", - "for output_path in [residue_data_path, atomic_data_path]:\n", - " os.makedirs(output_path, exist_ok=True)\n", - " if any(Path(output_path).iterdir()):\n", - " msg = f\"Please store any required data from `./{output_path}` and delete the folder.\\nThen re-run this cell to continue.\"\n", - " raise FileExistsError(msg)\n", - "\n", - "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", - "limit_data = True" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Raw data are PDB files in `data_raw/ppi/pdb/`, which contains atomic coordinates of the protein-protein complexes of interest, so in our case of pMHC complexes.\n", - "- Target data, so in our case the BA values for the pMHC complex, are in `data_raw/ppi/BA_values.csv`.\n", - "- The final PPI processed data will be saved in `data_processed/ppi/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names in a list and the BA target values from a CSV containing the IDs of the PDB models as well:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[float]]:\n", - " csv_data = pd.read_csv(os.path.join(data_path, \"BA_values.csv\"))\n", - " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.pdb\"))\n", - " pdb_files.sort()\n", - " pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0] for pdb_file in pdb_files]\n", - " with contextlib.suppress(KeyError):\n", - " csv_data_indexed = csv_data.set_index(\"ID\")\n", - " csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]\n", - " bas = csv_data_indexed.measurement_value.tolist()\n", - "\n", - " return pdb_files, bas\n", - "\n", - "\n", - "pdb_files, bas = get_pdb_files_and_target_data(data_path)\n", - "\n", - "if limit_data:\n", - " pdb_files = pdb_files[:15]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `QueryCollection` and `Query` objects\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each protein-protein complex, so for each data point, a query can be created and added to the `QueryCollection` object, to be processed later on.\n", - "\n", - "A query takes as inputs:\n", - "\n", - "- A `.pdb` file, representing the protein-protein structural complex.\n", - "- The resolution (`\"residue\"` or `\"atom\"`), i.e. whether each node should represent an amino acid residue or an atom.\n", - "- The ids of the two chains composing the complex. In our use case, \"M\" indicates the MHC protein chain and \"P\" the peptide chain.\n", - "- The interaction radius, which determines the threshold distance (in Γ…ngstrΓΆm) for residues/atoms surrounding the interface that will be included in the graph.\n", - "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add two targets: \"BA\" and \"binary\". The first represents the actual BA value of the complex in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) a binding one.\n", - "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", - "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), in the form of .pssm files. PSSMs are optional and will not be used in this tutorial.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Residue-level PPIs using `ProteinProteinInterfaceQuery`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = QueryCollection()\n", - "\n", - "influence_radius = 8 # max distance in Γ… between two interacting residues/atoms of two proteins\n", - "max_edge_length = 8\n", - "binary_target_value = 500\n", - "\n", - "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "for i in range(len(pdb_files)):\n", - " queries.add(\n", - " ProteinProteinInterfaceQuery(\n", - " pdb_path=pdb_files[i],\n", - " resolution=\"residue\",\n", - " chain_ids=[\"M\", \"P\"],\n", - " influence_radius=influence_radius,\n", - " max_edge_length=max_edge_length,\n", - " targets={\n", - " \"binary\": int(float(bas[i]) <= binary_target_value),\n", - " \"BA\": bas[i], # continuous target value\n", - " },\n", - " ),\n", - " )\n", - " if i + 1 % 20 == 0:\n", - " print(f\"{i+1} queries added to the collection.\")\n", - "\n", - "print(f\"{i+1} queries ready to be processed.\\n\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Notes on `process()` method\n", - "\n", - "Once all queries have been added to the `QueryCollection` instance, they can be processed. Main parameters of the `process()` method, include:\n", - "\n", - "- `prefix` sets the output file location.\n", - "- `feature_modules` allows you to choose which feature generating modules you want to use. By default, the basic features contained in `deeprank2.features.components` and `deeprank2.features.contact` are generated. Users can add custom features by creating a new module and placing it in the `deeprank2.feature` subpackage. A complete and detailed list of the pre-implemented features per module and more information about how to add custom features can be found [here](https://deeprank2.readthedocs.io/en/latest/features.html).\n", - " - Note that all features generated by a module will be added if that module was selected, and there is no way to only generate specific features from that module. However, during the training phase shown in `training_ppi.ipynb`, it is possible to select only a subset of available features.\n", - "- `cpu_count` can be used to specify how many processes to be run simultaneously, and will coincide with the number of HDF5 files generated. By default it takes all available CPU cores and HDF5 files are squashed into a single file using the `combine_output` setting.\n", - "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - " # the number of points on the x, y, z edges of the cube\n", - " points_counts=[35, 30, 30],\n", - " # x, y, z sizes of the box in Γ…\n", - " sizes=[1.0, 1.0, 1.0],\n", - ")\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", - "\n", - "queries.process(\n", - " prefix=os.path.join(processed_data_path, \"residue\", \"proc\"),\n", - " feature_modules=[components, contact],\n", - " cpu_count=8,\n", - " combine_output=False,\n", - " grid_settings=grid_settings,\n", - " grid_map_method=grid_map_method,\n", - ")\n", - "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exploring data\n", - "\n", - "As representative example, the following is the HDF5 structure generated by the previous code for `BA-100600.pdb`, so for one single graph, which represents one PPI, for the graph + grid case:\n", - "\n", - "```bash\n", - "└── residue-ppi:M-P:BA-100600\n", - " |\n", - " β”œβ”€β”€ edge_features\n", - " β”‚ β”œβ”€β”€ _index\n", - " β”‚ β”œβ”€β”€ _name\n", - " β”‚ β”œβ”€β”€ covalent\n", - " β”‚ β”œβ”€β”€ distance\n", - " β”‚ β”œβ”€β”€ electrostatic\n", - " β”‚ β”œβ”€β”€ same_chain\n", - " β”‚ └── vanderwaals\n", - " |\n", - " β”œβ”€β”€ node_features\n", - " β”‚ β”œβ”€β”€ _chain_id\n", - " β”‚ β”œβ”€β”€ _name\n", - " β”‚ β”œβ”€β”€ _position\n", - " β”‚ β”œβ”€β”€ hb_acceptors\n", - " β”‚ β”œβ”€β”€ hb_donors\n", - " β”‚ β”œβ”€β”€ polarity\n", - " β”‚ β”œβ”€β”€ res_charge\n", - " β”‚ β”œβ”€β”€ res_mass\n", - " | β”œβ”€β”€ res_pI\n", - " | β”œβ”€β”€ res_size\n", - " | └── res_type\n", - " |\n", - " β”œβ”€β”€ grid_points\n", - " β”‚ β”œβ”€β”€ center\n", - " β”‚ β”œβ”€β”€ x\n", - " β”‚ β”œβ”€β”€ y\n", - " β”‚ └── z\n", - " |\n", - " β”œβ”€β”€ mapped_features\n", - " β”‚ β”œβ”€β”€ _position_000\n", - " β”‚ β”œβ”€β”€ _position_001\n", - " β”‚ β”œβ”€β”€ _position_002\n", - " β”‚ β”œβ”€β”€ covalent\n", - " β”‚ β”œβ”€β”€ distance\n", - " β”‚ β”œβ”€β”€ electrostatic\n", - " β”‚ β”œβ”€β”€ polarity_000\n", - " β”‚ β”œβ”€β”€ polarity_001\n", - " β”‚ β”œβ”€β”€ polarity_002\n", - " β”‚ β”œβ”€β”€ polarity_003\n", - " | β”œβ”€β”€ ...\n", - " | └── vanderwaals\n", - " |\n", - " └── target_values\n", - " β”‚ β”œβ”€β”€ BA\n", - " └── binary\n", - "```\n", - "\n", - "`edge_features`, `node_features`, `mapped_features` are [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which contain [HDF5 Datasets](https://docs.h5py.org/en/stable/high/dataset.html) (e.g., `_index`, `electrostatic`, etc.), which in turn contains features values in the form of arrays. `edge_features` and `node_features` refer specificly to the graph representation, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, with the features and the target changing according to the user's settings. Features starting with `_` are present for human inspection of the data, but they are not used for training models.\n", - "\n", - "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Pandas dataframe\n", - "\n", - "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "dataset_df = dataset.hdf5_to_pandas()\n", - "dataset_df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also generate histograms for looking at the features distributions. An example:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fname = os.path.join(processed_data_path, \"residue\", \"res_mass_distance_electrostatic\")\n", - "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", - "\n", - "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize=(15, 10))\n", - "fig = plt.imshow(im)\n", - "fig.axes.get_xaxis().set_visible(False)\n", - "fig.axes.get_yaxis().set_visible(False)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Other tools\n", - "\n", - "- [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files.\n", - " As representative example, the following is the structure for `BA-100600.pdb` seen from HDF5View:\n", - "\n", - " \n", - "\n", - " Using this tool you can inspect the values of the features visually, for each data point.\n", - "\n", - "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with h5py.File(processed_data[0], \"r\") as hdf5:\n", - " # List of all graphs in hdf5, each graph representing a ppi\n", - " ids = list(hdf5.keys())\n", - " print(f\"IDs of PPIs in {processed_data[0]}: {ids}\")\n", - " node_features = list(hdf5[ids[0]][\"node_features\"])\n", - " print(f\"Node features: {node_features}\")\n", - " edge_features = list(hdf5[ids[0]][\"edge_features\"])\n", - " print(f\"Edge features: {edge_features}\")\n", - " target_features = list(hdf5[ids[0]][\"target_values\"])\n", - " print(f\"Targets features: {target_features}\")\n", - " # Polarity feature for ids[0], numpy.ndarray\n", - " node_feat_polarity = hdf5[ids[0]][\"node_features\"][\"polarity\"][:]\n", - " print(f\"Polarity feature shape: {node_feat_polarity.shape}\")\n", - " # Electrostatic feature for ids[0], numpy.ndarray\n", - " edge_feat_electrostatic = hdf5[ids[0]][\"edge_features\"][\"electrostatic\"][:]\n", - " print(f\"Electrostatic feature shape: {edge_feat_electrostatic.shape}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Atomic-level PPIs using `ProteinProteinInterfaceQuery`\n", - "\n", - "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = QueryCollection()\n", - "\n", - "influence_radius = 5 # max distance in Γ… between two interacting residues/atoms of two proteins\n", - "max_edge_length = 5\n", - "binary_target_value = 500\n", - "\n", - "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "for i in range(len(pdb_files)):\n", - " queries.add(\n", - " ProteinProteinInterfaceQuery(\n", - " pdb_path=pdb_files[i],\n", - " resolution=\"atom\",\n", - " chain_ids=[\"M\", \"P\"],\n", - " influence_radius=influence_radius,\n", - " max_edge_length=max_edge_length,\n", - " targets={\n", - " \"binary\": int(float(bas[i]) <= binary_target_value),\n", - " \"BA\": bas[i], # continuous target value\n", - " },\n", - " ),\n", - " )\n", - " if i + 1 % 20 == 0:\n", - " print(f\"{i+1} queries added to the collection.\")\n", - "\n", - "print(f\"{i+1} queries ready to be processed.\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - " # the number of points on the x, y, z edges of the cube\n", - " points_counts=[35, 30, 30],\n", - " # x, y, z sizes of the box in Γ…\n", - " sizes=[1.0, 1.0, 1.0],\n", - ")\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", - "\n", - "queries.process(\n", - " prefix=os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", - " feature_modules=[components, contact],\n", - " cpu_count=8,\n", - " combine_output=False,\n", - " grid_settings=grid_settings,\n", - " grid_map_method=grid_map_method,\n", - ")\n", - "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, the data can be inspected using `hdf5_to_pandas` function.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "dataset_df = dataset.hdf5_to_pandas()\n", - "dataset_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fname = os.path.join(processed_data_path, \"atomic\", \"atom_charge\")\n", - "dataset.save_hist(features=\"atom_charge\", fname=fname)\n", - "\n", - "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize=(8, 8))\n", - "fig = plt.imshow(im)\n", - "fig.axes.get_xaxis().set_visible(False)\n", - "fig.axes.get_yaxis().set_visible(False)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "deeprank2", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data preparation for protein-protein interfaces\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "\n", + "\n", + "This tutorial will demonstrate the use of DeepRank2 for generating protein-protein interface (PPI) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files]() of protein-protein complexes as input.\n", + "\n", + "In this data processing phase, for each protein-protein complex an interface is selected according to a distance threshold that the user can customize, and it is mapped to a graph. Nodes either represent residues or atoms, and edges are the interactions between them. Each node and edge can have several different features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. The mapped data are finally saved into HDF5 files, and can be used for later models' training (for details go to [training_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs).\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input Data\n", + "\n", + "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/7997585). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "\n", + "Note that the dataset contains only 100 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilities\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The libraries needed for this tutorial:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import contextlib\n", + "import glob\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "import h5py\n", + "import matplotlib.image as img\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "from deeprank2.dataset import GraphDataset\n", + "from deeprank2.features import components, contact\n", + "from deeprank2.query import ProteinProteinInterfaceQuery, QueryCollection\n", + "from deeprank2.utils.grid import GridSettings, MapMethod" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Raw files and paths\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The paths for reading raw data and saving the processed ones:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_path = os.path.join(\"data_raw\", \"ppi\")\n", + "processed_data_path = os.path.join(\"data_processed\", \"ppi\")\n", + "residue_data_path = os.path.join(processed_data_path, \"residue\")\n", + "atomic_data_path = os.path.join(processed_data_path, \"atomic\")\n", + "\n", + "for output_path in [residue_data_path, atomic_data_path]:\n", + " os.makedirs(output_path, exist_ok=True)\n", + " if any(Path(output_path).iterdir()):\n", + " msg = f\"Please store any required data from `./{output_path}` and delete the folder.\\nThen re-run this cell to continue.\"\n", + " raise FileExistsError(msg)\n", + "\n", + "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", + "limit_data = True" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Raw data are PDB files in `data_raw/ppi/pdb/`, which contains atomic coordinates of the protein-protein complexes of interest, so in our case of pMHC complexes.\n", + "- Target data, so in our case the BA values for the pMHC complex, are in `data_raw/ppi/BA_values.csv`.\n", + "- The final PPI processed data will be saved in `data_processed/ppi/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names in a list and the BA target values from a CSV containing the IDs of the PDB models as well:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[float]]:\n", + " csv_data = pd.read_csv(os.path.join(data_path, \"BA_values.csv\"))\n", + " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.pdb\"))\n", + " pdb_files.sort()\n", + " pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0] for pdb_file in pdb_files]\n", + " with contextlib.suppress(KeyError):\n", + " csv_data_indexed = csv_data.set_index(\"ID\")\n", + " csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]\n", + " bas = csv_data_indexed.measurement_value.tolist()\n", + "\n", + " return pdb_files, bas\n", + "\n", + "\n", + "pdb_files, bas = get_pdb_files_and_target_data(data_path)\n", + "\n", + "if limit_data:\n", + " pdb_files = pdb_files[:15]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `QueryCollection` and `Query` objects\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each protein-protein complex, so for each data point, a query can be created and added to the `QueryCollection` object, to be processed later on.\n", + "\n", + "A query takes as inputs:\n", + "\n", + "- A `.pdb` file, representing the protein-protein structural complex.\n", + "- The resolution (`\"residue\"` or `\"atom\"`), i.e. whether each node should represent an amino acid residue or an atom.\n", + "- The ids of the two chains composing the complex. In our use case, \"M\" indicates the MHC protein chain and \"P\" the peptide chain.\n", + "- The interaction radius, which determines the threshold distance (in Γ…ngstrΓΆm) for residues/atoms surrounding the interface that will be included in the graph.\n", + "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add two targets: \"BA\" and \"binary\". The first represents the actual BA value of the complex in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) a binding one.\n", + "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", + "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), in the form of .pssm files. PSSMs are optional and will not be used in this tutorial.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Residue-level PPIs using `ProteinProteinInterfaceQuery`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queries = QueryCollection()\n", + "\n", + "influence_radius = 8 # max distance in Γ… between two interacting residues/atoms of two proteins\n", + "max_edge_length = 8\n", + "binary_target_value = 500\n", + "\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", + "for i in range(len(pdb_files)):\n", + " queries.add(\n", + " ProteinProteinInterfaceQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"residue\",\n", + " chain_ids=[\"M\", \"P\"],\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " targets={\n", + " \"binary\": int(float(bas[i]) <= binary_target_value),\n", + " \"BA\": bas[i], # continuous target value\n", + " },\n", + " ),\n", + " )\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", + "\n", + "print(f\"{i+1} queries ready to be processed.\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notes on `process()` method\n", + "\n", + "Once all queries have been added to the `QueryCollection` instance, they can be processed. Main parameters of the `process()` method, include:\n", + "\n", + "- `prefix` sets the output file location.\n", + "- `feature_modules` allows you to choose which feature generating modules you want to use. By default, the basic features contained in `deeprank2.features.components` and `deeprank2.features.contact` are generated. Users can add custom features by creating a new module and placing it in the `deeprank2.feature` subpackage. A complete and detailed list of the pre-implemented features per module and more information about how to add custom features can be found [here](https://deeprank2.readthedocs.io/en/latest/features.html).\n", + " - Note that all features generated by a module will be added if that module was selected, and there is no way to only generate specific features from that module. However, during the training phase shown in `training_ppi.ipynb`, it is possible to select only a subset of available features.\n", + "- `cpu_count` can be used to specify how many processes to be run simultaneously, and will coincide with the number of HDF5 files generated. By default it takes all available CPU cores and HDF5 files are squashed into a single file using the `combine_output` setting.\n", + "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Γ…\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "\n", + "queries.process(\n", + " prefix=os.path.join(processed_data_path, \"residue\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", + "\n", + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploring data\n", + "\n", + "As representative example, the following is the HDF5 structure generated by the previous code for `BA-100600.pdb`, so for one single graph, which represents one PPI, for the graph + grid case:\n", + "\n", + "```bash\n", + "└── residue-ppi:M-P:BA-100600\n", + " |\n", + " β”œβ”€β”€ edge_features\n", + " β”‚ β”œβ”€β”€ _index\n", + " β”‚ β”œβ”€β”€ _name\n", + " β”‚ β”œβ”€β”€ covalent\n", + " β”‚ β”œβ”€β”€ distance\n", + " β”‚ β”œβ”€β”€ electrostatic\n", + " β”‚ β”œβ”€β”€ same_chain\n", + " β”‚ └── vanderwaals\n", + " |\n", + " β”œβ”€β”€ node_features\n", + " β”‚ β”œβ”€β”€ _chain_id\n", + " β”‚ β”œβ”€β”€ _name\n", + " β”‚ β”œβ”€β”€ _position\n", + " β”‚ β”œβ”€β”€ hb_acceptors\n", + " β”‚ β”œβ”€β”€ hb_donors\n", + " β”‚ β”œβ”€β”€ polarity\n", + " β”‚ β”œβ”€β”€ res_charge\n", + " β”‚ β”œβ”€β”€ res_mass\n", + " | β”œβ”€β”€ res_pI\n", + " | β”œβ”€β”€ res_size\n", + " | └── res_type\n", + " |\n", + " β”œβ”€β”€ grid_points\n", + " β”‚ β”œβ”€β”€ center\n", + " β”‚ β”œβ”€β”€ x\n", + " β”‚ β”œβ”€β”€ y\n", + " β”‚ └── z\n", + " |\n", + " β”œβ”€β”€ mapped_features\n", + " β”‚ β”œβ”€β”€ _position_000\n", + " β”‚ β”œβ”€β”€ _position_001\n", + " β”‚ β”œβ”€β”€ _position_002\n", + " β”‚ β”œβ”€β”€ covalent\n", + " β”‚ β”œβ”€β”€ distance\n", + " β”‚ β”œβ”€β”€ electrostatic\n", + " β”‚ β”œβ”€β”€ polarity_000\n", + " β”‚ β”œβ”€β”€ polarity_001\n", + " β”‚ β”œβ”€β”€ polarity_002\n", + " β”‚ β”œβ”€β”€ polarity_003\n", + " | β”œβ”€β”€ ...\n", + " | └── vanderwaals\n", + " |\n", + " └── target_values\n", + " β”‚ β”œβ”€β”€ BA\n", + " └── binary\n", + "```\n", + "\n", + "`edge_features`, `node_features`, `mapped_features` are [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which contain [HDF5 Datasets](https://docs.h5py.org/en/stable/high/dataset.html) (e.g., `_index`, `electrostatic`, etc.), which in turn contains features values in the form of arrays. `edge_features` and `node_features` refer specificly to the graph representation, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, with the features and the target changing according to the user's settings. Features starting with `_` are present for human inspection of the data, but they are not used for training models.\n", + "\n", + "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pandas dataframe\n", + "\n", + "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also generate histograms for looking at the features distributions. An example:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fname = os.path.join(processed_data_path, \"residue\", \"res_mass_distance_electrostatic\")\n", + "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", + "\n", + "im = img.imread(fname + \".png\")\n", + "plt.figure(figsize=(15, 10))\n", + "fig = plt.imshow(im)\n", + "fig.axes.get_xaxis().set_visible(False)\n", + "fig.axes.get_yaxis().set_visible(False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Other tools\n", + "\n", + "- [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files.\n", + " As representative example, the following is the structure for `BA-100600.pdb` seen from HDF5View:\n", + "\n", + " \n", + "\n", + " Using this tool you can inspect the values of the features visually, for each data point.\n", + "\n", + "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with h5py.File(processed_data[0], \"r\") as hdf5:\n", + " # List of all graphs in hdf5, each graph representing a ppi\n", + " ids = list(hdf5.keys())\n", + " print(f\"IDs of PPIs in {processed_data[0]}: {ids}\")\n", + " node_features = list(hdf5[ids[0]][\"node_features\"])\n", + " print(f\"Node features: {node_features}\")\n", + " edge_features = list(hdf5[ids[0]][\"edge_features\"])\n", + " print(f\"Edge features: {edge_features}\")\n", + " target_features = list(hdf5[ids[0]][\"target_values\"])\n", + " print(f\"Targets features: {target_features}\")\n", + " # Polarity feature for ids[0], numpy.ndarray\n", + " node_feat_polarity = hdf5[ids[0]][\"node_features\"][\"polarity\"][:]\n", + " print(f\"Polarity feature shape: {node_feat_polarity.shape}\")\n", + " # Electrostatic feature for ids[0], numpy.ndarray\n", + " edge_feat_electrostatic = hdf5[ids[0]][\"edge_features\"][\"electrostatic\"][:]\n", + " print(f\"Electrostatic feature shape: {edge_feat_electrostatic.shape}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Atomic-level PPIs using `ProteinProteinInterfaceQuery`\n", + "\n", + "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queries = QueryCollection()\n", + "\n", + "influence_radius = 5 # max distance in Γ… between two interacting residues/atoms of two proteins\n", + "max_edge_length = 5\n", + "binary_target_value = 500\n", + "\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", + "for i in range(len(pdb_files)):\n", + " queries.add(\n", + " ProteinProteinInterfaceQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"atom\",\n", + " chain_ids=[\"M\", \"P\"],\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " targets={\n", + " \"binary\": int(float(bas[i]) <= binary_target_value),\n", + " \"BA\": bas[i], # continuous target value\n", + " },\n", + " ),\n", + " )\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", + "\n", + "print(f\"{i+1} queries ready to be processed.\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Γ…\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "\n", + "queries.process(\n", + " prefix=os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", + "\n", + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, the data can be inspected using `hdf5_to_pandas` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fname = os.path.join(processed_data_path, \"atomic\", \"atom_charge\")\n", + "dataset.save_hist(features=\"atom_charge\", fname=fname)\n", + "\n", + "im = img.imread(fname + \".png\")\n", + "plt.figure(figsize=(8, 8))\n", + "fig = plt.imshow(im)\n", + "fig.axes.get_xaxis().set_visible(False)\n", + "fig.axes.get_yaxis().set_visible(False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deeprank2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 7dcde58a..83254395 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -1,583 +1,583 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data preparation for single-residue variants\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "\n", - "\n", - "This tutorial will demonstrate the use of DeepRank2 for generating single-residue variants (SRVs) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files]() of protein structures as input.\n", - "\n", - "In this data processing phase, a local neighborhood around the mutated residue is selected for each SRV according to a radius threshold that the user can customize. All atoms or residues within the threshold are mapped as the nodes to a graph and the interactions between them are the edges of the graph. Each node and edge can have several distinct (structural or physico-chemical) features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. Finally, the mapped data are saved as HDF5 files, which can be used for training predictive models (for details see [training_ppi.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs).\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Input Data\n", - "\n", - "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/7997585). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", - "\n", - "Note that the dataset contains only 96 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilities\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The libraries needed for this tutorial:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import contextlib\n", - "import glob\n", - "import os\n", - "from pathlib import Path\n", - "\n", - "import h5py\n", - "import matplotlib.image as img\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "\n", - "from deeprank2.dataset import GraphDataset\n", - "from deeprank2.domain.aminoacidlist import amino_acids_by_code\n", - "from deeprank2.features import components, contact\n", - "from deeprank2.query import QueryCollection, SingleResidueVariantQuery\n", - "from deeprank2.utils.grid import GridSettings, MapMethod" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Raw files and paths\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The paths for reading raw data and saving the processed ones:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_path = os.path.join(\"data_raw\", \"srv\")\n", - "processed_data_path = os.path.join(\"data_processed\", \"srv\")\n", - "residue_data_path = os.path.join(processed_data_path, \"residue\")\n", - "atomic_data_path = os.path.join(processed_data_path, \"atomic\")\n", - "\n", - "for output_path in [residue_data_path, atomic_data_path]:\n", - " os.makedirs(output_path, exist_ok=True)\n", - " if any(Path(output_path).iterdir()):\n", - " msg = f\"Please store any required data from `./{output_path}` and delete the folder.\\nThen re-run this cell to continue.\"\n", - " raise FileExistsError(msg)\n", - "\n", - "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", - "limit_data = False" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Raw data are PDB files in `data_raw/srv/pdb/`, which contains atomic coordinates of the protein structure containing the variant.\n", - "- Target data, so in our case pathogenic versus benign labels, are in `data_raw/srv/srv_target_values.csv`.\n", - "- The final SRV processed data will be saved in `data_processed/srv/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names, SRVs information and target values in a list from the CSV:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[int], list[str], list[str], list[float]]:\n", - " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values_curated.csv\"))\n", - " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.ent\"))\n", - " pdb_files.sort()\n", - " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", - " csv_data_indexed = csv_data.set_index(\"pdb_file\")\n", - " with contextlib.suppress(KeyError):\n", - " csv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", - " res_numbers = csv_data_indexed.res_number.tolist()\n", - " res_wildtypes = csv_data_indexed.res_wildtype.tolist()\n", - " res_variants = csv_data_indexed.res_variant.tolist()\n", - " targets = csv_data_indexed.target.tolist()\n", - " pdb_names = csv_data_indexed.index.tolist()\n", - " pdb_files = [data_path + \"/pdb/\" + pdb_name for pdb_name in pdb_names]\n", - "\n", - " return pdb_files, res_numbers, res_wildtypes, res_variants, targets\n", - "\n", - "\n", - "pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)\n", - "\n", - "if limit_data:\n", - " pdb_files = pdb_files[:15]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `QueryCollection` and `Query` objects\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each SRV, so for each data point, a query can be created and added to the `QueryCollection` object, to be processed later on. Different types of queries exist, based on the molecular resolution needed:\n", - "\n", - "A query takes as inputs:\n", - "\n", - "- A `.pdb` file, representing the protein structure containing the SRV.\n", - "- The resolution (`\"residue\"` or `\"atom\"`), i.e. whether each node should represent an amino acid residue or an atom.\n", - "- The chain id of the SRV.\n", - "- The residue number of the missense mutation.\n", - "- The insertion code, used when two residues have the same numbering. The combination of residue numbering and insertion code defines the unique residue.\n", - "- The wildtype amino acid.\n", - "- The variant amino acid.\n", - "- The interaction radius, which determines the threshold distance (in Γ…ngstrΓΆm) for residues/atoms surrounding the mutation that will be included in the graph.\n", - "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add a 0 if the SRV belongs to the benign class, and 1 if it belongs to the pathogenic one.\n", - "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", - "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), per chain identifier, in the form of .pssm files. PSSMs are optional and will not be used in this tutorial.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Residue-level SRV: `SingleResidueVariantQuery`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = QueryCollection()\n", - "\n", - "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", - "max_edge_length = 4.5 # ??\n", - "\n", - "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "for i in range(len(pdb_files)):\n", - " queries.add(\n", - " SingleResidueVariantQuery(\n", - " pdb_path=pdb_files[i],\n", - " resolution=\"residue\",\n", - " chain_ids=\"A\",\n", - " variant_residue_number=res_numbers[i],\n", - " insertion_code=None,\n", - " wildtype_amino_acid=amino_acids_by_code[res_wildtypes[i]],\n", - " variant_amino_acid=amino_acids_by_code[res_variants[i]],\n", - " targets={\"binary\": targets[i]},\n", - " influence_radius=influence_radius,\n", - " max_edge_length=max_edge_length,\n", - " ),\n", - " )\n", - " if i + 1 % 20 == 0:\n", - " print(f\"{i+1} queries added to the collection.\")\n", - "\n", - "print(f\"{i+1} queries ready to be processed.\\n\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Notes on `process()` method\n", - "\n", - "Once all queries have been added to the `QueryCollection` instance, they can be processed. Main parameters of the `process()` method, include:\n", - "\n", - "- `prefix` sets the output file location.\n", - "- `feature_modules` allows you to choose which feature generating modules you want to use. By default, the basic features contained in `deeprank2.features.components` and `deeprank2.features.contact` are generated. Users can add custom features by creating a new module and placing it in the `deeprank2.feature` subpackage. A complete and detailed list of the pre-implemented features per module and more information about how to add custom features can be found [here](https://deeprank2.readthedocs.io/en/latest/features.html).\n", - " - Note that all features generated by a module will be added if that module was selected, and there is no way to only generate specific features from that module. However, during the training phase shown in `training_ppi.ipynb`, it is possible to select only a subset of available features.\n", - "- `cpu_count` can be used to specify how many processes to be run simultaneously, and will coincide with the number of HDF5 files generated. By default it takes all available CPU cores and HDF5 files are squashed into a single file using the `combine_output` setting.\n", - "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - " # the number of points on the x, y, z edges of the cube\n", - " points_counts=[35, 30, 30],\n", - " # x, y, z sizes of the box in Γ…\n", - " sizes=[1.0, 1.0, 1.0],\n", - ")\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", - "\n", - "queries.process(\n", - " prefix=os.path.join(processed_data_path, \"residue\", \"proc\"),\n", - " feature_modules=[components, contact],\n", - " cpu_count=8,\n", - " combine_output=False,\n", - " grid_settings=grid_settings,\n", - " grid_map_method=grid_map_method,\n", - ")\n", - "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exploring data\n", - "\n", - "As representative example, the following is the HDF5 structure generated by the previous code for `pdb2ooh.ent`, so for one single graph, which represents one protein structure containing a SRV in position 112, for the graph + grid case:\n", - "\n", - "```bash\n", - "└── residue-graph:A:112:Threonine->Isoleucine:pdb2ooh\n", - " |\n", - " β”œβ”€β”€ edge_features\n", - " β”‚ β”œβ”€β”€ _index\n", - " β”‚ β”œβ”€β”€ _name\n", - " β”‚ β”œβ”€β”€ covalent\n", - " β”‚ β”œβ”€β”€ distance\n", - " β”‚ β”œβ”€β”€ electrostatic\n", - " β”‚ β”œβ”€β”€ same_chain\n", - " β”‚ └── vanderwaals\n", - " |\n", - " β”œβ”€β”€ node_features\n", - " β”‚ β”œβ”€β”€ _chain_id\n", - " β”‚ β”œβ”€β”€ _name\n", - " β”‚ β”œβ”€β”€ _position\n", - " β”‚ β”œβ”€β”€ diff_charge\n", - " β”‚ β”œβ”€β”€ diff_hb_donors\n", - " β”‚ β”œβ”€β”€ diff_hb_acceptors\n", - " β”‚ β”œβ”€β”€ diff_mass\n", - " β”‚ β”œβ”€β”€ diff_pI\n", - " β”‚ β”œβ”€β”€ diff_polarity\n", - " β”‚ β”œβ”€β”€ diff_size\n", - " β”‚ β”œβ”€β”€ hb_acceptors\n", - " β”‚ β”œβ”€β”€ hb_donors\n", - " β”‚ β”œβ”€β”€ polarity\n", - " β”‚ β”œβ”€β”€ res_charge\n", - " β”‚ β”œβ”€β”€ res_mass\n", - " | β”œβ”€β”€ res_pI\n", - " | β”œβ”€β”€ res_size\n", - " | β”œβ”€β”€ res_type\n", - " | └── variant_res\n", - " |\n", - " β”œβ”€β”€ grid_points\n", - " β”‚ β”œβ”€β”€ center\n", - " β”‚ β”œβ”€β”€ x\n", - " β”‚ β”œβ”€β”€ y\n", - " β”‚ └── z\n", - " |\n", - " β”œβ”€β”€ mapped_features\n", - " β”‚ β”œβ”€β”€ _position_000\n", - " β”‚ β”œβ”€β”€ _position_001\n", - " β”‚ β”œβ”€β”€ _position_002\n", - " β”‚ β”œβ”€β”€ covalent\n", - " β”‚ β”œβ”€β”€ distance\n", - " β”‚ β”œβ”€β”€ electrostatic\n", - " β”‚ β”œβ”€β”€ diff_polarity_000\n", - " β”‚ β”œβ”€β”€ diff_polarity_001\n", - " β”‚ β”œβ”€β”€ diff_polarity_002\n", - " β”‚ β”œβ”€β”€ diff_polarity_003\n", - " | β”œβ”€β”€ ...\n", - " | └── vanderwaals\n", - " |\n", - " └── target_values\n", - " └── binary\n", - "```\n", - "\n", - "`edge_features`, `node_features`, `mapped_features` are [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which contain [HDF5 Datasets](https://docs.h5py.org/en/stable/high/dataset.html) (e.g., `_index`, `electrostatic`, etc.), which in turn contains features values in the form of arrays. `edge_features` and `node_features` refer specificly to the graph representation, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, with the features and the target changing according to the user's settings. Features starting with `_` are present for human inspection of the data, but they are not used for training models.\n", - "\n", - "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Pandas dataframe\n", - "\n", - "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "dataset_df = dataset.hdf5_to_pandas()\n", - "dataset_df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also generate histograms for looking at the features distributions. An example:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fname = os.path.join(processed_data_path, \"residue\", \"res_mass_distance_electrostatic\")\n", - "\n", - "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", - "\n", - "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize=(15, 10))\n", - "fig = plt.imshow(im)\n", - "fig.axes.get_xaxis().set_visible(False)\n", - "fig.axes.get_yaxis().set_visible(False)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Other tools\n", - "\n", - "- [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files.\n", - " As representative example, the following is the structure for `pdb2ooh.ent` seen from HDF5View:\n", - "\n", - " \n", - "\n", - " Using this tool you can inspect the values of the features visually, for each data point.\n", - "\n", - "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with h5py.File(processed_data[0], \"r\") as hdf5:\n", - " # List of all graphs in hdf5, each graph representing\n", - " # a SRV and its sourrouding environment\n", - " ids = list(hdf5.keys())\n", - " print(f\"IDs of SRVs in {processed_data[0]}: {ids}\")\n", - " node_features = list(hdf5[ids[0]][\"node_features\"])\n", - " print(f\"Node features: {node_features}\")\n", - " edge_features = list(hdf5[ids[0]][\"edge_features\"])\n", - " print(f\"Edge features: {edge_features}\")\n", - " target_features = list(hdf5[ids[0]][\"target_values\"])\n", - " print(f\"Targets features: {target_features}\")\n", - " # Polarity feature for ids[0], numpy.ndarray\n", - " node_feat_polarity = hdf5[ids[0]][\"node_features\"][\"polarity\"][:]\n", - " print(f\"Polarity feature shape: {node_feat_polarity.shape}\")\n", - " # Electrostatic feature for ids[0], numpy.ndarray\n", - " edge_feat_electrostatic = hdf5[ids[0]][\"edge_features\"][\"electrostatic\"][:]\n", - " print(f\"Electrostatic feature shape: {edge_feat_electrostatic.shape}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Atomic-level SRV: `SingleResidueVariantQuery`\n", - "\n", - "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = QueryCollection()\n", - "\n", - "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", - "max_edge_length = 4.5 # ??\n", - "\n", - "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "for i in range(len(pdb_files)):\n", - " queries.add(\n", - " SingleResidueVariantQuery(\n", - " pdb_path=pdb_files[i],\n", - " resolution=\"atom\",\n", - " chain_ids=\"A\",\n", - " variant_residue_number=res_numbers[i],\n", - " insertion_code=None,\n", - " wildtype_amino_acid=amino_acids_by_code[res_wildtypes[i]],\n", - " variant_amino_acid=amino_acids_by_code[res_variants[i]],\n", - " targets={\"binary\": targets[i]},\n", - " influence_radius=influence_radius,\n", - " max_edge_length=max_edge_length,\n", - " ),\n", - " )\n", - " if i + 1 % 20 == 0:\n", - " print(f\"{i+1} queries added to the collection.\")\n", - "\n", - "print(f\"{i+1} queries ready to be processed.\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - " # the number of points on the x, y, z edges of the cube\n", - " points_counts=[35, 30, 30],\n", - " # x, y, z sizes of the box in Γ…\n", - " sizes=[1.0, 1.0, 1.0],\n", - ")\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", - "\n", - "queries.process(\n", - " prefix=os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", - " feature_modules=[components, contact],\n", - " cpu_count=8,\n", - " combine_output=False,\n", - " grid_settings=grid_settings,\n", - " grid_map_method=grid_map_method,\n", - ")\n", - "\n", - "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, the data can be inspected using `hdf5_to_pandas` function.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", - "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "dataset_df = dataset.hdf5_to_pandas()\n", - "dataset_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fname = os.path.join(processed_data_path, \"atomic\", \"atom_charge\")\n", - "dataset.save_hist(features=\"atom_charge\", fname=fname)\n", - "\n", - "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize=(8, 8))\n", - "fig = plt.imshow(im)\n", - "fig.axes.get_xaxis().set_visible(False)\n", - "fig.axes.get_yaxis().set_visible(False)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "deeprankcore", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data preparation for single-residue variants\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "\n", + "\n", + "This tutorial will demonstrate the use of DeepRank2 for generating single-residue variants (SRVs) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files]() of protein structures as input.\n", + "\n", + "In this data processing phase, a local neighborhood around the mutated residue is selected for each SRV according to a radius threshold that the user can customize. All atoms or residues within the threshold are mapped as the nodes to a graph and the interactions between them are the edges of the graph. Each node and edge can have several distinct (structural or physico-chemical) features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. Finally, the mapped data are saved as HDF5 files, which can be used for training predictive models (for details see [training_ppi.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs).\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input Data\n", + "\n", + "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/7997585). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "\n", + "Note that the dataset contains only 96 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilities\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The libraries needed for this tutorial:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import contextlib\n", + "import glob\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "import h5py\n", + "import matplotlib.image as img\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "from deeprank2.dataset import GraphDataset\n", + "from deeprank2.domain.aminoacidlist import amino_acids_by_code\n", + "from deeprank2.features import components, contact\n", + "from deeprank2.query import QueryCollection, SingleResidueVariantQuery\n", + "from deeprank2.utils.grid import GridSettings, MapMethod" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Raw files and paths\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The paths for reading raw data and saving the processed ones:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_path = os.path.join(\"data_raw\", \"srv\")\n", + "processed_data_path = os.path.join(\"data_processed\", \"srv\")\n", + "residue_data_path = os.path.join(processed_data_path, \"residue\")\n", + "atomic_data_path = os.path.join(processed_data_path, \"atomic\")\n", + "\n", + "for output_path in [residue_data_path, atomic_data_path]:\n", + " os.makedirs(output_path, exist_ok=True)\n", + " if any(Path(output_path).iterdir()):\n", + " msg = f\"Please store any required data from `./{output_path}` and delete the folder.\\nThen re-run this cell to continue.\"\n", + " raise FileExistsError(msg)\n", + "\n", + "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", + "limit_data = False" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Raw data are PDB files in `data_raw/srv/pdb/`, which contains atomic coordinates of the protein structure containing the variant.\n", + "- Target data, so in our case pathogenic versus benign labels, are in `data_raw/srv/srv_target_values_curated.csv`.\n", + "- The final SRV processed data will be saved in `data_processed/srv/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names, SRVs information and target values in a list from the CSV:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[int], list[str], list[str], list[float]]:\n", + " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values_curated.csv\"))\n", + " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.ent\"))\n", + " pdb_files.sort()\n", + " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", + " csv_data_indexed = csv_data.set_index(\"pdb_file\")\n", + " with contextlib.suppress(KeyError):\n", + " csv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", + " res_numbers = csv_data_indexed.res_number.tolist()\n", + " res_wildtypes = csv_data_indexed.res_wildtype.tolist()\n", + " res_variants = csv_data_indexed.res_variant.tolist()\n", + " targets = csv_data_indexed.target.tolist()\n", + " pdb_names = csv_data_indexed.index.tolist()\n", + " pdb_files = [data_path + \"/pdb/\" + pdb_name for pdb_name in pdb_names]\n", + "\n", + " return pdb_files, res_numbers, res_wildtypes, res_variants, targets\n", + "\n", + "\n", + "pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)\n", + "\n", + "if limit_data:\n", + " pdb_files = pdb_files[:15]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `QueryCollection` and `Query` objects\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each SRV, so for each data point, a query can be created and added to the `QueryCollection` object, to be processed later on. Different types of queries exist, based on the molecular resolution needed:\n", + "\n", + "A query takes as inputs:\n", + "\n", + "- A `.pdb` file, representing the protein structure containing the SRV.\n", + "- The resolution (`\"residue\"` or `\"atom\"`), i.e. whether each node should represent an amino acid residue or an atom.\n", + "- The chain id of the SRV.\n", + "- The residue number of the missense mutation.\n", + "- The insertion code, used when two residues have the same numbering. The combination of residue numbering and insertion code defines the unique residue.\n", + "- The wildtype amino acid.\n", + "- The variant amino acid.\n", + "- The interaction radius, which determines the threshold distance (in Γ…ngstrΓΆm) for residues/atoms surrounding the mutation that will be included in the graph.\n", + "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add a 0 if the SRV belongs to the benign class, and 1 if it belongs to the pathogenic one.\n", + "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", + "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), per chain identifier, in the form of .pssm files. PSSMs are optional and will not be used in this tutorial.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Residue-level SRV: `SingleResidueVariantQuery`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queries = QueryCollection()\n", + "\n", + "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", + "max_edge_length = 4.5 # ??\n", + "\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", + "for i in range(len(pdb_files)):\n", + " queries.add(\n", + " SingleResidueVariantQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"residue\",\n", + " chain_ids=\"A\",\n", + " variant_residue_number=res_numbers[i],\n", + " insertion_code=None,\n", + " wildtype_amino_acid=amino_acids_by_code[res_wildtypes[i]],\n", + " variant_amino_acid=amino_acids_by_code[res_variants[i]],\n", + " targets={\"binary\": targets[i]},\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " ),\n", + " )\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", + "\n", + "print(f\"{i+1} queries ready to be processed.\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notes on `process()` method\n", + "\n", + "Once all queries have been added to the `QueryCollection` instance, they can be processed. Main parameters of the `process()` method, include:\n", + "\n", + "- `prefix` sets the output file location.\n", + "- `feature_modules` allows you to choose which feature generating modules you want to use. By default, the basic features contained in `deeprank2.features.components` and `deeprank2.features.contact` are generated. Users can add custom features by creating a new module and placing it in the `deeprank2.feature` subpackage. A complete and detailed list of the pre-implemented features per module and more information about how to add custom features can be found [here](https://deeprank2.readthedocs.io/en/latest/features.html).\n", + " - Note that all features generated by a module will be added if that module was selected, and there is no way to only generate specific features from that module. However, during the training phase shown in `training_ppi.ipynb`, it is possible to select only a subset of available features.\n", + "- `cpu_count` can be used to specify how many processes to be run simultaneously, and will coincide with the number of HDF5 files generated. By default it takes all available CPU cores and HDF5 files are squashed into a single file using the `combine_output` setting.\n", + "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Γ…\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "\n", + "queries.process(\n", + " prefix=os.path.join(processed_data_path, \"residue\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", + "\n", + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploring data\n", + "\n", + "As representative example, the following is the HDF5 structure generated by the previous code for `pdb2ooh.ent`, so for one single graph, which represents one protein structure containing a SRV in position 112, for the graph + grid case:\n", + "\n", + "```bash\n", + "└── residue-graph:A:112:Threonine->Isoleucine:pdb2ooh\n", + " |\n", + " β”œβ”€β”€ edge_features\n", + " β”‚ β”œβ”€β”€ _index\n", + " β”‚ β”œβ”€β”€ _name\n", + " β”‚ β”œβ”€β”€ covalent\n", + " β”‚ β”œβ”€β”€ distance\n", + " β”‚ β”œβ”€β”€ electrostatic\n", + " β”‚ β”œβ”€β”€ same_chain\n", + " β”‚ └── vanderwaals\n", + " |\n", + " β”œβ”€β”€ node_features\n", + " β”‚ β”œβ”€β”€ _chain_id\n", + " β”‚ β”œβ”€β”€ _name\n", + " β”‚ β”œβ”€β”€ _position\n", + " β”‚ β”œβ”€β”€ diff_charge\n", + " β”‚ β”œβ”€β”€ diff_hb_donors\n", + " β”‚ β”œβ”€β”€ diff_hb_acceptors\n", + " β”‚ β”œβ”€β”€ diff_mass\n", + " β”‚ β”œβ”€β”€ diff_pI\n", + " β”‚ β”œβ”€β”€ diff_polarity\n", + " β”‚ β”œβ”€β”€ diff_size\n", + " β”‚ β”œβ”€β”€ hb_acceptors\n", + " β”‚ β”œβ”€β”€ hb_donors\n", + " β”‚ β”œβ”€β”€ polarity\n", + " β”‚ β”œβ”€β”€ res_charge\n", + " β”‚ β”œβ”€β”€ res_mass\n", + " | β”œβ”€β”€ res_pI\n", + " | β”œβ”€β”€ res_size\n", + " | β”œβ”€β”€ res_type\n", + " | └── variant_res\n", + " |\n", + " β”œβ”€β”€ grid_points\n", + " β”‚ β”œβ”€β”€ center\n", + " β”‚ β”œβ”€β”€ x\n", + " β”‚ β”œβ”€β”€ y\n", + " β”‚ └── z\n", + " |\n", + " β”œβ”€β”€ mapped_features\n", + " β”‚ β”œβ”€β”€ _position_000\n", + " β”‚ β”œβ”€β”€ _position_001\n", + " β”‚ β”œβ”€β”€ _position_002\n", + " β”‚ β”œβ”€β”€ covalent\n", + " β”‚ β”œβ”€β”€ distance\n", + " β”‚ β”œβ”€β”€ electrostatic\n", + " β”‚ β”œβ”€β”€ diff_polarity_000\n", + " β”‚ β”œβ”€β”€ diff_polarity_001\n", + " β”‚ β”œβ”€β”€ diff_polarity_002\n", + " β”‚ β”œβ”€β”€ diff_polarity_003\n", + " | β”œβ”€β”€ ...\n", + " | └── vanderwaals\n", + " |\n", + " └── target_values\n", + " └── binary\n", + "```\n", + "\n", + "`edge_features`, `node_features`, `mapped_features` are [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which contain [HDF5 Datasets](https://docs.h5py.org/en/stable/high/dataset.html) (e.g., `_index`, `electrostatic`, etc.), which in turn contains features values in the form of arrays. `edge_features` and `node_features` refer specificly to the graph representation, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, with the features and the target changing according to the user's settings. Features starting with `_` are present for human inspection of the data, but they are not used for training models.\n", + "\n", + "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pandas dataframe\n", + "\n", + "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also generate histograms for looking at the features distributions. An example:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fname = os.path.join(processed_data_path, \"residue\", \"res_mass_distance_electrostatic\")\n", + "\n", + "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", + "\n", + "im = img.imread(fname + \".png\")\n", + "plt.figure(figsize=(15, 10))\n", + "fig = plt.imshow(im)\n", + "fig.axes.get_xaxis().set_visible(False)\n", + "fig.axes.get_yaxis().set_visible(False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Other tools\n", + "\n", + "- [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files.\n", + " As representative example, the following is the structure for `pdb2ooh.ent` seen from HDF5View:\n", + "\n", + " \n", + "\n", + " Using this tool you can inspect the values of the features visually, for each data point.\n", + "\n", + "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with h5py.File(processed_data[0], \"r\") as hdf5:\n", + " # List of all graphs in hdf5, each graph representing\n", + " # a SRV and its sourrouding environment\n", + " ids = list(hdf5.keys())\n", + " print(f\"IDs of SRVs in {processed_data[0]}: {ids}\")\n", + " node_features = list(hdf5[ids[0]][\"node_features\"])\n", + " print(f\"Node features: {node_features}\")\n", + " edge_features = list(hdf5[ids[0]][\"edge_features\"])\n", + " print(f\"Edge features: {edge_features}\")\n", + " target_features = list(hdf5[ids[0]][\"target_values\"])\n", + " print(f\"Targets features: {target_features}\")\n", + " # Polarity feature for ids[0], numpy.ndarray\n", + " node_feat_polarity = hdf5[ids[0]][\"node_features\"][\"polarity\"][:]\n", + " print(f\"Polarity feature shape: {node_feat_polarity.shape}\")\n", + " # Electrostatic feature for ids[0], numpy.ndarray\n", + " edge_feat_electrostatic = hdf5[ids[0]][\"edge_features\"][\"electrostatic\"][:]\n", + " print(f\"Electrostatic feature shape: {edge_feat_electrostatic.shape}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Atomic-level SRV: `SingleResidueVariantQuery`\n", + "\n", + "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queries = QueryCollection()\n", + "\n", + "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", + "max_edge_length = 4.5 # ??\n", + "\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", + "for i in range(len(pdb_files)):\n", + " queries.add(\n", + " SingleResidueVariantQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"atom\",\n", + " chain_ids=\"A\",\n", + " variant_residue_number=res_numbers[i],\n", + " insertion_code=None,\n", + " wildtype_amino_acid=amino_acids_by_code[res_wildtypes[i]],\n", + " variant_amino_acid=amino_acids_by_code[res_variants[i]],\n", + " targets={\"binary\": targets[i]},\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " ),\n", + " )\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", + "\n", + "print(f\"{i+1} queries ready to be processed.\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Γ…\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "\n", + "queries.process(\n", + " prefix=os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", + "\n", + "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, the data can be inspected using `hdf5_to_pandas` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", + "dataset = GraphDataset(processed_data, target=\"binary\")\n", + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fname = os.path.join(processed_data_path, \"atomic\", \"atom_charge\")\n", + "dataset.save_hist(features=\"atom_charge\", fname=fname)\n", + "\n", + "im = img.imread(fname + \".png\")\n", + "plt.figure(figsize=(8, 8))\n", + "fig = plt.imshow(im)\n", + "fig.axes.get_xaxis().set_visible(False)\n", + "fig.axes.get_yaxis().set_visible(False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deeprankcore", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index b3445cc4..db64ae53 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -1,773 +1,773 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training Neural Networks\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "\n", - "\n", - "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", - "\n", - "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", - "\n", - "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Input data\n", - "\n", - "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", - "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/7997585). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", - "\n", - "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilities\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The libraries needed for this tutorial:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "import logging\n", - "import os\n", - "import warnings\n", - "\n", - "import h5py\n", - "import numpy as np\n", - "import pandas as pd\n", - "import plotly.express as px\n", - "import torch\n", - "from sklearn.metrics import accuracy_score, auc, f1_score, precision_score, recall_score, roc_curve\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "from deeprank2.dataset import GraphDataset, GridDataset\n", - "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", - "from deeprank2.neuralnets.gnn.vanilla_gnn import VanillaNetwork\n", - "from deeprank2.trainer import Trainer\n", - "from deeprank2.utils.exporters import HDF5OutputExporter\n", - "\n", - "np.seterr(divide=\"ignore\")\n", - "np.seterr(invalid=\"ignore\")\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "\n", - "warnings.filterwarnings(\"ignore\")\n", - "\n", - "# ruff: noqa: PD901" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Paths and sets\n", - "\n", - "The paths for reading the processed data:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_type = \"ppi\"\n", - "level = \"residue\"\n", - "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", - "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", - "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", - "\n", - "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dict = {}\n", - "df_dict[\"entry\"] = []\n", - "df_dict[\"target\"] = []\n", - "for fname in input_data_path:\n", - " with h5py.File(fname, \"r\") as hdf5:\n", - " for mol in hdf5:\n", - " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", - " df_dict[\"entry\"].append(mol)\n", - " df_dict[\"target\"].append(target_value)\n", - "\n", - "df = pd.DataFrame(data=df_dict)\n", - "df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", - "\n", - "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", - "\n", - "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", - "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", - "\n", - "print(\"Data statistics:\\n\")\n", - "print(f\"Total samples: {len(df)}\\n\")\n", - "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", - "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", - "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", - "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", - "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", - "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Classification example\n", - "\n", - "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### GNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GraphDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", - "\n", - "A few notes about `GraphDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", - "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", - "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", - " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", - " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", - " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", - " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", - " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "node_features = [\"res_type\"]\n", - "edge_features = [\"distance\"]\n", - "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " node_features=node_features,\n", - " edge_features=edge_features,\n", - " features_transform=features_transform,\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few notes about `Trainer` parameters:\n", - "\n", - "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `VanillaNetwork` (implemented in `deeprank2.neuralnets.gnn.vanilla_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", - "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", - "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", - "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " neuralnet=VanillaNetwork,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", - "\n", - "Then the model can be trained using the `train()` method of the `Trainer` class.\n", - "\n", - "A few notes about `train()` method parameters:\n", - "\n", - "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", - "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", - "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", - "\n", - "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", - "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", - "\n", - "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", - "\n", - "The loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - GNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "skip-execution" - ] - }, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for dataset in [\"training\", \"validation\", \"testing\"]:\n", - " df_plot_phase = df_plot[(df_plot.phase == dataset)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {dataset}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", - "\n", - "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GridDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training CNNs the user can create `GridDataset` instances.\n", - "\n", - "A few notes about `GridDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", - "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", - "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer = Trainer(\n", - " neuralnet=CnnClassification,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", - ")\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", - "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - CNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And some metrics of interest for classification tasks:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for dataset in [\"training\", \"validation\", \"testing\"]:\n", - " df_plot_phase = df_plot[(df_plot.phase == dataset)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {dataset}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "deeprank2", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training Neural Networks\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "\n", + "\n", + "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", + "\n", + "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/7997585). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", + "\n", + "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input data\n", + "\n", + "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", + "\n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/7997585). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "\n", + "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilities\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The libraries needed for this tutorial:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import logging\n", + "import os\n", + "import warnings\n", + "\n", + "import h5py\n", + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "import torch\n", + "from sklearn.metrics import accuracy_score, auc, f1_score, precision_score, recall_score, roc_curve\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from deeprank2.dataset import GraphDataset, GridDataset\n", + "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", + "from deeprank2.neuralnets.gnn.vanilla_gnn import VanillaNetwork\n", + "from deeprank2.trainer import Trainer\n", + "from deeprank2.utils.exporters import HDF5OutputExporter\n", + "\n", + "np.seterr(divide=\"ignore\")\n", + "np.seterr(invalid=\"ignore\")\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "# ruff: noqa: PD901" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Paths and sets\n", + "\n", + "The paths for reading the processed data:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_type = \"ppi\"\n", + "level = \"residue\"\n", + "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", + "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", + "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", + "\n", + "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dict = {}\n", + "df_dict[\"entry\"] = []\n", + "df_dict[\"target\"] = []\n", + "for fname in input_data_path:\n", + " with h5py.File(fname, \"r\") as hdf5:\n", + " for mol in hdf5:\n", + " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", + " df_dict[\"entry\"].append(mol)\n", + " df_dict[\"target\"].append(target_value)\n", + "\n", + "df = pd.DataFrame(data=df_dict)\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", + "\n", + "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", + "\n", + "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", + "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", + "\n", + "print(\"Data statistics:\\n\")\n", + "print(f\"Total samples: {len(df)}\\n\")\n", + "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", + "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", + "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", + "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", + "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", + "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification example\n", + "\n", + "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GraphDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", + "\n", + "A few notes about `GraphDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", + "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", + "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", + " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", + " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", + " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", + " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", + " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "node_features = [\"res_type\"]\n", + "edge_features = [\"distance\"]\n", + "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " node_features=node_features,\n", + " edge_features=edge_features,\n", + " features_transform=features_transform,\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A few notes about `Trainer` parameters:\n", + "\n", + "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `VanillaNetwork` (implemented in `deeprank2.neuralnets.gnn.vanilla_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", + "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", + "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", + "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " neuralnet=VanillaNetwork,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", + "\n", + "Then the model can be trained using the `train()` method of the `Trainer` class.\n", + "\n", + "A few notes about `train()` method parameters:\n", + "\n", + "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", + "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", + "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", + "\n", + "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", + "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", + "\n", + "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", + "\n", + "The loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - GNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "skip-execution" + ] + }, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for dataset in [\"training\", \"validation\", \"testing\"]:\n", + " df_plot_phase = df_plot[(df_plot.phase == dataset)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {dataset}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", + "\n", + "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GridDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training CNNs the user can create `GridDataset` instances.\n", + "\n", + "A few notes about `GridDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", + "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", + "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer = Trainer(\n", + " neuralnet=CnnClassification,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", + ")\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", + "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - CNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And some metrics of interest for classification tasks:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for dataset in [\"training\", \"validation\", \"testing\"]:\n", + " df_plot_phase = df_plot[(df_plot.phase == dataset)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {dataset}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deeprank2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 }