From 6fb26b34aba79e9d7f40ee2be0f2adc8d620cb5b Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Mon, 22 Jan 2024 13:13:23 +0100 Subject: [PATCH] style: auto format non-python files on save and currently format more non-python files --- .github/ISSUE_TEMPLATE/bug_report.md | 14 +- .github/ISSUE_TEMPLATE/feature_request.md | 8 +- .vscode/settings.json | 34 ++- CHANGELOG.md | 245 +++++++-------- README.dev.md | 12 +- README.md | 50 ++-- docs/features.md | 20 +- docs/getstarted.md | 1 + docs/installation.md | 25 +- paper/paper.bib | 2 +- paper/paper.md | 21 +- tests/data/hdf5/_generate_testdata.ipynb | 203 +++++++------ tutorials/TUTORIAL.md | 7 +- tutorials/data_generation_ppi.ipynb | 211 ++++++------- tutorials/data_generation_srv.ipynb | 248 ++++++++-------- tutorials/training.ipynb | 347 +++++++++++----------- 16 files changed, 747 insertions(+), 701 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index ec9ba868c..baf8836c3 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,16 +1,16 @@ --- name: Bug report about: Create a report to help us improve -title: 'Bug: ' -labels: 'bug' -assignees: '' - +title: "Bug: " +labels: "bug" +assignees: "" --- **Describe the bug** A clear and concise description of what the bug is. **Environment:** + - OS system: - Version: - Branch commit ID: @@ -19,11 +19,11 @@ A clear and concise description of what the bug is. **To Reproduce** Steps/commands/screenshots to reproduce the behaviour: - 1. +1. - 2. +2. - 3. +3. **Expected Results** A clear and concise description of what you expected to happen. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 49a07b8cb..174d8b991 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,10 +1,9 @@ --- name: Feature request about: Suggest an idea for this project -title: 'Add/edit' -labels: 'feature' -assignees: '' - +title: "Add/edit" +labels: "feature" +assignees: "" --- **Is your feature request related to a problem? Please describe.** @@ -21,6 +20,7 @@ Add any other context or screenshots about the feature request here. **Checks for the developer** After having implemented the request, please remember to: + - [ ] Add all the necessary tests. Make sure that the parameter functionality is well tested, from all points of views. - [ ] Add the proper documentation to the source code (docstrings). - [ ] Add the proper documentation to the readme. Examples about how using the new feature should be clear and easy to follow. diff --git a/.vscode/settings.json b/.vscode/settings.json index f0f387d5e..a519af241 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,19 +1,23 @@ { - // Python - "[python]": { - "editor.formatOnSave": true, - "editor.codeActionsOnSave": { - "source.fixAll": "explicit" - }, - "editor.defaultFormatter": "charliermarsh.ruff" + // Python + "[python]": { + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.fixAll": "explicit" }, - "autoDocstring.docstringFormat": "google", + "editor.defaultFormatter": "charliermarsh.ruff" + }, + "autoDocstring.docstringFormat": "google", - // Notebooks - "notebook.lineNumbers": "on", - "notebook.formatOnSave.enabled": true, - "notebook.codeActionsOnSave": { - "notebook.source.fixAll": "explicit", - }, - "notebook.diff.ignoreMetadata": true, + // Notebooks + "notebook.lineNumbers": "on", + "notebook.formatOnSave.enabled": true, + "notebook.codeActionsOnSave": { + "notebook.source.fixAll": "explicit" + }, + "notebook.diff.ignoreMetadata": true, + + // Format all files on save + "editor.formatOnSave": true, + "editor.defaultFormatter": "esbenp.prettier-vscode" } diff --git a/CHANGELOG.md b/CHANGELOG.md index e27e0cfc8..a2732a8a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,97 +5,106 @@ ### Main changes #### Refactor -* refactor: make `preprocess` use all available feature modules as default by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/247 -* refactor: move preprocess function to `QueryDataset` class and rename by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/252 -* refactor: save preprocessed data into one .hdf5 file as default by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/250 -* refactor: clean up `GraphDataset` and `Trainer` class by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/255 -* refactor: reorganize deeprank2.utils.metrics module by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/262 -* refactor: fix `transform_sigmoid` logic and move it to `GraphDataset` class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/288 -* refactor: add grid dataset class and make the trainer class work with it. by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/294 -* refactor: update deprecated dataloader import by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/310 -* refactor: move tests/_utils.py to tests/__init__.py by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/322 -* refactor: delete all outputs from unit tests after run by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/324 -* refactor: test_contact.py function naming and output by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/372 -* refactor: split test contact.py by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/369 -* refactor: change __repr__ of AminoAcid to 3 letter code by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/384 -* refactor: make feature modules and tests uniform and ditch duplicate code by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/400 + +- refactor: make `preprocess` use all available feature modules as default by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/247 +- refactor: move preprocess function to `QueryDataset` class and rename by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/252 +- refactor: save preprocessed data into one .hdf5 file as default by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/250 +- refactor: clean up `GraphDataset` and `Trainer` class by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/255 +- refactor: reorganize deeprank2.utils.metrics module by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/262 +- refactor: fix `transform_sigmoid` logic and move it to `GraphDataset` class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/288 +- refactor: add grid dataset class and make the trainer class work with it. by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/294 +- refactor: update deprecated dataloader import by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/310 +- refactor: move tests/\_utils.py to tests/**init**.py by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/322 +- refactor: delete all outputs from unit tests after run by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/324 +- refactor: test_contact.py function naming and output by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/372 +- refactor: split test contact.py by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/369 +- refactor: change **repr** of AminoAcid to 3 letter code by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/384 +- refactor: make feature modules and tests uniform and ditch duplicate code by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/400 #### Features -* feat: improve amino acid features by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/272 -* feat: add `test_size` equivalent of `val_size` to Trainer class by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/291 -* feat: add the option to have a grid box of different x,y and z dimensions by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/292 -* feat: add early stopping to `Trainer.train` by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/303 -* feat: add hist module for plotting raw hdf5 files features distributions by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/261 -* feat: allow for different loss functions other than the default by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/313 -* feat: center the grids as in the old deeprank by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/323 -* feat: add data augmentation for grids by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/336 -* feat: insert features standardization option in`DeeprankDataset` children classes by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/326 -* feat: add log transformation option for plotting features' hist by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/389 -* feat: add inter-residue contact (IRC) node features by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/333 -* feat: add feature module for secondary structure by @DTRademaker in https://github.com/DeepRank/deeprank-core/pull/387 -* feat: use dictionary for flexibly transforming and standardizing features by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/418 + +- feat: improve amino acid features by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/272 +- feat: add `test_size` equivalent of `val_size` to Trainer class by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/291 +- feat: add the option to have a grid box of different x,y and z dimensions by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/292 +- feat: add early stopping to `Trainer.train` by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/303 +- feat: add hist module for plotting raw hdf5 files features distributions by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/261 +- feat: allow for different loss functions other than the default by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/313 +- feat: center the grids as in the old deeprank by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/323 +- feat: add data augmentation for grids by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/336 +- feat: insert features standardization option in`DeeprankDataset` children classes by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/326 +- feat: add log transformation option for plotting features' hist by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/389 +- feat: add inter-residue contact (IRC) node features by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/333 +- feat: add feature module for secondary structure by @DTRademaker in https://github.com/DeepRank/deeprank-core/pull/387 +- feat: use dictionary for flexibly transforming and standardizing features by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/418 #### Fix -* fix: list all submodules imported from deeprank2.features using pkgutil by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/263 -* fix: let `classes` argument be also categorical by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/286 -* fix: makes sure that the `map_feature` function can handle single value features. by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/289 -* fix: raise exception for invalid optimizer by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/307 -* fix: `num_workers` parameter of Dataloader object by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/319 -* fix: gpu usage by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/334 -* fix: gpu and `entry_names` usage by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/335 -* fix: data generation threading locked by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/330 -* fix: `__hash__` circular dependency issue by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/341 -* fix: make sure that Grid data also has target values, like graph data by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/347 -* fix: change the internal structure of the grid data to match the graph data by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/352 -* fix: conflicts in package by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/386 -* fix: correct usage of nonbond energy for close contacts by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/368 -* fix: Incorrect number of datapoints loaded to model by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/397 -* fix: pytorch 2.0 by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/406 -* fix: covalent bonds cannot link nodes on separate branches by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/408 -* fix: `Trainer` error when only `dataset_test` and `pretrained_model` are used by @ntxxt in https://github.com/DeepRank/deeprank-core/pull/413 -* fix: check PSSMs by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/401 -* fix: only check pssms if conservation module was used by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/425 -* fix: epoch number in `test()` and test on the correct model by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/427 -* fix: convert list of arrays into arrays before converting to Pytorch tensor by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/438 + +- fix: list all submodules imported from deeprank2.features using pkgutil by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/263 +- fix: let `classes` argument be also categorical by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/286 +- fix: makes sure that the `map_feature` function can handle single value features. by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/289 +- fix: raise exception for invalid optimizer by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/307 +- fix: `num_workers` parameter of Dataloader object by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/319 +- fix: gpu usage by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/334 +- fix: gpu and `entry_names` usage by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/335 +- fix: data generation threading locked by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/330 +- fix: `__hash__` circular dependency issue by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/341 +- fix: make sure that Grid data also has target values, like graph data by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/347 +- fix: change the internal structure of the grid data to match the graph data by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/352 +- fix: conflicts in package by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/386 +- fix: correct usage of nonbond energy for close contacts by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/368 +- fix: Incorrect number of datapoints loaded to model by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/397 +- fix: pytorch 2.0 by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/406 +- fix: covalent bonds cannot link nodes on separate branches by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/408 +- fix: `Trainer` error when only `dataset_test` and `pretrained_model` are used by @ntxxt in https://github.com/DeepRank/deeprank-core/pull/413 +- fix: check PSSMs by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/401 +- fix: only check pssms if conservation module was used by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/425 +- fix: epoch number in `test()` and test on the correct model by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/427 +- fix: convert list of arrays into arrays before converting to Pytorch tensor by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/438 #### Docs -* docs: add verbose arg to QueryCollection class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/267 -* docs: improve `clustering_method` description and default value by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/293 -* docs: uniform docstrings format in modules by @joyceljy -* docs: incorrect usage of Union in Optional type hints by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/370 -* docs: improve docs for default exporter and results visualization by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/414 -* docs: update feature documentations by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/419 -* docs: add instructions for `GridDataset` by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/421 -* docs: fix getstarted hierarchy by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/422 -* docs: update dssp 4 install instructions by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/437 -* docs: change `external_distance_cutoff` and `interface_distance_cutoff` to `distance_cutoff` by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/246 + +- docs: add verbose arg to QueryCollection class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/267 +- docs: improve `clustering_method` description and default value by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/293 +- docs: uniform docstrings format in modules by @joyceljy +- docs: incorrect usage of Union in Optional type hints by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/370 +- docs: improve docs for default exporter and results visualization by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/414 +- docs: update feature documentations by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/419 +- docs: add instructions for `GridDataset` by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/421 +- docs: fix getstarted hierarchy by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/422 +- docs: update dssp 4 install instructions by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/437 +- docs: change `external_distance_cutoff` and `interface_distance_cutoff` to `distance_cutoff` by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/246 #### Performances -* perf: features.contact by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/220 -* perf: suppress warnings in pytest and from PDBParser by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/249 -* perf: add try except clause to `_preprocess_one_query` method of `QueryCollection` class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/264 -* perf: improve `process` speed for residue based graph building by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/274 -* perf: add `cuda` and `ngpu` parameters to the `Trainer` class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/311 -* perf: accelerate indexing of HDF5 files by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/362 + +- perf: features.contact by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/220 +- perf: suppress warnings in pytest and from PDBParser by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/249 +- perf: add try except clause to `_preprocess_one_query` method of `QueryCollection` class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/264 +- perf: improve `process` speed for residue based graph building by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/274 +- perf: add `cuda` and `ngpu` parameters to the `Trainer` class by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/311 +- perf: accelerate indexing of HDF5 files by @joyceljy in https://github.com/DeepRank/deeprank-core/pull/362 #### Style -* style: restructure deeprank2 package and subpackages by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/240 -* style: reorganize features/contact.py by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/260 -* style: add .vscode settings.json by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/404 + +- style: restructure deeprank2 package and subpackages by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/240 +- style: reorganize features/contact.py by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/260 +- style: add .vscode settings.json by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/404 #### Test -* test: make sure that the grid orientation is as in the original deeprank for `ProteinProteinInterfaceAtomicQuery` by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/312 -* test: check that the grid for residue-based protein-protein interfaces has the same center and orientation as in the original deeprank. by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/339 -* test: improve `utils/test_graph.py` module by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/420 + +- test: make sure that the grid orientation is as in the original deeprank for `ProteinProteinInterfaceAtomicQuery` by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/312 +- test: check that the grid for residue-based protein-protein interfaces has the same center and orientation as in the original deeprank. by @cbaakman in https://github.com/DeepRank/deeprank-core/pull/339 +- test: improve `utils/test_graph.py` module by @gcroci2 in https://github.com/DeepRank/deeprank-core/pull/420 #### CI -* ci: do not close stale issues or PRs by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/327 -* ci: remove incorrect message for stale branches by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/415 -* ci: automatically check markdown links by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/433 + +- ci: do not close stale issues or PRs by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/327 +- ci: remove incorrect message for stale branches by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/415 +- ci: automatically check markdown links by @DaniBodor in https://github.com/DeepRank/deeprank-core/pull/433 ### New Contributors -* @joyceljy made their first contribution in https://github.com/DeepRank/deeprank-core/pull/361 -* @ntxxt made their first contribution in https://github.com/DeepRank/deeprank-core/pull/413 + +- @joyceljy made their first contribution in https://github.com/DeepRank/deeprank-core/pull/361 +- @ntxxt made their first contribution in https://github.com/DeepRank/deeprank-core/pull/413 **Full Changelog**: https://github.com/DeepRank/deeprank-core/compare/v1.0.0...v2.0.0 @@ -105,25 +114,25 @@ Released on Oct 24, 2022 ### Added -* `weight_decay` parameter to NeuralNet #155 -* Exporter for generating a unique .csv file containing results per epoch #151 -* Automatized testing of all available features modules #163 -* `optimizer` parameter to NeuralNet #154 -* `atom` node feature #168 +- `weight_decay` parameter to NeuralNet #155 +- Exporter for generating a unique .csv file containing results per epoch #151 +- Automatized testing of all available features modules #163 +- `optimizer` parameter to NeuralNet #154 +- `atom` node feature #168 ### Changed -* `index` parameter of NeuralNet is now called `subset` #159 -* `percent` parameter of NeuralNet is now called `val_size`, and the logic behing it has been improved #183 -* Aligned the package to PyTorch high-level frameworks #172 - * NeuralNet is now called Trainer -* Clearer features names #145 -* Changed definitions in storage.py #150 -* `MAX_COVALENT_DISTANCE` is now 2.1 instead of 3 #205 +- `index` parameter of NeuralNet is now called `subset` #159 +- `percent` parameter of NeuralNet is now called `val_size`, and the logic behing it has been improved #183 +- Aligned the package to PyTorch high-level frameworks #172 + - NeuralNet is now called Trainer +- Clearer features names #145 +- Changed definitions in storage.py #150 +- `MAX_COVALENT_DISTANCE` is now 2.1 instead of 3 #205 ### Removed -* `threshold` input parameter from NeuralNet #157 +- `threshold` input parameter from NeuralNet #157 ## 0.2.0 @@ -131,17 +140,17 @@ Released on Aug 10, 2022 ### Added -* Automatic version bumping using `bump2version` with `.bumpversion.cfg` #126 -* `cffconvert.yml` to the CI workflow #139 -* Integration test for the Machine Learning pipeline #95 -* The package now is tested also on Python 3.10 #165 +- Automatic version bumping using `bump2version` with `.bumpversion.cfg` #126 +- `cffconvert.yml` to the CI workflow #139 +- Integration test for the Machine Learning pipeline #95 +- The package now is tested also on Python 3.10 #165 ### Changed -* Test PyPI package before publishing, by triggering a `workflow_dispatch` event from the Actions tab on `release.yml` workflow file #123 -* Coveralls is now working again #124 -* Wrong Zenodo entry has been corrected #138 -* Improved CUDA support (added for data tensors) #132 +- Test PyPI package before publishing, by triggering a `workflow_dispatch` event from the Actions tab on `release.yml` workflow file #123 +- Coveralls is now working again #124 +- Wrong Zenodo entry has been corrected #138 +- Improved CUDA support (added for data tensors) #132 ## 0.1.1 @@ -149,28 +158,28 @@ Released on June 28, 2022 ### Added -* Graph class #48 -* Tensorboard #15 -* CI Linting #30 -* Name, affiliation and orcid to `.zenodo.json` #18 -* Metrics class #17 -* QueryDataset class #53 -* Unit tests for NeuralNet class #86 -* Error message if you pick the wrong metrics #110 -* Unit tests for HDF5Dataset class parameters #82 -* Installation from PyPI in the readme #122 +- Graph class #48 +- Tensorboard #15 +- CI Linting #30 +- Name, affiliation and orcid to `.zenodo.json` #18 +- Metrics class #17 +- QueryDataset class #53 +- Unit tests for NeuralNet class #86 +- Error message if you pick the wrong metrics #110 +- Unit tests for HDF5Dataset class parameters #82 +- Installation from PyPI in the readme #122 ### Changed -* `test_process()` does not fail anymore #47 -* Tests have been speded up #36 -* `multiprocessing.Queue` has been replaced with `multiprocessing.pool.map` in PreProcessor #56 -* `test_preprocess.py` does not fail anymore on Mac M1 #74 -* It's now possible to pass your own train/test split to NeuralNet class #81 -* HDF5Dataset class now is used in the UX #83 -* IndexError running `NeuralNet.train()` has been fixed #89 -* pip installation has been fixed -* Repository has been renamed deeprank-core, and the package deeprank2 #101 -* The zero-division like error from TensorboardBinaryClassificationExporter has been fixed #112 -* h5xplorer is installed through `setup.cfg` file #121 -* Sphinx docs have been fixed #108 +- `test_process()` does not fail anymore #47 +- Tests have been speded up #36 +- `multiprocessing.Queue` has been replaced with `multiprocessing.pool.map` in PreProcessor #56 +- `test_preprocess.py` does not fail anymore on Mac M1 #74 +- It's now possible to pass your own train/test split to NeuralNet class #81 +- HDF5Dataset class now is used in the UX #83 +- IndexError running `NeuralNet.train()` has been fixed #89 +- pip installation has been fixed +- Repository has been renamed deeprank-core, and the package deeprank2 #101 +- The zero-division like error from TensorboardBinaryClassificationExporter has been fixed #112 +- h5xplorer is installed through `setup.cfg` file #121 +- Sphinx docs have been fixed #108 diff --git a/README.dev.md b/README.dev.md index 7b00c6847..9fdd58b1c 100644 --- a/README.dev.md +++ b/README.dev.md @@ -50,6 +50,8 @@ If you are using VS code, please install and activate the [Ruff extension](https Otherwise, please ensure check both linting (`ruff fix .`) and formatting (`ruff format .`) before requesting a review. +We use [prettier](https://prettier.io/) for formatting most other files. If you are editing or adding non-python files and using VS code, the [Prettier extension](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) can be installed to auto-format these files as well. + ## Versioning Bumping the version across all files is done before creating a new package release, running `bump2version [part]` from command line after having installed [bump2version](https://pypi.org/project/bump2version/) on your local environment. Instead of `[part]`, type the part of the version to increase, e.g. minor. The settings in `.bumpversion.cfg` will take care of updating all the files containing version strings. @@ -57,9 +59,12 @@ Bumping the version across all files is done before creating a new package relea ## Branching workflow We use a [Git Flow](https://nvie.com/posts/a-successful-git-branching-model/)-inspired branching workflow for development. DeepRank2's repository is based on two main branches with infinite lifetime: + - `main` — this branch contains production (stable) code. All development code is merged into `main` in sometime. - `dev` — this branch contains pre-production code. When the features are finished then they are merged into `dev`. + During the development cycle, three main supporting branches are used: + - Feature branches - Branches that branch off from `dev` and must merge into `dev`: used to develop new features for the upcoming releases. - Hotfix branches - Branches that branch off from `main` and must merge into `main` and `dev`: necessary to act immediately upon an undesired status of `main`. - Release branches - Branches that branch off from `dev` and must merge into `main` and `dev`: support preparation of a new production release. They allow many minor bug to be fixed and preparation of meta-data for a release. @@ -77,12 +82,13 @@ During the development cycle, three main supporting branches are used: 1. Branch from `dev` and prepare the branch for the release (e.g., removing the unnecessary dev files such as the current one, fix minor bugs if necessary). 2. [Bump the version](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md#versioning). 3. Verify that the information in `CITATION.cff` is correct (update the release date), and that `.zenodo.json` contains equivalent data. -5. Merge the release branch into `main` (and `dev`), and [run the tests](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md#running-the-tests). -6. Go to https://github.com/DeepRank/deeprank2/releases and draft a new release; create a new tag for the release, generate release notes automatically and adjust them, and finally publish the release as latest. This will trigger [a GitHub action](https://github.com/DeepRank/deeprank2/actions/workflows/release.yml) that will take care of publishing the package on PyPi. -7. Update the doi in `CITATION.cff` with the one corresponding to the new release. +4. Merge the release branch into `main` (and `dev`), and [run the tests](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md#running-the-tests). +5. Go to https://github.com/DeepRank/deeprank2/releases and draft a new release; create a new tag for the release, generate release notes automatically and adjust them, and finally publish the release as latest. This will trigger [a GitHub action](https://github.com/DeepRank/deeprank2/actions/workflows/release.yml) that will take care of publishing the package on PyPi. +6. Update the doi in `CITATION.cff` with the one corresponding to the new release. ## UML Code-base class diagrams updated on 02/11/2023, generated with https://www.gituml.com (save the images and open them in the browser for zooming). + - Data processing classes and functions: - ML pipeline classes and functions: diff --git a/README.md b/README.md index 4820b55b7..d0ae4356b 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # Deeprank2 -| Badges | | -|:----:|----| -| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | -| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f3f98b2d1883493ead50e3acaa23f2cc)](https://app.codacy.com/gh/DeepRank/deeprank2?utm_source=github.com&utm_medium=referral&utm_content=DeepRank/deeprank2&utm_campaign=Badge_Grade) | -| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) | -| **tests** | [![Build Status](https://github.com/DeepRank/deeprank2/actions/workflows/build.yml/badge.svg)](https://github.com/DeepRank/deeprank2/actions) ![Linting status](https://github.com/DeepRank/deeprank2/actions/workflows/linting.yml/badge.svg?branch=main) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank2/badge.svg?branch=main)](https://coveralls.io/github/DeepRank/deeprank2?branch=main) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) ![Python](https://img.shields.io/badge/python-3.11-blue.svg) | -| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | -| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | +| Badges | | +| :------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **fairness** | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6403/badge)](https://bestpractices.coreinfrastructure.org/projects/6403) | +| **package** | [![PyPI version](https://badge.fury.io/py/deeprank2.svg)](https://badge.fury.io/py/deeprank2) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f3f98b2d1883493ead50e3acaa23f2cc)](https://app.codacy.com/gh/DeepRank/deeprank2?utm_source=github.com&utm_medium=referral&utm_content=DeepRank/deeprank2&utm_campaign=Badge_Grade) | +| **docs** | [![Documentation Status](https://readthedocs.org/projects/deeprank2/badge/?version=latest)](https://deeprank2.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/450496579.svg)](https://zenodo.org/badge/latestdoi/450496579) | +| **tests** | [![Build Status](https://github.com/DeepRank/deeprank2/actions/workflows/build.yml/badge.svg)](https://github.com/DeepRank/deeprank2/actions) ![Linting status](https://github.com/DeepRank/deeprank2/actions/workflows/linting.yml/badge.svg?branch=main) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank2/badge.svg?branch=main)](https://coveralls.io/github/DeepRank/deeprank2?branch=main) ![Python](https://img.shields.io/badge/python-3.10-blue.svg) ![Python](https://img.shields.io/badge/python-3.11-blue.svg) | +| **running on** | ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white) | +| **license** | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/license/apache-2-0/) | ## Overview @@ -18,6 +18,7 @@ DeepRank2 is an open-source deep learning (DL) framework for data mining of prot DeepRank2 allows for transformation of (pdb formatted) molecular data into 3D representations (either grids or graphs) containing structural and physico-chemical information, which can be used for training neural networks. DeepRank2 also offers a pre-implemented training pipeline, using either [CNNs](https://en.wikipedia.org/wiki/Convolutional_neural_network) (for grids) or [GNNs](https://en.wikipedia.org/wiki/Graph_neural_network) (for graphs), as well as output exporters for evaluating performances. Main features: + - Predefined atom-level and residue-level feature types - e.g. atom/residue type, charge, size, potential energy - All features' documentation is available [here](https://deeprank2.readthedocs.io/en/latest/features.html) @@ -58,19 +59,19 @@ The package officially supports ubuntu-latest OS only, whose functioning is wide Before installing deeprank2 you need to install some dependencies. We advise to use a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) with Python >= 3.10 installed. The following dependency installation instructions are updated as of 14/09/2023, but in case of issues during installation always refer to the official documentation which is linked below: -* [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms`. - * [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -* [PyTorch](https://pytorch.org/get-started/locally/) - * We support torch's CPU library as well as CUDA. - * Currently, the package is tested using [PyTorch 2.0.1](https://pytorch.org/get-started/previous-versions/#v201). -* [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. -* [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/) - * Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4: - * on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`. - * on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread. -* [GCC](https://gcc.gnu.org/install/) - * Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`. -* For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). +- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms`. + - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/) + - We support torch's CPU library as well as CUDA. + - Currently, the package is tested using [PyTorch 2.0.1](https://pytorch.org/get-started/previous-versions/#v201). +- [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. +- [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/) + - Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4: + - on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`. + - on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread. +- [GCC](https://gcc.gnu.org/install/) + - Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`. +- For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). ## Deeprank2 Package @@ -110,6 +111,7 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/). For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. A `Query` takes as inputs: + - a `.pdb` file, representing the protein-protein structure, - the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, - the ids of the chains composing the structure, and @@ -350,10 +352,10 @@ For more details about how to run a pre-trained model on new data, see the [docs We measured the efficiency of data generation in DeepRank2 using the tutorials' [PDB files](https://zenodo.org/record/8187806) (~100 data points per data set), averaging the results run on Apple M1 Pro, using a single CPU. Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Å, radius (for SRV only) of 10 Å. The [features modules](https://deeprank2.readthedocs.io/en/latest/features.html) used were `components`, `contact`, `exposure`, `irc`, `secondary_structure`, `surfacearea`, for a total of 33 features for PPIs and 26 for SRVs (the latter do not use `irc` features). -| | Data processing speed
[seconds/structure] | Memory
[megabyte/structure] | -|------|:--------------------------------------------------------:|:--------------------------------------------------------:| +| | Data processing speed
[seconds/structure] | Memory
[megabyte/structure] | +| ---- | :--------------------------------------------------------------------: | :--------------------------------------------------------------------: | | PPIs | graph only: **2.99** (std 0.23)
graph+grid: **11.35** (std 1.30) | graph only: **0.54** (std 0.07)
graph+grid: **16.09** (std 0.44) | -| SRVs | graph only: **2.20** (std 0.08)
graph+grid: **2.85** (std 0.10) | graph only: **0.05** (std 0.01)
graph+grid: **17.52** (std 0.59) | +| SRVs | graph only: **2.20** (std 0.08)
graph+grid: **2.85** (std 0.10) | graph only: **0.05** (std 0.01)
graph+grid: **17.52** (std 0.59) | ## Package development diff --git a/docs/features.md b/docs/features.md index 32b052bf6..de3e9e639 100644 --- a/docs/features.md +++ b/docs/features.md @@ -2,7 +2,6 @@ Features implemented in the code-base are defined in `deeprank2.feature` subpackage. - ## Custom features Users can add custom features by creating a new module and placing it in `deeprank2.feature` subpackage. One requirement for any feature module is to implement an `add_features` function, as shown below. This will be used in `deeprank2.models.query` to add the features to the nodes or edges of the graph. @@ -24,12 +23,15 @@ def add_features( The following is a brief description of the features already implemented in the code-base, for each features' module. ## Default node features + For atomic graphs, when features relate to residues then _all_ atoms of one residue receive the feature value for that residue. ### Core properties of atoms and residues: `deeprank2.features.components` + These features relate to the chemical components (atoms and amino acid residues) of which the graph is composed. Detailed information and descrepancies between sources are described can be found in `deeprank2.domain.aminoacidlist.py`. #### Atom properties: + These features are only used in atomic graphs. - `atom_type`: One-hot encoding of the atomic element. Options are: C, O, N, S, P, H. @@ -37,6 +39,7 @@ These features are only used in atomic graphs. - `pdb_occupancy`: Proportion of structures where the atom was detected at this position (float). In some cases a single atom was detected at different positions, in which case separate structures exist whose occupancies sum to 1. Only the highest occupancy atom is used by deeprank2. #### Residue properties: + - `res_type`: One-hot encoding of the amino acid residue (size 20). - `polarity`: One-hot encoding of the polarity of the amino acid (options: NONPOLAR, POLAR, NEGATIVE, POSITIVE). Note that sources vary on the polarity for few of the amino acids; see detailed information in `deeprank2.domain.aminoacidlist.py`. - `res_size`: The number of non-hydrogen atoms in the side chain (int). @@ -47,28 +50,32 @@ These features are only used in atomic graphs. - `hb_donors`, `hb_acceptors`: The number of hydrogen bond donor/acceptor atoms in the residue (int). Hydrogen bonds are noncovalent intermolecular interactions formed between an hydrogen atom (partially positively charged) bound to a small, highly electronegative atom (O, N, F) with an unshared electron pair. #### Properties related to variant residues: + These features are only used in SingleResidueVariant queries. - `variant_res`: One-hot encoding of variant amino acid (size 20). - `diff_charge`, `diff_polarity`, `diff_size`, `diff_mass`, `diff_pI`, `diff_hb_donors`, `diff_hb_acceptors`: Subtraction of the wildtype value of indicated feature from the variant value. For example, if the variant has 4 hb_donors and the wildtype has 5, then `diff_hb_donors == -1`. ### Conservation features: `deeprank2.features.conservation` + These features relate to the conservation state of individual residues. - `pssm`: [Position-specific scoring matrix](https://en.wikipedia.org/wiki/Position_weight_matrix) (also known as position weight matrix, PWM) values relative to the residue, is a score of the conservation of the amino acid along all 20 amino acids. - `info_content`: Information content is the difference between the given PSSM for an amino acid and a uniform distribution (float). -- `conservation` (only used in SingleResidueVariant queries): Conservation of the wild type amino acid (float). *More details required.* +- `conservation` (only used in SingleResidueVariant queries): Conservation of the wild type amino acid (float). _More details required._ - `diff_conservation` (only used in SingleResidueVariant queries): Subtraction of wildtype conservation from the variant conservation (float). ### Protein context features: #### Surface exposure: `deeprank2.features.exposure` + These features relate to the exposure of residues to the surface, and are computed using [biopython](https://biopython.org/docs/1.81/api/Bio.PDB.html). Note that these features can only be calculated per residue and not per atom. - `res_depth`: [Residue depth](https://en.wikipedia.org/wiki/Residue_depth) is the average distance (in Å) of the residue to the closest molecule of bulk water (float). See also [`Bio.PDB.ResidueDepth`](https://biopython.org/docs/1.75/api/Bio.PDB.ResidueDepth.html). - `hse`: [Half sphere exposure (HSE)](https://en.wikipedia.org/wiki/Half_sphere_exposure) is a protein solvent exposure measure indicating how buried an amino acid residue is in a protein (3 float values, see [Bio.PDB.HSExposure](https://biopython.org/docs/dev/api/Bio.PDB.HSExposure.html#module-Bio.PDB.HSExposure) for details). #### Surface accessibility: `deeprank2.features.surfacearea` + These features relate to the surface area of the residue, and are computed using [freesasa](https://freesasa.github.io). Note that these features can only be calculated per residue and not per atom. - `sasa`: [Solvent-Accessible Surface Area](https://en.wikipedia.org/wiki/Accessible_surface_area) is the surface area (in Å^2) of a biomolecule that is accessible to the solvent (float). @@ -76,31 +83,36 @@ These features relate to the surface area of the residue, and are computed using #### Secondary structure: `deeprank2.features.secondary_structure` -- `sec_struct`: One-hot encoding of the [DSSP](https://en.wikipedia.org/wiki/DSSP_(algorithm)) assigned secondary structure of the amino acid, using the three major classes (HELIX, STRAND, COIL). Calculated using [DSSP4](https://github.com/PDB-REDO/dssp). +- `sec_struct`: One-hot encoding of the [DSSP]() assigned secondary structure of the amino acid, using the three major classes (HELIX, STRAND, COIL). Calculated using [DSSP4](https://github.com/PDB-REDO/dssp). #### Inter-residue contacts (IRCs): `deeprank2.features.irc` + These features are only calculated for ProteinProteinInterface queries. - `irc_total`: The number of residues on the other chain that are within a cutoff distance of 5.5 Å (int). - `irc_nonpolar_nonpolar`, `irc_nonpolar_polar`, `irc_nonpolar_negative`, `irc_nonpolar_positive`, `irc_polar_polar`, `irc_polar_negative`, `irc_polar_positive`, `irc_negative_negative`, `irc_positive_positive`, `irc_negative_positive`: As above, but for specific residue polarity pairings. - ## Default edge features ### Contact features: `deeprank2.features.contact` + These features relate to relationships between individual nodes. For atomic graphs, when features relate to residues then _all_ atoms of one residue receive the feature value for that residue. #### Distance: + - `distance`: Interatomic distance between atoms in Å, computed from the xyz atomic coordinates taken from the .pdb file (float). For residue graphs, the the minimum distance between any atom of each residues is used. #### Structure: + These features relate to the structural relationship between nodes. + - `same_chain`: Boolean indicating whether the edge connects nodes belonging to the same chain (1) or separate chains (0). - `same_res`: Boolean indicating whether atoms belong to the same residue (1) or separate residues (0). Only used in atomic graphs. - `covalent`: Boolean indicating whether nodes are covalently bound (1) or not (0). Note that covalency is not directly assessed, but any edge with a maximum distance of 2.1 Å is considered covalent. #### Nonbond energies: + These features measure nonbond energy potentials between nodes. For residue graphs, the pairwise sum of potentials for all atoms from each residue is used. Note that no distance cutoff is used and the radius of influence is assumed to be infinite, although the potentials tends to 0 at large distance. Also edges are only assigned within a given cutoff radius when graphs are created. Nonbond energies are set to 0 for any atom pairs (on the same chain) that are within a cutoff radius of 3.6 Å, as these are assumed to be covalent neighbors or linked by no more than 2 covalent bonds (i.e. 1-3 pairs). diff --git a/docs/getstarted.md b/docs/getstarted.md index 231e309d5..bb5d3c311 100644 --- a/docs/getstarted.md +++ b/docs/getstarted.md @@ -9,6 +9,7 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/). For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. A `Query` takes as inputs: + - a `.pdb` file, representing the protein-protein structure, - the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom, - the ids of the chains composing the structure, and diff --git a/docs/installation.md b/docs/installation.md index a979ed550..0b40d3197 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -6,19 +6,18 @@ The package officially supports ubuntu-latest OS only, whose functioning is wide Before installing deeprank2 you need to install some dependencies. We advise to use a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) with Python >= 3.10 installed. The following dependency installation instructions are updated as of 14/09/2023, but in case of issues during installation always refer to the official documentation which is linked below: -* [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms`. - * [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. -* [PyTorch](https://pytorch.org/get-started/locally/) - * We support torch's CPU library as well as CUDA. -* [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. -* [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/) - * Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4: - * on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`. - * on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. -* [GCC](https://gcc.gnu.org/install/) - * Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`. -* For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). - +- [MSMS](https://anaconda.org/bioconda/msms): `conda install -c bioconda msms`. + - [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users. +- [PyTorch](https://pytorch.org/get-started/locally/) + - We support torch's CPU library as well as CUDA. +- [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`. +- [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/) +- Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4: +- on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`. +- on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. +- [GCC](https://gcc.gnu.org/install/) +- Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`. +- For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html). ## Deeprank2 Package diff --git a/paper/paper.bib b/paper/paper.bib index b7ba32e15..350a6cac5 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -256,4 +256,4 @@ @article{modeller author={Sanchez, R and Sali, A}, journal={Google Scholar There is no corresponding record for this reference}, year={1997} -} \ No newline at end of file +} diff --git a/paper/paper.md b/paper/paper.md index a8dbbb0f5..13a10b3ae 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,5 +1,5 @@ --- -title: 'DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning' +title: "DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning" tags: - Python - PyTorch @@ -44,18 +44,18 @@ authors: orcid: 0000-0002-2613-538X affiliation: 2 affiliations: - - name: Netherlands eScience Center, Amsterdam, The Netherlands - index: 1 - - name: Department of Medical BioSciences, Radboud University Medical Center, Nijmegen, The Netherlands - index: 2 - - name: Independent Researcher - index: 3 + - name: Netherlands eScience Center, Amsterdam, The Netherlands + index: 1 + - name: Department of Medical BioSciences, Radboud University Medical Center, Nijmegen, The Netherlands + index: 2 + - name: Independent Researcher + index: 3 date: 08 August 2023 bibliography: paper.bib - --- # Summary + [comment]: <> (CHECK FOR AUTHORS: Do the summary describe the high-level functionality and purpose of the software for a diverse, non-specialist audience?) We present DeepRank2, a deep learning (DL) framework geared towards making predictions on 3D protein structures for variety of biologically relevant applications. Our software can be used for predicting structural properties in drug design, immunotherapy, or designing novel proteins, among other fields. DeepRank2 allows for transformation and storage of 3D representations of both protein-protein interfaces (PPIs) and protein single-residue variants (SRVs) into either graphs or volumetric grids containing structural and physico-chemical information. These can be used for training neural networks for a variety of patterns of interest, using either our pre-implemented training pipeline for graph neural networks (GNNs) or convolutional neural networks (CNNs) or external pipelines. The entire framework flowchart is visualized in \autoref{fig:flowchart}. The package is fully open-source, follows the community-endorsed FAIR principles for research software, provides user-friendly APIs, publicily available [documentation](https://deeprank2.readthedocs.io/en/latest/), and in-depth [tutorials](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). @@ -88,8 +88,9 @@ These limitations create a growing demand for a generic and flexible DL framewor DeepRank2 allows to transform and store 3D representations of both PPIs and SRVs into 3D grids or graphs containing both geometric and physico-chemical information, and provides a DL pipeline that can be used for training pre-implemented neural networks for a given pattern of interest to the user. DeepRank2 is an improved and unified version of three previously developed packages: [DeepRank](https://github.com/DeepRank/deeprank), [DeepRank-GNN](https://github.com/DeepRank/Deeprank-GNN), and [DeepRank-Mut](https://github.com/DeepRank/DeepRank-Mut). As input, DeepRank2 takes [PDB-formatted](https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html) atomic structures, which is one of the standard and most widely used formats in the field of structural biology. These are mapped to graphs, where nodes can represent either residues or atoms, as chosen by the user, and edges represent the interactions between them. The user can configure two types of 3D structures as input for the featurization phase: -- PPIs, for mining interaction patterns within protein-protein complexes; -- SRVs, for mining mutation phenotypes within protein structures. + +- PPIs, for mining interaction patterns within protein-protein complexes; +- SRVs, for mining mutation phenotypes within protein structures. The physico-chemical and geometrical features are then computed and assigned to each node and edge. The user can choose which features to generate from several pre-existing options defined in the package, or define custom features modules, as explained in the documentation. Examples of pre-defined node features are the type of the amino acid, its size and polarity, as well as more complex features such as its buried surface area and secondary structure features. Examples of pre-defined edge features are distance, covalency, and potential energy. A detailed list of predefined features can be found in the [documentation's features page](https://deeprank2.readthedocs.io/en/latest/features.html). Graphs can either be used directly or mapped to volumetric grids (i.e., 3D image-like representations), together with their features. Multiple CPUs can be used to parallelize and speed up the featurization process. The processed data are saved into HDF5 files, designed to efficiently store and organize big data. Users can then use the data for any ML or DL framework suited for the application. Specifically, graphs can be used for the training of GNNs, and 3D grids can be used for the training of CNNs. diff --git a/tests/data/hdf5/_generate_testdata.ipynb b/tests/data/hdf5/_generate_testdata.ipynb index 617b4a8f1..b2fc2677f 100644 --- a/tests/data/hdf5/_generate_testdata.ipynb +++ b/tests/data/hdf5/_generate_testdata.ipynb @@ -7,31 +7,36 @@ "outputs": [], "source": [ "from pathlib import Path\n", + "\n", "import pkg_resources as pkg\n", + "\n", "PATH_DEEPRANK_CORE = Path(pkg.resource_filename(\"deeprank2\", \"\"))\n", "ROOT = PATH_DEEPRANK_CORE.parent\n", "PATH_TEST = ROOT / \"tests\"\n", - "from deeprank2.query import (\n", - " QueryCollection,\n", - " ProteinProteinInterfaceQuery,\n", - " SingleResidueVariantQuery)\n", - "from deeprank2.tools.target import compute_ppi_scores\n", - "from deeprank2.dataset import save_hdf5_keys\n", - "from deeprank2.domain.aminoacidlist import alanine, phenylalanine\n", "import glob\n", "import os\n", "import re\n", "import sys\n", + "\n", "import h5py\n", "import numpy as np\n", - "import pandas as pd" + "import pandas as pd\n", + "\n", + "from deeprank2.dataset import save_hdf5_keys\n", + "from deeprank2.domain.aminoacidlist import alanine, phenylalanine\n", + "from deeprank2.query import (\n", + " ProteinProteinInterfaceQuery,\n", + " QueryCollection,\n", + " SingleResidueVariantQuery,\n", + ")\n", + "from deeprank2.tools.target import compute_ppi_scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Generating 1ATN_ppi.hdf5" + "- Generating 1ATN_ppi.hdf5\n" ] }, { @@ -41,7 +46,9 @@ "outputs": [], "source": [ "import warnings\n", + "\n", "from Bio import BiopythonWarning\n", + "\n", "from deeprank2.utils.grid import GridSettings, MapMethod\n", "\n", "with warnings.catch_warnings():\n", @@ -57,37 +64,38 @@ " str(PATH_TEST / \"data/pdb/1ATN/1ATN_1w.pdb\"),\n", " str(PATH_TEST / \"data/pdb/1ATN/1ATN_2w.pdb\"),\n", " str(PATH_TEST / \"data/pdb/1ATN/1ATN_3w.pdb\"),\n", - " str(PATH_TEST / \"data/pdb/1ATN/1ATN_4w.pdb\")]\n", + " str(PATH_TEST / \"data/pdb/1ATN/1ATN_4w.pdb\"),\n", + " ]\n", "\n", " queries = QueryCollection()\n", "\n", " for pdb_path in pdb_paths:\n", " # Append data points\n", " targets = compute_ppi_scores(pdb_path, ref_path)\n", - " queries.add(ProteinProteinInterfaceQuery(\n", - " pdb_path = pdb_path,\n", - " resolution = \"residue\",\n", - " chain_ids = [chain_id1, chain_id2],\n", - " targets = targets,\n", - " pssm_paths = {\n", - " chain_id1: pssm_path1,\n", - " chain_id2: pssm_path2\n", - " }\n", - " ))\n", + " queries.add(\n", + " ProteinProteinInterfaceQuery(\n", + " pdb_path=pdb_path,\n", + " resolution=\"residue\",\n", + " chain_ids=[chain_id1, chain_id2],\n", + " targets=targets,\n", + " pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2},\n", + " )\n", + " )\n", "\n", " # Generate graphs and save them in hdf5 files\n", - " output_paths = queries.process(cpu_count=1,\n", - " prefix='1ATN_ppi',\n", - " grid_settings=GridSettings([20, 20, 20], [20.0, 20.0, 20.0]),\n", - " grid_map_method=MapMethod.GAUSSIAN,\n", - " )" + " output_paths = queries.process(\n", + " cpu_count=1,\n", + " prefix=\"1ATN_ppi\",\n", + " grid_settings=GridSettings([20, 20, 20], [20.0, 20.0, 20.0]),\n", + " grid_map_method=MapMethod.GAUSSIAN,\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Generating residue.hdf5" + "- Generating residue.hdf5\n" ] }, { @@ -97,54 +105,53 @@ "outputs": [], "source": [ "# Local data\n", - "project_folder = '/home/dbodor/git/DeepRank/deeprank-core/tests/data/sample_25_07122022/'\n", - "csv_file_name = 'BA_pMHCI_human_quantitative.csv'\n", - "models_folder_name = 'exp_nmers_all_HLA_quantitative'\n", - "data = 'pMHCI'\n", - "resolution = 'residue' # either 'residue' or 'atom'\n", - "influence_radius = 15 # max distance in Å between two interacting residues/atoms of two proteins\n", - "max_edge_length = 15 # max distance in Å between to create an edge\n", + "project_folder = \"/home/dbodor/git/DeepRank/deeprank-core/tests/data/sample_25_07122022/\"\n", + "csv_file_name = \"BA_pMHCI_human_quantitative.csv\"\n", + "models_folder_name = \"exp_nmers_all_HLA_quantitative\"\n", + "data = \"pMHCI\"\n", + "resolution = \"residue\" # either 'residue' or 'atom'\n", + "influence_radius = 15 # max distance in Å between two interacting residues/atoms of two proteins\n", + "max_edge_length = 15 # max distance in Å between to create an edge\n", "\n", - "csv_file_path = f'{project_folder}data/external/processed/I/{csv_file_name}'\n", - "models_folder_path = f'{project_folder}data/{data}/features_input_folder/{models_folder_name}'\n", + "csv_file_path = f\"{project_folder}data/external/processed/I/{csv_file_name}\"\n", + "models_folder_path = f\"{project_folder}data/{data}/features_input_folder/{models_folder_name}\"\n", "\n", - "pdb_files = glob.glob(os.path.join(models_folder_path + '/pdb', '*.pdb'))\n", + "pdb_files = glob.glob(os.path.join(models_folder_path + \"/pdb\", \"*.pdb\"))\n", "pdb_files.sort()\n", - "print(f'{len(pdb_files)} pdbs found.')\n", - "pssm_m = glob.glob(os.path.join(models_folder_path + '/pssm', '*.M.*.pssm'))\n", + "print(f\"{len(pdb_files)} pdbs found.\")\n", + "pssm_m = glob.glob(os.path.join(models_folder_path + \"/pssm\", \"*.M.*.pssm\"))\n", "pssm_m.sort()\n", - "print(f'{len(pssm_m)} MHC pssms found.')\n", - "pssm_p = glob.glob(os.path.join(models_folder_path + '/pssm', '*.P.*.pssm'))\n", + "print(f\"{len(pssm_m)} MHC pssms found.\")\n", + "pssm_p = glob.glob(os.path.join(models_folder_path + \"/pssm\", \"*.P.*.pssm\"))\n", "pssm_p.sort()\n", - "print(f'{len(pssm_p)} peptide pssms found.')\n", + "print(f\"{len(pssm_p)} peptide pssms found.\")\n", "csv_data = pd.read_csv(csv_file_path)\n", "csv_data.cluster = csv_data.cluster.fillna(-1)\n", - "pdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0].replace('-', '_') for pdb_file in pdb_files]\n", - "clusters = [csv_data[csv_data.ID == pdb_id].cluster.values[0] for pdb_id in pdb_ids_csv]\n", - "bas = [csv_data[csv_data.ID == pdb_id].measurement_value.values[0] for pdb_id in pdb_ids_csv]\n", + "pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0].replace(\"-\", \"_\") for pdb_file in pdb_files]\n", + "clusters = [csv_data[pdb_id == csv_data.ID].cluster.values[0] for pdb_id in pdb_ids_csv]\n", + "bas = [csv_data[pdb_id == csv_data.ID].measurement_value.values[0] for pdb_id in pdb_ids_csv]\n", "\n", "queries = QueryCollection()\n", - "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", "for i in range(len(pdb_files)):\n", " queries.add(\n", " ProteinProteinInterfaceQuery(\n", - " pdb_path = pdb_files[i],\n", - " resolution = \"residue\",\n", - " chain_ids = [\"M\", \"P\"],\n", - " influence_radius = influence_radius,\n", - " max_edge_length = max_edge_length,\n", - " targets = {\n", - " 'binary': int(float(bas[i]) <= 500), # binary target value\n", - " 'BA': bas[i], # continuous target value\n", - " 'cluster': clusters[i]\n", - " },\n", - " pssm_paths = {\n", - " \"M\": pssm_m[i],\n", - " \"P\": pssm_p[i]\n", - " }))\n", - "print(f'Queries created and ready to be processed.\\n')\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"residue\",\n", + " chain_ids=[\"M\", \"P\"],\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " targets={\n", + " \"binary\": int(float(bas[i]) <= 500), # binary target value\n", + " \"BA\": bas[i], # continuous target value\n", + " \"cluster\": clusters[i],\n", + " },\n", + " pssm_paths={\"M\": pssm_m[i], \"P\": pssm_p[i]},\n", + " )\n", + " )\n", + "print(\"Queries created and ready to be processed.\\n\")\n", "\n", - "output_paths = queries.process(prefix='residue')\n", + "output_paths = queries.process(prefix=\"residue\")\n", "print(output_paths)" ] }, @@ -153,7 +160,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- Generating train.hdf5, valid.hdf5, test.hdf5" + "- Generating train.hdf5, valid.hdf5, test.hdf5\n" ] }, { @@ -163,20 +170,19 @@ "outputs": [], "source": [ "# dividing hdf5 file in train, valid, test\n", - "hdf5_path = 'residue.hdf5'\n", + "hdf5_path = \"residue.hdf5\"\n", "train_clusters = [3, 4, 5, 2]\n", "val_clusters = [1, 8]\n", "test_clusters = [6]\n", - "target = 'target_values'\n", - "feature = 'cluster'\n", + "target = \"target_values\"\n", + "feature = \"cluster\"\n", "\n", "clusters = {}\n", "train_ids = []\n", "val_ids = []\n", "test_ids = []\n", "\n", - "with h5py.File(hdf5_path, 'r') as hdf5:\n", - "\n", + "with h5py.File(hdf5_path, \"r\") as hdf5:\n", " for key in hdf5.keys():\n", " feature_value = float(hdf5[key][target][feature][()])\n", " if feature_value in train_clusters:\n", @@ -191,24 +197,23 @@ " else:\n", " clusters[int(feature_value)] = 1\n", "\n", + " print(f\"Trainset contains {len(train_ids)} data points, {round(100*len(train_ids)/len(hdf5.keys()), 2)}% of the total data.\")\n", + " print(f\"Validation set contains {len(val_ids)} data points, {round(100*len(val_ids)/len(hdf5.keys()), 2)}% of the total data.\")\n", + " print(f\"Test set contains {len(test_ids)} data points, {round(100*len(test_ids)/len(hdf5.keys()), 2)}% of the total data.\\n\")\n", "\n", - " print(f'Trainset contains {len(train_ids)} data points, {round(100*len(train_ids)/len(hdf5.keys()), 2)}% of the total data.')\n", - " print(f'Validation set contains {len(val_ids)} data points, {round(100*len(val_ids)/len(hdf5.keys()), 2)}% of the total data.')\n", - " print(f'Test set contains {len(test_ids)} data points, {round(100*len(test_ids)/len(hdf5.keys()), 2)}% of the total data.\\n')\n", - "\n", - " for (key, value) in dict(sorted(clusters.items(), key=lambda x:x[1], reverse=True)).items():\n", - " print(f'Group with value {key}: {value} data points, {round(100*value/len(hdf5.keys()), 2)}% of total data.')\n", + " for key, value in dict(sorted(clusters.items(), key=lambda x: x[1], reverse=True)).items():\n", + " print(f\"Group with value {key}: {value} data points, {round(100*value/len(hdf5.keys()), 2)}% of total data.\")\n", "\n", - "save_hdf5_keys(hdf5_path, train_ids, 'train.hdf5', hardcopy = True)\n", - "save_hdf5_keys(hdf5_path, val_ids, 'valid.hdf5', hardcopy = True)\n", - "save_hdf5_keys(hdf5_path, test_ids, 'test.hdf5', hardcopy = True)" + "save_hdf5_keys(hdf5_path, train_ids, \"train.hdf5\", hardcopy=True)\n", + "save_hdf5_keys(hdf5_path, val_ids, \"valid.hdf5\", hardcopy=True)\n", + "save_hdf5_keys(hdf5_path, test_ids, \"test.hdf5\", hardcopy=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Generating variants.hdf5" + "- Generating variants.hdf5\n" ] }, { @@ -225,21 +230,19 @@ "\n", "for number in range(1, count_queries + 1):\n", " query = SingleResidueVariantQuery(\n", - " pdb_path = pdb_path,\n", - " resolution = \"residue\",\n", - " chain_ids = \"A\",\n", - " variant_residue_number = number,\n", - " insertion_code = None,\n", - " wildtype_amino_acid = alanine,\n", - " variant_amino_acid = phenylalanine,\n", - " pssm_paths = {\n", - " \"A\": str(PATH_TEST / \"data/pssm/3C8P/3C8P.A.pdb.pssm\"),\n", - " \"B\": str(PATH_TEST / \"data/pssm/3C8P/3C8P.B.pdb.pssm\")},\n", - " targets = targets\n", + " pdb_path=pdb_path,\n", + " resolution=\"residue\",\n", + " chain_ids=\"A\",\n", + " variant_residue_number=number,\n", + " insertion_code=None,\n", + " wildtype_amino_acid=alanine,\n", + " variant_amino_acid=phenylalanine,\n", + " pssm_paths={\"A\": str(PATH_TEST / \"data/pssm/3C8P/3C8P.A.pdb.pssm\"), \"B\": str(PATH_TEST / \"data/pssm/3C8P/3C8P.B.pdb.pssm\")},\n", + " targets=targets,\n", " )\n", " queries.add(query)\n", "\n", - "output_paths = queries.process(cpu_count = 1, prefix='variants')" + "output_paths = queries.process(cpu_count=1, prefix=\"variants\")" ] }, { @@ -247,7 +250,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- Generating atom.hdf5" + "- Generating atom.hdf5\n" ] }, { @@ -265,26 +268,22 @@ " str(PATH_TEST / \"data/pdb/1ATN/1ATN_1w.pdb\"),\n", " str(PATH_TEST / \"data/pdb/1ATN/1ATN_2w.pdb\"),\n", " str(PATH_TEST / \"data/pdb/1ATN/1ATN_3w.pdb\"),\n", - " str(PATH_TEST / \"data/pdb/1ATN/1ATN_4w.pdb\")]\n", + " str(PATH_TEST / \"data/pdb/1ATN/1ATN_4w.pdb\"),\n", + "]\n", "\n", "queries = QueryCollection()\n", "\n", "for pdb_path in pdb_paths:\n", " # Append data points\n", " targets = compute_ppi_scores(pdb_path, ref_path)\n", - " queries.add(ProteinProteinInterfaceQuery(\n", - " pdb_path = pdb_path,\n", - " resolution=\"atom\",\n", - " chain_ids = [chain_id1, chain_id2],\n", - " targets = targets,\n", - " pssm_paths = {\n", - " chain_id1: pssm_path1,\n", - " chain_id2: pssm_path2\n", - " }\n", - " ))\n", + " queries.add(\n", + " ProteinProteinInterfaceQuery(\n", + " pdb_path=pdb_path, resolution=\"atom\", chain_ids=[chain_id1, chain_id2], targets=targets, pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2}\n", + " )\n", + " )\n", "\n", "# Generate graphs and save them in hdf5 files\n", - "output_paths = queries.process(cpu_count=1, prefix = 'atom')" + "output_paths = queries.process(cpu_count=1, prefix=\"atom\")" ] } ], diff --git a/tutorials/TUTORIAL.md b/tutorials/TUTORIAL.md index 5ede2257a..d41135c2e 100644 --- a/tutorials/TUTORIAL.md +++ b/tutorials/TUTORIAL.md @@ -1,9 +1,10 @@ ## Introduction The tutorial notebooks in this folder can be run to learn how to use DeepRank2. -- There are two tutorial notebooks for data generation, which demonstrate how to create *.hdf5-formatted input training data from raw *.pdb-formatted data using DeepRank2. + +- There are two tutorial notebooks for data generation, which demonstrate how to create _.hdf5-formatted input training data from raw _.pdb-formatted data using DeepRank2. - protein-protein interface (PPI) data ([data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb)); - - single-residue variant (SRV) data ([data_generation_srv.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_srv.ipynb)). + - single-residue variant (SRV) data ([data_generation_srv.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_srv.ipynb)). - The [training tutorial](tutorials/training_ppi.ipynb) will demonstrate how to train neural networks using DeepRank2. ### Use cases @@ -35,9 +36,9 @@ PDB models and target data used in this tutorial have been retrieved from [Ramak - Navigate to your deeprank2 folder. - Run `pytest tests`. All tests should pass at this point. - ## Running the notebooks The tutorial notebooks can be run: + - from inside your IDE, if it has that functionality (e.g., VS Code), - on JupyterLab by navigating to the tutorials directory in your terminal and running `jupyter-lab`. diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 1dcc41972..2bcc213d1 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Data preparation for protein-protein interfaces" + "# Data preparation for protein-protein interfaces\n" ] }, { @@ -17,9 +17,9 @@ "\n", "\n", "\n", - "This tutorial will demonstrate the use of DeepRank2 for generating protein-protein interface (PPI) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files](https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format)) of protein-protein complexes as input.\n", + "This tutorial will demonstrate the use of DeepRank2 for generating protein-protein interface (PPI) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files]() of protein-protein complexes as input.\n", "\n", - "In this data processing phase, for each protein-protein complex an interface is selected according to a distance threshold that the user can customize, and it is mapped to a graph. Nodes either represent residues or atoms, and edges are the interactions between them. Each node and edge can have several different features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. The mapped data are finally saved into HDF5 files, and can be used for later models' training (for details go to [training_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs)." + "In this data processing phase, for each protein-protein complex an interface is selected according to a distance threshold that the user can customize, and it is mapped to a graph. Nodes either represent residues or atoms, and edges are the interactions between them. Each node and edge can have several different features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. The mapped data are finally saved into HDF5 files, and can be used for later models' training (for details go to [training_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs).\n" ] }, { @@ -31,7 +31,7 @@ "\n", "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/8349335). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", - "Note that the dataset contains only 100 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users." + "Note that the dataset contains only 100 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" ] }, { @@ -39,7 +39,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Utilities" + "## Utilities\n" ] }, { @@ -47,7 +47,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Libraries" + "### Libraries\n" ] }, { @@ -55,7 +55,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The libraries needed for this tutorial:" + "The libraries needed for this tutorial:\n" ] }, { @@ -82,7 +82,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Raw files and paths" + "### Raw files and paths\n" ] }, { @@ -90,7 +90,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The paths for reading raw data and saving the processed ones:" + "The paths for reading raw data and saving the processed ones:\n" ] }, { @@ -112,7 +112,7 @@ "source": [ "- Raw data are PDB files in `data_raw/ppi/pdb/`, which contains atomic coordinates of the protein-protein complexes of interest, so in our case of pMHC complexes.\n", "- Target data, so in our case the BA values for the pMHC complex, are in `data_raw/ppi/BA_values.csv`.\n", - "- The final PPI processed data will be saved in `data_processed/ppi/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below." + "- The final PPI processed data will be saved in `data_processed/ppi/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" ] }, { @@ -120,7 +120,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names in a list and the BA target values from a CSV containing the IDs of the PDB models as well:" + "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names in a list and the BA target values from a CSV containing the IDs of the PDB models as well:\n" ] }, { @@ -130,14 +130,15 @@ "outputs": [], "source": [ "def get_pdb_files_and_target_data(data_path):\n", - "\tcsv_data = pd.read_csv(os.path.join(data_path, \"BA_values.csv\"))\n", - "\tpdb_files = glob.glob(os.path.join(data_path, \"pdb\", '*.pdb'))\n", - "\tpdb_files.sort()\n", - "\tpdb_ids_csv = [pdb_file.split('/')[-1].split('.')[0] for pdb_file in pdb_files]\n", - "\tcsv_data_indexed = csv_data.set_index('ID')\n", - "\tcsv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]\n", - "\tbas = csv_data_indexed.measurement_value.values.tolist()\n", - "\treturn pdb_files, bas\n", + " csv_data = pd.read_csv(os.path.join(data_path, \"BA_values.csv\"))\n", + " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.pdb\"))\n", + " pdb_files.sort()\n", + " pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0] for pdb_file in pdb_files]\n", + " csv_data_indexed = csv_data.set_index(\"ID\")\n", + " csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]\n", + " bas = csv_data_indexed.measurement_value.values.tolist()\n", + " return pdb_files, bas\n", + "\n", "\n", "pdb_files, bas = get_pdb_files_and_target_data(data_path)" ] @@ -147,7 +148,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## `QueryCollection` and `Query` objects" + "## `QueryCollection` and `Query` objects\n" ] }, { @@ -165,7 +166,7 @@ "- The interaction radius, which determines the threshold distance (in Ångström) for residues/atoms surrounding the interface that will be included in the graph.\n", "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add two targets: \"BA\" and \"binary\". The first represents the actual BA value of the complex in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) a binding one.\n", "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", - "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), in the form of .pssm files. PSSMs are optional and will not be used in this tutorial." + "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), in the form of .pssm files. PSSMs are optional and will not be used in this tutorial.\n" ] }, { @@ -173,7 +174,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Residue-level PPIs using `ProteinProteinInterfaceQuery`" + "## Residue-level PPIs using `ProteinProteinInterfaceQuery`\n" ] }, { @@ -187,25 +188,27 @@ "influence_radius = 8 # max distance in Å between two interacting residues/atoms of two proteins\n", "max_edge_length = 8\n", "\n", - "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", "count = 0\n", "for i in range(len(pdb_files)):\n", - "\tqueries.add(\n", - "\t\tProteinProteinInterfaceQuery(\n", - "\t\t\tpdb_path = pdb_files[i],\n", - "\t\t\tresolution = \"residue\",\n", - "\t\t\tchain_ids = [\"M\", \"P\"],\n", - "\t\t\tinfluence_radius = influence_radius,\n", - "\t\t\tmax_edge_length = max_edge_length,\n", - "\t\t\ttargets = {\n", - "\t\t\t\t'binary': int(float(bas[i]) <= 500), # binary target value\n", - "\t\t\t\t'BA': bas[i], # continuous target value\n", - "\t\t\t\t}))\n", - "\tcount +=1\n", - "\tif count % 20 == 0:\n", - "\t\tprint(f'{count} queries added to the collection.')\n", + " queries.add(\n", + " ProteinProteinInterfaceQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"residue\",\n", + " chain_ids=[\"M\", \"P\"],\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " targets={\n", + " \"binary\": int(float(bas[i]) <= 500), # binary target value\n", + " \"BA\": bas[i], # continuous target value\n", + " },\n", + " )\n", + " )\n", + " count += 1\n", + " if count % 20 == 0:\n", + " print(f\"{count} queries added to the collection.\")\n", "\n", - "print(f'Queries ready to be processed.\\n')" + "print(f\"Queries ready to be processed.\\n\")" ] }, { @@ -221,7 +224,7 @@ "- `feature_modules` allows you to choose which feature generating modules you want to use. By default, the basic features contained in `deeprank2.features.components` and `deeprank2.features.contact` are generated. Users can add custom features by creating a new module and placing it in the `deeprank2.feature` subpackage. A complete and detailed list of the pre-implemented features per module and more information about how to add custom features can be found [here](https://deeprank2.readthedocs.io/en/latest/features.html).\n", " - Note that all features generated by a module will be added if that module was selected, and there is no way to only generate specific features from that module. However, during the training phase shown in `training_ppi.ipynb`, it is possible to select only a subset of available features.\n", "- `cpu_count` can be used to specify how many processes to be run simultaneously, and will coincide with the number of HDF5 files generated. By default it takes all available CPU cores and HDF5 files are squashed into a single file using the `combine_output` setting.\n", - "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved." + "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved.\n" ] }, { @@ -230,20 +233,22 @@ "metadata": {}, "outputs": [], "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - "\t# the number of points on the x, y, z edges of the cube\n", - "\tpoints_counts = [35, 30, 30],\n", - "\t# x, y, z sizes of the box in Å\n", - "\tsizes = [1.0, 1.0, 1.0])\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Å\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", "\n", "queries.process(\n", - "\tprefix = os.path.join(processed_data_path, \"residue\", \"proc\"),\n", - "\tfeature_modules = [components, contact],\n", - " cpu_count = 8,\n", - "\tcombine_output = False,\n", - "\tgrid_settings = grid_settings,\n", - "\tgrid_map_method = grid_map_method)\n", + " prefix=os.path.join(processed_data_path, \"residue\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", "\n", "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" ] @@ -309,7 +314,7 @@ "\n", "`edge_features`, `node_features`, `mapped_features` are [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which contain [HDF5 Datasets](https://docs.h5py.org/en/stable/high/dataset.html) (e.g., `_index`, `electrostatic`, etc.), which in turn contains features values in the form of arrays. `edge_features` and `node_features` refer specificly to the graph representation, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, with the features and the target changing according to the user's settings. Features starting with `_` are present for human inspection of the data, but they are not used for training models.\n", "\n", - "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it." + "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it.\n" ] }, { @@ -319,7 +324,7 @@ "source": [ "#### Pandas dataframe\n", "\n", - "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph. " + "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph.\n" ] }, { @@ -339,7 +344,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also generate histograms for looking at the features distributions. An example:" + "We can also generate histograms for looking at the features distributions. An example:\n" ] }, { @@ -349,12 +354,10 @@ "outputs": [], "source": [ "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", - "dataset.save_hist(\n", - " features = [\"res_mass\", \"distance\", \"electrostatic\"],\n", - " fname = fname)\n", + "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize = (15,10))\n", + "plt.figure(figsize=(15, 10))\n", "fig = plt.imshow(im)\n", "fig.axes.get_xaxis().set_visible(False)\n", "fig.axes.get_yaxis().set_visible(False)" @@ -369,12 +372,12 @@ "\n", "- [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files.\n", " As representative example, the following is the structure for `BA-100600.pdb` seen from HDF5View:\n", - " \n", + "\n", " \n", "\n", - " Using this tool you can inspect the values of the features visually, for each data point. \n", + " Using this tool you can inspect the values of the features visually, for each data point.\n", "\n", - "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:" + "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:\n" ] }, { @@ -386,19 +389,19 @@ "with h5py.File(processed_data[0], \"r\") as hdf5:\n", " # List of all graphs in hdf5, each graph representing a ppi\n", " ids = list(hdf5.keys())\n", - " print(f'IDs of PPIs in {processed_data[0]}: {ids}')\n", - " node_features = list(hdf5[ids[0]][\"node_features\"]) \n", - " print(f'Node features: {node_features}')\n", + " print(f\"IDs of PPIs in {processed_data[0]}: {ids}\")\n", + " node_features = list(hdf5[ids[0]][\"node_features\"])\n", + " print(f\"Node features: {node_features}\")\n", " edge_features = list(hdf5[ids[0]][\"edge_features\"])\n", - " print(f'Edge features: {edge_features}')\n", + " print(f\"Edge features: {edge_features}\")\n", " target_features = list(hdf5[ids[0]][\"target_values\"])\n", - " print(f'Targets features: {target_features}')\n", + " print(f\"Targets features: {target_features}\")\n", " # Polarity feature for ids[0], numpy.ndarray\n", " node_feat_polarity = hdf5[ids[0]][\"node_features\"][\"polarity\"][:]\n", - " print(f'Polarity feature shape: {node_feat_polarity.shape}')\n", + " print(f\"Polarity feature shape: {node_feat_polarity.shape}\")\n", " # Electrostatic feature for ids[0], numpy.ndarray\n", " edge_feat_electrostatic = hdf5[ids[0]][\"edge_features\"][\"electrostatic\"][:]\n", - " print(f'Electrostatic feature shape: {edge_feat_electrostatic.shape}')" + " print(f\"Electrostatic feature shape: {edge_feat_electrostatic.shape}\")" ] }, { @@ -408,7 +411,7 @@ "source": [ "## Atomic-level PPIs using `ProteinProteinInterfaceQuery`\n", "\n", - "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level. " + "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level.\n" ] }, { @@ -422,25 +425,27 @@ "influence_radius = 5 # max distance in Å between two interacting residues/atoms of two proteins\n", "max_edge_length = 5\n", "\n", - "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", "count = 0\n", "for i in range(len(pdb_files)):\n", - "\tqueries.add(\n", - "\t\tProteinProteinInterfaceQuery(\n", - "\t\t\tpdb_path = pdb_files[i],\n", - "\t\t\tresolution = \"atom\",\n", - "\t\t\tchain_ids = [\"M\",\"P\"],\n", - "\t\t\tinfluence_radius = influence_radius,\n", - "\t\t\tmax_edge_length = max_edge_length,\n", - "\t\t\ttargets = {\n", - "\t\t\t\t'binary': int(float(bas[i]) <= 500), # binary target value\n", - "\t\t\t\t'BA': bas[i], # continuous target value\n", - "\t\t\t\t}))\n", - "\tcount +=1\n", - "\tif count % 20 == 0:\n", - "\t\tprint(f'{count} queries added to the collection.')\n", + " queries.add(\n", + " ProteinProteinInterfaceQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"atom\",\n", + " chain_ids=[\"M\", \"P\"],\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " targets={\n", + " \"binary\": int(float(bas[i]) <= 500), # binary target value\n", + " \"BA\": bas[i], # continuous target value\n", + " },\n", + " )\n", + " )\n", + " count += 1\n", + " if count % 20 == 0:\n", + " print(f\"{count} queries added to the collection.\")\n", "\n", - "print(f'Queries ready to be processed.\\n')" + "print(f\"Queries ready to be processed.\\n\")" ] }, { @@ -449,20 +454,22 @@ "metadata": {}, "outputs": [], "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - "\t# the number of points on the x, y, z edges of the cube\n", - "\tpoints_counts = [35, 30, 30],\n", - "\t# x, y, z sizes of the box in Å\n", - "\tsizes = [1.0, 1.0, 1.0])\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Å\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", "\n", "queries.process(\n", - "\tprefix = os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", - "\tfeature_modules = [components, contact],\n", - " cpu_count = 8,\n", - "\tcombine_output = False,\n", - "\tgrid_settings = grid_settings,\n", - "\tgrid_map_method = grid_map_method)\n", + " prefix=os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", "\n", "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" ] @@ -472,7 +479,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Again, the data can be inspected using `hdf5_to_pandas` function." + "Again, the data can be inspected using `hdf5_to_pandas` function.\n" ] }, { @@ -494,12 +501,10 @@ "outputs": [], "source": [ "fname = os.path.join(processed_data_path, \"atomic\", \"atom_charge\")\n", - "dataset.save_hist(\n", - " features = \"atom_charge\",\n", - " fname = fname)\n", + "dataset.save_hist(features=\"atom_charge\", fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize = (8,8))\n", + "plt.figure(figsize=(8, 8))\n", "fig = plt.imshow(im)\n", "fig.axes.get_xaxis().set_visible(False)\n", "fig.axes.get_yaxis().set_visible(False)" @@ -510,7 +515,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation." + "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" ] } ], diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index f8c3ffddf..d4835ff4f 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Data preparation for single-residue variants" + "# Data preparation for single-residue variants\n" ] }, { @@ -17,9 +17,9 @@ "\n", "\n", "\n", - "This tutorial will demonstrate the use of DeepRank2 for generating single-residue variants (SRVs) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files](https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format)) of protein structures as input.\n", + "This tutorial will demonstrate the use of DeepRank2 for generating single-residue variants (SRVs) graphs and saving them as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) files, using [PBD files]() of protein structures as input.\n", "\n", - "In this data processing phase, a local neighborhood around the mutated residue is selected for each SRV according to a radius threshold that the user can customize. All atoms or residues within the threshold are mapped as the nodes to a graph and the interactions between them are the edges of the graph. Each node and edge can have several distinct (structural or physico-chemical) features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. Finally, the mapped data are saved as HDF5 files, which can be used for training predictive models (for details see [training_ppi.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs)." + "In this data processing phase, a local neighborhood around the mutated residue is selected for each SRV according to a radius threshold that the user can customize. All atoms or residues within the threshold are mapped as the nodes to a graph and the interactions between them are the edges of the graph. Each node and edge can have several distinct (structural or physico-chemical) features, which are generated and added during the processing phase as well. Optionally, the graphs can be mapped to volumetric grids (i.e., 3D image-like representations), together with their features. Finally, the mapped data are saved as HDF5 files, which can be used for training predictive models (for details see [training_ppi.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/training_ppi.ipynb) tutorial). In particular, graphs can be used for the training of Graph Neural Networks (GNNs), and grids can be used for the training of Convolutional Neural Networks (CNNs).\n" ] }, { @@ -31,7 +31,7 @@ "\n", "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/8349335). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", - "Note that the dataset contains only 96 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users." + "Note that the dataset contains only 96 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" ] }, { @@ -39,7 +39,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Utilities" + "## Utilities\n" ] }, { @@ -47,7 +47,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Libraries" + "### Libraries\n" ] }, { @@ -55,7 +55,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The libraries needed for this tutorial:" + "The libraries needed for this tutorial:\n" ] }, { @@ -75,7 +75,7 @@ "from deeprank2.domain.aminoacidlist import amino_acids_by_code\n", "from deeprank2.features import components, contact\n", "from deeprank2.utils.grid import GridSettings, MapMethod\n", - "from deeprank2.dataset import GraphDataset\n" + "from deeprank2.dataset import GraphDataset" ] }, { @@ -83,7 +83,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Raw files and paths" + "### Raw files and paths\n" ] }, { @@ -91,7 +91,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The paths for reading raw data and saving the processed ones:" + "The paths for reading raw data and saving the processed ones:\n" ] }, { @@ -111,9 +111,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- Raw data are PDB files in `data_raw/srv/pdb/`, which contains atomic coordinates of the protein structure containing the variant. \n", + "- Raw data are PDB files in `data_raw/srv/pdb/`, which contains atomic coordinates of the protein structure containing the variant.\n", "- Target data, so in our case pathogenic versus benign labels, are in `data_raw/srv/srv_target_values.csv`.\n", - "- The final SRV processed data will be saved in `data_processed/srv/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below." + "- The final SRV processed data will be saved in `data_processed/srv/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" ] }, { @@ -121,7 +121,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names, SRVs information and target values in a list from the CSV:" + "`get_pdb_files_and_target_data` is an helper function used to retrieve the raw pdb files names, SRVs information and target values in a list from the CSV:\n" ] }, { @@ -131,19 +131,20 @@ "outputs": [], "source": [ "def get_pdb_files_and_target_data(data_path):\n", - "\tcsv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values.csv\"))\n", - "\tpdb_files = glob.glob(os.path.join(data_path, \"pdb\", '*.ent'))\n", - "\tpdb_files.sort()\n", - "\tpdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", - "\tcsv_data_indexed = csv_data.set_index('pdb_file')\n", - "\tcsv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", - "\tres_numbers = csv_data_indexed.res_number.values.tolist()\n", - "\tres_wildtypes = csv_data_indexed.res_wildtype.values.tolist()\n", - "\tres_variants = csv_data_indexed.res_variant.values.tolist()\n", - "\ttargets = csv_data_indexed.target.values.tolist()\n", - "\tpdb_names = csv_data_indexed.index.values.tolist()\n", - "\tpdb_files = [data_path + \"/pdb/\" + pdb_name for pdb_name in pdb_names]\n", - "\treturn pdb_files, res_numbers, res_wildtypes, res_variants, targets\n", + " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values.csv\"))\n", + " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.ent\"))\n", + " pdb_files.sort()\n", + " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", + " csv_data_indexed = csv_data.set_index(\"pdb_file\")\n", + " csv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", + " res_numbers = csv_data_indexed.res_number.values.tolist()\n", + " res_wildtypes = csv_data_indexed.res_wildtype.values.tolist()\n", + " res_variants = csv_data_indexed.res_variant.values.tolist()\n", + " targets = csv_data_indexed.target.values.tolist()\n", + " pdb_names = csv_data_indexed.index.values.tolist()\n", + " pdb_files = [data_path + \"/pdb/\" + pdb_name for pdb_name in pdb_names]\n", + " return pdb_files, res_numbers, res_wildtypes, res_variants, targets\n", + "\n", "\n", "pdb_files, res_numbers, res_wildtypes, res_variants, targets = get_pdb_files_and_target_data(data_path)" ] @@ -153,7 +154,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## `QueryCollection` and `Query` objects" + "## `QueryCollection` and `Query` objects\n" ] }, { @@ -163,7 +164,6 @@ "source": [ "For each SRV, so for each data point, a query can be created and added to the `QueryCollection` object, to be processed later on. Different types of queries exist, based on the molecular resolution needed:\n", "\n", - "\n", "A query takes as inputs:\n", "\n", "- A `.pdb` file, representing the protein structure containing the SRV.\n", @@ -171,12 +171,12 @@ "- The chain id of the SRV.\n", "- The residue number of the missense mutation.\n", "- The insertion code, used when two residues have the same numbering. The combination of residue numbering and insertion code defines the unique residue.\n", - "- The wildtype amino acid. \n", - "- The variant amino acid. \n", + "- The wildtype amino acid.\n", + "- The variant amino acid.\n", "- The interaction radius, which determines the threshold distance (in Ångström) for residues/atoms surrounding the mutation that will be included in the graph.\n", - "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add a 0 if the SRV belongs to the benign class, and 1 if it belongs to the pathogenic one. \n", + "- The target values associated with the query. For each query/data point, in the use case demonstrated in this tutorial will add a 0 if the SRV belongs to the benign class, and 1 if it belongs to the pathogenic one.\n", "- The max edge distance, which is the maximum distance between two nodes to generate an edge between them.\n", - "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), per chain identifier, in the form of .pssm files. PSSMs are optional and will not be used in this tutorial." + "- Optional: The correspondent [Position-Specific Scoring Matrices (PSSMs)](https://en.wikipedia.org/wiki/Position_weight_matrix), per chain identifier, in the form of .pssm files. PSSMs are optional and will not be used in this tutorial.\n" ] }, { @@ -184,7 +184,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Residue-level SRV: `SingleResidueVariantQuery`" + "## Residue-level SRV: `SingleResidueVariantQuery`\n" ] }, { @@ -195,29 +195,31 @@ "source": [ "queries = QueryCollection()\n", "\n", - "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", - "max_edge_length = 4.5 # ??\n", + "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", + "max_edge_length = 4.5 # ??\n", "\n", - "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", "count = 0\n", "for i in range(len(pdb_files)):\n", - "\tqueries.add(SingleResidueVariantQuery(\n", - "\t\tpdb_path = pdb_files[i],\n", - "\t\tresolution = \"residue\",\n", - "\t\tchain_ids = \"A\",\n", - "\t\tvariant_residue_number = res_numbers[i],\n", - "\t\tinsertion_code = None,\n", - "\t\twildtype_amino_acid = amino_acids_by_code[res_wildtypes[i]],\n", - "\t\tvariant_amino_acid = amino_acids_by_code[res_variants[i]],\n", - "\t\ttargets = {'binary': targets[i]},\n", - "\t\tinfluence_radius = influence_radius,\n", - "\t\tmax_edge_length = max_edge_length,\n", - "\t\t))\n", - "\tcount +=1\n", - "\tif count % 20 == 0:\n", - "\t\tprint(f'{count} queries added to the collection.')\n", - "\n", - "print(f'Queries ready to be processed.\\n')" + " queries.add(\n", + " SingleResidueVariantQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"residue\",\n", + " chain_ids=\"A\",\n", + " variant_residue_number=res_numbers[i],\n", + " insertion_code=None,\n", + " wildtype_amino_acid=amino_acids_by_code[res_wildtypes[i]],\n", + " variant_amino_acid=amino_acids_by_code[res_variants[i]],\n", + " targets={\"binary\": targets[i]},\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " )\n", + " )\n", + " count += 1\n", + " if count % 20 == 0:\n", + " print(f\"{count} queries added to the collection.\")\n", + "\n", + "print(f\"Queries ready to be processed.\\n\")" ] }, { @@ -233,7 +235,7 @@ "- `feature_modules` allows you to choose which feature generating modules you want to use. By default, the basic features contained in `deeprank2.features.components` and `deeprank2.features.contact` are generated. Users can add custom features by creating a new module and placing it in the `deeprank2.feature` subpackage. A complete and detailed list of the pre-implemented features per module and more information about how to add custom features can be found [here](https://deeprank2.readthedocs.io/en/latest/features.html).\n", " - Note that all features generated by a module will be added if that module was selected, and there is no way to only generate specific features from that module. However, during the training phase shown in `training_ppi.ipynb`, it is possible to select only a subset of available features.\n", "- `cpu_count` can be used to specify how many processes to be run simultaneously, and will coincide with the number of HDF5 files generated. By default it takes all available CPU cores and HDF5 files are squashed into a single file using the `combine_output` setting.\n", - "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved." + "- Optional: If you want to include grids in the HDF5 files, which represent the mapping of the graphs to a volumetric box, you need to define `grid_settings` and `grid_map_method`, as shown in the example below. If they are `None` (default), only graphs are saved.\n" ] }, { @@ -242,20 +244,22 @@ "metadata": {}, "outputs": [], "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - "\t# the number of points on the x, y, z edges of the cube\n", - "\tpoints_counts = [35, 30, 30],\n", - "\t# x, y, z sizes of the box in Å\n", - "\tsizes = [1.0, 1.0, 1.0])\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Å\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", "\n", "queries.process(\n", - "\tprefix = os.path.join(processed_data_path, \"residue\", \"proc\"),\n", - "\tfeature_modules = [components, contact],\n", - " cpu_count = 8,\n", - "\tcombine_output = False,\n", - "\tgrid_settings = grid_settings,\n", - "\tgrid_map_method = grid_map_method)\n", + " prefix=os.path.join(processed_data_path, \"residue\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", "\n", "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"residue\")}.')" ] @@ -328,7 +332,7 @@ "\n", "`edge_features`, `node_features`, `mapped_features` are [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which contain [HDF5 Datasets](https://docs.h5py.org/en/stable/high/dataset.html) (e.g., `_index`, `electrostatic`, etc.), which in turn contains features values in the form of arrays. `edge_features` and `node_features` refer specificly to the graph representation, while `grid_points` and `mapped_features` refer to the grid mapped from the graph. Each data point generated by deeprank2 has the above structure, with the features and the target changing according to the user's settings. Features starting with `_` are present for human inspection of the data, but they are not used for training models.\n", "\n", - "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it." + "It is always a good practice to first explore the data, and then make decision about splitting them in training, test and validation sets. There are different possible ways for doing it.\n" ] }, { @@ -338,7 +342,7 @@ "source": [ "#### Pandas dataframe\n", "\n", - "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph. " + "The edge and node features just generated can be explored by instantiating the `GraphDataset` object, and then using `hdf5_to_pandas` method which converts node and edge features into a [Pandas](https://pandas.pydata.org/) dataframe. Each row represents a ppi in the form of a graph.\n" ] }, { @@ -358,7 +362,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also generate histograms for looking at the features distributions. An example:" + "We can also generate histograms for looking at the features distributions. An example:\n" ] }, { @@ -368,12 +372,10 @@ "outputs": [], "source": [ "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", - "dataset.save_hist(\n", - " features = [\"res_mass\", \"distance\", \"electrostatic\"],\n", - " fname = fname)\n", + "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize = (15,10))\n", + "plt.figure(figsize=(15, 10))\n", "fig = plt.imshow(im)\n", "fig.axes.get_xaxis().set_visible(False)\n", "fig.axes.get_yaxis().set_visible(False)" @@ -388,12 +390,12 @@ "\n", "- [HDFView](https://www.hdfgroup.org/downloads/hdfview/), a visual tool written in Java for browsing and editing HDF5 files.\n", " As representative example, the following is the structure for `pdb2ooh.ent` seen from HDF5View:\n", - " \n", + "\n", " \n", "\n", - " Using this tool you can inspect the values of the features visually, for each data point. \n", + " Using this tool you can inspect the values of the features visually, for each data point.\n", "\n", - "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:" + "- Python packages such as [h5py](https://docs.h5py.org/en/stable/index.html). Examples:\n" ] }, { @@ -406,19 +408,19 @@ " # List of all graphs in hdf5, each graph representing\n", " # a SRV and its sourrouding environment\n", " ids = list(hdf5.keys())\n", - " print(f'IDs of SRVs in {processed_data[0]}: {ids}')\n", - " node_features = list(hdf5[ids[0]][\"node_features\"]) \n", - " print(f'Node features: {node_features}')\n", + " print(f\"IDs of SRVs in {processed_data[0]}: {ids}\")\n", + " node_features = list(hdf5[ids[0]][\"node_features\"])\n", + " print(f\"Node features: {node_features}\")\n", " edge_features = list(hdf5[ids[0]][\"edge_features\"])\n", - " print(f'Edge features: {edge_features}')\n", + " print(f\"Edge features: {edge_features}\")\n", " target_features = list(hdf5[ids[0]][\"target_values\"])\n", - " print(f'Targets features: {target_features}')\n", + " print(f\"Targets features: {target_features}\")\n", " # Polarity feature for ids[0], numpy.ndarray\n", " node_feat_polarity = hdf5[ids[0]][\"node_features\"][\"polarity\"][:]\n", - " print(f'Polarity feature shape: {node_feat_polarity.shape}')\n", + " print(f\"Polarity feature shape: {node_feat_polarity.shape}\")\n", " # Electrostatic feature for ids[0], numpy.ndarray\n", " edge_feat_electrostatic = hdf5[ids[0]][\"edge_features\"][\"electrostatic\"][:]\n", - " print(f'Electrostatic feature shape: {edge_feat_electrostatic.shape}')" + " print(f\"Electrostatic feature shape: {edge_feat_electrostatic.shape}\")" ] }, { @@ -428,7 +430,7 @@ "source": [ "## Atomic-level SRV: `SingleResidueVariantQuery`\n", "\n", - "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level. " + "Graphs can also be generated at an atomic resolution, very similarly to what has just been done for residue-level.\n" ] }, { @@ -439,29 +441,31 @@ "source": [ "queries = QueryCollection()\n", "\n", - "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", - "max_edge_length = 4.5 # ??\n", + "influence_radius = 10.0 # radius to select the local neighborhood around the SRV\n", + "max_edge_length = 4.5 # ??\n", "\n", - "print(f'Adding {len(pdb_files)} queries to the query collection ...')\n", + "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", "count = 0\n", "for i in range(len(pdb_files)):\n", - "\tqueries.add(SingleResidueVariantQuery(\n", - "\t\tpdb_path = pdb_files[i],\n", - " \t\tresolution = \"atom\",\n", - "\t\tchain_ids = \"A\",\n", - "\t\tvariant_residue_number = res_numbers[i],\n", - "\t\tinsertion_code = None,\n", - "\t\twildtype_amino_acid = amino_acids_by_code[res_wildtypes[i]],\n", - "\t\tvariant_amino_acid = amino_acids_by_code[res_variants[i]],\n", - "\t\ttargets = {'binary': targets[i]},\n", - "\t\tinfluence_radius = influence_radius,\n", - "\t\tmax_edge_length = max_edge_length,\n", - "\t\t))\n", - "\tcount +=1\n", - "\tif count % 20 == 0:\n", - "\t\tprint(f'{count} queries added to the collection.')\n", - "\n", - "print(f'Queries ready to be processed.\\n')" + " queries.add(\n", + " SingleResidueVariantQuery(\n", + " pdb_path=pdb_files[i],\n", + " resolution=\"atom\",\n", + " chain_ids=\"A\",\n", + " variant_residue_number=res_numbers[i],\n", + " insertion_code=None,\n", + " wildtype_amino_acid=amino_acids_by_code[res_wildtypes[i]],\n", + " variant_amino_acid=amino_acids_by_code[res_variants[i]],\n", + " targets={\"binary\": targets[i]},\n", + " influence_radius=influence_radius,\n", + " max_edge_length=max_edge_length,\n", + " )\n", + " )\n", + " count += 1\n", + " if count % 20 == 0:\n", + " print(f\"{count} queries added to the collection.\")\n", + "\n", + "print(f\"Queries ready to be processed.\\n\")" ] }, { @@ -470,20 +474,22 @@ "metadata": {}, "outputs": [], "source": [ - "grid_settings = GridSettings( # None if you don't want grids\n", - "\t# the number of points on the x, y, z edges of the cube\n", - "\tpoints_counts = [35, 30, 30],\n", - "\t# x, y, z sizes of the box in Å\n", - "\tsizes = [1.0, 1.0, 1.0])\n", - "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", + "grid_settings = GridSettings( # None if you don't want grids\n", + " # the number of points on the x, y, z edges of the cube\n", + " points_counts=[35, 30, 30],\n", + " # x, y, z sizes of the box in Å\n", + " sizes=[1.0, 1.0, 1.0],\n", + ")\n", + "grid_map_method = MapMethod.GAUSSIAN # None if you don't want grids\n", "\n", "queries.process(\n", - "\tprefix = os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", - "\tfeature_modules = [components, contact],\n", - " cpu_count = 8,\n", - "\tcombine_output = False,\n", - "\tgrid_settings = grid_settings,\n", - "\tgrid_map_method = grid_map_method)\n", + " prefix=os.path.join(processed_data_path, \"atomic\", \"proc\"),\n", + " feature_modules=[components, contact],\n", + " cpu_count=8,\n", + " combine_output=False,\n", + " grid_settings=grid_settings,\n", + " grid_map_method=grid_map_method,\n", + ")\n", "\n", "print(f'The queries processing is done. The generated HDF5 files are in {os.path.join(processed_data_path, \"atomic\")}.')" ] @@ -493,7 +499,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Again, the data can be inspected using `hdf5_to_pandas` function." + "Again, the data can be inspected using `hdf5_to_pandas` function.\n" ] }, { @@ -515,12 +521,10 @@ "outputs": [], "source": [ "fname = os.path.join(processed_data_path, \"atomic\", \"atom_charge\")\n", - "dataset.save_hist(\n", - " features = \"atom_charge\",\n", - " fname = fname)\n", + "dataset.save_hist(features=\"atom_charge\", fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", - "plt.figure(figsize = (8,8))\n", + "plt.figure(figsize=(8, 8))\n", "fig = plt.imshow(im)\n", "fig.axes.get_xaxis().set_visible(False)\n", "fig.axes.get_yaxis().set_visible(False)" @@ -531,7 +535,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation." + "Note that some of the features are different from the ones generated with the residue-level queries. There are indeed features in `deeprank2.features.components` module which are generated only in atomic graphs, i.e. `atom_type`, `atom_charge`, and `pdb_occupancy`, because they don't make sense only in the atomic graphs' representation.\n" ] } ], diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 6e0340970..784bc03c7 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Training Neural Networks" + "# Training Neural Networks\n" ] }, { @@ -21,7 +21,7 @@ "\n", "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", "\n", - "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist." + "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" ] }, { @@ -33,9 +33,9 @@ "\n", "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial. \n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", - "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users." + "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" ] }, { @@ -43,7 +43,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Utilities" + "## Utilities\n" ] }, { @@ -51,7 +51,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Libraries" + "### Libraries\n" ] }, { @@ -59,7 +59,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The libraries needed for this tutorial:" + "The libraries needed for this tutorial:\n" ] }, { @@ -74,19 +74,15 @@ "import h5py\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import (\n", - " roc_curve,\n", - " auc,\n", - " precision_score,\n", - " recall_score,\n", - " accuracy_score,\n", - " f1_score)\n", + "from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score\n", "import plotly.express as px\n", "import torch\n", "import numpy as np\n", - "np.seterr(divide = 'ignore')\n", - "np.seterr(invalid='ignore')\n", + "\n", + "np.seterr(divide=\"ignore\")\n", + "np.seterr(invalid=\"ignore\")\n", "import pandas as pd\n", + "\n", "logging.basicConfig(level=logging.INFO)\n", "from deeprank2.dataset import GraphDataset, GridDataset\n", "from deeprank2.trainer import Trainer\n", @@ -94,7 +90,8 @@ "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", "from deeprank2.utils.exporters import HDF5OutputExporter\n", "import warnings\n", - "warnings.filterwarnings('ignore')" + "\n", + "warnings.filterwarnings(\"ignore\")" ] }, { @@ -104,7 +101,7 @@ "source": [ "### Paths and sets\n", "\n", - "The paths for reading the processed data:" + "The paths for reading the processed data:\n" ] }, { @@ -116,8 +113,8 @@ "data_type = \"ppi\"\n", "level = \"residue\"\n", "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", - "input_data_path = glob.glob(os.path.join(processed_data_path, '*.hdf5'))\n", - "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" + "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", + "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" ] }, { @@ -127,7 +124,7 @@ "source": [ "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", "\n", - "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above." + "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" ] }, { @@ -135,7 +132,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:" + "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" ] }, { @@ -145,14 +142,14 @@ "outputs": [], "source": [ "df_dict = {}\n", - "df_dict['entry'] = []\n", - "df_dict['target'] = []\n", + "df_dict[\"entry\"] = []\n", + "df_dict[\"target\"] = []\n", "for fname in input_data_path:\n", - " with h5py.File(fname, 'r') as hdf5:\n", + " with h5py.File(fname, \"r\") as hdf5:\n", " for mol in hdf5.keys():\n", " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", - " df_dict['entry'].append(mol)\n", - " df_dict['target'].append(target_value)\n", + " df_dict[\"entry\"].append(mol)\n", + " df_dict[\"target\"].append(target_value)\n", "\n", "df = pd.DataFrame(data=df_dict)\n", "df.head()" @@ -165,9 +162,9 @@ "source": [ "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", "\n", - "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb). \n", + "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", "\n", - "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation." + "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" ] }, { @@ -179,17 +176,17 @@ "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", "\n", - "print(f'Data statistics:\\n')\n", - "print(f'Total samples: {len(df)}\\n')\n", - "print(f'Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%')\n", - "print(f'\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%')\n", - "print(f'\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%')\n", - "print(f'Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%')\n", - "print(f'\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%')\n", - "print(f'\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%')\n", - "print(f'Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%')\n", - "print(f'\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%')\n", - "print(f'\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%')" + "print(f\"Data statistics:\\n\")\n", + "print(f\"Total samples: {len(df)}\\n\")\n", + "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", + "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", + "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", + "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", + "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", + "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" ] }, { @@ -199,7 +196,7 @@ "source": [ "## Classification example\n", "\n", - "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values. " + "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" ] }, { @@ -207,7 +204,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### GNN" + "### GNN\n" ] }, { @@ -215,7 +212,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### GraphDataset" + "#### GraphDataset\n" ] }, { @@ -226,14 +223,15 @@ "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", "\n", "A few notes about `GraphDataset` parameters:\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features. \n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", - " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered. \n", - " - Note that transformations have not currently been implemented for the `GridDataset` class. \n", - " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only." + " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", + " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", + " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" ] }, { @@ -246,29 +244,29 @@ "task = \"classif\"\n", "node_features = [\"res_type\"]\n", "edge_features = [\"distance\"]\n", - "features_transform = {'all': {'transform': lambda x: np.cbrt(x), 'standardize': True}}\n", + "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", "\n", - "print('Loading training data...')\n", + "print(\"Loading training data...\")\n", "dataset_train = GraphDataset(\n", - " hdf5_path = input_data_path,\n", - " subset = list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " node_features = node_features,\n", - " edge_features = edge_features,\n", - " features_transform = features_transform,\n", - " target = target,\n", - " task = task\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " node_features=node_features,\n", + " edge_features=edge_features,\n", + " features_transform=features_transform,\n", + " target=target,\n", + " task=task,\n", ")\n", - "print('\\nLoading validation data...')\n", + "print(\"\\nLoading validation data...\")\n", "dataset_val = GraphDataset(\n", - " hdf5_path = input_data_path,\n", - " subset = list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source = dataset_train\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", ")\n", - "print('\\nLoading test data...')\n", + "print(\"\\nLoading test data...\")\n", "dataset_test = GraphDataset(\n", - " hdf5_path = input_data_path,\n", - " subset = list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source = dataset_train\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", ")" ] }, @@ -279,7 +277,7 @@ "source": [ "#### Trainer\n", "\n", - "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks. " + "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" ] }, { @@ -288,10 +286,11 @@ "metadata": {}, "source": [ "A few notes about `Trainer` parameters:\n", + "\n", "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `NaiveNetwork` (implemented in `deeprank2.neuralnets.gnn.naive_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", - "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated." + "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" ] }, { @@ -299,7 +298,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Training" + "##### Training\n" ] }, { @@ -309,11 +308,11 @@ "outputs": [], "source": [ "trainer = Trainer(\n", - " neuralnet = NaiveNetwork,\n", - " dataset_train = dataset_train,\n", - " dataset_val = dataset_val,\n", - " dataset_test = dataset_test,\n", - " output_exporters = [HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))]\n", + " neuralnet=NaiveNetwork,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", ")" ] }, @@ -322,7 +321,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:" + "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" ] }, { @@ -348,9 +347,10 @@ "Then the model can be trained using the `train()` method of the `Trainer` class.\n", "\n", "A few notes about `train()` method parameters:\n", + "\n", "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", - "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process." + "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" ] }, { @@ -366,20 +366,21 @@ "min_epoch = 10\n", "\n", "trainer.train(\n", - " nepoch = epochs,\n", - " batch_size = batch_size,\n", - " earlystop_patience = earlystop_patience,\n", - " earlystop_maxgap = earlystop_maxgap,\n", - " min_epoch = min_epoch,\n", - " validate = True,\n", - " filename = os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"))\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", + ")\n", "\n", "epoch = trainer.epoch_saved_model\n", "print(f\"Model saved at epoch {epoch}\")\n", "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f'Total # of parameters: {pytorch_total_params}')\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f'Total # of trainable parameters: {pytorch_trainable_params}')" + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" ] }, { @@ -389,7 +390,7 @@ "source": [ "##### Testing\n", "\n", - "And the trained model can be tested on `dataset_test`:" + "And the trained model can be tested on `dataset_test`:\n" ] }, { @@ -410,7 +411,7 @@ "\n", "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", "\n", - "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:" + "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" ] }, { @@ -419,8 +420,12 @@ "metadata": {}, "outputs": [], "source": [ - "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", - "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", + "output_train = pd.read_hdf(\n", + " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", + ")\n", + "output_test = pd.read_hdf(\n", + " os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", + ")\n", "output_train.head()" ] }, @@ -431,7 +436,7 @@ "source": [ "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", "\n", - "For example, the loss across the epochs can be plotted for the training and the validation sets:" + "For example, the loss across the epochs can be plotted for the training and the validation sets:\n" ] }, { @@ -440,20 +445,16 @@ "metadata": {}, "outputs": [], "source": [ - "fig = px.line(\n", - " output_train,\n", - " x='epoch',\n", - " y='loss',\n", - " color='phase',\n", - " markers=True)\n", + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", "\n", "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", "\n", "fig.update_layout(\n", - " xaxis_title='Epoch #',\n", - " yaxis_title='Loss',\n", - " title='Loss vs epochs - GNN training',\n", - " width=700, height=400,\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - GNN training\",\n", + " width=700,\n", + " height=400,\n", ")" ] }, @@ -462,7 +463,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition)." + "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" ] }, { @@ -473,23 +474,23 @@ "source": [ "threshold = 0.5\n", "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == 'testing'))]\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", "\n", - "for idx, set in enumerate(['training', 'validation', 'testing']):\n", + "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", " df_plot_phase = df_plot[(df_plot.phase == set)]\n", " y_true = df_plot_phase.target\n", " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", "\n", - " print(f'\\nMetrics for {set}:')\n", + " print(f\"\\nMetrics for {set}:\")\n", " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f'AUC: {round(auc_score, 1)}')\n", - " print(f'Considering a threshold of {threshold}')\n", - " y_pred = (y_score > threshold)*1\n", - " print(f'- Precision: {round(precision_score(y_true, y_pred), 1)}')\n", - " print(f'- Recall: {round(recall_score(y_true, y_pred), 1)}')\n", - " print(f'- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}')\n", - " print(f'- F1: {round(f1_score(y_true, y_pred), 1)}')" + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" ] }, { @@ -499,7 +500,7 @@ "source": [ "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", "\n", - "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs." + "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" ] }, { @@ -507,7 +508,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### CNN" + "### CNN\n" ] }, { @@ -515,7 +516,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### GridDataset" + "#### GridDataset\n" ] }, { @@ -526,8 +527,9 @@ "For training CNNs the user can create `GridDataset` instances.\n", "\n", "A few notes about `GridDataset` parameters:\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\". \n", - "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs." + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", + "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" ] }, { @@ -539,24 +541,24 @@ "target = \"binary\"\n", "task = \"classif\"\n", "\n", - "print('Loading training data...')\n", + "print(\"Loading training data...\")\n", "dataset_train = GridDataset(\n", - " hdf5_path = input_data_path,\n", - " subset = list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " target = target,\n", - " task = task\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " target=target,\n", + " task=task,\n", ")\n", - "print('\\nLoading validation data...')\n", + "print(\"\\nLoading validation data...\")\n", "dataset_val = GridDataset(\n", - " hdf5_path = input_data_path,\n", - " subset = list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source = dataset_train\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", ")\n", - "print('\\nLoading test data...')\n", + "print(\"\\nLoading test data...\")\n", "dataset_test = GridDataset(\n", - " hdf5_path = input_data_path,\n", - " subset = list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source = dataset_train \n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", ")" ] }, @@ -567,7 +569,7 @@ "source": [ "#### Trainer\n", "\n", - "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN. " + "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" ] }, { @@ -576,7 +578,7 @@ "metadata": {}, "source": [ "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", - "- The rest of the `Trainer` parameters can be used as explained already for graphs." + "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" ] }, { @@ -584,7 +586,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Training" + "##### Training\n" ] }, { @@ -603,30 +605,31 @@ "min_epoch = 10\n", "\n", "trainer = Trainer(\n", - " neuralnet = CnnClassification,\n", - " dataset_train = dataset_train,\n", - " dataset_val = dataset_val,\n", - " dataset_test = dataset_test,\n", - " output_exporters = [HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))]\n", + " neuralnet=CnnClassification,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", ")\n", "\n", "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", "\n", "trainer.train(\n", - " nepoch = epochs,\n", - " batch_size = batch_size,\n", - " earlystop_patience = earlystop_patience,\n", - " earlystop_maxgap = earlystop_maxgap,\n", - " min_epoch = min_epoch,\n", - " validate = True,\n", - " filename = os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"))\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", + ")\n", "\n", "epoch = trainer.epoch_saved_model\n", "print(f\"Model saved at epoch {epoch}\")\n", "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f'Total # of parameters: {pytorch_total_params}')\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f'Total # of trainable parameters: {pytorch_trainable_params}')" + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" ] }, { @@ -636,7 +639,7 @@ "source": [ "##### Testing\n", "\n", - "And the trained model can be tested on `dataset_test`:" + "And the trained model can be tested on `dataset_test`:\n" ] }, { @@ -655,7 +658,7 @@ "source": [ "##### Results visualization\n", "\n", - "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model. " + "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" ] }, { @@ -664,8 +667,12 @@ "metadata": {}, "outputs": [], "source": [ - "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", - "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", + "output_train = pd.read_hdf(\n", + " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\"\n", + ")\n", + "output_test = pd.read_hdf(\n", + " os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\"\n", + ")\n", "output_train.head()" ] }, @@ -674,7 +681,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:" + "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" ] }, { @@ -683,20 +690,16 @@ "metadata": {}, "outputs": [], "source": [ - "fig = px.line(\n", - " output_train,\n", - " x='epoch',\n", - " y='loss',\n", - " color='phase',\n", - " markers=True)\n", + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", "\n", "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", "\n", "fig.update_layout(\n", - " xaxis_title='Epoch #',\n", - " yaxis_title='Loss',\n", - " title='Loss vs epochs - CNN training',\n", - " width=700, height=400,\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - CNN training\",\n", + " width=700,\n", + " height=400,\n", ")" ] }, @@ -705,7 +708,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And some metrics of interest for classification tasks:" + "And some metrics of interest for classification tasks:\n" ] }, { @@ -716,23 +719,23 @@ "source": [ "threshold = 0.5\n", "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == 'testing'))]\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", "\n", - "for idx, set in enumerate(['training', 'validation', 'testing']):\n", + "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", " df_plot_phase = df_plot[(df_plot.phase == set)]\n", " y_true = df_plot_phase.target\n", " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", "\n", - " print(f'\\nMetrics for {set}:')\n", + " print(f\"\\nMetrics for {set}:\")\n", " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f'AUC: {round(auc_score, 1)}')\n", - " print(f'Considering a threshold of {threshold}')\n", - " y_pred = (y_score > threshold)*1\n", - " print(f'- Precision: {round(precision_score(y_true, y_pred), 1)}')\n", - " print(f'- Recall: {round(recall_score(y_true, y_pred), 1)}')\n", - " print(f'- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}')\n", - " print(f'- F1: {round(f1_score(y_true, y_pred), 1)}')" + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" ] }, { @@ -740,7 +743,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!" + "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" ] } ], @@ -760,7 +763,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" }, "orig_nbformat": 4 },