diff --git a/deeprank2/features/exposure.py b/deeprank2/features/exposure.py index 88d512ad7..b5d4aa71f 100644 --- a/deeprank2/features/exposure.py +++ b/deeprank2/features/exposure.py @@ -1,11 +1,9 @@ import logging import signal import sys -import warnings from typing import NoReturn import numpy as np -from Bio.PDB.Atom import PDBConstructionWarning from Bio.PDB.HSExposure import HSExposureCA from Bio.PDB.PDBParser import PDBParser from Bio.PDB.ResidueDepth import get_surface, residue_depth @@ -42,9 +40,8 @@ def add_features( # noqa:D103 signal.signal(signal.SIGINT, handle_sigint) signal.signal(signal.SIGALRM, handle_timeout) - with warnings.catch_warnings(record=PDBConstructionWarning): - parser = PDBParser() - structure = parser.get_structure("_tmp", pdb_path) + parser = PDBParser(QUIET=True) + structure = parser.get_structure("_tmp", pdb_path) bio_model = structure[0] try: diff --git a/deeprank2/features/irc.py b/deeprank2/features/irc.py index f1d0f4c07..249cbd1bd 100644 --- a/deeprank2/features/irc.py +++ b/deeprank2/features/irc.py @@ -40,7 +40,11 @@ def __init__(self, residue: tuple[str, int, str], polarity: Polarity): self.connections["all"] = [] -def get_IRCs(pdb_path: str, chains: list[str], cutoff: float = 5.5) -> dict[str, _ContactDensity]: +def get_IRCs( + pdb_path: str, + chains: list[str], + cutoff: float = 5.5, +) -> dict[str, _ContactDensity]: """Get all close contact residues from the opposite chain. Args: diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deeprank2/tools/pdbprep/hydrogenation.py b/deeprank2/tools/pdbprep/hydrogenation.py new file mode 100644 index 000000000..40733742f --- /dev/null +++ b/deeprank2/tools/pdbprep/hydrogenation.py @@ -0,0 +1,438 @@ +from dataclasses import dataclass +from tempfile import TemporaryFile + +from openmm import LangevinIntegrator, unit +from openmm import app as mmapp +from pdb2pqr.config import FORCE_FIELDS +from pdb2pqr.main import main_driver as pdb2pqr + +_MIN_ATOMS_TO_PROTONATE = 5 # TODO: why do we need this check? +_TEMPERATURE = 310 + + +def add_hydrogens( + pdb_str: str, + max_iterations: int = 100, + constraint_tolerance: float = 1e-05, + random_seed: int = 917, +) -> str: + """Add hydrogens to pdb file. + + Args: + pdb_str: String representation of pdb file, preprocessed using deeprank2.tools.pdbprep.preprocess. + max_iterations: Maximum number of iterations to perform during energy minimization. Defaults to 100. + constraint_tolerance: Distance tolerance of LangevinIntegrator within which constraints are maintained, as a + fraction of the constrained distance. Defaults to 1e-05. + random_seed: Random seed for LangevinIntegrator. + """ + with TemporaryFile(mode="w", suffix="pdb") as input_pdb, TemporaryFile(mode="r") as output_pdb: + input_pdb.write(pdb_str) + + # PARAMETERS + forcefield_model = "amber14-all.xml" #'charmm36.xml' + water_model = "amber14/tip3p.xml" #'charmm36/tip3p-pme-b.xml' + platform_properties = {"Threads": str(1)} + + # PREPARES MODEL + forcefield = mmapp.ForceField(forcefield_model, water_model) + structure = mmapp.PDBFile(input_pdb) + protonated_sequence = _detect_protonation_state(pdb_str) + + model = mmapp.Modeller(structure.topology, structure.positions) + model.addHydrogens(forcefield=forcefield, variants=protonated_sequence) + + structure.positions = model.positions + structure.topology = model.topology + + system = forcefield.createSystem(structure.topology) + + integrator = LangevinIntegrator( + temperature=_TEMPERATURE * unit.kelvin, + frictionCoeff=1.0 / unit.picosecond, + stepSize=2.0 * unit.femtosecond, + ) + + integrator.setRandomNumberSeed(random_seed) + integrator.setConstraintTolerance(constraint_tolerance) + + simulation = mmapp.Simulation( + structure.topology, + system, + integrator, + platformProperties=platform_properties, + ) + + context = simulation.context + context.setPositions(model.positions) + + state = context.getState(getEnergy=True) + ini_ene = state.getPotentialEnergy().value_in_unit(unit.kilocalorie_per_mole) # noqa:F841 TODO: check if this line is needed + simulation.minimizeEnergy(maxIterations=max_iterations) + structure.positions = context.getState(getPositions=True).getPositions() + + # TODO: check whether these lines need to be repeated or whether that's a typo. + state = context.getState(getEnergy=True) + simulation.minimizeEnergy(maxIterations=max_iterations) + structure.positions = context.getState(getPositions=True).getPositions() + + mmapp.PDBFile.writeFile(structure.topology, structure.positions, output_pdb) + + return output_pdb.read() + + +def _detect_protonation_state(pdb_str: str) -> list[str | None]: + """Detect protonation states and return them as a sequence of alternative residue names.""" + _calculate_protonation_state(pdb_str) + + pdb_lines = pdb_str.splitlines() + protonable_residues = ("HIS", "ASP", "GLU", "CYS", "LYS") + + # initialize + prev_resid = None + prev_resname = None + atoms_in_residue = set() + residues = [] + + for i, line in enumerate(pdb_lines): + if not line.startswith("ATOM"): + continue + + resid = line[21:26] # chain ID + res number + resname = line[17:20] + atom_name = line[12:16].strip() + + if (resid != prev_resid and len(atoms_in_residue) >= _MIN_ATOMS_TO_PROTONATE) or i == len(pdb_lines): + if prev_resname in protonable_residues: + residues.append(_protonation_resname(prev_resname, atoms_in_residue)) + else: + residues.append(None) + atoms_in_residue.clear() + + atoms_in_residue.add(atom_name) + prev_resid = resid + prev_resname = resname + + return residues + + +def _protonation_resname( # noqa:PLR0911 + resname: str, + atoms_in_residue: list[str], +) -> str: + """Return alternate residue name based on protonation state.""" + if resname == "HIS": + if "HD1" in atoms_in_residue and "HE2" in atoms_in_residue: + return "HIP" + if "HD1" in atoms_in_residue: + return "HID" + if "HE2" in atoms_in_residue: + return "HIE" + return "HIN" + + if resname == "ASP" and ("HD2" in atoms_in_residue or "HD1" in atoms_in_residue): + return "ASN" + + if resname == "GLU" and ("HE2" in atoms_in_residue or "HE1" in atoms_in_residue): + return "GLH" + + if resname == "LYS" and not all(_a in atoms_in_residue for _a in ("HZ1", "HZ2", "HZ3")): + return "LYN" + + if resname == "CYS" and "HG" not in atoms_in_residue: + return "CYX" + + return resname + + +def _calculate_protonation_state( + pdb_str: str, + forcefield: str = "AMBER", +) -> str: + """Calculate the protonation states using PDB2PQR. + + PDB2PQR can only use files (no strings) as input and output, which is why this function is wrapped inside + TemporaryFile context managers. + """ + with TemporaryFile(mode="w", suffix="pdb") as input_pdb, TemporaryFile(mode="r") as output_pdb: + input_pdb.write(pdb_str) + + input_args = _Pdb2pqrArgs(input_pdb, output_pdb, forcefield) + pdb2pqr(input_args) + + return output_pdb.read() + + +@dataclass +class _Pdb2pqrArgs: + """Input arguments to `main_driver` function of PDB2PQR. + + These are usually given via CLI using argparse. All arguments, including those kept as default need to be given to + `main_driver` if called from script. + The argument given to `main_driver` is accessed via dot notation and is iterated over, which is why this is created + as a dataclass with an iterator. + + Args*: + input_path: Input file path. + output_pqr: Output file path. + ff: Name of the selected forcefield. + + *all other arguments should remain untouched. + + Raises: + ValueError: if the forcefield is not recognized + """ + + input_path: str + output_pqr: str + ff: str = "AMBER" + + # arguments set different from default + debump: bool = True + keep_chain: bool = True + log_level: str = "CRITICAL" + + # arguments kept as default + ph: float = 7.0 + assign_only: bool = False + clean: bool = False + userff: None = None + ffout: None = None + usernames: None = None + ligand: None = None + neutraln: bool = False + neutralc: bool = False + drop_water: bool = False + pka_method: None = None + opt: bool = True + include_header: bool = False + whitespace: bool = False + pdb_output: None = None + apbs_input: None = None + + def __post_init__(self): + self._index = 0 + if self.ff.lower() not in FORCE_FIELDS: + msg = f"Forcefield {self.ff} not recognized. Valid options: {FORCE_FIELDS}." + raise ValueError(msg) + if self.ff.lower() != "amber": + msg = f"Forcefield given as {self.ff}. Currently only AMBER forcefield is implemented." + raise NotImplementedError(msg) + + def __iter__(self): + return self + + def __next__(self): + settings = vars(self) + if self._index < len(settings): + setting = list(settings)[self._index] + self._index += 1 + return setting + raise StopIteration + + +# Part of this module is modified from https://github.com/DeepRank/pdbprep/blob/main/ +# original module names: detect_protonation.py and add_hydrogens.py), +# written by João M.C. Teixeira (https://github.com/joaomcteixeira) +# publishd under the following license: + +# Apache License +# Version 2.0, January 2004 +# http://www.apache.org/licenses/ + +# TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +# 1. Definitions. + +# "License" shall mean the terms and conditions for use, reproduction, +# and distribution as defined by Sections 1 through 9 of this document. + +# "Licensor" shall mean the copyright owner or entity authorized by +# the copyright owner that is granting the License. + +# "Legal Entity" shall mean the union of the acting entity and all +# other entities that control, are controlled by, or are under common +# control with that entity. For the purposes of this definition, +# "control" means (i) the power, direct or indirect, to cause the +# direction or management of such entity, whether by contract or +# otherwise, or (ii) ownership of fifty percent (50%) or more of the +# outstanding shares, or (iii) beneficial ownership of such entity. + +# "You" (or "Your") shall mean an individual or Legal Entity +# exercising permissions granted by this License. + +# "Source" form shall mean the preferred form for making modifications, +# including but not limited to software source code, documentation +# source, and configuration files. + +# "Object" form shall mean any form resulting from mechanical +# transformation or translation of a Source form, including but +# not limited to compiled object code, generated documentation, +# and conversions to other media types. + +# "Work" shall mean the work of authorship, whether in Source or +# Object form, made available under the License, as indicated by a +# copyright notice that is included in or attached to the work +# (an example is provided in the Appendix below). + +# "Derivative Works" shall mean any work, whether in Source or Object +# form, that is based on (or derived from) the Work and for which the +# editorial revisions, annotations, elaborations, or other modifications +# represent, as a whole, an original work of authorship. For the purposes +# of this License, Derivative Works shall not include works that remain +# separable from, or merely link (or bind by name) to the interfaces of, +# the Work and Derivative Works thereof. + +# "Contribution" shall mean any work of authorship, including +# the original version of the Work and any modifications or additions +# to that Work or Derivative Works thereof, that is intentionally +# submitted to Licensor for inclusion in the Work by the copyright owner +# or by an individual or Legal Entity authorized to submit on behalf of +# the copyright owner. For the purposes of this definition, "submitted" +# means any form of electronic, verbal, or written communication sent +# to the Licensor or its representatives, including but not limited to +# communication on electronic mailing lists, source code control systems, +# and issue tracking systems that are managed by, or on behalf of, the +# Licensor for the purpose of discussing and improving the Work, but +# excluding communication that is conspicuously marked or otherwise +# designated in writing by the copyright owner as "Not a Contribution." + +# "Contributor" shall mean Licensor and any individual or Legal Entity +# on behalf of whom a Contribution has been received by Licensor and +# subsequently incorporated within the Work. + +# 2. Grant of Copyright License. Subject to the terms and conditions of +# this License, each Contributor hereby grants to You a perpetual, +# worldwide, non-exclusive, no-charge, royalty-free, irrevocable +# copyright license to reproduce, prepare Derivative Works of, +# publicly display, publicly perform, sublicense, and distribute the +# Work and such Derivative Works in Source or Object form. + +# 3. Grant of Patent License. Subject to the terms and conditions of +# this License, each Contributor hereby grants to You a perpetual, +# worldwide, non-exclusive, no-charge, royalty-free, irrevocable +# (except as stated in this section) patent license to make, have made, +# use, offer to sell, sell, import, and otherwise transfer the Work, +# where such license applies only to those patent claims licensable +# by such Contributor that are necessarily infringed by their +# Contribution(s) alone or by combination of their Contribution(s) +# with the Work to which such Contribution(s) was submitted. If You +# institute patent litigation against any entity (including a +# cross-claim or counterclaim in a lawsuit) alleging that the Work +# or a Contribution incorporated within the Work constitutes direct +# or contributory patent infringement, then any patent licenses +# granted to You under this License for that Work shall terminate +# as of the date such litigation is filed. + +# 4. Redistribution. You may reproduce and distribute copies of the +# Work or Derivative Works thereof in any medium, with or without +# modifications, and in Source or Object form, provided that You +# meet the following conditions: + +# (a) You must give any other recipients of the Work or +# Derivative Works a copy of this License; and + +# (b) You must cause any modified files to carry prominent notices +# stating that You changed the files; and + +# (c) You must retain, in the Source form of any Derivative Works +# that You distribute, all copyright, patent, trademark, and +# attribution notices from the Source form of the Work, +# excluding those notices that do not pertain to any part of +# the Derivative Works; and + +# (d) If the Work includes a "NOTICE" text file as part of its +# distribution, then any Derivative Works that You distribute must +# include a readable copy of the attribution notices contained +# within such NOTICE file, excluding those notices that do not +# pertain to any part of the Derivative Works, in at least one +# of the following places: within a NOTICE text file distributed +# as part of the Derivative Works; within the Source form or +# documentation, if provided along with the Derivative Works; or, +# within a display generated by the Derivative Works, if and +# wherever such third-party notices normally appear. The contents +# of the NOTICE file are for informational purposes only and +# do not modify the License. You may add Your own attribution +# notices within Derivative Works that You distribute, alongside +# or as an addendum to the NOTICE text from the Work, provided +# that such additional attribution notices cannot be construed +# as modifying the License. + +# You may add Your own copyright statement to Your modifications and +# may provide additional or different license terms and conditions +# for use, reproduction, or distribution of Your modifications, or +# for any such Derivative Works as a whole, provided Your use, +# reproduction, and distribution of the Work otherwise complies with +# the conditions stated in this License. + +# 5. Submission of Contributions. Unless You explicitly state otherwise, +# any Contribution intentionally submitted for inclusion in the Work +# by You to the Licensor shall be under the terms and conditions of +# this License, without any additional terms or conditions. +# Notwithstanding the above, nothing herein shall supersede or modify +# the terms of any separate license agreement you may have executed +# with Licensor regarding such Contributions. + +# 6. Trademarks. This License does not grant permission to use the trade +# names, trademarks, service marks, or product names of the Licensor, +# except as required for reasonable and customary use in describing the +# origin of the Work and reproducing the content of the NOTICE file. + +# 7. Disclaimer of Warranty. Unless required by applicable law or +# agreed to in writing, Licensor provides the Work (and each +# Contributor provides its Contributions) on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied, including, without limitation, any warranties or conditions +# of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +# PARTICULAR PURPOSE. You are solely responsible for determining the +# appropriateness of using or redistributing the Work and assume any +# risks associated with Your exercise of permissions under this License. + +# 8. Limitation of Liability. In no event and under no legal theory, +# whether in tort (including negligence), contract, or otherwise, +# unless required by applicable law (such as deliberate and grossly +# negligent acts) or agreed to in writing, shall any Contributor be +# liable to You for damages, including any direct, indirect, special, +# incidental, or consequential damages of any character arising as a +# result of this License or out of the use or inability to use the +# Work (including but not limited to damages for loss of goodwill, +# work stoppage, computer failure or malfunction, or any and all +# other commercial damages or losses), even if such Contributor +# has been advised of the possibility of such damages. + +# 9. Accepting Warranty or Additional Liability. While redistributing +# the Work or Derivative Works thereof, You may choose to offer, +# and charge a fee for, acceptance of support, warranty, indemnity, +# or other liability obligations and/or rights consistent with this +# License. However, in accepting such obligations, You may act only +# on Your own behalf and on Your sole responsibility, not on behalf +# of any other Contributor, and only if You agree to indemnify, +# defend, and hold each Contributor harmless for any liability +# incurred by, or claims asserted against, such Contributor by reason +# of your accepting any such warranty or additional liability. + +# END OF TERMS AND CONDITIONS + +# APPENDIX: How to apply the Apache License to your work. + +# To apply the Apache License to your work, attach the following +# boilerplate notice, with the fields enclosed by brackets "[]" +# replaced with your own identifying information. (Don't include +# the brackets!) The text should be enclosed in the appropriate +# comment syntax for the file format. We also recommend that a +# file or class name and description of purpose be included on the +# same "printed page" as the copyright notice for easier +# identification within third-party archives. + +# Copyright [yyyy] [name of copyright owner] + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/deeprank2/tools/pdbprep/preprocess.py b/deeprank2/tools/pdbprep/preprocess.py new file mode 100644 index 000000000..c0cbf68d0 --- /dev/null +++ b/deeprank2/tools/pdbprep/preprocess.py @@ -0,0 +1,111 @@ +from pathlib import Path +from tempfile import TemporaryFile + +from pdbtools import pdb_delresname, pdb_fixinsert, pdb_keepcoord, pdb_reatom, pdb_reres, pdb_rplresname, pdb_selaltloc, pdb_sort, pdb_tidy +from Pras_Server.RunType import InitRunType as PRAS + +from deeprank2.domain.aminoacidlist import amino_acids_by_code, amino_acids_by_letter + + +def preprocess_pdbs( + pdb_path: str | Path, + rename_residues: dict[str, str] | None = None, +) -> str: + """Preprocess a pdb file for adding/fixing hydrogens. + + Args: + pdb_path: Path of pdb file to preprocess. + rename_residues: Dictionary mapping of non-standard residue names (keys) to their standard names. + Defaults to: + { + "MSE": "MET", + "HIP": "HIS", + "HIE": "HIS", + "HID": "HIS", + "HSE": "HIS", + "HSD": "HIS", + } + """ + with Path(pdb_path).open("r") as f: + pdb_str = f.read() + + pdb_str = _run_pdb_tools(pdb_str, rename_residues) + pdb_str = _add_missing_heavy_atoms(pdb_str) + + return pdb_str # noqa: RET504 + + +def _run_pdb_tools( + pdb_str: str, + rename_residues: dict[str, str] | None = None, +) -> str: + """Preprocesses pdb files using pdb-tools (Bonvin lab). + + Files undergo a number of pruning steps: + 1. Scrape non-atomic records. + 2. Scrape water molecules. + 3. Replace non-standard residue names with their standard counterparts. + A default library is used for this, which can be replaced using the `rename_residues` argument. + 4. Scrape lower occupancy atoms in case of alternate locations. + Note that in case of equal occupancy, the first record is always used. + 5. Delete insertion codes and shift the residue numbering of downstream residues. + 6. Sort records by chain and residues. + 7. Renumber residues on each chain from 1. + 8. Renumber atoms from 1. + 9. Tidy up to somewhat adhere to pdb format specifications. + + Raises: + ValueError: if an invalid amino acid (3-letter or 1-letter) code is given as a value to rename_residues. + + Returns: + str: Updated pdb + """ + if not rename_residues: + rename_residues = { + "MSE": "MET", + "HIP": "HIS", + "HIE": "HIS", + "HID": "HIS", + "HSE": "HIS", + "HSD": "HIS", + } + else: + for new_res in rename_residues.values(): + if new_res not in amino_acids_by_code and new_res not in amino_acids_by_letter: + msg = f"{new_res} is not a valid amino-acid code." + raise ValueError(msg) + + # sequentially run individual tools from pdb-tools + new_pdb = pdb_keepcoord.run(pdb_str) # Scrape non-atomic records + new_pdb = pdb_delresname.run(new_pdb, ("HOH",)) # Scrape water molecules + + for old, new in rename_residues.items(): + new_pdb = pdb_rplresname.run(new_pdb, old, new) # Replace non-standard residue names with their standard counterparts + + new_pdb = pdb_selaltloc.run(new_pdb) # Scrape lower occupancy atoms in case of alternate locations + new_pdb = pdb_fixinsert.run(new_pdb, []) # Delete insertion codes and shift the residue numbering of downstream residues. + new_pdb = pdb_sort.run(new_pdb, "CR") # Sort records by chain and residues + new_pdb = pdb_reres.run(new_pdb, 1) # Renumber residues on each chain from 1 + new_pdb = pdb_reatom.run(new_pdb, 1) # Renumber atoms from 1 + new_pdb = pdb_tidy.run(new_pdb) # Tidy up to somewhat adhere to pdb format specifications + + return "".join(list(new_pdb)) + + +def _add_missing_heavy_atoms(pdb_str: str) -> str: + """Add missing heavy atoms (usually many) using PRAS. + + PRAS can only use files (no strings) as input and output, which is why this function is wrapped inside + TemporaryFile context managers. + + Returns: + str: Updated pdb + """ + with TemporaryFile(mode="w", suffix="pdb", encoding="utf-8") as input_pdb, TemporaryFile(mode="r", encoding="utf-8") as output_pdb: + input_pdb.write(pdb_str) + + fixing = PRAS(ofname=output_pdb) + fixing.fname = input_pdb + fixing.ProcessOther() # write to specified filename + + return output_pdb.read() diff --git a/env/deeprank2-docker.yml b/env/deeprank2-docker.yml index 440daf9c3..6f43c592e 100644 --- a/env/deeprank2-docker.yml +++ b/env/deeprank2-docker.yml @@ -41,5 +41,10 @@ dependencies: - ruff>=0.3.0 - dill>=0.3.8 - pyarrow>=15.0.0 + - openmm>=8.0.0 + - chardet>=4.0.0 - pip: + - pdb-tools==2.5.0 + - Pras-Server==1.2.1 + - pdb2pqr==3.6.2 - --requirement requirements-docker.txt diff --git a/env/deeprank2.yml b/env/deeprank2.yml index 6127fcb66..1c237710e 100644 --- a/env/deeprank2.yml +++ b/env/deeprank2.yml @@ -41,3 +41,9 @@ dependencies: - ruff>=0.3.0 - dill>=0.3.8 - pyarrow>=15.0.0 + - openmm>=8.0.0 + - chardet>=4.0.0 + - pip: + - pdb-tools==2.5.0 + - Pras-Server==1.2.1 + - pdb2pqr==3.6.2 diff --git a/pyproject.toml b/pyproject.toml index 7978fdbfa..ab22e8330 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,10 @@ ignore = [ "S311", # insecure random generators "PT011", # pytest-raises-too-broad "SIM108", # Use ternary operator + # TODO formatting + "TD002", # Missing TODO author + "TD003", # Missing TODO link + "FIX002", # Consider resolving the issue instead # Unwanted docstrings "D100", # Missing module docstring "D104", # Missing public package docstring diff --git a/tests/tools/_PRAS.ipynb b/tests/tools/_PRAS.ipynb new file mode 100644 index 000000000..59e44af2a --- /dev/null +++ b/tests/tools/_PRAS.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Testing PRAS functionality\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ruff: noqa" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "fixed /home/dbodor/git/DeepRank/DeepRank2/tests/Untitled_3C8P_pdbprep_joao.pdb.pdb\n" + ] + } + ], + "source": [ + "from Pras_Server.RunType import InitRunType\n", + "\n", + "fixing = InitRunType(\n", + " rotamer=\"\",\n", + " mutation=\"\",\n", + " pdb_faspr=\"\",\n", + " keep_ligand=\"\",\n", + " chain_no=\"\",\n", + " addh=False,\n", + " ss=False,\n", + " raman=False,\n", + " ofname=False,\n", + " pdbid=False,\n", + " his_p=False,\n", + ")\n", + "\n", + "pdbs = [\"/home/dbodor/git/DeepRank/DeepRank2/tests/Untitled_3C8P_pdbprep_joao.pdb\"]\n", + "fixing.pdbid = pdbs\n", + "\n", + "fixing.ProcessOther()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from pprint import pprint\n", + "\n", + "with Path(\"/home/dbodor/git/DeepRank/DeepRank2/tests/Untitled_3C8P_pdbprep_joao.pdb\").open(\"r\") as f:\n", + " orig = f.read().splitlines()\n", + "\n", + "with Path(\"/home/dbodor/git/DeepRank/DeepRank2/tests/Untitled_3C8P_pras_local.pdb\").open(\"r\") as f:\n", + " fixed = f.read().splitlines()[1:]\n", + "\n", + "with Path(\"/home/dbodor/git/DeepRank/DeepRank2/tests/Untitled_3C8P_pras_joao.pdb\").open(\"r\") as f:\n", + " joao = f.read().splitlines()[1:]\n", + "\n", + "\n", + "# fixed[0].strip() == orig[0].strip()\n", + "# orig[0].strip()\n", + "\n", + "for i, x in enumerate(fixed):\n", + " if x.strip() != joao[i].strip():\n", + " print(i)\n", + "\n", + "# fixed[0].strip() == joao[0].strip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import time\n", + "\n", + "from Pras_Server.RunType import InitRunType\n", + "\n", + "startTime = time.time()\n", + "out = [\n", + " \"xxxx.pdb\",\n", + " \"zzzz.pdb\",\n", + " \"ssss.pdb\",\n", + "] # size of this list MUST be equal to the total number of PDB structures in your working directory\n", + "k = 0\n", + "for i in os.listdir(os.getcwd()):\n", + " if i.endswith(\".pdb\") or i.endswith(\".ent\"):\n", + " try:\n", + " out[k]\n", + " except:\n", + " print(\"Index error. Size of the name list is small\")\n", + " sys.exit()\n", + " fixing = InitRunType(\n", + " rotamer=\"\",\n", + " mutation=\"\",\n", + " pdb_faspr=\"\",\n", + " keep_ligand=\"\",\n", + " chain_no=\"\",\n", + " addh=False,\n", + " ss=False,\n", + " raman=False,\n", + " ofname=out[k],\n", + " pdbid=False,\n", + " his_p=False,\n", + " )\n", + " fixing.fname = i\n", + " fixing.ProcessOther()\n", + " k += 1\n", + "print(f\"The program took {time.time() - startTime} second !\")" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "string = \"fsvfdskl\"\n", + "\n", + "with open(\"Output.txt\", \"w\") as text_file:\n", + " print(string, file=text_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DR2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/tools/_pdb_tools.ipynb b/tests/tools/_pdb_tools.ipynb new file mode 100644 index 000000000..0b9ddd335 --- /dev/null +++ b/tests/tools/_pdb_tools.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Testing pbd-tools functionality\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# ruff: noqa" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['101M/101M.pdb',\n", + " '1A0Z/1A0Z.pdb',\n", + " '1A6B/1A6B.pdb',\n", + " '1ATN/1ATN_1w.pdb',\n", + " '1ATN/1ATN_2w.pdb',\n", + " '1ATN/1ATN_3w.pdb',\n", + " '1ATN/1ATN_4w.pdb',\n", + " '1CRN/1CRN.pdb',\n", + " '1ak4/1ak4.pdb',\n", + " '2g98/pdb2g98.pdb',\n", + " '3C8P/3C8P.pdb',\n", + " '3MRC/3MRC.pdb',\n", + " '9api/9api.pdb']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_pdb_files = \"\"\"\n", + "101M/101M.pdb\n", + "1A0Z/1A0Z.pdb\n", + "1A6B/1A6B.pdb\n", + "1ATN/1ATN_1w.pdb\n", + "1ATN/1ATN_2w.pdb\n", + "1ATN/1ATN_3w.pdb\n", + "1ATN/1ATN_4w.pdb\n", + "1CRN/1CRN.pdb\n", + "1ak4/1ak4.pdb\n", + "2g98/pdb2g98.pdb\n", + "3C8P/3C8P.pdb\n", + "3MRC/3MRC.pdb\n", + "9api/9api.pdb\n", + "\"\"\"\n", + "\n", + "all_pdb_files = all_pdb_files.splitlines()[1:]\n", + "base_folder = \"/home/dbodor/git/DeepRank/DeepRank2/tests/data/pdb//\"\n", + "\n", + "all_pdb_files" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('3C8P/3C8P.pdb', 1488, 'A'),\n", + " ('3C8P/3C8P.pdb', 1490, 'B'),\n", + " ('3C8P/3C8P.pdb', 1492, 'A'),\n", + " ('3C8P/3C8P.pdb', 1494, 'B'),\n", + " ('3MRC/3MRC.pdb', 566, 'A'),\n", + " ('3MRC/3MRC.pdb', 567, 'B'),\n", + " ('3MRC/3MRC.pdb', 570, 'A'),\n", + " ('3MRC/3MRC.pdb', 571, 'B'),\n", + " ('3MRC/3MRC.pdb', 572, 'A'),\n", + " ('3MRC/3MRC.pdb', 573, 'B'),\n", + " ('3MRC/3MRC.pdb', 691, 'A'),\n", + " ('3MRC/3MRC.pdb', 692, 'B'),\n", + " ('3MRC/3MRC.pdb', 695, 'A'),\n", + " ('3MRC/3MRC.pdb', 696, 'B'),\n", + " ('3MRC/3MRC.pdb', 697, 'A'),\n", + " ('3MRC/3MRC.pdb', 698, 'B'),\n", + " ('3MRC/3MRC.pdb', 699, 'A'),\n", + " ('3MRC/3MRC.pdb', 700, 'B'),\n", + " ('3MRC/3MRC.pdb', 701, 'A'),\n", + " ('3MRC/3MRC.pdb', 702, 'B'),\n", + " ('3MRC/3MRC.pdb', 703, 'A'),\n", + " ('3MRC/3MRC.pdb', 704, 'B'),\n", + " ('3MRC/3MRC.pdb', 748, 'A'),\n", + " ('3MRC/3MRC.pdb', 749, 'B'),\n", + " ('3MRC/3MRC.pdb', 752, 'A'),\n", + " ('3MRC/3MRC.pdb', 753, 'B'),\n", + " ('3MRC/3MRC.pdb', 754, 'A'),\n", + " ('3MRC/3MRC.pdb', 755, 'B'),\n", + " ('3MRC/3MRC.pdb', 756, 'A'),\n", + " ('3MRC/3MRC.pdb', 757, 'B'),\n", + " ('3MRC/3MRC.pdb', 805, 'A'),\n", + " ('3MRC/3MRC.pdb', 806, 'B'),\n", + " ('3MRC/3MRC.pdb', 809, 'A'),\n", + " ('3MRC/3MRC.pdb', 810, 'B'),\n", + " ('3MRC/3MRC.pdb', 811, 'A'),\n", + " ('3MRC/3MRC.pdb', 812, 'B'),\n", + " ('3MRC/3MRC.pdb', 813, 'A'),\n", + " ('3MRC/3MRC.pdb', 814, 'B'),\n", + " ('3MRC/3MRC.pdb', 815, 'A'),\n", + " ('3MRC/3MRC.pdb', 816, 'B'),\n", + " ('3MRC/3MRC.pdb', 817, 'A'),\n", + " ('3MRC/3MRC.pdb', 818, 'B'),\n", + " ('3MRC/3MRC.pdb', 1060, 'A'),\n", + " ('3MRC/3MRC.pdb', 1061, 'B'),\n", + " ('3MRC/3MRC.pdb', 1064, 'A'),\n", + " ('3MRC/3MRC.pdb', 1065, 'B'),\n", + " ('3MRC/3MRC.pdb', 1066, 'A'),\n", + " ('3MRC/3MRC.pdb', 1067, 'B'),\n", + " ('3MRC/3MRC.pdb', 1068, 'A'),\n", + " ('3MRC/3MRC.pdb', 1069, 'B'),\n", + " ('3MRC/3MRC.pdb', 1070, 'A'),\n", + " ('3MRC/3MRC.pdb', 1071, 'B'),\n", + " ('3MRC/3MRC.pdb', 1093, 'A'),\n", + " ('3MRC/3MRC.pdb', 1094, 'B'),\n", + " ('3MRC/3MRC.pdb', 1097, 'A'),\n", + " ('3MRC/3MRC.pdb', 1098, 'B'),\n", + " ('3MRC/3MRC.pdb', 1099, 'A'),\n", + " ('3MRC/3MRC.pdb', 1100, 'B'),\n", + " ('3MRC/3MRC.pdb', 1101, 'A'),\n", + " ('3MRC/3MRC.pdb', 1102, 'B'),\n", + " ('3MRC/3MRC.pdb', 1103, 'A'),\n", + " ('3MRC/3MRC.pdb', 1104, 'B'),\n", + " ('3MRC/3MRC.pdb', 1105, 'A'),\n", + " ('3MRC/3MRC.pdb', 1106, 'B'),\n", + " ('3MRC/3MRC.pdb', 1107, 'A'),\n", + " ('3MRC/3MRC.pdb', 1108, 'B'),\n", + " ('3MRC/3MRC.pdb', 1109, 'A'),\n", + " ('3MRC/3MRC.pdb', 1110, 'B'),\n", + " ('3MRC/3MRC.pdb', 1340, 'A'),\n", + " ('3MRC/3MRC.pdb', 1341, 'B'),\n", + " ('3MRC/3MRC.pdb', 1344, 'A'),\n", + " ('3MRC/3MRC.pdb', 1345, 'B'),\n", + " ('3MRC/3MRC.pdb', 1346, 'A'),\n", + " ('3MRC/3MRC.pdb', 1347, 'B'),\n", + " ('3MRC/3MRC.pdb', 1348, 'A'),\n", + " ('3MRC/3MRC.pdb', 1349, 'B'),\n", + " ('3MRC/3MRC.pdb', 1515, 'A'),\n", + " ('3MRC/3MRC.pdb', 1516, 'B'),\n", + " ('3MRC/3MRC.pdb', 1519, 'A'),\n", + " ('3MRC/3MRC.pdb', 1520, 'B'),\n", + " ('3MRC/3MRC.pdb', 1521, 'A'),\n", + " ('3MRC/3MRC.pdb', 1522, 'B'),\n", + " ('3MRC/3MRC.pdb', 1523, 'A'),\n", + " ('3MRC/3MRC.pdb', 1524, 'B'),\n", + " ('3MRC/3MRC.pdb', 1525, 'A'),\n", + " ('3MRC/3MRC.pdb', 1526, 'B'),\n", + " ('3MRC/3MRC.pdb', 1527, 'A'),\n", + " ('3MRC/3MRC.pdb', 1528, 'B'),\n", + " ('3MRC/3MRC.pdb', 1530, 'A'),\n", + " ('3MRC/3MRC.pdb', 1531, 'B'),\n", + " ('3MRC/3MRC.pdb', 1534, 'A'),\n", + " ('3MRC/3MRC.pdb', 1535, 'B'),\n", + " ('3MRC/3MRC.pdb', 1536, 'A'),\n", + " ('3MRC/3MRC.pdb', 1537, 'B'),\n", + " ('3MRC/3MRC.pdb', 1538, 'A'),\n", + " ('3MRC/3MRC.pdb', 1539, 'B'),\n", + " ('3MRC/3MRC.pdb', 1540, 'A'),\n", + " ('3MRC/3MRC.pdb', 1541, 'B'),\n", + " ('3MRC/3MRC.pdb', 1542, 'A'),\n", + " ('3MRC/3MRC.pdb', 1543, 'B'),\n", + " ('3MRC/3MRC.pdb', 1544, 'A'),\n", + " ('3MRC/3MRC.pdb', 1545, 'B'),\n", + " ('3MRC/3MRC.pdb', 1546, 'A'),\n", + " ('3MRC/3MRC.pdb', 1547, 'B'),\n", + " ('3MRC/3MRC.pdb', 1548, 'A'),\n", + " ('3MRC/3MRC.pdb', 1549, 'B'),\n", + " ('3MRC/3MRC.pdb', 1630, 'A'),\n", + " ('3MRC/3MRC.pdb', 1631, 'B'),\n", + " ('3MRC/3MRC.pdb', 1634, 'A'),\n", + " ('3MRC/3MRC.pdb', 1635, 'B'),\n", + " ('3MRC/3MRC.pdb', 1636, 'A'),\n", + " ('3MRC/3MRC.pdb', 1637, 'B'),\n", + " ('3MRC/3MRC.pdb', 1638, 'A'),\n", + " ('3MRC/3MRC.pdb', 1639, 'B'),\n", + " ('3MRC/3MRC.pdb', 1640, 'A'),\n", + " ('3MRC/3MRC.pdb', 1641, 'B'),\n", + " ('3MRC/3MRC.pdb', 1642, 'A'),\n", + " ('3MRC/3MRC.pdb', 1643, 'B'),\n", + " ('3MRC/3MRC.pdb', 1786, 'A'),\n", + " ('3MRC/3MRC.pdb', 1787, 'B'),\n", + " ('3MRC/3MRC.pdb', 1790, 'A'),\n", + " ('3MRC/3MRC.pdb', 1791, 'B'),\n", + " ('3MRC/3MRC.pdb', 1792, 'A'),\n", + " ('3MRC/3MRC.pdb', 1793, 'B'),\n", + " ('3MRC/3MRC.pdb', 1794, 'A'),\n", + " ('3MRC/3MRC.pdb', 1795, 'B'),\n", + " ('3MRC/3MRC.pdb', 1796, 'A'),\n", + " ('3MRC/3MRC.pdb', 1797, 'B'),\n", + " ('3MRC/3MRC.pdb', 1798, 'A'),\n", + " ('3MRC/3MRC.pdb', 1799, 'B'),\n", + " ('3MRC/3MRC.pdb', 1944, 'A'),\n", + " ('3MRC/3MRC.pdb', 1945, 'B'),\n", + " ('3MRC/3MRC.pdb', 1948, 'A'),\n", + " ('3MRC/3MRC.pdb', 1949, 'B'),\n", + " ('3MRC/3MRC.pdb', 1950, 'A'),\n", + " ('3MRC/3MRC.pdb', 1951, 'B'),\n", + " ('3MRC/3MRC.pdb', 1952, 'A'),\n", + " ('3MRC/3MRC.pdb', 1953, 'B'),\n", + " ('3MRC/3MRC.pdb', 2129, 'A'),\n", + " ('3MRC/3MRC.pdb', 2130, 'B'),\n", + " ('3MRC/3MRC.pdb', 2133, 'A'),\n", + " ('3MRC/3MRC.pdb', 2134, 'B'),\n", + " ('3MRC/3MRC.pdb', 2135, 'A'),\n", + " ('3MRC/3MRC.pdb', 2136, 'B'),\n", + " ('3MRC/3MRC.pdb', 2137, 'A'),\n", + " ('3MRC/3MRC.pdb', 2138, 'B'),\n", + " ('3MRC/3MRC.pdb', 2139, 'A'),\n", + " ('3MRC/3MRC.pdb', 2140, 'B'),\n", + " ('3MRC/3MRC.pdb', 2141, 'A'),\n", + " ('3MRC/3MRC.pdb', 2142, 'B'),\n", + " ('3MRC/3MRC.pdb', 2305, 'A'),\n", + " ('3MRC/3MRC.pdb', 2306, 'B'),\n", + " ('3MRC/3MRC.pdb', 2309, 'A'),\n", + " ('3MRC/3MRC.pdb', 2310, 'B'),\n", + " ('3MRC/3MRC.pdb', 2311, 'A'),\n", + " ('3MRC/3MRC.pdb', 2312, 'B'),\n", + " ('3MRC/3MRC.pdb', 2411, 'A'),\n", + " ('3MRC/3MRC.pdb', 2412, 'B'),\n", + " ('3MRC/3MRC.pdb', 2415, 'A'),\n", + " ('3MRC/3MRC.pdb', 2416, 'B'),\n", + " ('3MRC/3MRC.pdb', 2417, 'A'),\n", + " ('3MRC/3MRC.pdb', 2418, 'B'),\n", + " ('3MRC/3MRC.pdb', 2419, 'A'),\n", + " ('3MRC/3MRC.pdb', 2420, 'B'),\n", + " ('3MRC/3MRC.pdb', 2421, 'A'),\n", + " ('3MRC/3MRC.pdb', 2422, 'B'),\n", + " ('3MRC/3MRC.pdb', 2423, 'A'),\n", + " ('3MRC/3MRC.pdb', 2424, 'B'),\n", + " ('3MRC/3MRC.pdb', 2425, 'A'),\n", + " ('3MRC/3MRC.pdb', 2426, 'B'),\n", + " ('3MRC/3MRC.pdb', 2427, 'A'),\n", + " ('3MRC/3MRC.pdb', 2428, 'B'),\n", + " ('3MRC/3MRC.pdb', 2523, 'A'),\n", + " ('3MRC/3MRC.pdb', 2524, 'B'),\n", + " ('3MRC/3MRC.pdb', 2527, 'A'),\n", + " ('3MRC/3MRC.pdb', 2528, 'B'),\n", + " ('3MRC/3MRC.pdb', 2529, 'A'),\n", + " ('3MRC/3MRC.pdb', 2530, 'B'),\n", + " ('3MRC/3MRC.pdb', 2531, 'A'),\n", + " ('3MRC/3MRC.pdb', 2532, 'B'),\n", + " ('3MRC/3MRC.pdb', 2533, 'A'),\n", + " ('3MRC/3MRC.pdb', 2534, 'B'),\n", + " ('3MRC/3MRC.pdb', 2535, 'A'),\n", + " ('3MRC/3MRC.pdb', 2536, 'B'),\n", + " ('3MRC/3MRC.pdb', 2747, 'A'),\n", + " ('3MRC/3MRC.pdb', 2748, 'B'),\n", + " ('3MRC/3MRC.pdb', 2751, 'A'),\n", + " ('3MRC/3MRC.pdb', 2752, 'B'),\n", + " ('3MRC/3MRC.pdb', 2753, 'A'),\n", + " ('3MRC/3MRC.pdb', 2754, 'B'),\n", + " ('3MRC/3MRC.pdb', 2755, 'A'),\n", + " ('3MRC/3MRC.pdb', 2756, 'B'),\n", + " ('3MRC/3MRC.pdb', 2757, 'A'),\n", + " ('3MRC/3MRC.pdb', 2758, 'B'),\n", + " ('3MRC/3MRC.pdb', 2759, 'A'),\n", + " ('3MRC/3MRC.pdb', 2760, 'B'),\n", + " ('3MRC/3MRC.pdb', 2761, 'A'),\n", + " ('3MRC/3MRC.pdb', 2762, 'B'),\n", + " ('3MRC/3MRC.pdb', 2921, 'A'),\n", + " ('3MRC/3MRC.pdb', 2922, 'B'),\n", + " ('3MRC/3MRC.pdb', 2925, 'A'),\n", + " ('3MRC/3MRC.pdb', 2926, 'B'),\n", + " ('3MRC/3MRC.pdb', 2927, 'A'),\n", + " ('3MRC/3MRC.pdb', 2928, 'B')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tell_me = {}\n", + "\n", + "for file in all_pdb_files:\n", + " with open(base_folder + file, \"r\") as f:\n", + " records = f.read().splitlines()\n", + "\n", + " for i, line in enumerate(records):\n", + " if line.startswith((\"ATOM\", \"HETATOM\")):\n", + " ins_code = slice(26, 27)\n", + " if line[ins_code].strip():\n", + " inscode: list = tell_me.setdefault(\"inscode\", [])\n", + " inscode.append((file, i, line[ins_code]))\n", + "\n", + " alt_loc = slice(16, 17)\n", + " if line[alt_loc].strip():\n", + " altloc: list = tell_me.setdefault(\"altloc\", [])\n", + " altloc.append((file, i, line[alt_loc]))\n", + "\n", + " resname = slice(17, 20)\n", + " if line[resname] == \"HOH\":\n", + " water: list = tell_me.setdefault(\"water\", [])\n", + " water.append((file, i, line[resname]))\n", + "\n", + "tell_me[\"altloc\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from deeprank2.tools.pdbprep.preprocess import preprocess_pdbs\n", + "\n", + "fname = \"/home/dbodor/git/DeepRank/DeepRank2/tests/data/pdb/3C8P/3C8P.pdb\"\n", + "result = preprocess_pdbs(fname).splitlines()\n", + "\n", + "file_from_Joao = \"/home/dbodor/git/DeepRank/DeepRank2/tests/Untitled_3C8P_pdbprep.pdb\"\n", + "with open(file_from_Joao) as f:\n", + " pdb_joao = f.read().splitlines()\n", + "\n", + "for i, x in enumerate(pdb_joao):\n", + " if result[i] != x:\n", + " print(i)\n", + " print(result[i])\n", + " print(x)\n", + " break\n", + "\n", + "result == pdb_joao" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ATOM 1 N LYS A 1 30.878 18.880 20.164 1.00 23.18 N ',\n", + " 'ATOM 2 CA LYS A 1 30.166 17.611 20.202 1.00 19.48 C ',\n", + " 'ATOM 3 C LYS A 1 30.047 17.111 18.767 1.00 15.10 C ',\n", + " 'ATOM 4 O LYS A 1 29.712 17.858 17.855 1.00 17.10 O ',\n", + " 'ATOM 5 CB LYS A 1 28.769 17.677 20.811 1.00 22.38 C ',\n", + " 'ATOM 6 CG LYS A 1 28.057 16.336 20.971 1.00 25.07 C ',\n", + " 'ATOM 7 CD LYS A 1 26.866 16.306 21.952 1.00 26.33 C ',\n", + " 'ATOM 8 CE LYS A 1 26.623 14.928 22.552 1.00 32.03 C ',\n", + " 'ATOM 9 NZ LYS A 1 25.509 14.800 23.557 1.00 42.05 N ',\n", + " 'ATOM 10 N SER A 2 30.314 15.859 18.520 1.00 13.53 N ']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fname = \"/home/dbodor/git/DeepRank/DeepRank2/tests/data/pdb/3C8P/3C8P.pdb\"\n", + "result = preprocess_pdbs(fname).splitlines()\n", + "\n", + "result[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'101.338 38.470 -1.931'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = \"ATOM 1 N VAL A 1 101.338 38.470 -1.931 1.00 53.52 N \"\n", + "Y = \"01234567890123456789012345678901234567890123456789012345678901234567890123456789\"\n", + "Z = \"00000000001111111111222222222233333333334444444444555555555566666666667777777777\"\n", + "X[31:54]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DR2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/tools/test_pdbprep.py b/tests/tools/test_pdbprep.py new file mode 100644 index 000000000..f4f3dede5 --- /dev/null +++ b/tests/tools/test_pdbprep.py @@ -0,0 +1,55 @@ +from pathlib import Path + +import pytest + +from deeprank2.tools.pdbprep.preprocess import preprocess_pdbs + + +@pytest.fixture(scope="module") +def pdb_file() -> Path: + return Path("tests/data/pdb/3C8P/3C8P.pdb") + # with Path("tests/data/pdb/3C8P/3C8P.pdb").open('r') as f: + # records = + + +def test_pdbtools(pdb_file: Path) -> None: + processed = preprocess_pdbs(pdb_file).splitlines() + + with pdb_file.open("r") as pdb: + original = pdb.read().splitlines() + + resname_cols = slice(17, 20) + altloc_cols = slice(16, 17) # noqa: F841 + coordinate_cols = slice(31, 54) # noqa: F841 + + # check that only atomic records were preserved + original_openings = [r.split()[0] for r in original] + processed_openings = [r.split()[0] for r in processed] + + scraped_record_types = ("HEADER", "TITLE", "COMPND", "REMARK") + kept_record_types = ("ATOM",) + + for record in scraped_record_types: + assert record in original_openings + assert record not in processed_openings + + for record in kept_record_types: + assert record in original_openings + assert record in processed_openings + + # check that no water remains + original_resnames = [r[resname_cols] for r in original] + processed_resnames = [r[resname_cols] for r in processed] + assert "HOH" in original_resnames + assert "HOH" not in processed_resnames + + # untested (but confirmed in Jupyter notebook): + # - select altloc (this file) + # - residue renumbering (this file) + # - atom renumbering (file 1ak4) + # - replace residue names (with dummy names) + # + # untested and no good test data: + # - fix insertion codes + # - sort + # - tidy (not sure what it does)