DeepRank · DaniBodor · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/deeprank2/tools/pdbprep/__init__.py b/deeprank2/tools/pdbprep/__init__.py
@@ -0,0 +1,124 @@
+# All code in this subpackage has been adapted from https://github.com/DeepRank/pdbprep,
+# which is published under an Apache 2.0 licence
+
+import sys
+from collections import defaultdict
+from collections.abc import Generator
+from typing import TextIO
+
+# define record columns for each datum
+_ATOMNAME_COLS = slice(12, 16)
+_RESNAME_COLS = slice(17, 20)
+_CHAIN_COLS = slice(21, 22)
+_RESNUM_COLS = slice(22, 27)  # this includes both the residue number and insertion code
+_OCCUPANCY_COLS = slice(54, 60)
+
+
+def write_pdb(new_pdb: list, pdbfh: TextIO) -> None:
+    """Writes new pdb files."""
+    try:
+        _buffer = []
+        _buffer_size = 5000  # write N lines at a time
+        for lineno, line in enumerate(new_pdb):
+            if not (lineno % _buffer_size):
+                sys.stdout.write("".join(_buffer))
+                _buffer = []
+            _buffer.append(line)
+
+        sys.stdout.write("".join(_buffer))
+        sys.stdout.flush()
+    except OSError:
+        # This is here to catch Broken Pipes
+        # for example to use 'head' or 'tail' without
+        # the error message showing up
+        pass
+
+    # last line of the script
+    # We can close it even if it is sys.stdin
+    pdbfh.close()
+    sys.exit(0)
+
+
+def _prune_records(fhandle: TextIO) -> Generator[str]:
+    """Prune records before processing.
+
+    Scraps non-atomic records and records from water molecule.
+    Replaces non-standard residue names by their standard counterparts.
+    """
+    atomic_record = ("ATOM", "HETATM")  # TODO: check if we need to keep ANISOU and TER records as well?
+    water = "HOH"
+    standard_resnames = {
+        "MSE": "MET",
+        "HIP": "HIS",
+        "HIE": "HIS",
+        "HID": "HIS",
+        "HSE": "HIS",
+        "HSD": "HIS",
+    }
+
+    for i, record in enumerate(fhandle):
+        resname = record[_RESNAME_COLS]
+        if record.startswith(atomic_record) and resname != water and i not in _find_low_occ_records(fhandle):
+            # TODO: if within a single file mixed residue nomenclature is used, it is not detected by _find_low_occ_records
+            # probably fix this by running these in separate functions rather than all at once.
+            standardized_resname = standard_resnames.get(resname, resname)
+            record = record[: _RESNAME_COLS.start] + standardized_resname + record[_RESNAME_COLS.stop :]  # noqa: PLW2901
+            yield record
+
+
+def _find_low_occ_records(pdb: list[str]) -> list[int]:
+    """Helper function to identify records with lowest occupancy alternate locations.
+
+    In case an atom is detected at more than one position (e.g. due to alternate conformations), the structure will
+    contain the same atom multiple times with separate "alternate location indicators" (col 17 of the pdb record).
+    Each location will have a certain occupancy, i.e. proportion of structures where this particular location is found
+    (and thus all occupancies for a given atom sum to 1).
+
+    This function first identifies atoms that are listed more than once in a pdb file, based on their chain identifier
+    (col 22), residue sequence number (col 23-26), and atom name (col 13-16). It then identifies the record with the
+    highest occupancy for each atom (in case of equal occupancy, the first entry is considered higher). From this, a
+    list of indices is returned representing the records that do not contain the highest occupancy for the atom in that
+    record.
+
+    Args:
+        pdb: list of records (lines) from a pdb file
+
+    Returns:
+        list of indices of records that do not contain the highest occupancy location
+    """
+    # define record columns for each datum
+
+    atom_indentiers = [record[_CHAIN_COLS] + record[_RESNUM_COLS] + record[_ATOMNAME_COLS] for record in pdb]
+
+    # create a dictionary containing only duplicated atom_indentiers (keys) and their indices in pdb (values)
+    # from: https://stackoverflow.com/a/11236042/5170442
+    duplicates = defaultdict(list)
+    for i, atom in enumerate(atom_indentiers):
+        duplicates[atom].append(i)
+    duplicates = {k: v for k, v in duplicates.items() if len(v) > 1}
+
+    highest_occupancies = {}
+    for atom, record_indices in duplicates.items():
+        highest_occ = 0
+        for i in record_indices:
+            occupancy = pdb[i][_OCCUPANCY_COLS]
+            if occupancy > highest_occ:
+                # only keep the record with the highest occupancy; in case of tie keep the first
+                highest_occ = occupancy
+                highest_occupancies[atom] = i
+    return [x for xs in duplicates.values() for x in xs if x not in highest_occupancies.values()]
+
+
+def pdb_prep(fhandle: TextIO) -> None:
+    """Run all steps from pdb prep repo."""
+    # step 1 - keep coordinates: removes non coordinate lines for simplicity
+    # step 2 - delresname: remove waters
+    # step 3 - rplresname: convert residue names to standard names, ex: MSE to MET
+    # step 4 - selaltloc: select most probable alternative location
+    _new_pdb = _prune_records(fhandle)
+
+    # step 5 - fixinsert: fix inserts
+    # step 6 - sort: sort chains and resides, necessary for OpenMM
+    # step 7 - reres: renumber residues from 1
+    # step 8 - reatom: renumber atoms from 1
+    # step 9 - tidy: tidy cleans the PDB, adds TER, etc.
diff --git a/pyproject.toml b/pyproject.toml
@@ -78,7 +78,7 @@ ignore = [
     "PLR0913", # Too many arguments in function definition
     "D102",    # Missing docstring in public method
     # Unwanted
-    "FBT",    # Using boolean arguments
+    "FBT",    # Disallow using booleans as function arguments
     "ANN101", # Missing type annotation for `self` in method
     "ANN102", # Missing type annotation for `cls` in classmethod
     "ANN204", # Missing return type annotation for special (dunder) method
@@ -87,6 +87,10 @@ ignore = [
     "S311",   # insecure random generators
     "PT011",  # pytest-raises-too-broad
     "SIM108", # Use ternary operator
+    # TODO formatting
+    "TD002",  # Missing TODO author
+    "TD003",  # Missing TODO link
+    "FIX002", # Consider resolving the issue instead 
     # Unwanted docstrings
     "D100", # Missing module docstring
     "D104", # Missing public package docstring