Merge pull request #492 from DeepRank/480_new

refactor: `Query` classes and related code
DeepRank · Nov 7, 2023 · 97db708 · 97db708
2 parents fe86f62 + c2151e7
commit 97db708
Show file tree

Hide file tree

Showing 20 changed files with 796 additions and 1,186 deletions.
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ DeepRank2 extensive documentation can be found [here](https://deeprank2.rtfd.io/
 
 ## Installation
 
-The package officially supports ubuntu-latest OS only, whose functioning is widely tested through the continuous integration workflows. 
+The package officially supports ubuntu-latest OS only, whose functioning is widely tested through the continuous integration workflows.
 
 ### Dependencies
 
@@ -65,9 +65,9 @@ Before installing deeprank2 you need to install some dependencies. We advise to
 *  [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/)
     * Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4:
       * on ubuntu 22.04 or newer: `sudo apt-get install dssp`. If the package cannot be located, first run `sudo apt-get update`.
-      * on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread. 
+      * on older versions of ubuntu or on mac or lacking sudo priviliges: install from [here](https://github.com/pdb-redo/dssp), following the instructions listed. Alternatively, follow [this](https://github.com/PDB-REDO/libcifpp/issues/49) thread.
 *  [GCC](https://gcc.gnu.org/install/)
-    * Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`.  
+    * Check if gcc is installed: `gcc --version`. If this gives an error, run `sudo apt-get install gcc`.
 *  For MacOS with M1 chip users only install [the conda version of PyTables](https://www.pytables.org/usersguide/installation.html).
 
 ### Deeprank2 Package
@@ -105,25 +105,24 @@ For more details, see the [extended documentation](https://deeprank2.rtfd.io/).
 
 ### Data generation
 
-For each protein-protein complex (or protein structure containing a SRV), a query can be created and added to the `QueryCollection` object, to be processed later on. Different types of queries exist:
-- In a `ProteinProteinInterfaceResidueQuery` and `SingleResidueVariantResidueQuery`, each node represents one amino acid residue.
-- In a `ProteinProteinInterfaceAtomicQuery` and `SingleResidueVariantAtomicQuery`, each node represents one atom within the amino acid residues.
+For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`.
 
-A query takes as inputs:
-- a `.pdb` file, representing the protein-protein structure
+A `Query` takes as inputs:
+- a `.pdb` file, representing the protein-protein structure,
+- the resolution (`"residue"` or `"atom"`), i.e. whether each node should represent an amino acid residue or an atom,
 - the ids of the chains composing the structure, and
 - optionally, the correspondent position-specific scoring matrices (PSSMs), in the form of `.pssm` files.
 
 ```python
-from deeprank2.query import QueryCollection, ProteinProteinInterfaceResidueQuery
+from deeprank2.query import QueryCollection, ProteinProteinInterfaceQuery
 
 queries = QueryCollection()
 
 # Append data points
-queries.add(ProteinProteinInterfaceResidueQuery(
+queries.add(ProteinProteinInterfaceQuery(
     pdb_path = "tests/data/pdb/1ATN/1ATN_1w.pdb",
-    chain_id1 = "A",
-    chain_id2 = "B",
+    resolution = "residue",
+    chain_ids = ["A", "B"],
     targets = {
         "binary": 0
     },
@@ -132,10 +131,10 @@ queries.add(ProteinProteinInterfaceResidueQuery(
         "B": "tests/data/pssm/1ATN/1ATN.B.pdb.pssm"
     }
 ))
-queries.add(ProteinProteinInterfaceResidueQuery(
+queries.add(ProteinProteinInterfaceQuery(
     pdb_path = "tests/data/pdb/1ATN/1ATN_2w.pdb",
-    chain_id1 = "A",
-    chain_id2 = "B",
+    resolution = "residue",
+    chain_ids = ["A", "B"],
     targets = {
         "binary": 1
     },
@@ -144,10 +143,10 @@ queries.add(ProteinProteinInterfaceResidueQuery(
         "B": "tests/data/pssm/1ATN/1ATN.B.pdb.pssm"
     }
 ))
-queries.add(ProteinProteinInterfaceResidueQuery(
+queries.add(ProteinProteinInterfaceQuery(
     pdb_path = "tests/data/pdb/1ATN/1ATN_3w.pdb",
-    chain_id1 = "A",
-    chain_id2 = "B",
+    resolution = "residue",
+    chain_ids = ["A", "B"],
     targets = {
         "binary": 0
     },

diff --git a/deeprank2/dataset.py b/deeprank2/dataset.py
@@ -346,14 +346,14 @@ def save_hist( # pylint: disable=too-many-arguments, too-many-branches, useless-
 
             for row, feat in enumerate(features_df):
                 if isinstance(self.df[feat].values[0], np.ndarray):
-                    if(log):
+                    if log:
                         log_data = np.log(np.concatenate(self.df[feat].values))
                         log_data[log_data == -np.inf] = 0
                         axs[row].hist(log_data, bins=bins)
                     else:
                         axs[row].hist(np.concatenate(self.df[feat].values), bins=bins)
                 else:
-                    if(log):
+                    if log:
                         log_data = np.log(self.df[feat].values)
                         log_data[log_data == -np.inf] = 0
                         axs[row].hist(log_data, bins=bins)
@@ -366,14 +366,14 @@ def save_hist( # pylint: disable=too-many-arguments, too-many-branches, useless-
             fig = plt.figure(figsize=figsize)
             ax = fig.add_subplot(111)
             if isinstance(self.df[features_df[0]].values[0], np.ndarray):
-                if(log):
+                if log:
                     log_data = np.log(np.concatenate(self.df[features_df[0]].values))
                     log_data[log_data == -np.inf] = 0
                     ax.hist(log_data, bins=bins)
                 else:
                     ax.hist(np.concatenate(self.df[features_df[0]].values), bins=bins)
             else:
-                if(log):
+                if log:
                     log_data = np.log(self.df[features_df[0]].values)
                     log_data[log_data == -np.inf] = 0
                     ax.hist(log_data, bins=bins)

diff --git a/deeprank2/domain/aminoacidlist.py b/deeprank2/domain/aminoacidlist.py
@@ -353,17 +353,15 @@
     ]
 
 def convert_aa_nomenclature(aa: str, output_type: Optional[int] = None):
-
-    # pylint: disable = raise-missing-from
     try:
         if len(aa) == 1:
             aa: AminoAcid = [entry for entry in amino_acids if entry.one_letter_code.lower() == aa.lower()][0]
         elif len(aa) == 3:
             aa: AminoAcid = [entry for entry in amino_acids if entry.three_letter_code.lower() == aa.lower()][0]
         else:
             aa: AminoAcid = [entry for entry in amino_acids if entry.name.lower() == aa.lower()][0]
-    except IndexError:
-        raise ValueError(f'{aa} is not a valid amino acid.')
+    except IndexError as e:
+        raise ValueError(f'{aa} is not a valid amino acid.') from e
 
     if not output_type:
         return aa.name

diff --git a/deeprank2/features/exposure.py b/deeprank2/features/exposure.py
@@ -52,25 +52,24 @@ def add_features( # pylint: disable=unused-argument
         signal.alarm(0)
     except TimeoutError as e:
         raise TimeoutError('Bio.PDB.ResidueDepth.get_surface timed out.') from e
-    else:
-        hse = HSExposureCA(bio_model)
-
-        # These can only be calculated per residue, not per atom.
-        # So for atomic graphs, every atom gets its residue's value.
-        for node in graph.nodes:
-            if isinstance(node.id, Residue):
-                residue = node.id
-            elif isinstance(node.id, Atom):
-                atom = node.id
-                residue = atom.residue
-            else:
-                raise TypeError(f"Unexpected node type: {type(node.id)}")
-
-            bio_residue = bio_model[residue.chain.id][residue.number]
-            node.features[Nfeat.RESDEPTH] = residue_depth(bio_residue, surface)
-            hse_key = (residue.chain.id, (" ", residue.number, space_if_none(residue.insertion_code)))
-
-            if hse_key in hse:
-                node.features[Nfeat.HSE] = np.array(hse[hse_key], dtype=np.float64)
-            else:
-                node.features[Nfeat.HSE] = np.array((0, 0, 0), dtype=np.float64)
+
+    # These can only be calculated per residue, not per atom.
+    # So for atomic graphs, every atom gets its residue's value.
+    hse = HSExposureCA(bio_model)
+    for node in graph.nodes:
+        if isinstance(node.id, Residue):
+            residue = node.id
+        elif isinstance(node.id, Atom):
+            atom = node.id
+            residue = atom.residue
+        else:
+            raise TypeError(f"Unexpected node type: {type(node.id)}")
+
+        bio_residue = bio_model[residue.chain.id][residue.number]
+        node.features[Nfeat.RESDEPTH] = residue_depth(bio_residue, surface)
+        hse_key = (residue.chain.id, (" ", residue.number, space_if_none(residue.insertion_code)))
+
+        if hse_key in hse:
+            node.features[Nfeat.HSE] = np.array(hse[hse_key], dtype=np.float64)
+        else:
+            node.features[Nfeat.HSE] = np.array((0, 0, 0), dtype=np.float64)