Merge pull request #5 from BioinfoMachineLearning/develop

amorehead · web-flow · commit 48f396333e6e · 2021-10-28T23:46:42.000-05:00
Allow input PDBs with custom filenames and make feature imputation function name unique
diff --git a/project/datasets/builder/impute_missing_feature_values.py b/project/datasets/builder/impute_missing_feature_values.py
@@ -4,7 +4,7 @@
 
 import click
 from parallel import submit_jobs
-from project.utils.dips_plus_utils import impute_missing_feature_values
+from project.utils.dips_plus_utils import impute_postprocessed_missing_feature_values
 
 
 # -------------------------------------------------------------------------------------------------------------------------------------
@@ -29,7 +29,7 @@ def main(output_dir: str, impute_atom_features: bool, advanced_logging: bool, nu
     inputs = [(pair_filename.as_posix(), pair_filename.as_posix(), impute_atom_features, advanced_logging)
               for pair_filename in Path(output_dir).rglob('*.dill')]
     # Without impute_atom_features set to True, non-CA atoms will be filtered out after writing updated pairs
-    submit_jobs(impute_missing_feature_values, inputs, num_cpus)
+    submit_jobs(impute_postprocessed_missing_feature_values, inputs, num_cpus)
 
 
 if __name__ == '__main__':
diff --git a/project/utils/deepinteract_utils.py b/project/utils/deepinteract_utils.py
@@ -30,7 +30,7 @@
 from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 
 from project.utils.deepinteract_constants import FEAT_COLS, ALLOWABLE_FEATS, D3TO1
-from project.utils.dips_plus_utils import postprocess_pruned_pairs
+from project.utils.dips_plus_utils import postprocess_pruned_pairs, impute_postprocessed_missing_feature_values
 from project.utils.graph_utils import prot_df_to_dgl_graph_feats
 from project.utils.protein_feature_utils import GeometricProteinFeatures
 
@@ -573,9 +573,11 @@ def create_input_dir_struct(input_dataset_dir: str, pdb_code: str):
     _, _ = dir_struct_create_proc.communicate()  # Wait until the directory structure creation cmd is finished
 
 
-def copy_input_to_raw_dir(input_dataset_dir: str, pdb_filepath: str, pdb_code: str):
+def copy_input_to_raw_dir(input_dataset_dir: str, pdb_filepath: str, pdb_code: str, chain_indic: str):
     """Make a copy of the input PDB file in the newly-created raw directory."""
-    input_copy_cmd = f'cp {pdb_filepath} {os.path.join(input_dataset_dir, "raw", pdb_code)}'
+    filename = db.get_pdb_code(pdb_filepath) + f'_{chain_indic}.pdb' \
+        if chain_indic not in pdb_filepath else db.get_pdb_name(pdb_filepath)
+    input_copy_cmd = f'cp {pdb_filepath} {os.path.join(input_dataset_dir, "raw", pdb_code, filename)}'
     input_copy_proc = subprocess.Popen(input_copy_cmd.split(), stdout=subprocess.PIPE, cwd=os.getcwd())
     _, _ = input_copy_proc.communicate()  # Wait until the input copy cmd is finished
 
@@ -590,6 +592,7 @@ def make_dataset(input_dataset_dir='datasets/Input/raw', output_dir='datasets/In
     pa.parse_all(input_dataset_dir, parsed_dir, num_cpus)
 
     complexes_dill = os.path.join(output_dir, 'complexes/complexes.dill')
+    os.remove(complexes_dill)  # Ensure that pairs are made everytime this function is called
     comp.complexes(parsed_dir, complexes_dill, source_type)
     complexes = comp.read_complexes(complexes_dill)
     pairs_dir = os.path.join(output_dir, 'pairs')
@@ -697,7 +700,7 @@ def impute_missing_feature_values(output_dir='datasets/Input/final/raw',
     inputs = [(pair_filename.as_posix(), pair_filename.as_posix(), impute_atom_features, advanced_logging)
               for pair_filename in Path(output_dir).rglob('*.dill')]
     # Without impute_atom_features set to True, non-CA atoms will be filtered out after writing updated pairs
-    par.submit_jobs(impute_missing_feature_values, inputs, num_cpus)
+    par.submit_jobs(impute_postprocessed_missing_feature_values, inputs, num_cpus)
 
 
 def convert_input_pdb_files_to_pair(left_pdb_filepath: str, right_pdb_filepath: str, input_dataset_dir: str,
@@ -707,8 +710,8 @@ def convert_input_pdb_files_to_pair(left_pdb_filepath: str, right_pdb_filepath:
     pdb_code = db.get_pdb_group(list(ca.get_complex_pdb_codes([left_pdb_filepath, right_pdb_filepath]))[0])
     # Iteratively execute the PDB file feature generation process
     create_input_dir_struct(input_dataset_dir, pdb_code)
-    copy_input_to_raw_dir(input_dataset_dir, left_pdb_filepath, pdb_code)
-    copy_input_to_raw_dir(input_dataset_dir, right_pdb_filepath, pdb_code)
+    copy_input_to_raw_dir(input_dataset_dir, left_pdb_filepath, pdb_code, 'l_u')
+    copy_input_to_raw_dir(input_dataset_dir, right_pdb_filepath, pdb_code, 'r_u')
     make_dataset(os.path.join(input_dataset_dir, 'raw'), os.path.join(input_dataset_dir, 'interim'))
     generate_psaia_features(psaia_dir=psaia_dir,
                             psaia_config=psaia_config,
diff --git a/project/utils/dips_plus_utils.py b/project/utils/dips_plus_utils.py
@@ -394,9 +394,10 @@ def __should_keep_postprocessed(raw_pdb_dir: str, pair_filename: str, source_typ
         # Identify if a given complex contains DSSP-derivable secondary structure features
         raw_pdb_filenames.append(get_raw_pdb_filename_from_interim_filename(interim_filename, raw_pdb_dir, source_type))
         pair_dssp_dict = get_dssp_dict_for_pdb_file(raw_pdb_filenames[i])
-        if not pair_dssp_dict and source_type not in ['input']:
+        if source_type.lower() not in ['input'] and not pair_dssp_dict:
             return pair, raw_pdb_filenames[i], False  # Discard pair missing DSSP-derivable secondary structure features
-        if pair.df0.shape[0] > ATOM_COUNT_LIMIT or pair.df1.shape[0] > ATOM_COUNT_LIMIT:
+        if source_type.lower() not in ['input'] \
+                and (pair.df0.shape[0] > ATOM_COUNT_LIMIT or pair.df1.shape[0] > ATOM_COUNT_LIMIT):
             return pair, raw_pdb_filenames[i], False  # Discard pair exceeding atom count limit to reduce comp. complex.
     return pair, raw_pdb_filenames, True
 
@@ -458,8 +459,8 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
             rd_dict = get_msms_rd_dict_for_pdb_model(structure[0])  # RD only retrieved for first model
 
             # Get protrusion indices using PSAIA
-            psaia_filepath = os.path.relpath(os.path.splitext(os.path.split(raw_pdb_filename)[-1])[0])
-            psaia_filename = [path for path in Path(external_feats_dir).rglob(f'{psaia_filepath}*.tbl')][0]  # 1st path
+            pdb_code = db.get_pdb_code(raw_pdb_filename)
+            psaia_filename = [path for path in Path(external_feats_dir).rglob(f'{pdb_code}*.tbl')][0]  # 1st path
             psaia_df = get_df_from_psaia_tbl_file(psaia_filename)
 
             # Extract half-sphere exposure (HSE) statistics for each PDB model (including HSAAC and CN values)
@@ -836,8 +837,8 @@ def determine_nan_fill_value(column: pd.Series, imputation_method='median'):
     return imputation_value if column.isna().sum().sum() <= NUM_ALLOWABLE_NANS else 0
 
 
-def impute_missing_feature_values(input_pair_filename: str, output_pair_filename: str,
-                                  impute_atom_features: bool, advanced_logging: bool):
+def impute_postprocessed_missing_feature_values(input_pair_filename: str, output_pair_filename: str,
+                                                impute_atom_features: bool, advanced_logging: bool):
     """Impute missing feature values in a postprocessed dataset."""
     # Look at a .dill file in the given output directory
     postprocessed_pair: pa.Pair = pd.read_pickle(input_pair_filename)