Skip to content

Commit

Permalink
fix: structural info check for DavisKibaDataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jyaacoub committed Nov 13, 2023
1 parent bfabacf commit 29d7586
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 11 deletions.
25 changes: 15 additions & 10 deletions src/data_processing/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def process(self):
pro_feat=self.feature_opt,
aln_file=self.aln_p(code),
# for foldseek feats
pdb_fp=self.pdb_p(code),
pdb_fp=self.pdb_p(code),
pddlt_fp=self.pddlt_p(code))
except Exception as e:
raise Exception(f"error on protein graph creation for code {code}") from e
Expand Down Expand Up @@ -595,7 +595,8 @@ def pdb_p(self, code, safe=True):
code = re.sub(r'[()]', '_', code)
# davis and kiba dont have their own structures so this must be made using
# af or some other method beforehand.
if self.edge_opt not in cfg.STRUCT_EDGE_OPT: return None
if (self.edge_opt not in cfg.STRUCT_EDGE_OPT) and \
(self.feature_opt not in cfg.STRUCT_PRO_FEAT_OPT): return None

file = glob(os.path.join(self.af_conf_dir, f'highQ/{code}_unrelaxed_rank_001*.pdb'))
# should only be one file
Expand Down Expand Up @@ -730,17 +731,21 @@ def pre_process(self):
no_cmap = [c for c in codes if not os.path.isfile(self.cmap_p(c))]
print(f'Number of codes without cmap files: {len(no_cmap)} out of {len(codes)}')

# Checking that structure and af_confs files are present if edgeW is anm or af2
# Checking that structure and af_confs files are present if required:
no_confs = []
if self.edge_opt in cfg.STRUCT_EDGE_OPT:
no_confs = [c for c in codes if (
(self.pdb_p(c, safe=False) is None) or # no highQ structure
(len(self.af_conf_files(c)) < 2))] # not enough af confirmations.
if self.edge_opt in cfg.STRUCT_EDGE_OPT or self.feature_opt in cfg.STRUCT_PRO_FEAT_OPT:
if self.feature_opt == 'foldseek':
# we only need HighQ structures for foldseek
no_confs = [c for c in codes if (self.pdb_p(c, safe=False) is None)]
else:
no_confs = [c for c in codes if (
(self.pdb_p(c, safe=False) is None) or # no highQ structure
(len(self.af_conf_files(c)) < 2))] # only if not for foldseek

# WARNING: TEMPORARY FIX FOR DAVIS (TESK1 highQ structure is mismatched...)
no_confs.append('TESK1')
# WARNING: TEMPORARY FIX FOR DAVIS (TESK1 highQ structure is mismatched...)
no_confs.append('TESK1')

print(f'Number of codes missing af2 configurations: {len(no_confs)} / {len(codes)}')
print(f'Number of codes missing af2 configurations: {len(no_confs)} / {len(codes)}')

invalid_codes = set(no_aln + no_cmap + no_confs)
# filtering out invalid codes:
Expand Down
4 changes: 3 additions & 1 deletion src/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

STRUCT_EDGE_OPT = ['anm', 'af2', 'af2-anm'] # edge options that require structural info (pdbs)
EDGE_OPT = ['simple', 'binary'] + STRUCT_EDGE_OPT
PRO_FEAT_OPT = ['nomsa', 'msa', 'shannon', 'foldseek']

STRUCT_PRO_FEAT_OPT = ['foldseek'] # requires structural info (pdbs)
PRO_FEAT_OPT = ['nomsa', 'msa', 'shannon'] + STRUCT_PRO_FEAT_OPT

LIG_FEAT_OPT = [None, 'original']
LIG_EDGE_OPT = [None, 'binary']
Expand Down

0 comments on commit 29d7586

Please sign in to comment.