From a92c68bea271749368fb805a4a38276ec8b12e1c Mon Sep 17 00:00:00 2001 From: Ben Webb Date: Fri, 23 Aug 2024 10:54:00 -0700 Subject: [PATCH] Set sequence offset on mmCIF file read When reading an mmCIF file, set each chain's sequence offset using the internal and author provided numbering of its first residue (rather than leaving it as zero). This gives us a better chance of having the correct seq_id in any future output mmCIF file. --- modules/atom/src/mmcif.cpp | 16 +++++++++++++--- modules/atom/test/test_mmcif.py | 10 ++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/modules/atom/src/mmcif.cpp b/modules/atom/src/mmcif.cpp index 06a091576a..bc211fd9ef 100644 --- a/modules/atom/src/mmcif.cpp +++ b/modules/atom/src/mmcif.cpp @@ -133,8 +133,9 @@ class AtomSiteCategory : public Category { return true; } - void get_chain_particle(const std::string &chain, + bool get_chain_particle(const std::string &chain, const std::string &label_asym_id) { + bool new_chain = false; if (cp_ == nullptr || chain != curr_chain_) { curr_chain_ = chain; std::pair root_chain(root_p_, chain); @@ -144,11 +145,13 @@ class AtomSiteCategory : public Category { Chain(cp_).set_label_asym_id(label_asym_id); Hierarchy(root_p_).add_child(Chain(cp_)); chain_map_[root_chain] = cp_; + new_chain = true; } else { cp_ = chain_map_[root_chain]; } rp_ = nullptr; // make sure we get a new residue } + return new_chain; } // Replace at most maxlen chars in dest, starting at pos, with repl @@ -178,10 +181,11 @@ class AtomSiteCategory : public Category { // Use author-provided chain ID if available std::string label_asym_id = chain_.as_str(); + bool new_chain; if (strlen(auth_chain_.as_str()) > 0) { - get_chain_particle(auth_chain_.as_str(), label_asym_id); + new_chain = get_chain_particle(auth_chain_.as_str(), label_asym_id); } else { - get_chain_particle(label_asym_id, label_asym_id); + new_chain = get_chain_particle(label_asym_id, label_asym_id); } std::string auth_seq_id_str = auth_seq_id_.as_str(); // Check if new residue @@ -200,6 +204,12 @@ class AtomSiteCategory : public Category { // if auth_seq_id is blank, use seq_id instead if (endptr == start) auth_seq_id = seq_id; char one_icode = 32; // default insertion code (space) + + // Set the chain's sequence offset based on the first residue numbering + if (new_chain) { + Chain(cp_).set_sequence_offset(auth_seq_id - seq_id); + } + // if auth_seq_id is not blank and contains something after the number, // use the first character of that as the insertion code if (endptr != start && *endptr) { diff --git a/modules/atom/test/test_mmcif.py b/modules/atom/test/test_mmcif.py index 8d0c958c67..c3cb63200c 100644 --- a/modules/atom/test/test_mmcif.py +++ b/modules/atom/test/test_mmcif.py @@ -415,6 +415,16 @@ def test_chain_selector_multi_char(self): for x in IMP.atom.get_by_type(mp, IMP.atom.CHAIN_TYPE)] self.assertEqual([c.get_id() for c in chains], ['ZB']) + def test_chain_read_offset(self): + """Check reading of chain sequence offset from an mmCIF file""" + m = IMP.Model() + + mp = IMP.atom.read_mmcif(self.get_input_file_name('chaintest.cif'), m, + IMP.atom.ChainPDBSelector(["ZK"])) + chains = [IMP.atom.Chain(x) + for x in IMP.atom.get_by_type(mp, IMP.atom.CHAIN_TYPE)] + self.assertEqual(chains[0].get_sequence_offset(), 286) + if __name__ == '__main__': IMP.test.main()