Merge pull request #313 from marrink-lab/DNA-exten/itp

[1] Dna exten/itp
marrink-lab · Sep 26, 2023 · 84d0c25 · 84d0c25
2 parents b8fb8b8 + f9117f5
commit 84d0c25
Show file tree

Hide file tree

Showing 15 changed files with 6,034 additions and 19 deletions.
diff --git a/bin/polyply b/bin/polyply
@@ -74,6 +74,9 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
                            help='A linear sequence of residue names.')
     seq_group.add_argument('-seqf', dest='seq_file', type=Path,
                            help='A graph input file (JSON|TXT|FASTA|IG)')
+    dna_group = parser_gen_itp.add_argument_group('DNA specifc options')
+    dna_group.add_argument('-dsdna', dest='dsdna', action='store_true',
+                           help='complement single sequence to dsDNA sequence')
 
     parser_gen_itp.set_defaults(func=gen_itp)
 

diff --git a/polyply/data/martini2/DNA_M2.ff b/polyply/data/martini2/DNA_M2.ff
@@ -566,3 +566,28 @@ BB3 +BB2
 BB2 BB3 +BB1 +BB2     1  180.00000     2     3 {"group": "link"}
 BB3 +BB1 +BB2 +BB3    9   85.00000     2     2 {"group": "link", "version": "1"}
 BB3 +BB1 +BB2 +BB3    9  160.00000     2     3 {"group": "link", "version": "2"}
+
+;
+; CIRCULAR MARTINI DNA
+;
+
+[ link ]
+resname "DT|DG|DA|DC"
+[ atoms ]
+BB3 { }
+>BB1 { }
+[ bonds ]
+BB3  >BB1 1  0.35300 10000 {"group": "link-circle", "edge": false}
+[ angles ]
+BB2 BB3  >BB1  2  102.00000   150 {"group": "backbone-circle", "edge": false}
+BB3 >BB1 >BB2  2  106.00000    75 {"group": "backbone-circle", "edge": false}
+[ exclusions ]
+#meta {"edge": false}
+BB2 >BB1
+BB3 >BB2
+[ dihedrals ]
+BB2 BB3 >BB1 >BB2     1  180.00000     2     3 {"group": "link-circle", "edge": false}
+BB3 >BB1 >BB2 >BB3    9   85.00000     2     2 {"group": "link-circle", "version": "1", "edge": false}
+BB3 >BB1 >BB2 >BB3    9  160.00000     2     3 {"group": "link-circle", "version": "2", "edge": false}
+[ edges ]
+BB3 >BB1 {"linktype": "circle"}
diff --git a/polyply/src/apply_links.py b/polyply/src/apply_links.py
@@ -489,7 +489,12 @@ def run_molecule(self, meta_molecule):
 
         # take care to remove nodes if there are any scheduled for removal
         # we do this here becuase that's more efficent
-        molecule.remove_nodes_from(self.nodes_to_remove)
+        if self.nodes_to_remove:
+            molecule.remove_nodes_from(self.nodes_to_remove)
+            # make sure the residue graph is updated; this takes care that
+            # nodes are also removed from the fragment graphs in the
+            # meta_molecule.nodes['graph'] attribute
+            meta_molecule.relabel_and_redo_res_graph(mapping={})
         # now we add all interactions but not the ones that contain the removed
         # nodes
         for inter_type in self.applied_links:

diff --git a/polyply/src/gen_dna.py b/polyply/src/gen_dna.py
@@ -0,0 +1,106 @@
+# Copyright 2022 University of Groningen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+from tqdm import tqdm
+
+BASE_LIBRARY = {"DA": "DT", "DT": "DA", "DG": "DC", "DC": "DG",
+                "DA5": "DT3", "DT5": "DA3", "DG5": "DC3", "DC5": "DG3",
+                "DA3": "DT5", "DT3": "DA5", "DG3": "DC5", "DC3": "DG5"
+                }
+
+def _dna_edge_iterator(meta_molecule, source):
+    """
+    Traversal method for DNA specifc meta-molecule.
+    Should become obsolete once vermouth commits to
+    directed graphs.
+    """
+    first_node = source
+    count = 0
+    while True:
+        neighbors = meta_molecule.neighbors(source)
+        src_resid = meta_molecule.nodes[source]["resid"]
+        for next_node in neighbors:
+            next_resid = meta_molecule.nodes[next_node]["resid"]
+            diff = src_resid - next_resid
+            if diff == 1 :
+                yield (source, next_node)
+                source = next_node
+                break
+
+            if next_resid > src_resid and next_node == first_node:
+                yield (source, next_node)
+                return
+        else:
+            return
+
+def complement_dsDNA(meta_molecule):
+    """
+    Given a meta-molecule, which represents the residue graph
+    of a single strand of dsDNA add the complementray strand
+    to get a meta-molecule with two disconnected strands. The
+    update of the meta_molecule is done in place.
+
+    By convention the other strand is generate 5'-> 3' end,
+    meaning that the last residue of the first strand aligns
+    with the first residue of the second strand.
+
+    Parameters
+    ----------
+    meta_molecule: :class:`polyply.src.meta_molecule.MetaMolecule`
+
+    Raises
+    ------
+    IOError
+        when the resname does not match any of the know base-pair
+        names an error is raised.
+    """
+    last_node = list(meta_molecule.nodes)[-1]
+    resname = BASE_LIBRARY[meta_molecule.nodes[last_node]["resname"]]
+    meta_molecule.add_monomer(last_node+1, resname, [])
+
+    correspondance = {last_node: last_node+1}
+    total = last_node+1
+
+    pbar = tqdm(total=len(meta_molecule.nodes))
+    for prev_node, next_node in _dna_edge_iterator(meta_molecule, source=last_node):
+        try:
+            resname = BASE_LIBRARY[meta_molecule.nodes[next_node]["resname"]]
+        except KeyError:
+            msg = ("Trying to complete a dsDNA strand. However, resname {resname} with resid {resid} "
+                    "does not match any of the known base-pair resnames. Note that polyply "
+                    "at the moment only allows base-pair completion for molecules that only "
+                    "consist of dsDNA. Please conact the developers if you whish to create a "
+                    "more complicated molecule.")
+
+            resname = meta_molecule.nodes[next_node]["resname"]
+            resid = meta_molecule.nodes[next_node]["resid"]
+            raise IOError(msg.format(resname=resname, resid=resid))
+
+        if next_node not in correspondance:
+            new_node = total + 1
+            meta_molecule.add_monomer(new_node, resname, [(correspondance[prev_node], new_node)])
+            correspondance[next_node] = new_node
+        else:
+            new_node = correspondance[next_node]
+            meta_molecule.add_edge(correspondance[prev_node], new_node)
+
+        # make sure that edge attributes are carried over
+        for attr, value in meta_molecule.edges[(prev_node, next_node)].items():
+            meta_molecule.edges[(correspondance[prev_node], new_node)][attr] = value
+
+        total += 1
+        pbar.update(1)
+    pbar.close()
+    return meta_molecule
+
diff --git a/polyply/src/gen_itp.py b/polyply/src/gen_itp.py
@@ -33,6 +33,7 @@
 from polyply import (MetaMolecule, ApplyLinks, Monomer, MapToMolecule)
 from polyply.src.graph_utils import find_missing_edges
 from .load_library import load_ff_library
+from .gen_dna import complement_dsDNA
 
 LOGGER = StyleAdapter(get_logger(__name__))
 
@@ -59,7 +60,7 @@ def split_seq_string(sequence):
         monomers.append(Monomer(resname=resname, n_blocks=n_blocks))
     return monomers
 
-def gen_params(name="polymer", outpath=Path("polymer.itp"), inpath=[], lib=None, seq=None, seq_file=None):
+def gen_params(name="polymer", outpath=Path("polymer.itp"), inpath=[], lib=None, seq=None, seq_file=None, dsdna=False):
     """
     Top level function for running the polyply parameter generation.
     Parameters seq and seq_file are mutually exclusive. Set the other
@@ -96,19 +97,20 @@ def gen_params(name="polymer", outpath=Path("polymer.itp"), inpath=[], lib=None,
         LOGGER.info("reading sequence from file",  type="step")
         meta_molecule = MetaMolecule.from_sequence_file(force_field, seq_file, name)
 
+    # Generate complementary DNA strand
+    if dsdna:
+        complement_dsDNA(meta_molecule)
+
     # Do transformationa and apply link
     LOGGER.info("mapping sequence to molecule",  type="step")
     meta_molecule = MapToMolecule(force_field).run_molecule(meta_molecule)
     LOGGER.info("applying links between residues",  type="step")
     meta_molecule = ApplyLinks().run_molecule(meta_molecule)
 
     # Raise warning if molecule is disconnected
-    if not nx.is_connected(meta_molecule.molecule):
-        LOGGER.warning("Your molecule consists of disjoint parts."
-                       "Perhaps links were not applied correctly.")
-        msg = "Missing link between residue {idxA} {resA} and residue {idxB} {resB}"
-        for missing in find_missing_edges(meta_molecule, meta_molecule.molecule):
-            LOGGER.warning(msg, **missing)
+    msg = "Missing a link between residue {idxA} {resA} and residue {idxB} {resB}."
+    for missing in find_missing_edges(meta_molecule, meta_molecule.molecule):
+        LOGGER.warning(msg, **missing)
 
     with deferred_open(outpath, 'w') as outfile:
         header = [ ' '.join(sys.argv) + "\n" ]

diff --git a/polyply/src/meta_molecule.py b/polyply/src/meta_molecule.py
@@ -104,6 +104,7 @@ def __init__(self, *args, **kwargs):
         self.__search_tree = None
         self.root = None
         self.dfs = False
+        self.max_resid = 0
 
         # add resids to polyply meta-molecule nodes if they are not
         # present. All algorithms rely on proper resids
@@ -117,6 +118,13 @@ def __init__(self, *args, **kwargs):
                     msg = "Couldn't add 1 to node. Either provide resids or use integers as node keys."
                     raise IOError(msg)
 
+            if self.max_resid < self.nodes[node]["resid"]:
+                self.max_resid = self.nodes[node]["resid"]
+
+    def add_node(self, *args, **kwargs):
+        self.max_resid += 1
+        kwargs["resid"] = self.max_resid
+        super().add_node(*args, **kwargs)
 
     def add_monomer(self, current, resname, connections):
         """
@@ -125,14 +133,7 @@ def add_monomer(self, current, resname, connections):
         that matches may only refer to already existing nodes.
         But connections can be an empty list.
         """
-        resids = nx.get_node_attributes(self, "resid")
-
-        if resids:
-           resid = max(resids.values()) + 1
-        else:
-           resid = 1
-
-        self.add_node(current, resname=resname, resid=resid, build=True, backmap=True)
+        self.add_node(current, resname=resname, build=True, backmap=True)
         for edge in connections:
             if self.has_node(edge[0]) and self.has_node(edge[1]):
                 self.add_edge(edge[0], edge[1])
@@ -155,7 +156,7 @@ def relabel_and_redo_res_graph(self, mapping):
             mapping of node-key to new residue name
         """
         # find the maximum resiude id
-        max_resid = max(nx.get_node_attributes(self.molecule, "resid").values())
+        max_resid = self.max_resid
         # resname the residues and increase with pseudo-resid
         for node, resname in mapping.items():
             self.molecule.nodes[node]["resname"] = resname

diff --git a/polyply/src/simple_seq_parsers.py b/polyply/src/simple_seq_parsers.py
@@ -191,7 +191,7 @@ def _identify_residues(comments):
 
         if "RNA" in comment:
             RNA = True
-            
+
         if "PROTEIN" in comment:
             AA = True
 
@@ -253,6 +253,9 @@ def parse_ig(filepath):
         raise FileFormatError(msg)
 
     DNA, RNA, AA = _identify_residues(comments)
+    if set(clean_lines[0]).issubset(set(['A', 'C', 'G', 'T'])):
+        LOGGER.warning("Found only the letters A, C, G, T on first line. Are you missing the title line in your .ig file?")
+
     seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA, AA=AA)
 
     if ter_char == '2':