From cbb8448d9e37a473f1cf5413f61b268f3399d188 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhuang <yuxuan.zhuang@dbb.su.se>
Date: Thu, 23 Jun 2022 21:09:43 +0200
Subject: [PATCH] lazy build RA, SR

---
 package/MDAnalysis/core/topology.py | 83 ++++++++++++++---------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/package/MDAnalysis/core/topology.py b/package/MDAnalysis/core/topology.py
index 090b807ac51..fada65831fd 100644
--- a/package/MDAnalysis/core/topology.py
+++ b/package/MDAnalysis/core/topology.py
@@ -64,16 +64,6 @@
 from ..exceptions import NoDataError
 
 
-# TODO Notes:
-#   Could make downshift tables lazily built! This would
-#     a) Make these not get built when not used
-#     b) Optimise moving multiple atoms between residues as only built once
-#     afterwards
-
-#   Could optimise moves by only updating the two parent tables rather than
-#   rebuilding everything!
-
-
 def make_downshift_arrays(upshift, nparents):
     """From an upwards translation table, create the opposite direction
 
@@ -99,7 +89,7 @@ def make_downshift_arrays(upshift, nparents):
     To find the residue to atom mappings for a given atom to residue mapping:
 
     >>> atom2res = np.array([0, 1, 0, 2, 2, 0, 2])
-    >>> make_downshift_arrays(atom2res)
+    >>> make_downshift_arrays(atom2res, 3)
     array([array([0, 2, 5]), array([1]), array([3, 4, 6]), None], dtype=object)
 
     Entry 0 corresponds to residue 0 and says that this contains atoms 0, 2 & 5
@@ -115,7 +105,7 @@ def make_downshift_arrays(upshift, nparents):
     if not len(upshift):
         return np.array([], dtype=object)
 
-    upshift = np.array(upshift)
+    # mergesort for a stable ordered array for the same value.
     order = np.argsort(upshift, kind="mergesort")
 
     upshift_sorted = upshift[order]
@@ -199,7 +189,7 @@ def __init__(self,
             self._AR = np.asarray(atom_resindex, dtype=np.intp).copy()
             if not len(self._AR) == n_atoms:
                 raise ValueError("atom_resindex must be len n_atoms")
-        self._RA = make_downshift_arrays(self._AR, n_residues)
+        self._RA = None
 
         # built residue-to-segment mapping, and vice-versa
         if residue_segindex is None:
@@ -208,13 +198,27 @@ def __init__(self,
             self._RS = np.asarray(residue_segindex, dtype=np.intp).copy()
             if not len(self._RS) == n_residues:
                 raise ValueError("residue_segindex must be len n_residues")
-        self._SR = make_downshift_arrays(self._RS, n_segments)
+        self._SR = None
 
     def copy(self):
         """Return a deepcopy of this Transtable"""
         return self.__class__(self.n_atoms, self.n_residues, self.n_segments,
                               atom_resindex=self._AR, residue_segindex=self._RS)
 
+    @property
+    def RA(self):
+        if self._RA is None:
+            self._RA = make_downshift_arrays(self._AR,
+                                             self.n_residues)
+        return self._RA
+
+    @property
+    def SR(self):
+        if self._SR is None:
+            self._SR = make_downshift_arrays(self._RS,
+                                             self.n_segments)
+        return self._SR
+
     @property
     def size(self):
         """The shape of the table, ``(n_atoms, n_residues, n_segments)``.
@@ -253,11 +257,12 @@ def residues2atoms_1d(self, rix):
             indices of atoms present in residues, collectively
 
         """
+        RA = self.RA
         try:
-            return np.concatenate(self._RA[rix])
+            return np.concatenate(RA[rix])
         except ValueError:  # rix is not iterable or empty
             # don't accidentally return a view!
-            return self._RA[rix].astype(np.intp, copy=True)
+            return RA[rix].astype(np.intp, copy=True)
 
     def residues2atoms_2d(self, rix):
         """Get atom indices represented by each residue index.
@@ -275,10 +280,11 @@ def residues2atoms_2d(self, rix):
             in that residue
 
         """
+        RA = self.RA
         try:
-            return [self._RA[r].copy() for r in rix]
+            return [RA[r].copy() for r in rix]
         except TypeError:
-            return [self._RA[rix].copy()]  # why would this be singular for 2d?
+            return [RA[rix].copy()]  # why would this be singular for 2d?
 
     def residues2segments(self, rix):
         """Get segment indices for each residue.
@@ -310,11 +316,12 @@ def segments2residues_1d(self, six):
             sorted indices of residues present in segments, collectively
 
         """
+        SR = self.SR
         try:
-            return np.concatenate(self._SR[six])
+            return np.concatenate(SR[six])
         except ValueError:  # six is not iterable or empty
             # don't accidentally return a view!
-            return self._SR[six].astype(np.intp, copy=True)
+            return SR[six].astype(np.intp, copy=True)
 
     def segments2residues_2d(self, six):
         """Get residue indices represented by each segment index.
@@ -332,10 +339,11 @@ def segments2residues_2d(self, six):
             present in that segment
 
         """
+        SR = self.SR
         try:
-            return [self._SR[s].copy() for s in six]
+            return [SR[s].copy() for s in six]
         except TypeError:
-            return [self._SR[six].copy()]
+            return [SR[six].copy()]
 
     # Compound moves, does 2 translations
     def atoms2segments(self, aix):
@@ -396,43 +404,34 @@ def segments2atoms_2d(self, six):
     def move_atom(self, aix, rix):
         """Move aix to be in rix"""
         self._AR[aix] = rix
-        self._RA = make_downshift_arrays(self._AR, self.n_residues)
+        self._RA = None
 
     def move_residue(self, rix, six):
         """Move rix to be in six"""
         self._RS[rix] = six
-        self._SR = make_downshift_arrays(self._RS, self.n_segments)
+        self._SR = None
 
     def add_Residue(self, segidx):
         # segidx - index of parent
         self.n_residues += 1
-        self._RA = make_downshift_arrays(self._AR, self.n_residues)
+        self._RA = None
         self._RS = np.concatenate([self._RS, np.array([segidx])])
-        self._SR = make_downshift_arrays(self._RS, self.n_segments)
+        self._SR = None
+
 
         return self.n_residues - 1
 
     def add_Segment(self):
         self.n_segments += 1
-        # self._RS remains the same, no residues point to the new segment yet
-        self._SR = make_downshift_arrays(self._RS, self.n_segments)
-
+        self._SR = None
         return self.n_segments - 1
 
     def __getstate__(self):
-        return (self.n_atoms, self.n_residues, self.n_segments,
-                self._AR, self._RS)
-
-    def __setstate__(self, args):
-        # rebuild _RA and _SR instead of serializing them.
-        n_atoms = args[0]
-        n_residues = args[1]
-        n_segments = args[2]
-        _AR = args[3]
-        _RS = args[4]
-        return self.__init__(n_atoms, n_residues, n_segments,
-                             atom_resindex=_AR, residue_segindex=_RS)
-
+        # don't serialize _RA and _SR for performance.
+        attrs = self.__dict__
+        attrs['_RA'] = None
+        attrs['_SR'] = None
+        return attrs
 
 
 class Topology(object):