zerothi · zerothi · Nov 29, 2024 · Nov 27, 2024 · Nov 28, 2024 · Nov 29, 2024
diff --git a/src/sisl/_core/_sparse.pxd b/src/sisl/_core/_sparse.pxd
@@ -2,5 +2,5 @@
 from sisl._core._dtypes cimport ints_st
 
 
-cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr, const
-ints_st per_elem) noexcept nogil
+cdef void ncol2ptr(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr,
+                   const ints_st per_row, const ints_st per_elem) noexcept nogil
diff --git a/src/sisl/_core/_sparse.pyx b/src/sisl/_core/_sparse.pyx
@@ -15,19 +15,21 @@ from sisl._indices cimport in_1d
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr, const ints_st per_elem) noexcept nogil:
-    cdef ssize_st r, rr
+cdef void ncol2ptr(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr,
+                   const ints_st per_row, const ints_st per_elem) noexcept nogil:
+    cdef ssize_st r, rr, ir
 
     # this is NC/SOC
     ptr[0] = 0
-    ptr[1] = ncol[0] * per_elem
+    for ir in range(1, per_row):
+        ptr[ir] = ptr[ir-1] + ncol[0] * per_elem
     for r in range(1, nr):
-        rr = r * 2
-        # do both
-        ptr[rr] = ptr[rr - 1] + ncol[r-1] * per_elem
-        ptr[rr+1] = ptr[rr] + ncol[r] * per_elem
+        rr = r * per_row
+        ptr[rr] = ptr[rr-1] + ncol[r-1] * per_elem
+        for ir in range(1, per_row):
+            ptr[rr+ir] = ptr[rr+ir-1] + ncol[r] * per_elem
 
-    ptr[nr * 2] = ptr[nr * 2 - 1] + ncol[nr - 1] * per_elem
+    ptr[nr * per_row] = ptr[nr * per_row - 1] + ncol[nr - 1] * per_elem
 
 
 @cython.boundscheck(False)
@@ -36,30 +38,33 @@ cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] pt
 @cython.cdivision(True)
 def fold_csr_matrix(ints_st[::1] ptr,
                     ints_st[::1] ncol,
-                    ints_st[::1] col):
+                    ints_st[::1] col,
+                    ints_st per_row = 1,
+                    ):
     """ Fold all columns into a square matrix """
 
     # Number of rows
     cdef ints_st nr = ncol.shape[0]
 
     cdef object dtype = type2dtype[ints_st](1)
-    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr + 1], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr*per_row+ 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr*per_row], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)*per_row*per_row], dtype=dtype)
 
     cdef ints_st[::1] fold_ptr = FOLD_ptr
     cdef ints_st[::1] fold_ncol = FOLD_ncol
     cdef ints_st[::1] fold_col = FOLD_col
 
     # local variables
-    cdef ints_st r, c, nz, ind
+    cdef ints_st r, rr, ir, c, ic, nz, ind
     cdef ints_st[::1] tmp
 
     nz = 0
     fold_ptr[0] = 0
 
     # Loop on all rows
     for r in range(nr):
+        rr = r * per_row
 
         # Initialize the pointer arrays
         # Even though large supercells has *many* double entries (after folding)
@@ -80,15 +85,34 @@ def fold_csr_matrix(ints_st[::1] ptr,
         #    which can be quite heavy.
         tmp = col[ptr[r]:ptr[r] + ncol[r]].copy()
         for ind in range(ncol[r]):
-            tmp[ind] %= nr
+            # correct the column indices (this is related to the additional rows)
+            tmp[ind] = (tmp[ind] % nr) * per_row
 
         tmp = np.unique(tmp)
-        fold_ncol[r] = tmp.shape[0]
+
+        # Create first one, then we simply copy it
+        # number of elements for this row
+        fold_ncol[rr] = tmp.shape[0] * per_row
+
+        # create the next columns
         for ind in range(tmp.shape[0]):
-            fold_col[fold_ptr[r] + ind] = tmp[ind]
+            for ic in range(per_row):
+                fold_col[fold_ptr[rr]+ind*per_row+ic] = tmp[ind]+ic
+
+        for ir in range(1, per_row):
+            # number of elements for this row
+            fold_ncol[rr+ir] = fold_ncol[rr]
+            fold_ptr[rr+ir] = fold_ptr[rr+ir-1] + fold_ncol[rr+ir]
+
+            # create the next columns
+            for ind in range(tmp.shape[0]*per_row):
+                fold_col[fold_ptr[rr+ir]+ind] = fold_col[fold_ptr[rr]+ind]
 
-        fold_ptr[r + 1] = fold_ptr[r] + fold_ncol[r]
-        nz += fold_ncol[r]
+        # Update next pointer
+        fold_ptr[rr+per_row] = fold_ptr[rr+per_row-1] + fold_ncol[rr+per_row-1]
+
+        # update counter
+        nz += fold_ncol[rr] * per_row
 
     if nz > fold_col.shape[0]:
         raise ValueError('something went wrong')
@@ -101,126 +125,76 @@ def fold_csr_matrix(ints_st[::1] ptr,
 @cython.wraparound(False)
 @cython.initializedcheck(False)
 @cython.cdivision(True)
-def fold_csr_matrix_nc(ints_st[::1] ptr,
-                       ints_st[::1] ncol,
-                       ints_st[::1] col):
+def fold_csr_matrix_diag(ints_st[::1] ptr,
+                         ints_st[::1] ncol,
+                         ints_st[::1] col,
+                         ints_st per_row,
+                    ):
     """ Fold all columns into a square matrix """
+
     # Number of rows
     cdef ints_st nr = ncol.shape[0]
 
     cdef object dtype = type2dtype[ints_st](1)
-    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=dtype)
-    # We have to multiply by 4, 2 times for the extra rows, and another
-    # 2 for the possible double couplings
-    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 4], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr*per_row+ 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr*per_row], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)*per_row], dtype=dtype)
 
     cdef ints_st[::1] fold_ptr = FOLD_ptr
     cdef ints_st[::1] fold_ncol = FOLD_ncol
     cdef ints_st[::1] fold_col = FOLD_col
 
     # local variables
-    cdef ints_st r, rr, ind, nz, c
+    cdef ints_st r, rr, ir, c, ic, nz, ind
     cdef ints_st[::1] tmp
 
     nz = 0
     fold_ptr[0] = 0
 
     # Loop on all rows
     for r in range(nr):
-        rr = r * 2
+        rr = r * per_row
 
+        # Initialize the pointer arrays
+        # Even though large supercells has *many* double entries (after folding)
+        # this turns out to be faster than incrementally searching
+        # the array.
+        # This kind-of-makes sense.
+        # We can do:
+        #  1.
+        #    a) build a full list of folded items
+        #    b) find unique (and sorted) elements
+        # or
+        #  2.
+        #    a) incrementally add a value, only
+        #       if it does not exist.
+        # 1. creates a bigger temporary array, but only
+        #    adds unique values 1 time through numpy fast algorithm
+        # 2. searchs an array (of seemingly small arrays) ncol times
+        #    which can be quite heavy.
         tmp = col[ptr[r]:ptr[r] + ncol[r]].copy()
         for ind in range(ncol[r]):
-            tmp[ind] = (tmp[ind] % nr) * 2
+            # correct the column indices (this is related to the additional rows)
+            tmp[ind] = (tmp[ind] % nr) * per_row
 
         tmp = np.unique(tmp)
 
-        # Duplicate pointers and counters for next row (off-diagonal)
-        fold_ncol[rr] = tmp.shape[0] * 2
-        fold_ncol[rr + 1] = fold_ncol[rr]
-        fold_ptr[rr + 1] = fold_ptr[rr] + fold_ncol[rr]
-        fold_ptr[rr + 2] = fold_ptr[rr + 1] + fold_ncol[rr]
+        for ir in range(per_row):
+            # number of elements for this row
+            fold_ncol[rr+ir] = tmp.shape[0]
 
-        for ind in range(tmp.shape[0]):
-            fold_col[fold_ptr[rr] + ind * 2] = tmp[ind]
-            fold_col[fold_ptr[rr] + ind * 2 + 1] = tmp[ind] + 1
-            fold_col[fold_ptr[rr+1] + ind * 2] = tmp[ind]
-            fold_col[fold_ptr[rr+1] + ind * 2 + 1] = tmp[ind] + 1
+            # create the next columns
+            for ind in range(tmp.shape[0]):
+                fold_col[fold_ptr[rr+ir] + ind] = tmp[ind] + ir
 
-        nz += fold_ncol[rr] * 2
+            # create next pointer
+            fold_ptr[rr+ir+1] = fold_ptr[rr+ir] + fold_ncol[rr+ir]
 
-    if nz > fold_col.shape[0]:
-        raise ValueError('something went wrong NC')
-
-    # Return objects
-    return FOLD_ptr, FOLD_ncol, FOLD_col[:nz].copy()
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
-def fold_csr_matrix_nc_diag(ints_st[::1] ptr,
-                            ints_st[::1] ncol,
-                            ints_st[::1] col):
-    """ Fold all columns into a square matrix """
-    # Number of rows
-    cdef ints_st nr = ncol.shape[0]
-
-    cdef object dtype = type2dtype[ints_st](1)
-    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=dtype)
-    # We have to multiply by 2 times for the extra rows
-    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 2], dtype=dtype)
-
-    cdef ints_st[::1] fold_ptr = FOLD_ptr
-    cdef ints_st[::1] fold_ncol = FOLD_ncol
-    cdef ints_st[::1] fold_col = FOLD_col
-
-    # local variables
-    cdef ints_st r, rr, ind, nz, c
-    cdef ints_st[::1] tmp
-
-    nz = 0
-    fold_ptr[0] = 0
-
-    # Loop on all rows
-    for r in range(nr):
-        rr = r * 2
-
-        # Initialize the pointer arrays
-        if ncol[r] > 0:
-            c = (col[ptr[r]] % nr) * 2
-            fold_ncol[rr] = 1
-            fold_col[fold_ptr[rr]] = c
-        else:
-            fold_ncol[rr] = 0
-
-        for ind in range(ptr[r] + 1, ptr[r] + ncol[r]):
-            c = (col[ind] % nr) * 2
-            if not in_1d(fold_col[fold_ptr[rr]:fold_ptr[rr] + fold_ncol[rr]], c):
-                fold_col[fold_ptr[rr] + fold_ncol[rr]] = c
-                fold_ncol[rr] += 1
-
-        # Duplicate pointers and counters for next row (off-diagonal)
-        fold_ptr[rr + 1] = fold_ptr[rr] + fold_ncol[rr]
-        fold_ncol[rr + 1] = fold_ncol[rr]
-
-        # Sort indices (we should implement our own sorting algorithm)
-        tmp = np.sort(fold_col[fold_ptr[rr]:fold_ptr[rr] + fold_ncol[rr]])
-        for ind in range(fold_ncol[rr]):
-            c = tmp[ind]
-            fold_col[fold_ptr[rr] + ind] = c
-            # Copy to next row as well
-            fold_col[fold_ptr[rr+1] + ind] = c + 1
-
-        # Increment the next row
-        fold_ptr[rr + 2] = fold_ptr[rr + 1] + fold_ncol[rr + 1]
-        nz += fold_ncol[rr] * 2
+        # update counter
+        nz += fold_ncol[rr] * per_row
 
     if nz > fold_col.shape[0]:
-        raise ValueError('something went wrong overlap NC')
+        raise ValueError('something went wrong')
 
     # Return objects
     return FOLD_ptr, FOLD_ncol, FOLD_col[:nz].copy()

diff --git a/src/sisl/io/siesta/_help.py b/src/sisl/io/siesta/_help.py
@@ -142,6 +142,19 @@
                 if D.shape[-1] > 4:
                     D[..., 4:] = csr._D[..., 8:].astype(dtype)
                 csr._D = D
+            elif spin.is_nambu:
+                D = np.empty(shape[:-1] + (shape[-1] - 8,), dtype=dtype)
+                D[..., 0] = toc(csr._D, 0, 4)
+                D[..., 1] = toc(csr._D, 1, 5)
+                D[..., 2] = toc(csr._D, 2, 3)
+                D[..., 3] = toc(csr._D, 6, 7)
+                D[..., 4] = toc(csr._D, 8, 9)  # S
+                D[..., 5] = toc(csr._D, 10, 11)  # Tuu
+                D[..., 6] = toc(csr._D, 12, 13)  # Tdd
+                D[..., 7] = toc(csr._D, 14, 15)  # T0
+                if D.shape[-1] > 8:
+                    D[..., 8:] = csr._D[..., 16:].astype(dtype)
+                csr._D = D
             else:
                 raise NotImplementedError
         else:
@@ -179,6 +192,27 @@
                 if D.shape[-1] > 8:
                     D[..., 8:] = csr._D[..., 4:].real.astype(dtype)
                 csr._D = D
+            elif spin.is_nambu:
+                D = np.empty(shape[:-1] + (shape[-1] + 8,), dtype=dtype)
+                D[..., 0] = csr._D[..., 0].real.astype(dtype)
+                D[..., 1] = csr._D[..., 1].real.astype(dtype)
+                D[..., 2] = csr._D[..., 2].real.astype(dtype)
+                D[..., 3] = csr._D[..., 2].imag.astype(dtype)
+                D[..., 4] = csr._D[..., 0].imag.astype(dtype)
+                D[..., 5] = csr._D[..., 1].imag.astype(dtype)
+                D[..., 6] = csr._D[..., 3].real.astype(dtype)
+                D[..., 7] = csr._D[..., 3].imag.astype(dtype)
+                D[..., 8] = csr._D[..., 4].real.astype(dtype)  # S
+                D[..., 9] = csr._D[..., 4].imag.astype(dtype)
+                D[..., 10] = csr._D[..., 5].real.astype(dtype)  # Tuu
+                D[..., 11] = csr._D[..., 5].imag.astype(dtype)
+                D[..., 12] = csr._D[..., 6].real.astype(dtype)  # Tdd
+                D[..., 13] = csr._D[..., 6].imag.astype(dtype)
+                D[..., 14] = csr._D[..., 7].real.astype(dtype)  # T0
+                D[..., 15] = csr._D[..., 7].imag.astype(dtype)
+                if D.shape[-1] > 16:
+                    D[..., 16:] = csr._D[..., 8:].real.astype(dtype)
+                csr._D = D
             else:
                 raise NotImplementedError
         else:
@@ -237,12 +271,7 @@
 
     spin = M.spin
 
-    if spin.is_noncolinear:
-        if np.dtype(M.dtype).kind in ("f", "i"):
-            M._csr._D[:, 3] = -M._csr._D[:, 3]
-        else:
-            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
-    elif spin.is_spinorbit:
+    if spin.kind in (spin.NONCOLINEAR, spin.SPINORBIT, spin.NAMBU):
         if np.dtype(M.dtype).kind in ("f", "i"):
             M._csr._D[:, 3] = -M._csr._D[:, 3]
         else:
@@ -261,12 +290,7 @@
 
     spin = M.spin
 
-    if spin.is_noncolinear:
-        if np.dtype(M.dtype).kind in ("f", "i"):
-            M._csr._D[:, 3] = -M._csr._D[:, 3]
-        else:
-            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
-    elif spin.is_spinorbit:
+    if spin.kind in (spin.NONCOLINEAR, spin.SPINORBIT, spin.NAMBU):
         if np.dtype(M.dtype).kind in ("f", "i"):
             M._csr._D[:, 3] = -M._csr._D[:, 3]
         else: