From 15ef79eb98ff07f7f8e307c3f7dfa488efe77680 Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Wed, 27 Nov 2024 13:36:46 +0100
Subject: [PATCH 1/4] initial commit of nambu code

Added matrix codes to create the Nambu
Hamiltonian.
This required restructuring the diag
fold_csr matrix codes. Now they can be given
a number to determine the number of elements
that are added. It generalizes the code a bit.

Currently the matrix creation does not implement
the no-phase code-path. So Gamma-point is not functional.

Changed some of the cython codes to use preprocessors
at the comment level at the top of the file.
It makes it much simpler to debug.

Enabled siesta routines to read the matrices
with Nambu spin configuration.

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 src/sisl/_core/_sparse.pxd            |   4 +-
 src/sisl/_core/_sparse.pyx            | 192 ++++++------
 src/sisl/io/siesta/_help.py           |  48 ++-
 src/sisl/physics/_matrix_ddk.pyx      |  93 ++++--
 src/sisl/physics/_matrix_dk.pyx       |  38 ++-
 src/sisl/physics/_matrix_k.pyx        |  45 ++-
 src/sisl/physics/_matrix_phase.pyx    | 338 +++++++++++++++------
 src/sisl/physics/_matrix_phase3.pyx   | 335 +++++++++++++++++++--
 src/sisl/physics/_matrix_phase_sc.pyx | 415 ++++++++++++++++++++------
 src/sisl/physics/_matrix_utils.pxd    |  12 +
 src/sisl/physics/_matrix_utils.pyx    |  65 ++--
 src/sisl/physics/densitymatrix.py     |   2 +
 src/sisl/physics/electron.py          |  24 +-
 src/sisl/physics/hamiltonian.py       |   9 +-
 src/sisl/physics/sparse.py            | 204 ++++++++++++-
 src/sisl/physics/spin.py              |  24 +-
 src/sisl/physics/tests/test_spin.py   |  19 ++
 17 files changed, 1456 insertions(+), 411 deletions(-)

diff --git a/src/sisl/_core/_sparse.pxd b/src/sisl/_core/_sparse.pxd
index a588c5d149..a08f3e0708 100644
--- a/src/sisl/_core/_sparse.pxd
+++ b/src/sisl/_core/_sparse.pxd
@@ -2,5 +2,5 @@
 from sisl._core._dtypes cimport ints_st
 
 
-cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr, const
-ints_st per_elem) noexcept nogil
+cdef void ncol2ptr(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr,
+                   const ints_st per_row, const ints_st per_elem) noexcept nogil
diff --git a/src/sisl/_core/_sparse.pyx b/src/sisl/_core/_sparse.pyx
index c0ff04e706..a8644d4c9c 100644
--- a/src/sisl/_core/_sparse.pyx
+++ b/src/sisl/_core/_sparse.pyx
@@ -15,19 +15,21 @@ from sisl._indices cimport in_1d
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr, const ints_st per_elem) noexcept nogil:
-    cdef ssize_st r, rr
+cdef void ncol2ptr(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr,
+                   const ints_st per_row, const ints_st per_elem) noexcept nogil:
+    cdef ssize_st r, rr, ir
 
     # this is NC/SOC
     ptr[0] = 0
-    ptr[1] = ncol[0] * per_elem
+    for ir in range(1, per_row):
+        ptr[ir] = ptr[ir-1] + ncol[0] * per_elem
     for r in range(1, nr):
-        rr = r * 2
-        # do both
-        ptr[rr] = ptr[rr - 1] + ncol[r-1] * per_elem
-        ptr[rr+1] = ptr[rr] + ncol[r] * per_elem
+        rr = r * per_row
+        ptr[rr] = ptr[rr-1] + ncol[r-1] * per_elem
+        for ir in range(1, per_row):
+            ptr[rr+ir] = ptr[rr+ir-1] + ncol[r] * per_elem
 
-    ptr[nr * 2] = ptr[nr * 2 - 1] + ncol[nr - 1] * per_elem
+    ptr[nr * per_row] = ptr[nr * per_row - 1] + ncol[nr - 1] * per_elem
 
 
 @cython.boundscheck(False)
@@ -36,23 +38,25 @@ cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] pt
 @cython.cdivision(True)
 def fold_csr_matrix(ints_st[::1] ptr,
                     ints_st[::1] ncol,
-                    ints_st[::1] col):
+                    ints_st[::1] col,
+                    ints_st per_row = 1,
+                    ):
     """ Fold all columns into a square matrix """
 
     # Number of rows
     cdef ints_st nr = ncol.shape[0]
 
     cdef object dtype = type2dtype[ints_st](1)
-    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr + 1], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr*per_row+ 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr*per_row], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)*per_row*per_row], dtype=dtype)
 
     cdef ints_st[::1] fold_ptr = FOLD_ptr
     cdef ints_st[::1] fold_ncol = FOLD_ncol
     cdef ints_st[::1] fold_col = FOLD_col
 
     # local variables
-    cdef ints_st r, c, nz, ind
+    cdef ints_st r, rr, ir, c, ic, nz, ind
     cdef ints_st[::1] tmp
 
     nz = 0
@@ -60,6 +64,7 @@ def fold_csr_matrix(ints_st[::1] ptr,
 
     # Loop on all rows
     for r in range(nr):
+        rr = r * per_row
 
         # Initialize the pointer arrays
         # Even though large supercells has *many* double entries (after folding)
@@ -80,15 +85,34 @@ def fold_csr_matrix(ints_st[::1] ptr,
         #    which can be quite heavy.
         tmp = col[ptr[r]:ptr[r] + ncol[r]].copy()
         for ind in range(ncol[r]):
-            tmp[ind] %= nr
+            # correct the column indices (this is related to the additional rows)
+            tmp[ind] = (tmp[ind] % nr) * per_row
 
         tmp = np.unique(tmp)
-        fold_ncol[r] = tmp.shape[0]
+
+        # Create first one, then we simply copy it
+        # number of elements for this row
+        fold_ncol[rr] = tmp.shape[0] * per_row
+
+        # create the next columns
         for ind in range(tmp.shape[0]):
-            fold_col[fold_ptr[r] + ind] = tmp[ind]
+            for ic in range(per_row):
+                fold_col[fold_ptr[rr]+ind*per_row+ic] = tmp[ind]+ic
+
+        for ir in range(1, per_row):
+            # number of elements for this row
+            fold_ncol[rr+ir] = fold_ncol[rr]
+            fold_ptr[rr+ir] = fold_ptr[rr+ir-1] + fold_ncol[rr+ir]
+
+            # create the next columns
+            for ind in range(tmp.shape[0]*per_row):
+                fold_col[fold_ptr[rr+ir]+ind] = fold_col[fold_ptr[rr]+ind]
 
-        fold_ptr[r + 1] = fold_ptr[r] + fold_ncol[r]
-        nz += fold_ncol[r]
+        # Update next pointer
+        fold_ptr[rr+per_row] = fold_ptr[rr+per_row-1] + fold_ncol[rr+per_row-1]
+
+        # update counter
+        nz += fold_ncol[rr] * per_row
 
     if nz > fold_col.shape[0]:
         raise ValueError('something went wrong')
@@ -101,26 +125,27 @@ def fold_csr_matrix(ints_st[::1] ptr,
 @cython.wraparound(False)
 @cython.initializedcheck(False)
 @cython.cdivision(True)
-def fold_csr_matrix_nc(ints_st[::1] ptr,
-                       ints_st[::1] ncol,
-                       ints_st[::1] col):
+def fold_csr_matrix_diag(ints_st[::1] ptr,
+                         ints_st[::1] ncol,
+                         ints_st[::1] col,
+                         ints_st per_row,
+                    ):
     """ Fold all columns into a square matrix """
+
     # Number of rows
     cdef ints_st nr = ncol.shape[0]
 
     cdef object dtype = type2dtype[ints_st](1)
-    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=dtype)
-    # We have to multiply by 4, 2 times for the extra rows, and another
-    # 2 for the possible double couplings
-    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 4], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr*per_row+ 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr*per_row], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)*per_row], dtype=dtype)
 
     cdef ints_st[::1] fold_ptr = FOLD_ptr
     cdef ints_st[::1] fold_ncol = FOLD_ncol
     cdef ints_st[::1] fold_col = FOLD_col
 
     # local variables
-    cdef ints_st r, rr, ind, nz, c
+    cdef ints_st r, rr, ir, c, ic, nz, ind
     cdef ints_st[::1] tmp
 
     nz = 0
@@ -128,99 +153,48 @@ def fold_csr_matrix_nc(ints_st[::1] ptr,
 
     # Loop on all rows
     for r in range(nr):
-        rr = r * 2
+        rr = r * per_row
 
+        # Initialize the pointer arrays
+        # Even though large supercells has *many* double entries (after folding)
+        # this turns out to be faster than incrementally searching
+        # the array.
+        # This kind-of-makes sense.
+        # We can do:
+        #  1.
+        #    a) build a full list of folded items
+        #    b) find unique (and sorted) elements
+        # or
+        #  2.
+        #    a) incrementally add a value, only
+        #       if it does not exist.
+        # 1. creates a bigger temporary array, but only
+        #    adds unique values 1 time through numpy fast algorithm
+        # 2. searchs an array (of seemingly small arrays) ncol times
+        #    which can be quite heavy.
         tmp = col[ptr[r]:ptr[r] + ncol[r]].copy()
         for ind in range(ncol[r]):
-            tmp[ind] = (tmp[ind] % nr) * 2
+            # correct the column indices (this is related to the additional rows)
+            tmp[ind] = (tmp[ind] % nr) * per_row
 
         tmp = np.unique(tmp)
 
-        # Duplicate pointers and counters for next row (off-diagonal)
-        fold_ncol[rr] = tmp.shape[0] * 2
-        fold_ncol[rr + 1] = fold_ncol[rr]
-        fold_ptr[rr + 1] = fold_ptr[rr] + fold_ncol[rr]
-        fold_ptr[rr + 2] = fold_ptr[rr + 1] + fold_ncol[rr]
+        for ir in range(per_row):
+            # number of elements for this row
+            fold_ncol[rr+ir] = tmp.shape[0]
 
-        for ind in range(tmp.shape[0]):
-            fold_col[fold_ptr[rr] + ind * 2] = tmp[ind]
-            fold_col[fold_ptr[rr] + ind * 2 + 1] = tmp[ind] + 1
-            fold_col[fold_ptr[rr+1] + ind * 2] = tmp[ind]
-            fold_col[fold_ptr[rr+1] + ind * 2 + 1] = tmp[ind] + 1
+            # create the next columns
+            for ind in range(tmp.shape[0]):
+                fold_col[fold_ptr[rr+ir] + ind] = tmp[ind] + ir
 
-        nz += fold_ncol[rr] * 2
+            # create next pointer
+            fold_ptr[rr+ir+1] = fold_ptr[rr+ir] + fold_ncol[rr+ir]
 
-    if nz > fold_col.shape[0]:
-        raise ValueError('something went wrong NC')
-
-    # Return objects
-    return FOLD_ptr, FOLD_ncol, FOLD_col[:nz].copy()
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
-def fold_csr_matrix_nc_diag(ints_st[::1] ptr,
-                            ints_st[::1] ncol,
-                            ints_st[::1] col):
-    """ Fold all columns into a square matrix """
-    # Number of rows
-    cdef ints_st nr = ncol.shape[0]
-
-    cdef object dtype = type2dtype[ints_st](1)
-    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=dtype)
-    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=dtype)
-    # We have to multiply by 2 times for the extra rows
-    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 2], dtype=dtype)
-
-    cdef ints_st[::1] fold_ptr = FOLD_ptr
-    cdef ints_st[::1] fold_ncol = FOLD_ncol
-    cdef ints_st[::1] fold_col = FOLD_col
-
-    # local variables
-    cdef ints_st r, rr, ind, nz, c
-    cdef ints_st[::1] tmp
-
-    nz = 0
-    fold_ptr[0] = 0
-
-    # Loop on all rows
-    for r in range(nr):
-        rr = r * 2
-
-        # Initialize the pointer arrays
-        if ncol[r] > 0:
-            c = (col[ptr[r]] % nr) * 2
-            fold_ncol[rr] = 1
-            fold_col[fold_ptr[rr]] = c
-        else:
-            fold_ncol[rr] = 0
-
-        for ind in range(ptr[r] + 1, ptr[r] + ncol[r]):
-            c = (col[ind] % nr) * 2
-            if not in_1d(fold_col[fold_ptr[rr]:fold_ptr[rr] + fold_ncol[rr]], c):
-                fold_col[fold_ptr[rr] + fold_ncol[rr]] = c
-                fold_ncol[rr] += 1
-
-        # Duplicate pointers and counters for next row (off-diagonal)
-        fold_ptr[rr + 1] = fold_ptr[rr] + fold_ncol[rr]
-        fold_ncol[rr + 1] = fold_ncol[rr]
-
-        # Sort indices (we should implement our own sorting algorithm)
-        tmp = np.sort(fold_col[fold_ptr[rr]:fold_ptr[rr] + fold_ncol[rr]])
-        for ind in range(fold_ncol[rr]):
-            c = tmp[ind]
-            fold_col[fold_ptr[rr] + ind] = c
-            # Copy to next row as well
-            fold_col[fold_ptr[rr+1] + ind] = c + 1
-
-        # Increment the next row
-        fold_ptr[rr + 2] = fold_ptr[rr + 1] + fold_ncol[rr + 1]
-        nz += fold_ncol[rr] * 2
+        # update counter
+        nz += fold_ncol[rr] * per_row
 
     if nz > fold_col.shape[0]:
-        raise ValueError('something went wrong overlap NC')
+        raise ValueError('something went wrong')
 
     # Return objects
     return FOLD_ptr, FOLD_ncol, FOLD_col[:nz].copy()
diff --git a/src/sisl/io/siesta/_help.py b/src/sisl/io/siesta/_help.py
index d7ac67585d..27f638b70d 100644
--- a/src/sisl/io/siesta/_help.py
+++ b/src/sisl/io/siesta/_help.py
@@ -142,6 +142,19 @@ def toc(D, re, im):
                 if D.shape[-1] > 4:
                     D[..., 4:] = csr._D[..., 8:].astype(dtype)
                 csr._D = D
+            elif spin.is_nambu:
+                D = np.empty(shape[:-1] + (shape[-1] - 8,), dtype=dtype)
+                D[..., 0] = toc(csr._D, 0, 4)
+                D[..., 1] = toc(csr._D, 1, 5)
+                D[..., 2] = toc(csr._D, 2, 3)
+                D[..., 3] = toc(csr._D, 6, 7)
+                D[..., 4] = toc(csr._D, 8, 9)  # S
+                D[..., 5] = toc(csr._D, 10, 11)  # Tuu
+                D[..., 6] = toc(csr._D, 12, 13)  # Tdd
+                D[..., 7] = toc(csr._D, 14, 15)  # T0
+                if D.shape[-1] > 8:
+                    D[..., 8:] = csr._D[..., 16:].astype(dtype)
+                csr._D = D
             else:
                 raise NotImplementedError
         else:
@@ -179,6 +192,27 @@ def toc(D, re, im):
                 if D.shape[-1] > 8:
                     D[..., 8:] = csr._D[..., 4:].real.astype(dtype)
                 csr._D = D
+            elif spin.is_nambu:
+                D = np.empty(shape[:-1] + (shape[-1] + 8,), dtype=dtype)
+                D[..., 0] = csr._D[..., 0].real.astype(dtype)
+                D[..., 1] = csr._D[..., 1].real.astype(dtype)
+                D[..., 2] = csr._D[..., 2].real.astype(dtype)
+                D[..., 3] = csr._D[..., 2].imag.astype(dtype)
+                D[..., 4] = csr._D[..., 0].imag.astype(dtype)
+                D[..., 5] = csr._D[..., 1].imag.astype(dtype)
+                D[..., 6] = csr._D[..., 3].real.astype(dtype)
+                D[..., 7] = csr._D[..., 3].imag.astype(dtype)
+                D[..., 8] = csr._D[..., 4].real.astype(dtype)  # S
+                D[..., 9] = csr._D[..., 4].imag.astype(dtype)
+                D[..., 10] = csr._D[..., 5].real.astype(dtype)  # Tuu
+                D[..., 11] = csr._D[..., 5].imag.astype(dtype)
+                D[..., 12] = csr._D[..., 6].real.astype(dtype)  # Tdd
+                D[..., 13] = csr._D[..., 6].imag.astype(dtype)
+                D[..., 14] = csr._D[..., 7].real.astype(dtype)  # T0
+                D[..., 15] = csr._D[..., 7].imag.astype(dtype)
+                if D.shape[-1] > 16:
+                    D[..., 16:] = csr._D[..., 8:].real.astype(dtype)
+                csr._D = D
             else:
                 raise NotImplementedError
         else:
@@ -237,12 +271,7 @@ def _mat_siesta2sisl(M, dtype: Optional[np.dtype] = None) -> None:
 
     spin = M.spin
 
-    if spin.is_noncolinear:
-        if np.dtype(M.dtype).kind in ("f", "i"):
-            M._csr._D[:, 3] = -M._csr._D[:, 3]
-        else:
-            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
-    elif spin.is_spinorbit:
+    if spin.kind in (spin.NONCOLINEAR, spin.SPINORBIT, spin.NAMBU):
         if np.dtype(M.dtype).kind in ("f", "i"):
             M._csr._D[:, 3] = -M._csr._D[:, 3]
         else:
@@ -261,12 +290,7 @@ def _mat_sisl2siesta(M, dtype: Optional[np.dtype] = None) -> None:
 
     spin = M.spin
 
-    if spin.is_noncolinear:
-        if np.dtype(M.dtype).kind in ("f", "i"):
-            M._csr._D[:, 3] = -M._csr._D[:, 3]
-        else:
-            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
-    elif spin.is_spinorbit:
+    if spin.kind in (spin.NONCOLINEAR, spin.SPINORBIT, spin.NAMBU):
         if np.dtype(M.dtype).kind in ("f", "i"):
             M._csr._D[:, 3] = -M._csr._D[:, 3]
         else:
diff --git a/src/sisl/physics/_matrix_ddk.pyx b/src/sisl/physics/_matrix_ddk.pyx
index fc83ac76df..a3139e8194 100644
--- a/src/sisl/physics/_matrix_ddk.pyx
+++ b/src/sisl/physics/_matrix_ddk.pyx
@@ -11,7 +11,10 @@ from sisl._core._dtypes cimport floats_st
 from ._matrix_phase3 import *
 from ._phase import *
 
-__all__ = ["matrix_ddk", "matrix_ddk_nc", "matrix_ddk_nc_diag", "matrix_ddk_so"]
+__all__ = ["matrix_ddk", "matrix_ddk_nc", "matrix_ddk_diag",
+    "matrix_ddk_so",
+    "matrix_ddk_nambu"
+]
 
 
 def _phase_ddk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
@@ -25,16 +28,7 @@ def _phase_ddk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
     # two dependent variables
     # We always do the Voigt representation
     #  Rd = dx^2, dy^2, dz^2, dzy, dxz, dyx
-    if gauge == 'cell':
-        phases = phase_rsc(sc, k, dtype).reshape(-1, 1)
-        Rs = np.dot(sc.sc_off, sc.cell)
-        Rd = - (Rs * Rs * phases).astype(dtype, copy=False)
-        Ro = - (np.roll(Rs, 1, axis=1) * phases).astype(dtype, copy=False) # z, x, y
-        Ro *= np.roll(Rs, -1, axis=1) # y, z, x
-        del phases, Rs
-        p_opt = 1
-
-    elif gauge == 'atom':
+    if gauge == 'atom':
         M.finalize()
         rij = M.Rij()._csr._D
         phases = phase_rij(rij, sc, k, dtype).reshape(-1, 1)
@@ -44,6 +38,17 @@ def _phase_ddk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
         del rij, phases
         p_opt = 0
 
+    elif gauge == 'cell':
+        phases = phase_rsc(sc, k, dtype).reshape(-1, 1)
+        Rs = np.dot(sc.sc_off, sc.cell)
+        Rd = - (Rs * Rs * phases).astype(dtype, copy=False)
+        Ro = - (np.roll(Rs, 1, axis=1) * phases).astype(dtype, copy=False) # z, x, y
+        Ro *= np.roll(Rs, -1, axis=1) # y, z, x
+        del phases, Rs
+        p_opt = 1
+
+    assert p_opt >= 0, "Not implemented"
+
     return p_opt, Rd, Ro
 
 
@@ -101,7 +106,8 @@ def matrix_ddk_nc(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     return dd
 
 
-def matrix_ddk_nc_diag(gauge, M, const int idx, sc, cnp.ndarray[floats_st] k, dtype, format):
+def matrix_ddk_diag(gauge, M, const int idx, const int per_row,
+                    sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, Rd, Ro = _phase_ddk(gauge, M, sc, k, dtype)
 
@@ -118,20 +124,32 @@ def matrix_ddk_nc_diag(gauge, M, const int idx, sc, cnp.ndarray[floats_st] k, dt
     csr = M._csr
 
     if format in ("array", "matrix", "dense"):
-        dxx = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxx, p_opt)
-        dyy = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryy, p_opt)
-        dzz = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzz, p_opt)
-        dzy = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzy, p_opt)
-        dxz = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxz, p_opt)
-        dyx = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryx, p_opt)
+        dxx = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxx, p_opt,
+        per_row)
+        dyy = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryy, p_opt,
+        per_row)
+        dzz = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzz, p_opt,
+        per_row)
+        dzy = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzy, p_opt,
+        per_row)
+        dxz = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxz, p_opt,
+        per_row)
+        dyx = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryx, p_opt,
+        per_row)
 
     else:
-        dxx = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxx, p_opt).asformat(format)
-        dyy = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryy, p_opt).asformat(format)
-        dzz = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzz, p_opt).asformat(format)
-        dzy = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzy, p_opt).asformat(format)
-        dxz = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxz, p_opt).asformat(format)
-        dyx = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryx, p_opt).asformat(format)
+        dxx = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxx, p_opt,
+        per_row).asformat(format)
+        dyy = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryy, p_opt,
+        per_row).asformat(format)
+        dzz = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzz, p_opt,
+        per_row).asformat(format)
+        dzy = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzy, p_opt,
+        per_row).asformat(format)
+        dxz = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxz, p_opt,
+        per_row).asformat(format)
+        dyx = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryx, p_opt,
+        per_row).asformat(format)
 
     return dxx, dyy, dzz, dzy, dxz, dyx
 
@@ -161,3 +179,30 @@ def matrix_ddk_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
         dd[5] = dd[5].asformat(format)
 
     return dd
+
+
+def matrix_ddk_nambu(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
+    dtype = phase_dtype(k, M.dtype, dtype, True)
+    p_opt, Rd, Ro = _phase_ddk(gauge, M, sc, k, dtype)
+
+    # Return list
+    dd = [None, None, None, None, None, None]
+
+    csr = M._csr
+
+    if format in ("array", "matrix", "dense"):
+        dd[:3] = _phase3_array_nambu(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
+        dd[3:] = _phase3_array_nambu(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
+
+    else:
+        # Default must be something else.
+        dd[:3] = _phase3_csr_nambu(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
+        dd[3:] = _phase3_csr_nambu(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
+        dd[0] = dd[0].asformat(format)
+        dd[1] = dd[1].asformat(format)
+        dd[2] = dd[2].asformat(format)
+        dd[3] = dd[3].asformat(format)
+        dd[4] = dd[4].asformat(format)
+        dd[5] = dd[5].asformat(format)
+
+    return dd
diff --git a/src/sisl/physics/_matrix_dk.pyx b/src/sisl/physics/_matrix_dk.pyx
index 3a937f3495..10a35c6977 100644
--- a/src/sisl/physics/_matrix_dk.pyx
+++ b/src/sisl/physics/_matrix_dk.pyx
@@ -14,7 +14,8 @@ from ._matrix_phase import *
 from ._matrix_phase3 import *
 from ._phase import *
 
-__all__ = ["matrix_dk", "matrik_dk_nc", "matrik_dk_nc_diag", "matrik_dk_so"]
+__all__ = ["matrix_dk", "matrik_dk_nc", "matrik_dk_diag", "matrik_dk_so",
+    "matrix_dk_nambu"]
 
 
 def _phase_dk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
@@ -37,6 +38,9 @@ def _phase_dk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
         iRs = (1j * np.dot(sc.sc_off, sc.cell) * iRs).astype(dtype, copy=False)
         p_opt = 1
 
+
+    assert p_opt >= 0, "Not implemented"
+
     return p_opt, iRs
 
 
@@ -68,7 +72,8 @@ def matrix_dk_nc(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     return d1.asformat(format), d2.asformat(format), d3.asformat(format)
 
 
-def matrix_dk_nc_diag(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k, dtype, format):
+def matrix_dk_diag(gauge, M, const ints_st idx, const ints_st per_row,
+                   sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, iRs = _phase_dk(gauge, M, sc, k, dtype)
 
@@ -80,14 +85,17 @@ def matrix_dk_nc_diag(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k,
     csr = M._csr
 
     if format in ("array", "matrix", "dense"):
-        x = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phx, p_opt)
-        y = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phy, p_opt)
-        z = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phz, p_opt)
+        x = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phx, p_opt,
+        per_row)
+        y = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phy, p_opt,
+        per_row)
+        z = _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phz, p_opt,
+        per_row)
 
     else:
-        x = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phx, p_opt).asformat(format)
-        y = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phy, p_opt).asformat(format)
-        z = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phz, p_opt).asformat(format)
+        x = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phx, p_opt, per_row).asformat(format)
+        y = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phy, p_opt, per_row).asformat(format)
+        z = _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phz, p_opt, per_row).asformat(format)
 
     return x, y, z
 
@@ -104,3 +112,17 @@ def matrix_dk_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     # Default must be something else.
     d1, d2, d3 = _phase3_csr_so(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
     return d1.asformat(format), d2.asformat(format), d3.asformat(format)
+
+
+def matrix_dk_nambu(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
+    dtype = phase_dtype(k, M.dtype, dtype, True)
+    p_opt, iRs = _phase_dk(gauge, M, sc, k, dtype)
+
+    csr = M._csr
+
+    if format in ("array", "matrix", "dense"):
+        return _phase3_array_nambu(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
+
+    # Default must be something else.
+    d1, d2, d3 = _phase3_csr_nambu(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
+    return d1.asformat(format), d2.asformat(format), d3.asformat(format)
diff --git a/src/sisl/physics/_matrix_k.pyx b/src/sisl/physics/_matrix_k.pyx
index eb8a78e14f..f004a8dec1 100644
--- a/src/sisl/physics/_matrix_k.pyx
+++ b/src/sisl/physics/_matrix_k.pyx
@@ -13,7 +13,8 @@ from ._matrix_phase_sc import *
 from ._phase import *
 from ._phase cimport is_gamma
 
-__all__ = ["matrix_k", "matrix_k_nc", "matrix_k_so", "matrix_k_nc_diag"]
+__all__ = ["matrix_k", "matrix_k_nc", "matrix_k_so", "matrix_k_diag",
+"matrix_k_nambu"]
 
 
 def _phase_k(gauge, M, sc, cnp.ndarray[floats_st] K, dtype):
@@ -97,7 +98,8 @@ def matrix_k_nc(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     return _phase_csr_nc(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
 
 
-def matrix_k_nc_diag(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k, dtype, format):
+def matrix_k_diag(gauge, M, const ints_st idx, const ints_st per_row,
+                  sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, phases = _phase_k(gauge, M, sc, k, dtype)
 
@@ -111,14 +113,18 @@ def matrix_k_nc_diag(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k,
         nc = M.geometry.no_s
 
         if format in ("array", "matrix", "dense"):
-            return _phase_sc_array_nc_diag(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases, p_opt)
+            return _phase_sc_array_diag(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx,
+            phases, p_opt, per_row)
 
-        return _phase_sc_csr_nc_diag(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases, p_opt).asformat(format)
+        return _phase_sc_csr_diag(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases,
+        p_opt, per_row).asformat(format)
 
     if format in ("array", "matrix", "dense"):
-        return _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
+        return _phase_array_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt,
+        per_row)
 
-    return _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
+    return _phase_csr_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt,
+    per_row).asformat(format)
 
 
 def matrix_k_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
@@ -143,3 +149,30 @@ def matrix_k_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
         return _phase_array_so(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
 
     return _phase_csr_so(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
+
+
+def matrix_k_nambu(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
+    dtype = phase_dtype(k, M.dtype, dtype, True)
+    p_opt, phases = _phase_k(gauge, M, sc, k, dtype)
+
+    # TODO right now nambu does not have p_opt < 0
+    assert p_opt >= 0, "Not implemented"
+
+    csr = M._csr
+
+    if format.startswith("sc:") or format == "sc":
+        if format == "sc":
+            format = "csr"
+        else:
+            format = format[3:]
+        nc = M.geometry.no_s
+
+        if format in ("array", "matrix", "dense"):
+            return _phase_sc_array_nambu(csr.ptr, csr.ncol, csr.col, nc, csr._D, phases, p_opt)
+
+        return _phase_sc_csr_nambu(csr.ptr, csr.ncol, csr.col, nc, csr._D, phases, p_opt).asformat(format)
+
+    if format in ("array", "matrix", "dense"):
+        return _phase_array_nambu(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
+
+    return _phase_csr_nambu(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
diff --git a/src/sisl/physics/_matrix_phase.pyx b/src/sisl/physics/_matrix_phase.pyx
index fba3e6d0cb..39e8632ad6 100644
--- a/src/sisl/physics/_matrix_phase.pyx
+++ b/src/sisl/physics/_matrix_phase.pyx
@@ -1,6 +1,7 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
 import numpy as np
@@ -11,11 +12,7 @@ from scipy.sparse import csr_matrix
 
 from sisl._indices cimport _index_sorted
 
-from sisl._core._sparse import (
-    fold_csr_matrix,
-    fold_csr_matrix_nc,
-    fold_csr_matrix_nc_diag,
-)
+from sisl._core._sparse import fold_csr_matrix, fold_csr_matrix_diag
 
 from sisl._core._dtypes cimport (
     complexs_st,
@@ -29,8 +26,11 @@ from sisl._core._dtypes cimport (
 )
 
 from ._matrix_utils cimport (
+    _f_matrix_box_nambu,
     _f_matrix_box_nc,
     _f_matrix_box_so,
+    _matrix_box_nambu_cmplx,
+    _matrix_box_nambu_real,
     _matrix_box_nc_cmplx,
     _matrix_box_nc_real,
     _matrix_box_so_cmplx,
@@ -42,10 +42,12 @@ __all__ = [
     "_phase_array",
     "_phase_csr_nc",
     "_phase_array_nc",
-    "_phase_csr_nc_diag",
-    "_phase_array_nc_diag",
+    "_phase_csr_diag",
+    "_phase_array_diag",
     "_phase_csr_so",
     "_phase_array_so",
+    "_phase_csr_nambu",
+    "_phase_array_nambu",
 ]
 
 """
@@ -66,10 +68,6 @@ p_opt == 1:
 """
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_csr(ints_st[::1] ptr,
                ints_st[::1] ncol,
                ints_st[::1] col,
@@ -126,10 +124,6 @@ def _phase_csr(ints_st[::1] ptr,
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_array(ints_st[::1] ptr,
                  ints_st[::1] ncol,
                  ints_st[::1] col,
@@ -171,20 +165,17 @@ def _phase_array(ints_st[::1] ptr,
     return V
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
-def _phase_csr_nc_diag(ints_st[::1] ptr,
-                       ints_st[::1] ncol,
-                       ints_st[::1] col,
-                       numerics_st[:, ::1] D,
-                       const int idx,
-                       complexs_st[::1] phases,
-                       const int p_opt):
+def _phase_csr_diag(ints_st[::1] ptr,
+                    ints_st[::1] ncol,
+                    ints_st[::1] col,
+                    numerics_st[:, ::1] D,
+                    const int idx,
+                    complexs_st[::1] phases,
+                    const int p_opt,
+                    const int per_row):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc_diag(ptr, ncol, col)
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix_diag(ptr, ncol, col, per_row)
     cdef ints_st[::1] v_ptr = V_PTR
     cdef ints_st[::1] v_ncol = V_NCOL
     cdef ints_st[::1] v_col = V_COL
@@ -196,115 +187,108 @@ def _phase_csr_nc_diag(ints_st[::1] ptr,
 
     # Local columns
     cdef ints_st nr = ncol.shape[0]
-    cdef ints_st r, rr, ind, s, s_idx, c
+    cdef ints_st r, rr, ind, s, s_idx, c, ic
 
     cdef complexs_st d
 
     with nogil:
         if p_opt == -1:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = (col[ind] % nr) * 2
+                    c = (col[ind] % nr) * per_row
 
                     tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
                     s_idx = _index_sorted(tmp, c)
 
                     d = <complexs_st> D[ind, idx]
-                    v[v_ptr[rr] + s_idx] += d
-                    v[v_ptr[rr+1] + s_idx] += d
+                    for ic in range(per_row):
+                        v[v_ptr[rr+ic] + s_idx] += d
 
         elif p_opt == 0:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = (col[ind] % nr) * 2
+                    c = (col[ind] % nr) * per_row
 
                     tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
                     s_idx = _index_sorted(tmp, c)
 
                     d = (phases[ind] * D[ind, idx])
-                    v[v_ptr[rr] + s_idx] += d
-                    v[v_ptr[rr+1] + s_idx] += d
+                    for ic in range(per_row):
+                        v[v_ptr[rr+ic] + s_idx] += d
 
         else:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = (col[ind] % nr) * 2
+                    c = (col[ind] % nr) * per_row
                     s = col[ind] / nr
 
                     tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
                     s_idx = _index_sorted(tmp, c)
 
                     d = (phases[s] * D[ind, idx])
-                    v[v_ptr[rr] + s_idx] += d
-                    v[v_ptr[rr+1] + s_idx] += d
+                    for ic in range(per_row):
+                        v[v_ptr[rr+ic] + s_idx] += d
 
-    nr = nr * 2
+    nr = nr * per_row
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
-def _phase_array_nc_diag(ints_st[::1] ptr,
-                         ints_st[::1] ncol,
-                         ints_st[::1] col,
-                         numerics_st[:, ::1] D,
-                         const int idx,
-                         complexs_st[::1] phases,
-                         const int p_opt):
+def _phase_array_diag(ints_st[::1] ptr,
+                      ints_st[::1] ncol,
+                      ints_st[::1] col,
+                      numerics_st[:, ::1] D,
+                      const int idx,
+                      complexs_st[::1] phases,
+                      const int p_opt,
+                      const int per_row):
 
     cdef ints_st[::1] tmp
     cdef ints_st nr = ncol.shape[0]
 
     cdef object dtype = type2dtype[complexs_st](1)
-    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr * per_row, nr * per_row], dtype=dtype)
     cdef complexs_st[:, ::1] v = V
 
     # Local columns
-    cdef ints_st r, rr, ind, s, c
+    cdef ints_st r, rr, ind, s, c, ic
 
     cdef complexs_st d
 
     with nogil:
         if p_opt == -1:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = (col[ind] % nr) * 2
+                    c = (col[ind] % nr) * per_row
                     d = D[ind, idx]
-                    v[rr, c] += d
-                    v[rr + 1, c + 1] += d
+                    for ic in range(per_row):
+                        v[rr + ic, c + ic] += d
 
         elif p_opt == 0:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = (col[ind] % nr) * 2
+                    c = (col[ind] % nr) * per_row
                     d = (phases[ind] * D[ind, idx])
-                    v[rr, c] += d
-                    v[rr + 1, c + 1] += d
+                    for ic in range(per_row):
+                        v[rr + ic, c + ic] += d
 
         else:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = (col[ind] % nr) * 2
+                    c = (col[ind] % nr) * per_row
                     s = col[ind] / nr
                     d = (phases[s] * D[ind, idx])
-                    v[rr, c] += d
-                    v[rr + 1, c + 1] += d
+                    for ic in range(per_row):
+                        v[rr + ic, c + ic] += d
 
     return V
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_csr_nc(ints_st[::1] ptr,
                   ints_st[::1] ncol,
                   ints_st[::1] col,
@@ -313,7 +297,7 @@ def _phase_csr_nc(ints_st[::1] ptr,
                   const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col, 2)
     cdef ints_st[::1] v_ptr = V_PTR
     cdef ints_st[::1] v_ncol = V_NCOL
     cdef ints_st[::1] v_col = V_COL
@@ -392,10 +376,6 @@ def _phase_csr_nc(ints_st[::1] ptr,
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_array_nc(ints_st[::1] ptr,
                     ints_st[::1] ncol,
                     ints_st[::1] col,
@@ -468,11 +448,6 @@ def _phase_array_nc(ints_st[::1] ptr,
     return V
 
 
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_csr_so(ints_st[::1] ptr,
                   ints_st[::1] ncol,
                   ints_st[::1] col,
@@ -481,7 +456,7 @@ def _phase_csr_so(ints_st[::1] ptr,
                   const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col, 2)
     cdef ints_st[::1] v_ptr = V_PTR
     cdef ints_st[::1] v_ncol = V_NCOL
     cdef ints_st[::1] v_col = V_COL
@@ -559,10 +534,6 @@ def _phase_csr_so(ints_st[::1] ptr,
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_array_so(ints_st[::1] ptr,
                     ints_st[::1] ncol,
                     ints_st[::1] col,
@@ -631,3 +602,200 @@ def _phase_array_so(ints_st[::1] ptr,
                     v[rr + 1, c + 1] += M[3]
 
     return V
+
+
+def _phase_csr_nambu(ints_st[::1] ptr,
+                     ints_st[::1] ncol,
+                     ints_st[::1] col,
+                     numerics_st[:, ::1] D,
+                     complexs_st[::1] phases,
+                     const int p_opt):
+
+    # Now create the folded sparse elements
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col, 4)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+    cdef ints_st[::1] tmp
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    # Local columns
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, s_idx, c
+
+    cdef complexs_st ph
+    cdef _f_matrix_box_nambu func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nambu_cmplx
+    else:
+        func = _matrix_box_nambu_real
+
+    with nogil:
+        if p_opt == -1:
+            pass
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+                    ph = phases[ind]
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + s_idx] += M[0]
+                    v[v_ptr[rr] + s_idx+1] += M[1]
+                    v[v_ptr[rr+1] + s_idx] += M[2]
+                    v[v_ptr[rr+1] + s_idx+1] += M[3]
+                    # Delta
+                    v[v_ptr[rr] + s_idx+2] += M[4]
+                    v[v_ptr[rr] + s_idx+3] += M[5]
+                    v[v_ptr[rr+1] + s_idx+2] += M[6]
+                    v[v_ptr[rr+1] + s_idx+3] += M[7]
+                    # Delta^dagger
+                    v[v_ptr[rr+2] + s_idx] += M[4].conjugate()
+                    v[v_ptr[rr+2] + s_idx+1] += M[6].conjugate()
+                    v[v_ptr[rr+3] + s_idx] += M[5].conjugate()
+                    v[v_ptr[rr+3] + s_idx+1] += M[7].conjugate()
+                    # -H^*
+                    v[v_ptr[rr+2] + s_idx+2] += -M[0].conjugate()
+                    v[v_ptr[rr+2] + s_idx+3] += -M[1].conjugate()
+                    v[v_ptr[rr+3] + s_idx+2] += -M[2].conjugate()
+                    v[v_ptr[rr+3] + s_idx+3] += -M[3].conjugate()
+
+        else:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+                    s = col[ind] / nr
+                    ph = phases[s]
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + s_idx] += M[0]
+                    v[v_ptr[rr] + s_idx+1] += M[1]
+                    v[v_ptr[rr+1] + s_idx] += M[2]
+                    v[v_ptr[rr+1] + s_idx+1] += M[3]
+                    # Delta
+                    v[v_ptr[rr] + s_idx+2] += M[4]
+                    v[v_ptr[rr] + s_idx+3] += M[5]
+                    v[v_ptr[rr+1] + s_idx+2] += M[6]
+                    v[v_ptr[rr+1] + s_idx+3] += M[7]
+                    # Delta^dagger
+                    v[v_ptr[rr+2] + s_idx] += M[4].conjugate()
+                    v[v_ptr[rr+2] + s_idx+1] += M[6].conjugate()
+                    v[v_ptr[rr+3] + s_idx] += M[5].conjugate()
+                    v[v_ptr[rr+3] + s_idx+1] += M[7].conjugate()
+                    # -H^*
+                    v[v_ptr[rr+2] + s_idx+2] += -M[0].conjugate()
+                    v[v_ptr[rr+2] + s_idx+3] += -M[1].conjugate()
+                    v[v_ptr[rr+3] + s_idx+2] += -M[2].conjugate()
+                    v[v_ptr[rr+3] + s_idx+3] += -M[3].conjugate()
+
+    nr = nr * 4
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
+
+
+def _phase_array_nambu(ints_st[::1] ptr,
+                       ints_st[::1] ncol,
+                       ints_st[::1] col,
+                       numerics_st[:, ::1] D,
+                       complexs_st[::1] phases,
+                       const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr * 4, nr * 4], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
+
+    # Local columns
+    cdef ints_st r, rr, s, c, ind
+
+    cdef complexs_st ph
+    cdef _f_matrix_box_nambu func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nambu_cmplx
+    else:
+        func = _matrix_box_nambu_real
+
+    with nogil:
+        if p_opt == -1:
+            pass
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] += M[0]
+                    v[rr, c+1] += M[1]
+                    v[rr+1, c] += M[2]
+                    v[rr+1, c+1] += M[3]
+                    # Delta
+                    v[rr, c+2] += M[4]
+                    v[rr, c+3] += M[5]
+                    v[rr+1, c+2] += M[6]
+                    v[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    v[rr+2, c] += M[4].conjugate()
+                    v[rr+2, c+1] += M[6].conjugate()
+                    v[rr+3, c] += M[5].conjugate()
+                    v[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    v[rr+2, c+2] += -M[0].conjugate()
+                    v[rr+2, c+3] += -M[1].conjugate()
+                    v[rr+3, c+2] += -M[2].conjugate()
+                    v[rr+3, c+3] += -M[3].conjugate()
+
+        else:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+                    s = col[ind] / nr
+                    ph = phases[s]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] += M[0]
+                    v[rr, c+1] += M[1]
+                    v[rr+1, c] += M[2]
+                    v[rr+1, c+1] += M[3]
+                    # Delta
+                    v[rr, c+2] += M[4]
+                    v[rr, c+3] += M[5]
+                    v[rr+1, c+2] += M[6]
+                    v[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    v[rr+2, c] += M[4].conjugate()
+                    v[rr+2, c+1] += M[6].conjugate()
+                    v[rr+3, c] += M[5].conjugate()
+                    v[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    v[rr+2, c+2] += -M[0].conjugate()
+                    v[rr+2, c+3] += -M[1].conjugate()
+                    v[rr+3, c+2] += -M[2].conjugate()
+                    v[rr+3, c+3] += -M[3].conjugate()
+
+    return V
diff --git a/src/sisl/physics/_matrix_phase3.pyx b/src/sisl/physics/_matrix_phase3.pyx
index 220a8570b9..517f862188 100644
--- a/src/sisl/physics/_matrix_phase3.pyx
+++ b/src/sisl/physics/_matrix_phase3.pyx
@@ -1,6 +1,7 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
 import numpy as np
@@ -11,7 +12,7 @@ from scipy.sparse import csr_matrix
 
 from sisl._indices cimport _index_sorted
 
-from sisl._core._sparse import fold_csr_matrix, fold_csr_matrix_nc
+from sisl._core._sparse import fold_csr_matrix
 
 from sisl._core._dtypes cimport (
     complexs_st,
@@ -24,8 +25,11 @@ from sisl._core._dtypes cimport (
 )
 
 from ._matrix_utils cimport (
+    _f_matrix_box_nambu,
     _f_matrix_box_nc,
     _f_matrix_box_so,
+    _matrix_box_nambu_cmplx,
+    _matrix_box_nambu_real,
     _matrix_box_nc_cmplx,
     _matrix_box_nc_real,
     _matrix_box_so_cmplx,
@@ -39,13 +43,11 @@ __all__ = [
     "_phase3_array_nc",
     "_phase3_csr_so",
     "_phase3_array_so",
+    "_phase3_csr_nambu",
+    "_phase3_array_nambu",
 ]
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase3_csr(ints_st[::1] ptr,
                 ints_st[::1] ncol,
                 ints_st[::1] col,
@@ -99,10 +101,6 @@ def _phase3_csr(ints_st[::1] ptr,
 
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase3_array(ints_st[::1] ptr,
                   ints_st[::1] ncol,
                   ints_st[::1] col,
@@ -149,10 +147,6 @@ def _phase3_array(ints_st[::1] ptr,
 # Non-collinear code
 ###
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase3_csr_nc(ints_st[::1] ptr,
                    ints_st[::1] ncol,
                    ints_st[::1] col,
@@ -161,7 +155,7 @@ def _phase3_csr_nc(ints_st[::1] ptr,
                    const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col, 2)
     cdef ints_st[::1] v_ptr = V_PTR
     cdef ints_st[::1] v_ncol = V_NCOL
     cdef ints_st[::1] v_col = V_COL
@@ -252,10 +246,6 @@ def _phase3_csr_nc(ints_st[::1] ptr,
     return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase3_array_nc(ints_st[::1] ptr,
                      ints_st[::1] ncol,
                      ints_st[::1] col,
@@ -350,10 +340,6 @@ def _phase3_array_nc(ints_st[::1] ptr,
 # Spin-orbit coupling matrices
 ###
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase3_csr_so(ints_st[::1] ptr,
                    ints_st[::1] ncol,
                    ints_st[::1] col,
@@ -362,7 +348,7 @@ def _phase3_csr_so(ints_st[::1] ptr,
                    const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col, 2)
     cdef ints_st[::1] v_ptr = V_PTR
     cdef ints_st[::1] v_ncol = V_NCOL
     cdef ints_st[::1] v_col = V_COL
@@ -453,10 +439,6 @@ def _phase3_csr_so(ints_st[::1] ptr,
     return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase3_array_so(ints_st[::1] ptr,
                      ints_st[::1] ncol,
                      ints_st[::1] col,
@@ -547,3 +529,302 @@ def _phase3_array_so(ints_st[::1] ptr,
                     vz[rr+1, c+1] += M[3]
 
     return Vx, Vy, Vz
+
+
+def _phase3_csr_nambu(ints_st[::1] ptr,
+                      ints_st[::1] ncol,
+                      ints_st[::1] col,
+                      numerics_st[:, ::1] D,
+                      complexs_st[:, ::1] phases,
+                      const int p_opt):
+
+    # Now create the folded sparse elements
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col, 4)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st ph
+
+    # Local columns (not in NC form)
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, c
+    cdef ints_st s_idx
+    cdef _f_matrix_box_nambu func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nambu_cmplx
+    else:
+        func = _matrix_box_nambu_real
+
+    with nogil:
+        if p_opt == 0:
+            pass
+
+        else:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+                    s = col[ind] / nr
+
+                    s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
+
+                    d = &D[ind, 0]
+
+                    ph = phases[s, 0]
+                    func(d, ph, M)
+                    Vx[v_ptr[rr] + s_idx] += M[0]
+                    Vx[v_ptr[rr] + s_idx+1] += M[1]
+                    Vx[v_ptr[rr+1] + s_idx] += M[2]
+                    Vx[v_ptr[rr+1] + s_idx+1] += M[3]
+                    # Delta
+                    Vx[v_ptr[rr] + s_idx+2] += M[4]
+                    Vx[v_ptr[rr] + s_idx+3] += M[5]
+                    Vx[v_ptr[rr+1] + s_idx+2] += M[6]
+                    Vx[v_ptr[rr+1] + s_idx+3] += M[7]
+                    # Delta^dagger
+                    Vx[v_ptr[rr+2] + s_idx] += M[4].conjugate()
+                    Vx[v_ptr[rr+2] + s_idx+1] += M[6].conjugate()
+                    Vx[v_ptr[rr+3] + s_idx] += M[5].conjugate()
+                    Vx[v_ptr[rr+3] + s_idx+1] += M[7].conjugate()
+                    # -H^*
+                    Vx[v_ptr[rr+2] + s_idx+2] += -M[0].conjugate()
+                    Vx[v_ptr[rr+2] + s_idx+3] += -M[1].conjugate()
+                    Vx[v_ptr[rr+3] + s_idx+2] += -M[2].conjugate()
+                    Vx[v_ptr[rr+3] + s_idx+3] += -M[3].conjugate()
+
+                    ph = phases[s, 1]
+                    func(d, ph, M)
+                    Vy[v_ptr[rr] + s_idx] += M[0]
+                    Vy[v_ptr[rr] + s_idx+1] += M[1]
+                    Vy[v_ptr[rr+1] + s_idx] += M[2]
+                    Vy[v_ptr[rr+1] + s_idx+1] += M[3]
+                    # Delta
+                    Vy[v_ptr[rr] + s_idx+2] += M[4]
+                    Vy[v_ptr[rr] + s_idx+3] += M[5]
+                    Vy[v_ptr[rr+1] + s_idx+2] += M[6]
+                    Vy[v_ptr[rr+1] + s_idx+3] += M[7]
+                    # Delta^dagger
+                    Vy[v_ptr[rr+2] + s_idx] += M[4].conjugate()
+                    Vy[v_ptr[rr+2] + s_idx+1] += M[6].conjugate()
+                    Vy[v_ptr[rr+3] + s_idx] += M[5].conjugate()
+                    Vy[v_ptr[rr+3] + s_idx+1] += M[7].conjugate()
+                    # -H^*
+                    Vy[v_ptr[rr+2] + s_idx+2] += -M[0].conjugate()
+                    Vy[v_ptr[rr+2] + s_idx+3] += -M[1].conjugate()
+                    Vy[v_ptr[rr+3] + s_idx+2] += -M[2].conjugate()
+                    Vy[v_ptr[rr+3] + s_idx+3] += -M[3].conjugate()
+
+                    ph = phases[s, 2]
+                    func(d, ph, M)
+                    Vz[v_ptr[rr] + s_idx] += M[0]
+                    Vz[v_ptr[rr] + s_idx+1] += M[1]
+                    Vz[v_ptr[rr+1] + s_idx] += M[2]
+                    Vz[v_ptr[rr+1] + s_idx+1] += M[3]
+                    # Delta
+                    Vz[v_ptr[rr] + s_idx+2] += M[4]
+                    Vz[v_ptr[rr] + s_idx+3] += M[5]
+                    Vz[v_ptr[rr+1] + s_idx+2] += M[6]
+                    Vz[v_ptr[rr+1] + s_idx+3] += M[7]
+                    # Delta^dagger
+                    Vz[v_ptr[rr+2] + s_idx] += M[4].conjugate()
+                    Vz[v_ptr[rr+2] + s_idx+1] += M[6].conjugate()
+                    Vz[v_ptr[rr+3] + s_idx] += M[5].conjugate()
+                    Vz[v_ptr[rr+3] + s_idx+1] += M[7].conjugate()
+                    # -H^*
+                    Vz[v_ptr[rr+2] + s_idx+2] += -M[0].conjugate()
+                    Vz[v_ptr[rr+2] + s_idx+3] += -M[1].conjugate()
+                    Vz[v_ptr[rr+3] + s_idx+2] += -M[2].conjugate()
+                    Vz[v_ptr[rr+3] + s_idx+3] += -M[3].conjugate()
+
+    nr = nr * 4
+    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
+
+
+def _phase3_array_nambu(ints_st[::1] ptr,
+                        ints_st[::1] ncol,
+                        ints_st[::1] col,
+                        numerics_st[:, ::1] D,
+                        complexs_st[:, ::1] phases,
+                        const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vx = np.zeros([nr * 4, nr * 4], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vy = np.zeros([nr * 4, nr * 4], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vz = np.zeros([nr * 4, nr * 4], dtype=dtype)
+    cdef complexs_st[:, ::1] vx = Vx
+    cdef complexs_st[:, ::1] vy = Vy
+    cdef complexs_st[:, ::1] vz = Vz
+
+    cdef complexs_st ph
+    cdef ints_st r, rr, ind, s, c
+    cdef ints_st s_idx
+    cdef _f_matrix_box_nambu func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nambu_cmplx
+    else:
+        func = _matrix_box_nambu_real
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+
+                    d = &D[ind, 0]
+
+                    ph = phases[ind, 0]
+                    func(d, ph, M)
+                    vx[rr, c] += M[0]
+                    vx[rr, c+1] += M[1]
+                    vx[rr+1, c] += M[2]
+                    vx[rr+1, c+1] += M[3]
+                    # Delta
+                    vx[rr, c+2] += M[4]
+                    vx[rr, c+3] += M[5]
+                    vx[rr+1, c+2] += M[6]
+                    vx[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    vx[rr+2, c] += M[4].conjugate()
+                    vx[rr+2, c+1] += M[6].conjugate()
+                    vx[rr+3, c] += M[5].conjugate()
+                    vx[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    vx[rr+2, c+2] += -M[0].conjugate()
+                    vx[rr+2, c+3] += -M[1].conjugate()
+                    vx[rr+3, c+2] += -M[2].conjugate()
+                    vx[rr+3, c+3] += -M[3].conjugate()
+
+                    ph = phases[ind, 1]
+                    func(d, ph, M)
+                    vy[rr, c] += M[0]
+                    vy[rr, c+1] += M[1]
+                    vy[rr+1, c] += M[2]
+                    vy[rr+1, c+1] += M[3]
+                    # Delta
+                    vy[rr, c+2] += M[4]
+                    vy[rr, c+3] += M[5]
+                    vy[rr+1, c+2] += M[6]
+                    vy[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    vy[rr+2, c] += M[4].conjugate()
+                    vy[rr+2, c+1] += M[6].conjugate()
+                    vy[rr+3, c] += M[5].conjugate()
+                    vy[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    vy[rr+2, c+2] += -M[0].conjugate()
+                    vy[rr+2, c+3] += -M[1].conjugate()
+                    vy[rr+3, c+2] += -M[2].conjugate()
+                    vy[rr+3, c+3] += -M[3].conjugate()
+
+                    ph = phases[ind, 2]
+                    func(d, ph, M)
+                    vz[rr, c] += M[0]
+                    vz[rr, c+1] += M[1]
+                    vz[rr+1, c] += M[2]
+                    vz[rr+1, c+1] += M[3]
+                    # Delta
+                    vz[rr, c+2] += M[4]
+                    vz[rr, c+3] += M[5]
+                    vz[rr+1, c+2] += M[6]
+                    vz[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    vz[rr+2, c] += M[4].conjugate()
+                    vz[rr+2, c+1] += M[6].conjugate()
+                    vz[rr+3, c] += M[5].conjugate()
+                    vz[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    vz[rr+2, c+2] += -M[0].conjugate()
+                    vz[rr+2, c+3] += -M[1].conjugate()
+                    vz[rr+3, c+2] += -M[2].conjugate()
+                    vz[rr+3, c+3] += -M[3].conjugate()
+
+        else:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 4
+                    s = col[ind] / nr
+
+                    d = &D[ind, 0]
+
+                    ph = phases[s, 0]
+                    func(d, ph, M)
+                    vx[rr, c] += M[0]
+                    vx[rr, c+1] += M[1]
+                    vx[rr+1, c] += M[2]
+                    vx[rr+1, c+1] += M[3]
+                    # Delta
+                    vx[rr, c+2] += M[4]
+                    vx[rr, c+3] += M[5]
+                    vx[rr+1, c+2] += M[6]
+                    vx[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    vx[rr+2, c] += M[4].conjugate()
+                    vx[rr+2, c+1] += M[6].conjugate()
+                    vx[rr+3, c] += M[5].conjugate()
+                    vx[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    vx[rr+2, c+2] += -M[0].conjugate()
+                    vx[rr+2, c+3] += -M[1].conjugate()
+                    vx[rr+3, c+2] += -M[2].conjugate()
+                    vx[rr+3, c+3] += -M[3].conjugate()
+
+                    ph = phases[s, 1]
+                    func(d, ph, M)
+                    vy[rr, c] += M[0]
+                    vy[rr, c+1] += M[1]
+                    vy[rr+1, c] += M[2]
+                    vy[rr+1, c+1] += M[3]
+                    # Delta
+                    vy[rr, c+2] += M[4]
+                    vy[rr, c+3] += M[5]
+                    vy[rr+1, c+2] += M[6]
+                    vy[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    vy[rr+2, c] += M[4].conjugate()
+                    vy[rr+2, c+1] += M[6].conjugate()
+                    vy[rr+3, c] += M[5].conjugate()
+                    vy[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    vy[rr+2, c+2] += -M[0].conjugate()
+                    vy[rr+2, c+3] += -M[1].conjugate()
+                    vy[rr+3, c+2] += -M[2].conjugate()
+                    vy[rr+3, c+3] += -M[3].conjugate()
+
+                    ph = phases[s, 2]
+                    func(d, ph, M)
+                    vz[rr, c] += M[0]
+                    vz[rr, c+1] += M[1]
+                    vz[rr+1, c] += M[2]
+                    vz[rr+1, c+1] += M[3]
+                    # Delta
+                    vz[rr, c+2] += M[4]
+                    vz[rr, c+3] += M[5]
+                    vz[rr+1, c+2] += M[6]
+                    vz[rr+1, c+3] += M[7]
+                    # Delta^dagger
+                    vz[rr+2, c] += M[4].conjugate()
+                    vz[rr+2, c+1] += M[6].conjugate()
+                    vz[rr+3, c] += M[5].conjugate()
+                    vz[rr+3, c+1] += M[7].conjugate()
+                    # -H^*
+                    vz[rr+2, c+2] += -M[0].conjugate()
+                    vz[rr+2, c+3] += -M[1].conjugate()
+                    vz[rr+3, c+2] += -M[2].conjugate()
+                    vz[rr+3, c+3] += -M[3].conjugate()
+
+    return Vx, Vy, Vz
diff --git a/src/sisl/physics/_matrix_phase_sc.pyx b/src/sisl/physics/_matrix_phase_sc.pyx
index ad9ede2faf..a2cd0e47c0 100644
--- a/src/sisl/physics/_matrix_phase_sc.pyx
+++ b/src/sisl/physics/_matrix_phase_sc.pyx
@@ -1,6 +1,7 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
 import numpy as np
@@ -19,12 +20,15 @@ from sisl._core._dtypes cimport (
     ssize_st,
     type2dtype,
 )
-from sisl._core._sparse cimport ncol2ptr_nc
+from sisl._core._sparse cimport ncol2ptr
 from sisl._indices cimport _index_sorted
 
 from ._matrix_utils cimport (
+    _f_matrix_box_nambu,
     _f_matrix_box_nc,
     _f_matrix_box_so,
+    _matrix_box_nambu_cmplx,
+    _matrix_box_nambu_real,
     _matrix_box_nc_cmplx,
     _matrix_box_nc_real,
     _matrix_box_so_cmplx,
@@ -36,16 +40,15 @@ __all__ = [
     "_phase_sc_array",
     "_phase_sc_csr_nc",
     "_phase_sc_array_nc",
-    "_phase_sc_csr_nc_diag",
-    "_phase_sc_array_nc_diag",
+    "_phase_sc_csr_diag",
+    "_phase_sc_array_diag",
     "_phase_sc_csr_so",
     "_phase_sc_array_so",
+    "_phase_sc_csr_nambu",
+    "_phase_sc_array_nambu",
 ]
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
+
 def _phase_sc_csr(ints_st[::1] ptr,
                   ints_st[::1] ncol,
                   ints_st[::1] col,
@@ -111,10 +114,6 @@ def _phase_sc_csr(ints_st[::1] ptr,
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nc))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_sc_array(ints_st[::1] ptr,
                     ints_st[::1] ncol,
                     ints_st[::1] col,
@@ -154,10 +153,6 @@ def _phase_sc_array(ints_st[::1] ptr,
     return V
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_sc_csr_nc(ints_st[::1] ptr,
                      ints_st[::1] ncol,
                      ints_st[::1] col,
@@ -193,7 +188,7 @@ def _phase_sc_csr_nc(ints_st[::1] ptr,
         func = _matrix_box_nc_real
 
     # We have to do it manually due to the double elements per matrix element
-    ncol2ptr_nc(nr, ncol, v_ptr, 2)
+    ncol2ptr(nr, ncol, v_ptr, 2, 2)
 
     with nogil:
         if p_opt == -1:
@@ -270,10 +265,6 @@ def _phase_sc_csr_nc(ints_st[::1] ptr,
     return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_sc_array_nc(ints_st[::1] ptr,
                        ints_st[::1] ncol,
                        ints_st[::1] col,
@@ -341,25 +332,22 @@ def _phase_sc_array_nc(ints_st[::1] ptr,
 
     return V
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
-def _phase_sc_csr_nc_diag(ints_st[::1] ptr,
-                          ints_st[::1] ncol,
-                          ints_st[::1] col,
-                          const ints_st nc,
-                          numerics_st[:, ::1] D,
-                          const int idx,
-                          complexs_st[::1] phases,
-                          const int p_opt):
+def _phase_sc_csr_diag(ints_st[::1] ptr,
+                      ints_st[::1] ncol,
+                      ints_st[::1] col,
+                      const ints_st nc,
+                      numerics_st[:, ::1] D,
+                      const int idx,
+                      complexs_st[::1] phases,
+                      const int p_opt,
+                      const int per_row):
 
     # Now copy the sparse matrix form
     cdef ints_st nr = ncol.shape[0]
     cdef object idtype = type2dtype[ints_st](1)
-    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=idtype)
-    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr*2], dtype=idtype)
-    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)*2], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr*per_row + 1], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr*per_row], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)*per_row], dtype=idtype)
 
     cdef ints_st[::1] v_ptr = V_PTR
     cdef ints_st[::1] v_ncol = V_NCOL
@@ -369,128 +357,119 @@ def _phase_sc_csr_nc_diag(ints_st[::1] ptr,
     cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
     cdef complexs_st[::1] v = V
 
-    cdef ints_st r, rr, cind, c, nz, ind
+    cdef ints_st r, rr, ir, cind, c, nz, ind, ic
     cdef complexs_st ph
 
-    # We have to do it manually due to the double elements per matrix element
-    ncol2ptr_nc(nr, ncol, v_ptr, 1)
+    # We have to do it manually due to the double elements per row, but only
+    # one per column
+    ncol2ptr(nr, ncol, v_ptr, per_row, 1)
 
     with nogil:
         if p_opt == -1:
             for r in range(nr):
-                rr = r * 2
-                v_ncol[rr] = ncol[r]
-                v_ncol[rr+1] = ncol[r]
+                rr = r * per_row
+                for ir in range(per_row):
+                    v_ncol[rr+ir] = ncol[r]
 
                 cind = 0
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = col[ind] * 2
+                    c = col[ind] * per_row
 
-                    v[v_ptr[rr] + cind] = <complexs_st> D[ind, idx]
-                    v_col[v_ptr[rr] + cind] = c
-                    v[v_ptr[rr+1] + cind] = <complexs_st> D[ind, idx]
-                    v_col[v_ptr[rr+1] + cind] = c + 1
+                    for ic in range(per_row):
+                        v[v_ptr[rr+ic] + cind] = <complexs_st> D[ind, idx]
+                        v_col[v_ptr[rr+ic] + cind] = c + ic
 
                     cind = cind + 1
 
         elif p_opt == 0:
             for r in range(nr):
-                rr = r * 2
-                v_ncol[rr] = ncol[r] * 2
-                v_ncol[rr+1] = ncol[r] * 2
+                rr = r * per_row
+                for ir in range(per_row):
+                    v_ncol[rr+ir] = ncol[r]
 
                 cind = 0
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = col[ind] * 2
+                    c = col[ind] * per_row
                     ph = phases[ind]
 
-                    v[v_ptr[rr] + cind] = <complexs_st> (D[ind, idx] * ph)
-                    v_col[v_ptr[rr] + cind] = c
-                    v[v_ptr[rr+1] + cind] = <complexs_st> (D[ind, idx] * ph)
-                    v_col[v_ptr[rr+1] + cind] = c + 1
+                    for ic in range(per_row):
+                        v[v_ptr[rr+ic] + cind] = <complexs_st> (D[ind, idx] * ph)
+                        v_col[v_ptr[rr+ic] + cind] = c + ic
 
                     cind = cind + 1
 
         else:
             for r in range(nr):
-                rr = r * 2
-                v_ncol[rr] = ncol[r] * 2
-                v_ncol[rr+1] = ncol[r] * 2
+                rr = r * per_row
+                for ir in range(per_row):
+                    v_ncol[rr+ir] = ncol[r]
 
                 cind = 0
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = col[ind] * 2
+                    c = col[ind] * per_row
                     ph = phases[col[ind] / nr]
 
-
-                    v[v_ptr[rr] + cind] = <complexs_st> (D[ind, idx] * ph)
-                    v_col[v_ptr[rr] + cind] = c
-                    v[v_ptr[rr+1] + cind] = <complexs_st> (D[ind, idx] * ph)
-                    v_col[v_ptr[rr+1] + cind] = c + 1
+                    for ic in range(per_row):
+                        v[v_ptr[rr+ic] + cind] = <complexs_st> (D[ind, idx] * ph)
+                        v_col[v_ptr[rr+ic] + cind] = c + ic
 
                     cind = cind + 1
 
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr * per_row, nc * per_row))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
-def _phase_sc_array_nc_diag(ints_st[::1] ptr,
-                            ints_st[::1] ncol,
-                            ints_st[::1] col,
-                            const ints_st nc,
-                            numerics_st[:, ::1] D,
-                            const int idx,
-                            complexs_st[::1] phases,
-                            const int p_opt):
+def _phase_sc_array_diag(ints_st[::1] ptr,
+                         ints_st[::1] ncol,
+                         ints_st[::1] col,
+                         const ints_st nc,
+                         numerics_st[:, ::1] D,
+                         const int idx,
+                         complexs_st[::1] phases,
+                         const int p_opt,
+                         const int per_row):
 
     cdef ints_st nr = ncol.shape[0]
 
     cdef object dtype = type2dtype[complexs_st](1)
-    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr*2, nc*2], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr*per_row, nc*per_row], dtype=dtype)
     cdef complexs_st[:, ::1] v = V
 
     cdef complexs_st d
-    cdef ints_st r, rr, c, nz, ind
+    cdef ints_st r, rr, c, nz, ind, ic
 
     with nogil:
         if p_opt == -1:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = col[ind] * 2
+                    c = col[ind] * per_row
                     d = <complexs_st> D[ind, idx]
-                    v[rr, c] = d
-                    v[rr+1, c+1] = d
+                    for ic in range(per_row):
+                        v[rr+ic, c+ic] = d
 
         elif p_opt == 0:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = col[ind] * 2
+                    c = col[ind] * per_row
                     d = <complexs_st> (D[ind, idx] * phases[ind])
 
-                    v[rr, c] = d
-                    v[rr+1, c+1] = d
+                    for ic in range(per_row):
+                        v[rr+ic, c+ic] = d
 
         else:
             for r in range(nr):
-                rr = r * 2
+                rr = r * per_row
                 for ind in range(ptr[r], ptr[r] + ncol[r]):
-                    c = col[ind] * 2
+                    c = col[ind] * per_row
                     d = <complexs_st> (D[ind, idx] * phases[col[ind] / nr])
 
-                    v[rr, c] = d
-                    v[rr+1, c+1] = d
+                    for ic in range(per_row):
+                        v[rr+ic, c+ic] = d
 
     return V
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
+
 def _phase_sc_csr_so(ints_st[::1] ptr,
                      ints_st[::1] ncol,
                      ints_st[::1] col,
@@ -526,7 +505,7 @@ def _phase_sc_csr_so(ints_st[::1] ptr,
         func = _matrix_box_so_real
 
     # We have to do it manually due to the double elements per matrix element
-    ncol2ptr_nc(nr, ncol, v_ptr, 2)
+    ncol2ptr(nr, ncol, v_ptr, 2, 2)
 
     with nogil:
         if p_opt == -1:
@@ -603,10 +582,6 @@ def _phase_sc_csr_so(ints_st[::1] ptr,
     return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 def _phase_sc_array_so(ints_st[::1] ptr,
                        ints_st[::1] ncol,
                        ints_st[::1] col,
@@ -673,3 +648,245 @@ def _phase_sc_array_so(ints_st[::1] ptr,
                     v[rr+1, c+1] = M[3]
 
     return V
+
+
+def _phase_sc_csr_nambu(ints_st[::1] ptr,
+                        ints_st[::1] ncol,
+                        ints_st[::1] col,
+                        const ints_st nc,
+                        numerics_st[:, ::1] D,
+                        complexs_st[::1] phases,
+                        const int p_opt):
+
+    # Now copy the sparse matrix form
+    cdef ints_st nr = ncol.shape[0]
+    cdef object idtype = type2dtype[ints_st](1)
+    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr*4 + 1], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr*4], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)*16], dtype=idtype)
+
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    cdef ints_st r, rr, cind, c, nz, ind
+    cdef complexs_st ph
+    cdef _f_matrix_box_nambu func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nambu_cmplx
+    else:
+        func = _matrix_box_nambu_real
+
+    # We have to do it manually due to the quadrouble elements per matrix element
+    ncol2ptr(nr, ncol, v_ptr, 4, 4)
+
+    with nogil:
+        if p_opt == -1:
+            pass
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 4
+                v_ncol[rr] = ncol[r] * 4
+                v_ncol[rr+1] = ncol[r] * 4
+                v_ncol[rr+2] = ncol[r] * 4
+                v_ncol[rr+3] = ncol[r] * 4
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 4
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+
+                    v[v_ptr[rr] + cind] = M[0]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = M[1]
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = M[2]
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = M[3]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+                    # Delta
+                    v[v_ptr[rr] + cind+2] = M[4]
+                    v_col[v_ptr[rr] + cind+2] = c + 2
+                    v[v_ptr[rr] + cind+3] = M[5]
+                    v_col[v_ptr[rr] + cind+3] = c + 3
+                    v[v_ptr[rr+1] + cind+2] = M[6]
+                    v_col[v_ptr[rr+1] + cind+2] = c + 2
+                    v[v_ptr[rr+1] + cind+3] = M[7]
+                    v_col[v_ptr[rr+1] + cind+3] = c + 3
+                    # Delta^dagger
+                    v[v_ptr[rr+2] + cind] = M[4].conjugate()
+                    v_col[v_ptr[rr+2] + cind] = c
+                    v[v_ptr[rr+2] + cind+1] = M[6].conjugate()
+                    v_col[v_ptr[rr+2] + cind+1] = c + 1
+                    v[v_ptr[rr+3] + cind] = M[5].conjugate()
+                    v_col[v_ptr[rr+3] + cind] = c
+                    v[v_ptr[rr+3] + cind+1] = M[7].conjugate()
+                    v_col[v_ptr[rr+3] + cind+1] = c + 1
+                    # -H^*
+                    v[v_ptr[rr+2] + cind+2] = -M[0].conjugate()
+                    v_col[v_ptr[rr+2] + cind+2] = c+2
+                    v[v_ptr[rr+2] + cind+3] = -M[1].conjugate()
+                    v_col[v_ptr[rr+2] + cind+3] = c + 3
+                    v[v_ptr[rr+3] + cind+2] = -M[2].conjugate()
+                    v_col[v_ptr[rr+3] + cind+2] = c +2
+                    v[v_ptr[rr+3] + cind+3] = -M[3].conjugate()
+                    v_col[v_ptr[rr+3] + cind+3] = c + 3
+
+                    cind = cind + 4
+
+        else:
+            for r in range(nr):
+                rr = r * 4
+                v_ncol[rr] = ncol[r] * 4
+                v_ncol[rr+1] = ncol[r] * 4
+                v_ncol[rr+2] = ncol[r] * 4
+                v_ncol[rr+3] = ncol[r] * 4
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 4
+                    ph = phases[col[ind] / nr]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+
+                    v[v_ptr[rr] + cind] = M[0]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = M[1]
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = M[2]
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = M[3]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+                    # Delta
+                    v[v_ptr[rr] + cind+2] = M[4]
+                    v_col[v_ptr[rr] + cind+2] = c + 2
+                    v[v_ptr[rr] + cind+3] = M[5]
+                    v_col[v_ptr[rr] + cind+3] = c + 3
+                    v[v_ptr[rr+1] + cind+2] = M[6]
+                    v_col[v_ptr[rr+1] + cind+2] = c + 2
+                    v[v_ptr[rr+1] + cind+3] = M[7]
+                    v_col[v_ptr[rr+1] + cind+3] = c + 3
+                    # Delta^dagger
+                    v[v_ptr[rr+2] + cind] = M[4].conjugate()
+                    v_col[v_ptr[rr+2] + cind] = c
+                    v[v_ptr[rr+2] + cind+1] = M[6].conjugate()
+                    v_col[v_ptr[rr+2] + cind+1] = c + 1
+                    v[v_ptr[rr+3] + cind] = M[5].conjugate()
+                    v_col[v_ptr[rr+3] + cind] = c
+                    v[v_ptr[rr+3] + cind+1] = M[7].conjugate()
+                    v_col[v_ptr[rr+3] + cind+1] = c + 1
+                    # -H^*
+                    v[v_ptr[rr+2] + cind+2] = -M[0].conjugate()
+                    v_col[v_ptr[rr+2] + cind+2] = c+2
+                    v[v_ptr[rr+2] + cind+3] = -M[1].conjugate()
+                    v_col[v_ptr[rr+2] + cind+3] = c + 3
+                    v[v_ptr[rr+3] + cind+2] = -M[2].conjugate()
+                    v_col[v_ptr[rr+3] + cind+2] = c +2
+                    v[v_ptr[rr+3] + cind+3] = -M[3].conjugate()
+                    v_col[v_ptr[rr+3] + cind+3] = c + 3
+
+                    cind = cind + 4
+
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 4, nc * 4))
+
+
+def _phase_sc_array_nambu(ints_st[::1] ptr,
+                          ints_st[::1] ncol,
+                          ints_st[::1] col,
+                          const ints_st nc,
+                          numerics_st[:, ::1] D,
+                          complexs_st[::1] phases,
+                          const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr*4, nc*4], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
+
+    cdef complexs_st ph
+    cdef ints_st r, rr, c, nz, ind
+    cdef _f_matrix_box_nambu func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0, 0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nambu_cmplx
+    else:
+        func = _matrix_box_nambu_real
+
+    with nogil:
+        if p_opt == -1:
+            pass
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 4
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] = M[0]
+                    v[rr, c+1] = M[1]
+                    v[rr+1, c] = M[2]
+                    v[rr+1, c+1] = M[3]
+                    # Delta
+                    v[rr, c+2] = M[4]
+                    v[rr, c+3] = M[5]
+                    v[rr+1, c+2] = M[6]
+                    v[rr+1, c+3] = M[7]
+                    # Delta^dagger
+                    v[rr+2, c] = M[4].conjugate()
+                    v[rr+2, c+1] = M[6].conjugate()
+                    v[rr+3, c] = M[5].conjugate()
+                    v[rr+3, c+1] = M[7].conjugate()
+                    # -H^*
+                    v[rr+2, c+2] = -M[0].conjugate()
+                    v[rr+2, c+3] = -M[1].conjugate()
+                    v[rr+3, c+2] = -M[2].conjugate()
+                    v[rr+3, c+3] = -M[3].conjugate()
+
+        else:
+            for r in range(nr):
+                rr = r * 4
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 4
+                    ph = phases[col[ind] / nr]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] = M[0]
+                    v[rr, c+1] = M[1]
+                    v[rr+1, c] = M[2]
+                    v[rr+1, c+1] = M[3]
+                    # Delta
+                    v[rr, c+2] = M[4]
+                    v[rr, c+3] = M[5]
+                    v[rr+1, c+2] = M[6]
+                    v[rr+1, c+3] = M[7]
+                    # Delta^dagger
+                    v[rr+2, c] = M[4].conjugate()
+                    v[rr+2, c+1] = M[6].conjugate()
+                    v[rr+3, c] = M[5].conjugate()
+                    v[rr+3, c+1] = M[7].conjugate()
+                    # -H^*
+                    v[rr+2, c+2] = -M[0].conjugate()
+                    v[rr+2, c+3] = -M[1].conjugate()
+                    v[rr+3, c+2] = -M[2].conjugate()
+                    v[rr+3, c+3] = -M[3].conjugate()
+
+    return V
diff --git a/src/sisl/physics/_matrix_utils.pxd b/src/sisl/physics/_matrix_utils.pxd
index b235ca106b..4f3d68b7cb 100644
--- a/src/sisl/physics/_matrix_utils.pxd
+++ b/src/sisl/physics/_matrix_utils.pxd
@@ -36,3 +36,15 @@ cdef void _matrix_box_so_real(const reals_st *data,
 cdef void _matrix_box_so_cmplx(const _internal_complexs_st *data,
                                const complexs_st phase,
                                complexs_st *M) noexcept nogil
+
+ctypedef void(*_f_matrix_box_nambu)(const numerics_st *data,
+                                    const complexs_st phase,
+                                    complexs_st *M) noexcept nogil
+
+cdef void _matrix_box_nambu_real(const reals_st *data,
+                                 const complexs_st phase,
+                                 complexs_st *M) noexcept nogil
+
+cdef void _matrix_box_nambu_cmplx(const _internal_complexs_st *data,
+                                  const complexs_st phase,
+                                  complexs_st *M) noexcept nogil
diff --git a/src/sisl/physics/_matrix_utils.pyx b/src/sisl/physics/_matrix_utils.pyx
index 7b0e2fb904..806785ae7e 100644
--- a/src/sisl/physics/_matrix_utils.pyx
+++ b/src/sisl/physics/_matrix_utils.pyx
@@ -1,6 +1,7 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
 import numpy as np
@@ -19,13 +20,29 @@ M[0] == spin[0, 0]
 M[1] == spin[0, 1]
 M[2] == spin[1, 0]
 M[3] == spin[1, 1]
+
+For nambu spin configurations, the spin box is 4x4, however
+the spin box is:
+
+           | M^ee           Delta |   | M^ee          Delta     |
+ M_nambu = |                      | = |                         |
+           | Delta^dagger   M^hh  |   | Delta^dagger  -(M^ee)^* |
+
+So we only return M^ee and Delta.
+The delta matrices are stored in the singlet (S) + triplet (Tuu, Tdd, T0) terms.
+The delta expansion looks like this:
+
+        |   Tuu    S + T0 |
+Delta = |                 |
+        | -S + T0   Tdd   |
+
+M[4] == Delta[0, 0]
+M[5] == Delta[0, 1]
+M[6] == Delta[1, 0]
+M[7] == Delta[1, 1]
 """
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 cdef inline void _matrix_box_nc_real(const reals_st *data,
                                      const complexs_st phase,
                                      complexs_st *M) noexcept nogil:
@@ -35,10 +52,6 @@ cdef inline void _matrix_box_nc_real(const reals_st *data,
     M[3] = <complexs_st> (data[1] * phase)
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 cdef inline void _matrix_box_nc_cmplx(const _internal_complexs_st *data,
                                       const complexs_st phase,
                                       complexs_st *M) noexcept nogil:
@@ -48,10 +61,6 @@ cdef inline void _matrix_box_nc_cmplx(const _internal_complexs_st *data,
     M[3] = <complexs_st> (data[1] * phase)
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 cdef inline void _matrix_box_so_real(const reals_st *data,
                                      const complexs_st phase,
                                      complexs_st *M) noexcept nogil:
@@ -61,11 +70,6 @@ cdef inline void _matrix_box_so_real(const reals_st *data,
     M[3] = <complexs_st> ((data[1] + 1j * data[5]) * phase)
 
 
-# necessary to double the interfaces
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-@cython.cdivision(True)
 cdef inline void _matrix_box_so_cmplx(const _internal_complexs_st *data,
                                       const complexs_st phase,
                                       complexs_st *M) noexcept nogil:
@@ -73,3 +77,30 @@ cdef inline void _matrix_box_so_cmplx(const _internal_complexs_st *data,
     M[1] = <complexs_st> (data[2] * phase)
     M[2] = <complexs_st> (data[3] * phase)
     M[3] = <complexs_st> (data[1] * phase)
+
+
+cdef inline void _matrix_box_nambu_real(const reals_st *data,
+                                        const complexs_st phase,
+                                        complexs_st *M) noexcept nogil:
+    M[0] = <complexs_st> ((data[0] + 1j * data[4]) * phase)
+    M[1] = <complexs_st> ((data[2] + 1j * data[3]) * phase)
+    M[2] = <complexs_st> ((data[6] + 1j * data[7]) * phase)
+    M[3] = <complexs_st> ((data[1] + 1j * data[5]) * phase)
+    # delta matrix stored in [8-15]
+    M[4] = <complexs_st> ((data[10] + 1j * data[11]) * phase)
+    M[5] = <complexs_st> ((data[8] + data[14] + 1j * (data[9] + data[15])) * phase)
+    M[6] = <complexs_st> ((-data[8] + data[14] + 1j * (-data[9] + data[15])) * phase)
+    M[7] = <complexs_st> ((data[12] + 1j * data[13]) * phase)
+
+
+cdef inline void _matrix_box_nambu_cmplx(const _internal_complexs_st *data,
+                                         const complexs_st phase,
+                                         complexs_st *M) noexcept nogil:
+    M[0] = <complexs_st> (data[0] * phase)
+    M[1] = <complexs_st> (data[2] * phase)
+    M[2] = <complexs_st> (data[3] * phase)
+    M[3] = <complexs_st> (data[1] * phase)
+    M[4] = <complexs_st> (data[5] * phase)
+    M[5] = <complexs_st> ((data[4] + data[7]) * phase)
+    M[6] = <complexs_st> ((-data[4] + data[7]) * phase)
+    M[7] = <complexs_st> (data[6] * phase)
diff --git a/src/sisl/physics/densitymatrix.py b/src/sisl/physics/densitymatrix.py
index 892c036576..6e24f12c2d 100644
--- a/src/sisl/physics/densitymatrix.py
+++ b/src/sisl/physics/densitymatrix.py
@@ -813,6 +813,8 @@ def density(
 
             csrDM = csr.tocsr(dim=0) * spinor[0] + csr.tocsr(dim=1) * spinor[1]
 
+        elif self.spin.is_nambu:
+            raise NotImplementedError("Nambu spin configuration not implemneted")
         else:
             csrDM = csr.tocsr(dim=0)
 
diff --git a/src/sisl/physics/electron.py b/src/sisl/physics/electron.py
index 0610058ed6..e60cabe070 100644
--- a/src/sisl/physics/electron.py
+++ b/src/sisl/physics/electron.py
@@ -962,7 +962,11 @@ def shc(
 
     dtype = eigenstate_kwargs.get("dtype", np.complex128)
 
-    m = _create_sigma(H.no, sigma, dtype, eigenstate_kwargs.get("format", "csr"))
+    if H.spin.is_nambu:
+        no = H.no * 2
+    else:
+        no = H.no
+    m = _create_sigma(no, sigma, dtype, eigenstate_kwargs.get("format", "csr"))
 
     # To reduce (heavily) the computational load, we pre-setup the
     # operators here.
@@ -1252,7 +1256,9 @@ def _berry(eigenstates):
 
 
 @set_module("sisl.physics.electron")
-def wavefunction(v, grid, geometry=None, k=None, spinor=0, spin=None, eta=None):
+def wavefunction(
+    v, grid, geometry=None, k=None, spinor=0, spin: Optional[Spin] = None, eta=None
+):
     r"""Add the wave-function (`Orbital.psi`) component of each orbital to the grid
 
     This routine calculates the real-space wave-function components in the
@@ -1316,7 +1322,7 @@ def wavefunction(v, grid, geometry=None, k=None, spinor=0, spin=None, eta=None):
        eigenstate object has been created from a parent object with a `Spin` object
        contained, *and* if the spin-configuration is non-colinear or spin-orbit coupling.
        Default to the first spinor component.
-    spin : Spin, optional
+    spin :
        specification of the spin configuration of the orbital coefficients. This only has
        influence for non-colinear wavefunctions where `spinor` choice is important.
     eta : bool, optional
@@ -1363,15 +1369,21 @@ def wavefunction(v, grid, geometry=None, k=None, spinor=0, spin=None, eta=None):
 
     if spin is None:
         if len(v) // 2 == geometry.no:
-            # We can see from the input that the vector *must* be a non-colinear calculation
+            # the input corresponds to a non-collinear calculation
             v = v.reshape(-1, 2)[:, spinor]
             info(
                 "wavefunction: assumes the input wavefunction coefficients to originate from a non-colinear calculation!"
             )
+        elif len(v) // 4 == geometry.no:
+            # the input corresponds to a NAMBU calculation
+            v = v.reshape(-1, 4)[:, spinor]
+            info(
+                "wavefunction: assumes the input wavefunction coefficients to originatefrom a nambu calculation!"
+            )
 
     elif spin.kind > Spin.POLARIZED:
-        # For non-colinear cases the user selects the spinor component.
-        v = v.reshape(-1, 2)[:, spinor]
+        # For non-colinear+nambu cases the user selects the spinor component.
+        v = v.reshape(-1, spin.spinor)[:, spinor]
 
     if len(v) != geometry.no:
         raise ValueError(
diff --git a/src/sisl/physics/hamiltonian.py b/src/sisl/physics/hamiltonian.py
index 60b02e0f86..e22f262037 100644
--- a/src/sisl/physics/hamiltonian.py
+++ b/src/sisl/physics/hamiltonian.py
@@ -320,14 +320,19 @@ def shift(self, E):
             # When the energy is zero, there is no shift
             return
 
+        if self.spin.is_nambu:
+            nspin = 2
+        else:
+            nspin = self.spin.spinor
+
         if self.orthogonal:
             for i in range(self.shape[0]):
-                for j in range(self.spin.spinor):
+                for j in range(nspin):
                     self[i, i, j] = self[i, i, j] + E[j]
         else:
             # For non-collinear and SO only the diagonal (real) components
             # should be shifted.
-            for i in range(self.spin.spinor):
+            for i in range(nspin):
                 self._csr._D[:, i].real += self._csr._D[:, self.S_idx].real * E[i]
 
     def eigenvalue(self, k=(0, 0, 0), gauge: GaugeType = "cell", **kwargs):
diff --git a/src/sisl/physics/sparse.py b/src/sisl/physics/sparse.py
index 95b3a59862..243986bc81 100644
--- a/src/sisl/physics/sparse.py
+++ b/src/sisl/physics/sparse.py
@@ -19,9 +19,21 @@
 from sisl.messages import warn
 from sisl.typing import AtomsIndex, GaugeType, KPoint
 
-from ._matrix_ddk import matrix_ddk, matrix_ddk_nc, matrix_ddk_nc_diag, matrix_ddk_so
-from ._matrix_dk import matrix_dk, matrix_dk_nc, matrix_dk_nc_diag, matrix_dk_so
-from ._matrix_k import matrix_k, matrix_k_nc, matrix_k_nc_diag, matrix_k_so
+from ._matrix_ddk import (
+    matrix_ddk,
+    matrix_ddk_diag,
+    matrix_ddk_nambu,
+    matrix_ddk_nc,
+    matrix_ddk_so,
+)
+from ._matrix_dk import (
+    matrix_dk,
+    matrix_dk_diag,
+    matrix_dk_nambu,
+    matrix_dk_nc,
+    matrix_dk_so,
+)
+from ._matrix_k import matrix_k, matrix_k_diag, matrix_k_nambu, matrix_k_nc, matrix_k_so
 from .spin import Spin
 
 __all__ = ["SparseOrbitalBZ", "SparseOrbitalBZSpin"]
@@ -675,8 +687,31 @@ def _ddSk_non_colinear(
            chosen gauge
         """
         k = _a.asarrayd(k).ravel()
-        return matrix_ddk_nc_diag(
-            gauge, self, self.S_idx, self.lattice, k, dtype, format
+        return matrix_ddk_diag(
+            gauge, self, self.S_idx, 2, self.lattice, k, dtype, format
+        )
+
+    def _ddSk_nambu(
+        self,
+        k: KPoint = (0, 0, 0),
+        dtype=None,
+        gauge: GaugeType = "cell",
+        format: str = "csr",
+    ):
+        r"""Overlap matrix in a `scipy.sparse.csr_matrix` at `k` for Nambu spin, differentiated with respect to `k`
+
+        Parameters
+        ----------
+        k : array_like, optional
+           k-point (default is Gamma point)
+        dtype : numpy.dtype, optional
+           default to `numpy.complex128`
+        gauge :
+           chosen gauge
+        """
+        k = _a.asarrayd(k).ravel()
+        return matrix_ddk_diag(
+            gauge, self, self.S_idx, 4, self.lattice, k, dtype, format
         )
 
     def eig(
@@ -835,6 +870,7 @@ def __init__(
                     2: Spin.POLARIZED,
                     4: Spin.NONCOLINEAR,
                     8: Spin.SPINORBIT,
+                    16: Spin.NAMBU,
                 }.get(dim)
         else:
             spin = kwargs.pop("spin")
@@ -910,6 +946,42 @@ def _reset(self):
             self.ddPk = self._ddPk_spin_orbit
             self.ddSk = self._ddSk_non_colinear
 
+        elif self.spin.is_nambu:
+            if self.dkind in ("f", "i"):
+                self.M11r = 0
+                self.M22r = 1
+                self.M12r = 2
+                self.M12i = 3
+                self.M11i = 4
+                self.M22i = 5
+                self.M21r = 6
+                self.M21i = 7
+                self.MSr = 8
+                self.MSi = 9
+                self.MT11r = 10
+                self.MT11i = 11
+                self.MT22r = 12
+                self.MT22i = 13
+                self.MT0r = 14
+                self.MT0i = 15
+            else:
+                self.M11 = 0
+                self.M22 = 1
+                self.M12 = 2
+                self.M21 = 3
+                self.MS = 4
+                self.MT11 = 5
+                self.MT22 = 6
+                self.MT0 = 7
+
+            # The overlap is the same as non-collinear
+            self.Pk = self._Pk_nambu
+            self.Sk = self._Sk_nambu
+            self.dPk = self._dPk_nambu
+            self.dSk = self._dSk_nambu
+            self.ddPk = self._ddPk_nambu
+            self.ddSk = self._ddSk_nambu
+
         if self.orthogonal:
             self.Sk = self._Sk_diagonal
 
@@ -975,6 +1047,8 @@ def create_construct(self, R, params):
             dtype_cplx = dtype_real_to_complex(self.dtype)
 
             is_complex = self.dkind == "c"
+            if self.spin.is_nambu:
+                raise NotImplementedError
             if self.spin.is_spinorbit:
                 if is_complex:
                     nv = 4
@@ -1092,7 +1166,8 @@ def __repr__(self):
             Spin.POLARIZED: "polarized",
             Spin.NONCOLINEAR: "noncolinear",
             Spin.SPINORBIT: "spinorbit",
-        }.get(self.spin._kind, f"unkown({self.spin._kind})")
+            Spin.NAMBU: "nambu",
+        }.get(self.spin.kind, f"unkown({self.spin.kind})")
         return f"<{self.__module__}.{self.__class__.__name__} na={g.na}, no={g.no}, nsc={g.nsc}, dim={self.dim}, nnz={self.nnz}, spin={spin}>"
 
     def _Pk_unpolarized(
@@ -1180,6 +1255,27 @@ def _Pk_spin_orbit(
         k = _a.asarrayd(k).ravel()
         return matrix_k_so(gauge, self, self.lattice, k, dtype, format)
 
+    def _Pk_nambu(
+        self,
+        k: KPoint = (0, 0, 0),
+        dtype=None,
+        gauge: GaugeType = "cell",
+        format: str = "csr",
+    ):
+        r"""Sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a Nambu system
+
+        Parameters
+        ----------
+        k : array_like, optional
+           k-point (default is Gamma point)
+        dtype : numpy.dtype, optional
+           default to `numpy.complex128`
+        gauge :
+           chosen gauge
+        """
+        k = _a.asarrayd(k).ravel()
+        return matrix_k_nambu(gauge, self, self.lattice, k, dtype, format)
+
     def _dPk_unpolarized(
         self,
         k: KPoint = (0, 0, 0),
@@ -1251,7 +1347,7 @@ def _dPk_spin_orbit(
         gauge: GaugeType = "cell",
         format: str = "csr",
     ):
-        r"""Tuple of sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a non-collinear system, differentiated with respect to `k`
+        r"""Tuple of sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a spin-orbit system, differentiated with respect to `k`
 
         Parameters
         ----------
@@ -1265,6 +1361,27 @@ def _dPk_spin_orbit(
         k = _a.asarrayd(k).ravel()
         return matrix_dk_so(gauge, self, self.lattice, k, dtype, format)
 
+    def _dPk_nambu(
+        self,
+        k: KPoint = (0, 0, 0),
+        dtype=None,
+        gauge: GaugeType = "cell",
+        format: str = "csr",
+    ):
+        r"""Tuple of sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a Nambu spin system, differentiated with respect to `k`
+
+        Parameters
+        ----------
+        k : array_like, optional
+           k-point (default is Gamma point)
+        dtype : numpy.dtype, optional
+           default to `numpy.complex128`
+        gauge :
+           chosen gauge
+        """
+        k = _a.asarrayd(k).ravel()
+        return matrix_dk_nambu(gauge, self, self.lattice, k, dtype, format)
+
     def _ddPk_non_colinear(
         self,
         k: KPoint = (0, 0, 0),
@@ -1293,7 +1410,7 @@ def _ddPk_spin_orbit(
         gauge: GaugeType = "cell",
         format: str = "csr",
     ):
-        r"""Tuple of sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a non-collinear system, differentiated with respect to `k`
+        r"""Tuple of sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a spin-orbit system, differentiated with respect to `k`
 
         Parameters
         ----------
@@ -1307,6 +1424,27 @@ def _ddPk_spin_orbit(
         k = _a.asarrayd(k).ravel()
         return matrix_ddk_so(gauge, self, self.lattice, k, dtype, format)
 
+    def _ddPk_nambu(
+        self,
+        k: KPoint = (0, 0, 0),
+        dtype=None,
+        gauge: GaugeType = "cell",
+        format: str = "csr",
+    ):
+        r"""Tuple of sparse matrix (`scipy.sparse.csr_matrix`) at `k` for a Nambu system, differentiated with respect to `k`
+
+        Parameters
+        ----------
+        k : array_like, optional
+           k-point (default is Gamma point)
+        dtype : numpy.dtype, optional
+           default to `numpy.complex128`
+        gauge :
+           chosen gauge
+        """
+        k = _a.asarrayd(k).ravel()
+        return matrix_ddk_nambu(gauge, self, self.lattice, k, dtype, format)
+
     def _Sk(
         self,
         k: KPoint = (0, 0, 0),
@@ -1346,7 +1484,28 @@ def _Sk_non_colinear(
            chosen gauge
         """
         k = _a.asarrayd(k).ravel()
-        return matrix_k_nc_diag(gauge, self, self.S_idx, self.lattice, k, dtype, format)
+        return matrix_k_diag(gauge, self, self.S_idx, 2, self.lattice, k, dtype, format)
+
+    def _Sk_nambu(
+        self,
+        k: KPoint = (0, 0, 0),
+        dtype=None,
+        gauge: GaugeType = "cell",
+        format: str = "csr",
+    ):
+        r"""Overlap matrix (`scipy.sparse.csr_matrix`) at `k` for a Nambu system
+
+        Parameters
+        ----------
+        k : array_like, optional
+           k-point (default is Gamma point)
+        dtype : numpy.dtype, optional
+           default to `numpy.complex128`
+        gauge :
+           chosen gauge
+        """
+        k = _a.asarrayd(k).ravel()
+        return matrix_k_diag(gauge, self, self.S_idx, 4, self.lattice, k, dtype, format)
 
     def _dSk_non_colinear(
         self,
@@ -1367,8 +1526,31 @@ def _dSk_non_colinear(
            chosen gauge
         """
         k = _a.asarrayd(k).ravel()
-        return matrix_dk_nc_diag(
-            gauge, self, self.S_idx, self.lattice, k, dtype, format
+        return matrix_dk_diag(
+            gauge, self, self.S_idx, 2, self.lattice, k, dtype, format
+        )
+
+    def _dSk_nambu(
+        self,
+        k: KPoint = (0, 0, 0),
+        dtype=None,
+        gauge: GaugeType = "cell",
+        format: str = "csr",
+    ):
+        r"""Overlap matrix (`scipy.sparse.csr_matrix`) at `k` for a Nambu system
+
+        Parameters
+        ----------
+        k : array_like, optional
+           k-point (default is Gamma point)
+        dtype : numpy.dtype, optional
+           default to `numpy.complex128`
+        gauge :
+           chosen gauge
+        """
+        k = _a.asarrayd(k).ravel()
+        return matrix_dk_diag(
+            gauge, self, self.S_idx, 4, self.lattice, k, dtype, format
         )
 
     def eig(
diff --git a/src/sisl/physics/spin.py b/src/sisl/physics/spin.py
index 133b6d7983..338fe15d21 100644
--- a/src/sisl/physics/spin.py
+++ b/src/sisl/physics/spin.py
@@ -28,6 +28,8 @@ class Spin:
     True
     >>> Spin(Spin.SPINORBIT, dtype=np.complex128) == Spin("spin-orbit") == Spin("so") == Spin("soc")
     True
+    >>> Spin(Spin.NAMBU) == Spin("nambu") == Spin("bdg")
+    True
 
     Note that a data-type may be associated with a spin-object. This is not to say
     that the data-type is used in the configuration, but merely that it helps
@@ -48,6 +50,8 @@ class Spin:
     NONCOLINEAR = 2
     #: Constant for a spin-orbit spin configuration
     SPINORBIT = 3
+    #: Constant for a Nambu spin configuration
+    NAMBU = 4
 
     #: The :math:`\boldsymbol\sigma_x` Pauli matrix
     X = np.array([[0, 1], [1, 0]], np.complex128)
@@ -67,8 +71,8 @@ def __init__(self, kind: Union[str, int] = "unpolarized"):
             kind = kind.lower()
 
         kind = {
-            "unpolarized": Spin.UNPOLARIZED,
             "": Spin.UNPOLARIZED,
+            "unpolarized": Spin.UNPOLARIZED,
             Spin.UNPOLARIZED: Spin.UNPOLARIZED,
             "colinear": Spin.POLARIZED,
             "collinear": Spin.POLARIZED,
@@ -87,6 +91,9 @@ def __init__(self, kind: Union[str, int] = "unpolarized"):
             "so": Spin.SPINORBIT,
             "soc": Spin.SPINORBIT,
             Spin.SPINORBIT: Spin.SPINORBIT,
+            "nambu": Spin.NAMBU,
+            "bdg": Spin.NAMBU,
+            Spin.NAMBU: Spin.NAMBU,
         }.get(kind)
         if kind is None:
             raise ValueError(
@@ -104,7 +111,9 @@ def __str__(self) -> str:
             return f"{self.__class__.__name__}{{polarized}}"
         if self.is_noncolinear:
             return f"{self.__class__.__name__}{{non-colinear}}"
-        return f"{self.__class__.__name__}{{spin-orbit}}"
+        if self.is_spinorbit:
+            return f"{self.__class__.__name__}{{spin-orbit}}"
+        return f"{self.__class__.__name__}{{nambu}}"
 
     def copy(self):
         """Create a copy of the spin-object"""
@@ -125,6 +134,7 @@ def size(self, dtype: np.dtype) -> int:
                 self.POLARIZED: 2,
                 self.NONCOLINEAR: 3,
                 self.SPINORBIT: 4,
+                self.NAMBU: 8,
             }[self.kind]
 
         return {
@@ -132,13 +142,16 @@ def size(self, dtype: np.dtype) -> int:
             self.POLARIZED: 2,
             self.NONCOLINEAR: 4,
             self.SPINORBIT: 8,
+            self.NAMBU: 16,
         }[self.kind]
 
     @property
     def spinor(self) -> int:
-        """Number of spinor components (1 or 2)"""
+        """Number of spinor components (1, 2 or 4)"""
         if self.is_unpolarized:
             return 1
+        if self.is_nambu:
+            return 4
         return 2
 
     @property
@@ -178,6 +191,11 @@ def is_spinorbit(self) -> bool:
         """True if the configuration is spin-orbit"""
         return self.kind == Spin.SPINORBIT
 
+    @property
+    def is_nambu(self) -> bool:
+        """True if the configuration is Nambu"""
+        return self.kind == Spin.NAMBU
+
     # Comparisons
     def __lt__(self, other) -> bool:
         return self.kind < other.kind
diff --git a/src/sisl/physics/tests/test_spin.py b/src/sisl/physics/tests/test_spin.py
index f05bb45ad5..29eaf363cd 100644
--- a/src/sisl/physics/tests/test_spin.py
+++ b/src/sisl/physics/tests/test_spin.py
@@ -25,6 +25,9 @@ def test_spin_init():
         "spin-orbit",
         "so",
         Spin.SPINORBIT,
+        "nambu",
+        "bdg",
+        Spin.NAMBU,
     ]:
         s = Spin(val)
         str(s)
@@ -37,20 +40,24 @@ def test_spin_comparisons():
     s2 = Spin("p")
     s3 = Spin("nc")
     s4 = Spin("so")
+    s5 = Spin("nambu")
 
     assert s1.kind == Spin.UNPOLARIZED
     assert s2.kind == Spin.POLARIZED
     assert s3.kind == Spin.NONCOLINEAR
     assert s4.kind == Spin.SPINORBIT
+    assert s5.kind == Spin.NAMBU
 
     assert s1 == s1.copy()
     assert s2 == s2.copy()
     assert s3 == s3.copy()
     assert s4 == s4.copy()
+    assert s5 == s5.copy()
 
     assert s1 < s2
     assert s2 < s3
     assert s3 < s4
+    assert s4 < s5
 
     assert s1 <= s2
     assert s2 <= s3
@@ -59,30 +66,42 @@ def test_spin_comparisons():
     assert s2 > s1
     assert s3 > s2
     assert s4 > s3
+    assert s5 > s4
 
     assert s2 >= s1
     assert s3 >= s2
     assert s4 >= s3
+    assert s5 >= s4
 
     assert s1.is_unpolarized
     assert not s1.is_polarized
     assert not s1.is_noncolinear
     assert not s1.is_spinorbit
+    assert not s1.is_nambu
 
     assert not s2.is_unpolarized
     assert s2.is_polarized
     assert not s2.is_noncolinear
     assert not s2.is_spinorbit
+    assert not s2.is_nambu
 
     assert not s3.is_unpolarized
     assert not s3.is_polarized
     assert s3.is_noncolinear
     assert not s3.is_spinorbit
+    assert not s3.is_nambu
 
     assert not s4.is_unpolarized
     assert not s4.is_polarized
     assert not s4.is_noncolinear
     assert s4.is_spinorbit
+    assert not s4.is_nambu
+
+    assert not s5.is_unpolarized
+    assert not s5.is_polarized
+    assert not s5.is_noncolinear
+    assert not s5.is_spinorbit
+    assert s5.is_nambu
 
 
 def test_spin_unaccepted_arg():

From 13edcb98f284b65d0ee17179535f17a9177ad1be Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Thu, 28 Nov 2024 14:24:39 +0100
Subject: [PATCH 2/4] finalized the PDOS for nambu and finalized more methods

- nambu PDOS
- nambu transpose
- nambu berry-phase stuff works
- nambu trs is *NOT* implemented.

Added more tests for complex data-types which
has been completed in this branch.

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 src/sisl/physics/_ufuncs_electron.py          |   6 +-
 src/sisl/physics/electron.py                  |  59 ++++++-
 src/sisl/physics/self_energy.py               |  24 ++-
 src/sisl/physics/sparse.py                    | 147 ++++++++++++++++-
 src/sisl/physics/tests/test_physics_sparse.py | 156 ++++++++++++++----
 5 files changed, 346 insertions(+), 46 deletions(-)

diff --git a/src/sisl/physics/_ufuncs_electron.py b/src/sisl/physics/_ufuncs_electron.py
index 82b9721d2b..6e9307e121 100644
--- a/src/sisl/physics/_ufuncs_electron.py
+++ b/src/sisl/physics/_ufuncs_electron.py
@@ -340,7 +340,11 @@ def Jz(M, d):
     dtype = np.result_type(state.dtype, state.info.get("dtype", np.complex128))
 
     # no is not including the spin-dimension
-    m = _create_sigma(H.no, sigma, dtype, state.info.get("format", "csr"))
+    if H.spin.is_nambu:
+        no = H.no * 2
+    else:
+        no = H.no
+    m = _create_sigma(no, sigma, dtype, state.info.get("format", "csr"))
 
     def J(M, d):
         nonlocal m, J_axes
diff --git a/src/sisl/physics/electron.py b/src/sisl/physics/electron.py
index e60cabe070..c03fcfd496 100644
--- a/src/sisl/physics/electron.py
+++ b/src/sisl/physics/electron.py
@@ -231,6 +231,7 @@ def PDOS(E, eig, state, S=None, distribution="gaussian", spin=None):
         projected DOS calculated at energies, has dimension ``(1, state.shape[1], len(E))``.
         For non-colinear calculations it will be ``(4, state.shape[1] // 2, len(E))``, ordered as
         indicated in the above list.
+        For Nambu calculations it will be ``(8, state.shape[1] // 4, len(E))``.
     """
     if isinstance(distribution, str):
         distribution = get_distribution(distribution)
@@ -243,11 +244,67 @@ def PDOS(E, eig, state, S=None, distribution="gaussian", spin=None):
         if S.shape[1] == state.shape[1] // 2:
             spin = Spin("nc")
             S = S[::2, ::2]
+        elif S.shape[1] == state.shape[1] // 4:
+            spin = Spin("nambu")
+            S = S[::4, ::4]
         else:
             spin = Spin()
 
     # check for non-colinear (or SO)
-    if spin.kind > Spin.POLARIZED:
+    if spin.kind > Spin.SPINORBIT:
+        # Non colinear eigenvectors
+        if S.shape[1] == state.shape[1]:
+            # Since we are going to reshape the eigen-vectors
+            # to more easily get the mixed states, we can reduce the overlap matrix
+            S = S[::4, ::4]
+
+        # Initialize data
+        PDOS = empty([8, state.shape[1] // 4, len(E)], dtype=state.real.dtype)
+
+        # Do spin-box calculations:
+        #  PDOS[:4] = electron
+        #  PDOS[0] = total DOS (diagonal)
+        #  PDOS[1] = x == < psi | \sigma_x S | psi >
+        #  PDOS[2] = y == < psi | \sigma_y S | psi >
+        #  PDOS[3] = z == < psi | \sigma_z S | psi >
+        #  PDOS[4:] = hole
+
+        d = distribution(E - eig[0]).reshape(1, -1)
+        cs = conj(state[0]).reshape(-1, 4)
+        v = S @ state[0].reshape(-1, 4)
+        D1 = (cs * v).real  # uu,dd PDOS
+        PDOS[0, :, :] = D1[..., [0, 1]].sum(1).reshape(-1, 1) * d  # total DOS
+        PDOS[3, :, :] = (D1[:, 0] - D1[:, 1]).reshape(-1, 1) * d  # z-dos
+        PDOS[4, :, :] = D1[..., [2, 3]].sum(1).reshape(-1, 1) * d  # total DOS
+        PDOS[7, :, :] = (D1[:, 2] - D1[:, 3]).reshape(-1, 1) * d  # z-dos
+        D1 = (cs[:, 1] * v[:, 0]).reshape(-1, 1)  # d,u
+        D2 = (cs[:, 0] * v[:, 1]).reshape(-1, 1)  # u,d
+        PDOS[1, :, :] = (D1.real + D2.real) * d  # x-dos
+        PDOS[2, :, :] = (D2.imag - D1.imag) * d  # y-dos
+        D1 = (cs[:, 3] * v[:, 2]).reshape(-1, 1)  # d,u
+        D2 = (cs[:, 2] * v[:, 3]).reshape(-1, 1)  # u,d
+        PDOS[5, :, :] = (D1.real + D2.real) * d  # x-dos
+        PDOS[6, :, :] = (D2.imag - D1.imag) * d  # y-dos
+        for i in range(1, len(eig)):
+            d = distribution(E - eig[i]).reshape(1, -1)
+            cs = conj(state[i]).reshape(-1, 4)
+            v = S @ state[i].reshape(-1, 4)
+            D1 = (cs * v).real
+            PDOS[0, :, :] += D1[..., [0, 1]].sum(1).reshape(-1, 1) * d  # total DOS
+            PDOS[3, :, :] += (D1[:, 0] - D1[:, 1]).reshape(-1, 1) * d  # z-dos
+            PDOS[4, :, :] += D1[..., [2, 3]].sum(1).reshape(-1, 1) * d  # total DOS
+            PDOS[7, :, :] += (D1[:, 2] - D1[:, 3]).reshape(-1, 1) * d  # z-dos
+            D1 = (cs[:, 1] * v[:, 0]).reshape(-1, 1)  # d,u
+            D2 = (cs[:, 0] * v[:, 1]).reshape(-1, 1)  # u,d
+            PDOS[1, :, :] += (D1.real + D2.real) * d  # x-dos
+            PDOS[2, :, :] += (D2.imag - D1.imag) * d  # y-dos
+            D1 = (cs[:, 3] * v[:, 2]).reshape(-1, 1)  # d,u
+            D2 = (cs[:, 2] * v[:, 3]).reshape(-1, 1)  # u,d
+            PDOS[5, :, :] += (D1.real + D2.real) * d  # x-dos
+            PDOS[6, :, :] += (D2.imag - D1.imag) * d  # y-dos
+
+    elif spin.kind > Spin.POLARIZED:
+        # check for non-colinear (or SO)
         # Non colinear eigenvectors
         if S.shape[1] == state.shape[1]:
             # Since we are going to reshape the eigen-vectors
diff --git a/src/sisl/physics/self_energy.py b/src/sisl/physics/self_energy.py
index 11c21c0475..e5e0188e83 100644
--- a/src/sisl/physics/self_energy.py
+++ b/src/sisl/physics/self_energy.py
@@ -939,7 +939,13 @@ def setup(self, **options):
         V_atoms = self.real_space_coupling(True)[1]
         orbs = P0.a2o(V_atoms, True)
         try:
-            if not P0.spin.is_diagonal:
+            if P0.spin.is_nambu:
+                # expand in case we have a non-colinear|spin-orbit
+                orbs = np.repeat(orbs, 4) * 4
+                orbs[1::4] += 1
+                orbs[2::4] += 2
+                orbs[3::4] += 3
+            elif not P0.spin.is_diagonal:
                 # expand in case we have a non-colinear|spin-orbit
                 orbs = np.repeat(orbs, 2) * 2
                 orbs[1::2] += 1
@@ -1438,7 +1444,13 @@ def __init__(self, semi, surface, k_axes, unfold=(1, 1, 1), **options):
         # Surface orbitals to put in the semi-infinite self-energy into.
         orbs = self.surface.geometry.a2o(atoms, True)
         try:
-            if not self.surface.spin.is_diagonal:
+            if self.surface.spin.is_nambu:
+                # expand in case we have a non-colinear|spin-orbit
+                orbs = np.repeat(orbs, 4) * 4
+                orbs[1::4] += 1
+                orbs[2::4] += 2
+                orbs[3::4] += 3
+            elif not self.surface.spin.is_diagonal:
                 # expand in case we have a non-colinear|spin-orbit
                 orbs = np.repeat(orbs, 2) * 2
                 orbs[1::2] += 1
@@ -1678,7 +1690,13 @@ def setup(self, **options):
         V_atoms = self.real_space_coupling(True)[1]
         orbs = P0.a2o(V_atoms, True)
         try:
-            if not P0.spin.is_diagonal:
+            if P0.spin.is_nambu:
+                # expand in case we have a non-colinear|spin-orbit
+                orbs = np.repeat(orbs, 4) * 4
+                orbs[1::4] += 1
+                orbs[2::4] += 2
+                orbs[3::4] += 3
+            elif not P0.spin.is_diagonal:
                 # expand in case we have a non-colinear|spin-orbit
                 orbs = np.repeat(orbs, 2) * 2
                 orbs[1::2] += 1
diff --git a/src/sisl/physics/sparse.py b/src/sisl/physics/sparse.py
index 243986bc81..c0a4024814 100644
--- a/src/sisl/physics/sparse.py
+++ b/src/sisl/physics/sparse.py
@@ -1048,8 +1048,101 @@ def create_construct(self, R, params):
 
             is_complex = self.dkind == "c"
             if self.spin.is_nambu:
-                raise NotImplementedError
-            if self.spin.is_spinorbit:
+                if is_complex:
+                    nv = 8
+                    # Hermitian parameters
+                    # The input order is [uu, dd, ud, du]
+                    paramsH = [
+                        [
+                            # H^ee
+                            p[0].conjugate(),
+                            p[1].conjugate(),
+                            p[3].conjugate(),
+                            p[2].conjugate(),
+                            # delta, note the singlet
+                            -p[4].conjugate(),
+                            p[5].conjugate(),
+                            p[6].conjugate(),
+                            p[7].conjugate(),
+                            # because it is already off-diagonal
+                            *p[8:],
+                        ]
+                        for p in params
+                    ]
+                else:
+                    nv = 16
+                    # Hermitian parameters
+                    # The input order is [Ruu, Rdd, Rud, Iud, Iuu, Idd, Rdu, idu]
+                    #                    [ RS,  IS, RTu, ITu, RTd, ITd, RT0, IT0]
+                    # delta, note the singlet!
+                    paramsH = [
+                        [
+                            p[0],
+                            p[1],
+                            p[6],
+                            -p[7],
+                            -p[4],
+                            -p[5],
+                            p[2],
+                            -p[3],
+                            -p[8],
+                            p[9],
+                            p[10],
+                            -p[11],
+                            p[12],
+                            -p[13],
+                            p[14],
+                            -p[15],
+                            *p[16:],
+                        ]
+                        for p in params
+                    ]
+                if not self.orthogonal:
+                    nv += 1
+
+                # ensure we have correct number of values
+                assert all(len(p) == nv for p in params)
+
+                if R[0] <= 0.1001:  # no atom closer than 0.1001 Ang!
+                    # We check that the the parameters here is Hermitian
+                    p = params[0]
+                    if is_complex:
+                        Me = np.array([[p[0], p[2]], [p[3], p[1]]], dtype_cplx)
+                        # do Delta
+                        p = p[4:]
+                        Md = np.array(
+                            [[p[1], p[0] + p[3]], [-p[0] + p[3], p[2]]], dtype_cplx
+                        )
+                    else:
+                        Me = np.array(
+                            [
+                                [p[0] + 1j * p[4], p[2] + 1j * p[3]],
+                                [p[6] + 1j * p[7], p[1] + 1j * p[5]],
+                            ],
+                            dtype_cplx,
+                        )
+                        # do Delta
+                        p = p[8:]
+                        Md = np.array(
+                            [
+                                [p[2] + 1j * p[3], p[0] + p[6] + 1j * (p[1] + p[7])],
+                                [-p[0] + p[6] + 1j * (-p[1] + p[7]), p[4] + 1j * p[5]],
+                            ],
+                            dtype_cplx,
+                        )
+                    if not np.allclose(Me, Me.T.conjugate()):
+                        warn(
+                            f"{self.__class__.__name__}.create_construct is NOT "
+                            "Hermitian for M^e on-site terms. This is your responsibility! "
+                            "The code will continue silently, be AWARE!"
+                        )
+                    if not np.allclose(Md, Md.T.conjugate()):
+                        warn(
+                            f"{self.__class__.__name__}.create_construct is NOT "
+                            "Hermitian for Delta on-site terms. This is your responsibility! "
+                            "The code will continue silently, be AWARE!"
+                        )
+            elif self.spin.is_spinorbit:
                 if is_complex:
                     nv = 4
                     # Hermitian parameters
@@ -1690,7 +1783,40 @@ def transpose(self, hermitian: bool = False, spin: bool = True, sort: bool = Tru
         sp = self.spin
         D = new._csr._D
 
-        if sp.is_spinorbit:
+        if sp.is_nambu:
+            if hermitian and spin:
+                # conjugate the imaginary value and transpose spin-box
+                if self.dkind in ("f", "i"):
+                    # imaginary components (including transposing)
+                    #    12,11,22,21
+                    D[:, [3, 4, 5, 7]] = -D[:, [7, 4, 5, 3]]
+                    # R12 <-> R21
+                    D[:, [2, 6]] = D[:, [6, 2]]
+                    # real S, otherwise imaginary components of Delta
+                    D[:, [8, 11, 13, 15]] = -D[:, [8, 11, 13, 15]]
+                else:
+                    D[:, [0, 1, 2, 3]] = np.conj(D[:, [0, 1, 3, 2]])
+                    # delta values
+                    D[:, 4:8] = np.conj(D[:, 4:8])
+                    D[:, 4] = -D[:, 4]
+            elif hermitian:
+                # conjugate the imaginary value
+                if self.dkind in ("f", "i"):
+                    # imaginary components
+                    #    12,11,22,21
+                    D[:, [3, 4, 5, 7, 9, 11, 13, 15]] *= -1.0
+                else:
+                    D[:, :] = np.conj(D[:, :])
+            elif spin:
+                # transpose spin-box, 12 <-> 21
+                if self.dkind in ("f", "i"):
+                    D[:, [2, 3, 6, 7]] = D[:, [6, 7, 2, 3]]
+                    D[:, [8, 9]] = -D[:, [8, 9]]
+                else:
+                    D[:, [2, 3]] = D[:, [3, 2]]
+                    D[:, 4] = -D[:, 4]
+
+        elif sp.is_spinorbit:
             if hermitian and spin:
                 # conjugate the imaginary value and transpose spin-box
                 if self.dkind in ("f", "i"):
@@ -1751,7 +1877,10 @@ def trs(self):
         D = new._csr._D
 
         # Apply Pauli-Y on the left and right of each spin-box
-        if sp.is_spinorbit:
+        if sp.is_nambu:
+            raise NotImplementedError
+
+        elif sp.is_spinorbit:
             if self.dkind in ("f", "i"):
                 # [R11, R22, R12, I12, I11, I22, R21, I21]
                 # [R11, R22] = [R22, R11]
@@ -1761,13 +1890,19 @@ def trs(self):
                 # [R12, R21] = -[R21, R12] (Y @ Y)
                 D[:, [4, 5, 2, 6]] = -D[:, [5, 4, 6, 2]]
             else:
-                raise NotImplementedError
+                # [R11, R22, R12, I12, I11, I22, R21, I21]
+                # [11, 22] = [22, 11]^*
+                D[:, [0, 1]] = np.conj(D[:, [1, 0]])
+                # [12, 21] = -[21, 12]^* (Y @ Y)
+                D[:, [2, 3]] = -np.conj(D[:, [3, 2]])
+
         elif sp.is_noncolinear:
             if self.dkind in ("f", "i"):
                 # [R11, R22, R12, I12]
                 D[:, 2] = -D[:, 2]
             else:
-                raise NotImplementedError
+                # [R11, R22, 12]
+                D[:, 2] = -np.conj(D[:, 2])
 
         return new
 
diff --git a/src/sisl/physics/tests/test_physics_sparse.py b/src/sisl/physics/tests/test_physics_sparse.py
index d6ace99ac5..a9add7820a 100644
--- a/src/sisl/physics/tests/test_physics_sparse.py
+++ b/src/sisl/physics/tests/test_physics_sparse.py
@@ -160,7 +160,7 @@ def test_sparse_orbital_bz_non_colinear():
     M.finalize()
 
     MT = M.transpose()
-    MH = M.transpose(True)
+    MH = M.transpose(hermitian=True)
 
     assert np.abs((M - MT)._csr._D).sum() != 0
     # For a non-collinear with construct we don't take
@@ -177,7 +177,7 @@ def test_sparse_orbital_bz_non_colinear_trs_kramers_theorem():
     M.construct(([0.1, 1.44], [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]]))
     M.finalize()
 
-    M = (M + M.transpose(True)) * 0.5
+    M = (M + M.transpose(hermitian=True)) * 0.5
     MTRS = (M + M.trs()) * 0.5
 
     # This will in principle also work for M since the above parameters preserve
@@ -188,37 +188,115 @@ def test_sparse_orbital_bz_non_colinear_trs_kramers_theorem():
     assert np.allclose(eig1, eig2)
 
 
-def test_sparse_orbital_bz_spin_orbit_warns_hermitian():
-    M = SparseOrbitalBZSpin(geom.graphene(), spin=Spin("SO"))
+def _so_real2cmplx(p):
+    return [p[0] + 1j * p[4], p[1] + 1j * p[5], p[2] + 1j * p[3], p[6] + 1j * p[7]]
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
+def test_sparse_orbital_bz_spin_orbit_warns_hermitian(dtype):
+    M = SparseOrbitalBZSpin(geom.graphene(), spin=Spin("SO"), dtype=dtype)
+
+    p0 = np.arange(1, 9) / 10
+    p1 = np.arange(2, 10) / 10
+
+    if dtype == np.complex128:
+        p0 = _so_real2cmplx(p0)
+        p1 = _so_real2cmplx(p1)
 
     with pytest.warns(SislWarning, match="Hermitian"):
-        M.construct(
-            (
-                [0.1, 1.44],
-                [
-                    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
-                    [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-                ],
-            )
+        M.construct(([0.1, 1.44], [p0, p1]))
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
+def test_sparse_orbital_bz_spin_orbit(dtype):
+    M = SparseOrbitalBZSpin(geom.graphene(), spin=Spin("SO"), dtype=dtype)
+
+    p0 = [0.1, 0.2, 0.3, 0.4, 0.0, 0.0, 0.3, -0.4]
+    p1 = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+
+    if dtype == np.complex128:
+        p0 = _so_real2cmplx(p0)
+        p1 = _so_real2cmplx(p1)
+
+    M.construct(
+        (
+            [0.1, 1.44],
+            [p0, p1],
         )
+    )
+    M.finalize()
+
+    MT = M.transpose()
+    MH = M.transpose(hermitian=True)
+
+    assert np.abs((M - MT)._csr._D).sum() != 0
+    assert np.abs((M - MH)._csr._D).sum() == 0
+    assert np.abs((MT - MH)._csr._D).sum() != 0
 
 
-def test_sparse_orbital_bz_spin_orbit():
-    M = SparseOrbitalBZSpin(geom.graphene(), spin=Spin("SO"))
+def _nambu_cmplx2real(p):
+    return [
+        p[0].real,
+        p[1].real,
+        p[2].real,
+        p[2].imag,
+        p[0].imag,
+        p[1].imag,
+        p[3].real,
+        p[3].imag,
+        p[4].real,
+        p[4].imag,
+        p[5].real,
+        p[5].imag,
+        p[6].real,
+        p[6].imag,
+        p[7].real,
+        p[7].imag,
+    ]
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
+def test_sparse_orbital_bz_nambu(dtype):
+    M = SparseOrbitalBZSpin(geom.graphene(), spin=Spin("nambu"), dtype=dtype)
+
+    p0 = [
+        0.1 + 1j * 0.0,
+        0.2 + 1j * 0.0,
+        0.3 + 1j * 0.4,
+        0.3 - 1j * 0.4,
+        # onsite S must have zero real
+        # onsite triplet states must have 0 imaginary
+        1j * 0.6,
+        0.3,
+        0.4,
+        0.3,
+    ]
+
+    p1 = [
+        0.2 + 1j * 0.6,
+        0.3 + 1j * 0.7,
+        0.4 + 1j * 0.5,
+        0.3 + 1j * 0.9,
+        0.3 + 1j * 0.7,
+        0.4 + 1j * 0.8,
+        0.5 + 1j * 0.6,
+        0.4 + 1j * 1.0,
+    ]
+
+    if dtype == np.float64:
+        p0 = _nambu_cmplx2real(p0)
+        p1 = _nambu_cmplx2real(p1)
 
     M.construct(
         (
             [0.1, 1.44],
-            [
-                [0.1, 0.2, 0.3, 0.4, 0.0, 0.0, 0.3, -0.4],
-                [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            ],
+            [p0, p1],
         )
     )
     M.finalize()
 
     MT = M.transpose()
-    MH = M.transpose(True)
+    MH = M.transpose(hermitian=True)
 
     assert np.abs((M - MT)._csr._D).sum() != 0
     assert np.abs((M - MH)._csr._D).sum() == 0
@@ -226,21 +304,26 @@ def test_sparse_orbital_bz_spin_orbit():
 
 
 @pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
-def test_sparse_orbital_bz_spin_orbit_trs_kramers_theorem():
-    M = SparseOrbitalBZSpin(geom.graphene(), spin="SO")
+@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
+def test_sparse_orbital_bz_spin_orbit_trs_kramers_theorem(dtype):
+    M = SparseOrbitalBZSpin(geom.graphene(), spin="SO", dtype=dtype)
+
+    p0 = np.arange(1, 9) / 10
+    p1 = np.arange(2, 10) / 10
+
+    if dtype == np.complex128:
+        p0 = _so_real2cmplx(p0)
+        p1 = _so_real2cmplx(p1)
 
     M.construct(
         (
             [0.1, 1.44],
-            [
-                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
-                [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            ],
+            [p0, p1],
         )
     )
     M.finalize()
 
-    M = (M + M.transpose(True)) / 2
+    M = (M + M.transpose(hermitian=True)) / 2
     MTRS = (M + M.trs()) * 0.5
 
     # This will in principle also work for M since the above parameters preserve
@@ -251,22 +334,25 @@ def test_sparse_orbital_bz_spin_orbit_trs_kramers_theorem():
     assert np.allclose(eig1, eig2)
 
 
-@pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
-@pytest.mark.xfail(reason="Construct does not impose hermitian property")
-def test_sparse_orbital_bz_spin_orbit_hermitian_not():
-    M = SparseOrbitalBZSpin(geom.graphene(), spin="SO")
+@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
+def test_sparse_orbital_bz_spin_orbit_hermitian_not(dtype):
+    M = SparseOrbitalBZSpin(geom.graphene(), spin="SO", dtype=dtype)
+
+    p0 = [0.1, 0.2, 0.3, 0.4, 0.0, 0.0, 0.3, -0.4]
+    p1 = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+
+    if dtype == np.complex128:
+        p0 = _so_real2cmplx(p0)
+        p1 = _so_real2cmplx(p1)
 
     M.construct(
         (
             [0.1, 1.44],
-            [
-                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
-                [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            ],
+            [p0, p1],
         )
     )
     M.finalize()
-    new = (M + M.transpose(True)) / 2
+    new = (M + M.transpose(hermitian=True)) / 2
     assert np.abs((M - new)._csr._D).sum() == 0
 
 

From 846bb07e5625f2016037a8ed760a424e1ecc5a41 Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Fri, 29 Nov 2024 10:24:58 +0100
Subject: [PATCH 3/4] added spin extraction to siesta stdout

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 src/sisl/io/siesta/stdout.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/sisl/io/siesta/stdout.py b/src/sisl/io/siesta/stdout.py
index 4841abf3e9..89b73a71f4 100644
--- a/src/sisl/io/siesta/stdout.py
+++ b/src/sisl/io/siesta/stdout.py
@@ -44,6 +44,8 @@ def _parse_spin(attr, instance, match):
     """Parse 'redata: Spin configuration *= <value>'"""
     opt = match.string.split("=")[-1].strip()
 
+    if opt.startswith("nambu"):
+        return Spin("nambu")
     if opt.startswith("spin-orbit"):
         return Spin("spin-orbit")
     if opt.startswith("collinear") or opt.startswith("colinear"):

From 09ab9840187c138fa3cd9555a45192e3dcedad4b Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Fri, 29 Nov 2024 10:47:08 +0100
Subject: [PATCH 4/4] Added warning when using nambu spin + changelog

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 CHANGELOG.md             | 2 ++
 src/sisl/physics/spin.py | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 868d719b58..89b6eb443d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,8 @@ we hit release version 1.0.0.
       import sisl
       sisl.geom.graphene
 
+- added Nambu spin configuration, this is still experimental
+
 ### Fixed
 - `projection` arguments of several functions has been streamlined
 
diff --git a/src/sisl/physics/spin.py b/src/sisl/physics/spin.py
index 338fe15d21..c77a241f82 100644
--- a/src/sisl/physics/spin.py
+++ b/src/sisl/physics/spin.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from sisl._internal import set_module
+from sisl.messages import warn
 
 __all__ = ["Spin"]
 
@@ -100,6 +101,11 @@ def __init__(self, kind: Union[str, int] = "unpolarized"):
                 f"{self.__class__.__name__} initialization went wrong because of wrong "
                 "kind specification. Could not determine the kind of spin!"
             )
+        if kind == Spin.NAMBU:
+            warn(
+                "Using untested Nambu spin-configuration, please be aware "
+                "that this is largely untested code!"
+            )
 
         # Now assert the checks
         self._kind = kind