diff --git a/poetry.lock b/poetry.lock
index 4078fe40..33872263 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "antlr4-python3-runtime"
@@ -809,7 +809,7 @@ torch = ["torch (>=2.1.1,<3.0.0)"]
 type = "git"
 url = "https://github.com/qiboteam/qibo.git"
 reference = "HEAD"
-resolved_reference = "a90c0184092ae18277d80bc4fafd8a407c7bc1c0"
+resolved_reference = "953852301abce89eae60ac4195e167678bf6de9d"
 
 [[package]]
 name = "scipy"
diff --git a/pyproject.toml b/pyproject.toml
index 57d859c1..58d56394 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.9,<3.13"
 numba = ">=0.59.0"
-qibo = { git = "https://github.com/qiboteam/qibo.git" }
+qibo = { git="https://github.com/qiboteam/qibo.git" }
 scipy = "^1.10.1"
 psutil = "^5.9.5"
 
diff --git a/src/qibojit/backends/clifford_operations_cpu.py b/src/qibojit/backends/clifford_operations_cpu.py
index 9d595165..5e7d479f 100644
--- a/src/qibojit/backends/clifford_operations_cpu.py
+++ b/src/qibojit/backends/clifford_operations_cpu.py
@@ -3,12 +3,22 @@
 import numpy as np
 from numba import njit, prange, uint64
 
+PARALLEL = False
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+
+@njit("Tuple((u1[:], u1[:,:], u1[:,:]))(u1[:,:], u8)", parallel=PARALLEL, cache=True)
+def _get_rxz(symplectic_matrix, nqubits):
+    return (
+        symplectic_matrix[:, -1],
+        symplectic_matrix[:, :nqubits],
+        symplectic_matrix[:, nqubits:-1],
+    )
+
+
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def H(symplectic_matrix, q, nqubits):
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (x[i, q] & z[i, q])
         tmp = symplectic_matrix[i, q]
@@ -17,27 +27,23 @@ def H(symplectic_matrix, q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8, u8)", parallel=PARALLEL, cache=True)
 def CNOT(symplectic_matrix, control_q, target_q, nqubits):
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
 
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
-        symplectic_matrix[i, -1] = r[i] ^ (x[i, control_q] & z[i, target_q]) & (
+        r[i] = r[i] ^ (x[i, control_q] & z[i, target_q]) & (
             x[i, target_q] ^ ~z[i, control_q]
         )
-        symplectic_matrix[i, target_q] = x[i, target_q] ^ x[i, control_q]
-        symplectic_matrix[i, nqubits + control_q] = z[i, control_q] ^ z[i, target_q]
+        x[i, target_q] = x[i, target_q] ^ x[i, control_q]
+        z[i, control_q] = z[i, control_q] ^ z[i, target_q]
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8, u8)", parallel=PARALLEL, cache=True)
 def CZ(symplectic_matrix, control_q, target_q, nqubits):
     """Decomposition --> H-CNOT-H"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
 
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = (
@@ -48,28 +54,26 @@ def CZ(symplectic_matrix, control_q, target_q, nqubits):
         )
         z_control_q = x[i, target_q] ^ z[i, control_q]
         z_target_q = z[i, target_q] ^ x[i, control_q]
-        symplectic_matrix[i, nqubits + control_q] = z_control_q
-        symplectic_matrix[i, nqubits + target_q] = z_target_q
+        z[i, control_q] = z_control_q
+        z[i, target_q] = z_target_q
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def S(symplectic_matrix, q, nqubits):
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (x[i, q] & z[i, q])
         symplectic_matrix[i, nqubits + q] = z[i, q] ^ x[i, q]
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def Z(symplectic_matrix, q, nqubits):
     """Decomposition --> S-S"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (
             (x[i, q] & z[i, q]) ^ x[i, q] & (z[i, q] ^ x[i, q])
@@ -77,12 +81,11 @@ def Z(symplectic_matrix, q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def X(symplectic_matrix, q, nqubits):
     """Decomposition --> H-S-S-H"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = (
             r[i] ^ (z[i, q] & (z[i, q] ^ x[i, q])) ^ (z[i, q] & x[i, q])
@@ -90,12 +93,11 @@ def X(symplectic_matrix, q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def Y(symplectic_matrix, q, nqubits):
     """Decomposition --> S-S-H-S-S-H"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = (
             r[i] ^ (z[i, q] & (z[i, q] ^ x[i, q])) ^ (x[i, q] & (z[i, q] ^ x[i, q]))
@@ -103,48 +105,44 @@ def Y(symplectic_matrix, q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def SX(symplectic_matrix, q, nqubits):
     """Decomposition --> H-S-H"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (z[i, q] & (z[i, q] ^ x[i, q]))
         symplectic_matrix[i, q] = z[i, q] ^ x[i, q]
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def SDG(symplectic_matrix, q, nqubits):
     """Decomposition --> S-S-S"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (x[i, q] & (z[i, q] ^ x[i, q]))
         symplectic_matrix[i, nqubits + q] = z[i, q] ^ x[i, q]
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def SXDG(symplectic_matrix, q, nqubits):
     """Decomposition --> H-S-S-S-H"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (z[i, q] & x[i, q])
         symplectic_matrix[i, q] = z[i, q] ^ x[i, q]
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def RY_pi(symplectic_matrix, q, nqubits):
     """Decomposition --> H-S-S"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (x[i, q] & (z[i, q] ^ x[i, q]))
         zq = symplectic_matrix[i, nqubits + q]
@@ -153,12 +151,11 @@ def RY_pi(symplectic_matrix, q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8)", parallel=PARALLEL, cache=True)
 def RY_3pi_2(symplectic_matrix, q, nqubits):
     """Decomposition --> H-S-S"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = r[i] ^ (z[i, q] & (z[i, q] ^ x[i, q]))
         zq = symplectic_matrix[i, nqubits + q]
@@ -167,12 +164,11 @@ def RY_3pi_2(symplectic_matrix, q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8, u8)", parallel=PARALLEL, cache=True)
 def SWAP(symplectic_matrix, control_q, target_q, nqubits):
     """Decomposition --> CNOT-CNOT-CNOT"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = (
             r[i]
@@ -199,12 +195,11 @@ def SWAP(symplectic_matrix, control_q, target_q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8, u8)", parallel=PARALLEL, cache=True)
 def iSWAP(symplectic_matrix, control_q, target_q, nqubits):
     """Decomposition --> H-CNOT-CNOT-H-S-S"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = (
             r[i]
@@ -233,12 +228,11 @@ def iSWAP(symplectic_matrix, control_q, target_q, nqubits):
     return symplectic_matrix
 
 
-@njit("b1[:,:](b1[:,:], u8, u8, u8)", parallel=True, cache=True)
+@njit("u1[:,:](u1[:,:], u8, u8, u8)", parallel=PARALLEL, cache=True)
 def CY(symplectic_matrix, control_q, target_q, nqubits):
     """Decomposition --> S-CNOT-SDG"""
-    r = symplectic_matrix[:-1, -1]
-    x = symplectic_matrix[:-1, :nqubits]
-    z = symplectic_matrix[:-1, nqubits:-1]
+    r, x, z = _get_rxz(symplectic_matrix, nqubits)
+
     for i in prange(symplectic_matrix.shape[0]):  # pylint: disable=not-an-iterable
         symplectic_matrix[i, -1] = (
             r[i]
@@ -261,33 +255,26 @@ def CY(symplectic_matrix, control_q, target_q, nqubits):
 
 @njit(
     [
-        "b1[:,:](b1[:,:], u8[:], u8[:], u8, b1)",
-        "b1[:,:](b1[:,:], u4[:], u4[:], u4, b1)",
+        "u1[:,:](u1[:,:], u8[:], u8[:], u8, b1)",
+        "u1[:,:](u1[:,:], u4[:], u4[:], u4, b1)",
     ],
-    parallel=True,
+    parallel=PARALLEL,
     cache=True,
     fastmath=True,
 )
 def _rowsum(symplectic_matrix, h, i, nqubits, determined=False):
-    xi, xh = symplectic_matrix[i, :nqubits], symplectic_matrix[h, :nqubits]
-    zi, zh = symplectic_matrix[i, nqubits:-1], symplectic_matrix[h, nqubits:-1]
+    xi, zi = symplectic_matrix[i, :nqubits], symplectic_matrix[i, nqubits:-1]
+    xh, zh = symplectic_matrix[h, :nqubits], symplectic_matrix[h, nqubits:-1]
     if determined:
-        g_r = np.array([False for _ in range(h.shape[0])])
+        g_r = np.zeros(h.shape[0], dtype=np.uint8)
         g_xi_xh = xi.copy()
         g_zi_zh = xi.copy()
     for j in prange(len(h)):  # pylint: disable=not-an-iterable
-        exp = np.zeros(nqubits, dtype=uint64)
-        x1_eq_z1 = (xi[j] ^ zi[j]) == False
-        x1_neq_z1 = ~x1_eq_z1
-        x1_eq_0 = xi[j] == False
-        x1_eq_1 = ~x1_eq_0
-        ind2 = x1_eq_z1 & x1_eq_1
-        ind3 = x1_eq_1 & x1_neq_z1
-        ind4 = x1_eq_0 & x1_neq_z1
-        exp[ind2] = zh[j, ind2].astype(uint64) - xh[j, ind2].astype(uint64)
-        exp[ind3] = zh[j, ind3].astype(uint64) * (2 * xh[j, ind3].astype(uint64) - 1)
-        exp[ind4] = xh[j, ind4].astype(uint64) * (1 - 2 * zh[j, ind4].astype(uint64))
-
+        exp = (
+            2 * (xi[j] * xh[j] * (zh[j] - zi[j]) + zi[j] * zh[j] * (xi[j] - xh[j]))
+            - xi[j] * zh[j]
+            + xh[j] * zi[j]
+        )
         r = (
             2 * symplectic_matrix[h[j], -1]
             + 2 * symplectic_matrix[i[j], -1]
diff --git a/src/qibojit/backends/clifford_operations_gpu.py b/src/qibojit/backends/clifford_operations_gpu.py
index 1039c2bc..9698176b 100644
--- a/src/qibojit/backends/clifford_operations_gpu.py
+++ b/src/qibojit/backends/clifford_operations_gpu.py
@@ -1,8 +1,8 @@
 """Set of custom CuPy operations for the Clifford backend."""
 
-from functools import cache
-
 import cupy as cp  # pylint: disable=E0401
+import numpy
+from qibo.backends._clifford_operations import _dim, _get_rxz, _packed_size, _pad_size
 from scipy import sparse
 
 np = cp
@@ -11,53 +11,51 @@
 GRIDDIM_2D = (1024, 1024)
 
 
-@cache
-def _get_dim(nqubits):
-    return 2 * nqubits + 1
-
-
 apply_one_qubit_kernel = """
 extern "C"
-__global__ void apply_{}(bool* symplectic_matrix, const int q, const int nqubits, const int qz, const int dim) {{
-    _apply_{}(symplectic_matrix, q, nqubits, qz, dim);
+__global__ void apply_{}(unsigned char* symplectic_matrix, const int q, const int qz, const int nrows, const int ncolumns) {{
+    _apply_{}(symplectic_matrix, q, qz, nrows, ncolumns);
 }}
 """
 
 apply_two_qubits_kernel = """
 extern "C"
-__global__ void apply_{}(bool* symplectic_matrix, const int control_q, const int target_q, const int nqubits, const int cqz, const int tqz, const int dim) {{
-    _apply_{}(symplectic_matrix, control_q, target_q, nqubits, cqz, tqz, dim);
+__global__ void apply_{}(unsigned char* symplectic_matrix, const int control_q, const int target_q, const int cqz, const int tqz, const int nrows, const int ncolumns) {{
+    _apply_{}(symplectic_matrix, control_q, target_q, cqz, tqz, nrows, ncolumns);
 }}
 """
 
 
 def one_qubit_kernel_launcher(kernel, symplectic_matrix, q, nqubits):
     qz = nqubits + q
-    dim = _get_dim(nqubits)
-    return kernel((GRIDDIM,), (BLOCKDIM,), (symplectic_matrix, q, nqubits, qz, dim))
+    ncolumns = _dim(nqubits)
+    nrows = _packed_size(ncolumns)
+    return kernel((GRIDDIM,), (BLOCKDIM,), (symplectic_matrix, q, qz, nrows, ncolumns))
 
 
 def two_qubits_kernel_launcher(kernel, symplectic_matrix, control_q, target_q, nqubits):
     cqz = nqubits + control_q
     tqz = nqubits + target_q
-    dim = _get_dim(nqubits)
+    ncolumns = _dim(nqubits)
+    nrows = _packed_size(ncolumns)
     return kernel(
         (GRIDDIM,),
         (BLOCKDIM,),
-        (symplectic_matrix, control_q, target_q, nqubits, cqz, tqz, dim),
+        (symplectic_matrix, control_q, target_q, cqz, tqz, nrows, ncolumns),
     )
 
 
 apply_H = """
-__device__ void _apply_H(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_H(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (symplectic_matrix[i * dim + q] & symplectic_matrix[i * dim + qz]);
-        const bool tmp = symplectic_matrix[i * dim + q];
-        symplectic_matrix[i * dim + q] = symplectic_matrix[i * dim + qz];
-        symplectic_matrix[i * dim + qz] = tmp;
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (symplectic_matrix[row_idx + q] & symplectic_matrix[row_idx + qz]);
+        const unsigned char tmp = symplectic_matrix[row_idx + q];
+        symplectic_matrix[row_idx + q] = symplectic_matrix[row_idx + qz];
+        symplectic_matrix[row_idx + qz] = tmp;
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -73,19 +71,20 @@ def H(symplectic_matrix, q, nqubits):
 
 
 apply_CNOT = """
-__device__ void _apply_CNOT(bool* symplectic_matrix, const int& control_q, const int& target_q, const int& nqubits, const int& cqz, const int& tqz, const int& dim) {
+__device__ void _apply_CNOT(unsigned char* symplectic_matrix, const int& control_q, const int& target_q, const int& cqz, const int& tqz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + control_q] & symplectic_matrix[i * dim + tqz]
-        ) & (symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + cqz] ^ 1);
-        symplectic_matrix[i * dim + target_q] = (
-            symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + control_q]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + control_q] & symplectic_matrix[row_idx + tqz]
+        ) & (symplectic_matrix[row_idx + target_q] ^ ~symplectic_matrix[row_idx + cqz]);
+        symplectic_matrix[row_idx + target_q] = (
+            symplectic_matrix[row_idx + target_q] ^ symplectic_matrix[row_idx + control_q]
         );
-        symplectic_matrix[i * dim + cqz] = (
-            symplectic_matrix[i * dim + cqz] ^ symplectic_matrix[i * dim + tqz]
+        symplectic_matrix[row_idx + cqz] = (
+            symplectic_matrix[row_idx + cqz] ^ symplectic_matrix[row_idx + tqz]
         );
     };
 }
@@ -104,28 +103,29 @@ def CNOT(symplectic_matrix, control_q, target_q, nqubits):
 
 
 apply_CZ = """
-__device__ void _apply_CZ(bool* symplectic_matrix, const int& control_q, const int& target_q, const int& nqubits, const int& cqz, const int& tqz, const int& dim) {
+__device__ void _apply_CZ(unsigned char* symplectic_matrix, const int& control_q, const int& target_q, const int& cqz, const int& tqz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = (
-            symplectic_matrix[i * dim + last]
-            ^ (symplectic_matrix[i * dim + target_q] & symplectic_matrix[i * dim + tqz])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = (
+            symplectic_matrix[row_idx + last]
+            ^ (symplectic_matrix[row_idx + target_q] & symplectic_matrix[row_idx + tqz])
             ^ (
-                symplectic_matrix[i * dim + control_q]
-                & symplectic_matrix[i * dim + target_q]
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + cqz] ^ 1)
+                symplectic_matrix[row_idx + control_q]
+                & symplectic_matrix[row_idx + target_q]
+                & (symplectic_matrix[row_idx + tqz] ^ ~symplectic_matrix[row_idx + cqz])
             )
             ^ (
-                symplectic_matrix[i * dim + target_q]
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + control_q])
+                symplectic_matrix[row_idx + target_q]
+                & (symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + control_q])
             )
         );
-        const bool z_control_q = symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + cqz];
-        const bool z_target_q = symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + control_q];
-        symplectic_matrix[i * dim + cqz] = z_control_q;
-        symplectic_matrix[i * dim + tqz] = z_target_q;
+        const unsigned char z_control_q = symplectic_matrix[row_idx + target_q] ^ symplectic_matrix[row_idx + cqz];
+        const unsigned char z_target_q = symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + control_q];
+        symplectic_matrix[row_idx + cqz] = z_control_q;
+        symplectic_matrix[row_idx + tqz] = z_target_q;
     };
 }
 """ + apply_two_qubits_kernel.format(
@@ -143,15 +143,16 @@ def CZ(symplectic_matrix, control_q, target_q, nqubits):
 
 
 apply_S = """
-__device__ void _apply_S(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_S(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + q] & symplectic_matrix[i * dim + qz]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + q] & symplectic_matrix[row_idx + qz]
         );
-        symplectic_matrix[i * dim + qz] = symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q];
+        symplectic_matrix[row_idx + qz] = symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q];
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -167,15 +168,16 @@ def S(symplectic_matrix, q, nqubits):
 
 
 apply_Z = """
-__device__ void _apply_Z(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_Z(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            (symplectic_matrix[i * dim + q] & symplectic_matrix[i * dim + qz])
-            ^ symplectic_matrix[i * dim + q]
-            & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            (symplectic_matrix[row_idx + q] & symplectic_matrix[row_idx + qz])
+            ^ symplectic_matrix[row_idx + q]
+            & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
         );
     };
 }
@@ -192,18 +194,19 @@ def Z(symplectic_matrix, q, nqubits):
 
 
 apply_X = """
-__device__ void _apply_X(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_X(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = (
-            symplectic_matrix[i * dim + last]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = (
+            symplectic_matrix[row_idx + last]
             ^ (
-                symplectic_matrix[i * dim + qz]
-                & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+                symplectic_matrix[row_idx + qz]
+                & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
             )
-            ^ (symplectic_matrix[i * dim + qz] & symplectic_matrix[i * dim + q])
+            ^ (symplectic_matrix[row_idx + qz] & symplectic_matrix[row_idx + q])
         );
     };
 }
@@ -220,20 +223,21 @@ def X(symplectic_matrix, q, nqubits):
 
 
 apply_Y = """
-__device__ void _apply_Y(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_Y(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = (
-            symplectic_matrix[i * dim + last]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = (
+            symplectic_matrix[row_idx + last]
             ^ (
-                symplectic_matrix[i * dim + qz]
-                & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+                symplectic_matrix[row_idx + qz]
+                & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
             )
             ^ (
-                symplectic_matrix[i * dim + q]
-                & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+                symplectic_matrix[row_idx + q]
+                & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
             )
         );
     };
@@ -251,16 +255,17 @@ def Y(symplectic_matrix, q, nqubits):
 
 
 apply_SX = """
-__device__ void _apply_SX(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_SX(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + qz]
-            & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + qz]
+            & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
         );
-        symplectic_matrix[i * dim + q] = symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q];
+        symplectic_matrix[row_idx + q] = symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q];
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -276,16 +281,17 @@ def SX(symplectic_matrix, q, nqubits):
 
 
 apply_SDG = """
-__device__ void _apply_SDG(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_SDG(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + q]
-            & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + q]
+            & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
         );
-        symplectic_matrix[i * dim + qz] = symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q];
+        symplectic_matrix[row_idx + qz] = symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q];
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -301,15 +307,16 @@ def SDG(symplectic_matrix, q, nqubits):
 
 
 apply_SXDG = """
-__device__ void _apply_SXDG(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_SXDG(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + qz] & symplectic_matrix[i * dim + q]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + qz] & symplectic_matrix[row_idx + q]
         );
-        symplectic_matrix[i * dim + q] = symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q];
+        symplectic_matrix[row_idx + q] = symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q];
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -325,18 +332,19 @@ def SXDG(symplectic_matrix, q, nqubits):
 
 
 apply_RY_pi = """
-__device__ void _apply_RY_pi(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_RY_pi(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + q]
-            & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + q]
+            & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
         );
-        const bool zq = symplectic_matrix[i * dim + qz];
-        symplectic_matrix[i * dim + qz] = symplectic_matrix[i * dim + q];
-        symplectic_matrix[i * dim + q] = zq;
+        const unsigned char zq = symplectic_matrix[row_idx + qz];
+        symplectic_matrix[row_idx + qz] = symplectic_matrix[row_idx + q];
+        symplectic_matrix[row_idx + q] = zq;
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -352,18 +360,19 @@ def RY_pi(symplectic_matrix, q, nqubits):
 
 
 apply_RY_3pi_2 = """
-__device__ void _apply_RY_3pi_2(bool* symplectic_matrix, const int& q, const int& nqubits, const int& qz, const int& dim) {
+__device__ void _apply_RY_3pi_2(unsigned char* symplectic_matrix, const int& q, const int& qz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = symplectic_matrix[i * dim + last] ^ (
-            symplectic_matrix[i * dim + qz]
-            & (symplectic_matrix[i * dim + qz] ^ symplectic_matrix[i * dim + q])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = symplectic_matrix[row_idx + last] ^ (
+            symplectic_matrix[row_idx + qz]
+            & (symplectic_matrix[row_idx + qz] ^ symplectic_matrix[row_idx + q])
         );
-        const bool zq = symplectic_matrix[i * dim + qz];
-        symplectic_matrix[i * dim + qz] = symplectic_matrix[i * dim + q];
-        symplectic_matrix[i * dim + q] = zq;
+        const unsigned char zq = symplectic_matrix[row_idx + qz];
+        symplectic_matrix[row_idx + qz] = symplectic_matrix[row_idx + q];
+        symplectic_matrix[row_idx + q] = zq;
     };
 }
 """ + apply_one_qubit_kernel.format(
@@ -381,42 +390,43 @@ def RY_3pi_2(symplectic_matrix, q, nqubits):
 
 
 apply_SWAP = """
-__device__ void _apply_SWAP(bool* symplectic_matrix, const int& control_q, const int& target_q, const int& nqubits, const int& cqz, const int& tqz, const int& dim) {
+__device__ void _apply_SWAP(unsigned char* symplectic_matrix, const int& control_q, const int& target_q, const int& cqz, const int& tqz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = (
-            symplectic_matrix[i * dim + last]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = (
+            symplectic_matrix[row_idx + last]
             ^ (
-                symplectic_matrix[i * dim + control_q]
-                & symplectic_matrix[i * dim + tqz]
-                & (symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + cqz] ^ 1)
+                symplectic_matrix[row_idx + control_q]
+                & symplectic_matrix[row_idx + tqz]
+                & (symplectic_matrix[row_idx + target_q] ^ ~symplectic_matrix[row_idx + cqz])
             )
             ^ (
-                (symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + control_q])
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + cqz])
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + control_q] ^ 1)
+                (symplectic_matrix[row_idx + target_q] ^ symplectic_matrix[row_idx + control_q])
+                & (symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + cqz])
+                & (symplectic_matrix[row_idx + tqz] ^ ~symplectic_matrix[row_idx + control_q])
             )
             ^ (
-                symplectic_matrix[i * dim + target_q]
-                & symplectic_matrix[i * dim + cqz]
+                symplectic_matrix[row_idx + target_q]
+                & symplectic_matrix[row_idx + cqz]
                 & (
-                    symplectic_matrix[i * dim + control_q]
-                    ^ symplectic_matrix[i * dim + target_q]
-                    ^ symplectic_matrix[i * dim + cqz]
-                    ^ symplectic_matrix[i * dim + tqz] ^ 1
+                    symplectic_matrix[row_idx + control_q]
+                    ^ symplectic_matrix[row_idx + target_q]
+                    ^ symplectic_matrix[row_idx + cqz]
+                    ^ ~symplectic_matrix[row_idx + tqz]
                 )
             )
         );
-        const bool x_cq = symplectic_matrix[i * dim + control_q];
-        const bool x_tq = symplectic_matrix[i * dim + target_q];
-        const bool z_cq = symplectic_matrix[i * dim + cqz];
-        const bool z_tq = symplectic_matrix[i * dim + tqz];
-        symplectic_matrix[i * dim + control_q] = x_tq;
-        symplectic_matrix[i * dim + target_q] = x_cq;
-        symplectic_matrix[i * dim + cqz] = z_tq;
-        symplectic_matrix[i * dim + tqz] = z_cq;
+        const unsigned char x_cq = symplectic_matrix[row_idx + control_q];
+        const unsigned char x_tq = symplectic_matrix[row_idx + target_q];
+        const unsigned char z_cq = symplectic_matrix[row_idx + cqz];
+        const unsigned char z_tq = symplectic_matrix[row_idx + tqz];
+        symplectic_matrix[row_idx + control_q] = x_tq;
+        symplectic_matrix[row_idx + target_q] = x_cq;
+        symplectic_matrix[row_idx + cqz] = z_tq;
+        symplectic_matrix[row_idx + tqz] = z_cq;
     };
 }
 """ + apply_two_qubits_kernel.format(
@@ -434,66 +444,67 @@ def SWAP(symplectic_matrix, control_q, target_q, nqubits):
 
 
 apply_iSWAP = """
-__device__ void _apply_iSWAP(bool* symplectic_matrix, const int& control_q, const int& target_q, const int& nqubits, const int& cqz, const int& tqz, const int& dim) {
+__device__ void _apply_iSWAP(unsigned char* symplectic_matrix, const int& control_q, const int& target_q, const int& cqz, const int& tqz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = (
-            symplectic_matrix[i * dim + last]
-            ^ (symplectic_matrix[i * dim + target_q] & symplectic_matrix[i * dim + tqz])
-            ^ (symplectic_matrix[i * dim + control_q] & symplectic_matrix[i * dim + cqz])
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = (
+            symplectic_matrix[row_idx + last]
+            ^ (symplectic_matrix[row_idx + target_q] & symplectic_matrix[row_idx + tqz])
+            ^ (symplectic_matrix[row_idx + control_q] & symplectic_matrix[row_idx + cqz])
             ^ (
-                symplectic_matrix[i * dim + control_q]
-                & (symplectic_matrix[i * dim + cqz] ^ symplectic_matrix[i * dim + control_q])
+                symplectic_matrix[row_idx + control_q]
+                & (symplectic_matrix[row_idx + cqz] ^ symplectic_matrix[row_idx + control_q])
             )
             ^ (
-                (symplectic_matrix[i * dim + cqz] ^ symplectic_matrix[i * dim + control_q])
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + target_q])
-                & (symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + control_q] ^ 1)
+                (symplectic_matrix[row_idx + cqz] ^ symplectic_matrix[row_idx + control_q])
+                & (symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + target_q])
+                & (symplectic_matrix[row_idx + target_q] ^ ~symplectic_matrix[row_idx + control_q])
             )
             ^ (
                 (
-                    symplectic_matrix[i * dim + target_q]
-                    ^ symplectic_matrix[i * dim + cqz]
-                    ^ symplectic_matrix[i * dim + control_q]
+                    symplectic_matrix[row_idx + target_q]
+                    ^ symplectic_matrix[row_idx + cqz]
+                    ^ symplectic_matrix[row_idx + control_q]
                 )
                 & (
-                    symplectic_matrix[i * dim + target_q]
-                    ^ symplectic_matrix[i * dim + tqz]
-                    ^ symplectic_matrix[i * dim + control_q]
+                    symplectic_matrix[row_idx + target_q]
+                    ^ symplectic_matrix[row_idx + tqz]
+                    ^ symplectic_matrix[row_idx + control_q]
                 )
                 & (
-                    symplectic_matrix[i * dim + target_q]
-                    ^ symplectic_matrix[i * dim + tqz]
-                    ^ symplectic_matrix[i * dim + control_q]
-                    ^ symplectic_matrix[i * dim + cqz] ^ 1
+                    symplectic_matrix[row_idx + target_q]
+                    ^ symplectic_matrix[row_idx + tqz]
+                    ^ symplectic_matrix[row_idx + control_q]
+                    ^ ~symplectic_matrix[row_idx + cqz]
                 )
             )
             ^ (
-                symplectic_matrix[i * dim + control_q]
+                symplectic_matrix[row_idx + control_q]
                 & (
-                    symplectic_matrix[i * dim + target_q]
-                    ^ symplectic_matrix[i * dim + control_q]
-                    ^ symplectic_matrix[i * dim + cqz]
+                    symplectic_matrix[row_idx + target_q]
+                    ^ symplectic_matrix[row_idx + control_q]
+                    ^ symplectic_matrix[row_idx + cqz]
                 )
             )
         );
-        const bool z_control_q = (
-            symplectic_matrix[i * dim + target_q]
-            ^ symplectic_matrix[i * dim + tqz]
-            ^ symplectic_matrix[i * dim + control_q]
+        const unsigned char z_control_q = (
+            symplectic_matrix[row_idx + target_q]
+            ^ symplectic_matrix[row_idx + tqz]
+            ^ symplectic_matrix[row_idx + control_q]
         );
-        const bool z_target_q = (
-            symplectic_matrix[i * dim + target_q]
-            ^ symplectic_matrix[i * dim + cqz]
-            ^ symplectic_matrix[i * dim + control_q]
+        const unsigned char z_target_q = (
+            symplectic_matrix[row_idx + target_q]
+            ^ symplectic_matrix[row_idx + cqz]
+            ^ symplectic_matrix[row_idx + control_q]
         );
-        symplectic_matrix[i * dim + cqz] = z_control_q;
-        symplectic_matrix[i * dim + tqz] = z_target_q;
-        const bool tmp = symplectic_matrix[i * dim + control_q];
-        symplectic_matrix[i * dim + control_q] = symplectic_matrix[i * dim + target_q];
-        symplectic_matrix[i * dim + target_q] = tmp;
+        symplectic_matrix[row_idx + cqz] = z_control_q;
+        symplectic_matrix[row_idx + tqz] = z_target_q;
+        const unsigned char tmp = symplectic_matrix[row_idx + control_q];
+        symplectic_matrix[row_idx + control_q] = symplectic_matrix[row_idx + target_q];
+        symplectic_matrix[row_idx + target_q] = tmp;
     };
 }
 """ + apply_two_qubits_kernel.format(
@@ -511,37 +522,38 @@ def iSWAP(symplectic_matrix, control_q, target_q, nqubits):
 
 
 apply_CY = """
-__device__ void _apply_CY(bool* symplectic_matrix, const int& control_q, const int& target_q, const int& nqubits, const int& cqz, const int& tqz, const int& dim) {
+__device__ void _apply_CY(unsigned char* symplectic_matrix, const int& control_q, const int& target_q, const int& cqz, const int& tqz, const int& nrows, const int& ncolumns) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     const int ntid = gridDim.x * blockDim.x;
-    const int last = dim - 1;
-    for(int i = tid; i < last; i += ntid) {
-        symplectic_matrix[i * dim + last] = (
-            symplectic_matrix[i * dim + last]
+    const int last = ncolumns - 1;
+    for(int i = tid; i < nrows; i += ntid) {
+        unsigned int row_idx = i * ncolumns;
+        symplectic_matrix[row_idx + last] = (
+            symplectic_matrix[row_idx + last]
             ^ (
-                symplectic_matrix[i * dim + target_q]
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + target_q])
+                symplectic_matrix[row_idx + target_q]
+                & (symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + target_q])
             )
             ^ (
-                symplectic_matrix[i * dim + control_q]
-                & (symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + tqz])
-                & (symplectic_matrix[i * dim + cqz] ^ symplectic_matrix[i * dim + target_q] ^ 1)
+                symplectic_matrix[row_idx + control_q]
+                & (symplectic_matrix[row_idx + target_q] ^ symplectic_matrix[row_idx + tqz])
+                & (symplectic_matrix[row_idx + cqz] ^ ~symplectic_matrix[row_idx + target_q])
             )
             ^ (
-                (symplectic_matrix[i * dim + target_q] ^ symplectic_matrix[i * dim + control_q])
-                & (symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + target_q])
+                (symplectic_matrix[row_idx + target_q] ^ symplectic_matrix[row_idx + control_q])
+                & (symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + target_q])
             )
         );
-        const bool x_target_q = symplectic_matrix[i * dim + control_q] ^ symplectic_matrix[i * dim + target_q];
-        const bool z_control_q = (
-            symplectic_matrix[i * dim + cqz]
-            ^ symplectic_matrix[i * dim + tqz]
-            ^ symplectic_matrix[i * dim + target_q]
+        const unsigned char x_target_q = symplectic_matrix[row_idx + control_q] ^ symplectic_matrix[row_idx + target_q];
+        const unsigned char z_control_q = (
+            symplectic_matrix[row_idx + cqz]
+            ^ symplectic_matrix[row_idx + tqz]
+            ^ symplectic_matrix[row_idx + target_q]
         );
-        const bool z_target_q = symplectic_matrix[i * dim + tqz] ^ symplectic_matrix[i * dim + control_q];
-        symplectic_matrix[i * dim + target_q] = x_target_q;
-        symplectic_matrix[i * dim + cqz] = z_control_q;
-        symplectic_matrix[i * dim + tqz] = z_target_q;
+        const unsigned char z_target_q = symplectic_matrix[row_idx + tqz] ^ symplectic_matrix[row_idx + control_q];
+        symplectic_matrix[row_idx + target_q] = x_target_q;
+        symplectic_matrix[row_idx + cqz] = z_control_q;
+        symplectic_matrix[row_idx + tqz] = z_target_q;
     };
 }
 """ + apply_two_qubits_kernel.format(
@@ -559,55 +571,40 @@ def CY(symplectic_matrix, control_q, target_q, nqubits):
 
 
 _apply_rowsum = """
-__device__ void _apply_rowsum(bool* symplectic_matrix, const long* h, const long* i, const int& nqubits, const bool& determined, const int& nrows, long* g_exp, const int& dim) {
+__device__ void _apply_rowsum(unsigned char* symplectic_matrix, const long* h, const long* i, const int& nqubits, const bool& determined, const int& nrows, long* g_exp, const int& dim) {
     unsigned int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned int bid_y = blockIdx.y;
     unsigned int ntid_x = gridDim.x * blockDim.x;
     unsigned int nbid_y = gridDim.y;
     const int last = dim - 1;
     __shared__ int exp;
-    if (threadIdx.x == 0) {
-        exp = 0;
-    }
     for(int j = bid_y; j < nrows; j += nbid_y) {
+        unsigned int row_i = i[j] * dim;
+        unsigned int row_h = h[j] * dim;
         for(int k = tid_x; k < nqubits; k += ntid_x) {
             unsigned int kz = nqubits + k;
-            bool x1_eq_z1 = symplectic_matrix[i[j] * dim + k] == symplectic_matrix[i[j] * dim + kz];
-            bool x1_eq_0 = symplectic_matrix[i[j] * dim + k] == false;
-            if (x1_eq_z1) {
-                if (not x1_eq_0) {
-                    exp += ((int) symplectic_matrix[h[j] * dim + kz]) -
-                        (int) symplectic_matrix[h[j] * dim + k];
-                }
-            } else {
-                if (x1_eq_0) {
-                    exp += ((int) symplectic_matrix[h[j] * dim + k]) * (
-                        1 - 2 * (int) symplectic_matrix[h[j] * dim + kz]
-                    );
-                } else {
-                    exp += ((int) symplectic_matrix[h[j] * dim + kz]) * (
-                        2 * (int) symplectic_matrix[h[j] * dim + k] - 1
-                    );
-                }
-            }
+            exp = (
+                2 * (symplectic_matrix[row_i + k] * symplectic_matrix[row_h + k] * (symplectic_matrix[row_h + kz] - symplectic_matrix[row_i + kz]) +
+                symplectic_matrix[row_i + kz] * symplectic_matrix[row_h + kz] * (symplectic_matrix[row_i + k] - symplectic_matrix[row_h + k]))
+                - symplectic_matrix[row_i + k] * symplectic_matrix[row_h + kz]
+                + symplectic_matrix[row_h + k] * symplectic_matrix[row_i + kz]
+            );
         }
         if (threadIdx.x == 0 && tid_x < nqubits) {
             g_exp[j] += exp;
         }
         __syncthreads();
         if (threadIdx.x == 0 && blockIdx.x == 0) {
-            symplectic_matrix[h[j] * dim + last] = (
-                2 * symplectic_matrix[h[j] * dim + last] + 2 * symplectic_matrix[i[j] * dim + last] + g_exp[j]
+            symplectic_matrix[row_h + last] = (
+                2 * symplectic_matrix[row_h + last] + 2 * symplectic_matrix[row_i + last] + g_exp[j]
             ) % 4 != 0;
         }
         for(int k = tid_x; k < nqubits; k += ntid_x) {
             unsigned int kz = nqubits + k;
-            unsigned int row_i = i[j] * dim;
-            unsigned int row_h = h[j] * dim;
-            bool xi_xh = (
+            unsigned char xi_xh = (
                 symplectic_matrix[row_i + k] ^ symplectic_matrix[row_h + k]
             );
-            bool zi_zh = (
+            unsigned char zi_zh = (
                 symplectic_matrix[row_i + kz] ^ symplectic_matrix[row_h + kz]
             );
             if (determined) {
@@ -625,7 +622,7 @@ def CY(symplectic_matrix, control_q, target_q, nqubits):
 apply_rowsum = f"""
 {_apply_rowsum}
 extern "C"
-__global__ void apply_rowsum(bool* symplectic_matrix, const long* h, const long* i, const int nqubits, const bool determined, const int nrows, long* g_exp, const int dim) {{
+__global__ void apply_rowsum(unsigned char* symplectic_matrix, const long* h, const long* i, const int nqubits, const bool determined, const int nrows, long* g_exp, const int dim) {{
     _apply_rowsum(symplectic_matrix, h, i, nqubits, determined, nrows, g_exp, dim);
 }}
 """
@@ -634,60 +631,89 @@ def CY(symplectic_matrix, control_q, target_q, nqubits):
 
 
 def _rowsum(symplectic_matrix, h, i, nqubits, determined=False):
-    dim = _get_dim(nqubits)
     nrows = len(h)
     exp = cp.zeros(len(h), dtype=int)
+    packed_nqubits = _packed_size(nqubits)
+    row_dim = _dim(packed_nqubits)
     apply_rowsum(
         GRIDDIM_2D,
         (BLOCKDIM,),
-        (symplectic_matrix, h, i, nqubits, determined, nrows, exp, dim),
+        (symplectic_matrix, h, i, packed_nqubits, determined, nrows, exp, row_dim),
     )
     return symplectic_matrix
 
 
-def _get_p(state, q, nqubits):
-    dim = _get_dim(nqubits)
-    return state.reshape(dim, dim)[nqubits:-1, q].nonzero()[0]
-
-
 def _random_outcome(state, p, q, nqubits):
-    dim = _get_dim(nqubits)
     p = p[0] + nqubits
-    idx_pq = p * dim + q
-    tmp = state[idx_pq].copy()
-    state[idx_pq] = False
-    h = state.reshape(dim, dim)[:-1, q].nonzero()[0]
-    state[idx_pq] = tmp
+    tmp = state[p, q].copy()
+    state[p, q] = 0
+    h = state[:-1, q].nonzero()[0]
+    state[p, q] = tmp
     if h.shape[0] > 0:
+        dim = state.shape[1]
+        state = _pack_for_measurements(state, nqubits)
+        dim = state.shape[1]
         state = _rowsum(
-            state,
+            state.ravel(),
             h,
             p.astype(cp.uint) * cp.ones(h.shape[0], dtype=np.uint),
-            nqubits,
+            _packed_size(nqubits),
             False,
         )
-    state = state.reshape(dim, dim)
+        state = _unpack_for_measurements(state.reshape(-1, dim), nqubits)
     state[p - nqubits, :] = state[p, :]
     outcome = cp.random.randint(2, size=None, dtype=cp.uint)
-    state[p, :] = False
-    state[p, -1] = outcome.astype(bool)
-    state[p, nqubits + q] = True
-    return state.ravel(), outcome
+    state[p, :] = 0
+    state[p, -1] = outcome.astype(cp.uint8)
+    state[p, nqubits + q] = 1
+    return state, outcome
 
 
 def _determined_outcome(state, q, nqubits):
-    dim = _get_dim(nqubits)
-    state = state.reshape(dim, dim)
-    state[-1, :] = False
-    idx = state[:nqubits, q].nonzero()[0] + nqubits
+    state[-1, :] = 0
+    idx = (state[:nqubits, q].nonzero()[0] + nqubits).astype(np.uint)
+    state = _pack_for_measurements(state, nqubits)
+    dim = state.shape[1]
     state = _rowsum(
         state.ravel(),
         (2 * nqubits * cp.ones(idx.shape, dtype=np.uint)).astype(np.uint),
         idx.astype(np.uint),
-        nqubits,
+        _packed_size(nqubits),
         True,
     )
-    return state, state[dim * dim - 1].astype(cp.uint)
+    state = _unpack_for_measurements(state.reshape(-1, dim), nqubits)
+    return state, state[-1, -1]
+
+
+def _packbits(array, axis):
+    # cupy.packbits doesn't support axis yet
+    return cp.array(numpy.packbits(array.get(), axis=axis), dtype=cp.uint8)
+
+
+def _unpackbits(array, axis):
+    return cp.array(numpy.unpackbits(array.get(), axis=axis), dtype=cp.uint8)
+
+
+def _pack_for_measurements(state, nqubits):
+    r, x, z = _get_rxz(state, nqubits)
+    x = _packbits(x, axis=1)
+    z = _packbits(z, axis=1)
+    return np.hstack((x, z, r[:, None]))
+
+
+def _unpack_for_measurements(state, nqubits):
+    xz = _unpackbits(state[:, :-1], axis=1)
+    padding_size = _pad_size(nqubits)
+    x, z = xz[:, :nqubits], xz[:, nqubits + padding_size : -padding_size]
+    return np.hstack((x, z, state[:, -1][:, None]))
+
+
+def _init_state_for_measurements(state, nqubits, collapse):
+    dim = _dim(nqubits)
+    if collapse:
+        return _unpackbits(state[None, :], axis=0)[:dim]
+    else:
+        return state.copy()
 
 
 def cast(x, dtype=None, copy=False):
@@ -710,12 +736,12 @@ def cast(x, dtype=None, copy=False):
 
 
 def _clifford_pre_execution_reshape(state):
-    return state.ravel()
+    return _packbits(state, axis=0).ravel()
 
 
 def _clifford_post_execution_reshape(state, nqubits):
-    dim = _get_dim(nqubits)
-    return state.reshape(dim, dim)
+    dim = _dim(nqubits)
+    return _unpackbits(state.reshape(-1, dim), axis=0)[:dim]
 
 
 def identity_density_matrix(nqubits, normalize: bool = True):