From dafee801a498242072fa19256d53457a1a59173c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 18 Jul 2023 14:11:57 -0700
Subject: [PATCH 1/2] sync with internal repo1 (commit b92afcab0) and repo2
 (commit e18a4a4)

---
 benchmarks/README.md                          |   6 +-
 benchmarks/cuquantum_benchmarks/__init__.py   |   2 +-
 benchmarks/cuquantum_benchmarks/_utils.py     |  66 ++-
 .../cuquantum_benchmarks/backends/__init__.py |   4 +-
 .../backends/backend_cutn.py                  |   5 +-
 .../backends/backend_pny.py                   |  25 +-
 .../backends/backend_qiskit.py                |  18 +-
 .../benchmarks/apply_gen_perm_matrix.py       | 147 +++++++
 .../benchmarks/apply_matrix.py                |  46 +-
 .../benchmarks/cusv_sampler.py                | 102 +++++
 .../benchmarks/tensor_decompose.py            | 116 +++++
 benchmarks/cuquantum_benchmarks/config.py     |  63 +--
 .../frontends/frontend_dumper.py              | 151 +++++++
 .../frontends/frontend_pny.py                 |   6 +-
 .../frontends/frontend_qiskit.py              |   4 +-
 benchmarks/cuquantum_benchmarks/run.py        | 217 ++++++---
 .../cuquantum_benchmarks/run_interface.py     | 414 +++++++++++-------
 benchmarks/setup.py                           |   5 +-
 .../cuquantum_benchmarks_tests/test_run.py    | 203 +++++++--
 .../cutensornet/coarse/example22_mpi_auto.py  |   9 -
 20 files changed, 1238 insertions(+), 371 deletions(-)
 create mode 100644 benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py
 create mode 100644 benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py
 create mode 100644 benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py
 create mode 100644 benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4ff9780..9ee3894 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -14,7 +14,7 @@ pip install .[all]
 ```
 if running outside of the [cuQuantum Appliance container](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/index.html).
 
-**Note: You may have to build `qsimcirq` and `qiskit-aer` GPU support from source if needed.**
+**Note: You may have to build `qsimcirq`, `qiskit-aer`, and `qulacs` GPU support from source if needed.**
 
 Alternatively, you can choose to manage all (required & optional) dependencies yourself via
 ```
@@ -44,7 +44,7 @@ Starting v0.2.0, we offer subcommands for performing benchmarks at different lev
 
 Alternatively, you can launch the benchmark program via `python -m cuquantum_benchmarks`. This is equivalent to the standalone command, and is useful when, say, `pip` installs this package to the user site-package (so that the `cuquantum-benchmarks` command may not be available without modifying `$PATH`).
 
-For GPU backends, it is preferred that `--ngpus` is explicitly set.
+For GPU backends, it is preferred that `--ngpus N` is explicitly set. On a multi-GPU system, the first `N` GPUs would be used. To limit which GPUs can be accessed by the CUDA runtime, use the environment variable `CUDA_VISIBLE_DEVICES` following the CUDA documentation.
 
 For backends that support MPI parallelism, it is assumed that `MPI_COMM_WORLD` is the communicator, and that `mpi4py` is installed. You can run the benchmarks as you would normally do to launch MPI processes: `mpiexec -n N cuquantum-benchmarks ...`. It is preferred if you fully specify the problem (explicitly set `--benchmark` & `--nqubits`).
 
@@ -70,6 +70,8 @@ Currently all environment variables are reserved for internal use only, and are
 
 * `CUTENSORNET_DUMP_TN=txt`
 * `CUTENSORNET_BENCHMARK_TARGET={amplitude,state_vector,expectation}` (pick one)
+* `CUTENSORNET_APPROX_TN_UTILS_PATH`
+* `CUQUANTUM_BENCHMARKS_DUMP_GATES`
 
 ## Development Overview
 
diff --git a/benchmarks/cuquantum_benchmarks/__init__.py b/benchmarks/cuquantum_benchmarks/__init__.py
index 8b51178..336ad30 100644
--- a/benchmarks/cuquantum_benchmarks/__init__.py
+++ b/benchmarks/cuquantum_benchmarks/__init__.py
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-__version__ = '0.2.0'
+__version__ = '0.3.0'
diff --git a/benchmarks/cuquantum_benchmarks/_utils.py b/benchmarks/cuquantum_benchmarks/_utils.py
index c814604..299e198 100644
--- a/benchmarks/cuquantum_benchmarks/_utils.py
+++ b/benchmarks/cuquantum_benchmarks/_utils.py
@@ -5,6 +5,7 @@
 import argparse
 import ctypes
 from dataclasses import dataclass
+import functools
 import math
 import json
 import hashlib
@@ -19,6 +20,7 @@
 
 import cupy as cp
 import numpy as np
+import nvtx
 from cuquantum import cudaDataType, ComputeType
 from cuquantum.cutensornet._internal.einsum_parser import create_size_dict
 import psutil
@@ -29,6 +31,15 @@
 logger = logging.getLogger(logger_name)
 
 
+def wrap_with_nvtx(func, msg):
+    """Add NVTX makers to a function with a message."""
+    @functools.wraps(func)
+    def inner(*args, **kwargs):
+        with nvtx.annotate(msg):
+            return func(*args, **kwargs)
+    return inner
+
+
 def reseed(seed=1234):
     random.seed(seed)
     np.random.seed(seed)
@@ -162,11 +173,16 @@ def is_running_mpi():
     return MPI
 
 
-def get_num_processes():
+def get_mpi_size():
     MPI = is_running_mpi()
     return MPI.COMM_WORLD.Get_size() if MPI else 1
 
 
+def get_mpi_rank():
+    MPI = is_running_mpi()
+    return MPI.COMM_WORLD.Get_rank() if MPI else 0
+
+
 def call_by_root(f, root=0):
     """ Call the callable f only by the root process. """
     MPI = is_running_mpi()
@@ -409,7 +425,7 @@ def dump():
     return full_data
 
 
-def load_benchmark_data(filepath, cache_dir, required_subdirs=()):
+def load_benchmark_data(filepath):
     try:
         with open(filepath, 'r') as f:
             full_data = json.load(f)
@@ -419,17 +435,16 @@ def load_benchmark_data(filepath, cache_dir, required_subdirs=()):
         full_data = {}
         logger.debug(f'{filepath} not found')
 
-        # it could be that the cache dirs are not created yet
-        def create_cache():
-            for subdir in required_subdirs:
-                path = os.path.join(cache_dir, subdir)
-                if not os.path.isdir(path):
-                    os.makedirs(path, exist_ok=True)
-        call_by_root(create_cache)
-
     return full_data
 
 
+def create_cache(cache_dir, required_subdirs):
+    for subdir in required_subdirs:
+        path = os.path.join(cache_dir, subdir)
+        if not os.path.isdir(path):
+            os.makedirs(path, exist_ok=True)
+
+
 # TODO: upstream this to cupyx.profiler.benchmark
 class L2flush:
     """ Handly utility for flushing the current device's L2 cache.
@@ -441,7 +456,7 @@ class L2flush:
     https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh.
     """
     def __init__(self):
-        self.l2_size = cp.cuda.Device().attributes['L2CacheSize']
+        self.l2_size = 3 * cp.cuda.Device().attributes['L2CacheSize']
         self.mem = cp.cuda.alloc(self.l2_size) if self.l2_size > 0 else None
 
     def flush(self):
@@ -496,3 +511,32 @@ class _Result: pass
         result.gpu_times = gpu_times
 
     return result
+
+
+class EarlyReturnError(RuntimeError): pass
+
+
+is_unique = lambda a: len(set(a)) == len(a)
+is_disjoint = lambda a, b: not bool(set(a) & set(b))
+
+
+def check_targets_controls(targets, controls, n_qubits):
+    # simple checks for targets and controls
+    assert len(targets) >= 1, "must have at least 1 target qubit"
+    assert is_unique(targets), "qubit indices in targets must be unique"
+    assert is_unique(controls), "qubit indices in controls must be unique"
+    assert is_disjoint(targets, controls), "qubit indices in targets and controls must be disjoint"
+    assert all(0 <= q and q < n_qubits for q in targets + controls), f"target and control qubit indices must be in range [0, {n_qubits})"
+
+
+def check_sequence(seq, expected_size=None, max_size=None, name=''):
+    if expected_size is not None:
+        assert len(seq) == expected_size, f"the provided {name} must be of length {expected_size}"
+        size = expected_size
+    elif max_size is not None:
+        assert len(seq) <= max_size, f"the provided {name} must have length <= {max_size}"
+        size = max_size
+    else:
+        assert False
+    assert is_unique(seq), f"the provided {name} must have non-repetitve entries"
+    assert all(0 <= i and i < size for i in seq), f"entries in the {name} must be in [0, {size})"
diff --git a/benchmarks/cuquantum_benchmarks/backends/__init__.py b/benchmarks/cuquantum_benchmarks/backends/__init__.py
index f009f67..0cb2cf2 100644
--- a/benchmarks/cuquantum_benchmarks/backends/__init__.py
+++ b/benchmarks/cuquantum_benchmarks/backends/__init__.py
@@ -4,7 +4,8 @@
 
 from .backend_cirq import Cirq
 from .backend_cutn import cuTensorNet
-from .backend_pny import Pny, PnyLightningGpu, PnyLightningCpu, PnyLightningKokkos
+from .backend_pny import (Pny, PnyLightningGpu, PnyLightningCpu,
+                          PnyLightningKokkos, PnyDumper)
 from .backend_qsim import Qsim, QsimCuda, QsimCusv, QsimMgpu
 from .backend_qiskit import Aer, AerCuda, AerCusv, CusvAer
 from .backend_qulacs import QulacsGpu, QulacsCpu
@@ -29,6 +30,7 @@
     'pennylane-lightning-gpu': PnyLightningGpu,
     'pennylane-lightning-qubit': PnyLightningCpu,
     'pennylane-lightning-kokkos': PnyLightningKokkos,
+    'pennylane-dumper': PnyDumper,
     'qulacs-cpu': QulacsCpu,
     'qulacs-gpu': QulacsGpu,
 }
diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py
index 76f52d1..ba58fb7 100644
--- a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py
+++ b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py
@@ -50,6 +50,7 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs):
             # cuQuantum Python 22.07 or below
             opts = cutn.NetworkOptions(handle=self.handle)
         self.network_opts = opts
+        self.n_samples = kwargs.pop('nhypersamples')
 
     def __del__(self):
         cutn.destroy(self.handle)
@@ -104,10 +105,12 @@ def preprocess_circuit(self, circuit, *args, **kwargs):
         t1 = time.perf_counter()
         path, opt_info = self.network.contract_path(
             # TODO: samples may be too large for small circuits
-            optimize={'samples': 512, 'threads': self.ncpu_threads})
+            optimize={'samples': self.n_samples, 'threads': self.ncpu_threads})
         t2 = time.perf_counter()
         time_path = t2 - t1
         logger.info(f'contract_path() took {time_path} s')
+        logger.debug(f'# samples: {self.n_samples}')
+        logger.debug(opt_info)
 
         self.path = path
         self.opt_info = opt_info
diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py
index ad78332..31f3920 100644
--- a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py
+++ b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py
@@ -7,6 +7,7 @@
 import os
 import time
 import warnings
+import sys
 
 import numpy as np
 try:
@@ -15,7 +16,7 @@
     pennylane = None
 
 from .backend import Backend
-from .._utils import is_running_mpi
+from .._utils import call_by_root, EarlyReturnError, is_running_mpi
 
 
 # set up a logger
@@ -80,6 +81,23 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs):
             if self.ngpus != 0:
                 raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}")
             dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots, c_dtype=self.dtype)
+        elif self.identifier == "pennylane-dumper":
+            import cloudpickle
+            import cuquantum_benchmarks
+            cloudpickle.register_pickle_by_value(cuquantum_benchmarks)
+
+            # note: before loading the pickle, one should check if the Python version agrees
+            # (probably pennylane's version too)
+            py_major_minor = f'{sys.version_info.major}.{sys.version_info.minor}'
+            circuit_filename = kwargs.pop('circuit_filename')
+            circuit_filename += f"_pny_raw_py{py_major_minor}.pickle"
+            def dump():
+                logger.info(f"dumping pennylane (raw) circuit as {circuit_filename} ...")
+                with open(circuit_filename, 'wb') as f:
+                    cloudpickle.dump(circuit, f)  # use highest protocol
+                    logger.info("early exiting as the dumper task is completed")
+            call_by_root(dump)
+            raise EarlyReturnError
         else:
             raise ValueError(f"the backend {self.identifier} is not recognized")
 
@@ -89,9 +107,9 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs):
     def preprocess_circuit(self, circuit, *args, **kwargs):
         nshots = kwargs.get('nshots', 1024)
         t1 = time.perf_counter()
-        self.circuit = self._make_qnode(circuit, nshots)
+        self.circuit = self._make_qnode(circuit, nshots, **kwargs)
         t2 = time.perf_counter()
-        time_make_qnode = t2-t1
+        time_make_qnode = t2 - t1
         logger.info(f'make qnode took {time_make_qnode} s')
         return {'make_qnode': time_make_qnode}
 
@@ -107,3 +125,4 @@ def run(self, circuit, nshots=1024):
 PnyLightningCpu = functools.partial(Pennylane, identifier='pennylane-lightning-qubit')
 PnyLightningKokkos = functools.partial(Pennylane, identifier='pennylane-lightning-kokkos')
 Pny = functools.partial(Pennylane, identifier='pennylane')
+PnyDumper = functools.partial(Pennylane, identifier='pennylane-dumper')
diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py
index 4362b53..2b1bde5 100644
--- a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py
+++ b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py
@@ -6,6 +6,7 @@
 import functools
 import logging
 import time
+from importlib.metadata import version
 
 import numpy as np
 import cupy as cp
@@ -15,7 +16,7 @@
     qiskit = None
 
 from .backend import Backend
-from .._utils import get_num_processes
+from .._utils import get_mpi_size, get_mpi_rank
 
 
 # set up a logger
@@ -48,8 +49,8 @@ def run(self, circuit, nshots=1024):
             results = self.backend.run(transpiled_qc, shots=nshots, memory=True)
         else:
             results = self.backend.run(transpiled_qc, shots=0, memory=True)
-        # workaround for memory allocation failure for cusvaer 22.11
-        if self.identifier == 'cusvaer':
+        # workaround for memory allocation failure for cusvaer 22.11/23.03
+        if self.identifier == 'cusvaer' and self._need_sync():
             self._synchronize()
 
         post_res_list = results.result().get_memory()
@@ -169,7 +170,7 @@ def create_aer_backend(self, identifier, ngpus, ncpu_threads, *args, **kwargs):
         return backend
 
     def get_aer_blocking_setup(self, ngpus=None):
-        size = get_num_processes()  # check if running MPI
+        size = get_mpi_size()  # check if running MPI
         if size > 1:
             blocking_enable = True
             if self.identifier == 'aer':
@@ -182,11 +183,16 @@ def get_aer_blocking_setup(self, ngpus=None):
             blocking_qubits = None
         return blocking_enable, blocking_qubits
 
+    def _need_sync(self):
+        ver_str = version('cusvaer')
+        ver = [int(num) for num in ver_str.split('.')]
+        return ver[0] == 0 and ver[1] <= 2
+
     def _synchronize(self):
-        nprocs = get_num_processes()
+        my_rank = get_mpi_rank()
         ndevices_in_node = cp.cuda.runtime.getDeviceCount()
         # GPU selected in this process
-        device_id = nprocs % ndevices_in_node
+        device_id = my_rank % ndevices_in_node
         cp.cuda.Device(device_id).synchronize()
 
 
diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py b/benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py
new file mode 100644
index 0000000..7701979
--- /dev/null
+++ b/benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+
+import cupy as cp
+import numpy as np
+from cupyx.profiler import benchmark
+
+from cuquantum import custatevec as cusv
+
+from .._utils import (check_sequence, check_targets_controls, dtype_to_cuda_type,
+                      precision_str_to_dtype, wrap_with_nvtx)
+
+
+# set up a logger
+logger_name = "cuquantum-benchmarks"
+logger = logging.getLogger(logger_name)
+
+
+def test_apply_generalized_permutation_matrix(
+        n_qubits, dtype_sv,
+        targets, controls, adjoint,
+        diag, dtype_diag, location_diag,  # for D
+        perm_table, location_perm,        # for P
+        n_warmup, n_repeat, *,
+        benchmark_data=None):
+    # TODO: allow controlling seed?
+    if diag is False and not perm_table:
+        raise ValueError("need to specify at least --has-diag or --has-perm/--perm-table")
+
+    logger.debug(f"{n_qubits=}")
+    logger.debug(f"{dtype_sv=}")
+    logger.debug(f"{targets=}")
+    logger.debug(f"{controls=}")
+    logger.debug(f"{adjoint=}")
+    logger.debug(f"{diag=}")
+    logger.debug(f"{dtype_diag=}")
+    logger.debug(f"{location_diag=}")
+    if isinstance(perm_table, bool) or len(perm_table) <= 16:
+        logger.debug(f"{perm_table=}")
+    else:
+        logger.debug("perm_table = (omitted due to length)")
+    logger.debug(f"{location_perm=}")
+    logger.debug(f"{n_warmup=}")
+    logger.debug(f"{n_repeat=}")
+
+    check_targets_controls(targets, controls, n_qubits)
+    n_targets = len(targets)
+    n_controls = len(controls)
+
+    # cuStateVec handle initialization
+    handle = cusv.create()
+    stream = cp.cuda.Stream()
+    cusv.set_stream(handle, stream.ptr)
+
+    size_sv = (2 ** n_qubits)
+    dtype_sv = precision_str_to_dtype(dtype_sv)
+    sv = cp.ones((size_sv,), dtype=dtype_sv)
+    data_type_sv = dtype_to_cuda_type(dtype_sv)
+
+    # the diagonal matrix can live on either host (np) or device (cp)
+    matrix_dim = (2 ** n_targets)
+    dtype_diag = precision_str_to_dtype(dtype_diag)
+    xp_diag = cp if location_diag == 'device' else np
+    if diag:
+        # it's better to just call rng.uniform(), but it's not there until CuPy v12.0.0
+        # rng_diag = xp_diag.random.default_rng(seed=1234)
+        # diag = rng_diag.uniform(0.7, 1.3, size=matrix_dim).astype(dtype_diag)
+        diag = 0.6 * xp_diag.random.random(size=matrix_dim).astype(dtype_diag) + 0.7
+        if isinstance(diag, cp.ndarray):
+            diag_ptr = diag.data.ptr
+        elif isinstance(diag, np.ndarray):
+            diag_ptr = diag.ctypes.data
+        else:
+            raise ValueError
+    else:
+        diag_ptr = 0
+    data_type_diag = dtype_to_cuda_type(dtype_diag)
+
+    # the permutation table can live on either host (np) or device (cp)
+    xp_perm = cp if location_perm == 'device' else np
+    if perm_table:
+        if perm_table is True:
+            original_perm_table = xp_perm.arange(0, matrix_dim, dtype=xp_perm.int64)
+            perm_table = xp_perm.copy(original_perm_table)
+            # it'd have been nice to seed an rng and call rng.shuffle(), but CuPy does
+            # not support it yet...
+            while True:
+                xp_perm.random.shuffle(perm_table)
+                # check if the matrix is not diagonal
+                if not (original_perm_table == perm_table).all():
+                    break
+        else:  # a user-provided list
+            check_sequence(perm_table, expected_size=matrix_dim, name="perm_table")
+            perm_table = xp_perm.asarray(perm_table, dtype=xp_perm.int64)
+
+        if isinstance(perm_table, cp.ndarray):
+            perm_table_ptr = perm_table.data.ptr
+        elif isinstance(perm_table, np.ndarray):
+            perm_table_ptr = perm_table.ctypes.data
+        else:
+            raise ValueError
+    else:
+        perm_table_ptr = 0
+
+    cp.cuda.Device().synchronize()  # ensure data prep is done before switching stream
+
+    ####################################################################################
+
+    # manage the workspace
+    workspace_size = cusv.apply_generalized_permutation_matrix_get_workspace_size(
+        handle, data_type_sv, n_qubits, perm_table_ptr, diag_ptr,
+        data_type_diag, targets, n_targets, n_controls)
+
+    with stream:
+        if workspace_size > 0:
+            workspace = cp.cuda.alloc(workspace_size)
+            workspace_ptr = workspace.ptr
+        else:
+            workspace_ptr = 0
+
+        # apply diagonal/permutation gate
+        apply_generalized_permutation_matrix = wrap_with_nvtx(
+            cusv.apply_generalized_permutation_matrix,
+            "apply_generalized_permutation_matrix")
+        args = (
+            handle, sv.data.ptr, data_type_sv, n_qubits, perm_table_ptr,
+            diag_ptr, data_type_diag, adjoint, targets, n_targets,
+            controls, 0,  # TODO: support control bit values
+            n_controls, workspace_ptr, workspace_size)
+        result = benchmark(
+            apply_generalized_permutation_matrix,
+            args,
+            n_warmup=n_warmup, n_repeat=n_repeat)
+
+    # destroy handle
+    cusv.destroy(handle)
+
+    logger.debug(str(result))
+    cpu_time = np.average(result.cpu_times)
+    gpu_time = np.average(result.gpu_times[0])
+    memory_footprint = (2. ** (n_qubits - n_controls)) * 2. * np.dtype(dtype_sv).itemsize
+    logger.debug(f"effective bandwidth = {memory_footprint / gpu_time * 1e-9} (GB/s)")
+
+    return cpu_time, gpu_time
diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py b/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py
index 0e82db2..dd9c0c1 100644
--- a/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py
+++ b/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py
@@ -10,8 +10,9 @@
 
 from cuquantum import custatevec as cusv
 
-from .._utils import (dtype_to_cuda_type, dtype_to_compute_type, precision_str_to_dtype,
-                      random_unitary, L2flush, benchmark_with_prerun)
+from .._utils import (benchmark_with_prerun, check_targets_controls, dtype_to_cuda_type,
+                      dtype_to_compute_type, L2flush, precision_str_to_dtype,
+                      random_unitary, wrap_with_nvtx)
 
 
 # set up a logger
@@ -21,33 +22,26 @@
 
 def test_apply_matrix(
         n_qubits, targets, controls, dtype_sv, dtype_mat, layout, adjoint,
-        n_warmup, n_repeat, location, *, flush_l2=False):
-    logger.debug(f"{n_qubits = }")
-    logger.debug(f"{targets = }")
-    logger.debug(f"{controls = }")
-    logger.debug(f"{dtype_sv = }")
-    logger.debug(f"{dtype_mat = }")
-    logger.debug(f"{layout = }")
-    logger.debug(f"{adjoint = }")
-    logger.debug(f"{location = }")
-    logger.debug(f"{n_warmup = }")
-    logger.debug(f"{n_repeat = }")
-    logger.debug(f"{flush_l2 = }")
+        n_warmup, n_repeat, location, *,
+        flush_l2=False, benchmark_data=None):
+    logger.debug(f"{n_qubits=}")
+    logger.debug(f"{targets=}")
+    logger.debug(f"{controls=}")
+    logger.debug(f"{dtype_sv=}")
+    logger.debug(f"{dtype_mat=}")
+    logger.debug(f"{layout=}")
+    logger.debug(f"{adjoint=}")
+    logger.debug(f"{location=}")
+    logger.debug(f"{n_warmup=}")
+    logger.debug(f"{n_repeat=}")
+    logger.debug(f"{flush_l2=}")
     
     dtype_sv = precision_str_to_dtype(dtype_sv)
     dtype_mat = precision_str_to_dtype(dtype_mat)
     xp = cp if location == 'device' else np
     layout = cusv.MatrixLayout.ROW if layout == "row" else cusv.MatrixLayout.COL
 
-    # simple sanity checks
-    assert len(targets) >= 1, "must have at least 1 target qubit"
-    _targets = set(targets)
-    assert len(_targets) == len(targets), "target qubit IDs cannot overlap"
-    _controls = set(controls)
-    assert len(_controls) == len(controls), "control qubits IDs cannot overlap"
-    assert len(_targets & _controls) == 0, "targets and controls cannot overlap"
-    _involved = targets + controls
-    assert 0 <= min(_involved) and max(_involved) < n_qubits, f"involved qubit IDs must be in range [0, {n_qubits})"
+    check_targets_controls(targets, controls, n_qubits)
 
     size_sv = 2**n_qubits
     n_targets = len(targets)
@@ -109,19 +103,21 @@ def test_apply_matrix(
                 controls, 0, n_controls,  # TODO: support control bit values
                 compute_type, workspace_ptr, workspace_size)
 
+        apply_matrix = wrap_with_nvtx(cusv.apply_matrix, "apply_matrix")
+
         if flush_l2:
             l2flusher = L2flush()
             def f(*args, **kwargs):
                 l2flusher.flush()  # clear L2 cache
 
             result = benchmark_with_prerun(
-                cusv.apply_matrix,
+                apply_matrix,
                 args,
                 n_warmup=n_warmup, n_repeat=n_repeat,
                 pre_run=f)
         else:
             result = benchmark(
-                cusv.apply_matrix,
+                apply_matrix,
                 args,
                 n_warmup=n_warmup, n_repeat=n_repeat)
     
diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py b/benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py
new file mode 100644
index 0000000..d404454
--- /dev/null
+++ b/benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+
+import numpy as np
+import cupy as cp
+from cupyx.profiler import benchmark
+
+from cuquantum import custatevec as cusv
+
+from .._utils import (check_sequence, dtype_to_cuda_type, precision_str_to_dtype,
+                      wrap_with_nvtx)
+
+
+# set up a logger
+logger_name = "cuquantum-benchmarks"
+logger = logging.getLogger(logger_name)
+
+
+def test_cusv_sampler(
+        n_qubits, dtype_sv, bit_ordering, n_shots, output_order, n_warmup, n_repeat, *,
+        benchmark_data=None):
+    logger.debug(f"{n_qubits=}")
+    logger.debug(f"{dtype_sv=}")
+    logger.debug(f"{bit_ordering=}")
+    logger.debug(f"{n_shots=}")
+    logger.debug(f"{output_order}")
+    logger.debug(f"{n_warmup=}")
+    logger.debug(f"{n_repeat=}")
+
+    check_sequence(bit_ordering, max_size=n_qubits, name="bit_ordering")
+    dtype_sv = precision_str_to_dtype(dtype_sv)
+    size_sv = (1 << n_qubits)
+
+    # the statevector must reside on device
+    sv = cp.ones((size_sv,), dtype=dtype_sv)
+    sv /= np.sqrt(size_sv)
+    # assert cp.allclose(cp.sum(cp.abs(sv)**2), 1)
+    data_type_sv = dtype_to_cuda_type(dtype_sv)
+
+    # the output bitstrings must reside on host
+    bit_strings = np.empty((n_shots,), dtype=np.int64)
+
+    # the random seeds must be a host array
+    randnums = np.random.random((n_shots,)).astype(np.float64)
+
+    cp.cuda.Device().synchronize()  # ensure data prep is done before switching stream
+
+    ####################################################################################
+    
+    # cuStateVec handle initialization
+    handle = cusv.create()
+    stream = cp.cuda.Stream()
+    cusv.set_stream(handle, stream.ptr)
+
+    # create sampler and check the size of external workspace
+    sampler, workspace_size = cusv.sampler_create(
+        handle, sv.data.ptr, data_type_sv, n_qubits, n_shots)
+    
+    with stream:
+        # manage the workspace
+        if workspace_size > 0:
+            workspace = cp.cuda.alloc(workspace_size)
+            workspace_ptr = workspace.ptr
+        else:
+            workspace_ptr = 0
+
+        # sample preprocess
+        sampler_preprocess = wrap_with_nvtx(
+            cusv.sampler_preprocess, "sampler_preprocess")
+        args = (handle, sampler, workspace_ptr, workspace_size)
+
+        result1 = benchmark(
+            sampler_preprocess,
+            args,
+            n_warmup=n_warmup, n_repeat=n_repeat)
+        logger.debug(str(result1))
+
+        # sample bit strings
+        sampler_sample = wrap_with_nvtx(
+            cusv.sampler_sample, "sampler_sample")
+        args = (
+            handle, sampler, bit_strings.ctypes.data, bit_ordering, len(bit_ordering),
+            randnums.ctypes.data, n_shots,
+            cusv.SamplerOutput.RANDNUM_ORDER if output_order == "random" else cusv.SamplerOutput.ASCENDING_ORDER)
+
+        result2 = benchmark(
+            sampler_sample,
+            args,
+            n_warmup=n_warmup, n_repeat=n_repeat)
+        logger.debug(str(result2))
+    
+    # clean up
+    cusv.sampler_destroy(sampler)
+    cusv.destroy(handle)
+
+    cpu_time = np.average(result1.cpu_times) + np.average(result2.cpu_times)
+    gpu_time = np.average(result1.gpu_times[0]) + np.average(result2.gpu_times[0])
+
+    return cpu_time, gpu_time
diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py b/benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py
new file mode 100644
index 0000000..bb711ba
--- /dev/null
+++ b/benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import logging
+import os
+import sys
+
+import cupy as cp
+import numpy as np
+from cupyx.profiler import benchmark
+
+import cuquantum.cutensornet as cutn
+from cuquantum.cutensornet import tensor
+
+from .._utils import precision_str_to_dtype, wrap_with_nvtx
+try:
+    path = os.environ.get('CUTENSORNET_APPROX_TN_UTILS_PATH', '')
+    if path and os.path.isfile(path):
+        sys.path.insert(1, os.path.dirname(path))
+    from approxTN_utils import tensor_decompose
+except ImportError:
+    tensor_decompose = None
+
+
+# set up a logger
+logger_name = "cuquantum-benchmarks"
+logger = logging.getLogger(logger_name)
+
+
+def benchmark_tensor_decompose(
+        expr, shape, precision, is_complex, method, algorithm, n_warmup, n_repeats, check_ref, *,
+        benchmark_data=None):
+    logger.debug(f"{expr=}")
+    logger.debug(f"{shape=}")
+    logger.debug(f"{precision=}")
+    logger.debug(f"{is_complex=}")
+    logger.debug(f"{method=}")
+    logger.debug(f"{algorithm=}")
+    logger.debug(f"{n_warmup=}")
+    logger.debug(f"{n_repeats=}")
+    logger.debug(f"{check_ref=}")
+
+    cp.random.seed(5678)  # TODO: set me
+    handle = cutn.create()
+    options = {'handle': handle}
+    decomp_subscripts = expr
+
+    # sanity checks
+    expr_in = expr.split('->')[0]
+    assert len(expr_in) == len(shape), \
+           f"the input shape {shape} mismatches with the input modes {expr_in}"
+    if check_ref and tensor_decompose is None:
+        raise RuntimeError("--check-reference is not supported") 
+
+    dtype_r = precision_str_to_dtype(precision, False)
+    t_in = cp.random.random(shape, dtype=dtype_r)
+    if is_complex:
+        dtype = precision_str_to_dtype(precision)
+        t_in = t_in.astype(dtype)
+        t_in += 1j*cp.random.random(shape, dtype=dtype_r)
+        assert t_in.dtype == dtype
+    
+    t_numpy = t_in.get()
+
+    if method == "QR":
+        kwargs = {'options': options}
+        if check_ref:
+            options_ref = {'method':'qr'}
+    elif method == "SVD":
+        try:
+            kwargs = {'options': options, 'method': tensor.SVDMethod(algorithm=algorithm)}
+        except TypeError as e:
+            if algorithm != "gesvd":
+                raise ValueError(f"{algorithm} requires cuQuantum v23.06+") from e
+            else:
+                kwargs = {'options': options, 'method': tensor.SVDMethod()}
+        if check_ref:
+            options_ref = {'method':'svd'}
+    else:
+        assert False
+    cp.cuda.Device().synchronize()  # ensure data prep is done
+
+    decompose = wrap_with_nvtx(tensor.decompose, "decompose")
+
+    results = benchmark(decompose,
+                        (decomp_subscripts, t_in), kwargs=kwargs,
+                        n_repeat=n_repeats, n_warmup=n_warmup)
+
+    if check_ref:
+        decompose_ref = wrap_with_nvtx(tensor_decompose, "tensor_decompose")
+
+        results_cupy = benchmark(decompose_ref,
+                                 (decomp_subscripts, t_in), kwargs=options_ref,
+                                 n_repeat=n_repeats, n_warmup=n_warmup)
+    
+        results_numpy = benchmark(decompose_ref,
+                                  (decomp_subscripts, t_numpy), kwargs=options_ref,
+                                  n_repeat=n_repeats, n_warmup=n_warmup)
+
+    cutn.destroy(handle)
+
+    logger.debug(str(results))
+    if check_ref:
+        logger.debug("ref (CuPy):")
+        logger.debug(str(results_cupy))
+        benchmark_data['cupy_time'] = max(
+            np.average(results_cupy.cpu_times), np.average(results_cupy.gpu_times[0]))
+        logger.debug("ref (NumPy):")
+        logger.debug(str(results_numpy))
+        benchmark_data['numpy_time'] = np.average(results_numpy.cpu_times)
+
+    cpu_time = np.average(results.cpu_times)
+    gpu_time = np.average(results.gpu_times[0])
+
+    return cpu_time, gpu_time
diff --git a/benchmarks/cuquantum_benchmarks/config.py b/benchmarks/cuquantum_benchmarks/config.py
index 178f0d9..04581c6 100644
--- a/benchmarks/cuquantum_benchmarks/config.py
+++ b/benchmarks/cuquantum_benchmarks/config.py
@@ -23,12 +23,6 @@
 
     'qft': {
         'benchmark': QFT,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)) + [33],
-        },
         'config': {
             'measure': True,
         },
@@ -36,12 +30,6 @@
 
     'iqft': {
         'benchmark': IQFT,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)) + [33],
-        },
         'config': {
             'measure': True,
         },
@@ -49,12 +37,6 @@
 
     'ghz': {
         'benchmark': GHZ,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)) + [33],
-        },
         'config': {
             'measure': True,
         },
@@ -62,12 +44,6 @@
 
     'simon': {
         'benchmark': Simon,
-        'nqubits': {
-            'default': list(range(6, 16, 2)) + [15],
-            '3090': list(range(6, 16, 2)) + [15],
-            'A6000': list(range(6, 16, 2)) + [15],
-            'A100-SXM4-80GB': list(range(6, 17, 1)),
-        },
         'config': {
             'measure': True,
         },
@@ -75,12 +51,6 @@
 
     'hidden_shift': {
         'benchmark': HiddenShift,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)) + [33],
-        },
         'config': {
             'measure': True,
         },
@@ -88,12 +58,6 @@
 
     'qaoa': {
         'benchmark': QAOA,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)) + [33],
-        },
         'config': {
             'measure': True,
             'p': 1,
@@ -102,12 +66,6 @@
 
     'qpe': {
         'benchmark': QPE,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)),
-        },
         'config': {
             'measure': True,
             'unfold': False,
@@ -116,9 +74,6 @@
 
     'quantum_volume': {
         'benchmark': QuantumVolume,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-        },
         'config': {
             'measure': True,
         },
@@ -126,12 +81,6 @@
 
     'random': {
         'benchmark': Random,
-        'nqubits': {
-            'default': list(range(16, 32, 4)) + [30],
-            '3090': list(range(16, 32, 4)) + [30],
-            'A6000': list(range(16, 32, 4)) + [30],
-            'A100-SXM4-80GB': list(range(16, 34, 2)),
-        },
         'config': {
             'measure': True,
         },
@@ -152,6 +101,7 @@
             # TODO: even this may not be a good default
             'ncputhreads': multiprocessing.cpu_count() // 2,
             'precision': 'single',
+            'nhypersamples': 32,
         },
     },
 
@@ -295,6 +245,17 @@
         },
     },
 
+    # dummy
+    'pennylane-dumper': {
+        'config': {
+            'nshots': 1024,
+            'nfused': None,
+            'ngpus': 0,
+            'ncputhreads': 1,
+            'precision': 'single',
+        },
+    },
+
     'qulacs-gpu': {
         'config': {
             'nshots': 1024,
diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py
new file mode 100644
index 0000000..74217d2
--- /dev/null
+++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import cmath
+import logging
+from math import pi
+
+import numpy as np
+
+from .frontend import Frontend
+from .._utils import call_by_root
+
+
+# set up a logger
+logger_name = "cuquantum-benchmarks"
+logger = logging.getLogger(logger_name)
+
+
+class Dumper(Frontend):
+    """Special frontend for dumping the gate sequence as pure text to disk.
+
+    Each gate (or operation) would be stored as 3 lines, with elements separated by 1 space:
+
+      1. n_targets n_controls
+      2. targets controls
+      3. contiguity actual_matrix_data
+
+    Note that the qubit IDs are zero-based. The matrix data is flattened to a 1D contiguous
+    array of length 2**(2*n_targets). The contiguity is a single character "C" (for C-major,
+    or row-major) or "F" (for Fortran-major, or column-major) for how to interpret the matrix.
+    All complex numbers are stored as two real numbers (ex: 0.5-0.1j -> "0.5 -0.1").
+
+    As an example, a CCX gate acting on qubit 0 and controlled by qubits 2 & 4 is stored as
+
+      '''
+      1 2\n
+      0 2 4\n
+      C 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0\n
+      '''
+
+    Currently the measurement operation at the end of the gate sequence is not stored.
+
+    An empty line can be used to separate different gates/operations and improve readability,
+    but it is not required.
+    """
+
+    def __init__(self, nqubits, config):
+        precision = config['precision']
+        self.dtype = np.complex64 if precision == 'single' else np.complex128
+        self.dtype = np.dtype(self.dtype)
+        circuit_filename = config['circuit_filename']
+        self.circuit_filename = circuit_filename.replace('.pickle', '_raw.txt')
+        self.nqubits = nqubits
+        self.order = 'C'  # TODO
+        self.digits = 12  # TODO
+
+    def _dump_op(self, op, targets, controls=()):
+        op = np.array2string(
+            op.astype(self.dtype).reshape(-1, order=self.order).view(self.dtype.char.lower()),
+            max_line_width=np.inf,
+            precision=self.digits,
+        )
+        if isinstance(targets, int):
+            targets = (targets,)
+        if isinstance(controls, int):
+            controls = (controls,)
+
+        op_data = f"{len(targets)} {len(controls)}\n"
+        for t in targets:
+            op_data += f"{t} "
+        for c in controls:
+            op_data += f"{c} "
+        op_data += f"\n{self.order} "
+        op_data += f"{op[1:-1]}\n\n"
+
+        return op_data
+
+    def _get_rotation_matrix(self, theta, phi, lam):
+        matrix = np.empty((2, 2), dtype=self.dtype)
+        theta *= 0.5
+        matrix[0, 0] = cmath.cos(theta)
+        matrix[0, 1] = - cmath.sin(theta) * cmath.exp(1j*lam)
+        matrix[1, 0] = cmath.sin(theta) * cmath.exp(1j*phi)
+        matrix[1, 1] = cmath.cos(theta) * cmath.exp(1j*(phi+lam))
+        matrix = np.asarray(matrix)
+        return matrix
+
+    def generateCircuit(self, gateSeq):
+        circuit = ''
+
+        for g in gateSeq:
+            if g.id == 'h':
+                circuit += self._dump_op(
+                    np.asarray([[1, 1], [1, -1]])/np.sqrt(2), g.targets)
+
+            elif g.id == 'x':
+                circuit += self._dump_op(
+                    np.asarray([[0, 1], [1, 0]]), g.targets)
+
+            elif g.id == 'cnot':
+                # TODO: use 4*4 version (merge targets & controls)?
+                circuit += self._dump_op(
+                    np.asarray([[0, 1], [1, 0]]), g.targets, g.controls)
+
+            elif g.id == 'cz':
+                # TODO: use 4*4 version (merge targets & controls)?
+                circuit += self._dump_op(
+                    np.asarray([[1, 0], [0, -1]]), g.targets, g.controls)
+
+            elif g.id == 'rz':
+                circuit += self._dump_op(
+                    self._get_rotation_matrix(0, g.params, 0), g.targets)
+
+            elif g.id == 'rx':
+                circuit += self._dump_op(
+                    self._get_rotation_matrix(g.params, -pi/2, pi/2), g.targets)
+
+            elif g.id == 'ry':
+                circuit += self._dump_op(
+                    self._get_rotation_matrix(g.params, 0, 0), g.targets)
+
+            elif g.id == 'czpowgate':
+                matrix = np.eye(2, dtype=self.dtype)
+                matrix[1, 1] = cmath.exp(1j*pi*g.params)
+                circuit += self._dump_op(matrix, g.targets, g.controls)
+
+            elif g.id == 'swap':
+                assert len(g.targets) == 2
+                matrix = np.eye(4, dtype=self.dtype)
+                matrix[1:3, 1:3] = [[0, 1], [1, 0]]
+                circuit += self._dump_op(matrix, g.targets)
+
+            elif g.id == 'cu':
+                circuit += self._dump_op(g.matrix, g.targets, g.controls)
+
+            elif g.id == 'u':
+                circuit += self._dump_op(g.matrix, g.targets)
+
+            elif g.id == 'measure':
+                pass  # treated as no-op for now
+
+            else:
+                raise NotImplementedError(f"the gate type {g.id} is not defined")
+
+        def dump():
+            logger.info(f"dumping (raw) circuit as {self.circuit_filename} ...")
+            with open(self.circuit_filename, 'w') as f:
+                f.write(circuit)
+
+        call_by_root(dump)
diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py
index fe5c32b..1265262 100644
--- a/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py
+++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py
@@ -25,7 +25,7 @@ def __init__(self, nqubits, config):
     def generateCircuit(self, gateSeq):
         last_g = gateSeq[-1]
         assert last_g.id == "measure"  # TODO: relax this?
-        
+
         def circuit():
             measured_qs = None
 
@@ -71,5 +71,5 @@ def circuit():
                     raise NotImplementedError(f"The gate type {g.id} is not defined")
             
             return pennylane.sample(wires=measured_qs) 
-    
-        return circuit
\ No newline at end of file
+ 
+        return circuit
diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py
index 98a8211..8c4230a 100644
--- a/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py
+++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py
@@ -57,12 +57,12 @@ def generateCircuit(self, gateSeq):
 
             elif g.id == 'cu':
                 U_gate = UnitaryGate(g.matrix, g.name).control(1)
-                circuit.append(U_gate, [g.controls]+g.targets)
+                circuit.append(U_gate, [g.controls]+g.targets[::-1])
 
             elif g.id == 'u':
                 # TODO: give the gate a name?
                 U_gate = UnitaryGate(g.matrix)
-                circuit.append(U_gate, g.targets)
+                circuit.append(U_gate, g.targets[::-1])
 
             elif g.id == 'measure':
                 circuit.measure(g.targets, g.targets)
diff --git a/benchmarks/cuquantum_benchmarks/run.py b/benchmarks/cuquantum_benchmarks/run.py
index eb0a80a..6f7daa3 100644
--- a/benchmarks/cuquantum_benchmarks/run.py
+++ b/benchmarks/cuquantum_benchmarks/run.py
@@ -11,8 +11,9 @@
 from .config import benchmarks
 from .config import backends as backend_config
 from .frontends import frontends
-from .run_interface import run_interface, BenchApiRunner
-from ._utils import str_to_seq, MPHandler, RawTextAndDefaultArgFormatter
+from .run_interface import BenchApiRunner, BenchCircuitRunner
+from ._utils import (EarlyReturnError, MPHandler, RawTextAndDefaultArgFormatter,
+                     str_to_seq,)
 
 
 frontend_names = [f for f in frontends.keys()]
@@ -20,13 +21,13 @@
 benchmark_names = [b for b in benchmarks.keys()]
 
 
-main_description = api_description = r"""
+main_description = api_description = circuit_description = r"""
 =============== NVIDIA cuQuantum Performance Benchmark Suite ===============
 """
 
 
-circuit_description = r"""
-=============== NVIDIA cuQuantum Performance Benchmark Suite ===============
+circuit_description += r"""
+Note: all frontends and backends are optional and unavailable for use unless installed.
 
 Supported Frontends:
 
@@ -57,8 +58,6 @@
   - qulacs-gpu: runs the Qulacs GPU backend
   - qulacs-cpu: runs the Qulacs CPU backend
 
-Note: all frontends and backends are optional and unavailable for use unless installed.
-
 ============================================================================
 """
 
@@ -82,9 +81,8 @@
                             help=f'set the simulator frontend')
 parser_circuit.add_argument('--backend', type=str, required=True, choices=backend_names,
                             help=f'set the simulator backend that is compatible with the frontend')
-# TODO
-#parser.add_argument('--append', help='only add to existing benchmarking data rather than overwrite any data', action='store_true')
 parser_circuit.add_argument('--new', help='create a new circuit rather than use existing circuit', action='store_true')
+
 # these options make sense to both circuit & api benchmarks, for better UX we need to copy/paste
 parser_circuit.add_argument('--cachedir', type=str, default='.', help='set the directory to cache generated data')
 parser_circuit.add_argument('--nqubits', type=int, help='set the number of qubits for each benchmark (circuit/api)')
@@ -100,30 +98,33 @@
 backend.add_argument('--nfused', type=int, help='set the maximum number of fused qubits for gate matrix fusion')
 backend.add_argument('--precision', type=str, choices=('single', 'double'),
                      help='set the floating-point precision')
-backend.add_argument('--cusvaer-global-index-bits', type=str_to_seq, nargs='?', const='', default=-1,
+
+backend_cusvaer = parser_circuit.add_argument_group('cusvaer-specific options')
+backend_cusvaer.add_argument('--cusvaer-global-index-bits', type=str_to_seq, nargs='?', const='', default=-1,
                      help='set the global index bits to specify the inter-node network structure.  Please refer to the '
                           'cusvaer backend documentation for further details. If not followed by any argument, '
                           'the default (empty sequence) is used; '
                           'otherwise, the argument should be a comma-separated string. '
                           'Setting this option is mandatory for the cusvaer backend and an error otherwise')
-backend.add_argument('--cusvaer-p2p-device-bits', type=int, nargs='?', const=0, default=-1,
+backend_cusvaer.add_argument('--cusvaer-p2p-device-bits', type=int, nargs='?', const=0, default=-1,
                      help='set the number of p2p device bits.  Please refer to the cusvaer backend documentation '
                           'for further details. If not followed by any argument, the default (0) is used. '
                           'Setting this option is mandatory for the cusvaer backend and an error otherwise')
-backend.add_argument('--cusvaer-data-transfer-buffer-bits', type=int, default=26,
+backend_cusvaer.add_argument('--cusvaer-data-transfer-buffer-bits', type=int, default=26,
                      help='set the size of the data transfer buffer in cusvaer.  The size is '
                           'specified as a positive integer.  The buffer sized used is (1 << [#bits]). '
                           'The default is 26 (64 MiB = 1 << 26)')
-backend.add_argument('--cusvaer-comm-plugin-type', type=str, nargs='?', default='mpi_auto',
+backend_cusvaer.add_argument('--cusvaer-comm-plugin-type', type=str, nargs='?', default='mpi_auto',
                      choices=['mpi_auto', 'mpi_openmpi', 'mpi_mpich', 'external', 'self'],
                      help='set the type of comm plugin used for multi-process simulation. '
-                          'Required to set this option when one needs to use a custom comm plugin. '
-                          'Acceptable values are mpi_auto, mpi_openmpi, mpi_mpich and external. '
-                          'The default is mpi_auto.')
-backend.add_argument('--cusvaer-comm-plugin-soname', type=str, nargs='?', default='',
+                          'Required to set this option when one needs to use a custom comm plugin.')
+backend_cusvaer.add_argument('--cusvaer-comm-plugin-soname', type=str, nargs='?', default='',
                      help='specify the name of a shared library used for inter-process communication. '
                           'Required to set this option when one needs to use a custom comm plugin')
 
+backend_cutn = parser_circuit.add_argument_group('cutn-specific options')
+backend_cutn.add_argument('--nhypersamples', type=int, default=32, help='set the number of hypersamples for the pathfinder to explore')
+
 
 # "cuquantum-benchmarks api" subcommand
 parser_api = subparsers.add_parser(
@@ -131,38 +132,130 @@
     description=api_description,
     help="benchmark different APIs from cuQuantum's libraries",
     formatter_class=RawTextAndDefaultArgFormatter)
-parser_api.add_argument('--benchmark', type=str, choices=('apply_matrix',),
-                        help=f'pick the API to benchmark')
+parser_api.add_argument('--benchmark', type=str, required=True,
+                        choices=BenchApiRunner.supported_apis,
+                        help=f'pick the API to benchmark. Specify a benchmark with -h/--help can see detailed help message.')
 parser_api.add_argument('--precision', type=str, choices=('single', 'double'), default='single',
                         help='set the floating-point precision')
-apply_matrix = parser_api.add_argument_group('apply_matrix-specific options')
-
-targets = apply_matrix.add_mutually_exclusive_group(required=True)
-targets.add_argument('--targets', type=str_to_seq,
-                     help="set the (comma-separated) target qubit IDs")
-targets.add_argument('--ntargets', type=int, help='set the number of target qubits')
-
-controls = apply_matrix.add_mutually_exclusive_group(required=False)
-controls.add_argument('--controls', type=str_to_seq,
-                      help="set the (comma-separated) control qubit IDs")
-controls.add_argument('--ncontrols', type=int, help='set the number of target qubits')
-
-apply_matrix.add_argument('--layout', type=str, choices=('row', 'column'), default='row',
-                          help='set the gate matrix layout')
-apply_matrix.add_argument('--adjoint', action='store_true', help='apply the matrix adjoint')
-apply_matrix.add_argument('--location', type=str, choices=('device', 'host'), default='host',
-                          help='set the location of the gate matrix')
-apply_matrix.add_argument('--nqubits', type=int, required=True,
-                          help='set the total number of qubits')
-apply_matrix.add_argument('--flush-cache', action='store_true', help='flush the L2 cache for more accurate timing')
-
 # these options make sense to both circuit & api benchmarks, for better UX we need to copy/paste
+# TODO: set the arguments programmatically to avoid dups
 parser_api.add_argument('--cachedir', type=str, default='.', help='set the directory to cache generated data')
 parser_api.add_argument('--nwarmups', type=int, default=3, help='set the number of warm-up runs for each benchmark')
 parser_api.add_argument('--nrepeats', type=int, default=10, help='set the number of repetitive runs for each benchmark')
 parser_api.add_argument('-v', '--verbose', help='output extra information during benchmarking', action='store_true')
 
 
+# add_api_benchmark_options() can only be called once throughout the process's lifetime
+_is_api_benchmark_options_added = False
+
+def add_api_benchmark_options(parser_api, args=None):
+    # benchmark-specific options
+    global _is_api_benchmark_options_added
+    if _is_api_benchmark_options_added: return
+
+    # hack: we want dynamic behavior but the parser can't do the job properly
+    target = None
+    if args is None:
+        what_to_parse = sys.argv  # parsing from cmdline
+    else:
+        what_to_parse = args
+    try:
+        idx = what_to_parse.index('--benchmark')
+        target = what_to_parse[idx+1]
+    except (ValueError, IndexError):
+        return
+    assert target is not None
+
+    if target == 'apply_matrix':
+        apply_matrix = parser_api.add_argument_group('apply_matrix-specific options')
+
+        targets = apply_matrix.add_mutually_exclusive_group(required=True)
+        targets.add_argument('--targets', type=str_to_seq,
+                             help="set the (comma-separated) target qubit IDs")
+        targets.add_argument('--ntargets', type=int, help='set the number of target qubits')
+
+        controls = apply_matrix.add_mutually_exclusive_group(required=False)
+        controls.add_argument('--controls', type=str_to_seq,
+                              help="set the (comma-separated) control qubit IDs")
+        controls.add_argument('--ncontrols', type=int, help='set the number of target qubits')
+
+        apply_matrix.add_argument('--layout', type=str, choices=('row', 'column'), default='row',
+                                  help='set the gate matrix layout')
+        apply_matrix.add_argument('--adjoint', action='store_true', help='apply the matrix adjoint')
+        apply_matrix.add_argument('--location', type=str, choices=('device', 'host'), default='host',
+                                  help='set the location of the gate matrix')
+        apply_matrix.add_argument('--nqubits', type=int, required=True,
+                                  help='set the total number of qubits')
+        apply_matrix.add_argument('--flush-cache', action='store_true', help='flush the L2 cache for more accurate timing')
+
+    if target == 'apply_generalized_permutation_matrix':
+        apply_gen_perm_matrix = parser_api.add_argument_group('apply_generalized_permutation_matrix-specific options')
+        apply_gen_perm_matrix.add_argument('--nqubits', type=int, required=True,
+                                           help='set the total number of qubits')
+
+        targets = apply_gen_perm_matrix.add_mutually_exclusive_group(required=True)
+        targets.add_argument('--targets', type=str_to_seq,
+                             help="set the (comma-separated) target qubit IDs")
+        targets.add_argument('--ntargets', type=int, help='set the number of target qubits')
+
+        controls = apply_gen_perm_matrix.add_mutually_exclusive_group(required=False)
+        controls.add_argument('--controls', type=str_to_seq,
+                              help="set the (comma-separated) control qubit IDs")
+        controls.add_argument('--ncontrols', type=int, help='set the number of control qubits')
+
+        apply_gen_perm_matrix.add_argument('--adjoint', action='store_true',
+                                           help='apply the matrix adjoint')
+        apply_gen_perm_matrix.add_argument('--has-diag', action='store_true',
+                                           help='whether the diagonal matrix is nontrivial (not an identity)')
+        apply_gen_perm_matrix.add_argument('--location-diag', type=str, choices=('device', 'host'), default='host',
+                                           help='set the location of the diagonal matrix')
+        apply_gen_perm_matrix.add_argument('--precision-diag', type=str, choices=('single', 'double'), default='single',
+                                           help='set the floating-point precision of the diagonal matrix')
+
+        perm = apply_gen_perm_matrix.add_mutually_exclusive_group(required=False)
+        perm.add_argument('--has-perm', action='store_true',
+                          help='whether the permutation matrix is nontrivial (not an identity)')
+        perm.add_argument('--perm-table', type=str_to_seq,
+                          help='set the permutation table for constructing a permutation matrix')
+
+        apply_gen_perm_matrix.add_argument('--location-perm', type=str, choices=('device', 'host'), default='host',
+                                           help='set the location of the permutation matrix')
+
+    elif target == 'cusv_sampler':
+        sampler = parser_api.add_argument_group('cusv_sampler-specific options')
+        bitordering = sampler.add_mutually_exclusive_group(required=True)
+        bitordering.add_argument('--bit-ordering', type=str_to_seq,
+                                 help="set the (comma-separated) qubit IDs to sample")
+        bitordering.add_argument('--nbit-ordering', type=int,
+                                 help='set the number of qubits to sample')
+        sampler.add_argument('--nqubits', type=int, required=True,
+                             help='set the total number of qubits')
+        sampler.add_argument('--nshots', type=int, default=1024,
+                             help="set the number of shots")
+        sampler.add_argument('--output-order', choices=('random', 'ascending'), default='ascending',
+                             help='set the order of bit strings in sampled outputs')
+
+    elif target == 'tensor_decompose':
+        tensor_decompose = parser_api.add_argument_group('tensor_decompose-specific options')
+    
+        method = tensor_decompose.add_mutually_exclusive_group(required=True)
+        method.add_argument('--method', type=str, choices=('QR', 'SVD'),
+                            help='the method for tensor decomposition; when SVD is chosen, gesvd will be used')
+        method.add_argument('--algorithm', type=str, choices=('gesvd', 'gesvdj', 'gesvdr', 'gesvdp'),
+                            help='the algorithm for SVD decomposition')
+
+        tensor_decompose.add_argument('--expr', type=str, required=True,
+                                      help='an einsum-like expression describing the decomposition; '
+                                           'the expression must be quoted with \' or \"')
+        tensor_decompose.add_argument('--shape', type=str_to_seq, required=True,
+                                      help='the shape of the input tensor')
+        tensor_decompose.add_argument('--is-complex', action='store_true',
+                                      help='whether the input tensor is complex-valued')
+        tensor_decompose.add_argument('--check-reference', action='store_true', default=False)
+
+    _is_api_benchmark_options_added = True
+
+
 # set up a logger
 logger_name = "cuquantum-benchmarks"
 logger = logging.getLogger(logger_name)
@@ -178,8 +271,8 @@
 
 def run(args=None):
     # we allow args to be a list of cmd options for potential private use cases and tests
+    add_api_benchmark_options(parser_api, args)
     args = parser.parse_args(args)
-    #print(args)
 
     # Since run() might be called multiple times, in such case we don't wanna make any changes
     # to the handler in the 2nd time onward, this ensures we write to the same I/O stream and
@@ -196,13 +289,16 @@ def run(args=None):
         pass
     else:
         logger.setLevel(level)
+    finally:
+        del args.verbose
 
     # dispatch to subcommands
     cmd = args.cmd
+    del args.cmd
     if cmd == "circuit":
-
         selected_benchmarks = benchmarks if args.benchmark == 'all' else {args.benchmark: benchmarks[args.benchmark]}
-        selected_backend = (args.backend, backend_config[args.backend])
+        del args.benchmark
+        config = backend_config[args.backend]
 
         if ((args.frontend == 'cirq' and args.backend not in ('cirq', 'cutn', *[k for k in backends.keys() if k.startswith('qsim')]))
                 or (args.frontend == 'qiskit' and args.backend not in ('cutn', *[k for k in backends.keys() if 'aer' in k]))
@@ -221,34 +317,25 @@ def run(args=None):
             if args.cusvaer_p2p_device_bits != -1:
                 raise ValueError(f"cannot set --cusvaer-p2p-device-bits for backend {args.backend}")
 
-        run_interface(benchmarks=selected_benchmarks,
-                      nqubits_interface=args.nqubits,
-                      ngpus_interface=args.ngpus,
-                      ncpu_threads_interface=args.ncputhreads,
-                      frontend=args.frontend,
-                      backend=selected_backend,
-                      #append=args.append,
-                      nwarmups=args.nwarmups,
-                      nrepeats=args.nrepeats,
-                      nshots_interface=args.nshots,
-                      nfused_interface=args.nfused,
-                      precision_interface=args.precision,
-                      new_circ=args.new,
-                      save=True,
-                      cache_dir=args.cachedir,
-                      cusvaer_global_index_bits=args.cusvaer_global_index_bits,
-                      cusvaer_p2p_device_bits=args.cusvaer_p2p_device_bits,
-                      cusvaer_data_transfer_buffer_bits=args.cusvaer_data_transfer_buffer_bits,
-                      cusvaer_comm_plugin_type=args.cusvaer_comm_plugin_type,
-                      cusvaer_comm_plugin_soname=args.cusvaer_comm_plugin_soname)
+        runner = BenchCircuitRunner(
+            benchmarks=selected_benchmarks,
+            backend_config=config,
+            **vars(args))
 
+        # benchmark & dump result to cachedir
+        try:
+            runner.run()
+        except EarlyReturnError:
+            pass
 
     elif cmd == "api":
-        del args.cmd
         runner = BenchApiRunner(**vars(args))
 
         # benchmark & dump result to cachedir
-        runner.run()
+        try:
+            runner.run()
+        except EarlyReturnError:
+            pass
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/cuquantum_benchmarks/run_interface.py b/benchmarks/cuquantum_benchmarks/run_interface.py
index 409898f..c6fd005 100644
--- a/benchmarks/cuquantum_benchmarks/run_interface.py
+++ b/benchmarks/cuquantum_benchmarks/run_interface.py
@@ -4,18 +4,21 @@
 
 import functools
 import logging
+import math
+import nvtx
 import os
 import pickle
 import random
 import time
-
 import cupy as cp
 
 from .backends import createBackend
 from .frontends import createFrontend
-from ._utils import (call_by_root, gen_run_env, HashableDict, is_running_mpiexec,
-                     load_benchmark_data, report, save_benchmark_data, reseed,
-                     is_running_mpi)
+from ._utils import (
+    call_by_root, create_cache, EarlyReturnError, gen_run_env, get_mpi_rank, HashableDict,
+    is_running_mpiexec, is_running_mpi, load_benchmark_data, report, reseed,
+    save_benchmark_data,
+)
 
 
 # set up a logger
@@ -23,102 +26,68 @@
 logger = logging.getLogger(logger_name)
 
 
-def run_interface(
-        benchmarks, nqubits_interface, ngpus_interface, ncpu_threads_interface, frontend, backend, nwarmups, nrepeats, nshots_interface,
-        nfused_interface, precision_interface, new_circ, save, cache_dir,
-        cusvaer_global_index_bits, cusvaer_p2p_device_bits, cusvaer_data_transfer_buffer_bits, cusvaer_comm_plugin_type, cusvaer_comm_plugin_soname):
-
-    reseed(1234)  # TODO: use a global seed?
-    backend, backend_config = backend  # unpack
-    ngpus = ngpus_interface if ngpus_interface is not None else backend_config['config']['ngpus']
-    ncpu_threads = ncpu_threads_interface if ncpu_threads_interface is not None else backend_config['config']['ncputhreads']
-    nshots = nshots_interface if nshots_interface is not None else backend_config['config']['nshots']
-    nfused = nfused_interface if nfused_interface is not None else backend_config['config']['nfused']
-    precision = precision_interface if precision_interface is not None else backend_config['config']['precision']
+class BenchCircuitRunner:
 
-    general_interface = GeneralInterface(frontend=frontend,
-                                         backend=backend,
-                                         nshots=nshots,
-                                         nfused=nfused,
-                                         precision=precision,
-                                         #append=append,
-                                         new_circ=new_circ,
-                                         save=save)
+    # currently we assume the following subdirectories exist
+    required_subdirs = ('circuits', 'data')
 
-    for benchmark_name in benchmarks.keys(): # Iterate over diferent benchmarks
-        benchmark = benchmarks[benchmark_name]
+    def __init__(self, **kwargs):
+        # use default backend config unless users want to overwrite it
+        self.backend_config = backend_config = kwargs.pop("backend_config")
+        for k in (# generic backend options
+                  "ngpus", "ncputhreads", "nshots", "nfused", "precision",
+                  # cusvaer options
+                  'cusvaer_global_index_bits', 'cusvaer_p2p_device_bits',
+                  'cusvaer_data_transfer_buffer_bits', 'cusvaer_comm_plugin_type',
+                  'cusvaer_comm_plugin_soname',
+                  # cutn options
+                  'nhypersamples'):
+            v = kwargs.pop(k)
+            if k.startswith('cusvaer') or v is not None:
+                setattr(self, k, v)
+            else:
+                setattr(self, k, backend_config['config'][k])
+
+        # To be parsed in run()
+        self._benchmarks = kwargs.pop("benchmarks")
+        self._nqubits = kwargs.pop("nqubits")
+
+        # other common benchmark args
+        self.frontend = kwargs.pop("frontend")
+        self.backend = kwargs.pop("backend")
+        self.cache_dir = kwargs.pop("cachedir")
+        self.nwarmups = kwargs.pop("nwarmups")
+        self.nrepeats = kwargs.pop("nrepeats")
+        self.new_circ = kwargs.pop("new")
+        self.save = True
+        assert len(kwargs) == 0, f"unhandled cmdline args: {kwargs}"
 
-        gpu_device_properties = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)
-        gpu_name = gpu_device_properties['name'].decode('utf-8').split(' ')[-1]
-        if gpu_name not in benchmark['nqubits']:
-            # Use the default config for this benchmark if there is no GPU-specific config
-            gpu_name = 'default'
-        nqubits_list = [nqubits_interface] if nqubits_interface else benchmark['nqubits'][gpu_name]
-
-        benchmark_object = benchmark['benchmark']
-        config = benchmark['config']
-        config['precision'] = precision  # WAR
-
-        for nqubits in nqubits_list: # Iterate over diferent number of qubits
-            run_specific = RunSpecific(benchmark_name=benchmark_name,
-                                       benchmark_object=benchmark_object,
-                                       nqubits=nqubits,
-                                       ngpus=ngpus,
-                                       ncpu_threads=ncpu_threads,
-                                       nwarmups=nwarmups,
-                                       nrepeats=nrepeats,
-                                       config=config,
-                                       general_interface=general_interface,
-                                       cache_dir=cache_dir,
-                                       cusvaer_global_index_bits=cusvaer_global_index_bits,
-                                       cusvaer_p2p_device_bits=cusvaer_p2p_device_bits,
-                                       cusvaer_data_transfer_buffer_bits=cusvaer_data_transfer_buffer_bits,
-                                       cusvaer_comm_plugin_type=cusvaer_comm_plugin_type,
-                                       cusvaer_comm_plugin_soname=cusvaer_comm_plugin_soname)
-            run_specific.run()
-
-
-class GeneralInterface:
-
-    def __init__(self, frontend, backend, nshots, nfused, precision, new_circ, save):
-        self.frontend = frontend
-        self.backend = backend
-        self.nshots = nshots
-        self.nfused = nfused
-        self.precision = precision
-        #self.append = append
-        self.new_circ = new_circ
-        self.save = save
         self.full_data = {}
+        self.benchmark_data = {}
 
+        # it could be that the cache dirs are not created yet
+        call_by_root(functools.partial(create_cache, self.cache_dir, self.required_subdirs))
 
-class RunSpecific:
-
-    def __init__(
-            self, benchmark_name, benchmark_object, nqubits, ngpus, ncpu_threads, nwarmups, nrepeats, config,
-            general_interface, cache_dir,
-            cusvaer_global_index_bits, cusvaer_p2p_device_bits, cusvaer_data_transfer_buffer_bits,
-            cusvaer_comm_plugin_type, cusvaer_comm_plugin_soname):
-        self.benchmark_name = benchmark_name
-        self.benchmark_object = benchmark_object
-        self.nqubits = nqubits
-        self.ngpus = ngpus
-        self.ncpu_threads = ncpu_threads
-        self.nwarmups=nwarmups
-        self.nrepeats=nrepeats
-        self.config = config
-        self.general_interface = general_interface
-        self.benchmark_data = {}
-        self.cache_dir = cache_dir
-        # cusvaer options
-        self.cusvaer_global_index_bits = cusvaer_global_index_bits
-        self.cusvaer_p2p_device_bits = cusvaer_p2p_device_bits
-        self.cusvaer_data_transfer_buffer_bits = cusvaer_data_transfer_buffer_bits
-        self.cusvaer_comm_plugin_type = cusvaer_comm_plugin_type
-        self.cusvaer_comm_plugin_soname = cusvaer_comm_plugin_soname
+    def run(self):
+        if self._nqubits is None:
+            gpu_prop = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)
+            max_n_qubits = math.floor(math.log2(gpu_prop['totalGlobalMem'] / (8 if self.precision == 'single' else 16)))
+            nqubits_list = list(range(16, max_n_qubits + 4, 4))
+        else:
+            nqubits_list = [self._nqubits]
+
+        for benchmark_name in self._benchmarks.keys():
+            b = self._benchmarks[benchmark_name]
+            benchmark_object = b['benchmark']
+            benchmark_config = b['config']
+            benchmark_config['precision'] = self.precision  # some frontends may need it
 
-        # currently we assume the following subdirectories exist
-        self.required_subdirs = ('circuits', 'data')
+            for nqubits in nqubits_list:
+                self.benchmark_name = benchmark_name
+                self.benchmark_object = benchmark_object
+                self.benchmark_config = benchmark_config
+                self.nqubits = nqubits
+                self._run()
 
     def _load_or_generate_circuit(self, circuit_filename):
         # We need a mechanism to ensure any incompatible gate_sequence generated
@@ -131,9 +100,17 @@ def _load_or_generate_circuit(self, circuit_filename):
         gate_seq_ver = 1
 
         circuit_filename += f"_v{gate_seq_ver}.pickle"
-        frontend = createFrontend(self.general_interface.frontend, self.nqubits, self.config)
+        frontend = createFrontend(self.frontend, self.nqubits, self.benchmark_config)
+
+        dump_only = bool(os.environ.get('CUQUANTUM_BENCHMARKS_DUMP_GATES', False))
+        if dump_only:
+            # hijack & discard user input
+            from .frontends.frontend_dumper import Dumper
+            frontend = Dumper(
+                self.nqubits,
+                {**self.benchmark_config, 'circuit_filename': circuit_filename})
         try:
-            if self.general_interface.new_circ:
+            if self.new_circ:
                 raise ValueError
 
             # If this circuit has been generated previously, load it
@@ -143,7 +120,7 @@ def _load_or_generate_circuit(self, circuit_filename):
                 logger.debug(f'Circuit loaded from {circuit_filename}')
 
         except:  # Otherwise, generate the circuit and save it
-            gate_sequence = self.benchmark_object.generateGatesSequence(self.nqubits, self.config)
+            gate_sequence = self.benchmark_object.generateGatesSequence(self.nqubits, self.benchmark_config)
             circuit = frontend.generateCircuit(gate_sequence)
             def dump():
                 with open(os.path.join(self.cache_dir, circuit_filename), 'wb') as f:
@@ -151,6 +128,10 @@ def dump():
                     logger.debug(f'Circuit generated and saved to {circuit_filename}')
             call_by_root(dump)
 
+        if dump_only:
+            logger.info("early exiting as the dumper task is completed")
+            raise EarlyReturnError
+
         return circuit
 
     def get_circuit(self, circuit_filename):
@@ -176,6 +157,7 @@ def timer(self, backend, circuit, nshots):
             backend.pre_run(circuit, nshots=nshots)
             backend.run(circuit, nshots)
 
+        annotation_string = f"p{get_mpi_rank()}_run_"
         # actual timing
         for i in range(self.nrepeats):
             backend.pre_run(circuit, nshots=nshots)
@@ -184,7 +166,8 @@ def timer(self, backend, circuit, nshots):
                 start_gpu.record()
             pe1 = time.perf_counter()
 
-            run_dict = backend.run(circuit, nshots)
+            with nvtx.annotate(annotation_string + str(i)):
+                run_dict = backend.run(circuit, nshots)
 
             pe2 = time.perf_counter()
             if self.ngpus > 0:
@@ -212,7 +195,7 @@ def timer(self, backend, circuit, nshots):
 
     def _fix_filename_for_cutn(self, circuit_filename, nqubits):
         target = pauli = None
-        if self.general_interface.backend == 'cutn':
+        if self.backend == 'cutn':
             target = os.environ.get('CUTENSORNET_BENCHMARK_TARGET', 'amplitude')
             circuit_filename += f'_{target}'
             if target == 'expectation':
@@ -221,34 +204,36 @@ def _fix_filename_for_cutn(self, circuit_filename, nqubits):
         return circuit_filename, target, pauli
 
     def extract_backend_version(self):
-        if 'aer' in self.general_interface.backend:
+        if 'aer' in self.backend:
             import qiskit
             version = qiskit.__qiskit_version__['qiskit-aer']
-        elif 'qsim' in self.general_interface.backend:
+        elif 'qsim' in self.backend:
             import qsimcirq
             version = qsimcirq.__version__
-        elif self.general_interface.backend == 'cutn':
+        elif self.backend == 'cutn':
             import cuquantum
             version = cuquantum.cutensornet.get_version()
-        elif self.general_interface.backend == 'cirq':
+        elif self.backend == 'cirq':
             import cirq
             version = cirq.__version__
-        elif self.general_interface.backend == 'naive':
+        elif self.backend == 'naive':
             from .backends import backends
             version = backends['naive'].version
-        elif self.general_interface.backend == 'pennylane':
+        elif self.backend == 'pennylane':
             import pennylane
             version = pennylane.__version__
-        elif self.general_interface.backend == 'pennylane-lightning-gpu':
+        elif self.backend == 'pennylane-lightning-gpu':
             import pennylane_lightning_gpu
             version = pennylane_lightning_gpu.__version__
-        elif self.general_interface.backend == 'pennylane-lightning-qubit':
+        elif self.backend == 'pennylane-lightning-qubit':
             import pennylane_lightning
             version = pennylane_lightning.__version__
-        elif self.general_interface.backend == 'pennylane-lightning-kokkos':
+        elif self.backend == 'pennylane-lightning-kokkos':
             import pennylane_lightning_kokkos
             version = pennylane_lightning_kokkos.__version__
-        elif self.general_interface.backend in ('qulacs-gpu', 'qulacs-cpu'):
+        elif self.backend == 'pennylane-dumper':
+            version = '0'  # dummy
+        elif self.backend in ('qulacs-gpu', 'qulacs-cpu'):
             import qulacs
             version = qulacs.__version__
         else:
@@ -256,19 +241,19 @@ def extract_backend_version(self):
         return version
 
     def extract_frontend_version(self):
-        if self.general_interface.frontend == 'qiskit':
+        if self.frontend == 'qiskit':
             import qiskit
             version = qiskit.__qiskit_version__['qiskit-terra']
-        elif self.general_interface.frontend == 'cirq':
+        elif self.frontend == 'cirq':
             import cirq
             version = cirq.__version__
-        elif self.general_interface.frontend == 'naive':
+        elif self.frontend == 'naive':
             from .frontends import frontends
             version = frontends['naive'].version
-        elif self.general_interface.frontend == 'pennylane':
+        elif self.frontend == 'pennylane':
             import pennylane
             version = pennylane.__version__
-        elif self.general_interface.frontend == 'qulacs':
+        elif self.frontend == 'qulacs':
             import qulacs
             version = qulacs.__version__
         else:
@@ -276,63 +261,56 @@ def extract_frontend_version(self):
         return version
 
     def extract_glue_layer_version(self):
-        if self.general_interface.backend == 'cutn':
+        if self.backend == 'cutn':
             import cuquantum
             glue_ver = f'cuquantum {cuquantum.__version__}'
         else:
             return None
         return glue_ver
 
-    def run(self):
-        measure = self.config['measure']
+    def _run(self):
+        reseed(1234)  # TODO: use a global seed?
+        measure = self.benchmark_config['measure']
 
         # try to load existing perf data, if any
         data_filename = f'{self.benchmark_name}.json'
         filepath = f'{self.cache_dir}/data/{data_filename}'
-        self.general_interface.full_data = load_benchmark_data(
-            filepath, self.cache_dir, self.required_subdirs)
+        self.full_data = load_benchmark_data(filepath)
 
         gpu_device_properties = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)
         gpu_name = gpu_device_properties['name'].decode('utf-8').split(' ')[-1]
         num_qubits = str(self.nqubits)
         num_gpus = str(self.ngpus)
 
-        # FIXME: this is buggy (no early return)
-        # try:
-        #     if (self.general_interface.append
-        #             and num_gpus in self.general_interface.full_data[num_qubits][self.general_interface.frontend+'-v'+frontend_version][self.general_interface.backend+'-v'+backend_version][gpu_name]):
-        #         self.general_interface.logger.info(
-        #             f'Skipping {self.benchmark_name} with {self.nqubits} qubits and {self.ngpus} GPUs [{self.general_interface.backend}-v{backend_version}]')
-        # except KeyError:
-        #     # KeyError means this configuration is not currently benchmarked, so we can continue running
-        #     self.general_interface.logger.debug('Benchmark configuration not found in existing data')
-        #     pass
-
         circuit_filename = f'circuits/{self.benchmark_name}_{self.nqubits}'
-
-        if 'unfold' in self.config.keys() and self.config['unfold']:
+        if 'unfold' in self.benchmark_config.keys() and self.benchmark_config['unfold']:
             circuit_filename += '_unfold'
-        if 'p' in self.config.keys():
-            p = self.config['p']
+        if 'p' in self.benchmark_config.keys():
+            p = self.benchmark_config['p']
             circuit_filename += f'_p{p}'
         if measure:
             circuit_filename += '_measure'
         circuit_filename, target, pauli = self._fix_filename_for_cutn(circuit_filename, self.nqubits)
-        self.general_interface.cutn_target = target
+        self.cutn_target = target
 
         # get circuit
         circuit = self.get_circuit(circuit_filename)
 
         # get backend
+        # TODO: use backend config to simplify this...
         backend = createBackend(
-            self.general_interface.backend, self.ngpus, self.ncpu_threads, self.general_interface.precision,
-            nqubits=self.nqubits,                                      # TODO: backend config
-            cusvaer_global_index_bits=self.cusvaer_global_index_bits,  # cusvaer options
+            self.backend, self.ngpus, self.ncputhreads, self.precision,
+            nqubits=self.nqubits,
+            # cusvaer options
+            cusvaer_global_index_bits=self.cusvaer_global_index_bits,
             cusvaer_p2p_device_bits=self.cusvaer_p2p_device_bits,
             cusvaer_data_transfer_buffer_bits=self.cusvaer_data_transfer_buffer_bits,
             cusvaer_comm_plugin_type=self.cusvaer_comm_plugin_type,
             cusvaer_comm_plugin_soname=self.cusvaer_comm_plugin_soname,
-            nfused=self.general_interface.nfused,                      # only qiskit and qsim
+            # qiskit and qsim
+            nfused=self.nfused,
+            # cutn
+            nhypersamples=self.nhypersamples,
         )
 
         # get versions; it's assumed up to this point, the existence of Python modules for
@@ -340,27 +318,33 @@ def run(self):
         backend_version = self.extract_backend_version()
         frontend_version = self.extract_frontend_version()
         glue_layer_version = self.extract_glue_layer_version()
+        if glue_layer_version is not None:
+            ver_str = f'[{self.frontend}-v{frontend_version} | (glue ver: {glue_layer_version}) | {self.backend}-v{backend_version}]'
+        else:
+            ver_str = f'[{self.frontend}-v{frontend_version} | {self.backend}-v{backend_version}]'
 
         if self.ngpus == 0:
             logger.info(
-                f'* Running {self.benchmark_name} with {self.ncpu_threads} CPU threads, and {self.nqubits} qubits [{self.general_interface.backend}-v{backend_version}]:')
+                f'* Running {self.benchmark_name} with {self.ncputhreads} CPU threads, and {self.nqubits} qubits {ver_str}:'
+            )
         else:
             logger.info(
-                f'* Running {self.benchmark_name} with {self.ngpus} GPUs, and {self.nqubits} qubits [{self.general_interface.backend}-v{backend_version}]:')
+                f'* Running {self.benchmark_name} with {self.ngpus} GPUs, and {self.nqubits} qubits {ver_str}:'
+            )
 
         preprocess_data = backend.preprocess_circuit(
             circuit,
             # only cutn needs these, TODO: backend config
             circuit_filename=os.path.join(self.cache_dir, circuit_filename),
             target=target,
-            pauli=pauli
+            pauli=pauli,
         )
 
         for k in preprocess_data.keys():
             self.benchmark_data[k] = preprocess_data[k]
 
         # run benchmark
-        perf_time, cuda_time, post_time, post_process = self.timer(backend, circuit, self.general_interface.nshots) # nsamples -> nshots
+        perf_time, cuda_time, post_time, post_process = self.timer(backend, circuit, self.nshots) # nsamples -> nshots
 
         # report the result
         run_env = gen_run_env(gpu_device_properties)
@@ -371,7 +355,7 @@ def run(self):
         out = self.canonicalize_benchmark_data(frontend_version, backend_version, run_env, glue_layer_version)
         save_benchmark_data(
             *out,
-            self.general_interface.full_data, filepath, self.general_interface.save)
+            self.full_data, filepath, self.save)
 
     def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env, glue_layer_version):
         """
@@ -415,17 +399,17 @@ def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env
 
         sim_config = HashableDict({
             'frontend': HashableDict({
-                "name": self.general_interface.frontend,
+                "name": self.frontend,
                 "version": frontend_version,
             }),
             'backend': HashableDict({
-                "name": self.general_interface.backend,
+                "name": self.backend,
                 "version": backend_version,
                 "ngpus": self.ngpus,
-                "ncputhreads": self.ncpu_threads,
-                "nshots": self.general_interface.nshots,
-                "nfused": self.general_interface.nfused,
-                "precision": self.general_interface.precision,
+                "ncputhreads": self.ncputhreads,
+                "nshots": self.nshots,
+                "nfused": self.nfused,
+                "precision": self.precision,
                 "with_mpi": is_running_mpiexec(),
             }),
             'glue_layer': HashableDict({
@@ -439,11 +423,11 @@ def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env
         # TODO: record "measure"?
 
         # backend-specific options
-        if self.general_interface.backend == "cusvaer":
+        if self.backend == "cusvaer":
             sim_config["backend"]["cusvaer_global_index_bits"] = self.cusvaer_global_index_bits
             sim_config["backend"]["cusvaer_p2p_device_bits"] = self.cusvaer_p2p_device_bits
-        elif self.general_interface.backend == "cutn":
-            sim_config["backend"]["target"] = self.general_interface.cutn_target
+        elif self.backend == "cutn":
+            sim_config["backend"]["target"] = self.cutn_target
 
         sim_config_hash = sim_config.get_hash()
         self.benchmark_data = {**self.benchmark_data, **sim_config}
@@ -453,35 +437,36 @@ def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env
 
 class BenchApiRunner:
 
-    supported_cusv_apis = ('apply_matrix',)
-    supported_cutn_apis = ()
+    supported_cusv_apis = ('apply_matrix', 'apply_generalized_permutation_matrix', 'cusv_sampler', )
+    supported_cutn_apis = ('tensor_decompose',)
     supported_apis = supported_cusv_apis + supported_cutn_apis
 
+    # currently we assume the following subdirectories exist
+    required_subdirs = ('data',)
+
     def __init__(self, **kwargs):
-        self.num_qubits = kwargs.pop("nqubits")
         self.benchmark = kwargs.pop("benchmark")
         self.cache_dir = kwargs.pop("cachedir")
-        kwargs.pop("verbose")  # don't care
         self.args = kwargs  # just hold the entire group of parsed cmdline args, don't unpack all
 
-        # currently we assume the following subdirectories exist
-        self.required_subdirs = ('data',)
+        # it could be that the cache dirs are not created yet
+        call_by_root(functools.partial(create_cache, self.cache_dir, self.required_subdirs))
 
         # load existing json, if any
         self.data_filename = f"{self.benchmark}.json"
         self.file_path = f'{self.cache_dir}/data/{self.data_filename}'
-        self.full_data = load_benchmark_data(
-            self.file_path, self.cache_dir, self.required_subdirs)
+        self.full_data = load_benchmark_data(self.file_path)
 
     def run(self):
         # prep
         if self.benchmark not in self.supported_apis:
             raise NotImplementedError(f"only {self.supported_apis} is supported for now")
         gpu_device_properties = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)
-        benchmark_data = {}  # dummy
+        benchmark_data = {}
 
         # time the api
-        perf_time, cuda_time = self._run_apply_matrix()
+        bench_func = getattr(self, f"_run_{self.benchmark}")
+        perf_time, cuda_time = bench_func(benchmark_data)  # update benchmark_data in-place
 
         # report the result
         run_env = gen_run_env(gpu_device_properties)
@@ -492,10 +477,11 @@ def run(self):
         out = self.canonicalize_benchmark_data(run_env, benchmark_data)
         save_benchmark_data(*out, self.full_data, self.file_path)
 
-    def _run_apply_matrix(self):
+    def _run_apply_matrix(self, benchmark_data):
         # TODO: It's better to move this method elsewhere, once we support more apis
         from .benchmarks.apply_matrix import test_apply_matrix
         args = self.args
+        self.num_qubits = args.pop("nqubits")
 
         # create targets while keeping args clean for later use
         ntargets = args.pop("ntargets")
@@ -527,6 +513,108 @@ def _run_apply_matrix(self):
             args["nrepeats"],
             args["location"],
             flush_l2=args["flush_cache"],
+            benchmark_data=benchmark_data,
+        )
+
+    def _run_apply_generalized_permutation_matrix(self, benchmark_data):
+        # TODO: It's better to move this method elsewhere, once we support more apis
+        from .benchmarks.apply_gen_perm_matrix import test_apply_generalized_permutation_matrix
+        args = self.args
+        self.num_qubits = args.pop("nqubits")
+
+        # create targets while keeping args clean for later use
+        ntargets = args.pop("ntargets")
+        targets = args.pop("targets")
+        targets = tuple(range(ntargets)) if targets is None else tuple(targets)
+        args["targets"] = targets
+
+        # create controls while keeping args clean for later use
+        ncontrols = args.pop("ncontrols")
+        controls = args.pop("controls")
+        if controls is None and ncontrols is None:
+            controls = ()
+        elif controls is None:
+            controls = tuple(range(ncontrols))
+        else:
+            controls = tuple(controls)
+        args["controls"] = controls
+
+        # create perm_table while keeping args clean for later use
+        has_perm = args.pop("has_perm")
+        perm_table = args.pop("perm_table")
+        if has_perm is False and perm_table is None:
+            perm_table = []
+        elif perm_table is None:
+            # used as a flag to fill perm_table randomly later
+            perm_table = bool(has_perm)
+        else:
+            perm_table = list(perm_table)
+        args["perm_table"] = perm_table
+
+        # run
+        return test_apply_generalized_permutation_matrix(
+            self.num_qubits,
+            args["precision"],
+            targets,
+            controls,
+            int(args["adjoint"]),
+            args["has_diag"],
+            args["precision_diag"],
+            args["location_diag"],
+            args["perm_table"],
+            args["location_perm"],
+            args["nwarmups"],
+            args["nrepeats"],
+            benchmark_data=benchmark_data,
+        )
+
+    def _run_cusv_sampler(self, benchmark_data):
+        from .benchmarks.cusv_sampler import test_cusv_sampler
+        args = self.args
+        self.num_qubits = args.pop("nqubits")
+
+        # create bit_ordering while keeping args clean for later use
+        nbit_ordering = args.pop("nbit_ordering")
+        bit_ordering = args.pop("bit_ordering")
+        bit_ordering = tuple(range(nbit_ordering)) if bit_ordering is None else tuple(bit_ordering)
+        args["bit_ordering"] = bit_ordering
+
+        # run
+        return test_cusv_sampler(
+            self.num_qubits,
+            args["precision"],
+            bit_ordering,
+            args["nshots"],
+            args["output_order"],
+            args["nwarmups"],
+            args["nrepeats"],
+            benchmark_data=benchmark_data,
+        )
+
+    def _run_tensor_decompose(self, benchmark_data):
+        from .benchmarks.tensor_decompose import benchmark_tensor_decompose
+        args = self.args
+        self.num_qubits = 0  # WAR
+
+        # ensure the combination of method/algorithm is meaningful
+        if args["method"] == "SVD":
+            args["algorithm"] = "gesvd"
+        elif args["algorithm"] is not None:
+            # algorithm is set, must be doing SVD
+            args["method"] = "SVD"
+
+        # run
+        return benchmark_tensor_decompose(
+            args["expr"],
+            tuple(args["shape"]),
+            args["precision"],
+            args["is_complex"],
+            args["method"],
+            args["algorithm"],
+            args["nwarmups"],
+            args["nrepeats"],
+            args["check_reference"],
+            benchmark_data=benchmark_data,
         )
 
     def canonicalize_benchmark_data(self, run_env, benchmark_data):
diff --git a/benchmarks/setup.py b/benchmarks/setup.py
index e6b09e5..72fc6d1 100644
--- a/benchmarks/setup.py
+++ b/benchmarks/setup.py
@@ -32,15 +32,16 @@
     "psutil",
     "scipy",
     "networkx",
+    "nvtx",
 ]
 if importlib.util.find_spec('cuquantum') is None:
-    install_requires.append("cuquantum-python>=22.7")
+    install_requires.append("cuquantum-python>=23.3")
 
 
 setup(
     name="cuquantum-benchmarks",
     version=version,
-    description="NVIDIA cuQuantum Circuit Performance Benchmark Suite",
+    description="NVIDIA cuQuantum Performance Benchmark Suite",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/NVIDIA/cuQuantum",
diff --git a/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py b/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py
index 9bd2743..7857466 100644
--- a/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py
+++ b/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py
@@ -218,6 +218,8 @@ def test_benchmark(self, combo, nqubits, benchmark, precision, tmp_path, visible
                 '--verbose']
         if backend == 'cusvaer':
             cmd += ['--cusvaer-global-index-bits', '--cusvaer-p2p-device-bits']
+        if backend == 'cutn':
+            cmd += ['--nhypersamples', '2']
 
         for cmd_prefix in tests:
             result = subprocess.run(cmd_prefix+cmd, env=env, capture_output=True)
@@ -239,38 +241,36 @@ def test_benchmark(self, combo, nqubits, benchmark, precision, tmp_path, visible
 
 
 # TODO: test invalid cases and ensure we raise errors
-@pytest.mark.parametrize(
-    "args", (
-        ["--nqubits", "4", "--ntargets", "2"],
-        ["--nqubits", "4", "--targets", "2,3"],
-        ["--nqubits", "6", "--ntargets", "3", "--controls", "3"],
-        ["--nqubits", "4", "--targets", "2,3", "--ncontrols", "1"],
-        ["--nqubits", "4", "--targets", "2,3", "--controls", "1"],
+class TestCmdApi:
+
+    @pytest.mark.parametrize(
+        "args", (
+            ["--nqubits", "4", "--ntargets", "2"],
+            ["--nqubits", "4", "--targets", "2,3"],
+            ["--nqubits", "6", "--ntargets", "3", "--controls", "3"],
+            ["--nqubits", "4", "--targets", "2,3", "--ncontrols", "1"],
+            ["--nqubits", "4", "--targets", "2,3", "--controls", "1"],
+        )
     )
-)
-@pytest.mark.parametrize(
-    "matrix_prop", (
-        [],  # default
-        ["--layout", "column", "--adjoint"],
+    @pytest.mark.parametrize(
+        "matrix_prop", (
+            [],  # default
+            ["--layout", "column", "--adjoint"],
+        )
+    )
+    @pytest.mark.parametrize(
+        "precision", ("single", "double")
+    )
+    @pytest.mark.parametrize(
+        "flush", (True, False)
     )
-)
-@pytest.mark.parametrize(
-    "precision", ("single", "double")
-)
-@pytest.mark.parametrize(
-    "flush", (True, False)
-)
-class TestCmdApiApplyMatrix:
-
-    benchmark = "apply_matrix"
-
     def test_apply_matrix(self, args, matrix_prop, precision, flush, tmp_path, visible_device):
-
+        benchmark = 'apply_matrix'
         env = os.environ.copy()
         env["CUDA_VISIBLE_DEVICES"] = str(visible_device)
 
         cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api',
-                '--benchmark', self.benchmark,
+                '--benchmark', benchmark,
                 '--precision', precision,
                 '--cachedir', str(tmp_path),
                 # speed up the tests...
@@ -286,7 +286,158 @@ def test_apply_matrix(self, args, matrix_prop, precision, flush, tmp_path, visib
 
         try:
             assert bool(result.check_returncode()) == False
-            cached_json = [f for f in glob.glob(str(tmp_path / f"data/{self.benchmark}.json")) if os.path.isfile(f)]
+            cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)]
+            assert len(cached_json) == 1  # TODO: test aggregate behavior too?
+        except:
+            # make debugging easier
+            print("stdout:\n", result.stdout.decode())
+            print("stderr:\n", result.stderr.decode())
+            raise
+        finally:
+            print("cmd:\n", ' '.join(cmd))
+
+    @pytest.mark.parametrize(
+        "args", (
+            ("--nqubits", "4", "--ntargets", "2",),
+            ("--nqubits", "4", "--targets", "2,3",),
+            ("--nqubits", "6", "--ntargets", "2", "--controls", "3",),
+            ("--nqubits", "4", "--targets", "1,2", "--ncontrols", "1",),
+            ("--nqubits", "4", "--targets", "2,3", "--controls", "1",),
+        )
+    )
+    @pytest.mark.parametrize(
+        "diag", (
+            (),
+            ("--has-diag", "--location-diag", "device",),
+            ("--has-diag", "--precision-diag", "double", "--precision", "double",),
+        )
+    )
+    @pytest.mark.parametrize(
+        "perm", (
+            ("--has-perm",),
+            ("--has-perm", "--location-perm", "device",),
+            ("--perm-table", "2,3,0,1",),  # this test assumes ntargets=2 always
+        )
+    )
+    @pytest.mark.parametrize(
+        "matrix_prop", (
+            (),  # default
+            ("--adjoint",),
+        )
+    )
+    def test_apply_generalized_permutation_matrix(
+            self, args, diag, perm, matrix_prop, tmp_path, visible_device):
+        benchmark = 'apply_generalized_permutation_matrix'
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(visible_device)
+
+        cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api',
+               '--benchmark', benchmark,
+               '--cachedir', str(tmp_path),
+               # speed up the tests...
+               '--nwarmups', '1',
+               '--nrepeats', '1',
+               '--verbose']
+        cmd += args
+        cmd += diag
+        cmd += perm
+        cmd += matrix_prop
+        result = subprocess.run(cmd, env=env, capture_output=True)
+
+        try:
+            assert bool(result.check_returncode()) == False
+            cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)]
+            assert len(cached_json) == 1  # TODO: test aggregate behavior too?
+        except:
+            # make debugging easier
+            print("stdout:\n", result.stdout.decode())
+            print("stderr:\n", result.stderr.decode())
+            raise
+        finally:
+            print("cmd:\n", ' '.join(cmd))
+
+    @pytest.mark.parametrize(
+        "args", (
+            ("--nqubits", "4", "--nbit-ordering", "2", "--nshots", "256"),
+            ("--nqubits", "4", "--bit-ordering", "2,3", "--output-order", "random"),
+        )
+    )
+    @pytest.mark.parametrize(
+        "precision", ("single", "double")
+    )
+    def test_cusv_sampler(self, args, precision, tmp_path, visible_device):
+        benchmark = 'cusv_sampler'
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(visible_device)
+
+        cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api',
+                '--benchmark', benchmark,
+                '--precision', precision,
+                '--cachedir', str(tmp_path),
+                # speed up the tests...
+                '--nwarmups', '1',
+                '--nrepeats', '1',
+                '--verbose']
+        cmd += args
+        result = subprocess.run(cmd, env=env, capture_output=True)
+
+        try:
+            assert bool(result.check_returncode()) == False
+            cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)]
+            assert len(cached_json) == 1  # TODO: test aggregate behavior too?
+        except:
+            # make debugging easier
+            print("stdout:\n", result.stdout.decode())
+            print("stderr:\n", result.stderr.decode())
+            raise
+        finally:
+            print("cmd:\n", ' '.join(cmd))
+
+    @pytest.mark.parametrize(
+        "args", (
+            ["--expr", "abc->abx,xc", "--shape", "4,8,4"],
+            ["--expr", "abcd->ax,bcdx", "--shape", "4,8,4,2"],
+        )
+    )
+    @pytest.mark.parametrize(
+        "method", (
+            ("--method", "QR",),
+            ("--method", "SVD",),
+            ("--algorithm", "gesvd"),
+            ("--algorithm", "gesvdj"),
+            ("--algorithm", "gesvdr"),
+            ("--algorithm", "gesvdp"),
+        )
+    )
+    @pytest.mark.parametrize(
+        "precision", ("single", "double")
+    )
+    @pytest.mark.parametrize(
+        "is_complex", (True, False)
+    )
+    def test_tensor_decompose(self, args, method, precision, is_complex, tmp_path, visible_device):
+        benchmark = 'tensor_decompose'
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(visible_device)
+
+        cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api',
+                '--benchmark', benchmark,
+                '--precision', precision,
+                '--cachedir', str(tmp_path),
+                # speed up the tests...
+                '--nwarmups', '1',
+                '--nrepeats', '1',
+                '--verbose']
+        cmd += args
+        cmd += method
+        if is_complex:
+            cmd.append('--is-complex')
+
+        result = subprocess.run(cmd, env=env, capture_output=True)
+
+        try:
+            assert bool(result.check_returncode()) == False
+            cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)]
             assert len(cached_json) == 1  # TODO: test aggregate behavior too?
         except:
             # make debugging easier
diff --git a/python/samples/cutensornet/coarse/example22_mpi_auto.py b/python/samples/cutensornet/coarse/example22_mpi_auto.py
index 6c6e906..9457f66 100644
--- a/python/samples/cutensornet/coarse/example22_mpi_auto.py
+++ b/python/samples/cutensornet/coarse/example22_mpi_auto.py
@@ -64,15 +64,6 @@
 # Compute the contraction (with distributed path finding & contraction execution)
 result = cuquantum.contract(expr, *operands, options={'device_id' : device_id, 'handle': handle})
 
-# Create a new GPU buffer for verification
-result_cp = cp.empty_like(result)
-
-# Sum the partial contribution from each process on root, with GPU
-if rank == root:
-    comm.Reduce(sendbuf=MPI.IN_PLACE, recvbuf=result_cp, op=MPI.SUM, root=root)
-else:
-    comm.Reduce(sendbuf=result_cp, recvbuf=None, op=MPI.SUM, root=root)
-
 # Check correctness.
 if rank == root:
    result_cp = cp.einsum(expr, *operands, optimize=True)

From 92a18e9d012b405f9599e0534ab46a587cd8b4b6 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 18 Jul 2023 19:19:07 -0700
Subject: [PATCH 2/2] nit

---
 benchmarks/cuquantum_benchmarks/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cuquantum_benchmarks/_utils.py b/benchmarks/cuquantum_benchmarks/_utils.py
index 299e198..76ecbde 100644
--- a/benchmarks/cuquantum_benchmarks/_utils.py
+++ b/benchmarks/cuquantum_benchmarks/_utils.py
@@ -456,7 +456,7 @@ class L2flush:
     https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh.
     """
     def __init__(self):
-        self.l2_size = 3 * cp.cuda.Device().attributes['L2CacheSize']
+        self.l2_size = cp.cuda.Device().attributes['L2CacheSize']
         self.mem = cp.cuda.alloc(self.l2_size) if self.l2_size > 0 else None
 
     def flush(self):