From dafee801a498242072fa19256d53457a1a59173c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 18 Jul 2023 14:11:57 -0700 Subject: [PATCH 1/2] sync with internal repo1 (commit b92afcab0) and repo2 (commit e18a4a4) --- benchmarks/README.md | 6 +- benchmarks/cuquantum_benchmarks/__init__.py | 2 +- benchmarks/cuquantum_benchmarks/_utils.py | 66 ++- .../cuquantum_benchmarks/backends/__init__.py | 4 +- .../backends/backend_cutn.py | 5 +- .../backends/backend_pny.py | 25 +- .../backends/backend_qiskit.py | 18 +- .../benchmarks/apply_gen_perm_matrix.py | 147 +++++++ .../benchmarks/apply_matrix.py | 46 +- .../benchmarks/cusv_sampler.py | 102 +++++ .../benchmarks/tensor_decompose.py | 116 +++++ benchmarks/cuquantum_benchmarks/config.py | 63 +-- .../frontends/frontend_dumper.py | 151 +++++++ .../frontends/frontend_pny.py | 6 +- .../frontends/frontend_qiskit.py | 4 +- benchmarks/cuquantum_benchmarks/run.py | 217 ++++++--- .../cuquantum_benchmarks/run_interface.py | 414 +++++++++++------- benchmarks/setup.py | 5 +- .../cuquantum_benchmarks_tests/test_run.py | 203 +++++++-- .../cutensornet/coarse/example22_mpi_auto.py | 9 - 20 files changed, 1238 insertions(+), 371 deletions(-) create mode 100644 benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py create mode 100644 benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py create mode 100644 benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py create mode 100644 benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py diff --git a/benchmarks/README.md b/benchmarks/README.md index 4ff9780..9ee3894 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -14,7 +14,7 @@ pip install .[all] ``` if running outside of the [cuQuantum Appliance container](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/index.html). -**Note: You may have to build `qsimcirq` and `qiskit-aer` GPU support from source if needed.** +**Note: You may have to build `qsimcirq`, `qiskit-aer`, and `qulacs` GPU support from source if needed.** Alternatively, you can choose to manage all (required & optional) dependencies yourself via ``` @@ -44,7 +44,7 @@ Starting v0.2.0, we offer subcommands for performing benchmarks at different lev Alternatively, you can launch the benchmark program via `python -m cuquantum_benchmarks`. This is equivalent to the standalone command, and is useful when, say, `pip` installs this package to the user site-package (so that the `cuquantum-benchmarks` command may not be available without modifying `$PATH`). -For GPU backends, it is preferred that `--ngpus` is explicitly set. +For GPU backends, it is preferred that `--ngpus N` is explicitly set. On a multi-GPU system, the first `N` GPUs would be used. To limit which GPUs can be accessed by the CUDA runtime, use the environment variable `CUDA_VISIBLE_DEVICES` following the CUDA documentation. For backends that support MPI parallelism, it is assumed that `MPI_COMM_WORLD` is the communicator, and that `mpi4py` is installed. You can run the benchmarks as you would normally do to launch MPI processes: `mpiexec -n N cuquantum-benchmarks ...`. It is preferred if you fully specify the problem (explicitly set `--benchmark` & `--nqubits`). @@ -70,6 +70,8 @@ Currently all environment variables are reserved for internal use only, and are * `CUTENSORNET_DUMP_TN=txt` * `CUTENSORNET_BENCHMARK_TARGET={amplitude,state_vector,expectation}` (pick one) +* `CUTENSORNET_APPROX_TN_UTILS_PATH` +* `CUQUANTUM_BENCHMARKS_DUMP_GATES` ## Development Overview diff --git a/benchmarks/cuquantum_benchmarks/__init__.py b/benchmarks/cuquantum_benchmarks/__init__.py index 8b51178..336ad30 100644 --- a/benchmarks/cuquantum_benchmarks/__init__.py +++ b/benchmarks/cuquantum_benchmarks/__init__.py @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: BSD-3-Clause -__version__ = '0.2.0' +__version__ = '0.3.0' diff --git a/benchmarks/cuquantum_benchmarks/_utils.py b/benchmarks/cuquantum_benchmarks/_utils.py index c814604..299e198 100644 --- a/benchmarks/cuquantum_benchmarks/_utils.py +++ b/benchmarks/cuquantum_benchmarks/_utils.py @@ -5,6 +5,7 @@ import argparse import ctypes from dataclasses import dataclass +import functools import math import json import hashlib @@ -19,6 +20,7 @@ import cupy as cp import numpy as np +import nvtx from cuquantum import cudaDataType, ComputeType from cuquantum.cutensornet._internal.einsum_parser import create_size_dict import psutil @@ -29,6 +31,15 @@ logger = logging.getLogger(logger_name) +def wrap_with_nvtx(func, msg): + """Add NVTX makers to a function with a message.""" + @functools.wraps(func) + def inner(*args, **kwargs): + with nvtx.annotate(msg): + return func(*args, **kwargs) + return inner + + def reseed(seed=1234): random.seed(seed) np.random.seed(seed) @@ -162,11 +173,16 @@ def is_running_mpi(): return MPI -def get_num_processes(): +def get_mpi_size(): MPI = is_running_mpi() return MPI.COMM_WORLD.Get_size() if MPI else 1 +def get_mpi_rank(): + MPI = is_running_mpi() + return MPI.COMM_WORLD.Get_rank() if MPI else 0 + + def call_by_root(f, root=0): """ Call the callable f only by the root process. """ MPI = is_running_mpi() @@ -409,7 +425,7 @@ def dump(): return full_data -def load_benchmark_data(filepath, cache_dir, required_subdirs=()): +def load_benchmark_data(filepath): try: with open(filepath, 'r') as f: full_data = json.load(f) @@ -419,17 +435,16 @@ def load_benchmark_data(filepath, cache_dir, required_subdirs=()): full_data = {} logger.debug(f'{filepath} not found') - # it could be that the cache dirs are not created yet - def create_cache(): - for subdir in required_subdirs: - path = os.path.join(cache_dir, subdir) - if not os.path.isdir(path): - os.makedirs(path, exist_ok=True) - call_by_root(create_cache) - return full_data +def create_cache(cache_dir, required_subdirs): + for subdir in required_subdirs: + path = os.path.join(cache_dir, subdir) + if not os.path.isdir(path): + os.makedirs(path, exist_ok=True) + + # TODO: upstream this to cupyx.profiler.benchmark class L2flush: """ Handly utility for flushing the current device's L2 cache. @@ -441,7 +456,7 @@ class L2flush: https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh. """ def __init__(self): - self.l2_size = cp.cuda.Device().attributes['L2CacheSize'] + self.l2_size = 3 * cp.cuda.Device().attributes['L2CacheSize'] self.mem = cp.cuda.alloc(self.l2_size) if self.l2_size > 0 else None def flush(self): @@ -496,3 +511,32 @@ class _Result: pass result.gpu_times = gpu_times return result + + +class EarlyReturnError(RuntimeError): pass + + +is_unique = lambda a: len(set(a)) == len(a) +is_disjoint = lambda a, b: not bool(set(a) & set(b)) + + +def check_targets_controls(targets, controls, n_qubits): + # simple checks for targets and controls + assert len(targets) >= 1, "must have at least 1 target qubit" + assert is_unique(targets), "qubit indices in targets must be unique" + assert is_unique(controls), "qubit indices in controls must be unique" + assert is_disjoint(targets, controls), "qubit indices in targets and controls must be disjoint" + assert all(0 <= q and q < n_qubits for q in targets + controls), f"target and control qubit indices must be in range [0, {n_qubits})" + + +def check_sequence(seq, expected_size=None, max_size=None, name=''): + if expected_size is not None: + assert len(seq) == expected_size, f"the provided {name} must be of length {expected_size}" + size = expected_size + elif max_size is not None: + assert len(seq) <= max_size, f"the provided {name} must have length <= {max_size}" + size = max_size + else: + assert False + assert is_unique(seq), f"the provided {name} must have non-repetitve entries" + assert all(0 <= i and i < size for i in seq), f"entries in the {name} must be in [0, {size})" diff --git a/benchmarks/cuquantum_benchmarks/backends/__init__.py b/benchmarks/cuquantum_benchmarks/backends/__init__.py index f009f67..0cb2cf2 100644 --- a/benchmarks/cuquantum_benchmarks/backends/__init__.py +++ b/benchmarks/cuquantum_benchmarks/backends/__init__.py @@ -4,7 +4,8 @@ from .backend_cirq import Cirq from .backend_cutn import cuTensorNet -from .backend_pny import Pny, PnyLightningGpu, PnyLightningCpu, PnyLightningKokkos +from .backend_pny import (Pny, PnyLightningGpu, PnyLightningCpu, + PnyLightningKokkos, PnyDumper) from .backend_qsim import Qsim, QsimCuda, QsimCusv, QsimMgpu from .backend_qiskit import Aer, AerCuda, AerCusv, CusvAer from .backend_qulacs import QulacsGpu, QulacsCpu @@ -29,6 +30,7 @@ 'pennylane-lightning-gpu': PnyLightningGpu, 'pennylane-lightning-qubit': PnyLightningCpu, 'pennylane-lightning-kokkos': PnyLightningKokkos, + 'pennylane-dumper': PnyDumper, 'qulacs-cpu': QulacsCpu, 'qulacs-gpu': QulacsGpu, } diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py index 76f52d1..ba58fb7 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_cutn.py @@ -50,6 +50,7 @@ def __init__(self, ngpus, ncpu_threads, precision, **kwargs): # cuQuantum Python 22.07 or below opts = cutn.NetworkOptions(handle=self.handle) self.network_opts = opts + self.n_samples = kwargs.pop('nhypersamples') def __del__(self): cutn.destroy(self.handle) @@ -104,10 +105,12 @@ def preprocess_circuit(self, circuit, *args, **kwargs): t1 = time.perf_counter() path, opt_info = self.network.contract_path( # TODO: samples may be too large for small circuits - optimize={'samples': 512, 'threads': self.ncpu_threads}) + optimize={'samples': self.n_samples, 'threads': self.ncpu_threads}) t2 = time.perf_counter() time_path = t2 - t1 logger.info(f'contract_path() took {time_path} s') + logger.debug(f'# samples: {self.n_samples}') + logger.debug(opt_info) self.path = path self.opt_info = opt_info diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py index ad78332..31f3920 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_pny.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_pny.py @@ -7,6 +7,7 @@ import os import time import warnings +import sys import numpy as np try: @@ -15,7 +16,7 @@ pennylane = None from .backend import Backend -from .._utils import is_running_mpi +from .._utils import call_by_root, EarlyReturnError, is_running_mpi # set up a logger @@ -80,6 +81,23 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs): if self.ngpus != 0: raise ValueError(f"cannot specify --ngpus for the backend {self.identifier}") dev = pennylane.device("default.qubit", wires=self.nqubits, shots=nshots, c_dtype=self.dtype) + elif self.identifier == "pennylane-dumper": + import cloudpickle + import cuquantum_benchmarks + cloudpickle.register_pickle_by_value(cuquantum_benchmarks) + + # note: before loading the pickle, one should check if the Python version agrees + # (probably pennylane's version too) + py_major_minor = f'{sys.version_info.major}.{sys.version_info.minor}' + circuit_filename = kwargs.pop('circuit_filename') + circuit_filename += f"_pny_raw_py{py_major_minor}.pickle" + def dump(): + logger.info(f"dumping pennylane (raw) circuit as {circuit_filename} ...") + with open(circuit_filename, 'wb') as f: + cloudpickle.dump(circuit, f) # use highest protocol + logger.info("early exiting as the dumper task is completed") + call_by_root(dump) + raise EarlyReturnError else: raise ValueError(f"the backend {self.identifier} is not recognized") @@ -89,9 +107,9 @@ def _make_qnode(self, circuit, nshots=1024, **kwargs): def preprocess_circuit(self, circuit, *args, **kwargs): nshots = kwargs.get('nshots', 1024) t1 = time.perf_counter() - self.circuit = self._make_qnode(circuit, nshots) + self.circuit = self._make_qnode(circuit, nshots, **kwargs) t2 = time.perf_counter() - time_make_qnode = t2-t1 + time_make_qnode = t2 - t1 logger.info(f'make qnode took {time_make_qnode} s') return {'make_qnode': time_make_qnode} @@ -107,3 +125,4 @@ def run(self, circuit, nshots=1024): PnyLightningCpu = functools.partial(Pennylane, identifier='pennylane-lightning-qubit') PnyLightningKokkos = functools.partial(Pennylane, identifier='pennylane-lightning-kokkos') Pny = functools.partial(Pennylane, identifier='pennylane') +PnyDumper = functools.partial(Pennylane, identifier='pennylane-dumper') diff --git a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py index 4362b53..2b1bde5 100644 --- a/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py +++ b/benchmarks/cuquantum_benchmarks/backends/backend_qiskit.py @@ -6,6 +6,7 @@ import functools import logging import time +from importlib.metadata import version import numpy as np import cupy as cp @@ -15,7 +16,7 @@ qiskit = None from .backend import Backend -from .._utils import get_num_processes +from .._utils import get_mpi_size, get_mpi_rank # set up a logger @@ -48,8 +49,8 @@ def run(self, circuit, nshots=1024): results = self.backend.run(transpiled_qc, shots=nshots, memory=True) else: results = self.backend.run(transpiled_qc, shots=0, memory=True) - # workaround for memory allocation failure for cusvaer 22.11 - if self.identifier == 'cusvaer': + # workaround for memory allocation failure for cusvaer 22.11/23.03 + if self.identifier == 'cusvaer' and self._need_sync(): self._synchronize() post_res_list = results.result().get_memory() @@ -169,7 +170,7 @@ def create_aer_backend(self, identifier, ngpus, ncpu_threads, *args, **kwargs): return backend def get_aer_blocking_setup(self, ngpus=None): - size = get_num_processes() # check if running MPI + size = get_mpi_size() # check if running MPI if size > 1: blocking_enable = True if self.identifier == 'aer': @@ -182,11 +183,16 @@ def get_aer_blocking_setup(self, ngpus=None): blocking_qubits = None return blocking_enable, blocking_qubits + def _need_sync(self): + ver_str = version('cusvaer') + ver = [int(num) for num in ver_str.split('.')] + return ver[0] == 0 and ver[1] <= 2 + def _synchronize(self): - nprocs = get_num_processes() + my_rank = get_mpi_rank() ndevices_in_node = cp.cuda.runtime.getDeviceCount() # GPU selected in this process - device_id = nprocs % ndevices_in_node + device_id = my_rank % ndevices_in_node cp.cuda.Device(device_id).synchronize() diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py b/benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py new file mode 100644 index 0000000..7701979 --- /dev/null +++ b/benchmarks/cuquantum_benchmarks/benchmarks/apply_gen_perm_matrix.py @@ -0,0 +1,147 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import logging + +import cupy as cp +import numpy as np +from cupyx.profiler import benchmark + +from cuquantum import custatevec as cusv + +from .._utils import (check_sequence, check_targets_controls, dtype_to_cuda_type, + precision_str_to_dtype, wrap_with_nvtx) + + +# set up a logger +logger_name = "cuquantum-benchmarks" +logger = logging.getLogger(logger_name) + + +def test_apply_generalized_permutation_matrix( + n_qubits, dtype_sv, + targets, controls, adjoint, + diag, dtype_diag, location_diag, # for D + perm_table, location_perm, # for P + n_warmup, n_repeat, *, + benchmark_data=None): + # TODO: allow controlling seed? + if diag is False and not perm_table: + raise ValueError("need to specify at least --has-diag or --has-perm/--perm-table") + + logger.debug(f"{n_qubits=}") + logger.debug(f"{dtype_sv=}") + logger.debug(f"{targets=}") + logger.debug(f"{controls=}") + logger.debug(f"{adjoint=}") + logger.debug(f"{diag=}") + logger.debug(f"{dtype_diag=}") + logger.debug(f"{location_diag=}") + if isinstance(perm_table, bool) or len(perm_table) <= 16: + logger.debug(f"{perm_table=}") + else: + logger.debug("perm_table = (omitted due to length)") + logger.debug(f"{location_perm=}") + logger.debug(f"{n_warmup=}") + logger.debug(f"{n_repeat=}") + + check_targets_controls(targets, controls, n_qubits) + n_targets = len(targets) + n_controls = len(controls) + + # cuStateVec handle initialization + handle = cusv.create() + stream = cp.cuda.Stream() + cusv.set_stream(handle, stream.ptr) + + size_sv = (2 ** n_qubits) + dtype_sv = precision_str_to_dtype(dtype_sv) + sv = cp.ones((size_sv,), dtype=dtype_sv) + data_type_sv = dtype_to_cuda_type(dtype_sv) + + # the diagonal matrix can live on either host (np) or device (cp) + matrix_dim = (2 ** n_targets) + dtype_diag = precision_str_to_dtype(dtype_diag) + xp_diag = cp if location_diag == 'device' else np + if diag: + # it's better to just call rng.uniform(), but it's not there until CuPy v12.0.0 + # rng_diag = xp_diag.random.default_rng(seed=1234) + # diag = rng_diag.uniform(0.7, 1.3, size=matrix_dim).astype(dtype_diag) + diag = 0.6 * xp_diag.random.random(size=matrix_dim).astype(dtype_diag) + 0.7 + if isinstance(diag, cp.ndarray): + diag_ptr = diag.data.ptr + elif isinstance(diag, np.ndarray): + diag_ptr = diag.ctypes.data + else: + raise ValueError + else: + diag_ptr = 0 + data_type_diag = dtype_to_cuda_type(dtype_diag) + + # the permutation table can live on either host (np) or device (cp) + xp_perm = cp if location_perm == 'device' else np + if perm_table: + if perm_table is True: + original_perm_table = xp_perm.arange(0, matrix_dim, dtype=xp_perm.int64) + perm_table = xp_perm.copy(original_perm_table) + # it'd have been nice to seed an rng and call rng.shuffle(), but CuPy does + # not support it yet... + while True: + xp_perm.random.shuffle(perm_table) + # check if the matrix is not diagonal + if not (original_perm_table == perm_table).all(): + break + else: # a user-provided list + check_sequence(perm_table, expected_size=matrix_dim, name="perm_table") + perm_table = xp_perm.asarray(perm_table, dtype=xp_perm.int64) + + if isinstance(perm_table, cp.ndarray): + perm_table_ptr = perm_table.data.ptr + elif isinstance(perm_table, np.ndarray): + perm_table_ptr = perm_table.ctypes.data + else: + raise ValueError + else: + perm_table_ptr = 0 + + cp.cuda.Device().synchronize() # ensure data prep is done before switching stream + + #################################################################################### + + # manage the workspace + workspace_size = cusv.apply_generalized_permutation_matrix_get_workspace_size( + handle, data_type_sv, n_qubits, perm_table_ptr, diag_ptr, + data_type_diag, targets, n_targets, n_controls) + + with stream: + if workspace_size > 0: + workspace = cp.cuda.alloc(workspace_size) + workspace_ptr = workspace.ptr + else: + workspace_ptr = 0 + + # apply diagonal/permutation gate + apply_generalized_permutation_matrix = wrap_with_nvtx( + cusv.apply_generalized_permutation_matrix, + "apply_generalized_permutation_matrix") + args = ( + handle, sv.data.ptr, data_type_sv, n_qubits, perm_table_ptr, + diag_ptr, data_type_diag, adjoint, targets, n_targets, + controls, 0, # TODO: support control bit values + n_controls, workspace_ptr, workspace_size) + result = benchmark( + apply_generalized_permutation_matrix, + args, + n_warmup=n_warmup, n_repeat=n_repeat) + + # destroy handle + cusv.destroy(handle) + + logger.debug(str(result)) + cpu_time = np.average(result.cpu_times) + gpu_time = np.average(result.gpu_times[0]) + memory_footprint = (2. ** (n_qubits - n_controls)) * 2. * np.dtype(dtype_sv).itemsize + logger.debug(f"effective bandwidth = {memory_footprint / gpu_time * 1e-9} (GB/s)") + + return cpu_time, gpu_time diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py b/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py index 0e82db2..dd9c0c1 100644 --- a/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py +++ b/benchmarks/cuquantum_benchmarks/benchmarks/apply_matrix.py @@ -10,8 +10,9 @@ from cuquantum import custatevec as cusv -from .._utils import (dtype_to_cuda_type, dtype_to_compute_type, precision_str_to_dtype, - random_unitary, L2flush, benchmark_with_prerun) +from .._utils import (benchmark_with_prerun, check_targets_controls, dtype_to_cuda_type, + dtype_to_compute_type, L2flush, precision_str_to_dtype, + random_unitary, wrap_with_nvtx) # set up a logger @@ -21,33 +22,26 @@ def test_apply_matrix( n_qubits, targets, controls, dtype_sv, dtype_mat, layout, adjoint, - n_warmup, n_repeat, location, *, flush_l2=False): - logger.debug(f"{n_qubits = }") - logger.debug(f"{targets = }") - logger.debug(f"{controls = }") - logger.debug(f"{dtype_sv = }") - logger.debug(f"{dtype_mat = }") - logger.debug(f"{layout = }") - logger.debug(f"{adjoint = }") - logger.debug(f"{location = }") - logger.debug(f"{n_warmup = }") - logger.debug(f"{n_repeat = }") - logger.debug(f"{flush_l2 = }") + n_warmup, n_repeat, location, *, + flush_l2=False, benchmark_data=None): + logger.debug(f"{n_qubits=}") + logger.debug(f"{targets=}") + logger.debug(f"{controls=}") + logger.debug(f"{dtype_sv=}") + logger.debug(f"{dtype_mat=}") + logger.debug(f"{layout=}") + logger.debug(f"{adjoint=}") + logger.debug(f"{location=}") + logger.debug(f"{n_warmup=}") + logger.debug(f"{n_repeat=}") + logger.debug(f"{flush_l2=}") dtype_sv = precision_str_to_dtype(dtype_sv) dtype_mat = precision_str_to_dtype(dtype_mat) xp = cp if location == 'device' else np layout = cusv.MatrixLayout.ROW if layout == "row" else cusv.MatrixLayout.COL - # simple sanity checks - assert len(targets) >= 1, "must have at least 1 target qubit" - _targets = set(targets) - assert len(_targets) == len(targets), "target qubit IDs cannot overlap" - _controls = set(controls) - assert len(_controls) == len(controls), "control qubits IDs cannot overlap" - assert len(_targets & _controls) == 0, "targets and controls cannot overlap" - _involved = targets + controls - assert 0 <= min(_involved) and max(_involved) < n_qubits, f"involved qubit IDs must be in range [0, {n_qubits})" + check_targets_controls(targets, controls, n_qubits) size_sv = 2**n_qubits n_targets = len(targets) @@ -109,19 +103,21 @@ def test_apply_matrix( controls, 0, n_controls, # TODO: support control bit values compute_type, workspace_ptr, workspace_size) + apply_matrix = wrap_with_nvtx(cusv.apply_matrix, "apply_matrix") + if flush_l2: l2flusher = L2flush() def f(*args, **kwargs): l2flusher.flush() # clear L2 cache result = benchmark_with_prerun( - cusv.apply_matrix, + apply_matrix, args, n_warmup=n_warmup, n_repeat=n_repeat, pre_run=f) else: result = benchmark( - cusv.apply_matrix, + apply_matrix, args, n_warmup=n_warmup, n_repeat=n_repeat) diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py b/benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py new file mode 100644 index 0000000..d404454 --- /dev/null +++ b/benchmarks/cuquantum_benchmarks/benchmarks/cusv_sampler.py @@ -0,0 +1,102 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import logging + +import numpy as np +import cupy as cp +from cupyx.profiler import benchmark + +from cuquantum import custatevec as cusv + +from .._utils import (check_sequence, dtype_to_cuda_type, precision_str_to_dtype, + wrap_with_nvtx) + + +# set up a logger +logger_name = "cuquantum-benchmarks" +logger = logging.getLogger(logger_name) + + +def test_cusv_sampler( + n_qubits, dtype_sv, bit_ordering, n_shots, output_order, n_warmup, n_repeat, *, + benchmark_data=None): + logger.debug(f"{n_qubits=}") + logger.debug(f"{dtype_sv=}") + logger.debug(f"{bit_ordering=}") + logger.debug(f"{n_shots=}") + logger.debug(f"{output_order}") + logger.debug(f"{n_warmup=}") + logger.debug(f"{n_repeat=}") + + check_sequence(bit_ordering, max_size=n_qubits, name="bit_ordering") + dtype_sv = precision_str_to_dtype(dtype_sv) + size_sv = (1 << n_qubits) + + # the statevector must reside on device + sv = cp.ones((size_sv,), dtype=dtype_sv) + sv /= np.sqrt(size_sv) + # assert cp.allclose(cp.sum(cp.abs(sv)**2), 1) + data_type_sv = dtype_to_cuda_type(dtype_sv) + + # the output bitstrings must reside on host + bit_strings = np.empty((n_shots,), dtype=np.int64) + + # the random seeds must be a host array + randnums = np.random.random((n_shots,)).astype(np.float64) + + cp.cuda.Device().synchronize() # ensure data prep is done before switching stream + + #################################################################################### + + # cuStateVec handle initialization + handle = cusv.create() + stream = cp.cuda.Stream() + cusv.set_stream(handle, stream.ptr) + + # create sampler and check the size of external workspace + sampler, workspace_size = cusv.sampler_create( + handle, sv.data.ptr, data_type_sv, n_qubits, n_shots) + + with stream: + # manage the workspace + if workspace_size > 0: + workspace = cp.cuda.alloc(workspace_size) + workspace_ptr = workspace.ptr + else: + workspace_ptr = 0 + + # sample preprocess + sampler_preprocess = wrap_with_nvtx( + cusv.sampler_preprocess, "sampler_preprocess") + args = (handle, sampler, workspace_ptr, workspace_size) + + result1 = benchmark( + sampler_preprocess, + args, + n_warmup=n_warmup, n_repeat=n_repeat) + logger.debug(str(result1)) + + # sample bit strings + sampler_sample = wrap_with_nvtx( + cusv.sampler_sample, "sampler_sample") + args = ( + handle, sampler, bit_strings.ctypes.data, bit_ordering, len(bit_ordering), + randnums.ctypes.data, n_shots, + cusv.SamplerOutput.RANDNUM_ORDER if output_order == "random" else cusv.SamplerOutput.ASCENDING_ORDER) + + result2 = benchmark( + sampler_sample, + args, + n_warmup=n_warmup, n_repeat=n_repeat) + logger.debug(str(result2)) + + # clean up + cusv.sampler_destroy(sampler) + cusv.destroy(handle) + + cpu_time = np.average(result1.cpu_times) + np.average(result2.cpu_times) + gpu_time = np.average(result1.gpu_times[0]) + np.average(result2.gpu_times[0]) + + return cpu_time, gpu_time diff --git a/benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py b/benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py new file mode 100644 index 0000000..bb711ba --- /dev/null +++ b/benchmarks/cuquantum_benchmarks/benchmarks/tensor_decompose.py @@ -0,0 +1,116 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import logging +import os +import sys + +import cupy as cp +import numpy as np +from cupyx.profiler import benchmark + +import cuquantum.cutensornet as cutn +from cuquantum.cutensornet import tensor + +from .._utils import precision_str_to_dtype, wrap_with_nvtx +try: + path = os.environ.get('CUTENSORNET_APPROX_TN_UTILS_PATH', '') + if path and os.path.isfile(path): + sys.path.insert(1, os.path.dirname(path)) + from approxTN_utils import tensor_decompose +except ImportError: + tensor_decompose = None + + +# set up a logger +logger_name = "cuquantum-benchmarks" +logger = logging.getLogger(logger_name) + + +def benchmark_tensor_decompose( + expr, shape, precision, is_complex, method, algorithm, n_warmup, n_repeats, check_ref, *, + benchmark_data=None): + logger.debug(f"{expr=}") + logger.debug(f"{shape=}") + logger.debug(f"{precision=}") + logger.debug(f"{is_complex=}") + logger.debug(f"{method=}") + logger.debug(f"{algorithm=}") + logger.debug(f"{n_warmup=}") + logger.debug(f"{n_repeats=}") + logger.debug(f"{check_ref=}") + + cp.random.seed(5678) # TODO: set me + handle = cutn.create() + options = {'handle': handle} + decomp_subscripts = expr + + # sanity checks + expr_in = expr.split('->')[0] + assert len(expr_in) == len(shape), \ + f"the input shape {shape} mismatches with the input modes {expr_in}" + if check_ref and tensor_decompose is None: + raise RuntimeError("--check-reference is not supported") + + dtype_r = precision_str_to_dtype(precision, False) + t_in = cp.random.random(shape, dtype=dtype_r) + if is_complex: + dtype = precision_str_to_dtype(precision) + t_in = t_in.astype(dtype) + t_in += 1j*cp.random.random(shape, dtype=dtype_r) + assert t_in.dtype == dtype + + t_numpy = t_in.get() + + if method == "QR": + kwargs = {'options': options} + if check_ref: + options_ref = {'method':'qr'} + elif method == "SVD": + try: + kwargs = {'options': options, 'method': tensor.SVDMethod(algorithm=algorithm)} + except TypeError as e: + if algorithm != "gesvd": + raise ValueError(f"{algorithm} requires cuQuantum v23.06+") from e + else: + kwargs = {'options': options, 'method': tensor.SVDMethod()} + if check_ref: + options_ref = {'method':'svd'} + else: + assert False + cp.cuda.Device().synchronize() # ensure data prep is done + + decompose = wrap_with_nvtx(tensor.decompose, "decompose") + + results = benchmark(decompose, + (decomp_subscripts, t_in), kwargs=kwargs, + n_repeat=n_repeats, n_warmup=n_warmup) + + if check_ref: + decompose_ref = wrap_with_nvtx(tensor_decompose, "tensor_decompose") + + results_cupy = benchmark(decompose_ref, + (decomp_subscripts, t_in), kwargs=options_ref, + n_repeat=n_repeats, n_warmup=n_warmup) + + results_numpy = benchmark(decompose_ref, + (decomp_subscripts, t_numpy), kwargs=options_ref, + n_repeat=n_repeats, n_warmup=n_warmup) + + cutn.destroy(handle) + + logger.debug(str(results)) + if check_ref: + logger.debug("ref (CuPy):") + logger.debug(str(results_cupy)) + benchmark_data['cupy_time'] = max( + np.average(results_cupy.cpu_times), np.average(results_cupy.gpu_times[0])) + logger.debug("ref (NumPy):") + logger.debug(str(results_numpy)) + benchmark_data['numpy_time'] = np.average(results_numpy.cpu_times) + + cpu_time = np.average(results.cpu_times) + gpu_time = np.average(results.gpu_times[0]) + + return cpu_time, gpu_time diff --git a/benchmarks/cuquantum_benchmarks/config.py b/benchmarks/cuquantum_benchmarks/config.py index 178f0d9..04581c6 100644 --- a/benchmarks/cuquantum_benchmarks/config.py +++ b/benchmarks/cuquantum_benchmarks/config.py @@ -23,12 +23,6 @@ 'qft': { 'benchmark': QFT, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)) + [33], - }, 'config': { 'measure': True, }, @@ -36,12 +30,6 @@ 'iqft': { 'benchmark': IQFT, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)) + [33], - }, 'config': { 'measure': True, }, @@ -49,12 +37,6 @@ 'ghz': { 'benchmark': GHZ, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)) + [33], - }, 'config': { 'measure': True, }, @@ -62,12 +44,6 @@ 'simon': { 'benchmark': Simon, - 'nqubits': { - 'default': list(range(6, 16, 2)) + [15], - '3090': list(range(6, 16, 2)) + [15], - 'A6000': list(range(6, 16, 2)) + [15], - 'A100-SXM4-80GB': list(range(6, 17, 1)), - }, 'config': { 'measure': True, }, @@ -75,12 +51,6 @@ 'hidden_shift': { 'benchmark': HiddenShift, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)) + [33], - }, 'config': { 'measure': True, }, @@ -88,12 +58,6 @@ 'qaoa': { 'benchmark': QAOA, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)) + [33], - }, 'config': { 'measure': True, 'p': 1, @@ -102,12 +66,6 @@ 'qpe': { 'benchmark': QPE, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)), - }, 'config': { 'measure': True, 'unfold': False, @@ -116,9 +74,6 @@ 'quantum_volume': { 'benchmark': QuantumVolume, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - }, 'config': { 'measure': True, }, @@ -126,12 +81,6 @@ 'random': { 'benchmark': Random, - 'nqubits': { - 'default': list(range(16, 32, 4)) + [30], - '3090': list(range(16, 32, 4)) + [30], - 'A6000': list(range(16, 32, 4)) + [30], - 'A100-SXM4-80GB': list(range(16, 34, 2)), - }, 'config': { 'measure': True, }, @@ -152,6 +101,7 @@ # TODO: even this may not be a good default 'ncputhreads': multiprocessing.cpu_count() // 2, 'precision': 'single', + 'nhypersamples': 32, }, }, @@ -295,6 +245,17 @@ }, }, + # dummy + 'pennylane-dumper': { + 'config': { + 'nshots': 1024, + 'nfused': None, + 'ngpus': 0, + 'ncputhreads': 1, + 'precision': 'single', + }, + }, + 'qulacs-gpu': { 'config': { 'nshots': 1024, diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py new file mode 100644 index 0000000..74217d2 --- /dev/null +++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_dumper.py @@ -0,0 +1,151 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cmath +import logging +from math import pi + +import numpy as np + +from .frontend import Frontend +from .._utils import call_by_root + + +# set up a logger +logger_name = "cuquantum-benchmarks" +logger = logging.getLogger(logger_name) + + +class Dumper(Frontend): + """Special frontend for dumping the gate sequence as pure text to disk. + + Each gate (or operation) would be stored as 3 lines, with elements separated by 1 space: + + 1. n_targets n_controls + 2. targets controls + 3. contiguity actual_matrix_data + + Note that the qubit IDs are zero-based. The matrix data is flattened to a 1D contiguous + array of length 2**(2*n_targets). The contiguity is a single character "C" (for C-major, + or row-major) or "F" (for Fortran-major, or column-major) for how to interpret the matrix. + All complex numbers are stored as two real numbers (ex: 0.5-0.1j -> "0.5 -0.1"). + + As an example, a CCX gate acting on qubit 0 and controlled by qubits 2 & 4 is stored as + + ''' + 1 2\n + 0 2 4\n + C 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0\n + ''' + + Currently the measurement operation at the end of the gate sequence is not stored. + + An empty line can be used to separate different gates/operations and improve readability, + but it is not required. + """ + + def __init__(self, nqubits, config): + precision = config['precision'] + self.dtype = np.complex64 if precision == 'single' else np.complex128 + self.dtype = np.dtype(self.dtype) + circuit_filename = config['circuit_filename'] + self.circuit_filename = circuit_filename.replace('.pickle', '_raw.txt') + self.nqubits = nqubits + self.order = 'C' # TODO + self.digits = 12 # TODO + + def _dump_op(self, op, targets, controls=()): + op = np.array2string( + op.astype(self.dtype).reshape(-1, order=self.order).view(self.dtype.char.lower()), + max_line_width=np.inf, + precision=self.digits, + ) + if isinstance(targets, int): + targets = (targets,) + if isinstance(controls, int): + controls = (controls,) + + op_data = f"{len(targets)} {len(controls)}\n" + for t in targets: + op_data += f"{t} " + for c in controls: + op_data += f"{c} " + op_data += f"\n{self.order} " + op_data += f"{op[1:-1]}\n\n" + + return op_data + + def _get_rotation_matrix(self, theta, phi, lam): + matrix = np.empty((2, 2), dtype=self.dtype) + theta *= 0.5 + matrix[0, 0] = cmath.cos(theta) + matrix[0, 1] = - cmath.sin(theta) * cmath.exp(1j*lam) + matrix[1, 0] = cmath.sin(theta) * cmath.exp(1j*phi) + matrix[1, 1] = cmath.cos(theta) * cmath.exp(1j*(phi+lam)) + matrix = np.asarray(matrix) + return matrix + + def generateCircuit(self, gateSeq): + circuit = '' + + for g in gateSeq: + if g.id == 'h': + circuit += self._dump_op( + np.asarray([[1, 1], [1, -1]])/np.sqrt(2), g.targets) + + elif g.id == 'x': + circuit += self._dump_op( + np.asarray([[0, 1], [1, 0]]), g.targets) + + elif g.id == 'cnot': + # TODO: use 4*4 version (merge targets & controls)? + circuit += self._dump_op( + np.asarray([[0, 1], [1, 0]]), g.targets, g.controls) + + elif g.id == 'cz': + # TODO: use 4*4 version (merge targets & controls)? + circuit += self._dump_op( + np.asarray([[1, 0], [0, -1]]), g.targets, g.controls) + + elif g.id == 'rz': + circuit += self._dump_op( + self._get_rotation_matrix(0, g.params, 0), g.targets) + + elif g.id == 'rx': + circuit += self._dump_op( + self._get_rotation_matrix(g.params, -pi/2, pi/2), g.targets) + + elif g.id == 'ry': + circuit += self._dump_op( + self._get_rotation_matrix(g.params, 0, 0), g.targets) + + elif g.id == 'czpowgate': + matrix = np.eye(2, dtype=self.dtype) + matrix[1, 1] = cmath.exp(1j*pi*g.params) + circuit += self._dump_op(matrix, g.targets, g.controls) + + elif g.id == 'swap': + assert len(g.targets) == 2 + matrix = np.eye(4, dtype=self.dtype) + matrix[1:3, 1:3] = [[0, 1], [1, 0]] + circuit += self._dump_op(matrix, g.targets) + + elif g.id == 'cu': + circuit += self._dump_op(g.matrix, g.targets, g.controls) + + elif g.id == 'u': + circuit += self._dump_op(g.matrix, g.targets) + + elif g.id == 'measure': + pass # treated as no-op for now + + else: + raise NotImplementedError(f"the gate type {g.id} is not defined") + + def dump(): + logger.info(f"dumping (raw) circuit as {self.circuit_filename} ...") + with open(self.circuit_filename, 'w') as f: + f.write(circuit) + + call_by_root(dump) diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py index fe5c32b..1265262 100644 --- a/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py +++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_pny.py @@ -25,7 +25,7 @@ def __init__(self, nqubits, config): def generateCircuit(self, gateSeq): last_g = gateSeq[-1] assert last_g.id == "measure" # TODO: relax this? - + def circuit(): measured_qs = None @@ -71,5 +71,5 @@ def circuit(): raise NotImplementedError(f"The gate type {g.id} is not defined") return pennylane.sample(wires=measured_qs) - - return circuit \ No newline at end of file + + return circuit diff --git a/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py b/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py index 98a8211..8c4230a 100644 --- a/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py +++ b/benchmarks/cuquantum_benchmarks/frontends/frontend_qiskit.py @@ -57,12 +57,12 @@ def generateCircuit(self, gateSeq): elif g.id == 'cu': U_gate = UnitaryGate(g.matrix, g.name).control(1) - circuit.append(U_gate, [g.controls]+g.targets) + circuit.append(U_gate, [g.controls]+g.targets[::-1]) elif g.id == 'u': # TODO: give the gate a name? U_gate = UnitaryGate(g.matrix) - circuit.append(U_gate, g.targets) + circuit.append(U_gate, g.targets[::-1]) elif g.id == 'measure': circuit.measure(g.targets, g.targets) diff --git a/benchmarks/cuquantum_benchmarks/run.py b/benchmarks/cuquantum_benchmarks/run.py index eb0a80a..6f7daa3 100644 --- a/benchmarks/cuquantum_benchmarks/run.py +++ b/benchmarks/cuquantum_benchmarks/run.py @@ -11,8 +11,9 @@ from .config import benchmarks from .config import backends as backend_config from .frontends import frontends -from .run_interface import run_interface, BenchApiRunner -from ._utils import str_to_seq, MPHandler, RawTextAndDefaultArgFormatter +from .run_interface import BenchApiRunner, BenchCircuitRunner +from ._utils import (EarlyReturnError, MPHandler, RawTextAndDefaultArgFormatter, + str_to_seq,) frontend_names = [f for f in frontends.keys()] @@ -20,13 +21,13 @@ benchmark_names = [b for b in benchmarks.keys()] -main_description = api_description = r""" +main_description = api_description = circuit_description = r""" =============== NVIDIA cuQuantum Performance Benchmark Suite =============== """ -circuit_description = r""" -=============== NVIDIA cuQuantum Performance Benchmark Suite =============== +circuit_description += r""" +Note: all frontends and backends are optional and unavailable for use unless installed. Supported Frontends: @@ -57,8 +58,6 @@ - qulacs-gpu: runs the Qulacs GPU backend - qulacs-cpu: runs the Qulacs CPU backend -Note: all frontends and backends are optional and unavailable for use unless installed. - ============================================================================ """ @@ -82,9 +81,8 @@ help=f'set the simulator frontend') parser_circuit.add_argument('--backend', type=str, required=True, choices=backend_names, help=f'set the simulator backend that is compatible with the frontend') -# TODO -#parser.add_argument('--append', help='only add to existing benchmarking data rather than overwrite any data', action='store_true') parser_circuit.add_argument('--new', help='create a new circuit rather than use existing circuit', action='store_true') + # these options make sense to both circuit & api benchmarks, for better UX we need to copy/paste parser_circuit.add_argument('--cachedir', type=str, default='.', help='set the directory to cache generated data') parser_circuit.add_argument('--nqubits', type=int, help='set the number of qubits for each benchmark (circuit/api)') @@ -100,30 +98,33 @@ backend.add_argument('--nfused', type=int, help='set the maximum number of fused qubits for gate matrix fusion') backend.add_argument('--precision', type=str, choices=('single', 'double'), help='set the floating-point precision') -backend.add_argument('--cusvaer-global-index-bits', type=str_to_seq, nargs='?', const='', default=-1, + +backend_cusvaer = parser_circuit.add_argument_group('cusvaer-specific options') +backend_cusvaer.add_argument('--cusvaer-global-index-bits', type=str_to_seq, nargs='?', const='', default=-1, help='set the global index bits to specify the inter-node network structure. Please refer to the ' 'cusvaer backend documentation for further details. If not followed by any argument, ' 'the default (empty sequence) is used; ' 'otherwise, the argument should be a comma-separated string. ' 'Setting this option is mandatory for the cusvaer backend and an error otherwise') -backend.add_argument('--cusvaer-p2p-device-bits', type=int, nargs='?', const=0, default=-1, +backend_cusvaer.add_argument('--cusvaer-p2p-device-bits', type=int, nargs='?', const=0, default=-1, help='set the number of p2p device bits. Please refer to the cusvaer backend documentation ' 'for further details. If not followed by any argument, the default (0) is used. ' 'Setting this option is mandatory for the cusvaer backend and an error otherwise') -backend.add_argument('--cusvaer-data-transfer-buffer-bits', type=int, default=26, +backend_cusvaer.add_argument('--cusvaer-data-transfer-buffer-bits', type=int, default=26, help='set the size of the data transfer buffer in cusvaer. The size is ' 'specified as a positive integer. The buffer sized used is (1 << [#bits]). ' 'The default is 26 (64 MiB = 1 << 26)') -backend.add_argument('--cusvaer-comm-plugin-type', type=str, nargs='?', default='mpi_auto', +backend_cusvaer.add_argument('--cusvaer-comm-plugin-type', type=str, nargs='?', default='mpi_auto', choices=['mpi_auto', 'mpi_openmpi', 'mpi_mpich', 'external', 'self'], help='set the type of comm plugin used for multi-process simulation. ' - 'Required to set this option when one needs to use a custom comm plugin. ' - 'Acceptable values are mpi_auto, mpi_openmpi, mpi_mpich and external. ' - 'The default is mpi_auto.') -backend.add_argument('--cusvaer-comm-plugin-soname', type=str, nargs='?', default='', + 'Required to set this option when one needs to use a custom comm plugin.') +backend_cusvaer.add_argument('--cusvaer-comm-plugin-soname', type=str, nargs='?', default='', help='specify the name of a shared library used for inter-process communication. ' 'Required to set this option when one needs to use a custom comm plugin') +backend_cutn = parser_circuit.add_argument_group('cutn-specific options') +backend_cutn.add_argument('--nhypersamples', type=int, default=32, help='set the number of hypersamples for the pathfinder to explore') + # "cuquantum-benchmarks api" subcommand parser_api = subparsers.add_parser( @@ -131,38 +132,130 @@ description=api_description, help="benchmark different APIs from cuQuantum's libraries", formatter_class=RawTextAndDefaultArgFormatter) -parser_api.add_argument('--benchmark', type=str, choices=('apply_matrix',), - help=f'pick the API to benchmark') +parser_api.add_argument('--benchmark', type=str, required=True, + choices=BenchApiRunner.supported_apis, + help=f'pick the API to benchmark. Specify a benchmark with -h/--help can see detailed help message.') parser_api.add_argument('--precision', type=str, choices=('single', 'double'), default='single', help='set the floating-point precision') -apply_matrix = parser_api.add_argument_group('apply_matrix-specific options') - -targets = apply_matrix.add_mutually_exclusive_group(required=True) -targets.add_argument('--targets', type=str_to_seq, - help="set the (comma-separated) target qubit IDs") -targets.add_argument('--ntargets', type=int, help='set the number of target qubits') - -controls = apply_matrix.add_mutually_exclusive_group(required=False) -controls.add_argument('--controls', type=str_to_seq, - help="set the (comma-separated) control qubit IDs") -controls.add_argument('--ncontrols', type=int, help='set the number of target qubits') - -apply_matrix.add_argument('--layout', type=str, choices=('row', 'column'), default='row', - help='set the gate matrix layout') -apply_matrix.add_argument('--adjoint', action='store_true', help='apply the matrix adjoint') -apply_matrix.add_argument('--location', type=str, choices=('device', 'host'), default='host', - help='set the location of the gate matrix') -apply_matrix.add_argument('--nqubits', type=int, required=True, - help='set the total number of qubits') -apply_matrix.add_argument('--flush-cache', action='store_true', help='flush the L2 cache for more accurate timing') - # these options make sense to both circuit & api benchmarks, for better UX we need to copy/paste +# TODO: set the arguments programmatically to avoid dups parser_api.add_argument('--cachedir', type=str, default='.', help='set the directory to cache generated data') parser_api.add_argument('--nwarmups', type=int, default=3, help='set the number of warm-up runs for each benchmark') parser_api.add_argument('--nrepeats', type=int, default=10, help='set the number of repetitive runs for each benchmark') parser_api.add_argument('-v', '--verbose', help='output extra information during benchmarking', action='store_true') +# add_api_benchmark_options() can only be called once throughout the process's lifetime +_is_api_benchmark_options_added = False + +def add_api_benchmark_options(parser_api, args=None): + # benchmark-specific options + global _is_api_benchmark_options_added + if _is_api_benchmark_options_added: return + + # hack: we want dynamic behavior but the parser can't do the job properly + target = None + if args is None: + what_to_parse = sys.argv # parsing from cmdline + else: + what_to_parse = args + try: + idx = what_to_parse.index('--benchmark') + target = what_to_parse[idx+1] + except (ValueError, IndexError): + return + assert target is not None + + if target == 'apply_matrix': + apply_matrix = parser_api.add_argument_group('apply_matrix-specific options') + + targets = apply_matrix.add_mutually_exclusive_group(required=True) + targets.add_argument('--targets', type=str_to_seq, + help="set the (comma-separated) target qubit IDs") + targets.add_argument('--ntargets', type=int, help='set the number of target qubits') + + controls = apply_matrix.add_mutually_exclusive_group(required=False) + controls.add_argument('--controls', type=str_to_seq, + help="set the (comma-separated) control qubit IDs") + controls.add_argument('--ncontrols', type=int, help='set the number of target qubits') + + apply_matrix.add_argument('--layout', type=str, choices=('row', 'column'), default='row', + help='set the gate matrix layout') + apply_matrix.add_argument('--adjoint', action='store_true', help='apply the matrix adjoint') + apply_matrix.add_argument('--location', type=str, choices=('device', 'host'), default='host', + help='set the location of the gate matrix') + apply_matrix.add_argument('--nqubits', type=int, required=True, + help='set the total number of qubits') + apply_matrix.add_argument('--flush-cache', action='store_true', help='flush the L2 cache for more accurate timing') + + if target == 'apply_generalized_permutation_matrix': + apply_gen_perm_matrix = parser_api.add_argument_group('apply_generalized_permutation_matrix-specific options') + apply_gen_perm_matrix.add_argument('--nqubits', type=int, required=True, + help='set the total number of qubits') + + targets = apply_gen_perm_matrix.add_mutually_exclusive_group(required=True) + targets.add_argument('--targets', type=str_to_seq, + help="set the (comma-separated) target qubit IDs") + targets.add_argument('--ntargets', type=int, help='set the number of target qubits') + + controls = apply_gen_perm_matrix.add_mutually_exclusive_group(required=False) + controls.add_argument('--controls', type=str_to_seq, + help="set the (comma-separated) control qubit IDs") + controls.add_argument('--ncontrols', type=int, help='set the number of control qubits') + + apply_gen_perm_matrix.add_argument('--adjoint', action='store_true', + help='apply the matrix adjoint') + apply_gen_perm_matrix.add_argument('--has-diag', action='store_true', + help='whether the diagonal matrix is nontrivial (not an identity)') + apply_gen_perm_matrix.add_argument('--location-diag', type=str, choices=('device', 'host'), default='host', + help='set the location of the diagonal matrix') + apply_gen_perm_matrix.add_argument('--precision-diag', type=str, choices=('single', 'double'), default='single', + help='set the floating-point precision of the diagonal matrix') + + perm = apply_gen_perm_matrix.add_mutually_exclusive_group(required=False) + perm.add_argument('--has-perm', action='store_true', + help='whether the permutation matrix is nontrivial (not an identity)') + perm.add_argument('--perm-table', type=str_to_seq, + help='set the permutation table for constructing a permutation matrix') + + apply_gen_perm_matrix.add_argument('--location-perm', type=str, choices=('device', 'host'), default='host', + help='set the location of the permutation matrix') + + elif target == 'cusv_sampler': + sampler = parser_api.add_argument_group('cusv_sampler-specific options') + bitordering = sampler.add_mutually_exclusive_group(required=True) + bitordering.add_argument('--bit-ordering', type=str_to_seq, + help="set the (comma-separated) qubit IDs to sample") + bitordering.add_argument('--nbit-ordering', type=int, + help='set the number of qubits to sample') + sampler.add_argument('--nqubits', type=int, required=True, + help='set the total number of qubits') + sampler.add_argument('--nshots', type=int, default=1024, + help="set the number of shots") + sampler.add_argument('--output-order', choices=('random', 'ascending'), default='ascending', + help='set the order of bit strings in sampled outputs') + + elif target == 'tensor_decompose': + tensor_decompose = parser_api.add_argument_group('tensor_decompose-specific options') + + method = tensor_decompose.add_mutually_exclusive_group(required=True) + method.add_argument('--method', type=str, choices=('QR', 'SVD'), + help='the method for tensor decomposition; when SVD is chosen, gesvd will be used') + method.add_argument('--algorithm', type=str, choices=('gesvd', 'gesvdj', 'gesvdr', 'gesvdp'), + help='the algorithm for SVD decomposition') + + tensor_decompose.add_argument('--expr', type=str, required=True, + help='an einsum-like expression describing the decomposition; ' + 'the expression must be quoted with \' or \"') + tensor_decompose.add_argument('--shape', type=str_to_seq, required=True, + help='the shape of the input tensor') + tensor_decompose.add_argument('--is-complex', action='store_true', + help='whether the input tensor is complex-valued') + tensor_decompose.add_argument('--check-reference', action='store_true', default=False) + + _is_api_benchmark_options_added = True + + # set up a logger logger_name = "cuquantum-benchmarks" logger = logging.getLogger(logger_name) @@ -178,8 +271,8 @@ def run(args=None): # we allow args to be a list of cmd options for potential private use cases and tests + add_api_benchmark_options(parser_api, args) args = parser.parse_args(args) - #print(args) # Since run() might be called multiple times, in such case we don't wanna make any changes # to the handler in the 2nd time onward, this ensures we write to the same I/O stream and @@ -196,13 +289,16 @@ def run(args=None): pass else: logger.setLevel(level) + finally: + del args.verbose # dispatch to subcommands cmd = args.cmd + del args.cmd if cmd == "circuit": - selected_benchmarks = benchmarks if args.benchmark == 'all' else {args.benchmark: benchmarks[args.benchmark]} - selected_backend = (args.backend, backend_config[args.backend]) + del args.benchmark + config = backend_config[args.backend] if ((args.frontend == 'cirq' and args.backend not in ('cirq', 'cutn', *[k for k in backends.keys() if k.startswith('qsim')])) or (args.frontend == 'qiskit' and args.backend not in ('cutn', *[k for k in backends.keys() if 'aer' in k])) @@ -221,34 +317,25 @@ def run(args=None): if args.cusvaer_p2p_device_bits != -1: raise ValueError(f"cannot set --cusvaer-p2p-device-bits for backend {args.backend}") - run_interface(benchmarks=selected_benchmarks, - nqubits_interface=args.nqubits, - ngpus_interface=args.ngpus, - ncpu_threads_interface=args.ncputhreads, - frontend=args.frontend, - backend=selected_backend, - #append=args.append, - nwarmups=args.nwarmups, - nrepeats=args.nrepeats, - nshots_interface=args.nshots, - nfused_interface=args.nfused, - precision_interface=args.precision, - new_circ=args.new, - save=True, - cache_dir=args.cachedir, - cusvaer_global_index_bits=args.cusvaer_global_index_bits, - cusvaer_p2p_device_bits=args.cusvaer_p2p_device_bits, - cusvaer_data_transfer_buffer_bits=args.cusvaer_data_transfer_buffer_bits, - cusvaer_comm_plugin_type=args.cusvaer_comm_plugin_type, - cusvaer_comm_plugin_soname=args.cusvaer_comm_plugin_soname) + runner = BenchCircuitRunner( + benchmarks=selected_benchmarks, + backend_config=config, + **vars(args)) + # benchmark & dump result to cachedir + try: + runner.run() + except EarlyReturnError: + pass elif cmd == "api": - del args.cmd runner = BenchApiRunner(**vars(args)) # benchmark & dump result to cachedir - runner.run() + try: + runner.run() + except EarlyReturnError: + pass if __name__ == "__main__": diff --git a/benchmarks/cuquantum_benchmarks/run_interface.py b/benchmarks/cuquantum_benchmarks/run_interface.py index 409898f..c6fd005 100644 --- a/benchmarks/cuquantum_benchmarks/run_interface.py +++ b/benchmarks/cuquantum_benchmarks/run_interface.py @@ -4,18 +4,21 @@ import functools import logging +import math +import nvtx import os import pickle import random import time - import cupy as cp from .backends import createBackend from .frontends import createFrontend -from ._utils import (call_by_root, gen_run_env, HashableDict, is_running_mpiexec, - load_benchmark_data, report, save_benchmark_data, reseed, - is_running_mpi) +from ._utils import ( + call_by_root, create_cache, EarlyReturnError, gen_run_env, get_mpi_rank, HashableDict, + is_running_mpiexec, is_running_mpi, load_benchmark_data, report, reseed, + save_benchmark_data, +) # set up a logger @@ -23,102 +26,68 @@ logger = logging.getLogger(logger_name) -def run_interface( - benchmarks, nqubits_interface, ngpus_interface, ncpu_threads_interface, frontend, backend, nwarmups, nrepeats, nshots_interface, - nfused_interface, precision_interface, new_circ, save, cache_dir, - cusvaer_global_index_bits, cusvaer_p2p_device_bits, cusvaer_data_transfer_buffer_bits, cusvaer_comm_plugin_type, cusvaer_comm_plugin_soname): - - reseed(1234) # TODO: use a global seed? - backend, backend_config = backend # unpack - ngpus = ngpus_interface if ngpus_interface is not None else backend_config['config']['ngpus'] - ncpu_threads = ncpu_threads_interface if ncpu_threads_interface is not None else backend_config['config']['ncputhreads'] - nshots = nshots_interface if nshots_interface is not None else backend_config['config']['nshots'] - nfused = nfused_interface if nfused_interface is not None else backend_config['config']['nfused'] - precision = precision_interface if precision_interface is not None else backend_config['config']['precision'] +class BenchCircuitRunner: - general_interface = GeneralInterface(frontend=frontend, - backend=backend, - nshots=nshots, - nfused=nfused, - precision=precision, - #append=append, - new_circ=new_circ, - save=save) + # currently we assume the following subdirectories exist + required_subdirs = ('circuits', 'data') - for benchmark_name in benchmarks.keys(): # Iterate over diferent benchmarks - benchmark = benchmarks[benchmark_name] + def __init__(self, **kwargs): + # use default backend config unless users want to overwrite it + self.backend_config = backend_config = kwargs.pop("backend_config") + for k in (# generic backend options + "ngpus", "ncputhreads", "nshots", "nfused", "precision", + # cusvaer options + 'cusvaer_global_index_bits', 'cusvaer_p2p_device_bits', + 'cusvaer_data_transfer_buffer_bits', 'cusvaer_comm_plugin_type', + 'cusvaer_comm_plugin_soname', + # cutn options + 'nhypersamples'): + v = kwargs.pop(k) + if k.startswith('cusvaer') or v is not None: + setattr(self, k, v) + else: + setattr(self, k, backend_config['config'][k]) + + # To be parsed in run() + self._benchmarks = kwargs.pop("benchmarks") + self._nqubits = kwargs.pop("nqubits") + + # other common benchmark args + self.frontend = kwargs.pop("frontend") + self.backend = kwargs.pop("backend") + self.cache_dir = kwargs.pop("cachedir") + self.nwarmups = kwargs.pop("nwarmups") + self.nrepeats = kwargs.pop("nrepeats") + self.new_circ = kwargs.pop("new") + self.save = True + assert len(kwargs) == 0, f"unhandled cmdline args: {kwargs}" - gpu_device_properties = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id) - gpu_name = gpu_device_properties['name'].decode('utf-8').split(' ')[-1] - if gpu_name not in benchmark['nqubits']: - # Use the default config for this benchmark if there is no GPU-specific config - gpu_name = 'default' - nqubits_list = [nqubits_interface] if nqubits_interface else benchmark['nqubits'][gpu_name] - - benchmark_object = benchmark['benchmark'] - config = benchmark['config'] - config['precision'] = precision # WAR - - for nqubits in nqubits_list: # Iterate over diferent number of qubits - run_specific = RunSpecific(benchmark_name=benchmark_name, - benchmark_object=benchmark_object, - nqubits=nqubits, - ngpus=ngpus, - ncpu_threads=ncpu_threads, - nwarmups=nwarmups, - nrepeats=nrepeats, - config=config, - general_interface=general_interface, - cache_dir=cache_dir, - cusvaer_global_index_bits=cusvaer_global_index_bits, - cusvaer_p2p_device_bits=cusvaer_p2p_device_bits, - cusvaer_data_transfer_buffer_bits=cusvaer_data_transfer_buffer_bits, - cusvaer_comm_plugin_type=cusvaer_comm_plugin_type, - cusvaer_comm_plugin_soname=cusvaer_comm_plugin_soname) - run_specific.run() - - -class GeneralInterface: - - def __init__(self, frontend, backend, nshots, nfused, precision, new_circ, save): - self.frontend = frontend - self.backend = backend - self.nshots = nshots - self.nfused = nfused - self.precision = precision - #self.append = append - self.new_circ = new_circ - self.save = save self.full_data = {} + self.benchmark_data = {} + # it could be that the cache dirs are not created yet + call_by_root(functools.partial(create_cache, self.cache_dir, self.required_subdirs)) -class RunSpecific: - - def __init__( - self, benchmark_name, benchmark_object, nqubits, ngpus, ncpu_threads, nwarmups, nrepeats, config, - general_interface, cache_dir, - cusvaer_global_index_bits, cusvaer_p2p_device_bits, cusvaer_data_transfer_buffer_bits, - cusvaer_comm_plugin_type, cusvaer_comm_plugin_soname): - self.benchmark_name = benchmark_name - self.benchmark_object = benchmark_object - self.nqubits = nqubits - self.ngpus = ngpus - self.ncpu_threads = ncpu_threads - self.nwarmups=nwarmups - self.nrepeats=nrepeats - self.config = config - self.general_interface = general_interface - self.benchmark_data = {} - self.cache_dir = cache_dir - # cusvaer options - self.cusvaer_global_index_bits = cusvaer_global_index_bits - self.cusvaer_p2p_device_bits = cusvaer_p2p_device_bits - self.cusvaer_data_transfer_buffer_bits = cusvaer_data_transfer_buffer_bits - self.cusvaer_comm_plugin_type = cusvaer_comm_plugin_type - self.cusvaer_comm_plugin_soname = cusvaer_comm_plugin_soname + def run(self): + if self._nqubits is None: + gpu_prop = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id) + max_n_qubits = math.floor(math.log2(gpu_prop['totalGlobalMem'] / (8 if self.precision == 'single' else 16))) + nqubits_list = list(range(16, max_n_qubits + 4, 4)) + else: + nqubits_list = [self._nqubits] + + for benchmark_name in self._benchmarks.keys(): + b = self._benchmarks[benchmark_name] + benchmark_object = b['benchmark'] + benchmark_config = b['config'] + benchmark_config['precision'] = self.precision # some frontends may need it - # currently we assume the following subdirectories exist - self.required_subdirs = ('circuits', 'data') + for nqubits in nqubits_list: + self.benchmark_name = benchmark_name + self.benchmark_object = benchmark_object + self.benchmark_config = benchmark_config + self.nqubits = nqubits + self._run() def _load_or_generate_circuit(self, circuit_filename): # We need a mechanism to ensure any incompatible gate_sequence generated @@ -131,9 +100,17 @@ def _load_or_generate_circuit(self, circuit_filename): gate_seq_ver = 1 circuit_filename += f"_v{gate_seq_ver}.pickle" - frontend = createFrontend(self.general_interface.frontend, self.nqubits, self.config) + frontend = createFrontend(self.frontend, self.nqubits, self.benchmark_config) + + dump_only = bool(os.environ.get('CUQUANTUM_BENCHMARKS_DUMP_GATES', False)) + if dump_only: + # hijack & discard user input + from .frontends.frontend_dumper import Dumper + frontend = Dumper( + self.nqubits, + {**self.benchmark_config, 'circuit_filename': circuit_filename}) try: - if self.general_interface.new_circ: + if self.new_circ: raise ValueError # If this circuit has been generated previously, load it @@ -143,7 +120,7 @@ def _load_or_generate_circuit(self, circuit_filename): logger.debug(f'Circuit loaded from {circuit_filename}') except: # Otherwise, generate the circuit and save it - gate_sequence = self.benchmark_object.generateGatesSequence(self.nqubits, self.config) + gate_sequence = self.benchmark_object.generateGatesSequence(self.nqubits, self.benchmark_config) circuit = frontend.generateCircuit(gate_sequence) def dump(): with open(os.path.join(self.cache_dir, circuit_filename), 'wb') as f: @@ -151,6 +128,10 @@ def dump(): logger.debug(f'Circuit generated and saved to {circuit_filename}') call_by_root(dump) + if dump_only: + logger.info("early exiting as the dumper task is completed") + raise EarlyReturnError + return circuit def get_circuit(self, circuit_filename): @@ -176,6 +157,7 @@ def timer(self, backend, circuit, nshots): backend.pre_run(circuit, nshots=nshots) backend.run(circuit, nshots) + annotation_string = f"p{get_mpi_rank()}_run_" # actual timing for i in range(self.nrepeats): backend.pre_run(circuit, nshots=nshots) @@ -184,7 +166,8 @@ def timer(self, backend, circuit, nshots): start_gpu.record() pe1 = time.perf_counter() - run_dict = backend.run(circuit, nshots) + with nvtx.annotate(annotation_string + str(i)): + run_dict = backend.run(circuit, nshots) pe2 = time.perf_counter() if self.ngpus > 0: @@ -212,7 +195,7 @@ def timer(self, backend, circuit, nshots): def _fix_filename_for_cutn(self, circuit_filename, nqubits): target = pauli = None - if self.general_interface.backend == 'cutn': + if self.backend == 'cutn': target = os.environ.get('CUTENSORNET_BENCHMARK_TARGET', 'amplitude') circuit_filename += f'_{target}' if target == 'expectation': @@ -221,34 +204,36 @@ def _fix_filename_for_cutn(self, circuit_filename, nqubits): return circuit_filename, target, pauli def extract_backend_version(self): - if 'aer' in self.general_interface.backend: + if 'aer' in self.backend: import qiskit version = qiskit.__qiskit_version__['qiskit-aer'] - elif 'qsim' in self.general_interface.backend: + elif 'qsim' in self.backend: import qsimcirq version = qsimcirq.__version__ - elif self.general_interface.backend == 'cutn': + elif self.backend == 'cutn': import cuquantum version = cuquantum.cutensornet.get_version() - elif self.general_interface.backend == 'cirq': + elif self.backend == 'cirq': import cirq version = cirq.__version__ - elif self.general_interface.backend == 'naive': + elif self.backend == 'naive': from .backends import backends version = backends['naive'].version - elif self.general_interface.backend == 'pennylane': + elif self.backend == 'pennylane': import pennylane version = pennylane.__version__ - elif self.general_interface.backend == 'pennylane-lightning-gpu': + elif self.backend == 'pennylane-lightning-gpu': import pennylane_lightning_gpu version = pennylane_lightning_gpu.__version__ - elif self.general_interface.backend == 'pennylane-lightning-qubit': + elif self.backend == 'pennylane-lightning-qubit': import pennylane_lightning version = pennylane_lightning.__version__ - elif self.general_interface.backend == 'pennylane-lightning-kokkos': + elif self.backend == 'pennylane-lightning-kokkos': import pennylane_lightning_kokkos version = pennylane_lightning_kokkos.__version__ - elif self.general_interface.backend in ('qulacs-gpu', 'qulacs-cpu'): + elif self.backend == 'pennylane-dumper': + version = '0' # dummy + elif self.backend in ('qulacs-gpu', 'qulacs-cpu'): import qulacs version = qulacs.__version__ else: @@ -256,19 +241,19 @@ def extract_backend_version(self): return version def extract_frontend_version(self): - if self.general_interface.frontend == 'qiskit': + if self.frontend == 'qiskit': import qiskit version = qiskit.__qiskit_version__['qiskit-terra'] - elif self.general_interface.frontend == 'cirq': + elif self.frontend == 'cirq': import cirq version = cirq.__version__ - elif self.general_interface.frontend == 'naive': + elif self.frontend == 'naive': from .frontends import frontends version = frontends['naive'].version - elif self.general_interface.frontend == 'pennylane': + elif self.frontend == 'pennylane': import pennylane version = pennylane.__version__ - elif self.general_interface.frontend == 'qulacs': + elif self.frontend == 'qulacs': import qulacs version = qulacs.__version__ else: @@ -276,63 +261,56 @@ def extract_frontend_version(self): return version def extract_glue_layer_version(self): - if self.general_interface.backend == 'cutn': + if self.backend == 'cutn': import cuquantum glue_ver = f'cuquantum {cuquantum.__version__}' else: return None return glue_ver - def run(self): - measure = self.config['measure'] + def _run(self): + reseed(1234) # TODO: use a global seed? + measure = self.benchmark_config['measure'] # try to load existing perf data, if any data_filename = f'{self.benchmark_name}.json' filepath = f'{self.cache_dir}/data/{data_filename}' - self.general_interface.full_data = load_benchmark_data( - filepath, self.cache_dir, self.required_subdirs) + self.full_data = load_benchmark_data(filepath) gpu_device_properties = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id) gpu_name = gpu_device_properties['name'].decode('utf-8').split(' ')[-1] num_qubits = str(self.nqubits) num_gpus = str(self.ngpus) - # FIXME: this is buggy (no early return) - # try: - # if (self.general_interface.append - # and num_gpus in self.general_interface.full_data[num_qubits][self.general_interface.frontend+'-v'+frontend_version][self.general_interface.backend+'-v'+backend_version][gpu_name]): - # self.general_interface.logger.info( - # f'Skipping {self.benchmark_name} with {self.nqubits} qubits and {self.ngpus} GPUs [{self.general_interface.backend}-v{backend_version}]') - # except KeyError: - # # KeyError means this configuration is not currently benchmarked, so we can continue running - # self.general_interface.logger.debug('Benchmark configuration not found in existing data') - # pass - circuit_filename = f'circuits/{self.benchmark_name}_{self.nqubits}' - - if 'unfold' in self.config.keys() and self.config['unfold']: + if 'unfold' in self.benchmark_config.keys() and self.benchmark_config['unfold']: circuit_filename += '_unfold' - if 'p' in self.config.keys(): - p = self.config['p'] + if 'p' in self.benchmark_config.keys(): + p = self.benchmark_config['p'] circuit_filename += f'_p{p}' if measure: circuit_filename += '_measure' circuit_filename, target, pauli = self._fix_filename_for_cutn(circuit_filename, self.nqubits) - self.general_interface.cutn_target = target + self.cutn_target = target # get circuit circuit = self.get_circuit(circuit_filename) # get backend + # TODO: use backend config to simplify this... backend = createBackend( - self.general_interface.backend, self.ngpus, self.ncpu_threads, self.general_interface.precision, - nqubits=self.nqubits, # TODO: backend config - cusvaer_global_index_bits=self.cusvaer_global_index_bits, # cusvaer options + self.backend, self.ngpus, self.ncputhreads, self.precision, + nqubits=self.nqubits, + # cusvaer options + cusvaer_global_index_bits=self.cusvaer_global_index_bits, cusvaer_p2p_device_bits=self.cusvaer_p2p_device_bits, cusvaer_data_transfer_buffer_bits=self.cusvaer_data_transfer_buffer_bits, cusvaer_comm_plugin_type=self.cusvaer_comm_plugin_type, cusvaer_comm_plugin_soname=self.cusvaer_comm_plugin_soname, - nfused=self.general_interface.nfused, # only qiskit and qsim + # qiskit and qsim + nfused=self.nfused, + # cutn + nhypersamples=self.nhypersamples, ) # get versions; it's assumed up to this point, the existence of Python modules for @@ -340,27 +318,33 @@ def run(self): backend_version = self.extract_backend_version() frontend_version = self.extract_frontend_version() glue_layer_version = self.extract_glue_layer_version() + if glue_layer_version is not None: + ver_str = f'[{self.frontend}-v{frontend_version} | (glue ver: {glue_layer_version}) | {self.backend}-v{backend_version}]' + else: + ver_str = f'[{self.frontend}-v{frontend_version} | {self.backend}-v{backend_version}]' if self.ngpus == 0: logger.info( - f'* Running {self.benchmark_name} with {self.ncpu_threads} CPU threads, and {self.nqubits} qubits [{self.general_interface.backend}-v{backend_version}]:') + f'* Running {self.benchmark_name} with {self.ncputhreads} CPU threads, and {self.nqubits} qubits {ver_str}:' + ) else: logger.info( - f'* Running {self.benchmark_name} with {self.ngpus} GPUs, and {self.nqubits} qubits [{self.general_interface.backend}-v{backend_version}]:') + f'* Running {self.benchmark_name} with {self.ngpus} GPUs, and {self.nqubits} qubits {ver_str}:' + ) preprocess_data = backend.preprocess_circuit( circuit, # only cutn needs these, TODO: backend config circuit_filename=os.path.join(self.cache_dir, circuit_filename), target=target, - pauli=pauli + pauli=pauli, ) for k in preprocess_data.keys(): self.benchmark_data[k] = preprocess_data[k] # run benchmark - perf_time, cuda_time, post_time, post_process = self.timer(backend, circuit, self.general_interface.nshots) # nsamples -> nshots + perf_time, cuda_time, post_time, post_process = self.timer(backend, circuit, self.nshots) # nsamples -> nshots # report the result run_env = gen_run_env(gpu_device_properties) @@ -371,7 +355,7 @@ def run(self): out = self.canonicalize_benchmark_data(frontend_version, backend_version, run_env, glue_layer_version) save_benchmark_data( *out, - self.general_interface.full_data, filepath, self.general_interface.save) + self.full_data, filepath, self.save) def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env, glue_layer_version): """ @@ -415,17 +399,17 @@ def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env sim_config = HashableDict({ 'frontend': HashableDict({ - "name": self.general_interface.frontend, + "name": self.frontend, "version": frontend_version, }), 'backend': HashableDict({ - "name": self.general_interface.backend, + "name": self.backend, "version": backend_version, "ngpus": self.ngpus, - "ncputhreads": self.ncpu_threads, - "nshots": self.general_interface.nshots, - "nfused": self.general_interface.nfused, - "precision": self.general_interface.precision, + "ncputhreads": self.ncputhreads, + "nshots": self.nshots, + "nfused": self.nfused, + "precision": self.precision, "with_mpi": is_running_mpiexec(), }), 'glue_layer': HashableDict({ @@ -439,11 +423,11 @@ def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env # TODO: record "measure"? # backend-specific options - if self.general_interface.backend == "cusvaer": + if self.backend == "cusvaer": sim_config["backend"]["cusvaer_global_index_bits"] = self.cusvaer_global_index_bits sim_config["backend"]["cusvaer_p2p_device_bits"] = self.cusvaer_p2p_device_bits - elif self.general_interface.backend == "cutn": - sim_config["backend"]["target"] = self.general_interface.cutn_target + elif self.backend == "cutn": + sim_config["backend"]["target"] = self.cutn_target sim_config_hash = sim_config.get_hash() self.benchmark_data = {**self.benchmark_data, **sim_config} @@ -453,35 +437,36 @@ def canonicalize_benchmark_data(self, frontend_version, backend_version, run_env class BenchApiRunner: - supported_cusv_apis = ('apply_matrix',) - supported_cutn_apis = () + supported_cusv_apis = ('apply_matrix', 'apply_generalized_permutation_matrix', 'cusv_sampler', ) + supported_cutn_apis = ('tensor_decompose',) supported_apis = supported_cusv_apis + supported_cutn_apis + # currently we assume the following subdirectories exist + required_subdirs = ('data',) + def __init__(self, **kwargs): - self.num_qubits = kwargs.pop("nqubits") self.benchmark = kwargs.pop("benchmark") self.cache_dir = kwargs.pop("cachedir") - kwargs.pop("verbose") # don't care self.args = kwargs # just hold the entire group of parsed cmdline args, don't unpack all - # currently we assume the following subdirectories exist - self.required_subdirs = ('data',) + # it could be that the cache dirs are not created yet + call_by_root(functools.partial(create_cache, self.cache_dir, self.required_subdirs)) # load existing json, if any self.data_filename = f"{self.benchmark}.json" self.file_path = f'{self.cache_dir}/data/{self.data_filename}' - self.full_data = load_benchmark_data( - self.file_path, self.cache_dir, self.required_subdirs) + self.full_data = load_benchmark_data(self.file_path) def run(self): # prep if self.benchmark not in self.supported_apis: raise NotImplementedError(f"only {self.supported_apis} is supported for now") gpu_device_properties = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id) - benchmark_data = {} # dummy + benchmark_data = {} # time the api - perf_time, cuda_time = self._run_apply_matrix() + bench_func = getattr(self, f"_run_{self.benchmark}") + perf_time, cuda_time = bench_func(benchmark_data) # update benchmark_data in-place # report the result run_env = gen_run_env(gpu_device_properties) @@ -492,10 +477,11 @@ def run(self): out = self.canonicalize_benchmark_data(run_env, benchmark_data) save_benchmark_data(*out, self.full_data, self.file_path) - def _run_apply_matrix(self): + def _run_apply_matrix(self, benchmark_data): # TODO: It's better to move this method elsewhere, once we support more apis from .benchmarks.apply_matrix import test_apply_matrix args = self.args + self.num_qubits = args.pop("nqubits") # create targets while keeping args clean for later use ntargets = args.pop("ntargets") @@ -527,6 +513,108 @@ def _run_apply_matrix(self): args["nrepeats"], args["location"], flush_l2=args["flush_cache"], + benchmark_data=benchmark_data, + ) + + def _run_apply_generalized_permutation_matrix(self, benchmark_data): + # TODO: It's better to move this method elsewhere, once we support more apis + from .benchmarks.apply_gen_perm_matrix import test_apply_generalized_permutation_matrix + args = self.args + self.num_qubits = args.pop("nqubits") + + # create targets while keeping args clean for later use + ntargets = args.pop("ntargets") + targets = args.pop("targets") + targets = tuple(range(ntargets)) if targets is None else tuple(targets) + args["targets"] = targets + + # create controls while keeping args clean for later use + ncontrols = args.pop("ncontrols") + controls = args.pop("controls") + if controls is None and ncontrols is None: + controls = () + elif controls is None: + controls = tuple(range(ncontrols)) + else: + controls = tuple(controls) + args["controls"] = controls + + # create perm_table while keeping args clean for later use + has_perm = args.pop("has_perm") + perm_table = args.pop("perm_table") + if has_perm is False and perm_table is None: + perm_table = [] + elif perm_table is None: + # used as a flag to fill perm_table randomly later + perm_table = bool(has_perm) + else: + perm_table = list(perm_table) + args["perm_table"] = perm_table + + # run + return test_apply_generalized_permutation_matrix( + self.num_qubits, + args["precision"], + targets, + controls, + int(args["adjoint"]), + args["has_diag"], + args["precision_diag"], + args["location_diag"], + args["perm_table"], + args["location_perm"], + args["nwarmups"], + args["nrepeats"], + benchmark_data=benchmark_data, + ) + + def _run_cusv_sampler(self, benchmark_data): + from .benchmarks.cusv_sampler import test_cusv_sampler + args = self.args + self.num_qubits = args.pop("nqubits") + + # create bit_ordering while keeping args clean for later use + nbit_ordering = args.pop("nbit_ordering") + bit_ordering = args.pop("bit_ordering") + bit_ordering = tuple(range(nbit_ordering)) if bit_ordering is None else tuple(bit_ordering) + args["bit_ordering"] = bit_ordering + + # run + return test_cusv_sampler( + self.num_qubits, + args["precision"], + bit_ordering, + args["nshots"], + args["output_order"], + args["nwarmups"], + args["nrepeats"], + benchmark_data=benchmark_data, + ) + + def _run_tensor_decompose(self, benchmark_data): + from .benchmarks.tensor_decompose import benchmark_tensor_decompose + args = self.args + self.num_qubits = 0 # WAR + + # ensure the combination of method/algorithm is meaningful + if args["method"] == "SVD": + args["algorithm"] = "gesvd" + elif args["algorithm"] is not None: + # algorithm is set, must be doing SVD + args["method"] = "SVD" + + # run + return benchmark_tensor_decompose( + args["expr"], + tuple(args["shape"]), + args["precision"], + args["is_complex"], + args["method"], + args["algorithm"], + args["nwarmups"], + args["nrepeats"], + args["check_reference"], + benchmark_data=benchmark_data, ) def canonicalize_benchmark_data(self, run_env, benchmark_data): diff --git a/benchmarks/setup.py b/benchmarks/setup.py index e6b09e5..72fc6d1 100644 --- a/benchmarks/setup.py +++ b/benchmarks/setup.py @@ -32,15 +32,16 @@ "psutil", "scipy", "networkx", + "nvtx", ] if importlib.util.find_spec('cuquantum') is None: - install_requires.append("cuquantum-python>=22.7") + install_requires.append("cuquantum-python>=23.3") setup( name="cuquantum-benchmarks", version=version, - description="NVIDIA cuQuantum Circuit Performance Benchmark Suite", + description="NVIDIA cuQuantum Performance Benchmark Suite", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/NVIDIA/cuQuantum", diff --git a/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py b/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py index 9bd2743..7857466 100644 --- a/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py +++ b/benchmarks/tests/cuquantum_benchmarks_tests/test_run.py @@ -218,6 +218,8 @@ def test_benchmark(self, combo, nqubits, benchmark, precision, tmp_path, visible '--verbose'] if backend == 'cusvaer': cmd += ['--cusvaer-global-index-bits', '--cusvaer-p2p-device-bits'] + if backend == 'cutn': + cmd += ['--nhypersamples', '2'] for cmd_prefix in tests: result = subprocess.run(cmd_prefix+cmd, env=env, capture_output=True) @@ -239,38 +241,36 @@ def test_benchmark(self, combo, nqubits, benchmark, precision, tmp_path, visible # TODO: test invalid cases and ensure we raise errors -@pytest.mark.parametrize( - "args", ( - ["--nqubits", "4", "--ntargets", "2"], - ["--nqubits", "4", "--targets", "2,3"], - ["--nqubits", "6", "--ntargets", "3", "--controls", "3"], - ["--nqubits", "4", "--targets", "2,3", "--ncontrols", "1"], - ["--nqubits", "4", "--targets", "2,3", "--controls", "1"], +class TestCmdApi: + + @pytest.mark.parametrize( + "args", ( + ["--nqubits", "4", "--ntargets", "2"], + ["--nqubits", "4", "--targets", "2,3"], + ["--nqubits", "6", "--ntargets", "3", "--controls", "3"], + ["--nqubits", "4", "--targets", "2,3", "--ncontrols", "1"], + ["--nqubits", "4", "--targets", "2,3", "--controls", "1"], + ) ) -) -@pytest.mark.parametrize( - "matrix_prop", ( - [], # default - ["--layout", "column", "--adjoint"], + @pytest.mark.parametrize( + "matrix_prop", ( + [], # default + ["--layout", "column", "--adjoint"], + ) + ) + @pytest.mark.parametrize( + "precision", ("single", "double") + ) + @pytest.mark.parametrize( + "flush", (True, False) ) -) -@pytest.mark.parametrize( - "precision", ("single", "double") -) -@pytest.mark.parametrize( - "flush", (True, False) -) -class TestCmdApiApplyMatrix: - - benchmark = "apply_matrix" - def test_apply_matrix(self, args, matrix_prop, precision, flush, tmp_path, visible_device): - + benchmark = 'apply_matrix' env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = str(visible_device) cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api', - '--benchmark', self.benchmark, + '--benchmark', benchmark, '--precision', precision, '--cachedir', str(tmp_path), # speed up the tests... @@ -286,7 +286,158 @@ def test_apply_matrix(self, args, matrix_prop, precision, flush, tmp_path, visib try: assert bool(result.check_returncode()) == False - cached_json = [f for f in glob.glob(str(tmp_path / f"data/{self.benchmark}.json")) if os.path.isfile(f)] + cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)] + assert len(cached_json) == 1 # TODO: test aggregate behavior too? + except: + # make debugging easier + print("stdout:\n", result.stdout.decode()) + print("stderr:\n", result.stderr.decode()) + raise + finally: + print("cmd:\n", ' '.join(cmd)) + + @pytest.mark.parametrize( + "args", ( + ("--nqubits", "4", "--ntargets", "2",), + ("--nqubits", "4", "--targets", "2,3",), + ("--nqubits", "6", "--ntargets", "2", "--controls", "3",), + ("--nqubits", "4", "--targets", "1,2", "--ncontrols", "1",), + ("--nqubits", "4", "--targets", "2,3", "--controls", "1",), + ) + ) + @pytest.mark.parametrize( + "diag", ( + (), + ("--has-diag", "--location-diag", "device",), + ("--has-diag", "--precision-diag", "double", "--precision", "double",), + ) + ) + @pytest.mark.parametrize( + "perm", ( + ("--has-perm",), + ("--has-perm", "--location-perm", "device",), + ("--perm-table", "2,3,0,1",), # this test assumes ntargets=2 always + ) + ) + @pytest.mark.parametrize( + "matrix_prop", ( + (), # default + ("--adjoint",), + ) + ) + def test_apply_generalized_permutation_matrix( + self, args, diag, perm, matrix_prop, tmp_path, visible_device): + benchmark = 'apply_generalized_permutation_matrix' + env = os.environ.copy() + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) + + cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api', + '--benchmark', benchmark, + '--cachedir', str(tmp_path), + # speed up the tests... + '--nwarmups', '1', + '--nrepeats', '1', + '--verbose'] + cmd += args + cmd += diag + cmd += perm + cmd += matrix_prop + result = subprocess.run(cmd, env=env, capture_output=True) + + try: + assert bool(result.check_returncode()) == False + cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)] + assert len(cached_json) == 1 # TODO: test aggregate behavior too? + except: + # make debugging easier + print("stdout:\n", result.stdout.decode()) + print("stderr:\n", result.stderr.decode()) + raise + finally: + print("cmd:\n", ' '.join(cmd)) + + @pytest.mark.parametrize( + "args", ( + ("--nqubits", "4", "--nbit-ordering", "2", "--nshots", "256"), + ("--nqubits", "4", "--bit-ordering", "2,3", "--output-order", "random"), + ) + ) + @pytest.mark.parametrize( + "precision", ("single", "double") + ) + def test_cusv_sampler(self, args, precision, tmp_path, visible_device): + benchmark = 'cusv_sampler' + env = os.environ.copy() + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) + + cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api', + '--benchmark', benchmark, + '--precision', precision, + '--cachedir', str(tmp_path), + # speed up the tests... + '--nwarmups', '1', + '--nrepeats', '1', + '--verbose'] + cmd += args + result = subprocess.run(cmd, env=env, capture_output=True) + + try: + assert bool(result.check_returncode()) == False + cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)] + assert len(cached_json) == 1 # TODO: test aggregate behavior too? + except: + # make debugging easier + print("stdout:\n", result.stdout.decode()) + print("stderr:\n", result.stderr.decode()) + raise + finally: + print("cmd:\n", ' '.join(cmd)) + + @pytest.mark.parametrize( + "args", ( + ["--expr", "abc->abx,xc", "--shape", "4,8,4"], + ["--expr", "abcd->ax,bcdx", "--shape", "4,8,4,2"], + ) + ) + @pytest.mark.parametrize( + "method", ( + ("--method", "QR",), + ("--method", "SVD",), + ("--algorithm", "gesvd"), + ("--algorithm", "gesvdj"), + ("--algorithm", "gesvdr"), + ("--algorithm", "gesvdp"), + ) + ) + @pytest.mark.parametrize( + "precision", ("single", "double") + ) + @pytest.mark.parametrize( + "is_complex", (True, False) + ) + def test_tensor_decompose(self, args, method, precision, is_complex, tmp_path, visible_device): + benchmark = 'tensor_decompose' + env = os.environ.copy() + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) + + cmd = [sys.executable, '-m', 'cuquantum_benchmarks', 'api', + '--benchmark', benchmark, + '--precision', precision, + '--cachedir', str(tmp_path), + # speed up the tests... + '--nwarmups', '1', + '--nrepeats', '1', + '--verbose'] + cmd += args + cmd += method + if is_complex: + cmd.append('--is-complex') + + result = subprocess.run(cmd, env=env, capture_output=True) + + try: + assert bool(result.check_returncode()) == False + cached_json = [f for f in glob.glob(str(tmp_path / f"data/{benchmark}.json")) if os.path.isfile(f)] assert len(cached_json) == 1 # TODO: test aggregate behavior too? except: # make debugging easier diff --git a/python/samples/cutensornet/coarse/example22_mpi_auto.py b/python/samples/cutensornet/coarse/example22_mpi_auto.py index 6c6e906..9457f66 100644 --- a/python/samples/cutensornet/coarse/example22_mpi_auto.py +++ b/python/samples/cutensornet/coarse/example22_mpi_auto.py @@ -64,15 +64,6 @@ # Compute the contraction (with distributed path finding & contraction execution) result = cuquantum.contract(expr, *operands, options={'device_id' : device_id, 'handle': handle}) -# Create a new GPU buffer for verification -result_cp = cp.empty_like(result) - -# Sum the partial contribution from each process on root, with GPU -if rank == root: - comm.Reduce(sendbuf=MPI.IN_PLACE, recvbuf=result_cp, op=MPI.SUM, root=root) -else: - comm.Reduce(sendbuf=result_cp, recvbuf=None, op=MPI.SUM, root=root) - # Check correctness. if rank == root: result_cp = cp.einsum(expr, *operands, optimize=True) From 92a18e9d012b405f9599e0534ab46a587cd8b4b6 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 18 Jul 2023 19:19:07 -0700 Subject: [PATCH 2/2] nit --- benchmarks/cuquantum_benchmarks/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cuquantum_benchmarks/_utils.py b/benchmarks/cuquantum_benchmarks/_utils.py index 299e198..76ecbde 100644 --- a/benchmarks/cuquantum_benchmarks/_utils.py +++ b/benchmarks/cuquantum_benchmarks/_utils.py @@ -456,7 +456,7 @@ class L2flush: https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh. """ def __init__(self): - self.l2_size = 3 * cp.cuda.Device().attributes['L2CacheSize'] + self.l2_size = cp.cuda.Device().attributes['L2CacheSize'] self.mem = cp.cuda.alloc(self.l2_size) if self.l2_size > 0 else None def flush(self):