From 0d18f7c6e7820b78e338470cc2ed2045644f127e Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 12 Jul 2023 14:21:34 -0700 Subject: [PATCH 1/3] sync with internal repo1 (commit b0bf87218) --- python/README.md | 31 +- python/builder/pep517.py | 4 +- python/builder/utils.py | 47 +- python/cuquantum/__init__.py | 8 + python/cuquantum/_version.py | 2 +- python/cuquantum/custatevec/custatevec.pxd | 22 +- python/cuquantum/custatevec/custatevec.pyx | 491 ++++++++++- .../_internal/circuit_converter_utils.py | 36 +- .../_internal/circuit_parser_utils_qiskit.py | 23 +- .../_internal/decomposition_utils.py | 101 ++- .../cutensornet/_internal/einsum_parser.py | 11 +- python/cuquantum/cutensornet/cutensornet.pxd | 82 +- python/cuquantum/cutensornet/cutensornet.pyx | 832 +++++++++++++++++- .../experimental/tensor_network.py | 14 +- python/cuquantum/cutensornet/tensor.py | 95 +- .../cuquantum/cutensornet/tensor_network.py | 8 +- python/cuquantum/utils.pyx | 38 + python/samples/custatevec/batched_abs2sum.py | 52 ++ python/samples/custatevec/batched_collapse.py | 68 ++ .../custatevec/batched_gate_application.py | 78 ++ python/samples/custatevec/batched_measure.py | 59 ++ python/samples/custatevec/initialize_sv.py | 37 + .../approxTN/tensor_svd_example.py | 22 +- .../samples/cutensornet/coarse/example12.py | 1 + .../high_level/marginal_example.py | 123 +++ .../high_level/sampling_example.py | 119 +++ .../tensor/example11-svd_algorithms.py | 27 + .../tensornet_example_gradients.py | 342 +++++++ .../tn_algorithms/mps_algorithms.ipynb | 413 +++++++-- python/setup.py | 18 +- python/tests/cuquantum_tests/__init__.py | 13 + .../custatevec_tests/test_custatevec.py | 707 +++++++++++---- .../cutensornet_tests/__init__.py | 2 + .../cutensornet_tests/approxTN_utils.py | 46 +- .../cutensornet_tests/circuit_utils.py | 405 ++++++++- .../cuquantum_tests/cutensornet_tests/data.py | 19 +- .../cutensornet_tests/test_cutensornet.py | 455 ++++++---- .../cutensornet_tests/test_experimental.py | 28 +- .../cutensornet_tests/test_internal.py | 23 + .../cutensornet_tests/test_options.py | 2 +- .../cutensornet_tests/test_tensor.py | 71 +- .../cutensornet_tests/test_utils.py | 38 +- samples/custatevec/CMakeLists.txt | 39 +- samples/custatevec/Makefile | 47 +- samples/custatevec/batched_abs2sum.cu | 82 ++ samples/custatevec/batched_collapse.cu | 103 +++ .../custatevec/batched_gate_application.cu | 109 +++ samples/custatevec/batched_measure.cu | 92 ++ samples/custatevec/initialize_sv.cu | 68 ++ samples/cutensornet/CMakeLists.txt | 18 +- samples/cutensornet/Makefile | 33 +- samples/cutensornet/README.md | 17 + .../approxTN/tensor_svd_example.cu | 26 +- .../high_level/marginal_example.cu | 199 +++++ .../high_level/sampling_example.cu | 184 ++++ samples/cutensornet/tensornet_example.cu | 2 +- .../tensornet_example_gradients.cu | 540 ++++++++++++ 57 files changed, 5827 insertions(+), 745 deletions(-) create mode 100644 python/samples/custatevec/batched_abs2sum.py create mode 100644 python/samples/custatevec/batched_collapse.py create mode 100644 python/samples/custatevec/batched_gate_application.py create mode 100644 python/samples/custatevec/batched_measure.py create mode 100644 python/samples/custatevec/initialize_sv.py create mode 100755 python/samples/cutensornet/high_level/marginal_example.py create mode 100755 python/samples/cutensornet/high_level/sampling_example.py create mode 100644 python/samples/cutensornet/tensor/example11-svd_algorithms.py create mode 100644 python/samples/cutensornet/tensornet_example_gradients.py create mode 100644 samples/custatevec/batched_abs2sum.cu create mode 100644 samples/custatevec/batched_collapse.cu create mode 100644 samples/custatevec/batched_gate_application.cu create mode 100644 samples/custatevec/batched_measure.cu create mode 100644 samples/custatevec/initialize_sv.cu create mode 100644 samples/cutensornet/high_level/marginal_example.cu create mode 100644 samples/cutensornet/high_level/sampling_example.cu create mode 100644 samples/cutensornet/tensornet_example_gradients.cu diff --git a/python/README.md b/python/README.md index fb5a57e..b40cadf 100644 --- a/python/README.md +++ b/python/README.md @@ -13,9 +13,10 @@ If you already have a Conda environment set up, it is the easiest to install cuQ ``` conda install -c conda-forge cuquantum-python ``` -The Conda solver will install all required dependencies for you. - -**Note**: Currently CUDA 12 support is pending the NVIDIA-led community effort ([conda-forge/staged-recipes#21382](https://github.com/conda-forge/staged-recipes/issues/21382)). Once conda-forge supports CUDA 12 we will make compatible conda packages available. +The Conda solver will install all required dependencies for you. If you need to select a particular CUDA version, say CUDA 12.0, please issue the following command: +``` +conda install -c conda-forge cuquantum-python cuda-version=12.0 +``` ### Install cuQuantum Python from PyPI @@ -26,12 +27,12 @@ you can also install cuQuantum Python this way: pip install cuquantum-python-cuXX ``` with `XX` being `11` (for CUDA 11) or `12` (for CUDA 12). -The `pip` solver will also install all dependencies, with the exception of CuPy, for you (including both cuTENSOR and cuQuantum wheels). Please follow -[CuPy's installation guide](https://docs.cupy.dev/en/stable/install.html). +The `pip` solver will also install all required dependencies for you (including both cuTENSOR and cuQuantum wheels). Notes: -- Users can install cuQuantum Python using `pip install cuquantum-python`, which will attempt to detect the current CUDA environment and choose the appropriate wheel to install. In the event of detection failure, CUDA 11 is assumed. This is subject to change in the future. Installing wheels with the `-cuXX` suffix is encouraged. +- Users can install cuQuantum Python using `pip install --no-cache-dir cuquantum-python`, which will attempt to detect the current CUDA environment and choose the appropriate wheel to install. In the event of detection failure, CUDA 11 is assumed. This is subject to change in the future. Installing wheels with the `-cuXX` suffix is encouraged. `--no-cache-dir` is required when using `pip` 23.1+. +- CuPy also uses a similar auto-detection mechanism to determine the correct wheel to install. If in doubt, or if installing `cuquantum-python-cu11`, please follow [CuPy's installation guide](https://docs.cupy.dev/en/stable/install.html) and install it manually. - To manually manage all Python dependencies, append `--no-deps` to `pip install` to bypass the `pip` solver, see below. ### Building and installing cuQuantum Python from source @@ -41,10 +42,10 @@ Notes: The build-time dependencies of the cuQuantum Python package include: * CUDA Toolkit 11.x or 12.x -* cuStateVec 1.3.0+ -* cuTensorNet 2.1.0+ +* cuStateVec 1.4.0+ +* cuTensorNet 2.2.0+ * cuTENSOR 1.6.1+ -* Python 3.8+ +* Python 3.9+ * Cython >=0.29.22,<3 * pip 21.3.1+ * [packaging](https://packaging.pypa.io/en/latest/) @@ -84,12 +85,12 @@ Runtime dependencies of the cuQuantum Python package include: * An NVIDIA GPU with compute capability 7.0+ * Driver: Linux (450.80.02+ for CUDA 11, 525.60.13+ for CUDA 12) * CUDA Toolkit 11.x or 12.x -* cuStateVec 1.3.0+ -* cuTensorNet 2.1.0+ +* cuStateVec 1.4.0+ +* cuTensorNet 2.2.0+ * cuTENSOR 1.6.1+ -* Python 3.8+ -* NumPy v1.19+ -* CuPy v9.5.0+ (see [installation guide](https://docs.cupy.dev/en/stable/install.html)) +* Python 3.9+ +* NumPy v1.21+ +* CuPy v10.0.0+ (see [installation guide](https://docs.cupy.dev/en/stable/install.html)) * PyTorch v1.10+ (optional, see [installation guide](https://pytorch.org/get-started/locally/)) * Qiskit v0.24.0+ (optional, see [installation guide](https://qiskit.org/documentation/getting_started.html)) * Cirq v0.6.0+ (optional, see [installation guide](https://quantumai.google/cirq/install)) @@ -100,7 +101,7 @@ If you install everything from conda-forge, all the required dependencies are ta If you install the pip wheels, CuPy, cuTENSOR and cuQuantum (but not CUDA Toolkit or the driver, please make sure the CUDA libraries are visible through your `LD_LIBRARY_PATH`) are installed for you. -If you build cuQuantum Python from source, please make sure the paths to the CUDA, cuQuantum, and cuTENSOR libraries are added +If you build cuQuantum Python from source, please make sure that the paths to the CUDA, cuQuantum, and cuTENSOR libraries are added to your `LD_LIBRARY_PATH` environment variable, and that a compatible CuPy is installed. Known issues: diff --git a/python/builder/pep517.py b/python/builder/pep517.py index 57174b1..66f7c09 100644 --- a/python/builder/pep517.py +++ b/python/builder/pep517.py @@ -30,8 +30,8 @@ def get_requires_for_build_wheel(config_settings=None): # set up version constraints: note that CalVer like 22.03 is normalized to # 22.3 by setuptools, so we must follow the same practice in the constraints; # also, we don't need the patch number here - cuqnt_require = [f'custatevec-cu{utils.cuda_major_ver}~=1.3', # ">=1.3.0,<2" - f'cutensornet-cu{utils.cuda_major_ver}~=2.1', # ">=2.1.0,<3" + cuqnt_require = [f'custatevec-cu{utils.cuda_major_ver}~=1.4', # ">=1.4.0,<2" + f'cutensornet-cu{utils.cuda_major_ver}~=2.2', # ">=2.2.0,<3" ] return _build_meta.get_requires_for_build_wheel(config_settings) + cuqnt_require diff --git a/python/builder/utils.py b/python/builder/utils.py index fac5607..95f5c8a 100644 --- a/python/builder/utils.py +++ b/python/builder/utils.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause import os -import platform import re import site import sys @@ -57,13 +56,10 @@ def check_cuda_version(): # We support CUDA 11/12 starting 23.03 cuda_ver = check_cuda_version() if cuda_ver == '11.0': - cutensor_ver = cuda_ver cuda_major_ver = '11' elif '11.0' < cuda_ver < '12.0': - cutensor_ver = '11' cuda_major_ver = '11' elif '12.0' <= cuda_ver < '13.0': - cutensor_ver = '12' cuda_major_ver = '12' else: raise RuntimeError(f"Unsupported CUDA version: {cuda_ver}") @@ -79,20 +75,6 @@ def run(self): building_wheel = True super().run() - def finalize_options(self): - super().finalize_options() - self.root_is_pure = False - - def get_tag(self): - # hack: passing --build-options in cmdline no longer works with PEP 517 backend, - # so we need to overwrite --plat-name here - # refs: - # - https://github.com/pypa/build/issues/480 - # - https://github.com/scikit-build/ninja-python-distributions/pull/85 - impl_tag, abi_tag, _ = super().get_tag() - plat_tag = f"manylinux2014_{platform.machine()}" - return impl_tag, abi_tag, plat_tag - class build_ext(_build_ext): @@ -131,28 +113,14 @@ def _set_library_roots(self): else: cutensornet_root = cuquantum_root - # search order: - # 1. installed "cutensor" package - # 2. env var - for path in py_paths: - path = os.path.join(path, 'cutensor') - if os.path.isdir(os.path.join(path, 'include')): - cutensor_root = path - break - else: - try: - cutensor_root = os.environ['CUTENSOR_ROOT'] - except KeyError as e: - raise RuntimeError('cuTENSOR is not found, please set $CUTENSOR_ROOT') from e - - return custatevec_root, cutensornet_root, cutensor_root + return custatevec_root, cutensornet_root def _prep_includes_libs_rpaths(self): """ Set global vars cusv_incl_dir, cutn_incl_dir, cusv_lib_dir, cutn_lib_dir, cusv_lib, cutn_lib, and extra_linker_flags. """ - custatevec_root, cutensornet_root, cutensor_root = self._set_library_roots() + custatevec_root, cutensornet_root = self._set_library_roots() global cusv_incl_dir, cutn_incl_dir cusv_incl_dir = [os.path.join(cuda_path, 'include'), @@ -165,22 +133,20 @@ def _prep_includes_libs_rpaths(self): cusv_lib_dir = [os.path.join(custatevec_root, 'lib'), os.path.join(custatevec_root, 'lib64')] cutn_lib_dir = [os.path.join(cutensornet_root, 'lib'), - os.path.join(cutensornet_root, 'lib64'), - os.path.join(cutensor_root, 'lib'), # wheel - os.path.join(cutensor_root, 'lib', cutensor_ver)] # tarball + os.path.join(cutensornet_root, 'lib64')] global cusv_lib, cutn_lib, extra_linker_flags if not building_wheel: # Note: with PEP-517 the editable mode would not build a wheel for installation # (and we purposely do not support PEP-660). cusv_lib = ['custatevec'] - cutn_lib = ['cutensornet', 'cutensor'] + cutn_lib = ['cutensornet'] extra_linker_flags = [] else: # Note: soname = library major version - # We don't need to link to cuBLAS/cuSOLVER at build time (TODO: perhaps cuTENSOR too...?) + # We don't need to link to cuBLAS/cuSOLVER/cuTensor at build time cusv_lib = [':libcustatevec.so.1'] - cutn_lib = [':libcutensornet.so.2', ':libcutensor.so.1'] + cutn_lib = [':libcutensornet.so.2'] # The rpaths must be adjusted given the following full-wheel installation: # - cuquantum-python: site-packages/cuquantum/{custatevec, cutensornet}/ [=$ORIGIN] # - cusv & cutn: site-packages/cuquantum/lib/ @@ -201,7 +167,6 @@ def _prep_includes_libs_rpaths(self): print("CUDA path:", cuda_path) print("cuStateVec path:", custatevec_root) print("cuTensorNet path:", cutensornet_root) - print("cuTENSOR path:", cutensor_root) print("*"*80+"\n") def build_extension(self, ext): diff --git a/python/cuquantum/__init__.py b/python/cuquantum/__init__.py index fb9ef70..3c9acbc 100644 --- a/python/cuquantum/__init__.py +++ b/python/cuquantum/__init__.py @@ -16,9 +16,14 @@ custatevec.Pauli, custatevec.MatrixLayout, custatevec.MatrixType, + custatevec.MatrixMapType, custatevec.Collapse, custatevec.SamplerOutput, custatevec.DeviceNetworkType, + cutensornet.NetworkAttribute, + custatevec.CommunicatorType, + custatevec.DataTransferType, + custatevec.StateVectorType, cutensornet.ContractionOptimizerInfoAttribute, cutensornet.ContractionOptimizerConfigAttribute, cutensornet.ContractionAutotunePreferenceAttribute, @@ -32,6 +37,9 @@ cutensornet.TensorSVDPartition, cutensornet.TensorSVDInfoAttribute, cutensornet.GateSplitAlgo, + cutensornet.StatePurity, + cutensornet.MarginalAttribute, + cutensornet.SamplerAttribute, ): cutensornet._internal.enum_utils.add_enum_class_doc(enum, chomp="_ATTRIBUTE|_PREFERENCE_ATTRIBUTE") diff --git a/python/cuquantum/_version.py b/python/cuquantum/_version.py index 872e436..19b2289 100644 --- a/python/cuquantum/_version.py +++ b/python/cuquantum/_version.py @@ -5,4 +5,4 @@ # Note: cuQuantum Python follows the cuQuantum SDK version, which is now # switched to YY.MM and is different from individual libraries' (semantic) # versioning scheme. -__version__ = '23.03.0' +__version__ = '23.06.0' diff --git a/python/cuquantum/custatevec/custatevec.pxd b/python/cuquantum/custatevec/custatevec.pxd index 5975a88..c654a5a 100644 --- a/python/cuquantum/custatevec/custatevec.pxd +++ b/python/cuquantum/custatevec/custatevec.pxd @@ -14,6 +14,14 @@ from cuquantum.utils cimport (DataType, DeviceAllocType, DeviceFreeType, int2, cdef extern from '' nogil: + # cuStateVec consts + const int CUSTATEVEC_VER_MAJOR + const int CUSTATEVEC_VER_MINOR + const int CUSTATEVEC_VER_PATCH + const int CUSTATEVEC_VERSION + const int CUSTATEVEC_ALLOCATOR_NAME_LEN + const int CUSTATEVEC_MAX_SEGMENT_MASK_SIZE + # cuStateVec types ctypedef void* _Handle 'custatevecHandle_t' ctypedef int64_t _Index 'custatevecIndex_t' @@ -24,10 +32,7 @@ cdef extern from '' nogil: void* ctx DeviceAllocType device_alloc DeviceFreeType device_free - - # Cython limitation: cannot use C defines in declaring a static array, - # so we just have to hard-code CUSTATEVEC_ALLOCATOR_NAME_LEN here... - char name[64] + char name[CUSTATEVEC_ALLOCATOR_NAME_LEN] ctypedef void(*LoggerCallbackData 'custatevecLoggerCallbackData_t')( int32_t logLevel, const char* functionName, @@ -69,6 +74,10 @@ cdef extern from '' nogil: CUSTATEVEC_MATRIX_TYPE_UNITARY CUSTATEVEC_MATRIX_TYPE_HERMITIAN + ctypedef enum _MatrixMapType 'custatevecMatrixMapType_t': + CUSTATEVEC_MATRIX_MAP_TYPE_BROADCAST + CUSTATEVEC_MATRIX_MAP_TYPE_MATRIX_INDEXED + ctypedef enum _CollapseOp 'custatevecCollapseOp_t': CUSTATEVEC_COLLAPSE_NONE CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO @@ -92,6 +101,11 @@ cdef extern from '' nogil: CUSTATEVEC_DATA_TRANSFER_TYPE_RECV CUSTATEVEC_DATA_TRANSFER_TYPE_SEND_RECV + ctypedef enum _StateVectorType 'custatevecStateVectorType_t': + CUSTATEVEC_STATE_VECTOR_TYPE_ZERO + CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM + CUSTATEVEC_STATE_VECTOR_TYPE_GHZ + CUSTATEVEC_STATE_VECTOR_TYPE_W # cuStateVec consts int CUSTATEVEC_VER_MAJOR diff --git a/python/cuquantum/custatevec/custatevec.pyx b/python/cuquantum/custatevec/custatevec.pyx index 502776a..86fca61 100644 --- a/python/cuquantum/custatevec/custatevec.pyx +++ b/python/cuquantum/custatevec/custatevec.pyx @@ -46,18 +46,32 @@ cdef extern from * nogil: int custatevecAbs2SumArray( _Handle, const void*, DataType, const uint32_t, double*, const int32_t*, const uint32_t, const int32_t*, const int32_t*, const uint32_t) + int custatevecAbs2SumArrayBatched( + _Handle, const void*, DataType, const uint32_t, const uint32_t, + const _Index, double*, const _Index, const int32_t*, + const uint32_t, const _Index*, const int32_t*, const uint32_t) int custatevecCollapseOnZBasis( _Handle, void*, DataType, const uint32_t, const int32_t, const int32_t*, const uint32_t, double) int custatevecCollapseByBitString( _Handle, void*, DataType, const uint32_t, const int32_t*, const int32_t*, const uint32_t, double) + int custatevecCollapseByBitStringBatchedGetWorkspaceSize( + _Handle, const uint32_t, const _Index*, const double*, size_t*) + int custatevecCollapseByBitStringBatched( + _Handle, void*, DataType, const uint32_t, const uint32_t, + const _Index, const _Index*, const int32_t*, const uint32_t, + const double*, void*, size_t) int custatevecMeasureOnZBasis( _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*, const uint32_t, const double, _CollapseOp) int custatevecBatchMeasure( _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*, const uint32_t, const double, _CollapseOp) + int custatevecMeasureBatched( + _Handle, void*, DataType, const uint32_t, const uint32_t, const _Index, + _Index*, const int32_t*, const uint32_t, + const double*, _CollapseOp) int custatevecBatchMeasureWithOffset( _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*, const uint32_t, const double, _CollapseOp, const double, const double) @@ -74,6 +88,17 @@ cdef extern from * nogil: DataType, _MatrixLayout, const int32_t, const int32_t*, const uint32_t, const int32_t*, const int32_t*, const uint32_t, _ComputeType, void*, size_t) + int custatevecApplyMatrixBatchedGetWorkspaceSize( + _Handle, DataType, const uint32_t, const uint32_t, const _Index, + _MatrixMapType, const int32_t*, const void*, DataType, + _MatrixLayout, const int32_t, const uint32_t, const uint32_t, + const uint32_t, _ComputeType, size_t*) + int custatevecApplyMatrixBatched( + _Handle, void*, DataType, const uint32_t, const uint32_t, _Index, + _MatrixMapType, const int32_t*, const void*, DataType, + _MatrixLayout, const int32_t, const uint32_t, const int32_t*, + const uint32_t, const int32_t*, const int32_t*, const uint32_t, + _ComputeType, void*, size_t) int custatevecComputeExpectationGetWorkspaceSize( _Handle, DataType, const uint32_t, const void*, DataType, _MatrixLayout, const uint32_t, _ComputeType, size_t*) @@ -132,6 +157,8 @@ cdef extern from * nogil: int custatevecTestMatrixType( _Handle, double*, _MatrixType, const void*, DataType, _MatrixLayout, const uint32_t, const int32_t, _ComputeType, void*, size_t) + int custatevecInitializeStateVector( + _Handle, void*, DataType, const uint32_t, _StateVectorType) int custatevecGetDeviceMemHandler(_Handle, _DeviceMemHandler*) int custatevecSetDeviceMemHandler(_Handle, const _DeviceMemHandler*) @@ -418,7 +445,6 @@ cpdef abs2sum_array( mask_len (uint32_t): The length of ``mask_ordering``. - .. seealso:: `custatevecAbs2SumArray` """ # bit_ordering can be a pointer address, or a Python sequence @@ -456,6 +482,85 @@ cpdef abs2sum_array( check_status(status) +cpdef abs2sum_array_batched( + intptr_t handle, intptr_t batched_svs, int sv_data_type, uint32_t + n_index_bits, uint32_t n_svs, _Index sv_stride, + intptr_t abs2sum, _Index abs2sum_stride, + bit_ordering, uint32_t bit_ordering_len, + mask_bit_string, mask_ordering, uint32_t mask_len): + """Calculates the batched sum of squared absolute values for a given set of + index bits. + + Args: + handle (intptr_t): The library handle. + batched_svs (intptr_t): The pointer address (as Python :class:`int`) to + the batched statevectors (on device). + sv_data_type (cuquantum.cudaDataType): The data type of the statevector. + n_index_bits (uint32_t): The number of index bits. + n_svs (uint32_t): The number of batched statevectors. + sv_stride (int64_t): The stride between each state vector in the batch. + abs2sum (intptr_t): The pointer address (as Python :class:`int`) to the + array (on either host or device) that would hold the sums. + abs2sum_stride (int64_t): The stride between each ``abs2sum`` array in + the batch. + bit_ordering: A host array of index bit ordering. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of index bit ordering + + bit_ordering_len (uint32_t): The length of ``bit_ordering``. + mask_bit_string: An array for specifying mask values. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of mask values on host + + mask_ordering: A host array of mask ordering. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of index bit ordering + + mask_len (uint32_t): The length of ``mask_ordering``. + + .. seealso:: `custatevecAbs2SumArrayBatched` + """ + # bit_ordering can be a pointer address, or a Python sequence + cdef vector[int32_t] bitOrderingData + cdef int32_t* bitOrderingPtr + if cpython.PySequence_Check(bit_ordering): + bitOrderingData = bit_ordering + bitOrderingPtr = bitOrderingData.data() + else: # a pointer address + bitOrderingPtr = bit_ordering + + # mask_bit_string can be a pointer address, or a Python sequence + cdef vector[_Index] maskBitStringData + cdef _Index* maskBitStringPtr + if cpython.PySequence_Check(mask_bit_string): + maskBitStringData = mask_bit_string + maskBitStringPtr = maskBitStringData.data() + else: # a pointer address + maskBitStringPtr = <_Index*>mask_bit_string + + # mask_ordering can be a pointer address, or a Python sequence + cdef vector[int32_t] maskOrderingData + cdef int32_t* maskOrderingPtr + if cpython.PySequence_Check(mask_ordering): + maskOrderingData = mask_ordering + maskOrderingPtr = maskOrderingData.data() + else: # a pointer address + maskOrderingPtr = mask_ordering + + with nogil: + status = custatevecAbs2SumArrayBatched( + <_Handle>handle, batched_svs, sv_data_type, + n_index_bits, n_svs, sv_stride, + abs2sum, abs2sum_stride, + bitOrderingPtr, bit_ordering_len, + maskBitStringPtr, maskOrderingPtr, mask_len) + check_status(status) + + cpdef collapse_on_z_basis( intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits, int32_t parity, basis_bits, uint32_t n_basis_bits, double norm): @@ -548,6 +653,134 @@ cpdef collapse_by_bitstring( check_status(status) +cpdef size_t collapse_by_bitstring_batched_get_workspace_size( + intptr_t handle, uint32_t n_svs, bit_strings, norms) except*: + """Computes the required workspace size for + :func:`collapse_by_bitstring_batched`. + + Args: + handle (intptr_t): The library handle. + n_svs (uint32_t): The number of batched statevectors. + bit_strings: An array of bit strings. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of bits on host + + norms: An array of normalization factors for the statevectors after + collapse. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of normalization factors on host + + .. seealso:: `custatevecCollapseByBitStringBatchedGetWorkspaceSize` + """ + # bit_strings can be a pointer address, or a Python sequence + cdef vector[_Index] bitStringsData + cdef _Index* bitStringsPtr + if cpython.PySequence_Check(bit_strings): + bitStringsData = bit_strings + bitStringsPtr = bitStringsData.data() + else: # a pointer address + bitStringsPtr = <_Index*>bit_strings + + # norms can be a pointer address, or a Python sequence + cdef vector[double] normsData + cdef double* normsPtr + if cpython.PySequence_Check(norms): + normsData = norms + normsPtr = normsData.data() + else: # a pointer address + normsPtr = norms + + cdef size_t workspace_size + with nogil: + status = custatevecCollapseByBitStringBatchedGetWorkspaceSize( + <_Handle>handle, n_svs, bitStringsPtr, normsPtr, &workspace_size) + check_status(status) + + return workspace_size + + +cpdef collapse_by_bitstring_batched( + intptr_t handle, intptr_t batched_svs, int sv_data_type, + uint32_t n_index_bits, uint32_t n_svs, _Index sv_stride, + bit_strings, bit_ordering, uint32_t bit_string_len, norms, + intptr_t workspace, size_t workspace_size): + """Collapse the batched statevectors to the states specified by the given + bit strings. + + Args: + handle (intptr_t): The library handle. + batched_svs (intptr_t): The pointer address (as Python :class:`int`) to + the batched statevectors (on device). + sv_data_type (cuquantum.cudaDataType): The data type of the + statevectors. + n_index_bits (uint32_t): The number of index bits. + n_svs (uint32_t): The number of batched statevectors. + sv_stride (int64_t): The stride between each state vector in the batch. + bit_strings: An array of bit strings. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of bits on host + + bit_ordering: A host array of bit string ordering. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of bit ordering + + bit_string_len (uint32_t): The length of individual ``bit_string``. + norms: An array of normalization factors for the statevectors after + collapse. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of normalization factors on host + + workspace (intptr_t): The pointer address (as Python :class:`int`) to the + workspace (on device). + workspace_size (size_t): The workspace size (in bytes). + + .. seealso:: `custatevecCollapseByBitStringBatched` + """ + # bit_strings can be a pointer address, or a Python sequence + cdef vector[_Index] bitStringsData + cdef _Index* bitStringsPtr + if cpython.PySequence_Check(bit_strings): + bitStringsData = bit_strings + bitStringsPtr = bitStringsData.data() + else: # a pointer address + bitStringsPtr = <_Index*>bit_strings + + # bit_ordering can be a pointer address, or a Python sequence + cdef vector[int32_t] bitOrderingData + cdef int32_t* bitOrderingPtr + if cpython.PySequence_Check(bit_ordering): + bitOrderingData = bit_ordering + bitOrderingPtr = bitOrderingData.data() + else: # a pointer address + bitOrderingPtr = bit_ordering + + # norms can be a pointer address, or a Python sequence + cdef vector[double] normsData + cdef double* normsPtr + if cpython.PySequence_Check(norms): + normsData = norms + normsPtr = normsData.data() + else: # a pointer address + normsPtr = norms + + with nogil: + status = custatevecCollapseByBitStringBatched( + <_Handle>handle, batched_svs, sv_data_type, + n_index_bits, n_svs, sv_stride, + bitStringsPtr, bitOrderingPtr, bit_string_len, normsPtr, + workspace, workspace_size) + check_status(status) + + cpdef int measure_on_z_basis( intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits, basis_bits, const uint32_t n_basis_bits, double rand_num, @@ -635,6 +868,66 @@ cpdef batch_measure( check_status(status) +cpdef measure_batched( + intptr_t handle, intptr_t batched_svs, int sv_data_type, + uint32_t n_index_bits, uint32_t n_svs, int64_t sv_stride, + intptr_t bit_strings, bit_ordering, const uint32_t bit_string_len, + rand_nums, int collapse): + """Performs measurement of a batched of statevectors. + + Args: + handle (intptr_t): The library handle. + batched_svs (intptr_t): The pointer address (as Python :class:`int`) to + the batched statevectors (on device). + sv_data_type (cuquantum.cudaDataType): The data type of the statevector. + n_index_bits (uint32_t): The number of index bits. + n_svs (uint32_t): The number of batched statevectors. + sv_stride (int64_t): The stride between each state vector in the batch. + bit_strings (intptr_t): The pointer address (as Python :class:`int`) to + a host or device array of measured bit strings. + bit_ordering: A host array of bit string ordering. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of bit ordering + + bit_string_len (uint32_t): The length of ``bit_string``. + rand_nums (double): An array of random numbers in [0, 1). It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of random numbers + + collapse (Collapse): Indicate the collapse operation. + + .. seealso:: `custatevecMeasureBatched` + """ + # bit_ordering can be a pointer address, or a Python sequence + cdef vector[int32_t] bitOrderingData + cdef int32_t* bitOrderingPtr + if cpython.PySequence_Check(bit_ordering): + bitOrderingData = bit_ordering + bitOrderingPtr = bitOrderingData.data() + else: # a pointer address + bitOrderingPtr = bit_ordering + + # rand_nums can be a pointer address, or a Python sequence + cdef vector[double] randNumsData + cdef double* randNumsPtr + if cpython.PySequence_Check(rand_nums): + randNumsData = rand_nums + randNumsPtr = randNumsData.data() + else: # a pointer address + randNumsPtr = rand_nums + + with nogil: + status = custatevecMeasureBatched( + <_Handle>handle, batched_svs, sv_data_type, + n_index_bits, n_svs, sv_stride, + <_Index*>bit_strings, bitOrderingPtr, bit_string_len, + randNumsPtr, <_CollapseOp>collapse) + check_status(status) + + cpdef batch_measure_with_offset( intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits, intptr_t bit_string, bit_ordering, @@ -882,6 +1175,171 @@ cpdef apply_matrix( check_status(status) +cpdef size_t apply_matrix_batched_get_workspace_size( + intptr_t handle, int sv_data_type, uint32_t n_index_bits, + uint32_t n_svs, _Index sv_stride, + int map_type, matrix_indices, intptr_t matrices, int matrix_data_type, + int layout, int32_t adjoint, uint32_t n_matrices, + uint32_t n_targets, uint32_t n_controls, int compute_type) except*: + """Computes the required workspace size for :func:`apply_matrix_batched`. + + Args: + handle (intptr_t): The library handle. + sv_data_type (cuquantum.cudaDataType): The data type of the statevector. + n_index_bits (uint32_t): The number of index bits. + n_svs (uint32_t): The number of batched statevectors. + sv_stride (int64_t): The stride between each state vector in the batch. + map_type (MatrixMapType): Specify the way to assign matrices. + matrix_indices: An array of matrix indices to indicate, in order, which + matrix is to be applied to the statevectors in the batch. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of bits on host + + matrices (intptr_t): The pointer address (as Python :class:`int`) to the + matrices (on either host or device). + matrix_data_type (cuquantum.cudaDataType): The data type of the matrix. + layout (MatrixLayout): The memory layout the the matrix. + adjoint (int32_t): Whether the adjoint of the matrix would be applied. + n_matrices (uint32_t): The number of matrices. + n_targets (uint32_t): The length of ``targets``. + n_controls (uint32_t): The length of ``controls``. + compute_type (cuquantum.ComputeType): The compute type of matrix + multiplication. + + Returns: + size_t: The required workspace size (in bytes). + + .. seealso:: `custatevecApplyMatrixBatchedGetWorkspaceSize` + """ + # matrix_indices can be a pointer address, or a Python sequence + cdef vector[int32_t] matrixIndicesData + cdef int32_t* matrixIndicesPtr + if cpython.PySequence_Check(matrix_indices): + matrixIndicesData = matrix_indices + matrixIndicesPtr = matrixIndicesData.data() + else: # a pointer address + matrixIndicesPtr = matrix_indices + + cdef size_t extraWorkspaceSizeInBytes + with nogil: + status = custatevecApplyMatrixBatchedGetWorkspaceSize( + <_Handle>handle, sv_data_type, n_index_bits, + n_svs, sv_stride, <_MatrixMapType>map_type, + matrixIndicesPtr, matrices, matrix_data_type, + <_MatrixLayout>layout, adjoint, n_matrices, + n_targets, n_controls, <_ComputeType>compute_type, + &extraWorkspaceSizeInBytes) + check_status(status) + return extraWorkspaceSizeInBytes + + +cpdef apply_matrix_batched( + intptr_t handle, intptr_t batched_svs, int sv_data_type, + uint32_t n_index_bits, uint32_t n_svs, _Index sv_stride, + int map_type, matrix_indices, intptr_t matrices, int matrix_data_type, + int layout, int32_t adjoint, uint32_t n_matrices, + targets, uint32_t n_targets, + controls, control_bit_values, uint32_t n_controls, + int compute_type, intptr_t workspace, size_t workspace_size): + """Apply the specified gate matrices to the batched statevectors. + + Args: + handle (intptr_t): The library handle. + batched_svs (intptr_t): The pointer address (as Python :class:`int`) to + the batched statevectors (on device). + sv_data_type (cuquantum.cudaDataType): The data type of the statevectors. + n_index_bits (uint32_t): The number of index bits. + n_svs (uint32_t): The number of batched statevectors. + sv_stride (int64_t): The stride between each state vector in the batch. + map_type (MatrixMapType): Specify the way to assign matrices. + matrix_indices: An array of matrix indices to indicate, in order, which + matrix is to be applied to the statevectors in the batch. It can be + + - an :class:`int` as the pointer address to the array (on host or + device) + - a Python sequence of bits on host + + matrices (intptr_t): The pointer address (as Python :class:`int`) to the + matrices (on either host or device). + matrix_data_type (cuquantum.cudaDataType): The data type of the matrix. + layout (MatrixLayout): The memory layout the the matrix. + adjoint (int32_t): Whether the adjoint of the matrix would be applied. + n_matrices (uint32_t): The number of matrices. + targets: A host array of target bits. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of target bits + + n_targets (uint32_t): The length of ``targets``. + controls: A host array of control bits. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of control bits + + control_bit_values: A host array of control bit values. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of control bit values + + n_controls (uint32_t): The length of ``controls``. + compute_type (cuquantum.ComputeType): The compute type of matrix + multiplication. + workspace (intptr_t): The pointer address (as Python :class:`int`) to the + workspace (on device). + workspace_size (size_t): The workspace size (in bytes). + + .. seealso:: `custatevecApplyMatrixBatched` + """ + # matrix_indices can be a pointer address, or a Python sequence + cdef vector[int32_t] matrixIndicesData + cdef int32_t* matrixIndicesPtr + if cpython.PySequence_Check(matrix_indices): + matrixIndicesData = matrix_indices + matrixIndicesPtr = matrixIndicesData.data() + else: # a pointer address + matrixIndicesPtr = matrix_indices + + # targets can be a pointer address, or a Python sequence + cdef vector[int32_t] targetsData + cdef int32_t* targetsPtr + if cpython.PySequence_Check(targets): + targetsData = targets + targetsPtr = targetsData.data() + else: # a pointer address + targetsPtr = targets + + # controls can be a pointer address, or a Python sequence + cdef vector[int32_t] controlsData + cdef int32_t* controlsPtr + if cpython.PySequence_Check(controls): + controlsData = controls + controlsPtr = controlsData.data() + else: # a pointer address + controlsPtr = controls + + # control_bit_values can be a pointer address, or a Python sequence + cdef vector[int32_t] controlBitValuesData + cdef int32_t* controlBitValuesPtr + if cpython.PySequence_Check(control_bit_values): + controlBitValuesData = control_bit_values + controlBitValuesPtr = controlBitValuesData.data() + else: # a pointer address + controlBitValuesPtr = control_bit_values + + with nogil: + status = custatevecApplyMatrixBatched( + <_Handle>handle, batched_svs, sv_data_type, + n_index_bits, n_svs, sv_stride, <_MatrixMapType>map_type, + matrixIndicesPtr, matrices, matrix_data_type, + <_MatrixLayout>layout, adjoint, n_matrices, + targetsPtr, n_targets, + controlsPtr, controlBitValuesPtr, n_controls, + <_ComputeType>compute_type, workspace, workspace_size) + check_status(status) + + cpdef size_t compute_expectation_get_workspace_size( intptr_t handle, int sv_data_type, uint32_t n_index_bits, intptr_t matrix, int matrix_data_type, int layout, uint32_t n_basis_bits, int compute_type) except*: @@ -1908,6 +2366,26 @@ cpdef double test_matrix_type( return residualNorm +cpdef initialize_state_vector( + intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits, + int sv_type): + """Initialize the state vector. + + Args: + handle (intptr_t): The library handle. + sv (intptr_t): The pointer address (as Python :class:`int`) to the + statevector (on device). + sv_data_type (cuquantum.cudaDataType): The data type of the statevector. + n_index_bits (uint32_t): The number of index bits. + sv_type (StateVectorType): The target quantum state. + """ + with nogil: + status = custatevecInitializeStateVector( + <_Handle>handle, sv, sv_data_type, n_index_bits, + <_StateVectorType>sv_type) + check_status(status) + + cpdef set_device_mem_handler(intptr_t handle, handler): """ Set the device memory handler for cuTensorNet. @@ -2709,6 +3187,11 @@ class MatrixType(IntEnum): UNITARY = CUSTATEVEC_MATRIX_TYPE_UNITARY HERMITIAN = CUSTATEVEC_MATRIX_TYPE_HERMITIAN +class MatrixMapType(IntEnum): + """See `custatevecMatrixMapType_t`.""" + BROADCAST = CUSTATEVEC_MATRIX_MAP_TYPE_BROADCAST + MATRIX_INDEXED = CUSTATEVEC_MATRIX_MAP_TYPE_MATRIX_INDEXED + class Collapse(IntEnum): """See `custatevecCollapseOp_t`.""" NONE = CUSTATEVEC_COLLAPSE_NONE @@ -2737,6 +3220,12 @@ class DataTransferType(IntEnum): RECV = CUSTATEVEC_DATA_TRANSFER_TYPE_RECV SEND_RECV = CUSTATEVEC_DATA_TRANSFER_TYPE_SEND_RECV +class StateVectorType(IntEnum): + """See `custatevecStateVectorType_t`.""" + ZERO = CUSTATEVEC_STATE_VECTOR_TYPE_ZERO + UNIFORM = CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM + GHZ = CUSTATEVEC_STATE_VECTOR_TYPE_GHZ + W = CUSTATEVEC_STATE_VECTOR_TYPE_W del IntEnum diff --git a/python/cuquantum/cutensornet/_internal/circuit_converter_utils.py b/python/cuquantum/cutensornet/_internal/circuit_converter_utils.py index 26bd8ea..6f30798 100644 --- a/python/cuquantum/cutensornet/_internal/circuit_converter_utils.py +++ b/python/cuquantum/cutensornet/_internal/circuit_converter_utils.py @@ -2,12 +2,15 @@ # # SPDX-License-Identifier: BSD-3-Clause +import types + try: import cirq from . import circuit_parser_utils_cirq except ImportError: cirq = circuit_parser_utils_cirq = None import cupy as cp +import numpy as np try: import qiskit from . import circuit_parser_utils_qiskit @@ -15,15 +18,18 @@ qiskit = circuit_parser_utils_qiskit = None from .tensor_wrapper import _get_backend_asarray_func +from ...utils import WHITESPACE_UNICODE + EINSUM_SYMBOLS_BASE = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +WHITESPACE_SYMBOLS_ID = None CIRQ_MIN_VERSION = '0.6.0' -QISKIT_MIN_VERSION = '0.24.0' # qiskit metapackage version +QISKIT_MIN_VERSION = '0.24.0' # qiskit metapackage version -import types EMPTY_DICT = types.MappingProxyType({}) + def check_version(package_name, version, minimum_version): """ Check if the current version of a package is above the required minimum. @@ -35,15 +41,35 @@ def check_version(package_name, version, minimum_version): f'current version: {version}') return None + def _get_symbol(i): """ - Return a Unicode as label for index. + Return a unicode as label for index. Whitespace unicode characters are skipped. - .. note:: This function is adopted from `opt_einsum `_ + This function can offer 1113955 (= sys.maxunicode - 140 - 16) unique symbols. """ if i < 52: return EINSUM_SYMBOLS_BASE[i] - return chr(i + 140) + + global WHITESPACE_SYMBOLS_ID + if WHITESPACE_SYMBOLS_ID is None: + whitespace = WHITESPACE_UNICODE + WHITESPACE_SYMBOLS_ID = np.asarray([ord(c) for c in whitespace], dtype=np.int32) + WHITESPACE_SYMBOLS_ID = WHITESPACE_SYMBOLS_ID[WHITESPACE_SYMBOLS_ID >= 192] + + # leave "holes" in the integer -> unicode mapping to avoid using whitespaces as symbols + i += 140 + offset = 0 + for hole in WHITESPACE_SYMBOLS_ID: # loop size = 16 + if i + offset < hole: + break + offset += 1 + + try: + return chr(i + offset) + except ValueError as e: + raise ValueError(f"{i=} would exceed unicode limit") from e + def infer_parser(circuit): """ diff --git a/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py b/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py index c0c7f56..c2e4837 100644 --- a/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py +++ b/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause import cupy as cp +import numpy as np from qiskit import QuantumCircuit from qiskit.circuit import Barrier, ControlledGate, Delay, Gate, Measure from qiskit.extensions import UnitaryGate @@ -26,13 +27,14 @@ def get_inverse_circuit(circuit): """ return circuit.inverse() -def get_decomposed_gates(circuit, qubit_map=None, gates=None, gate_process_func=None): +def get_decomposed_gates(circuit, qubit_map=None, gates=None, gate_process_func=None, global_phase=0): """ Return the gate sequence for the given circuit. Compound gates/instructions will be decomposed to either standard gates or customized unitary gates. """ if gates is None: gates = [] + global_phase += circuit.global_phase for operation, gate_qubits, _ in circuit: if qubit_map: gate_qubits = [qubit_map[q] for q in gate_qubits] @@ -52,8 +54,8 @@ def get_decomposed_gates(circuit, qubit_map=None, gates=None, gate_process_func= raise ValueError(f'operation type {type(operation)} not supported') # for composite gate, must provide a map from the sub circuit to the original circuit next_qubit_map = dict(zip(operation.definition.qubits, gate_qubits)) - gates = get_decomposed_gates(operation.definition, qubit_map=next_qubit_map, gates=gates, gate_process_func=gate_process_func) - return gates + gates, global_phase = get_decomposed_gates(operation.definition, qubit_map=next_qubit_map, gates=gates, gate_process_func=gate_process_func, global_phase=global_phase) + return gates, global_phase def unfold_circuit(circuit, dtype='complex128', backend=cp): """ @@ -75,12 +77,14 @@ def unfold_circuit(circuit, dtype='complex128', backend=cp): def gate_process_func(operation, gate_qubits): tensor = operation.to_matrix().reshape((2,2)*len(gate_qubits)) tensor = asarray(tensor, dtype=dtype) - if isinstance(operation, ControlledGate): - # in qiskit notation, qubit at high index is the target qubit - gate_qubits = gate_qubits[::-1] - return tensor, gate_qubits + # in qiskit notation, qubits are labelled in the inverse order + return tensor, gate_qubits[::-1] - gates = get_decomposed_gates(circuit, gate_process_func=gate_process_func) + gates, global_phase = get_decomposed_gates(circuit, gate_process_func=gate_process_func, global_phase=0) + if global_phase != 0: + phase = np.exp(1j*global_phase) + phase_gate = asarray([[phase, 0], [0, phase]], dtype=dtype) + gates = [(phase_gate, qubits[:1]), ] + gates return qubits, gates @@ -96,7 +100,7 @@ def get_lightcone_circuit(circuit, coned_qubits): A :class:`qiskit.QuantumCircuit` object that potentially contains less number of gates """ coned_qubits = set(coned_qubits) - gates = get_decomposed_gates(circuit) + gates, global_phase = get_decomposed_gates(circuit) newqc = QuantumCircuit(circuit.qubits) ix = len(gates) tail_operations = [] @@ -109,4 +113,5 @@ def get_lightcone_circuit(circuit, coned_qubits): coned_qubits |= qubit_set for operation, gate_qubits in gates[:ix] + tail_operations[::-1]: newqc.append(operation, gate_qubits) + newqc.global_phase = global_phase return newqc diff --git a/python/cuquantum/cutensornet/_internal/decomposition_utils.py b/python/cuquantum/cutensornet/_internal/decomposition_utils.py index 7ada989..56c911f 100644 --- a/python/cuquantum/cutensornet/_internal/decomposition_utils.py +++ b/python/cuquantum/cutensornet/_internal/decomposition_utils.py @@ -33,14 +33,23 @@ 'L2': cutn.TensorSVDNormalization.L2, 'LInf': cutn.TensorSVDNormalization.LINF} +SVD_ALGORITHM_MAP = {'gesvd': cutn.TensorSVDAlgo.GESVD, + 'gesvdj': cutn.TensorSVDAlgo.GESVDJ, + 'gesvdp': cutn.TensorSVDAlgo.GESVDP, + 'gesvdr': cutn.TensorSVDAlgo.GESVDR} + +SVD_ALGORITHM_MAP_TO_STRING = dict((val, key) for key, val in SVD_ALGORITHM_MAP.items()) + SVD_METHOD_CONFIG_MAP = {'abs_cutoff': cutn.TensorSVDConfigAttribute.ABS_CUTOFF, 'rel_cutoff': cutn.TensorSVDConfigAttribute.REL_CUTOFF, 'partition': cutn.TensorSVDConfigAttribute.S_PARTITION, - 'normalization': cutn.TensorSVDConfigAttribute.S_NORMALIZATION} + 'normalization': cutn.TensorSVDConfigAttribute.S_NORMALIZATION, + 'algorithm': cutn.TensorSVDConfigAttribute.ALGO} SVD_INFO_MAP = {'full_extent': cutn.TensorSVDInfoAttribute.FULL_EXTENT, 'reduced_extent': cutn.TensorSVDInfoAttribute.REDUCED_EXTENT, - 'discarded_weight': cutn.TensorSVDInfoAttribute.DISCARDED_WEIGHT} + 'discarded_weight': cutn.TensorSVDInfoAttribute.DISCARDED_WEIGHT, + 'algorithm': cutn.TensorSVDInfoAttribute.ALGO} def compute_combined_size(size_dict, modes): @@ -202,32 +211,44 @@ def parse_decomposition(subscripts, *operands): return operands, inputs, outputs, size_dict, mode_map_user_to_ord, mode_map_ord_to_user, mid_extent -def get_svd_config_info_scalar_attr(handle, obj_type, obj, attr): +def get_svd_config_info_scalar_attr(handle, obj_type, obj, attr, svd_algorithm=None): """ Get the data for given attribute of SVDConfig or SVDInfo. """ if obj_type == 'config': - dtype_getter = cutn.tensor_svd_config_get_attribute_dtype + if attr != cutn.TensorSVDConfigAttribute.ALGO_PARAMS: + dtype = cutn.tensor_svd_config_get_attribute_dtype(attr) + else: + if svd_algorithm not in (cutn.TensorSVDAlgo.GESVDJ, cutn.TensorSVDAlgo.GESVDR): + return None + dtype = cutn.tensor_svd_algo_params_get_dtype(svd_algorithm) getter = cutn.tensor_svd_config_get_attribute elif obj_type == 'info': - dtype_getter = cutn.tensor_svd_info_get_attribute_dtype + if attr != cutn.TensorSVDInfoAttribute.ALGO_STATUS: + dtype = cutn.tensor_svd_info_get_attribute_dtype(attr) + else: + if svd_algorithm not in (cutn.TensorSVDAlgo.GESVDJ, cutn.TensorSVDAlgo.GESVDP): + return None + dtype = cutn.tensor_svd_algo_status_get_dtype(svd_algorithm) getter = cutn.tensor_svd_info_get_attribute else: raise ValueError("object type must be either config or info") - - dtype = dtype_getter(attr) data = numpy.empty((1,), dtype=dtype) getter(handle, obj, attr, data.ctypes.data, data.dtype.itemsize) return data -def set_svd_config_scalar_attr(handle, obj, attr, data): +def set_svd_config_scalar_attr(handle, obj, attr, data, svd_algorithm=None): """ Set the data for given attribute of SVDConfig. """ - dtype_getter = cutn.tensor_svd_config_get_attribute_dtype setter = cutn.tensor_svd_config_set_attribute - dtype = dtype_getter(attr) + if attr != cutn.TensorSVDConfigAttribute.ALGO_PARAMS: + dtype = cutn.tensor_svd_config_get_attribute_dtype(attr) + else: + if svd_algorithm not in (cutn.TensorSVDAlgo.GESVDJ, cutn.TensorSVDAlgo.GESVDR): + raise ValueError(f"Algorithm specific parameters not supported for {svd_algorithm}") + dtype = cutn.tensor_svd_algo_params_get_dtype(svd_algorithm) if not isinstance(data, numpy.ndarray): data = numpy.asarray(data, dtype=dtype) setter(handle, obj, attr, data.ctypes.data, data.dtype.itemsize) @@ -237,16 +258,24 @@ def parse_svd_config(handle, svd_config, svd_method, logger=None): """ Given an SVDMethod object, set the corresponding attributes in the SVDConfig. """ + svd_algorithm = None for method_attr, attr in SVD_METHOD_CONFIG_MAP.items(): data = getattr(svd_method, method_attr) if method_attr == 'partition': data = PARTITION_MAP[data] elif method_attr == 'normalization': data = NORMALIZATION_MAP[data] + elif method_attr == 'algorithm': + svd_algorithm = data = SVD_ALGORITHM_MAP[data] set_svd_config_scalar_attr(handle, svd_config, attr, data) if logger is not None: logger.info(f"The SVDConfig attribute '{method_attr}' has been set to {data}.") + algo_params = svd_method._get_algo_params() + if algo_params is not None: + set_svd_config_scalar_attr(handle, svd_config, cutn.TensorSVDConfigAttribute.ALGO_PARAMS, algo_params, svd_algorithm=svd_algorithm) + if logger is not None: + logger.info(f"The SVDConfig attribute '{cutn.TensorSVDConfigAttribute.ALGO_PARAMS}' has been set to {algo_params}.") def get_svd_info_dict(handle, svd_info): """ @@ -255,6 +284,13 @@ def get_svd_info_dict(handle, svd_info): info = dict() for key, attr in SVD_INFO_MAP.items(): info[key] = get_svd_config_info_scalar_attr(handle, 'info', svd_info, attr).item() + svd_algorithm = info['algorithm'] + algo_status = get_svd_config_info_scalar_attr(handle, 'info', svd_info, cutn.TensorSVDInfoAttribute.ALGO_STATUS, svd_algorithm=svd_algorithm) + info['algorithm'] = SVD_ALGORITHM_MAP_TO_STRING[svd_algorithm] + if algo_status is not None: + for name in algo_status.dtype.names: + key = info['algorithm'] + f'_{name}' + info[key] = algo_status[name].item() return info @@ -282,8 +318,6 @@ def parse_decompose_operands_options(options, wrapped_operands, allowed_dtype_na with utils.device_ctx(device_id): handle = cutn.create() - blocking = options.blocking is True or operands_location == 'cpu' - dtype_name = utils.get_operands_dtype(wrapped_operands) if allowed_dtype_names is not None and dtype_name not in allowed_dtype_names: raise ValueError(f"dtype {dtype_name} not supported") @@ -295,7 +329,7 @@ def parse_decompose_operands_options(options, wrapped_operands, allowed_dtype_na internal_options = options.__class__(device_id=device_id, logger=logger, handle=handle, - blocking=blocking, + blocking=options.blocking, compute_type=compute_type, memory_limit=options.memory_limit, allocator=allocator) @@ -309,22 +343,31 @@ def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_spac """ workspace_size = cutn.workspace_get_memory_size(handle, workspace_desc, pref, mem_space, workspace_kind) # Allocate and set workspace - with utils.device_ctx(device_id), stream_ctx: - try: - logger.debug(f"Allocating memory for {task_name}") - workspace_ptr = allocator.memalloc(workspace_size) - except TypeError as e: - message = "The method 'memalloc' in the allocator object must conform to the interface in the "\ - "'BaseCUDAMemoryManager' protocol." - raise TypeError(message) from e - - logger.debug(f"Finished allocating memory of size {formatters.MemoryStr(workspace_size)} for decomposition in the context of stream {stream}.") - - device_ptr = utils.get_ptr_from_memory_pointer(workspace_ptr) - cutn.workspace_set_memory(handle, workspace_desc, mem_space, workspace_kind, device_ptr, workspace_size) - logger.debug(f"The workspace memory (device pointer = {device_ptr}) has been set in the workspace descriptor.") - - return workspace_ptr + if mem_space == cutn.Memspace.DEVICE: + with utils.device_ctx(device_id), stream_ctx: + try: + logger.debug(f"Allocating device memory for {task_name}") + workspace_ptr = allocator.memalloc(workspace_size) + except TypeError as e: + message = "The method 'memalloc' in the allocator object must conform to the interface in the "\ + "'BaseCUDAMemoryManager' protocol." + raise TypeError(message) from e + + logger.debug(f"Finished allocating device memory of size {formatters.MemoryStr(workspace_size)} for decomposition in the context of stream {stream}.") + device_ptr = utils.get_ptr_from_memory_pointer(workspace_ptr) + cutn.workspace_set_memory(handle, workspace_desc, mem_space, workspace_kind, device_ptr, workspace_size) + logger.debug(f"The workspace memory (device pointer = {device_ptr}) has been set in the workspace descriptor.") + return workspace_ptr + elif workspace_size != 0: + # host workspace + logger.debug(f"Allocating host memory for {task_name}") + workspace_host = numpy.empty(workspace_size, dtype=numpy.int8) + logger.debug(f"Finished allocating host memory of size {formatters.MemoryStr(workspace_size)} for decomposition.") + cutn.workspace_set_memory(handle, workspace_desc, mem_space, workspace_kind, workspace_host.ctypes.data, workspace_size) + logger.debug(f"The workspace memory (host pointer = {workspace_host.ctypes.data}) has been set in the workspace descriptor.") + return workspace_host + else: + return None def _destroy_tensor_descriptors(desc_tensors): diff --git a/python/cuquantum/cutensornet/_internal/einsum_parser.py b/python/cuquantum/cutensornet/_internal/einsum_parser.py index f3c04a5..44da696 100644 --- a/python/cuquantum/cutensornet/_internal/einsum_parser.py +++ b/python/cuquantum/cutensornet/_internal/einsum_parser.py @@ -8,17 +8,20 @@ from collections import Counter from itertools import chain -import string +import re +import sys import numpy as np from . import formatters from .tensor_wrapper import wrap_operands +from ...utils import WHITESPACE_UNICODE DISALLOWED_LABELS = set(['.', '-', '>']) native_to_str = lambda native : "'" + ''.join(s if s is not Ellipsis else '...' for s in native) + "'" + def select_morpher(interleaved, mapper=None): """ Select appropriate function for mode label representation based on string or interleaved format. @@ -48,7 +51,8 @@ def parse_single(single): """ Parse single operand mode labels considering ellipsis. Leading or trailing whitespace, if present, is removed. """ - subexpr = single.strip(string.whitespace).split('...') + whitespace = WHITESPACE_UNICODE + subexpr = single.strip(whitespace).split('...') n = len(subexpr) expr = [[Ellipsis]] * (2*n - 1) expr[::2] = subexpr @@ -60,10 +64,11 @@ def check_single(single): """ Check for disallowed characters used as mode labels for a single operand. """ + whitespace = WHITESPACE_UNICODE for s in single: if s is Ellipsis: continue - if s in string.whitespace or s in DISALLOWED_LABELS: + if s in whitespace or s in DISALLOWED_LABELS: return False return True diff --git a/python/cuquantum/cutensornet/cutensornet.pxd b/python/cuquantum/cutensornet/cutensornet.pxd index 3bf3f69..3a21b92 100644 --- a/python/cuquantum/cutensornet/cutensornet.pxd +++ b/python/cuquantum/cutensornet/cutensornet.pxd @@ -13,6 +13,13 @@ from cuquantum.utils cimport DataType, DeviceAllocType, DeviceFreeType, Stream cdef extern from '' nogil: + # cuTensorNet consts + const int CUTENSORNET_MAJOR + const int CUTENSORNET_MINOR + const int CUTENSORNET_PATCH + const int CUTENSORNET_VERSION + const int CUTENSORNET_ALLOCATOR_NAME_LEN + # cuTensorNet types ctypedef void* _Handle 'cutensornetHandle_t' ctypedef int _Status 'cutensornetStatus_t' @@ -26,6 +33,9 @@ cdef extern from '' nogil: ctypedef void* _TensorDescriptor 'cutensornetTensorDescriptor_t' ctypedef void* _TensorSVDConfig 'cutensornetTensorSVDConfig_t' ctypedef void* _TensorSVDInfo 'cutensornetTensorSVDInfo_t' + ctypedef void* _State 'cutensornetState_t' + ctypedef void* _StateMarginal 'cutensornetStateMarginal_t' + ctypedef void* _StateSampler 'cutensornetStateSampler_t' # cuTensorNet structs ctypedef struct _NodePair 'cutensornetNodePair_t': @@ -48,14 +58,34 @@ cdef extern from '' nogil: void* ctx DeviceAllocType device_alloc DeviceFreeType device_free - # Cython limitation: cannot use C defines in declaring a static array, - # so we just have to hard-code CUTENSORNET_ALLOCATOR_NAME_LEN here... - char name[64] + char name[CUTENSORNET_ALLOCATOR_NAME_LEN] ctypedef struct _TensorQualifiers 'cutensornetTensorQualifiers_t': - int32_t isConjugate # cannot assign default value to fields in cdef structs - int32_t isConstant # cannot assign default value to fields in cdef structs - + # cannot assign default value to fields in cdef structs + int32_t isConjugate + int32_t isConstant + int32_t requiresGradient + + ctypedef struct _TensorIDList 'cutensornetTensorIDList_t': + int32_t numTensors + int32_t* data + + ctypedef struct _GesvdjParams 'cutensornetGesvdjParams_t': + double tol + int64_t maxSweeps + + ctypedef struct _GesvdrParams 'cutensornetGesvdrParams_t': + int64_t oversampling + int64_t niters + + ctypedef struct _GesvdjStatus 'cutensornetGesvdjStatus_t': + double residual + int64_t sweeps + + ctypedef struct _GesvdpStatus 'cutensornetGesvdpStatus_t': + double errSigma + + # cuTensorNet function pointers ctypedef void(*LoggerCallbackData 'cutensornetLoggerCallbackData_t')( int32_t logLevel, const char* functionName, @@ -79,6 +109,10 @@ cdef extern from '' nogil: CUTENSORNET_OPTIMIZER_COST_TIME CUTENSORNET_OPTIMIZER_COST_TIME_TUNED + ctypedef enum _SmartOption 'cutensornetSmartOption_t': + CUTENSORNET_SMART_OPTION_DISABLED + CUTENSORNET_SMART_OPTION_ENABLED + ctypedef enum _ContractionOptimizerConfigAttribute 'cutensornetContractionOptimizerConfigAttributes_t': CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_PARTITIONS CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_CUTOFF_SIZE @@ -98,6 +132,8 @@ cdef extern from '' nogil: CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_COST_FUNCTION_OBJECTIVE + CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_CACHE_REUSE_NRUNS + CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SMART_OPTION ctypedef enum _ContractionOptimizerInfoAttribute 'cutensornetContractionOptimizerInfoAttributes_t': CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES @@ -137,6 +173,14 @@ cdef extern from '' nogil: CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF CUTENSORNET_TENSOR_SVD_CONFIG_S_NORMALIZATION CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS + + ctypedef enum _TensorSVDAlgo 'cutensornetTensorSVDAlgo_t': + CUTENSORNET_TENSOR_SVD_ALGO_GESVD + CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ + CUTENSORNET_TENSOR_SVD_ALGO_GESVDP + CUTENSORNET_TENSOR_SVD_ALGO_GESVDR ctypedef enum _TensorSVDNormalization 'cutensornetTensorSVDNormalization_t': CUTENSORNET_TENSOR_SVD_NORMALIZATION_NONE @@ -154,14 +198,26 @@ cdef extern from '' nogil: CUTENSORNET_TENSOR_SVD_INFO_FULL_EXTENT CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT + CUTENSORNET_TENSOR_SVD_INFO_ALGO + CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS ctypedef enum _GateSplitAlgo 'cutensornetGateSplitAlgo_t': CUTENSORNET_GATE_SPLIT_ALGO_DIRECT CUTENSORNET_GATE_SPLIT_ALGO_REDUCED - - # cuTensorNet consts - int CUTENSORNET_MAJOR - int CUTENSORNET_MINOR - int CUTENSORNET_PATCH - int CUTENSORNET_VERSION - int CUTENSORNET_ALLOCATOR_NAME_LEN + + ctypedef enum _StatePurity 'cutensornetStatePurity_t': + CUTENSORNET_STATE_PURITY_PURE + + ctypedef enum _MarginalAttribute 'cutensornetMarginalAttributes_t': + CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES + + ctypedef enum _SamplerAttribute 'cutensornetSamplerAttributes_t': + CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES + + ctypedef enum _NetworkAttribute 'cutensornetNetworkAttributes_t': + CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONSTANT + CUTENSORNET_NETWORK_INPUT_TENSORS_CONSTANT + CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONJUGATED + CUTENSORNET_NETWORK_INPUT_TENSORS_CONJUGATED + CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_REQUIRE_GRAD + CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD diff --git a/python/cuquantum/cutensornet/cutensornet.pyx b/python/cuquantum/cutensornet/cutensornet.pyx index f9b93a3..a5d0c09 100644 --- a/python/cuquantum/cutensornet/cutensornet.pyx +++ b/python/cuquantum/cutensornet/cutensornet.pyx @@ -38,6 +38,10 @@ cdef extern from * nogil: int32_t, const int64_t[], const int64_t[], const int32_t[], DataType, _ComputeType, _NetworkDescriptor*) int cutensornetDestroyNetworkDescriptor(_NetworkDescriptor) + int cutensornetNetworkGetAttribute( + _Handle, _NetworkDescriptor, _NetworkAttribute, void*, size_t) + int cutensornetNetworkSetAttribute( + _Handle, _NetworkDescriptor, _NetworkAttribute, void*, size_t) int cutensornetGetOutputTensorDetails( const _Handle, const _NetworkDescriptor, int32_t*, size_t*, int32_t*, int64_t*, int64_t*) @@ -68,15 +72,15 @@ cdef extern from * nogil: void* const, uint64_t) int cutensornetWorkspaceSetMemory( const _Handle, _WorkspaceDescriptor, _Memspace, - _WorkspaceKind, - void* const, int64_t) + _WorkspaceKind, void* const, int64_t) int cutensornetWorkspaceGet( const _Handle, const _WorkspaceDescriptor, _Memspace, void**, uint64_t*) int cutensornetWorkspaceGetMemory( const _Handle, const _WorkspaceDescriptor, _Memspace, - _WorkspaceKind, - void**, int64_t*) + _WorkspaceKind, void**, int64_t*) + int cutensornetWorkspacePurgeCache( + _Handle, _WorkspaceDescriptor, _Memspace) int cutensornetDestroyWorkspaceDescriptor(_WorkspaceDescriptor) # optimizer info @@ -224,6 +228,47 @@ cdef extern from * nogil: int cutensornetDistributedGetProcRank(_Handle, int*) int cutensornetDistributedSynchronize(_Handle) + # gradients + int cutensornetComputeGradientsBackward( + const _Handle, _ContractionPlan, const void* const[], + void*, void* const[], int32_t, _WorkspaceDescriptor, Stream) + + # high level API + # state preparation + int cutensornetCreateState( + const _Handle, _StatePurity, int32_t, const int64_t*, + DataType, _State*) + int cutensornetDestroyState(_State) + int cutensornetStateApplyTensor( + const _Handle, _State, int32_t, const int32_t*, void*, + const int64_t*, const int32_t, const int32_t, const int32_t, int64_t*) + int cutensornetStateUpdateTensor( + const _Handle, _State, int64_t, void*, int32_t) + + # marginals + int cutensornetCreateMarginal( + const _Handle, _State, int32_t, const int32_t*, + int32_t, const int32_t*, const int64_t*, _StateMarginal*) + int cutensornetMarginalConfigure( + const _Handle, _StateMarginal, _MarginalAttribute, const void*, size_t) + int cutensornetMarginalPrepare( + const _Handle, _StateMarginal, size_t, _WorkspaceDescriptor, Stream) + int cutensornetMarginalCompute( + const _Handle, _StateMarginal, const int64_t*, _WorkspaceDescriptor, void*, Stream) + int cutensornetDestroyMarginal(_StateMarginal) + + # sampling + int cutensornetCreateSampler( + const _Handle, _State, int32_t, const int32_t*, _StateSampler*) + int cutensornetSamplerConfigure( + const _Handle, _StateSampler, _SamplerAttribute, const void*, size_t) + int cutensornetSamplerPrepare( + const _Handle, _StateSampler, size_t, _WorkspaceDescriptor, Stream) + int cutensornetSamplerSample( + const _Handle, _StateSampler, int64_t, + _WorkspaceDescriptor, int64_t*, Stream) + int cutensornetDestroySampler(_StateSampler) + class cuTensorNetError(RuntimeError): def __init__(self, status): @@ -519,6 +564,93 @@ cpdef destroy_network_descriptor(intptr_t tn_desc): check_status(status) +######################### Python specific utility ######################### + +tensor_id_list_dtype = _numpy.dtype( + {'names':['num_tensors','data'], + 'formats': (_numpy.int32, _numpy.intp), + 'itemsize': sizeof(_TensorIDList), + }, align=True +) + +cdef dict network_sizes = { + CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONSTANT: _numpy.int32, + CUTENSORNET_NETWORK_INPUT_TENSORS_CONSTANT: tensor_id_list_dtype, + CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONJUGATED: _numpy.int32, + CUTENSORNET_NETWORK_INPUT_TENSORS_CONJUGATED: tensor_id_list_dtype, + CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_REQUIRE_GRAD: _numpy.int32, + CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD: tensor_id_list_dtype, +} + +cpdef network_get_attribute_dtype(int attr): + """Get the Python data type of the corresponding network descriptor + attribute. + + Args: + attr (NetworkAttribute): The attribute to query. + + Returns: + The data type of the queried attribute. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for :func:`network_get_attribute` and + :func:`network_set_attribute`. + """ + return network_sizes[attr] + +########################################################################### + + +cpdef network_get_attribute( + intptr_t handle, intptr_t tn_desc, int attr, + intptr_t buf, size_t size): + """Get the network descriptor attribute. + + Args: + handle (intptr_t): The library handle. + tn_desc (intptr_t): The tensor network descriptor. + attr (NetworkAttribute): The attribute to query. + buf (intptr_t): The pointer address (as Python :class:`int`) for storing + the returned attribute value. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`network_get_attribute_dtype`. + + .. seealso:: `cutensornetNetworkGetAttribute` + """ + with nogil: + status = cutensornetNetworkGetAttribute( + <_Handle>handle, <_NetworkDescriptor>tn_desc, + <_NetworkAttribute>attr, buf, size) + check_status(status) + + +cpdef network_set_attribute( + intptr_t handle, intptr_t tn_desc, int attr, + intptr_t buf, size_t size): + """Set the network descriptor attribute. + + Args: + handle (intptr_t): The library handle. + tn_desc (intptr_t): The tensor network descriptor. + attr (NetworkAttribute): The attribute to set. + buf (intptr_t): The pointer address (as Python :class:`int`) to the + attribute data. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`network_get_attribute_dtype`. + + .. seealso:: `cutensornetNetworkSetAttribute` + """ + with nogil: + status = cutensornetNetworkSetAttribute( + <_Handle>handle, <_NetworkDescriptor>tn_desc, + <_NetworkAttribute>attr, buf, size) + check_status(status) + + cpdef tuple get_output_tensor_details(intptr_t handle, intptr_t tn_desc): """Get the output tensor's metadata. @@ -875,6 +1007,25 @@ cpdef tuple workspace_get_memory( return (workspace_ptr, workspace_size) +cpdef workspace_purge_cache( + intptr_t handle, intptr_t workspace, int mem_space): + """Purge the cached data in the specified memory space. + + Args: + handle (intptr_t): The library handle. + workspace (intptr_t): The workspace descriptor. + mem_space (Memspace): The memory space for the workspace being + queried. + + .. seealso:: `cutensornetWorkspacePurgeCache` + """ + with nogil: + status = cutensornetWorkspacePurgeCache( + <_Handle>handle, <_WorkspaceDescriptor>workspace, + <_Memspace>mem_space) + check_status(status) + + cpdef intptr_t create_contraction_optimizer_info( intptr_t handle, intptr_t tn_desc) except*: """Create a contraction optimizer info object. @@ -995,6 +1146,34 @@ slicing_config_dtype = _numpy.dtype( }, align=True ) +gesvdj_params_dtype = _numpy.dtype( + {'names': ('tol','max_sweeps'), + 'formats': (_numpy.float64, _numpy.int32), + 'itemsize': sizeof(_GesvdjParams), + }, align=True +) + +gesvdr_params_dtype = _numpy.dtype( + {'names': ('oversampling','niters'), + 'formats': (_numpy.int64, _numpy.int64), + 'itemsize': sizeof(_GesvdrParams), + }, align=True +) + +gesvdj_status_dtype = _numpy.dtype( + {'names': ('residual', 'sweeps'), + 'formats': (_numpy.float64, _numpy.int32), + 'itemsize': sizeof(_GesvdjStatus), + }, align=True +) + +gesvdp_status_dtype = _numpy.dtype( + {'names': ('err_sigma', ), + 'formats': (_numpy.float64, ), + 'itemsize': sizeof(_GesvdpStatus), + }, align=True +) + cdef dict contract_opti_info_sizes = { CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES: _numpy.int64, CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICED_MODES: _numpy.int32, @@ -1220,6 +1399,8 @@ cdef dict contract_opti_cfg_sizes = { CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR: _numpy.int32, CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED: _numpy.int32, CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_COST_FUNCTION_OBJECTIVE: _numpy.int32, # = sizeof(enum value) + CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_CACHE_REUSE_NRUNS: _numpy.int32, + CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SMART_OPTION: _numpy.int32, # = sizeof(enum value) } cpdef contraction_optimizer_config_get_attribute_dtype(int attr): @@ -1245,6 +1426,9 @@ cpdef contraction_optimizer_config_get_attribute_dtype(int attr): elif attr == CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_COST_FUNCTION_OBJECTIVE: if _numpy.dtype(dtype).itemsize != sizeof(_OptimizerCost): warnings.warn("binary size may be incompatible") + elif attr == CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SMART_OPTION: + if _numpy.dtype(dtype).itemsize != sizeof(_SmartOption): + warnings.warn("binary size may be incompatible") return dtype ########################################################################### @@ -2028,13 +2212,22 @@ cdef dict tensor_svd_cfg_sizes = { CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF: _numpy.float64, CUTENSORNET_TENSOR_SVD_CONFIG_S_NORMALIZATION: _numpy.int32, # = sizeof(enum value) CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION: _numpy.int32, # = sizeof(enum value) + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO: _numpy.int32, # = sizeof(enum value) +} + +cdef dict svd_algo_params_sizes = { + CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ: gesvdj_params_dtype, + CUTENSORNET_TENSOR_SVD_ALGO_GESVDR: gesvdr_params_dtype } cpdef tensor_svd_config_get_attribute_dtype(int attr): """Get the Python data type of the corresponding tensor SVD config attribute. Args: - attr (TensorSVDConfigAttribute): The attribute to query. + attr (TensorSVDConfigAttribute): The attribute to query. + The enum CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS is not supported, + the dtype of which can be queried by :func:`tensor_svd_algo_params_get_dtype`. + Returns: The data type of the queried attribute. @@ -2043,6 +2236,8 @@ cpdef tensor_svd_config_get_attribute_dtype(int attr): allocating memory for :func:`tensor_svd_config_get_attribute` and :func:`tensor_svd_config_set_attribute`. """ + if attr == CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS: + raise ValueError("For CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, use `tensor_svd_algo_params_get_dtype` to get the dtype") dtype = tensor_svd_cfg_sizes[attr] if attr == CUTENSORNET_TENSOR_SVD_CONFIG_S_NORMALIZATION: if _numpy.dtype(dtype).itemsize != sizeof(_TensorSVDNormalization): @@ -2050,8 +2245,28 @@ cpdef tensor_svd_config_get_attribute_dtype(int attr): elif attr == CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION: if _numpy.dtype(dtype).itemsize != sizeof(_TensorSVDPartition): warnings.warn("binary size may be incompatible") + elif attr == CUTENSORNET_TENSOR_SVD_CONFIG_ALGO: + if _numpy.dtype(dtype).itemsize != sizeof(_TensorSVDAlgo): + warnings.warn("binary size may be incompatible") return dtype +cpdef tensor_svd_algo_params_get_dtype(int svd_algo): + """Get the Python data type of the corresponding tensor SVD parameters attribute. + + Args: + svd_algo (TensorSVDAlgo): The SVD algorithm to query. + + Returns: + The data type of algorithm parameters for the queried SVD algorithm. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for `CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS`. + """ + if svd_algo not in svd_algo_params_sizes: + raise ValueError(f"Algorithm {svd_algo} does not support tunable parameters.") + return svd_algo_params_sizes[svd_algo] + ########################################################################### @@ -2145,6 +2360,12 @@ cdef dict tensor_svd_info_sizes = { CUTENSORNET_TENSOR_SVD_INFO_FULL_EXTENT: _numpy.int64, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT: _numpy.int64, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT: _numpy.float64, + CUTENSORNET_TENSOR_SVD_INFO_ALGO: _numpy.int32, # = sizeof(enum value) +} + +cdef dict svd_algo_status_sizes = { + CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ: gesvdj_status_dtype, + CUTENSORNET_TENSOR_SVD_ALGO_GESVDP: gesvdp_status_dtype } cpdef tensor_svd_info_get_attribute_dtype(int attr): @@ -2152,6 +2373,8 @@ cpdef tensor_svd_info_get_attribute_dtype(int attr): Args: attr (TensorSVDInfoAttribute): The attribute to query. + The enum CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS is not supported, + the dtype of which can be queried by :func:`tensor_svd_algo_status_get_dtype`. Returns: The data type of the queried attribute. The returned dtype is always @@ -2161,8 +2384,27 @@ cpdef tensor_svd_info_get_attribute_dtype(int attr): allocating memory for :func:`tensor_svd_info_get_attribute`. """ + if attr == CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS: + raise ValueError("For CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, use `tensor_svd_algo_status_get_dtype` to get the dtype") return tensor_svd_info_sizes[attr] +cpdef tensor_svd_algo_status_get_dtype(int svd_algo): + """Get the Python data type of the corresponding tensor SVD status attribute. + + Args: + svd_algo (TensorSVDAlgo): The SVD algorithm to query. + + Returns: + The data type of algorithm status for the queried SVD algorithm. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for `CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS`. + """ + if svd_algo not in svd_algo_status_sizes: + raise ValueError(f"Algorithm {svd_algo} does not support tunable parameters.") + return svd_algo_status_sizes[svd_algo] + ########################################################################### @@ -2482,6 +2724,557 @@ cpdef distributed_synchronize(intptr_t handle): check_status(status) +cpdef compute_gradients_backward( + intptr_t handle, intptr_t plan, + raw_data_in, intptr_t output_gradient, gradients, bint accumulate_output, + intptr_t workspace, intptr_t stream): + """Compute the gradients of the network w.r.t. the input tensors whose + gradients are required. + + The input tensors should form a tensor network that is prescribed by the + tensor network descriptor that was used to create the contraction plan. + + .. warning:: + + This function is experimental and is subject to change in future + releases. + + Args: + handle (intptr_t): The library handle. + plan (intptr_t): The contraction plan handle. + raw_data_in: A host array of pointer addresses (as Python :class:`int`) for + each input tensor (on device). It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + output_gradient (intptr_t): The pointer address (as Python :class:`int`) + to the gradient w.r.t. the output tensor (on device). + gradients: A host array of pointer addresses (as Python :class:`int`) for + each gradient tensor (on device). It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + accumulate_output (bool): Whether to accumulate the data in + ``gradients``. + workspace (intptr_t): The workspace descriptor. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetComputeGradientsBackward` + """ + warnings.warn("compute_gradients_backward() is an experimental API and " + "subject to future changes", stacklevel=2) + + # raw_data_in can be a pointer address, or a Python sequence + cdef vector[intptr_t] rawDataInData + cdef void** rawDataInPtr + if cpython.PySequence_Check(raw_data_in): + rawDataInData = raw_data_in + rawDataInPtr = (rawDataInData.data()) + else: # a pointer address + rawDataInPtr = raw_data_in + + # gradients can be a pointer address, or a Python sequence + cdef vector[intptr_t] gradientsData + cdef void** gradientsPtr + if cpython.PySequence_Check(gradients): + gradientsData = gradients + gradientsPtr = (gradientsData.data()) + else: # a pointer address + gradientsPtr = gradients + + with nogil: + status = cutensornetComputeGradientsBackward( + <_Handle>handle, <_ContractionPlan>plan, + rawDataInPtr, output_gradient, gradientsPtr, + accumulate_output, + <_WorkspaceDescriptor>workspace, stream) + check_status(status) + + +cpdef intptr_t create_state( + intptr_t handle, + int purity, int32_t n_state_modes, + state_mode_extents, int data_type) except*: + """Create a tensor network state. + + Args: + handle (intptr_t): The library handle. + purity (cuquantum.cutensornet.StatePurity): The tensor network state purity. + n_state_modes (int32_t): The number of modes of the tensor network states. + state_mode_extents: A host array of extents for each state mode. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + data_type (cuquantum.cudaDataType): The data type of the tensor network state. + + Returns: + intptr_t: An opaque tensor network state handle (as Python :class:`int`). + + .. seealso:: `cutensornetCreateState` + """ + # state_mode_extents can be a pointer address, or a Python sequence + cdef vector[int64_t] stateModesExtentsData + cdef int64_t* stateModesExtentsPtr + if cpython.PySequence_Check(state_mode_extents): + if len(state_mode_extents) != n_state_modes: + raise ValueError("size of state_mode_extents not matching n_state_modes") + stateModesExtentsData = state_mode_extents + stateModesExtentsPtr = stateModesExtentsData.data() + else: # a pointer address + stateModesExtentsPtr = state_mode_extents + + cdef _State state + with nogil: + status = cutensornetCreateState( + <_Handle>handle, <_StatePurity>purity, n_state_modes, + stateModesExtentsPtr, data_type, &state) + check_status(status) + return state + + +cpdef destroy_state(intptr_t state): + """Destroy a tensor network state. + + Args: + state (intptr_t): The tensor network state. + + .. seealso:: `cutensornetDestroyState` + """ + with nogil: + status = cutensornetDestroyState(<_State>state) + check_status(status) + + +cpdef int64_t state_apply_tensor( + intptr_t handle, intptr_t state, int32_t n_state_modes, + state_modes, intptr_t tensor_data, tensor_mode_strides, + int32_t immutable, int32_t adjoint, int32_t unitary): + """Apply a tensor operator to the tensor network state. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + n_state_modes (int32_t): The number of state modes that the tensor applies on. + state_modes: A host array of modes to specify where the tensor is applied to. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + tensor_data (intptr_t): The tensor data. + tensor_mode_strides: A host array of strides for each mode. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + immutable (int32_t): Whether the tensor is immutable + adjoint (int32_t): Whether the tensor should be considered as adjoint. + unitary (int32_t): Whether the tensor represents a unitary operation. + + Returns: + tensor_id (int64_t): The id that is assigned to the tensor. + + .. seealso:: `cutensornetStateApplyTensor` + """ + # state_modes can be a pointer address, or a Python sequence + cdef int64_t tensor_id + cdef vector[int32_t] stateModesData + cdef int32_t* stateModesPtr + if cpython.PySequence_Check(state_modes): + if len(state_modes) != n_state_modes: + raise ValueError("size of state_modes not matching n_state_modes") + stateModesData = state_modes + stateModesPtr = stateModesData.data() + else: # a pointer address + stateModesPtr = state_modes + + # tensor_mode_strides can be a pointer address, or a Python sequence + cdef vector[int64_t] tensorModesStridesData + cdef int64_t* tensorModesStridesPtr + if cpython.PySequence_Check(tensor_mode_strides): + tensorModesStridesData = tensor_mode_strides + tensorModesStridesPtr = tensorModesStridesData.data() + else: # a pointer address + tensorModesStridesPtr = tensor_mode_strides + + with nogil: + status = cutensornetStateApplyTensor( + <_Handle>handle, <_State>state, n_state_modes, stateModesPtr, tensor_data, + tensorModesStridesPtr, immutable, adjoint, unitary, &tensor_id) + check_status(status) + return tensor_id + + +cpdef state_update_tensor( + intptr_t handle, intptr_t state, + int64_t tensor_id, intptr_t tensor_data, int32_t unitary): + """Update a tensor operand that has been applied to the tensor network state. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + tensor_id (int64_t): The id that is assigned to the tensor. + tensor_data (intptr_t): The tensor data. + adjoint (int32_t): Whether the tensor should be considered as adjoint. + unitary (int32_t): Whether the tensor represents a unitary operation. + + .. seealso:: `cutensornetStateUpdateTensor` + """ + with nogil: + status = cutensornetStateUpdateTensor( + <_Handle>handle, <_State>state, tensor_id, tensor_data, unitary) + check_status(status) + + +cpdef intptr_t create_marginal( + intptr_t handle, intptr_t state, + int32_t n_marginal_modes, marginal_modes, + int32_t n_projected_modes, projected_modes, marginal_tensor_strides) except*: + """Create a representation for the tensor network state marginal distribution. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + n_marginal_modes (int32_t): The number of modes for the marginal. + marginal_modes: A host array of modes for the marginal. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + n_projected_modes (int32_t): The number of modes that are projected out for the marginal. + projected_modes: A host array of projected modes for the marginal. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + marginal_tensor_strides: A host array of strides for the marginal modes. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + Returns: + intptr_t: An opaque tensor network state marginal handle (as Python :class:`int`). + + .. seealso:: `cutensornetCreateMarginal` + """ + # marginal_modes can be a pointer address, or a Python sequence + cdef vector[int32_t] marginalModesData + cdef int32_t* marginalModesPtr + if cpython.PySequence_Check(marginal_modes): + if len(marginal_modes) != n_marginal_modes: + raise ValueError("size of marginal_modes not matching n_marginal_modes") + marginalModesData = marginal_modes + marginalModesPtr = marginalModesData.data() + else: # a pointer address + marginalModesPtr = marginal_modes + + # projected_modes can be a pointer address, or a Python sequence + cdef vector[int32_t] projectedModesData + cdef int32_t* projectedModesPtr + if cpython.PySequence_Check(projected_modes): + if len(projected_modes) != n_projected_modes: + raise ValueError("size of projected_modes not matching n_projected_modes") + projectedModesData = projected_modes + projectedModesPtr = projectedModesData.data() + else: # a pointer address + projectedModesPtr = projected_modes + + # marginal_tensor_strides can be a pointer address, or a Python sequence + cdef vector[int64_t] marginalTensorStridesData + cdef int64_t* marginalTensorStridesPtr + if cpython.PySequence_Check(marginal_tensor_strides): + marginalTensorStridesData = marginal_tensor_strides + marginalTensorStridesPtr = marginalTensorStridesData.data() + else: # a pointer address + marginalTensorStridesPtr = marginal_tensor_strides + + cdef _StateMarginal marginal + with nogil: + status = cutensornetCreateMarginal( + <_Handle>handle, <_State>state, + n_marginal_modes, marginalModesPtr, + n_projected_modes, projectedModesPtr, + marginalTensorStridesPtr, &marginal) + check_status(status) + return marginal + + +cdef dict marginal_attribute_sizes = { + CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES: _numpy.int64 +} + + +cpdef marginal_get_attribute_dtype(int attr): + """Get the Python data type of the corresponding marginal attribute. + + Args: + attr (MarginalAttribute): The attribute to query. + + Returns: + The data type of the queried attribute. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for :func:`marginal_configure`. + """ + return marginal_attribute_sizes[attr] + + +cpdef marginal_configure(intptr_t handle, intptr_t marginal, int attr, intptr_t buf, size_t size): + """Configures computation of the tensor network state marginal distribution. + + Args: + handle (intptr_t): The library handle. + marginal (intptr_t): The tensor network marginal computation handle. + attr (MarginalAttribute): The attribute to configure. + buf (intptr_t): The pointer address (as Python :class:`int`) for storing + the returned attribute value. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`marginal_get_attribute_dtype`. + + .. seealso:: `cutensornetMarginalConfigure` + """ + with nogil: + status = cutensornetMarginalConfigure( + <_Handle>handle, <_StateMarginal>marginal, + <_MarginalAttribute>attr, + buf, size) + check_status(status) + + +cpdef marginal_prepare( + intptr_t handle, intptr_t marginal, + size_t max_workspace_size_device, intptr_t workspace, intptr_t stream): + """Prepares computation of the tensor network state marginal distribution. + + Args: + handle (intptr_t): The library handle. + marginal (intptr_t): The tensor network marginal computation handle. + max_workspace_size_device (size_t): The maximal device workspace size (in bytes) allowed + for the mariginal computation. + workspace (intptr_t): The workspace descriptor. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetMarginalPrepare` + """ + with nogil: + status = cutensornetMarginalPrepare( + <_Handle>handle, <_StateMarginal>marginal, + max_workspace_size_device, <_WorkspaceDescriptor>workspace, stream) + check_status(status) + + +cpdef marginal_compute( + intptr_t handle, intptr_t marginal, projected_mode_values, + intptr_t workspace, intptr_t marginal_tensor, intptr_t stream): + """Computes the tensor network state marginal distribution. + + Args: + handle (intptr_t): The library handle. + marginal (intptr_t): The tensor network marginal computation handle. + projected_mode_values: A host array of values for the projected modes. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + workspace (intptr_t): The workspace descriptor. + marginal_tensor (intptr_t): The pointer address (as Python :class:`int`) for storing + the computed marginals. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetMarginalCompute` + """ + # projected_mode_values can be a pointer address, or a Python sequence + cdef vector[int64_t] projectedModeValuesData + cdef int64_t* projectedModeValuesPtr + if cpython.PySequence_Check(projected_mode_values): + projectedModeValuesData = projected_mode_values + projectedModeValuesPtr = projectedModeValuesData.data() + else: # a pointer address + projectedModeValuesPtr = projected_mode_values + + with nogil: + status = cutensornetMarginalCompute( + <_Handle>handle, <_StateMarginal>marginal, + projectedModeValuesPtr, <_WorkspaceDescriptor>workspace, + marginal_tensor, stream) + check_status(status) + + +cpdef destroy_marginal(intptr_t marginal): + """Destroy a tensor network marginal representation. + + Args: + marginal (intptr_t): The tensor network marginal distribution. + + .. seealso:: `cutensornetDestroyMarginal` + """ + with nogil: + status = cutensornetDestroyMarginal(<_StateMarginal>marginal) + check_status(status) + + +cpdef intptr_t create_sampler( + intptr_t handle, intptr_t state, + int32_t n_modes_to_sample, modes_to_sample) except*: + """Creates a tensor network state sampler. + + Args: + handle (intptr_t): The library handle. + state (intptr_t): The tensor network state. + n_modes_to_sample (int32_t): The number of modes to sample for the sampler. + modes_to_sample: A host array of modes for the sampler. It can be + + - an :class:`int` as the pointer address to the array + - a Python sequence of :class:`int` + + Returns: + intptr_t: An opaque tensor network state sampler handle (as Python :class:`int`). + + .. seealso:: `cutensornetCreateSampler` + """ + # modes_to_sample can be a pointer address, or a Python sequence + cdef vector[int32_t] modesData + cdef int32_t* modesPtr + if cpython.PySequence_Check(modes_to_sample): + if len(modes_to_sample) != n_modes_to_sample: + raise ValueError("size of modes_to_sample not matching n_modes_to_sample") + modesData = modes_to_sample + modesPtr = modesData.data() + else: # a pointer address + modesPtr = modes_to_sample + + cdef _StateSampler sampler + with nogil: + status = cutensornetCreateSampler( + <_Handle>handle, <_State>state, + n_modes_to_sample, modesPtr, &sampler) + check_status(status) + return sampler + + +cdef dict sampler_attribute_sizes = { + CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES: _numpy.int64 +} + + +cpdef sampler_get_attribute_dtype(int attr): + """Get the Python data type of the corresponding sampler attribute. + + Args: + attr (SamplerAttribute): The attribute to query. + + Returns: + The data type of the queried attribute. The returned dtype is always + a valid NumPy dtype object. + + .. note:: This API has no C counterpart and is a convenient helper for + allocating memory for :func:`sampler_configure`. + """ + return sampler_attribute_sizes[attr] + + +cpdef sampler_configure( + intptr_t handle, intptr_t sampler, int attr, intptr_t buf, size_t size): + """Configures the tensor network state sampler. + + Args: + handle (intptr_t): The library handle. + sampler (intptr_t): The tensor network sampler handle. + attr (SamplerAttribute): The attribute to configure. + buf (intptr_t): The pointer address (as Python :class:`int`) for storing + the returned attribute value. + size (size_t): The size of ``buf`` (in bytes). + + .. note:: To compute ``size``, use the itemsize of the corresponding data + type, which can be queried using :func:`sampler_get_attribute_dtype`. + + .. seealso:: `cutensornetSamplerConfigure` + """ + with nogil: + status = cutensornetSamplerConfigure( + <_Handle>handle, <_StateSampler>sampler, + <_SamplerAttribute>attr, + buf, size) + check_status(status) + + +cpdef sampler_prepare( + intptr_t handle, intptr_t sampler, size_t max_workspace_size_device, intptr_t workspace, intptr_t stream): + """Prepares computation of the tensor network state marginal distribution. + + Args: + handle (intptr_t): The library handle. + sampler (intptr_t): The tensor network sampler. + max_workspace_size_device (size_t): The maximal device workspace size (in bytes) allowed + for the sampling computation. + workspace (intptr_t): The workspace descriptor. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetSamplerPrepare` + """ + with nogil: + status = cutensornetSamplerPrepare( + <_Handle>handle, <_StateSampler>sampler, + max_workspace_size_device, <_WorkspaceDescriptor>workspace, stream) + check_status(status) + + +cpdef sampler_sample( + intptr_t handle, intptr_t sampler, int64_t n_shots, + intptr_t workspace, intptr_t samples, intptr_t stream): + """Computes the tensor network state marginal distribution. + + Args: + handle (intptr_t): The library handle. + sampler (intptr_t): The tensor network sampler. + n_shots (int64_t): The number of shots. + workspace (intptr_t): The workspace descriptor. + samples (intptr_t): The pointer address (as Python :class:`int`) for storing + the computed samples. + stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python + :class:`int`). + + .. seealso:: `cutensornetSamplerSample` + """ + with nogil: + status = cutensornetSamplerSample( + <_Handle>handle, <_StateSampler>sampler, n_shots, + <_WorkspaceDescriptor>workspace, + samples, stream) + check_status(status) + + +cpdef destroy_sampler(intptr_t sampler): + """Destroy a tensor network state sampler. + + Args: + sampler (intptr_t): The tensor network state sampler. + + .. seealso:: `cutensornetDestroySampler` + """ + with nogil: + status = cutensornetDestroySampler(<_StateSampler>sampler) + check_status(status) + + +class NetworkAttribute(IntEnum): + """See `cutensornetNetworkAttributes_t`.""" + INPUT_TENSORS_NUM_CONSTANT = CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONSTANT + INPUT_TENSORS_CONSTANT = CUTENSORNET_NETWORK_INPUT_TENSORS_CONSTANT + INPUT_TENSORS_NUM_CONJUGATED = CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_CONJUGATED + INPUT_TENSORS_CONJUGATED = CUTENSORNET_NETWORK_INPUT_TENSORS_CONJUGATED + INPUT_TENSORS_NUM_REQUIRE_GRAD = CUTENSORNET_NETWORK_INPUT_TENSORS_NUM_REQUIRE_GRAD + INPUT_TENSORS_REQUIRE_GRAD = CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD + class GraphAlgo(IntEnum): """See `cutensornetGraphAlgo_t`.""" RB = CUTENSORNET_GRAPH_ALGO_RB @@ -2518,6 +3311,8 @@ class ContractionOptimizerConfigAttribute(IntEnum): SIMPLIFICATION_DISABLE_DR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR SEED = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED COST_FUNCTION_OBJECTIVE = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_COST_FUNCTION_OBJECTIVE + CACHE_REUSE_NRUNS = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_CACHE_REUSE_NRUNS + SMART_OPTION = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SMART_OPTION class ContractionOptimizerInfoAttribute(IntEnum): """See `cutensornetContractionOptimizerInfoAttributes_t`.""" @@ -2563,6 +3358,8 @@ class TensorSVDConfigAttribute(IntEnum): REL_CUTOFF = CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF S_NORMALIZATION = CUTENSORNET_TENSOR_SVD_CONFIG_S_NORMALIZATION S_PARTITION = CUTENSORNET_TENSOR_SVD_CONFIG_S_PARTITION + ALGO = CUTENSORNET_TENSOR_SVD_CONFIG_ALGO + ALGO_PARAMS = CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS class TensorSVDNormalization(IntEnum): """See `cutensornetTensorSVDNormalization_t`.""" @@ -2583,12 +3380,33 @@ class TensorSVDInfoAttribute(IntEnum): FULL_EXTENT = CUTENSORNET_TENSOR_SVD_INFO_FULL_EXTENT REDUCED_EXTENT = CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT DISCARDED_WEIGHT = CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT + ALGO = CUTENSORNET_TENSOR_SVD_INFO_ALGO + ALGO_STATUS = CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS + +class TensorSVDAlgo(IntEnum): + """See `cutensornetTensorSVDAlgo_t`.""" + GESVD = CUTENSORNET_TENSOR_SVD_ALGO_GESVD + GESVDJ = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ + GESVDP = CUTENSORNET_TENSOR_SVD_ALGO_GESVDP + GESVDR = CUTENSORNET_TENSOR_SVD_ALGO_GESVDR class GateSplitAlgo(IntEnum): """See `cutensornetGateSplitAlgo_t`.""" DIRECT = CUTENSORNET_GATE_SPLIT_ALGO_DIRECT REDUCED = CUTENSORNET_GATE_SPLIT_ALGO_REDUCED +class StatePurity(IntEnum): + """See `cutensornetStatePurity_t`.""" + PURE = CUTENSORNET_STATE_PURITY_PURE + +class MarginalAttribute(IntEnum): + """See `cutensornetMarginalAttributes_t`.""" + OPT_NUM_HYPER_SAMPLES = CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES + +class SamplerAttribute(IntEnum): + """See `cutensornetSamplerAttributes_t`.""" + OPT_NUM_HYPER_SAMPLES = CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES + del IntEnum @@ -2600,8 +3418,8 @@ VERSION = CUTENSORNET_VERSION # numpy dtypes tensor_qualifiers_dtype = _numpy.dtype( - {'names':('is_conjugate', 'is_constant', ), - 'formats': (_numpy.int32, _numpy.int32, ), + {'names':('is_conjugate', 'is_constant', 'requires_gradient'), + 'formats': (_numpy.int32, _numpy.int32, _numpy.int32, ), 'itemsize': sizeof(_TensorQualifiers), }, align=True ) diff --git a/python/cuquantum/cutensornet/experimental/tensor_network.py b/python/cuquantum/cutensornet/experimental/tensor_network.py index 14a1ef0..9c8c450 100644 --- a/python/cuquantum/cutensornet/experimental/tensor_network.py +++ b/python/cuquantum/cutensornet/experimental/tensor_network.py @@ -81,19 +81,22 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al gate_algorithm, svd_config, options.compute_type, workspace_desc) # Allocate and set workspace - workspace_ptr = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, - cutn.WorksizePref.MIN, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, options.device_id, + workspaces = dict() + for mem_space in (cutn.Memspace.DEVICE, cutn.Memspace.HOST): + workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, + cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id, stream, stream_ctx, options.logger, task_name='contract decomposition') options.logger.info("Starting contract-decompose (gate split)...") timing = bool(options.logger and options.logger.handlers) - if options.blocking: + blocking = options.blocking is True or operands_location == 'cpu' + if blocking: options.logger.info("This call is blocking and will return only after the operation is complete.") else: options.logger.info("This call is non-blocking and will return immediately after the operation is launched on the device.") svd_info = cutn.create_tensor_svd_info(handle) - with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, options.blocking, timing) as (last_compute_event, elapsed): + with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, blocking, timing) as (last_compute_event, elapsed): cutn.gate_split(handle, input_tensor_descriptors[0], wrapped_operands[0].data_ptr, input_tensor_descriptors[1], wrapped_operands[1].data_ptr, @@ -122,6 +125,9 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al if reduced_extent != mid_extent: s.tensor = s.tensor[:reduced_extent] finally: + # when host workspace is allocated, synchronize stream before return + if workspaces[cutn.Memspace.HOST] is not None: + stream.synchronize() # Free resources decomposition_utils._destroy_tensor_descriptors(input_tensor_descriptors) decomposition_utils._destroy_tensor_descriptors(output_tensor_descriptors) diff --git a/python/cuquantum/cutensornet/tensor.py b/python/cuquantum/cutensornet/tensor.py index 71de016..52f13d3 100644 --- a/python/cuquantum/cutensornet/tensor.py +++ b/python/cuquantum/cutensornet/tensor.py @@ -13,6 +13,8 @@ import re from typing import Optional +import numpy + from . import cutensornet as cutn from .configuration import NetworkOptions from ._internal import decomposition_utils @@ -22,6 +24,7 @@ DecompositionOptions = dataclasses.make_dataclass("DecompositionOptions", fields=[(field.name, field.type, field) for field in dataclasses.fields(NetworkOptions)], bases=(NetworkOptions,)) DecompositionOptions.__doc__ = re.sub(":class:`cuquantum.Network` object", ":func:`cuquantum.cutensornet.tensor.decompose` and :func:`cuquantum.cutensornet.experimental.contract_decompose` functions", NetworkOptions.__doc__) + def decompose( subscripts, operand, @@ -252,7 +255,6 @@ def decompose( # Create workspace descriptor workspace_desc = cutn.create_workspace_descriptor(handle) - workspace_ptr = None # Compute required workspace size if isinstance(method, QRMethod): @@ -268,21 +270,24 @@ def decompose( ValueError("method must be either a QRMethod/SVDMethod object or a dict that can be used to construct QRMethod/SVDMethod") # Allocate and set workspace - workspace_ptr = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, - cutn.WorksizePref.MIN, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, options.device_id, - stream, stream_ctx, options.logger, task_name='tensor decomposition') + workspaces = dict() + for mem_space in (cutn.Memspace.DEVICE, cutn.Memspace.HOST): + workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, + cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id, + stream, stream_ctx, options.logger, task_name='tensor decomposition') svd_info_obj = None # Perform QR/SVD computation logger.info("Starting tensor decomposition...") - if options.blocking: + blocking = options.blocking is True or operands_location == 'cpu' + if blocking: logger.info("This call is blocking and will return only after the operation is complete.") else: logger.info("This call is non-blocking and will return immediately after the operation is launched on the device.") timing = bool(logger and logger.handlers) if isinstance(method, QRMethod): - with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, options.blocking, timing) as (last_compute_event, elapsed): + with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, blocking, timing) as (last_compute_event, elapsed): cutn.tensor_qr(handle, *input_descriptors, wrapped_operands[0].data_ptr, output_descriptors[0], output_operands[0].data_ptr, @@ -293,7 +298,7 @@ def decompose( logger.info(f"The QR decomposition took {elapsed.data:.3f} ms to complete.") elif isinstance(method, SVDMethod): svd_info = cutn.create_tensor_svd_info(handle) - with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, options.blocking, timing) as (last_compute_event, elapsed): + with utils.device_ctx(options.device_id), utils.cuda_call_ctx(stream, blocking, timing) as (last_compute_event, elapsed): cutn.tensor_svd(handle, *input_descriptors, wrapped_operands[0].data_ptr, output_descriptors[0], output_operands[0].data_ptr, @@ -312,6 +317,9 @@ def decompose( if s is not None and reduced_extent != mid_extent: s.tensor = s.tensor[:reduced_extent] finally: + # when host workspace is allocated, synchronize stream before return + if workspaces[cutn.Memspace.HOST] is not None: + stream.synchronize() # Free resources if svd_config is not None: cutn.destroy_tensor_svd_config(svd_config) @@ -355,14 +363,36 @@ class SVDInfo: full_extent: The total number of singular values after matricization (before truncation). reduced_extent: The number of remaining singular values after truncation. discarded_weight: The discarded weight for the truncation. + algorithm: The algorithm used in the SVD execution. + gesvdj_residual: The residual for full gesvdj execution. + gesvdj_sweeps: The number of iterations used in the gesvdj execution. + gesvdp_err_sigma: The error sigma in the gesvdp execution. + + .. note:: + + When the SVD algorithm is set to ``"gesvdr"`` with fixed extent truncation enabled in :class:`cuquantum.cutensornet.tensor.SVDMethod`, + the discarded weight will not be computed. """ reduced_extent: int full_extent: int discarded_weight: float + algorithm: str + gesvdj_residual: Optional[float] = None + gesvdj_sweeps: Optional[int] = None + gesvdp_err_sigma: Optional[float] = None def __str__(self): + svd_details = f"Algorithm = {self.algorithm}" + if self.gesvdj_residual is not None: + svd_details += f", residual= {self.gesvdj_residual}" + if self.gesvdj_sweeps is not None: + svd_details += f", sweeps = {self.gesvdj_sweeps}" + if self.gesvdp_err_sigma is not None: + svd_details += f", sigma error = {self.gesvdp_err_sigma}" + s = f"""SVD Information at Runtime: + {svd_details} Total number of singular values after matricization = {self.full_extent} Number of singular values after truncation = {self.reduced_extent} Discarded weight for the truncation = {self.discarded_weight}""" @@ -384,6 +414,16 @@ class SVDMethod: :func:`cuquantum.cutensornet.experimental.contract_decompose` will be `None`. normalization: The specified norm of the singular values (after truncation) will be normalized to 1. Currently supports ``None``, ``"L1"``, ``"L2"`` and ``"LInf"``. + algorithm: The SVD algorithm to use. Currently supports ``"gesvd"`` (default), ``"gesvdj"``, ``"gesvdp"`` and ``"gesvdr"``. + gesvdj_tol: The tolerance to use when ``algorithm`` is set to ``"gesvdj"``. Default 0 denotes machine precision. + gesvdj_max_sweeps: The maximal number of sweeps when ``algorithm`` is set to ``"gesvdj"``. Default 0 denotes 100. + gesvdr_oversampling: The size of oversampling when ``algorithm`` is set to ``"gesvdr"``. Default 0 denotes the lower of 4 times ``max_extent`` and the difference between full rank and ``max_extent``. + gesvdr_niters: The number of iteration of power method when ``algorithm`` is set to ``"gesvdr"`` and the default (0) is 10. + + .. note:: + + For detailed explanation on the different SVD algorithms and the corresponding parameters, + please refer to `cuSolver documentation page `_ .. note:: @@ -396,10 +436,26 @@ class SVDMethod: rel_cutoff: Optional[float] = 0.0 partition: Optional[str] = None normalization: Optional[str] = None + algorithm: Optional[str] = 'gesvd' + gesvdj_tol: Optional[float] = 0 + gesvdj_max_sweeps: Optional[int] = 0 + gesvdr_oversampling: Optional[int] = 0 + gesvdr_niters: Optional[int] = 0 def __str__(self): + svd_details = f"Algorithm = {self.algorithm}" + if self.gesvdj_tol is not None: + svd_details += f", tolerance = {self.gesvdj_tol}" + if self.gesvdj_max_sweeps is not None: + svd_details += f", max sweeps = {self.gesvdj_max_sweeps}" + if self.gesvdr_oversampling is not None: + svd_details += f", oversampling = {self.gesvdr_oversampling}" + if self.gesvdr_niters is not None: + svd_details += f", niters = {self.gesvdr_niters}" + s = f"""SVD Method: + {svd_details} Maxmial number of singular values = {self.max_extent} Absolute value cutoff = {self.abs_cutoff} Relative value cutoff = {self.rel_cutoff} @@ -407,3 +463,28 @@ def __str__(self): Singular values normalization = {self.normalization}""" return s + + def __post_init__(self): + if self.algorithm not in ('gesvd', 'gesvdj', 'gesvdr', 'gesvdp'): + raise ValueError(f"SVD algorithm {self.algorithm} not supported; currently supports gesvd, gesvdj, gesvdr, gesvdp") + + if (self.gesvdj_tol !=0 or self.gesvdj_max_sweeps !=0) and self.algorithm != 'gesvdj': + raise ValueError(f"gesvdj_tol and gesvdj_max_sweeps can only be set when algorithm is set to gesvdj, found algorithm {self.algorithm}") + + if (self.gesvdr_oversampling !=0 or self.gesvdr_niters !=0) and self.algorithm != 'gesvdr': + raise ValueError(f"gesvdr_oversample and gesvdr_niters can only be set when algorithm is set to gesvdr, found algorithm {self.algorithm}") + + def _get_algo_params(self): + initialized = False + if self.algorithm in ('gesvdj', 'gesvdr'): + dtype = cutn.tensor_svd_algo_params_get_dtype(decomposition_utils.SVD_ALGORITHM_MAP[self.algorithm]) + algo_params = numpy.zeros(1, dtype=dtype) + for name in dtype.names: + value = getattr(self, f'{self.algorithm}_{name}') + if value != 0: + algo_params[name] = value + initialized = True + if initialized: + return algo_params + else: + return None diff --git a/python/cuquantum/cutensornet/tensor_network.py b/python/cuquantum/cutensornet/tensor_network.py index f27eed1..c6f4528 100644 --- a/python/cuquantum/cutensornet/tensor_network.py +++ b/python/cuquantum/cutensornet/tensor_network.py @@ -532,7 +532,13 @@ def contract_path(self, optimize=None, **kwargs): self._set_optimizer_options(optimize) # Find "optimal" path. self.logger.info("Finding optimal path as well as sliced modes...") - cutn.contraction_optimize(self.handle, self.network, self.optimizer_config_ptr, self.memory_limit, self.optimizer_info_ptr) + try: + cutn.contraction_optimize( + self.handle, self.network, self.optimizer_config_ptr, self.memory_limit, self.optimizer_info_ptr) + except cutn.cuTensorNetError as e: + if 'INTERRUPTED' in str(e): + raise KeyboardInterrupt from e + raise self.logger.info("Finished finding optimal path as well as sliced modes.") else: self.logger.info("Setting user-provided path...") diff --git a/python/cuquantum/utils.pyx b/python/cuquantum/utils.pyx index 9e0a71b..d3d6bc7 100644 --- a/python/cuquantum/utils.pyx +++ b/python/cuquantum/utils.pyx @@ -145,3 +145,41 @@ class libraryPropertyType(IntEnum): PATCH_LEVEL = 2 del IntEnum + + +# Defined in CPython: +# https://github.com/python/cpython/blob/26bc2cc06128890ac89492eca20e83abe0789c1c/Objects/unicodetype_db.h#L6311-L6349 +cdef int[29] _WHITESPACE_UNICODE_INTS = [ + 0x0009, + 0x000A, + 0x000B, + 0x000C, + 0x000D, + 0x001C, + 0x001D, + 0x001E, + 0x001F, + 0x0020, + 0x0085, + 0x00A0, + 0x1680, + 0x2000, + 0x2001, + 0x2002, + 0x2003, + 0x2004, + 0x2005, + 0x2006, + 0x2007, + 0x2008, + 0x2009, + 0x200A, + 0x2028, + 0x2029, + 0x202F, + 0x205F, + 0x3000, +] + + +WHITESPACE_UNICODE = ''.join(chr(s) for s in _WHITESPACE_UNICODE_INTS) diff --git a/python/samples/custatevec/batched_abs2sum.py b/python/samples/custatevec/batched_abs2sum.py new file mode 100644 index 0000000..343911e --- /dev/null +++ b/python/samples/custatevec/batched_abs2sum.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import cupy as cp + +from cuquantum import custatevec as cusv +from cuquantum import cudaDataType + + +nSVs = 2 +nIndexBits = 3 +svStride = (1 << nIndexBits) + +# square absolute values of state vector elements for 0/2-th bits will be summed up +# bit ordering should only live on host. +bitOrdering = np.asarray([1], dtype=np.int32) +bitStringLen = bitOrdering.size + +# 2 state vectors are allocated contiguously in single memory chunk. +d_svs = cp.asarray([[0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j, + 0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j], + [0.25 + 0.25j, 0.25 + 0.25j, 0.25 + 0.25j, 0.25 + 0.25j, + 0.25 + 0.25j, 0.25 + 0.25j, 0.25 + 0.25j, 0.25 + 0.25j]], dtype=cp.complex64) + +abs2sumStride = 2 +batchedAbs2sumSize = nSVs * abs2sumStride + +# abs2sum arrays are allocated contiguously in single memory chunk +# Note: abs2sum can also live on the host. +abs2sum = cp.empty(batchedAbs2sumSize, dtype=cp.float64) +abs2sum_res = cp.asarray([0.27, 0.73, 0.5, 0.5], dtype=cp.float64) + +################################################################################### + +# cuStateVec handle initialization +handle = cusv.create() + +# compute abs2sum arrays +cusv.abs2sum_array_batched( + handle, d_svs.data.ptr, cudaDataType.CUDA_C_32F, nIndexBits, nSVs, svStride, + abs2sum.data.ptr, abs2sumStride, + bitOrdering.ctypes.data, bitStringLen, 0, 0, 0) + +# destroy handle +cusv.destroy(handle) + +# check result +if not cp.allclose(abs2sum_res, abs2sum): + raise ValueError("results mismatch") +print("test passed") diff --git a/python/samples/custatevec/batched_collapse.py b/python/samples/custatevec/batched_collapse.py new file mode 100644 index 0000000..b33fc18 --- /dev/null +++ b/python/samples/custatevec/batched_collapse.py @@ -0,0 +1,68 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import cupy as cp + +from cuquantum import custatevec as cusv +from cuquantum import cudaDataType + + +nSVs = 2 +nIndexBits = 3 +svStride = (1 << nIndexBits) + +# 2 state vectors are allocated contiguously in single memory chunk. +d_svs = cp.asarray([[0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], + [0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j]], dtype=cp.complex64) + +d_svs_res = cp.asarray([[0.0+0.0j, 0.0+1.0j, 0.0+0.0j, 0.0+0.0j, + 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j], + [0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, + 0.0+0.0j, 0.0+0.0j, 0.6+0.8j, 0.0+0.0j]], dtype=cp.complex64) + +# 2 bitStrings are allocated contiguously in single memory chunk. +# The 1st SV collapses to |001> and the 2nd to |110> +# Note: bitStrings can also live on the host. +bitStrings = cp.asarray([0b001, 0b110], dtype=cp.int64) + +# bit ordering should only live on host. +bitOrdering = np.asarray([0, 1, 2], dtype=np.int32) +bitStringLen = bitOrdering.size + +# 2 norms are allocated contiguously in single memory chunk. +# Note: norms can also live on the host. +norms = cp.asarray([0.01, 0.25], dtype=cp.float64) + +################################################################################### + +# cuStateVec handle initialization +handle = cusv.create() + +# check the size of external workspace +extraWorkspaceSizeInBytes = cusv.collapse_by_bitstring_batched_get_workspace_size( + handle, nSVs, bitStrings.data.ptr, norms.data.ptr) + +# allocate external workspace if necessary +if extraWorkspaceSizeInBytes > 0: + workspace = cp.cuda.alloc(extraWorkspaceSizeInBytes) + workspace_ptr = workspace.ptr +else: + workspace_ptr = 0 + +# collapse the quantum states to the target bitstrings +cusv.collapse_by_bitstring_batched( + handle, d_svs.data.ptr, cudaDataType.CUDA_C_32F, nIndexBits, nSVs, svStride, + bitStrings.data.ptr, bitOrdering.ctypes.data, bitStringLen, norms.data.ptr, + workspace_ptr, extraWorkspaceSizeInBytes) + +# destroy handle +cusv.destroy(handle) + +# check result +if not cp.allclose(d_svs_res, d_svs): + raise ValueError("results mismatch") +print("test passed") diff --git a/python/samples/custatevec/batched_gate_application.py b/python/samples/custatevec/batched_gate_application.py new file mode 100644 index 0000000..37cf6bf --- /dev/null +++ b/python/samples/custatevec/batched_gate_application.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import cupy as cp + +from cuquantum import custatevec as cusv +from cuquantum import cudaDataType, ComputeType + + +nSVs = 2 +nIndexBits = 3 +svSize = (1 << nIndexBits) +svStride = svSize +adjoint = 0 + +targets = [2] +nTargets = len(targets) +controls = [0, 1] +nControls = len(controls) + +matrixIndices = [1, 0] +nMatrices = len(matrixIndices) + +# 2 state vectors are allocated contiguously in single memory chunk. +d_svs = cp.asarray([[0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], + [0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j]], dtype=cp.complex64) + +d_svs_res = cp.asarray([[0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, -0.4-0.5j], + [0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.4+0.5j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.1+0.2j]], dtype=cp.complex64) + +# 2 gate matrices are allocated contiguously in single memory chunk. +# Note: gate matrices can also live on the host. +d_matrices = cp.asarray([[0.0+0.0j, 1.0+0.0j, + 1.0+0.0j, 0.0+0.0j], + [1.0+0.0j, 0.0+0.0j, + 0.0+0.0j, -1.0+0.0j]], dtype=cp.complex64) + +################################################################################### + +# cuStateVec handle initialization +handle = cusv.create() + +# check the size of external workspace +extraWorkspaceSizeInBytes = cusv.apply_matrix_batched_get_workspace_size( + handle, cudaDataType.CUDA_C_32F, nIndexBits, nSVs, svStride, + cusv.MatrixMapType.MATRIX_INDEXED, matrixIndices, d_matrices.data.ptr, + cudaDataType.CUDA_C_32F, cusv.MatrixLayout.ROW, adjoint, nMatrices, + nTargets, nControls, + ComputeType.COMPUTE_32F) + +# allocate external workspace if necessary +if extraWorkspaceSizeInBytes > 0: + workspace = cp.cuda.alloc(extraWorkspaceSizeInBytes) + workspace_ptr = workspace.ptr +else: + workspace_ptr = 0 + +# apply gate +cusv.apply_matrix_batched( + handle, d_svs.data.ptr, cudaDataType.CUDA_C_32F, nIndexBits, nSVs, svStride, + cusv.MatrixMapType.MATRIX_INDEXED, matrixIndices, d_matrices.data.ptr, + cudaDataType.CUDA_C_32F, cusv.MatrixLayout.ROW, adjoint, nMatrices, + targets, nTargets, controls, 0, nControls, + ComputeType.COMPUTE_32F, workspace_ptr, extraWorkspaceSizeInBytes) + +# destroy handle +cusv.destroy(handle) + +# check result +if not cp.allclose(d_svs_res, d_svs): + raise ValueError("results mismatch") +print("test passed") diff --git a/python/samples/custatevec/batched_measure.py b/python/samples/custatevec/batched_measure.py new file mode 100644 index 0000000..138a4bd --- /dev/null +++ b/python/samples/custatevec/batched_measure.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import cupy as cp + +from cuquantum import custatevec as cusv +from cuquantum import cudaDataType + + +nSVs = 2 +nIndexBits = 3 +svStride = (1 << nIndexBits) + +# bit ordering should only live on host. +bitOrdering = np.asarray([2, 1, 0], dtype=np.int32) +bitStringLen = bitOrdering.size + +# 2 bitStrings are allocated contiguously in single memory chunk. +# Note: bitStrings can also live on the host. +bitStrings = cp.empty(2, dtype=cp.int64) +bitStrings_res = cp.asarray([0b100, 0b011], dtype=cp.int64) + +# In real appliction, random number in range [0, 1) will be used. +# Note: norms can also live on the host. +randnums = cp.asarray([0.009, 0.5], dtype=cp.float64) + +# 2 state vectors are allocated contiguously in single memory chunk. +d_svs = cp.asarray([[0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], + [0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j, + 0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j]], dtype=cp.complex64) + +d_svs_res = cp.asarray([[0.0+0.0j, 0.0+1.0j, 0.0+0.0j, 0.0+0.0j, + 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j], + [0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, + 0.0+0.0j, 0.0+0.0j, 0.6+0.8j, 0.0+0.0j]], dtype=cp.complex64) + +################################################################################### + +# cuStateVec handle initialization +handle = cusv.create() + +# batched measurement +cusv.measure_batched( + handle, d_svs.data.ptr, cudaDataType.CUDA_C_32F, nIndexBits, nSVs, svStride, + bitStrings.data.ptr, bitOrdering.ctypes.data, bitStringLen, + randnums.data.ptr, cusv.Collapse.NORMALIZE_AND_ZERO) + +# destroy handle +cusv.destroy(handle) + +# check result +if not cp.allclose(d_svs_res, d_svs): + raise ValueError("results mismatch") +if not cp.allclose(bitStrings_res, bitStrings): + raise ValueError("results mismatch") +print("test passed") diff --git a/python/samples/custatevec/initialize_sv.py b/python/samples/custatevec/initialize_sv.py new file mode 100644 index 0000000..47c84ef --- /dev/null +++ b/python/samples/custatevec/initialize_sv.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp + +from cuquantum import custatevec as cusv +from cuquantum import cudaDataType + + +nIndexBits = 3 +svSize = (1 << nIndexBits) + +# populate the device memory with junk values (for illustrative purpose only) +# (we create a real random array of twice length, and view it as a complex array) +d_sv = cp.random.random(2*svSize, dtype=cp.float32).view(cp.complex64) + +d_sv_res = cp.asarray([[1.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, + 0.0+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j]], dtype=cp.complex64) + +################################################################################### + +# cuStateVec handle initialization +handle = cusv.create() + +# initialize the state vector +cusv.initialize_state_vector( + handle, d_sv.data.ptr, cudaDataType.CUDA_C_32F, nIndexBits, + cusv.StateVectorType.ZERO) + +# destroy handle +cusv.destroy(handle) + +# check result +if not cp.allclose(d_sv_res, d_sv): + raise ValueError("results mismatch") +print("test passed") diff --git a/python/samples/cutensornet/approxTN/tensor_svd_example.py b/python/samples/cutensornet/approxTN/tensor_svd_example.py index 70aeb7b..07a5aac 100644 --- a/python/samples/cutensornet/approxTN/tensor_svd_example.py +++ b/python/samples/cutensornet/approxTN/tensor_svd_example.py @@ -91,7 +91,21 @@ cutn.tensor_svd_config_set_attribute(handle, svd_config, cutn.TensorSVDConfigAttribute.REL_CUTOFF, rel_cutoff.ctypes.data, rel_cutoff.dtype.itemsize) -print("Setup SVD truncation parameters.") +# optional: choose gesvdj algorithm with customized parameters. Default is gesvd. +algorithm_dtype = cutn.tensor_svd_config_get_attribute_dtype(cutn.TensorSVDConfigAttribute.ALGO) +algorithm = np.array(cutn.TensorSVDAlgo.GESVDJ, dtype=algorithm_dtype) +cutn.tensor_svd_config_set_attribute(handle, + svd_config, cutn.TensorSVDConfigAttribute.ALGO, algorithm.ctypes.data, algorithm.dtype.itemsize) + +algo_params_dtype = cutn.tensor_svd_algo_params_get_dtype(cutn.TensorSVDAlgo.GESVDJ) +algo_params = np.zeros(1, dtype=algo_params_dtype) +algo_params['tol'] = 1e-12 +algo_params['max_sweeps'] = 80 + +cutn.tensor_svd_config_set_attribute(handle, + svd_config, cutn.TensorSVDConfigAttribute.ALGO_PARAMS, algo_params.ctypes.data, algo_params.dtype.itemsize) + +print("Set up SVDConfig to use gesvdj algorithm with truncation") # create SVDInfo to record truncation information svd_info = cutn.create_tensor_svd_info(handle) @@ -172,8 +186,14 @@ svd_info, cutn.TensorSVDInfoAttribute.DISCARDED_WEIGHT, discarded_weight.ctypes.data, discarded_weight.itemsize) discarded_weight = float(discarded_weight) +algo_status_dtype = cutn.tensor_svd_algo_status_get_dtype(cutn.TensorSVDAlgo.GESVDJ) +algo_status = np.empty(1, dtype=algo_status_dtype) +cutn.tensor_svd_info_get_attribute(handle, + svd_info, cutn.TensorSVDInfoAttribute.ALGO_STATUS, algo_status.ctypes.data, algo_status.itemsize) + print(f"Execution time: {min_time_cutensornet} ms") print("SVD truncation info:") +print(f"GESVDJ residual: {algo_status['residual'].item()}, runtime sweeps = {algo_status['sweeps'].item()}") print(f"For fixed extent truncation of {shared_extent}, an absolute cutoff value of {float(abs_cutoff)}, and a relative cutoff value of {float(rel_cutoff)}, full extent {full_extent} is reduced to {reduced_extent}") print(f"Discarded weight: {discarded_weight}") diff --git a/python/samples/cutensornet/coarse/example12.py b/python/samples/cutensornet/coarse/example12.py index 7c939aa..10f2825 100644 --- a/python/samples/cutensornet/coarse/example12.py +++ b/python/samples/cutensornet/coarse/example12.py @@ -32,6 +32,7 @@ if flops != flops_np or largest != largest_np: message = f""" Results don't match. +path = {path_np} flops: NumPy = {flops_np}, cuTensorNet = {flops}, largest intermediate: NumPy = {largest_np}, cuTensorNet = {largest} """ diff --git a/python/samples/cutensornet/high_level/marginal_example.py b/python/samples/cutensornet/high_level/marginal_example.py new file mode 100755 index 0000000..c23e208 --- /dev/null +++ b/python/samples/cutensornet/high_level/marginal_example.py @@ -0,0 +1,123 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +################################################# +# Marginal computation of a quantum circuit state +################################################# + +# Quantum state configuration +num_qubits = 16 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +marginal_modes = (0, 1) # open qubits +num_marginal_modes = len(marginal_modes) +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors on device +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the reduced density matrix (marginal) +rdm_shape = (dim, ) * 2 * len(marginal_modes) +rdm = cp.empty(rdm_shape, dtype='complex128') +rdm_strides = [stride_in_bytes // rdm.itemsize for stride_in_bytes in rdm.strides] + +# Create the initial quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) +print("Quantum gates applied") + +# Specify the desired reduced density matrix (marginal) +marginal = cutn.create_marginal(handle, quantum_state, num_marginal_modes, marginal_modes, 0, 0, rdm_strides) + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +num_hyper_samples_dtype = cutn.marginal_get_attribute_dtype(cutn.MarginalAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.marginal_configure(handle, marginal, + cutn.MarginalAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the specified quantum circuit reduced densitry matrix (marginal) +work_desc = cutn.create_workspace_descriptor(handle) +cutn.marginal_prepare(handle, marginal, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit reduced density matrix (marginal)") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_marginal(marginal) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer") + +# Compute the specified quantum circuit reduced density matrix (marginal) +cutn.marginal_compute(handle, marginal, 0, work_desc, rdm.data.ptr, stream.ptr) +stream.synchronize() +print("Computed the specified quantum circuit reduced density matrix (marginal)") + +print(f"Reduced density matrix for {num_marginal_modes} qubits") +print(rdm.reshape(dim**num_marginal_modes, dim**num_marginal_modes)) + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_marginal(marginal) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") \ No newline at end of file diff --git a/python/samples/cutensornet/high_level/sampling_example.py b/python/samples/cutensornet/high_level/sampling_example.py new file mode 100755 index 0000000..948d182 --- /dev/null +++ b/python/samples/cutensornet/high_level/sampling_example.py @@ -0,0 +1,119 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +##################################### +# Sampling of a quantum circuit state +##################################### + +# Quantum state configuration +num_samples = 100 +num_qubits = 16 +dim = 2 +qubits_dims = (dim, ) * num_qubits # qubit size +print(f"Quantum circuit with {num_qubits} qubits") + +############# +# cuTensorNet +############# + +handle = cutn.create() +stream = cp.cuda.Stream() +data_type = cuquantum.cudaDataType.CUDA_C_64F + +# Define quantum gate tensors in host memory +gate_h = 2**-0.5 * cp.asarray([[1,1], [1,-1]], dtype='complex128', order='F') +gate_h_strides = 0 + +gate_cx = cp.asarray([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]], dtype='complex128').reshape(2,2,2,2, order='F') +gate_cx_strides = 0 + +# Allocate device memory for the samples +samples = np.empty((num_qubits, num_samples), dtype='int64', order='F') # samples are stored in F order with shape (num_qubits, num_qubits) +# Create the initial quantum state +quantum_state = cutn.create_state(handle, cutn.StatePurity.PURE, num_qubits, qubits_dims, data_type) +print("Created the initial quantum state") + +# Construct the quantum circuit state with gate application +tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 1, (0, ), + gate_h.data.ptr, gate_h_strides, 1, 0, 1) + +for i in range(1, num_qubits): + tensor_id = cutn.state_apply_tensor( + handle, quantum_state, 2, (i-1, i), # target on i-1 while control on i + gate_cx.data.ptr, gate_cx_strides, 1, 0, 1) + +print("Quantum gates applied") + +# Create the quantum circuit sampler +sampler = cutn.create_sampler(handle, quantum_state, num_qubits, 0) + +free_mem = dev.mem_info[0] +# use half of the totol free size +scratch_size = free_mem // 2 +scratch_space = cp.cuda.alloc(scratch_size) +print(f"Allocated {scratch_size} bytes of scratch memory on GPU") + +num_hyper_samples_dtype = cutn.sampler_get_attribute_dtype(cutn.SamplerAttribute.OPT_NUM_HYPER_SAMPLES) +num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) +cutn.sampler_configure(handle, sampler, + cutn.SamplerAttribute.OPT_NUM_HYPER_SAMPLES, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + +# Prepare the quantum circuit sampler +work_desc = cutn.create_workspace_descriptor(handle) +cutn.sampler_prepare(handle, sampler, scratch_size, work_desc, stream.ptr) +print("Prepared the specified quantum circuit state sampler") + +workspace_size_d = cutn.workspace_get_memory_size(handle, + work_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + +if workspace_size_d <= scratch_size: + cutn.workspace_set_memory(handle, work_desc, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) +else: + print("Error:Insufficient workspace size on Device") + cutn.destroy_workspace_descriptor(work_desc) + cutn.destroy_sampler(sampler) + cutn.destroy_state(quantum_state) + cutn.destroy(handle) + del scratch + print("Free resource and exit.") + exit() +print("Set the workspace buffer") + +# Sample the quantum circuit state +cutn.sampler_sample(handle, sampler, num_samples, work_desc, samples.ctypes.data, stream.ptr) +stream.synchronize() +print("Performed quantum circuit state sampling") +print("Bit-string samples:") +print(samples.T) + +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_sampler(sampler) +cutn.destroy_state(quantum_state) +cutn.destroy(handle) +del scratch_space +print("Free resource and exit.") diff --git a/python/samples/cutensornet/tensor/example11-svd_algorithms.py b/python/samples/cutensornet/tensor/example11-svd_algorithms.py new file mode 100644 index 0000000..05ffdea --- /dev/null +++ b/python/samples/cutensornet/tensor/example11-svd_algorithms.py @@ -0,0 +1,27 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +""" +truncated SVD Example using NumPy ndarray with various SVD algorithms. + +The decomposition results are also NumPy ndarrays. +""" +import numpy as np + +from cuquantum import tensor + + +a = np.ones((3,2,4,5)) + +base_options = {'max_extent': 4, + 'abs_cutoff': 0.1, + 'rel_cutoff': 0.1} + + +for algorithm in ('gesvd', 'gesvdj', 'gesvdr', 'gesvdp'): + method = tensor.SVDMethod(algorithm=algorithm, **base_options) + u, s, v, info = tensor.decompose("ijab->ixa,xbj", a, method=method, return_info=True) + print(s) + print(info) + diff --git a/python/samples/cutensornet/tensornet_example_gradients.py b/python/samples/cutensornet/tensornet_example_gradients.py new file mode 100644 index 0000000..434f6cd --- /dev/null +++ b/python/samples/cutensornet/tensornet_example_gradients.py @@ -0,0 +1,342 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + +import cupy as cp +import numpy as np +try: + import torch +except ImportError: + torch = None + +import cuquantum +from cuquantum import cutensornet as cutn + + +print("cuTensorNet-vers:", cutn.get_version()) +dev = cp.cuda.Device() # get current device +props = cp.cuda.runtime.getDeviceProperties(dev.id) +print("===== device info ======") +print("GPU-name:", props["name"].decode()) +print("GPU-clock:", props["clockRate"]) +print("GPU-memoryClock:", props["memoryClockRate"]) +print("GPU-nSM:", props["multiProcessorCount"]) +print("GPU-major:", props["major"]) +print("GPU-minor:", props["minor"]) +print("========================") + +########################################################################################## +# Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m} +# We will execute the contraction and compute the gradients of input tensors A, B, C +########################################################################################## + +print("Include headers and define data types.") + +data_type = cuquantum.cudaDataType.CUDA_R_32F +compute_type = cuquantum.ComputeType.COMPUTE_32F +num_inputs = 6 +grad_input_ids = np.asarray((0, 1, 2), dtype=np.int32) + +# Create an array of modes +modes_A = [ord(c) for c in ('a','b','c','d')] +modes_B = [ord(c) for c in ('b','c','d','e')] +modes_C = [ord(c) for c in ('e','g','h')] +modes_D = [ord(c) for c in ('g','h','i','j')] +modes_E = [ord(c) for c in ('i','j','k','l')] +modes_F = [ord(c) for c in ('k','l','m')] +modes_O = [ord(c) for c in ('a','m')] + +# Create an array of extents (shapes) for each tensor +dim = 36 +extent_A = (dim,) * len(modes_A) +extent_B = (dim,) * len(modes_B) +extent_C = (dim,) * len(modes_C) +extent_D = (dim,) * len(modes_D) +extent_E = (dim,) * len(modes_E) +extent_F = (dim,) * len(modes_F) +extent_O = (dim,) * len(modes_O) + +print("Define network, modes, and extents.") + +################# +# Initialize data +################# + +A_d = cp.random.random((np.prod(extent_A),), dtype=np.float32) +B_d = cp.random.random((np.prod(extent_B),), dtype=np.float32) +C_d = cp.random.random((np.prod(extent_C),), dtype=np.float32) +D_d = cp.random.random((np.prod(extent_D),), dtype=np.float32) +E_d = cp.random.random((np.prod(extent_E),), dtype=np.float32) +F_d = cp.random.random((np.prod(extent_F),), dtype=np.float32) +O_d = cp.zeros((np.prod(extent_O),), dtype=np.float32) +raw_data_in_d = (A_d.data.ptr, B_d.data.ptr, C_d.data.ptr, D_d.data.ptr, E_d.data.ptr, F_d.data.ptr) + +# allocate buffers for holding the gradients w.r.t. the first 3 input tensors +grads_d = [cp.empty_like(A_d), + cp.empty_like(B_d), + cp.empty_like(C_d), + None, + None, + None] +grads_d_ptr = [grad.data.ptr if grad is not None else 0 for grad in grads_d] + +# output gradients (w.r.t itself, so it's all one) +output_grads_d = cp.ones(extent_O, dtype=np.float32, order='F') + +############# +# cuTensorNet +############# + +stream = cp.cuda.Stream() +handle = cutn.create() + +nmode_A = len(modes_A) +nmode_B = len(modes_B) +nmode_C = len(modes_C) +nmode_D = len(modes_D) +nmode_E = len(modes_E) +nmode_F = len(modes_F) +nmode_O = len(modes_O) + +############################### +# Create Contraction Descriptor +############################### + +modes_in = (modes_A, modes_B, modes_C, modes_D, modes_E, modes_F) +extents_in = (extent_A, extent_B, extent_C, extent_D, extent_E, extent_F) +num_modes_in = (nmode_A, nmode_B, nmode_C, nmode_D, nmode_E, nmode_F) + +# Strides are optional; if no stride (0) is provided, then cuTensorNet assumes a generalized column-major data layout +strides_in = (0, 0, 0, 0, 0, 0) + +# Set up tensor network +desc_net = cutn.create_network_descriptor(handle, + num_inputs, num_modes_in, extents_in, strides_in, modes_in, 0, # inputs + nmode_O, extent_O, 0, modes_O, # output + data_type, compute_type) + +# In this sample we use the new network attributes interface to mark certain +# input tensors as constant, but we can also use the tensor qualifiers as shown +# in other samples (ex: tensornet_example_reuse.py) +net_attr_dtype = cutn.network_get_attribute_dtype(cutn.NetworkAttribute.INPUT_TENSORS_REQUIRE_GRAD) +tensor_ids = np.zeros(1, dtype=net_attr_dtype) +tensor_ids['num_tensors'] = grad_input_ids.size +tensor_ids['data'] = grad_input_ids.ctypes.data +cutn.network_set_attribute( + handle, desc_net, cutn.NetworkAttribute.INPUT_TENSORS_REQUIRE_GRAD, + tensor_ids.ctypes.data, tensor_ids.dtype.itemsize) + +print("Initialize the cuTensorNet library and create a network descriptor.") + +##################################################### +# Choose workspace limit based on available resources +##################################################### + +free_mem, total_mem = dev.mem_info +workspace_limit = int(free_mem * 0.9) + +####################### +# Set contraction order +####################### + +# create contraction optimizer info +optimizer_info = cutn.create_contraction_optimizer_info(handle, desc_net) + +# set a predetermined contraction path +path_dtype = cutn.contraction_optimizer_info_get_attribute_dtype(cutn.ContractionOptimizerInfoAttribute.PATH) +path = np.asarray([(0, 1), (0, 4), (0, 3), (0, 2), (0, 1)], dtype=np.int32) +path_obj = np.zeros((1,), dtype=path_dtype) +path_obj["num_contractions"] = num_inputs - 1 +path_obj["data"] = path.ctypes.data + +# provide user-specified contract path +cutn.contraction_optimizer_info_set_attribute( + handle, optimizer_info, cutn.ContractionOptimizerInfoAttribute.PATH, + path_obj.ctypes.data, path_obj.dtype.itemsize) + +num_slices = 1 + +print("Set predetermined contraction path into cuTensorNet optimizer.") + +############################################################# +# Create workspace descriptor, allocate workspace, and set it +############################################################# + +work_desc = cutn.create_workspace_descriptor(handle) + +# set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords +cutn.workspace_compute_contraction_sizes(handle, desc_net, optimizer_info, work_desc) +required_scratch_workspace_size = cutn.workspace_get_memory_size( + handle, work_desc, + cutn.WorksizePref.MIN, + cutn.Memspace.DEVICE, + cutn.WorkspaceKind.SCRATCH) +work_scratch = cp.cuda.alloc(required_scratch_workspace_size) +cutn.workspace_set_memory( + handle, work_desc, + cutn.Memspace.DEVICE, + cutn.WorkspaceKind.SCRATCH, + work_scratch.ptr, required_scratch_workspace_size) + +# set CACHE workspace, which will be used across network contraction operations +required_cache_workspace_size = cutn.workspace_get_memory_size( + handle, work_desc, + cutn.WorksizePref.MIN, + cutn.Memspace.DEVICE, + cutn.WorkspaceKind.CACHE) +work_cache = cp.cuda.alloc(required_cache_workspace_size) +cutn.workspace_set_memory( + handle, work_desc, + cutn.Memspace.DEVICE, + cutn.WorkspaceKind.CACHE, + work_cache.ptr, required_cache_workspace_size) + +print("Allocated and set up the GPU workspace") + +########################################################### +# Initialize the pair-wise contraction plans (for cuTENSOR) +########################################################### + +plan = cutn.create_contraction_plan(handle, desc_net, optimizer_info, work_desc) + +################################################################################### +# Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel +################################################################################### + +pref = cutn.create_contraction_autotune_preference(handle) + +num_autotuning_iterations = 5 # may be 0 +n_iter_dtype = cutn.contraction_autotune_preference_get_attribute_dtype( + cutn.ContractionAutotunePreferenceAttribute.MAX_ITERATIONS) +num_autotuning_iterations = np.asarray([num_autotuning_iterations], dtype=n_iter_dtype) +cutn.contraction_autotune_preference_set_attribute( + handle, pref, + cutn.ContractionAutotunePreferenceAttribute.MAX_ITERATIONS, + num_autotuning_iterations.ctypes.data, num_autotuning_iterations.dtype.itemsize) + +# Modify the plan again to find the best pair-wise contractions +cutn.contraction_autotune( + handle, plan, raw_data_in_d, O_d.data.ptr, + work_desc, pref, stream.ptr) + +cutn.destroy_contraction_autotune_preference(pref) + +print("Create a contraction plan for cuTENSOR and optionally auto-tune it.") + +########### +# Execution +########### + +# create a cutensornetSliceGroup_t object from a range of slice IDs +slice_group = cutn.create_slice_group_from_id_range(handle, 0, num_slices, 1) + +min_time_cutn = 1e100 +num_runs = 3 # to get stable perf results +e1 = cp.cuda.Event() +e2 = cp.cuda.Event() + +for i in range(num_runs): + # Contract over all slices. + e1.record(stream) + cutn.contract_slices( + handle, plan, raw_data_in_d, + O_d.data.ptr, + False, work_desc, slice_group, stream.ptr) + cutn.compute_gradients_backward( + handle, plan, raw_data_in_d, + output_grads_d.data.ptr, + grads_d_ptr, + False, work_desc, stream.ptr) + cutn.workspace_purge_cache(handle, work_desc, cutn.Memspace.DEVICE) + e2.record(stream) + + # Synchronize and measure timing + e2.synchronize() + time = cp.cuda.get_elapsed_time(e1, e2) / 1000 # ms -> s + min_time_cutn = min_time_cutn if min_time_cutn < time else time + +print("Contract the network and compute gradients.") + +# free up the workspace +del work_scratch +del work_cache + +# Recall that we set strides to null (0), so the data are in F-contiguous layout, +# including the gradients (which follow the layout of the input tensors) +A_d = A_d.reshape(extent_A, order='F') +B_d = B_d.reshape(extent_B, order='F') +C_d = C_d.reshape(extent_C, order='F') +D_d = D_d.reshape(extent_D, order='F') +E_d = E_d.reshape(extent_E, order='F') +F_d = F_d.reshape(extent_F, order='F') +O_d = O_d.reshape(extent_O, order='F') +grads_d[0] = grads_d[0].reshape(extent_A, order='F') +grads_d[1] = grads_d[1].reshape(extent_B, order='F') +grads_d[2] = grads_d[2].reshape(extent_C, order='F') + +# Compute the contraction reference using cupy.einsum with the same path +path = ['einsum_path'] + path.tolist() +out = cp.einsum("abcd,bcde,egh,ghij,ijkl,klm->am", A_d, B_d, C_d, D_d, E_d, F_d, optimize=path) +if not cp.allclose(out, O_d): + raise RuntimeError("result is incorrect") +print("Check cuTensorNet contraction result against that of cupy.einsum().") + +# Compute the gradient reference using PyTorch +if torch: + if not torch.cuda.is_available(): + # copy data back to CPU + dev = "cpu" + func = cp.asnumpy + torch_cuda = False + else: + # zero-copy from CuPy to PyTorch! + dev = "cuda" + func = (lambda x: x) # no op + torch_cuda = True + + A = torch.as_tensor(func(A_d), device=dev) + B = torch.as_tensor(func(B_d), device=dev) + C = torch.as_tensor(func(C_d), device=dev) + D = torch.as_tensor(func(D_d), device=dev) + E = torch.as_tensor(func(E_d), device=dev) + F = torch.as_tensor(func(F_d), device=dev) + output_grads = torch.as_tensor(func(output_grads_d), device=dev) + + # do not need gradient for the last 3 tensors + A.requires_grad_(True) + B.requires_grad_(True) + C.requires_grad_(True) + D.requires_grad_(False) + E.requires_grad_(False) + F.requires_grad_(False) + + # We can use either torch.einsum or opt_einsum.contract to establish the + # computational graph of an einsum op over the PyTorch tensors. Note that + # torch.einsum does not support passing custom contraction paths. + out = torch.einsum("abcd,bcde,egh,ghij,ijkl,klm->am", A, B, C, D, E, F) + out.backward(output_grads) # backprop to populate the inputs' .grad attributes + if not cp.allclose(cp.asarray(out.detach()), O_d): + raise RuntimeError("result is incorrect") + + # If using PyTorch CPU tensors, these move data back to GPU for comparison; + # otherwise, PyTorch GPU tensors are zero-copied as CuPy arrays. + assert cp.allclose(cp.asarray(A.grad), grads_d[0]) + assert cp.allclose(cp.asarray(B.grad), grads_d[1]) + assert cp.allclose(cp.asarray(C.grad), grads_d[2]) + # Note: D.grad, E.grad, and F.grad do not exist + + print("Check cuTensorNet gradient results against those from " + f"PyTorch ({'GPU' if torch_cuda else 'GPU'}).") + +####################################################### + +print(f"Tensor network contraction and back-propagation time (ms): = {min_time_cutn * 1000}") + +cutn.destroy_slice_group(slice_group) +cutn.destroy_contraction_plan(plan) +cutn.destroy_workspace_descriptor(work_desc) +cutn.destroy_contraction_optimizer_info(optimizer_info) +cutn.destroy_network_descriptor(desc_net) +cutn.destroy(handle) + +print("Free resource and exit.") diff --git a/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb b/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb index 74ba4df..c3ccf11 100644 --- a/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb +++ b/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb @@ -15,13 +15,13 @@ "id": "61d9c687-8903-4429-8e8a-2ba3b1e62c92", "metadata": {}, "source": [ - "In this notebook we will go over some examples on how to use cuTensorNet to execute basic MPS algorithms for quantum circuit simulation. \n", + "In this notebook we will go over some examples on how to use `cuquantum-python` to execute some MPS algorithms for quantum circuit simulation. \n", "\n", "This notebook will guide you through the following tasks:\n", " - I. Initializing an entangled quantum circuit as a basis.\n", " - II. Constructing an MPS from a quantum circuit and verifying its correctness.\n", " - III. Computing the expectation value of an MPS, with and without canonicalization.\n", - " - IV. Performing MPS-MPO multiplication, with and without approximation.\n", + " - IV. Performing MPS-MPO multiplication using two different methods and assessing their accuracy for the approximations.\n", "\n", "Note that we will use `cupy.ndarray` for operands throughout this notebook." ] @@ -68,9 +68,9 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ - "
" + "
" ] }, "execution_count": 2, @@ -118,9 +118,9 @@ " circuit = cirq.Circuit(add_entangled_layers(cirq.LineQubit.range(num_qubits), reps))\n", " return circuit\n", "\n", - "# Example using 8 qubits and 6 repeated entangled layers\n", + "# Example using 8 qubits and 8 repeated entangled layers\n", "num_qubits = 8\n", - "reps = 6 # depth/num of layers\n", + "reps = 8 # depth/num of layers\n", "\n", "backend = 'qiskit' # also works for 'cirq'\n", "circuit = get_entangled_circuit(num_qubits, reps, backend)\n", @@ -144,12 +144,12 @@ "\n", "![mps_diagram.png](attachment:7d5870b7-c80c-4190-b2ed-d598e137dcad.png)\n", "\n", - "Note: In this sample, we've inserted a dummy mode at the beginning and the end of the MPS to make it easier to generalize the handling of boundary conditions. This does not affect the outcome. \n", + "Note: In this sample, we've inserted a dummy mode of size 1 at the beginning and the end of the MPS to make it easier to generalize the handling of boundary conditions. This does not affect the outcome. \n", "\n", "The MPS representation of the circuit can be obtained by iteratively applying all gates to the initial MPS (typically $|\\Psi_0\\rangle = |00...00\\rangle$). \n", "In the subsequent subsections, we'll go through these operations step by step followed by verification of correctness.\n", "\n", - "## II(a) Initialization\n", + "## II(a). Initialization\n", "\n", "We begin by creating the initial MPS state $|\\Psi_0\\rangle = |00...00\\rangle$. " ] @@ -186,7 +186,7 @@ "id": "3ec42238-6f74-4998-a5c6-3f28db1e0051", "metadata": {}, "source": [ - "## II(b) Apply Gates to the MPS\n", + "## II(b). Apply Gates to the MPS\n", "\n", "Given an initial MPS state, we can begin applying all the gates in the quantum circuit to form the final MPS. \n", "The diagram below describes the application of single-qubit and two-qubit gates to an MPS. \n", @@ -197,7 +197,7 @@ "\n", "Note: In the case of a two-qubit gate acting on non-adjacent qubits, an iterative swap step is required to first make the two qubits adjacent. \n", "There are multiple ways to achieve this goal. The diagram below describes two typical algorithms to swap the qubit indices of adjacent sites. \n", - "Before the application of gate, adjacent sites are swapped iteratively until the two involved sites are made adjacent. \n", + "Before the application of the gate, adjacent sites are swapped iteratively until the two involved sites are made adjacent. \n", "After the gate operand is applied, a reverse swap sweep is also needed to revert back to the original MPS site ordering.\n", "\n", "![swap_site.png](attachment:bbad5491-2744-4842-9eb8-c5470461ec8b.png)\n", @@ -219,8 +219,8 @@ "site 0, shape: (1, 2, 2)\n", "site 1, shape: (2, 2, 4)\n", "site 2, shape: (4, 2, 8)\n", - "site 3, shape: (8, 2, 8)\n", - "site 4, shape: (8, 2, 8)\n", + "site 3, shape: (8, 2, 16)\n", + "site 4, shape: (16, 2, 8)\n", "site 5, shape: (8, 2, 4)\n", "site 6, shape: (4, 2, 2)\n", "site 7, shape: (2, 2, 1)\n" @@ -305,7 +305,7 @@ "# For two-qubit gates, an SVD is performed with singular values partitioned onto the two MPS sites equally.\n", "# We also set a cutoff value of 1e-12 to filter out computational noise.\n", "exact_gate_algorithm = {'qr_method': False, \n", - " 'svd_method':{'partition': 'UV', 'abs_cutoff':1e-12}}\n", + " 'svd_method':{'partition': 'UV', 'abs_cutoff':1e-12}}\n", "\n", "# Constructing the final MPS\n", "for (gate, qubits) in gates:\n", @@ -347,14 +347,14 @@ "output_type": "stream", "text": [ "The norm of the MPS: 1.000e+00\n", - "State vector difference: 1.166e-15\n" + "State vector difference: 1.790e-15\n" ] } ], "source": [ "class MPSContractionHelper:\n", " \"\"\"\n", - " A helper class to compute various quantities for a given MPS.\n", + " A helper class to compute various quantities for a given MPS using exact contraction.\n", " \n", " Interleaved format is used to construct the input args for `cuquantum.contract`. \n", " A concrete example on how the modes are populated for a 7-site MPS is provided below:\n", @@ -398,7 +398,7 @@ " mps_tensors: A list of rank-3 ndarray-like tensor objects. \n", " The indices of the ith tensor are expected to be bonding index to the i-1 tensor, \n", " the physical mode, and then the bonding index to the i+1th tensor.\n", - " options: Specify the contract and decompose options. \n", + " options: Specify the contraction options. \n", "\n", " Returns:\n", " The norm of the MPS.\n", @@ -417,7 +417,7 @@ " mps_tensors: A list of rank-3 ndarray-like tensor objects. \n", " The indices of the ith tensor are expected to be bonding index to the i-1 tensor, \n", " the physical mode, and then the bonding index to the i+1th tensor.\n", - " options: Specify the contract and decompose options. \n", + " options: Specify the contraction options. \n", "\n", " Returns:\n", " An ndarray-like object as the state vector.\n", @@ -429,7 +429,7 @@ " interleaved_inputs.append(output_modes) # output\n", " return self._contract('sv', interleaved_inputs, options=options)\n", " \n", - " def contract_expectation(self, mps_tensors, operator, qubits, options=None, normalize=False):\n", + " def contract_expectation(self, mps_tensors, operator, qubits, normalize=False, options=None):\n", " \"\"\"\n", " Contract the corresponding tensor network to form the state vector representation of the MPS.\n", "\n", @@ -441,8 +441,8 @@ " The modes of the operator are expected to be output qubits followed by input qubits, e.g, \n", " ``A, B, a, b`` where `a, b` denotes the inputs and `A, B'` denotes the outputs. \n", " qubits: A sequence of integers specifying the qubits that the operator is acting on. \n", - " options: Specify the contract and decompose options. \n", " normalize: Whether to scale the expectation value by the normalization factor.\n", + " options: Specify the contraction options. \n", "\n", " Returns:\n", " An ndarray-like object as the state vector.\n", @@ -480,7 +480,7 @@ " mpo_tensors: A list of rank-4 ndarray-like tensor objects.\n", " The indics of the ith tensor are expected to be the bonding index to the i-1 tensor, \n", " the output physical mode, the bonding index to the i+1th tensor and then the inputput physical mode.\n", - " options: Specify the contract and decompose options. \n", + " options: Specify the contraction options. \n", "\n", " Returns:\n", " An ndarray-like object as the output state vector.\n", @@ -535,7 +535,7 @@ "source": [ "# III. MPS Canonicalization\n", "\n", - "One nice property of MPS is that its canonical form allows for easy computation of expectation values or reduced density matrices. \n", + "One nice property of the MPS is that its canonical form allows for easy computation of expectation values or reduced density matrices. \n", "\n", "For instance, if one were to compute the expectation value of a two-qubit operator, two approaches shown below can be adopted:\n", "- Contraction of the original $\\langle\\Psi|\\hat{O}|\\Psi\\rangle$ tensor network where $|\\Psi\\rangle$ is now an MPS.\n", @@ -561,8 +561,8 @@ "output_type": "stream", "text": [ "Expectation value at qubits (2, 3)\n", - ": (0.4953677828368589+0.39635222637212664j)\n", - ": (0.495367782836861+0.3963522263721279j)\n", + ": (0.34424195503028576+0.38355529480141526j)\n", + ": (0.3442419550302871+0.3835552948014171j)\n", " equal to ? : True\n" ] } @@ -626,9 +626,9 @@ "name": "stdout", "output_type": "stream", "text": [ - " with reduced graph: (0.49536778283685906+0.3963522263721267j)\n", + " with reduced graph: (0.34424195503028576+0.38355529480141537j)\n", "reduced equal to ?: True\n", - "original with reduced graph: (1.1231422600031686+0.8532610561182179j)\n", + "original with reduced graph: (1.1063549716955885+1.0291661220130142j)\n", "Equal to reference?: False (False expected)\n" ] } @@ -700,7 +700,7 @@ "print(f\"reduced equal to ?: {is_canonical_expec_equal}\")\n", "assert is_canonical_expec_equal\n", "\n", - "# For validation, we can also try to compute the expectation value (False) from the reduced graph with the original MPS\n", + "# For validation, we can also compute the expectation value (wrong) from the reduced graph with the original MPS\n", "expec_mps_reduced = contract('ipj,jql,irk,ksl,rspq->', mps_tensors[target_qubits[0]], mps_tensors[target_qubits[1]], mps_tensors[target_qubits[0]].conj(), mps_tensors[target_qubits[1]].conj(), operator, options=options)\n", "print(f\"original with reduced graph: {expec_mps_reduced}\")\n", "\n", @@ -726,15 +726,16 @@ "\n", "![mps_mpo.png](attachment:a1e48907-1e5f-498a-bac9-d2f3f238c08c.png)\n", "\n", - "The following topics will be covered:\n", + "We will describe two types of algorithms to perform the multiplication. Specifically, the following topics will be covered:\n", "\n", "- Initializing a random MPO.\n", - "- Contracting MPS and MPO exactly using a simple algorithm and verifying its correctness.\n", - "- Evaluating the accuracy of approximate MPS MPO multiplication.\n", + "- Contracting MPS and MPO exactly using a direct algorithm and verifying its correctness.\n", + "- Contracting MPS and MPO exactly using an algorithm based on density matrix and verifying its correctness.\n", + "- Evaluating the accuracy of approximate MPS MPO multiplication using the two algorithms above.\n", "\n", "## IV(a). Generate a Random MPO\n", "\n", - "Here we first create a random MPO for subsequent operations." + "Here we first create a random MPO for subsequent operations. Like we did with MPS, we will insert a dummpy index in the first and the last MPO tensor." ] }, { @@ -758,7 +759,7 @@ " shape = (left_bond, 2, right_bond, 2)\n", " o = cp.random.random(shape) + 1.j * cp.random.random(shape)\n", " # normalize\n", - " o /= contract('aPbp,aPbp->', o, o.conj()) ** .5\n", + " o /= cp.linalg.norm(o)\n", " mpo_tensors.append(o)\n", " return mpo_tensors\n", "\n", @@ -767,25 +768,25 @@ }, { "attachments": { - "61565b4c-c320-42e0-bb0f-8248ce692c5e.png": { - "image/png": "" + "987db8fe-d369-4f9e-b96b-461370ba011c.png": { + "image/png": "" } }, "cell_type": "markdown", "id": "0f80c8e5-dfa5-422c-83c9-7d7ee11bd332", "metadata": {}, "source": [ - "## IV(b). Exact MPS MPO Multiplication\n", + "## IV(b). Exact MPS MPO Multiplication with Direct Algorithm\n", "\n", "There are various algorithms to perform MPS MPO multiplication. In the diagram below we present one simple algorithm based on direct contraction and SVD. \n", "\n", - "![mps_mpo_multiplication.png](attachment:61565b4c-c320-42e0-bb0f-8248ce692c5e.png)\n", + "![direct_mpo_multiplication.png](attachment:987db8fe-d369-4f9e-b96b-461370ba011c.png)\n", "\n", "Note that during SVD, we can partition the singular values to the right tensor such that the resulting MPS we obtain is in the right orthogonal gauge.\n", "\n", "We perform the following checks to verify the correctness of our algorithm:\n", "- Comparing the states presented by the output MPS against direct contraction.\n", - "- Computing the expectation value of a one-body operator acting on the last qubit using a reduced graph. \n" + "- Computing the expectation value of a one-body operator acting on the canonicalization center using a reduced graph. \n" ] }, { @@ -798,7 +799,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "After MPS-MPO multiplication\n", + "After exact MPS-MPO multiplication using direct algorithm\n", "site 0, shape: (1, 2, 2)\n", "site 1, shape: (2, 2, 4)\n", "site 2, shape: (4, 2, 8)\n", @@ -807,113 +808,331 @@ "site 5, shape: (32, 2, 16)\n", "site 6, shape: (16, 2, 2)\n", "site 7, shape: (2, 2, 1)\n", - "Max absolute difference compared with reference state vector: 2.040e-17\n", - "Is the canonicalization center moved to the last site?: True\n" + "Max absolute difference compared with reference state vector: 6.934e-18\n", + "Is the canonicalization center moved to the right?: True\n" ] } ], "source": [ - "def multiply_mps_mpo(mps_tensors, mpo_tensors, algorithm, options=None):\n", + "def multiply_mps_mpo_direct(mps_tensors, mpo_tensors, svd_method, canonical_center=\"right\", options=None):\n", " \"\"\"\n", - " Perform MPS MPO multiplication using the algorithm shown above.\n", + " Perform MPS MPO multiplication using the direct algorithm shown above.\n", + " \n", + " Args:\n", + " mps_tensors: A list of rank-3 ndarray-like tensor objects. \n", + " The indices of the ith tensor is expected to be the bonding index to the i-1 tensor, \n", + " the physical mode and then the bonding index to the i+1th tensor.\n", + " mpo_tensors: A list of rank-4 ndarray-like tensor objects.\n", + " The indics of the ith tensor are expected to be the bonding index to the i-1 tensor, \n", + " the output physical mode, the bonding index to the i+1th tensor and then the inputput physical mode.\n", + " svd_method: A tensor.SVDMethod object specifying the options for the SVD truncation. \n", + " For the direct algorithm, the `partition` attribute must be set to `V` for proper canonicalization.\n", + " canonical_center: A string specifying the final canonicalization center, can be either `left` or `right`.\n", + " options: Specify the contract and decompose options. \n", " \"\"\"\n", + " if svd_method.partition != 'V':\n", + " raise ValueError(f\"The direct algorithm expects SVDMethod.partition to be V, found {svd_method.partition}\")\n", " num_qubits = len(mps_tensors)\n", - " t = contract('ipj,kmlp->ijlm', mps_tensors[0], mpo_tensors[0], options=options)\n", - " output_mps = []\n", - " for i in range(1, num_qubits):\n", - " mps, _, t = contract_decompose('ijlm,jqr,lnsq->imx,xrsn', t, mps_tensors[i], mpo_tensors[i], algorithm=algorithm, options=options)\n", - " output_mps.append(mps)\n", - " t = t.reshape(-1,2,1)\n", - " output_mps.append(t)\n", + " algorithm = {'qr_method': False,\n", + " 'svd_method': svd_method}\n", + " output_mps = [None,] * num_qubits\n", + " X = None\n", + " if canonical_center == 'right':\n", + " for i in range(num_qubits):\n", + " if X is None:\n", + " X = contract('ipj,kmlp->imlj', mps_tensors[i], mpo_tensors[i], options=options)\n", + " else:\n", + " mps, _, X = contract_decompose('imlj,jqr,lnsq->imx,xnsr', X, mps_tensors[i], mpo_tensors[i], algorithm=algorithm, options=options)\n", + " output_mps[i-1] = mps\n", + " output_mps[-1] = X.reshape(-1,2,1)\n", + " elif canonical_center == 'left':\n", + " for i in range(num_qubits-1, -1, -1):\n", + " if X is None:\n", + " X = contract('ipj,kmlp->ikmj', mps_tensors[i], mpo_tensors[i], options=options)\n", + " else:\n", + " mps, _, X = contract_decompose('ikmj,lqi,npkq->xmj,lnpx', X, mps_tensors[i], mpo_tensors[i], algorithm=algorithm, options=options)\n", + " output_mps[i+1] = mps\n", + " output_mps[0] = X.reshape(1,2,-1)\n", + " else:\n", + " raise ValueError(\"canonical_center must be either left or right\")\n", " return output_mps\n", "\n", - "mult_algorithm = {'qr_method': False,\n", - " 'svd_method': {'partition': 'V', 'rel_cutoff':1e-10}}\n", - "new_mps = multiply_mps_mpo(mps_tensors, mpo_tensors, mult_algorithm, options=options)\n", + "svd_method = tensor.SVDMethod(partition='V', rel_cutoff=1e-10)\n", + "canonical_center = 'right' # target canonicalization center, can be 'left' as well\n", "\n", - "print(\"After MPS-MPO multiplication\")\n", - "for i, o in enumerate(new_mps):\n", + "# multiply MPS with MPO using direct algorithm\n", + "mps_mpo_direct = multiply_mps_mpo_direct(mps_tensors, mpo_tensors, svd_method, canonical_center=canonical_center, options=options)\n", + "\n", + "print(\"After exact MPS-MPO multiplication using direct algorithm\")\n", + "for i, o in enumerate(mps_mpo_direct):\n", " print(f\"site {i}, shape: {o.shape}\")\n", "\n", "# compute the state vector after MPS-MPO multiplication\n", - "mps_mpo_sv = mps_helper.contract_state_vector(new_mps, options=options)\n", + "mps_mpo_sv_direct = mps_helper.contract_state_vector(mps_mpo_direct, options=options)\n", + "\n", "# reference state vector from full network contraction\n", - "mps_mpo_ref = mps_helper.contract_mps_mpo_to_state_vector(mps_tensors, mpo_tensors, options=options)\n", - "print(f\"Max absolute difference compared with reference state vector: {abs(mps_mpo_sv-mps_mpo_ref).max():0.3e}\")\n", - "assert cp.allclose(mps_mpo_sv, mps_mpo_ref)\n", + "mps_mpo_sv_ref = mps_helper.contract_mps_mpo_to_state_vector(mps_tensors, mpo_tensors, options=options)\n", + "print(f\"Max absolute difference compared with reference state vector: {abs(mps_mpo_sv_direct-mps_mpo_sv_ref).max():0.3e}\")\n", + "assert cp.allclose(mps_mpo_sv_direct, mps_mpo_sv_ref)\n", "\n", - "# compute the expectation value of an operator on the last qubit\n", + "# compute the expectation value of a one body operator on the canonicalization center\n", "one_body_operator = cp.random.random([2,2]) + 1j* cp.random.random([2,2])\n", - "# reference from contracting the full network\n", - "expec_one_body_ref = mps_helper.contract_expectation(new_mps, one_body_operator, (num_qubits-1, ), options=options)\n", - "# expectation value from the reduced graph on the last qubit\n", - "expec_one_body = contract('ipj,iPj,Pp->', new_mps[-1], new_mps[-1].conj(), one_body_operator, options=options)\n", - "\n", - "is_right_canonical = cp.allclose(expec_one_body, expec_one_body_ref)\n", - "print(f\"Is the canonicalization center moved to the last site?: {is_right_canonical}\")\n", - "assert is_right_canonical" + "operator_site = {'right': num_qubits-1,\n", + " 'left': 0}[canonical_center]\n", + "\n", + "# reference expectation value from contracting the full network\n", + "expec_direct_full = mps_helper.contract_expectation(mps_mpo_direct, one_body_operator, (operator_site, ), normalize=True, options=options)\n", + "# expectation value from the reduced graph on the canonical center\n", + "expec_direct_reduced = contract('ipj,iPj,Pp->', mps_mpo_direct[operator_site], mps_mpo_direct[operator_site].conj(), one_body_operator, options=options)\n", + "expec_direct_reduced /= cp.linalg.norm(mps_mpo_direct[operator_site]) ** 2\n", + "\n", + "is_direct_canonical = cp.allclose(expec_direct_full, expec_direct_reduced)\n", + "print(f\"Is the canonicalization center moved to the {canonical_center}?: {is_direct_canonical}\")\n", + "assert is_direct_canonical\n" ] }, { "cell_type": "markdown", - "id": "4194c27f-87c2-4671-881c-7ef351c0ecf8", + "id": "985449dc-afd0-4d95-96f5-06b8ffed9f1f", "metadata": {}, "source": [ - "## IV(c). Accuracy of Approximate MPS MPO contraction\n", + "## IV(c). Exact MPS MPO Multiplication with Density Matrix Algorithm\n", "\n", - "In this subsection we present some experiments on MPS-MPO mulitiplication with approximation. \n", - "Specifically, we will do a sweep of MPS-MPO multiplication with different constraints on the maximal number of singular values to keep for all connecting bonds.\n", - "We examine the accuracy of approximation with the following two criteria:\n", - "- The difference in normalized state vector.\n", - "- The difference in expectation value of a random operator after normalization." + "The density matrix algorithm for MPS MPO multiplication is based on iterative truncation on the density matrix. \n", + "For detailed introduction on the algorithm, user may refer to this [tutorial page](https://tensornetwork.org/mps/algorithms/denmat_mpo_mps/) or this [paper](https://iopscience.iop.org/article/10.1088/1742-5468/2007/10/P10014). \n", + "Since the truncation is performed on the density matrix, approximation using this algorithm is generally expected to be more accurate than the direct approach. \n", + "\n", + "In this subsection, we provide a reference implementation in function `multiply_mps_mpo_density_matrix` and verify its correctness using the same checks as we did with the direct algorithm." ] }, { "cell_type": "code", "execution_count": 10, - "id": "99c9204c-a629-4309-b333-691e8a181088", + "id": "68e9d6ef-16dd-4b25-906c-773f1553658f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "max extent=1, state vector error=4.191e-02, relative error for expectation value=1.770e-02\n", - "max extent=2, state vector error=1.544e-02, relative error for expectation value=2.131e-03\n", - "max extent=4, state vector error=2.894e-03, relative error for expectation value=2.804e-05\n", - "max extent=8, state vector error=2.643e-04, relative error for expectation value=5.410e-07\n", - "max extent=12, state vector error=6.573e-05, relative error for expectation value=3.087e-08\n", - "max extent=16, state vector error=3.419e-06, relative error for expectation value=8.492e-11\n" + "After exact MPS-MPO multiplication using density matrix algorithm\n", + "site 0, shape: (1, 2, 2)\n", + "site 1, shape: (2, 2, 4)\n", + "site 2, shape: (4, 2, 8)\n", + "site 3, shape: (8, 2, 15)\n", + "site 4, shape: (15, 2, 8)\n", + "site 5, shape: (8, 2, 4)\n", + "site 6, shape: (4, 2, 2)\n", + "site 7, shape: (2, 2, 1)\n", + "Max absolute difference compared with reference state vector: 6.240e-09\n", + "Is the canonicalization center moved to the right?: True\n" ] } ], "source": [ - "# For a fair comparison between state vectors, in this vector we normalize all state vector\n", - "mps_mpo_ref /= cp.linalg.norm(mps_mpo_ref)\n", + "def multiply_mps_mpo_density_matrix(mps_tensors, mpo_tensors, svd_method, canonical_center=\"right\", options=None):\n", + " \"\"\"\n", + " Perform MPS MPO multiplication using the density matrix algorithm.\n", + " \n", + " Args:\n", + " mps_tensors: A list of rank-3 ndarray-like tensor objects. \n", + " The indices of the ith tensor is expected to be the bonding index to the i-1 tensor, \n", + " the physical mode and then the bonding index to the i+1th tensor.\n", + " mpo_tensors: A list of rank-4 ndarray-like tensor objects.\n", + " The indics of the ith tensor are expected to be the bonding index to the i-1 tensor, \n", + " the output physical mode, the bonding index to the i+1th tensor and then the inputput physical mode.\n", + " svd_method: A tensor.SVDMethod object specifying the options for the SVD truncation. \n", + " For the density matrix algorithm, the `partition` attribute must be set to `None` for proper canonicalization.\n", + " canonical_center: A string specifying the final canonicalization center, can be either `left` or `right`.\n", + " options: Specify the contract and decompose options. \n", + " \"\"\"\n", + " if svd_method.partition is not None:\n", + " raise ValueError(f\"The density matrix algorithm expects SVDMethod.partition to be None, found {svd_method.partition}\")\n", + " if len(mps_tensors) != len(mpo_tensors):\n", + " raise ValueError(f\"The number of MPS tensor {len(mps_tensors)} does not match that of MPO tensors {len(mpo_tensors)}\")\n", + " \n", + " n = len(mps_tensors)\n", + " algo = {'qr_method': False, 'svd_method': svd_method}\n", + " envs = dict()\n", + " mps_mpo_tensors = []\n", + " for i in range(n):\n", + " # multiply the MPS tensor with the MPO tensor at the same site\n", + " mps_mpo_tensors.append(contract('ijk,lnmj->ilnmk', mps_tensors[i], mpo_tensors[i], options=options))\n", + " new_mps_tensors = [None] * n\n", + " if canonical_center == 'left':\n", + " # construct the environment tensors with an inverse sweep from left to right\n", + " for i in range(n-1):\n", + " if i==0:\n", + " envs[i] = contract('ilnmk,ILnMK->kmKM', mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), options=options)\n", + " else:\n", + " # contract the MPS-MPO with its left env to construct the next env \n", + " envs[i] = contract('ilIL,ilnmk,ILnMK->kmKM', envs[i-1], mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), options=options)\n", + " R_tensor = None\n", + " for i in range(n-1, 0, -1):\n", + " if R_tensor is None:\n", + " # contract and decompose ---MPS_MPO---left_env---MPS_MPO^*--- to compute the new MPS tensor at last site\n", + " u, s, v = contract_decompose('ilIL,ilnmk,ILNMK->xnk,xNK', envs[i-1], mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), algorithm=algo, options=options)\n", + " R_tensor = contract('ilnmk,xnr->ilx', mps_mpo_tensors[i], v, options=options)\n", + " else:\n", + " # contract and decompose R_i---MPS_MPO_i---left_env_i---MPS_MPO_i^*---R_i^* to compute the unitary u as the new compressed MPS tensor\n", + " u, s, v = contract_decompose('ilIL,ilnmk,ILNMK,kmx,KMX->znx,zNX', envs[i-1], mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), R_tensor, R_tensor.conj(), algorithm=algo, options=options)\n", + " # form R_i+1 by contracting MPS_MPO_i, V_i and R_i\n", + " R_tensor = contract('ilnmk,znx,kmx->ilz', mps_mpo_tensors[i], v, R_tensor, options=options)\n", + " new_mps_tensors[i] = u\n", + " # contract MPS_MPO_0 with R_0 as the first MPS tensor\n", + " new_mps_tensors[0] = contract('ilnmk,kmx->inx', mps_mpo_tensors[0], R_tensor, options=options)\n", + " elif canonical_center == 'right':\n", + " # construct the environment tensors with an inverse sweep from right to left\n", + " for i in range(n-1, 0, -1):\n", + " if i == n-1:\n", + " envs[i] = contract('ilnmk,ILnMK->ilIL', mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), options=options)\n", + " else:\n", + " # contract the MPS-MPO with its right env to construct the next env \n", + " envs[i] = contract('kmKM,ilnmk,ILnMK->ilIL', envs[i+1], mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), options=options)\n", + " L_tensor = None\n", + " for i in range(n-1):\n", + " if L_tensor is None:\n", + " # contract and decompose ---MPS_MPO---right_env---MPS_MPO^*--- to compute the new MPS tensor at last site\n", + " u, s, v = contract_decompose('kmKM,ilnmk,ILNMK->inx,INx', envs[i+1], mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), algorithm=algo, options=options)\n", + " L_tensor = contract('ilnmk,rnx->xmk', mps_mpo_tensors[i], v, options=options)\n", + " else:\n", + " # contract and decompose L_i---MPS_MPO_i---right_env_i---MPS_MPO_i^*---L_i^* to compute the unitary u as the new compressed MPS tensor\n", + " u, s, v = contract_decompose('kmKM,ilnmk,ILNMK,xli,XLI->xnz,XNz', envs[i+1], mps_mpo_tensors[i], mps_mpo_tensors[i].conj(), L_tensor, L_tensor.conj(), algorithm=algo, options=options)\n", + " L_tensor = contract('ilnmk,xnz,xli->zmk', mps_mpo_tensors[i], v, L_tensor, options=options)\n", + " new_mps_tensors[i] = u\n", + " # contract the last MPS_MPO with L as the last MPS tensor\n", + " new_mps_tensors[n-1] = contract('ilnmk,xli->xnk', mps_mpo_tensors[n-1], L_tensor, options=options)\n", + " else:\n", + " raise ValueError(\"sweep direction must be either left or right\")\n", + " \n", + " return new_mps_tensors\n", "\n", - "# We also rescale the expectation values by normalization factor\n", - "expec_one_body_ref = mps_helper.contract_expectation(new_mps, one_body_operator, (num_qubits-1, ), options=options, normalize=True)\n", + "\n", + "svd_method = tensor.SVDMethod(rel_cutoff=1e-10)\n", + "\n", + "# multiply MPS with MPO using density matrix algorithm\n", + "mps_mpo_dm = multiply_mps_mpo_density_matrix(mps_tensors, mpo_tensors, svd_method, canonical_center=canonical_center, options=options)\n", + "print(\"After exact MPS-MPO multiplication using density matrix algorithm\")\n", + "for i, o in enumerate(mps_mpo_dm):\n", + " print(f\"site {i}, shape: {o.shape}\")\n", + "\n", + "# compute the state vector after MPS-MPO multiplication\n", + "mps_mpo_sv_dm = mps_helper.contract_state_vector(mps_mpo_dm, options=options)\n", + "print(f\"Max absolute difference compared with reference state vector: {abs(mps_mpo_sv_dm-mps_mpo_sv_ref).max():0.3e}\")\n", + "assert cp.allclose(mps_mpo_sv_dm, mps_mpo_sv_ref)\n", + "\n", + "# reference from contracting the full network\n", + "expec_dm_full = mps_helper.contract_expectation(mps_mpo_dm, one_body_operator, (operator_site, ), normalize=True, options=options)\n", + "# expectation value from the reduced graph on the canonical center\n", + "expec_dm_reduced = contract('ipj,iPj,Pp->', mps_mpo_dm[operator_site], mps_mpo_dm[operator_site].conj(), one_body_operator, options=options)\n", + "expec_dm_reduced /= cp.linalg.norm(mps_mpo_dm[operator_site]) ** 2\n", + "\n", + "is_dm_canonical = cp.allclose(expec_dm_full, expec_dm_reduced)\n", + "print(f\"Is the canonicalization center moved to the {canonical_center}?: {is_dm_canonical}\")\n", + "assert is_dm_canonical" + ] + }, + { + "cell_type": "markdown", + "id": "4194c27f-87c2-4671-881c-7ef351c0ecf8", + "metadata": {}, + "source": [ + "## IV(d). Accuracy of Approximate MPS MPO contraction\n", + "\n", + "In this subsection we present some experiments on the accuracies of the two algorithms for approximate MPS-MPO mulitiplication. \n", + "Specifically, we will do a sweep of MPS-MPO multiplication with different constraints on the maximal number of singular values to keep for all connecting bonds.\n", + "We examine the accuracy of approximation for these two algorithms with the following two criteria:\n", + "- The difference in state vector.\n", + "- The difference in expectation value of a random operator after normalization." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "99c9204c-a629-4309-b333-691e8a181088", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "err_direct = []\n", + "err_dm = []\n", "\n", "for max_extent in (1, 2, 4, 8, 12, 16):\n", - " naive_algorithm = {'qr_method': False, \n", - " 'svd_method': {'partition': 'V', 'rel_cutoff':1e-10, 'max_extent': max_extent}}\n", - " new_mps = multiply_mps_mpo(mps_tensors, mpo_tensors, naive_algorithm, options=options)\n", - " # compute the normalized state vector\n", - " mps_mpo_sv = mps_helper.contract_state_vector(new_mps, options=options)\n", - " mps_mpo_sv /= cp.linalg.norm(mps_mpo_sv)\n", - " err_sv = abs(mps_mpo_sv-mps_mpo_ref).max()\n", + " svd_setting = {'rel_cutoff': 1e-10, 'max_extent': max_extent}\n", + " \n", + " # 1. Using direct algorithm\n", + " svd_method_direct = tensor.SVDMethod(partition='V', **svd_setting)\n", + " new_mps_direct = multiply_mps_mpo_direct(mps_tensors, mpo_tensors, svd_method_direct, canonical_center=canonical_center, options=options)\n", + " \n", + " # 1a. compute the equivalent state vector\n", + " mps_mpo_sv_direct = mps_helper.contract_state_vector(new_mps_direct, options=options)\n", + " err_sv_direct = abs(mps_mpo_sv_direct - mps_mpo_sv_ref).max()\n", " \n", - " # compute the expectation value with normalization\n", - " expec_one_body = contract('ipj,iPj,Pp->', new_mps[-1], new_mps[-1].conj(), one_body_operator, options=options) \n", - " expec_one_body /= cp.linalg.norm(new_mps[-1]) ** 2\n", - " err_expec = abs(expec_one_body_ref-expec_one_body)/abs(expec_one_body)\n", + " # 1b. compute the expectation value with normalization\n", + " expec_direct = contract('ipj,iPj,Pp->', new_mps_direct[operator_site], new_mps_direct[operator_site].conj(), one_body_operator, options=options) \n", + " expec_direct /= cp.linalg.norm(new_mps_direct[operator_site]) ** 2\n", + " err_expec_direct = abs(1-expec_direct/expec_direct_full)\n", " \n", - " print(f\"max extent={max_extent}, state vector error={err_sv:0.3e}, relative error for expectation value={err_expec:0.3e}\")" + " # 2. Using density matrix algorithm\n", + " svd_method_dm = tensor.SVDMethod(**svd_setting)\n", + " new_mps_dm = multiply_mps_mpo_density_matrix(mps_tensors, mpo_tensors, svd_method_dm, canonical_center=canonical_center, options=options)\n", + " \n", + " # 2a. compute the equivalent state vector\n", + " mps_mpo_sv_dm = mps_helper.contract_state_vector(new_mps_dm, options=options)\n", + " err_sv_dm = abs(mps_mpo_sv_dm - mps_mpo_sv_ref).max()\n", + " \n", + " # 2b. compute the expectation value with normalization\n", + " expec_dm = contract('ipj,iPj,Pp->', new_mps_dm[operator_site], new_mps_dm[operator_site].conj(), one_body_operator, options=options) \n", + " expec_dm /= cp.linalg.norm(new_mps_dm[operator_site]) ** 2\n", + " err_expec_dm = abs(1-expec_dm/expec_dm_full)\n", + " \n", + " err_direct.append([max_extent, err_sv_direct.get(), err_expec_direct.get()])\n", + " err_dm.append([max_extent, err_sv_dm.get(), err_expec_dm.get()])\n", + "\n", + "err_direct = np.asarray(err_direct)\n", + "err_dm = np.asarray(err_dm)\n", + "\n", + "# making plots\n", + "import matplotlib.pyplot as plt\n", + "\n", + "fig, axes = plt.subplots(1,2, figsize=(12,6), sharex=True)\n", + "direct_plot_kwargs = {'color': 'k',\n", + " 'marker': 'o',\n", + " 'label': 'Direct Algorithm'}\n", + "\n", + "dm_plot_kwargs = {'color': 'r',\n", + " 'marker': 'o',\n", + " 'label': 'Density Matrix Algorithm'}\n", + "\n", + "axes[0].plot(err_direct[:,0], err_direct[:,1], **direct_plot_kwargs)\n", + "axes[0].plot(err_dm[:,0], err_dm[:,1], **dm_plot_kwargs)\n", + "axes[0].set_yscale('log')\n", + "axes[0].legend()\n", + "axes[0].set_xlabel('Bond Dimension')\n", + "axes[0].set_title('Absolute Error in State Vector')\n", + "\n", + "axes[1].plot(err_direct[:,0], err_direct[:,2], **direct_plot_kwargs)\n", + "axes[1].plot(err_dm[:,0], err_dm[:,2], **dm_plot_kwargs)\n", + "axes[1].set_yscale('log')\n", + "axes[1].legend()\n", + "axes[1].set_xlabel('Bond Dimension')\n", + "axes[1].set_title('Relative Error in Expectation Value')\n", + "plt.show()\n" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "ab89e997-b286-4c31-a7e1-562644767c86", "metadata": {}, "outputs": [], diff --git a/python/setup.py b/python/setup.py index 7758c34..b69a8a4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -29,12 +29,17 @@ # - cuTENSOR version is constrained in the cutensornet-cuXX package, so we don't # need to list it install_requires = [ - 'numpy', - # 'cupy', # TODO: use "cupy-wheel" once it's stablized, see https://github.com/cupy/cupy/issues/6688 + 'numpy>=1.21', # 'torch', # <-- PyTorch is optional; also, the PyPI version does not support GPU... - f'custatevec-cu{utils.cuda_major_ver}~=1.3', # ">=1.3.0,<2" - f'cutensornet-cu{utils.cuda_major_ver}~=2.1', # ">=2.1.0,<3" - ] + f'custatevec-cu{utils.cuda_major_ver}~=1.4', # ">=1.4.0,<2" + f'cutensornet-cu{utils.cuda_major_ver}~=2.2', # ">=2.2.0,<3" +] +if utils.cuda_major_ver == '11': + # CuPy has 3+ wheels for CUDA 11.x, only the cuquantum-python meta package has + # a chance to resolve the ambiguity properly + pass +elif utils.cuda_major_ver == '12': + install_requires.append('cupy-cuda12x>=10.0') # no ambiguity # Note: the extension attributes are overwritten in build_extension() @@ -91,7 +96,6 @@ "Topic :: Education", "Topic :: Scientific/Engineering", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -104,7 +108,7 @@ packages=find_packages(include=['cuquantum', 'cuquantum.*']), package_data={"": ["*.pxd", "*.pyx", "*.py"],}, zip_safe=False, - python_requires='>=3.8', + python_requires='>=3.9', install_requires=install_requires, tests_require=install_requires+tests_require, cmdclass=cmdclass, diff --git a/python/tests/cuquantum_tests/__init__.py b/python/tests/cuquantum_tests/__init__.py index 3197836..84da4e4 100644 --- a/python/tests/cuquantum_tests/__init__.py +++ b/python/tests/cuquantum_tests/__init__.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: BSD-3-Clause +import atexit +import glob import os import sys import tempfile @@ -23,6 +25,15 @@ sys.path.append(os.getcwd()) +def clean_up_cffi_files(): + files = glob.glob(os.path.join(os.getcwd(), "cuquantum_test_cffi*")) + for f in files: + try: + os.remove(f) + except FileNotFoundError: + pass + + dtype_to_data_type = { numpy.float16: cudaDataType.CUDA_R_16F, numpy.float32: cudaDataType.CUDA_R_32F, @@ -114,6 +125,7 @@ def _get_functor_address(self): self.ffi = ffi _cffi_mod1 = importlib.import_module(mod_name) self.ffi_mod = _cffi_mod1 + atexit.register(clean_up_cffi_files) alloc_addr = self._get_address("my_alloc") free_addr = self._get_address("my_free") @@ -173,6 +185,7 @@ def _get_handler_address(self): self.ffi = ffi _cffi_mod2 = importlib.import_module(mod_name) self.ffi_mod = _cffi_mod2 + atexit.register(clean_up_cffi_files) h = self.handler = self.ffi_mod.ffi.new("myHandler*") self.ffi_mod.lib.init_myHandler(h, self.source.encode()) diff --git a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py index 0627318..364d3c3 100644 --- a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py +++ b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py @@ -4,9 +4,9 @@ import copy -import cupy +import cupy as cp from cupy import testing -import numpy +import numpy as np try: from mpi4py import MPI # init! except ImportError: @@ -39,13 +39,13 @@ def handle(): @testing.parameterize(*testing.product({ 'n_qubits': (3,), - 'dtype': (numpy.complex64, numpy.complex128), + 'dtype': (np.complex64, np.complex128), })) class TestSV: # Base class for all statevector tests def get_sv(self): - arr = cupy.zeros((2**self.n_qubits,), dtype=self.dtype) + arr = cp.zeros((2**self.n_qubits,), dtype=self.dtype) arr[0] = 1 # initialize in |000...00> return arr @@ -57,7 +57,50 @@ def _return_data(self, data, name, dtype, return_value): return 0, 0 else: # return int as void* - data = numpy.asarray(data, dtype=dtype) + data = np.asarray(data, dtype=dtype) + setattr(self, name, data) # keep data alive + return data.ctypes.data, data.size + elif return_value == 'seq': + # data itself is already a flat sequence + return data, len(data) + else: + assert False + + +@testing.parameterize(*testing.product({ + 'n_svs': (3,), + 'n_qubits': (4,), + 'n_extra_qubits': (0, 1), # for padding purpose + 'dtype': (np.complex64, np.complex128), +})) +class TestBatchedSV: + # Base class for all batched statevector tests + + def get_sv(self): + arr = cp.zeros((self.n_svs, 2**(self.n_qubits + self.n_extra_qubits)), dtype=self.dtype) + arr[:, 0] = 1 # initialize in |000...00> + self.sv_stride = 2 ** (self.n_qubits + self.n_extra_qubits) # in counts, not bytes + return arr + + # TODO: make this a static method + # TODO: refactor this to a helper class? + def _return_data(self, data, name, dtype, return_value): + if return_value == 'int_d': + if len(data) == 0: + # empty, give it a NULL + return 0, 0 + else: + # return int as void* + data = cp.asarray(data, dtype=dtype) + setattr(self, name, data) # keep data alive + return data.data.ptr, data.size + if return_value == 'int_h': + if len(data) == 0: + # empty, give it a NULL + return 0, 0 + else: + # return int as void* + data = np.asarray(data, dtype=dtype) setattr(self, name, data) # keep data alive return data.ctypes.data, data.size elif return_value == 'seq': @@ -76,14 +119,14 @@ def multi_gpu_handles(request): p2p_required = request.param for dev in range(n_devices): - with cupy.cuda.Device(dev): + with cp.cuda.Device(dev): h = cusv.create() handles.append(h) if p2p_required: for peer in range(n_devices): if dev == peer: continue try: - cupy.cuda.runtime.deviceEnablePeerAccess(peer) + cp.cuda.runtime.deviceEnablePeerAccess(peer) except Exception as e: if 'PeerAccessUnsupported' in str(e): pytest.skip("P2P unsupported") @@ -93,14 +136,14 @@ def multi_gpu_handles(request): yield handles for dev in range(n_devices): - with cupy.cuda.Device(dev): + with cp.cuda.Device(dev): h = handles.pop(0) cusv.destroy(h) if p2p_required: for peer in range(n_devices): if dev == peer: continue try: - cupy.cuda.runtime.deviceDisablePeerAccess(peer) + cp.cuda.runtime.deviceDisablePeerAccess(peer) except Exception as e: if 'PeerAccessNotEnabled' not in str(e): raise @@ -120,7 +163,7 @@ def get_exponent(n): @testing.parameterize(*testing.product({ 'n_qubits': (4,), - 'dtype': (numpy.complex64, numpy.complex128), + 'dtype': (np.complex64, np.complex128), })) class TestMultiGpuSV: # TODO: consider making this class more flexible @@ -133,8 +176,8 @@ def get_sv(self): self.sub_sv = [] for dev in range(self.n_devices): - with cupy.cuda.Device(dev): - self.sub_sv.append(cupy.zeros( + with cp.cuda.Device(dev): + self.sub_sv.append(cp.zeros( 2**self.n_local_bits, dtype=self.dtype)) self.sub_sv[0][0] = 1 # initialize in |000...00> return self.sub_sv @@ -147,7 +190,7 @@ def _return_data(self, data, name, dtype, return_value): return 0, 0 else: # return int as void* - data = numpy.asarray(data, dtype=dtype) + data = np.asarray(data, dtype=dtype) setattr(self, name, data) # keep data alive return data.ctypes.data, data.size elif return_value == 'seq': @@ -188,7 +231,7 @@ def test_workspace(self, handle): # cuStateVec does not like a smaller workspace... size = 24*1024**2 assert size > default_workspace_size - memptr = cupy.cuda.alloc(size) + memptr = cp.cuda.alloc(size) cusv.set_workspace(handle, memptr.ptr, size) # should not fail def test_stream(self, handle): @@ -196,17 +239,34 @@ def test_stream(self, handle): assert 0 == cusv.get_stream(handle) # simple set/get round-trip - stream = cupy.cuda.Stream() + stream = cp.cuda.Stream() cusv.set_stream(handle, stream.ptr) assert stream.ptr == cusv.get_stream(handle) +class TestInitSV(TestSV): + + @pytest.mark.parametrize('sv_type', cusv.StateVectorType) + def test_initialize_state_vector(self, handle, sv_type): + sv = self.get_sv() + data_type = dtype_to_data_type[self.dtype] + + if sv_type == cusv.StateVectorType.ZERO: + sv_orig = sv.copy() # already zero-init'd + sv[:] = 1. # reset to something else + cusv.initialize_state_vector( + handle, sv.data.ptr, data_type, self.n_qubits, sv_type) + if sv_type == cusv.StateVectorType.ZERO: + assert (sv == sv_orig).all() + assert cp.allclose(cp.sum(cp.abs(sv)**2), 1.) + + class TestAbs2Sum(TestSV): @pytest.mark.parametrize( 'input_form', ( - {'basis_bits': (numpy.int32, 'int'),}, - {'basis_bits': (numpy.int32, 'seq'),}, + {'basis_bits': (np.int32, 'int'),}, + {'basis_bits': (np.int32, 'seq'),}, ) ) def test_abs2sum_on_z_basis(self, handle, input_form): @@ -220,21 +280,21 @@ def test_abs2sum_on_z_basis(self, handle, input_form): sum0, sum1 = cusv.abs2sum_on_z_basis( handle, sv.data.ptr, data_type, self.n_qubits, True, True, basis_bits, basis_bits_len) - assert numpy.allclose(sum0+sum1, 1) + assert np.allclose(sum0+sum1, 1) assert (sum0 is not None) and (sum1 is not None) # case 2: only sum0 is computed sum0, sum1 = cusv.abs2sum_on_z_basis( handle, sv.data.ptr, data_type, self.n_qubits, True, False, basis_bits, basis_bits_len) - assert numpy.allclose(sum0, 1) + assert np.allclose(sum0, 1) assert (sum0 is not None) and (sum1 is None) # case 3: only sum1 is computed sum0, sum1 = cusv.abs2sum_on_z_basis( handle, sv.data.ptr, data_type, self.n_qubits, False, True, basis_bits, basis_bits_len) - assert numpy.allclose(sum1, 0) + assert np.allclose(sum1, 0) assert (sum0 is None) and (sum1 is not None) # case 4: none is computed @@ -245,19 +305,19 @@ def test_abs2sum_on_z_basis(self, handle, input_form): @pytest.mark.parametrize( 'input_form', ( - {'bit_ordering': (numpy.int32, 'int'),}, - {'bit_ordering': (numpy.int32, 'seq'),}, + {'bit_ordering': (np.int32, 'int'),}, + {'bit_ordering': (np.int32, 'seq'),}, ) ) @pytest.mark.parametrize( - 'xp', (numpy, cupy) + 'xp', (np, cp) ) def test_abs2sum_array_no_mask(self, handle, xp, input_form): # change sv from |000> to 1/\sqrt{2} (|001> + |100>) sv = self.get_sv() sv[0] = 0 - sv[1] = 1./numpy.sqrt(2) - sv[4] = 1./numpy.sqrt(2) + sv[1] = 1./np.sqrt(2) + sv[4] = 1./np.sqrt(2) data_type = dtype_to_data_type[self.dtype] bit_ordering = list(range(self.n_qubits)) @@ -265,7 +325,7 @@ def test_abs2sum_array_no_mask(self, handle, xp, input_form): bit_ordering, 'bit_ordering', *input_form['bit_ordering']) # test abs2sum on both host and device abs2sum = xp.zeros((2**bit_ordering_len,), dtype=xp.float64) - abs2sum_ptr = abs2sum.data.ptr if xp is cupy else abs2sum.ctypes.data + abs2sum_ptr = abs2sum.data.ptr if xp is cp else abs2sum.ctypes.data cusv.abs2sum_array( handle, sv.data.ptr, data_type, self.n_qubits, abs2sum_ptr, bit_ordering, bit_ordering_len, 0, 0, 0) @@ -276,12 +336,96 @@ def test_abs2sum_array_no_mask(self, handle, xp, input_form): # TODO(leofang): add more tests for abs2sum_array, such as nontrivial masks +class TestBatchedAbs2Sum(TestBatchedSV): + + @pytest.mark.parametrize( + 'input_form', ( + {'bit_ordering': (np.int32, 'int_h'),}, + {'bit_ordering': (np.int32, 'seq'),}, + ) + ) + @pytest.mark.parametrize( + 'xp', (np, cp) + ) + def test_abs2sum_array_batched_no_mask(self, handle, xp, input_form): + # change sv from |0000> to 1/\sqrt{2} (|0001> + |1000>) + sv = self.get_sv() + sv[..., 0] = 0 + sv[..., 1] = 1./np.sqrt(2) + sv[..., 8] = 1./np.sqrt(2) + + data_type = dtype_to_data_type[self.dtype] + bit_ordering = list(range(self.n_qubits)) + bit_ordering, bit_ordering_len = self._return_data( + bit_ordering, 'bit_ordering', *input_form['bit_ordering']) + + # test abs2sum on both host and device + abs2sum = xp.zeros((self.n_svs, 2**bit_ordering_len,), + dtype=xp.float64) + abs2sum_ptr = abs2sum.data.ptr if xp is cp else abs2sum.ctypes.data + cusv.abs2sum_array_batched( + handle, sv.data.ptr, data_type, self.n_qubits, + self.n_svs, self.sv_stride, + abs2sum_ptr, 2**bit_ordering_len, + bit_ordering, bit_ordering_len, 0, 0, 0) + + assert xp.allclose(abs2sum.sum(), self.n_svs) + assert xp.allclose(abs2sum[..., 1], 0.5) + assert xp.allclose(abs2sum[..., 8], 0.5) + + @pytest.mark.parametrize( + 'input_form', ( + {'bit_ordering': (np.int32, 'int_h'), 'mask_bit_strings': (np.int64, 'int_h'), }, + {'bit_ordering': (np.int32, 'int_h'), 'mask_bit_strings': (np.int64, 'int_d'), }, + {'bit_ordering': (np.int32, 'seq'), 'mask_bit_strings': (np.int64, 'seq'), }, + ) + ) + @pytest.mark.parametrize( + 'xp', (np, cp) + ) + def test_abs2sum_array_batched_masked(self, handle, xp, input_form): + # change sv from |0000> to 1/\sqrt{2} (|0001> + |1000>) + sv = self.get_sv() + sv[..., 0] = 0 + sv[..., 1] = 1./np.sqrt(2) + sv[..., 8] = 1./np.sqrt(2) + + data_type = dtype_to_data_type[self.dtype] + bit_ordering = list(range(self.n_qubits - 1)) # exclude the last qubit + bit_ordering, bit_ordering_len = self._return_data( + bit_ordering, 'bit_ordering', *input_form['bit_ordering']) + + # mask = 0b1 + mask_bit_strings = np.ones(self.n_svs) + mask_bit_strings, _ = self._return_data( + mask_bit_strings, 'mask_bit_strings', + *input_form['mask_bit_strings']) + mask_bit_ordering = [self.n_qubits - 1] + mask_bit_ordering, mask_len = self._return_data( + mask_bit_ordering, 'mask_bit_ordering', *input_form['bit_ordering']) + + # test abs2sum on both host and device + abs2sum = xp.zeros((self.n_svs, 2**bit_ordering_len,), + dtype=xp.float64) + abs2sum_ptr = abs2sum.data.ptr if xp is cp else abs2sum.ctypes.data + cusv.abs2sum_array_batched( + handle, sv.data.ptr, data_type, self.n_qubits, + self.n_svs, self.sv_stride, + abs2sum_ptr, 2**bit_ordering_len, + bit_ordering, bit_ordering_len, + mask_bit_strings, mask_bit_ordering, mask_len) + + # we mask out half of the values + assert xp.allclose(abs2sum.sum(), self.n_svs * 0.5) + assert xp.allclose(abs2sum[..., 0], 0.5) + + class TestCollapse(TestSV): @pytest.mark.parametrize( 'input_form', ( - {'basis_bits': (numpy.int32, 'int'),}, - {'basis_bits': (numpy.int32, 'seq'),}, + {'basis_bits': (np.int32, 'int'),}, + {'basis_bits': (np.int32, 'seq'),}, ) ) @pytest.mark.parametrize( @@ -299,21 +443,21 @@ def test_collapse_on_z_basis(self, handle, parity, input_form): parity, basis_bits, basis_bits_len, 1) if parity == 0: - assert cupy.allclose(sv.sum(), 1) + assert cp.allclose(sv.sum(), 1) elif parity == 1: - assert cupy.allclose(sv.sum(), 0) + assert cp.allclose(sv.sum(), 0) @pytest.mark.parametrize( 'input_form', ( - {'bit_ordering': (numpy.int32, 'int'), 'bitstring': (numpy.int32, 'int')}, - {'bit_ordering': (numpy.int32, 'seq'), 'bitstring': (numpy.int32, 'seq')}, + {'bit_ordering': (np.int32, 'int'), 'bitstring': (np.int32, 'int')}, + {'bit_ordering': (np.int32, 'seq'), 'bitstring': (np.int32, 'seq')}, ) ) def test_collapse_by_bitstring(self, handle, input_form): # change sv to 1/\sqrt{2} (|000> + |111>) sv = self.get_sv() - sv[0] = numpy.sqrt(0.5) - sv[-1] = numpy.sqrt(0.5) + sv[0] = np.sqrt(0.5) + sv[-1] = np.sqrt(0.5) # collapse to |111> bitstring = [1] * self.n_qubits @@ -331,14 +475,64 @@ def test_collapse_by_bitstring(self, handle, input_form): handle, sv.data.ptr, data_type, self.n_qubits, bitstring, bit_ordering, bitstring_len, norm) - assert cupy.allclose(sv.sum(), 1) - assert cupy.allclose(sv[-1], 1) + assert cp.allclose(sv.sum(), 1) + assert cp.allclose(sv[-1], 1) + + +class TestBatchedCollapse(TestBatchedSV): + + @pytest.mark.parametrize( + 'input_form', ( + {'bit_ordering': (np.int32, 'int_h'), 'bitstrings': (np.int64, 'int_d'), 'norms': (np.double, 'int_d')}, + {'bit_ordering': (np.int32, 'int_h'), 'bitstrings': (np.int64, 'int_h'), 'norms': (np.double, 'int_h')}, + {'bit_ordering': (np.int32, 'seq'), 'bitstrings': (np.int64, 'seq'), 'norms': (np.double, 'seq')}, + ) + ) + def test_collapse_by_bitstring_batched(self, handle, input_form): + # change sv to 1/\sqrt{2} (|00...0> + |11...1>) + sv = self.get_sv() + sv[:, 0] = np.sqrt(0.5) + sv[:, 2**self.n_qubits-1] = np.sqrt(0.5) # Note the padding at the end + + bit_ordering = list(range(self.n_qubits)) + bit_ordering, _ = self._return_data( + bit_ordering, 'bit_ordering', *input_form['bit_ordering']) + bitstrings_len = self.n_qubits + data_type = dtype_to_data_type[self.dtype] + + # collapse to |11...1> + bitstrings = [2**self.n_qubits-1] * self.n_svs + bitstrings, _ = self._return_data( + bitstrings, 'bitstrings', *input_form['bitstrings']) + + # the sv after collapse is normalized as sv -> sv / \sqrt{norm} + norms = [0.5] * self.n_svs + norms, _ = self._return_data( + norms, 'norms', *input_form['norms']) + + workspace_size = cusv.collapse_by_bitstring_batched_get_workspace_size( + handle, self.n_svs, bitstrings, norms) + if workspace_size > 0: + workspace = cp.cuda.alloc(workspace_size) + workspace_ptr = workspace.ptr + else: + workspace_ptr = 0 + + cusv.collapse_by_bitstring_batched( + handle, sv.data.ptr, data_type, self.n_qubits, + self.n_svs, self.sv_stride, + bitstrings, bit_ordering, bitstrings_len, + norms, + workspace_ptr, workspace_size) + cp.cuda.Device().synchronize() + assert cp.allclose(sv[:, 0:2**self.n_qubits].sum(), self.n_svs) + assert cp.allclose(sv[:, 2**self.n_qubits-1], cp.ones(self.n_svs, dtype=self.dtype)) @pytest.mark.parametrize( 'rand', # the choices here ensure we get either parity - (0, numpy.nextafter(1, 0)) + (0, np.nextafter(1, 0)) ) @pytest.mark.parametrize( 'collapse', @@ -348,16 +542,16 @@ class TestMeasure(TestSV): @pytest.mark.parametrize( 'input_form', ( - {'basis_bits': (numpy.int32, 'int'),}, - {'basis_bits': (numpy.int32, 'seq'),}, + {'basis_bits': (np.int32, 'int'),}, + {'basis_bits': (np.int32, 'seq'),}, ) ) def test_measure_on_z_basis(self, handle, rand, collapse, input_form): # change the sv to 1/\sqrt{2} (|000> + |010>) to allow 50-50 chance # of getting either parity sv = self.get_sv() - sv[0] = numpy.sqrt(0.5) - sv[2] = numpy.sqrt(0.5) + sv[0] = np.sqrt(0.5) + sv[2] = np.sqrt(0.5) basis_bits = list(range(self.n_qubits)) basis_bits, basis_bits_len = self._return_data( @@ -372,10 +566,10 @@ def test_measure_on_z_basis(self, handle, rand, collapse, input_form): if collapse == cusv.Collapse.NORMALIZE_AND_ZERO: if parity == 0: # collapse to |000> - assert cupy.allclose(sv[0], 1) + assert cp.allclose(sv[0], 1) elif parity == 1: # collapse to |111> - assert cupy.allclose(sv[2], 1) + assert cp.allclose(sv[2], 1) # sv is collapsed assert not (sv == orig_sv).all() else: @@ -384,19 +578,19 @@ def test_measure_on_z_basis(self, handle, rand, collapse, input_form): @pytest.mark.parametrize( 'input_form', ( - {'bit_ordering': (numpy.int32, 'int'),}, - {'bit_ordering': (numpy.int32, 'seq'),}, + {'bit_ordering': (np.int32, 'int'),}, + {'bit_ordering': (np.int32, 'seq'),}, ) ) def test_batch_measure(self, handle, rand, collapse, input_form): # change sv to 1/\sqrt{2} (|000> + |111>) sv = self.get_sv() - sv[0] = numpy.sqrt(0.5) - sv[-1] = numpy.sqrt(0.5) + sv[0] = np.sqrt(0.5) + sv[-1] = np.sqrt(0.5) orig_sv = sv.copy() data_type = dtype_to_data_type[self.dtype] - bitstring = numpy.empty(self.n_qubits, dtype=numpy.int32) + bitstring = np.empty(self.n_qubits, dtype=np.int32) bit_ordering = list(range(self.n_qubits)) bit_ordering, _ = self._return_data( bit_ordering, 'bit_ordering', *input_form['bit_ordering']) @@ -406,32 +600,104 @@ def test_batch_measure(self, handle, rand, collapse, input_form): bitstring.ctypes.data, bit_ordering, bitstring.size, rand, collapse) - if collapse == cusv.Collapse.NORMALIZE_AND_ZERO: - if bitstring.sum() == 0: + if bitstring.sum() == 0: + assert rand == 0 + if collapse == cusv.Collapse.NORMALIZE_AND_ZERO: # collapse to |000> - assert cupy.allclose(sv[0], 1) - elif bitstring.sum() == 3: + assert cp.allclose(sv[0], 1) + # sv is collapsed + assert (sv != orig_sv).any() + else: + # sv is intact + assert (sv == orig_sv).all() + elif bitstring.sum() == self.n_qubits: + assert rand == np.nextafter(1, 0) + if collapse == cusv.Collapse.NORMALIZE_AND_ZERO: # collapse to |111> - assert cupy.allclose(sv[-1], 1) + assert cp.allclose(sv[-1], 1) + # sv is collapsed + assert (sv != orig_sv).any() else: - assert False, f"unexpected bitstring: {bitstring}" - # sv is collapsed - assert not (sv == orig_sv).all() + # sv is intact + assert (sv == orig_sv).all() else: - assert bitstring.sum() in (0, 3) - # sv is intact - assert (sv == orig_sv).all() + assert False, f"unexpected bitstrings: {bitstrings}" + + +class TestMeasureBatched(TestBatchedSV): + + @pytest.mark.parametrize( + 'rand', + # the choices here ensure we get either parity + (0, np.nextafter(1, 0)) + ) + @pytest.mark.parametrize( + 'input_form', ( + {'bitstrings': (np.int64, 'int_h'), 'bit_ordering': (np.int32, 'int_h'), 'rand_nums': (np.float64, 'int_h')}, + {'bitstrings': (np.int64, 'int_d'), 'bit_ordering': (np.int32, 'int_h'), 'rand_nums': (np.float64, 'int_d')}, + {'bitstrings': (np.int64, 'int_d'), 'bit_ordering': (np.int32, 'seq'), 'rand_nums': (np.float64, 'seq')}, + ) + ) + @pytest.mark.parametrize('collapse', cusv.Collapse) + @pytest.mark.parametrize('xp', (np, cp)) + def test_measure_batched(self, handle, rand, input_form, collapse, xp): + # change sv to 1/\sqrt{2} (|00...0> + |11...1>) + sv = self.get_sv() + sv[:, 0] = np.sqrt(0.5) + sv[:, 2**self.n_qubits-1] = np.sqrt(0.5) # Note the padding at the end + orig_sv = sv.copy() + + data_type = dtype_to_data_type[self.dtype] + bitstrings = np.empty(self.n_svs, dtype=np.int32) + bitstrings, _ = self._return_data( + bitstrings, 'bitstrings', *input_form['bitstrings']) + bit_ordering = list(range(self.n_qubits)) + bit_ordering, bit_ordering_len = self._return_data( + bit_ordering, 'bit_ordering', *input_form['bit_ordering']) + rand_nums = [rand] * self.n_svs + rand_nums, _ = self._return_data( + rand_nums, 'rand_nums', *input_form['rand_nums']) + + cusv.measure_batched( + handle, sv.data.ptr, data_type, self.n_qubits, + self.n_svs, self.sv_stride, + bitstrings, bit_ordering, bit_ordering_len, + rand_nums, collapse) + + bitstrings = self.bitstrings + if bitstrings.sum() == 0: + assert rand == 0 + if collapse == cusv.Collapse.NORMALIZE_AND_ZERO: + # collapse to |00...0> + assert cp.allclose(sv[:, 0], 1) + # sv is collapsed + assert (sv != orig_sv).any() + else: + # sv is intact + assert (sv == orig_sv).all() + elif bitstrings.sum() == (2**self.n_qubits-1)*self.n_svs: + assert rand == np.nextafter(1, 0) + if collapse == cusv.Collapse.NORMALIZE_AND_ZERO: + # collapse to |11...1> + assert cp.allclose(sv[:, 2**self.n_qubits-1], 1) + # sv is collapsed + assert (sv != orig_sv).any() + else: + # sv is intact + assert (sv == orig_sv).all() + else: + assert False, f"unexpected bitstrings: {bitstrings}" class TestApply(TestSV): @pytest.mark.parametrize( 'input_form', ( - {'targets': (numpy.int32, 'int'), 'controls': (numpy.int32, 'int'), + {'targets': (np.int32, 'int'), 'controls': (np.int32, 'int'), # sizeof(enum) == sizeof(int) - 'paulis': (numpy.int32, 'int'),}, - {'targets': (numpy.int32, 'seq'), 'controls': (numpy.int32, 'seq'), - 'paulis': (numpy.int32, 'seq'),}, + 'paulis': (np.int32, 'int'),}, + {'targets': (np.int32, 'seq'), 'controls': (np.int32, 'seq'), + 'paulis': (np.int32, 'seq'),}, ) ) def test_apply_pauli_rotation(self, handle, input_form): @@ -454,25 +720,25 @@ def test_apply_pauli_rotation(self, handle, input_form): cusv.apply_pauli_rotation( handle, sv.data.ptr, data_type, self.n_qubits, - 0.5*numpy.pi, paulis, + 0.5*np.pi, paulis, targets, targets_len, controls, control_values, controls_len) sv *= -1j # result is |111> - assert cupy.allclose(sv[-1], 1) + assert cp.allclose(sv[-1], 1) @pytest.mark.parametrize( 'mempool', (None, 'py-callable', 'cffi', 'cffi_struct') ) @pytest.mark.parametrize( 'input_form', ( - {'targets': (numpy.int32, 'int'), 'controls': (numpy.int32, 'int')}, - {'targets': (numpy.int32, 'seq'), 'controls': (numpy.int32, 'seq')}, + {'targets': (np.int32, 'int'), 'controls': (np.int32, 'int')}, + {'targets': (np.int32, 'seq'), 'controls': (np.int32, 'seq')}, ) ) @pytest.mark.parametrize( - 'xp', (numpy, cupy) + 'xp', (np, cp) ) def test_apply_matrix(self, handle, xp, input_form, mempool): if (isinstance(mempool, str) and mempool.startswith('cffi') @@ -492,7 +758,7 @@ def test_apply_matrix(self, handle, xp, input_form, mempool): # matrix can live on host or device matrix = xp.zeros((2**self.n_qubits, 2**self.n_qubits), dtype=sv.dtype) matrix[-1][0] = 1 - matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr + matrix_ptr = matrix.ctypes.data if xp is np else matrix.data.ptr if mempool is None: workspace_size = cusv.apply_matrix_get_workspace_size( @@ -500,7 +766,7 @@ def test_apply_matrix(self, handle, xp, input_form, mempool): matrix_ptr, data_type, cusv.MatrixLayout.ROW, 0, targets_len, controls_len, compute_type) if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -521,20 +787,19 @@ def test_apply_matrix(self, handle, xp, input_form, mempool): assert sv[-1] == 1 # output state is |111> - @pytest.mark.parametrize( 'mempool', (None, 'py-callable', 'cffi', 'cffi_struct') ) @pytest.mark.parametrize( 'input_form', ( - {'permutation': (numpy.int64, 'int'), 'basis_bits': (numpy.int32, 'int'), - 'mask_bitstring': (numpy.int32, 'int'), 'mask_ordering': (numpy.int32, 'int')}, - {'permutation': (numpy.int64, 'seq'), 'basis_bits': (numpy.int32, 'seq'), - 'mask_bitstring': (numpy.int32, 'seq'), 'mask_ordering': (numpy.int32, 'seq')}, + {'permutation': (np.int64, 'int'), 'basis_bits': (np.int32, 'int'), + 'mask_bitstring': (np.int32, 'int'), 'mask_ordering': (np.int32, 'int')}, + {'permutation': (np.int64, 'seq'), 'basis_bits': (np.int32, 'seq'), + 'mask_bitstring': (np.int32, 'seq'), 'mask_ordering': (np.int32, 'seq')}, ) ) @pytest.mark.parametrize( - 'xp', (numpy, cupy) + 'xp', (np, cp) ) def test_apply_generalized_permutation_matrix( self, handle, xp, input_form, mempool): @@ -548,14 +813,14 @@ def test_apply_generalized_permutation_matrix( compute_type = dtype_to_compute_type[self.dtype] # TODO(leofang): test permutation on either host or device - permutation = list(numpy.random.permutation(2**self.n_qubits)) + permutation = list(np.random.permutation(2**self.n_qubits)) permutation_data = permutation permutation, permutation_len = self._return_data( permutation, 'permutation', *input_form['permutation']) # diagonal can live on host or device diagonal = 10 * xp.ones((2**self.n_qubits, ), dtype=sv.dtype) - diagonal_ptr = diagonal.ctypes.data if xp is numpy else diagonal.data.ptr + diagonal_ptr = diagonal.ctypes.data if xp is np else diagonal.data.ptr basis_bits = list(range(self.n_qubits)) basis_bits, basis_bits_len = self._return_data( @@ -573,7 +838,7 @@ def test_apply_generalized_permutation_matrix( basis_bits, basis_bits_len, mask_len) if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -592,7 +857,85 @@ def test_apply_generalized_permutation_matrix( mask_bitstring, mask_ordering, mask_len, workspace_ptr, workspace_size) - assert cupy.allclose(sv, diagonal[xp.asarray(permutation_data)]) + assert cp.allclose(sv, diagonal[xp.asarray(permutation_data)]) + + +class TestBatchedApply(TestBatchedSV): + + @pytest.mark.parametrize( + 'mempool', (None, 'py-callable', 'cffi', 'cffi_struct') + ) + @pytest.mark.parametrize( + 'input_form', ( + {'matrix_indices': (np.int32, 'int_h'), 'targets': (np.int32, 'int_h'), 'controls': (np.int32, 'int_h')}, + {'matrix_indices': (np.int32, 'int_d'), 'targets': (np.int32, 'int_h'), 'controls': (np.int32, 'int_h')}, + {'matrix_indices': (np.int32, 'seq'), 'targets': (np.int32, 'seq'), 'controls': (np.int32, 'seq')}, + ) + ) + @pytest.mark.parametrize('xp', (np, cp)) + @pytest.mark.parametrize('map_type', cusv.MatrixMapType) + def test_apply_matrix_batched( + self, handle, map_type, xp, input_form, mempool): + if (isinstance(mempool, str) and mempool.startswith('cffi') + and not _can_use_cffi()): + pytest.skip("cannot run cffi tests") + + sv = self.get_sv() + data_type = dtype_to_data_type[self.dtype] + compute_type = dtype_to_compute_type[self.dtype] + targets = list(range(self.n_qubits)) + targets, targets_len = self._return_data( + targets, 'targets', *input_form['targets']) + controls = [] + controls, controls_len = self._return_data( + controls, 'controls', *input_form['controls']) + + if map_type == cusv.MatrixMapType.BROADCAST: + n_matrices = 1 + elif map_type == cusv.MatrixMapType.MATRIX_INDEXED: + n_matrices = self.n_svs + + # matrices and their indices can live on host or device + matrices = xp.zeros( + (n_matrices, 2**self.n_qubits, 2**self.n_qubits), + dtype=sv.dtype) + matrices[..., -1, 0] = 1 + matrices_ptr = matrices.ctypes.data if xp is np else matrices.data.ptr + matrix_indices = list(range(n_matrices)) + if len(matrix_indices) > 1: + np.random.shuffle(matrix_indices) + matrix_indices, n_matrices = self._return_data( + matrix_indices, 'matrix_indices', *input_form['matrix_indices']) + + if mempool is None: + workspace_size = cusv.apply_matrix_batched_get_workspace_size( + handle, data_type, self.n_qubits, self.n_svs, self.sv_stride, + map_type, matrix_indices, matrices_ptr, data_type, + cusv.MatrixLayout.ROW, 0, n_matrices, + targets_len, controls_len, compute_type) + if workspace_size: + workspace = cp.cuda.alloc(workspace_size) + workspace_ptr = workspace.ptr + else: + workspace_ptr = 0 + else: + mr = MemoryResourceFactory(mempool) + handler = mr.get_dev_mem_handler() + cusv.set_device_mem_handler(handle, handler) + + workspace_ptr = 0 + workspace_size = 0 + + cusv.apply_matrix_batched( + handle, sv.data.ptr, data_type, self.n_qubits, + self.n_svs, self.sv_stride, map_type, matrix_indices, + matrices_ptr, data_type, + cusv.MatrixLayout.ROW, 0, n_matrices, + targets, targets_len, + controls, 0, controls_len, + compute_type, workspace_ptr, workspace_size) + + assert (sv[..., 2**self.n_qubits-1] == 1).all() # output state is |11...1> class TestExpect(TestSV): @@ -602,15 +945,15 @@ class TestExpect(TestSV): ) @pytest.mark.parametrize( 'input_form', ( - {'basis_bits': (numpy.int32, 'int'),}, - {'basis_bits': (numpy.int32, 'seq'),}, + {'basis_bits': (np.int32, 'int'),}, + {'basis_bits': (np.int32, 'seq'),}, ) ) @pytest.mark.parametrize( - 'expect_dtype', (numpy.float64, numpy.complex128) + 'expect_dtype', (np.float64, np.complex128) ) @pytest.mark.parametrize( - 'xp', (numpy, cupy) + 'xp', (np, cp) ) def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool): if (isinstance(mempool, str) and mempool.startswith('cffi') @@ -619,7 +962,7 @@ def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool # create a uniform sv sv = self.get_sv() - sv[:] = numpy.sqrt(1/(2**self.n_qubits)) + sv[:] = np.sqrt(1/(2**self.n_qubits)) data_type = dtype_to_data_type[self.dtype] compute_type = dtype_to_compute_type[self.dtype] @@ -629,7 +972,7 @@ def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool # matrix can live on host or device matrix = xp.ones((2**self.n_qubits, 2**self.n_qubits), dtype=sv.dtype) - matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr + matrix_ptr = matrix.ctypes.data if xp is np else matrix.data.ptr if mempool is None: workspace_size = cusv.compute_expectation_get_workspace_size( @@ -637,7 +980,7 @@ def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool matrix_ptr, data_type, cusv.MatrixLayout.ROW, basis_bits_len, compute_type) if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -649,10 +992,10 @@ def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool workspace_ptr = 0 workspace_size = 0 - expect = numpy.empty((1,), dtype=expect_dtype) + expect = np.empty((1,), dtype=expect_dtype) # TODO(leofang): check if this is relaxed in beta 2 expect_data_type = ( - cudaDataType.CUDA_R_64F if expect_dtype == numpy.float64 + cudaDataType.CUDA_R_64F if expect_dtype == np.float64 else cudaDataType.CUDA_C_64F) cusv.compute_expectation( @@ -668,7 +1011,7 @@ def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool def test_compute_expectations_on_pauli_basis(self, handle): # create a uniform sv sv = self.get_sv() - sv[:] = numpy.sqrt(1/(2**self.n_qubits)) + sv[:] = np.sqrt(1/(2**self.n_qubits)) data_type = dtype_to_data_type[self.dtype] compute_type = dtype_to_compute_type[self.dtype] @@ -679,16 +1022,16 @@ def test_compute_expectations_on_pauli_basis(self, handle): basis_bits = [[*range(self.n_qubits)] for i in range(len(paulis))] n_basis_bits = [len(basis_bits[i]) for i in range(len(paulis))] - expect = numpy.empty((len(paulis),), dtype=numpy.float64) + expect = np.empty((len(paulis),), dtype=np.float64) cusv.compute_expectations_on_pauli_basis( handle, sv.data.ptr, data_type, self.n_qubits, expect.ctypes.data, paulis, len(paulis), basis_bits, n_basis_bits) - result = numpy.zeros_like(expect) + result = np.zeros_like(expect) result[0] = 1 # for XX...X - assert numpy.allclose(expect, result) + assert np.allclose(expect, result) class TestSampler(TestSV): @@ -698,8 +1041,8 @@ class TestSampler(TestSV): ) @pytest.mark.parametrize( 'input_form', ( - {'bit_ordering': (numpy.int32, 'int'),}, - {'bit_ordering': (numpy.int32, 'seq'),}, + {'bit_ordering': (np.int32, 'int'),}, + {'bit_ordering': (np.int32, 'seq'),}, ) ) def test_sampling(self, handle, input_form, mempool): @@ -709,14 +1052,14 @@ def test_sampling(self, handle, input_form, mempool): # create a uniform sv sv = self.get_sv() - sv[:] = numpy.sqrt(1/(2**self.n_qubits)) + sv[:] = np.sqrt(1/(2**self.n_qubits)) data_type = dtype_to_data_type[self.dtype] compute_type = dtype_to_compute_type[self.dtype] shots = 4096 - bitstrings = numpy.empty((shots,), dtype=numpy.int64) - rand_nums = numpy.random.random((shots,)).astype(numpy.float64) + bitstrings = np.empty((shots,), dtype=np.int64) + rand_nums = np.random.random((shots,)).astype(np.float64) # measure all qubits bit_ordering = list(range(self.n_qubits)) bit_ordering, _ = self._return_data( @@ -726,7 +1069,7 @@ def test_sampling(self, handle, input_form, mempool): handle, sv.data.ptr, data_type, self.n_qubits, shots) if mempool is None: if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -756,13 +1099,13 @@ def test_sampling(self, handle, input_form, mempool): finally: cusv.sampler_destroy(sampler) - keys, counts = numpy.unique(bitstrings, return_counts=True) + keys, counts = np.unique(bitstrings, return_counts=True) # keys are the returned bitstrings 000, 001, ..., 111 # the sv has all components, and unique() returns a sorted array, # so the following should hold: - assert (keys == numpy.arange(2**self.n_qubits)).all() + assert (keys == np.arange(2**self.n_qubits)).all() - assert numpy.allclose(norm, 1) + assert np.allclose(norm, 1) # TODO: test counts, which should follow a uniform distribution @@ -773,8 +1116,8 @@ def test_sampling(self, handle, input_form, mempool): # TODO(leofang): test mask_bitstring & mask_ordering @pytest.mark.parametrize( 'input_form', ( - {'bit_ordering': (numpy.int32, 'int'), 'mask_bitstring': (numpy.int32, 'int'), 'mask_ordering': (numpy.int32, 'int')}, - {'bit_ordering': (numpy.int32, 'seq'), 'mask_bitstring': (numpy.int32, 'seq'), 'mask_ordering': (numpy.int32, 'seq')}, + {'bit_ordering': (np.int32, 'int'), 'mask_bitstring': (np.int32, 'int'), 'mask_ordering': (np.int32, 'int')}, + {'bit_ordering': (np.int32, 'seq'), 'mask_bitstring': (np.int32, 'seq'), 'mask_ordering': (np.int32, 'seq')}, ) ) @pytest.mark.parametrize( @@ -789,8 +1132,8 @@ def test_accessor_get(self, handle, readonly, input_form, mempool): # create a monotonically increasing sv sv = self.get_sv() - data = cupy.arange(2**self.n_qubits, dtype=sv.dtype) - data /= cupy.sqrt(data**2) + data = cp.arange(2**self.n_qubits, dtype=sv.dtype) + data /= cp.sqrt(data**2) sv[:] = data data_type = dtype_to_data_type[self.dtype] @@ -818,7 +1161,7 @@ def test_accessor_get(self, handle, readonly, input_form, mempool): try: if mempool is None: if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -834,7 +1177,7 @@ def test_accessor_get(self, handle, readonly, input_form, mempool): handle, accessor, workspace_ptr, workspace_size) buf_len = 2**2 - buf = cupy.empty(buf_len, dtype=sv.dtype) + buf = cp.empty(buf_len, dtype=sv.dtype) # copy the last buf_len elements cusv.accessor_get( @@ -851,8 +1194,8 @@ def test_accessor_set(self, handle, readonly, input_form, mempool): # create a monotonically increasing sv sv = self.get_sv() - data = cupy.arange(2**self.n_qubits, dtype=sv.dtype) - data /= cupy.sqrt(data**2) + data = cp.arange(2**self.n_qubits, dtype=sv.dtype) + data /= cp.sqrt(data**2) sv[:] = data data_type = dtype_to_data_type[self.dtype] @@ -880,7 +1223,7 @@ def test_accessor_set(self, handle, readonly, input_form, mempool): try: if mempool is None: if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -896,7 +1239,7 @@ def test_accessor_set(self, handle, readonly, input_form, mempool): handle, accessor, workspace_ptr, workspace_size) buf_len = 2**2 - buf = cupy.zeros(buf_len, dtype=sv.dtype) + buf = cp.zeros(buf_len, dtype=sv.dtype) if readonly: # copy the last buf_len elements would fail @@ -927,15 +1270,15 @@ class TestTestMatrixType: ) @pytest.mark.parametrize( 'input_form', ( - {'targets': (numpy.int32, 'int'), }, - {'targets': (numpy.int32, 'seq'), }, + {'targets': (np.int32, 'int'), }, + {'targets': (np.int32, 'seq'), }, ) ) @pytest.mark.parametrize( - 'dtype', (numpy.complex64, numpy.complex128) + 'dtype', (np.complex64, np.complex128) ) @pytest.mark.parametrize( - 'xp', (numpy, cupy) + 'xp', (np, cp) ) def test_apply_matrix_type( self, handle, xp, dtype, input_form, matrix_type, mempool): @@ -951,7 +1294,7 @@ def test_apply_matrix_type( # choose a trivial matrix data = xp.ones(2**n_targets, dtype=dtype) matrix = xp.diag(data) - matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr + matrix_ptr = matrix.ctypes.data if xp is np else matrix.data.ptr if mempool is None: workspace_size = cusv.test_matrix_type_get_workspace_size( @@ -959,7 +1302,7 @@ def test_apply_matrix_type( matrix_ptr, data_type, cusv.MatrixLayout.ROW, n_targets, 0, compute_type) if workspace_size: - workspace = cupy.cuda.alloc(workspace_size) + workspace = cp.cuda.alloc(workspace_size) workspace_ptr = workspace.ptr else: workspace_ptr = 0 @@ -975,26 +1318,26 @@ def test_apply_matrix_type( handle, matrix_type, matrix_ptr, data_type, cusv.MatrixLayout.ROW, n_targets, 0, compute_type, workspace_ptr, workspace_size) - assert numpy.isclose(residual, 0) + assert np.isclose(residual, 0) @pytest.mark.parametrize( 'rand', # the choices here ensure we get either parity - (0, numpy.nextafter(1, 0)) + (0, np.nextafter(1, 0)) ) @pytest.mark.parametrize( 'collapse', (cusv.Collapse.NORMALIZE_AND_ZERO, cusv.Collapse.NONE) ) @pytest.mark.skipif( - cupy.cuda.runtime.getDeviceCount() < 2, reason='not enough GPUs') + cp.cuda.runtime.getDeviceCount() < 2, reason='not enough GPUs') class TestBatchMeasureWithSubSV(TestMultiGpuSV): @pytest.mark.parametrize( 'input_form', ( - {'bit_ordering': (numpy.int32, 'int'),}, - {'bit_ordering': (numpy.int32, 'seq'),}, + {'bit_ordering': (np.int32, 'int'),}, + {'bit_ordering': (np.int32, 'seq'),}, ) ) @pytest.mark.parametrize( @@ -1011,28 +1354,28 @@ def test_batch_measure_with_offset( # change sv to 1/\sqrt{2} (|0000> + |1111>), and compute abs2sum; # calling abs2sum_array is also OK, but we focus on testing the target API - cumulative_array = numpy.zeros(self.n_devices+1, dtype=numpy.float64) + cumulative_array = np.zeros(self.n_devices+1, dtype=np.float64) for i_sv in range(self.n_devices): - with cupy.cuda.Device(i_sv): + with cp.cuda.Device(i_sv): if i_sv == 0: # |0 000> is on GPU 0 - sub_sv[i_sv][0] = numpy.sqrt(0.5) + sub_sv[i_sv][0] = np.sqrt(0.5) elif i_sv == 1: # |1 111> is on GPU 1 - sub_sv[i_sv][-1] = numpy.sqrt(0.5) - abs2sum = cupy.asnumpy(cupy.sum(cupy.abs(sub_sv[i_sv])**2)) + sub_sv[i_sv][-1] = np.sqrt(0.5) + abs2sum = cp.asnumpy(cp.sum(cp.abs(sub_sv[i_sv])**2)) cumulative_array[i_sv+1] = cumulative_array[i_sv] + abs2sum orig_sub_sv = copy.deepcopy(sub_sv) - bitstring = numpy.empty(self.n_local_bits, dtype=numpy.int32) + bitstring = np.empty(self.n_local_bits, dtype=np.int32) for i_sv in range(self.n_devices): if (cumulative_array[i_sv] <= rand and rand < cumulative_array[i_sv+1]): global_bits = i_sv norm = cumulative_array[-1] offset = cumulative_array[i_sv] - with cupy.cuda.Device(i_sv) as dev: + with cp.cuda.Device(i_sv) as dev: cusv.batch_measure_with_offset( handles[i_sv], sub_sv[i_sv].data.ptr, data_type, self.n_local_bits, bitstring.ctypes.data, @@ -1056,25 +1399,25 @@ def test_batch_measure_with_offset( # the measured sub sv is collapsed (those not measured are intact!) if global_bits == 0: # collapse to |0 000> - with cupy.cuda.Device(0): - assert cupy.allclose(sub_sv[0][0], 1) + with cp.cuda.Device(0): + assert cp.allclose(sub_sv[0][0], 1) assert not (sub_sv[0] == orig_sub_sv[0]).all() - with cupy.cuda.Device(1): + with cp.cuda.Device(1): assert (sub_sv[1] == orig_sub_sv[1]).all() elif global_bits == 1: # collapse to |1 111> - with cupy.cuda.Device(0): + with cp.cuda.Device(0): assert (sub_sv[0] == orig_sub_sv[0]).all() - with cupy.cuda.Device(1): - assert cupy.allclose(sub_sv[1][-1], 1) + with cp.cuda.Device(1): + assert cp.allclose(sub_sv[1][-1], 1) assert not (sub_sv[1] == orig_sub_sv[1]).all() else: assert False, f"unexpected bitstring: {bitstring}" else: # sv is intact - with cupy.cuda.Device(0): + with cp.cuda.Device(0): assert (sub_sv[0] == orig_sub_sv[0]).all() - with cupy.cuda.Device(1): + with cp.cuda.Device(1): assert (sub_sv[1] == orig_sub_sv[1]).all() @@ -1082,18 +1425,18 @@ class TestSwap: @pytest.mark.parametrize( 'input_form', ( - {'swapped_bits': (numpy.int32, 'int'), - 'mask_bitstring': (numpy.int32, 'int'), 'mask_ordering': (numpy.int32, 'int')}, - {'swapped_bits': (numpy.int32, 'seq'), - 'mask_bitstring': (numpy.int32, 'seq'), 'mask_ordering': (numpy.int32, 'seq')}, + {'swapped_bits': (np.int32, 'int'), + 'mask_bitstring': (np.int32, 'int'), 'mask_ordering': (np.int32, 'int')}, + {'swapped_bits': (np.int32, 'seq'), + 'mask_bitstring': (np.int32, 'seq'), 'mask_ordering': (np.int32, 'seq')}, ) ) @pytest.mark.parametrize( - 'dtype', (numpy.complex64, numpy.complex128) + 'dtype', (np.complex64, np.complex128) ) def test_swap_index_bits(self, handle, dtype, input_form): n_qubits = 4 - sv = cupy.zeros(2**n_qubits, dtype=dtype) + sv = cp.zeros(2**n_qubits, dtype=dtype) data_type = dtype_to_data_type[dtype] # set sv to |0110> @@ -1103,7 +1446,7 @@ def test_swap_index_bits(self, handle, dtype, input_form): swapped_bits = [(0, 2), (1, 3)] n_swapped_bits = len(swapped_bits) if input_form['swapped_bits'][1] == 'int': - swapped_bits_data = numpy.asarray( + swapped_bits_data = np.asarray( swapped_bits, dtype=input_form['swapped_bits'][0]) swapped_bits = swapped_bits_data.ctypes.data @@ -1127,15 +1470,15 @@ def test_swap_index_bits(self, handle, dtype, input_form): 'topology', [t for t in cusv.DeviceNetworkType] ) @pytest.mark.skipif( - cupy.cuda.runtime.getDeviceCount() < 2, reason='not enough GPUs') + cp.cuda.runtime.getDeviceCount() < 2, reason='not enough GPUs') class TestMultiGPUSwap(TestMultiGpuSV): @pytest.mark.parametrize( 'input_form', ( - {'handles': (numpy.intp, 'int'), 'sub_svs': (numpy.intp, 'int'), - 'swapped_bits': (numpy.int32, 'int'), 'mask': (numpy.int32, 'int')}, - {'handles': (numpy.intp, 'seq'), 'sub_svs': (numpy.intp, 'seq'), - 'swapped_bits': (numpy.int32, 'seq'), 'mask': (numpy.int32, 'seq')}, + {'handles': (np.intp, 'int'), 'sub_svs': (np.intp, 'int'), + 'swapped_bits': (np.int32, 'int'), 'mask': (np.int32, 'int')}, + {'handles': (np.intp, 'seq'), 'sub_svs': (np.intp, 'seq'), + 'swapped_bits': (np.int32, 'seq'), 'mask': (np.int32, 'seq')}, ) ) @pytest.mark.parametrize( @@ -1152,19 +1495,19 @@ def test_multi_device_swap_index_bits( data_type = dtype_to_data_type[self.dtype] # set sv to |0110> (up to normalization) - with cupy.cuda.Device(0): + with cp.cuda.Device(0): sub_sv[0][0] = 0 sub_sv[0][-2] = 1 if input_form['handles'][1] == 'int': - handles_data = numpy.asarray( + handles_data = np.asarray( handles, dtype=input_form['handles'][0]) handles = handles_data.ctypes.data sub_sv_data = sub_sv sub_sv_ptr_data = [arr.data.ptr for arr in sub_sv] sub_sv = sub_sv_ptr_data if input_form['sub_svs'][1] == 'int': - sub_sv_ptr_data = numpy.asarray( + sub_sv_ptr_data = np.asarray( sub_sv_ptr_data, dtype=input_form['sub_svs'][0]) sub_sv = sub_sv_ptr_data.ctypes.data else: @@ -1173,7 +1516,7 @@ def test_multi_device_swap_index_bits( swapped_bits = [(3, 1)] n_swapped_bits = len(swapped_bits) if input_form['swapped_bits'][1] == 'int': - swapped_bits_data = numpy.asarray( + swapped_bits_data = np.asarray( swapped_bits, dtype=input_form['swapped_bits'][0]) swapped_bits = swapped_bits_data.ctypes.data @@ -1182,10 +1525,10 @@ def test_multi_device_swap_index_bits( mask_ordering = [] mask_len = 0 if input_form['mask'][1] == 'int': - mask_bitstring_data = numpy.asarray( + mask_bitstring_data = np.asarray( mask_bitstring, dtype=input_form['mask'][0]) mask_bitstring = mask_bitstring_data.ctypes.data - mask_ordering_data = numpy.asarray( + mask_ordering_data = np.asarray( mask_ordering, dtype=input_form['mask'][0]) mask_ordering = mask_ordering_data.ctypes.data @@ -1198,9 +1541,9 @@ def test_multi_device_swap_index_bits( # now we should get |1100> sub_sv = sub_sv_data - with cupy.cuda.Device(0): + with cp.cuda.Device(0): assert sub_sv[0][-2] == 0 - with cupy.cuda.Device(1): + with cp.cuda.Device(1): assert sub_sv[1][4] == 1 @@ -1270,7 +1613,7 @@ def test_parameters(self): assert parameters.ptr == new_parameters.ptr assert parameters == new_parameters - new_parameters_arr = numpy.empty( + new_parameters_arr = np.empty( (1,), dtype=cusv.sv_swap_parameters_dtype) new_parameters_arr['segment_mask_ordering'][:] = 1 new_parameters = cusv.SVSwapParameters.from_data(new_parameters_arr) @@ -1279,12 +1622,12 @@ def test_parameters(self): assert parameters != new_parameters # negative tests - parameters_arr = numpy.empty( + parameters_arr = np.empty( (2,), dtype=cusv.sv_swap_parameters_dtype) with pytest.raises(ValueError) as e: # wrong size parameters = cusv.SVSwapParameters.from_data(parameters_arr) - parameters_arr = numpy.empty( - (1,), dtype=numpy.float32) + parameters_arr = np.empty( + (1,), dtype=np.float32) with pytest.raises(ValueError) as e: # wrong dtype parameters = cusv.SVSwapParameters.from_data(parameters_arr) parameters_arr = "ABC" @@ -1294,19 +1637,19 @@ def test_parameters(self): class TestWorker: - event = cupy.cuda.Event() - stream = cupy.cuda.Stream() - sv = cupy.zeros((2**4,), dtype=cupy.complex64) + event = cp.cuda.Event() + stream = cp.cuda.Stream() + sv = cp.zeros((2**4,), dtype=cp.complex64) @pytest.mark.parametrize( "worker_args", ((sv.data.ptr, 0, event.ptr, cudaDataType.CUDA_C_32F, stream.ptr),) ) @pytest.mark.parametrize( 'input_form', ( - {'sv': (numpy.intp, 'int'), 'indices': (numpy.int32, 'int'), - 'event': (numpy.intp, 'int')}, - {'sv': (numpy.intp, 'seq'), 'indices': (numpy.int32, 'seq'), - 'event': (numpy.intp, 'seq')}, + {'sv': (np.intp, 'int'), 'indices': (np.int32, 'int'), + 'event': (np.intp, 'int')}, + {'sv': (np.intp, 'seq'), 'indices': (np.int32, 'seq'), + 'event': (np.intp, 'seq')}, ) ) @pytest.mark.parametrize( @@ -1318,30 +1661,30 @@ def test_worker(self, handle, worker_args, input_form, param_form): 0, # set the communicator to null, assuming single process *worker_args) - extra_space = cupy.cuda.alloc(extra_size) + extra_space = cp.cuda.alloc(extra_size) cusv.sv_swap_worker_set_extra_workspace( handle, worker, extra_space.ptr, extra_size) - transfer_space = cupy.cuda.alloc(min_size) + transfer_space = cp.cuda.alloc(min_size) cusv.sv_swap_worker_set_transfer_workspace( handle, worker, transfer_space.ptr, min_size) sv = [self.sv.data.ptr] if input_form['sv'][1] == 'int': - sv_data = numpy.asarray( + sv_data = np.asarray( sv, dtype=input_form['sv'][0]) sv = sv_data.ctypes.data indices = [1] if input_form['indices'][1] == 'int': - indices_data = numpy.asarray( + indices_data = np.asarray( indices, dtype=input_form['indices'][0]) indices = indices_data.ctypes.data - dummy = cupy.cuda.Event() + dummy = cp.cuda.Event() event = [dummy.ptr] if input_form['event'][1] == 'int': - event_data = numpy.asarray( + event_data = np.asarray( event, dtype=input_form['event'][0]) event = event_data.ctypes.data @@ -1380,8 +1723,8 @@ class TestScheduler: ) @pytest.mark.parametrize( 'input_form', ( - {'swapped_bits': (numpy.int32, 'int'), 'mask': (numpy.int32, 'int')}, - {'swapped_bits': (numpy.int32, 'seq'), 'mask': (numpy.int32, 'seq')}, + {'swapped_bits': (np.int32, 'int'), 'mask': (np.int32, 'int')}, + {'swapped_bits': (np.int32, 'seq'), 'mask': (np.int32, 'seq')}, ) ) @pytest.mark.parametrize( @@ -1394,7 +1737,7 @@ def test_scheduler(self, handle, scheduler_args, input_form, param_form): swapped_bits = [(0, 1)] n_swapped_bits = len(swapped_bits) if input_form['swapped_bits'][1] == 'int': - swapped_bits_data = numpy.asarray( + swapped_bits_data = np.asarray( swapped_bits, dtype=input_form['swapped_bits'][0]) swapped_bits = swapped_bits_data.ctypes.data @@ -1403,10 +1746,10 @@ def test_scheduler(self, handle, scheduler_args, input_form, param_form): mask_ordering = [] mask_len = 0 if input_form['mask'][1] == 'int': - mask_bitstring_data = numpy.asarray( + mask_bitstring_data = np.asarray( mask_bitstring, dtype=input_form['mask'][0]) mask_bitstring = mask_bitstring_data.ctypes.data - mask_ordering_data = numpy.asarray( + mask_ordering_data = np.asarray( mask_ordering, dtype=input_form['mask'][0]) mask_ordering = mask_ordering_data.ctypes.data @@ -1419,9 +1762,9 @@ def test_scheduler(self, handle, scheduler_args, input_form, param_form): elif param_form == "class": params_in = cusv.SVSwapParameters() elif param_form == "ndarray": - params_in = numpy.empty((1,), dtype=cusv.sv_swap_parameters_dtype) + params_in = np.empty((1,), dtype=cusv.sv_swap_parameters_dtype) elif param_form == "int": - params = numpy.empty((1,), dtype=cusv.sv_swap_parameters_dtype) + params = np.empty((1,), dtype=cusv.sv_swap_parameters_dtype) params_in = params.ctypes.data else: assert False diff --git a/python/tests/cuquantum_tests/cutensornet_tests/__init__.py b/python/tests/cuquantum_tests/cutensornet_tests/__init__.py index 3ff1538..8f8d432 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/__init__.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/__init__.py @@ -5,6 +5,8 @@ import cupy as cp try: import torch + if not torch.cuda.is_available(): + raise ImportError torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False except ImportError: diff --git a/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py index fde2d03..605c4a1 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/approxTN_utils.py @@ -46,6 +46,8 @@ def parse_split_expression(split_expression): def get_new_modes(used_modes, num): + # Note: cannot use _internal.circuit_converter_utils._get_symbol() here, as this + # module needs to be standalone. We don't need that many symbols here, anyway. base_modes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" new_modes = "" for mode in base_modes: @@ -174,7 +176,7 @@ def split_contract_decompose(subscripts): def torch_support_wrapper(func): def new_func(T, *args, **kwargs): backend = infer_backend(T) - if backend not in (cp, np): # torch + if backend not in (cp, np): # torch if T.device.type == 'cpu': out = func(T.numpy(), *args, **kwargs) else: @@ -216,7 +218,8 @@ def matrix_svd( rel_cutoff=0, partition=None, normalization=None, - return_info=True + return_info=True, + **kwargs, ): info = dict() backend = infer_backend(T) @@ -507,13 +510,46 @@ def verify_split_SVD( modes_in, left_modes, right_modes, shared_mode = parse_split_expression(split_expression) shared_mode_idx = left_modes.index(shared_mode) shared_extent = array_u.shape[shared_mode_idx] - if is_exact_split(**split_options) and T is not None: + try: + max_mid_extent = min(array_u.size, array_v.size) // shared_extent + except: + # for torch + max_mid_extent = min(array_u.numel(), array_v.numel()) // shared_extent + max_extent = split_options.pop('max_extent', max_mid_extent) + if is_exact_split(**split_options) and max_extent == max_mid_extent and T is not None: reference = T else: reference = reverse_einsum(split_expression, array_u_ref, array_s_ref, array_v_ref) out = reverse_einsum(split_expression, array_u, array_s, array_v) + if hasattr(out.dtype, "name"): + dtype_name = out.dtype.name + else: + dtype_name = str(out.dtype).split('.')[-1] backend = infer_backend(out) rtol = get_tolerance("svd", out.dtype) # Note: tolerance for gate and svd is equal + if info is not None: + algorithm = info['algorithm'] + else: + algorithm = 'gesvd' + if algorithm == 'gesvdj': + if dtype_name in ['float64', 'complex128']: + rtol = 1e-8 + if 'gesvdj_residual' not in info: + logging.warning("gesvdj_residual not recorded in info; verification may fail due to unknown runtime status") + else: + rtol = max(rtol, info['gesvdj_residual']) + elif algorithm == 'gesvdp': + if dtype_name in ['float64', 'complex128']: + rtol = 1e-8 + if 'gesvdp_err_sigma' not in info: + logging.warning("gesvdp_err_sigma not recorded in info; verification may fail due to unknown runtime status") + elif info['gesvdp_err_sigma'] > 1e-4: + logging.warning(f"Large err sigma found for gesvdp: {info['gesvdp_err_sigma']}, skipping verification") + return True + elif algorithm == 'gesvdr': + if dtype_name in ['float64', 'complex128']: + rtol = 1e-4 + is_equal = verify_close(reference, out, rtol, True, scale_factor=shared_extent, error_message="Contracted output is not close to the expected outcome") partition = split_options.get("partition", None) @@ -549,7 +585,9 @@ def verify_split_SVD( if info is not None and info_ref is not None: for attr in ["full_extent", "reduced_extent"]: info_equal = info_equal and info[attr] == info_ref[attr] - info_equal = info_equal and (abs(info["discarded_weight"]-info_ref["discarded_weight"]) < rtol) + # For gesvdr, discarded weight is only computed when fix extent truncation is not enabled + if info['algorithm'] != 'gesvdr' or max_extent == max_mid_extent: + info_equal = info_equal and (abs(info["discarded_weight"]-info_ref["discarded_weight"]) < rtol) if not info_equal: info_details = "".join([f"{key}:({info.get(key)}, {info_ref.get(key)}); " for key in info.keys()]) logging.error(f"SVD Info not matching the reference: {info_details}") diff --git a/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py index b6a3f03..69f1fa1 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/circuit_utils.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: BSD-3-Clause +from collections import Counter import itertools from types import MappingProxyType @@ -11,22 +12,29 @@ cirq = None import cupy as cp import numpy as np +import pytest try: import torch + if not torch.cuda.is_available(): + raise ImportError except ImportError: torch = None try: import qiskit except ImportError: qiskit = None - + from cuquantum import contract, CircuitToEinsum +from cuquantum import cutensornet as cutn +from cuquantum.cutensornet._internal import circuit_parser_utils_cirq, circuit_parser_utils_qiskit from cuquantum.cutensornet._internal.circuit_converter_utils import convert_mode_labels_to_expression from cuquantum.cutensornet._internal.circuit_converter_utils import EINSUM_SYMBOLS_BASE from cuquantum.cutensornet._internal.circuit_converter_utils import get_pauli_gates from cuquantum.cutensornet._internal.circuit_converter_utils import parse_gates_to_mode_labels_operands from cuquantum.cutensornet._internal.utils import infer_object_package -from .test_utils import atol_mapper, rtol_mapper + +from .test_utils import atol_mapper, get_stream_for_backend, rtol_mapper +from .test_cutensornet import manage_resource # note: this implementation would cause pytorch tests being silently skipped @@ -60,16 +68,17 @@ def bitstring_generator(n_qubits, nsample=1): def where_fixed_generator(qubits, nfix_max, nsite_max=None): indices = np.arange(len(qubits)) for nfix in range(nfix_max): - np.random.shuffle(indices) - fixed_sites = [qubits[indices[ix]] for ix in range(nfix)] - bitstring = ''.join(np.random.choice(('0', '1'), nfix)) - fixed = dict(zip(fixed_sites, bitstring)) - if nsite_max is None: - yield fixed - else: - for nsite in range(1, nsite_max): - where = [qubits[indices[ix]] for ix in range(nfix, nfix+nsite)] - yield where, fixed + for _ in range(2): + np.random.shuffle(indices) + fixed_sites = [qubits[indices[ix]] for ix in range(nfix)] + bitstring = ''.join(np.random.choice(('0', '1'), nfix)) + fixed = dict(zip(fixed_sites, bitstring)) + if nsite_max is None: + yield fixed + else: + for nsite in range(1, nsite_max+1): + where = [qubits[indices[ix]] for ix in range(nfix, nfix+nsite)] + yield where, fixed def random_pauli_string_generator(n_qubits, num_strings=4): @@ -117,6 +126,19 @@ def get_cirq_random_circuit(n_qubits, n_moments, op_density=0.9, seed=3): for n_moments in N_MOMENTS_RANGE: cirq_circuits.append(get_cirq_random_circuit(n_qubits, n_moments)) +try: + from cuquantum_benchmarks.frontends.frontend_cirq import Cirq as cuqnt_cirq + from cuquantum_benchmarks.benchmarks import qpe, quantum_volume, qaoa + cirq_generators = [qpe.QPE, quantum_volume.QuantumVolume, qaoa.QAOA] + config = {'measure': True, 'unfold': True, 'p': 4} + for generator in cirq_generators: + for n_qubits in (5, 6): + seq = generator.generateGatesSequence(n_qubits, config) + circuit = cuqnt_cirq(n_qubits, config).generateCircuit(seq) + cirq_circuits.append(circuit) +except: + pass + ######################################################### # functions to generate qiskit.QuantumCircuit for testing @@ -167,14 +189,78 @@ def get_qiskit_nested_circuit(): return circ +def get_cc_unitary_gate(seed=None): + # random unitary two qubit gate + from qiskit.extensions import UnitaryGate + if seed is None: + seed = 1234 + rng = np.random.default_rng(seed) + m = rng.standard_normal(size=(4, 4)) + 1j*rng.standard_normal(size=(4, 4)) + q, r = np.linalg.qr(m) + d = np.diag(r) + q *= d/abs(d) + gate = UnitaryGate(q).control(2) + return gate + + +def get_qiskit_multi_control_circuit(): + qubits = qiskit.QuantumRegister(5) + circuit = qiskit.QuantumCircuit(qubits) + for q in qubits: + circuit.h(q) + qs = list(qubits) + # 3 layers of multi-controlled qubits + np.random.seed(0) + for i in range(2): + np.random.shuffle(qs) + ccu_gate = get_cc_unitary_gate(i) + circuit.append(ccu_gate, qs[:4]) + for q in qubits: + if i % 2 == 1: + circuit.h(q) + else: + circuit.x(q) + circuit.global_phase = 0.5 + circuit.p(0.1, qubits[0]) + return circuit + + if qiskit: - qiskit_circuits.append(get_qiskit_composite_circuit()) + circuit = get_qiskit_composite_circuit() + qiskit_circuits.append(circuit.copy()) + circuit.global_phase = 0.5 + qiskit_circuits.append(circuit) qiskit_circuits.append(get_qiskit_nested_circuit()) + qiskit_circuits.append(get_qiskit_multi_control_circuit()) for n_qubits in N_QUBITS_RANGE: qiskit_circuits.append(get_qiskit_qft_circuit(n_qubits)) for depth in DEPTH_RANGE: qiskit_circuits.append(get_qiskit_random_circuit(n_qubits, depth)) +try: + from cuquantum_benchmarks.frontends.frontend_qiskit import Qiskit as cuqnt_qiskit + from cuquantum_benchmarks.benchmarks import qpe, quantum_volume, qaoa + qiskit_generators = [qpe.QPE, quantum_volume.QuantumVolume, qaoa.QAOA] + config = {'measure': True, 'unfold': True, 'p': 4} + for generator in qiskit_generators: + for n_qubits in (5, 6): + seq = generator.generateGatesSequence(n_qubits, config) + circuit = cuqnt_qiskit(n_qubits, config).generateCircuit(seq) + qiskit_circuits.append(circuit) +except: + pass + + +def compute_histogram_overlap(hist1, hist2, nshots): + # assuming hist1 & hist2 have the same sample size (=nshots) + overlap = 0 + for val, count in hist1.items(): + if val not in hist2: + continue + overlap += min(hist1[val], hist2[val]) + overlap /= nshots + return overlap + ################################################################### # @@ -187,17 +273,21 @@ def get_qiskit_nested_circuit(): ################################################################### class BaseTester: - def __init__(self, circuit, dtype, backend, nsample, nsite_max, nfix_max): + def __init__(self, circuit, dtype, backend, nsample, nsite_max, nfix_max, nshots=5000, seed=1024): self.circuit = circuit self.converter = CircuitToEinsum(circuit, dtype=dtype, backend=backend) self.backend = backend - self.qubits = self.converter.qubits + self.qubits = list(self.converter.qubits) self.n_qubits = self.converter.n_qubits self.dtype = dtype self.sv = None self.nsample = nsample self.nsite_max = max(1, min(nsite_max, self.n_qubits-1)) self.nfix_max = max(min(nfix_max, self.n_qubits-nsite_max-1), 0) + self.nshots = nshots + self.seed = seed + self.state_purity = cutn.StatePurity.PURE + self.state_prepared = False def get_state_vector_from_simulator(self): if self.sv is None: @@ -268,20 +358,173 @@ def get_expectation_from_sv(self, pauli_string): def _get_state_vector_from_simulator(self): raise NotImplementedError + + def _get_sampling_from_simulator(self, qubits_to_sample=None, seed=None): + raise NotImplementedError + + def get_sampling_from_sv(self, qubits_to_sample=None, seed=None): + sv = self.get_state_vector_from_simulator() + p = abs(sv) ** 2 + # convert p to double type in case probs does not add up to 1 + if self.backend is np: + p = p.astype('float64') + elif self.backend is cp: + p = cp.asnumpy(p).astype('float64') + elif self.backend is torch: + if p.device.type == 'cpu': + p = p.numpy().astype('float64') + else: + p = p.cpu().numpy().astype('float64') + if qubits_to_sample is not None: + sorted_qubits_to_sample = [q for q in self.qubits if q in qubits_to_sample] + axis = [i for (i, q) in enumerate(self.qubits) if q not in qubits_to_sample] + if axis: + p = p.sum(tuple(axis)) + # potential transpose to match the order of qubits_to_sample + transpose_order = [sorted_qubits_to_sample.index(q) for q in qubits_to_sample] + p = p.transpose(*transpose_order) + # normalize + p /= p.sum() + if seed is not None: + np.random.seed(seed) + samples = np.random.choice(np.arange(p.size), p=p.flat, size=self.nshots) + hist_sv = np.unique(samples, return_counts=True) + return dict(zip(*hist_sv)) + + def maybe_prepare_state(self): + if not self.state_prepared: + if not hasattr(self, 'state'): + raise RuntimeError("state not initialized") + if self.backend is not cp: + raise RuntimeError("This func is only expected to be executed for cupy backend") + gates = self.converter.gates + immutable = 0 + adjoint = 0 + unitary = 1 # assuming all gates unitary + self.operands = [] + for (operand, qubits) in gates: + n_state_modes = len(qubits) + state_modes = [self.qubits.index(q) for q in qubits] + # keep operand alive otherwise cupy will re-use the memory space + operand = operand.T.astype(operand.dtype, order=np.random.choice(['C', 'F'])) + self.operands.append(operand) + tensor_mode_strides = [stride_in_bytes//operand.itemsize for stride_in_bytes in operand.strides] + update_tensor = np.random.choice([True, False], p=[0.1, 0.9]) + if update_tensor: + tmp = cp.empty_like(operand) + tensor_id = cutn.state_apply_tensor(self.handle, self.state, n_state_modes, + state_modes, tmp.data.ptr, tensor_mode_strides, + immutable, adjoint, unitary) + cutn.state_update_tensor(self.handle, self.state, tensor_id, operand.data.ptr, unitary) + else: + cutn.state_apply_tensor(self.handle, self.state, n_state_modes, + state_modes, operand.data.ptr, tensor_mode_strides, + immutable, adjoint, unitary) + self.state_prepared = True + + def _run_cutensornet_sampling_marginal(self, task, create_args, execute_args, stream): + self.maybe_prepare_state() + if task == 'marginal': + create_func = cutn.create_marginal + configure_func = cutn.marginal_configure + hyper_sample_attr = cutn.MarginalAttribute.OPT_NUM_HYPER_SAMPLES + num_hyper_samples_dtype = cutn.marginal_get_attribute_dtype(hyper_sample_attr) + prepare_func = cutn.marginal_prepare + execute_func = cutn.marginal_compute + destroy_func = cutn.destroy_marginal + elif task == 'sampler': + create_func = cutn.create_sampler + configure_func = cutn.sampler_configure + hyper_sample_attr = cutn.SamplerAttribute.OPT_NUM_HYPER_SAMPLES + num_hyper_samples_dtype = cutn.sampler_get_attribute_dtype(hyper_sample_attr) + prepare_func = cutn.sampler_prepare + execute_func = cutn.sampler_sample + destroy_func = cutn.destroy_sampler + else: + raise ValueError("only supports marginal and sampler") + + dev = cp.cuda.Device() + free_mem = dev.mem_info[0] + scratch_size = free_mem // 2 # maximal usage of 50% device memory + + task_obj = create_func(self.handle, self.state, *create_args) + num_hyper_samples = np.asarray(8, dtype=num_hyper_samples_dtype) + configure_func(self.handle, task_obj, hyper_sample_attr, + num_hyper_samples.ctypes.data, num_hyper_samples.dtype.itemsize) + prepare_func(self.handle, task_obj, scratch_size, self.workspace, stream.ptr) # similar args for marginal and sampler + workspace_size_d = cutn.workspace_get_memory_size(self.handle, + self.workspace, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + + if workspace_size_d >= scratch_size: + destroy_func(task_obj) + return None + + scratch_space = cp.cuda.alloc(workspace_size_d) + cutn.workspace_set_memory(self.handle, + self.workspace, cutn.Memspace.DEVICE, + cutn.WorkspaceKind.SCRATCH, scratch_space.ptr, workspace_size_d) + + execute_func(self.handle, task_obj, *execute_args, stream.ptr) + stream.synchronize() + destroy_func(task_obj) + return True + + def get_reduced_density_matrix_from_cutn(self, where, fixed=EMPTY_DICT): + n_marginal_modes = len(where) + marginal_modes = [self.qubits.index(q) for q in where] + if fixed: + n_projected_modes = len(fixed) + projected_modes = [] + projected_mode_values = [] + for q, bit in fixed.items(): + projected_modes.append(self.qubits.index(q)) + projected_mode_values.append(int(bit)) + else: + n_projected_modes = projected_modes = projected_mode_values = 0 + + rdm = cp.empty((2,2)*n_marginal_modes, dtype=self.dtype, order=np.random.choice(['C', 'F'])) + rdm_strides = [s // rdm.itemsize for s in rdm.strides] + stream = cp.cuda.get_current_stream() + + create_args = (n_marginal_modes, marginal_modes, n_projected_modes, projected_modes, rdm_strides) + execute_args = (projected_mode_values, self.workspace, rdm.data.ptr) + if self._run_cutensornet_sampling_marginal('marginal', create_args, execute_args, stream): + return rdm + else: + return None + + def get_sampling_from_cutensornet(self, qubits_to_sample=None, seed=None): + if qubits_to_sample is None: + qubits_to_sample = self.qubits + n_modes_to_sample = len(qubits_to_sample) + modes_to_sample = [self.qubits.index(q) for q in qubits_to_sample] + samples = np.empty((self.nshots, n_modes_to_sample), dtype='int64', order='C') # equivalent to (n_modes, nshots) in F order + stream = cp.cuda.get_current_stream() + + create_args = (n_modes_to_sample, modes_to_sample) + execute_args = (self.nshots, self.workspace, samples.ctypes.data) + if self._run_cutensornet_sampling_marginal('sampler', create_args, execute_args, stream): + sampling = {} + for bitstring, n_sampling in zip(*np.unique(samples, axis=0, return_counts=True)): + bitstring = np.array2string(bitstring, separator='')[1:-1] + sampling[int(bitstring, 2)] = n_sampling + return sampling + else: + return None def test_qubits(self): - assert len(self.qubits) == self.num_qubits + assert len(self.qubits) == self.n_qubits def test_gates(self): for (gate_operand, qubits) in self.converter.gates: assert gate_operand.ndim == len(qubits) * 2 - assert infer_object_package(gate_operand) is self.backend + assert infer_object_package(gate_operand) == self.backend.__name__ def test_state_vector(self): expression, operands = self.converter.state_vector() sv1 = contract(expression, *operands) sv2 = self.get_state_vector_from_simulator() - self.backend.allclose( + assert self.backend.allclose( sv1, sv2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) def test_amplitude(self): @@ -289,7 +532,7 @@ def test_amplitude(self): expression, operands = self.converter.amplitude(bitstring) amp1 = contract(expression, *operands) amp2 = self.get_amplitude_from_simulator(bitstring) - self.backend.allclose( + assert self.backend.allclose( amp1, amp2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) def test_batched_amplitudes(self): @@ -297,72 +540,151 @@ def test_batched_amplitudes(self): expression, operands = self.converter.batched_amplitudes(fixed) batched_amps1 = contract(expression, *operands) batched_amps2 = self.get_batched_amplitudes_from_simulator(fixed) - self.backend.allclose( + assert self.backend.allclose( batched_amps1, batched_amps2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) def test_reduced_density_matrix(self): for where, fixed in where_fixed_generator(self.qubits, self.nfix_max, nsite_max=self.nsite_max): expression1, operands1 = self.converter.reduced_density_matrix(where, fixed=fixed, lightcone=True) expression2, operands2 = self.converter.reduced_density_matrix(where, fixed=fixed, lightcone=False) - assert len(operands1) <= len(operands2) + assert len(operands1) <= len(operands2) + 2 # potential phase handling for qiskit Circuit rdm1 = contract(expression1, *operands1) rdm2 = contract(expression2, *operands2) rdm3 = self.get_reduced_density_matrix_from_simulator(where, fixed=fixed) - self.backend.allclose( + assert self.backend.allclose( rdm1, rdm2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) - self.backend.allclose( + assert self.backend.allclose( rdm1, rdm3, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + + if self.backend is cp: + rdm4 = self.get_reduced_density_matrix_from_cutn(where, fixed=fixed) + if rdm4 is not None: + assert self.backend.allclose( + rdm1, rdm4, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) def test_expectation(self): for pauli_string in random_pauli_string_generator(self.n_qubits, 2): expression1, operands1 = self.converter.expectation(pauli_string, lightcone=True) expression2, operands2 = self.converter.expectation(pauli_string, lightcone=False) - assert len(operands1) <= len(operands2) + assert len(operands1) <= len(operands2) + 2 # potential phase handling for qiskit Circuit expec1 = contract(expression1, *operands1) expec2 = contract(expression2, *operands2) expec3 = self.get_expectation_from_sv(pauli_string) - self.backend.allclose( + assert self.backend.allclose( expec1, expec2, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) - self.backend.allclose( + assert self.backend.allclose( expec1, expec3, atol=atol_mapper[self.dtype], rtol=rtol_mapper[self.dtype]) + + def test_sampling(self): + full_qubits = list(self.qubits) + np.random.shuffle(full_qubits) + selected_qubits = full_qubits[:len(full_qubits)//2] + + for qubits_to_sample in (None, selected_qubits): + seed = self.seed + nshots = self.nshots + max_try = 3 + overlap_best = 0. + + for counter in range(1, max_try+1): + # build a histogram for the reference impl + hist_ref = self._get_sampling_from_simulator(qubits_to_sample=qubits_to_sample, seed=seed) + + # do the same for cutensornet sampling + hist_cutn = self.get_sampling_from_cutensornet(qubits_to_sample=qubits_to_sample, seed=seed) + + # compute overlap of the histograms (cutn vs ref) + overlap = compute_histogram_overlap(hist_cutn, hist_ref, self.nshots) + if overlap > overlap_best: + overlap_best = overlap + else: + print("WARNING: overlap not improving as nshots increases!") + + # do the same for sampling from the (exactly computed) SV + hist_sv = self.get_sampling_from_sv(qubits_to_sample=qubits_to_sample, seed=seed) + + # compute overlap of the histograms (sv vs ref) + overlap_check = compute_histogram_overlap(hist_sv, hist_ref, self.nshots) + print(f"with nshots = {self.nshots}, {overlap_best = }, {overlap_check = }") + + # to reduce test time we set 95% here, but 99% will also work + if np.round(overlap, decimals=2) < 0.95: + self.nshots *= 10 + print(f"retry with nshots = {self.nshots} ...") + else: + self.nshots = nshots # restore + break + else: + self.nshots = nshots # restore + assert False, f"{overlap_best=} after {counter} retries..." + @manage_resource("handle") + @manage_resource("state") + @manage_resource("workspace") def run_tests(self): self.test_state_vector() self.test_amplitude() self.test_batched_amplitudes() self.test_reduced_density_matrix() self.test_expectation() + self.test_gates() + self.test_qubits() + if self.backend is cp: + # sampling only needed to be tested for cupy backend + self.test_sampling() class CirqTester(BaseTester): def _get_state_vector_from_simulator(self): qubits = self.qubits simulator = cirq.Simulator(dtype=self.dtype) - result = simulator.simulate(self.circuit, qubit_order=qubits) + circuit = circuit_parser_utils_cirq.remove_measurements(self.circuit) + result = simulator.simulate(circuit, qubit_order=qubits) statevector = result.state_vector().reshape((2,)*self.n_qubits) if self.backend is torch: statevector = torch.as_tensor(statevector, dtype=getattr(torch, self.dtype), device='cuda') else: statevector = self.backend.asarray(statevector, dtype=self.dtype) return statevector + + def _get_sampling_from_simulator(self, qubits_to_sample=None, seed=None): + if qubits_to_sample is None: + qubits_to_sample = list(self.qubits) + circuit = circuit_parser_utils_cirq.remove_measurements(self.circuit) + circuit.append(cirq.measure_each(qubits_to_sample)) + circuit.append(cirq.measure(*qubits_to_sample, key='meas')) + result = cirq.sample( + circuit, repetitions=self.nshots, seed=seed, dtype=getattr(np, self.dtype)) + result = result.histogram(key='meas') + sampling = {} + nsamples = 0 + for bitstring, nsample in result.items(): + sampling[int(bitstring)] = nsample + nsamples += nsample + assert nsamples == self.nshots + return sampling class QiskitTester(BaseTester): - def _get_state_vector_from_simulator(self): - # requires qiskit >= 0.24.0 + def _get_precision(self): precision = {'complex64': 'single', 'complex128': 'double'}[self.dtype] + return precision + + def _get_state_vector_from_simulator(self): + # requires qiskit >= 0.24.0 + precision = self._get_precision() + circuit = circuit_parser_utils_qiskit.remove_measurements(self.circuit) try: # for qiskit >= 0.25.0 simulator = qiskit.Aer.get_backend('aer_simulator_statevector', precision=precision) - circuit = qiskit.transpile(self.circuit, simulator) + circuit = qiskit.transpile(circuit, simulator) circuit.save_statevector() result = simulator.run(circuit).result() except: # for qiskit 0.24.* - circuit = self.circuit simulator = qiskit.Aer.get_backend('statevector_simulator', precision=precision) result = qiskit.execute(circuit, simulator).result() sv = np.asarray(result.get_statevector()).reshape((2,)*circuit.num_qubits) @@ -374,3 +696,24 @@ def _get_state_vector_from_simulator(self): else: sv = self.backend.asarray(sv, dtype=self.dtype) return sv + + def _get_sampling_from_simulator(self, qubits_to_sample=None, seed=None): + if qubits_to_sample is None: + qubits_to_sample = list(self.qubits) + circuit = self.circuit.remove_final_measurements(inplace=False) + new_creg = circuit._create_creg(len(qubits_to_sample), "meas") + circuit.add_register(new_creg) + circuit.measure(qubits_to_sample, new_creg) + precision = self._get_precision() + backend = qiskit.Aer.get_backend('qasm_simulator', precision=precision) + result = backend.run(qiskit.transpile(circuit, backend), shots=self.nshots, seed=seed).result() + counts = result.get_counts(circuit) + sampling = {} + nsamples = 0 + for bitstring, nsample in counts.items(): + # little endian from qiskit + value = int(bitstring[::-1], 2) + sampling[value] = nsample + nsamples += nsample + assert nsamples == self.nshots + return sampling diff --git a/python/tests/cuquantum_tests/cutensornet_tests/data.py b/python/tests/cuquantum_tests/cutensornet_tests/data.py index 7cab507..2ecee3c 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/data.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/data.py @@ -54,7 +54,7 @@ ["a,b,c->abc", {}, {}, "float64"], ["acdf,jbje,gihb,hfac", {}, {}, "float64"], ["acdf,jbje,gihb,hfac,gfac,gifabc,hfac", {}, {}, "float64"], - ["chd,bde,agbc,hiad,bdi,cgh,agdb", {}, {}, "float64"], + ["chd,bde,agbc,hiad,bdi,cgh,agdb", {"blocking": "auto"}, {}, "float64"], ["eb,cb,fb->cef", {}, {}, "float64"], ["dd,fb,be,cdb->cef", {}, {}, "float64"], ["bca,cdb,dbf,afc->", {}, {}, "float64"], @@ -62,8 +62,9 @@ ["a,ac,ab,ad,cd,bd,bc->", {}, {}, "float64"], ) + # the expression here should be -# - a list [decomposition_expression, input_tensor_shapes as a list of tuple] +# - a sequence of [decomposition_expression, input_tensor_shapes as a list of tuple] tensor_decomp_expressions = ( ('ab->ax,xb', [(8, 8)]), ('ab->ax,bx', [(8, 8)]), @@ -79,19 +80,21 @@ ('ab->xa,bx', [(8, 6)]), ('abcd->cxa,bdx', [(2, 3, 4, 5)]), ('abcd->cax,bdx', [(2, 3, 4, 5)]), - ('mnijk->jny,kmyi', [(2, 9, 3, 3, 4)]) + ('mnijk->jny,kmyi', [(2, 9, 3, 3, 4)]), ) + # the expression here should be -# - a list [gate_decomposition_expression, input_tensor_shapes as a list of tuple] +# - a sequence of [gate_decomposition_expression, input_tensor_shapes as a list of tuple] gate_decomp_expressions = ( - ('ijk,klm,jlpq->ipk,kqm', [(4, 2, 4), (4, 2, 4), (2, 2, 2, 2)]), - ('ijk,klm,jlpq->kpi,qmk', [(4, 2, 4), (4, 2, 4), (2, 2, 2, 2)]), - ('ijk,klm,jlpq->pki,mkq', [(4, 2, 4), (4, 2, 4), (2, 2, 2, 2)]), + ('ijk,klm,jlpq->ipk,kqm', [(2, 2, 2), (2, 2, 2), (2, 2, 2, 2)]), + ('ijk,klm,jlpq->kpi,qmk', [(2, 2, 2), (2, 2, 2), (2, 2, 2, 2)]), + ('ijk,klm,jlpq->pki,mkq', [(2, 2, 2), (2, 2, 2), (2, 2, 2, 2)]), ('sOD,DdNr,ROrsq->KR,qKdN', [(2, 4, 2), (2, 3, 4, 2), (5, 4, 2, 2, 2)]), ('beQ,cey,cbJj->Je,jQey', [(3, 5, 4), (2, 5, 7), (2, 3, 4, 4)]) ) + # the expression here can be # - a string as a standard contract and decompose expression # - a list of [contract decompose expression, network options, optimize options, kwargs] @@ -101,7 +104,7 @@ 'ijk,klm,jlpq->ipk,kqm', 'abcd,cdef->axb,fex', 'abcd,cdef->axf,bex', - 'sOD,DdNr,ROrsq->KR,qKdN', + ['sOD,DdNr,ROrsq->KR,qKdN', {'blocking': 'auto'}, {}, {}], 'beQ,cey,cbJj->Je,jQey', 'ijlm,jqr,lqsn->imx,xrsn', ['ijk,klm,jlpq->ipk,kqm', {}, {}, {'return_info': False}], diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py index 8e39f8b..b2e4291 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py @@ -6,15 +6,21 @@ import functools import os -import cupy +import cupy as cp from cupy import testing -import numpy +import numpy as np try: import mpi4py from mpi4py import MPI # init! except ImportError: mpi4py = MPI = None import pytest +try: + import torch + # unlike in other test modules, we don't check torch.cuda.is_available() + # here because we allow verifying against PyTorch CPU tensors +except: + torch = None import cuquantum from cuquantum import ComputeType, cudaDataType @@ -25,7 +31,7 @@ from . import approxTN_utils from .data import gate_decomp_expressions, tensor_decomp_expressions -from .test_utils import atol_mapper, rtol_mapper +from .test_utils import atol_mapper, get_stream_for_backend, rtol_mapper from .. import (_can_use_cffi, dtype_to_compute_type, dtype_to_data_type, MemHandlerTestBase, MemoryResourceFactory, LoggerTestBase) @@ -43,18 +49,21 @@ def manage_resource(name): def decorator(impl): @functools.wraps(impl) def test_func(self, *args, **kwargs): + # "self" refers to the test case try: if name == 'handle': h = cutn.create() elif name == 'dscr': - tn, dtype, input_form, output_form = self.tn, self.dtype, self.input_form, self.output_form + tn, dtype, input_form = self.tn, self.dtype, self.input_form einsum, shapes = tn # unpack - tn = TensorNetworkFactory(einsum, shapes, dtype) + tn = TensorNetworkFactory(einsum, shapes, dtype, order=self.order) i_n_inputs, i_n_modes, i_extents, i_strides, i_modes = \ tn.get_input_metadata(**input_form) o_n_modes, o_extents, o_strides, o_modes = \ - tn.get_output_metadata(**output_form) - i_qualifiers = numpy.zeros(i_n_inputs, dtype=cutn.tensor_qualifiers_dtype) + tn.get_output_metadata(**input_form) + i_qualifiers = np.zeros(i_n_inputs, dtype=cutn.tensor_qualifiers_dtype) + if self.qual is not None: + i_qualifiers['requires_gradient'][:] = True h = cutn.create_network_descriptor( self.handle, i_n_inputs, i_n_modes, i_extents, i_strides, i_modes, i_qualifiers, @@ -94,6 +103,9 @@ def test_func(self, *args, **kwargs): # we use this version to avoid creating a sequence; another API # is tested elsewhere h = cutn.create_slice_group_from_id_range(self.handle, 0, 1, 1) + elif name == 'state': + dtype = dtype_to_data_type[getattr(np, self.dtype)] + h = cutn.create_state(self.handle, self.state_purity, self.n_qubits, (2,)*self.n_qubits, dtype) else: assert False, f'name "{name}" not recognized' setattr(self, name, h) @@ -133,6 +145,8 @@ def test_func(self, *args, **kwargs): elif name == 'slice_group': h = cutn.destroy_slice_group(self.slice_group) del self.slice_group + elif name == 'state': + h = cutn.destroy_state(self.state) return test_func return decorator @@ -168,7 +182,7 @@ class TensorNetworkFactory: # This factory CANNOT be reused; once a TN descriptor uses it, it must # be discarded. - def __init__(self, einsum, shapes, dtype): + def __init__(self, einsum, shapes, dtype, *, order='C'): self.einsum = einsum inputs, output = einsum.split('->') if "->" in einsum else (einsum, None) i_shapes, o_shape = shapes[:-1], shapes[-1] @@ -177,30 +191,34 @@ def __init__(self, einsum, shapes, dtype): assert len(output) == len(o_shape) # xp strides in bytes, cutn strides in counts - itemsize = cupy.dtype(dtype).itemsize + itemsize = cp.dtype(dtype).itemsize self.input_tensors = [ - testing.shaped_random(s, cupy, dtype) for s in i_shapes] + testing.shaped_random(s, cp, dtype, seed=i, order=order) + for i, s in enumerate(i_shapes)] self.input_n_modes = [len(i) for i in inputs] self.input_extents = i_shapes - self.input_strides = [[stride // itemsize for stride in arr.strides] for arr in self.input_tensors] + self.input_strides = [[stride // itemsize for stride in arr.strides] + for arr in self.input_tensors] self.input_modes = [tuple([ord(m) for m in i]) for i in inputs] - self.output_tensor = cupy.empty(o_shape, dtype=dtype) + self.output_tensor = cp.empty(o_shape, dtype=dtype, order=order) self.output_n_modes = len(o_shape) self.output_extent = o_shape self.output_stride = [stride // itemsize for stride in self.output_tensor.strides] self.output_mode = tuple([ord(m) for m in output]) + self.gradients = None + def _get_data_type(self, category): if 'n_modes' in category: - return numpy.int32 + return np.int32 elif 'extent' in category: - return numpy.int64 + return np.int64 elif 'stride' in category: - return numpy.int64 + return np.int64 elif 'mode' in category: - return numpy.int32 + return np.int32 elif 'tensor' in category: return None # unused else: @@ -213,38 +231,38 @@ def _return_data(self, category, return_value): if len(data) == 0: # empty, give it a NULL return 0 - elif category == 'input_tensors': + elif category in ('input_tensors', 'gradients'): # special case for device arrays, return int as void** - data = numpy.asarray([d.data.ptr for d in data], - dtype=numpy.intp) + data = np.asarray([d.data.ptr for d in data], + dtype=np.intp) setattr(self, f'{category}_ptrs', data) # keep data alive # some data are not nested in nature, so we peek at the first # element to determine elif isinstance(data[0], abc.Sequence): # return int as void** - data = [numpy.asarray(d, dtype=self._get_data_type(category)) + data = [np.asarray(d, dtype=self._get_data_type(category)) for d in data] setattr(self, category, data) # keep data alive - data = numpy.asarray([d.ctypes.data for d in data], - dtype=numpy.intp) + data = np.asarray([d.ctypes.data for d in data], + dtype=np.intp) setattr(self, f'{category}_ptrs', data) # keep data alive else: # return int as void* - data = numpy.asarray(data, dtype=self._get_data_type(category)) + data = np.asarray(data, dtype=self._get_data_type(category)) setattr(self, category, data) # keep data alive return data.ctypes.data elif return_value == 'seq': if len(data) == 0: # empty, leave it as is pass - elif category == 'input_tensors': + elif category in ('input_tensors', 'gradients'): # special case for device arrays data = [d.data.ptr for d in data] setattr(self, f'{category}_ptrs', data) # keep data alive # some data are not nested in nature, so we peek at the first # element to determine elif isinstance(data[0], abc.Sequence): - data = [numpy.asarray(d, dtype=self._get_data_type(category)) + data = [np.asarray(d, dtype=self._get_data_type(category)) for d in data] setattr(self, category, data) # keep data alive else: @@ -278,6 +296,14 @@ def get_input_tensors(self, **kwargs): def get_output_tensor(self): return self.output_tensor.data.ptr + def get_gradient_tensors(self, **kwargs): + if self.gradients is None: + # as of 23.06, the gradient tensors' strides follow those of the + # input tensors + self.gradients = [cp.empty_like(arr) for arr in self.input_tensors] + data = self._return_data('gradients', kwargs['data']) + return data + @testing.parameterize(*testing.product({ 'tn': ( @@ -287,8 +313,9 @@ def get_output_tensor(self): ('ab,bc,cd->ad', [(2, 3), (3, 1), (1, 5), (2, 5)]), ), 'dtype': ( - numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 + np.float32, np.float64, np.complex64, np.complex128 ), + # use the same format for both input/output tensors 'input_form': ( {'n_modes': 'int', 'extent': 'int', 'stride': 'int', 'mode': 'int', 'data': 'int'}, @@ -297,10 +324,9 @@ def get_output_tensor(self): {'n_modes': 'seq', 'extent': 'nested_seq', 'stride': 'nested_seq', 'mode': 'seq', 'data': 'seq'}, ), - 'output_form': ( - {'extent': 'int', 'stride': 'int', 'mode': 'int'}, - {'extent': 'seq', 'stride': 'seq', 'mode': 'seq'}, - ) + 'order': ('C', 'F'), + # mainly for gradient tests + 'qual': (None, True), })) class TestTensorNetworkBase: @@ -330,9 +356,9 @@ def test_descriptor_create_destroy(self, API): handle, tensor_dscr) assert num_modes == self.tn.output_n_modes - assert (modes == numpy.asarray(self.tn.output_mode, dtype=numpy.int32)).all() - assert (extents == numpy.asarray(self.tn.output_extent, dtype=numpy.int64)).all() - assert (strides == numpy.asarray(self.tn.output_stride, dtype=numpy.int64)).all() + assert (modes == np.asarray(self.tn.output_mode, dtype=np.int32)).all() + assert (extents == np.asarray(self.tn.output_extent, dtype=np.int64)).all() + assert (strides == np.asarray(self.tn.output_stride, dtype=np.int64)).all() if API == 'new': cutn.destroy_tensor_descriptor(tensor_dscr) @@ -346,14 +372,14 @@ def _get_path(self, handle, info): def _set_path(self, handle, info, path): attr = cutn.ContractionOptimizerInfoAttribute.PATH dtype = cutn.contraction_optimizer_info_get_attribute_dtype(attr) - if not isinstance(path, numpy.ndarray): - path = numpy.ascontiguousarray(path, dtype=numpy.int32) - path_obj = numpy.asarray((path.shape[0], path.ctypes.data), dtype=dtype) + if not isinstance(path, np.ndarray): + path = np.ascontiguousarray(path, dtype=np.int32) + path_obj = np.asarray((path.shape[0], path.ctypes.data), dtype=dtype) self._set_scalar_attr(handle, info, attr, path_obj) def _get_scalar_attr(self, handle, info, attr): dtype = cutn.contraction_optimizer_info_get_attribute_dtype(attr) - data = numpy.empty((1,), dtype=dtype) + data = np.empty((1,), dtype=dtype) cutn.contraction_optimizer_info_get_attribute( handle, info, attr, data.ctypes.data, data.dtype.itemsize) @@ -361,8 +387,8 @@ def _get_scalar_attr(self, handle, info, attr): def _set_scalar_attr(self, handle, info, attr, data): dtype = cutn.contraction_optimizer_info_get_attribute_dtype(attr) - if not isinstance(data, numpy.ndarray): - data = numpy.ascontiguousarray(data, dtype=dtype) + if not isinstance(data, np.ndarray): + data = np.ascontiguousarray(data, dtype=dtype) cutn.contraction_optimizer_info_set_attribute( handle, info, attr, data.ctypes.data, data.dtype.itemsize) @@ -421,9 +447,9 @@ def test_optimizer_info_packing_unpacking(self, buffer_form): dtype = cutn.contraction_optimizer_info_get_attribute_dtype(attr) # compute a valid path for the problem - path, _ = numpy.einsum_path( + path, _ = np.einsum_path( tn.einsum, - *[arr for arr in map(lambda a: numpy.broadcast_to(0, a.shape), + *[arr for arr in map(lambda a: np.broadcast_to(0, a.shape), tn.input_tensors)]) # set the path in info (a few other attributes would be computed too) @@ -431,7 +457,7 @@ def test_optimizer_info_packing_unpacking(self, buffer_form): self._set_path(handle, info, path[1:]) buf_size = cutn.contraction_optimizer_info_get_packed_size( handle, info) - buf_data = numpy.empty((buf_size,), dtype=numpy.int8) + buf_data = np.empty((buf_size,), dtype=np.int8) if buffer_form == "int": buf = buf_data.ctypes.data else: # buffer_form == "buf" @@ -481,15 +507,16 @@ def test_optimizer_config_get_set_attribute(self, attr): cutn.ContractionOptimizerConfigAttribute.SLICER_MEMORY_MODEL, cutn.ContractionOptimizerConfigAttribute.SLICER_DISABLE_SLICING, cutn.ContractionOptimizerConfigAttribute.SIMPLIFICATION_DISABLE_DR, - cutn.ContractionOptimizerConfigAttribute.COST_FUNCTION_OBJECTIVE): - factor = numpy.asarray([1], dtype=dtype) + cutn.ContractionOptimizerConfigAttribute.COST_FUNCTION_OBJECTIVE, + cutn.ContractionOptimizerConfigAttribute.SMART_OPTION): + factor = np.asarray([1], dtype=dtype) else: - factor = numpy.asarray([30], dtype=dtype) + factor = np.asarray([30], dtype=dtype) cutn.contraction_optimizer_config_set_attribute( handle, config, attr, factor.ctypes.data, factor.dtype.itemsize) # do a round-trip test as a sanity check - factor2 = numpy.zeros_like(factor) + factor2 = np.zeros_like(factor) cutn.contraction_optimizer_config_get_attribute( handle, config, attr, factor2.ctypes.data, factor2.dtype.itemsize) @@ -513,12 +540,12 @@ def test_autotune_preference_get_set_attribute(self, attr): handle, pref = self.handle, self.autotune dtype = cutn.contraction_autotune_preference_get_attribute_dtype(attr) # Hack: assume this is a valid value for all attrs - factor = numpy.asarray([2], dtype=dtype) + factor = np.asarray([2], dtype=dtype) cutn.contraction_autotune_preference_set_attribute( handle, pref, attr, factor.ctypes.data, factor.dtype.itemsize) # do a round-trip test as a sanity check - factor2 = numpy.zeros_like(factor) + factor2 = np.zeros_like(factor) cutn.contraction_autotune_preference_get_attribute( handle, pref, attr, factor2.ctypes.data, factor2.dtype.itemsize) @@ -535,10 +562,10 @@ def test_autotune_preference_get_set_attribute(self, attr): 'autotune', (True, False) ) @pytest.mark.parametrize( - 'contract', (False, "legacy", "slice_group") + 'contract', ("legacy", "slice_group", "gradient") ) @pytest.mark.parametrize( - 'stream', (cupy.cuda.Stream.null, cupy.cuda.Stream(non_blocking=True)) + 'stream', (cp.cuda.Stream.null, get_stream_for_backend(cp)) ) class TestContraction(TestTensorNetworkBase): @@ -551,7 +578,7 @@ class TestContraction(TestTensorNetworkBase): @manage_resource('autotune') @manage_resource('workspace') @manage_resource('slice_group') - def test_contraction_workflow( + def test_contraction_gradient_workflow( self, mempool, workspace_pref, autotune, contract, stream): if (isinstance(mempool, str) and mempool.startswith('cffi') and not _can_use_cffi()): @@ -560,54 +587,87 @@ def test_contraction_workflow( # unpack handle, dscr, info, config, pref = self.handle, self.dscr, self.info, self.config, self.autotune workspace = self.workspace - tn, input_form, output_form = self.tn, self.input_form, self.output_form + tn, input_form = self.tn, self.input_form + + # make sure inputs are ready + # TODO: use stream_wait_event to establish stream order is better + cp.cuda.Device().synchronize() if mempool: mr = MemoryResourceFactory(mempool) handler = mr.get_dev_mem_handler() cutn.set_device_mem_handler(handle, handler) - workspace_size = 32*1024**2 # large enough for our test cases + workspace_hint = 32*1024**2 # large enough for our test cases # we have to run this API in any case in order to create a path cutn.contraction_optimize( - handle, dscr, config, workspace_size, info) + handle, dscr, config, workspace_hint, info) + + # for simplicity, compute grads for all tensors + if contract == "gradient": + if self.qual is None: + # set up the grad flag via TN attributes instead of input qualifiers + tensor_id_range = np.arange(len(tn.input_tensors), dtype=np.int32) + net_attr_dtype = cutn.network_get_attribute_dtype( + cutn.NetworkAttribute.INPUT_TENSORS_REQUIRE_GRAD) + tensor_ids = np.zeros(1, dtype=net_attr_dtype) + tensor_ids['num_tensors'] = tensor_id_range.size + tensor_ids['data'] = tensor_id_range.ctypes.data + cutn.network_set_attribute( + handle, dscr, cutn.NetworkAttribute.INPUT_TENSORS_REQUIRE_GRAD, + tensor_ids.ctypes.data, tensor_ids.dtype.itemsize) + # round-trip + tensor_id_range_back = np.zeros_like(tensor_id_range) + tensor_ids['num_tensors'] = tensor_id_range_back.size + tensor_ids['data'] = tensor_id_range_back.ctypes.data + cutn.network_get_attribute( + handle, dscr, cutn.NetworkAttribute.INPUT_TENSORS_REQUIRE_GRAD, + tensor_ids.ctypes.data, tensor_ids.dtype.itemsize) + assert (tensor_id_range_back == tensor_id_range).all() + + output_grads = cp.ones_like(tn.output_tensor) # manage workspace + placeholder = [] if mempool is None: - cutn.workspace_compute_sizes(handle, dscr, info, workspace) - required_size_deprecated = cutn.workspace_get_memory_size( - handle, workspace, - getattr(cutn.WorksizePref, f"{workspace_pref.upper()}"), - cutn.Memspace.DEVICE, # TODO: parametrize memspace? - cutn.WorkspaceKind.SCRATCH) cutn.workspace_compute_contraction_sizes(handle, dscr, info, workspace) - required_size = cutn.workspace_get_memory_size( - handle, workspace, - getattr(cutn.WorksizePref, f"{workspace_pref.upper()}"), - cutn.Memspace.DEVICE, # TODO: parametrize memspace? - cutn.WorkspaceKind.SCRATCH) - assert required_size == required_size_deprecated - if workspace_size < required_size: - assert False, \ - f"wrong assumption on the workspace size " \ - f"(given: {workspace_size}, needed: {required_size})" - workspace_ptr = cupy.cuda.alloc(workspace_size) - cutn.workspace_set_memory( - handle, workspace, - cutn.Memspace.DEVICE, - cutn.WorkspaceKind.SCRATCH, - workspace_ptr.ptr, workspace_size) - # round-trip check - assert (workspace_ptr.ptr, workspace_size) == cutn.workspace_get_memory( - handle, workspace, - cutn.Memspace.DEVICE, - cutn.WorkspaceKind.SCRATCH) + for kind in cutn.WorkspaceKind: # for both scratch & cache + required_size = cutn.workspace_get_memory_size( + handle, workspace, + getattr(cutn.WorksizePref, f"{workspace_pref.upper()}"), + cutn.Memspace.DEVICE, # TODO: parametrize memspace? + kind) + if contract != "gradient": + cutn.workspace_compute_sizes(handle, dscr, info, workspace) + required_size_deprecated = cutn.workspace_get_memory_size( + handle, workspace, + getattr(cutn.WorksizePref, f"{workspace_pref.upper()}"), + cutn.Memspace.DEVICE, # TODO: parametrize memspace? + kind) + assert required_size == required_size_deprecated + if workspace_hint < required_size: + assert False, \ + f"wrong assumption on the workspace size " \ + f"(given: {workspace_hint}, needed: {required_size})" + if required_size > 0: + workspace_ptr = cp.cuda.alloc(required_size) + cutn.workspace_set_memory( + handle, workspace, + cutn.Memspace.DEVICE, + kind, + workspace_ptr.ptr, required_size) + placeholder.append(workspace_ptr) # keep it alive + # round-trip check + assert ((workspace_ptr.ptr, required_size) == + cutn.workspace_get_memory(handle, workspace, + cutn.Memspace.DEVICE, kind)) else: - cutn.workspace_set_memory( - handle, workspace, - cutn.Memspace.DEVICE, - cutn.WorkspaceKind.SCRATCH, - 0, -1) # TODO: check custom workspace size? + for kind in cutn.WorkspaceKind: + cutn.workspace_set_memory( + handle, workspace, + cutn.Memspace.DEVICE, + kind, + 0, -1) # TODO: check custom workspace size? plan = None try: @@ -628,7 +688,7 @@ def test_contraction_workflow( tn.get_input_tensors(**input_form), tn.get_output_tensor(), workspace, 0, stream.ptr) - elif contract == "slice_group": + elif contract in ("slice_group", "gradient"): accumulate = 0 cutn.contract_slices( handle, plan, @@ -636,11 +696,49 @@ def test_contraction_workflow( tn.get_output_tensor(), accumulate, workspace, self.slice_group, stream.ptr) + if contract == "gradient": + cutn.compute_gradients_backward( + handle, plan, + tn.get_input_tensors(**input_form), + output_grads.data.ptr, + tn.get_gradient_tensors(**input_form), + accumulate, workspace, stream.ptr) stream.synchronize() finally: if plan is not None: cutn.destroy_contraction_plan(plan) + if contract == "gradient" and torch: + + if not torch.cuda.is_available(): + # copy data back to CPU + dev = "cpu" + func = cp.asnumpy + else: + # zero-copy from CuPy to PyTorch! + dev = "cuda" + func = (lambda x: x) # no op + + inputs = [torch.as_tensor(func(t), device=dev) + for t in tn.input_tensors] + output_grads = torch.as_tensor(func(output_grads), device=dev) + for t in inputs: + t.requires_grad_(True) + assert t.grad is None + + # repeat the same calculation with PyTorch so that it fills up the + # gradients for us to do verification + out = torch.einsum(tn.einsum, *inputs) + out.backward(output_grads) + + # compare gradients + for grad_cutn, in_torch in zip(tn.gradients, inputs): + grad_torch = in_torch.grad + if torch.is_complex(grad_torch): + grad_torch = grad_torch.conj().resolve_conj() + # zero-copy if on GPU + assert cp.allclose(grad_cutn, cp.asarray(grad_torch)) + @pytest.mark.parametrize( 'source', ('int', 'seq', 'range') @@ -652,11 +750,11 @@ def test_slice_group(self, source): # we don't do a simple round-trip test here because there are two # flavors of constructors if source == "int": - ids = numpy.arange(10, dtype=numpy.int64) + ids = np.arange(10, dtype=np.int64) slice_group = cutn.create_slice_group_from_ids( self.handle, ids.ctypes.data, ids.size) elif source == "seq": - ids = numpy.arange(10, dtype=numpy.int64) + ids = np.arange(10, dtype=np.int64) slice_group = cutn.create_slice_group_from_ids( self.handle, ids, ids.size) elif source == "range": @@ -716,7 +814,7 @@ def __init__(self, subscript, shapes, dtype, max_extent=None): self.tensor_names = [f"input_{i}" for i in range(len(shapes))] + ["left", "right"] # note s needs to be explictly managed in the tester function # xp strides in bytes, cutn strides in counts - dtype = cupy.dtype(dtype) + dtype = cp.dtype(dtype) real_dtype = dtype.char.lower() is_complex = dtype.char != real_dtype itemsize = dtype.itemsize @@ -725,13 +823,13 @@ def _get_tensor(name, modes): if name.startswith('input'): shape = [size_dict[mode] for mode in modes] if is_complex: # complex - arr = (cupy.random.random(shape, dtype=real_dtype) - + 1j*cupy.random.random(shape, dtype=real_dtype)).astype(dtype) + arr = (cp.random.random(shape, dtype=real_dtype) + + 1j*cp.random.random(shape, dtype=real_dtype)).astype(dtype) else: - arr = cupy.random.random(shape, dtype=dtype) + arr = cp.random.random(shape, dtype=dtype) else: shape = [self.mid_extent if mode == shared_mode_out else size_dict[mode] for mode in modes] - arr = cupy.empty(shape, dtype=dtype, order='F') + arr = cp.empty(shape, dtype=dtype, order='F') return arr for name, modes in zip(self.tensor_names, modes_in + [left_modes_out, right_modes_out]): @@ -744,13 +842,13 @@ def _get_tensor(name, modes): def _get_data_type(self, category): if 'n_modes' in category: - return numpy.int32 + return np.int32 elif 'extent' in category: - return numpy.int64 + return np.int64 elif 'stride' in category: - return numpy.int64 + return np.int64 elif 'mode' in category: - return numpy.int32 + return np.int32 elif 'tensor' in category: return None # unused else: @@ -765,7 +863,7 @@ def _return_data(self, category, return_value): return 0 else: # return int as void* - data = numpy.asarray(data, dtype=self._get_data_type(category)) + data = np.asarray(data, dtype=self._get_data_type(category)) setattr(self, category, data) # keep data alive return data.ctypes.data elif return_value == 'seq': @@ -797,7 +895,7 @@ def get_operands(self, include_inputs=True, include_outputs=True): @testing.parameterize(*testing.product({ 'tn': tensor_decomp_expressions, 'dtype': ( - numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 + np.float32, np.float64, np.complex64, np.complex128 ), 'tensor_form': ( {'extent': 'int', 'stride': 'int', 'mode': 'int'}, @@ -816,7 +914,7 @@ def test_tensor_qr(self): handle, tn, workspace = self.handle, self.tn, self.workspace tensor_in, tensor_q, tensor_r = self.tensor_decom - dtype = cupy.dtype(self.dtype) + dtype = cp.dtype(self.dtype) # prepare workspace cutn.workspace_compute_qr_sizes( @@ -828,7 +926,7 @@ def test_tensor_qr(self): cutn.Memspace.DEVICE, # TODO: parametrize memspace? cutn.WorkspaceKind.SCRATCH) if required_size > 0: - workspace_ptr = cupy.cuda.alloc(required_size) + workspace_ptr = cp.cuda.alloc(required_size) cutn.workspace_set_memory( handle, workspace, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, workspace_ptr.ptr, required_size) @@ -837,7 +935,7 @@ def test_tensor_qr(self): handle, workspace, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) # perform QR - stream = cupy.cuda.get_current_stream().ptr # TODO + stream = cp.cuda.get_current_stream().ptr # TODO cutn.tensor_qr( handle, tensor_in, tn.get_tensor_ptr('input_0'), tensor_q, tn.get_tensor_ptr('left'), @@ -852,7 +950,7 @@ def test_tensor_qr(self): @testing.parameterize(*testing.product({ 'tn': tensor_decomp_expressions, 'dtype': ( - numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 + np.float32, np.float64, np.complex64, np.complex128 ), 'tensor_form': ( {'extent': 'int', 'stride': 'int', 'mode': 'int'}, @@ -860,11 +958,11 @@ def test_tensor_qr(self): ), 'options': ( {}, # standard exact svd - {'max_extent': 4, 'normalization':'L1', 'partition':'U'}, # fix extent truncation - {'abs_cutoff': 0.1, 'rel_cutoff': 0.1}, # value based truncation - {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V'}, # absolute value based truncation - {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV'}, # relative value based truncation - {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV'}, # compound truncation + {'max_extent': 4, 'normalization':'L1', 'partition':'U', 'algorithm': 'gesvdr', 'gesvdr_niters': 40}, # fix extent truncation + {'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'algorithm': 'gesvdj', 'gesvdj_tol':1e-14, 'gesvdj_max_sweeps': 80}, # value based truncation + {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V', 'algorithm': 'gesvdj'}, # absolute value based truncation + {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # relative value based truncation + {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV', 'algorithm': 'gesvdp'}, # compound truncation ), })) class TestTensorSVD: @@ -881,7 +979,7 @@ def test_tensor_svd(self): handle, tn, workspace = self.handle, self.tn, self.workspace tensor_in, tensor_u, tensor_v = self.tensor_decom svd_config, svd_info = self.svd_config, self.svd_info - dtype = cupy.dtype(self.dtype) + dtype = cp.dtype(self.dtype) # parse svdConfig svd_method = check_or_create_options(tensor.SVDMethod, self.options, "SVDMethod") @@ -892,29 +990,37 @@ def test_tensor_svd(self): handle, tensor_in, tensor_u, tensor_v, svd_config, workspace) # for now host workspace is always 0, so just query device one # also, it doesn't matter which one (min/recommended/max) is queried - required_size = cutn.workspace_get_memory_size( - handle, workspace, cutn.WorksizePref.MIN, - cutn.Memspace.DEVICE, # TODO: parametrize memspace? - cutn.WorkspaceKind.SCRATCH) - if required_size > 0: - workspace_ptr = cupy.cuda.alloc(required_size) - cutn.workspace_set_memory( - handle, workspace, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, - workspace_ptr.ptr, required_size) - # round-trip check - assert (workspace_ptr.ptr, required_size) == cutn.workspace_get_memory( - handle, workspace, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + workspaces = {} + allocators = {cutn.Memspace.DEVICE: cp.cuda.alloc, + cutn.Memspace.HOST: lambda nbytes: np.empty(nbytes, dtype=np.int8)} + for mem_space, allocator in allocators.items(): + required_size = cutn.workspace_get_memory_size( + handle, workspace, cutn.WorksizePref.MIN, + mem_space, + cutn.WorkspaceKind.SCRATCH) + if required_size > 0: + workspaces[mem_space] = workspace_ptr = allocator(required_size) # keep alive + if mem_space == cutn.Memspace.DEVICE: + workspace_ptr_address = workspace_ptr.ptr + else: + workspace_ptr_address = workspace_ptr.ctypes.data + cutn.workspace_set_memory( + handle, workspace, mem_space, cutn.WorkspaceKind.SCRATCH, + workspace_ptr_address, required_size) + # round-trip check + assert (workspace_ptr_address, required_size) == cutn.workspace_get_memory( + handle, workspace, mem_space, cutn.WorkspaceKind.SCRATCH) partition = self.options.get("partition", None) if partition is None: - s = cupy.empty(tn.mid_extent, dtype=dtype.char.lower()) + s = cp.empty(tn.mid_extent, dtype=dtype.char.lower()) s_ptr = s.data.ptr else: s = None s_ptr = 0 # perform SVD - stream = cupy.cuda.get_current_stream().ptr # TODO + stream = cp.cuda.get_current_stream().ptr # TODO cutn.tensor_svd( handle, tensor_in, tn.get_tensor_ptr('input_0'), tensor_u, tn.get_tensor_ptr('left'), @@ -936,10 +1042,10 @@ def test_tensor_svd(self): if tuple(extent_U_out) != u.shape: strides_U_out = [i * u.itemsize for i in strides_U_out] strides_V_out = [i * v.itemsize for i in strides_V_out] - tn.left_tensor = u = cupy.ndarray(extent_U_out, dtype=u.dtype, memptr=u.data, strides=strides_U_out) + tn.left_tensor = u = cp.ndarray(extent_U_out, dtype=u.dtype, memptr=u.data, strides=strides_U_out) if s is not None: - s = cupy.ndarray(reduced_extent, dtype=s.dtype, memptr=s.data, order='F') - tn.right_tensor = v = cupy.ndarray(extent_V_out, dtype=v.dtype, memptr=v.data, strides=strides_V_out) + s = cp.ndarray(reduced_extent, dtype=s.dtype, memptr=s.data, order='F') + tn.right_tensor = v = cp.ndarray(extent_V_out, dtype=v.dtype, memptr=v.data, strides=strides_V_out) u_ref, s_ref, v_ref, info_ref = approxTN_utils.tensor_decompose( tn.subscript, T, @@ -957,7 +1063,7 @@ def test_tensor_svd(self): @testing.parameterize(*testing.product({ 'tn': gate_decomp_expressions, 'dtype': ( - numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 + np.float32, np.float64, np.complex64, np.complex128 ), 'tensor_form': ( {'extent': 'int', 'stride': 'int', 'mode': 'int'}, @@ -967,18 +1073,18 @@ def test_tensor_svd(self): "direct", "reduced" ), 'options': ( - {}, # exact svd - {'max_extent': 4, 'normalization':'L1', 'partition':'U'}, # fix extent truncation - {'abs_cutoff': 0.1, 'rel_cutoff': 0.1}, # value based truncation - {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V'}, # absolute value based truncation - {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV'}, # relative value based truncation - {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV'}, # compound truncation + {}, # standard exact svd + {'max_extent': 4, 'normalization':'L1', 'partition':'U', 'algorithm': 'gesvdr', 'gesvdr_niters': 40}, # fix extent truncation + {'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'algorithm': 'gesvdj', 'gesvdj_tol':1e-14, 'gesvdj_max_sweeps': 80}, # value based truncation + {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V', 'algorithm': 'gesvdj'}, # absolute value based truncation + {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # relative value based truncation + {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV', 'algorithm': 'gesvdp'}, # compound truncation ), })) class TestTensorGate: GATE_ALGO_MAP = {"direct": cutn.GateSplitAlgo.DIRECT, - "reduced": cutn.GateSplitAlgo.REDUCED} + "reduced": cutn.GateSplitAlgo.REDUCED} # There is no easy way for us to test each API independently, so we instead # parametrize the steps and test the whole workflow @@ -999,37 +1105,43 @@ def test_gate_split(self): svd_method = check_or_create_options(tensor.SVDMethod, self.options, "SVDMethod") parse_svd_config(handle, svd_config, svd_method, logger=None) - dtype = cupy.dtype(self.dtype) + dtype = cp.dtype(self.dtype) compute_type = dtype_to_compute_type[self.dtype] # prepare workspace cutn.workspace_compute_gate_split_sizes(handle, tensor_in_a, tensor_in_b, tensor_in_g, tensor_u, tensor_v, gate_algorithm, svd_config, compute_type, workspace) - # for now host workspace is always 0, so just query device one - # also, it doesn't matter which one (min/recommended/max) is queried - required_size = cutn.workspace_get_memory_size( - handle, workspace, cutn.WorksizePref.MIN, - cutn.Memspace.DEVICE, # TODO: parametrize memspace? - cutn.WorkspaceKind.SCRATCH) - if required_size > 0: - workspace_ptr = cupy.cuda.alloc(required_size) - cutn.workspace_set_memory( - handle, workspace, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH, - workspace_ptr.ptr, required_size) - # round-trip check - assert (workspace_ptr.ptr, required_size) == cutn.workspace_get_memory( - handle, workspace, cutn.Memspace.DEVICE, cutn.WorkspaceKind.SCRATCH) + workspaces = {} + allocators = {cutn.Memspace.DEVICE: cp.cuda.alloc, + cutn.Memspace.HOST: lambda nbytes: np.empty(nbytes, dtype=np.int8)} + for mem_space, allocator in allocators.items(): + required_size = cutn.workspace_get_memory_size( + handle, workspace, cutn.WorksizePref.MIN, + mem_space, + cutn.WorkspaceKind.SCRATCH) + if required_size > 0: + workspaces[mem_space] = workspace_ptr = allocator(required_size) # keep alive + if mem_space == cutn.Memspace.DEVICE: + workspace_ptr_address = workspace_ptr.ptr + else: + workspace_ptr_address = workspace_ptr.ctypes.data + cutn.workspace_set_memory( + handle, workspace, mem_space, cutn.WorkspaceKind.SCRATCH, + workspace_ptr_address, required_size) + # round-trip check + assert (workspace_ptr_address, required_size) == cutn.workspace_get_memory( + handle, workspace, mem_space, cutn.WorkspaceKind.SCRATCH) partition = self.options.get("partition", None) if partition is None: - s = cupy.empty(tn.mid_extent, dtype=dtype.char.lower()) + s = cp.empty(tn.mid_extent, dtype=dtype.char.lower()) s_ptr = s.data.ptr else: s = None s_ptr = 0 # perform gate split - stream = cupy.cuda.get_current_stream().ptr # TODO + stream = cp.cuda.get_current_stream().ptr # TODO cutn.gate_split(handle, tensor_in_a, tn.get_tensor_ptr('input_0'), tensor_in_b, tn.get_tensor_ptr('input_1'), tensor_in_g, tn.get_tensor_ptr('input_2'), @@ -1052,10 +1164,10 @@ def test_gate_split(self): if tuple(extent_U_out) != u.shape: strides_U_out = [i * u.itemsize for i in strides_U_out] strides_V_out = [i * v.itemsize for i in strides_V_out] - tn.left_tensor = u = cupy.ndarray(extent_U_out, dtype=u.dtype, memptr=u.data, strides=strides_U_out) + tn.left_tensor = u = cp.ndarray(extent_U_out, dtype=u.dtype, memptr=u.data, strides=strides_U_out) if s is not None: - s = cupy.ndarray(reduced_extent, dtype=s.dtype, memptr=s.data, order='F') - tn.right_tensor = v = cupy.ndarray(extent_V_out, dtype=v.dtype, memptr=v.data, strides=strides_V_out) + s = cp.ndarray(reduced_extent, dtype=s.dtype, memptr=s.data, order='F') + tn.right_tensor = v = cp.ndarray(extent_V_out, dtype=v.dtype, memptr=v.data, strides=strides_V_out) u_ref, s_ref, v_ref, info_ref = approxTN_utils.gate_decompose( tn.subscript, @@ -1083,7 +1195,7 @@ def test_tensor_svd_config_create_destroy(self): pass @pytest.mark.parametrize( - 'attr', [val for val in cutn.TensorSVDConfigAttribute] + 'attr', [val for val in cutn.TensorSVDConfigAttribute if val != cutn.TensorSVDConfigAttribute.ALGO_PARAMS] ) @manage_resource('handle') @manage_resource('svd_config') @@ -1091,16 +1203,43 @@ def test_tensor_svd_config_get_set_attribute(self, attr): handle, svd_config = self.handle, self.svd_config dtype = cutn.tensor_svd_config_get_attribute_dtype(attr) # Hack: assume this is a valid value for all attrs - factor = numpy.asarray([0.8], dtype=dtype) + factor = np.asarray([0.8], dtype=dtype) cutn.tensor_svd_config_set_attribute( handle, svd_config, attr, factor.ctypes.data, factor.dtype.itemsize) # do a round-trip test as a sanity check - factor2 = numpy.zeros_like(factor) + factor2 = np.zeros_like(factor) cutn.tensor_svd_config_get_attribute( handle, svd_config, attr, factor2.ctypes.data, factor2.dtype.itemsize) assert factor == factor2 + + @pytest.mark.parametrize( + 'svd_algorithm', (cutn.TensorSVDAlgo.GESVDJ, cutn.TensorSVDAlgo.GESVDR) + ) + @manage_resource('handle') + @manage_resource('svd_config') + def test_tensor_svd_config_get_set_params_attribute(self, svd_algorithm): + handle, svd_config = self.handle, self.svd_config + # set ALGO first + algo_dtype = cutn.tensor_svd_config_get_attribute_dtype(cutn.TensorSVDConfigAttribute.ALGO) + algo = np.asarray(svd_algorithm, dtype=algo_dtype) + cutn.tensor_svd_config_set_attribute( + handle, svd_config, cutn.TensorSVDConfigAttribute.ALGO, + algo.ctypes.data, algo.dtype.itemsize) + + algo_params_dtype = cutn.tensor_svd_algo_params_get_dtype(svd_algorithm) + # Hack: assume this is a valid value for all SVD parameters + factor = np.asarray([1.8], dtype=algo_params_dtype) # 0 may trigger default behavior, eg, gesvdr_niters set to 0 means default (10) + cutn.tensor_svd_config_set_attribute( + handle, svd_config, cutn.TensorSVDConfigAttribute.ALGO_PARAMS, + factor.ctypes.data, factor.dtype.itemsize) + # do a round-trip test as a sanity check + factor2 = np.zeros_like(factor) + cutn.tensor_svd_config_get_attribute( + handle, svd_config, cutn.TensorSVDConfigAttribute.ALGO_PARAMS, + factor2.ctypes.data, factor2.dtype.itemsize) + assert factor == factor2 @pytest.mark.skipif(mpi4py is None, reason="need mpi4py") diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py b/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py index cac42dc..d0da2ec 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py @@ -130,22 +130,20 @@ def test_contract_qr_decompose(self, decompose_expr, xp, dtype, order, stream): self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) - @pytest.mark.parametrize( - "svd_method_seed", (None, 0, 1, 2) - ) - def test_contract_svd_decompose(self, decompose_expr, xp, dtype, order, stream, svd_method_seed): - svd_method = gen_rand_svd_method(seed=svd_method_seed) - algorithm = ContractDecomposeAlgorithm(qr_method=False, svd_method=svd_method) - self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) + def test_contract_svd_decompose(self, decompose_expr, xp, dtype, order, stream): + rng = numpy.random.default_rng(2021) + methods = [tensor.SVDMethod()] + [gen_rand_svd_method(rng) for _ in range(10)] + for svd_method in methods: + algorithm = ContractDecomposeAlgorithm(qr_method=False, svd_method=svd_method) + self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) - @pytest.mark.parametrize( - "svd_method_seed", (None, 0, 1, 2) - ) - def test_contract_qr_assisted_svd_decompose(self, decompose_expr, xp, dtype, order, stream, svd_method_seed): - svd_method = gen_rand_svd_method(seed=svd_method_seed) - algorithm = ContractDecomposeAlgorithm(qr_method={}, svd_method=svd_method) - self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) + def test_contract_qr_assisted_svd_decompose(self, decompose_expr, xp, dtype, order, stream): + rng = numpy.random.default_rng(2021) + methods = [tensor.SVDMethod()] + [gen_rand_svd_method(rng) for _ in range(10)] + for svd_method in methods: + algorithm = ContractDecomposeAlgorithm(qr_method={}, svd_method=svd_method) + self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm) class TestContractDecomposeAlgorithm(_OptionsBase): @@ -178,7 +176,7 @@ class TestContractDecomposeInfo(_OptionsBase): intermediate_modes=[(1, 3), (2, 4)])] ) @pytest.mark.parametrize( - 'svd_info', [None, tensor.SVDInfo(reduced_extent=2, full_extent=4, discarded_weight=0.01)] + 'svd_info', [None, tensor.SVDInfo(reduced_extent=2, full_extent=4, discarded_weight=0.01, algorithm='gesvdj')] ) @pytest.mark.parametrize( 'svd_method', [False, {}, tensor.SVDMethod()] diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py b/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py index 4e97467..32885db 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py @@ -1,10 +1,14 @@ +import re +import sys import threading import cupy as cp from cupy.cuda.runtime import getDevice, setDevice import pytest +from cuquantum.cutensornet import _internal from cuquantum.cutensornet._internal import utils +from cuquantum.utils import WHITESPACE_UNICODE class TestDeviceCtx: @@ -85,3 +89,22 @@ def test_one_shot(self): with pytest.raises(Exception): with dev: pass + + +class TestGetSymbol: + + def test_no_whitespace(self): + # Note: max(whitespace_s) = 12288 + out = [] + for i in range(0, 30000): + s = _internal.circuit_converter_utils._get_symbol(i) + assert not s.isspace() + out.append(s) + + # check the mapping is unique + assert len(set(out)) == 30000 + + def test_whitespace_unicode_consistency(self): + all_s = ''.join(chr(c) for c in range(sys.maxunicode+1)) + whitespace_s = ''.join(re.findall(r'\s', all_s)) + assert WHITESPACE_UNICODE == whitespace_s diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_options.py b/python/tests/cuquantum_tests/cutensornet_tests/test_options.py index 9f4079f..a8ab520 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_options.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_options.py @@ -104,7 +104,7 @@ class TestOptimizerInfo(_OptionsBase): options_type = OptimizerInfo - # All fileds in OptimizerInfo are required, so we must test + # All fields in OptimizerInfo are required, so we must test # them at once def test_optimizer_info(self): self.create_options({ diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py b/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py index 2272fa8..502ebc6 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py @@ -2,10 +2,11 @@ # # SPDX-License-Identifier: BSD-3-Clause +import copy +import dataclasses import sys import cupy -import dataclasses import numpy import pytest @@ -35,12 +36,17 @@ "xp", backend_names ) @pytest.mark.parametrize( - "decompose_expr", list(set([expr[0] for expr in tensor_decomp_expressions])) # filter out duplicated expressions + "decompose_expr", tensor_decomp_expressions +) +@pytest.mark.parametrize( + "blocking", (True, "auto") ) class TestDecompose: - def _run_decompose(self, decompose_expr, xp, dtype, order, stream, method, **kwargs): - factory = DecomposeFactory(decompose_expr) + def _run_decompose( + self, decompose_expr, xp, dtype, order, stream, method, **kwargs): + decompose_expr, shapes = copy.deepcopy(decompose_expr) + factory = DecomposeFactory(decompose_expr, shapes=shapes) operand = factory.generate_operands(factory.input_shapes, xp, dtype, order)[0] backend = sys.modules[infer_object_package(operand)] @@ -49,10 +55,11 @@ def _run_decompose(self, decompose_expr, xp, dtype, order, stream, method, **kwa return_info = kwargs.pop("return_info", False) outputs = tensor.decompose(decompose_expr, - operand, - method=method, - return_info=return_info, - stream=stream) + operand, + method=method, + options={"blocking": kwargs["blocking"]}, + stream=stream, + return_info=return_info) if stream: stream.synchronize() @@ -72,7 +79,7 @@ def _run_decompose(self, decompose_expr, xp, dtype, order, stream, method, **kwa else: u, s, v = outputs u_ref, s_ref, v_ref = outputs_ref - info = None + info = {'algorithm': method.algorithm} info_ref = None assert type(u) is type(v) @@ -90,18 +97,22 @@ def _run_decompose(self, decompose_expr, xp, dtype, order, stream, method, **kwa info_ref=info_ref, **svd_kwargs) - def test_qr(self, decompose_expr, xp, dtype, order, stream): - self._run_decompose(decompose_expr, xp, dtype, order, stream, tensor.QRMethod()) + def test_qr(self, decompose_expr, xp, dtype, order, stream, blocking): + self._run_decompose( + decompose_expr, xp, dtype, order, stream, tensor.QRMethod(), + blocking=blocking) - @pytest.mark.parametrize( - "svd_method_seed", (None, 0, 1, 2) - ) @pytest.mark.parametrize( "return_info", (False, True) ) - def test_svd(self, decompose_expr, xp, dtype, order, stream, return_info, svd_method_seed): - method = gen_rand_svd_method(seed=svd_method_seed) - self._run_decompose(decompose_expr, xp, dtype, order, stream, method, return_info=return_info) + def test_svd( + self, decompose_expr, xp, dtype, order, stream, return_info, blocking): + rng = numpy.random.default_rng(2021) + methods = [tensor.SVDMethod()] + [gen_rand_svd_method(rng) for _ in range(10)] + for method in methods: + self._run_decompose( + decompose_expr, xp, dtype, order, stream, method, + blocking=blocking, return_info=return_info) class TestDecompositionOptions(TestNetworkOptions): @@ -134,11 +145,33 @@ def test_partition(self, partition): def test_normalization(self, normalization): self.create_options({'normalization': normalization}) + @pytest.mark.parametrize( + 'algorithm', ['gesvd', 'gesvdj', 'gesvdp', 'gesvdr'] + ) + def test_algorithm(self, algorithm): + options = {'algorithm': algorithm} + if algorithm == 'gesvdj': + options['gesvdj_tol'] = 1e-16 + options['gesvdj_max_sweeps'] = 80 + elif algorithm == 'gesvdr': + options['gesvdr_oversampling'] = 4 + options['gesvdr_niters'] = 8 + self.create_options(options) + class TestSVDInfo(_OptionsBase): options_type = tensor.SVDInfo # All fields are required. Therefore we test them all at once. - def test_svd_info(self): - self.create_options({'reduced_extent': 6, 'full_extent': 8, 'discarded_weight': 0.02}) + @pytest.mark.parametrize( + 'algorithm', ['gesvd', 'gesvdj', 'gesvdp', 'gesvdr'] + ) + def test_svd_info(self, algorithm): + info = {'reduced_extent': 6, 'full_extent': 8, 'discarded_weight': 0.02, 'algorithm': algorithm} + if algorithm == 'gesvdj': + info['gesvdj_sweeps'] = 12 + info['gesvdj_residual'] = 1e-12 + elif algorithm == 'gesvdp': + info['gesvdp_err_sigma'] = 1e-8 + self.create_options(info) diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py index 9ee56b4..faaabcd 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py @@ -10,6 +10,8 @@ import numpy try: import torch + if not torch.cuda.is_available(): + raise ImportError except ImportError: torch = None @@ -284,6 +286,15 @@ def convert_by_format(self, operands, *, dummy=False): class DecomposeFactory(ExpressionFactory): + def __init__(self, expression, *, shapes=None): + super().__init__(expression) + + if shapes is not None: + # overwrite the base class's dict + inputs, _ = self.expr.split("->") + inputs = inputs.split(",") + self.size_dict = dict((m, e) for k, v in zip(inputs, shapes) for m, e in zip(k, v)) + @property def modes(self): if self._modes is None: @@ -301,20 +312,23 @@ def modes(self): raise ValueError("decomposition does not support interleave format") return self._modes - -def gen_rand_svd_method(seed=None): - if seed is None: - return tensor.SVDMethod() - else: - numpy.random.seed(seed) - method = {"max_extent": numpy.random.randint(1, high=6), - "abs_cutoff": numpy.random.random() / 2.0, # [0, 0.5) - "rel_cutoff": numpy.random.random() / 2.0, # [0, 0.5) - "normalization": numpy.random.choice([None, "L1", "L2", "LInf"]), - "partition": numpy.random.choice([None, "U", "V", "UV"])} - return tensor.SVDMethod(**method) +def gen_rand_svd_method(rng): + method = {"max_extent": rng.choice(range(1, 7)), + "abs_cutoff": rng.random() / 2.0, # [0, 0.5) + "rel_cutoff": 0.1 + rng.random() / 2.5 , # [0.1, 0.5) + "normalization": rng.choice([None, "L1", "L2", "LInf"]), + "partition": rng.choice([None, "U", "V", "UV"]), + "algorithm": rng.choice(['gesvd', 'gesvdj', 'gesvdp', 'gesvdr'])} + if method["algorithm"] == 'gesvdj': + method["gesvdj_tol"] = rng.choice([0, 1e-14]) + method["gesvdj_max_sweeps"] = rng.choice([0, 100]) + elif method["algorithm"] == 'gesvdr': + method["gesvdr_niters"] = rng.choice([0, 40]) + # we can't set oversampling as it depends on matrix size here + return tensor.SVDMethod(**method) + # We want to avoid fragmenting the stream-ordered mempools _predefined_streams = { diff --git a/samples/custatevec/CMakeLists.txt b/samples/custatevec/CMakeLists.txt index 04a8e1c..520a447 100644 --- a/samples/custatevec/CMakeLists.txt +++ b/samples/custatevec/CMakeLists.txt @@ -123,20 +123,25 @@ endfunction() add_custom_target(custatevec_examples) -add_custatevec_example(custatevec_examples "cuStateVec.example.gate_application" gate_application.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.permutation_matrix" permutation_matrix.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.diagonal_matrix" diagonal_matrix.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.exponential_pauli" exponential_pauli.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.expectation" expectation.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.expentation_pauli" expectation_pauli.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.sampler" sampler.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.measure_zbasis" measure_zbasis.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.batch_measure" batch_measure.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_get" accessor_get.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_set" accessor_set.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.test_matrix_type" test_matrix_type.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.memory_handler" memory_handler.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.swap_index_bits" swap_index_bits.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_swap_index_bits" mgpu_swap_index_bits.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_sampler" mgpu_sampler.cu) -add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_batch_measure" mgpu_batch_measure.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.initialize_sv" initialize_sv.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.gate_application" gate_application.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.permutation_matrix" permutation_matrix.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.diagonal_matrix" diagonal_matrix.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.batched_gate_application" batched_gate_application.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.exponential_pauli" exponential_pauli.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.expectation" expectation.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.expentation_pauli" expectation_pauli.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.sampler" sampler.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.measure_zbasis" measure_zbasis.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.batch_measure" batch_measure.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.batched_collapse" batched_collapse.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.batched_abs2sum" batched_abs2sum.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.batched_measure" batched_measure.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_get" accessor_get.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_set" accessor_set.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.test_matrix_type" test_matrix_type.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.memory_handler" memory_handler.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.swap_index_bits" swap_index_bits.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_swap_index_bits" mgpu_swap_index_bits.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_sampler" mgpu_sampler.cu) +add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_batch_measure" mgpu_batch_measure.cu) diff --git a/samples/custatevec/Makefile b/samples/custatevec/Makefile index a96ea9a..ffc2649 100644 --- a/samples/custatevec/Makefile +++ b/samples/custatevec/Makefile @@ -19,23 +19,29 @@ ARCH_FLAGS = $(ARCH_FLAGS_SM70) $(ARCH_FLAGS_SM75) $(ARCH_FLAGS_SM80) $(ARC CXX_FLAGS = -std=c++11 $(INCLUDE_DIRS) $(LIBRARY_DIRS) $(ARCH_FLAGS) $(LINKER_FLAGS) all: check-env - nvcc gate_application.cu -o gate_application ${CXX_FLAGS} - nvcc permutation_matrix.cu -o permutation_matrix ${CXX_FLAGS} - nvcc diagonal_matrix.cu -o diagonal_matrix ${CXX_FLAGS} - nvcc exponential_pauli.cu -o exponential_pauli ${CXX_FLAGS} - nvcc expectation.cu -o expectation ${CXX_FLAGS} - nvcc expectation_pauli.cu -o expectation_pauli ${CXX_FLAGS} - nvcc sampler.cu -o sampler ${CXX_FLAGS} - nvcc measure_zbasis.cu -o measure_zbasis ${CXX_FLAGS} - nvcc batch_measure.cu -o batch_measure ${CXX_FLAGS} - nvcc accessor_get.cu -o accessor_get ${CXX_FLAGS} - nvcc accessor_set.cu -o accessor_set ${CXX_FLAGS} - nvcc test_matrix_type.cu -o test_matrix_type ${CXX_FLAGS} - nvcc memory_handler.cu -o memory_handler ${CXX_FLAGS} - nvcc swap_index_bits.cu -o swap_index_bits ${CXX_FLAGS} - nvcc mgpu_swap_index_bits.cu -o mgpu_swap_index_bits ${CXX_FLAGS} - nvcc mgpu_batch_measure.cu -o mgpu_batch_measure ${CXX_FLAGS} - nvcc mgpu_sampler.cu -o mgpu_sampler ${CXX_FLAGS} + nvcc initialize_sv.cu -o initialize_sv ${CXX_FLAGS} + nvcc gate_application.cu -o gate_application ${CXX_FLAGS} + nvcc permutation_matrix.cu -o permutation_matrix ${CXX_FLAGS} + nvcc diagonal_matrix.cu -o diagonal_matrix ${CXX_FLAGS} + nvcc batched_gate_application.cu -o batched_gate_application ${CXX_FLAGS} + nvcc exponential_pauli.cu -o exponential_pauli ${CXX_FLAGS} + nvcc expectation.cu -o expectation ${CXX_FLAGS} + nvcc expectation_pauli.cu -o expectation_pauli ${CXX_FLAGS} + nvcc sampler.cu -o sampler ${CXX_FLAGS} + nvcc measure_zbasis.cu -o measure_zbasis ${CXX_FLAGS} + nvcc batch_measure.cu -o batch_measure ${CXX_FLAGS} + nvcc batched_collapse.cu -o batched_collapse ${CXX_FLAGS} + nvcc batched_abs2sum.cu -o batched_abs2sum ${CXX_FLAGS} + nvcc batched_measure.cu -o batched_measure ${CXX_FLAGS} + nvcc accessor_get.cu -o accessor_get ${CXX_FLAGS} + nvcc accessor_set.cu -o accessor_set ${CXX_FLAGS} + nvcc test_matrix_type.cu -o test_matrix_type ${CXX_FLAGS} + nvcc memory_handler.cu -o memory_handler ${CXX_FLAGS} + nvcc swap_index_bits.cu -o swap_index_bits ${CXX_FLAGS} + nvcc mgpu_swap_index_bits.cu -o mgpu_swap_index_bits ${CXX_FLAGS} + nvcc mgpu_batch_measure.cu -o mgpu_batch_measure ${CXX_FLAGS} + nvcc mgpu_sampler.cu -o mgpu_sampler ${CXX_FLAGS} + check-env: @ echo "" && \ @@ -50,15 +56,20 @@ check-env: fi clean: - rm -f gate_application \ + rm -f initialize_sv \ + gate_application \ permutation_matrix \ diagonal_matrix \ + batched_gate_application \ exponential_pauli \ expectation \ expectation_pauli \ sampler \ measure_zbasis \ batch_measure \ + batched_collapse \ + abs2sum_batched \ + measure_batched \ accessor_get \ accessor_set \ test_matrix_type \ diff --git a/samples/custatevec/batched_abs2sum.cu b/samples/custatevec/batched_abs2sum.cu new file mode 100644 index 0000000..bdc5c3a --- /dev/null +++ b/samples/custatevec/batched_abs2sum.cu @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include // cudaMalloc, cudaMemcpy, etc. +#include // cuDoubleComplex +#include // custatevecAbs2SumArrayBatched +#include // printf +#include // EXIT_FAILURE + +#include "helper.hpp" // HANDLE_ERROR, HANDLE_CUDA_ERROR + +int main(void) { + + const int nSVs = 2; + const int nIndexBits = 3; + const int nSvElms = (1 << nIndexBits); + const int bitOrderingLen = 1; + + // square absolute values of state vector elements for 0/2-th bits will be summed up + const int bitOrdering[] = {1}; + + const custatevecIndex_t svStride = nSvElms; + + // 2 state vectors are allocated contiguously in single memory chunk. + cuDoubleComplex h_svs[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}, + { 0.25, 0.25}, { 0.25, 0.25}, { 0.25, 0.25}, { 0.25, 0.25}, + { 0.25, 0.25}, { 0.25, 0.25}, { 0.25, 0.25}, { 0.25, 0.25}}; + + const custatevecIndex_t abs2sumStride = 2; + const custatevecIndex_t batchedAbs2sumSize = nSVs * abs2sumStride; + + // abs2sum arrays are allocated contiguously in single memory chunk. + double abs2sum[batchedAbs2sumSize]; + const double abs2sum_result[] = {0.27, 0.73, 0.5, 0.5}; + + cuDoubleComplex *d_svs; + HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_svs, nSVs * nSvElms * sizeof(cuDoubleComplex)) ); + + HANDLE_CUDA_ERROR( cudaMemcpy(d_svs, h_svs, nSVs * nSvElms * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice) ); + + //---------------------------------------------------------------------------------------------- + + // custatevec handle initialization + custatevecHandle_t handle; + HANDLE_ERROR( custatevecCreate(&handle) ); + + // compute abs2sum arrays + HANDLE_ERROR( custatevecAbs2SumArrayBatched( + handle, d_svs, CUDA_C_64F, nIndexBits, nSVs, svStride, abs2sum, abs2sumStride, + bitOrdering, bitOrderingLen, nullptr, nullptr, 0) ); + + // destroy handle + HANDLE_ERROR( custatevecDestroy(handle) ); + + //---------------------------------------------------------------------------------------------- + + HANDLE_CUDA_ERROR( cudaDeviceSynchronize() ); + + bool correct = true; + for (custatevecIndex_t i = 0; i < batchedAbs2sumSize; i++) { + if (!almost_equal(abs2sum[i], abs2sum_result[i])) { + correct = false; + break; + } + } + + HANDLE_CUDA_ERROR( cudaFree(d_svs) ); + + if (correct) { + printf("abs2sum_batched example PASSED\n"); + return EXIT_SUCCESS; + } + else { + printf("abs2sum_batched example FAILED: wrong result\n"); + return EXIT_FAILURE; + } +} \ No newline at end of file diff --git a/samples/custatevec/batched_collapse.cu b/samples/custatevec/batched_collapse.cu new file mode 100644 index 0000000..0204ee6 --- /dev/null +++ b/samples/custatevec/batched_collapse.cu @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include // cudaMalloc, cudaMemcpy, etc. +#include // cuDoubleComplex +#include // custatevecCollapseByBitStringBatched +#include // printf +#include // EXIT_FAILURE + +#include "helper.hpp" // HANDLE_ERROR, HANDLE_CUDA_ERROR + +int main(void) { + + const int nSVs = 2; + const int nIndexBits = 3; + const int svSize = (1 << nIndexBits); + const int svStride = (1 << nIndexBits); // no padding + + // 2 state vectors are allocated contiguously in single memory chunk. + cuDoubleComplex h_svs[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}, + { 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}}; + cuDoubleComplex h_svs_result[] = {{ 0.0, 0.0}, { 0.0, 1.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.6, 0.8}, { 0.0, 0.0}}; + + // 2 bitStrings are allocated contiguously in single memory chunk. + // The 1st SV collapses to |001> and the 2nd to |110> + // Note: bitStrings can also live on the device. + custatevecIndex_t bitStrings[] = {0b001, 0b110}; + + // bit ordering should only live on host + const int32_t bitOrdering[] = {0, 1, 2}; + const uint32_t bitStringLen = nIndexBits; + + // 2 norms are allocated contiguously in single memory chunk. + // Note: norms can also live on the device. + double norms[] = {0.01, 0.25}; + + cuDoubleComplex *d_svs; + HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_svs, nSVs * svSize * sizeof(cuDoubleComplex)) ); + + HANDLE_CUDA_ERROR( cudaMemcpy(d_svs, h_svs, nSVs * svSize * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice) ); + + //--------------------------------------------------------------------------------------------- + + // custatevec handle initialization + custatevecHandle_t handle; + HANDLE_ERROR( custatevecCreate(&handle) ); + + void* extraWorkspace = nullptr; + size_t extraWorkspaceSizeInBytes = 0; + + // check the size of external workspace + HANDLE_ERROR( custatevecCollapseByBitStringBatchedGetWorkspaceSize( + handle, nSVs, bitStrings, norms, &extraWorkspaceSizeInBytes) ); + + // allocate external workspace if necessary + if (extraWorkspaceSizeInBytes > 0) + HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) ); + + // collapse the quantum states to the target bitstrings + HANDLE_ERROR( custatevecCollapseByBitStringBatched( + handle, d_svs, CUDA_C_64F, nIndexBits, nSVs, svStride, + bitStrings, bitOrdering, bitStringLen, norms, + extraWorkspace, extraWorkspaceSizeInBytes) ); + + // destroy handle + HANDLE_ERROR( custatevecDestroy(handle) ); + + //--------------------------------------------------------------------------------------------- + + HANDLE_CUDA_ERROR( cudaMemcpy(h_svs, d_svs, nSVs * svSize * sizeof(cuDoubleComplex), + cudaMemcpyDeviceToHost) ); + + bool correct = true; + for (int i = 0; i < nSVs * svSize; i++) { + if (!almost_equal(h_svs[i], h_svs_result[i])) { + correct = false; + break; + } + } + + HANDLE_CUDA_ERROR( cudaFree(d_svs) ); + if (extraWorkspaceSizeInBytes) + HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) ); + + if (correct) { + printf("batched_collapse example PASSED\n"); + return EXIT_SUCCESS; + } + else { + printf("batched_collapse example FAILED: wrong result\n"); + return EXIT_FAILURE; + } + +} diff --git a/samples/custatevec/batched_gate_application.cu b/samples/custatevec/batched_gate_application.cu new file mode 100644 index 0000000..de6ee3f --- /dev/null +++ b/samples/custatevec/batched_gate_application.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include // cudaMalloc, cudaMemcpy, etc. +#include // cuDoubleComplex +#include // custatevecApplyMatrixBatched +#include // printf +#include // EXIT_FAILURE + +#include "helper.hpp" // HANDLE_ERROR, HANDLE_CUDA_ERROR + +int main(void) { + + const int nSVs = 2; + const int nIndexBits = 3; + const int nSvSize = (1 << nIndexBits); + const int svStride = nSvSize; + const int nTargets = 1; + const int nControls = 2; + const int adjoint = 0; + + const int targets[] = {2}; + const int controls[] = {0, 1}; + + const int nMatrices = 2; + const int matrixIndices[] = {1, 0}; + + // 2 state vectors are allocated contiguously in single memory chunk. + cuDoubleComplex h_svs[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}, + { 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}}; + cuDoubleComplex h_svs_result[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, {-0.4,-0.5}, + { 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.4, 0.5}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.1, 0.2}}; + // 2 gate matrices are allocated contiguously in single memory chunk. + cuDoubleComplex matrices[] = {{0.0, 0.0}, {1.0, 0.0}, + {1.0, 0.0}, {0.0, 0.0}, + {1.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {-1.0, 0.0}}; + + cuDoubleComplex *d_svs; + HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_svs, nSVs * svStride * sizeof(cuDoubleComplex)) ); + + HANDLE_CUDA_ERROR( cudaMemcpy(d_svs, h_svs, nSVs * svStride * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice) ); + + //--------------------------------------------------------------------------------------------- + + // custatevec handle initialization + custatevecHandle_t handle; + HANDLE_ERROR( custatevecCreate(&handle) ); + + void* extraWorkspace = nullptr; + size_t extraWorkspaceSizeInBytes = 0; + + // check the size of external workspace + HANDLE_ERROR( custatevecApplyMatrixBatchedGetWorkspaceSize( + handle, CUDA_C_64F, nIndexBits, nSVs, svStride, + CUSTATEVEC_MATRIX_MAP_TYPE_MATRIX_INDEXED, matrixIndices, matrices, CUDA_C_64F, + CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, nMatrices, nTargets, nControls, + CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) ); + + // allocate external workspace if necessary + if (extraWorkspaceSizeInBytes > 0) + HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) ); + + // apply gate + HANDLE_ERROR( custatevecApplyMatrixBatched( + handle, d_svs, CUDA_C_64F, nIndexBits, nSVs, svStride, + CUSTATEVEC_MATRIX_MAP_TYPE_MATRIX_INDEXED, matrixIndices, matrices, CUDA_C_64F, + CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, nMatrices, targets, nTargets, controls, + nullptr, nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, + extraWorkspaceSizeInBytes) ); + + // destroy handle + HANDLE_ERROR( custatevecDestroy(handle) ); + + //--------------------------------------------------------------------------------------------- + + HANDLE_CUDA_ERROR( cudaMemcpy(h_svs, d_svs, nSVs * svStride * sizeof(cuDoubleComplex), + cudaMemcpyDeviceToHost) ); + + bool correct = true; + for (int i = 0; i < nSVs * svStride; i++) { + if (!almost_equal(h_svs[i], h_svs_result[i])) { + correct = false; + break; + } + } + + HANDLE_CUDA_ERROR( cudaFree(d_svs) ); + if (extraWorkspaceSizeInBytes) + HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) ); + + if (correct) { + printf("batched_gate_application example PASSED\n"); + return EXIT_SUCCESS; + } + else { + printf("batched_gate_application example FAILED: wrong result\n"); + return EXIT_FAILURE; + } + +} \ No newline at end of file diff --git a/samples/custatevec/batched_measure.cu b/samples/custatevec/batched_measure.cu new file mode 100644 index 0000000..d10c921 --- /dev/null +++ b/samples/custatevec/batched_measure.cu @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include // cudaMalloc, cudaMemcpy, etc. +#include // cuDoubleComplex +#include // custatevecMeasureBatched +#include // printf +#include // EXIT_FAILURE + +#include "helper.hpp" // HANDLE_ERROR, HANDLE_CUDA_ERROR + +int main(void) { + + const int nSVs = 2; + const int nIndexBits = 3; + const int nSvElms = (1 << nIndexBits); + const int bitStringLen = 3; + + const int bitOrdering[] = {2, 1, 0}; + + const custatevecIndex_t svStride = nSvElms; + + custatevecIndex_t bitStrings[nSVs]; + const custatevecIndex_t bitStrings_result[] = {0b100, 0b011}; + + // In real appliction, random number in range [0, 1) will be used. + const double randnums[] = {0.009, 0.5}; + + // 2 state vectors are allocated contiguously in single memory chunk. + cuDoubleComplex h_svs[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}, + { 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1}, { 0.1, 0.2}, + { 0.2, 0.2}, { 0.3, 0.3}, { 0.3, 0.4}, { 0.4, 0.5}}; + cuDoubleComplex h_svs_result[] = {{ 0.0, 0.0}, { 0.0, 1.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.6, 0.8}, { 0.0, 0.0}}; + + cuDoubleComplex *d_svs; + HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_svs, nSVs * nSvElms * sizeof(cuDoubleComplex)) ); + + HANDLE_CUDA_ERROR( cudaMemcpy(d_svs, h_svs, nSVs * nSvElms * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice) ); + + //---------------------------------------------------------------------------------------------- + + // custatevec handle initialization + custatevecHandle_t handle; + HANDLE_ERROR( custatevecCreate(&handle) ); + + // batched measurement + HANDLE_ERROR( custatevecMeasureBatched( + handle, d_svs, CUDA_C_64F, nIndexBits, nSVs, svStride, bitStrings, bitOrdering, + bitStringLen, randnums, CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO) ); + + // destroy handle + HANDLE_ERROR( custatevecDestroy(handle) ); + + //---------------------------------------------------------------------------------------------- + + HANDLE_CUDA_ERROR( cudaMemcpy(h_svs, d_svs, nSVs * nSvElms * sizeof(cuDoubleComplex), + cudaMemcpyDeviceToHost) ); + + bool correct = true; + for (int i = 0; i < nSVs * nSvElms; i++) { + if (!almost_equal(h_svs[i], h_svs_result[i])) { + correct = false; + break; + } + } + + for (int i = 0; i < nSVs; i++) { + if (bitStrings[i] != bitStrings_result[i]) { + correct = false; + break; + } + } + + HANDLE_CUDA_ERROR( cudaFree(d_svs) ); + + if (correct) { + printf("measure_batched example PASSED\n"); + return EXIT_SUCCESS; + } + else { + printf("measure_batched example FAILED: wrong result\n"); + return EXIT_FAILURE; + } +} \ No newline at end of file diff --git a/samples/custatevec/initialize_sv.cu b/samples/custatevec/initialize_sv.cu new file mode 100644 index 0000000..506d0e5 --- /dev/null +++ b/samples/custatevec/initialize_sv.cu @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include // cudaMalloc, cudaMemcpy, etc. +#include // cuDoubleComplex +#include // custatevecInitializeStateVector +#include // printf +#include // EXIT_FAILURE + +#include "helper.hpp" // HANDLE_ERROR, HANDLE_CUDA_ERROR + +int main(void) { + + const int nIndexBits = 3; + const int svSize = (1 << nIndexBits); + + cuDoubleComplex h_sv[svSize]; + + cuDoubleComplex h_sv_result[] = {{ 1.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, + { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}}; + + cuDoubleComplex *d_sv; + HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv, svSize * sizeof(cuDoubleComplex)) ); + + // populate the device memory with junk values (for illustrative purpose only) + HANDLE_CUDA_ERROR( cudaMemset(d_sv, 0x7F, svSize * sizeof(cuDoubleComplex)) ); + + //---------------------------------------------------------------------------------------------- + + // custatevec handle initialization + custatevecHandle_t handle; + HANDLE_ERROR( custatevecCreate(&handle) ); + + // initialize the state vector + HANDLE_ERROR( custatevecInitializeStateVector( + handle, d_sv, CUDA_C_64F, nIndexBits, CUSTATEVEC_STATE_VECTOR_TYPE_ZERO) ); + + // destroy handle + HANDLE_ERROR( custatevecDestroy(handle) ); + + //---------------------------------------------------------------------------------------------- + + HANDLE_CUDA_ERROR( cudaMemcpy(h_sv, d_sv, svSize * sizeof(cuDoubleComplex), + cudaMemcpyDeviceToHost) ); + + bool correct = true; + for (int i = 0; i < svSize; i++) { + if (!almost_equal(h_sv[i], h_sv_result[i])) { + correct = false; + break; + } + } + + HANDLE_CUDA_ERROR( cudaFree(d_sv) ); + + if (correct) { + printf("initialize_sv example PASSED\n"); + return EXIT_SUCCESS; + } + else { + printf("initialize_sv example FAILED: wrong result\n"); + return EXIT_FAILURE; + } + +} diff --git a/samples/cutensornet/CMakeLists.txt b/samples/cutensornet/CMakeLists.txt index 7fdcc5e..1bee4d0 100644 --- a/samples/cutensornet/CMakeLists.txt +++ b/samples/cutensornet/CMakeLists.txt @@ -43,12 +43,6 @@ endfunction() # cutensornet_example dependencies # ########################################## -set_with_fallback(CUTENSOR_ROOT NONE) - -if (CUTENSOR_ROOT STREQUAL "") - message(FATAL_ERROR "Please set the environment variables CUTENSOR_ROOT to the path of the cuTENSOR installation.") -endif () - set_with_fallback(CUTENSORNET_ROOT CUQUANTUM_ROOT) if (CUTENSORNET_ROOT STREQUAL "") @@ -84,13 +78,11 @@ function(add_cutensornet_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) ${EXAMPLE_TARGET} PUBLIC ${CUDA_INCLUDE_DIRS} - ${CUTENSOR_ROOT}/include ${CUTENSORNET_ROOT}/include ) target_link_directories( ${EXAMPLE_TARGET} PUBLIC - ${CUTENSOR_ROOT}/lib/11 ${CUTENSORNET_ROOT}/lib ${CUTENSORNET_ROOT}/lib64 ) @@ -98,17 +90,12 @@ function(add_cutensornet_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) ${EXAMPLE_TARGET} PUBLIC cutensornet - cutensor - cudart - cusolver - cublasLt $<$:MPI::MPI_CXX> ) set_target_properties( ${EXAMPLE_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON - CUDA_SEPARABLE_COMPILATION ON CUDA_ARCHITECTURES "70;75;80" ) @@ -129,7 +116,10 @@ endfunction() add_custom_target(cutensornet_examples) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet" tensornet_example.cu) -add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet" tensornet_example_reuse.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.reuse" tensornet_example_reuse.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.gradients" tensornet_example_gradients.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.marginal" high_level/marginal_example.cu) +add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.sampler" high_level/sampling_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.svd" approxTN/tensor_svd_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.qr" approxTN/tensor_qr_example.cu) add_cutensornet_example(cutensornet_examples "cuTENSORNet.example.tensornet.gate" approxTN/gate_split_example.cu) diff --git a/samples/cutensornet/Makefile b/samples/cutensornet/Makefile index 746a813..3d7f7e7 100644 --- a/samples/cutensornet/Makefile +++ b/samples/cutensornet/Makefile @@ -4,13 +4,12 @@ SHELL := /bin/bash CUDA_PATH := ${CUDA_PATH} -CUTENSOR_ROOT := ${CUTENSOR_ROOT} CUTENSORNET_ROOT ?= ${CUQUANTUM_ROOT} MPI_ROOT := ${MPI_ROOT} - -INCLUDE_DIRS := -I${CUTENSORNET_ROOT}/include -I${CUTENSOR_ROOT}/include -I${MPI_ROOT}/include -LIBRARY_DIRS := -L${CUTENSORNET_ROOT}/lib -L${CUTENSORNET_ROOT}/lib64 -L${CUTENSOR_ROOT}/lib/11 -LINKER_FLAGS := -lcutensornet -lcutensor -lcudart -lcusolver +CUDA_MAJOR_VERSION := $(shell nvcc --version | egrep -o "V[0-9]+.[0-9]+.[0-9]" | cut -c2-3) +INCLUDE_DIRS := -I${CUTENSORNET_ROOT}/include -I${MPI_ROOT}/include +LIBRARY_DIRS := -L${CUTENSORNET_ROOT}/lib -L${CUTENSORNET_ROOT}/lib64 +LINKER_FLAGS := -lcutensornet ARCH_FLAGS_SM70 = -gencode arch=compute_70,code=sm_70 ARCH_FLAGS_SM75 = -gencode arch=compute_75,code=sm_75 @@ -24,10 +23,13 @@ CXX_FLAGS = -std=c++11 $(INCLUDE_DIRS) $(LIBRARY_DIRS) $(LINKER_FLAGS) $( all: check-env ${CUDA_PATH}/bin/nvcc tensornet_example.cu -o tensornet_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc tensornet_example_reuse.cu -o tensornet_example_reuse ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc tensornet_example_gradients.cu -o tensornet_example_gradients ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc approxTN/tensor_svd_example.cu -o tensor_svd_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc approxTN/tensor_qr_example.cu -o tensor_qr_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc approxTN/gate_split_example.cu -o gate_split_example ${CXX_FLAGS} ${CUDA_PATH}/bin/nvcc approxTN/mps_example.cu -o mps_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/marginal_example.cu -o marginal_example ${CXX_FLAGS} + ${CUDA_PATH}/bin/nvcc high_level/sampling_example.cu -o sampling_example ${CXX_FLAGS} ifdef MPI_ROOT ${CUDA_PATH}/bin/nvcc tensornet_example_mpi.cu -Xlinker -rpath,${MPI_ROOT}/lib -L${MPI_ROOT}/lib -o tensornet_example_mpi ${CXX_FLAGS} -lmpi ${CUDA_PATH}/bin/nvcc tensornet_example_mpi_auto.cu -Xlinker -rpath,${MPI_ROOT}/lib -L${MPI_ROOT}/lib -o tensornet_example_mpi_auto ${CXX_FLAGS} -lmpi @@ -36,16 +38,9 @@ all: check-env check-env: @ echo "" && \ echo "CUDA_PATH=${CUDA_PATH}"; \ - echo "CUTENSOR_ROOT=${CUTENSOR_ROOT}"; \ echo "CUTENSORNET_ROOT=${CUTENSORNET_ROOT}"; \ echo "MPI_ROOT=${MPI_ROOT}"; \ echo ""; \ - if [[ -z "${CUTENSOR_ROOT}" ]]; \ - then \ - echo "" && \ - echo "CUTENSOR_ROOT is not set." && \ - exit 1; \ - fi; \ if [[ -z "${CUTENSORNET_ROOT}" ]]; \ then \ echo "" && \ @@ -60,6 +55,14 @@ check-env: fi clean: - rm -f tensornet_example tensornet_example.o tensornet_example_reuse tensornet_example_reuse.o tensornet_example_mpi tensornet_example_mpi.o tensornet_example_mpi_auto tensornet_example_mpi_auto.o - rm -f tensor_qr_example tensor_qr_example.o tensor_svd_example tensor_svd_example.o - rm -f gatesplit_example gatesplit_example.o mps_example mps_example.o + rm -f tensornet_example tensornet_example.o + rm -f tensornet_example_reuse tensornet_example_reuse.o + rm -f tensornet_example_gradients tensornet_example_gradients.o + rm -f tensor_svd_example tensor_svd_example.o + rm -f tensor_qr_example tensor_qr_example.o + rm -f gate_split_example gate_split_example.o + rm -f mps_example mps_example.o + rm -f marginal_example marginal_example.o + rm -f sampling_example sampling_example.o + rm -f tensornet_example_mpi tensornet_example_mpi.o + rm -f tensornet_example_mpi_auto tensornet_example_mpi_auto.o diff --git a/samples/cutensornet/README.md b/samples/cutensornet/README.md index 735a723..b618a00 100644 --- a/samples/cutensornet/README.md +++ b/samples/cutensornet/README.md @@ -50,6 +50,12 @@ The tensor SVD sample can be easily executed in a command shell using: ``` The sample for tensor QR, gate split and MPS can also be executed in the same fashion. +**Note**: Depending on how CUDA Toolkit and cuTENSOR are installed, you might need to add them to `LD_LIBRARY_PATH` like this: +``` +export LD_LIBRARY_PATH=$CUDA_PATH/lib64:$CUTENSOR_ROOT/lib/11:$LD_LIBRARY_PATH +``` +The cuTENSOR library path would depend on the CUDA major version. Please refer to the [Getting Started](https://docs.nvidia.com/cuda/cuquantum/cutensornet/getting_started.html) page for further detail. + ## Support * **Supported SM Architectures:** SM 7.0, SM 7.5, SM 8.0, SM 8.6, SM 9.0 @@ -164,3 +170,14 @@ This sample demonstrates how to: * Mark input tensors as "constant" when creating a tensor network using `cutensornetCreateNetworkDescriptor`, by setting the corresponding `cutensornetTensorQualifiers_t` field. * Provide a cache workspace to the contraction plan which will be used to accelerate the subsequent contractions of the same network. It shows how to query the required cache memory size using `cutensornetWorkspaceGetMemorySize` with a `CUTENSORNET_WORKSPACE_CACHE` workspace-kind, and how to the provide the workspace memory using `cutensornetWorkspaceSetMemory`. * Provide a predefined contraction path to the contraction optimizer by calling `cutensornetContractionOptimizerInfoSetAttribute` with `CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH` attribute. + +### 9. Gradient via back-propagation (`tensornet_example_gradients.cu`) + +This sample demonstrates how to perform back-propagation to compute gradients of a tensor-network w.r.t. select input tensors. +This sample largely builds on first sample provided above. + +This sample demonstrates how to: +* Mark input tensors for gradient computation when creating a tensor network by calling `cutensornetNetworkSetAttribute`. +* Provide a cache workspace to the contraction plan which will be used to hold intermediate data needed for gradient computation. It shows how to query the required cache memory size using `cutensornetWorkspaceGetMemorySize` with a `CUTENSORNET_WORKSPACE_CACHE` workspace-kind, and how to the provide the workspace memory using `cutensornetWorkspaceSetMemory`. +* Call `cutensornetComputeGradientsBackward` to perform the gradient computation. +* Call `cutensornetWorkspacePurgeCache` to clean up the cache and prepare for the next gradient calculation. diff --git a/samples/cutensornet/approxTN/tensor_svd_example.cu b/samples/cutensornet/approxTN/tensor_svd_example.cu index dd523c2..cd856eb 100644 --- a/samples/cutensornet/approxTN/tensor_svd_example.cu +++ b/samples/cutensornet/approxTN/tensor_svd_example.cu @@ -217,12 +217,14 @@ int main() printf("Initialize the cuTensorNet library and create all tensor descriptors.\n"); // Sphinx: #5 - /******************************** - * Setup SVD truncation parameters - *********************************/ + /********************************************** + * Setup SVD algorithm and truncation parameters + ***********************************************/ cutensornetTensorSVDConfig_t svdConfig; HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) ); + + // set up truncation parameters double absCutoff = 1e-2; HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, svdConfig, @@ -236,6 +238,21 @@ int main() &relCutoff, sizeof(relCutoff)) ); + // optional: choose gesvdj algorithm with customized parameters. Default is gesvd. + cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; + HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, + svdConfig, + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, + &svdAlgo, + sizeof(svdAlgo)) ); + cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80}; + HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, + svdConfig, + CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, + &gesvdjParams, + sizeof(gesvdjParams)) ); + printf("Set up SVDConfig to use GESVDJ algorithm with truncation\n"); + /******************************************************** * Create SVDInfo to record runtime SVD truncation details *********************************************************/ @@ -337,11 +354,14 @@ int main() double discardedWeight{0}; int64_t reducedExtent{0}; + cutensornetGesvdjStatus_t gesvdjStatus; cudaDeviceSynchronize(); // device synchronization. HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_DISCARDED_WEIGHT, &discardedWeight, sizeof(discardedWeight)) ); HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_REDUCED_EXTENT, &reducedExtent, sizeof(reducedExtent)) ); + HANDLE_ERROR( cutensornetTensorSVDInfoGetAttribute( handle, svdInfo, CUTENSORNET_TENSOR_SVD_INFO_ALGO_STATUS, &gesvdjStatus, sizeof(gesvdjStatus)) ); printf("elapsed time: %.2f ms\n", minTimeCUTENSOR * 1000.f); + printf("GESVDJ residual: %.4f, runtime sweeps = %d\n", gesvdjStatus.residual, gesvdjStatus.sweeps); printf("reduced extent found at runtime: %lu\n", reducedExtent); printf("discarded weight: %.2f\n", discardedWeight); diff --git a/samples/cutensornet/high_level/marginal_example.cu b/samples/cutensornet/high_level/marginal_example.cu new file mode 100644 index 0000000..5310a53 --- /dev/null +++ b/samples/cutensornet/high_level/marginal_example.cu @@ -0,0 +1,199 @@ +/* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: Marginal #1 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: Marginal #2 + + // Quantum state configuration + constexpr int32_t numQubits = 16; + const std::vector qubitDims(numQubits,2); // qubit dimensions + constexpr int32_t numMarginalModes = 2; // rank of the marginal (reduced density matrix) + const std::vector marginalModes({0,1}); // open qubits (must be in acsending order) + std::cout << "Quantum circuit: " << numQubits << " qubits\n"; + + // Sphinx: Marginal #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: Marginal #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: Marginal #5 + + // Allocate the specified quantum circuit reduced density matrix (marginal) in Device memory + void *d_rdm{nullptr}; + std::size_t rdmDim = 1; + for(const auto & mode: marginalModes) rdmDim *= qubitDims[mode]; + const std::size_t rdmSize = rdmDim * rdmDim; + HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSize * (2 * fp64size))); + + // Sphinx: Marginal #6 + + // Query the free memory on Device + std::size_t freeSize{0}, totalSize{0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n"; + + // Sphinx: Marginal #7 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: Marginal #8 + + // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: Marginal #9 + + // Specify the desired reduced density matrix (marginal) + cutensornetStateMarginal_t marginal; + HANDLE_CUTN_ERROR(cutensornetCreateMarginal(cutnHandle, quantumState, numMarginalModes, marginalModes.data(), + 0, nullptr, std::vector{{1,2,4,8}}.data(), &marginal)); // using explicit strides + std::cout << "Created the specified quantum circuit reduced densitry matrix (marginal)\n"; + + // Sphinx: Marginal #10 + + // Configure the computation of the specified quantum circuit reduced density matrix (marginal) + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetMarginalConfigure(cutnHandle, marginal, + CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: Marginal #11 + + // Prepare the specified quantum circuit reduced densitry matrix (marginal) + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + std::cout << "Created the workspace descriptor\n"; + HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(cutnHandle, marginal, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the specified quantum circuit reduced density matrix (marginal)\n"; + + // Sphinx: Marginal #12 + + // Attach the workspace buffer + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + std::cout << "Required scratch GPU workspace size (bytes) = " << worksize << std::endl; + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: Marginal #13 + + // Compute the specified quantum circuit reduced densitry matrix (marginal) + HANDLE_CUTN_ERROR(cutensornetMarginalCompute(cutnHandle, marginal, nullptr, workDesc, d_rdm, 0)); + std::cout << "Computed the specified quantum circuit reduced density matrix (marginal)\n"; + std::vector> h_rdm(rdmSize); + HANDLE_CUDA_ERROR(cudaMemcpy(h_rdm.data(), d_rdm, rdmSize * (2 * fp64size), cudaMemcpyDeviceToHost)); + std::cout << "Reduced density matrix for " << numMarginalModes << " qubits:\n"; + for(std::size_t i = 0; i < rdmDim; ++i) { + for(std::size_t j = 0; j < rdmDim; ++j) { + std::cout << " " << h_rdm[i + j * rdmDim]; + } + std::cout << std::endl; + } + + // Sphinx: Marginal #14 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit reduced density matrix + HANDLE_CUTN_ERROR(cutensornetDestroyMarginal(marginal)); + std::cout << "Destroyed the quantum circuit state reduced density matrix (marginal)\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_rdm)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/high_level/sampling_example.cu b/samples/cutensornet/high_level/sampling_example.cu new file mode 100644 index 0000000..b9b96bb --- /dev/null +++ b/samples/cutensornet/high_level/sampling_example.cu @@ -0,0 +1,184 @@ +/* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: Sampler #1 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + +#define HANDLE_CUTN_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \ +}; + + +int main(int argc, char **argv) +{ + constexpr std::size_t fp64size = sizeof(double); + + // Sphinx: Sampler #2 + + // Quantum state configuration + const int64_t numSamples = 100; + const int32_t numQubits = 16; + const std::vector qubitDims(numQubits, 2); // qubit size + std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n"; + + // Sphinx: Sampler #3 + + // Initialize the cuTensorNet library + HANDLE_CUDA_ERROR(cudaSetDevice(0)); + cutensornetHandle_t cutnHandle; + HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle)); + std::cout << "Initialized cuTensorNet library on GPU 0\n"; + + // Sphinx: Sampler #4 + + // Define necessary quantum gate tensors in Host memory + const double invsq2 = 1.0 / std::sqrt(2.0); + // Hadamard gate + const std::vector> h_gateH {{invsq2, 0.0}, {invsq2, 0.0}, + {invsq2, 0.0}, {-invsq2, 0.0}}; + // CX gate + const std::vector> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, + {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}}; + + // Copy quantum gates to Device memory + void *d_gateH{nullptr}, *d_gateCX{nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size))); + std::cout << "H gate buffer allocated on GPU: " << d_gateH << std::endl; //debug + HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size))); + std::cout << "CX gate buffer allocated on GPU: " << d_gateCX << std::endl; //debug + std::cout << "Allocated quantum gate memory on GPU\n"; + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice)); + std::cout << "Copied quantum gates to GPU memory\n"; + + // Sphinx: Sampler #5 + + // Create the initial quantum state + cutensornetState_t quantumState; + HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(), + CUDA_C_64F, &quantumState)); + std::cout << "Created the initial quantum state\n"; + + // Sphinx: Sampler #6 + + // Construct the quantum circuit state (apply quantum gates) + int64_t id; + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector{{0}}.data(), + d_gateH, nullptr, 1, 0, 1, &id)); + for(int32_t i = 1; i < numQubits; ++i) { + HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector{{i-1,i}}.data(), + d_gateCX, nullptr, 1, 0, 1, &id)); + } + std::cout << "Applied quantum gates\n"; + + // Sphinx: Sampler #7 + + // Create the quantum circuit sampler + cutensornetStateSampler_t sampler; + HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler)); + std::cout << "Created the quantum circuit sampler\n"; + + // Sphinx: Sampler #8 + + // Query the free memory on Device + std::size_t freeSize {0}, totalSize {0}; + HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize)); + const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment + void *d_scratch {nullptr}; + HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize)); + std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU: " + << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch)) + scratchSize) << ")\n"; + + // Sphinx: Sampler #9 + + // Configure the quantum circuit sampler + const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder + HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler, + CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples))); + + // Sphinx: Sampler #10 + + // Prepare the quantum circuit sampler + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc)); + HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, scratchSize, workDesc, 0x0)); + std::cout << "Prepared the quantum circuit state sampler\n"; + + // Sphinx: Sampler #11 + + // Attach the workspace buffer + int64_t worksize {0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &worksize)); + assert(worksize > 0); + if(worksize <= scratchSize) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize)); + }else{ + std::cout << "ERROR: Insufficient workspace size on Device!\n"; + std::abort(); + } + std::cout << "Set the workspace buffer\n"; + + // Sphinx: Sampler #12 + + // Sample the quantum circuit state + std::vector samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory + HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0)); + std::cout << "Performed quantum circuit state sampling\n"; + std::cout << "Bit-string samples:\n"; + for(int64_t i = 0; i < numSamples; ++i) { + for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j]; + std::cout << std::endl; + } + + // Sphinx: Sampler #13 + + // Destroy the workspace descriptor + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + std::cout << "Destroyed the workspace descriptor\n"; + + // Destroy the quantum circuit sampler + HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler)); + std::cout << "Destroyed the quantum circuit state sampler\n"; + + // Destroy the quantum circuit state + HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState)); + std::cout << "Destroyed the quantum circuit state\n"; + + HANDLE_CUDA_ERROR(cudaFree(d_scratch)); + HANDLE_CUDA_ERROR(cudaFree(d_gateCX)); + HANDLE_CUDA_ERROR(cudaFree(d_gateH)); + std::cout << "Freed memory on GPU\n"; + + // Finalize the cuTensorNet library + HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle)); + std::cout << "Finalized the cuTensorNet library\n"; + + return 0; +} diff --git a/samples/cutensornet/tensornet_example.cu b/samples/cutensornet/tensornet_example.cu index 7637fa3..9ef785c 100644 --- a/samples/cutensornet/tensornet_example.cu +++ b/samples/cutensornet/tensornet_example.cu @@ -418,7 +418,7 @@ int main() R_d, accumulateOutput, workDesc, - sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object + sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object stream) ); // Synchronize and measure best timing diff --git a/samples/cutensornet/tensornet_example_gradients.cu b/samples/cutensornet/tensornet_example_gradients.cu new file mode 100644 index 0000000..0bc5511 --- /dev/null +++ b/samples/cutensornet/tensornet_example_gradients.cu @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +// Sphinx: #1 + +#include +#include + +#include +#include +#include + +#include +#include + + +#define HANDLE_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSORNET_STATUS_SUCCESS ) \ + { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \ + fflush(stdout); \ + } \ +}; + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \ + fflush(stdout); \ + } \ +}; + + +struct GPUTimer +{ + GPUTimer(cudaStream_t stream): stream_(stream) + { + cudaEventCreate(&start_); + cudaEventCreate(&stop_); + } + + ~GPUTimer() + { + cudaEventDestroy(start_); + cudaEventDestroy(stop_); + } + + void start() + { + cudaEventRecord(start_, stream_); + } + + float seconds() + { + cudaEventRecord(stop_, stream_); + cudaEventSynchronize(stop_); + float time; + cudaEventElapsedTime(&time, start_, stop_); + return time * 1e-3; + } + + private: + cudaEvent_t start_, stop_; + cudaStream_t stream_; +}; + + +int main() +{ + static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!"); + + bool verbose = true; + + // Check cuTensorNet version + const size_t cuTensornetVersion = cutensornetGetVersion(); + if(verbose) + printf("cuTensorNet version: %ld\n", cuTensornetVersion); + + // Set GPU device + int numDevices {0}; + HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) ); + const int deviceId = 0; + HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) ); + cudaDeviceProp prop; + HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) ); + + if(verbose) { + printf("===== device info ======\n"); + printf("GPU-name:%s\n", prop.name); + printf("GPU-clock:%d\n", prop.clockRate); + printf("GPU-memoryClock:%d\n", prop.memoryClockRate); + printf("GPU-nSM:%d\n", prop.multiProcessorCount); + printf("GPU-major:%d\n", prop.major); + printf("GPU-minor:%d\n", prop.minor); + printf("========================\n"); + } + + typedef float floatType; + cudaDataType_t typeData = CUDA_R_32F; + cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F; + + if(verbose) + printf("Included headers and defined data types\n"); + + // Sphinx: #2 + /********************** + * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m} + * We will execute the contraction and compute the gradients of input tensors A, B, C + **********************/ + + constexpr int32_t numInputs = 6; + std::vector gradInputIDs = {0, 1, 2}; + + // Create vectors of tensor modes + std::vector> modesVec { + {'a','b','c','d'}, + {'b','c','d','e'}, + {'e','g','h'}, + {'g','h','i','j'}, + {'i','j','k','l'}, + {'k','l','m'}, + {'a','m'} + }; + + // Set mode extents + int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ. + std::unordered_map extent; + for (auto &vec: modesVec) + { + for (auto &mode: vec) + { + extent[mode] = sameExtent; + } + } + + // Create a vector of extents for each tensor + std::vector> extentVec; + extentVec.resize(numInputs+1); // hold inputs + output tensors + for (int i = 0; i < numInputs+1; ++i) + { + for (auto mode : modesVec[i]) + extentVec[i].push_back(extent[mode]); + } + + if(verbose) + printf("Defined tensor network, modes, and extents\n"); + + // Sphinx: #3 + /********************** + * Allocating data + **********************/ + + std::vector elementsVec; + elementsVec.resize(numInputs+1); // hold inputs + output tensors + for (int i = 0; i < numInputs+1; ++i) + { + elementsVec[i] = 1; + for (auto mode : modesVec[i]) + elementsVec[i] *= extent[mode]; + } + + size_t totalSize = 0; + std::vector sizeVec; + sizeVec.resize(numInputs+1); // hold inputs + output tensors + for (int i = 0; i < numInputs+1; ++i) + { + sizeVec[i] = sizeof(floatType) * elementsVec[i]; + totalSize += sizeVec[i]; + } + if(verbose) + printf("Total GPU memory used for tensor storage: %.2f GiB\n", + (totalSize) / 1024. /1024. / 1024); + + void* rawDataIn_d[numInputs]; + void* O_d; + void* outputActivation_d; + for (int i = 0; i < numInputs; ++i) + { + HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) ); + } + HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs])); + HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs])); + + floatType* rawDataIn_h[numInputs]; + for (int i = 0; i < numInputs; ++i) + { + rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]); + if (rawDataIn_h[i] == NULL) + { + printf("Error: Host memory allocation failed!\n"); + return -1; + } + } + floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]); + if (O_h == NULL) + { + printf("Error: Host memory allocation failed!\n"); + return -1; + } + floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]); + if (outputActivation_h == NULL) + { + printf("Error: Host memory allocation failed!\n"); + return -1; + } + + void* gradientsOut_d[numInputs] = {nullptr}; + for (auto i : gradInputIDs) + { + HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) ); + } + void* gradientsOut_h[numInputs] = {nullptr}; + for (auto i : gradInputIDs) + { + gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]); + if (gradientsOut_h[i] == NULL) + { + printf("Error: Host memory allocation failed!\n"); + return -1; + } + } + + /******************* + * Initialize data + *******************/ + + memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]); + for (int i = 0; i < numInputs; ++i) + { + for (size_t e = 0; e < elementsVec[i]; ++e) + rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX; + } + for (size_t e = 0; e < elementsVec[numInputs]; ++e) + outputActivation_h[e] = (floatType) 1.0; + + for (int i = 0; i < numInputs; ++i) + { + HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) ); + } + HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); + + if(verbose) + printf("Allocated GPU memory for data, initialize data, and create library handle\n"); + + /************************* + * cuTensorNet + *************************/ + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cutensornetHandle_t handle; + HANDLE_ERROR( cutensornetCreate(&handle) ); + + // Sphinx: #4 + /******************************* + * Create Network Descriptor + *******************************/ + + int32_t* modesIn[numInputs]; + int32_t numModesIn[numInputs]; + int64_t* extentsIn[numInputs]; + int64_t* stridesIn[numInputs]; + + for (int i = 0; i < numInputs; ++i) + { + modesIn[i] = modesVec[i].data(); + numModesIn[i] = modesVec[i].size(); + extentsIn[i] = extentVec[i].data(); + stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout + } + + // Set up tensor network + cutensornetNetworkDescriptor_t descNet; + HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle, + numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL, + modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(), + typeData, typeCompute, + &descNet) ); + + /******************************* + * Set input tensor ids that requrie gradient + *******************************/ + + cutensornetTensorIDList_t tensorIDList { + .numTensors = static_cast(gradInputIDs.size()), + .data = gradInputIDs.data() + }; + + HANDLE_ERROR(cutensornetNetworkSetAttribute(handle, + descNet, + CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD, + &tensorIDList, + sizeof(cutensornetTensorIDList_t))); + + if(verbose) + printf("Initialized the cuTensorNet library and created a tensor network descriptor\n"); + + // Sphinx: #5 + /******************************* + * Choose workspace limit based on available resources. + *******************************/ + + size_t freeMem, totalMem; + HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) ); + uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9); + if(verbose) + printf("Workspace limit = %lu\n", workspaceLimit); + + /******************************* + * Set contraction order + *******************************/ + + // Create contraction optimizer info + cutensornetContractionOptimizerInfo_t optimizerInfo; + HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) ); + + // set a predetermined contraction path + std::vector path{0,1,0,4,0,3,0,2,0,1}; + const auto numContractions = numInputs - 1; + cutensornetContractionPath_t contPath; + contPath.data = reinterpret_cast(const_cast(path.data())); + contPath.numContractions = numContractions; + + // provide user-specified contPath + HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute( + handle, + optimizerInfo, + CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH, + &contPath, + sizeof(contPath))); + int64_t numSlices = 1; + + if(verbose) + printf("Set predetermined contraction path into cuTensorNet optimizer\n"); + + // Sphinx: #6 + /******************************* + * Create workspace descriptor, allocate workspace, and set it. + *******************************/ + + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) ); + + // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords + int64_t requiredWorkspaceSizeScratch = 0; + HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle, + descNet, + optimizerInfo, + workDesc) ); + + HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_MIN, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + &requiredWorkspaceSizeScratch) ); + + void* workScratch = nullptr; + HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) ); + + HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle, + workDesc, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, + workScratch, + requiredWorkspaceSizeScratch) ); + + // set CACHE workspace, which will be used across network contraction operations + int64_t requiredWorkspaceSizeCache = 0; + HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle, + workDesc, + CUTENSORNET_WORKSIZE_PREF_MIN, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_CACHE, + &requiredWorkspaceSizeCache) ); + + void* workCache = nullptr; + HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) ); + + HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle, + workDesc, + CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_CACHE, + workCache, + requiredWorkspaceSizeCache) ); + + if(verbose) + printf("Allocated and set up the GPU workspace\n"); + + // Sphinx: #7 + /******************************* + * Initialize the pairwise contraction plan (for cuTENSOR). + *******************************/ + + cutensornetContractionPlan_t plan; + HANDLE_ERROR( cutensornetCreateContractionPlan(handle, + descNet, + optimizerInfo, + workDesc, + &plan) ); + + /******************************* + * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel + * for each pairwise tensor contraction. + *******************************/ + cutensornetContractionAutotunePreference_t autotunePref; + HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle, + &autotunePref) ); + + const int numAutotuningIterations = 5; // may be 0 + HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute( + handle, + autotunePref, + CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS, + &numAutotuningIterations, + sizeof(numAutotuningIterations)) ); + + // Modify the plan again to find the best pair-wise contractions + HANDLE_ERROR( cutensornetContractionAutotune(handle, + plan, + rawDataIn_d, + O_d, + workDesc, + autotunePref, + stream) ); + + HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) ); + + if(verbose) + printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n"); + + // Sphinx: #8 + /********************** + * Execute the tensor network contraction + **********************/ + + // Create a cutensornetSliceGroup_t object from a range of slice IDs + cutensornetSliceGroup_t sliceGroup{}; + HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) ); + + GPUTimer timer {stream}; + double minTimeCUTENSORNET = 1e100; + const int numRuns = 3; // number of repeats to get stable performance results + for (int i = 0; i < numRuns; ++i) + { + HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU + HANDLE_CUDA_ERROR( cudaDeviceSynchronize() ); + + /* + * Contract all slices of the tensor network + */ + timer.start(); + + int32_t accumulateOutput = 0; // output tensor data will be overwritten + HANDLE_ERROR( cutensornetContractSlices(handle, + plan, + rawDataIn_d, + O_d, + accumulateOutput, + workDesc, + sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object + stream) ); + + HANDLE_ERROR( cutensornetComputeGradientsBackward(handle, + plan, + rawDataIn_d, + outputActivation_d, + gradientsOut_d, + accumulateOutput, + workDesc, + stream) ); + + // Purge the cache to make room for the next run to use cache memory + HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) ); + + // Synchronize and measure best timing + auto time = timer.seconds(); + minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time; + } + + if(verbose) + printf("Contracted the tensor network and computed gradients\n"); + + HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host + + for (auto i : gradInputIDs) + { + HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) ); + } + + /*************************/ + + if(verbose) { + printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f); + } + + // Sphinx: #9 + /*************** + * Free resources + ****************/ + + // Free cuTensorNet resources + HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) ); + HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) ); + HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) ); + HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) ); + HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) ); + HANDLE_ERROR( cutensornetDestroy(handle) ); + + // Free Host memory resources + if (O_h) free(O_h); + if (outputActivation_h) free(outputActivation_h); + for (int i = 0; i < numInputs; ++i) + { + if (rawDataIn_h[i]) + free(rawDataIn_h[i]); + if (gradientsOut_h[i]) + free(gradientsOut_h[i]); + } + // Free GPU memory resources + if (workScratch) cudaFree(workScratch); + if (workCache) cudaFree(workCache); + if (O_d) cudaFree(O_d); + if (outputActivation_d) cudaFree(outputActivation_d); + for (int i = 0; i < numInputs; ++i) + { + if (rawDataIn_d[i]) + cudaFree(rawDataIn_d[i]); + if (gradientsOut_d[i]) + cudaFree(gradientsOut_d[i]); + } + if(verbose) + printf("Freed resources and exited\n"); + + return 0; +} From fbe0291672c51ee80b901b3ca14860878c7c9e46 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 12 Jul 2023 17:46:48 -0700 Subject: [PATCH 2/3] update hard-coded urls to new format --- benchmarks/README.md | 2 +- extra/custatevec/README.md | 4 ++-- python/README.md | 2 +- samples/cutensornet/README.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 61c3d0e..4ff9780 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -12,7 +12,7 @@ You can install all optional dependencies via ``` pip install .[all] ``` -if running outside of the [cuQuantum Appliance container](https://docs.nvidia.com/cuda/cuquantum/appliance/index.html). +if running outside of the [cuQuantum Appliance container](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/index.html). **Note: You may have to build `qsimcirq` and `qiskit-aer` GPU support from source if needed.** diff --git a/extra/custatevec/README.md b/extra/custatevec/README.md index 420bb2e..b3c8bca 100644 --- a/extra/custatevec/README.md +++ b/extra/custatevec/README.md @@ -1,6 +1,6 @@ ## MPI Comm Plugin Extension -The first version of [multi-node state vector simulator](https://docs.nvidia.com/cuda/cuquantum/appliance/qiskit.html) has been released in cuQuantum Appliance 22.11. It currently supports a limited set of versions of OpenMPI and MPICH. Other MPI libraries are supported by using an extension module called as External CommPlugin. +The first version of [multi-node state vector simulator](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/qiskit.html) has been released in cuQuantum Appliance 22.11. It currently supports a limited set of versions of OpenMPI and MPICH. Other MPI libraries are supported by using an extension module called as External CommPlugin. External CommPlugin is a small shared object that wraps MPI functions. A customer needs to build its own external CommPlugin and link it to the MPI library of its preference to create a shared object. Then, by specifying appropriate options to the simulator, the compiled shared object is dynamically loaded to use the MPI library for inter-process communications. ## Prerequisite @@ -25,7 +25,7 @@ $ ls -l ## Simulator options -The custom Comm Plugin object is selected by [cusvaer options](https://docs.nvidia.com/cuda/cuquantum/appliance/cusvaer.html#commplugin), `cusvaer_comm_plugin_type` and `cusvaer_comm_plugin_soname`. +The custom Comm Plugin object is selected by [cusvaer options](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/cusvaer.html#commplugin), `cusvaer_comm_plugin_type` and `cusvaer_comm_plugin_soname`. - `cusvaer_comm_plugin_type`: The value is `cusvaer.CommPluginType.EXTERNAL` - `cusvaer_comm_plugin_soname` The name of the shared object of an external comm plugin diff --git a/python/README.md b/python/README.md index b40cadf..aeead2c 100644 --- a/python/README.md +++ b/python/README.md @@ -2,7 +2,7 @@ ## Documentation -Please visit the [NVIDIA cuQuantum Python documentation](https://docs.nvidia.com/cuda/cuquantum/python). +Please visit the [NVIDIA cuQuantum Python documentation](https://docs.nvidia.com/cuda/cuquantum/latest/python). ## Installation diff --git a/samples/cutensornet/README.md b/samples/cutensornet/README.md index b618a00..d6b1945 100644 --- a/samples/cutensornet/README.md +++ b/samples/cutensornet/README.md @@ -1,6 +1,6 @@ # cuTensorNet - Samples -* [Documentation](https://docs.nvidia.com/cuda/cuquantum/cutensornet/index.html) +* [Documentation](https://docs.nvidia.com/cuda/cuquantum/latest/cutensornet/index.html) ## Install @@ -54,7 +54,7 @@ The sample for tensor QR, gate split and MPS can also be executed in the same fa ``` export LD_LIBRARY_PATH=$CUDA_PATH/lib64:$CUTENSOR_ROOT/lib/11:$LD_LIBRARY_PATH ``` -The cuTENSOR library path would depend on the CUDA major version. Please refer to the [Getting Started](https://docs.nvidia.com/cuda/cuquantum/cutensornet/getting_started.html) page for further detail. +The cuTENSOR library path would depend on the CUDA major version. Please refer to the [Getting Started](https://docs.nvidia.com/cuda/cuquantum/latest/cutensornet/getting_started.html) page for further detail. ## Support From de5658f0a3f6f3ade02b57f48662b5ae6a2eddbe Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 13 Jul 2023 09:42:36 -0700 Subject: [PATCH 3/3] sync with internal repo1 (commit 8963e92db) --- extra/custatevec/README.md | 2 +- python/samples/custatevec/batched_abs2sum.py | 2 +- python/samples/custatevec/batched_collapse.py | 2 +- python/samples/custatevec/batched_gate_application.py | 2 +- python/samples/custatevec/batched_measure.py | 2 +- python/samples/custatevec/initialize_sv.py | 2 +- python/setup.py | 2 +- .../tests/cuquantum_tests/cutensornet_tests/test_internal.py | 4 ++++ samples/custatevec/batched_abs2sum.cu | 4 ++-- samples/custatevec/batched_collapse.cu | 2 +- samples/custatevec/batched_gate_application.cu | 4 ++-- samples/custatevec/batched_measure.cu | 4 ++-- samples/custatevec/initialize_sv.cu | 2 +- samples/cutensornet/high_level/marginal_example.cu | 2 +- samples/cutensornet/high_level/sampling_example.cu | 2 +- 15 files changed, 21 insertions(+), 17 deletions(-) diff --git a/extra/custatevec/README.md b/extra/custatevec/README.md index b3c8bca..75227f2 100644 --- a/extra/custatevec/README.md +++ b/extra/custatevec/README.md @@ -1,6 +1,6 @@ ## MPI Comm Plugin Extension -The first version of [multi-node state vector simulator](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/qiskit.html) has been released in cuQuantum Appliance 22.11. It currently supports a limited set of versions of OpenMPI and MPICH. Other MPI libraries are supported by using an extension module called as External CommPlugin. +The first version of [multi-node state vector simulator](https://docs.nvidia.com/cuda/cuquantum/latest/appliance/qiskit.html) has been released in cuQuantum Appliance 22.11. It currently supports a limited set of versions of Open MPI and MPICH. Other MPI libraries are supported by using an extension module (called External CommPlugin). External CommPlugin is a small shared object that wraps MPI functions. A customer needs to build its own external CommPlugin and link it to the MPI library of its preference to create a shared object. Then, by specifying appropriate options to the simulator, the compiled shared object is dynamically loaded to use the MPI library for inter-process communications. ## Prerequisite diff --git a/python/samples/custatevec/batched_abs2sum.py b/python/samples/custatevec/batched_abs2sum.py index 343911e..302881f 100644 --- a/python/samples/custatevec/batched_abs2sum.py +++ b/python/samples/custatevec/batched_abs2sum.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES # # SPDX-License-Identifier: BSD-3-Clause diff --git a/python/samples/custatevec/batched_collapse.py b/python/samples/custatevec/batched_collapse.py index b33fc18..7972597 100644 --- a/python/samples/custatevec/batched_collapse.py +++ b/python/samples/custatevec/batched_collapse.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES # # SPDX-License-Identifier: BSD-3-Clause diff --git a/python/samples/custatevec/batched_gate_application.py b/python/samples/custatevec/batched_gate_application.py index 37cf6bf..de3d0e7 100644 --- a/python/samples/custatevec/batched_gate_application.py +++ b/python/samples/custatevec/batched_gate_application.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES # # SPDX-License-Identifier: BSD-3-Clause diff --git a/python/samples/custatevec/batched_measure.py b/python/samples/custatevec/batched_measure.py index 138a4bd..68180d4 100644 --- a/python/samples/custatevec/batched_measure.py +++ b/python/samples/custatevec/batched_measure.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES # # SPDX-License-Identifier: BSD-3-Clause diff --git a/python/samples/custatevec/initialize_sv.py b/python/samples/custatevec/initialize_sv.py index 47c84ef..39d9538 100644 --- a/python/samples/custatevec/initialize_sv.py +++ b/python/samples/custatevec/initialize_sv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES # # SPDX-License-Identifier: BSD-3-Clause diff --git a/python/setup.py b/python/setup.py index b69a8a4..4e59fa7 100644 --- a/python/setup.py +++ b/python/setup.py @@ -82,7 +82,7 @@ project_urls={ "Bug Tracker": "https://github.com/NVIDIA/cuQuantum/issues", "User Forum": "https://github.com/NVIDIA/cuQuantum/discussions", - "Documentation": "https://docs.nvidia.com/cuda/cuquantum/python/", + "Documentation": "https://docs.nvidia.com/cuda/cuquantum/latest/python/", "Source Code": "https://github.com/NVIDIA/cuQuantum", }, author="NVIDIA Corporation", diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py b/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py index 32885db..9178064 100644 --- a/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py +++ b/python/tests/cuquantum_tests/cutensornet_tests/test_internal.py @@ -1,3 +1,7 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES +# +# SPDX-License-Identifier: BSD-3-Clause + import re import sys import threading diff --git a/samples/custatevec/batched_abs2sum.cu b/samples/custatevec/batched_abs2sum.cu index bdc5c3a..2eaf838 100644 --- a/samples/custatevec/batched_abs2sum.cu +++ b/samples/custatevec/batched_abs2sum.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -79,4 +79,4 @@ int main(void) { printf("abs2sum_batched example FAILED: wrong result\n"); return EXIT_FAILURE; } -} \ No newline at end of file +} diff --git a/samples/custatevec/batched_collapse.cu b/samples/custatevec/batched_collapse.cu index 0204ee6..11b3e4d 100644 --- a/samples/custatevec/batched_collapse.cu +++ b/samples/custatevec/batched_collapse.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/samples/custatevec/batched_gate_application.cu b/samples/custatevec/batched_gate_application.cu index de6ee3f..fd91119 100644 --- a/samples/custatevec/batched_gate_application.cu +++ b/samples/custatevec/batched_gate_application.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -106,4 +106,4 @@ int main(void) { return EXIT_FAILURE; } -} \ No newline at end of file +} diff --git a/samples/custatevec/batched_measure.cu b/samples/custatevec/batched_measure.cu index d10c921..a0ba34f 100644 --- a/samples/custatevec/batched_measure.cu +++ b/samples/custatevec/batched_measure.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -89,4 +89,4 @@ int main(void) { printf("measure_batched example FAILED: wrong result\n"); return EXIT_FAILURE; } -} \ No newline at end of file +} diff --git a/samples/custatevec/initialize_sv.cu b/samples/custatevec/initialize_sv.cu index 506d0e5..0e8da36 100644 --- a/samples/custatevec/initialize_sv.cu +++ b/samples/custatevec/initialize_sv.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/samples/cutensornet/high_level/marginal_example.cu b/samples/cutensornet/high_level/marginal_example.cu index 5310a53..8de8720 100644 --- a/samples/cutensornet/high_level/marginal_example.cu +++ b/samples/cutensornet/high_level/marginal_example.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/samples/cutensornet/high_level/sampling_example.cu b/samples/cutensornet/high_level/sampling_example.cu index b9b96bb..3c0c2e0 100644 --- a/samples/cutensornet/high_level/sampling_example.cu +++ b/samples/cutensornet/high_level/sampling_example.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. * * SPDX-License-Identifier: BSD-3-Clause */