diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 44b01d3..c15ad19 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,10 +18,10 @@ jobs: test: if: github.event_name != 'pull_request' || !contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association) name: py${{ matrix.python }} - runs-on: ubuntu-${{ matrix.python == 3.7 && '22.04' || 'latest' }} + runs-on: ubuntu-latest strategy: matrix: - python: [3.7, 3.12] + python: [3.8, 3.12] steps: - uses: actions/checkout@v4 with: {fetch-depth: 0} @@ -47,14 +47,14 @@ jobs: git clean -Xdf pip install build python -m build -n -w \ - -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Wpedantic -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" + -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" cuda: if: github.event_name != 'pull_request' || !contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association) name: CUDA py${{ matrix.python }} runs-on: [self-hosted, cuda, python] strategy: matrix: - python: [3.7, 3.12] + python: [3.8, 3.12] steps: - uses: actions/checkout@v4 with: {fetch-depth: 0} @@ -79,7 +79,7 @@ jobs: git clean -Xdf pip install build python -m build -n -w \ - -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Wpedantic -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" \ + -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" \ -Ccmake.define.CMAKE_CUDA_ARCHITECTURES=all - name: Post Run setup-python run: setup-python -p${{ matrix.python }} -Dr diff --git a/CITATION.cff b/CITATION.cff index 988ef4e..cbe2e23 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -13,4 +13,4 @@ authors: identifiers: - type: doi value: 10.5281/zenodo.7013340 -keywords: [Python, C, C++, buffer, vector, array, CUDA, CPython, pybind11, extensions, API] +keywords: [Python, C, C++, buffer, vector, array, CUDA, CPython, nanobind, extensions, API] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6415865..fc8945b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,7 +17,7 @@ print("\0".join(c["build-system"]["requires"] + c["project"]["dependencies"] + c git clean -Xdf pip install --no-build-isolation --no-deps -t . -U -v . \ -Ccmake.define.CUVEC_DEBUG=1 - -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Wpedantic -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" + -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" git restore numcu/src # undo deletion of sources ``` diff --git a/README.rst b/README.rst index 827abca..7f2bd2b 100644 --- a/README.rst +++ b/README.rst @@ -14,7 +14,7 @@ Install Requirements: -- Python 3.7 or greater (e.g. via `Anaconda or Miniconda `_, or via ``python3-dev``) +- Python 3.8 or greater (e.g. via `Anaconda or Miniconda `_, or via ``python3-dev``) - (optional) `CUDA SDK/Toolkit `_ (including drivers for an NVIDIA GPU) * note that if the CUDA SDK/Toolkit is installed *after* NumCu, then NumCu must be re-installed to enable CUDA support diff --git a/docs/index.md b/docs/index.md index 91ac6ed..b677651 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,7 +18,7 @@ pip install numcu Requirements: -- Python 3.7 or greater (e.g. via [Anaconda or Miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html#anaconda-or-miniconda) or via `python3-dev`) +- Python 3.8 or greater (e.g. via [Anaconda or Miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html#anaconda-or-miniconda) or via `python3-dev`) - (optional) [CUDA SDK/Toolkit](https://developer.nvidia.com/cuda-downloads) (including drivers for an NVIDIA GPU) + note that if the CUDA SDK/Toolkit is installed *after* NumCu, then NumCu must be re-installed to enable CUDA support diff --git a/numcu/CMakeLists.txt b/numcu/CMakeLists.txt index 9c5c23a..af6683f 100644 --- a/numcu/CMakeLists.txt +++ b/numcu/CMakeLists.txt @@ -14,7 +14,7 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) endif() cmake_policy(SET CMP0104 NEW) # CMAKE_CUDA_ARCHITECTURES find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) -find_package(pybind11 CONFIG REQUIRED) +find_package(nanobind CONFIG REQUIRED) if(NOT NUMCU_CUDA_OPTIONAL) find_package(CUDAToolkit REQUIRED) enable_language(CUDA) @@ -33,8 +33,7 @@ else() endif() execute_process( COMMAND "${Python_EXECUTABLE}" -c "import cuvec; print(cuvec.include_path)" - OUTPUT_VARIABLE CUVEC_INCLUDE_DIRS - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE CUVEC_INCLUDE_DIRS) if("${CUVEC_INCLUDE_DIRS}" STREQUAL "") message(WARNING "Could not find cuvec includes") else() @@ -71,7 +70,7 @@ file(GLOB SRC LIST_DIRECTORIES false "src/*.cu") include_directories(${Python_INCLUDE_DIRS}) include_directories(${CUVEC_INCLUDE_DIRS}) -pybind11_add_module(${PROJECT_NAME} MODULE WITH_SOABI ${SRC}) +nanobind_add_module(${PROJECT_NAME} ${SRC}) add_library(AMYPAD::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) target_include_directories(${PROJECT_NAME} PUBLIC "$" diff --git a/numcu/__init__.py b/numcu/__init__.py index 3e28987..69f5e9d 100644 --- a/numcu/__init__.py +++ b/numcu/__init__.py @@ -38,7 +38,7 @@ from .lib import add, div, mul p = resources.files('numcu').resolve() -# for C++/CUDA/pybind11 includes +# for C++/CUDA includes include_path = p / 'include' # for use in `cmake -DCMAKE_PREFIX_PATH=...` cmake_prefix = p / 'cmake' diff --git a/numcu/lib.py b/numcu/lib.py index 76a2371..18a9a96 100644 --- a/numcu/lib.py +++ b/numcu/lib.py @@ -24,26 +24,6 @@ def get_namespace(*xs, default=cu): return default # backwards compatibility -def check_cuvec(a, shape, dtype, xp=cu): - """Asserts that CuVec `a` is of `shape` & `dtype`""" - if not isinstance(a, xp.CuVec): - raise TypeError(f"must be a {xp.CuVec}") - elif np.dtype(a.dtype) != np.dtype(dtype): - raise TypeError(f"dtype must be {dtype}: got {a.dtype}") - elif a.shape != shape: - raise IndexError(f"shape must be {shape}: got {a.shape}") - - -def check_similar(*arrays, allow_none=True): - """Asserts that all arrays are `CuVec`s of the same `shape` & `dtype`""" - arrs = tuple(filter(lambda x: x is not None, arrays)) - if not allow_none and len(arrays) != len(arrs): - raise TypeError("must not be None") - shape, dtype, xp = arrs[0].shape, arrs[0].dtype, get_namespace(*arrs) - for a in arrs: - check_cuvec(a, shape, dtype, xp) - - def div(numerator, divisor, default=FLOAT_MAX, output=None, dev_id=0, sync=True): """ Elementwise `output = numerator / divisor if divisor else default` @@ -59,12 +39,10 @@ def div(numerator, divisor, default=FLOAT_MAX, output=None, dev_id=0, sync=True) res = np.divide(numerator, divisor, out=output) res[np.isnan(res)] = default return res + assert numerator.size == divisor.size cu.dev_set(dev_id) - xp = get_namespace(numerator, divisor, output) - numerator = xp.asarray(numerator, 'float32') - divisor = xp.asarray(divisor, 'float32') - output = xp.zeros_like(numerator) if output is None else xp.asarray(output, 'float32') - check_similar(numerator, divisor, output) + if output is None: + output = get_namespace(numerator, divisor, output).zeros_like(numerator) ext.div(numerator, divisor, output, default=default) if sync: cu.dev_sync() return output @@ -81,12 +59,10 @@ def mul(a, b, output=None, dev_id=0, sync=True): sync(bool): whether to `cudaDeviceSynchronize()` after GPU operations. """ if dev_id is False: return np.multiply(a, b, out=output) + assert a.size == b.size cu.dev_set(dev_id) - xp = get_namespace(a, b, output) - a = xp.asarray(a, 'float32') - b = xp.asarray(b, 'float32') - output = xp.zeros_like(a) if output is None else xp.asarray(output, 'float32') - check_similar(a, b, output) + if output is None: + output = get_namespace(a, b, output).zeros_like(a) ext.mul(a, b, output) if sync: cu.dev_sync() return output @@ -103,12 +79,10 @@ def add(a, b, output=None, dev_id=0, sync=True): sync(bool): whether to `cudaDeviceSynchronize()` after GPU operations. """ if dev_id is False: return np.add(a, b, out=output) + assert a.size == b.size cu.dev_set(dev_id) - xp = get_namespace(a, b, output) - a = xp.asarray(a, 'float32') - b = xp.asarray(b, 'float32') - output = xp.zeros_like(a) if output is None else xp.asarray(output, 'float32') - check_similar(a, b, output) + if output is None: + output = get_namespace(a, b, output).zeros_like(a) ext.add(a, b, output) if sync: cu.dev_sync() return output diff --git a/numcu/src/numcu.cu b/numcu/src/numcu.cu index b7d73b7..f631472 100644 --- a/numcu/src/numcu.cu +++ b/numcu/src/numcu.cu @@ -4,35 +4,31 @@ * Copyright (2022) Casper da Costa-Luis */ #include "elemwise.h" // div, mul, add -#include // pybind11 +#include // nanobind, NB_MODULE +#include // ndarray #include // CUDA_PyErr -namespace py = pybind11; +namespace nb = nanobind; +template using Arr = const nb::ndarray; template -void elem_div(py::buffer num, py::buffer den, py::buffer dst, T zeroDivDefault) { - py::buffer_info src_num = num.request(), src_den = den.request(), dst_out = dst.request(true); - div(static_cast(dst_out.ptr), static_cast(src_num.ptr), static_cast(src_den.ptr), - dst_out.size, zeroDivDefault); +void elem_div(Arr &num, Arr &den, Arr &dst, T zeroDivDefault) { + div(dst.data(), num.data(), den.data(), dst.size(), zeroDivDefault); if (CUDA_PyErr()) throw std::runtime_error("CUDA kernel"); } -template void elem_mul(py::buffer a, py::buffer b, py::buffer dst) { - py::buffer_info src_a = a.request(), src_b = b.request(), dst_out = dst.request(true); - mul(static_cast(dst_out.ptr), static_cast(src_a.ptr), static_cast(src_b.ptr), - dst_out.size); +template void elem_mul(Arr &a, Arr &b, Arr &dst) { + mul(dst.data(), a.data(), b.data(), dst.size()); if (CUDA_PyErr()) throw std::runtime_error("CUDA kernel"); } -template void elem_add(py::buffer a, py::buffer b, py::buffer dst) { - py::buffer_info src_a = a.request(), src_b = b.request(), dst_out = dst.request(true); - add(static_cast(dst_out.ptr), static_cast(src_a.ptr), static_cast(src_b.ptr), - dst_out.size); +template void elem_add(Arr &a, Arr &b, Arr &dst) { + add(dst.data(), a.data(), b.data(), dst.size()); if (CUDA_PyErr()) throw std::runtime_error("CUDA kernel"); } -using namespace pybind11::literals; -PYBIND11_MODULE(numcu, m) { +using namespace nb::literals; +NB_MODULE(numcu, m) { m.doc() = "NumCu external module."; m.def("div", &elem_div, "Elementwise division.", "numerator"_a, "divisor"_a, "output"_a, "default"_a = FLOAT_MAX); diff --git a/pyproject.toml b/pyproject.toml index b87b35e..369ddc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools_scm>=7", "scikit-build-core[pyproject]>=0.5", "cuvec>=2.11.0", "pybind11"] +requires = ["setuptools_scm>=7", "scikit-build-core[pyproject]>=0.5", "cuvec>=2.11.0", "nanobind"] build-backend = "scikit_build_core.build" [tool.scikit-build] @@ -26,8 +26,8 @@ dynamic = ["version"] authors = [{name = "Casper da Costa-Luis", email = "casper.dcl@physics.org"}] description = "Numerical CUDA-based Python library built on CuVec" readme = "README.rst" -requires-python = ">=3.7" -keywords = ["Python", "C", "C++", "buffer", "vector", "array", "CUDA", "CPython", "pybind11", "extensions", "API"] +requires-python = ">=3.8" +keywords = ["Python", "C", "C++", "buffer", "vector", "array", "CUDA", "CPython", "nanobind", "extensions", "API"] license = {text = "MPL-2.0"} classifiers = [ "Development Status :: 5 - Production/Stable", @@ -42,7 +42,6 @@ classifiers = [ "Programming Language :: C", "Programming Language :: C++", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10",