diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 44b01d3..c15ad19 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,10 +18,10 @@ jobs:
   test:
     if: github.event_name != 'pull_request' || !contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association)
     name: py${{ matrix.python }}
-    runs-on: ubuntu-${{ matrix.python == 3.7 && '22.04' || 'latest' }}
+    runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [3.7, 3.12]
+        python: [3.8, 3.12]
     steps:
     - uses: actions/checkout@v4
       with: {fetch-depth: 0}
@@ -47,14 +47,14 @@ jobs:
         git clean -Xdf
         pip install build
         python -m build -n -w \
-          -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Wpedantic -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type"
+          -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type"
   cuda:
     if: github.event_name != 'pull_request' || !contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association)
     name: CUDA py${{ matrix.python }}
     runs-on: [self-hosted, cuda, python]
     strategy:
       matrix:
-        python: [3.7, 3.12]
+        python: [3.8, 3.12]
     steps:
     - uses: actions/checkout@v4
       with: {fetch-depth: 0}
@@ -79,7 +79,7 @@ jobs:
         git clean -Xdf
         pip install build
         python -m build -n -w \
-          -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Wpedantic -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" \
+          -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type" \
           -Ccmake.define.CMAKE_CUDA_ARCHITECTURES=all
     - name: Post Run setup-python
       run: setup-python -p${{ matrix.python }} -Dr
diff --git a/CITATION.cff b/CITATION.cff
index 988ef4e..cbe2e23 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -13,4 +13,4 @@ authors:
 identifiers:
 - type: doi
   value: 10.5281/zenodo.7013340
-keywords: [Python, C, C++, buffer, vector, array, CUDA, CPython, pybind11, extensions, API]
+keywords: [Python, C, C++, buffer, vector, array, CUDA, CPython, nanobind, extensions, API]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6415865..fc8945b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,7 +17,7 @@ print("\0".join(c["build-system"]["requires"] + c["project"]["dependencies"] + c
 git clean -Xdf
 pip install --no-build-isolation --no-deps -t . -U -v . \
   -Ccmake.define.CUVEC_DEBUG=1
-  -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Wpedantic -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type"
+  -Ccmake.define.CMAKE_CXX_FLAGS="-Wall -Wextra -Werror -Wno-missing-field-initializers -Wno-unused-parameter -Wno-cast-function-type"
 git restore numcu/src # undo deletion of sources
 ```
 
diff --git a/README.rst b/README.rst
index 827abca..7f2bd2b 100644
--- a/README.rst
+++ b/README.rst
@@ -14,7 +14,7 @@ Install
 
 Requirements:
 
-- Python 3.7 or greater (e.g. via `Anaconda or Miniconda <https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html#anaconda-or-miniconda>`_, or via ``python3-dev``)
+- Python 3.8 or greater (e.g. via `Anaconda or Miniconda <https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html#anaconda-or-miniconda>`_, or via ``python3-dev``)
 - (optional) `CUDA SDK/Toolkit <https://developer.nvidia.com/cuda-downloads>`_ (including drivers for an NVIDIA GPU)
 
   * note that if the CUDA SDK/Toolkit is installed *after* NumCu, then NumCu must be re-installed to enable CUDA support
diff --git a/docs/index.md b/docs/index.md
index 91ac6ed..b677651 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -18,7 +18,7 @@ pip install numcu
 
 Requirements:
 
-- Python 3.7 or greater (e.g. via [Anaconda or Miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html#anaconda-or-miniconda) or via `python3-dev`)
+- Python 3.8 or greater (e.g. via [Anaconda or Miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/download.html#anaconda-or-miniconda) or via `python3-dev`)
 - (optional) [CUDA SDK/Toolkit](https://developer.nvidia.com/cuda-downloads) (including drivers for an NVIDIA GPU)
   + note that if the CUDA SDK/Toolkit is installed *after* NumCu, then NumCu must be re-installed to enable CUDA support
 
diff --git a/numcu/CMakeLists.txt b/numcu/CMakeLists.txt
index 9c5c23a..af6683f 100644
--- a/numcu/CMakeLists.txt
+++ b/numcu/CMakeLists.txt
@@ -14,7 +14,7 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
 endif()
 cmake_policy(SET CMP0104 NEW)  # CMAKE_CUDA_ARCHITECTURES
 find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
-find_package(pybind11 CONFIG REQUIRED)
+find_package(nanobind CONFIG REQUIRED)
 if(NOT NUMCU_CUDA_OPTIONAL)
   find_package(CUDAToolkit REQUIRED)
   enable_language(CUDA)
@@ -33,8 +33,7 @@ else()
 endif()
 execute_process(
   COMMAND "${Python_EXECUTABLE}" -c "import cuvec; print(cuvec.include_path)"
-  OUTPUT_VARIABLE CUVEC_INCLUDE_DIRS
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE CUVEC_INCLUDE_DIRS)
 if("${CUVEC_INCLUDE_DIRS}" STREQUAL "")
   message(WARNING "Could not find cuvec includes")
 else()
@@ -71,7 +70,7 @@ file(GLOB SRC LIST_DIRECTORIES false "src/*.cu")
 include_directories(${Python_INCLUDE_DIRS})
 include_directories(${CUVEC_INCLUDE_DIRS})
 
-pybind11_add_module(${PROJECT_NAME} MODULE WITH_SOABI ${SRC})
+nanobind_add_module(${PROJECT_NAME} ${SRC})
 add_library(AMYPAD::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
diff --git a/numcu/__init__.py b/numcu/__init__.py
index 3e28987..69f5e9d 100644
--- a/numcu/__init__.py
+++ b/numcu/__init__.py
@@ -38,7 +38,7 @@
     from .lib import add, div, mul
 
 p = resources.files('numcu').resolve()
-# for C++/CUDA/pybind11 includes
+# for C++/CUDA includes
 include_path = p / 'include'
 # for use in `cmake -DCMAKE_PREFIX_PATH=...`
 cmake_prefix = p / 'cmake'
diff --git a/numcu/lib.py b/numcu/lib.py
index 76a2371..18a9a96 100644
--- a/numcu/lib.py
+++ b/numcu/lib.py
@@ -24,26 +24,6 @@ def get_namespace(*xs, default=cu):
     return default # backwards compatibility
 
 
-def check_cuvec(a, shape, dtype, xp=cu):
-    """Asserts that CuVec `a` is of `shape` & `dtype`"""
-    if not isinstance(a, xp.CuVec):
-        raise TypeError(f"must be a {xp.CuVec}")
-    elif np.dtype(a.dtype) != np.dtype(dtype):
-        raise TypeError(f"dtype must be {dtype}: got {a.dtype}")
-    elif a.shape != shape:
-        raise IndexError(f"shape must be {shape}: got {a.shape}")
-
-
-def check_similar(*arrays, allow_none=True):
-    """Asserts that all arrays are `CuVec`s of the same `shape` & `dtype`"""
-    arrs = tuple(filter(lambda x: x is not None, arrays))
-    if not allow_none and len(arrays) != len(arrs):
-        raise TypeError("must not be None")
-    shape, dtype, xp = arrs[0].shape, arrs[0].dtype, get_namespace(*arrs)
-    for a in arrs:
-        check_cuvec(a, shape, dtype, xp)
-
-
 def div(numerator, divisor, default=FLOAT_MAX, output=None, dev_id=0, sync=True):
     """
     Elementwise `output = numerator / divisor if divisor else default`
@@ -59,12 +39,10 @@ def div(numerator, divisor, default=FLOAT_MAX, output=None, dev_id=0, sync=True)
         res = np.divide(numerator, divisor, out=output)
         res[np.isnan(res)] = default
         return res
+    assert numerator.size == divisor.size
     cu.dev_set(dev_id)
-    xp = get_namespace(numerator, divisor, output)
-    numerator = xp.asarray(numerator, 'float32')
-    divisor = xp.asarray(divisor, 'float32')
-    output = xp.zeros_like(numerator) if output is None else xp.asarray(output, 'float32')
-    check_similar(numerator, divisor, output)
+    if output is None:
+        output = get_namespace(numerator, divisor, output).zeros_like(numerator)
     ext.div(numerator, divisor, output, default=default)
     if sync: cu.dev_sync()
     return output
@@ -81,12 +59,10 @@ def mul(a, b, output=None, dev_id=0, sync=True):
       sync(bool): whether to `cudaDeviceSynchronize()` after GPU operations.
     """
     if dev_id is False: return np.multiply(a, b, out=output)
+    assert a.size == b.size
     cu.dev_set(dev_id)
-    xp = get_namespace(a, b, output)
-    a = xp.asarray(a, 'float32')
-    b = xp.asarray(b, 'float32')
-    output = xp.zeros_like(a) if output is None else xp.asarray(output, 'float32')
-    check_similar(a, b, output)
+    if output is None:
+        output = get_namespace(a, b, output).zeros_like(a)
     ext.mul(a, b, output)
     if sync: cu.dev_sync()
     return output
@@ -103,12 +79,10 @@ def add(a, b, output=None, dev_id=0, sync=True):
       sync(bool): whether to `cudaDeviceSynchronize()` after GPU operations.
     """
     if dev_id is False: return np.add(a, b, out=output)
+    assert a.size == b.size
     cu.dev_set(dev_id)
-    xp = get_namespace(a, b, output)
-    a = xp.asarray(a, 'float32')
-    b = xp.asarray(b, 'float32')
-    output = xp.zeros_like(a) if output is None else xp.asarray(output, 'float32')
-    check_similar(a, b, output)
+    if output is None:
+        output = get_namespace(a, b, output).zeros_like(a)
     ext.add(a, b, output)
     if sync: cu.dev_sync()
     return output
diff --git a/numcu/src/numcu.cu b/numcu/src/numcu.cu
index b7d73b7..f631472 100644
--- a/numcu/src/numcu.cu
+++ b/numcu/src/numcu.cu
@@ -4,35 +4,31 @@
  * Copyright (2022) Casper da Costa-Luis
  */
 #include "elemwise.h"          // div, mul, add
-#include <pybind11/pybind11.h> // pybind11
+#include <nanobind/nanobind.h> // nanobind, NB_MODULE
+#include <nanobind/ndarray.h>  // ndarray
 #include <pycuvec.cuh>         // CUDA_PyErr
 
-namespace py = pybind11;
+namespace nb = nanobind;
+template <typename T> using Arr = const nb::ndarray<T>;
 
 template <typename T>
-void elem_div(py::buffer num, py::buffer den, py::buffer dst, T zeroDivDefault) {
-  py::buffer_info src_num = num.request(), src_den = den.request(), dst_out = dst.request(true);
-  div(static_cast<T *>(dst_out.ptr), static_cast<T *>(src_num.ptr), static_cast<T *>(src_den.ptr),
-      dst_out.size, zeroDivDefault);
+void elem_div(Arr<const T> &num, Arr<const T> &den, Arr<T> &dst, T zeroDivDefault) {
+  div(dst.data(), num.data(), den.data(), dst.size(), zeroDivDefault);
   if (CUDA_PyErr()) throw std::runtime_error("CUDA kernel");
 }
 
-template <typename T> void elem_mul(py::buffer a, py::buffer b, py::buffer dst) {
-  py::buffer_info src_a = a.request(), src_b = b.request(), dst_out = dst.request(true);
-  mul(static_cast<T *>(dst_out.ptr), static_cast<T *>(src_a.ptr), static_cast<T *>(src_b.ptr),
-      dst_out.size);
+template <typename T> void elem_mul(Arr<const T> &a, Arr<const T> &b, Arr<T> &dst) {
+  mul(dst.data(), a.data(), b.data(), dst.size());
   if (CUDA_PyErr()) throw std::runtime_error("CUDA kernel");
 }
 
-template <typename T> void elem_add(py::buffer a, py::buffer b, py::buffer dst) {
-  py::buffer_info src_a = a.request(), src_b = b.request(), dst_out = dst.request(true);
-  add(static_cast<T *>(dst_out.ptr), static_cast<T *>(src_a.ptr), static_cast<T *>(src_b.ptr),
-      dst_out.size);
+template <typename T> void elem_add(Arr<const T> &a, Arr<const T> &b, Arr<T> &dst) {
+  add(dst.data(), a.data(), b.data(), dst.size());
   if (CUDA_PyErr()) throw std::runtime_error("CUDA kernel");
 }
 
-using namespace pybind11::literals;
-PYBIND11_MODULE(numcu, m) {
+using namespace nb::literals;
+NB_MODULE(numcu, m) {
   m.doc() = "NumCu external module.";
   m.def("div", &elem_div<float>, "Elementwise division.", "numerator"_a, "divisor"_a, "output"_a,
         "default"_a = FLOAT_MAX);
diff --git a/pyproject.toml b/pyproject.toml
index b87b35e..369ddc2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools_scm>=7", "scikit-build-core[pyproject]>=0.5", "cuvec>=2.11.0", "pybind11"]
+requires = ["setuptools_scm>=7", "scikit-build-core[pyproject]>=0.5", "cuvec>=2.11.0", "nanobind"]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -26,8 +26,8 @@ dynamic = ["version"]
 authors = [{name = "Casper da Costa-Luis", email = "casper.dcl@physics.org"}]
 description = "Numerical CUDA-based Python library built on CuVec"
 readme = "README.rst"
-requires-python = ">=3.7"
-keywords = ["Python", "C", "C++", "buffer", "vector", "array", "CUDA", "CPython", "pybind11", "extensions", "API"]
+requires-python = ">=3.8"
+keywords = ["Python", "C", "C++", "buffer", "vector", "array", "CUDA", "CPython", "nanobind", "extensions", "API"]
 license = {text = "MPL-2.0"}
 classifiers = [
     "Development Status :: 5 - Production/Stable",
@@ -42,7 +42,6 @@ classifiers = [
     "Programming Language :: C",
     "Programming Language :: C++",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",