diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index c02e5c09d..f83e0e223 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -14,6 +14,7 @@ from cuda.core.experimental._launch_config import LaunchConfig from cuda.core.experimental._launcher import launch from cuda.core.experimental._linker import Linker, LinkerOptions +from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 1d874586a..c9a786070 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -8,8 +8,8 @@ from cuda.core.experimental._context import Context, ContextOptions from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._graph import GraphBuilder -from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource -from cuda.core.experimental._stream import Stream, StreamOptions, default_stream +from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, MemoryResource, _SynchronousMemoryResource +from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions, default_stream from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( ComputeCapability, @@ -1004,7 +1004,7 @@ def __new__(cls, device_id: Optional[int] = None): ) ) ) == 1: - dev._mr = _DefaultAsyncMempool(dev_id) + dev._mr = DeviceMemoryResource(dev_id) else: dev._mr = _SynchronousMemoryResource(dev_id) @@ -1207,13 +1207,13 @@ def create_context(self, options: ContextOptions = None) -> Context: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") @precondition(_check_context_initialized) - def create_stream(self, obj=None, options: StreamOptions = None) -> Stream: + def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions = None) -> Stream: """Create a Stream object. New stream objects can be created in two different ways: - 1) Create a new CUDA stream with customizable `options`. - 2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol. + 1) Create a new CUDA stream with customizable ``options``. + 2) Wrap an existing foreign `obj` supporting the ``__cuda_stream__`` protocol. Option (2) internally holds a reference to the foreign object such that the lifetime is managed. @@ -1224,8 +1224,8 @@ def create_stream(self, obj=None, options: StreamOptions = None) -> Stream: Parameters ---------- - obj : Any, optional - Any object supporting the __cuda_stream__ protocol. + obj : :obj:`~_stream.IsStreamT`, optional + Any object supporting the ``__cuda_stream__`` protocol. options : :obj:`~_stream.StreamOptions`, optional Customizable dataclass for stream creation options. @@ -1259,7 +1259,7 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event: return Event._init(self._id, self.context._handle, options) @precondition(_check_context_initialized) - def allocate(self, size, stream=None) -> Buffer: + def allocate(self, size, stream: Optional[Stream] = None) -> Buffer: """Allocate device memory from a specified stream. Allocates device memory of `size` bytes on the specified `stream` diff --git a/cuda_core/cuda/core/experimental/_launch_config.py b/cuda_core/cuda/core/experimental/_launch_config.py index 43b7c2109..9640e4a67 100644 --- a/cuda_core/cuda/core/experimental/_launch_config.py +++ b/cuda_core/cuda/core/experimental/_launch_config.py @@ -50,7 +50,8 @@ class LaunchConfig: shmem_size : int, optional Dynamic shared-memory size per thread block in bytes. (Default to size 0) - + cooperative_launch : bool, optional + Whether this config can be used to launch a cooperative kernel. """ # TODO: expand LaunchConfig to include other attributes diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 1177d6034..5f191e017 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -2,11 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +from typing import Union from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel -from cuda.core.experimental._stream import Stream +from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( _reduce_3_tuple, @@ -34,7 +35,7 @@ def _lazy_init(): _inited = True -def launch(stream, config, kernel, *kernel_args): +def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args): """Launches a :obj:`~_module.Kernel` object with launch-time configuration. @@ -43,7 +44,7 @@ def launch(stream, config, kernel, *kernel_args): stream : :obj:`~_stream.Stream` The stream establishing the stream ordering semantic of a launch. - config : :obj:`~_launcher.LaunchConfig` + config : :obj:`LaunchConfig` Launch configurations inline with options provided by :obj:`~_launcher.LaunchConfig` dataclass. kernel : :obj:`~_module.Kernel` @@ -55,13 +56,15 @@ def launch(stream, config, kernel, *kernel_args): """ if stream is None: raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__") - if not isinstance(stream, Stream): + try: + stream_handle = stream.handle + except AttributeError: try: - stream = Stream._init(stream) - except Exception as e: + stream_handle = _try_to_get_stream_ptr(stream) + except Exception: raise ValueError( f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" - ) from e + ) from None assert_type(kernel, Kernel) _lazy_init() config = check_or_create_options(LaunchConfig, config, "launch config") @@ -78,7 +81,7 @@ def launch(stream, config, kernel, *kernel_args): # rich. if _use_ex: drv_cfg = _to_native_launch_config(config) - drv_cfg.hStream = stream.handle + drv_cfg.hStream = stream_handle if config.cooperative_launch: _check_cooperative_launch(kernel, config, stream) handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) @@ -86,7 +89,7 @@ def launch(stream, config, kernel, *kernel_args): # TODO: check if config has any unsupported attrs handle_return( driver.cuLaunchKernel( - int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 + int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0 ) ) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index e214da0f8..0ef751a09 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -9,17 +9,18 @@ from typing import Optional, Tuple, TypeVar, Union from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule -from cuda.core.experimental._stream import default_stream +from cuda.core.experimental._stream import Stream, default_stream from cuda.core.experimental._utils.cuda_utils import driver, handle_return -PyCapsule = TypeVar("PyCapsule") - - # TODO: define a memory property mixin class and make Buffer and # MemoryResource both inherit from it + +PyCapsule = TypeVar("PyCapsule") +"""Represent the capsule type.""" + DevicePointerT = Union[driver.CUdeviceptr, int, None] -"""A type union of `Cudeviceptr`, `int` and `None` for hinting Buffer.handle.""" +"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" class Buffer: @@ -29,19 +30,7 @@ class Buffer: different memory resources are to give access to their memory allocations. - Support for data interchange mechanisms are provided by - establishing both the DLPack and the Python-level buffer - protocols. - - Parameters - ---------- - ptr : Any - Allocated buffer handle object - size : Any - Memory size of the buffer - mr : :obj:`~_memory.MemoryResource`, optional - Memory resource associated with the buffer - + Support for data interchange mechanisms are provided by DLPack. """ class _MembersNeededForFinalize: @@ -55,8 +44,6 @@ def __init__(self, buffer_obj, ptr, size, mr): def close(self, stream=None): if self.ptr and self.mr is not None: - if stream is None: - stream = default_stream() self.mr.deallocate(self.ptr, self.size, stream) self.ptr = 0 self.mr = None @@ -64,10 +51,16 @@ def close(self, stream=None): # TODO: handle ownership? (_mr could be None) __slots__ = ("__weakref__", "_mnff") - def __init__(self, ptr, size, mr: MemoryResource = None): + def __new__(self, *args, **kwargs): + raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") + + @classmethod + def _init(cls, ptr: DevicePointerT, size: int, mr: Optional[MemoryResource] = None): + self = super().__new__(cls) self._mnff = Buffer._MembersNeededForFinalize(self, ptr, size, mr) + return self - def close(self, stream=None): + def close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. This buffer is released back to their memory resource @@ -75,11 +68,9 @@ def close(self, stream=None): Parameters ---------- - stream : Any, optional - The stream object with a __cuda_stream__ protocol to - use for asynchronous deallocation. Defaults to using - the default stream. - + stream : Stream, optional + The stream object to use for asynchronous deallocation. If None, + the behavior depends on the underlying memory resource. """ self._mnff.close(stream) @@ -95,7 +86,7 @@ def handle(self) -> DevicePointerT: return self._mnff.ptr @property - def size(self): + def size(self) -> int: """Return the memory size of this buffer.""" return self._mnff.size @@ -125,7 +116,7 @@ def device_id(self) -> int: return self._mnff.mr.device_id raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: + def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. Copies the data from this buffer to the provided dst buffer. @@ -136,7 +127,7 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: ---------- dst : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : Any + stream : Stream Keyword argument specifying the stream for the asynchronous copy @@ -154,14 +145,14 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle)) return dst - def copy_from(self, src: Buffer, *, stream): + def copy_from(self, src: Buffer, *, stream: Stream): """Copy from the src buffer to this buffer asynchronously on the given stream. Parameters ---------- src : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : Any + stream : Stream Keyword argument specifying the stream for the asynchronous copy @@ -219,107 +210,247 @@ def __release_buffer__(self, buffer: memoryview, /): # Supporting method paired with __buffer__. raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.") + @staticmethod + def from_handle(ptr: DevicePointerT, size: int, mr: Optional[MemoryResource] = None) -> Buffer: + """Create a new :class:`Buffer` object from a pointer. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + Allocated buffer handle object + size : int + Memory size of the buffer + mr : :obj:`~_memory.MemoryResource`, optional + Memory resource associated with the buffer + """ + return Buffer._init(ptr, size, mr=mr) + class MemoryResource(abc.ABC): + """Abstract base class for memory resources that manage allocation and deallocation of buffers. + + Subclasses must implement methods for allocating and deallocation, as well as properties + associated with this memory resource from which all allocated buffers will inherit. (Since + all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would + hold a reference to self, the buffer properties are retrieved simply by looking up the underlying + memory resource's respective property.) + """ + __slots__ = ("_handle",) @abc.abstractmethod - def __init__(self, *args, **kwargs): ... + def __init__(self, *args, **kwargs): + """Initialize the memory resource. + + Subclasses may use additional arguments to configure the resource. + """ + ... @abc.abstractmethod - def allocate(self, size, stream=None) -> Buffer: ... + def allocate(self, size: int, stream: Stream = None) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + The stream on which to perform the allocation asynchronously. + If None, it is up to each memory resource implementation to decide + and document the behavior. + + Returns + ------- + Buffer + The allocated buffer object, which can be used for device or host operations + depending on the resource's properties. + """ + ... @abc.abstractmethod - def deallocate(self, ptr, size, stream=None): ... + def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream, optional + The stream on which to perform the deallocation asynchronously. + If None, it is up to each memory resource implementation to decide + and document the behavior. + """ + ... @property @abc.abstractmethod def is_device_accessible(self) -> bool: - # Check if the buffers allocated from this MR can be accessed from - # GPUs. + """bool: True if buffers allocated by this resource can be accessed on the device.""" ... @property @abc.abstractmethod def is_host_accessible(self) -> bool: - # Check if the buffers allocated from this MR can be accessed from - # CPUs. + """bool: True if buffers allocated by this resource can be accessed on the host.""" ... @property @abc.abstractmethod def device_id(self) -> int: - # Return the device ID if this MR is for single devices. Raise an - # exception if it is not. + """int: The device ordinal for which this memory resource is responsible. + + Raises + ------ + RuntimeError + If the resource is not bound to a specific device. + """ ... -class _DefaultAsyncMempool(MemoryResource): +class DeviceMemoryResource(MemoryResource): + """Create a device memory resource that uses the driver's stream-ordered memory pool. + + Parameters + ---------- + device_id : int + Device ordinal for which a memory resource is constructed. The mempool that is + set to *current* on ``device_id`` is used. If no mempool is set to current yet, + the driver would use the *default* mempool on the device. + """ + __slots__ = ("_dev_id",) - def __init__(self, dev_id): - self._handle = handle_return(driver.cuDeviceGetMemPool(dev_id)) - self._dev_id = dev_id + def __init__(self, device_id: int): + self._handle = handle_return(driver.cuDeviceGetMemPool(device_id)) + self._dev_id = device_id - def allocate(self, size, stream=None) -> Buffer: + def allocate(self, size: int, stream: Stream = None) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + The stream on which to perform the allocation asynchronously. + If None, an internal stream is used. + + Returns + ------- + Buffer + The allocated buffer object, which is accessible on the device that this memory + resource was created for. + """ if stream is None: stream = default_stream() ptr = handle_return(driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle)) - return Buffer(ptr, size, self) + return Buffer._init(ptr, size, self) - def deallocate(self, ptr, size, stream=None): + def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream, optional + The stream on which to perform the deallocation asynchronously. + If None, an internal stream is used. + """ if stream is None: stream = default_stream() handle_return(driver.cuMemFreeAsync(ptr, stream.handle)) @property def is_device_accessible(self) -> bool: + """bool: this memory resource provides device-accessible buffers.""" return True @property def is_host_accessible(self) -> bool: + """bool: this memory resource does not provides host-accessible buffers.""" return False @property def device_id(self) -> int: + """int: the associated device ordinal.""" return self._dev_id -class _DefaultPinnedMemorySource(MemoryResource): +class LegacyPinnedMemoryResource(MemoryResource): + """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost + APIs. + """ + def __init__(self): # TODO: support flags from cuMemHostAlloc? self._handle = None - def allocate(self, size, stream=None) -> Buffer: + def allocate(self, size: int, stream: Stream = None) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + Currently ignored + + Returns + ------- + Buffer + The allocated buffer object, which is accessible on both host and device. + """ ptr = handle_return(driver.cuMemAllocHost(size)) - return Buffer(ptr, size, self) + return Buffer._init(ptr, size, self) - def deallocate(self, ptr, size, stream=None): + def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream, optional + The stream on which to perform the deallocation asynchronously. + If None, no synchronization would happen. + """ + if stream: + stream.sync() handle_return(driver.cuMemFreeHost(ptr)) @property def is_device_accessible(self) -> bool: + """bool: this memory resource provides device-accessible buffers.""" return True @property def is_host_accessible(self) -> bool: + """bool: this memory resource provides host-accessible buffers.""" return True @property def device_id(self) -> int: + """This memory resource is not bound to any GPU.""" raise RuntimeError("a pinned memory resource is not bound to any GPU") class _SynchronousMemoryResource(MemoryResource): __slots__ = ("_dev_id",) - def __init__(self, dev_id): + def __init__(self, device_id): self._handle = None - self._dev_id = dev_id + self._dev_id = device_id def allocate(self, size, stream=None) -> Buffer: ptr = handle_return(driver.cuMemAlloc(size)) - return Buffer(ptr, size, self) + return Buffer._init(ptr, size, self) def deallocate(self, ptr, size, stream=None): if stream is None: diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 029505da6..ea488f9fc 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -8,7 +8,7 @@ import warnings import weakref from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Protocol, Tuple, Union if TYPE_CHECKING: import cuda.bindings @@ -44,6 +44,47 @@ class StreamOptions: priority: Optional[int] = None +class IsStreamT(Protocol): + def __cuda_stream__(self) -> Tuple[int, int]: + """ + For any Python object that is meant to be interpreted as a CUDA stream, the intent + can be communicated by implementing this protocol that returns a 2-tuple: The protocol + version number (currently ``0``) and the address of ``cudaStream_t``. Both values + should be Python `int`. + """ + ... + + +def _try_to_get_stream_ptr(obj: IsStreamT): + try: + cuda_stream_attr = obj.__cuda_stream__ + except AttributeError: + raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None + + if callable(cuda_stream_attr): + info = cuda_stream_attr() + else: + info = cuda_stream_attr + warnings.simplefilter("once", DeprecationWarning) + warnings.warn( + "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", + stacklevel=3, + category=DeprecationWarning, + ) + + try: + len_info = len(info) + except TypeError as e: + raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e + if len_info != 2: + raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements") + if info[0] != 0: + raise RuntimeError( + f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" + ) + return driver.CUstream(info[1]) + + class Stream: """Represent a queue of GPU operations that are executed in a specific order. @@ -108,42 +149,14 @@ def _per_thread_default(cls): return self @classmethod - def _init(cls, obj=None, *, options: Optional[StreamOptions] = None): + def _init(cls, obj: Optional[IsStreamT] = None, *, options: Optional[StreamOptions] = None): self = super().__new__(cls) self._mnff = Stream._MembersNeededForFinalize(self, None, None, False) if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - cuda_stream_attr = getattr(obj, "__cuda_stream__", None) - if cuda_stream_attr is None: - raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") - if callable(cuda_stream_attr): - info = cuda_stream_attr() - else: - info = cuda_stream_attr - warnings.simplefilter("once", DeprecationWarning) - warnings.warn( - "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", - stacklevel=3, - category=DeprecationWarning, - ) - try: - len_info = len(info) - except Exception as e: - raise RuntimeError( - f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}" - ) from e - if len_info != 2: - raise RuntimeError( - f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements" - ) - if info[0] != 0: - raise RuntimeError( - f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" - ) - - self._mnff.handle = driver.CUstream(info[1]) + self._mnff.handle = _try_to_get_stream_ptr(obj) # TODO: check if obj is created under the current context/device self._mnff.owner = obj self._nonblocking = None # delayed diff --git a/cuda_core/docs/source/_templates/autosummary/protocol.rst b/cuda_core/docs/source/_templates/autosummary/protocol.rst new file mode 100644 index 000000000..13ce267a6 --- /dev/null +++ b/cuda_core/docs/source/_templates/autosummary/protocol.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: Apache-2.0 + +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoprotocol:: {{ objname }} + + {% block methods %} + {% if methods %} + .. rubric:: {{ _('Methods') }} + + {% for item in methods %} + .. automethod:: {{ item }} + {%- endfor %} + + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + {% for item in attributes %} + .. autoproperty:: {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 2b82dd066..7f9f814d4 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -20,6 +20,10 @@ CUDA runtime Graph GraphBuilder launch + Buffer + MemoryResource + DeviceMemoryResource + LegacyPinnedMemoryResource :template: dataclass.rst diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index 94f5a5df1..8cc27c83a 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -15,11 +15,11 @@ CUDA runtime .. autosummary:: :toctree: generated/ - _memory.Buffer _stream.Stream + _memory.PyCapsule + _memory.DevicePointerT _event.Event _device.DeviceProperties - _launch_config.LaunchConfig _module.KernelAttributes _module.KernelOccupancy _module.ParamInfo @@ -33,4 +33,13 @@ CUDA compilation toolchain :toctree: generated/ _module.Kernel - _module.ObjectCode + + +CUDA protocols +-------------- + +.. autosummary:: + :toctree: generated/ + :template: protocol.rst + + _stream.IsStreamT diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index ac0ce27e3..6945f47fd 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -40,6 +40,7 @@ "myst_nb", "enum_tools.autoenum", "sphinx_copybutton", + "sphinx_toolbox.more_autodoc.autoprotocol", ] # Add any paths that contain templates here, relative to this directory. diff --git a/cuda_core/docs/source/contribute.rst b/cuda_core/docs/source/contribute.rst index a7a10b3d3..2965afb50 100644 --- a/cuda_core/docs/source/contribute.rst +++ b/cuda_core/docs/source/contribute.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: Apache-2.0 +.. _contributor_guide: + Contributing ------------ diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst index 01e932842..a853e6481 100644 --- a/cuda_core/docs/source/interoperability.rst +++ b/cuda_core/docs/source/interoperability.rst @@ -34,10 +34,10 @@ in Python. While we encourage new Python projects to start using streams (and ot CUDA types) from ``cuda.core``, we understand that there are already several projects exposing their own stream types. -To address this issue, we propose the ``__cuda_stream__`` protocol (currently version -0) as follows: For any Python objects that are meant to be interpreted as a stream, they -should add a ``__cuda_stream__`` method that returns a 2-tuple: The version number -(``0``) and the address of ``cudaStream_t`` (both as Python `int`): +To address this issue, we propose the :attr:`~_stream.IsStreamT.__cuda_stream__` protocol +(currently version 0) as follows: For any Python objects that are meant to be interpreted +as a stream, they should add a ``__cuda_stream__`` *method* that returns a 2-tuple: The +version number (``0``) and the address of ``cudaStream_t`` (both as Python `int`): .. code-block:: python @@ -48,11 +48,11 @@ should add a ``__cuda_stream__`` method that returns a 2-tuple: The version numb ... -Then such objects can be understood by ``cuda.core`` anywhere a stream-like object -is needed. +Then such objects can be understood and wrapped by :meth:`Device.create_stream`. -We suggest all existing Python projects that expose a stream class to also support this -protocol wherever a function takes a stream. +We suggest all existing Python projects that already expose a stream class to also support +this protocol. For new Python projects that need to access CUDA streams, we encourage you +to use :class:`~_stream.Stream` from ``cuda.core`` directly. Memory view utilities for CPU/GPU buffers diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst index 856cdde1e..5176e1e17 100644 --- a/cuda_core/docs/source/release/0.3.0-notes.rst +++ b/cuda_core/docs/source/release/0.3.0-notes.rst @@ -7,10 +7,11 @@ Released on MM DD, 2025 + Highlights ---------- -- Starting this release ``cuda.core`` is licensed under Apache 2.0. +- Starting this release ``cuda.core`` is licensed under Apache 2.0. The biggest implication of this change is that we are open to external contribution now! Please kindly follow the :ref:`Contributor Guide ` for detailed instructions. - Initial support for CUDA graphs (phase 1). - In this release, we support building a CUDA graph that captures kernel launches. The captured graph can be replayed to reduce latency. Graph split/join and conditional nodes are supported. @@ -19,24 +20,41 @@ Highlights Breaking Changes ---------------- +- The :class:`Buffer` object's ``__init__()`` method is removed, see below. +- The :class:`Buffer` object's :meth:`~Buffer.close` method and destructor now always defer to the underlying memory resource implementation + to decide the behavior if a stream is not explicitly passed. Previously, in this case it always uses the default stream, which could + interfere with the memory resource's assumptions. + New features ------------ -- :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612) +- :class:`~_module.Kernel` adds :attr:`~_module.Kernel.num_arguments` and :attr:`~_module.Kernel.arguments_info` for introspection of kernel arguments. (#612) - Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648) - Support launching cooperative kernels by setting :attr:`LaunchConfig.cooperative_launch` to `True`. -- A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective - options. +- A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective options. +- Expose :class:`Buffer`, :class:`DeviceMemoryResource`, :class:`LegacyPinnedMemoryResource`, and :class:`MemoryResource` to the top namespace. + - Before this release, the internal :class:`Buffer` class had an ``__init__()`` constructor. To align with the design of cuda.core objects, + this constructor is removed starting this release. Users who still need the old behavior should use the :meth:`~Buffer.from_handle` + alternative constructor. +- Add a typing annotation for the :attr:`~_stream.IsStreamT.__cuda_stream__` protocol. New examples ------------ +- Add a PyTorch-based example. +- Split the :class:`StridedMemoryView` example into two (CPU/GPU). + Fixes and enhancements ---------------------- +- ``cuda.core`` now raises more clear and actionable error messages whenever possible. +- :class:`ObjectCode` can be pickled now. - Look-up of the :attr:`Event.device` and :attr:`Event.context` (the device and CUDA context where an event was created from) is now possible. +- :class:`Event`-based timing is made more robust (also with better error messages). - The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed. +- :attr:`ProgramOptions.ptxas_options` can now accept more than one argument. - The :class:`Device` constructor is made faster. +- The CFFI-based example no longer leaves the intermediate files on disk after it finishes. diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 76a972859..4e8ce5533 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -40,7 +40,7 @@ def __cuda_stream__(self): return (0, stream_id) # Return format required by CUDA Python -s = PyTorchStreamWrapper(pt_stream) +s = dev.create_stream(PyTorchStreamWrapper(pt_stream)) # prepare program arch = "".join(f"{i}" for i in dev.compute_capability) diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 2d790592c..a9b006b31 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -87,7 +87,7 @@ def __cuda_stream__(self): a = cp.random.random(size, dtype=dtype) b = cp.random.random(size, dtype=dtype) c = cp.empty_like(a) -cp_stream0 = StreamAdaptor(cp.cuda.get_current_stream()) +cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) # Establish a stream order to ensure that memory has been initialized before # accessed by the kernel. @@ -102,7 +102,7 @@ def __cuda_stream__(self): x = cp.random.random(size, dtype=dtype) y = cp.random.random(size, dtype=dtype) z = cp.empty_like(a) -cp_stream1 = StreamAdaptor(cp.cuda.get_current_stream()) +cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) # Establish a stream order stream1.wait(cp_stream1) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 32d450102..9cee936d8 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -10,8 +10,15 @@ from conftest import skipif_need_cuda_headers import cuda.core.experimental -from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch -from cuda.core.experimental._memory import _DefaultPinnedMemorySource +from cuda.core.experimental import ( + Device, + EventOptions, + LaunchConfig, + LegacyPinnedMemoryResource, + Program, + ProgramOptions, + launch, +) def test_event_init_disabled(): @@ -143,7 +150,7 @@ def test_error_timing_incomplete(): mod = prog.compile(target_type="cubin") ker = mod.get_kernel("wait") - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(4) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py index 97f7c5c60..396dbc99f 100644 --- a/cuda_core/tests/test_graph.py +++ b/cuda_core/tests/test_graph.py @@ -15,11 +15,11 @@ GraphCompleteOptions, GraphDebugPrintOptions, LaunchConfig, + LegacyPinnedMemoryResource, Program, ProgramOptions, launch, ) -from cuda.core.experimental._memory import _DefaultPinnedMemorySource from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return @@ -174,7 +174,7 @@ def test_graph_repeat_capture(init_cuda): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(4) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 @@ -225,7 +225,7 @@ def test_graph_conditional_if(init_cuda, condition_value): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(8) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 @@ -287,7 +287,7 @@ def test_graph_conditional_if_else(init_cuda, condition_value): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(8) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 @@ -359,7 +359,7 @@ def test_graph_conditional_switch(init_cuda, condition_value): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(12) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 @@ -451,7 +451,7 @@ def test_graph_conditional_while(init_cuda, condition_value): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(4) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 @@ -497,7 +497,7 @@ def test_graph_child_graph(init_cuda): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(8) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 @@ -550,7 +550,7 @@ def test_graph_update(init_cuda): # Allocate memory launch_stream = Device().create_stream() - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(12) arr = np.from_dlpack(b).view(np.int32) arr[0] = 0 diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 635d1fcf5..3a02065de 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -9,8 +9,7 @@ import pytest from conftest import skipif_need_cuda_headers -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch -from cuda.core.experimental._memory import _DefaultPinnedMemorySource +from cuda.core.experimental import Device, LaunchConfig, LegacyPinnedMemoryResource, Program, ProgramOptions, launch def test_launch_config_init(init_cuda): @@ -111,7 +110,7 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value): dev.set_current() # Prepare pinned host array - mr = _DefaultPinnedMemorySource() + mr = LegacyPinnedMemoryResource() b = mr.allocate(np.dtype(python_type).itemsize) arr = np.from_dlpack(b).view(python_type) arr[:] = 0 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5bcc607da..3169c1d19 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,8 +10,8 @@ import pytest -from cuda.core.experimental import Device -from cuda.core.experimental._memory import Buffer, DLDeviceType, MemoryResource +from cuda.core.experimental import Buffer, Device, MemoryResource +from cuda.core.experimental._memory import DLDeviceType from cuda.core.experimental._utils.cuda_utils import handle_return @@ -21,7 +21,7 @@ def __init__(self, device): def allocate(self, size, stream=None) -> Buffer: ptr = handle_return(driver.cuMemAlloc(size)) - return Buffer(ptr=ptr, size=size, mr=self) + return Buffer.from_handle(ptr=ptr, size=size, mr=self) def deallocate(self, ptr, size, stream=None): handle_return(driver.cuMemFree(ptr)) @@ -46,7 +46,7 @@ def __init__(self): def allocate(self, size, stream=None) -> Buffer: # Allocate a ctypes buffer of size `size` ptr = (ctypes.c_byte * size)() - return Buffer(ptr=ptr, size=size, mr=self) + return Buffer.from_handle(ptr=ptr, size=size, mr=self) def deallocate(self, ptr, size, stream=None): # the memory is deallocated per the ctypes deallocation at garbage collection time @@ -71,7 +71,7 @@ def __init__(self, device): def allocate(self, size, stream=None) -> Buffer: ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) - return Buffer(ptr=ptr, size=size, mr=self) + return Buffer.from_handle(ptr=ptr, size=size, mr=self) def deallocate(self, ptr, size, stream=None): handle_return(driver.cuMemFree(ptr)) @@ -95,7 +95,7 @@ def __init__(self, device): def allocate(self, size, stream=None) -> Buffer: ptr = handle_return(driver.cuMemAllocHost(size)) - return Buffer(ptr=ptr, size=size, mr=self) + return Buffer.from_handle(ptr=ptr, size=size, mr=self) def deallocate(self, ptr, size, stream=None): handle_return(driver.cuMemFreeHost(ptr))