Skip to content

Make a few memory management objects public + Miscellaneous doc updates #693

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 11, 2025
1 change: 1 addition & 0 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from cuda.core.experimental._launch_config import LaunchConfig
from cuda.core.experimental._launcher import launch
from cuda.core.experimental._linker import Linker, LinkerOptions
from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource
from cuda.core.experimental._module import ObjectCode
from cuda.core.experimental._program import Program, ProgramOptions
from cuda.core.experimental._stream import Stream, StreamOptions
Expand Down
18 changes: 9 additions & 9 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from cuda.core.experimental._context import Context, ContextOptions
from cuda.core.experimental._event import Event, EventOptions
from cuda.core.experimental._graph import GraphBuilder
from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, MemoryResource, _SynchronousMemoryResource
from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions, default_stream
from cuda.core.experimental._utils.clear_error_support import assert_type
from cuda.core.experimental._utils.cuda_utils import (
ComputeCapability,
Expand Down Expand Up @@ -1004,7 +1004,7 @@ def __new__(cls, device_id: Optional[int] = None):
)
)
) == 1:
dev._mr = _DefaultAsyncMempool(dev_id)
dev._mr = DeviceMemoryResource(dev_id)
else:
dev._mr = _SynchronousMemoryResource(dev_id)

Expand Down Expand Up @@ -1207,13 +1207,13 @@ def create_context(self, options: ContextOptions = None) -> Context:
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")

@precondition(_check_context_initialized)
def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:
def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions = None) -> Stream:
"""Create a Stream object.

New stream objects can be created in two different ways:

1) Create a new CUDA stream with customizable `options`.
2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol.
1) Create a new CUDA stream with customizable ``options``.
2) Wrap an existing foreign `obj` supporting the ``__cuda_stream__`` protocol.

Option (2) internally holds a reference to the foreign object
such that the lifetime is managed.
Expand All @@ -1224,8 +1224,8 @@ def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:

Parameters
----------
obj : Any, optional
Any object supporting the __cuda_stream__ protocol.
obj : :obj:`~_stream.IsStreamT`, optional
Any object supporting the ``__cuda_stream__`` protocol.
options : :obj:`~_stream.StreamOptions`, optional
Customizable dataclass for stream creation options.

Expand Down Expand Up @@ -1259,7 +1259,7 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event:
return Event._init(self._id, self.context._handle, options)

@precondition(_check_context_initialized)
def allocate(self, size, stream=None) -> Buffer:
def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
"""Allocate device memory from a specified stream.

Allocates device memory of `size` bytes on the specified `stream`
Expand Down
3 changes: 2 additions & 1 deletion cuda_core/cuda/core/experimental/_launch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ class LaunchConfig:
shmem_size : int, optional
Dynamic shared-memory size per thread block in bytes.
(Default to size 0)

cooperative_launch : bool, optional
Whether this config can be used to launch a cooperative kernel.
"""

# TODO: expand LaunchConfig to include other attributes
Expand Down
21 changes: 12 additions & 9 deletions cuda_core/cuda/core/experimental/_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
#
# SPDX-License-Identifier: Apache-2.0

from typing import Union

from cuda.core.experimental._kernel_arg_handler import ParamHolder
from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
from cuda.core.experimental._module import Kernel
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr
from cuda.core.experimental._utils.clear_error_support import assert_type
from cuda.core.experimental._utils.cuda_utils import (
_reduce_3_tuple,
Expand Down Expand Up @@ -34,7 +35,7 @@ def _lazy_init():
_inited = True


def launch(stream, config, kernel, *kernel_args):
def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit inconsistent where only this API supports __cuda_stream__ protocol, but many other APIs only work with explicit Stream type objects. We should probably be consistent.

Maybe push this to a follow up PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have public examples for PyTorch and CuPy showcasing the use of this protocol. With this PR we made it slightly faster in favor of our own Stream (through try-except). Perhaps we should instead encourage users to use our native objects, and state that using the protocol could add a slight overhead?

"""Launches a :obj:`~_module.Kernel`
object with launch-time configuration.

Expand All @@ -43,7 +44,7 @@ def launch(stream, config, kernel, *kernel_args):
stream : :obj:`~_stream.Stream`
The stream establishing the stream ordering semantic of a
launch.
config : :obj:`~_launcher.LaunchConfig`
config : :obj:`LaunchConfig`
Launch configurations inline with options provided by
:obj:`~_launcher.LaunchConfig` dataclass.
kernel : :obj:`~_module.Kernel`
Expand All @@ -55,13 +56,15 @@ def launch(stream, config, kernel, *kernel_args):
"""
if stream is None:
raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
if not isinstance(stream, Stream):
try:
stream_handle = stream.handle
except AttributeError:
try:
stream = Stream._init(stream)
except Exception as e:
stream_handle = _try_to_get_stream_ptr(stream)
except Exception:
raise ValueError(
f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
) from e
) from None
assert_type(kernel, Kernel)
_lazy_init()
config = check_or_create_options(LaunchConfig, config, "launch config")
Expand All @@ -78,15 +81,15 @@ def launch(stream, config, kernel, *kernel_args):
# rich.
if _use_ex:
drv_cfg = _to_native_launch_config(config)
drv_cfg.hStream = stream.handle
drv_cfg.hStream = stream_handle
if config.cooperative_launch:
_check_cooperative_launch(kernel, config, stream)
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
else:
# TODO: check if config has any unsupported attrs
handle_return(
driver.cuLaunchKernel(
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0
)
)

Expand Down
Loading
Loading