diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index c02e5c09d..f83e0e223 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -14,6 +14,7 @@
 from cuda.core.experimental._launch_config import LaunchConfig
 from cuda.core.experimental._launcher import launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 1d874586a..c9a786070 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -8,8 +8,8 @@
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
-from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
+from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, MemoryResource, _SynchronousMemoryResource
+from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
     ComputeCapability,
@@ -1004,7 +1004,7 @@ def __new__(cls, device_id: Optional[int] = None):
                         )
                     )
                 ) == 1:
-                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    dev._mr = DeviceMemoryResource(dev_id)
                 else:
                     dev._mr = _SynchronousMemoryResource(dev_id)
 
@@ -1207,13 +1207,13 @@ def create_context(self, options: ContextOptions = None) -> Context:
         raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
     @precondition(_check_context_initialized)
-    def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:
+    def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions = None) -> Stream:
         """Create a Stream object.
 
         New stream objects can be created in two different ways:
 
-        1) Create a new CUDA stream with customizable `options`.
-        2) Wrap an existing foreign `obj` supporting the __cuda_stream__ protocol.
+        1) Create a new CUDA stream with customizable ``options``.
+        2) Wrap an existing foreign `obj` supporting the ``__cuda_stream__`` protocol.
 
         Option (2) internally holds a reference to the foreign object
         such that the lifetime is managed.
@@ -1224,8 +1224,8 @@ def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:
 
         Parameters
         ----------
-        obj : Any, optional
-            Any object supporting the __cuda_stream__ protocol.
+        obj : :obj:`~_stream.IsStreamT`, optional
+            Any object supporting the ``__cuda_stream__`` protocol.
         options : :obj:`~_stream.StreamOptions`, optional
             Customizable dataclass for stream creation options.
 
@@ -1259,7 +1259,7 @@ def create_event(self, options: Optional[EventOptions] = None) -> Event:
         return Event._init(self._id, self.context._handle, options)
 
     @precondition(_check_context_initialized)
-    def allocate(self, size, stream=None) -> Buffer:
+    def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`
diff --git a/cuda_core/cuda/core/experimental/_launch_config.py b/cuda_core/cuda/core/experimental/_launch_config.py
index 43b7c2109..9640e4a67 100644
--- a/cuda_core/cuda/core/experimental/_launch_config.py
+++ b/cuda_core/cuda/core/experimental/_launch_config.py
@@ -50,7 +50,8 @@ class LaunchConfig:
     shmem_size : int, optional
         Dynamic shared-memory size per thread block in bytes.
         (Default to size 0)
-
+    cooperative_launch : bool, optional
+        Whether this config can be used to launch a cooperative kernel.
     """
 
     # TODO: expand LaunchConfig to include other attributes
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index 1177d6034..5f191e017 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -2,11 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Union
 
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import Stream
+from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
     _reduce_3_tuple,
@@ -34,7 +35,7 @@ def _lazy_init():
     _inited = True
 
 
-def launch(stream, config, kernel, *kernel_args):
+def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
     object with launch-time configuration.
 
@@ -43,7 +44,7 @@ def launch(stream, config, kernel, *kernel_args):
     stream : :obj:`~_stream.Stream`
         The stream establishing the stream ordering semantic of a
         launch.
-    config : :obj:`~_launcher.LaunchConfig`
+    config : :obj:`LaunchConfig`
         Launch configurations inline with options provided by
         :obj:`~_launcher.LaunchConfig` dataclass.
     kernel : :obj:`~_module.Kernel`
@@ -55,13 +56,15 @@ def launch(stream, config, kernel, *kernel_args):
     """
     if stream is None:
         raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
-    if not isinstance(stream, Stream):
+    try:
+        stream_handle = stream.handle
+    except AttributeError:
         try:
-            stream = Stream._init(stream)
-        except Exception as e:
+            stream_handle = _try_to_get_stream_ptr(stream)
+        except Exception:
             raise ValueError(
                 f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
-            ) from e
+            ) from None
     assert_type(kernel, Kernel)
     _lazy_init()
     config = check_or_create_options(LaunchConfig, config, "launch config")
@@ -78,7 +81,7 @@ def launch(stream, config, kernel, *kernel_args):
     # rich.
     if _use_ex:
         drv_cfg = _to_native_launch_config(config)
-        drv_cfg.hStream = stream.handle
+        drv_cfg.hStream = stream_handle
         if config.cooperative_launch:
             _check_cooperative_launch(kernel, config, stream)
         handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
@@ -86,7 +89,7 @@ def launch(stream, config, kernel, *kernel_args):
         # TODO: check if config has any unsupported attrs
         handle_return(
             driver.cuLaunchKernel(
-                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
+                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0
             )
         )
 
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index e214da0f8..0ef751a09 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -9,17 +9,18 @@
 from typing import Optional, Tuple, TypeVar, Union
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
-from cuda.core.experimental._stream import default_stream
+from cuda.core.experimental._stream import Stream, default_stream
 from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
-PyCapsule = TypeVar("PyCapsule")
-
-
 # TODO: define a memory property mixin class and make Buffer and
 # MemoryResource both inherit from it
 
+
+PyCapsule = TypeVar("PyCapsule")
+"""Represent the capsule type."""
+
 DevicePointerT = Union[driver.CUdeviceptr, int, None]
-"""A type union of `Cudeviceptr`, `int` and `None` for hinting Buffer.handle."""
+"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
 
 
 class Buffer:
@@ -29,19 +30,7 @@ class Buffer:
     different memory resources are to give access to their memory
     allocations.
 
-    Support for data interchange mechanisms are provided by
-    establishing both the DLPack and the Python-level buffer
-    protocols.
-
-    Parameters
-    ----------
-    ptr : Any
-        Allocated buffer handle object
-    size : Any
-        Memory size of the buffer
-    mr : :obj:`~_memory.MemoryResource`, optional
-        Memory resource associated with the buffer
-
+    Support for data interchange mechanisms are provided by DLPack.
     """
 
     class _MembersNeededForFinalize:
@@ -55,8 +44,6 @@ def __init__(self, buffer_obj, ptr, size, mr):
 
         def close(self, stream=None):
             if self.ptr and self.mr is not None:
-                if stream is None:
-                    stream = default_stream()
                 self.mr.deallocate(self.ptr, self.size, stream)
                 self.ptr = 0
                 self.mr = None
@@ -64,10 +51,16 @@ def close(self, stream=None):
     # TODO: handle ownership? (_mr could be None)
     __slots__ = ("__weakref__", "_mnff")
 
-    def __init__(self, ptr, size, mr: MemoryResource = None):
+    def __new__(self, *args, **kwargs):
+        raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, ptr: DevicePointerT, size: int, mr: Optional[MemoryResource] = None):
+        self = super().__new__(cls)
         self._mnff = Buffer._MembersNeededForFinalize(self, ptr, size, mr)
+        return self
 
-    def close(self, stream=None):
+    def close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
 
         This buffer is released back to their memory resource
@@ -75,11 +68,9 @@ def close(self, stream=None):
 
         Parameters
         ----------
-        stream : Any, optional
-            The stream object with a __cuda_stream__ protocol to
-            use for asynchronous deallocation. Defaults to using
-            the default stream.
-
+        stream : Stream, optional
+            The stream object to use for asynchronous deallocation. If None,
+            the behavior depends on the underlying memory resource.
         """
         self._mnff.close(stream)
 
@@ -95,7 +86,7 @@ def handle(self) -> DevicePointerT:
         return self._mnff.ptr
 
     @property
-    def size(self):
+    def size(self) -> int:
         """Return the memory size of this buffer."""
         return self._mnff.size
 
@@ -125,7 +116,7 @@ def device_id(self) -> int:
             return self._mnff.mr.device_id
         raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
-    def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
+    def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
         Copies the data from this buffer to the provided dst buffer.
@@ -136,7 +127,7 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
         ----------
         dst : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : Any
+        stream : Stream
             Keyword argument specifying the stream for the
             asynchronous copy
 
@@ -154,14 +145,14 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
         handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
         return dst
 
-    def copy_from(self, src: Buffer, *, stream):
+    def copy_from(self, src: Buffer, *, stream: Stream):
         """Copy from the src buffer to this buffer asynchronously on the given stream.
 
         Parameters
         ----------
         src : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : Any
+        stream : Stream
             Keyword argument specifying the stream for the
             asynchronous copy
 
@@ -219,107 +210,247 @@ def __release_buffer__(self, buffer: memoryview, /):
         # Supporting method paired with __buffer__.
         raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
 
+    @staticmethod
+    def from_handle(ptr: DevicePointerT, size: int, mr: Optional[MemoryResource] = None) -> Buffer:
+        """Create a new :class:`Buffer` object from a pointer.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            Allocated buffer handle object
+        size : int
+            Memory size of the buffer
+        mr : :obj:`~_memory.MemoryResource`, optional
+            Memory resource associated with the buffer
+        """
+        return Buffer._init(ptr, size, mr=mr)
+
 
 class MemoryResource(abc.ABC):
+    """Abstract base class for memory resources that manage allocation and deallocation of buffers.
+
+    Subclasses must implement methods for allocating and deallocation, as well as properties
+    associated with this memory resource from which all allocated buffers will inherit. (Since
+    all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would
+    hold a reference to self, the buffer properties are retrieved simply by looking up the underlying
+    memory resource's respective property.)
+    """
+
     __slots__ = ("_handle",)
 
     @abc.abstractmethod
-    def __init__(self, *args, **kwargs): ...
+    def __init__(self, *args, **kwargs):
+        """Initialize the memory resource.
+
+        Subclasses may use additional arguments to configure the resource.
+        """
+        ...
 
     @abc.abstractmethod
-    def allocate(self, size, stream=None) -> Buffer: ...
+    def allocate(self, size: int, stream: Stream = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the allocation asynchronously.
+            If None, it is up to each memory resource implementation to decide
+            and document the behavior.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which can be used for device or host operations
+            depending on the resource's properties.
+        """
+        ...
 
     @abc.abstractmethod
-    def deallocate(self, ptr, size, stream=None): ...
+    def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the deallocation asynchronously.
+            If None, it is up to each memory resource implementation to decide
+            and document the behavior.
+        """
+        ...
 
     @property
     @abc.abstractmethod
     def is_device_accessible(self) -> bool:
-        # Check if the buffers allocated from this MR can be accessed from
-        # GPUs.
+        """bool: True if buffers allocated by this resource can be accessed on the device."""
         ...
 
     @property
     @abc.abstractmethod
     def is_host_accessible(self) -> bool:
-        # Check if the buffers allocated from this MR can be accessed from
-        # CPUs.
+        """bool: True if buffers allocated by this resource can be accessed on the host."""
         ...
 
     @property
     @abc.abstractmethod
     def device_id(self) -> int:
-        # Return the device ID if this MR is for single devices. Raise an
-        # exception if it is not.
+        """int: The device ordinal for which this memory resource is responsible.
+
+        Raises
+        ------
+        RuntimeError
+            If the resource is not bound to a specific device.
+        """
         ...
 
 
-class _DefaultAsyncMempool(MemoryResource):
+class DeviceMemoryResource(MemoryResource):
+    """Create a device memory resource that uses the driver's stream-ordered memory pool.
+
+    Parameters
+    ----------
+    device_id : int
+        Device ordinal for which a memory resource is constructed. The mempool that is
+        set to *current* on ``device_id`` is used. If no mempool is set to current yet,
+        the driver would use the *default* mempool on the device.
+    """
+
     __slots__ = ("_dev_id",)
 
-    def __init__(self, dev_id):
-        self._handle = handle_return(driver.cuDeviceGetMemPool(dev_id))
-        self._dev_id = dev_id
+    def __init__(self, device_id: int):
+        self._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
+        self._dev_id = device_id
 
-    def allocate(self, size, stream=None) -> Buffer:
+    def allocate(self, size: int, stream: Stream = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the allocation asynchronously.
+            If None, an internal stream is used.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on the device that this memory
+            resource was created for.
+        """
         if stream is None:
             stream = default_stream()
         ptr = handle_return(driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle))
-        return Buffer(ptr, size, self)
+        return Buffer._init(ptr, size, self)
 
-    def deallocate(self, ptr, size, stream=None):
+    def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the deallocation asynchronously.
+            If None, an internal stream is used.
+        """
         if stream is None:
             stream = default_stream()
         handle_return(driver.cuMemFreeAsync(ptr, stream.handle))
 
     @property
     def is_device_accessible(self) -> bool:
+        """bool: this memory resource provides device-accessible buffers."""
         return True
 
     @property
     def is_host_accessible(self) -> bool:
+        """bool: this memory resource does not provides host-accessible buffers."""
         return False
 
     @property
     def device_id(self) -> int:
+        """int: the associated device ordinal."""
         return self._dev_id
 
 
-class _DefaultPinnedMemorySource(MemoryResource):
+class LegacyPinnedMemoryResource(MemoryResource):
+    """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost
+    APIs.
+    """
+
     def __init__(self):
         # TODO: support flags from cuMemHostAlloc?
         self._handle = None
 
-    def allocate(self, size, stream=None) -> Buffer:
+    def allocate(self, size: int, stream: Stream = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            Currently ignored
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on both host and device.
+        """
         ptr = handle_return(driver.cuMemAllocHost(size))
-        return Buffer(ptr, size, self)
+        return Buffer._init(ptr, size, self)
 
-    def deallocate(self, ptr, size, stream=None):
+    def deallocate(self, ptr: DevicePointerT, size: int, stream: Stream = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the deallocation asynchronously.
+            If None, no synchronization would happen.
+        """
+        if stream:
+            stream.sync()
         handle_return(driver.cuMemFreeHost(ptr))
 
     @property
     def is_device_accessible(self) -> bool:
+        """bool: this memory resource provides device-accessible buffers."""
         return True
 
     @property
     def is_host_accessible(self) -> bool:
+        """bool: this memory resource provides host-accessible buffers."""
         return True
 
     @property
     def device_id(self) -> int:
+        """This memory resource is not bound to any GPU."""
         raise RuntimeError("a pinned memory resource is not bound to any GPU")
 
 
 class _SynchronousMemoryResource(MemoryResource):
     __slots__ = ("_dev_id",)
 
-    def __init__(self, dev_id):
+    def __init__(self, device_id):
         self._handle = None
-        self._dev_id = dev_id
+        self._dev_id = device_id
 
     def allocate(self, size, stream=None) -> Buffer:
         ptr = handle_return(driver.cuMemAlloc(size))
-        return Buffer(ptr, size, self)
+        return Buffer._init(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
         if stream is None:
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 029505da6..ea488f9fc 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -8,7 +8,7 @@
 import warnings
 import weakref
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Protocol, Tuple, Union
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -44,6 +44,47 @@ class StreamOptions:
     priority: Optional[int] = None
 
 
+class IsStreamT(Protocol):
+    def __cuda_stream__(self) -> Tuple[int, int]:
+        """
+        For any Python object that is meant to be interpreted as a CUDA stream, the intent
+        can be communicated by implementing this protocol that returns a 2-tuple: The protocol
+        version number (currently ``0``) and the address of ``cudaStream_t``. Both values
+        should be Python `int`.
+        """
+        ...
+
+
+def _try_to_get_stream_ptr(obj: IsStreamT):
+    try:
+        cuda_stream_attr = obj.__cuda_stream__
+    except AttributeError:
+        raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None
+
+    if callable(cuda_stream_attr):
+        info = cuda_stream_attr()
+    else:
+        info = cuda_stream_attr
+        warnings.simplefilter("once", DeprecationWarning)
+        warnings.warn(
+            "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
+            stacklevel=3,
+            category=DeprecationWarning,
+        )
+
+    try:
+        len_info = len(info)
+    except TypeError as e:
+        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e
+    if len_info != 2:
+        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements")
+    if info[0] != 0:
+        raise RuntimeError(
+            f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
+        )
+    return driver.CUstream(info[1])
+
+
 class Stream:
     """Represent a queue of GPU operations that are executed in a specific order.
 
@@ -108,42 +149,14 @@ def _per_thread_default(cls):
         return self
 
     @classmethod
-    def _init(cls, obj=None, *, options: Optional[StreamOptions] = None):
+    def _init(cls, obj: Optional[IsStreamT] = None, *, options: Optional[StreamOptions] = None):
         self = super().__new__(cls)
         self._mnff = Stream._MembersNeededForFinalize(self, None, None, False)
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            cuda_stream_attr = getattr(obj, "__cuda_stream__", None)
-            if cuda_stream_attr is None:
-                raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute")
-            if callable(cuda_stream_attr):
-                info = cuda_stream_attr()
-            else:
-                info = cuda_stream_attr
-                warnings.simplefilter("once", DeprecationWarning)
-                warnings.warn(
-                    "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
-                    stacklevel=3,
-                    category=DeprecationWarning,
-                )
-            try:
-                len_info = len(info)
-            except Exception as e:
-                raise RuntimeError(
-                    f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}"
-                ) from e
-            if len_info != 2:
-                raise RuntimeError(
-                    f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements"
-                )
-            if info[0] != 0:
-                raise RuntimeError(
-                    f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
-                )
-
-            self._mnff.handle = driver.CUstream(info[1])
+            self._mnff.handle = _try_to_get_stream_ptr(obj)
             # TODO: check if obj is created under the current context/device
             self._mnff.owner = obj
             self._nonblocking = None  # delayed
diff --git a/cuda_core/docs/source/_templates/autosummary/protocol.rst b/cuda_core/docs/source/_templates/autosummary/protocol.rst
new file mode 100644
index 000000000..13ce267a6
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/protocol.rst
@@ -0,0 +1,28 @@
+.. SPDX-License-Identifier: Apache-2.0
+
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoprotocol:: {{ objname }}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   {% for item in methods %}
+   .. automethod:: {{ item }}
+   {%- endfor %}
+
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: {{ _('Attributes') }}
+
+   {% for item in attributes %}
+   .. autoproperty:: {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 2b82dd066..7f9f814d4 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -20,6 +20,10 @@ CUDA runtime
    Graph
    GraphBuilder
    launch
+   Buffer
+   MemoryResource
+   DeviceMemoryResource
+   LegacyPinnedMemoryResource
 
    :template: dataclass.rst
 
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
index 94f5a5df1..8cc27c83a 100644
--- a/cuda_core/docs/source/api_private.rst
+++ b/cuda_core/docs/source/api_private.rst
@@ -15,11 +15,11 @@ CUDA runtime
 .. autosummary::
    :toctree: generated/
 
-   _memory.Buffer
    _stream.Stream
+   _memory.PyCapsule
+   _memory.DevicePointerT
    _event.Event
    _device.DeviceProperties
-   _launch_config.LaunchConfig
    _module.KernelAttributes
    _module.KernelOccupancy
    _module.ParamInfo
@@ -33,4 +33,13 @@ CUDA compilation toolchain
    :toctree: generated/
 
    _module.Kernel
-   _module.ObjectCode
+
+
+CUDA protocols
+--------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: protocol.rst
+
+   _stream.IsStreamT
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index ac0ce27e3..6945f47fd 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -40,6 +40,7 @@
     "myst_nb",
     "enum_tools.autoenum",
     "sphinx_copybutton",
+    "sphinx_toolbox.more_autodoc.autoprotocol",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/cuda_core/docs/source/contribute.rst b/cuda_core/docs/source/contribute.rst
index a7a10b3d3..2965afb50 100644
--- a/cuda_core/docs/source/contribute.rst
+++ b/cuda_core/docs/source/contribute.rst
@@ -1,5 +1,7 @@
 .. SPDX-License-Identifier: Apache-2.0
 
+.. _contributor_guide:
+
 Contributing
 ------------
 
diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst
index 01e932842..a853e6481 100644
--- a/cuda_core/docs/source/interoperability.rst
+++ b/cuda_core/docs/source/interoperability.rst
@@ -34,10 +34,10 @@ in Python. While we encourage new Python projects to start using streams (and ot
 CUDA types) from ``cuda.core``, we understand that there are already several projects
 exposing their own stream types.
 
-To address this issue, we propose the ``__cuda_stream__`` protocol (currently version
-0) as follows: For any Python objects that are meant to be interpreted as a stream, they
-should add a ``__cuda_stream__`` method that returns a 2-tuple: The version number
-(``0``) and the address of ``cudaStream_t`` (both as Python `int`):
+To address this issue, we propose the :attr:`~_stream.IsStreamT.__cuda_stream__` protocol
+(currently version 0) as follows: For any Python objects that are meant to be interpreted
+as a stream, they should add a ``__cuda_stream__`` *method* that returns a 2-tuple: The
+version number (``0``) and the address of ``cudaStream_t`` (both as Python `int`):
 
 .. code-block:: python
 
@@ -48,11 +48,11 @@ should add a ``__cuda_stream__`` method that returns a 2-tuple: The version numb
 
        ...
 
-Then such objects can be understood by ``cuda.core`` anywhere a stream-like object
-is needed.
+Then such objects can be understood and wrapped by :meth:`Device.create_stream`.
 
-We suggest all existing Python projects that expose a stream class to also support this
-protocol wherever a function takes a stream.
+We suggest all existing Python projects that already expose a stream class to also support
+this protocol. For new Python projects that need to access CUDA streams, we encourage you
+to use :class:`~_stream.Stream` from ``cuda.core`` directly.
 
 
 Memory view utilities for CPU/GPU buffers
diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst
index 856cdde1e..5176e1e17 100644
--- a/cuda_core/docs/source/release/0.3.0-notes.rst
+++ b/cuda_core/docs/source/release/0.3.0-notes.rst
@@ -7,10 +7,11 @@
 
 Released on MM DD, 2025
 
+
 Highlights
 ----------
 
-- Starting this release ``cuda.core`` is licensed under Apache 2.0.
+- Starting this release ``cuda.core`` is licensed under Apache 2.0. The biggest implication of this change is that we are open to external contribution now! Please kindly follow the :ref:`Contributor Guide <contributor_guide>` for detailed instructions.
 - Initial support for CUDA graphs (phase 1).
    - In this release, we support building a CUDA graph that captures kernel launches. The captured graph can be replayed to reduce
      latency. Graph split/join and conditional nodes are supported.
@@ -19,24 +20,41 @@ Highlights
 Breaking Changes
 ----------------
 
+- The :class:`Buffer` object's ``__init__()`` method is removed, see below.
+- The :class:`Buffer` object's :meth:`~Buffer.close` method and destructor now always defer to the underlying memory resource implementation
+  to decide the behavior if a stream is not explicitly passed. Previously, in this case it always uses the default stream, which could
+  interfere with the memory resource's assumptions.
+
 
 New features
 ------------
 
-- :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
+- :class:`~_module.Kernel` adds :attr:`~_module.Kernel.num_arguments` and :attr:`~_module.Kernel.arguments_info` for introspection of kernel arguments. (#612)
 - Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648)
 - Support launching cooperative kernels by setting :attr:`LaunchConfig.cooperative_launch` to `True`.
-- A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective
-  options.
+- A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective options.
+- Expose :class:`Buffer`, :class:`DeviceMemoryResource`, :class:`LegacyPinnedMemoryResource`, and :class:`MemoryResource` to the top namespace.
+   - Before this release, the internal :class:`Buffer` class had an ``__init__()`` constructor. To align with the design of cuda.core objects,
+     this constructor is removed starting this release. Users who still need the old behavior should use the :meth:`~Buffer.from_handle`
+     alternative constructor.
+- Add a typing annotation for the :attr:`~_stream.IsStreamT.__cuda_stream__` protocol.
 
 
 New examples
 ------------
 
+- Add a PyTorch-based example.
+- Split the :class:`StridedMemoryView` example into two (CPU/GPU).
+
 
 Fixes and enhancements
 ----------------------
 
+- ``cuda.core`` now raises more clear and actionable error messages whenever possible.
+- :class:`ObjectCode` can be pickled now.
 - Look-up of the :attr:`Event.device` and :attr:`Event.context` (the device and CUDA context where an event was created from) is now possible.
+- :class:`Event`-based timing is made more robust (also with better error messages).
 - The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.
+- :attr:`ProgramOptions.ptxas_options` can now accept more than one argument.
 - The :class:`Device` constructor is made faster.
+- The CFFI-based example no longer leaves the intermediate files on disk after it finishes.
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 76a972859..4e8ce5533 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -40,7 +40,7 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-s = PyTorchStreamWrapper(pt_stream)
+s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
 
 # prepare program
 arch = "".join(f"{i}" for i in dev.compute_capability)
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 2d790592c..a9b006b31 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -87,7 +87,7 @@ def __cuda_stream__(self):
 a = cp.random.random(size, dtype=dtype)
 b = cp.random.random(size, dtype=dtype)
 c = cp.empty_like(a)
-cp_stream0 = StreamAdaptor(cp.cuda.get_current_stream())
+cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
 
 # Establish a stream order to ensure that memory has been initialized before
 # accessed by the kernel.
@@ -102,7 +102,7 @@ def __cuda_stream__(self):
 x = cp.random.random(size, dtype=dtype)
 y = cp.random.random(size, dtype=dtype)
 z = cp.empty_like(a)
-cp_stream1 = StreamAdaptor(cp.cuda.get_current_stream())
+cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
 
 # Establish a stream order
 stream1.wait(cp_stream1)
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 32d450102..9cee936d8 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -10,8 +10,15 @@
 from conftest import skipif_need_cuda_headers
 
 import cuda.core.experimental
-from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
-from cuda.core.experimental._memory import _DefaultPinnedMemorySource
+from cuda.core.experimental import (
+    Device,
+    EventOptions,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    launch,
+)
 
 
 def test_event_init_disabled():
@@ -143,7 +150,7 @@ def test_error_timing_incomplete():
     mod = prog.compile(target_type="cubin")
     ker = mod.get_kernel("wait")
 
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(4)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index 97f7c5c60..396dbc99f 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -15,11 +15,11 @@
     GraphCompleteOptions,
     GraphDebugPrintOptions,
     LaunchConfig,
+    LegacyPinnedMemoryResource,
     Program,
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._memory import _DefaultPinnedMemorySource
 from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
 
 
@@ -174,7 +174,7 @@ def test_graph_repeat_capture(init_cuda):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(4)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
@@ -225,7 +225,7 @@ def test_graph_conditional_if(init_cuda, condition_value):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(8)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
@@ -287,7 +287,7 @@ def test_graph_conditional_if_else(init_cuda, condition_value):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(8)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
@@ -359,7 +359,7 @@ def test_graph_conditional_switch(init_cuda, condition_value):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(12)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
@@ -451,7 +451,7 @@ def test_graph_conditional_while(init_cuda, condition_value):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(4)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
@@ -497,7 +497,7 @@ def test_graph_child_graph(init_cuda):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(8)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
@@ -550,7 +550,7 @@ def test_graph_update(init_cuda):
 
     # Allocate memory
     launch_stream = Device().create_stream()
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(12)
     arr = np.from_dlpack(b).view(np.int32)
     arr[0] = 0
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 635d1fcf5..3a02065de 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -9,8 +9,7 @@
 import pytest
 from conftest import skipif_need_cuda_headers
 
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
-from cuda.core.experimental._memory import _DefaultPinnedMemorySource
+from cuda.core.experimental import Device, LaunchConfig, LegacyPinnedMemoryResource, Program, ProgramOptions, launch
 
 
 def test_launch_config_init(init_cuda):
@@ -111,7 +110,7 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
     dev.set_current()
 
     # Prepare pinned host array
-    mr = _DefaultPinnedMemorySource()
+    mr = LegacyPinnedMemoryResource()
     b = mr.allocate(np.dtype(python_type).itemsize)
     arr = np.from_dlpack(b).view(python_type)
     arr[:] = 0
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 5bcc607da..3169c1d19 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -10,8 +10,8 @@
 
 import pytest
 
-from cuda.core.experimental import Device
-from cuda.core.experimental._memory import Buffer, DLDeviceType, MemoryResource
+from cuda.core.experimental import Buffer, Device, MemoryResource
+from cuda.core.experimental._memory import DLDeviceType
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
@@ -21,7 +21,7 @@ def __init__(self, device):
 
     def allocate(self, size, stream=None) -> Buffer:
         ptr = handle_return(driver.cuMemAlloc(size))
-        return Buffer(ptr=ptr, size=size, mr=self)
+        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
 
     def deallocate(self, ptr, size, stream=None):
         handle_return(driver.cuMemFree(ptr))
@@ -46,7 +46,7 @@ def __init__(self):
     def allocate(self, size, stream=None) -> Buffer:
         # Allocate a ctypes buffer of size `size`
         ptr = (ctypes.c_byte * size)()
-        return Buffer(ptr=ptr, size=size, mr=self)
+        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
 
     def deallocate(self, ptr, size, stream=None):
         # the memory is deallocated per the ctypes deallocation at garbage collection time
@@ -71,7 +71,7 @@ def __init__(self, device):
 
     def allocate(self, size, stream=None) -> Buffer:
         ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
-        return Buffer(ptr=ptr, size=size, mr=self)
+        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
 
     def deallocate(self, ptr, size, stream=None):
         handle_return(driver.cuMemFree(ptr))
@@ -95,7 +95,7 @@ def __init__(self, device):
 
     def allocate(self, size, stream=None) -> Buffer:
         ptr = handle_return(driver.cuMemAllocHost(size))
-        return Buffer(ptr=ptr, size=size, mr=self)
+        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
 
     def deallocate(self, ptr, size, stream=None):
         handle_return(driver.cuMemFreeHost(ptr))