From cf3edc92f4f660ebfe16b1e3821d521cc6d673a5 Mon Sep 17 00:00:00 2001 From: "yan.yan" Date: Sun, 18 Aug 2024 16:33:53 +0800 Subject: [PATCH] v0.6.3: fix macos inline kernel bug --- CHANGELOG.md | 8 ++++++-- cumm/inliner/__init__.py | 31 +++++++++++++++++++++++-------- cumm/nvrtc/__init__.py | 5 ----- version.txt | 2 +- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6097971..5f9e60a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ # Changelog -## [0.6.2] - 2024-08-27 +## [0.6.3] - 2024-08-18 +### Fixed +- fix bug in mac in inline kernels + +## [0.6.2] - 2024-08-17 ### Fixed - fix mac os bug -## [0.6.1] - 2024-08-27 +## [0.6.1] - 2024-08-17 ### Changed - debug macos ci diff --git a/cumm/inliner/__init__.py b/cumm/inliner/__init__.py index c9d9d7d..49e9bae 100644 --- a/cumm/inliner/__init__.py +++ b/cumm/inliner/__init__.py @@ -284,11 +284,15 @@ def type_conversion_code(self, obj, src_name: str, tgt_name: str, user_arg: Opti # if we use ten.data_ptr(), the result will be # MTLBuffer + data_offset, which will cause # segfault. + tv_dtype_str = _cached_get_torch_dtype_to_tv_str()[obj.dtype] + itemsize = obj.itemsize + # storage offset isn't byte offset, so we need to multiply itemsize + # when use raw device pointer. res.extend([ f"__{tgt_name}_tmp0 = {src_name}", f"assert __{tgt_name}_tmp0.dtype == {obj.dtype}", f"{tgt_name} = (EMPTY_TENSOR, kDevicePointer, ", - f" __{tgt_name}_tmp0.untyped_storage().data_ptr(), __{tgt_name}_tmp0.storage_offset())", + f" __{tgt_name}_tmp0.untyped_storage().data_ptr(), __{tgt_name}_tmp0.storage_offset() * {itemsize})", ]) else: res.extend([ @@ -301,6 +305,7 @@ def type_conversion_code(self, obj, src_name: str, tgt_name: str, user_arg: Opti "import torch", "from cumm import tensorview as tv", "kDevicePointer = tv._NVRTCModule.kDevicePointer", + # "kTensor = tv._NVRTCModule.kTensor", "EMPTY_TENSOR = tv.Tensor()", ] else: @@ -400,15 +405,25 @@ def type_conversion_code(self, obj, src_name: str, tgt_name: str, user_arg: Opti raise NotImplementedError assert user_arg is not None and isinstance(user_arg, _NVRTCInlineParams) if user_arg.unchecked_mode: - res.extend([ - f"__{tgt_name}_tmp0 = {src_name}", - # f"assert isinstance(__{tgt_name}_tmp0, {obj_type_str})", - f"{tgt_name} = (tv.full([1], __{tgt_name}_tmp0, {tv_dtype}), kScalar, ", - f" 0, 0)", - ]) + if isinstance(obj, bool): + # bools are func constants (only used in apple metal) + res.extend([ + f"__{tgt_name}_tmp0 = {src_name}", + f"assert isinstance(__{tgt_name}_tmp0, bool)", + f"{tgt_name} = (tv.full([1], __{tgt_name}_tmp0, tv.uint8), kConstant, ", + f" 0, 0)", + ]) + else: + res.extend([ + f"__{tgt_name}_tmp0 = {src_name}", + # f"assert isinstance(__{tgt_name}_tmp0, {obj_type_str})", + f"{tgt_name} = (tv.full([1], __{tgt_name}_tmp0, {tv_dtype}), kScalar, ", + f" 0, 0)", + ]) return "\n".join(res), [ "from cumm import tensorview as tv", - "kScalar = tv._NVRTCModule.kScalar" + "kScalar = tv._NVRTCModule.kScalar", + "kConstant = tv._NVRTCModule.kConstant" ] else: return f"{tgt_name} = {src_name}", [] diff --git a/cumm/nvrtc/__init__.py b/cumm/nvrtc/__init__.py index 28e12e5..37eed50 100644 --- a/cumm/nvrtc/__init__.py +++ b/cumm/nvrtc/__init__.py @@ -735,12 +735,7 @@ def __init__(self, self.name_to_meta = self.params.name_to_meta def load(self): - import llvmlite.binding as llvm - _lazy_load_llvm() - # use clang++ to get ir opts = self.params.opts - _lazy_load_lib_for_llvm(self.params.libraries, - self.params.libpaths) with tempfile.TemporaryDirectory() as fdir: inc_dir = Path(fdir) / "include" for k, v in self.params.headers.items(): diff --git a/version.txt b/version.txt index b1d7abc..a0a1517 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.6.2 \ No newline at end of file +0.6.3 \ No newline at end of file