Skip to content

Commit

Permalink
fix many bugs, add some parallel ops and linalg func
Browse files Browse the repository at this point in the history
  • Loading branch information
FindDefinition committed Aug 11, 2024
1 parent d0b7eb3 commit 5f458bf
Show file tree
Hide file tree
Showing 18 changed files with 1,343 additions and 292 deletions.
20 changes: 10 additions & 10 deletions cumm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
from cumm.constants import PACKAGE_ROOT
from ccimport import compat

# if project_is_installed(PACKAGE_NAME) and project_is_editable(
# PACKAGE_NAME) and not CUMM_DISABLE_JIT:
# from cumm.csrc.arrayref import ArrayPtr
# from cumm.tensorview_bind import TensorViewBind, AppleMetalImpl
# pccm.builder.build_pybind([ArrayPtr(), TensorViewBind(), AppleMetalImpl()],
# PACKAGE_ROOT / "core_cc",
# namespace_root=PACKAGE_ROOT,
# load_library=False,
# std="c++17" if compat.InMacOS else "c++14",
# verbose=False)
if project_is_installed(PACKAGE_NAME) and project_is_editable(
PACKAGE_NAME) and not CUMM_DISABLE_JIT:
from cumm.csrc.arrayref import ArrayPtr
from cumm.tensorview_bind import TensorViewBind, AppleMetalImpl
pccm.builder.build_pybind([ArrayPtr(), TensorViewBind(), AppleMetalImpl()],
PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT,
load_library=False,
std="c++17" if compat.InMacOS else "c++14",
verbose=False)
59 changes: 54 additions & 5 deletions cumm/inliner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,23 @@ def _cached_get_torch_dtype_to_tv():
return _TORCH_DTYPE_TO_TV


RESERVED_NAMES = set([
"threadPositionInGrid",
"threadGroupPositionInGrid",
])

class CUDAMode(enum.Enum):
Kernel1D = "Kernel1D"
Kernel2D = "Kernel2D"
Kernel3D = "Kernel3D"

KernelRaw = "KernelRaw"


def torch_tensor_to_tv(ten,
dtype: Optional[int] = None,
shape: Optional[List[int]] = None):
shape: Optional[List[int]] = None,
to_const: bool = False):
# assert ten.is_contiguous(), "must be contiguous tensor"
device = ten.device
if device.type == "cpu":
Expand All @@ -175,9 +184,41 @@ def torch_tensor_to_tv(ten,
shape = list(ten.shape)
if dtype is None:
dtype = _cached_get_torch_dtype_to_tv()[ten.dtype]
if to_const:
return tv.from_const_blob_strided(ptr, shape, list(ten.stride()), dtype,
tv_device, offset)
return tv.from_blob_strided(ptr, shape, list(ten.stride()), dtype,
tv_device, offset)

@contextlib.contextmanager
def measure_and_print_torch(name: str = "CUDATimer", *, stream: int = 0, out: Optional[List[float]] = None, enable: bool = True):
if not enable:
yield
else:
import torch
if compat.IsAppleSiliconMacOs:
start_ev = torch.mps.Event(enable_timing=True)
end_ev = torch.mps.Event(enable_timing=True)
start_ev.record()
yield
end_ev.record()
torch.mps.synchronize()
# TODO sync event will hang
# start_ev.synchronize()
# end_ev.synchronize()
duration = start_ev.elapsed_time(end_ev)
print(f"{name} duration: {duration} ms")
else:
start_ev = torch.cuda.Event(enable_timing=True)
end_ev = torch.cuda.Event(enable_timing=True)

start_ev.record(torch.cuda.default_stream())
yield
end_ev.record(torch.cuda.default_stream())
start_ev.synchronize()
end_ev.synchronize()
duration = start_ev.elapsed_time(end_ev)
print(f"{name} duration: {duration} ms")

def get_current_stream():
import torch
Expand Down Expand Up @@ -455,7 +496,7 @@ def get_nvrtc_module(self, name: str) -> Optional[Union[CummNVRTCModule, CummMet
def get_nvrtc_kernel_attrs(self, name: str) -> Dict[str, int]:
nvrtc_mod = self.get_nvrtc_module(name)
assert nvrtc_mod is not None
return nvrtc_mod.get_kernel_attrs(nvrtc_mod.get_lowered_name(_NVRTC_FUNC_NAME_FORMAT.format(name)))
return nvrtc_mod.get_kernel_attrs(nvrtc_mod.get_lowered_name(self._get_nvrtc_inline_func_name_for_debug(name)))

def get_save_root(self,
path: Path,
Expand Down Expand Up @@ -538,6 +579,12 @@ def handle_container_code(self, code_str: str, code: pccm.FunctionCode,
code.arguments = new_args + [
pccm.Argument(self.index_name, "uint32_t", attributes=["thread_position_in_grid"])
]
else:
code.arguments = new_args + [
pccm.Argument("threadPositionInGrid", "uint3", attributes=["thread_position_in_grid"]),
pccm.Argument("threadgroupPositionInGrid", "uint3", attributes=["threadgroup_position_in_grid"]),
pccm.Argument("threadPositionInThreadgroup", "uint3", attributes=["thread_position_in_threadgroup"]),
]
code.code_before_func_def = "\n".join(func_constants)
trycatch_ctx = contextlib.nullcontext()
if is_cpu:
Expand Down Expand Up @@ -632,20 +679,22 @@ def run_func(self,
*args,
user_args: Optional[_NVRTCInlineParams] = None):
assert user_args is not None
real_name = self._get_nvrtc_inline_func_name_for_debug(name)
launch = user_args.launch.copy()
if launch.ctx is None or not launch.ctx.has_apple_metal_context():
launch.ctx = self.ctx
if isinstance(func, CummMetalModule):
is_kernel_raw = user_args.mode == CUDAMode.KernelRaw
with self.enter_inliner_scope():
res = func.run_kernel(self._get_nvrtc_inline_func_name_for_debug(name), launch, *args, perf_context=user_args.perf_context)
res = func.run_kernel(real_name, launch, *args, perf_context=user_args.perf_context, use_nonuniform_threadgroup=not is_kernel_raw)
if self._mps_context is not None:
self._mps_context.commit()
return res
else:
if user_args.run_in_process:
return func.run_kernel_in_spawn_process(self._get_nvrtc_inline_func_name_for_debug(name), launch, *args)
return func.run_kernel_in_spawn_process(real_name, launch, *args)
else:
return func.run_kernel(self._get_nvrtc_inline_func_name_for_debug(name), launch, *args, perf_context=user_args.perf_context)
return func.run_kernel(real_name, launch, *args, perf_context=user_args.perf_context)

def kernel_raw(self,
name: str,
Expand Down
8 changes: 5 additions & 3 deletions cumm/nvrtc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ def load(self):
try:
subprocess.check_output([
"xcrun", "-sdk", "macosx", "metal", "-c",
f2.name, *opts, "-std=metal3.1",
f2.name, *opts, "-std=metal3.1", # "-frecord-sources", "-gline-tables-only",
"-I",
str(inc_dir), "-o",
str(out_name)
Expand All @@ -784,11 +784,13 @@ def load(self):
def run_kernel(self, name: str, launch: tv.LaunchParam,
*args: Union[Tensor, int, float, List[int], List[float],
Tuple[float, ...], Tuple[int, ...]],
perf_context: Optional[ContextManager] = None):
perf_context: Optional[ContextManager] = None,
use_nonuniform_threadgroup: bool = True):
if self._metal_mod is None:
self.load()
assert self._metal_mod is not None
return self._metal_mod.run_kernel(name, launch, *args, perf_context=perf_context)
return self._metal_mod.run_kernel(name, launch, *args, perf_context=perf_context,
use_nonuniform_threadgroup = use_nonuniform_threadgroup)


if __name__ == "__main__":
Expand Down
14 changes: 10 additions & 4 deletions cumm/tensorview/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,10 @@ def _run_kernel(mod: Union[_NVRTCModule, _MetalModule], name: str, launch: Launc
Tuple[float, ...], Tuple[int, ...]],
perf_context: Optional[ContextManager] = None,
name_to_meta: Optional[Dict[str, NVRTCKernelMeta]] = None,
lower_name: Optional[str] = None):
lower_name: Optional[str] = None,
use_nonuniform_threadgroup: bool = True):
# WARNING: use_nonuniform_threadgroup only used on apple silicon

# t = time.time()
metas: List[NVRTCArgMeta] = [NVRTCArgMeta(NVRTCArgBaseType.Scalar, False, -1, [])] * len(args)
if name_to_meta:
Expand Down Expand Up @@ -294,7 +297,8 @@ def _run_kernel(mod: Union[_NVRTCModule, _MetalModule], name: str, launch: Launc
# print("preprocess time", time.time() - t)
# t = time.time()
res = mod.run_kernel(name, launch.blocks, launch.threads,
launch.smem, launch.ctx, kernel_args)
launch.smem, launch.ctx, kernel_args,
use_nonuniform_threadgroup)
# print(f"kernel {name} time: {time.time() - t}")
return res

Expand Down Expand Up @@ -497,8 +501,10 @@ def run_kernel_unchecked(self, name: str, launch: LaunchParam, *args: Tuple[Tens
def run_kernel(self, name: str, launch: LaunchParam,
*args: Union[Tensor, int, float, List[int], List[float],
Tuple[float, ...], Tuple[int, ...]],
perf_context: Optional[ContextManager] = None):
return _run_kernel(self._mod, name, launch, *args, perf_context=perf_context, name_to_meta=self.name_to_meta)
perf_context: Optional[ContextManager] = None,
use_nonuniform_threadgroup: bool = True):
return _run_kernel(self._mod, name, launch, *args, perf_context=perf_context, name_to_meta=self.name_to_meta,
use_nonuniform_threadgroup=use_nonuniform_threadgroup)

def tensor_scalar(self, val, dtype: int):
return full([1], val, dtype)
Expand Down
4 changes: 2 additions & 2 deletions cumm/tensorview_bind.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,10 +550,10 @@ def bind_tensorview(self):
py::class_<tv::MetalModule, std::shared_ptr<tv::MetalModule>> metal_rtc_m(m, "MetalModule");
metal_rtc_m.def(py::init<tv::Tensor>(), py::arg("binary"))
.def(py::init<std::string, std::vector<std::string>>(), py::arg("code"), py::arg("opts"))
.def(py::init<std::string, std::unordered_map<std::string, std::string>, bool>(), py::arg("code"), py::arg("preprocessorMacros"), py::arg("fastMathEnabled") = true)
.def("run_kernel", &tv::MetalModule::run_kernel, py::arg("name"), py::arg("blocks"),
py::arg("threads"), py::arg("smem_size"), py::arg("ctx"),
py::arg("args"));
py::arg("args"), py::arg("use_nonuniform_threadgroup") = true);
py::enum_<tv::NVRTCModule::ArgType>(nvrtc_m, "ArgType")
.value("kTensor", tv::NVRTCModule::ArgType::kTensor)
Expand Down
2 changes: 1 addition & 1 deletion cumm/tensorview_bind_anno.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class MetalModule:
...

@overload
def __init__(self, code: str, opts: List[str]):
def __init__(self, code: str, preprocessorMacros: Dict[str, str], fastMathEnabled: bool = True):
...

def run_kernel(self, name: str, blocks: List[int], threads: List[int],
Expand Down
93 changes: 86 additions & 7 deletions cumm/utils/array_grad_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
from cumm.inliner import NVRTCInlineBuilder
from cumm.common import TensorView, TensorViewCPU, TensorViewNVRTC, TensorViewNVRTCHashKernel, TensorViewArrayLinalg, TensorViewNVRTCDev, EigenLib
from cumm import tensorview as tv

import pccm
from ccimport import compat
def check_array_op_grad(inp: np.ndarray, inp_shape: list[int], out_shape: list[int], op: str, grad_op: str, delta: float = 1e-4):
def check_array_op_grad(inp: np.ndarray, out_shape: list[int], op: str, grad_op: str, delta: float = 1e-4):
np.random.seed(50051)

inp_shape = inp.shape[1:]
num_element = np.prod(inp_shape)
inliner = NVRTCInlineBuilder([TensorViewArrayLinalg], std="c++17")
dtype = np.float64
Expand All @@ -26,7 +29,7 @@ def check_array_op_grad(inp: np.ndarray, inp_shape: list[int], out_shape: list[i
my_val_tv = tv.zeros([num_check], tv_dtype, 0)
ref_val_tv = tv.zeros([num_check], tv_dtype, 0)

assert inp_shape == list(inp.shape[1:]), f"{inp_shape}, {inp.shape[1:]}"
assert list(inp_shape) == list(inp.shape[1:]), f"{inp_shape}, {inp.shape[1:]}"
inp_shape_np = np.array(inp_shape, np.int32)
for i in range(num_element):
inp_delta = np.zeros(inp_shape, dtype)
Expand All @@ -36,13 +39,12 @@ def check_array_op_grad(inp: np.ndarray, inp_shape: list[int], out_shape: list[i
slice_indexes_str = "".join(map(lambda x: f"[{x}]", slice_indexes))
inliner.kernel_1d(f"check_grad_op_{op}_{grad_op}_{inp_shape}_{dtype_str}", num_check, 0, f"""
namespace op = tv::arrayops;
auto inp_ptr = op::reinterpret_cast_array_nd<{dtype_str}, {inp_shape_str}>($inp_tv);
auto inp_ptr = op::reinterpret_cast_array_nd<{inp_shape_str}>($inp_tv);
auto inp_arr = inp_ptr[i];
auto grad_scale = $grad_scalar;
auto inp_delta_val = $inp_delta;
tv::array_nd<float, 3, 3> out_arr = inp_arr.op<op::{op}>() * grad_scale;
auto out_arr_2 = out_arr.op<op::max>(1.0f);
tv::array_nd<float, {out_shape_str}> out_arr = inp_arr.op<op::{op}>() * grad_scale;
auto out_arr_with_delta = (inp_arr + inp_delta_val).op<op::{op}>() * grad_scale;
auto out_arr_with_delta_sum = op::reshape<-1>(out_arr_with_delta - out_arr).op<op::sum>();
$my_val_tv[i] = op::reshape<-1>(grad_scale.op<op::{grad_op}>(inp_arr))[$index];
Expand All @@ -51,5 +53,82 @@ def check_array_op_grad(inp: np.ndarray, inp_shape: list[int], out_shape: list[i

my_val = my_val_tv.cpu().numpy()
ref_val = ref_val_tv.cpu().numpy()
# currently only double can get high precision result,
# apple silicon don't support double, so we just print here for now.
print(my_val)
print(ref_val)
print(ref_val)

def check_array_binary_op_grad(inp_list: list[np.ndarray], out_shape: list[int], op: str, grad_ops: list[str], delta: float = 1e-4):
np.random.seed(50051)
inliner = NVRTCInlineBuilder([TensorViewArrayLinalg], std="c++17")
dtype = np.float64
tv_dtype = tv.float64
dtype_str = "double"
if compat.IsAppleSiliconMacOs:
dtype = np.float32
tv_dtype = tv.float32
dtype_str = "float"
inp_list = [inp.astype(dtype) for inp in inp_list]
inp_shapes = [inp.shape[1:] for inp in inp_list]
inp_shape_strs = [", ".join(map(str, inp_shape)) for inp_shape in inp_shapes]
inp_tvs = [tv.from_numpy(inp).cuda() for inp in inp_list]

grad_scale_np = np.random.uniform(0.5, 1.5, size=out_shape).astype(dtype)
num_check = inp_list[0].shape[0]
# grad_scale_np = np.eye(4, dtype=dtype)
# grad_scale_np[:] = 1
out_shape_str = ", ".join(map(str, out_shape))
my_val_tv = tv.zeros([len(inp_list), num_check], tv_dtype, 0)
ref_val_tv = tv.zeros([len(inp_list), num_check], tv_dtype, 0)
code = pccm.code()
code.raw(f"""
namespace op = tv::arrayops;
auto grad_scale = $grad_scale_np;
""")
inp_deltas = [np.zeros(inp_shape, dtype) for inp_shape in inp_shapes]
for cur_inp_idx in range(len(inp_list)):
code.raw(f"""
auto inp_ptr_{cur_inp_idx} = op::reinterpret_cast_array_nd<{inp_shape_strs[cur_inp_idx]}>($(inp_tvs[{cur_inp_idx}]));
auto inp_arr_{cur_inp_idx} = inp_ptr_{cur_inp_idx}[i];
auto inp_delta_{cur_inp_idx} = $(inp_deltas[{cur_inp_idx}]);
auto inp_with_delta_{cur_inp_idx} = inp_arr_{cur_inp_idx} + inp_delta_{cur_inp_idx};
""")
# op format: inp1.op<op>(inp2, inp3, ...)
inp_arr_str = ", ".join([f"inp_arr_{i}" for i in range(len(inp_list))])
inp_arr_with_delta_str = ", ".join([f"inp_with_delta_{i}" for i in range(len(inp_list))])


inp_arr_start1_str = ", ".join([f"inp_arr_{i}" for i in range(1, len(inp_list))])
inp_arr_with_delta_start1_str = ", ".join([f"inp_with_delta_{i}" for i in range(1, len(inp_list))])

code.raw(f"""
tv::array_nd<float, {out_shape_str}> out_arr = inp_arr_0.op<op::{op}>({inp_arr_start1_str}) * grad_scale;
tv::array_nd<float, {out_shape_str}> out_arr_with_delta = inp_with_delta_0.op<op::{op}>({inp_arr_with_delta_start1_str}) * grad_scale;
auto out_arr_with_delta_sum = op::reshape<-1>(out_arr_with_delta - out_arr).op<op::sum>();
""")
for cur_inp_idx in range(len(inp_list)):
grad_op = grad_ops[cur_inp_idx]
code.raw(f"""
$my_val_tv[{cur_inp_idx} * {num_check} + i] = op::reshape<-1>(grad_scale.op<op::{grad_op}>({inp_arr_str}))[$index];
$ref_val_tv[{cur_inp_idx} * {num_check} + i] = out_arr_with_delta_sum / op::reshape<-1>(inp_delta_{cur_inp_idx})[$index];
""")
for cur_inp_idx in range(len(inp_list)):
num_element = np.prod(inp_shapes[cur_inp_idx])
for j in range(num_element):
inp_deltas[cur_inp_idx].reshape(-1)[j] = delta
index = j
inliner.kernel_1d(f"check_grad_op_{dtype_str}", num_check, 0, code)
inp_deltas[cur_inp_idx].reshape(-1)[j] = 0
my_val = my_val_tv[cur_inp_idx].cpu().numpy()
ref_val = ref_val_tv[cur_inp_idx].cpu().numpy()
# currently only double can get high precision result,
# apple silicon don't support double, so we just print here for now.
print(f"------ {op}-{grad_ops[cur_inp_idx]}-{j} ------")
print(my_val)
print(ref_val)


4 changes: 2 additions & 2 deletions include/tensorview/contexts/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ struct AppleMetalContext {
return command_buffer_ptr_external_;
}
if (!_commandBuffer) {
_commandBuffer = NS::TransferPtr(command_queue_ptr_->commandBuffer());
_commandBuffer = NS::RetainPtr(command_queue_ptr_->commandBuffer());
}
return _commandBuffer.get();
}
Expand Down Expand Up @@ -232,7 +232,7 @@ struct AppleMetalContext {
TV_ASSERT_RT_ERR(!from_blob_, "you can't synchronize when context is from blob");
TV_ASSERT_RT_ERR(_commandBuffer, "error");
_commandBuffer->commit();
_commandBuffer = NS::TransferPtr(command_queue_ptr_->commandBuffer());
_commandBuffer = NS::RetainPtr(command_queue_ptr_->commandBuffer());
}
void commitAndWait() {
TV_ASSERT_RT_ERR(!from_blob_, "you can't synchronize when context is from blob");
Expand Down
4 changes: 1 addition & 3 deletions include/tensorview/core/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -1038,13 +1038,11 @@ TV_HOST_DEVICE_INLINE constexpr T reduce(TV_METAL_THREAD F &&f,
return detail::array_reduce_impl<N>::run(TV_FORWARD_EXCEPT_METAL(F, f), a);
}

#ifndef TV_METAL_RTC
template <typename T, size_t N, size_t Align = 0>
TV_HOST_DEVICE_INLINE constexpr auto constant(T val) TV_NOEXCEPT_EXCEPT_METAL {
TV_HOST_DEVICE_INLINE constexpr auto constant_array(T val) TV_NOEXCEPT_EXCEPT_METAL {
return detail::constant_impl<T, N, Align>(
val, mp_make_list_c_sequence_reverse<int, N>{});
}
#endif

template <typename T, size_t N>
TV_HOST_DEVICE_INLINE constexpr array<T, N>
Expand Down
Loading

0 comments on commit 5f458bf

Please sign in to comment.