From 74746c5cea5ab8730844c3bf11dc6e4fd9df9c4e Mon Sep 17 00:00:00 2001 From: Eric Shi Date: Thu, 23 Jan 2025 16:38:41 -0800 Subject: [PATCH] Add an augassign benchmark --- .../codegen/component_assignment.py | 5 +- .../benchmarks/codegen/component_augassign.py | 142 ++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 benchmarks/benchmarks/codegen/component_augassign.py diff --git a/benchmarks/benchmarks/codegen/component_assignment.py b/benchmarks/benchmarks/codegen/component_assignment.py index bd0a0b014..38d6ae2f8 100644 --- a/benchmarks/benchmarks/codegen/component_assignment.py +++ b/benchmarks/benchmarks/codegen/component_assignment.py @@ -21,7 +21,10 @@ def component_assignment(a: wp.array(dtype=wp.mat44)): a[tid] = m -class ComponentAssignment: +class CompileModule: + repeat = 10 # Number of samples to run + number = 1 # Number of measurements to make between a single setup and teardown + def setup(self): wp.init() wp.build.clear_kernel_cache() diff --git a/benchmarks/benchmarks/codegen/component_augassign.py b/benchmarks/benchmarks/codegen/component_augassign.py new file mode 100644 index 000000000..6759e830e --- /dev/null +++ b/benchmarks/benchmarks/codegen/component_augassign.py @@ -0,0 +1,142 @@ +# Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import warp as wp + + +@wp.kernel +def matrix_augassign_kernel( + a: wp.array2d(dtype=wp.mat22), + b: wp.array2d(dtype=wp.mat22), + c: wp.array2d(dtype=wp.mat22), + d: wp.array2d(dtype=wp.mat22), +): + i, j = wp.tid() + + m1 = wp.mat22() + m2 = b[i, j] + + m1[0, 0] += m2[0, 0] + m1[0, 1] += m2[0, 1] + m1[1, 0] += m2[1, 0] + m1[1, 1] += m2[1, 1] + + a[i, j] = m1 + + m3 = wp.mat22() + m4 = d[i, j] + + m3[0, 0] -= m4[0, 0] + m3[0, 1] -= m4[0, 1] + m3[1, 0] -= m4[1, 0] + m3[1, 1] -= m4[1, 1] + + c[i, j] = m3 + + +class CompileModule: + repeat = 10 # Number of samples to run + number = 1 # Number of measurements to make between a single setup and teardown + + def setup(self): + wp.init() + wp.build.clear_kernel_cache() + + def teardown(self): + matrix_augassign_kernel.module.unload() + + def time_cuda_codegen(self): + wp.load_module(device="cuda:0") + + def time_cpu_codegen(self): + wp.load_module(device="cpu") + + +class RunForwardKernel: + def setup(self): + wp.init() + wp.build.clear_kernel_cache() + N = (1024, 1024) + self.a = wp.zeros(N, dtype=wp.mat22, device="cuda:0") + self.b = wp.ones(N, dtype=wp.mat22, device="cuda:0") + self.c = wp.zeros(N, dtype=wp.mat22, device="cuda:0") + self.d = wp.ones(N, dtype=wp.mat22, device="cuda:0") + + self.cmd = wp.launch( + matrix_augassign_kernel, N, inputs=[self.a, self.b, self.c, self.d], record_cmd=True, device="cuda:0" + ) + wp.synchronize_device("cuda:0") + + def time_cuda(self): + self.cmd.launch() + wp.synchronize_device("cuda:0") + + def teardown(self): + self.a.zero_() + self.b.fill_(1.0) + self.c.zero_() + self.d.fill_(1.0) + wp.synchronize_device("cuda:0") + + +class RunBackwardKernel: + def setup(self): + wp.init() + wp.build.clear_kernel_cache() + wp.load_module(device="cuda:0") + + N = (1024, 1024) + self.a = wp.zeros(N, dtype=wp.mat22, device="cuda:0", requires_grad=True) + self.b = wp.ones(N, dtype=wp.mat22, device="cuda:0", requires_grad=True) + self.c = wp.zeros(N, dtype=wp.mat22, device="cuda:0", requires_grad=True) + self.d = wp.ones(N, dtype=wp.mat22, device="cuda:0", requires_grad=True) + + with wp.ScopedCapture("cuda:0") as capture: + wp.launch( + matrix_augassign_kernel, + N, + inputs=[self.a, self.b, self.c, self.d], + adj_inputs=[self.a.grad, self.b.grad, self.c.grad, self.d.grad], + adj_outputs=[], + adjoint=True, + device="cuda:0", + ) + self.graph = capture.graph + # Warmup + for _ in range(5): + wp.capture_launch(self.graph) + wp.synchronize_device("cuda:0") + + def time_cuda(self): + wp.capture_launch(self.graph) + wp.synchronize_device("cuda:0") + + def track_cuda_filter(self): + with wp.ScopedTimer("benchmark", cuda_filter=wp.TIMING_KERNEL) as timer: + wp.launch( + matrix_augassign_kernel, + self.a.shape, + inputs=[self.a, self.b, self.c, self.d], + adj_inputs=[self.a.grad, self.b.grad, self.c.grad, self.d.grad], + adj_outputs=[], + adjoint=True, + device="cuda:0", + ) + return timer.timing_results[0].elapsed * 1e3 + + track_cuda_filter.unit = "microseconds" + + def teardown(self): + self.a.zero_() + self.b.fill_(1.0) + self.c.zero_() + self.d.fill_(1.0) + self.a.grad.zero_() + self.b.grad.zero_() + self.c.grad.zero_() + self.d.grad.zero_() + wp.synchronize_device("cuda:0")