Skip to content

Commit

Permalink
Merge branch 'ccrouzet/skip-emit-backward-signature' into 'main'
Browse files Browse the repository at this point in the history
Skip Emitting Adjoints When Not Needed

Partly addresses GH-442, in which the adjoint kernel signature fails to compile on CTK 11
due to overflowing the 1024-byte parameter limit.

See merge request omniverse/warp!1018
  • Loading branch information
shi-eric committed Jan 24, 2025
2 parents 9d17ad9 + 8e657fc commit d9b1305
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 118 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
- `wp.autograd.gradcheck()`, `wp.autograd.jacobian()`, and `wp.autograd.jacobian_fd()` now also accept
arbitrary Python functions that have Warp arrays as inputs and outputs.
- `update_vbo_transforms` kernel launches in the OpenGL renderer are no longer recorded onto the tape.
- Skip emitting backward functions/kernels in the generated C++/CUDA code when `enable_backward` is set to `False`.

### Fixed

Expand Down
196 changes: 108 additions & 88 deletions warp/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3236,7 +3236,7 @@ def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.conte
"""

cuda_kernel_template = """
cuda_kernel_template_forward = """
extern "C" __global__ void {name}_cuda_kernel_forward(
{forward_args})
Expand All @@ -3251,6 +3251,10 @@ def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.conte
{forward_body} }}
}}
"""

cuda_kernel_template_backward = """
extern "C" __global__ void {name}_cuda_kernel_backward(
{reverse_args})
{{
Expand All @@ -3266,21 +3270,25 @@ def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.conte
"""

cpu_kernel_template = """
cpu_kernel_template_forward = """
void {name}_cpu_kernel_forward(
{forward_args})
{{
{forward_body}}}
"""

cpu_kernel_template_backward = """
void {name}_cpu_kernel_backward(
{reverse_args})
{{
{reverse_body}}}
"""

cpu_module_template = """
cpu_module_template_forward = """
extern "C" {{
Expand All @@ -3295,6 +3303,14 @@ def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.conte
}}
}}
}} // extern C
"""

cpu_module_template_backward = """
extern "C" {{
WP_API void {name}_cpu_backward(
{reverse_args})
{{
Expand All @@ -3309,36 +3325,6 @@ def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.conte
"""

cuda_module_header_template = """
extern "C" {{
// Python CUDA entry points
WP_API void {name}_cuda_forward(
void* stream,
{forward_args});
WP_API void {name}_cuda_backward(
void* stream,
{reverse_args});
}} // extern C
"""

cpu_module_header_template = """
extern "C" {{
// Python CPU entry points
WP_API void {name}_cpu_forward(
{forward_args});
WP_API void {name}_cpu_backward(
{reverse_args});
}} // extern C
"""


# converts a constant Python value to equivalent C-repr
def constant_str(value):
Expand Down Expand Up @@ -3776,59 +3762,82 @@ def codegen_kernel(kernel, device, options):

adj = kernel.adj

forward_args = ["wp::launch_bounds_t dim"]
reverse_args = ["wp::launch_bounds_t dim"]
if device == "cpu":
template_forward = cpu_kernel_template_forward
template_backward = cpu_kernel_template_backward
elif device == "cuda":
template_forward = cuda_kernel_template_forward
template_backward = cuda_kernel_template_backward
else:
raise ValueError(f"Device {device} is not supported")

template = ""
template_fmt_args = {
"name": kernel.get_mangled_name(),
}

# build forward signature
forward_args = ["wp::launch_bounds_t dim"]
if device == "cpu":
forward_args.append("size_t task_index")
reverse_args.append("size_t task_index")

# forward args
for arg in adj.args:
forward_args.append(arg.ctype() + " var_" + arg.label)
reverse_args.append(arg.ctype() + " var_" + arg.label)

# reverse args
for arg in adj.args:
# indexed array gradients are regular arrays
if isinstance(arg.type, indexedarray):
_arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
reverse_args.append(_arg.ctype() + " adj_" + arg.label)
else:
reverse_args.append(arg.ctype() + " adj_" + arg.label)

# codegen body
forward_body = codegen_func_forward(adj, func_type="kernel", device=device)
template_fmt_args.update(
{
"forward_args": indent(forward_args),
"forward_body": forward_body,
}
)
template += template_forward

if options["enable_backward"]:
reverse_body = codegen_func_reverse(adj, func_type="kernel", device=device)
else:
reverse_body = ""
# build reverse signature
reverse_args = ["wp::launch_bounds_t dim"]
if device == "cpu":
reverse_args.append("size_t task_index")

if device == "cpu":
template = cpu_kernel_template
elif device == "cuda":
template = cuda_kernel_template
else:
raise ValueError(f"Device {device} is not supported")
for arg in adj.args:
reverse_args.append(arg.ctype() + " var_" + arg.label)

s = template.format(
name=kernel.get_mangled_name(),
forward_args=indent(forward_args),
reverse_args=indent(reverse_args),
forward_body=forward_body,
reverse_body=reverse_body,
)
for arg in adj.args:
# indexed array gradients are regular arrays
if isinstance(arg.type, indexedarray):
_arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
reverse_args.append(_arg.ctype() + " adj_" + arg.label)
else:
reverse_args.append(arg.ctype() + " adj_" + arg.label)

reverse_body = codegen_func_reverse(adj, func_type="kernel", device=device)
template_fmt_args.update(
{
"reverse_args": indent(reverse_args),
"reverse_body": reverse_body,
}
)
template += template_backward

s = template.format(**template_fmt_args)
return s


def codegen_module(kernel, device="cpu"):
def codegen_module(kernel, device, options):
if device != "cpu":
return ""

# Update the module's options with the ones defined on the kernel, if any.
options = dict(options)
options.update(kernel.options)

adj = kernel.adj

template = ""
template_fmt_args = {
"name": kernel.get_mangled_name(),
}

# build forward signature
forward_args = ["wp::launch_bounds_t dim"]
forward_params = ["dim", "task_index"]
Expand All @@ -3842,29 +3851,40 @@ def codegen_module(kernel, device="cpu"):
forward_args.append(f"{arg.ctype()} var_{arg.label}")
forward_params.append("var_" + arg.label)

# build reverse signature
reverse_args = [*forward_args]
reverse_params = [*forward_params]
template_fmt_args.update(
{
"forward_args": indent(forward_args),
"forward_params": indent(forward_params, 3),
}
)
template += cpu_module_template_forward

for arg in adj.args:
if isinstance(arg.type, indexedarray):
# indexed array gradients are regular arrays
_arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
reverse_args.append(f"const {_arg.ctype()} adj_{arg.label}")
reverse_params.append(f"adj_{_arg.label}")
elif hasattr(arg.type, "_wp_generic_type_str_"):
# vectors and matrices are passed from Python by pointer
reverse_args.append(f"const {arg.ctype()}* adj_{arg.label}")
reverse_params.append(f"*adj_{arg.label}")
else:
reverse_args.append(f"{arg.ctype()} adj_{arg.label}")
reverse_params.append(f"adj_{arg.label}")
if options["enable_backward"]:
# build reverse signature
reverse_args = [*forward_args]
reverse_params = [*forward_params]

s = cpu_module_template.format(
name=kernel.get_mangled_name(),
forward_args=indent(forward_args),
reverse_args=indent(reverse_args),
forward_params=indent(forward_params, 3),
reverse_params=indent(reverse_params, 3),
)
for arg in adj.args:
if isinstance(arg.type, indexedarray):
# indexed array gradients are regular arrays
_arg = Var(arg.label, array(dtype=arg.type.dtype, ndim=arg.type.ndim))
reverse_args.append(f"const {_arg.ctype()} adj_{arg.label}")
reverse_params.append(f"adj_{_arg.label}")
elif hasattr(arg.type, "_wp_generic_type_str_"):
# vectors and matrices are passed from Python by pointer
reverse_args.append(f"const {arg.ctype()}* adj_{arg.label}")
reverse_params.append(f"*adj_{arg.label}")
else:
reverse_args.append(f"{arg.ctype()} adj_{arg.label}")
reverse_params.append(f"adj_{arg.label}")

template_fmt_args.update(
{
"reverse_args": indent(reverse_args),
"reverse_params": indent(reverse_params, 3),
}
)
template += cpu_module_template_backward

s = template.format(**template_fmt_args)
return s
32 changes: 20 additions & 12 deletions warp/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -1712,7 +1712,7 @@ def codegen(self, device):

for kernel in self.kernels:
source += warp.codegen.codegen_kernel(kernel, device=device, options=self.options)
source += warp.codegen.codegen_module(kernel, device=device)
source += warp.codegen.codegen_module(kernel, device=device, options=self.options)

# add headers
if device == "cpu":
Expand Down Expand Up @@ -1765,20 +1765,26 @@ def get_kernel_hooks(self, kernel):

name = kernel.get_mangled_name()

options = dict(kernel.module.options)
options.update(kernel.options)

if self.device.is_cuda:
forward_name = name + "_cuda_kernel_forward"
forward_kernel = runtime.core.cuda_get_kernel(
self.device.context, self.handle, forward_name.encode("utf-8")
)

backward_name = name + "_cuda_kernel_backward"
backward_kernel = runtime.core.cuda_get_kernel(
self.device.context, self.handle, backward_name.encode("utf-8")
)
if options["enable_backward"]:
backward_name = name + "_cuda_kernel_backward"
backward_kernel = runtime.core.cuda_get_kernel(
self.device.context, self.handle, backward_name.encode("utf-8")
)
else:
backward_kernel = None

# look up the required shared memory size for each kernel from module metadata
forward_smem_bytes = self.meta[forward_name + "_smem_bytes"]
backward_smem_bytes = self.meta[backward_name + "_smem_bytes"]
backward_smem_bytes = self.meta[backward_name + "_smem_bytes"] if options["enable_backward"] else 0

# configure kernels maximum shared memory size
max_smem_bytes = runtime.core.cuda_get_max_shared_memory(self.device.context)
Expand All @@ -1788,9 +1794,6 @@ def get_kernel_hooks(self, kernel):
f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {forward_name} kernel for {forward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
)

options = dict(kernel.module.options)
options.update(kernel.options)

if options["enable_backward"] and not runtime.core.cuda_configure_kernel_shared_memory(
backward_kernel, backward_smem_bytes
):
Expand All @@ -1805,9 +1808,14 @@ def get_kernel_hooks(self, kernel):
forward = (
func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))) or None
)
backward = (
func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))) or None
)

if options["enable_backward"]:
backward = (
func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
or None
)
else:
backward = None

hooks = KernelHooks(forward, backward)

Expand Down
2 changes: 2 additions & 0 deletions warp/sim/integrator_vbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from .integrator import Integrator
from .model import PARTICLE_FLAG_ACTIVE, Control, Model, ModelShapeMaterials, State

wp.set_module_options({"enable_backward": False})

VBD_DEBUG_PRINTING_OPTIONS = {
# "elasticity_force_hessian",
# "contact_force_hessian",
Expand Down
Loading

0 comments on commit d9b1305

Please sign in to comment.