Skip to content

Commit d5a787d

Browse files
committed
feat: Automatically generate QDP plugins
1 parent 0a46392 commit d5a787d

File tree

8 files changed

+594
-5
lines changed

8 files changed

+594
-5
lines changed
+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
"""
2+
.. _auto_generate_converters:
3+
4+
Automatically Generate a Plugin for a Custom Kernel
5+
===================================================================
6+
7+
We are going to demonstrate how to automatically generate a plugin for a custom kernel using Torch-TensorRT using
8+
the new Python based plugin system in TensorRT 10.7.
9+
10+
Torch-TensorRT supports falling back to PyTorch implementations of operations in the case that Torch-TensorRT
11+
does not know how to compile them in TensorRT. However, this comes at the cost of a graph break and will reduce the performance of the model.
12+
The easiest way to fix lack of support for ops is by adding a decomposition (see:
13+
`Writing lowering passes for the Dynamo frontend <https://pytorch.org/TensorRT/contributors/writing_dynamo_aten_lowering_passes.html>`_) - which defines the operator
14+
in terms of PyTorch ops that are supported in Torch-TensorRT or a converter (see:
15+
`Writing converters for the Dynamo frontend <https://pytorch.org/TensorRT/contributors/dynamo_converters.html>`_) - which defines the operator in terms of TensorRT operators.
16+
17+
In some cases there isn't a great way to do either of these, perhaps because the operator is a custom kernel that is not part of standard PyTorch or
18+
TensorRT cannot support it natively.
19+
20+
For these cases, it is possible to use a TensorRT plugin to replace the operator **inside** the TensorRT engine, thereby avoiding
21+
the performance and resource overhead from a graph break.
22+
23+
Previously this involved a complex process in not only building a performant kernel but setting it up to run in TensorRT (see: `Using Custom Kernels within TensorRT Engines with Torch-TensorRT <https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/custom_kernel_plugins.html>`_).
24+
With TensorRT 10.7, there is a new Python native plugin system which greatly streamlines this process. This
25+
plugin system also allows Torch-TensorRT to automatically generate the necessary conversion code to convert the
26+
operation in PyTorch to TensorRT.
27+
"""
28+
29+
# %%
30+
# Writing Custom Operators in PyTorch
31+
# -----------------------------------------
32+
#
33+
# Pervious tutorials already cover creating custom operators in PyTorch which later get used with Torch-TensorRT.
34+
# Here we define a simple elementwise multiplication operator in Triton. This operator is then registered as a custom op in PyTorch.
35+
# with its host launch code as well as a "meta-kernel", A meta-kernel is a function that describes the shape and data type
36+
# transformations that the operator will perform. This meta-kernel is used by Dynamo and Torch-TensorRT, so it
37+
# is necessary to define.
38+
#
39+
40+
from typing import Tuple
41+
42+
import tensorrt_bindings.plugin as trtp
43+
import torch
44+
import torch_tensorrt
45+
import triton
46+
import triton.language as tl
47+
48+
49+
@triton.jit
50+
def elementwise_scale_mul_kernel(X, Y, Z, a, b, BLOCK_SIZE: tl.constexpr):
51+
pid = tl.program_id(0)
52+
# Compute the range of elements that this thread block will work on
53+
block_start = pid * BLOCK_SIZE
54+
# Range of indices this thread will handle
55+
offsets = block_start + tl.arange(0, BLOCK_SIZE)
56+
# Load elements from the X and Y tensors
57+
x_vals = tl.load(X + offsets)
58+
y_vals = tl.load(Y + offsets)
59+
# Perform the element-wise multiplication
60+
z_vals = x_vals * y_vals * a + b
61+
# Store the result in Z
62+
tl.store(Z + offsets, z_vals)
63+
64+
65+
@torch.library.custom_op("torchtrt_ex::elementwise_scale_mul", mutates_args=()) # type: ignore[misc]
66+
def elementwise_scale_mul(
67+
X: torch.Tensor, Y: torch.Tensor, b: float = 0.2, a: int = 2
68+
) -> torch.Tensor:
69+
# Ensure the tensors are on the GPU
70+
assert X.is_cuda and Y.is_cuda, "Tensors must be on CUDA device."
71+
assert X.shape == Y.shape, "Tensors must have the same shape."
72+
73+
# Create output tensor
74+
Z = torch.empty_like(X)
75+
76+
# Define block size
77+
BLOCK_SIZE = 1024
78+
79+
# Grid of programs
80+
grid = lambda meta: (X.numel() // meta["BLOCK_SIZE"],)
81+
82+
# Launch the kernel with parameters a and b
83+
elementwise_scale_mul_kernel[grid](X, Y, Z, a, b, BLOCK_SIZE=BLOCK_SIZE)
84+
85+
return Z
86+
87+
88+
# %%
89+
# The meta kernel for an elementwise operation is just the shape and dtype of one of the inputs since we will not change the shape
90+
# in the course of the operation.
91+
92+
93+
@torch.library.register_fake("torchtrt_ex::elementwise_scale_mul")
94+
def _(x: torch.Tensor, y: torch.Tensor, b: float = 0.2, a: int = 2) -> torch.Tensor:
95+
return x
96+
97+
98+
# %%
99+
# Here we use automatic plugin creation feature in Torch-TensorRT which enables plugin registration using
100+
# TensorRT QDP APIs
101+
torch_tensorrt.dynamo.conversion.plugins.generate_plugin(
102+
"torchtrt_ex::elementwise_scale_mul"
103+
)
104+
105+
106+
# # %%
107+
# # Generating the Converter
108+
# # -------------------------------------------------------------------
109+
# # Given that we have defined the custom operator in PyTorch and TensorRT, we can now generate the converter for the operation.
110+
# # As long as the namespace and names match, the following function will automatically generate the converter for the operation.
111+
torch_tensorrt.dynamo.conversion.plugins.generate_plugin_converter(
112+
"torchtrt_ex::elementwise_scale_mul", supports_dynamic_shapes=True
113+
)
114+
115+
116+
# # %%
117+
# # Above two commands can be replaced with the following single one line:
118+
# torch_tensorrt.dynamo.conversion.plugins.custom_op("torchtrt_ex::elementwise_scale_mul", supports_dynamic_shapes=True)
119+
120+
121+
# %%
122+
# Using our converter with a model
123+
# -------------------------------------------------------------------
124+
#
125+
# Now we can use our custom operator in a model and compile it with Torch-TensorRT.
126+
# We can see that the custom operator is used as one of the operations in the forward pass of the model.
127+
# The process of compiling the model at this point is identical to standard Torch-TensorRT usage.
128+
class MyModel(torch.nn.Module): # type: ignore[misc]
129+
def __init__(self):
130+
super().__init__()
131+
132+
def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
133+
z = torch.add(x, y)
134+
res = torch.ops.torchtrt_ex.elementwise_scale_mul.default(x, z, b=0.5)
135+
136+
return res
137+
138+
139+
my_model = MyModel().to("cuda")
140+
m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
141+
n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
142+
143+
with torch_tensorrt.logging.errors():
144+
model_trt = torch_tensorrt.compile(
145+
my_model, inputs=[m, n], debug=True, min_block_size=1
146+
)
147+
for i in range(300):
148+
res = model_trt(m, n)
149+
assert torch.allclose(res, my_model(m, n))
150+
151+
print("Ran with custom plugin!")
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from torch_tensorrt.dynamo.conversion.plugins._custom_op import custom_op
2+
from torch_tensorrt.dynamo.conversion.plugins._generate_plugin import generate_plugin
13
from torch_tensorrt.dynamo.conversion.plugins._generate_plugin_converter import (
24
generate_plugin_converter,
35
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from typing import Callable, Optional
2+
3+
from torch.fx.node import Node
4+
from torch_tensorrt.dynamo._settings import CompilationSettings
5+
from torch_tensorrt.dynamo.conversion._ConverterRegistry import ConverterPriority
6+
from torch_tensorrt.dynamo.conversion.plugins._generate_plugin import generate_plugin
7+
from torch_tensorrt.dynamo.conversion.plugins._generate_plugin_converter import (
8+
generate_plugin_converter,
9+
)
10+
11+
12+
def custom_op(
13+
op_name: str,
14+
capability_validator: Optional[Callable[[Node, CompilationSettings], bool]] = None,
15+
priority: ConverterPriority = ConverterPriority.STANDARD,
16+
supports_dynamic_shapes: bool = False,
17+
) -> None:
18+
"""
19+
Generate the Plugin and corresponding Plugin Converter using external kernels and TensorRT Quick Deployable Plugin APIs.
20+
21+
Args:
22+
plugin_name: the plugin name that is used to generate the plugin automatically.
23+
There should be existing kernels and pytorch custom operation for this plugin name.
24+
capability_validator: A lambda that can take a ``torch.fx.Node`` and determine if the
25+
converter can properly handle this Node. If the validator returns ``False``, the subgraph
26+
partitioner will make sure this Node is run in PyTorch in the compiled graph.
27+
priority: Allows developers to override existing converters in the converter registry
28+
supports_dynamic_shapes: if dynamic shape is supported
29+
"""
30+
generate_plugin(op_name)
31+
generate_plugin_converter(
32+
op_name, capability_validator, priority, supports_dynamic_shapes
33+
)

0 commit comments

Comments
 (0)