From 8231129d0f3b67ae623e326450b7047eb262369c Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Mon, 27 Jan 2025 14:32:13 -0600 Subject: [PATCH] [ET-VK] Using push constants for conv2d pw. Pull Request resolved: https://github.com/pytorch/executorch/pull/7814 This diff is related to the use of push constants for convolutional pw (pointwise) in Executorch's Vulkan backend. This optimization improves performance and memory usage. ghstack-source-id: 263238730 @exported-using-ghexport Differential Revision: [D68400677](https://our.internmc.facebook.com/intern/diff/D68400677/) Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> --- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 23 +++-- .../runtime/graph/ops/impl/Convolution.cpp | 83 ++++++++++++++----- 2 files changed, 78 insertions(+), 28 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index f72c487fa7..0413eb7b7a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -24,11 +24,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")} ${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")} ${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")} ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")} -${layout_declare_ubo(4, "ivec3", "out_limits")} -${layout_declare_ubo(5, "ivec4", "in_sizes")} -${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")} -${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")} -${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} + +layout(push_constant) uniform restrict Block { + ivec4 out_limits; + ivec4 in_sizes; + ivec2 kernel_size; + ivec2 stride; + ivec2 padding; + ivec2 dilation; + ivec2 overlay_region; + int in_group_size; + int dummy_padding; + float out_min; + float out_max; +}; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -70,7 +79,7 @@ void main() { // If the top left position is out of bounds, then this invocation will have // no work to do. - if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits))) { + if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits.xyz))) { return; } @@ -144,7 +153,7 @@ void main() { for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex]; - if (all(lessThan(ivec3(pos, gpos.z), out_limits))) { + if (all(lessThan(ivec3(pos, gpos.z), out_limits.xyz))) { imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max)); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 8c369914c1..3c367f334d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -407,27 +407,68 @@ void add_conv2d_node( wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1}; } - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - shader, - wg_size, - graph.create_local_wg_size(wg_size), - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, - // Shader params buffers - { - t_out->logical_limits_ubo(), - t_in->sizes_ubo(), - graph.create_params_buffer(kernel_params), - graph.create_params_buffer(extra_params), - graph.create_params_buffer(out_params), - }, - // Specialization Constants - {}, - // Resizing Logic - resize_conv2d_node, - {weight_data, stride, padding, dilation, transposed, output_padding})); + if (method == Conv2dMethod::Pointwise) { + const utils::ivec4 kernel_param_size_stride = { + kernel_params.kernel_size[0], + kernel_params.kernel_size[1], + kernel_params.stride[0], + kernel_params.stride[1]}; + + const utils::ivec4 kernel_param_pad_dial = { + kernel_params.padding[0], + kernel_params.padding[1], + kernel_params.dilation[0], + kernel_params.dilation[1]}; + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + wg_size, + graph.create_local_wg_size(wg_size), + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + {}, + // Specialization Constants + {}, + // Resizing Logic + resize_conv2d_node, + {weight_data, stride, padding, dilation, transposed, output_padding}, + { + graph.logical_limits_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo( + &kernel_param_size_stride, sizeof(kernel_param_size_stride)), + PushConstantDataInfo( + &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)), + PushConstantDataInfo( + &extra_params, sizeof(extra_params), sizeof(utils::ivec4)), + PushConstantDataInfo(&out_params, sizeof(out_params)), + })); + } else { + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + shader, + wg_size, + graph.create_local_wg_size(wg_size), + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + { + t_out->logical_limits_ubo(), + t_in->sizes_ubo(), + graph.create_params_buffer(kernel_params), + graph.create_params_buffer(extra_params), + graph.create_params_buffer(out_params), + }, + // Specialization Constants + {}, + // Resizing Logic + resize_conv2d_node, + {weight_data, stride, padding, dilation, transposed, output_padding})); + } } void add_conv1d_node(