Skip to content

Commit

Permalink
[ET-VK] Using push constants for conv2d dw. (#8008)
Browse files Browse the repository at this point in the history
* [ET-VK] Using shared memory to save position in conv2d dw output op.

Pull Request resolved: #7923

This diff introduces a change to conv2d dw op to save output positions in shared memory, which reduces register usage and improves performance.
ghstack-source-id: 263440666
@exported-using-ghexport

Differential Revision: [D68400890](https://our.internmc.facebook.com/intern/diff/D68400890/)

* [ET-VK] Using push constants for conv2d dw.

Pull Request resolved: #7928

This diff is related to the use of push constants for convolutional dw (depthwise) in Executorch's Vulkan backend. This optimization improves memory usage.
ghstack-source-id: 263440665
@exported-using-ghexport

Differential Revision: [D68493849](https://our.internmc.facebook.com/intern/diff/D68493849/)

* [ET-VK] Using TmpTensor for width packed versions of q_linear op shader to reduce memory usage. (#8009)

Pull Request resolved: #7929

This diff introduces the use of temporary tensors to reduce memory usage in the width packed versions of the q_linear op shader.
ghstack-source-id: 263456691
@exported-using-ghexport

Differential Revision: [D68561647](https://our.internmc.facebook.com/intern/diff/D68561647/)

Co-authored-by: Vivek Trivedi <[email protected]>

---------

Co-authored-by: Vivek Trivedi <[email protected]>
  • Loading branch information
pytorchbot and trivedivivek authored Jan 29, 2025
1 parent 0b16f27 commit c5fea7e
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 61 deletions.
21 changes: 15 additions & 6 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
${layout_declare_ubo(4, "ivec3", "out_limits")}
${layout_declare_ubo(5, "ivec4", "in_sizes")}
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}

layout(push_constant) uniform restrict Block {
ivec4 out_limits;
ivec4 in_sizes;
ivec2 kernel_size;
ivec2 stride;
ivec2 padding;
ivec2 dilation;
ivec2 overlay_region;
int in_group_size;
int dummy_padding;
float out_min;
float out_max;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand Down Expand Up @@ -127,7 +136,7 @@ void main() {
const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)];
for (int y = 0; y < BATCH_SIZE_Y; y++) {
for (int x = 0; x < BATCH_SIZE_X; x++) {
if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits))) {
if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) {
continue;
}
imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
${layout_declare_ubo(4, "ivec3", "out_limits")}
${layout_declare_ubo(5, "ivec4", "in_sizes")}
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}

layout(push_constant) uniform restrict Block {
ivec4 out_limits;
ivec4 in_sizes;
ivec2 kernel_size;
ivec2 stride;
ivec2 padding;
ivec2 dilation;
ivec2 overlay_region;
int in_group_size;
int dummy_padding;
float out_min;
float out_max;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand Down
86 changes: 38 additions & 48 deletions backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,9 @@ void add_conv2d_node(
wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
}

if (method == Conv2dMethod::Pointwise) {
vkapi::ParamsBindList param_buffers;
std::vector<PushConstantDataInfo> push_constants;
if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
const utils::ivec4 kernel_param_size_stride = {
kernel_params.kernel_size[0],
kernel_params.kernel_size[1],
Expand All @@ -420,55 +422,43 @@ void add_conv2d_node(
kernel_params.dilation[0],
kernel_params.dilation[1]};

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
shader,
wg_size,
graph.create_local_wg_size(wg_size),
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
{},
// Specialization Constants
{},
// Resizing Logic
resize_conv2d_node,
{weight_data, stride, padding, dilation, transposed, output_padding},
{
graph.logical_limits_pc_of(out),
graph.sizes_pc_of(in),
PushConstantDataInfo(
&kernel_param_size_stride, sizeof(kernel_param_size_stride)),
PushConstantDataInfo(
&kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
PushConstantDataInfo(
&extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
PushConstantDataInfo(&out_params, sizeof(out_params)),
}));
push_constants = {
graph.logical_limits_pc_of(out),
graph.sizes_pc_of(in),
PushConstantDataInfo(
&kernel_param_size_stride, sizeof(kernel_param_size_stride)),
PushConstantDataInfo(
&kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
PushConstantDataInfo(
&extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
PushConstantDataInfo(&out_params, sizeof(out_params)),
};
} else {
graph.execute_nodes().emplace_back(new DispatchNode(
graph,
shader,
wg_size,
graph.create_local_wg_size(wg_size),
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
{
t_out->logical_limits_ubo(),
t_in->sizes_ubo(),
graph.create_params_buffer(kernel_params),
graph.create_params_buffer(extra_params),
graph.create_params_buffer(out_params),
},
// Specialization Constants
{},
// Resizing Logic
resize_conv2d_node,
{weight_data, stride, padding, dilation, transposed, output_padding}));
param_buffers = {
t_out->logical_limits_ubo(),
t_in->sizes_ubo(),
graph.create_params_buffer(kernel_params),
graph.create_params_buffer(extra_params),
graph.create_params_buffer(out_params),
};
}

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
shader,
wg_size,
graph.create_local_wg_size(wg_size),
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
param_buffers,
// Specialization Constants
{},
// Resizing Logic
resize_conv2d_node,
{weight_data, stride, padding, dilation, transposed, output_padding},
push_constants));
}

void add_conv1d_node(
Expand Down
9 changes: 7 additions & 2 deletions backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,18 @@ void add_q_8w_linear_node(
auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
ValueRef mat1_W_packed = mat1;
ValueRef out_W_packed = out;
// Create temporary tensors to store the width packed versions of mat1 and out
TmpTensor mat1_tmp(
&graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked);
TmpTensor out_tmp(
&graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked);
if (!graph.is_buffer_storage(out) &&
graph.packed_dim_of(mat1) != WHCN::kWidthDim) {
// Ensure mat1 is width packed
mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked);
mat1_W_packed = mat1_tmp;
viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
// Ensure out is packed correctly
out_W_packed = graph.add_tensor_like(out, utils::kWidthPacked);
out_W_packed = out_tmp;
}
ValueRef q_mat2 = prepack_standard(
graph, q_mat2_data, graph.storage_type_of(out), utils::kWidthPacked);
Expand Down

0 comments on commit c5fea7e

Please sign in to comment.