Skip to content

Commit 4b79199

Browse files
committed
Save 16 KiB by moving part of optimized add/sub to .cpp
Noticed that this function has a significant template-independent part, so I factored it out. ran test/build_optimized_size_test.sh before/after. here's the output of `size cmake-out/size_test_all_optimized_ops` before/after (other binaries didn't change). before: ``` __TEXT __DATA __OBJC others dec hex 4456448 98304 0 4296359936 4300914688 1005ac000 cmake-out/test/size_test_all_optimized_ops ``` after: ``` __TEXT __DATA __OBJC others dec hex 4440064 98304 0 4296359936 4300898304 1005a8000 cmake-out/test/size_test_all_optimized_ops ``` difference of 16384 bytes. ghstack-source-id: bc550ec ghstack-comment-id: 2937733037 Pull-Request-resolved: #11346
1 parent 450d1f9 commit 4b79199

File tree

3 files changed

+88
-46
lines changed

3 files changed

+88
-46
lines changed

kernels/optimized/cpu/binary_ops.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/kernels/optimized/cpu/binary_ops.h>
10+
11+
namespace torch::executor::internal {
12+
std::optional<BroadcastElementwisePlan> plan_broadcast_elementwise(
13+
KernelRuntimeContext& ctx,
14+
const Tensor& a,
15+
const Tensor& b,
16+
Tensor& out,
17+
const ElementwiseOptimizedPath selected_optimized_path) {
18+
BroadcastElementwisePlan plan;
19+
if ((selected_optimized_path ==
20+
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
21+
(selected_optimized_path ==
22+
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
23+
plan.lhs = &b;
24+
plan.rhs = &a;
25+
} else {
26+
// Catch failure to update logic when adding new broadcasting possibility.
27+
ET_DCHECK(
28+
(selected_optimized_path ==
29+
ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
30+
(selected_optimized_path ==
31+
ElementwiseOptimizedPath::kBroadcastNdByNd));
32+
plan.lhs = &a;
33+
plan.rhs = &b;
34+
}
35+
auto error = resize_tensor(out, plan.lhs->sizes());
36+
ET_KERNEL_CHECK_MSG(
37+
ctx,
38+
error == Error::Ok,
39+
InvalidArgument,
40+
std::nullopt,
41+
"Failed to resize output tensor.");
42+
plan.outer_size = 1;
43+
if ((selected_optimized_path == ElementwiseOptimizedPath::kBroadcastNdByNd) ||
44+
(selected_optimized_path ==
45+
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
46+
int32_t broadcast_dim = internal::get_broadcast_dim(*plan.lhs, *plan.rhs);
47+
int32_t broadcast_dim_lhs = plan.lhs->dim() + broadcast_dim;
48+
auto normalized_tensor_size_lhs =
49+
get_normalized_tensor_size(*plan.lhs, broadcast_dim_lhs);
50+
plan.outer_size = normalized_tensor_size_lhs[0];
51+
plan.broadcast_size = normalized_tensor_size_lhs[1];
52+
plan.inner_size = normalized_tensor_size_lhs[2];
53+
} else {
54+
plan.broadcast_size = plan.lhs->sizes()[plan.lhs->dim() - 2];
55+
plan.inner_size = plan.lhs->sizes()[plan.lhs->dim() - 1];
56+
}
57+
return plan;
58+
}
59+
} // namespace torch::executor::internal

kernels/optimized/cpu/binary_ops.h

Lines changed: 28 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
1414
#include <executorch/runtime/kernel/kernel_includes.h>
1515

16+
#include <optional>
17+
1618
namespace torch {
1719
namespace executor {
1820
enum class ElementwiseOptimizedPath {
@@ -206,6 +208,23 @@ Tensor& handle_last_dim_broadcast_elementwise(
206208
return out;
207209
}
208210

211+
namespace internal {
212+
struct BroadcastElementwisePlan {
213+
const Tensor* lhs;
214+
const Tensor* rhs;
215+
int64_t outer_size;
216+
int64_t broadcast_size;
217+
int64_t inner_size;
218+
};
219+
220+
std::optional<BroadcastElementwisePlan> plan_broadcast_elementwise(
221+
KernelRuntimeContext& ctx,
222+
const Tensor& a,
223+
const Tensor& b,
224+
Tensor& out,
225+
const ElementwiseOptimizedPath selected_optimized_path);
226+
} // namespace internal
227+
209228
template <typename CTYPE, typename Op>
210229
Tensor& handle_broadcast_elementwise(
211230
KernelRuntimeContext& ctx,
@@ -223,56 +242,19 @@ Tensor& handle_broadcast_elementwise(
223242
ctx, vec_fun, a, b, out, selected_optimized_path);
224243
}
225244

226-
const Tensor* lhs;
227-
const Tensor* rhs;
228-
if ((selected_optimized_path ==
229-
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
230-
(selected_optimized_path ==
231-
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
232-
lhs = &b;
233-
rhs = &a;
234-
} else {
235-
// Catch failure to update logic when adding new broadcasting possibility.
236-
ET_DCHECK(
237-
(selected_optimized_path ==
238-
ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
239-
(selected_optimized_path ==
240-
ElementwiseOptimizedPath::kBroadcastNdByNd));
241-
lhs = &a;
242-
rhs = &b;
243-
}
244-
auto error = resize_tensor(out, lhs->sizes());
245-
ET_KERNEL_CHECK_MSG(
246-
ctx,
247-
error == Error::Ok,
248-
InvalidArgument,
249-
out,
250-
"Failed to resize output tensor.");
251-
int64_t outer_size = 1;
252-
int64_t broadcast_size;
253-
int64_t inner_size;
254-
if ((selected_optimized_path == ElementwiseOptimizedPath::kBroadcastNdByNd) ||
255-
(selected_optimized_path ==
256-
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
257-
int32_t broadcast_dim = internal::get_broadcast_dim(*lhs, *rhs);
258-
int32_t broadcast_dim_lhs = lhs->dim() + broadcast_dim;
259-
auto normalized_tensor_size_lhs =
260-
get_normalized_tensor_size(*lhs, broadcast_dim_lhs);
261-
outer_size = normalized_tensor_size_lhs[0];
262-
broadcast_size = normalized_tensor_size_lhs[1];
263-
inner_size = normalized_tensor_size_lhs[2];
264-
} else {
265-
broadcast_size = lhs->sizes()[lhs->dim() - 2];
266-
inner_size = lhs->sizes()[lhs->dim() - 1];
245+
auto opt_plan = internal::plan_broadcast_elementwise(
246+
ctx, a, b, out, selected_optimized_path);
247+
if (!opt_plan) {
248+
return out;
267249
}
268250
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE, Op>(
269251
vec_fun,
270252
out.mutable_data_ptr<CTYPE>(),
271-
lhs->const_data_ptr<CTYPE>(),
272-
rhs->const_data_ptr<CTYPE>(),
273-
outer_size,
274-
broadcast_size,
275-
inner_size);
253+
opt_plan->lhs->const_data_ptr<CTYPE>(),
254+
opt_plan->rhs->const_data_ptr<CTYPE>(),
255+
opt_plan->outer_size,
256+
opt_plan->broadcast_size,
257+
opt_plan->inner_size);
276258
return out;
277259
}
278260
} // namespace executor

kernels/optimized/cpu/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def define_common_targets():
4242

4343
runtime.cxx_library(
4444
name = "binary_ops",
45+
srcs = ["binary_ops.cpp"],
4546
exported_headers = ["binary_ops.h"],
4647
visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",],
4748
exported_deps = ["//executorch/runtime/core:core"],

0 commit comments

Comments
 (0)