Skip to content

Commit

Permalink
Fix handling of thread features for scalars in Anderson2021 (#7726)
Browse files Browse the repository at this point in the history
* Fix handling of thread features for scalars

* Remove unneeded change
  • Loading branch information
aekul authored Jul 31, 2023
1 parent fca8d96 commit f54bc08
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 48 deletions.
119 changes: 79 additions & 40 deletions src/autoschedulers/anderson2021/LoopNest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const {

// Get the stride over "node's" storage and its element-wise stride for a unit
// increment in the given thread loops
Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose) const {
Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose) const {
internal_assert(innermost_storage_dim >= 0);

if (verbose) {
Expand Down Expand Up @@ -756,7 +756,7 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage
}

Strides strides{storage_strides};
for (const auto &thread_loop_var : thread_info.loop_vars) {
for (const auto &thread_loop_var : thread_info->loop_vars) {
int loop_index = stage->get_loop_index_from_var(thread_loop_var);
bool loop_index_exists = loop_index >= 0;

Expand Down Expand Up @@ -843,7 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
return;
}

const ThreadInfo &thread_info = *gpu_loop_info.thread_info;
internal_assert(gpu_loop_info.thread_info != nullptr);
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;

size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
Expand Down Expand Up @@ -967,18 +968,29 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
}

template<typename T>
void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
int bytes_per_access = node->bytes_per_point;

// If the consumer is a scalar and is compute_root, then it will not be
// surrounded by a gpu_threads loop, in which case thread_info will be null.
// In this case, there is no need to compute the below thread/warp-related
// details because only a single point is being computed
if (!thread_info && is_scalar()) {
mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
return;
}

internal_assert(thread_info != nullptr);

Strides strides = compute_strides(jac, innermost_dim, node, store_bounds, thread_info, verbose);

size_t dimensions = thread_info.loop_indices.size();
size_t dimensions = thread_info->loop_indices.size();
strides.dump(verbose);

int bytes_per_access = node->bytes_per_point;

{
int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
thread_info.for_each_thread_id_in_first_warp(accumulator);
thread_info->for_each_thread_id_in_first_warp(accumulator);

accumulator.add_access_info(
num_requests,
Expand All @@ -987,21 +999,21 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const

if (verbose) {
aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
}
}

if (!thread_info.has_tail_warp) {
if (!thread_info->has_tail_warp) {
return;
}

if (verbose) {
aslog(2) << "\nBEGIN tail warp\n";
aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
}

Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
thread_info.for_each_thread_id_in_tail_warp(accumulator);
thread_info->for_each_thread_id_in_tail_warp(accumulator);

accumulator.add_access_info(
num_requests_per_warp,
Expand All @@ -1013,18 +1025,27 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
}
}

template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;

template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;

template<>
void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
int bytes_per_access = node->bytes_per_point;

// If the consumer is a scalar and is compute_root, then it will not be
// surrounded by a gpu_threads loop, in which case thread_info will be null.
// In this case, there is no need to compute the below thread/warp-related
// details because only a single point is being computed
if (!thread_info && is_scalar()) {
mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
return;
}

{
int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
LocalAccessAccumulator accumulator(bytes_per_access, verbose);
thread_info.for_each_thread_id_in_first_warp(accumulator);
thread_info->for_each_thread_id_in_first_warp(accumulator);

accumulator.add_access_info(
num_requests,
Expand All @@ -1033,21 +1054,21 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &

if (verbose) {
aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
}
}

if (!thread_info.has_tail_warp) {
if (!thread_info->has_tail_warp) {
return;
}

if (verbose) {
aslog(2) << "\nBEGIN tail warp\n";
aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
}

LocalAccessAccumulator accumulator(bytes_per_access, verbose);
thread_info.for_each_thread_id_in_tail_warp(accumulator);
thread_info->for_each_thread_id_in_tail_warp(accumulator);

accumulator.add_access_info(
num_requests_per_warp,
Expand All @@ -1074,19 +1095,19 @@ std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJ
}

template<typename T>
MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const {
MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const {
MemInfoType<T> mem_info;

compute_num_mem_accesses_per_block<T>(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose);
return mem_info;
}

template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;

template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;

template<typename T>
void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
if (producer_has_been_scheduled) {
compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);

Expand Down Expand Up @@ -1115,7 +1136,7 @@ template void LoopNest::compute_mem_load_features<GlobalMem>(const LoadJacobian
const FunctionDAG::Node *node,
const Bound &producer_store_bounds,
bool producer_has_been_scheduled,
const ThreadInfo &thread_info,
const ThreadInfo *thread_info,
MemInfoType<GlobalMem> &mem_info,
double points_accessed_per_thread,
bool verbose) const;
Expand All @@ -1125,7 +1146,7 @@ template void LoopNest::compute_mem_load_features<SharedMem>(const LoadJacobian
const FunctionDAG::Node *node,
const Bound &producer_store_bounds,
bool producer_has_been_scheduled,
const ThreadInfo &thread_info,
const ThreadInfo *thread_info,
MemInfoType<SharedMem> &mem_info,
double points_accessed_per_thread,
bool verbose) const;
Expand All @@ -1136,7 +1157,7 @@ void LoopNest::compute_mem_load_features<LocalMem>(const LoadJacobian &jac,
const FunctionDAG::Node *node,
const Bound &producer_store_bounds,
bool producer_has_been_scheduled,
const ThreadInfo &thread_info,
const ThreadInfo *thread_info,
MemInfoType<LocalMem> &mem_info,
double points_accessed_per_thread,
bool verbose) const {
Expand Down Expand Up @@ -2163,11 +2184,13 @@ void LoopNest::compute_features(const FunctionDAG &dag,
// The store_at location of the consumer
const auto *consumer_store_site = innermost ? parent : consumer_site.store;

bool inner_serial_loop_extents_computed = false;
std::vector<int64_t> inner_serial_loop_extents;

if (innermost && !stage->store_jacobian->empty()) {
const auto &bounds = consumer_site.store->get_bounds(stage->node);
inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
inner_serial_loop_extents_computed = true;
auto store_jac = *stage->store_jacobian;

compute_gpu_store_features(
Expand Down Expand Up @@ -2223,10 +2246,16 @@ void LoopNest::compute_features(const FunctionDAG &dag,
for (const auto &j : e->load_jacobians) {
jacobians.emplace_back(j, e->producer);

if (!inner_serial_loop_extents_computed && !is_scalar()) {
inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
inner_serial_loop_extents_computed = true;
}

// Thread loops may not be innermost so in the
// Jacobians we need to account for the stride
// of the inner loops
thread_jacobians.emplace_back(j * inner_serial_loop_extents, e->producer);
// of the inner loops (but only for non-scalars,
// since scalars never have inner serial loops)
thread_jacobians.emplace_back(is_scalar() ? j : j * inner_serial_loop_extents, e->producer);
}
} else {
// Consumer was inlined. Multiply the Jacobians to look through it.
Expand Down Expand Up @@ -2334,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
*gpu_loop_info.thread_info,
gpu_loop_info.thread_info,
shared_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2365,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
*gpu_loop_info.thread_info,
gpu_loop_info.thread_info,
global_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2405,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
*gpu_loop_info.thread_info,
gpu_loop_info.thread_info,
local_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2678,10 +2707,20 @@ void LoopNest::compute_features(const FunctionDAG &dag,
inlined_feat.outer_parallelism = parallelism;
inlined_feat.num_blocks = parallelism;

internal_assert(gpu_loop_info.thread_info);
auto num_warps = it.value() * gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
inlined_feat.num_warps_per_block += num_warps;
inlined_feat.num_threads_per_block += gpu_loop_info.thread_info->num_threads;
internal_assert(is_scalar() || gpu_loop_info.thread_info);

auto num_warps_per_block = it.value();
auto num_threads_per_block = 1;

// If the func is being inlined into a scalar, then the scalar will not
// be surrounded by block/thread/serial loops so there's no need to take
// them into account when computing these features
if (!is_scalar()) {
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
num_threads_per_block = gpu_loop_info.thread_info->num_threads;
}
inlined_feat.num_warps_per_block += num_warps_per_block;
inlined_feat.num_threads_per_block += num_threads_per_block;
double points_computed_per_thread = it.value() * feat.points_computed_per_thread;
inlined_feat.points_computed_per_thread += points_computed_per_thread;

Expand All @@ -2695,9 +2734,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,

intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
intermediate.outer_parallelism = parallelism;
intermediate.num_warps_per_block = num_warps;
intermediate.num_warps_per_block = num_warps_per_block;

intermediate.num_threads_per_block = gpu_loop_info.thread_info->num_threads;
intermediate.num_threads_per_block = num_threads_per_block;
intermediate.points_computed_per_thread = points_computed_per_thread;
}
}
Expand Down
Loading

0 comments on commit f54bc08

Please sign in to comment.