Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce: Align device of temp buffer with input tensor #736

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 68 additions & 20 deletions src/ATen/native/xpu/sycl/Reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@
#include <ATen/native/xpu/sycl/MemoryAccessUtils.h>
#include <ATen/native/xpu/sycl/OffsetCalculator.h>
#include <c10/core/Allocator.h>
#include <c10/core/Stream.h>
#include <c10/macros/Macros.h>
#include <c10/xpu/XPUCachingAllocator.h>
#include <c10/xpu/XPUStream.h>
#include <comm/DeviceProperties.h>
#include <comm/SYCLContext.h>
#include <comm/XPUPair.h>
#include <sycl/queue.hpp>
#include <functional>
#include <iosfwd>
#include <type_traits>
Expand All @@ -25,6 +29,7 @@ namespace xpu {

using namespace at::xpu;
using at::detail::Array;
using namespace c10::xpu;

namespace detail {

Expand Down Expand Up @@ -246,9 +251,8 @@ struct ReduceConfig {
// in side of SG. It is functional WA. We got case failures on some
// platforms supporting SIMD8.
// https://github.com/intel/torch-xpu-ops/issues/698
auto max_sg_sz = syclMinSubGroupSize() == 8
? syclMinSubGroupSize()
: syclMaxSubGroupSize();
auto max_sg_sz = syclMinSubGroupSize() == 8 ? syclMinSubGroupSize()
: syclMaxSubGroupSize();
const int max_num_items = max_wg_sz / output_vec_size;
int dim0_pow2 = dim0 < max_num_items ? static_cast<int>(last_pow2(dim0))
: max_num_items;
Expand Down Expand Up @@ -1074,7 +1078,10 @@ class AccumulationBuffer {
numerator_ = 1;
denominator_ = 1;
} else {
buffer_ = c10::GetAllocator(kXPU)->allocate(size);
// buffer_ = c10::GetAllocator(kXPU)->allocate(size);
auto& allocator = *c10::xpu::XPUCachingAllocator::get();
buffer_ = allocator.allocate(size);

acc_ptr_ = (char*)buffer_.get();
numerator_ = acc_t_size;
denominator_ = out_t_size;
Expand Down Expand Up @@ -1149,6 +1156,9 @@ inline void gpu_reduce_kernel(
iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 &&
iter.noutputs() >= 1);

std::cout << "current device 1 " << (int)current_device() << "\n"
<< std::endl;

using traits = function_traits<decltype(&ops_t::reduce)>;
using arg_t = typename traits::template arg<0>::type;
static constexpr bool can_accumulate_in_output =
Expand Down Expand Up @@ -1179,7 +1189,8 @@ inline void gpu_reduce_kernel(
}
acc_buf_ptr = owned_buf_ptr.get();
}

std::cout << "current device 2 " << (int)current_device() << "\n"
<< std::endl;
if (!can_use_32bit_indexing) {
for (auto& sub_iter : iter.with_32bit_indexing()) {
int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
Expand All @@ -1189,7 +1200,8 @@ inline void gpu_reduce_kernel(
}
return;
}

std::cout << "current device 3 " << (int)current_device() << "\n"
<< std::endl;
char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
char* out_data = (char*)iter.data_ptr(0);
const auto noutputs = iter.noutputs();
Expand All @@ -1201,6 +1213,8 @@ inline void gpu_reduce_kernel(
}
char* acc_data = acc_buf_ptr->get_acc_slice(out_data);

std::cout << "current device 4 " << (int)current_device() << "\n"
<< std::endl;
// Start by assuming that each thread handles a single output and all
// the inputs for that output.
int64_t num_outputs = iter.num_output_elements();
Expand All @@ -1214,6 +1228,8 @@ inline void gpu_reduce_kernel(
int64_t fastest_moving_stride;
bool reduction_on_fastest_striding_dimension;

std::cout << "current device 5 " << (int)current_device() << "\n"
<< std::endl;
if (iter.ndim() > 0) {
// Adjust group size to map group width to fastest changing dimension of
// the input tensor. This grants the best possible memory accessing
Expand Down Expand Up @@ -1251,7 +1267,8 @@ inline void gpu_reduce_kernel(
dim0 = 1;
dim1 = 1;
}

std::cout << "current device 6 " << (int)current_device() << "\n"
<< std::endl;
// We do vectorization to gain better memory access, there are two cases
// which we call "vectorize along input" and "vectorize along output". Note
// that the "input/output" here does not mean we are vectorizing load/store
Expand Down Expand Up @@ -1284,7 +1301,8 @@ inline void gpu_reduce_kernel(
dim0 /= config.output_vec_size;
}
}

std::cout << "current device 7 " << (int)current_device() << "\n"
<< std::endl;
// Adjust group_width and group_height
// Mapping to launch_reduce_kernel
using R = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>;
Expand All @@ -1302,7 +1320,8 @@ inline void gpu_reduce_kernel(
break;
}
}

std::cout << "current device 8 " << (int)current_device() << "\n"
<< std::endl;
int group_width = config.group_width;
int group_height = config.group_height;

Expand Down Expand Up @@ -1330,6 +1349,9 @@ inline void gpu_reduce_kernel(
config.output_mult[1] = config.split_output(group_height);
}

std::cout << "current device 9 " << (int)current_device() << "\n"
<< std::endl;

// We are finding a general rountine to work out target max WI number on dev
// And now we use catch-all configuration
constexpr int min_values_per_item = 16;
Expand Down Expand Up @@ -1361,25 +1383,46 @@ inline void gpu_reduce_kernel(
if (config.groups_per_output > 1) {
config.input_mult[2] = config.split_input(config.groups_per_output);
}

std::cout << "current device 10 " << (int)current_device() << "\n"
<< std::endl;
}

Tensor buffer;
Tensor semaphores;
// Tensor buffer;
// Tensor semaphores;
at::DataPtr buffer;
at::DataPtr semaphores;

if (config.should_global_reduce()) {
// auto allocator = c10::xpu::XPUCachingAllocator::get();
buffer = at::empty(
config.global_memory_size(),
at::TensorOptions().dtype(kChar).device(kXPU));
semaphores = at::empty(
config.semaphore_size(), at::TensorOptions().dtype(kChar).device(kXPU));
// buffer = at::empty(
// config.global_memory_size(),
// at::TensorOptions().dtype(kChar).device(kXPU));
// semaphores = at::empty(
// config.semaphore_size(),
// at::TensorOptions().dtype(kChar).device(kXPU));
std::cout << "current device 11 " << (int)current_device() << "\n"
<< std::endl;
auto& allocator = *c10::xpu::XPUCachingAllocator::get();
buffer = allocator.allocate(config.global_memory_size());
semaphores = allocator.allocate(config.semaphore_size());

getCurrentSYCLQueue().memset(
(void*)buffer.get(), 0, config.global_memory_size());
getCurrentSYCLQueue().memset(
(void*)semaphores.get(), 0, config.semaphore_size());

at::detail::Array<char*, 1> data;
data[0] = (char*)semaphores.data_ptr();
// data[0] = (char*)semaphores.data_ptr();
data[0] = (char*)semaphores.get();
ReduceKernelEmptyFunctor fn;
int vec_size = at::native::memory::can_vectorize_up_to<decltype(fn)>(data);
auto ic = TrivialOffsetCalculator<traits::arity>();
launch_vectorized_kernel(config.semaphore_size(), fn, data, ic, vec_size);
}

std::cout << "current device 12 " << (int)current_device() << "\n"
<< std::endl;
AT_ASSERT(can_use_32bit_indexing);
auto output_calc = make_output_calculator<uint32_t>(iter);
auto input_calc = make_input_calculator<uint32_t>(iter);
Expand All @@ -1392,15 +1435,20 @@ inline void gpu_reduce_kernel(
out_data,
out_data_extra,
acc_data,
buffer.defined() ? (void*)buffer.data_ptr() : nullptr,
buffer.defined() ? (int*)semaphores.data_ptr() : nullptr,
// buffer.defined() ? (void*)buffer.data_ptr() : nullptr,
// buffer.defined() ? (int*)semaphores.data_ptr() : nullptr,
(void*)buffer.get(),
(int*)semaphores.get(),
ident,
noutputs,
base_idx);
reduce.accumulate = iter.should_accumulate();
reduce.final_output = iter.is_final_output();

std::cout << "current device 13 " << (int)current_device() << "\n"
<< std::endl;
launch_reduce_kernel(config, reduce);
std::cout << "current device 14 " << (int)current_device() << "\n"
<< std::endl;
}

} // namespace xpu
Expand Down
Loading