Skip to content

Commit

Permalink
Basic implementation completed
Browse files Browse the repository at this point in the history
  • Loading branch information
szaman19 committed Mar 17, 2023
1 parent d1d1138 commit 69269a7
Show file tree
Hide file tree
Showing 2 changed files with 377 additions and 88 deletions.
216 changes: 208 additions & 8 deletions src/layers/regularizers/distconv/distconv_layer_norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,10 @@ void LayerNormalization ::calculate_forward_stats(
util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n";
return; // no op for empty inputs
}

const auto& input_dims = input.get_local_shape();
const auto& statistics_dims = statistics.get_local_shape();

const auto local_num_samples = input_0_dims[3];

const auto global_num_samples = statistics_dims[3];

const auto local_sample_size = std::accumulate(input_dims.begin(),
input_dims.end() - 1,
1,
Expand All @@ -61,7 +57,7 @@ void LayerNormalization ::calculate_forward_stats(
local_sample_size);

LocalMat local_statistics(2,
local_num_samples,
global_num_samples,
statistics.get_local_shape(),
2);

Expand Down Expand Up @@ -101,7 +97,79 @@ void LayerNormalization::apply_normalization(
const DCTensor<Allocator>& input,
const DCTensor<Allocator>& statistics,
DCTensor<Allocator>& output)
{}
{
const auto& input_dims = input.get_local_shape();
const auto& statistics_dims = statistics.get_local_shape();
const auto local_num_samples = input_0_dims[3];
const auto global_num_samples = statistics_dims[3];
const auto local_sample_size = std::accumulate(input_dims.begin(),
input_dims.end() - 1,
1,
std::multiplies<int>());

using LocalMat = El::Matrix<DataType, El::Device::GPU>;
const LocalMat local_input(local_sample_size,
local_num_samples,
input.get_buffer(),
local_sample_size);

const LocalMat local_statistics(2,
global_num_samples,
statistics.get_local_shape(),
2);

LocalMat local_output(local_sample_size,
local_num_samples,
output.get_buffer(),
local_sample_size);

const auto local_means = El::View(local_statistics, El::IR(0), El::ALL);
const auto local_vars = El::View(local_statistics, El::IR(1), El::ALL);
{
using namespace hydrogen;
auto sync_info = gpu::get_sync_info(local_statistics);
constexpr size_t block_size = 256;
dim3 block_dims, grid_dims;
block_dims.x = block_size;
grid_dims.x = (local_num_samples + block_size - 1) / block_size;
hydrogen::gpu::LaunchKernel(layer_norm_fp_statistics_kernel<TensorDataType>,
grid_dims,
block_dims,
0,
sync_info,
sample_size,
local_num_samples,
local_means.Buffer(),
local_means.LDim(),
local_vars.Buffer(),
local_vars.LDim());

auto multisync = El::MakeMultiSync(gpu::get_sync_info(local_output),
gpu::get_sync_info(local_statistics),
gpu::get_sync_info(local_input));
constexpr size_t block_size = 256;
dim3 block_dims, grid_dims;
block_dims.x = block_size;
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
grid_dims.y = local_num_samples;
hydrogen::gpu::LaunchKernel(layer_norm_fp_output_kernel<TensorDataType>,
grid_dims,
block_dims,
0,
multisync,
local_num_samples,
local_sample_size,
epsilon,
local_input.LockedBuffer(),
local_input.LDim(),
local_output.Buffer(),
local_output.LDim(),
local_means.LockedBuffer(),
local_means.LDim(),
local_vars.LockedBuffer(),
local_vars.LDim());
}
}

template <typename Backend, typename DataType>
template <typename Allocator>
Expand All @@ -110,15 +178,147 @@ void LayerNormalization::calculate_backward_stats(
const DCTensor<Allocator>& output_grad,
const DCTensor<Allocator>& statistics,
DCTensor<Allocator>& statistics_grad)
{}
{
const auto& input_dims = input.get_local_shape();
const auto& statistics_dims = statistics.get_local_shape();
const auto local_num_samples = input_0_dims[3];
const auto global_num_samples = statistics_dims[3];
const auto local_sample_size = std::accumulate(input_dims.begin(),
input_dims.end() - 1,
1,
std::multiplies<int>());
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
const LocalMat local_input(local_sample_size,
local_num_samples,
input.get_buffer(),
local_sample_size);
const LocalMat local_output_grad(local_sample_size,
local_num_samples,
output_grad.get_buffer(),
local_sample_size);

const LocalMat local_statistics(2,
global_num_samples,
statistics.get_local_shape(),
2);

LocalMat local_statistics_grad(2,
global_num_samples,
statistics_grad.get_buffer(),
2);
{
using namespace hydrogen;
auto multisync =
El::MakeMultiSync(gpu::get_sync_info(local_statistics_grad),
gpu::get_sync_info(local_output_grad),
gpu::get_sync_info(local_statistics),
gpu::get_sync_info(local_input));
constexpr size_t block_size = 256;
dim3 block_dims, grid_dims;
block_dims.x = block_size;
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
grid_dims.y = local_num_samples;
hydrogen::gpu::LaunchKernel(
layer_norm_bp_statistics_grad_kernel<block_size, TensorDataType>,
grid_dims,
block_dims,
0,
multisync,
local_num_samples,
local_sample_size,
m_epsilon,
local_input.LockedBuffer(),
local_input.LDim(),
local_output_grad.LockedBuffer(),
local_output_grad.LDim(),
local_means.LockedBuffer(),
local_means.LDim(),
local_vars.LockedBuffer(),
local_vars.LDim(),
local_means_grad.Buffer(),
local_means_grad.LDim(),
local_vars_grad.Buffer(),
local_vars_grad.LDim());
}
}

template <typename Backend, typename DataType>
template <typename Allocator>
void LayerNormalization::apply_grad(const DCTensor<Allocator>& input,
const DCTensor<Allocator>& output_grad,
const DCTensor<Allocator>& statistics,
const DCTensor<Allocator>& statistics_grad,
DCTensor<Allocator>& input_grad)
{}
{
const auto& input_dims = input.get_local_shape();
const auto& statistics_dims = statistics.get_local_shape();
const auto local_num_samples = input_0_dims[3];
const auto global_num_samples = statistics_dims[3];
const auto local_sample_size = std::accumulate(input_dims.begin(),
input_dims.end() - 1,
1,
std::multiplies<int>());
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
const LocalMat local_input(local_sample_size,
local_num_samples,
input.get_buffer(),
local_sample_size);
const LocalMat local_output_grad(local_sample_size,
local_num_samples,
output_grad.get_buffer(),
local_sample_size);

const LocalMat local_statistics(2,
global_num_samples,
statistics.get_local_shape(),
2);

const LocalMat local_statistics_grad(2,
global_num_samples,
statistics_grad.get_buffer(),
2);

LocalMat local_input_grad(local_sample_size,
local_num_samples,
input_grad.get_buffer(),
local_sample_size);
{
using namespace hydrogen;
auto multisync =
El::MakeMultiSync(gpu::get_sync_info(local_statistics_grad),
gpu::get_sync_info(local_output_grad),
gpu::get_sync_info(local_statistics),
gpu::get_sync_info(local_input));
constexpr size_t block_size = 256;
dim3 block_dims, grid_dims;
block_dims.x = block_size;
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
grid_dims.y = local_num_samples;
hydrogen::gpu::LaunchKernel(layer_norm_bp_input_grad_kernel<TensorDataType>,
grid_dims,
block_dims,
0,
multisync,
sample_size,
local_num_samples,
local_sample_size,
m_epsilon,
local_input.LockedBuffer(),
local_input.LDim(),
local_output_grad.LockedBuffer(),
local_output_grad.LDim(),
local_input_grad.Buffer(),
local_input_grad.LDim(),
local_means.LockedBuffer(),
local_means.LDim(),
local_vars.LockedBuffer(),
local_vars.LDim(),
local_means_grad.LockedBuffer(),
local_means_grad.LDim(),
local_vars_grad.LockedBuffer(),
local_vars_grad.LDim());
}
}

#define ETI(T, Backend) \
template class LayerNormalization<Backend, T>; \
Expand Down
Loading

0 comments on commit 69269a7

Please sign in to comment.