Skip to content

Commit d59e2db

Browse files
committed
Basic implementation completed
1 parent dbff063 commit d59e2db

File tree

2 files changed

+342
-64
lines changed

2 files changed

+342
-64
lines changed

src/layers/regularizers/distconv/distconv_layer_norm.cu

+208-8
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,10 @@ void LayerNormalization ::calculate_forward_stats(
4141
util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n";
4242
return; // no op for empty inputs
4343
}
44-
4544
const auto& input_dims = input.get_local_shape();
4645
const auto& statistics_dims = statistics.get_local_shape();
47-
4846
const auto local_num_samples = input_0_dims[3];
49-
5047
const auto global_num_samples = statistics_dims[3];
51-
5248
const auto local_sample_size = std::accumulate(input_dims.begin(),
5349
input_dims.end() - 1,
5450
1,
@@ -61,7 +57,7 @@ void LayerNormalization ::calculate_forward_stats(
6157
local_sample_size);
6258

6359
LocalMat local_statistics(2,
64-
local_num_samples,
60+
global_num_samples,
6561
statistics.get_local_shape(),
6662
2);
6763

@@ -101,7 +97,79 @@ void LayerNormalization::apply_normalization(
10197
const DCTensor<Allocator>& input,
10298
const DCTensor<Allocator>& statistics,
10399
DCTensor<Allocator>& output)
104-
{}
100+
{
101+
const auto& input_dims = input.get_local_shape();
102+
const auto& statistics_dims = statistics.get_local_shape();
103+
const auto local_num_samples = input_0_dims[3];
104+
const auto global_num_samples = statistics_dims[3];
105+
const auto local_sample_size = std::accumulate(input_dims.begin(),
106+
input_dims.end() - 1,
107+
1,
108+
std::multiplies<int>());
109+
110+
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
111+
const LocalMat local_input(local_sample_size,
112+
local_num_samples,
113+
input.get_buffer(),
114+
local_sample_size);
115+
116+
const LocalMat local_statistics(2,
117+
global_num_samples,
118+
statistics.get_local_shape(),
119+
2);
120+
121+
LocalMat local_output(local_sample_size,
122+
local_num_samples,
123+
output.get_buffer(),
124+
local_sample_size);
125+
126+
const auto local_means = El::View(local_statistics, El::IR(0), El::ALL);
127+
const auto local_vars = El::View(local_statistics, El::IR(1), El::ALL);
128+
{
129+
using namespace hydrogen;
130+
auto sync_info = gpu::get_sync_info(local_statistics);
131+
constexpr size_t block_size = 256;
132+
dim3 block_dims, grid_dims;
133+
block_dims.x = block_size;
134+
grid_dims.x = (local_num_samples + block_size - 1) / block_size;
135+
hydrogen::gpu::LaunchKernel(layer_norm_fp_statistics_kernel<TensorDataType>,
136+
grid_dims,
137+
block_dims,
138+
0,
139+
sync_info,
140+
sample_size,
141+
local_num_samples,
142+
local_means.Buffer(),
143+
local_means.LDim(),
144+
local_vars.Buffer(),
145+
local_vars.LDim());
146+
147+
auto multisync = El::MakeMultiSync(gpu::get_sync_info(local_output),
148+
gpu::get_sync_info(local_statistics),
149+
gpu::get_sync_info(local_input));
150+
constexpr size_t block_size = 256;
151+
dim3 block_dims, grid_dims;
152+
block_dims.x = block_size;
153+
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
154+
grid_dims.y = local_num_samples;
155+
hydrogen::gpu::LaunchKernel(layer_norm_fp_output_kernel<TensorDataType>,
156+
grid_dims,
157+
block_dims,
158+
0,
159+
multisync,
160+
local_num_samples,
161+
local_sample_size,
162+
epsilon,
163+
local_input.LockedBuffer(),
164+
local_input.LDim(),
165+
local_output.Buffer(),
166+
local_output.LDim(),
167+
local_means.LockedBuffer(),
168+
local_means.LDim(),
169+
local_vars.LockedBuffer(),
170+
local_vars.LDim());
171+
}
172+
}
105173

106174
template <typename Backend, typename DataType>
107175
template <typename Allocator>
@@ -110,15 +178,147 @@ void LayerNormalization::calculate_backward_stats(
110178
const DCTensor<Allocator>& output_grad,
111179
const DCTensor<Allocator>& statistics,
112180
DCTensor<Allocator>& statistics_grad)
113-
{}
181+
{
182+
const auto& input_dims = input.get_local_shape();
183+
const auto& statistics_dims = statistics.get_local_shape();
184+
const auto local_num_samples = input_0_dims[3];
185+
const auto global_num_samples = statistics_dims[3];
186+
const auto local_sample_size = std::accumulate(input_dims.begin(),
187+
input_dims.end() - 1,
188+
1,
189+
std::multiplies<int>());
190+
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
191+
const LocalMat local_input(local_sample_size,
192+
local_num_samples,
193+
input.get_buffer(),
194+
local_sample_size);
195+
const LocalMat local_output_grad(local_sample_size,
196+
local_num_samples,
197+
output_grad.get_buffer(),
198+
local_sample_size);
199+
200+
const LocalMat local_statistics(2,
201+
global_num_samples,
202+
statistics.get_local_shape(),
203+
2);
204+
205+
LocalMat local_statistics_grad(2,
206+
global_num_samples,
207+
statistics_grad.get_buffer(),
208+
2);
209+
{
210+
using namespace hydrogen;
211+
auto multisync =
212+
El::MakeMultiSync(gpu::get_sync_info(local_statistics_grad),
213+
gpu::get_sync_info(local_output_grad),
214+
gpu::get_sync_info(local_statistics),
215+
gpu::get_sync_info(local_input));
216+
constexpr size_t block_size = 256;
217+
dim3 block_dims, grid_dims;
218+
block_dims.x = block_size;
219+
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
220+
grid_dims.y = local_num_samples;
221+
hydrogen::gpu::LaunchKernel(
222+
layer_norm_bp_statistics_grad_kernel<block_size, TensorDataType>,
223+
grid_dims,
224+
block_dims,
225+
0,
226+
multisync,
227+
local_num_samples,
228+
local_sample_size,
229+
m_epsilon,
230+
local_input.LockedBuffer(),
231+
local_input.LDim(),
232+
local_output_grad.LockedBuffer(),
233+
local_output_grad.LDim(),
234+
local_means.LockedBuffer(),
235+
local_means.LDim(),
236+
local_vars.LockedBuffer(),
237+
local_vars.LDim(),
238+
local_means_grad.Buffer(),
239+
local_means_grad.LDim(),
240+
local_vars_grad.Buffer(),
241+
local_vars_grad.LDim());
242+
}
243+
}
244+
114245
template <typename Backend, typename DataType>
115246
template <typename Allocator>
116247
void LayerNormalization::apply_grad(const DCTensor<Allocator>& input,
117248
const DCTensor<Allocator>& output_grad,
118249
const DCTensor<Allocator>& statistics,
119250
const DCTensor<Allocator>& statistics_grad,
120251
DCTensor<Allocator>& input_grad)
121-
{}
252+
{
253+
const auto& input_dims = input.get_local_shape();
254+
const auto& statistics_dims = statistics.get_local_shape();
255+
const auto local_num_samples = input_0_dims[3];
256+
const auto global_num_samples = statistics_dims[3];
257+
const auto local_sample_size = std::accumulate(input_dims.begin(),
258+
input_dims.end() - 1,
259+
1,
260+
std::multiplies<int>());
261+
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
262+
const LocalMat local_input(local_sample_size,
263+
local_num_samples,
264+
input.get_buffer(),
265+
local_sample_size);
266+
const LocalMat local_output_grad(local_sample_size,
267+
local_num_samples,
268+
output_grad.get_buffer(),
269+
local_sample_size);
270+
271+
const LocalMat local_statistics(2,
272+
global_num_samples,
273+
statistics.get_local_shape(),
274+
2);
275+
276+
const LocalMat local_statistics_grad(2,
277+
global_num_samples,
278+
statistics_grad.get_buffer(),
279+
2);
280+
281+
LocalMat local_input_grad(local_sample_size,
282+
local_num_samples,
283+
input_grad.get_buffer(),
284+
local_sample_size);
285+
{
286+
using namespace hydrogen;
287+
auto multisync =
288+
El::MakeMultiSync(gpu::get_sync_info(local_statistics_grad),
289+
gpu::get_sync_info(local_output_grad),
290+
gpu::get_sync_info(local_statistics),
291+
gpu::get_sync_info(local_input));
292+
constexpr size_t block_size = 256;
293+
dim3 block_dims, grid_dims;
294+
block_dims.x = block_size;
295+
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
296+
grid_dims.y = local_num_samples;
297+
hydrogen::gpu::LaunchKernel(layer_norm_bp_input_grad_kernel<TensorDataType>,
298+
grid_dims,
299+
block_dims,
300+
0,
301+
multisync,
302+
sample_size,
303+
local_num_samples,
304+
local_sample_size,
305+
m_epsilon,
306+
local_input.LockedBuffer(),
307+
local_input.LDim(),
308+
local_output_grad.LockedBuffer(),
309+
local_output_grad.LDim(),
310+
local_input_grad.Buffer(),
311+
local_input_grad.LDim(),
312+
local_means.LockedBuffer(),
313+
local_means.LDim(),
314+
local_vars.LockedBuffer(),
315+
local_vars.LDim(),
316+
local_means_grad.LockedBuffer(),
317+
local_means_grad.LDim(),
318+
local_vars_grad.LockedBuffer(),
319+
local_vars_grad.LDim());
320+
}
321+
}
122322

123323
#define ETI(T, Backend) \
124324
template class LayerNormalization<Backend, T>; \

0 commit comments

Comments
 (0)