@@ -41,14 +41,10 @@ void LayerNormalization ::calculate_forward_stats(
41
41
util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
42
42
return ; // no op for empty inputs
43
43
}
44
-
45
44
const auto & input_dims = input.get_local_shape ();
46
45
const auto & statistics_dims = statistics.get_local_shape ();
47
-
48
46
const auto local_num_samples = input_0_dims[3 ];
49
-
50
47
const auto global_num_samples = statistics_dims[3 ];
51
-
52
48
const auto local_sample_size = std::accumulate (input_dims.begin (),
53
49
input_dims.end () - 1 ,
54
50
1 ,
@@ -61,7 +57,7 @@ void LayerNormalization ::calculate_forward_stats(
61
57
local_sample_size);
62
58
63
59
LocalMat local_statistics (2 ,
64
- local_num_samples ,
60
+ global_num_samples ,
65
61
statistics.get_local_shape (),
66
62
2 );
67
63
@@ -101,7 +97,79 @@ void LayerNormalization::apply_normalization(
101
97
const DCTensor<Allocator>& input,
102
98
const DCTensor<Allocator>& statistics,
103
99
DCTensor<Allocator>& output)
104
- {}
100
+ {
101
+ const auto & input_dims = input.get_local_shape ();
102
+ const auto & statistics_dims = statistics.get_local_shape ();
103
+ const auto local_num_samples = input_0_dims[3 ];
104
+ const auto global_num_samples = statistics_dims[3 ];
105
+ const auto local_sample_size = std::accumulate (input_dims.begin (),
106
+ input_dims.end () - 1 ,
107
+ 1 ,
108
+ std::multiplies<int >());
109
+
110
+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
111
+ const LocalMat local_input (local_sample_size,
112
+ local_num_samples,
113
+ input.get_buffer (),
114
+ local_sample_size);
115
+
116
+ const LocalMat local_statistics (2 ,
117
+ global_num_samples,
118
+ statistics.get_local_shape (),
119
+ 2 );
120
+
121
+ LocalMat local_output (local_sample_size,
122
+ local_num_samples,
123
+ output.get_buffer (),
124
+ local_sample_size);
125
+
126
+ const auto local_means = El::View (local_statistics, El::IR (0 ), El::ALL);
127
+ const auto local_vars = El::View (local_statistics, El::IR (1 ), El::ALL);
128
+ {
129
+ using namespace hydrogen ;
130
+ auto sync_info = gpu::get_sync_info (local_statistics);
131
+ constexpr size_t block_size = 256 ;
132
+ dim3 block_dims, grid_dims;
133
+ block_dims.x = block_size;
134
+ grid_dims.x = (local_num_samples + block_size - 1 ) / block_size;
135
+ hydrogen::gpu::LaunchKernel (layer_norm_fp_statistics_kernel<TensorDataType>,
136
+ grid_dims,
137
+ block_dims,
138
+ 0 ,
139
+ sync_info,
140
+ sample_size,
141
+ local_num_samples,
142
+ local_means.Buffer (),
143
+ local_means.LDim (),
144
+ local_vars.Buffer (),
145
+ local_vars.LDim ());
146
+
147
+ auto multisync = El::MakeMultiSync (gpu::get_sync_info (local_output),
148
+ gpu::get_sync_info (local_statistics),
149
+ gpu::get_sync_info (local_input));
150
+ constexpr size_t block_size = 256 ;
151
+ dim3 block_dims, grid_dims;
152
+ block_dims.x = block_size;
153
+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
154
+ grid_dims.y = local_num_samples;
155
+ hydrogen::gpu::LaunchKernel (layer_norm_fp_output_kernel<TensorDataType>,
156
+ grid_dims,
157
+ block_dims,
158
+ 0 ,
159
+ multisync,
160
+ local_num_samples,
161
+ local_sample_size,
162
+ epsilon,
163
+ local_input.LockedBuffer (),
164
+ local_input.LDim (),
165
+ local_output.Buffer (),
166
+ local_output.LDim (),
167
+ local_means.LockedBuffer (),
168
+ local_means.LDim (),
169
+ local_vars.LockedBuffer (),
170
+ local_vars.LDim ());
171
+ }
172
+ }
105
173
106
174
template <typename Backend, typename DataType>
107
175
template <typename Allocator>
@@ -110,15 +178,147 @@ void LayerNormalization::calculate_backward_stats(
110
178
const DCTensor<Allocator>& output_grad,
111
179
const DCTensor<Allocator>& statistics,
112
180
DCTensor<Allocator>& statistics_grad)
113
- {}
181
+ {
182
+ const auto & input_dims = input.get_local_shape ();
183
+ const auto & statistics_dims = statistics.get_local_shape ();
184
+ const auto local_num_samples = input_0_dims[3 ];
185
+ const auto global_num_samples = statistics_dims[3 ];
186
+ const auto local_sample_size = std::accumulate (input_dims.begin (),
187
+ input_dims.end () - 1 ,
188
+ 1 ,
189
+ std::multiplies<int >());
190
+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
191
+ const LocalMat local_input (local_sample_size,
192
+ local_num_samples,
193
+ input.get_buffer (),
194
+ local_sample_size);
195
+ const LocalMat local_output_grad (local_sample_size,
196
+ local_num_samples,
197
+ output_grad.get_buffer (),
198
+ local_sample_size);
199
+
200
+ const LocalMat local_statistics (2 ,
201
+ global_num_samples,
202
+ statistics.get_local_shape (),
203
+ 2 );
204
+
205
+ LocalMat local_statistics_grad (2 ,
206
+ global_num_samples,
207
+ statistics_grad.get_buffer (),
208
+ 2 );
209
+ {
210
+ using namespace hydrogen ;
211
+ auto multisync =
212
+ El::MakeMultiSync (gpu::get_sync_info (local_statistics_grad),
213
+ gpu::get_sync_info (local_output_grad),
214
+ gpu::get_sync_info (local_statistics),
215
+ gpu::get_sync_info (local_input));
216
+ constexpr size_t block_size = 256 ;
217
+ dim3 block_dims, grid_dims;
218
+ block_dims.x = block_size;
219
+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
220
+ grid_dims.y = local_num_samples;
221
+ hydrogen::gpu::LaunchKernel (
222
+ layer_norm_bp_statistics_grad_kernel<block_size, TensorDataType>,
223
+ grid_dims,
224
+ block_dims,
225
+ 0 ,
226
+ multisync,
227
+ local_num_samples,
228
+ local_sample_size,
229
+ m_epsilon,
230
+ local_input.LockedBuffer (),
231
+ local_input.LDim (),
232
+ local_output_grad.LockedBuffer (),
233
+ local_output_grad.LDim (),
234
+ local_means.LockedBuffer (),
235
+ local_means.LDim (),
236
+ local_vars.LockedBuffer (),
237
+ local_vars.LDim (),
238
+ local_means_grad.Buffer (),
239
+ local_means_grad.LDim (),
240
+ local_vars_grad.Buffer (),
241
+ local_vars_grad.LDim ());
242
+ }
243
+ }
244
+
114
245
template <typename Backend, typename DataType>
115
246
template <typename Allocator>
116
247
void LayerNormalization::apply_grad (const DCTensor<Allocator>& input,
117
248
const DCTensor<Allocator>& output_grad,
118
249
const DCTensor<Allocator>& statistics,
119
250
const DCTensor<Allocator>& statistics_grad,
120
251
DCTensor<Allocator>& input_grad)
121
- {}
252
+ {
253
+ const auto & input_dims = input.get_local_shape ();
254
+ const auto & statistics_dims = statistics.get_local_shape ();
255
+ const auto local_num_samples = input_0_dims[3 ];
256
+ const auto global_num_samples = statistics_dims[3 ];
257
+ const auto local_sample_size = std::accumulate (input_dims.begin (),
258
+ input_dims.end () - 1 ,
259
+ 1 ,
260
+ std::multiplies<int >());
261
+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
262
+ const LocalMat local_input (local_sample_size,
263
+ local_num_samples,
264
+ input.get_buffer (),
265
+ local_sample_size);
266
+ const LocalMat local_output_grad (local_sample_size,
267
+ local_num_samples,
268
+ output_grad.get_buffer (),
269
+ local_sample_size);
270
+
271
+ const LocalMat local_statistics (2 ,
272
+ global_num_samples,
273
+ statistics.get_local_shape (),
274
+ 2 );
275
+
276
+ const LocalMat local_statistics_grad (2 ,
277
+ global_num_samples,
278
+ statistics_grad.get_buffer (),
279
+ 2 );
280
+
281
+ LocalMat local_input_grad (local_sample_size,
282
+ local_num_samples,
283
+ input_grad.get_buffer (),
284
+ local_sample_size);
285
+ {
286
+ using namespace hydrogen ;
287
+ auto multisync =
288
+ El::MakeMultiSync (gpu::get_sync_info (local_statistics_grad),
289
+ gpu::get_sync_info (local_output_grad),
290
+ gpu::get_sync_info (local_statistics),
291
+ gpu::get_sync_info (local_input));
292
+ constexpr size_t block_size = 256 ;
293
+ dim3 block_dims, grid_dims;
294
+ block_dims.x = block_size;
295
+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
296
+ grid_dims.y = local_num_samples;
297
+ hydrogen::gpu::LaunchKernel (layer_norm_bp_input_grad_kernel<TensorDataType>,
298
+ grid_dims,
299
+ block_dims,
300
+ 0 ,
301
+ multisync,
302
+ sample_size,
303
+ local_num_samples,
304
+ local_sample_size,
305
+ m_epsilon,
306
+ local_input.LockedBuffer (),
307
+ local_input.LDim (),
308
+ local_output_grad.LockedBuffer (),
309
+ local_output_grad.LDim (),
310
+ local_input_grad.Buffer (),
311
+ local_input_grad.LDim (),
312
+ local_means.LockedBuffer (),
313
+ local_means.LDim (),
314
+ local_vars.LockedBuffer (),
315
+ local_vars.LDim (),
316
+ local_means_grad.LockedBuffer (),
317
+ local_means_grad.LDim (),
318
+ local_vars_grad.LockedBuffer (),
319
+ local_vars_grad.LDim ());
320
+ }
321
+ }
122
322
123
323
#define ETI (T, Backend ) \
124
324
template class LayerNormalization <Backend, T>; \
0 commit comments