From 13da5803f729eeaf425429cad3e323f382f3728b Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Wed, 14 Jun 2023 18:01:31 +0300 Subject: [PATCH 01/27] Added RNN_RELU and RNN_TANH forward training refactored functions --- src/include/miopen/rnn.hpp | 14 + src/ocl/rnnocl.cpp | 555 +++++++++++++++++++++++++++++++++++++ 2 files changed, 569 insertions(+) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index e3b66b3ce7..2e542ac939 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -207,6 +207,20 @@ struct RNNDescriptor : miopenRNNDescriptor Data_t reserveSpace, size_t reserveSpaceSize) const; + void RNNForwardTrainingTanhRelu(Handle& handle, + std::vector& seq_array, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& hxDesc, + ConstData_t hx, + const TensorDescriptor& wDesc, + ConstData_t w, + const TensorDescriptor& yDesc, + Data_t y, + Data_t hy, + Data_t reserveSpace, + size_t reserveSpaceSize) const; + void RNNForwardTraining_MS(Handle& handle, std::vector& seq_array, const TensorDescriptor& xDesc, diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 6b03f4c115..86fb6c1508 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -40,6 +40,534 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_RNNFWD_exp) namespace miopen { +void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, + std::vector& seq_array, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& hxDesc, + ConstData_t hx, + const TensorDescriptor& wDesc, + ConstData_t w, + const TensorDescriptor& yDesc, + Data_t y, + Data_t hy, + Data_t reserveSpace, + size_t reserveSpaceSize) const +{ +#if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP + int seq_len = seq_array.size(); + if(seq_len == 0) + return; + + struct ReluWeightOffsets + { + private: + auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + { + if(bidirect_mode == 0) + return hidden_sz; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + + auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz) const + { + return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz; + } + + size_t bias_start_offset(int input_vector_sz, + int hidden_vec_sz, + int layers_cnt, + int bidirect_mode) const + { + if(bidirect_mode == 0) + return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + + static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * + hidden_vec_sz * static_cast(layers_cnt - 1); + + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + + public: + ReluWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) + : in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), + bias_cnt(bias_mode), + matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), + bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, 0)) + { + } + + private: + const int in_vec_sz, h_vec_sz; + const int x_in_vec_sz; // for bidirect TODO + + const int + bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec + + const size_t matrix_normal_start_off; + const size_t bias_start_off; + + auto get_input_matrix_size(int layer_id) const + { + return (layer_id > 0 ? x_in_vec_sz : in_vec_sz) * h_vec_sz; + } + + auto get_hidden_matrix_size() const { return h_vec_sz * h_vec_sz; } + auto get_matrix_layer_size(int layer_id) const + { + return get_input_matrix_size(layer_id) + get_hidden_matrix_size(); + } + + int bias_vector_size() const { return h_vec_sz; } + + size_t bias_relative_off(int layer_id, int bias_id) const + { + return static_cast(layer_id * bias_cnt + bias_id) * h_vec_sz; + } + + public: + size_t input_offset(int layer_id) const + { + if(layer_id > 0) + return matrix_normal_start_off + + static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); + else + return 0; + }; + + size_t hidden_offset(int layer_id) const + { + if(layer_id > 0) + return input_offset(layer_id) + static_cast(h_vec_sz * x_in_vec_sz); + else + return input_offset(layer_id) + static_cast(h_vec_sz * in_vec_sz); + }; + + int bias_stride() const { return bias_vector_size(); } + + size_t bias_off(int layer_id, int bias_id) const + { + return bias_start_off + bias_relative_off(layer_id, bias_id); + } + }; + + struct ReluReserveBufferOffsets + { + struct RBuffHelper + { + int element, save_point, batch; + size_t layer, table; + }; + + private: + auto Reserve_Buffer_strides(int save_point_sz, + int batches_per_layer, + int layers, + int bidirect_mode = 0) const + { + const auto element_st = 1; + const auto save_point_st = element_st * save_point_sz; + const auto batch_st = save_point_st; + const auto layer_st = static_cast(batch_st) * batches_per_layer; + const auto table_st = layers * layer_st; + + if(bidirect_mode == 0) + return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + + public: + ReluReserveBufferOffsets(int hidden_vec_sz, + int save_point_sz, + int layers_cnt, + int batches_per_layer) + : h_vec_size(hidden_vec_sz), + save_point_size(save_point_sz), + layers(layers_cnt), + batches_per_layer(batches_per_layer), + strides(Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, 0)) + { + } + + const int h_vec_size; + const int save_point_size; + + const int layers; + const int batches_per_layer; + const RBuffHelper strides; + + size_t layer_offset(int layer_id) const + { + return static_cast(layer_id) * strides.layer; + } + + auto layer_stride() const { return strides.layer; } + + auto gemm_write_size() const { return h_vec_size; } + + auto gemm_write_stride() const { return strides.batch; } + + size_t gemm_write_relative_offset(int batch_id) const + { + return static_cast(gemm_write_stride()) * batch_id; + } + + size_t gemm_write_offset(int layer_id, int batch_id) const + { + return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id; + } + + size_t ht_offset(int layer_id, int batch_id) const + { + return strides.table + layer_offset(layer_id) + gemm_write_relative_offset(batch_id); + } + + size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id); } + }; + + std::vector batches; + int in_vec_size = xDesc.GetLengths()[1]; + int out_vec_size = yDesc.GetLengths()[1]; + + int max_batch = seq_array[0]; + int hidden_size; + std::tie(std::ignore, max_batch, hidden_size) = miopen::tien<3>(hxDesc.GetLengths()); + + int total_batch_size = 0; + // accumulated batches per time + std::vector bacc_per_time(seq_len + 1); + + for(int i = 0; i < seq_len; i++) + { + bacc_per_time[i] = total_batch_size; + total_batch_size += seq_array[i]; + batches.push_back(seq_array[i]); + } + bacc_per_time[seq_len] = total_batch_size; + + auto get_HxBuff_offset = [&](int layer_id) { + return layer_id * (static_cast(hidden_size) * max_batch); + }; + + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2); + ReluReserveBufferOffsets RBuff(hidden_size, hidden_size, nLayers, total_batch_size); + + ActivationDescriptor activDesc; + + if(rnnMode == miopenRNNRELU) + { + activDesc = {miopenActivationRELU, 1, 0, 1}; + } + else if(rnnMode == miopenRNNTANH) + { + activDesc = {miopenActivationTANH, 1, 1, 1}; + } + + auto call_input_gemm = + [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, hidden_size]( + int layer, float beta_t = 1) { + const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), + k = layer > 0 ? hidden_size : in_vec_size; + + const int lda = layer > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, + ldc = RBuff.gemm_write_stride(); + + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; + + const auto input_weight_offset = WeiBuf.input_offset(layer); + const auto output_offset = RBuff.layer_offset(layer); + + const auto input_offset = layer > 0 ? RBuff.ht_offset(layer - 1) : 0; + + const auto input_ptr = layer > 0 ? reserveSpace : x; + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + input_ptr, + input_offset, + w, + input_weight_offset, + reserveSpace, + output_offset, + GemmBackend_t::miopengemm); + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; + + auto call_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, + float beta_t = 0) { + float alpha0 = 1; + float alpha1 = 1; + const auto bias_stride = WeiBuf.bias_stride(); + + const auto bias_desc = + miopen::TensorDescriptor(wDesc.GetType(), + std::vector{1, 1, bias_stride}, + std::vector{bias_stride, bias_stride, 1}); + + const auto hidden_interim_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, + std::vector{ + RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + + const auto RB_layer_out_off = RBuff.layer_offset(layer); + const auto w_bias_layer_start_off = WeiBuf.bias_off(layer, 0); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, // A + &alpha1, + bias_desc, + w, // B + &beta_t, + hidden_interim_desc, + reserveSpace, // C + RB_layer_out_off, // A offset + w_bias_layer_start_off, // B offset + RB_layer_out_off); // C offset + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + hidden_interim_desc, + reserveSpace, + RB_layer_out_off, + w_bias_layer_start_off + bias_stride, + RB_layer_out_off); + }; + + auto call_hidden_gemm = [&RBuff, + &WeiBuf, + &get_HxBuff_offset, + &bacc_per_time, + &batches, + &handle, + &xDesc, + reserveSpace, + hx, + w, + hidden_size](int layer, int cur_time) { + if(cur_time == 0 && hx == nullptr) + return; + + const int m = batches.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; + + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size; + + const int ldb = hidden_size, ldc = RBuff.gemm_write_stride(); + + const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; + + const auto ht_offset = (cur_time == 0) + ? get_HxBuff_offset(layer) + : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); + + const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; + + const auto RB_batch_save_points_off = + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time]); + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc_hx, + ht_ptr, + ht_offset, + w, + WeiBuf.hidden_offset(layer), + reserveSpace, + RB_batch_save_points_off, + GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; + + auto call_hidden_state_update = + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc, hidden_size]( + int layer_id, int time_id) { + float alpha = 1, beta = 0; + + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + const auto RB_layer_save_points_off = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); + + const auto ht_offset = RBuff.ht_offset(layer_id, bacc_per_time[time_id]); + + activDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + RB_layer_save_points_off, + // output tensor offset + ht_offset); + }; + + auto call_update_output = [&RBuff, + &get_HxBuff_offset, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + hy, + max_batch, + hidden_size, + seq_len](int layer_id) { + if(hy == nullptr) + return; + + auto hcy_layer_offset = get_HxBuff_offset(layer_id); + + const std::vector hcy_src_stride{ + RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; + const std::vector hcy_dst_stride{ + static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; + + for(int time_i = seq_len - 1; time_i >= 0; time_i--) + { + auto copy_batch = (time_i == seq_len - 1) ? batches.at(time_i) + : batches.at(time_i) - batches.at(time_i + 1); + if(copy_batch > 0) + { + auto batch_id_relative = batches.at(time_i) - copy_batch; + auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; + + auto hcy_batch_offset = batch_id_relative * hidden_size; + + auto src_batch_offset = + RBuff.ht_offset(layer_id) + RBuff.gemm_write_relative_offset(batch_id_abs); + + const std::vector hcy_copy_size{ + 1, static_cast(copy_batch), static_cast(hidden_size)}; + + auto src_desc = + miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); + auto dst_desc = + miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); + + CopyTensor(handle, + src_desc, + reserveSpace, + dst_desc, + hy, + src_batch_offset, + hcy_layer_offset + hcy_batch_offset); + } + } + }; + + if(biasMode != 0u) + for(int layer_id = 0; layer_id < nLayers; layer_id++) + call_bias_add(layer_id); + + for(int layer_id = 0; layer_id < nLayers; layer_id++) + { + call_input_gemm(layer_id); + for(int time = 0; time < seq_len; time++) + { + call_hidden_gemm(layer_id, time); + call_hidden_state_update(layer_id, time); + } + call_update_output(layer_id); + } + + // output tensor copy + { + const std::vector y_copy_size{ + 1, static_cast(total_batch_size), static_cast(out_vec_size)}; + + const std::vector y_src_stride{ + RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; + + const std::vector y_dst_stride{static_cast(out_vec_size * total_batch_size), + static_cast(out_vec_size), + 1}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_src_stride); + auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); + + CopyTensor( + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); + } +#else + (void)handle; + (void)seq_array; + (void)xDesc; + (void)x; + (void)hxDesc; + (void)hx; + (void)cx; + (void)wDesc; + (void)w; + (void)yDesc; + (void)y; + (void)hy; + (void)cy; + (void)reserveSpace; + (void)reserveSpaceSize; + + MIOPEN_THROW("GEMM is not supported"); +#endif +} + void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, std::vector& seq_array, const TensorDescriptor& xDesc, @@ -2215,6 +2743,33 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, } return; } + + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && + dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && + !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) + { + RNNForwardTrainingTanhRelu(handle, + in_n, + xDesc[0], + x, + hxDesc, + hx, + wDesc, + w, + yDesc[0], + y, + hy, + reserveSpace, + reserveSpaceSize); + if(is_profiling) + { + float eventTime_mS = RNNProfilingEnd(handle, start, stop); + handle.EnableProfiling(true); + handle.ResetKernelTime(); + handle.AccumKernelTime(eventTime_mS); + } + return; + } #endif // MIOPEN_USE_GEMM&& MIOPEN_BACKEND_HIP int in_stride = xDesc[0].GetLengths()[1]; From c3d8ab47ff278fa448bab204cb4c644633c038bb Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Tue, 25 Jul 2023 23:21:59 +0300 Subject: [PATCH 02/27] Added RNNForwardTrainingGRU method --- src/include/miopen/rnn.hpp | 14 + src/include/miopen/rnn_util.hpp | 445 ++++++++++++ src/ocl/rnnocl.cpp | 1127 ++++++++++++++++++++----------- 3 files changed, 1196 insertions(+), 390 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index 2e542ac939..db16a09453 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -207,6 +207,20 @@ struct RNNDescriptor : miopenRNNDescriptor Data_t reserveSpace, size_t reserveSpaceSize) const; + void RNNForwardTrainingGRU(Handle& handle, + std::vector& seq_array, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& hxDesc, + ConstData_t hx, + const TensorDescriptor& wDesc, + ConstData_t w, + const TensorDescriptor& yDesc, + Data_t y, + Data_t hy, + Data_t reserveSpace, + size_t reserveSpaceSize) const; + void RNNForwardTrainingTanhRelu(Handle& handle, std::vector& seq_array, const TensorDescriptor& xDesc, diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 74d82ad68c..0ed4de2b23 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -118,6 +118,451 @@ void LSTMBackwardHiddenStateUpdate(const Handle& handle, std::size_t dcell_offset_pre, std::size_t dhidden_offset, std::size_t f_offset_pre); + +struct GRUOffsets +{ +public: + GRUOffsets(int num_layers, int hidden_size, int total_batch_size) + : num_layers(num_layers), hidden_size(hidden_size), batches_per_layer(total_batch_size) + { + } + + int r_offset() const { return save_point::R * hidden_size; } + + int z_offset() const { return save_point::Z * hidden_size; } + + int c_offset() const { return save_point::С * hidden_size; } + + int hidden_offset() const { return save_point::Ht * hidden_size; } + + size_t batch_offset(int layer_id, int batch_num) const + { + return layer_offset(layer_id) + batch_num * gemm_write_stride(); + } + + int activated_offset() const { return layer_stride() * num_layers; } + + int gemm_write_size() const { return hidden_size; } + + int gemm_write_stride() const { return save_point::Count * hidden_size; } + + int layer_offset(int layer) const { return layer * layer_stride(); } + + int batches_per_layer; + + size_t layer_stride() const { return gemm_write_stride() * batches_per_layer; } + + size_t network_stride() { return layer_stride() * num_layers; } + +private: + int num_layers; + int hidden_size; + + enum save_point + { + Z = 0, + R = 1, + С = 2, + Ht = 3, + Count = 4 + }; +}; + +struct GruWeightOffsets +{ + GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) + : weight_stride(matrixes::Count * hidden_vec_sz), + in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + num_layers(layers_cnt) + { + } + + int weight_stride; + const int in_vec_sz, h_vec_sz; + const int num_layers; + + enum matrixes + { + Z = 0, + R = 1, + C = 2, + Count = 3 + }; + +public: + int input_offset(int layer) + { + return layer == 0 ? 0 + : (in_vec_sz + h_vec_sz) * weight_stride + + 2 * h_vec_sz * weight_stride * (layer - 1); + } + + int hidden_offset(int layer) + { + return layer == 0 ? input_offset(layer) + in_vec_sz * weight_stride + : input_offset(layer) + h_vec_sz * weight_stride; + } + + int bias_stride() { return matrixes::Count * h_vec_sz; } + int bias_off() + { + return (in_vec_sz + h_vec_sz + 2 * h_vec_sz * (num_layers - 1)) * weight_stride; + } + int bias_off(int layer_id) { return bias_off() + layer_id * weight_stride; } +}; + +struct ReluWeightOffsets +{ +private: + auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + { + if(bidirect_mode == 0) + return hidden_sz; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + + auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz) const + { + return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz; + } + + size_t bias_start_offset(int input_vector_sz, + int hidden_vec_sz, + int layers_cnt, + int bidirect_mode) const + { + if(bidirect_mode == 0) + return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + + static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * + hidden_vec_sz * static_cast(layers_cnt - 1); + + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + +public: + ReluWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) + : in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), + bias_cnt(bias_mode), + matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), + bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, 0)) + { + } + +private: + const int in_vec_sz, h_vec_sz; + const int x_in_vec_sz; // for bidirect TODO + + const int bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec + + const size_t matrix_normal_start_off; + const size_t bias_start_off; + + auto get_input_matrix_size(int layer_id) const + { + return (layer_id > 0 ? x_in_vec_sz : in_vec_sz) * h_vec_sz; + } + + auto get_hidden_matrix_size() const { return h_vec_sz * h_vec_sz; } + auto get_matrix_layer_size(int layer_id) const + { + return get_input_matrix_size(layer_id) + get_hidden_matrix_size(); + } + + int bias_vector_size() const { return h_vec_sz; } + + size_t bias_relative_off(int layer_id, int bias_id) const + { + return static_cast(layer_id * bias_cnt + bias_id) * h_vec_sz; + } + +public: + size_t input_offset(int layer_id) const + { + if(layer_id > 0) + return matrix_normal_start_off + + static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); + else + return 0; + }; + + size_t hidden_offset(int layer_id) const + { + if(layer_id > 0) + return input_offset(layer_id) + static_cast(h_vec_sz * x_in_vec_sz); + else + return input_offset(layer_id) + static_cast(h_vec_sz * in_vec_sz); + }; + + int bias_stride() const { return bias_vector_size(); } + + size_t bias_off(int layer_id, int bias_id) const + { + return bias_start_off + bias_relative_off(layer_id, bias_id); + } +}; + +struct ReluReserveBufferOffsets +{ + struct RBuffHelper + { + int element, save_point, batch; + size_t layer, table; + }; + +private: + auto Reserve_Buffer_strides(int save_point_sz, + int batches_per_layer, + int layers, + int bidirect_mode = 0) const + { + const auto element_st = 1; + const auto save_point_st = element_st * save_point_sz; + const auto batch_st = save_point_st; + const auto layer_st = static_cast(batch_st) * batches_per_layer; + const auto table_st = layers * layer_st; + + if(bidirect_mode == 0) + return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + +public: + ReluReserveBufferOffsets(int hidden_vec_sz, + int save_point_sz, + int layers_cnt, + int batches_per_layer) + : h_vec_size(hidden_vec_sz), + save_point_size(save_point_sz), + layers(layers_cnt), + batches_per_layer(batches_per_layer), + strides(Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, 0)) + { + } + + const int h_vec_size; + const int save_point_size; + + const int layers; + const int batches_per_layer; + const RBuffHelper strides; + + size_t layer_offset(int layer_id) const + { + return static_cast(layer_id) * strides.layer; + } + + auto layer_stride() const { return strides.layer; } + + auto gemm_write_size() const { return h_vec_size; } + + auto gemm_write_stride() const { return strides.batch; } + + size_t gemm_write_relative_offset(int batch_id) const + { + return static_cast(gemm_write_stride()) * batch_id; + } + + size_t gemm_write_offset(int layer_id, int batch_id) const + { + return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id; + } + + size_t ht_offset(int layer_id, int batch_id) const + { + return strides.table + layer_offset(layer_id) + gemm_write_relative_offset(batch_id); + } + + size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id); } +}; + +struct LSTMReserveBufferHelper +{ + struct RBuffHelper + { + int element, save_point, batch; + size_t layer; + }; + +private: + auto Reserve_Buffer_strides(int save_point_sz, + int batches_per_layer, + int save_points, + int bidirect_mode = 0) const + { + const auto element_st = 1; + const auto save_point_st = element_st * save_point_sz; + const auto batch_st = save_point_st * save_points; + const auto layer_st = static_cast(batch_st) * batches_per_layer; + if(bidirect_mode == 0) + return RBuffHelper{element_st, save_point_st, batch_st, layer_st}; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + +public: + enum save_point + { + F = 1, + I = 0, + G = 2, + O = 3, + St = 4, + Ht = 5 + }; + + LSTMReserveBufferHelper(int hidden_vec_sz, + int save_point_sz, + int layers_cnt, + int batches_per_layer, + int save_points, + int gates_cnt) + : h_vec(hidden_vec_sz), + save_point_size(save_point_sz), + layers(layers_cnt), + batches(batches_per_layer), + save_points_cnt(save_points), + gates(gates_cnt), + strides(Reserve_Buffer_strides(save_point_sz, batches, save_points, 0)) + { + } + + const int h_vec; + const int save_point_size; // for bidirect TODO + + const int layers; + const int batches; + const int save_points_cnt; + const int gates; + const RBuffHelper strides; + + size_t layer_offset(int layer) const { return static_cast(layer) * strides.layer; } + auto layer_stride() const { return strides.layer; } + + auto gemm_write_size() const { return h_vec * gates; } + auto gemm_write_stride() const { return strides.batch; } // save_point_size * save_points_cnt + + size_t gemm_write_relative_offset(int batch_id) const + { + return static_cast(gemm_write_stride()) * batch_id; + } + + size_t gemm_write_offset(int layer, int batch_id) const + { + return layer_offset(layer) + static_cast(gemm_write_stride()) * batch_id; + } + + auto ht_relative_offset() const { return save_point::Ht * save_point_size; } + + auto ct_relative_offset() const { return save_point::St * save_point_size; } + + auto get_gate_relative_offset(int gate_id) const { return gate_id * save_point_size; } + + size_t ht_offset(int layer_id, int batch_id) const + { + return layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + ht_relative_offset(); + } + + size_t extra_save_point_offset(int layer_id, int batch_id) const + { + return (static_cast(batches) * layers * gemm_write_stride()) // all data offset + + (static_cast(batches) * layer_id) * h_vec + + static_cast(batch_id * h_vec); + } +}; + +struct LSTMWeightsBufferHelper +{ +private: + auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + { + if(bidirect_mode == 0) + return hidden_sz; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + + auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz, int gates) const + { + return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz * gates; + } + size_t bias_start_offset( + int input_vector_sz, int hidden_vec_sz, int layers_cnt, int gates, int bidirect_mode) const + { + if(bidirect_mode == 0) + return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates) + + static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * + hidden_vec_sz * static_cast(layers_cnt - 1) * gates; + + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + +public: + LSTMWeightsBufferHelper( + int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int gates) + : in_vec(input_vector_sz), + h_vec(hidden_vec_sz), + x_in_vec(hidden_xinput_size(hidden_vec_sz, 0)), + layers(layers_cnt), + gates_cnt(gates), + bias_cnt(bias_mode), + matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates)), + bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, gates, 0)) + { + } + + const int in_vec, h_vec; + const int x_in_vec; // for bidirect TODO + + const int layers; + const int gates_cnt; + const int bias_cnt; // 0 - no bisa; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec +private: + const size_t matrix_normal_start_off; + const size_t bias_start_off; + +public: + auto get_matrix_x_size(int layer_id) const + { + return (layer_id > 0 ? x_in_vec : in_vec) * h_vec; + } + auto get_matrix_h_size() const { return h_vec * h_vec; } + auto get_matrix_layer_size(int layer_id) const + { + return get_matrix_x_size(layer_id) * gates_cnt + get_matrix_h_size() * gates_cnt; + } + + size_t get_matrix_x_off(int layer_id) const + { + if(layer_id > 0) + return matrix_normal_start_off + + static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); + else + return 0; + }; + + size_t get_matrix_h_off(int layer_id) const + { + if(layer_id > 0) + return get_matrix_x_off(layer_id) + static_cast(h_vec * x_in_vec * gates_cnt); + else + return get_matrix_x_off(layer_id) + static_cast(h_vec * in_vec) * gates_cnt; + }; + + int bias_vector_size() const { return h_vec; } + int bias_vector_mul_gate() const { return bias_vector_size() * gates_cnt; } + int bias_stride() const { return bias_vector_mul_gate(); } + + size_t bias_relative_off(int layer_id, int bias_id) const + { + return static_cast(layer_id * bias_cnt + bias_id) * gates_cnt * h_vec; + } + + size_t get_bias_off(int layer_id, int bias_id) const + { + return bias_start_off + bias_relative_off(layer_id, bias_id); + } +}; } // namespace miopen #endif // GUARD_MIOPEN_RNN_UTIL_HPP_ diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 86fb6c1508..094308b269 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -40,191 +40,712 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_RNNFWD_exp) namespace miopen { -void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, - std::vector& seq_array, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& hxDesc, - ConstData_t hx, - const TensorDescriptor& wDesc, - ConstData_t w, - const TensorDescriptor& yDesc, - Data_t y, - Data_t hy, - Data_t reserveSpace, - size_t reserveSpaceSize) const +void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, + std::vector& seq_array, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& hxDesc, + ConstData_t hx, + const TensorDescriptor& wDesc, + ConstData_t w, + const TensorDescriptor& yDesc, + Data_t y, + Data_t hy, + Data_t reserveSpace, + size_t reserveSpaceSize) const { #if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP int seq_len = seq_array.size(); if(seq_len == 0) return; - struct ReluWeightOffsets + int max_batch = seq_array[0]; + int hidden_size; + std::tie(std::ignore, max_batch, hidden_size) = miopen::tien<3>(hxDesc.GetLengths()); + + int in_vec_size = xDesc.GetLengths()[1]; + int out_vec_size = yDesc.GetLengths()[1]; + + GruWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2); + + ActivationDescriptor sigDesc = {miopenActivationLOGISTIC, 1, 0, 1}; + ActivationDescriptor tanhDesc = {miopenActivationTANH, 1, 1, 1}; + + int total_batch_size = 0; + // accumulated batches per time + std::vector bacc_per_time(seq_len + 1); + std::vector batches; + + for(int i = 0; i < seq_len; i++) { - private: - auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + bacc_per_time[i] = total_batch_size; + total_batch_size += seq_array[i]; + batches.push_back(seq_array[i]); + } + + bacc_per_time[seq_len] = total_batch_size; + + GRUOffsets RBuff(nLayers, hidden_size, total_batch_size); + + auto get_HxBuff_offset = [&](int layer_id) { + return layer_id * (static_cast(hidden_size) * max_batch); + }; + + auto call_gru_input_gemm = [*this, + &RBuff, + &WeiBuf, + &in_vec_size, + &handle, + &xDesc, + &wDesc, + reserveSpace, + x, + w, + hidden_size](int layer_id, float beta_t = 1) { + // n = Rx,Zx,Cx + const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size() * WeiBuf.matrixes::Count, + k = layer_id > 0 ? hidden_size : in_vec_size; + + const int lda = layer_id > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, + ldc = RBuff.gemm_write_stride(); + + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; + + const auto input_weight_offset = WeiBuf.input_offset(layer_id); + const auto output_offset = RBuff.layer_offset(layer_id); + + const auto input_offset = + layer_id > 0 ? RBuff.batch_offset(layer_id - 1, 0) + RBuff.hidden_offset() : 0; + + const auto input_ptr = layer_id > 0 ? reserveSpace : x; + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + input_ptr, + input_offset, + w, + input_weight_offset, + reserveSpace, + output_offset, + GemmBackend_t::miopengemm); + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + + if(biasMode != 0u) { - if(bidirect_mode == 0) - return hidden_sz; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + const std::vector tensor_size{1, + static_cast(RBuff.batches_per_layer), + static_cast(hidden_size) * + WeiBuf.matrixes::Count}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto wei_shift_bias_temp = WeiBuf.bias_off() + WeiBuf.weight_stride * 2 * layer_id; + + auto tensor_desc = + miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + const std::vector weight_size{ + 1, 1, static_cast(hidden_size) * WeiBuf.matrixes::Count}; + + const std::vector weight_stride{ + static_cast(hidden_size) * WeiBuf.matrixes::Count, + static_cast(hidden_size) * WeiBuf.matrixes::Count, + 1}; + + auto wei_desc = miopen::TensorDescriptor(wDesc.GetType(), weight_size, weight_stride); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + tensor_desc, + reserveSpace, + &alpha1, + wei_desc, + w, + &beta_t, + tensor_desc, + reserveSpace, + output_offset, + wei_shift_bias_temp, + output_offset); } - auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz) const + const std::vector tensor_size{ + 1, static_cast(RBuff.batches_per_layer), static_cast(hidden_size)}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + float alpha0 = 0; + float alpha1 = 0; + beta_t = 0; + + CopyTensor(handle, + desc, + reserveSpace, + desc, + reserveSpace, + RBuff.layer_offset(layer_id) + RBuff.c_offset(), + RBuff.layer_offset(layer_id) + RBuff.hidden_offset()); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + desc, + reserveSpace, + &alpha1, + desc, + reserveSpace, + &beta_t, + desc, + reserveSpace, + RBuff.layer_offset(layer_id) + RBuff.c_offset(), + RBuff.layer_offset(layer_id) + RBuff.c_offset(), + RBuff.layer_offset(layer_id) + RBuff.c_offset()); + }; + + auto call_gru_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer_id, + float beta_t = 0) { + float alpha0 = 1; + float alpha1 = 1; + const auto bias_stride = WeiBuf.bias_stride(); + + auto wei_shift_bias_temp = + WeiBuf.bias_off() + WeiBuf.weight_stride * 2 * layer_id + WeiBuf.weight_stride; + + const auto bias_desc = + miopen::TensorDescriptor(wDesc.GetType(), + std::vector{1, 1, bias_stride}, + std::vector{bias_stride, bias_stride, 1}); + + const auto hidden_interim_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, + std::vector{ + RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + + const auto RB_layer_out_off = RBuff.layer_offset(layer_id); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + hidden_interim_desc, + reserveSpace, + RB_layer_out_off, + wei_shift_bias_temp, + RB_layer_out_off); + }; + + auto call_gru_hidden_gemm = [*this, + &RBuff, + &WeiBuf, + &get_HxBuff_offset, + &bacc_per_time, + &batches, + &handle, + &xDesc, + reserveSpace, + hx, + w, + hidden_size](int layer, int cur_time) { + if(cur_time == 0 && hx == nullptr) + return; + + const int m = batches.at(cur_time), n = RBuff.gemm_write_size() * WeiBuf.matrixes::Count, + k = hidden_size; + + const int lda = (cur_time == 0) ? hidden_size : RBuff.gemm_write_stride(); + + const int ldb = hidden_size, ldc = RBuff.gemm_write_stride(); + + const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; + + const auto ht_offset = + (cur_time == 0) + ? get_HxBuff_offset(layer) + : RBuff.batch_offset(layer, bacc_per_time[cur_time - 1]) + RBuff.hidden_offset(); + + const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; + + const auto result_offset = RBuff.batch_offset(layer, bacc_per_time[cur_time]); + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc_hx, + ht_ptr, + ht_offset, + w, + WeiBuf.hidden_offset(layer), + reserveSpace, + result_offset, + GemmBackend_t::miopengemm); + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; + + auto call_gru_activate_rz = [*this, + &RBuff, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + &sigDesc, + hidden_size](int layer_id, int time_id) { + float alpha = 1, beta = 0; + + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size) * 2}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + auto r_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]); + auto r_act_offset = r_offset + RBuff.activated_offset(); + + sigDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + r_offset, + // output tensor offset + r_act_offset); + }; + + auto call_gru_compute_c = + [*this, &RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size]( + int layer_id, int time_id) { + auto с_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); + auto hidden_offset = + RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); + auto hidden_act_offset = hidden_offset + RBuff.activated_offset(); + auto r_act_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + + RBuff.r_offset() + RBuff.activated_offset(); + + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + CopyTensor(handle, desc, reserveSpace, desc, reserveSpace, с_offset, hidden_act_offset); + + float alpha0 = 1; + float alpha1 = 1; + float beta = 0; + + OpTensor(handle, + miopenTensorOpMul, + &alpha0, + desc, + reserveSpace, + &alpha1, + desc, + reserveSpace, + &beta, + desc, + reserveSpace, + r_act_offset, + с_offset, + с_offset); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + desc, + reserveSpace, + &alpha1, + desc, + reserveSpace, + &beta, + desc, + reserveSpace, + с_offset, + hidden_offset, + с_offset); + }; + + auto call_gru_activate_c_gate = [*this, + &RBuff, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + &tanhDesc, + hidden_size](int layer_id, int time_id) { + float alpha = 1, beta = 0; + + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + auto c_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); + auto c_act_offset = c_offset + RBuff.activated_offset(); + + auto z_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.z_offset(); + auto z_act_offset = z_offset + RBuff.activated_offset(); + + tanhDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + c_offset, + // output tensor offset + c_act_offset); + }; + + auto call_gru_compute_hidden = [*this, + &RBuff, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + &sigDesc, + hidden_size, + hx](int layer_id, int time_id) { + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + auto hidden_offset = + RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); + auto zact_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.z_offset() + + RBuff.activated_offset(); + auto cact_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset() + + RBuff.activated_offset(); + + const std::vector hidden_tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + const std::vector hidden_tensor_stride{ + static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + auto hidden_tensor_desc = + miopen::TensorDescriptor(wDesc.GetType(), hidden_tensor_size, hidden_tensor_stride); + float alpha0 = -1, alpha1 = 1, beta = 0; + + OpTensor(handle, + miopenTensorOpMul, + &alpha0, + hidden_tensor_desc, + reserveSpace, + &alpha1, + hidden_tensor_desc, + reserveSpace, + &beta, + hidden_tensor_desc, + reserveSpace, + zact_offset, + cact_offset, + hidden_offset); + + alpha0 = 1; + alpha1 = 1; + beta = 0; + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_tensor_desc, + reserveSpace, + &alpha1, + hidden_tensor_desc, + reserveSpace, + &beta, + hidden_tensor_desc, + reserveSpace, + cact_offset, + hidden_offset, + hidden_offset); + + if(time_id == 0) { - return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz; - } + const std::vector hx_tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + const std::vector hx_tensor_stride{ + static_cast(batches.at(time_id) * hidden_size), + static_cast(hidden_size), + 1}; - size_t bias_start_offset(int input_vector_sz, - int hidden_vec_sz, - int layers_cnt, - int bidirect_mode) const - { - if(bidirect_mode == 0) - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1); + auto hx_tensor_desc = + miopen::TensorDescriptor(wDesc.GetType(), hx_tensor_size, hx_tensor_stride); + auto hx_offset = batches.at(time_id) * hidden_size * layer_id; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } + alpha0 = 1; + alpha1 = 1; + beta = 1; - public: - ReluWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) - : in_vec_sz(input_vector_sz), - h_vec_sz(hidden_vec_sz), - x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), - bias_cnt(bias_mode), - matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), - bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, 0)) - { + OpTensor(handle, + miopenTensorOpMul, + &alpha0, + hidden_tensor_desc, + reserveSpace, + &alpha1, + hx_tensor_desc, + hx, + &beta, + hidden_tensor_desc, + reserveSpace, + zact_offset, + hx_offset, + hidden_offset); } + else + { + auto hidden_prev_offset = + RBuff.batch_offset(layer_id, bacc_per_time[time_id - 1]) + RBuff.hidden_offset(); + alpha0 = 1; + alpha1 = 1; + beta = 1; - private: - const int in_vec_sz, h_vec_sz; - const int x_in_vec_sz; // for bidirect TODO + OpTensor(handle, + miopenTensorOpMul, + &alpha0, + hidden_tensor_desc, + reserveSpace, + &alpha1, + hidden_tensor_desc, + reserveSpace, + &beta, + hidden_tensor_desc, + reserveSpace, + zact_offset, + hidden_prev_offset, + hidden_offset); + } + }; - const int - bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec + auto call_gru_update_output = [&RBuff, + &get_HxBuff_offset, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + hy, + max_batch, + hidden_size, + seq_len](int layer_id) { + if(hy == nullptr) + return; - const size_t matrix_normal_start_off; - const size_t bias_start_off; + auto hcy_layer_offset = get_HxBuff_offset(layer_id); - auto get_input_matrix_size(int layer_id) const - { - return (layer_id > 0 ? x_in_vec_sz : in_vec_sz) * h_vec_sz; - } + const std::vector hcy_src_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; + const std::vector hcy_dst_stride{ + static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; - auto get_hidden_matrix_size() const { return h_vec_sz * h_vec_sz; } - auto get_matrix_layer_size(int layer_id) const + for(int time_i = seq_len - 1; time_i >= 0; time_i--) { - return get_input_matrix_size(layer_id) + get_hidden_matrix_size(); - } - - int bias_vector_size() const { return h_vec_sz; } + auto copy_batch = (time_i == seq_len - 1) ? batches.at(time_i) + : batches.at(time_i) - batches.at(time_i + 1); + if(copy_batch > 0) + { + auto batch_id_relative = batches.at(time_i) - copy_batch; + auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - size_t bias_relative_off(int layer_id, int bias_id) const - { - return static_cast(layer_id * bias_cnt + bias_id) * h_vec_sz; - } + auto hcy_batch_offset = batch_id_relative * hidden_size; - public: - size_t input_offset(int layer_id) const - { - if(layer_id > 0) - return matrix_normal_start_off + - static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); - else - return 0; - }; + auto src_batch_offset = + RBuff.batch_offset(layer_id, batch_id_abs) + RBuff.hidden_offset(); - size_t hidden_offset(int layer_id) const - { - if(layer_id > 0) - return input_offset(layer_id) + static_cast(h_vec_sz * x_in_vec_sz); - else - return input_offset(layer_id) + static_cast(h_vec_sz * in_vec_sz); - }; + const std::vector hcy_copy_size{ + 1, static_cast(copy_batch), static_cast(hidden_size)}; - int bias_stride() const { return bias_vector_size(); } + auto src_desc = + miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); + auto dst_desc = + miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); - size_t bias_off(int layer_id, int bias_id) const - { - return bias_start_off + bias_relative_off(layer_id, bias_id); + CopyTensor(handle, + src_desc, + reserveSpace, + dst_desc, + hy, + src_batch_offset, + hcy_layer_offset + hcy_batch_offset); + } } }; - struct ReluReserveBufferOffsets - { - struct RBuffHelper - { - int element, save_point, batch; - size_t layer, table; - }; + auto call_gru_hidden_state_update = [&RBuff, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + &sigDesc, + hidden_size, + call_gru_activate_rz, + call_gru_compute_c, + call_gru_activate_c_gate, + call_gru_compute_hidden](int layer_id, int time_id) { + call_gru_activate_rz(layer_id, time_id); + call_gru_compute_c(layer_id, time_id); + call_gru_activate_c_gate(layer_id, time_id); + call_gru_compute_hidden(layer_id, time_id); + }; - private: - auto Reserve_Buffer_strides(int save_point_sz, - int batches_per_layer, - int layers, - int bidirect_mode = 0) const + for(int layer_id = 0; layer_id < nLayers; layer_id++) + { + call_gru_input_gemm(layer_id); + if(biasMode != 0u) { - const auto element_st = 1; - const auto save_point_st = element_st * save_point_sz; - const auto batch_st = save_point_st; - const auto layer_st = static_cast(batch_st) * batches_per_layer; - const auto table_st = layers * layer_st; - - if(bidirect_mode == 0) - return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + call_gru_bias_add(layer_id); } - - public: - ReluReserveBufferOffsets(int hidden_vec_sz, - int save_point_sz, - int layers_cnt, - int batches_per_layer) - : h_vec_size(hidden_vec_sz), - save_point_size(save_point_sz), - layers(layers_cnt), - batches_per_layer(batches_per_layer), - strides(Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, 0)) + for(int time = 0; time < seq_len; time++) { + call_gru_hidden_gemm(layer_id, time); + call_gru_hidden_state_update(layer_id, time); } + call_gru_update_output(layer_id); + } - const int h_vec_size; - const int save_point_size; - - const int layers; - const int batches_per_layer; - const RBuffHelper strides; + // output tensor copy + { + const std::vector y_copy_size{ + 1, static_cast(total_batch_size), static_cast(out_vec_size)}; - size_t layer_offset(int layer_id) const - { - return static_cast(layer_id) * strides.layer; - } + const std::vector y_src_stride{ + RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; - auto layer_stride() const { return strides.layer; } + const std::vector y_dst_stride{static_cast(out_vec_size * total_batch_size), + static_cast(out_vec_size), + 1}; - auto gemm_write_size() const { return h_vec_size; } + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_src_stride); + auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); - auto gemm_write_stride() const { return strides.batch; } + int src_offset = RBuff.layer_offset(nLayers - 1) + RBuff.hidden_offset(); - size_t gemm_write_relative_offset(int batch_id) const - { - return static_cast(gemm_write_stride()) * batch_id; - } + CopyTensor(handle, src_desc, reserveSpace, y_dst_desc, y, src_offset, 0); + } - size_t gemm_write_offset(int layer_id, int batch_id) const - { - return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id; - } +#else + (void)handle; + (void)seq_array; + (void)xDesc; + (void)x; + (void)hxDesc; + (void)hx; + (void)cx; + (void)wDesc; + (void)w; + (void)yDesc; + (void)y; + (void)hy; + (void)cy; + (void)reserveSpace; + (void)reserveSpaceSize; - size_t ht_offset(int layer_id, int batch_id) const - { - return strides.table + layer_offset(layer_id) + gemm_write_relative_offset(batch_id); - } + MIOPEN_THROW("GEMM is not supported"); +#endif +} - size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id); } - }; +void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, + std::vector& seq_array, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& hxDesc, + ConstData_t hx, + const TensorDescriptor& wDesc, + ConstData_t w, + const TensorDescriptor& yDesc, + Data_t y, + Data_t hy, + Data_t reserveSpace, + size_t reserveSpaceSize) const +{ +#if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP + int seq_len = seq_array.size(); + if(seq_len == 0) + return; std::vector batches; int in_vec_size = xDesc.GetLengths()[1]; @@ -244,6 +765,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, total_batch_size += seq_array[i]; batches.push_back(seq_array[i]); } + bacc_per_time[seq_len] = total_batch_size; auto get_HxBuff_offset = [&](int layer_id) { @@ -264,7 +786,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, activDesc = {miopenActivationTANH, 1, 1, 1}; } - auto call_input_gemm = + auto call_relu_tan_input_gemm = [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, hidden_size]( int layer, float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), @@ -311,8 +833,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW("GEMM execution failure"); }; - auto call_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, - float beta_t = 0) { + auto call_relu_tan_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w]( + int layer, float beta_t = 0) { float alpha0 = 1; float alpha1 = 1; const auto bias_stride = WeiBuf.bias_stride(); @@ -362,17 +884,17 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RB_layer_out_off); }; - auto call_hidden_gemm = [&RBuff, - &WeiBuf, - &get_HxBuff_offset, - &bacc_per_time, - &batches, - &handle, - &xDesc, - reserveSpace, - hx, - w, - hidden_size](int layer, int cur_time) { + auto call_relu_tan_hidden_gemm = [&RBuff, + &WeiBuf, + &get_HxBuff_offset, + &bacc_per_time, + &batches, + &handle, + &xDesc, + reserveSpace, + hx, + w, + hidden_size](int layer, int cur_time) { if(cur_time == 0 && hx == nullptr) return; @@ -423,7 +945,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW("GEMM execution failure"); }; - auto call_hidden_state_update = + auto call_relu_tan_hidden_state_update = [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc, hidden_size]( int layer_id, int time_id) { float alpha = 1, beta = 0; @@ -460,17 +982,17 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, ht_offset); }; - auto call_update_output = [&RBuff, - &get_HxBuff_offset, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - hy, - max_batch, - hidden_size, - seq_len](int layer_id) { + auto call_relu_tan_update_output = [&RBuff, + &get_HxBuff_offset, + &bacc_per_time, + &batches, + &handle, + &wDesc, + reserveSpace, + hy, + max_batch, + hidden_size, + seq_len](int layer_id) { if(hy == nullptr) return; @@ -516,17 +1038,17 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(biasMode != 0u) for(int layer_id = 0; layer_id < nLayers; layer_id++) - call_bias_add(layer_id); + call_relu_tan_bias_add(layer_id); for(int layer_id = 0; layer_id < nLayers; layer_id++) { - call_input_gemm(layer_id); + call_relu_tan_input_gemm(layer_id); for(int time = 0; time < seq_len; time++) { - call_hidden_gemm(layer_id, time); - call_hidden_state_update(layer_id, time); + call_relu_tan_hidden_gemm(layer_id, time); + call_relu_tan_hidden_state_update(layer_id, time); } - call_update_output(layer_id); + call_relu_tan_update_output(layer_id); } // output tensor copy @@ -631,203 +1153,10 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, int gates_cnt = 4; int save_points_cnt = 6; - struct WeightsBufferHelper - { - private: - auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const - { - if(bidirect_mode == 0) - return hidden_sz; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } - - auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz, int gates) const - { - return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz * gates; - } - size_t bias_start_offset(int input_vector_sz, - int hidden_vec_sz, - int layers_cnt, - int gates, - int bidirect_mode) const - { - if(bidirect_mode == 0) - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1) * gates; - - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } - - public: - WeightsBufferHelper( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int gates) - : in_vec(input_vector_sz), - h_vec(hidden_vec_sz), - x_in_vec(hidden_xinput_size(hidden_vec_sz, 0)), - layers(layers_cnt), - gates_cnt(gates), - bias_cnt(bias_mode), - matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates)), - bias_start_off( - bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, gates, 0)) - { - } - - const int in_vec, h_vec; - const int x_in_vec; // for bidirect TODO - - const int layers; - const int gates_cnt; - const int - bias_cnt; // 0 - no bisa; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec - private: - const size_t matrix_normal_start_off; - const size_t bias_start_off; - - public: - auto get_matrix_x_size(int layer_id) const - { - return (layer_id > 0 ? x_in_vec : in_vec) * h_vec; - } - auto get_matrix_h_size() const { return h_vec * h_vec; } - auto get_matrix_layer_size(int layer_id) const - { - return get_matrix_x_size(layer_id) * gates_cnt + get_matrix_h_size() * gates_cnt; - } - - size_t get_matrix_x_off(int layer_id) const - { - if(layer_id > 0) - return matrix_normal_start_off + - static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); - else - return 0; - }; - - size_t get_matrix_h_off(int layer_id) const - { - if(layer_id > 0) - return get_matrix_x_off(layer_id) + - static_cast(h_vec * x_in_vec * gates_cnt); - else - return get_matrix_x_off(layer_id) + static_cast(h_vec * in_vec) * gates_cnt; - }; - - int bias_vector_size() const { return h_vec; } - int bias_vector_mul_gate() const { return bias_vector_size() * gates_cnt; } - int bias_stride() const { return bias_vector_mul_gate(); } - - size_t bias_relative_off(int layer_id, int bias_id) const - { - return static_cast(layer_id * bias_cnt + bias_id) * gates_cnt * h_vec; - } - - size_t get_bias_off(int layer_id, int bias_id) const - { - return bias_start_off + bias_relative_off(layer_id, bias_id); - } - - } WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, gates_cnt); - - struct ReserveBufferHelper - { - struct RBuffHelper - { - int element, save_point, batch; - size_t layer; - }; - - private: - auto Reserve_Buffer_strides(int save_point_sz, - int batches_per_layer, - int save_points, - int bidirect_mode = 0) const - { - const auto element_st = 1; - const auto save_point_st = element_st * save_point_sz; - const auto batch_st = save_point_st * save_points; - const auto layer_st = static_cast(batch_st) * batches_per_layer; - if(bidirect_mode == 0) - return RBuffHelper{element_st, save_point_st, batch_st, layer_st}; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } - - public: - enum save_point - { - F = 1, - I = 0, - G = 2, - O = 3, - St = 4, - Ht = 5 - }; - - ReserveBufferHelper(int hidden_vec_sz, - int save_point_sz, - int layers_cnt, - int batches_per_layer, - int save_points, - int gates_cnt) - : h_vec(hidden_vec_sz), - save_point_size(save_point_sz), - layers(layers_cnt), - batches(batches_per_layer), - save_points_cnt(save_points), - gates(gates_cnt), - strides(Reserve_Buffer_strides(save_point_sz, batches, save_points, 0)) - { - } - - const int h_vec; - const int save_point_size; // for bidirect TODO - - const int layers; - const int batches; - const int save_points_cnt; - const int gates; - const RBuffHelper strides; - - size_t layer_offset(int layer) const { return static_cast(layer) * strides.layer; } - auto layer_stride() const { return strides.layer; } + LSTMWeightsBufferHelper WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, gates_cnt); - auto gemm_write_size() const { return h_vec * gates; } - auto gemm_write_stride() const - { - return strides.batch; - } // save_point_size * save_points_cnt - - size_t gemm_write_relative_offset(int batch_id) const - { - return static_cast(gemm_write_stride()) * batch_id; - } - - size_t gemm_write_offset(int layer, int batch_id) const - { - return layer_offset(layer) + static_cast(gemm_write_stride()) * batch_id; - } - - auto ht_relative_offset() const { return save_point::Ht * save_point_size; } - - auto ct_relative_offset() const { return save_point::St * save_point_size; } - - auto get_gate_relative_offset(int gate_id) const { return gate_id * save_point_size; } - - size_t ht_offset(int layer_id, int batch_id) const - { - return layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + - ht_relative_offset(); - } - - size_t extra_save_point_offset(int layer_id, int batch_id) const - { - return (static_cast(batches) * layers * gemm_write_stride()) // all data offset - + (static_cast(batches) * layer_id) * h_vec + - static_cast(batch_id * h_vec); - } - - } RBuff(hidden_size, hidden_size, nLayers, total_batch_size, save_points_cnt, gates_cnt); + LSTMReserveBufferHelper RBuff( + hidden_size, hidden_size, nLayers, total_batch_size, save_points_cnt, gates_cnt); auto call_x_gemm = [&RBuff, &WeiBuf, @@ -870,7 +1199,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const auto x_in_offset = layer > 0 ? RBuff.ht_offset(layer - 1, start_b) : static_cast(start_b * InBuff_strides.batch); - const auto in_ptr = layer > 0 ? reserveSpace : x; + const auto in_ptr = layer > 0 ? reserveSpace : x; const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -1628,7 +1957,8 @@ void RNNDescriptor::RNNForwardInference(Handle& handle, 1, // beta xDesc[0].GetType(), false}; - miopenStatus_t gemm_status = CallGemm(handle, + + miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, workSpace, prelayer_shift, @@ -2770,6 +3100,33 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, } return; } + + if((rnnMode == miopenGRU) && !use_dropout && nLayers > 0 && dirMode == miopenRNNunidirection && + inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) + { + RNNForwardTrainingGRU(handle, + in_n, + xDesc[0], + x, + hxDesc, + hx, + wDesc, + w, + yDesc[0], + y, + hy, + reserveSpace, + reserveSpaceSize); + if(is_profiling) + { + float eventTime_mS = RNNProfilingEnd(handle, start, stop); + handle.EnableProfiling(true); + handle.ResetKernelTime(); + handle.AccumKernelTime(eventTime_mS); + } + return; + } + #endif // MIOPEN_USE_GEMM&& MIOPEN_BACKEND_HIP int in_stride = xDesc[0].GetLengths()[1]; @@ -2780,6 +3137,7 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, int bi_stride = hy_h * bi; size_t wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (nLayers - 1)) * wei_stride; + size_t offset; float alpha0, alpha1, beta_t; float alpha = 1, beta = 0; @@ -3074,6 +3432,7 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, hid_shift + hid_off + bs * hy_h); // Update time profileRNNkernels(handle, 1, ctime); + OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -3106,7 +3465,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, sp_size[1] = batch_n; sp_size[2] = wei_stride; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -3266,7 +3624,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, reserveSpace, static_cast(offset) + ri * wei_len, GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) { if(gemm_status == miopenStatusNotImplemented) @@ -3351,7 +3708,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, 1, // beta xDesc[0].GetType(), false}; - miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -3475,7 +3831,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, alpha0 = 1; alpha1 = 1; beta_t = 1; - OpTensor(handle, miopenTensorOpMul, &alpha0, @@ -3564,7 +3919,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); } - if(in_n.at(use_time) > 0) { if(in_n.at(use_time) != in_n.at(cur_time)) @@ -3602,7 +3956,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, } } } - // active cell state tanhDesc.Forward(handle, &alpha, @@ -3663,7 +4016,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, // calculate c gate sp_size[2] = hy_h; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - CopyTensor(handle, sp_desc, reserveSpace, @@ -3678,7 +4030,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, alpha0 = 1; alpha1 = 1; beta_t = 0; - OpTensor(handle, miopenTensorOpMul, &alpha0, @@ -3698,7 +4049,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, static_cast(ri) * wei_len); // Update time profileRNNkernels(handle, 1, ctime); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -3738,7 +4088,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, alpha0 = -1; alpha1 = 1; beta_t = 0; - OpTensor(handle, miopenTensorOpMul, &alpha0, @@ -3762,7 +4111,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, alpha0 = 1; alpha1 = 1; beta_t = 0; - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -3794,7 +4142,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, hx_size[2] = hy_h; hx_desc = miopen::TensorDescriptor(wDesc.GetType(), hx_size, hx_stride); - OpTensor(handle, miopenTensorOpMul, &alpha0, @@ -3860,7 +4207,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, sp_desc = miopen::TensorDescriptor( wDesc.GetType(), sp_size, sp_stride); } - OpTensor(handle, miopenTensorOpMul, &alpha0, @@ -3883,7 +4229,6 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, } } } - bacc += in_n.at(ti); } @@ -3912,8 +4257,7 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, if(in_n.at(cur_time) > use_batch) { - offset = hid_shift + cur_batch * hy_stride; - + offset = hid_shift + cur_batch * hy_stride; sp_size[1] = in_n.at(cur_time) - use_batch; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); @@ -3964,6 +4308,8 @@ void RNNDescriptor::RNNForwardTraining(Handle& handle, y_desc = miopen::TensorDescriptor(wDesc.GetType(), y_size, y_stride); sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); + // DumpGPUMemory(reserveSpace, prelayer_shift, sp_stride[0]); + CopyTensor(handle, sp_desc, reserveSpace, y_desc, y, prelayer_shift, 0); // Update time profileRNNkernels(handle, 2, ctime); @@ -5641,8 +5987,9 @@ void RNNDescriptor::RNNBackwardWeights(Handle& handle, MIOPEN_THROW(miopenStatusBadParm, "Output size doesn't match hidden state size!"); } - int in_stride = in_h; - int hy_stride = hy_h * bi * static_cast(workspaceScale); + int in_stride = in_h; + int hy_stride = hy_h * bi * static_cast(workspaceScale); + std::cout << "nHiddenTensorsPerLayer " << nHiddenTensorsPerLayer << "\n"; int wei_stride = hy_h * bi * static_cast(nHiddenTensorsPerLayer); int uni_stride = hy_h; int bi_stride = hy_h * bi; @@ -6164,7 +6511,7 @@ void RNNDescriptor::RNNBackwardWeights(Handle& handle, { hid_shift = ri == 0 ? (li * batch_n * hy_stride + bacc * hy_stride) : (li * batch_n * hy_stride + baccbi * hy_stride); - cur_time = ri == 0 ? ti : seqLen - 1 - ti; + cur_time = ri == 0 ? ti : seqLen - 1 - ti; if(ti > 0) { pre_batch = From f247d7de8f3a6f668c68d3f6958edcb6b1348e16 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Thu, 31 Aug 2023 23:13:35 +0300 Subject: [PATCH 03/27] Added RNN Backward Data refactor --- src/include/miopen/rnn.hpp | 20 ++ src/include/miopen/rnn_util.hpp | 29 +- src/ocl/rnnocl.cpp | 557 ++++++++++++++++++++++++++++++-- 3 files changed, 576 insertions(+), 30 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index 97cdb66168..1f0fd047d1 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -338,6 +338,26 @@ struct RNNDescriptor : miopenRNNDescriptor Data_t reserveSpace, size_t reserveSpaceSize) const; + void RNNBackwardDataPackedTensorsRelu(Handle& handle, + int seqLen, + c_array_view dyDesc, + ConstData_t dy, + ConstData_t dhy, + ConstData_t dcy, + ConstData_t w, + ConstData_t hx, + ConstData_t cx, + c_array_view dxDesc, + Data_t dx, + const TensorDescriptor& dhxDesc, + Data_t dhx, + const TensorDescriptor& dcxDesc, + Data_t dcx, + Data_t workSpace, + size_t workSpaceSize, + Data_t reserveSpace, + size_t reserveSpaceSize) const; + void RNNBackwardDataPackedTensors(Handle& handle, int seqLen, c_array_view dyDesc, diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 55cb031e4d..c02d958e97 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -242,18 +242,26 @@ struct ReluWeightOffsets } public: - ReluWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) + ReluWeightOffsets(int input_vector_sz, + int hidden_vec_sz, + int layers_cnt, + int bias_mode, + int bidirectional, + int wei_stride = 0) : in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), bias_cnt(bias_mode), matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), - bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, 0)) + bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, 0)), + bidirectional(bidirectional), + wei_stride(wei_stride) { } private: - const int in_vec_sz, h_vec_sz; + const int in_vec_sz; + const int bidirectional; const int x_in_vec_sz; // for bidirect TODO const int bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec @@ -267,6 +275,7 @@ struct ReluWeightOffsets } auto get_hidden_matrix_size() const { return h_vec_sz * h_vec_sz; } + auto get_matrix_layer_size(int layer_id) const { return get_input_matrix_size(layer_id) + get_hidden_matrix_size(); @@ -280,6 +289,20 @@ struct ReluWeightOffsets } public: + const int h_vec_sz; + const int wei_stride; + + size_t input_weight_offset(int layer_id) const + { + return hidden_weight_offset(layer_id) + h_vec_sz * wei_stride; + } + + size_t hidden_weight_offset(int layer_id) const + { + return in_vec_sz * wei_stride + + layer_id * (bidirectional * h_vec_sz + h_vec_sz) * wei_stride; + } + size_t input_offset(int layer_id) const { if(layer_id > 0) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index cba9d65a9e..24a4472dd2 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -772,7 +772,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2); + int bi = dirMode != 0u ? 2 : 1; + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi); ReluReserveBufferOffsets RBuff(hidden_size, hidden_size, nLayers, total_batch_size); ActivationDescriptor activDesc; @@ -2040,8 +2041,9 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, x_size[2] = hy_h; sp_size[1] = batch_n; sp_size[2] = hy_h; - x_desc = miopen::TensorDescriptor(wDesc.GetType(), x_size, x_stride); - sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); + + x_desc = miopen::TensorDescriptor(wDesc.GetType(), x_size, x_stride); + sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); for(int gi = 0; gi < nHiddenTensorsPerLayer * bi; gi++) { @@ -4595,8 +4597,6 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( y_desc = miopen::TensorDescriptor(wDesc.GetType(), y_size, y_stride); sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - // DumpGPUMemory(reserveSpace, prelayer_shift, sp_stride[0]); - CopyTensor(handle, sp_desc, reserveSpace, y_desc, y, prelayer_shift, 0); // Update time profileRNNkernels(handle, 2, ctime); @@ -4688,28 +4688,27 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, try { #endif - if(paddingMode == miopenRNNIONotPadded) { - RNNBackwardDataPackedTensors(handle, - seqLen, - dyDesc, - dy, - dhy, - dcy, - w, - hx, - cx, - dxDesc, - dx, - dhxDesc, - dhx, - dcxDesc, - dcx, - workSpace, - workSpaceSize, - reserveSpace, - reserveSpaceSize); + RNNBackwardDataPackedTensorsRelu(handle, + seqLen, + dyDesc, + dy, + dhy, + dcy, + w, + hx, + cx, + dxDesc, + dx, + dhxDesc, + dhx, + dcxDesc, + dcx, + workSpace, + workSpaceSize, + reserveSpace, + reserveSpaceSize); } else { @@ -4790,6 +4789,511 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, #endif } +void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( + Handle& handle, + const int seqLen, + c_array_view dyDesc, + ConstData_t dy, + ConstData_t dhy, + ConstData_t dcy, + ConstData_t w, + ConstData_t hx, + ConstData_t cx, + c_array_view dxDesc, + Data_t dx, + const TensorDescriptor& dhxDesc, + Data_t dhx, + const TensorDescriptor& dcxDesc, + Data_t dcx, + Data_t workSpace, + size_t workSpaceSize, + Data_t reserveSpace, + size_t reserveSpaceSize) const +{ +#if MIOPEN_USE_GEMM + + // reset kernel timer + // if projections supported, dcxDesc.GetLengths()[2] should be used for hidden_size, + // dhxDesc.GetLengths()[2] for proj_size. + if(dhxDesc.GetSize() != dcxDesc.GetSize() || dhxDesc.GetLengths()[2] != dcxDesc.GetLengths()[2]) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + if(paddingMode != miopenRNNIONotPadded) + { + MIOPEN_THROW("Padded IO is not supported by this solver"); + } + + if(workSpaceSize < GetWorkspaceSize(handle, seqLen, dxDesc)) + { + MIOPEN_THROW("Workspace is required"); + } + + if(reserveSpaceSize < GetReserveSize(handle, seqLen, dxDesc)) + { + MIOPEN_THROW("Reservespace is required"); + } + + auto rnn_data_type = dhxDesc.GetType(); + + std::vector in_n; + int in_h = dxDesc[0].GetLengths()[1]; + int hy_d = dhxDesc.GetLengths()[0]; + int hy_n = dhxDesc.GetLengths()[1]; + int hy_h = dhxDesc.GetLengths()[2]; + int out_vec_size = dyDesc[0].GetLengths()[1]; + int bi = dirMode != 0u ? 2 : 1; + int wei_stride = hy_h * bi * static_cast(nHiddenTensorsPerLayer); + + if(in_h <= 0 || hy_h <= 0 || hy_n <= 0 || hy_d <= 0 || out_vec_size <= 0 || seqLen <= 0) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + int total_batches = 0; + for(int i = 0; i < seqLen; i++) + { + int batchval, inputvec, batchvalout, outputvec; + std::tie(batchval, inputvec) = miopen::tien<2>(dxDesc[i].GetLengths()); + std::tie(batchvalout, outputvec) = miopen::tien<2>(dyDesc[i].GetLengths()); + if(batchval != batchvalout) + { + MIOPEN_THROW(miopenStatusBadParm); + } + if(i == 0) + { + if(batchval <= 0) + { + MIOPEN_THROW(miopenStatusBadParm, "Input batch is ZERO!"); + } + } + else + { + if(batchval > in_n.back() || batchval < 0) + { + MIOPEN_THROW(miopenStatusBadParm, + "Incorrect input batch size at time " + std::to_string(i) + + "! Batch size must not ascend!"); + } + } + in_n.push_back(batchval); + total_batches += dxDesc[i].GetLengths()[0]; + } + + if(out_vec_size != (bi * hy_h)) + { + MIOPEN_THROW(miopenStatusBadParm, "Output size doesn't match hidden state size!"); + } + + if(inputMode == miopenRNNskip) + { + if(in_h != hy_h) + { + MIOPEN_THROW(miopenStatusBadParm, + "The input tensor size must equal to the hidden " + "state size of the network in SKIP_INPUT mode!"); + } + in_h = 0; + } + + // Update time + ActivationDescriptor activDesc; + if(rnnMode == miopenRNNRELU) + { + activDesc = {miopenActivationRELU, 1, 0, 1}; + } + else if(rnnMode == miopenRNNTANH) + { + activDesc = {miopenActivationTANH, 1, 1, 1}; + } + + ReluWeightOffsets WeiBuf(in_h, hy_h, nLayers, bi * 2, 1, wei_stride); + ReluReserveBufferOffsets RBuff(hy_h, hy_h, nLayers, total_batches); + + auto get_HxBuff_offset = [&](int layer_id) { + return layer_id * (static_cast(hy_h) * hy_n); + }; + + auto propagate_input = + [*this, &RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, bi, w]( + int nLayers, int layer) { + // Propagate output + // + if(layer == nLayers - 1) + { + const std::vector y_copy_size{1, + static_cast(RBuff.batches_per_layer), + static_cast(out_vec_size)}; + + const std::vector y_dst_stride{ + RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; + + const std::vector y_src_stride{ + static_cast(out_vec_size * RBuff.batches_per_layer), + static_cast(out_vec_size), + 1}; + + auto y_src_desc = + miopen::TensorDescriptor(rnn_data_type, y_copy_size, y_src_stride); + auto y_dst_desc = + miopen::TensorDescriptor(rnn_data_type, y_copy_size, y_dst_stride); + + CopyTensor( + handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); + } + // Propagate previous layer + // + else + { + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + RBuff.batches_per_layer, + RBuff.h_vec_size * bi, + RBuff.h_vec_size * bi, + RBuff.h_vec_size, + RBuff.h_vec_size * bi, + RBuff.h_vec_size, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; + + miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + workSpace, + RBuff.layer_offset(layer + 1), + w, + WeiBuf.input_weight_offset(layer), + workSpace, + RBuff.layer_offset(layer), + GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + } + }; + + auto propagate_hidden_output = + [*this, &RBuff, &handle, in_n, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset]( + int layer, int accumulated_batches, int ti) { + if(dhy != nullptr) + { + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + std::vector hx_stride{in_n.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector reserve_stride{ + RBuff.batches_per_layer * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + + std::vector hx_size{1, in_n.at(ti), RBuff.h_vec_size}; + std::vector reserve_size{1, in_n.at(ti), RBuff.h_vec_size}; + + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto workspace_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + workspace_desc, + workSpace, + &beta_t, + workspace_desc, + workSpace, + get_HxBuff_offset(layer), + RBuff.gemm_write_offset(layer, accumulated_batches), + RBuff.gemm_write_offset(layer, accumulated_batches)); + } + }; + + auto propagate_hidden_prev = [*this, + &RBuff, + &handle, + in_n, + &rnn_data_type, + dhy, + workSpace, + &get_HxBuff_offset, + WeiBuf, + w](int layer, int accumulated_batches, int ti) { + if(dhy == nullptr || in_n.at(ti) <= in_n.at(ti + 1)) + return; + + std::vector hx_stride{in_n.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector reserve_stride{ + RBuff.batches_per_layer * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector hx_size{1, in_n.at(ti) - in_n.at(ti + 1), RBuff.h_vec_size}; + std::vector reserve_size{1, in_n.at(ti) - in_n.at(ti + 1), RBuff.h_vec_size}; + + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + reserve_desc, + workSpace, + &beta_t, + reserve_desc, + workSpace, + get_HxBuff_offset(layer) + in_n.at(ti + 1) * RBuff.h_vec_size, + RBuff.gemm_write_offset(layer, accumulated_batches + in_n.at(ti + 1)), + RBuff.gemm_write_offset(layer, accumulated_batches + in_n.at(ti + 1))); + + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + in_n.at(ti + 1), + RBuff.h_vec_size, + RBuff.h_vec_size, + RBuff.gemm_write_stride(), + RBuff.gemm_write_stride(), + RBuff.gemm_write_stride(), + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; + + miopenStatus_t gemm_status = + CallGemm(handle, + gemm_desc, + workSpace, + RBuff.gemm_write_offset(layer, accumulated_batches + in_n.at(ti)), + w, + WeiBuf.hidden_weight_offset(layer), + workSpace, + RBuff.gemm_write_offset(layer, accumulated_batches), + GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + }; + + auto propagate_hidden = [*this, + &RBuff, + &handle, + seqLen, + in_n, + &rnn_data_type, + workSpace, + reserveSpace, + &activDesc, + propagate_hidden_output, + propagate_hidden_prev](int nLayers, int layer) { + int accumulated_batches = RBuff.batches_per_layer; + std::vector hx_stride{in_n.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector reserve_stride{ + RBuff.batches_per_layer * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + + for(int ti = seqLen - 1; ti >= 0; ti--) + { + accumulated_batches -= in_n.at(ti); + + if(ti == seqLen - 1) + { + propagate_hidden_output(layer, accumulated_batches, ti); + } + else + { + propagate_hidden_prev(layer, accumulated_batches, ti); + } + + std::vector reserve_size{1, in_n.at(ti), RBuff.h_vec_size}; + auto reserve_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + float alpha = 1, beta = 0; + + activDesc.Backward(handle, + &alpha, + reserve_desc, + reserveSpace, + reserve_desc, + workSpace, + reserve_desc, + reserveSpace, + &beta, + reserve_desc, + workSpace, + RBuff.gemm_write_offset(layer, accumulated_batches) + + static_cast(nLayers) * RBuff.batches_per_layer * + RBuff.gemm_write_stride(), + RBuff.gemm_write_offset(layer, accumulated_batches), + RBuff.gemm_write_offset(layer, accumulated_batches), + RBuff.gemm_write_offset(layer, accumulated_batches)); + } + }; + + auto propagate_dhx = [*this, + seqLen, + &RBuff, + &WeiBuf, + &bi, + &rnn_data_type, + in_n, + &handle, + w, + dhx, + hy_n, + in_h, + workSpace](int layer) { + for(int ti = 0; ti < seqLen; ti++) + { + int use_time = ti > 0 ? ti - 1 : 0; + int use_batch = ti > 0 ? in_n.at(use_time) : 0; + + if(in_n.at(ti) <= use_batch) + return; + + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + (in_n.at(ti) - use_batch), + RBuff.h_vec_size, + RBuff.h_vec_size, + RBuff.gemm_write_stride(), + RBuff.gemm_write_stride(), + RBuff.gemm_write_stride(), + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; + + int hx_shift = layer * hy_n * RBuff.h_vec_size + use_batch * RBuff.h_vec_size; + + miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + workSpace, + RBuff.gemm_write_offset(layer, 2 * use_batch), + w, + WeiBuf.hidden_weight_offset(layer), + dhx, + hx_shift, + GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + } + }; + + for(int li = static_cast(nLayers) - 1; li >= 0; li--) + { + propagate_input(nLayers, li); + propagate_hidden(nLayers, li); + propagate_dhx(li); + } + + int in_stride = in_h; + int hy_stride = hy_h * bi * static_cast(workspaceScale); + + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + total_batches, + in_h, + RBuff.h_vec_size * bi, + hy_stride, + in_stride, + in_stride, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 0, // beta + rnn_data_type, + false}; + miopenStatus_t gemm_status = + CallGemm(handle, gemm_desc, workSpace, 0, w, 0, dx, 0, GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + +#else + (void)handle; + (void)seqLen; + (void)dhy; + (void)dcy; + (void)dyDesc; + (void)dy; + (void)w; + (void)hx; + (void)cx; + (void)dxDesc; + (void)dx; + (void)dhxDesc; + (void)dhx; + (void)dcxDesc; + (void)dcx; + (void)workSpace; + (void)workSpaceSize; + (void)reserveSpace; + (void)reserveSpaceSize; + MIOPEN_THROW("GEMM is not supported"); +#endif +}; + void RNNDescriptor::RNNBackwardDataPackedTensors( Handle& handle, const int seqLen, @@ -6171,7 +6675,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( dhx, hx_shift + ri * hy_n * hy_h + use_batch * hy_h, GemmBackend_t::miopengemm); - + if(gemm_status != miopenStatusSuccess) { if(gemm_status == miopenStatusNotImplemented) @@ -6430,7 +6934,6 @@ void RNNDescriptor::RNNBackwardWeights(Handle& handle, reserveSpace, reserveSpaceSize); } - #if MIOPEN_BACKEND_HIP } catch(...) From 7bb3be5ec27e3044bbc877dcd6af763cb6592374 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Fri, 15 Sep 2023 21:53:01 +0300 Subject: [PATCH 04/27] Added bidirection support for RNNBackwardData --- src/include/miopen/rnn.hpp | 32 +-- src/include/miopen/rnn_util.hpp | 54 ++-- src/ocl/rnnocl.cpp | 458 +++++++++++++++++++------------- 3 files changed, 321 insertions(+), 223 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index 1f0fd047d1..eaf1028cdd 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -238,6 +238,7 @@ struct RNNDescriptor : miopenRNNDescriptor Data_t hy, Data_t reserveSpace, size_t reserveSpaceSize) const; + void RNNForwardTrainingPackedTensors(Handle& handle, int seqLen, c_array_view xDesc, @@ -339,24 +340,19 @@ struct RNNDescriptor : miopenRNNDescriptor size_t reserveSpaceSize) const; void RNNBackwardDataPackedTensorsRelu(Handle& handle, - int seqLen, - c_array_view dyDesc, - ConstData_t dy, - ConstData_t dhy, - ConstData_t dcy, - ConstData_t w, - ConstData_t hx, - ConstData_t cx, - c_array_view dxDesc, - Data_t dx, - const TensorDescriptor& dhxDesc, - Data_t dhx, - const TensorDescriptor& dcxDesc, - Data_t dcx, - Data_t workSpace, - size_t workSpaceSize, - Data_t reserveSpace, - size_t reserveSpaceSize) const; + int seqLen, + c_array_view dyDesc, + ConstData_t dy, + ConstData_t dhy, + ConstData_t w, + c_array_view dxDesc, + Data_t dx, + const TensorDescriptor& dhxDesc, + Data_t dhx, + Data_t workSpace, + size_t workSpaceSize, + Data_t reserveSpace, + size_t reserveSpaceSize) const; void RNNBackwardDataPackedTensors(Handle& handle, int seqLen, diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index c02d958e97..148373d917 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -34,6 +34,12 @@ namespace miopen { +enum rnn_direction +{ + Forward = 0, + Backward = 1 +}; + #if MIOPEN_BACKEND_HIP inline void RNNProfilingBegin(const miopen::Handle& handle, miopen::HipEventPtr& start, @@ -233,12 +239,9 @@ struct ReluWeightOffsets int layers_cnt, int bidirect_mode) const { - if(bidirect_mode == 0) - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1); - - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + + static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * + hidden_vec_sz * static_cast(layers_cnt - 1); } public: @@ -253,7 +256,8 @@ struct ReluWeightOffsets x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), bias_cnt(bias_mode), matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), - bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, 0)), + bias_start_off( + bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, bidirectional)), bidirectional(bidirectional), wei_stride(wei_stride) { @@ -294,13 +298,14 @@ struct ReluWeightOffsets size_t input_weight_offset(int layer_id) const { - return hidden_weight_offset(layer_id) + h_vec_sz * wei_stride; + return hidden_weight_offset(layer_id, 0) + h_vec_sz * wei_stride; } - size_t hidden_weight_offset(int layer_id) const + size_t hidden_weight_offset(int layer_id, int reverse) const { return in_vec_sz * wei_stride + - layer_id * (bidirectional * h_vec_sz + h_vec_sz) * wei_stride; + layer_id * (bidirectional * h_vec_sz + h_vec_sz) * wei_stride + + reverse * h_vec_sz * h_vec_sz; } size_t input_offset(int layer_id) const @@ -342,27 +347,29 @@ struct ReluReserveBufferOffsets int layers, int bidirect_mode = 0) const { - const auto element_st = 1; + const auto element_st = bidirect_mode ? 2 : 1; const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st; const auto layer_st = static_cast(batch_st) * batches_per_layer; const auto table_st = layers * layer_st; - if(bidirect_mode == 0) - return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; } public: ReluReserveBufferOffsets(int hidden_vec_sz, int save_point_sz, int layers_cnt, - int batches_per_layer) + int batches_per_layer, + int max_batch, + bool bidirect_mode = 0) : h_vec_size(hidden_vec_sz), save_point_size(save_point_sz), layers(layers_cnt), batches_per_layer(batches_per_layer), - strides(Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, 0)) + max_batch(max_batch), + strides( + Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, bidirect_mode)) { } @@ -372,6 +379,7 @@ struct ReluReserveBufferOffsets const int layers; const int batches_per_layer; const RBuffHelper strides; + const int max_batch; size_t layer_offset(int layer_id) const { @@ -380,7 +388,7 @@ struct ReluReserveBufferOffsets auto layer_stride() const { return strides.layer; } - auto gemm_write_size() const { return h_vec_size; } + auto gemm_write_size() const { return strides.save_point; } auto gemm_write_stride() const { return strides.batch; } @@ -389,14 +397,16 @@ struct ReluReserveBufferOffsets return static_cast(gemm_write_stride()) * batch_id; } - size_t gemm_write_offset(int layer_id, int batch_id) const + size_t gemm_write_offset(int layer_id, int batch_id, int reverse = 0) const { - return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id; + return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + + reverse * h_vec_size; } - size_t ht_offset(int layer_id, int batch_id) const + size_t ht_offset(int layer_id, int batch_id, int reverse = 0) const { - return strides.table + layer_offset(layer_id) + gemm_write_relative_offset(batch_id); + return strides.table + layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + + reverse * h_vec_size; } size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id); } @@ -472,7 +482,7 @@ struct LSTMReserveBufferHelper return static_cast(gemm_write_stride()) * batch_id; } - size_t gemm_write_offset(int layer, int batch_id) const + size_t gemm_write_offset(int layer, int batch_id, int reverse = 0) const { return layer_offset(layer) + static_cast(gemm_write_stride()) * batch_id; } diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 24a4472dd2..2488c244ec 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -773,8 +773,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, }; int bi = dirMode != 0u ? 2 : 1; - ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi); - ReluReserveBufferOffsets RBuff(hidden_size, hidden_size, nLayers, total_batch_size); + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode, bi); + ReluReserveBufferOffsets RBuff(hidden_size, hidden_size, nLayers, total_batch_size, max_batch); ActivationDescriptor activDesc; @@ -4690,7 +4690,26 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, #endif if(paddingMode == miopenRNNIONotPadded) { - RNNBackwardDataPackedTensorsRelu(handle, + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH)) + { + RNNBackwardDataPackedTensorsRelu(handle, + seqLen, + dyDesc, + dy, + dhy, + w, + dxDesc, + dx, + dhxDesc, + dhx, + workSpace, + workSpaceSize, + reserveSpace, + reserveSpaceSize); + } + else + { + RNNBackwardDataPackedTensors(handle, seqLen, dyDesc, dy, @@ -4709,6 +4728,7 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, workSpaceSize, reserveSpace, reserveSpaceSize); + } } else { @@ -4795,16 +4815,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( c_array_view dyDesc, ConstData_t dy, ConstData_t dhy, - ConstData_t dcy, ConstData_t w, - ConstData_t hx, - ConstData_t cx, c_array_view dxDesc, Data_t dx, const TensorDescriptor& dhxDesc, Data_t dhx, - const TensorDescriptor& dcxDesc, - Data_t dcx, Data_t workSpace, size_t workSpaceSize, Data_t reserveSpace, @@ -4815,10 +4830,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( // reset kernel timer // if projections supported, dcxDesc.GetLengths()[2] should be used for hidden_size, // dhxDesc.GetLengths()[2] for proj_size. - if(dhxDesc.GetSize() != dcxDesc.GetSize() || dhxDesc.GetLengths()[2] != dcxDesc.GetLengths()[2]) - { - MIOPEN_THROW(miopenStatusBadParm); - } if(paddingMode != miopenRNNIONotPadded) { @@ -4837,16 +4848,15 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto rnn_data_type = dhxDesc.GetType(); - std::vector in_n; - int in_h = dxDesc[0].GetLengths()[1]; - int hy_d = dhxDesc.GetLengths()[0]; - int hy_n = dhxDesc.GetLengths()[1]; - int hy_h = dhxDesc.GetLengths()[2]; + std::vector batches; + int input_size = dxDesc[0].GetLengths()[1]; + int max_batch = dhxDesc.GetLengths()[1]; + int hidden_size = dhxDesc.GetLengths()[2]; int out_vec_size = dyDesc[0].GetLengths()[1]; int bi = dirMode != 0u ? 2 : 1; - int wei_stride = hy_h * bi * static_cast(nHiddenTensorsPerLayer); + int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); - if(in_h <= 0 || hy_h <= 0 || hy_n <= 0 || hy_d <= 0 || out_vec_size <= 0 || seqLen <= 0) + if(input_size <= 0 || hidden_size <= 0 || max_batch <= 0 || out_vec_size <= 0 || seqLen <= 0) { MIOPEN_THROW(miopenStatusBadParm); } @@ -4870,31 +4880,31 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } else { - if(batchval > in_n.back() || batchval < 0) + if(batchval > batches.back() || batchval < 0) { MIOPEN_THROW(miopenStatusBadParm, "Incorrect input batch size at time " + std::to_string(i) + "! Batch size must not ascend!"); } } - in_n.push_back(batchval); + batches.push_back(batchval); total_batches += dxDesc[i].GetLengths()[0]; } - if(out_vec_size != (bi * hy_h)) + if(out_vec_size != (bi * hidden_size)) { MIOPEN_THROW(miopenStatusBadParm, "Output size doesn't match hidden state size!"); } if(inputMode == miopenRNNskip) { - if(in_h != hy_h) + if(input_size != hidden_size) { MIOPEN_THROW(miopenStatusBadParm, "The input tensor size must equal to the hidden " "state size of the network in SKIP_INPUT mode!"); } - in_h = 0; + input_size = 0; } // Update time @@ -4908,36 +4918,43 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( activDesc = {miopenActivationTANH, 1, 1, 1}; } - ReluWeightOffsets WeiBuf(in_h, hy_h, nLayers, bi * 2, 1, wei_stride); - ReluReserveBufferOffsets RBuff(hy_h, hy_h, nLayers, total_batches); + ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode, bi, wei_stride); + ReluReserveBufferOffsets RBuff(hidden_size, + hidden_size, + nLayers, + total_batches, + max_batch, + dirMode == miopenRNNbidirection); - auto get_HxBuff_offset = [&](int layer_id) { - return layer_id * (static_cast(hy_h) * hy_n); + auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse) { + return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); }; - auto propagate_input = - [*this, &RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, bi, w]( - int nLayers, int layer) { + auto propagate_output = + [&RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, w](int nLayers, + int layer) { // Propagate output // if(layer == nLayers - 1) { - const std::vector y_copy_size{1, - static_cast(RBuff.batches_per_layer), - static_cast(out_vec_size)}; + const std::vector y_src_size{1, + static_cast(RBuff.batches_per_layer), + static_cast(out_vec_size)}; + + const std::vector y_dst_size{1, + static_cast(RBuff.batches_per_layer), + static_cast(RBuff.gemm_write_size())}; const std::vector y_dst_stride{ - RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; + RBuff.layer_stride(), static_cast(RBuff.gemm_write_size()), 1}; const std::vector y_src_stride{ static_cast(out_vec_size * RBuff.batches_per_layer), static_cast(out_vec_size), 1}; - auto y_src_desc = - miopen::TensorDescriptor(rnn_data_type, y_copy_size, y_src_stride); - auto y_dst_desc = - miopen::TensorDescriptor(rnn_data_type, y_copy_size, y_dst_stride); + auto y_src_desc = miopen::TensorDescriptor(rnn_data_type, y_src_size, y_src_stride); + auto y_dst_desc = miopen::TensorDescriptor(rnn_data_type, y_dst_size, y_dst_stride); CopyTensor( handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); @@ -4950,11 +4967,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, RBuff.batches_per_layer, - RBuff.h_vec_size * bi, - RBuff.h_vec_size * bi, - RBuff.h_vec_size, - RBuff.h_vec_size * bi, - RBuff.h_vec_size, + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), 1, // batch count 0, // Stride A 0, // Stride B @@ -4989,92 +5006,103 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( }; auto propagate_hidden_output = - [*this, &RBuff, &handle, in_n, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset]( - int layer, int accumulated_batches, int ti) { - if(dhy != nullptr) - { - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; + [&RBuff, &handle, batches, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset]( + int layer, int accumulated_batches, int cur_time, int reverse) { + if(dhy == nullptr) + return; - std::vector hx_stride{in_n.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; - std::vector reserve_stride{ - RBuff.batches_per_layer * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; - std::vector hx_size{1, in_n.at(ti), RBuff.h_vec_size}; - std::vector reserve_size{1, in_n.at(ti), RBuff.h_vec_size}; + std::vector hx_stride{batches.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector reserve_stride{(int)RBuff.layer_stride(), RBuff.gemm_write_size(), 1}; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); - auto workspace_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + std::vector hx_size{1, batches.at(cur_time), RBuff.h_vec_size}; + std::vector reserve_size{1, batches.at(cur_time), RBuff.h_vec_size}; - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - workspace_desc, - workSpace, - &beta_t, - workspace_desc, - workSpace, - get_HxBuff_offset(layer), - RBuff.gemm_write_offset(layer, accumulated_batches), - RBuff.gemm_write_offset(layer, accumulated_batches)); - } + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto workspace_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + workspace_desc, + workSpace, + &beta_t, + workspace_desc, + workSpace, + get_HxBuff_offset(layer, reverse), + RBuff.gemm_write_offset(layer, accumulated_batches, reverse), + RBuff.gemm_write_offset(layer, accumulated_batches, reverse)); }; - auto propagate_hidden_prev = [*this, - &RBuff, + auto propagate_hidden_prev = [&RBuff, &handle, - in_n, + batches, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset, WeiBuf, - w](int layer, int accumulated_batches, int ti) { - if(dhy == nullptr || in_n.at(ti) <= in_n.at(ti + 1)) - return; + w](int layer, + int accumulated_batches, + int cur_time, + int use_time, + int pre_batch, + int reverse) { + if(reverse == 0 && dhy != nullptr && batches.at(cur_time) > batches.at(use_time)) + { + std::vector hx_stride{batches.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; - std::vector hx_stride{in_n.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; - std::vector reserve_stride{ - RBuff.batches_per_layer * RBuff.h_vec_size, RBuff.h_vec_size, 1}; - std::vector hx_size{1, in_n.at(ti) - in_n.at(ti + 1), RBuff.h_vec_size}; - std::vector reserve_size{1, in_n.at(ti) - in_n.at(ti + 1), RBuff.h_vec_size}; + std::vector reserve_stride{(int)RBuff.layer_stride(), RBuff.gemm_write_size(), 1}; - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; + std::vector hx_size{ + 1, batches.at(cur_time) - batches.at(use_time), RBuff.h_vec_size}; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); - auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + std::vector reserve_size{ + 1, batches.at(cur_time) - batches.at(use_time), RBuff.h_vec_size}; - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - reserve_desc, - workSpace, - &beta_t, - reserve_desc, - workSpace, - get_HxBuff_offset(layer) + in_n.at(ti + 1) * RBuff.h_vec_size, - RBuff.gemm_write_offset(layer, accumulated_batches + in_n.at(ti + 1)), - RBuff.gemm_write_offset(layer, accumulated_batches + in_n.at(ti + 1))); + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto reserve_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + reserve_desc, + workSpace, + &beta_t, + reserve_desc, + workSpace, + (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.h_vec_size), + RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time)), + RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time))); + } + + if(batches.at(use_time) <= 0) + return; miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, - in_n.at(ti + 1), + batches.at(use_time), RBuff.h_vec_size, RBuff.h_vec_size, - RBuff.gemm_write_stride(), - RBuff.gemm_write_stride(), - RBuff.gemm_write_stride(), + RBuff.gemm_write_size(), + RBuff.h_vec_size, + RBuff.gemm_write_size(), 1, // batch count 0, // Stride A 0, // Stride B @@ -5088,11 +5116,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( CallGemm(handle, gemm_desc, workSpace, - RBuff.gemm_write_offset(layer, accumulated_batches + in_n.at(ti)), + RBuff.gemm_write_offset(layer, pre_batch, reverse), w, - WeiBuf.hidden_weight_offset(layer), + WeiBuf.hidden_weight_offset(layer, reverse), workSpace, - RBuff.gemm_write_offset(layer, accumulated_batches), + RBuff.gemm_write_offset(layer, accumulated_batches, reverse), GemmBackend_t::miopengemm); if(gemm_status != miopenStatusSuccess) @@ -5112,87 +5140,125 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &RBuff, &handle, seqLen, - in_n, + batches, &rnn_data_type, workSpace, reserveSpace, &activDesc, propagate_hidden_output, propagate_hidden_prev](int nLayers, int layer) { - int accumulated_batches = RBuff.batches_per_layer; - std::vector hx_stride{in_n.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + int accumulated_batches = RBuff.batches_per_layer; + int reverse_accumulated_batches = 0; + + std::vector hx_stride{ + batches.at(0) * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; std::vector reserve_stride{ - RBuff.batches_per_layer * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; for(int ti = seqLen - 1; ti >= 0; ti--) { - accumulated_batches -= in_n.at(ti); + accumulated_batches -= batches.at(ti); if(ti == seqLen - 1) { - propagate_hidden_output(layer, accumulated_batches, ti); + propagate_hidden_output(layer, accumulated_batches, ti, rnn_direction::Forward); } else { - propagate_hidden_prev(layer, accumulated_batches, ti); + propagate_hidden_prev(layer, + accumulated_batches, + ti, + ti + 1, + accumulated_batches + batches.at(ti), + rnn_direction::Forward); } - std::vector reserve_size{1, in_n.at(ti), RBuff.h_vec_size}; + std::vector reserve_size{1, batches.at(ti), RBuff.h_vec_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); float alpha = 1, beta = 0; - activDesc.Backward(handle, - &alpha, - reserve_desc, - reserveSpace, - reserve_desc, - workSpace, - reserve_desc, - reserveSpace, - &beta, - reserve_desc, - workSpace, - RBuff.gemm_write_offset(layer, accumulated_batches) + - static_cast(nLayers) * RBuff.batches_per_layer * - RBuff.gemm_write_stride(), - RBuff.gemm_write_offset(layer, accumulated_batches), - RBuff.gemm_write_offset(layer, accumulated_batches), - RBuff.gemm_write_offset(layer, accumulated_batches)); + activDesc.Backward( + handle, + &alpha, + reserve_desc, + reserveSpace, + reserve_desc, + workSpace, + reserve_desc, + reserveSpace, + &beta, + reserve_desc, + workSpace, + RBuff.ht_offset(layer, accumulated_batches, rnn_direction::Forward), + RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), + RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), + RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward)); + + if(dirMode == 0u) + continue; + + // Propagate Backward direction + // + if(ti == seqLen - 1) + { + propagate_hidden_output( + layer, reverse_accumulated_batches, seqLen - 1 - ti, rnn_direction::Backward); + } + else + { + propagate_hidden_prev(layer, + reverse_accumulated_batches, + seqLen - 1 - ti, + seqLen - 1 - ti, + reverse_accumulated_batches - batches.at(seqLen - 2 - ti), + rnn_direction::Backward); + } + + std::vector reserve_backward_size{ + 1, batches.at(seqLen - 1 - ti), RBuff.h_vec_size}; + auto reserve_backward_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_backward_size, reserve_stride); + activDesc.Backward( + handle, + &alpha, + reserve_backward_desc, + reserveSpace, + reserve_backward_desc, + workSpace, + reserve_backward_desc, + reserveSpace, + &beta, + reserve_backward_desc, + workSpace, + RBuff.ht_offset(layer, reverse_accumulated_batches, rnn_direction::Backward), + RBuff.gemm_write_offset( + layer, reverse_accumulated_batches, rnn_direction::Backward), + RBuff.gemm_write_offset( + layer, reverse_accumulated_batches, rnn_direction::Backward), + RBuff.gemm_write_offset( + layer, reverse_accumulated_batches, rnn_direction::Backward)); + + reverse_accumulated_batches += batches.at(seqLen - 1 - ti); } }; - auto propagate_dhx = [*this, - seqLen, - &RBuff, - &WeiBuf, - &bi, - &rnn_data_type, - in_n, - &handle, - w, - dhx, - hy_n, - in_h, - workSpace](int layer) { - for(int ti = 0; ti < seqLen; ti++) - { - int use_time = ti > 0 ? ti - 1 : 0; - int use_batch = ti > 0 ? in_n.at(use_time) : 0; - - if(in_n.at(ti) <= use_batch) + auto propagate_dhx_prev = + [&RBuff, &WeiBuf, &rnn_data_type, batches, &handle, w, dhx, &get_HxBuff_offset, workSpace]( + int layer, int cur_time, int cur_batch, int use_batch, int reverse) { + if(batches.at(cur_time) <= use_batch) return; miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, - (in_n.at(ti) - use_batch), + (batches.at(cur_time) - use_batch), + RBuff.h_vec_size, + RBuff.h_vec_size, + RBuff.gemm_write_size(), RBuff.h_vec_size, RBuff.h_vec_size, - RBuff.gemm_write_stride(), - RBuff.gemm_write_stride(), - RBuff.gemm_write_stride(), 1, // batch count 0, // Stride A 0, // Stride B @@ -5202,17 +5268,17 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int hx_shift = layer * hy_n * RBuff.h_vec_size + use_batch * RBuff.h_vec_size; - - miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - workSpace, - RBuff.gemm_write_offset(layer, 2 * use_batch), - w, - WeiBuf.hidden_weight_offset(layer), - dhx, - hx_shift, - GemmBackend_t::miopengemm); + int hx_shift = get_HxBuff_offset(layer, reverse) + use_batch * RBuff.h_vec_size; + miopenStatus_t gemm_status = + CallGemm(handle, + gemm_desc, + workSpace, + RBuff.gemm_write_offset(layer, cur_batch + use_batch, reverse), + w, + WeiBuf.hidden_weight_offset(layer, reverse), + dhx, + hx_shift, + GemmBackend_t::miopengemm); if(gemm_status != miopenStatusSuccess) { @@ -5225,24 +5291,48 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } + }; + + auto propagate_dhx = [*this, seqLen, &RBuff, batches, &propagate_dhx_prev](int layer) { + int accumulated_batches = 0; + int reverse_accumulated_batches = RBuff.batches_per_layer; + + for(int ti = 0; ti < seqLen; ti++) + { + int use_time = ti > 0 ? ti - 1 : 0; + int use_batch = ti > 0 ? batches.at(use_time) : 0; + propagate_dhx_prev(layer, ti, accumulated_batches, use_batch, rnn_direction::Forward); + + if(dirMode != 0u) + { + reverse_accumulated_batches -= batches.at(seqLen - 1 - ti); + int reverse_use_time = ti > 0 ? seqLen - ti : 0; + int reverse_use_batch = ti > 0 ? batches.at(reverse_use_time) : 0; + propagate_dhx_prev(layer, + seqLen - 1 - ti, + reverse_accumulated_batches, + reverse_use_batch, + rnn_direction::Backward); + } + accumulated_batches += batches.at(ti); } }; for(int li = static_cast(nLayers) - 1; li >= 0; li--) { - propagate_input(nLayers, li); + propagate_output(nLayers, li); propagate_hidden(nLayers, li); propagate_dhx(li); } - int in_stride = in_h; - int hy_stride = hy_h * bi * static_cast(workspaceScale); + int in_stride = input_size; + int hy_stride = hidden_size * bi * static_cast(workspaceScale); miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, total_batches, - in_h, + input_size, RBuff.h_vec_size * bi, hy_stride, in_stride, @@ -5510,8 +5600,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( y_size[2] = out_h; sp_size[1] = batch_n; sp_size[2] = hy_h * bi; - y_desc = miopen::TensorDescriptor(rnn_data_type, y_size, y_stride); - sp_desc = miopen::TensorDescriptor(rnn_data_type, sp_size, sp_stride); + + y_desc = miopen::TensorDescriptor(rnn_data_type, y_size, y_stride); + sp_desc = miopen::TensorDescriptor(rnn_data_type, sp_size, sp_stride); CopyTensor(handle, y_desc, dy, sp_desc, workSpace, 0, hid_shift + dhd_off); // Update time @@ -5611,6 +5702,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( cur_time = ri == 0 ? ti : seqLen - 1 - ti; cur_batch = ri == 0 ? bacc : baccbi; offset = hid_shift + cur_batch * hy_stride; + if(ti < seqLen - 1) { use_time = ri == 0 ? ti + 1 : seqLen - 1 - ti; @@ -5684,10 +5776,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( &beta_t, sp_desc, workSpace, - hx_shift + ri * hy_n * hy_h + in_n.at(use_time) * hy_h, - offset + dhd_off + static_cast(ri) * hy_h + + hx_shift + in_n.at(use_time) * hy_h, + offset + dhd_off + static_cast(in_n.at(use_time)) * hy_stride, - offset + dhd_off + static_cast(ri) * hy_h + + offset + dhd_off + static_cast(in_n.at(use_time)) * hy_stride); // Update time profileRNNkernels(handle, 1, ctime); @@ -5747,6 +5839,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( // Update time profileRNNkernels(handle, 1, ctime); } + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, @@ -6675,7 +6768,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( dhx, hx_shift + ri * hy_n * hy_h + use_batch * hy_h, GemmBackend_t::miopengemm); - + if(gemm_status != miopenStatusSuccess) { if(gemm_status == miopenStatusNotImplemented) @@ -7043,9 +7136,8 @@ void RNNDescriptor::RNNBackwardWeightsPackedTensors( MIOPEN_THROW(miopenStatusBadParm, "Output size doesn't match hidden state size!"); } - int in_stride = in_h; - int hy_stride = hy_h * bi * static_cast(workspaceScale); - std::cout << "nHiddenTensorsPerLayer " << nHiddenTensorsPerLayer << "\n"; + int in_stride = in_h; + int hy_stride = hy_h * bi * static_cast(workspaceScale); int wei_stride = hy_h * bi * static_cast(nHiddenTensorsPerLayer); int uni_stride = hy_h; int bi_stride = hy_h * bi; From 3c889def133bf07739771daa5abe73954961f780 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Sat, 16 Sep 2023 00:04:47 +0300 Subject: [PATCH 05/27] Unified relu and gru offset interfaces --- src/include/miopen/rnn_util.hpp | 21 ++++++--------- src/ocl/rnnocl.cpp | 47 +++++++++++++++++---------------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 148373d917..e4ad698568 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -134,15 +134,15 @@ struct GRUOffsets { } - int r_offset() const { return save_point::R * hidden_size; } + int r_offset() const { return save_point::R * gemm_write_size(); } - int z_offset() const { return save_point::Z * hidden_size; } + int z_offset() const { return save_point::Z * gemm_write_size(); } - int c_offset() const { return save_point::С * hidden_size; } + int c_offset() const { return save_point::С * gemm_write_size(); } - int hidden_offset() const { return save_point::Ht * hidden_size; } + int hidden_offset() const { return save_point::Ht * gemm_write_size(); } - size_t batch_offset(int layer_id, int batch_num) const + size_t gemm_write_offset(int layer_id, int batch_num) const { return layer_offset(layer_id) + batch_num * gemm_write_stride(); } @@ -151,7 +151,7 @@ struct GRUOffsets int gemm_write_size() const { return hidden_size; } - int gemm_write_stride() const { return save_point::Count * hidden_size; } + int gemm_write_stride() const { return save_point::Count * gemm_write_size(); } int layer_offset(int layer) const { return layer * layer_stride(); } @@ -392,11 +392,6 @@ struct ReluReserveBufferOffsets auto gemm_write_stride() const { return strides.batch; } - size_t gemm_write_relative_offset(int batch_id) const - { - return static_cast(gemm_write_stride()) * batch_id; - } - size_t gemm_write_offset(int layer_id, int batch_id, int reverse = 0) const { return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + @@ -405,11 +400,11 @@ struct ReluReserveBufferOffsets size_t ht_offset(int layer_id, int batch_id, int reverse = 0) const { - return strides.table + layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + + return strides.table + gemm_write_offset(layer_id, batch_id) + reverse * h_vec_size; } - size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id); } + size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id);} }; struct LSTMReserveBufferHelper diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 2488c244ec..5100b230ca 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -131,7 +131,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, const auto output_offset = RBuff.layer_offset(layer_id); const auto input_offset = - layer_id > 0 ? RBuff.batch_offset(layer_id - 1, 0) + RBuff.hidden_offset() : 0; + layer_id > 0 ? RBuff.gemm_write_offset(layer_id - 1, 0) + RBuff.hidden_offset() : 0; const auto input_ptr = layer_id > 0 ? reserveSpace : x; @@ -308,14 +308,14 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, xDesc.GetType(), false}; - const auto ht_offset = - (cur_time == 0) - ? get_HxBuff_offset(layer) - : RBuff.batch_offset(layer, bacc_per_time[cur_time - 1]) + RBuff.hidden_offset(); + const auto ht_offset = (cur_time == 0) + ? get_HxBuff_offset(layer) + : RBuff.gemm_write_offset(layer, bacc_per_time[cur_time - 1]) + + RBuff.hidden_offset(); const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; - const auto result_offset = RBuff.batch_offset(layer, bacc_per_time[cur_time]); + const auto result_offset = RBuff.gemm_write_offset(layer, bacc_per_time[cur_time]); const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, @@ -351,7 +351,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto r_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]); + auto r_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); auto r_act_offset = r_offset + RBuff.activated_offset(); sigDesc.Forward(handle, @@ -374,11 +374,12 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto call_gru_compute_c = [*this, &RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size]( int layer_id, int time_id) { - auto с_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); + auto с_offset = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); auto hidden_offset = - RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); auto hidden_act_offset = hidden_offset + RBuff.activated_offset(); - auto r_act_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + + auto r_act_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.r_offset() + RBuff.activated_offset(); const std::vector tensor_size{ @@ -449,10 +450,12 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto c_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); + auto c_offset = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); auto c_act_offset = c_offset + RBuff.activated_offset(); - auto z_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.z_offset(); + auto z_offset = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.z_offset(); auto z_act_offset = z_offset + RBuff.activated_offset(); tanhDesc.Forward(handle, @@ -492,11 +495,11 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); auto hidden_offset = - RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); - auto zact_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.z_offset() + - RBuff.activated_offset(); - auto cact_offset = RBuff.batch_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset() + - RBuff.activated_offset(); + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); + auto zact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + + RBuff.z_offset() + RBuff.activated_offset(); + auto cact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + + RBuff.c_offset() + RBuff.activated_offset(); const std::vector hidden_tensor_size{ 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; @@ -577,7 +580,8 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, else { auto hidden_prev_offset = - RBuff.batch_offset(layer_id, bacc_per_time[time_id - 1]) + RBuff.hidden_offset(); + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id - 1]) + + RBuff.hidden_offset(); alpha0 = 1; alpha1 = 1; beta = 1; @@ -633,7 +637,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto hcy_batch_offset = batch_id_relative * hidden_size; auto src_batch_offset = - RBuff.batch_offset(layer_id, batch_id_abs) + RBuff.hidden_offset(); + RBuff.gemm_write_offset(layer_id, batch_id_abs) + RBuff.hidden_offset(); const std::vector hcy_copy_size{ 1, static_cast(copy_batch), static_cast(hidden_size)}; @@ -1015,9 +1019,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto hcy_batch_offset = batch_id_relative * hidden_size; - auto src_batch_offset = - RBuff.ht_offset(layer_id) + RBuff.gemm_write_relative_offset(batch_id_abs); - const std::vector hcy_copy_size{ 1, static_cast(copy_batch), static_cast(hidden_size)}; @@ -1031,7 +1032,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, dst_desc, hy, - src_batch_offset, + RBuff.ht_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } } From cf967132ce1bb986f90a04373d4c1abc87c7bf63 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Wed, 27 Sep 2023 15:28:50 +0300 Subject: [PATCH 06/27] Lstm minor refactoring --- src/include/miopen/rnn_util.hpp | 519 +++++++++++++++----------------- src/ocl/rnnocl.cpp | 477 ++++++++++++++--------------- 2 files changed, 455 insertions(+), 541 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index e4ad698568..4dfdce971c 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -126,69 +126,54 @@ void LSTMBackwardHiddenStateUpdate(const Handle& handle, std::size_t dhidden_offset, std::size_t f_offset_pre); -struct GRUOffsets +struct RNNWeightOffsets { -public: - GRUOffsets(int num_layers, int hidden_size, int total_batch_size) - : num_layers(num_layers), hidden_size(hidden_size), batches_per_layer(total_batch_size) - { - } - - int r_offset() const { return save_point::R * gemm_write_size(); } - - int z_offset() const { return save_point::Z * gemm_write_size(); } - - int c_offset() const { return save_point::С * gemm_write_size(); } - - int hidden_offset() const { return save_point::Ht * gemm_write_size(); } - - size_t gemm_write_offset(int layer_id, int batch_num) const - { - return layer_offset(layer_id) + batch_num * gemm_write_stride(); - } - - int activated_offset() const { return layer_stride() * num_layers; } - - int gemm_write_size() const { return hidden_size; } - - int gemm_write_stride() const { return save_point::Count * gemm_write_size(); } - - int layer_offset(int layer) const { return layer * layer_stride(); } - int batches_per_layer; - - size_t layer_stride() const { return gemm_write_stride() * batches_per_layer; } - - size_t network_stride() { return layer_stride() * num_layers; } +public: + int input_offset(int layer) const; + int hidden_offset(int layer) const; + int bias_off(); + int bias_off(int layer) const; private: - int num_layers; - int hidden_size; - - enum save_point - { - Z = 0, - R = 1, - С = 2, - Ht = 3, - Count = 4 - }; + int first_layer_offset() const; }; -struct GruWeightOffsets +struct GruWeightOffsets : public RNNWeightOffsets { - GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) + GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_count) : weight_stride(matrixes::Count * hidden_vec_sz), in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), - num_layers(layers_cnt) + num_layers(layers_cnt), + bias_count(bias_count) + { + } + + int input_offset(int layer) { + return layer == 0 ? 0 : first_layer_offset() + h_vec_sz * 2 * weight_stride * (layer - 1); } + int hidden_offset(int layer) + { + return layer == 0 ? input_offset(layer) + in_vec_sz * weight_stride + : input_offset(layer) + h_vec_sz * weight_stride; + } + + size_t bias_stride() { return matrixes::Count * h_vec_sz; } + int bias_off() + { + return (in_vec_sz + h_vec_sz + bias_count * h_vec_sz * (num_layers - 1)) * weight_stride; + } + int bias_off(int layer_id) { return bias_off() + layer_id * bias_count * weight_stride; } int weight_stride; + +private: const int in_vec_sz, h_vec_sz; const int num_layers; - + const int bi_scale = 0; + const int bias_count = 0; enum matrixes { Z = 0, @@ -196,144 +181,186 @@ struct GruWeightOffsets C = 2, Count = 3 }; + int first_layer_offset() { return (in_vec_sz + h_vec_sz) * weight_stride; } +}; +struct ReluWeightOffsets : public RNNWeightOffsets +{ public: - int input_offset(int layer) + ReluWeightOffsets(int input_vector_sz, + int hidden_vec_sz, + int layers_cnt, + int bias_mode, + int bi_scale, + int wei_stride) + : weight_stride(wei_stride), + in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + num_layers(layers_cnt), + bi_scale(bi_scale), + bias_count(bias_mode) + { + } + + int input_weight_offset(int layer) const { return layer == 0 ? 0 - : (in_vec_sz + h_vec_sz) * weight_stride + - 2 * h_vec_sz * weight_stride * (layer - 1); + : first_layer_offset() + + (h_vec_sz + h_vec_sz * bi_scale) * weight_stride * (layer - 1); } - int hidden_offset(int layer) + int hidden_weight_offset(int layer, int reverse = 0) const { - return layer == 0 ? input_offset(layer) + in_vec_sz * weight_stride - : input_offset(layer) + h_vec_sz * weight_stride; + return layer == 0 ? input_weight_offset(layer) + in_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz + : input_weight_offset(layer) + bi_scale * h_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz; } - int bias_stride() { return matrixes::Count * h_vec_sz; } + size_t bias_stride() { return h_vec_sz; } + int bias_off() { - return (in_vec_sz + h_vec_sz + 2 * h_vec_sz * (num_layers - 1)) * weight_stride; + return first_layer_offset() + + (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; } - int bias_off(int layer_id) { return bias_off() + layer_id * weight_stride; } + + int bias_off(int layer_id) { return bias_off() + bias_count * layer_id * weight_stride; } + int weight_stride; + +private: + const int in_vec_sz, h_vec_sz; + const int num_layers; + const int bi_scale = 1; + const int bias_count = 0; + + int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } }; -struct ReluWeightOffsets +struct LSTMWeightsBufferHelper : public RNNWeightOffsets { -private: - auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const +public: + const int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } + +public: + LSTMWeightsBufferHelper( + int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int bi_scale) + : weight_stride(hidden_vec_sz * gates_cnt), + in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + num_layers(layers_cnt), + bi_scale(bi_scale), + bias_cnt(bias_mode) { - if(bidirect_mode == 0) - return hidden_sz; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); } - auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz) const + int input_weight_offset(int layer) const { - return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz; + return layer == 0 ? 0 + : first_layer_offset() + + (h_vec_sz + h_vec_sz * bi_scale) * weight_stride * (layer - 1); } - size_t bias_start_offset(int input_vector_sz, - int hidden_vec_sz, - int layers_cnt, - int bidirect_mode) const + int hidden_weight_offset(int layer, int reverse = 0) const { - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1); + return layer == 0 ? input_weight_offset(layer) + in_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz + : input_weight_offset(layer) + bi_scale * h_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz; } -public: - ReluWeightOffsets(int input_vector_sz, - int hidden_vec_sz, - int layers_cnt, - int bias_mode, - int bidirectional, - int wei_stride = 0) - : in_vec_sz(input_vector_sz), - h_vec_sz(hidden_vec_sz), - x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), - bias_cnt(bias_mode), - matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), - bias_start_off( - bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, bidirectional)), - bidirectional(bidirectional), - wei_stride(wei_stride) + size_t bias_stride() { return bias_vector_mul_gate(); } + + int bias_off() { + return first_layer_offset() + + (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; } + int bias_off(int layer_id) { return bias_off() + layer_id * bias_cnt * weight_stride; } + + size_t bias_vector_mul_gate() const { return h_vec_sz * gates_cnt; } + + const int weight_stride; + private: + static const int gates_cnt = 4; const int in_vec_sz; - const int bidirectional; - const int x_in_vec_sz; // for bidirect TODO + const int h_vec_sz; + const int num_layers; + const int bi_scale = 1; + const int bias_cnt = 0; +}; - const int bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec +struct RNNOffsets +{ + size_t layer_offset(int layer_id) const; - const size_t matrix_normal_start_off; - const size_t bias_start_off; + size_t layer_stride() const; - auto get_input_matrix_size(int layer_id) const - { - return (layer_id > 0 ? x_in_vec_sz : in_vec_sz) * h_vec_sz; - } + int gemm_write_size() const; - auto get_hidden_matrix_size() const { return h_vec_sz * h_vec_sz; } + size_t gemm_write_stride() const; - auto get_matrix_layer_size(int layer_id) const - { - return get_input_matrix_size(layer_id) + get_hidden_matrix_size(); - } + size_t gemm_write_offset(int layer_id, int batch_id = 0, int reverse = 0) const; - int bias_vector_size() const { return h_vec_sz; } + size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const; +}; - size_t bias_relative_off(int layer_id, int bias_id) const +struct GRUOffsets : public RNNOffsets +{ +public: + GRUOffsets(int in_vec_size, int hidden_size, int num_layers, int total_batch_size) + : hidden_size(hidden_size), + batches_per_layer(total_batch_size), + in_vec_size(in_vec_size), + num_layers(num_layers) { - return static_cast(layer_id * bias_cnt + bias_id) * h_vec_sz; } -public: - const int h_vec_sz; - const int wei_stride; + size_t layer_offset(int layer_id) const { return layer_id * layer_stride(); } - size_t input_weight_offset(int layer_id) const - { - return hidden_weight_offset(layer_id, 0) + h_vec_sz * wei_stride; - } + size_t layer_stride() const { return gemm_write_stride() * batches_per_layer; } + + int gemm_write_size() const { return hidden_size; } + + size_t gemm_write_stride() const { return save_point::Count * gemm_write_size(); } - size_t hidden_weight_offset(int layer_id, int reverse) const + size_t gemm_write_offset(int layer_id, int batch_num, int reverse = 0) const { - return in_vec_sz * wei_stride + - layer_id * (bidirectional * h_vec_sz + h_vec_sz) * wei_stride + - reverse * h_vec_sz * h_vec_sz; + return layer_offset(layer_id) + batch_num * gemm_write_stride(); } - size_t input_offset(int layer_id) const - { - if(layer_id > 0) - return matrix_normal_start_off + - static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); - else - return 0; - }; + size_t hidden_offset() const { return save_point::Ht * gemm_write_size(); } - size_t hidden_offset(int layer_id) const - { - if(layer_id > 0) - return input_offset(layer_id) + static_cast(h_vec_sz * x_in_vec_sz); - else - return input_offset(layer_id) + static_cast(h_vec_sz * in_vec_sz); - }; + const int hidden_size; + const int batches_per_layer; + const int in_vec_size; - int bias_stride() const { return bias_vector_size(); } + int r_offset() const { return save_point::R * gemm_write_size(); } + + int z_offset() const { return save_point::Z * gemm_write_size(); } + + int c_offset() const { return save_point::С * gemm_write_size(); } + + int activated_offset() const { return layer_stride() * num_layers; } - size_t bias_off(int layer_id, int bias_id) const + size_t network_stride() { return layer_stride() * num_layers; } + +private: + int num_layers; + + enum save_point { - return bias_start_off + bias_relative_off(layer_id, bias_id); - } + Z = 0, + R = 1, + С = 2, + Ht = 3, + Count = 4 + }; }; -struct ReluReserveBufferOffsets +struct ReluReserveBufferOffsets : public RNNOffsets { struct RBuffHelper { @@ -347,7 +374,7 @@ struct ReluReserveBufferOffsets int layers, int bidirect_mode = 0) const { - const auto element_st = bidirect_mode ? 2 : 1; + const auto element_st = 1; const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st; const auto layer_st = static_cast(batch_st) * batches_per_layer; @@ -357,239 +384,165 @@ struct ReluReserveBufferOffsets } public: - ReluReserveBufferOffsets(int hidden_vec_sz, - int save_point_sz, + ReluReserveBufferOffsets(int in_vec_size, + int hidden_vec_size, int layers_cnt, int batches_per_layer, int max_batch, bool bidirect_mode = 0) - : h_vec_size(hidden_vec_sz), - save_point_size(save_point_sz), - layers(layers_cnt), + : hidden_size(hidden_vec_size), batches_per_layer(batches_per_layer), + in_vec_size(in_vec_size), + save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), + layers(layers_cnt), max_batch(max_batch), strides( - Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, bidirect_mode)) + Reserve_Buffer_strides(save_point_size, batches_per_layer, layers_cnt, bidirect_mode)) { } - const int h_vec_size; - const int save_point_size; - - const int layers; - const int batches_per_layer; - const RBuffHelper strides; - const int max_batch; - size_t layer_offset(int layer_id) const { return static_cast(layer_id) * strides.layer; } - auto layer_stride() const { return strides.layer; } + size_t layer_stride() const { return strides.layer; } - auto gemm_write_size() const { return strides.save_point; } + int gemm_write_size() const { return strides.save_point; } - auto gemm_write_stride() const { return strides.batch; } + size_t gemm_write_stride() const { return strides.batch; } size_t gemm_write_offset(int layer_id, int batch_id, int reverse = 0) const { return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + - reverse * h_vec_size; + reverse * hidden_size; } - size_t ht_offset(int layer_id, int batch_id, int reverse = 0) const + size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const { - return strides.table + gemm_write_offset(layer_id, batch_id) + - reverse * h_vec_size; + return strides.table + gemm_write_offset(layer_id, batch_id) + reverse * hidden_size; } - size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id);} + const int hidden_size; + const int batches_per_layer; + const int in_vec_size; + + const int save_point_size; + const int layers; + const int max_batch; + const RBuffHelper strides; }; -struct LSTMReserveBufferHelper +struct LSTMReserveBufferHelper : public RNNOffsets { struct RBuffHelper { int element, save_point, batch; - size_t layer; + size_t layer, table; }; private: + static const int gates_cnt = 4; auto Reserve_Buffer_strides(int save_point_sz, int batches_per_layer, int save_points, + int layers_cnt, int bidirect_mode = 0) const { - const auto element_st = 1; + const auto element_st = bidirect_mode ? 2 : 1; + const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st * save_points; const auto layer_st = static_cast(batch_st) * batches_per_layer; + const auto table_st = layer_st * layers_cnt; + if(bidirect_mode == 0) - return RBuffHelper{element_st, save_point_st, batch_st, layer_st}; + return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); } public: enum save_point { - F = 1, - I = 0, - G = 2, - O = 3, - St = 4, - Ht = 5 + F = 1, + I = 0, + G = 2, + O = 3, + St = 4, + Ht = 5, + Count = 6 }; - LSTMReserveBufferHelper(int hidden_vec_sz, - int save_point_sz, + LSTMReserveBufferHelper(int hidden_vec_size, int layers_cnt, int batches_per_layer, - int save_points, - int gates_cnt) - : h_vec(hidden_vec_sz), - save_point_size(save_point_sz), + int in_vec_size, + int bidirect_mode = 0) + : hidden_size(hidden_vec_size), + batches_per_layer(batches_per_layer), + in_vec_size(in_vec_size), + save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), layers(layers_cnt), - batches(batches_per_layer), - save_points_cnt(save_points), - gates(gates_cnt), - strides(Reserve_Buffer_strides(save_point_sz, batches, save_points, 0)) + strides(Reserve_Buffer_strides( + save_point_size, batches_per_layer, save_point::Count, layers_cnt, 0)) { } - const int h_vec; - const int save_point_size; // for bidirect TODO - - const int layers; - const int batches; - const int save_points_cnt; - const int gates; - const RBuffHelper strides; - size_t layer_offset(int layer) const { return static_cast(layer) * strides.layer; } - auto layer_stride() const { return strides.layer; } + size_t layer_stride() const { return strides.layer; } - auto gemm_write_size() const { return h_vec * gates; } - auto gemm_write_stride() const { return strides.batch; } // save_point_size * save_points_cnt + int gemm_write_size() const { return save_point_size * gates_cnt; } + size_t gemm_write_stride() const { return strides.batch; } - size_t gemm_write_relative_offset(int batch_id) const + size_t gemm_write_offset(int layer, int batch, int reverse = 0) const { - return static_cast(gemm_write_stride()) * batch_id; + return layer_offset(layer) + static_cast(gemm_write_stride()) * batch; } - size_t gemm_write_offset(int layer, int batch_id, int reverse = 0) const + size_t hidden_offset(int layer, int batch, int reverse = 0) const { - return layer_offset(layer) + static_cast(gemm_write_stride()) * batch_id; + return gemm_write_offset(layer, batch) + save_point::Ht * save_point_size; } - auto ht_relative_offset() const { return save_point::Ht * save_point_size; } - - auto ct_relative_offset() const { return save_point::St * save_point_size; } - - auto get_gate_relative_offset(int gate_id) const { return gate_id * save_point_size; } + const int hidden_size; + const int batches_per_layer; + const int in_vec_size; - size_t ht_offset(int layer_id, int batch_id) const + auto f_offset(int layer, int batch_num) const { - return layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + ht_relative_offset(); + return gemm_write_offset(layer, batch_num) + save_point::F * save_point_size; } - size_t extra_save_point_offset(int layer_id, int batch_id) const + auto i_offset(int layer, int batch_num) const { - return (static_cast(batches) * layers * gemm_write_stride()) // all data offset - + (static_cast(batches) * layer_id) * h_vec + - static_cast(batch_id * h_vec); + return gemm_write_offset(layer, batch_num) + save_point::I * save_point_size; } -}; -struct LSTMWeightsBufferHelper -{ -private: - auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + auto g_offset(int layer, int batch_num) const { - if(bidirect_mode == 0) - return hidden_sz; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + return gemm_write_offset(layer, batch_num) + save_point::G * save_point_size; } - auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz, int gates) const + auto o_offset(int layer, int batch_num) const { - return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz * gates; + return gemm_write_offset(layer, batch_num) + save_point::O * save_point_size; } - size_t bias_start_offset( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int gates, int bidirect_mode) const - { - if(bidirect_mode == 0) - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1) * gates; - - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } - -public: - LSTMWeightsBufferHelper( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int gates) - : in_vec(input_vector_sz), - h_vec(hidden_vec_sz), - x_in_vec(hidden_xinput_size(hidden_vec_sz, 0)), - layers(layers_cnt), - gates_cnt(gates), - bias_cnt(bias_mode), - matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates)), - bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, gates, 0)) - { - } - - const int in_vec, h_vec; - const int x_in_vec; // for bidirect TODO + const int save_point_size; // for bidirect TODO const int layers; - const int gates_cnt; - const int bias_cnt; // 0 - no bisa; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec -private: - const size_t matrix_normal_start_off; - const size_t bias_start_off; - -public: - auto get_matrix_x_size(int layer_id) const - { - return (layer_id > 0 ? x_in_vec : in_vec) * h_vec; - } - auto get_matrix_h_size() const { return h_vec * h_vec; } - auto get_matrix_layer_size(int layer_id) const - { - return get_matrix_x_size(layer_id) * gates_cnt + get_matrix_h_size() * gates_cnt; - } - - size_t get_matrix_x_off(int layer_id) const - { - if(layer_id > 0) - return matrix_normal_start_off + - static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); - else - return 0; - }; - - size_t get_matrix_h_off(int layer_id) const - { - if(layer_id > 0) - return get_matrix_x_off(layer_id) + static_cast(h_vec * x_in_vec * gates_cnt); - else - return get_matrix_x_off(layer_id) + static_cast(h_vec * in_vec) * gates_cnt; - }; - - int bias_vector_size() const { return h_vec; } - int bias_vector_mul_gate() const { return bias_vector_size() * gates_cnt; } - int bias_stride() const { return bias_vector_mul_gate(); } + const RBuffHelper strides; - size_t bias_relative_off(int layer_id, int bias_id) const + auto st_offset(int layer, int batch_num, int reverse = 0) { - return static_cast(layer_id * bias_cnt + bias_id) * gates_cnt * h_vec; + return gemm_write_offset(layer, batch_num) + save_point::St * save_point_size; } - size_t get_bias_off(int layer_id, int bias_id) const + size_t extra_save_point_offset(int layer, int batch_num) const { - return bias_start_off + bias_relative_off(layer_id, bias_id); + return strides.table // all data offset + + static_cast(batches_per_layer) * layer * hidden_size + + static_cast(batch_num * hidden_size); } }; diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 5100b230ca..cfd040d951 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -85,7 +85,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, bacc_per_time[seq_len] = total_batch_size; - GRUOffsets RBuff(nLayers, hidden_size, total_batch_size); + GRUOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size); auto get_HxBuff_offset = [&](int layer_id) { return layer_id * (static_cast(hidden_size) * max_batch); @@ -100,11 +100,10 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, &wDesc, reserveSpace, x, - w, - hidden_size](int layer_id, float beta_t = 1) { + w](int layer_id, float beta_t = 1) { // n = Rx,Zx,Cx - const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size() * WeiBuf.matrixes::Count, - k = layer_id > 0 ? hidden_size : in_vec_size; + const int m = RBuff.batches_per_layer, n = WeiBuf.weight_stride, + k = layer_id > 0 ? RBuff.hidden_size : in_vec_size; const int lda = layer_id > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); @@ -155,25 +154,20 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, const std::vector tensor_size{1, static_cast(RBuff.batches_per_layer), - static_cast(hidden_size) * - WeiBuf.matrixes::Count}; + static_cast(WeiBuf.weight_stride)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), 1}; - auto wei_shift_bias_temp = WeiBuf.bias_off() + WeiBuf.weight_stride * 2 * layer_id; - auto tensor_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - const std::vector weight_size{ - 1, 1, static_cast(hidden_size) * WeiBuf.matrixes::Count}; + const std::vector weight_size{1, 1, static_cast(WeiBuf.weight_stride)}; - const std::vector weight_stride{ - static_cast(hidden_size) * WeiBuf.matrixes::Count, - static_cast(hidden_size) * WeiBuf.matrixes::Count, - 1}; + const std::vector weight_stride{static_cast(WeiBuf.weight_stride), + static_cast(WeiBuf.weight_stride), + 1}; auto wei_desc = miopen::TensorDescriptor(wDesc.GetType(), weight_size, weight_stride); @@ -189,12 +183,13 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, tensor_desc, reserveSpace, output_offset, - wei_shift_bias_temp, + WeiBuf.bias_off(layer_id), output_offset); } - const std::vector tensor_size{ - 1, static_cast(RBuff.batches_per_layer), static_cast(hidden_size)}; + const std::vector tensor_size{1, + static_cast(RBuff.batches_per_layer), + static_cast(RBuff.hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -232,23 +227,20 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto call_gru_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer_id, float beta_t = 0) { - float alpha0 = 1; - float alpha1 = 1; - const auto bias_stride = WeiBuf.bias_stride(); + float alpha0 = 1; + float alpha1 = 1; - auto wei_shift_bias_temp = - WeiBuf.bias_off() + WeiBuf.weight_stride * 2 * layer_id + WeiBuf.weight_stride; - - const auto bias_desc = - miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, bias_stride}, - std::vector{bias_stride, bias_stride, 1}); + const auto bias_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, 1, WeiBuf.bias_stride()}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); const auto hidden_interim_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, - std::vector{ - RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, + std::vector{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_stride(), 1}); const auto RB_layer_out_off = RBuff.layer_offset(layer_id); @@ -264,7 +256,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, hidden_interim_desc, reserveSpace, RB_layer_out_off, - wei_shift_bias_temp, + WeiBuf.bias_off(layer_id) + WeiBuf.weight_stride, RB_layer_out_off); }; @@ -283,8 +275,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, if(cur_time == 0 && hx == nullptr) return; - const int m = batches.at(cur_time), n = RBuff.gemm_write_size() * WeiBuf.matrixes::Count, - k = hidden_size; + const int m = batches.at(cur_time), n = WeiBuf.weight_stride, k = hidden_size; const int lda = (cur_time == 0) ? hidden_size : RBuff.gemm_write_stride(); @@ -308,10 +299,10 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, xDesc.GetType(), false}; - const auto ht_offset = (cur_time == 0) - ? get_HxBuff_offset(layer) - : RBuff.gemm_write_offset(layer, bacc_per_time[cur_time - 1]) + - RBuff.hidden_offset(); + const auto hidden_offset = + (cur_time == 0) ? get_HxBuff_offset(layer) + : RBuff.gemm_write_offset(layer, bacc_per_time[cur_time - 1]) + + RBuff.hidden_offset(); const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; @@ -320,7 +311,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, ht_ptr, - ht_offset, + hidden_offset, w, WeiBuf.hidden_offset(layer), reserveSpace, @@ -776,9 +767,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - int bi = dirMode != 0u ? 2 : 1; - ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode, bi); - ReluReserveBufferOffsets RBuff(hidden_size, hidden_size, nLayers, total_batch_size, max_batch); + int bi = dirMode != 0u ? 2 : 1; + int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); + ReluReserveBufferOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size, max_batch); ActivationDescriptor activDesc; @@ -795,9 +787,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, hidden_size]( int layer, float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), - k = layer > 0 ? hidden_size : in_vec_size; + k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, + const int lda = layer > 0 ? RBuff.gemm_write_stride() : RBuff.in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, @@ -818,10 +810,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, xDesc.GetType(), false}; - const auto input_weight_offset = WeiBuf.input_offset(layer); + const auto input_weight_offset = WeiBuf.input_weight_offset(layer); const auto output_offset = RBuff.layer_offset(layer); - const auto input_offset = layer > 0 ? RBuff.ht_offset(layer - 1) : 0; + const auto input_offset = layer > 0 ? RBuff.hidden_offset(layer - 1) : 0; const auto input_ptr = layer > 0 ? reserveSpace : x; @@ -838,56 +830,54 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW("GEMM execution failure"); }; - auto call_relu_tan_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w]( - int layer, float beta_t = 0) { - float alpha0 = 1; - float alpha1 = 1; - const auto bias_stride = WeiBuf.bias_stride(); + auto call_relu_tan_bias_add = + [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, float beta_t = 0) { + float alpha0 = 1; + float alpha1 = 1; - const auto bias_desc = - miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, bias_stride}, - std::vector{bias_stride, bias_stride, 1}); + const auto bias_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, 1, WeiBuf.bias_stride()}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - const auto hidden_interim_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, - std::vector{ - RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + const auto hidden_interim_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, + std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); - const auto w_bias_layer_start_off = WeiBuf.bias_off(layer, 0); + const auto RB_layer_out_off = RBuff.layer_offset(layer); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_interim_desc, - reserveSpace, // A - &alpha1, - bias_desc, - w, // B - &beta_t, - hidden_interim_desc, - reserveSpace, // C - RB_layer_out_off, // A offset - w_bias_layer_start_off, // B offset - RB_layer_out_off); // C offset + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, // A + &alpha1, + bias_desc, + w, // B + &beta_t, + hidden_interim_desc, + reserveSpace, // C + RB_layer_out_off, // A offset + WeiBuf.bias_off(layer), // B offset + RB_layer_out_off); // C offset - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_interim_desc, - reserveSpace, - &alpha1, - bias_desc, - w, - &beta_t, - hidden_interim_desc, - reserveSpace, - RB_layer_out_off, - w_bias_layer_start_off + bias_stride, - RB_layer_out_off); - }; + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + hidden_interim_desc, + reserveSpace, + RB_layer_out_off, + WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), + RB_layer_out_off); + }; auto call_relu_tan_hidden_gemm = [&RBuff, &WeiBuf, @@ -898,16 +888,15 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &xDesc, reserveSpace, hx, - w, - hidden_size](int layer, int cur_time) { + w](int layer, int cur_time) { if(cur_time == 0 && hx == nullptr) return; - const int m = batches.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; + const int m = batches.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size; + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; - const int ldb = hidden_size, ldc = RBuff.gemm_write_stride(); + const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, false, @@ -927,9 +916,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, xDesc.GetType(), false}; - const auto ht_offset = (cur_time == 0) - ? get_HxBuff_offset(layer) - : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); + const auto hidden_offset = (cur_time == 0) + ? get_HxBuff_offset(layer) + : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; @@ -939,9 +928,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, ht_ptr, - ht_offset, + hidden_offset, w, - WeiBuf.hidden_offset(layer), + WeiBuf.hidden_weight_offset(layer), reserveSpace, RB_batch_save_points_off, GemmBackend_t::miopengemm); @@ -951,12 +940,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, }; auto call_relu_tan_hidden_state_update = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc, hidden_size]( - int layer_id, int time_id) { + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc](int layer_id, + int time_id) { float alpha = 1, beta = 0; - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + const std::vector tensor_size{1, + static_cast(batches.at(time_id)), + static_cast(RBuff.hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -968,7 +958,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const auto RB_layer_save_points_off = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); - const auto ht_offset = RBuff.ht_offset(layer_id, bacc_per_time[time_id]); + const auto hidden_offset = RBuff.hidden_offset(layer_id, bacc_per_time[time_id]); activDesc.Forward(handle, &alpha, @@ -984,7 +974,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, // input tensor offset RB_layer_save_points_off, // output tensor offset - ht_offset); + hidden_offset); }; auto call_relu_tan_update_output = [&RBuff, @@ -996,7 +986,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, hy, max_batch, - hidden_size, seq_len](int layer_id) { if(hy == nullptr) return; @@ -1005,8 +994,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; - const std::vector hcy_dst_stride{ - static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; + const std::vector hcy_dst_stride{static_cast(RBuff.hidden_size * max_batch), + static_cast(RBuff.hidden_size), + 1}; for(int time_i = seq_len - 1; time_i >= 0; time_i--) { @@ -1017,10 +1007,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto batch_id_relative = batches.at(time_i) - copy_batch; auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * hidden_size; + auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(hidden_size)}; + 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); @@ -1032,19 +1022,18 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, dst_desc, hy, - RBuff.ht_offset(layer_id, batch_id_abs), + RBuff.hidden_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } } }; - if(biasMode != 0u) - for(int layer_id = 0; layer_id < nLayers; layer_id++) - call_relu_tan_bias_add(layer_id); - for(int layer_id = 0; layer_id < nLayers; layer_id++) { call_relu_tan_input_gemm(layer_id); + if(biasMode != 0u) + call_relu_tan_bias_add(layer_id); + for(int time = 0; time < seq_len; time++) { call_relu_tan_hidden_gemm(layer_id, time); @@ -1069,7 +1058,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); } #else (void)handle; @@ -1152,69 +1141,61 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - int gates_cnt = 4; - int save_points_cnt = 6; + int bi = dirMode != 0u ? 2 : 1; - LSTMWeightsBufferHelper WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, gates_cnt); + LSTMWeightsBufferHelper WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, bi); - LSTMReserveBufferHelper RBuff( - hidden_size, hidden_size, nLayers, total_batch_size, save_points_cnt, gates_cnt); + LSTMReserveBufferHelper RBuff(hidden_size, nLayers, total_batch_size, in_vec); - auto call_x_gemm = [&RBuff, - &WeiBuf, - &InBuff_strides, - &bacc_per_time, - &handle, - &xDesc, - reserveSpace, - x, - w, - hidden_size, - in_vec](int layer, int start_time, int time_cnt, float beta_t = 1) { - const auto start_b = bacc_per_time[start_time]; - const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; - - const int m = batch_sz, n = RBuff.gemm_write_size(), k = layer > 0 ? hidden_size : in_vec; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, - ldc = RBuff.gemm_write_stride(); + auto call_x_gemm = + [&RBuff, &WeiBuf, &InBuff_strides, &bacc_per_time, &handle, &xDesc, reserveSpace, x, w]( + int layer, int start_time, int time_cnt, float beta_t = 1) { + const auto start_b = bacc_per_time[start_time]; + const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; - const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta - xDesc.GetType(), - false}; + const int m = batch_sz, n = RBuff.gemm_write_size(), + k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; + const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, + ldc = RBuff.gemm_write_stride(); - const auto wx_off = WeiBuf.get_matrix_x_off(layer); - const auto out_offset = RBuff.gemm_write_offset(layer, start_b); + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; - const auto x_in_offset = layer > 0 ? RBuff.ht_offset(layer - 1, start_b) - : static_cast(start_b * InBuff_strides.batch); - const auto in_ptr = layer > 0 ? reserveSpace : x; + const auto wx_off = WeiBuf.input_weight_offset(layer); + const auto out_offset = RBuff.gemm_write_offset(layer, start_b); - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - in_ptr, - x_in_offset, - w, - wx_off, - reserveSpace, - out_offset, - GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); - }; + const auto x_in_offset = layer > 0 + ? RBuff.hidden_offset(layer - 1, start_b) + : static_cast(start_b * InBuff_strides.batch); + const auto in_ptr = layer > 0 ? reserveSpace : x; + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + in_ptr, + x_in_offset, + w, + wx_off, + reserveSpace, + out_offset, + GemmBackend_t::miopengemm); + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; auto call_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, float beta_t = 0) { @@ -1224,17 +1205,17 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const auto bias_desc = miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, - std::vector{bias_stride, bias_stride, 1}); + std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, + std::vector{bias_stride, bias_stride, 1}); const auto hidden_interim_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, RBuff.batches, WeiBuf.bias_vector_mul_gate()}, - std::vector{ - RBuff.batches * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_vector_mul_gate()}, + std::vector{ + RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); - const auto w_bias_layer_start_off = WeiBuf.get_bias_off(layer, 0); + const auto RB_layer_out_off = RBuff.layer_offset(layer); OpTensor(handle, miopenTensorOpAdd, @@ -1248,7 +1229,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, // C RB_layer_out_off, // A offset - w_bias_layer_start_off, // B offset + WeiBuf.bias_off(layer), // B offset RB_layer_out_off); // C offset OpTensor(handle, @@ -1263,7 +1244,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, RB_layer_out_off, - w_bias_layer_start_off + bias_stride, + WeiBuf.bias_off(layer) + bias_stride, RB_layer_out_off); }; @@ -1276,16 +1257,15 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &xDesc, reserveSpace, hx, - w, - hidden_size](int layer, int cur_time) { - const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; + w](int layer, int cur_time) { + const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size, - ldb = hidden_size, ldc = RBuff.gemm_write_stride(); + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size, + ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); const auto hx_ptr_offset = (cur_time == 0) ? get_HxBuff_offset(layer) - : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); + : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); if(cur_time == 0) if(hx == nullptr) @@ -1319,7 +1299,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hx_ptr, hx_ptr_offset, w, - WeiBuf.get_matrix_h_off(layer), + WeiBuf.hidden_weight_offset(layer), reserveSpace, RB_layer_save_points_off, GemmBackend_t::miopengemm); @@ -1336,11 +1316,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &wDesc, reserveSpace, cx, - max_batch, - hidden_size](int layer_id, int time_id) { - auto RB_layer_save_points_off = - RBuff.layer_offset(layer_id) + RBuff.gemm_write_relative_offset(bacc_per_time[time_id]); - + max_batch](int layer_id, int time_id) { auto is_seq_begin = time_id == 0; const int direction = 0; @@ -1351,47 +1327,36 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const size_t cx_offset = get_HxBuff_offset(layer_id); - const size_t i_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(0), - f_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(1), - o_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(2), - c_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(3); - - const size_t cell_offset = RB_layer_save_points_off + RBuff.ct_relative_offset(), - hidden_offset = RB_layer_save_points_off + RBuff.ht_relative_offset(); - - const size_t cell_offset_pre = - (time_id == 0) ? 0 - : RBuff.layer_offset(layer_id) + - RBuff.gemm_write_relative_offset(bacc_per_time[time_id - 1]) + - RBuff.ct_relative_offset(); - const size_t activ_cell_offset = RBuff.extra_save_point_offset(layer_id, bacc_per_time[time_id]); + const size_t cell_offset_pre = + (time_id == 0) ? 0 : RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]); + LSTMForwardHiddenStateUpdate(handle, wDesc.GetType(), false, is_seq_begin, - direction, + rnn_direction::Forward, max_batch, cur_batch, use_batch, - - hidden_size, + RBuff.hidden_size, hy_stride, wei_len, wei_stride, cx, cx_offset, reserveSpace, - i_offset, - f_offset, - o_offset, - c_offset, - cell_offset, + RBuff.i_offset(layer_id, bacc_per_time[time_id]), + RBuff.f_offset(layer_id, bacc_per_time[time_id]), + RBuff.g_offset(layer_id, bacc_per_time[time_id]), + RBuff.o_offset(layer_id, bacc_per_time[time_id]), + RBuff.st_offset(layer_id, bacc_per_time[time_id]), + // RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]), cell_offset_pre, activ_cell_offset, - hidden_offset); + RBuff.hidden_offset(layer_id, bacc_per_time[time_id])); }; auto call_hy_cy_update = [&RBuff, @@ -1404,7 +1369,6 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hy, cy, max_batch, - hidden_size, seq_len](int layer_id) { if(hy != nullptr || (cy != nullptr)) { @@ -1413,7 +1377,9 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; const std::vector hcy_dst_stride{ - static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; + static_cast(RBuff.hidden_size * max_batch), + static_cast(RBuff.hidden_size), + 1}; for(int time_i = seq_len - 1; time_i >= 0; time_i--) { @@ -1424,13 +1390,10 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto batch_id_relative = in_n.at(time_i) - copy_batch; auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * hidden_size; - - auto src_batch_offset = RBuff.layer_offset(layer_id) + - RBuff.gemm_write_relative_offset(batch_id_abs); + auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(hidden_size)}; + 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); @@ -1444,7 +1407,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, hy, - src_batch_offset + RBuff.ht_relative_offset(), + RBuff.hidden_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } @@ -1455,7 +1418,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, cy, - src_batch_offset + RBuff.ct_relative_offset(), + RBuff.st_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } } @@ -1671,7 +1634,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); } sync_root_to_all_stream_pull(); @@ -2152,7 +2115,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, sp_size[2] = wei_stride; w_desc = miopen::TensorDescriptor(wDesc.GetType(), w_size, w_stride); sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -2224,7 +2186,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, sp_size[1] = batch_n; sp_size[2] = wei_stride; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -2250,7 +2211,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, w_size[1] = 1; w_size[2] = wei_len; w_desc = miopen::TensorDescriptor(wDesc.GetType(), w_size, w_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -2302,7 +2262,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, sp_size[2] = wei_len; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -3326,6 +3285,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( "! Batch size must not ascend!"); } } + in_n.push_back(batchval); batch_n += batchval; } @@ -3353,7 +3313,6 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( cy, reserveSpace, reserveSpaceSize); - if(is_profiling) { float eventTime_mS = RNNProfilingEnd(handle, start, stop); @@ -4919,8 +4878,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( activDesc = {miopenActivationTANH, 1, 1, 1}; } - ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode, bi, wei_stride); - ReluReserveBufferOffsets RBuff(hidden_size, + ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); + + ReluReserveBufferOffsets RBuff(input_size, hidden_size, nLayers, total_batches, @@ -4987,7 +4947,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( workSpace, RBuff.layer_offset(layer + 1), w, - WeiBuf.input_weight_offset(layer), + WeiBuf.input_weight_offset(layer + 1), workSpace, RBuff.layer_offset(layer), GemmBackend_t::miopengemm); @@ -5016,13 +4976,14 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( float alpha1 = 1; float beta_t = 0; - std::vector hx_stride{batches.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; std::vector reserve_stride{(int)RBuff.layer_stride(), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(cur_time), RBuff.h_vec_size}; - std::vector reserve_size{1, batches.at(cur_time), RBuff.h_vec_size}; + std::vector hx_size{1, batches.at(cur_time), RBuff.hidden_size}; + std::vector reserve_size{1, batches.at(cur_time), RBuff.hidden_size}; auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto workspace_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); @@ -5058,15 +5019,15 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( int reverse) { if(reverse == 0 && dhy != nullptr && batches.at(cur_time) > batches.at(use_time)) { - std::vector hx_stride{batches.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; std::vector reserve_stride{(int)RBuff.layer_stride(), RBuff.gemm_write_size(), 1}; std::vector hx_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.h_vec_size}; + 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; std::vector reserve_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.h_vec_size}; + 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; float alpha0 = 1; float alpha1 = 1; @@ -5087,7 +5048,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta_t, reserve_desc, workSpace, - (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.h_vec_size), + (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.hidden_size), RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time)), RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time))); } @@ -5099,10 +5060,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, batches.at(use_time), - RBuff.h_vec_size, - RBuff.h_vec_size, + RBuff.hidden_size, + RBuff.hidden_size, RBuff.gemm_write_size(), - RBuff.h_vec_size, + RBuff.hidden_size, RBuff.gemm_write_size(), 1, // batch count 0, // Stride A @@ -5147,7 +5108,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( reserveSpace, &activDesc, propagate_hidden_output, - propagate_hidden_prev](int nLayers, int layer) { + propagate_hidden_prev](int layer) { int accumulated_batches = RBuff.batches_per_layer; int reverse_accumulated_batches = 0; @@ -5174,7 +5135,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_direction::Forward); } - std::vector reserve_size{1, batches.at(ti), RBuff.h_vec_size}; + std::vector reserve_size{1, batches.at(ti), RBuff.hidden_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); @@ -5192,7 +5153,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta, reserve_desc, workSpace, - RBuff.ht_offset(layer, accumulated_batches, rnn_direction::Forward), + RBuff.hidden_offset(layer, accumulated_batches, rnn_direction::Forward), RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward)); @@ -5218,7 +5179,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } std::vector reserve_backward_size{ - 1, batches.at(seqLen - 1 - ti), RBuff.h_vec_size}; + 1, batches.at(seqLen - 1 - ti), RBuff.hidden_size}; auto reserve_backward_desc = miopen::TensorDescriptor(rnn_data_type, reserve_backward_size, reserve_stride); activDesc.Backward( @@ -5233,7 +5194,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta, reserve_backward_desc, workSpace, - RBuff.ht_offset(layer, reverse_accumulated_batches, rnn_direction::Backward), + RBuff.hidden_offset(layer, reverse_accumulated_batches, rnn_direction::Backward), RBuff.gemm_write_offset( layer, reverse_accumulated_batches, rnn_direction::Backward), RBuff.gemm_write_offset( @@ -5255,11 +5216,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, (batches.at(cur_time) - use_batch), - RBuff.h_vec_size, - RBuff.h_vec_size, + RBuff.hidden_size, + RBuff.hidden_size, RBuff.gemm_write_size(), - RBuff.h_vec_size, - RBuff.h_vec_size, + RBuff.hidden_size, + RBuff.hidden_size, 1, // batch count 0, // Stride A 0, // Stride B @@ -5269,7 +5230,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int hx_shift = get_HxBuff_offset(layer, reverse) + use_batch * RBuff.h_vec_size; + int hx_shift = get_HxBuff_offset(layer, reverse) + use_batch * RBuff.hidden_size; miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -5322,7 +5283,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( for(int li = static_cast(nLayers) - 1; li >= 0; li--) { propagate_output(nLayers, li); - propagate_hidden(nLayers, li); + propagate_hidden(li); propagate_dhx(li); } @@ -5334,7 +5295,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, total_batches, input_size, - RBuff.h_vec_size * bi, + RBuff.hidden_size * bi, hy_stride, in_stride, in_stride, From c2bfee257b54f1a81c053529bcc14a135d284b00 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Wed, 27 Sep 2023 15:28:50 +0300 Subject: [PATCH 07/27] Lstm minor refactoring --- src/include/miopen/rnn_util.hpp | 519 +++++++++++++++----------------- src/ocl/rnnocl.cpp | 485 ++++++++++++++--------------- 2 files changed, 459 insertions(+), 545 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index e4ad698568..4dfdce971c 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -126,69 +126,54 @@ void LSTMBackwardHiddenStateUpdate(const Handle& handle, std::size_t dhidden_offset, std::size_t f_offset_pre); -struct GRUOffsets +struct RNNWeightOffsets { -public: - GRUOffsets(int num_layers, int hidden_size, int total_batch_size) - : num_layers(num_layers), hidden_size(hidden_size), batches_per_layer(total_batch_size) - { - } - - int r_offset() const { return save_point::R * gemm_write_size(); } - - int z_offset() const { return save_point::Z * gemm_write_size(); } - - int c_offset() const { return save_point::С * gemm_write_size(); } - - int hidden_offset() const { return save_point::Ht * gemm_write_size(); } - - size_t gemm_write_offset(int layer_id, int batch_num) const - { - return layer_offset(layer_id) + batch_num * gemm_write_stride(); - } - - int activated_offset() const { return layer_stride() * num_layers; } - - int gemm_write_size() const { return hidden_size; } - - int gemm_write_stride() const { return save_point::Count * gemm_write_size(); } - - int layer_offset(int layer) const { return layer * layer_stride(); } - int batches_per_layer; - - size_t layer_stride() const { return gemm_write_stride() * batches_per_layer; } - - size_t network_stride() { return layer_stride() * num_layers; } +public: + int input_offset(int layer) const; + int hidden_offset(int layer) const; + int bias_off(); + int bias_off(int layer) const; private: - int num_layers; - int hidden_size; - - enum save_point - { - Z = 0, - R = 1, - С = 2, - Ht = 3, - Count = 4 - }; + int first_layer_offset() const; }; -struct GruWeightOffsets +struct GruWeightOffsets : public RNNWeightOffsets { - GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode) + GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_count) : weight_stride(matrixes::Count * hidden_vec_sz), in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), - num_layers(layers_cnt) + num_layers(layers_cnt), + bias_count(bias_count) + { + } + + int input_offset(int layer) { + return layer == 0 ? 0 : first_layer_offset() + h_vec_sz * 2 * weight_stride * (layer - 1); } + int hidden_offset(int layer) + { + return layer == 0 ? input_offset(layer) + in_vec_sz * weight_stride + : input_offset(layer) + h_vec_sz * weight_stride; + } + + size_t bias_stride() { return matrixes::Count * h_vec_sz; } + int bias_off() + { + return (in_vec_sz + h_vec_sz + bias_count * h_vec_sz * (num_layers - 1)) * weight_stride; + } + int bias_off(int layer_id) { return bias_off() + layer_id * bias_count * weight_stride; } int weight_stride; + +private: const int in_vec_sz, h_vec_sz; const int num_layers; - + const int bi_scale = 0; + const int bias_count = 0; enum matrixes { Z = 0, @@ -196,144 +181,186 @@ struct GruWeightOffsets C = 2, Count = 3 }; + int first_layer_offset() { return (in_vec_sz + h_vec_sz) * weight_stride; } +}; +struct ReluWeightOffsets : public RNNWeightOffsets +{ public: - int input_offset(int layer) + ReluWeightOffsets(int input_vector_sz, + int hidden_vec_sz, + int layers_cnt, + int bias_mode, + int bi_scale, + int wei_stride) + : weight_stride(wei_stride), + in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + num_layers(layers_cnt), + bi_scale(bi_scale), + bias_count(bias_mode) + { + } + + int input_weight_offset(int layer) const { return layer == 0 ? 0 - : (in_vec_sz + h_vec_sz) * weight_stride + - 2 * h_vec_sz * weight_stride * (layer - 1); + : first_layer_offset() + + (h_vec_sz + h_vec_sz * bi_scale) * weight_stride * (layer - 1); } - int hidden_offset(int layer) + int hidden_weight_offset(int layer, int reverse = 0) const { - return layer == 0 ? input_offset(layer) + in_vec_sz * weight_stride - : input_offset(layer) + h_vec_sz * weight_stride; + return layer == 0 ? input_weight_offset(layer) + in_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz + : input_weight_offset(layer) + bi_scale * h_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz; } - int bias_stride() { return matrixes::Count * h_vec_sz; } + size_t bias_stride() { return h_vec_sz; } + int bias_off() { - return (in_vec_sz + h_vec_sz + 2 * h_vec_sz * (num_layers - 1)) * weight_stride; + return first_layer_offset() + + (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; } - int bias_off(int layer_id) { return bias_off() + layer_id * weight_stride; } + + int bias_off(int layer_id) { return bias_off() + bias_count * layer_id * weight_stride; } + int weight_stride; + +private: + const int in_vec_sz, h_vec_sz; + const int num_layers; + const int bi_scale = 1; + const int bias_count = 0; + + int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } }; -struct ReluWeightOffsets +struct LSTMWeightsBufferHelper : public RNNWeightOffsets { -private: - auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const +public: + const int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } + +public: + LSTMWeightsBufferHelper( + int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int bi_scale) + : weight_stride(hidden_vec_sz * gates_cnt), + in_vec_sz(input_vector_sz), + h_vec_sz(hidden_vec_sz), + num_layers(layers_cnt), + bi_scale(bi_scale), + bias_cnt(bias_mode) { - if(bidirect_mode == 0) - return hidden_sz; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); } - auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz) const + int input_weight_offset(int layer) const { - return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz; + return layer == 0 ? 0 + : first_layer_offset() + + (h_vec_sz + h_vec_sz * bi_scale) * weight_stride * (layer - 1); } - size_t bias_start_offset(int input_vector_sz, - int hidden_vec_sz, - int layers_cnt, - int bidirect_mode) const + int hidden_weight_offset(int layer, int reverse = 0) const { - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1); + return layer == 0 ? input_weight_offset(layer) + in_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz + : input_weight_offset(layer) + bi_scale * h_vec_sz * weight_stride + + reverse * h_vec_sz * h_vec_sz; } -public: - ReluWeightOffsets(int input_vector_sz, - int hidden_vec_sz, - int layers_cnt, - int bias_mode, - int bidirectional, - int wei_stride = 0) - : in_vec_sz(input_vector_sz), - h_vec_sz(hidden_vec_sz), - x_in_vec_sz(hidden_xinput_size(hidden_vec_sz, 0)), - bias_cnt(bias_mode), - matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz)), - bias_start_off( - bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, bidirectional)), - bidirectional(bidirectional), - wei_stride(wei_stride) + size_t bias_stride() { return bias_vector_mul_gate(); } + + int bias_off() { + return first_layer_offset() + + (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; } + int bias_off(int layer_id) { return bias_off() + layer_id * bias_cnt * weight_stride; } + + size_t bias_vector_mul_gate() const { return h_vec_sz * gates_cnt; } + + const int weight_stride; + private: + static const int gates_cnt = 4; const int in_vec_sz; - const int bidirectional; - const int x_in_vec_sz; // for bidirect TODO + const int h_vec_sz; + const int num_layers; + const int bi_scale = 1; + const int bias_cnt = 0; +}; - const int bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec +struct RNNOffsets +{ + size_t layer_offset(int layer_id) const; - const size_t matrix_normal_start_off; - const size_t bias_start_off; + size_t layer_stride() const; - auto get_input_matrix_size(int layer_id) const - { - return (layer_id > 0 ? x_in_vec_sz : in_vec_sz) * h_vec_sz; - } + int gemm_write_size() const; - auto get_hidden_matrix_size() const { return h_vec_sz * h_vec_sz; } + size_t gemm_write_stride() const; - auto get_matrix_layer_size(int layer_id) const - { - return get_input_matrix_size(layer_id) + get_hidden_matrix_size(); - } + size_t gemm_write_offset(int layer_id, int batch_id = 0, int reverse = 0) const; - int bias_vector_size() const { return h_vec_sz; } + size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const; +}; - size_t bias_relative_off(int layer_id, int bias_id) const +struct GRUOffsets : public RNNOffsets +{ +public: + GRUOffsets(int in_vec_size, int hidden_size, int num_layers, int total_batch_size) + : hidden_size(hidden_size), + batches_per_layer(total_batch_size), + in_vec_size(in_vec_size), + num_layers(num_layers) { - return static_cast(layer_id * bias_cnt + bias_id) * h_vec_sz; } -public: - const int h_vec_sz; - const int wei_stride; + size_t layer_offset(int layer_id) const { return layer_id * layer_stride(); } - size_t input_weight_offset(int layer_id) const - { - return hidden_weight_offset(layer_id, 0) + h_vec_sz * wei_stride; - } + size_t layer_stride() const { return gemm_write_stride() * batches_per_layer; } + + int gemm_write_size() const { return hidden_size; } + + size_t gemm_write_stride() const { return save_point::Count * gemm_write_size(); } - size_t hidden_weight_offset(int layer_id, int reverse) const + size_t gemm_write_offset(int layer_id, int batch_num, int reverse = 0) const { - return in_vec_sz * wei_stride + - layer_id * (bidirectional * h_vec_sz + h_vec_sz) * wei_stride + - reverse * h_vec_sz * h_vec_sz; + return layer_offset(layer_id) + batch_num * gemm_write_stride(); } - size_t input_offset(int layer_id) const - { - if(layer_id > 0) - return matrix_normal_start_off + - static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); - else - return 0; - }; + size_t hidden_offset() const { return save_point::Ht * gemm_write_size(); } - size_t hidden_offset(int layer_id) const - { - if(layer_id > 0) - return input_offset(layer_id) + static_cast(h_vec_sz * x_in_vec_sz); - else - return input_offset(layer_id) + static_cast(h_vec_sz * in_vec_sz); - }; + const int hidden_size; + const int batches_per_layer; + const int in_vec_size; - int bias_stride() const { return bias_vector_size(); } + int r_offset() const { return save_point::R * gemm_write_size(); } + + int z_offset() const { return save_point::Z * gemm_write_size(); } + + int c_offset() const { return save_point::С * gemm_write_size(); } + + int activated_offset() const { return layer_stride() * num_layers; } - size_t bias_off(int layer_id, int bias_id) const + size_t network_stride() { return layer_stride() * num_layers; } + +private: + int num_layers; + + enum save_point { - return bias_start_off + bias_relative_off(layer_id, bias_id); - } + Z = 0, + R = 1, + С = 2, + Ht = 3, + Count = 4 + }; }; -struct ReluReserveBufferOffsets +struct ReluReserveBufferOffsets : public RNNOffsets { struct RBuffHelper { @@ -347,7 +374,7 @@ struct ReluReserveBufferOffsets int layers, int bidirect_mode = 0) const { - const auto element_st = bidirect_mode ? 2 : 1; + const auto element_st = 1; const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st; const auto layer_st = static_cast(batch_st) * batches_per_layer; @@ -357,239 +384,165 @@ struct ReluReserveBufferOffsets } public: - ReluReserveBufferOffsets(int hidden_vec_sz, - int save_point_sz, + ReluReserveBufferOffsets(int in_vec_size, + int hidden_vec_size, int layers_cnt, int batches_per_layer, int max_batch, bool bidirect_mode = 0) - : h_vec_size(hidden_vec_sz), - save_point_size(save_point_sz), - layers(layers_cnt), + : hidden_size(hidden_vec_size), batches_per_layer(batches_per_layer), + in_vec_size(in_vec_size), + save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), + layers(layers_cnt), max_batch(max_batch), strides( - Reserve_Buffer_strides(save_point_sz, batches_per_layer, layers_cnt, bidirect_mode)) + Reserve_Buffer_strides(save_point_size, batches_per_layer, layers_cnt, bidirect_mode)) { } - const int h_vec_size; - const int save_point_size; - - const int layers; - const int batches_per_layer; - const RBuffHelper strides; - const int max_batch; - size_t layer_offset(int layer_id) const { return static_cast(layer_id) * strides.layer; } - auto layer_stride() const { return strides.layer; } + size_t layer_stride() const { return strides.layer; } - auto gemm_write_size() const { return strides.save_point; } + int gemm_write_size() const { return strides.save_point; } - auto gemm_write_stride() const { return strides.batch; } + size_t gemm_write_stride() const { return strides.batch; } size_t gemm_write_offset(int layer_id, int batch_id, int reverse = 0) const { return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + - reverse * h_vec_size; + reverse * hidden_size; } - size_t ht_offset(int layer_id, int batch_id, int reverse = 0) const + size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const { - return strides.table + gemm_write_offset(layer_id, batch_id) + - reverse * h_vec_size; + return strides.table + gemm_write_offset(layer_id, batch_id) + reverse * hidden_size; } - size_t ht_offset(int layer_id) const { return strides.table + layer_offset(layer_id);} + const int hidden_size; + const int batches_per_layer; + const int in_vec_size; + + const int save_point_size; + const int layers; + const int max_batch; + const RBuffHelper strides; }; -struct LSTMReserveBufferHelper +struct LSTMReserveBufferHelper : public RNNOffsets { struct RBuffHelper { int element, save_point, batch; - size_t layer; + size_t layer, table; }; private: + static const int gates_cnt = 4; auto Reserve_Buffer_strides(int save_point_sz, int batches_per_layer, int save_points, + int layers_cnt, int bidirect_mode = 0) const { - const auto element_st = 1; + const auto element_st = bidirect_mode ? 2 : 1; + const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st * save_points; const auto layer_st = static_cast(batch_st) * batches_per_layer; + const auto table_st = layer_st * layers_cnt; + if(bidirect_mode == 0) - return RBuffHelper{element_st, save_point_st, batch_st, layer_st}; + return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); } public: enum save_point { - F = 1, - I = 0, - G = 2, - O = 3, - St = 4, - Ht = 5 + F = 1, + I = 0, + G = 2, + O = 3, + St = 4, + Ht = 5, + Count = 6 }; - LSTMReserveBufferHelper(int hidden_vec_sz, - int save_point_sz, + LSTMReserveBufferHelper(int hidden_vec_size, int layers_cnt, int batches_per_layer, - int save_points, - int gates_cnt) - : h_vec(hidden_vec_sz), - save_point_size(save_point_sz), + int in_vec_size, + int bidirect_mode = 0) + : hidden_size(hidden_vec_size), + batches_per_layer(batches_per_layer), + in_vec_size(in_vec_size), + save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), layers(layers_cnt), - batches(batches_per_layer), - save_points_cnt(save_points), - gates(gates_cnt), - strides(Reserve_Buffer_strides(save_point_sz, batches, save_points, 0)) + strides(Reserve_Buffer_strides( + save_point_size, batches_per_layer, save_point::Count, layers_cnt, 0)) { } - const int h_vec; - const int save_point_size; // for bidirect TODO - - const int layers; - const int batches; - const int save_points_cnt; - const int gates; - const RBuffHelper strides; - size_t layer_offset(int layer) const { return static_cast(layer) * strides.layer; } - auto layer_stride() const { return strides.layer; } + size_t layer_stride() const { return strides.layer; } - auto gemm_write_size() const { return h_vec * gates; } - auto gemm_write_stride() const { return strides.batch; } // save_point_size * save_points_cnt + int gemm_write_size() const { return save_point_size * gates_cnt; } + size_t gemm_write_stride() const { return strides.batch; } - size_t gemm_write_relative_offset(int batch_id) const + size_t gemm_write_offset(int layer, int batch, int reverse = 0) const { - return static_cast(gemm_write_stride()) * batch_id; + return layer_offset(layer) + static_cast(gemm_write_stride()) * batch; } - size_t gemm_write_offset(int layer, int batch_id, int reverse = 0) const + size_t hidden_offset(int layer, int batch, int reverse = 0) const { - return layer_offset(layer) + static_cast(gemm_write_stride()) * batch_id; + return gemm_write_offset(layer, batch) + save_point::Ht * save_point_size; } - auto ht_relative_offset() const { return save_point::Ht * save_point_size; } - - auto ct_relative_offset() const { return save_point::St * save_point_size; } - - auto get_gate_relative_offset(int gate_id) const { return gate_id * save_point_size; } + const int hidden_size; + const int batches_per_layer; + const int in_vec_size; - size_t ht_offset(int layer_id, int batch_id) const + auto f_offset(int layer, int batch_num) const { - return layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + ht_relative_offset(); + return gemm_write_offset(layer, batch_num) + save_point::F * save_point_size; } - size_t extra_save_point_offset(int layer_id, int batch_id) const + auto i_offset(int layer, int batch_num) const { - return (static_cast(batches) * layers * gemm_write_stride()) // all data offset - + (static_cast(batches) * layer_id) * h_vec + - static_cast(batch_id * h_vec); + return gemm_write_offset(layer, batch_num) + save_point::I * save_point_size; } -}; -struct LSTMWeightsBufferHelper -{ -private: - auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + auto g_offset(int layer, int batch_num) const { - if(bidirect_mode == 0) - return hidden_sz; - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + return gemm_write_offset(layer, batch_num) + save_point::G * save_point_size; } - auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz, int gates) const + auto o_offset(int layer, int batch_num) const { - return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz * gates; + return gemm_write_offset(layer, batch_num) + save_point::O * save_point_size; } - size_t bias_start_offset( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int gates, int bidirect_mode) const - { - if(bidirect_mode == 0) - return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates) + - static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * - hidden_vec_sz * static_cast(layers_cnt - 1) * gates; - - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } - -public: - LSTMWeightsBufferHelper( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int gates) - : in_vec(input_vector_sz), - h_vec(hidden_vec_sz), - x_in_vec(hidden_xinput_size(hidden_vec_sz, 0)), - layers(layers_cnt), - gates_cnt(gates), - bias_cnt(bias_mode), - matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates)), - bias_start_off(bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, gates, 0)) - { - } - - const int in_vec, h_vec; - const int x_in_vec; // for bidirect TODO + const int save_point_size; // for bidirect TODO const int layers; - const int gates_cnt; - const int bias_cnt; // 0 - no bisa; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec -private: - const size_t matrix_normal_start_off; - const size_t bias_start_off; - -public: - auto get_matrix_x_size(int layer_id) const - { - return (layer_id > 0 ? x_in_vec : in_vec) * h_vec; - } - auto get_matrix_h_size() const { return h_vec * h_vec; } - auto get_matrix_layer_size(int layer_id) const - { - return get_matrix_x_size(layer_id) * gates_cnt + get_matrix_h_size() * gates_cnt; - } - - size_t get_matrix_x_off(int layer_id) const - { - if(layer_id > 0) - return matrix_normal_start_off + - static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); - else - return 0; - }; - - size_t get_matrix_h_off(int layer_id) const - { - if(layer_id > 0) - return get_matrix_x_off(layer_id) + static_cast(h_vec * x_in_vec * gates_cnt); - else - return get_matrix_x_off(layer_id) + static_cast(h_vec * in_vec) * gates_cnt; - }; - - int bias_vector_size() const { return h_vec; } - int bias_vector_mul_gate() const { return bias_vector_size() * gates_cnt; } - int bias_stride() const { return bias_vector_mul_gate(); } + const RBuffHelper strides; - size_t bias_relative_off(int layer_id, int bias_id) const + auto st_offset(int layer, int batch_num, int reverse = 0) { - return static_cast(layer_id * bias_cnt + bias_id) * gates_cnt * h_vec; + return gemm_write_offset(layer, batch_num) + save_point::St * save_point_size; } - size_t get_bias_off(int layer_id, int bias_id) const + size_t extra_save_point_offset(int layer, int batch_num) const { - return bias_start_off + bias_relative_off(layer_id, bias_id); + return strides.table // all data offset + + static_cast(batches_per_layer) * layer * hidden_size + + static_cast(batch_num * hidden_size); } }; diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 5100b230ca..8f18171fd4 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -85,7 +85,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, bacc_per_time[seq_len] = total_batch_size; - GRUOffsets RBuff(nLayers, hidden_size, total_batch_size); + GRUOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size); auto get_HxBuff_offset = [&](int layer_id) { return layer_id * (static_cast(hidden_size) * max_batch); @@ -100,11 +100,10 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, &wDesc, reserveSpace, x, - w, - hidden_size](int layer_id, float beta_t = 1) { + w](int layer_id, float beta_t = 1) { // n = Rx,Zx,Cx - const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size() * WeiBuf.matrixes::Count, - k = layer_id > 0 ? hidden_size : in_vec_size; + const int m = RBuff.batches_per_layer, n = WeiBuf.weight_stride, + k = layer_id > 0 ? RBuff.hidden_size : in_vec_size; const int lda = layer_id > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); @@ -155,25 +154,20 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, const std::vector tensor_size{1, static_cast(RBuff.batches_per_layer), - static_cast(hidden_size) * - WeiBuf.matrixes::Count}; + static_cast(WeiBuf.weight_stride)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), 1}; - auto wei_shift_bias_temp = WeiBuf.bias_off() + WeiBuf.weight_stride * 2 * layer_id; - auto tensor_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - const std::vector weight_size{ - 1, 1, static_cast(hidden_size) * WeiBuf.matrixes::Count}; + const std::vector weight_size{1, 1, static_cast(WeiBuf.weight_stride)}; - const std::vector weight_stride{ - static_cast(hidden_size) * WeiBuf.matrixes::Count, - static_cast(hidden_size) * WeiBuf.matrixes::Count, - 1}; + const std::vector weight_stride{static_cast(WeiBuf.weight_stride), + static_cast(WeiBuf.weight_stride), + 1}; auto wei_desc = miopen::TensorDescriptor(wDesc.GetType(), weight_size, weight_stride); @@ -189,12 +183,13 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, tensor_desc, reserveSpace, output_offset, - wei_shift_bias_temp, + WeiBuf.bias_off(layer_id), output_offset); } - const std::vector tensor_size{ - 1, static_cast(RBuff.batches_per_layer), static_cast(hidden_size)}; + const std::vector tensor_size{1, + static_cast(RBuff.batches_per_layer), + static_cast(RBuff.hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -232,23 +227,20 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto call_gru_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer_id, float beta_t = 0) { - float alpha0 = 1; - float alpha1 = 1; - const auto bias_stride = WeiBuf.bias_stride(); + float alpha0 = 1; + float alpha1 = 1; - auto wei_shift_bias_temp = - WeiBuf.bias_off() + WeiBuf.weight_stride * 2 * layer_id + WeiBuf.weight_stride; - - const auto bias_desc = - miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, bias_stride}, - std::vector{bias_stride, bias_stride, 1}); + const auto bias_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, 1, WeiBuf.bias_stride()}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); const auto hidden_interim_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, - std::vector{ - RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, + std::vector{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_stride(), 1}); const auto RB_layer_out_off = RBuff.layer_offset(layer_id); @@ -264,7 +256,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, hidden_interim_desc, reserveSpace, RB_layer_out_off, - wei_shift_bias_temp, + WeiBuf.bias_off(layer_id) + WeiBuf.weight_stride, RB_layer_out_off); }; @@ -283,8 +275,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, if(cur_time == 0 && hx == nullptr) return; - const int m = batches.at(cur_time), n = RBuff.gemm_write_size() * WeiBuf.matrixes::Count, - k = hidden_size; + const int m = batches.at(cur_time), n = WeiBuf.weight_stride, k = hidden_size; const int lda = (cur_time == 0) ? hidden_size : RBuff.gemm_write_stride(); @@ -308,10 +299,10 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, xDesc.GetType(), false}; - const auto ht_offset = (cur_time == 0) - ? get_HxBuff_offset(layer) - : RBuff.gemm_write_offset(layer, bacc_per_time[cur_time - 1]) + - RBuff.hidden_offset(); + const auto hidden_offset = + (cur_time == 0) ? get_HxBuff_offset(layer) + : RBuff.gemm_write_offset(layer, bacc_per_time[cur_time - 1]) + + RBuff.hidden_offset(); const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; @@ -320,7 +311,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, ht_ptr, - ht_offset, + hidden_offset, w, WeiBuf.hidden_offset(layer), reserveSpace, @@ -776,9 +767,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - int bi = dirMode != 0u ? 2 : 1; - ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode, bi); - ReluReserveBufferOffsets RBuff(hidden_size, hidden_size, nLayers, total_batch_size, max_batch); + int bi = dirMode != 0u ? 2 : 1; + int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); + ReluReserveBufferOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size, max_batch); ActivationDescriptor activDesc; @@ -795,9 +787,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, hidden_size]( int layer, float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), - k = layer > 0 ? hidden_size : in_vec_size; + k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, + const int lda = layer > 0 ? RBuff.gemm_write_stride() : RBuff.in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, @@ -818,10 +810,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, xDesc.GetType(), false}; - const auto input_weight_offset = WeiBuf.input_offset(layer); + const auto input_weight_offset = WeiBuf.input_weight_offset(layer); const auto output_offset = RBuff.layer_offset(layer); - const auto input_offset = layer > 0 ? RBuff.ht_offset(layer - 1) : 0; + const auto input_offset = layer > 0 ? RBuff.hidden_offset(layer - 1) : 0; const auto input_ptr = layer > 0 ? reserveSpace : x; @@ -838,56 +830,54 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW("GEMM execution failure"); }; - auto call_relu_tan_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w]( - int layer, float beta_t = 0) { - float alpha0 = 1; - float alpha1 = 1; - const auto bias_stride = WeiBuf.bias_stride(); + auto call_relu_tan_bias_add = + [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, float beta_t = 0) { + float alpha0 = 1; + float alpha1 = 1; - const auto bias_desc = - miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, bias_stride}, - std::vector{bias_stride, bias_stride, 1}); + const auto bias_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, 1, WeiBuf.bias_stride()}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - const auto hidden_interim_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, - std::vector{ - RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + const auto hidden_interim_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, + std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); - const auto w_bias_layer_start_off = WeiBuf.bias_off(layer, 0); + const auto RB_layer_out_off = RBuff.layer_offset(layer); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_interim_desc, - reserveSpace, // A - &alpha1, - bias_desc, - w, // B - &beta_t, - hidden_interim_desc, - reserveSpace, // C - RB_layer_out_off, // A offset - w_bias_layer_start_off, // B offset - RB_layer_out_off); // C offset + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, // A + &alpha1, + bias_desc, + w, // B + &beta_t, + hidden_interim_desc, + reserveSpace, // C + RB_layer_out_off, // A offset + WeiBuf.bias_off(layer), // B offset + RB_layer_out_off); // C offset - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_interim_desc, - reserveSpace, - &alpha1, - bias_desc, - w, - &beta_t, - hidden_interim_desc, - reserveSpace, - RB_layer_out_off, - w_bias_layer_start_off + bias_stride, - RB_layer_out_off); - }; + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + hidden_interim_desc, + reserveSpace, + RB_layer_out_off, + WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), + RB_layer_out_off); + }; auto call_relu_tan_hidden_gemm = [&RBuff, &WeiBuf, @@ -898,16 +888,15 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &xDesc, reserveSpace, hx, - w, - hidden_size](int layer, int cur_time) { + w](int layer, int cur_time) { if(cur_time == 0 && hx == nullptr) return; - const int m = batches.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; + const int m = batches.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size; + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; - const int ldb = hidden_size, ldc = RBuff.gemm_write_stride(); + const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, false, @@ -927,9 +916,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, xDesc.GetType(), false}; - const auto ht_offset = (cur_time == 0) - ? get_HxBuff_offset(layer) - : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); + const auto hidden_offset = (cur_time == 0) + ? get_HxBuff_offset(layer) + : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; @@ -939,9 +928,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, ht_ptr, - ht_offset, + hidden_offset, w, - WeiBuf.hidden_offset(layer), + WeiBuf.hidden_weight_offset(layer), reserveSpace, RB_batch_save_points_off, GemmBackend_t::miopengemm); @@ -951,12 +940,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, }; auto call_relu_tan_hidden_state_update = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc, hidden_size]( - int layer_id, int time_id) { + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc](int layer_id, + int time_id) { float alpha = 1, beta = 0; - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + const std::vector tensor_size{1, + static_cast(batches.at(time_id)), + static_cast(RBuff.hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -968,7 +958,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const auto RB_layer_save_points_off = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); - const auto ht_offset = RBuff.ht_offset(layer_id, bacc_per_time[time_id]); + const auto hidden_offset = RBuff.hidden_offset(layer_id, bacc_per_time[time_id]); activDesc.Forward(handle, &alpha, @@ -984,7 +974,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, // input tensor offset RB_layer_save_points_off, // output tensor offset - ht_offset); + hidden_offset); }; auto call_relu_tan_update_output = [&RBuff, @@ -996,7 +986,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, hy, max_batch, - hidden_size, seq_len](int layer_id) { if(hy == nullptr) return; @@ -1005,8 +994,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; - const std::vector hcy_dst_stride{ - static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; + const std::vector hcy_dst_stride{static_cast(RBuff.hidden_size * max_batch), + static_cast(RBuff.hidden_size), + 1}; for(int time_i = seq_len - 1; time_i >= 0; time_i--) { @@ -1017,10 +1007,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto batch_id_relative = batches.at(time_i) - copy_batch; auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * hidden_size; + auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(hidden_size)}; + 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); @@ -1032,19 +1022,18 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, dst_desc, hy, - RBuff.ht_offset(layer_id, batch_id_abs), + RBuff.hidden_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } } }; - if(biasMode != 0u) - for(int layer_id = 0; layer_id < nLayers; layer_id++) - call_relu_tan_bias_add(layer_id); - for(int layer_id = 0; layer_id < nLayers; layer_id++) { call_relu_tan_input_gemm(layer_id); + if(biasMode != 0u) + call_relu_tan_bias_add(layer_id); + for(int time = 0; time < seq_len; time++) { call_relu_tan_hidden_gemm(layer_id, time); @@ -1069,7 +1058,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); } #else (void)handle; @@ -1152,69 +1141,61 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - int gates_cnt = 4; - int save_points_cnt = 6; + int bi = dirMode != 0u ? 2 : 1; - LSTMWeightsBufferHelper WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, gates_cnt); + LSTMWeightsBufferHelper WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, bi); - LSTMReserveBufferHelper RBuff( - hidden_size, hidden_size, nLayers, total_batch_size, save_points_cnt, gates_cnt); + LSTMReserveBufferHelper RBuff(hidden_size, nLayers, total_batch_size, in_vec); - auto call_x_gemm = [&RBuff, - &WeiBuf, - &InBuff_strides, - &bacc_per_time, - &handle, - &xDesc, - reserveSpace, - x, - w, - hidden_size, - in_vec](int layer, int start_time, int time_cnt, float beta_t = 1) { - const auto start_b = bacc_per_time[start_time]; - const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; - - const int m = batch_sz, n = RBuff.gemm_write_size(), k = layer > 0 ? hidden_size : in_vec; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, - ldc = RBuff.gemm_write_stride(); + auto call_x_gemm = + [&RBuff, &WeiBuf, &InBuff_strides, &bacc_per_time, &handle, &xDesc, reserveSpace, x, w]( + int layer, int start_time, int time_cnt, float beta_t = 1) { + const auto start_b = bacc_per_time[start_time]; + const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; - const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta - xDesc.GetType(), - false}; + const int m = batch_sz, n = RBuff.gemm_write_size(), + k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; + const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, + ldc = RBuff.gemm_write_stride(); - const auto wx_off = WeiBuf.get_matrix_x_off(layer); - const auto out_offset = RBuff.gemm_write_offset(layer, start_b); + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; - const auto x_in_offset = layer > 0 ? RBuff.ht_offset(layer - 1, start_b) - : static_cast(start_b * InBuff_strides.batch); - const auto in_ptr = layer > 0 ? reserveSpace : x; + const auto wx_off = WeiBuf.input_weight_offset(layer); + const auto out_offset = RBuff.gemm_write_offset(layer, start_b); - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - in_ptr, - x_in_offset, - w, - wx_off, - reserveSpace, - out_offset, - GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); - }; + const auto x_in_offset = layer > 0 + ? RBuff.hidden_offset(layer - 1, start_b) + : static_cast(start_b * InBuff_strides.batch); + const auto in_ptr = layer > 0 ? reserveSpace : x; + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + in_ptr, + x_in_offset, + w, + wx_off, + reserveSpace, + out_offset, + GemmBackend_t::miopengemm); + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; auto call_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, float beta_t = 0) { @@ -1224,17 +1205,17 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const auto bias_desc = miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, - std::vector{bias_stride, bias_stride, 1}); + std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, + std::vector{bias_stride, bias_stride, 1}); const auto hidden_interim_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, RBuff.batches, WeiBuf.bias_vector_mul_gate()}, - std::vector{ - RBuff.batches * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_vector_mul_gate()}, + std::vector{ + RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); - const auto w_bias_layer_start_off = WeiBuf.get_bias_off(layer, 0); + const auto RB_layer_out_off = RBuff.layer_offset(layer); OpTensor(handle, miopenTensorOpAdd, @@ -1248,7 +1229,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, // C RB_layer_out_off, // A offset - w_bias_layer_start_off, // B offset + WeiBuf.bias_off(layer), // B offset RB_layer_out_off); // C offset OpTensor(handle, @@ -1263,7 +1244,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, RB_layer_out_off, - w_bias_layer_start_off + bias_stride, + WeiBuf.bias_off(layer) + bias_stride, RB_layer_out_off); }; @@ -1276,16 +1257,15 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &xDesc, reserveSpace, hx, - w, - hidden_size](int layer, int cur_time) { - const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; + w](int layer, int cur_time) { + const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size, - ldb = hidden_size, ldc = RBuff.gemm_write_stride(); + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size, + ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); const auto hx_ptr_offset = (cur_time == 0) ? get_HxBuff_offset(layer) - : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); + : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); if(cur_time == 0) if(hx == nullptr) @@ -1319,7 +1299,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hx_ptr, hx_ptr_offset, w, - WeiBuf.get_matrix_h_off(layer), + WeiBuf.hidden_weight_offset(layer), reserveSpace, RB_layer_save_points_off, GemmBackend_t::miopengemm); @@ -1336,11 +1316,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &wDesc, reserveSpace, cx, - max_batch, - hidden_size](int layer_id, int time_id) { - auto RB_layer_save_points_off = - RBuff.layer_offset(layer_id) + RBuff.gemm_write_relative_offset(bacc_per_time[time_id]); - + max_batch](int layer_id, int time_id) { auto is_seq_begin = time_id == 0; const int direction = 0; @@ -1351,47 +1327,36 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const size_t cx_offset = get_HxBuff_offset(layer_id); - const size_t i_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(0), - f_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(1), - o_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(2), - c_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(3); - - const size_t cell_offset = RB_layer_save_points_off + RBuff.ct_relative_offset(), - hidden_offset = RB_layer_save_points_off + RBuff.ht_relative_offset(); - - const size_t cell_offset_pre = - (time_id == 0) ? 0 - : RBuff.layer_offset(layer_id) + - RBuff.gemm_write_relative_offset(bacc_per_time[time_id - 1]) + - RBuff.ct_relative_offset(); - const size_t activ_cell_offset = RBuff.extra_save_point_offset(layer_id, bacc_per_time[time_id]); + const size_t cell_offset_pre = + (time_id == 0) ? 0 : RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]); + LSTMForwardHiddenStateUpdate(handle, wDesc.GetType(), false, is_seq_begin, - direction, + rnn_direction::Forward, max_batch, cur_batch, use_batch, - - hidden_size, + RBuff.hidden_size, hy_stride, wei_len, wei_stride, cx, cx_offset, reserveSpace, - i_offset, - f_offset, - o_offset, - c_offset, - cell_offset, + RBuff.i_offset(layer_id, bacc_per_time[time_id]), + RBuff.f_offset(layer_id, bacc_per_time[time_id]), + RBuff.g_offset(layer_id, bacc_per_time[time_id]), + RBuff.o_offset(layer_id, bacc_per_time[time_id]), + RBuff.st_offset(layer_id, bacc_per_time[time_id]), + // RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]), cell_offset_pre, activ_cell_offset, - hidden_offset); + RBuff.hidden_offset(layer_id, bacc_per_time[time_id])); }; auto call_hy_cy_update = [&RBuff, @@ -1404,7 +1369,6 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hy, cy, max_batch, - hidden_size, seq_len](int layer_id) { if(hy != nullptr || (cy != nullptr)) { @@ -1413,7 +1377,9 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; const std::vector hcy_dst_stride{ - static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; + static_cast(RBuff.hidden_size * max_batch), + static_cast(RBuff.hidden_size), + 1}; for(int time_i = seq_len - 1; time_i >= 0; time_i--) { @@ -1424,13 +1390,10 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto batch_id_relative = in_n.at(time_i) - copy_batch; auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * hidden_size; - - auto src_batch_offset = RBuff.layer_offset(layer_id) + - RBuff.gemm_write_relative_offset(batch_id_abs); + auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(hidden_size)}; + 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); @@ -1444,7 +1407,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, hy, - src_batch_offset + RBuff.ht_relative_offset(), + RBuff.hidden_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } @@ -1455,7 +1418,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, cy, - src_batch_offset + RBuff.ct_relative_offset(), + RBuff.st_offset(layer_id, batch_id_abs), hcy_layer_offset + hcy_batch_offset); } } @@ -1671,7 +1634,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); } sync_root_to_all_stream_pull(); @@ -2152,7 +2115,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, sp_size[2] = wei_stride; w_desc = miopen::TensorDescriptor(wDesc.GetType(), w_size, w_stride); sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -2224,7 +2186,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, sp_size[1] = batch_n; sp_size[2] = wei_stride; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -2250,7 +2211,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, w_size[1] = 1; w_size[2] = wei_len; w_desc = miopen::TensorDescriptor(wDesc.GetType(), w_size, w_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -2302,7 +2262,6 @@ void RNNDescriptor::RNNForwardInferencePacked(Handle& handle, sp_size[2] = wei_len; sp_desc = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -3326,6 +3285,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( "! Batch size must not ascend!"); } } + in_n.push_back(batchval); batch_n += batchval; } @@ -3353,7 +3313,6 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( cy, reserveSpace, reserveSpaceSize); - if(is_profiling) { float eventTime_mS = RNNProfilingEnd(handle, start, stop); @@ -4919,8 +4878,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( activDesc = {miopenActivationTANH, 1, 1, 1}; } - ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode, bi, wei_stride); - ReluReserveBufferOffsets RBuff(hidden_size, + ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); + + ReluReserveBufferOffsets RBuff(input_size, hidden_size, nLayers, total_batches, @@ -4932,11 +4892,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( }; auto propagate_output = - [&RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, w](int nLayers, + [&RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, int layer) { // Propagate output // - if(layer == nLayers - 1) + if(layer == numLayers - 1) { const std::vector y_src_size{1, static_cast(RBuff.batches_per_layer), @@ -4987,7 +4947,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( workSpace, RBuff.layer_offset(layer + 1), w, - WeiBuf.input_weight_offset(layer), + WeiBuf.input_weight_offset(layer + 1), workSpace, RBuff.layer_offset(layer), GemmBackend_t::miopengemm); @@ -5016,13 +4976,14 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( float alpha1 = 1; float beta_t = 0; - std::vector hx_stride{batches.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; - std::vector reserve_stride{(int)RBuff.layer_stride(), RBuff.gemm_write_size(), 1}; + std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector reserve_stride{static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(cur_time), RBuff.h_vec_size}; - std::vector reserve_size{1, batches.at(cur_time), RBuff.h_vec_size}; + std::vector hx_size{1, batches.at(cur_time), RBuff.hidden_size}; + std::vector reserve_size{1, batches.at(cur_time), RBuff.hidden_size}; auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto workspace_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); @@ -5058,15 +5019,15 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( int reverse) { if(reverse == 0 && dhy != nullptr && batches.at(cur_time) > batches.at(use_time)) { - std::vector hx_stride{batches.at(0) * RBuff.h_vec_size, RBuff.h_vec_size, 1}; + std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; - std::vector reserve_stride{(int)RBuff.layer_stride(), RBuff.gemm_write_size(), 1}; + std::vector reserve_stride{static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; std::vector hx_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.h_vec_size}; + 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; std::vector reserve_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.h_vec_size}; + 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; float alpha0 = 1; float alpha1 = 1; @@ -5087,7 +5048,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta_t, reserve_desc, workSpace, - (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.h_vec_size), + (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.hidden_size), RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time)), RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time))); } @@ -5099,10 +5060,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, batches.at(use_time), - RBuff.h_vec_size, - RBuff.h_vec_size, + RBuff.hidden_size, + RBuff.hidden_size, RBuff.gemm_write_size(), - RBuff.h_vec_size, + RBuff.hidden_size, RBuff.gemm_write_size(), 1, // batch count 0, // Stride A @@ -5147,7 +5108,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( reserveSpace, &activDesc, propagate_hidden_output, - propagate_hidden_prev](int nLayers, int layer) { + propagate_hidden_prev](int layer) { int accumulated_batches = RBuff.batches_per_layer; int reverse_accumulated_batches = 0; @@ -5174,7 +5135,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_direction::Forward); } - std::vector reserve_size{1, batches.at(ti), RBuff.h_vec_size}; + std::vector reserve_size{1, batches.at(ti), RBuff.hidden_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); @@ -5192,7 +5153,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta, reserve_desc, workSpace, - RBuff.ht_offset(layer, accumulated_batches, rnn_direction::Forward), + RBuff.hidden_offset(layer, accumulated_batches, rnn_direction::Forward), RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward)); @@ -5218,7 +5179,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } std::vector reserve_backward_size{ - 1, batches.at(seqLen - 1 - ti), RBuff.h_vec_size}; + 1, batches.at(seqLen - 1 - ti), RBuff.hidden_size}; auto reserve_backward_desc = miopen::TensorDescriptor(rnn_data_type, reserve_backward_size, reserve_stride); activDesc.Backward( @@ -5233,7 +5194,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta, reserve_backward_desc, workSpace, - RBuff.ht_offset(layer, reverse_accumulated_batches, rnn_direction::Backward), + RBuff.hidden_offset(layer, reverse_accumulated_batches, rnn_direction::Backward), RBuff.gemm_write_offset( layer, reverse_accumulated_batches, rnn_direction::Backward), RBuff.gemm_write_offset( @@ -5255,11 +5216,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, (batches.at(cur_time) - use_batch), - RBuff.h_vec_size, - RBuff.h_vec_size, + RBuff.hidden_size, + RBuff.hidden_size, RBuff.gemm_write_size(), - RBuff.h_vec_size, - RBuff.h_vec_size, + RBuff.hidden_size, + RBuff.hidden_size, 1, // batch count 0, // Stride A 0, // Stride B @@ -5269,7 +5230,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int hx_shift = get_HxBuff_offset(layer, reverse) + use_batch * RBuff.h_vec_size; + int hx_shift = get_HxBuff_offset(layer, reverse) + use_batch * RBuff.hidden_size; miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -5322,7 +5283,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( for(int li = static_cast(nLayers) - 1; li >= 0; li--) { propagate_output(nLayers, li); - propagate_hidden(nLayers, li); + propagate_hidden(li); propagate_dhx(li); } @@ -5334,7 +5295,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, total_batches, input_size, - RBuff.h_vec_size * bi, + RBuff.hidden_size * bi, hy_stride, in_stride, in_stride, From cdce2f5185a90142a1b4d7a4b4b5f24da827c5bc Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Fri, 27 Oct 2023 16:04:46 +0300 Subject: [PATCH 08/27] Reverted back RNNForwardTraining_MS method --- src/ocl/rnnocl.cpp | 388 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 304 insertions(+), 84 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 8f18171fd4..4f6522b9e9 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -1141,62 +1141,263 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - int bi = dirMode != 0u ? 2 : 1; + int gates_cnt = 4; + int save_points_cnt = 6; - LSTMWeightsBufferHelper WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, bi); + struct WeightsBufferHelper + { + private: + auto hidden_xinput_size(int hidden_sz, int bidirect_mode) const + { + if(bidirect_mode == 0) + return hidden_sz; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } - LSTMReserveBufferHelper RBuff(hidden_size, nLayers, total_batch_size, in_vec); + auto matrix_lin_layer_size(int input_vector_sz, int hidden_vec_sz, int gates) const + { + return (input_vector_sz + hidden_vec_sz) * hidden_vec_sz * gates; + } + size_t bias_start_offset(int input_vector_sz, + int hidden_vec_sz, + int layers_cnt, + int gates, + int bidirect_mode) const + { + if(bidirect_mode == 0) + return matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates) + + static_cast(hidden_vec_sz + hidden_xinput_size(hidden_vec_sz, 0)) * + hidden_vec_sz * static_cast(layers_cnt - 1) * gates; - auto call_x_gemm = - [&RBuff, &WeiBuf, &InBuff_strides, &bacc_per_time, &handle, &xDesc, reserveSpace, x, w]( - int layer, int start_time, int time_cnt, float beta_t = 1) { - const auto start_b = bacc_per_time[start_time]; - const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } - const int m = batch_sz, n = RBuff.gemm_write_size(), - k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, - ldc = RBuff.gemm_write_stride(); + public: + WeightsBufferHelper( + int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int gates) + : in_vec(input_vector_sz), + h_vec(hidden_vec_sz), + x_in_vec(hidden_xinput_size(hidden_vec_sz, 0)), + layers(layers_cnt), + gates_cnt(gates), + bias_cnt(bias_mode), + matrix_normal_start_off(matrix_lin_layer_size(input_vector_sz, hidden_vec_sz, gates)), + bias_start_off( + bias_start_offset(input_vector_sz, hidden_vec_sz, layers_cnt, gates, 0)) + { + } - const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta - xDesc.GetType(), - false}; + const int in_vec, h_vec; + const int x_in_vec; // for bidirect TODO - const auto wx_off = WeiBuf.input_weight_offset(layer); - const auto out_offset = RBuff.gemm_write_offset(layer, start_b); + const int layers; + const int gates_cnt; + const int + bias_cnt; // 0 - no bisa; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec + private: + const size_t matrix_normal_start_off; + const size_t bias_start_off; - const auto x_in_offset = layer > 0 - ? RBuff.hidden_offset(layer - 1, start_b) - : static_cast(start_b * InBuff_strides.batch); - const auto in_ptr = layer > 0 ? reserveSpace : x; + public: + auto get_matrix_x_size(int layer_id) const + { + return (layer_id > 0 ? x_in_vec : in_vec) * h_vec; + } + auto get_matrix_h_size() const { return h_vec * h_vec; } + auto get_matrix_layer_size(int layer_id) const + { + return get_matrix_x_size(layer_id) * gates_cnt + get_matrix_h_size() * gates_cnt; + } - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - in_ptr, - x_in_offset, - w, - wx_off, - reserveSpace, - out_offset, - GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + size_t get_matrix_x_off(int layer_id) const + { + if(layer_id > 0) + return matrix_normal_start_off + + static_cast(layer_id - 1) * get_matrix_layer_size(layer_id); + else + return 0; + }; + + size_t get_matrix_h_off(int layer_id) const + { + if(layer_id > 0) + return get_matrix_x_off(layer_id) + + static_cast(h_vec * x_in_vec * gates_cnt); + else + return get_matrix_x_off(layer_id) + static_cast(h_vec * in_vec) * gates_cnt; + }; + + int bias_vector_size() const { return h_vec; } + int bias_vector_mul_gate() const { return bias_vector_size() * gates_cnt; } + int bias_stride() const { return bias_vector_mul_gate(); } + + size_t bias_relative_off(int layer_id, int bias_id) const + { + return static_cast(layer_id * bias_cnt + bias_id) * gates_cnt * h_vec; + } + + size_t get_bias_off(int layer_id, int bias_id) const + { + return bias_start_off + bias_relative_off(layer_id, bias_id); + } + + } WeiBuf(in_vec, hidden_size, nLayers, biasMode * 2, gates_cnt); + + struct ReserveBufferHelper + { + struct RBuffHelper + { + int element, save_point, batch; + size_t layer; }; + private: + auto Reserve_Buffer_strides(int save_point_sz, + int batches_per_layer, + int save_points, + int bidirect_mode = 0) const + { + const auto element_st = 1; + const auto save_point_st = element_st * save_point_sz; + const auto batch_st = save_point_st * save_points; + const auto layer_st = static_cast(batch_st) * batches_per_layer; + if(bidirect_mode == 0) + return RBuffHelper{element_st, save_point_st, batch_st, layer_st}; + MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); + } + + public: + enum save_point + { + F = 1, + I = 0, + G = 2, + O = 3, + St = 4, + Ht = 5 + }; + + ReserveBufferHelper(int hidden_vec_sz, + int save_point_sz, + int layers_cnt, + int batches_per_layer, + int save_points, + int gates_cnt) + : h_vec(hidden_vec_sz), + save_point_size(save_point_sz), + layers(layers_cnt), + batches(batches_per_layer), + save_points_cnt(save_points), + gates(gates_cnt), + strides(Reserve_Buffer_strides(save_point_sz, batches, save_points, 0)) + { + } + + const int h_vec; + const int save_point_size; // for bidirect TODO + + const int layers; + const int batches; + const int save_points_cnt; + const int gates; + const RBuffHelper strides; + + size_t layer_offset(int layer) const { return static_cast(layer) * strides.layer; } + auto layer_stride() const { return strides.layer; } + + auto gemm_write_size() const { return h_vec * gates; } + auto gemm_write_stride() const + { + return strides.batch; + } // save_point_size * save_points_cnt + + size_t gemm_write_relative_offset(int batch_id) const + { + return static_cast(gemm_write_stride()) * batch_id; + } + + size_t gemm_write_offset(int layer, int batch_id) const + { + return layer_offset(layer) + static_cast(gemm_write_stride()) * batch_id; + } + + auto ht_relative_offset() const { return save_point::Ht * save_point_size; } + + auto ct_relative_offset() const { return save_point::St * save_point_size; } + + auto get_gate_relative_offset(int gate_id) const { return gate_id * save_point_size; } + + size_t ht_offset(int layer_id, int batch_id) const + { + return layer_offset(layer_id) + gemm_write_relative_offset(batch_id) + + ht_relative_offset(); + } + + size_t extra_save_point_offset(int layer_id, int batch_id) const + { + return (static_cast(batches) * layers * gemm_write_stride()) // all data offset + + (static_cast(batches) * layer_id) * h_vec + + static_cast(batch_id * h_vec); + } + + } RBuff(hidden_size, hidden_size, nLayers, total_batch_size, save_points_cnt, gates_cnt); + + auto call_x_gemm = [&RBuff, + &WeiBuf, + &InBuff_strides, + &bacc_per_time, + &handle, + &xDesc, + reserveSpace, + x, + w, + hidden_size, + in_vec](int layer, int start_time, int time_cnt, float beta_t = 1) { + const auto start_b = bacc_per_time[start_time]; + const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; + + const int m = batch_sz, n = RBuff.gemm_write_size(), k = layer > 0 ? hidden_size : in_vec; + const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, + ldc = RBuff.gemm_write_stride(); + + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; + + const auto wx_off = WeiBuf.get_matrix_x_off(layer); + const auto out_offset = RBuff.gemm_write_offset(layer, start_b); + + const auto x_in_offset = layer > 0 ? RBuff.ht_offset(layer - 1, start_b) + : static_cast(start_b * InBuff_strides.batch); + const auto in_ptr = layer > 0 ? reserveSpace : x; + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + in_ptr, + x_in_offset, + w, + wx_off, + reserveSpace, + out_offset, + GemmBackend_t::rocblas); + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; + auto call_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, float beta_t = 0) { float alpha0 = 1; @@ -1205,17 +1406,17 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const auto bias_desc = miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, - std::vector{bias_stride, bias_stride, 1}); + std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, + std::vector{bias_stride, bias_stride, 1}); const auto hidden_interim_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{ - 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_vector_mul_gate()}, - std::vector{ - RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + std::vector{1, RBuff.batches, WeiBuf.bias_vector_mul_gate()}, + std::vector{ + RBuff.batches * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); + const auto RB_layer_out_off = RBuff.layer_offset(layer); + const auto w_bias_layer_start_off = WeiBuf.get_bias_off(layer, 0); OpTensor(handle, miopenTensorOpAdd, @@ -1229,8 +1430,9 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, // C RB_layer_out_off, // A offset - WeiBuf.bias_off(layer), // B offset - RB_layer_out_off); // C offset + w_bias_layer_start_off, // B offset + RB_layer_out_off); // C offset + OpTensor(handle, miopenTensorOpAdd, @@ -1244,7 +1446,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, RB_layer_out_off, - WeiBuf.bias_off(layer) + bias_stride, + w_bias_layer_start_off + bias_stride, RB_layer_out_off); }; @@ -1257,15 +1459,16 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &xDesc, reserveSpace, hx, - w](int layer, int cur_time) { - const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; + w, + hidden_size](int layer, int cur_time) { + const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size, - ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size, + ldb = hidden_size, ldc = RBuff.gemm_write_stride(); const auto hx_ptr_offset = (cur_time == 0) ? get_HxBuff_offset(layer) - : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); + : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); if(cur_time == 0) if(hx == nullptr) @@ -1299,10 +1502,10 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hx_ptr, hx_ptr_offset, w, - WeiBuf.hidden_weight_offset(layer), + WeiBuf.get_matrix_h_off(layer), reserveSpace, RB_layer_save_points_off, - GemmBackend_t::miopengemm); + GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) MIOPEN_THROW("GEMM execution failure"); @@ -1316,7 +1519,11 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &wDesc, reserveSpace, cx, - max_batch](int layer_id, int time_id) { + max_batch, + hidden_size](int layer_id, int time_id) { + auto RB_layer_save_points_off = + RBuff.layer_offset(layer_id) + RBuff.gemm_write_relative_offset(bacc_per_time[time_id]); + auto is_seq_begin = time_id == 0; const int direction = 0; @@ -1327,36 +1534,47 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const size_t cx_offset = get_HxBuff_offset(layer_id); - const size_t activ_cell_offset = - RBuff.extra_save_point_offset(layer_id, bacc_per_time[time_id]); + const size_t i_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(0), + f_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(1), + o_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(2), + c_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(3); + + const size_t cell_offset = RB_layer_save_points_off + RBuff.ct_relative_offset(), + hidden_offset = RB_layer_save_points_off + RBuff.ht_relative_offset(); const size_t cell_offset_pre = - (time_id == 0) ? 0 : RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]); + (time_id == 0) ? 0 + : RBuff.layer_offset(layer_id) + + RBuff.gemm_write_relative_offset(bacc_per_time[time_id - 1]) + + RBuff.ct_relative_offset(); + + const size_t activ_cell_offset = + RBuff.extra_save_point_offset(layer_id, bacc_per_time[time_id]); LSTMForwardHiddenStateUpdate(handle, wDesc.GetType(), false, is_seq_begin, - rnn_direction::Forward, + direction, max_batch, cur_batch, use_batch, - RBuff.hidden_size, + + hidden_size, hy_stride, wei_len, wei_stride, cx, cx_offset, reserveSpace, - RBuff.i_offset(layer_id, bacc_per_time[time_id]), - RBuff.f_offset(layer_id, bacc_per_time[time_id]), - RBuff.g_offset(layer_id, bacc_per_time[time_id]), - RBuff.o_offset(layer_id, bacc_per_time[time_id]), - RBuff.st_offset(layer_id, bacc_per_time[time_id]), - // RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]), + i_offset, + f_offset, + o_offset, + c_offset, + cell_offset, cell_offset_pre, activ_cell_offset, - RBuff.hidden_offset(layer_id, bacc_per_time[time_id])); + hidden_offset); }; auto call_hy_cy_update = [&RBuff, @@ -1369,6 +1587,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hy, cy, max_batch, + hidden_size, seq_len](int layer_id) { if(hy != nullptr || (cy != nullptr)) { @@ -1377,9 +1596,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; const std::vector hcy_dst_stride{ - static_cast(RBuff.hidden_size * max_batch), - static_cast(RBuff.hidden_size), - 1}; + static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; for(int time_i = seq_len - 1; time_i >= 0; time_i--) { @@ -1390,10 +1607,13 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto batch_id_relative = in_n.at(time_i) - copy_batch; auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; + auto hcy_batch_offset = batch_id_relative * hidden_size; + + auto src_batch_offset = RBuff.layer_offset(layer_id) + + RBuff.gemm_write_relative_offset(batch_id_abs); const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; + 1, static_cast(copy_batch), static_cast(hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); @@ -1407,7 +1627,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, hy, - RBuff.hidden_offset(layer_id, batch_id_abs), + src_batch_offset + RBuff.ht_relative_offset(), hcy_layer_offset + hcy_batch_offset); } @@ -1418,7 +1638,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, cy, - RBuff.st_offset(layer_id, batch_id_abs), + src_batch_offset + RBuff.ct_relative_offset(), hcy_layer_offset + hcy_batch_offset); } } @@ -1634,7 +1854,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); } sync_root_to_all_stream_pull(); From 12e4a433fc9eeb903371090ebd9c133052da147a Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Sat, 4 Nov 2023 00:58:06 +0300 Subject: [PATCH 09/27] Rnn relu bidirectional support draft --- src/ocl/rnnocl.cpp | 654 +++++++++++++++++++++++++++------------------ 1 file changed, 392 insertions(+), 262 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 8a988b22c7..06efeb2274 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -87,8 +87,10 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, GRUOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size); - auto get_HxBuff_offset = [&](int layer_id) { - return layer_id * (static_cast(hidden_size) * max_batch); + int bi = dirMode != 0u ? 2 : 1; + + auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse = 0) { + return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); }; auto call_gru_input_gemm = [*this, @@ -743,6 +745,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return; std::vector batches; + std::vector rbatches; + int in_vec_size = xDesc.GetLengths()[1]; int out_vec_size = yDesc.GetLengths()[1]; @@ -753,6 +757,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, int total_batch_size = 0; // accumulated batches per time std::vector bacc_per_time(seq_len + 1); + std::vector rbacc_per_time(seq_len + 1); for(int i = 0; i < seq_len; i++) { @@ -761,16 +766,27 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, batches.push_back(seq_array[i]); } - bacc_per_time[seq_len] = total_batch_size; + int rtotal_batch_size = 0; + for(int i = seq_len; i > 0; i--) + { + rbacc_per_time[i] = rtotal_batch_size; + rtotal_batch_size += seq_array[i]; + rbatches.push_back(seq_array[i]); + } - auto get_HxBuff_offset = [&](int layer_id) { - return layer_id * (static_cast(hidden_size) * max_batch); - }; + bacc_per_time[seq_len] = total_batch_size; + rbacc_per_time[0] = total_batch_size; int bi = dirMode != 0u ? 2 : 1; int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); + + auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse) { + return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); + }; + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); - ReluReserveBufferOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size, max_batch); + ReluReserveBufferOffsets RBuff( + in_vec_size, hidden_size, nLayers, total_batch_size, max_batch, bi); ActivationDescriptor activDesc; @@ -787,7 +803,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, hidden_size]( int layer, float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), - k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; + k = layer > 0 ? RBuff.gemm_write_size() : RBuff.in_vec_size; const int lda = layer > 0 ? RBuff.gemm_write_stride() : RBuff.in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); @@ -813,7 +829,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const auto input_weight_offset = WeiBuf.input_weight_offset(layer); const auto output_offset = RBuff.layer_offset(layer); - const auto input_offset = layer > 0 ? RBuff.hidden_offset(layer - 1) : 0; + const auto input_offset = layer > 0 ? RBuff.hidden_offset(layer - 1, 0, 0) : 0; const auto input_ptr = layer > 0 ? reserveSpace : x; @@ -879,69 +895,118 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RB_layer_out_off); }; - auto call_relu_tan_hidden_gemm = [&RBuff, - &WeiBuf, - &get_HxBuff_offset, - &bacc_per_time, - &batches, - &handle, - &xDesc, - reserveSpace, - hx, - w](int layer, int cur_time) { - if(cur_time == 0 && hx == nullptr) - return; + auto call_relu_tan_hidden_gemm = + [&RBuff, + &WeiBuf, + &get_HxBuff_offset, + &bacc_per_time, + &rbacc_per_time, + &batches, + &rbatches, + &handle, + &xDesc, + reserveSpace, + hx, + seq_len, + w](int layer, int accumulated_batches, int time, int cur_batch, int direction) { + if(time == 0 && hx == nullptr) + return; - const int m = batches.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; + // int cur_time = direction == rnn_direction::Forward ? time : seq_len - 1 - time; + // int use_time = direction == rnn_direction::Forward ? cur_time : cur_time + 1; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; + const int m = direction == rnn_direction::Forward + ? batches.at(time) + : time == 0 ? batches.at(seq_len - 1) : batches.at(seq_len - time); - const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); + // const int m = time == 0 ? batches.at(cur_time) : batches.at(use_time); + const int n = RBuff.hidden_size, k = RBuff.hidden_size; - const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - xDesc.GetType(), - false}; + const int lda = (time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; - const auto hidden_offset = (cur_time == 0) - ? get_HxBuff_offset(layer) - : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); + const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); - const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; + const auto ht_ptr = time > 0 ? reserveSpace : hx; - const auto RB_batch_save_points_off = - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time]); + if(time != 0 && direction == rnn_direction::Backward && hx != nullptr && + batches.at(seq_len - time - 1) > batches.at(seq_len - time)) + { + miopen::GemmDescriptor gemm_desc = + GemmDescriptor{false, + false, + true, + (batches.at(seq_len - time - 1) - batches.at(seq_len - time)), + n, + k, + RBuff.hidden_size, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; + + const miopenStatus_t gemm_status = CallGemm( + handle, + gemm_desc, + hx, + get_HxBuff_offset(layer, direction) + + batches.at(seq_len - time) * RBuff.hidden_size, + w, + WeiBuf.hidden_weight_offset(layer, direction), + reserveSpace, + RBuff.gemm_write_offset( + layer, accumulated_batches + batches.at(seq_len - time), direction), + GemmBackend_t::miopengemm); + } - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc_hx, - ht_ptr, - hidden_offset, - w, - WeiBuf.hidden_weight_offset(layer), - reserveSpace, - RB_batch_save_points_off, - GemmBackend_t::miopengemm); + const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; + + const auto hidden_offset = (time == 0) + ? get_HxBuff_offset(layer, direction) + : RBuff.hidden_offset(layer, cur_batch, direction); + + const auto RB_batch_save_points_off = + RBuff.gemm_write_offset(layer, accumulated_batches, direction); + + const miopenStatus_t gemm_status = + CallGemm(handle, + gemm_desc_hx, + ht_ptr, + hidden_offset, + w, + WeiBuf.hidden_weight_offset(layer, direction), + reserveSpace, + RB_batch_save_points_off, + GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); - }; + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; auto call_relu_tan_hidden_state_update = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc](int layer_id, - int time_id) { + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc, seq_len]( + int layer_id, int time_id, int reverse) { float alpha = 1, beta = 0; const std::vector tensor_size{1, @@ -956,9 +1021,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); const auto RB_layer_save_points_off = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id], reverse); - const auto hidden_offset = RBuff.hidden_offset(layer_id, bacc_per_time[time_id]); + const auto hidden_offset = + RBuff.hidden_offset(layer_id, bacc_per_time[time_id], reverse); activDesc.Forward(handle, &alpha, @@ -979,53 +1045,57 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_update_output = [&RBuff, &get_HxBuff_offset, - &bacc_per_time, &batches, &handle, &wDesc, + &bi, reserveSpace, hy, max_batch, - seq_len](int layer_id) { + &bacc_per_time, + seq_len](int layer_id, int time, int reverse) { + int base_time = reverse == 0 ? time : seq_len - 1 - time; + if(hy == nullptr) return; - auto hcy_layer_offset = get_HxBuff_offset(layer_id); - const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; + const std::vector hcy_dst_stride{static_cast(RBuff.hidden_size * max_batch), static_cast(RBuff.hidden_size), 1}; - for(int time_i = seq_len - 1; time_i >= 0; time_i--) - { - auto copy_batch = (time_i == seq_len - 1) ? batches.at(time_i) - : batches.at(time_i) - batches.at(time_i + 1); - if(copy_batch > 0) - { - auto batch_id_relative = batches.at(time_i) - copy_batch; - auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - - auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; + auto hcy_layer_offset = get_HxBuff_offset(layer_id, reverse); - const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; - - auto src_desc = - miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); - auto dst_desc = - miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); - - CopyTensor(handle, - src_desc, - reserveSpace, - dst_desc, - hy, - RBuff.hidden_offset(layer_id, batch_id_abs), - hcy_layer_offset + hcy_batch_offset); - } - } + auto copy_batch = + reverse == 0 + ? (time == seq_len - 1) ? batches.at(time) : batches.at(time) - batches.at(time + 1) + : (time == seq_len - 1) + ? batches.at(seq_len - 1 - time) + : batches.at(seq_len - 1 - time) - batches.at(seq_len - 2 - time); + if(copy_batch > 0) + { + auto batch_id_relative = batches.at(base_time) - copy_batch; + auto batch_id_abs = bacc_per_time[base_time] + batch_id_relative; + auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; + + const std::vector hcy_copy_size{ + 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; + + auto src_desc = + miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); + auto dst_desc = + miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); + + CopyTensor(handle, + src_desc, + reserveSpace, + dst_desc, + hy, + RBuff.hidden_offset(layer_id, batch_id_abs, reverse), + hcy_layer_offset + hcy_batch_offset); + }; }; for(int layer_id = 0; layer_id < nLayers; layer_id++) @@ -1034,12 +1104,41 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(biasMode != 0u) call_relu_tan_bias_add(layer_id); + int accumulated_batches = 0; + int reverse_accumulated_batches = RBuff.batches_per_layer; + for(int time = 0; time < seq_len; time++) { - call_relu_tan_hidden_gemm(layer_id, time); - call_relu_tan_hidden_state_update(layer_id, time); + reverse_accumulated_batches -= batches.at(seq_len - 1 - time); + call_relu_tan_hidden_gemm(layer_id, + accumulated_batches, + time, + time == 0 ? 0 : accumulated_batches - batches.at(time - 1), + rnn_direction::Forward); + + call_relu_tan_hidden_state_update(layer_id, time, rnn_direction::Forward); + accumulated_batches += batches.at(time); + + if(dirMode == 0u) + continue; + + call_relu_tan_hidden_gemm(layer_id, + reverse_accumulated_batches, + time, + reverse_accumulated_batches + batches.at(seq_len - 1 - time), + rnn_direction::Backward); + + call_relu_tan_hidden_state_update( + layer_id, seq_len - 1 - time, rnn_direction::Backward); + } + + for(int time = seq_len - 1; time >= 0; time--) + { + call_relu_tan_update_output(layer_id, time, rnn_direction::Forward); + if(dirMode == 0u) + continue; + call_relu_tan_update_output(layer_id, time, rnn_direction::Backward); } - call_relu_tan_update_output(layer_id); } // output tensor copy @@ -1057,8 +1156,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_src_stride); auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); - CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); + CopyTensor(handle, + src_desc, + reserveSpace, + y_dst_desc, + y, + RBuff.hidden_offset(nLayers - 1, 0, rnn_direction::Forward), + 0); } #else (void)handle; @@ -1141,7 +1245,8 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, return layer_id * (static_cast(hidden_size) * max_batch); }; - int bi = dirMode != 0u ? 2 : 1; + int gates_cnt = 4; + int save_points_cnt = 6; struct WeightsBufferHelper { @@ -1341,39 +1446,48 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, } RBuff(hidden_size, hidden_size, nLayers, total_batch_size, save_points_cnt, gates_cnt); - auto call_x_gemm = - [&RBuff, &WeiBuf, &InBuff_strides, &bacc_per_time, &handle, &xDesc, reserveSpace, x, w]( - int layer, int start_time, int time_cnt, float beta_t = 1) { - const auto start_b = bacc_per_time[start_time]; - const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; - - const int m = batch_sz, n = RBuff.gemm_write_size(), - k = layer > 0 ? RBuff.hidden_size : RBuff.in_vec_size; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, - ldc = RBuff.gemm_write_stride(); + auto call_x_gemm = [&RBuff, + &WeiBuf, + &InBuff_strides, + &bacc_per_time, + &handle, + &xDesc, + reserveSpace, + x, + w, + hidden_size, + in_vec](int layer, int start_time, int time_cnt, float beta_t = 1) { + const auto start_b = bacc_per_time[start_time]; + const auto batch_sz = bacc_per_time[start_time + time_cnt] - start_b; + + const int m = batch_sz, n = RBuff.gemm_write_size(), k = layer > 0 ? hidden_size : in_vec; + const int lda = layer > 0 ? RBuff.gemm_write_stride() : InBuff_strides.batch, ldb = k, + ldc = RBuff.gemm_write_stride(); - const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta - xDesc.GetType(), - false}; + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; + const auto wx_off = WeiBuf.get_matrix_x_off(layer); + const auto out_offset = RBuff.gemm_write_offset(layer, start_b); const auto x_in_offset = layer > 0 ? RBuff.ht_offset(layer - 1, start_b) : static_cast(start_b * InBuff_strides.batch); - const auto in_ptr = layer > 0 ? reserveSpace : x; + const auto in_ptr = layer > 0 ? reserveSpace : x; const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -1396,17 +1510,17 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const auto bias_desc = miopen::TensorDescriptor(wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, - std::vector{bias_stride, bias_stride, 1}); + std::vector{1, 1, WeiBuf.bias_vector_mul_gate()}, + std::vector{bias_stride, bias_stride, 1}); const auto hidden_interim_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{ - 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_vector_mul_gate()}, - std::vector{ - RBuff.batches_per_layer * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); + std::vector{1, RBuff.batches, WeiBuf.bias_vector_mul_gate()}, + std::vector{ + RBuff.batches * RBuff.gemm_write_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); + const auto RB_layer_out_off = RBuff.layer_offset(layer); + const auto w_bias_layer_start_off = WeiBuf.get_bias_off(layer, 0); OpTensor(handle, miopenTensorOpAdd, @@ -1421,7 +1535,8 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, // C RB_layer_out_off, // A offset w_bias_layer_start_off, // B offset - RB_layer_out_off); // C offset + RB_layer_out_off // C offset + ); OpTensor(handle, miopenTensorOpAdd, @@ -1435,7 +1550,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hidden_interim_desc, reserveSpace, RB_layer_out_off, - WeiBuf.bias_off(layer) + bias_stride, + w_bias_layer_start_off + bias_stride, RB_layer_out_off); }; @@ -1448,15 +1563,16 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &xDesc, reserveSpace, hx, - w](int layer, int cur_time) { - const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = RBuff.hidden_size; + w, + hidden_size](int layer, int cur_time) { + const int m = in_n.at(cur_time), n = RBuff.gemm_write_size(), k = hidden_size; - const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size, - ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); + const int lda = (cur_time != 0) ? RBuff.gemm_write_stride() : hidden_size, + ldb = hidden_size, ldc = RBuff.gemm_write_stride(); const auto hx_ptr_offset = (cur_time == 0) ? get_HxBuff_offset(layer) - : RBuff.hidden_offset(layer, bacc_per_time[cur_time - 1]); + : RBuff.ht_offset(layer, bacc_per_time[cur_time - 1]); if(cur_time == 0) if(hx == nullptr) @@ -1490,7 +1606,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hx_ptr, hx_ptr_offset, w, - WeiBuf.hidden_weight_offset(layer), + WeiBuf.get_matrix_h_off(layer), reserveSpace, RB_layer_save_points_off, GemmBackend_t::rocblas); @@ -1507,7 +1623,11 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, &wDesc, reserveSpace, cx, - max_batch](int layer_id, int time_id) { + max_batch, + hidden_size](int layer_id, int time_id) { + auto RB_layer_save_points_off = + RBuff.layer_offset(layer_id) + RBuff.gemm_write_relative_offset(bacc_per_time[time_id]); + auto is_seq_begin = time_id == 0; const int direction = 0; @@ -1518,36 +1638,47 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const size_t cx_offset = get_HxBuff_offset(layer_id); - const size_t activ_cell_offset = - RBuff.extra_save_point_offset(layer_id, bacc_per_time[time_id]); + const size_t i_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(0), + f_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(1), + o_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(2), + c_offset = RB_layer_save_points_off + RBuff.get_gate_relative_offset(3); + + const size_t cell_offset = RB_layer_save_points_off + RBuff.ct_relative_offset(), + hidden_offset = RB_layer_save_points_off + RBuff.ht_relative_offset(); const size_t cell_offset_pre = - (time_id == 0) ? 0 : RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]); + (time_id == 0) ? 0 + : RBuff.layer_offset(layer_id) + + RBuff.gemm_write_relative_offset(bacc_per_time[time_id - 1]) + + RBuff.ct_relative_offset(); + + const size_t activ_cell_offset = + RBuff.extra_save_point_offset(layer_id, bacc_per_time[time_id]); LSTMForwardHiddenStateUpdate(handle, wDesc.GetType(), false, is_seq_begin, - rnn_direction::Forward, + direction, max_batch, cur_batch, use_batch, - RBuff.hidden_size, + + hidden_size, hy_stride, wei_len, wei_stride, cx, cx_offset, reserveSpace, - RBuff.i_offset(layer_id, bacc_per_time[time_id]), - RBuff.f_offset(layer_id, bacc_per_time[time_id]), - RBuff.g_offset(layer_id, bacc_per_time[time_id]), - RBuff.o_offset(layer_id, bacc_per_time[time_id]), - RBuff.st_offset(layer_id, bacc_per_time[time_id]), - // RBuff.st_offset(layer_id, bacc_per_time[time_id - 1]), + i_offset, + f_offset, + o_offset, + c_offset, + cell_offset, cell_offset_pre, activ_cell_offset, - RBuff.hidden_offset(layer_id, bacc_per_time[time_id])); + hidden_offset); }; auto call_hy_cy_update = [&RBuff, @@ -1560,6 +1691,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, hy, cy, max_batch, + hidden_size, seq_len](int layer_id) { if(hy != nullptr || (cy != nullptr)) { @@ -1568,9 +1700,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; const std::vector hcy_dst_stride{ - static_cast(RBuff.hidden_size * max_batch), - static_cast(RBuff.hidden_size), - 1}; + static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; for(int time_i = seq_len - 1; time_i >= 0; time_i--) { @@ -1581,10 +1711,13 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto batch_id_relative = in_n.at(time_i) - copy_batch; auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; + auto hcy_batch_offset = batch_id_relative * hidden_size; + + auto src_batch_offset = RBuff.layer_offset(layer_id) + + RBuff.gemm_write_relative_offset(batch_id_abs); const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; + 1, static_cast(copy_batch), static_cast(hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); @@ -1598,7 +1731,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, hy, - RBuff.hidden_offset(layer_id, batch_id_abs), + src_batch_offset + RBuff.ht_relative_offset(), hcy_layer_offset + hcy_batch_offset); } @@ -1609,7 +1742,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, reserveSpace, dst_desc, cy, - RBuff.st_offset(layer_id, batch_id_abs), + src_batch_offset + RBuff.ct_relative_offset(), hcy_layer_offset + hcy_batch_offset); } } @@ -1825,7 +1958,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); CopyTensor( - handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.hidden_offset(nLayers - 1, 0), 0); + handle, src_desc, reserveSpace, y_dst_desc, y, RBuff.ht_offset(nLayers - 1, 0), 0); } sync_root_to_all_stream_pull(); @@ -3487,7 +3620,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( if(rnnMode == miopenLSTM && algoMode == miopenRNNdefault && !use_dropout && nLayers > 1 && dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{})) && xDesc[0].GetType() == miopenFloat && - seqLen >= 32) + seqLen >= 1) { RNNForwardTraining_MS(handle, in_n, @@ -3514,8 +3647,9 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( return; } - if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && - dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && + // nLayers > 1 && + // dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { RNNForwardTrainingTanhRelu(handle, @@ -4013,6 +4147,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( for(int ti = 0; ti < seqLen; ti++) { baccbi -= in_n.at(seqLen - 1 - ti); + wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride; int pretime_shift = 0; int use_time = 0; @@ -4022,6 +4157,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( int cur_time = ri == 0 ? ti : seqLen - 1 - ti; int cur_batch = ri == 0 ? bacc : baccbi; offset = hid_shift + cur_batch * hy_stride; + if(ti > 0) { pretime_shift = @@ -5071,12 +5207,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); - ReluReserveBufferOffsets RBuff(input_size, - hidden_size, - nLayers, - total_batches, - max_batch, - dirMode == miopenRNNbidirection); + ReluReserveBufferOffsets RBuff(input_size, hidden_size, nLayers, total_batches, max_batch, bi); auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse) { return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); @@ -5168,7 +5299,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( float beta_t = 0; std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; - std::vector reserve_stride{static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; + std::vector reserve_stride{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; std::vector hx_size{1, batches.at(cur_time), RBuff.hidden_size}; std::vector reserve_size{1, batches.at(cur_time), RBuff.hidden_size}; @@ -5194,100 +5326,98 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( RBuff.gemm_write_offset(layer, accumulated_batches, reverse)); }; - auto propagate_hidden_prev = [&RBuff, - &handle, - batches, - &rnn_data_type, - dhy, - workSpace, - &get_HxBuff_offset, - WeiBuf, - w](int layer, - int accumulated_batches, - int cur_time, - int use_time, - int pre_batch, - int reverse) { - if(reverse == 0 && dhy != nullptr && batches.at(cur_time) > batches.at(use_time)) - { - std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; - - std::vector reserve_stride{static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - - std::vector hx_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; - - std::vector reserve_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; - - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; - - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); - auto reserve_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - reserve_desc, - workSpace, - &beta_t, - reserve_desc, - workSpace, - (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.hidden_size), - RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time)), - RBuff.gemm_write_offset(layer, accumulated_batches + batches.at(use_time))); - } + auto propagate_hidden_prev = + [&RBuff, &handle, batches, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset, WeiBuf, w]( + int layer, + int accumulated_batches, + int cur_time, + int use_time, + int pre_batch, + int reverse) { + if(reverse == 0 && dhy != nullptr && batches.at(cur_time) > batches.at(use_time)) + { + std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + + std::vector reserve_stride{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; + + std::vector hx_size{ + 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; + + std::vector reserve_size{ + 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; + + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto reserve_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + OpTensor( + handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + reserve_desc, + workSpace, + &beta_t, + reserve_desc, + workSpace, + (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.hidden_size), + RBuff.gemm_write_offset( + layer, accumulated_batches + batches.at(use_time), reverse), + RBuff.gemm_write_offset( + layer, accumulated_batches + batches.at(use_time), reverse)); + } - if(batches.at(use_time) <= 0) - return; + if(batches.at(use_time) <= 0) + return; - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - false, - batches.at(use_time), - RBuff.hidden_size, - RBuff.hidden_size, - RBuff.gemm_write_size(), - RBuff.hidden_size, - RBuff.gemm_write_size(), - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - rnn_data_type, - false}; + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + batches.at(use_time), + RBuff.hidden_size, + RBuff.hidden_size, + RBuff.gemm_write_size(), + RBuff.hidden_size, + RBuff.gemm_write_size(), + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; - miopenStatus_t gemm_status = - CallGemm(handle, - gemm_desc, - workSpace, - RBuff.gemm_write_offset(layer, pre_batch, reverse), - w, - WeiBuf.hidden_weight_offset(layer, reverse), - workSpace, - RBuff.gemm_write_offset(layer, accumulated_batches, reverse), - GemmBackend_t::miopengemm); + miopenStatus_t gemm_status = + CallGemm(handle, + gemm_desc, + workSpace, + RBuff.gemm_write_offset(layer, pre_batch, reverse), + w, + WeiBuf.hidden_weight_offset(layer, reverse), + workSpace, + RBuff.gemm_write_offset(layer, accumulated_batches, reverse), + GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) - { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else + if(gemm_status != miopenStatusSuccess) { - MIOPEN_LOG_E("GEMM failed"); + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } } - } - }; + }; auto propagate_hidden = [*this, &RBuff, From d2a166b85797d6d0970ebd95e6ff419ca4cf41a7 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Thu, 9 Nov 2023 20:52:41 +0300 Subject: [PATCH 10/27] Refactored Relu forward --- src/ocl/rnnocl.cpp | 323 ++++++++++++++++++++++----------------------- 1 file changed, 161 insertions(+), 162 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 06efeb2274..7e1b349b81 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -766,11 +766,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, batches.push_back(seq_array[i]); } - int rtotal_batch_size = 0; - for(int i = seq_len; i > 0; i--) + int rtotal_batch_size = total_batch_size; + for(int i = seq_len - 1; i >= 0; i--) { - rbacc_per_time[i] = rtotal_batch_size; - rtotal_batch_size += seq_array[i]; + rbacc_per_time[seq_len - 1 - i] = rtotal_batch_size; + rtotal_batch_size -= seq_array[i]; rbatches.push_back(seq_array[i]); } @@ -780,9 +780,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, int bi = dirMode != 0u ? 2 : 1; int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); - auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse) { - return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); - }; + auto get_HxBuff_offset = + [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { + return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + + hidden_size * batch_id; + }; ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); ReluReserveBufferOffsets RBuff( @@ -895,123 +897,131 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RB_layer_out_off); }; - auto call_relu_tan_hidden_gemm = - [&RBuff, - &WeiBuf, - &get_HxBuff_offset, - &bacc_per_time, - &rbacc_per_time, - &batches, - &rbatches, - &handle, - &xDesc, - reserveSpace, - hx, - seq_len, - w](int layer, int accumulated_batches, int time, int cur_batch, int direction) { - if(time == 0 && hx == nullptr) - return; - - // int cur_time = direction == rnn_direction::Forward ? time : seq_len - 1 - time; - // int use_time = direction == rnn_direction::Forward ? cur_time : cur_time + 1; - - const int m = direction == rnn_direction::Forward - ? batches.at(time) - : time == 0 ? batches.at(seq_len - 1) : batches.at(seq_len - time); - - // const int m = time == 0 ? batches.at(cur_time) : batches.at(use_time); - const int n = RBuff.hidden_size, k = RBuff.hidden_size; + auto call_relu_tan_hidden_gemm = [&RBuff, + &WeiBuf, + &get_HxBuff_offset, + &bacc_per_time, + &rbacc_per_time, + &batches, + &rbatches, + &handle, + &xDesc, + reserveSpace, + hx, + w](int layer, int time, int direction) { + if(time == 0 && hx == nullptr) + return; - const int lda = (time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; + const int m = direction == rnn_direction::Forward + ? batches.at(time) + : time == 0 ? rbatches.at(0) : rbatches.at(time - 1); - const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); + const int n = RBuff.hidden_size, k = RBuff.hidden_size; - const auto ht_ptr = time > 0 ? reserveSpace : hx; + const int lda = (time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; - if(time != 0 && direction == rnn_direction::Backward && hx != nullptr && - batches.at(seq_len - time - 1) > batches.at(seq_len - time)) - { - miopen::GemmDescriptor gemm_desc = - GemmDescriptor{false, - false, - true, - (batches.at(seq_len - time - 1) - batches.at(seq_len - time)), - n, - k, - RBuff.hidden_size, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - xDesc.GetType(), - false}; + const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); - const miopenStatus_t gemm_status = CallGemm( - handle, - gemm_desc, - hx, - get_HxBuff_offset(layer, direction) + - batches.at(seq_len - time) * RBuff.hidden_size, - w, - WeiBuf.hidden_weight_offset(layer, direction), - reserveSpace, - RBuff.gemm_write_offset( - layer, accumulated_batches + batches.at(seq_len - time), direction), - GemmBackend_t::miopengemm); - } + const auto ht_ptr = time > 0 ? reserveSpace : hx; - const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - xDesc.GetType(), - false}; - - const auto hidden_offset = (time == 0) - ? get_HxBuff_offset(layer, direction) - : RBuff.hidden_offset(layer, cur_batch, direction); - - const auto RB_batch_save_points_off = - RBuff.gemm_write_offset(layer, accumulated_batches, direction); + if(time != 0 && direction == rnn_direction::Backward && hx != nullptr && + rbatches.at(time) > rbatches.at(time - 1)) + { + miopen::GemmDescriptor gemm_desc = + GemmDescriptor{false, + false, + true, + (rbatches.at(time) - rbatches.at(time - 1)), + n, + k, + RBuff.hidden_size, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc_hx, - ht_ptr, - hidden_offset, + gemm_desc, + hx, + get_HxBuff_offset(layer, rbatches.at(time - 1), direction), w, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, - RB_batch_save_points_off, + RBuff.gemm_write_offset( + layer, rbacc_per_time[time + 1] + rbatches.at(time - 1), direction), GemmBackend_t::miopengemm); + } - if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); - }; + const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; + + int cur_batch = direction == rnn_direction::Forward + ? time == 0 ? 0 : bacc_per_time[time - 1] + : rbacc_per_time[time]; + + const auto hidden_offset = (time == 0) ? get_HxBuff_offset(layer, 0, direction) + : RBuff.hidden_offset(layer, cur_batch, direction); + + const int accumulated_batches = direction == rnn_direction::Forward + ? time == 0 ? 0 : bacc_per_time[time] + : rbacc_per_time[time + 1]; + + const auto RB_batch_save_points_off = + RBuff.gemm_write_offset(layer, accumulated_batches, direction); + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc_hx, + ht_ptr, + hidden_offset, + w, + WeiBuf.hidden_weight_offset(layer, direction), + reserveSpace, + RB_batch_save_points_off, + GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + MIOPEN_THROW("GEMM execution failure"); + }; auto call_relu_tan_hidden_state_update = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &activDesc, seq_len]( - int layer_id, int time_id, int reverse) { + [&RBuff, + &bacc_per_time, + &rbacc_per_time, + &batches, + &rbatches, + &handle, + &wDesc, + reserveSpace, + &activDesc](int layer_id, int time_id, int direction) { float alpha = 1, beta = 0; - const std::vector tensor_size{1, - static_cast(batches.at(time_id)), - static_cast(RBuff.hidden_size)}; + auto cur_size = + direction == rnn_direction::Forward ? batches.at(time_id) : rbatches.at(time_id); + + const std::vector tensor_size{ + 1, static_cast(cur_size), static_cast(RBuff.hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -1020,11 +1030,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto cur_batch = direction == rnn_direction::Forward ? bacc_per_time[time_id] + : rbacc_per_time[time_id + 1]; + const auto RB_layer_save_points_off = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id], reverse); + RBuff.gemm_write_offset(layer_id, cur_batch, direction); - const auto hidden_offset = - RBuff.hidden_offset(layer_id, bacc_per_time[time_id], reverse); + const auto hidden_offset = RBuff.hidden_offset(layer_id, cur_batch, direction); activDesc.Forward(handle, &alpha, @@ -1046,6 +1058,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_update_output = [&RBuff, &get_HxBuff_offset, &batches, + &rbatches, &handle, &wDesc, &bi, @@ -1053,12 +1066,19 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, hy, max_batch, &bacc_per_time, - seq_len](int layer_id, int time, int reverse) { - int base_time = reverse == 0 ? time : seq_len - 1 - time; - + &rbacc_per_time, + seq_len](int layer_id, int time, int direction) { if(hy == nullptr) return; + auto& use_batches = direction == rnn_direction::Forward ? batches : rbatches; + + auto copy_batch = time == seq_len - 1 ? use_batches.at(time) + : use_batches.at(time) - use_batches.at(time + 1); + + if(copy_batch <= 0) + return; + const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; @@ -1066,36 +1086,30 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, static_cast(RBuff.hidden_size), 1}; - auto hcy_layer_offset = get_HxBuff_offset(layer_id, reverse); + auto hcy_layer_offset = get_HxBuff_offset(layer_id, 0, direction); - auto copy_batch = - reverse == 0 - ? (time == seq_len - 1) ? batches.at(time) : batches.at(time) - batches.at(time + 1) - : (time == seq_len - 1) - ? batches.at(seq_len - 1 - time) - : batches.at(seq_len - 1 - time) - batches.at(seq_len - 2 - time); - if(copy_batch > 0) - { - auto batch_id_relative = batches.at(base_time) - copy_batch; - auto batch_id_abs = bacc_per_time[base_time] + batch_id_relative; - auto hcy_batch_offset = batch_id_relative * RBuff.hidden_size; - - const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; - - auto src_desc = - miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); - auto dst_desc = - miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); - - CopyTensor(handle, - src_desc, - reserveSpace, - dst_desc, - hy, - RBuff.hidden_offset(layer_id, batch_id_abs, reverse), - hcy_layer_offset + hcy_batch_offset); - }; + auto hcy_batch_offset = + time == seq_len - 1 ? 0 : use_batches.at(time + 1) * RBuff.hidden_size; + + auto accumulated_batch = + direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; + + auto batch_id_abs = + time == seq_len - 1 ? accumulated_batch : accumulated_batch + use_batches.at(time + 1); + + const std::vector hcy_copy_size{ + 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; + + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); + + CopyTensor(handle, + src_desc, + reserveSpace, + dst_desc, + hy, + RBuff.hidden_offset(layer_id, batch_id_abs, direction), + hcy_layer_offset + hcy_batch_offset); }; for(int layer_id = 0; layer_id < nLayers; layer_id++) @@ -1104,32 +1118,18 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(biasMode != 0u) call_relu_tan_bias_add(layer_id); - int accumulated_batches = 0; - int reverse_accumulated_batches = RBuff.batches_per_layer; - for(int time = 0; time < seq_len; time++) { - reverse_accumulated_batches -= batches.at(seq_len - 1 - time); - call_relu_tan_hidden_gemm(layer_id, - accumulated_batches, - time, - time == 0 ? 0 : accumulated_batches - batches.at(time - 1), - rnn_direction::Forward); + call_relu_tan_hidden_gemm(layer_id, time, rnn_direction::Forward); call_relu_tan_hidden_state_update(layer_id, time, rnn_direction::Forward); - accumulated_batches += batches.at(time); if(dirMode == 0u) continue; - call_relu_tan_hidden_gemm(layer_id, - reverse_accumulated_batches, - time, - reverse_accumulated_batches + batches.at(seq_len - 1 - time), - rnn_direction::Backward); + call_relu_tan_hidden_gemm(layer_id, time, rnn_direction::Backward); - call_relu_tan_hidden_state_update( - layer_id, seq_len - 1 - time, rnn_direction::Backward); + call_relu_tan_hidden_state_update(layer_id, time, rnn_direction::Backward); } for(int time = seq_len - 1; time >= 0; time--) @@ -3647,10 +3647,9 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( return; } - if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && - // nLayers > 1 && - // dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && - !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && + dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && + !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { RNNForwardTrainingTanhRelu(handle, in_n, From f0d1d0ac34ee9e222599b7f27da4c4f70068955d Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Sat, 11 Nov 2023 10:12:39 +0300 Subject: [PATCH 11/27] =?UTF-8?q?RNNBackwardDataPackedTensorsRelu=20refact?= =?UTF-8?q?or=20=20=D0=9F=D0=BE=D0=B6=D0=B0=D0=BB=D1=83=D0=B9=D1=81=D1=82?= =?UTF-8?q?=D0=B0,=20=D0=B2=D0=B2=D0=B5=D0=B4=D0=B8=D1=82=D0=B5=20=D1=81?= =?UTF-8?q?=D0=BE=D0=BE=D0=B1=D1=89=D0=B5=D0=BD=D0=B8=D0=B5=20=D0=BA=D0=BE?= =?UTF-8?q?=D0=BC=D0=BC=D0=B8=D1=82=D0=B0=20=D0=B4=D0=BB=D1=8F=20=D0=B2?= =?UTF-8?q?=D0=B0=D1=88=D0=B8=D1=85=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B9.=20=D0=A1=D1=82=D1=80=D0=BE=D0=BA=D0=B8,?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ocl/rnnocl.cpp | 448 +++++++++++++++++++++++---------------------- 1 file changed, 229 insertions(+), 219 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 7e1b349b81..29579353ed 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -3648,8 +3648,8 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( } if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && - dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && - !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) + dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && + !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { RNNForwardTrainingTanhRelu(handle, in_n, @@ -5177,6 +5177,29 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( total_batches += dxDesc[i].GetLengths()[0]; } + int total_batch_size = 0; + // accumulated batches per time + std::vector bacc_per_time(seqLen + 1); + std::vector rbacc_per_time(seqLen + 1); + + for(int i = 0; i < seqLen; i++) + { + bacc_per_time[i] = total_batch_size; + total_batch_size += batches[i]; + } + + int rtotal_batch_size = total_batch_size; + std::vector rbatches; + for(int i = seqLen - 1; i >= 0; i--) + { + rbacc_per_time[seqLen - 1 - i] = rtotal_batch_size; + rtotal_batch_size -= batches[i]; + rbatches.push_back(batches[i]); + } + + bacc_per_time[seqLen] = total_batch_size; + rbacc_per_time[0] = total_batch_size; + if(out_vec_size != (bi * hidden_size)) { MIOPEN_THROW(miopenStatusBadParm, "Output size doesn't match hidden state size!"); @@ -5287,248 +5310,239 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_hidden_output = - [&RBuff, &handle, batches, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset]( - int layer, int accumulated_batches, int cur_time, int reverse) { - if(dhy == nullptr) - return; + auto propagate_hidden_output = [&RBuff, + &handle, + &batches, + &rbatches, + &bacc_per_time, + &rbacc_per_time, + &rnn_data_type, + dhy, + workSpace, + &get_HxBuff_offset](int layer, int time, int direction) { + if(dhy == nullptr) + return; - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; + std::vector& use_batches = direction == rnn_direction::Forward ? batches : rbatches; + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + std::vector hx_stride{use_batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector reserve_stride{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; + + std::vector hx_size{1, use_batches.at(time), RBuff.hidden_size}; + std::vector reserve_size{1, use_batches.at(time), RBuff.hidden_size}; + + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + + auto workspace_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + + int accumulated_batches = + direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + workspace_desc, + workSpace, + &beta_t, + workspace_desc, + workSpace, + get_HxBuff_offset(layer, direction), + RBuff.gemm_write_offset(layer, accumulated_batches, direction), + RBuff.gemm_write_offset(layer, accumulated_batches, direction)); + }; + + auto propagate_hidden_prev = [&RBuff, + &handle, + &batches, + &rbatches, + &bacc_per_time, + &rbacc_per_time, + &rnn_data_type, + dhy, + workSpace, + &get_HxBuff_offset, + WeiBuf, + w](int layer, int time, int direction) { + if(direction == rnn_direction::Forward && dhy != nullptr && + batches.at(time) > batches.at(time + 1)) + { std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(cur_time), RBuff.hidden_size}; - std::vector reserve_size{1, batches.at(cur_time), RBuff.hidden_size}; + std::vector hx_size{1, batches.at(time) - batches.at(time + 1), RBuff.hidden_size}; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + std::vector reserve_size{ + 1, batches.at(time) - batches.at(time + 1), RBuff.hidden_size}; + + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; - auto workspace_desc = + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - workspace_desc, - workSpace, - &beta_t, - workspace_desc, - workSpace, - get_HxBuff_offset(layer, reverse), - RBuff.gemm_write_offset(layer, accumulated_batches, reverse), - RBuff.gemm_write_offset(layer, accumulated_batches, reverse)); - }; + OpTensor( + handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + reserve_desc, + workSpace, + &beta_t, + reserve_desc, + workSpace, + (get_HxBuff_offset(layer, direction) + batches.at(time + 1) * RBuff.hidden_size), + RBuff.gemm_write_offset( + layer, bacc_per_time[time] + batches.at(time + 1), direction), + RBuff.gemm_write_offset( + layer, bacc_per_time[time] + batches.at(time + 1), direction)); + } - auto propagate_hidden_prev = - [&RBuff, &handle, batches, &rnn_data_type, dhy, workSpace, &get_HxBuff_offset, WeiBuf, w]( - int layer, - int accumulated_batches, - int cur_time, - int use_time, - int pre_batch, - int reverse) { - if(reverse == 0 && dhy != nullptr && batches.at(cur_time) > batches.at(use_time)) - { - std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; - - std::vector reserve_stride{ - static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - - std::vector hx_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; - - std::vector reserve_size{ - 1, batches.at(cur_time) - batches.at(use_time), RBuff.hidden_size}; - - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; - - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); - auto reserve_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - - OpTensor( - handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - reserve_desc, - workSpace, - &beta_t, - reserve_desc, - workSpace, - (get_HxBuff_offset(layer, reverse) + batches.at(use_time) * RBuff.hidden_size), - RBuff.gemm_write_offset( - layer, accumulated_batches + batches.at(use_time), reverse), - RBuff.gemm_write_offset( - layer, accumulated_batches + batches.at(use_time), reverse)); - } + int use_batch = direction == rnn_direction::Forward ? batches[time + 1] : rbatches[time]; - if(batches.at(use_time) <= 0) - return; + if(use_batch <= 0) + return; - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - false, - batches.at(use_time), - RBuff.hidden_size, - RBuff.hidden_size, - RBuff.gemm_write_size(), - RBuff.hidden_size, - RBuff.gemm_write_size(), - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - rnn_data_type, - false}; + int pre_batch = direction == rnn_direction::Forward ? bacc_per_time[time + 1] + : rbacc_per_time[time + 2]; - miopenStatus_t gemm_status = - CallGemm(handle, - gemm_desc, - workSpace, - RBuff.gemm_write_offset(layer, pre_batch, reverse), - w, - WeiBuf.hidden_weight_offset(layer, reverse), - workSpace, - RBuff.gemm_write_offset(layer, accumulated_batches, reverse), - GemmBackend_t::miopengemm); + int accumulated_batches = + direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; - if(gemm_status != miopenStatusSuccess) + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + use_batch, + RBuff.hidden_size, + RBuff.hidden_size, + RBuff.gemm_write_size(), + RBuff.hidden_size, + RBuff.gemm_write_size(), + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; + + miopenStatus_t gemm_status = + CallGemm(handle, + gemm_desc, + workSpace, + RBuff.gemm_write_offset(layer, pre_batch, direction), + w, + WeiBuf.hidden_weight_offset(layer, direction), + workSpace, + RBuff.gemm_write_offset(layer, accumulated_batches, direction), + GemmBackend_t::miopengemm); + + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else - { - MIOPEN_LOG_E("GEMM failed"); - } + MIOPEN_LOG_E("GEMM not implemented"); } - }; - - auto propagate_hidden = [*this, - &RBuff, - &handle, - seqLen, - batches, - &rnn_data_type, - workSpace, - reserveSpace, - &activDesc, - propagate_hidden_output, - propagate_hidden_prev](int layer) { - int accumulated_batches = RBuff.batches_per_layer; - int reverse_accumulated_batches = 0; + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + }; + auto propagate_hidden_time = [*this, + &RBuff, + &handle, + seqLen, + &batches, + &rbatches, + &bacc_per_time, + &rbacc_per_time, + &rnn_data_type, + workSpace, + reserveSpace, + &activDesc, + propagate_hidden_output, + propagate_hidden_prev](int layer, int time, int direction) { std::vector hx_stride{ batches.at(0) * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; std::vector reserve_stride{ RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; - for(int ti = seqLen - 1; ti >= 0; ti--) + if(time == seqLen - 1) + { + propagate_hidden_output(layer, time, direction); + } + else { - accumulated_batches -= batches.at(ti); + propagate_hidden_prev(layer, time, direction); + } - if(ti == seqLen - 1) - { - propagate_hidden_output(layer, accumulated_batches, ti, rnn_direction::Forward); - } - else - { - propagate_hidden_prev(layer, - accumulated_batches, - ti, - ti + 1, - accumulated_batches + batches.at(ti), - rnn_direction::Forward); - } + auto& use_batches = direction == rnn_direction::Forward ? batches : rbatches; - std::vector reserve_size{1, batches.at(ti), RBuff.hidden_size}; - auto reserve_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + int batch_offset = + direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; - float alpha = 1, beta = 0; + std::vector reserve_size{1, use_batches.at(time), RBuff.hidden_size}; + auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - activDesc.Backward( - handle, - &alpha, - reserve_desc, - reserveSpace, - reserve_desc, - workSpace, - reserve_desc, - reserveSpace, - &beta, - reserve_desc, - workSpace, - RBuff.hidden_offset(layer, accumulated_batches, rnn_direction::Forward), - RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), - RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward), - RBuff.gemm_write_offset(layer, accumulated_batches, rnn_direction::Forward)); + float alpha = 1, beta = 0; + + activDesc.Backward(handle, + &alpha, + reserve_desc, + reserveSpace, + reserve_desc, + workSpace, + reserve_desc, + reserveSpace, + &beta, + reserve_desc, + workSpace, + RBuff.hidden_offset(layer, batch_offset, direction), + RBuff.gemm_write_offset(layer, batch_offset, direction), + RBuff.gemm_write_offset(layer, batch_offset, direction), + RBuff.gemm_write_offset(layer, batch_offset, direction)); + }; + auto propagate_hidden = [*this, + &RBuff, + &handle, + seqLen, + &batches, + &rbatches, + &bacc_per_time, + &rbacc_per_time, + &rnn_data_type, + workSpace, + reserveSpace, + &activDesc, + propagate_hidden_time](int layer) { + for(int time = seqLen - 1; time >= 0; time--) + { + propagate_hidden_time(layer, time, rnn_direction::Forward); if(dirMode == 0u) continue; - - // Propagate Backward direction - // - if(ti == seqLen - 1) - { - propagate_hidden_output( - layer, reverse_accumulated_batches, seqLen - 1 - ti, rnn_direction::Backward); - } - else - { - propagate_hidden_prev(layer, - reverse_accumulated_batches, - seqLen - 1 - ti, - seqLen - 1 - ti, - reverse_accumulated_batches - batches.at(seqLen - 2 - ti), - rnn_direction::Backward); - } - - std::vector reserve_backward_size{ - 1, batches.at(seqLen - 1 - ti), RBuff.hidden_size}; - auto reserve_backward_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_backward_size, reserve_stride); - activDesc.Backward( - handle, - &alpha, - reserve_backward_desc, - reserveSpace, - reserve_backward_desc, - workSpace, - reserve_backward_desc, - reserveSpace, - &beta, - reserve_backward_desc, - workSpace, - RBuff.hidden_offset(layer, reverse_accumulated_batches, rnn_direction::Backward), - RBuff.gemm_write_offset( - layer, reverse_accumulated_batches, rnn_direction::Backward), - RBuff.gemm_write_offset( - layer, reverse_accumulated_batches, rnn_direction::Backward), - RBuff.gemm_write_offset( - layer, reverse_accumulated_batches, rnn_direction::Backward)); - - reverse_accumulated_batches += batches.at(seqLen - 1 - ti); + propagate_hidden_time(layer, time, rnn_direction::Backward); } }; auto propagate_dhx_prev = [&RBuff, &WeiBuf, &rnn_data_type, batches, &handle, w, dhx, &get_HxBuff_offset, workSpace]( - int layer, int cur_time, int cur_batch, int use_batch, int reverse) { + int layer, int cur_time, int cur_batch, int use_batch, int direction) { if(batches.at(cur_time) <= use_batch) return; @@ -5550,14 +5564,14 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int hx_shift = get_HxBuff_offset(layer, reverse) + use_batch * RBuff.hidden_size; + int hx_shift = get_HxBuff_offset(layer, direction) + use_batch * RBuff.hidden_size; miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, workSpace, - RBuff.gemm_write_offset(layer, cur_batch + use_batch, reverse), + RBuff.gemm_write_offset(layer, cur_batch + use_batch, direction), w, - WeiBuf.hidden_weight_offset(layer, reverse), + WeiBuf.hidden_weight_offset(layer, direction), dhx, hx_shift, GemmBackend_t::miopengemm); @@ -5575,28 +5589,24 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_dhx = [*this, seqLen, &RBuff, batches, &propagate_dhx_prev](int layer) { - int accumulated_batches = 0; - int reverse_accumulated_batches = RBuff.batches_per_layer; + auto propagate_dhx = [*this, seqLen, &RBuff, &batches, &rbatches, &bacc_per_time, &rbacc_per_time, &propagate_dhx_prev](int layer) { for(int ti = 0; ti < seqLen; ti++) { int use_time = ti > 0 ? ti - 1 : 0; int use_batch = ti > 0 ? batches.at(use_time) : 0; - propagate_dhx_prev(layer, ti, accumulated_batches, use_batch, rnn_direction::Forward); + propagate_dhx_prev(layer, ti, bacc_per_time[ti], use_batch, rnn_direction::Forward); if(dirMode != 0u) { - reverse_accumulated_batches -= batches.at(seqLen - 1 - ti); int reverse_use_time = ti > 0 ? seqLen - ti : 0; - int reverse_use_batch = ti > 0 ? batches.at(reverse_use_time) : 0; + int reverse_use_batch = ti > 0 ? rbatches.at(ti - 1 ) : 0; propagate_dhx_prev(layer, seqLen - 1 - ti, - reverse_accumulated_batches, + rbacc_per_time[ti + 1], reverse_use_batch, rnn_direction::Backward); } - accumulated_batches += batches.at(ti); } }; From e8de9355038f035d5b061d69cf1e778f19830cdf Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Wed, 15 Nov 2023 14:04:48 +0300 Subject: [PATCH 12/27] Simplified RNNBackwardDataPackedTensorsRelu --- src/ocl/rnnocl.cpp | 272 ++++++++++++++++++++++++--------------------- 1 file changed, 143 insertions(+), 129 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 29579353ed..bd2a5a9fa7 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -5134,7 +5134,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto rnn_data_type = dhxDesc.GetType(); - std::vector batches; + std::vector fbatches; int input_size = dxDesc[0].GetLengths()[1]; int max_batch = dhxDesc.GetLengths()[1]; int hidden_size = dhxDesc.GetLengths()[2]; @@ -5166,39 +5166,39 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } else { - if(batchval > batches.back() || batchval < 0) + if(batchval > fbatches.back() || batchval < 0) { MIOPEN_THROW(miopenStatusBadParm, "Incorrect input batch size at time " + std::to_string(i) + "! Batch size must not ascend!"); } } - batches.push_back(batchval); + fbatches.push_back(batchval); total_batches += dxDesc[i].GetLengths()[0]; } int total_batch_size = 0; // accumulated batches per time - std::vector bacc_per_time(seqLen + 1); + std::vector fbacc_per_time(seqLen + 1); std::vector rbacc_per_time(seqLen + 1); for(int i = 0; i < seqLen; i++) { - bacc_per_time[i] = total_batch_size; - total_batch_size += batches[i]; + fbacc_per_time[i] = total_batch_size; + total_batch_size += fbatches[i]; } int rtotal_batch_size = total_batch_size; std::vector rbatches; for(int i = seqLen - 1; i >= 0; i--) { + rtotal_batch_size -= fbatches[i]; rbacc_per_time[seqLen - 1 - i] = rtotal_batch_size; - rtotal_batch_size -= batches[i]; - rbatches.push_back(batches[i]); + rbatches.push_back(fbatches[i]); } - bacc_per_time[seqLen] = total_batch_size; - rbacc_per_time[0] = total_batch_size; + fbacc_per_time[seqLen] = total_batch_size; + rbacc_per_time[seqLen] = 0; if(out_vec_size != (bi * hidden_size)) { @@ -5231,9 +5231,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( ReluReserveBufferOffsets RBuff(input_size, hidden_size, nLayers, total_batches, max_batch, bi); - auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse) { - return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); - }; + auto get_HxBuff_offset = + [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { + return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + + hidden_size * batch_id; + }; auto propagate_output = [&RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, @@ -5312,9 +5314,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto propagate_hidden_output = [&RBuff, &handle, - &batches, + &fbatches, &rbatches, - &bacc_per_time, + &fbacc_per_time, &rbacc_per_time, &rnn_data_type, dhy, @@ -5323,25 +5325,25 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( if(dhy == nullptr) return; - std::vector& use_batches = direction == rnn_direction::Forward ? batches : rbatches; + auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; float alpha0 = 1; float alpha1 = 1; float beta_t = 0; - std::vector hx_stride{use_batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, use_batches.at(time), RBuff.hidden_size}; - std::vector reserve_size{1, use_batches.at(time), RBuff.hidden_size}; + std::vector hx_size{1, batches.at(time), RBuff.hidden_size}; + std::vector reserve_size{1, batches.at(time), RBuff.hidden_size}; auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); auto workspace_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - int accumulated_batches = - direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; + auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; + OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -5353,16 +5355,16 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta_t, workspace_desc, workSpace, - get_HxBuff_offset(layer, direction), - RBuff.gemm_write_offset(layer, accumulated_batches, direction), - RBuff.gemm_write_offset(layer, accumulated_batches, direction)); + get_HxBuff_offset(layer, 0, direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time], direction)); }; auto propagate_hidden_prev = [&RBuff, &handle, - &batches, + &fbatches, &rbatches, - &bacc_per_time, + &fbacc_per_time, &rbacc_per_time, &rnn_data_type, dhy, @@ -5371,17 +5373,18 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( WeiBuf, w](int layer, int time, int direction) { if(direction == rnn_direction::Forward && dhy != nullptr && - batches.at(time) > batches.at(time + 1)) + fbatches.at(time) > fbatches.at(time + 1)) { - std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector hx_stride{fbatches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(time) - batches.at(time + 1), RBuff.hidden_size}; + std::vector hx_size{ + 1, fbatches.at(time) - fbatches.at(time + 1), RBuff.hidden_size}; std::vector reserve_size{ - 1, batches.at(time) - batches.at(time + 1), RBuff.hidden_size}; + 1, fbatches.at(time) - fbatches.at(time + 1), RBuff.hidden_size}; float alpha0 = 1; float alpha1 = 1; @@ -5391,40 +5394,35 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - OpTensor( - handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - reserve_desc, - workSpace, - &beta_t, - reserve_desc, - workSpace, - (get_HxBuff_offset(layer, direction) + batches.at(time + 1) * RBuff.hidden_size), - RBuff.gemm_write_offset( - layer, bacc_per_time[time] + batches.at(time + 1), direction), - RBuff.gemm_write_offset( - layer, bacc_per_time[time] + batches.at(time + 1), direction)); + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + reserve_desc, + workSpace, + &beta_t, + reserve_desc, + workSpace, + get_HxBuff_offset(layer, fbatches.at(time + 1), direction), + RBuff.gemm_write_offset( + layer, fbacc_per_time[time] + fbatches.at(time + 1), direction), + RBuff.gemm_write_offset( + layer, fbacc_per_time[time] + fbatches.at(time + 1), direction)); } - int use_batch = direction == rnn_direction::Forward ? batches[time + 1] : rbatches[time]; + int used_batch = direction == rnn_direction::Forward ? fbatches[time + 1] : rbatches[time]; - if(use_batch <= 0) + if(used_batch <= 0) return; - int pre_batch = direction == rnn_direction::Forward ? bacc_per_time[time + 1] - : rbacc_per_time[time + 2]; - - int accumulated_batches = - direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; + auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, - use_batch, + used_batch, RBuff.hidden_size, RBuff.hidden_size, RBuff.gemm_write_size(), @@ -5443,11 +5441,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( CallGemm(handle, gemm_desc, workSpace, - RBuff.gemm_write_offset(layer, pre_batch, direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time + 1], direction), w, WeiBuf.hidden_weight_offset(layer, direction), workSpace, - RBuff.gemm_write_offset(layer, accumulated_batches, direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), GemmBackend_t::miopengemm); if(gemm_status != miopenStatusSuccess) @@ -5467,9 +5465,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &RBuff, &handle, seqLen, - &batches, + &fbatches, &rbatches, - &bacc_per_time, + &fbacc_per_time, &rbacc_per_time, &rnn_data_type, workSpace, @@ -5478,7 +5476,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( propagate_hidden_output, propagate_hidden_prev](int layer, int time, int direction) { std::vector hx_stride{ - batches.at(0) * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; + fbatches.at(0) * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; std::vector reserve_stride{ RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; @@ -5491,12 +5489,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( propagate_hidden_prev(layer, time, direction); } - auto& use_batches = direction == rnn_direction::Forward ? batches : rbatches; + auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; - int batch_offset = - direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; + auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - std::vector reserve_size{1, use_batches.at(time), RBuff.hidden_size}; + std::vector reserve_size{1, batches.at(time), RBuff.hidden_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); float alpha = 1, beta = 0; @@ -5512,19 +5509,19 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta, reserve_desc, workSpace, - RBuff.hidden_offset(layer, batch_offset, direction), - RBuff.gemm_write_offset(layer, batch_offset, direction), - RBuff.gemm_write_offset(layer, batch_offset, direction), - RBuff.gemm_write_offset(layer, batch_offset, direction)); + RBuff.hidden_offset(layer, bacc_per_time[time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[time], direction)); }; auto propagate_hidden = [*this, &RBuff, &handle, seqLen, - &batches, + &fbatches, &rbatches, - &bacc_per_time, + &fbacc_per_time, &rbacc_per_time, &rnn_data_type, workSpace, @@ -5540,73 +5537,90 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_dhx_prev = - [&RBuff, &WeiBuf, &rnn_data_type, batches, &handle, w, dhx, &get_HxBuff_offset, workSpace]( - int layer, int cur_time, int cur_batch, int use_batch, int direction) { - if(batches.at(cur_time) <= use_batch) - return; + auto propagate_dhx_prev = [&RBuff, + &WeiBuf, + &rnn_data_type, + &fbatches, + &rbatches, + &fbacc_per_time, + &rbacc_per_time, + &handle, + w, + dhx, + &get_HxBuff_offset, + workSpace](int layer, int time, int direction) { + auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; + + auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; + + int batch_size = time == 0 ? batches.at(time) : batches.at(time) - batches.at(time - 1); + + if(batch_size <= 0) + return; - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - false, - (batches.at(cur_time) - use_batch), - RBuff.hidden_size, - RBuff.hidden_size, - RBuff.gemm_write_size(), - RBuff.hidden_size, - RBuff.hidden_size, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - rnn_data_type, - false}; + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + batch_size, + RBuff.hidden_size, + RBuff.hidden_size, + RBuff.gemm_write_size(), + RBuff.hidden_size, + RBuff.hidden_size, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; - int hx_shift = get_HxBuff_offset(layer, direction) + use_batch * RBuff.hidden_size; - miopenStatus_t gemm_status = - CallGemm(handle, - gemm_desc, - workSpace, - RBuff.gemm_write_offset(layer, cur_batch + use_batch, direction), - w, - WeiBuf.hidden_weight_offset(layer, direction), - dhx, - hx_shift, - GemmBackend_t::miopengemm); + int use_batch = time == 0 ? 0 : batches.at(time - 1); - if(gemm_status != miopenStatusSuccess) - { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else - { - MIOPEN_LOG_E("GEMM failed"); - } - } - }; + int write_offset = + time == 0 ? bacc_per_time[time] : bacc_per_time[time] + batches.at(time - 1); - auto propagate_dhx = [*this, seqLen, &RBuff, &batches, &rbatches, &bacc_per_time, &rbacc_per_time, &propagate_dhx_prev](int layer) { + miopenStatus_t gemm_status = + CallGemm(handle, + gemm_desc, + workSpace, + RBuff.gemm_write_offset(layer, write_offset, direction), + w, + WeiBuf.hidden_weight_offset(layer, direction), + dhx, + get_HxBuff_offset(layer, use_batch, direction), + GemmBackend_t::miopengemm); - for(int ti = 0; ti < seqLen; ti++) + if(gemm_status != miopenStatusSuccess) { - int use_time = ti > 0 ? ti - 1 : 0; - int use_batch = ti > 0 ? batches.at(use_time) : 0; - propagate_dhx_prev(layer, ti, bacc_per_time[ti], use_batch, rnn_direction::Forward); - - if(dirMode != 0u) + if(gemm_status == miopenStatusNotImplemented) { - int reverse_use_time = ti > 0 ? seqLen - ti : 0; - int reverse_use_batch = ti > 0 ? rbatches.at(ti - 1 ) : 0; - propagate_dhx_prev(layer, - seqLen - 1 - ti, - rbacc_per_time[ti + 1], - reverse_use_batch, - rnn_direction::Backward); + MIOPEN_LOG_E("GEMM not implemented"); } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + }; + + auto propagate_dhx = [*this, + seqLen, + &RBuff, + &fbatches, + &rbatches, + &fbacc_per_time, + &rbacc_per_time, + &propagate_dhx_prev](int layer) { + for(int time = 0; time < seqLen; ti++) + { + propagate_dhx_prev(layer, time, rnn_direction::Forward); + + if(dirMode == 0u) + continue; + + propagate_dhx_prev(layer, time, rnn_direction::Backward); } }; From ee524b62e6e331e36e71dc29ebc6ce93a29c3eb8 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Wed, 15 Nov 2023 17:03:17 +0300 Subject: [PATCH 13/27] Relu forward minor refactor --- src/ocl/rnnocl.cpp | 137 ++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 71 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index bd2a5a9fa7..c53449897b 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -744,7 +744,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(seq_len == 0) return; - std::vector batches; + std::vector fbatches; std::vector rbatches; int in_vec_size = xDesc.GetLengths()[1]; @@ -756,26 +756,26 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, int total_batch_size = 0; // accumulated batches per time - std::vector bacc_per_time(seq_len + 1); + std::vector fbacc_per_time(seq_len + 1); std::vector rbacc_per_time(seq_len + 1); for(int i = 0; i < seq_len; i++) { - bacc_per_time[i] = total_batch_size; + fbacc_per_time[i] = total_batch_size; total_batch_size += seq_array[i]; - batches.push_back(seq_array[i]); + fbatches.push_back(seq_array[i]); } int rtotal_batch_size = total_batch_size; for(int i = seq_len - 1; i >= 0; i--) { + rtotal_batch_size -= fbatches[i]; rbacc_per_time[seq_len - 1 - i] = rtotal_batch_size; - rtotal_batch_size -= seq_array[i]; - rbatches.push_back(seq_array[i]); + rbatches.push_back(fbatches[i]); } - bacc_per_time[seq_len] = total_batch_size; - rbacc_per_time[0] = total_batch_size; + fbacc_per_time[seq_len] = total_batch_size; + rbacc_per_time[seq_len] = 0; int bi = dirMode != 0u ? 2 : 1; int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); @@ -900,9 +900,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_hidden_gemm = [&RBuff, &WeiBuf, &get_HxBuff_offset, - &bacc_per_time, + &fbacc_per_time, &rbacc_per_time, - &batches, + &fbatches, &rbatches, &handle, &xDesc, @@ -913,7 +913,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return; const int m = direction == rnn_direction::Forward - ? batches.at(time) + ? fbatches.at(time) : time == 0 ? rbatches.at(0) : rbatches.at(time - 1); const int n = RBuff.hidden_size, k = RBuff.hidden_size; @@ -931,7 +931,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, GemmDescriptor{false, false, true, - (rbatches.at(time) - rbatches.at(time - 1)), + rbatches.at(time) - rbatches.at(time - 1), n, k, RBuff.hidden_size, @@ -955,7 +955,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, RBuff.gemm_write_offset( - layer, rbacc_per_time[time + 1] + rbatches.at(time - 1), direction), + layer, rbacc_per_time[time] + rbatches.at(time - 1), direction), GemmBackend_t::miopengemm); } @@ -976,17 +976,14 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, 1, // beta xDesc.GetType(), false}; + auto& bacc_per_time = rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - int cur_batch = direction == rnn_direction::Forward - ? time == 0 ? 0 : bacc_per_time[time - 1] - : rbacc_per_time[time]; + int cur_batch = time == 0 ? 0 : bacc_per_time[time - 1]; const auto hidden_offset = (time == 0) ? get_HxBuff_offset(layer, 0, direction) : RBuff.hidden_offset(layer, cur_batch, direction); - const int accumulated_batches = direction == rnn_direction::Forward - ? time == 0 ? 0 : bacc_per_time[time] - : rbacc_per_time[time + 1]; + const int accumulated_batches = time == 0 ? 0 : bacc_per_time[time]; const auto RB_batch_save_points_off = RBuff.gemm_write_offset(layer, accumulated_batches, direction); @@ -1005,59 +1002,57 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW("GEMM execution failure"); }; - auto call_relu_tan_hidden_state_update = - [&RBuff, - &bacc_per_time, - &rbacc_per_time, - &batches, - &rbatches, - &handle, - &wDesc, - reserveSpace, - &activDesc](int layer_id, int time_id, int direction) { - float alpha = 1, beta = 0; - - auto cur_size = - direction == rnn_direction::Forward ? batches.at(time_id) : rbatches.at(time_id); - - const std::vector tensor_size{ - 1, static_cast(cur_size), static_cast(RBuff.hidden_size)}; + auto call_relu_tan_hidden_state_update = [&RBuff, + &fbacc_per_time, + &rbacc_per_time, + &fbatches, + &rbatches, + &handle, + &wDesc, + reserveSpace, + &activDesc](int layer_id, int time, int direction) { + float alpha = 1, beta = 0; - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; + auto cur_size = direction == rnn_direction::Forward ? fbatches.at(time) : rbatches.at(time); - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + const std::vector tensor_size{ + 1, static_cast(cur_size), static_cast(RBuff.hidden_size)}; - auto cur_batch = direction == rnn_direction::Forward ? bacc_per_time[time_id] - : rbacc_per_time[time_id + 1]; + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; - const auto RB_layer_save_points_off = - RBuff.gemm_write_offset(layer_id, cur_batch, direction); + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - const auto hidden_offset = RBuff.hidden_offset(layer_id, cur_batch, direction); + auto cur_batch = + direction == rnn_direction::Forward ? fbacc_per_time[time] : rbacc_per_time[time]; - activDesc.Forward(handle, - &alpha, - // input tensor descriptor - src_desc, - // input pointer - reserveSpace, - &beta, - // output tensor descriptor - dst_desc, - // output pointer - reserveSpace, - // input tensor offset - RB_layer_save_points_off, - // output tensor offset - hidden_offset); - }; + const auto RB_layer_save_points_off = + RBuff.gemm_write_offset(layer_id, cur_batch, direction); + + const auto hidden_offset = RBuff.hidden_offset(layer_id, cur_batch, direction); + + activDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + RB_layer_save_points_off, + // output tensor offset + hidden_offset); + }; auto call_relu_tan_update_output = [&RBuff, &get_HxBuff_offset, - &batches, + &fbatches, &rbatches, &handle, &wDesc, @@ -1065,16 +1060,16 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, hy, max_batch, - &bacc_per_time, + &fbacc_per_time, &rbacc_per_time, seq_len](int layer_id, int time, int direction) { if(hy == nullptr) return; - auto& use_batches = direction == rnn_direction::Forward ? batches : rbatches; + auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; - auto copy_batch = time == seq_len - 1 ? use_batches.at(time) - : use_batches.at(time) - use_batches.at(time + 1); + auto copy_batch = time == seq_len - 1 ? batches.at(time) + : batches.at(time) - batches.at(time + 1); if(copy_batch <= 0) return; @@ -1089,13 +1084,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto hcy_layer_offset = get_HxBuff_offset(layer_id, 0, direction); auto hcy_batch_offset = - time == seq_len - 1 ? 0 : use_batches.at(time + 1) * RBuff.hidden_size; + time == seq_len - 1 ? 0 : batches.at(time + 1) * RBuff.hidden_size; auto accumulated_batch = - direction == rnn_direction::Forward ? bacc_per_time[time] : rbacc_per_time[time + 1]; + direction == rnn_direction::Forward ? fbacc_per_time[time] : rbacc_per_time[time]; auto batch_id_abs = - time == seq_len - 1 ? accumulated_batch : accumulated_batch + use_batches.at(time + 1); + time == seq_len - 1 ? accumulated_batch : accumulated_batch + batches.at(time + 1); const std::vector hcy_copy_size{ 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; @@ -5613,7 +5608,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &fbacc_per_time, &rbacc_per_time, &propagate_dhx_prev](int layer) { - for(int time = 0; time < seqLen; ti++) + for(int time = 0; time < seqLen; time++) { propagate_dhx_prev(layer, time, rnn_direction::Forward); From 5cc2e095376d34d00f5e8596956bc9ebe081c38d Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Fri, 17 Nov 2023 17:28:25 +0300 Subject: [PATCH 14/27] Removed hidden_size from offset helpers --- src/include/miopen/rnn.hpp | 6 +- src/include/miopen/rnn_util.hpp | 86 +++-- src/ocl/rnnocl.cpp | 578 ++++++++++++++++---------------- 3 files changed, 324 insertions(+), 346 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index eaf1028cdd..90eeb230be 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -222,8 +222,7 @@ struct RNNDescriptor : miopenRNNDescriptor const TensorDescriptor& yDesc, Data_t y, Data_t hy, - Data_t reserveSpace, - size_t reserveSpaceSize) const; + Data_t reserveSpace) const; void RNNForwardTrainingTanhRelu(Handle& handle, std::vector& seq_array, @@ -236,8 +235,7 @@ struct RNNDescriptor : miopenRNNDescriptor const TensorDescriptor& yDesc, Data_t y, Data_t hy, - Data_t reserveSpace, - size_t reserveSpaceSize) const; + Data_t reserveSpace) const; void RNNForwardTrainingPackedTensors(Handle& handle, int seqLen, diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 4dfdce971c..08b26084a9 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -141,12 +141,12 @@ struct RNNWeightOffsets struct GruWeightOffsets : public RNNWeightOffsets { - GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_count) + GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_cnt) : weight_stride(matrixes::Count * hidden_vec_sz), in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), num_layers(layers_cnt), - bias_count(bias_count) + bias_count(bias_cnt) { } @@ -172,8 +172,8 @@ struct GruWeightOffsets : public RNNWeightOffsets private: const int in_vec_sz, h_vec_sz; const int num_layers; - const int bi_scale = 0; - const int bias_count = 0; + [[maybe_unused]] const int bi_scale = 0; + const int bias_count = 0; enum matrixes { Z = 0, @@ -191,13 +191,13 @@ struct ReluWeightOffsets : public RNNWeightOffsets int hidden_vec_sz, int layers_cnt, int bias_mode, - int bi_scale, + int bi, int wei_stride) : weight_stride(wei_stride), in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), num_layers(layers_cnt), - bi_scale(bi_scale), + bi_scale(bi), bias_count(bias_mode) { } @@ -230,6 +230,8 @@ struct ReluWeightOffsets : public RNNWeightOffsets private: const int in_vec_sz, h_vec_sz; + +public: const int num_layers; const int bi_scale = 1; const int bias_count = 0; @@ -244,12 +246,12 @@ struct LSTMWeightsBufferHelper : public RNNWeightOffsets public: LSTMWeightsBufferHelper( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int bi_scale) + int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int bi) : weight_stride(hidden_vec_sz * gates_cnt), in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), num_layers(layers_cnt), - bi_scale(bi_scale), + bi_scale(bi), bias_cnt(bias_mode) { } @@ -310,11 +312,8 @@ struct RNNOffsets struct GRUOffsets : public RNNOffsets { public: - GRUOffsets(int in_vec_size, int hidden_size, int num_layers, int total_batch_size) - : hidden_size(hidden_size), - batches_per_layer(total_batch_size), - in_vec_size(in_vec_size), - num_layers(num_layers) + GRUOffsets(int h_vec_size, int layers_cnt, int total_batch_size) + : hidden_size(h_vec_size), batches_per_layer(total_batch_size), num_layers(layers_cnt) { } @@ -326,16 +325,18 @@ struct GRUOffsets : public RNNOffsets size_t gemm_write_stride() const { return save_point::Count * gemm_write_size(); } - size_t gemm_write_offset(int layer_id, int batch_num, int reverse = 0) const + size_t gemm_write_offset(int layer_id, int batch_num) const { return layer_offset(layer_id) + batch_num * gemm_write_stride(); } size_t hidden_offset() const { return save_point::Ht * gemm_write_size(); } +private: const int hidden_size; + +public: const int batches_per_layer; - const int in_vec_size; int r_offset() const { return save_point::R * gemm_write_size(); } @@ -369,35 +370,24 @@ struct ReluReserveBufferOffsets : public RNNOffsets }; private: - auto Reserve_Buffer_strides(int save_point_sz, - int batches_per_layer, - int layers, - int bidirect_mode = 0) const + auto Reserve_Buffer_strides(int save_point_sz, int batches_per_l, int layers_cnt) const { const auto element_st = 1; const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st; - const auto layer_st = static_cast(batch_st) * batches_per_layer; - const auto table_st = layers * layer_st; + const auto layer_st = static_cast(batch_st) * batches_per_l; + const auto table_st = layers_cnt * layer_st; return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; } public: - ReluReserveBufferOffsets(int in_vec_size, - int hidden_vec_size, - int layers_cnt, - int batches_per_layer, - int max_batch, - bool bidirect_mode = 0) + ReluReserveBufferOffsets(int hidden_vec_size, int layers_cnt, int batches_per_l, int bi_scale) : hidden_size(hidden_vec_size), - batches_per_layer(batches_per_layer), - in_vec_size(in_vec_size), - save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), + batches_per_layer(batches_per_l), + save_point_size(hidden_vec_size * bi_scale), layers(layers_cnt), - max_batch(max_batch), - strides( - Reserve_Buffer_strides(save_point_size, batches_per_layer, layers_cnt, bidirect_mode)) + strides(Reserve_Buffer_strides(save_point_size, batches_per_l, layers_cnt)) { } @@ -412,24 +402,24 @@ struct ReluReserveBufferOffsets : public RNNOffsets size_t gemm_write_stride() const { return strides.batch; } - size_t gemm_write_offset(int layer_id, int batch_id, int reverse = 0) const + size_t gemm_write_offset(int layer_id, int batch_id, int reverse) const { return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + reverse * hidden_size; } - size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const + size_t hidden_offset(int layer_id, int batch_id, int reverse) const { - return strides.table + gemm_write_offset(layer_id, batch_id) + reverse * hidden_size; + return strides.table + gemm_write_offset(layer_id, batch_id, reverse); } +private: const int hidden_size; - const int batches_per_layer; - const int in_vec_size; +public: + const int batches_per_layer; const int save_point_size; const int layers; - const int max_batch; const RBuffHelper strides; }; @@ -444,7 +434,7 @@ struct LSTMReserveBufferHelper : public RNNOffsets private: static const int gates_cnt = 4; auto Reserve_Buffer_strides(int save_point_sz, - int batches_per_layer, + int batches_per_l, int save_points, int layers_cnt, int bidirect_mode = 0) const @@ -453,7 +443,7 @@ struct LSTMReserveBufferHelper : public RNNOffsets const auto save_point_st = element_st * save_point_sz; const auto batch_st = save_point_st * save_points; - const auto layer_st = static_cast(batch_st) * batches_per_layer; + const auto layer_st = static_cast(batch_st) * batches_per_l; const auto table_st = layer_st * layers_cnt; if(bidirect_mode == 0) @@ -476,12 +466,12 @@ struct LSTMReserveBufferHelper : public RNNOffsets LSTMReserveBufferHelper(int hidden_vec_size, int layers_cnt, - int batches_per_layer, - int in_vec_size, + int batches_per_l, + int in_vec_sz, int bidirect_mode = 0) : hidden_size(hidden_vec_size), - batches_per_layer(batches_per_layer), - in_vec_size(in_vec_size), + batches_per_layer(batches_per_l), + in_vec_size(in_vec_sz), save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), layers(layers_cnt), strides(Reserve_Buffer_strides( @@ -495,12 +485,12 @@ struct LSTMReserveBufferHelper : public RNNOffsets int gemm_write_size() const { return save_point_size * gates_cnt; } size_t gemm_write_stride() const { return strides.batch; } - size_t gemm_write_offset(int layer, int batch, int reverse = 0) const + size_t gemm_write_offset(int layer, int batch) const { return layer_offset(layer) + static_cast(gemm_write_stride()) * batch; } - size_t hidden_offset(int layer, int batch, int reverse = 0) const + size_t hidden_offset(int layer, int batch) const { return gemm_write_offset(layer, batch) + save_point::Ht * save_point_size; } @@ -533,7 +523,7 @@ struct LSTMReserveBufferHelper : public RNNOffsets const int layers; const RBuffHelper strides; - auto st_offset(int layer, int batch_num, int reverse = 0) + auto st_offset(int layer, int batch_num) { return gemm_write_offset(layer, batch_num) + save_point::St * save_point_size; } diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index c53449897b..cff925387c 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -51,8 +51,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, const TensorDescriptor& yDesc, Data_t y, Data_t hy, - Data_t reserveSpace, - size_t reserveSpaceSize) const + Data_t reserveSpace) const { #if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP int seq_len = seq_array.size(); @@ -85,7 +84,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, bacc_per_time[seq_len] = total_batch_size; - GRUOffsets RBuff(in_vec_size, hidden_size, nLayers, total_batch_size); + GRUOffsets RBuff(hidden_size, nLayers, total_batch_size); int bi = dirMode != 0u ? 2 : 1; @@ -96,7 +95,8 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, auto call_gru_input_gemm = [*this, &RBuff, &WeiBuf, - &in_vec_size, + hidden_size, + in_vec_size, &handle, &xDesc, &wDesc, @@ -105,7 +105,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, w](int layer_id, float beta_t = 1) { // n = Rx,Zx,Cx const int m = RBuff.batches_per_layer, n = WeiBuf.weight_stride, - k = layer_id > 0 ? RBuff.hidden_size : in_vec_size; + k = layer_id > 0 ? hidden_size : in_vec_size; const int lda = layer_id > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); @@ -146,13 +146,22 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, output_offset, GemmBackend_t::miopengemm); if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } if(biasMode != 0u) { float alpha0 = 1; float alpha1 = 1; - float beta_t = 0; + beta_t = 0; const std::vector tensor_size{1, static_cast(RBuff.batches_per_layer), @@ -189,9 +198,8 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, output_offset); } - const std::vector tensor_size{1, - static_cast(RBuff.batches_per_layer), - static_cast(RBuff.hidden_size)}; + const std::vector tensor_size{ + 1, static_cast(RBuff.batches_per_layer), static_cast(hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -262,8 +270,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, RB_layer_out_off); }; - auto call_gru_hidden_gemm = [*this, - &RBuff, + auto call_gru_hidden_gemm = [&RBuff, &WeiBuf, &get_HxBuff_offset, &bacc_per_time, @@ -320,52 +327,55 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, result_offset, GemmBackend_t::miopengemm); if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } }; - auto call_gru_activate_rz = [*this, - &RBuff, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - &sigDesc, - hidden_size](int layer_id, int time_id) { - float alpha = 1, beta = 0; - - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size) * 2}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; + auto call_gru_activate_rz = + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &sigDesc, hidden_size]( + int layer_id, int time_id) { + float alpha = 1, beta = 0; - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size) * 2}; - auto r_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); - auto r_act_offset = r_offset + RBuff.activated_offset(); + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; - sigDesc.Forward(handle, - &alpha, - // input tensor descriptor - src_desc, - // input pointer - reserveSpace, - &beta, - // output tensor descriptor - dst_desc, - // output pointer - reserveSpace, - // input tensor offset - r_offset, - // output tensor offset - r_act_offset); - }; + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + + auto r_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); + auto r_act_offset = r_offset + RBuff.activated_offset(); + + sigDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + r_offset, + // output tensor offset + r_act_offset); + }; auto call_gru_compute_c = - [*this, &RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size]( + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size]( int layer_id, int time_id) { auto с_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); @@ -422,138 +432,70 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, с_offset); }; - auto call_gru_activate_c_gate = [*this, - &RBuff, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - &tanhDesc, - hidden_size](int layer_id, int time_id) { - float alpha = 1, beta = 0; - - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - auto c_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); - auto c_act_offset = c_offset + RBuff.activated_offset(); + auto call_gru_activate_c_gate = + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &tanhDesc, hidden_size]( + int layer_id, int time_id) { + float alpha = 1, beta = 0; - auto z_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.z_offset(); - auto z_act_offset = z_offset + RBuff.activated_offset(); + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - tanhDesc.Forward(handle, - &alpha, - // input tensor descriptor - src_desc, - // input pointer - reserveSpace, - &beta, - // output tensor descriptor - dst_desc, - // output pointer - reserveSpace, - // input tensor offset - c_offset, - // output tensor offset - c_act_offset); - }; + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; - auto call_gru_compute_hidden = [*this, - &RBuff, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - &sigDesc, - hidden_size, - hx](int layer_id, int time_id) { - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; + auto c_offset = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); + auto c_act_offset = c_offset + RBuff.activated_offset(); - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + tanhDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + c_offset, + // output tensor offset + c_act_offset); + }; - auto hidden_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); - auto zact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + - RBuff.z_offset() + RBuff.activated_offset(); - auto cact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + - RBuff.c_offset() + RBuff.activated_offset(); - - const std::vector hidden_tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - const std::vector hidden_tensor_stride{ - static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - auto hidden_tensor_desc = - miopen::TensorDescriptor(wDesc.GetType(), hidden_tensor_size, hidden_tensor_stride); - float alpha0 = -1, alpha1 = 1, beta = 0; + auto call_gru_compute_hidden = + [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size, hx]( + int layer_id, int time_id) { + const std::vector tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - OpTensor(handle, - miopenTensorOpMul, - &alpha0, - hidden_tensor_desc, - reserveSpace, - &alpha1, - hidden_tensor_desc, - reserveSpace, - &beta, - hidden_tensor_desc, - reserveSpace, - zact_offset, - cact_offset, - hidden_offset); + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; - alpha0 = 1; - alpha1 = 1; - beta = 0; + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_tensor_desc, - reserveSpace, - &alpha1, - hidden_tensor_desc, - reserveSpace, - &beta, - hidden_tensor_desc, - reserveSpace, - cact_offset, - hidden_offset, - hidden_offset); + auto hidden_offset = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); + auto zact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + + RBuff.z_offset() + RBuff.activated_offset(); + auto cact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + + RBuff.c_offset() + RBuff.activated_offset(); - if(time_id == 0) - { - const std::vector hx_tensor_size{ + const std::vector hidden_tensor_size{ 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - const std::vector hx_tensor_stride{ - static_cast(batches.at(time_id) * hidden_size), - static_cast(hidden_size), + const std::vector hidden_tensor_stride{ + static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), 1}; - - auto hx_tensor_desc = - miopen::TensorDescriptor(wDesc.GetType(), hx_tensor_size, hx_tensor_stride); - auto hx_offset = batches.at(time_id) * hidden_size * layer_id; - - alpha0 = 1; - alpha1 = 1; - beta = 1; + auto hidden_tensor_desc = + miopen::TensorDescriptor(wDesc.GetType(), hidden_tensor_size, hidden_tensor_stride); + float alpha0 = -1, alpha1 = 1, beta = 0; OpTensor(handle, miopenTensorOpMul, @@ -561,26 +503,21 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, hidden_tensor_desc, reserveSpace, &alpha1, - hx_tensor_desc, - hx, + hidden_tensor_desc, + reserveSpace, &beta, hidden_tensor_desc, reserveSpace, zact_offset, - hx_offset, + cact_offset, hidden_offset); - } - else - { - auto hidden_prev_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id - 1]) + - RBuff.hidden_offset(); + alpha0 = 1; alpha1 = 1; - beta = 1; + beta = 0; OpTensor(handle, - miopenTensorOpMul, + miopenTensorOpAdd, &alpha0, hidden_tensor_desc, reserveSpace, @@ -590,11 +527,67 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, &beta, hidden_tensor_desc, reserveSpace, - zact_offset, - hidden_prev_offset, + cact_offset, + hidden_offset, hidden_offset); - } - }; + + if(time_id == 0) + { + const std::vector hx_tensor_size{ + 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; + const std::vector hx_tensor_stride{ + static_cast(batches.at(time_id) * hidden_size), + static_cast(hidden_size), + 1}; + + auto hx_tensor_desc = + miopen::TensorDescriptor(wDesc.GetType(), hx_tensor_size, hx_tensor_stride); + auto hx_offset = batches.at(time_id) * hidden_size * layer_id; + + alpha0 = 1; + alpha1 = 1; + beta = 1; + + OpTensor(handle, + miopenTensorOpMul, + &alpha0, + hidden_tensor_desc, + reserveSpace, + &alpha1, + hx_tensor_desc, + hx, + &beta, + hidden_tensor_desc, + reserveSpace, + zact_offset, + hx_offset, + hidden_offset); + } + else + { + auto hidden_prev_offset = + RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id - 1]) + + RBuff.hidden_offset(); + alpha0 = 1; + alpha1 = 1; + beta = 1; + + OpTensor(handle, + miopenTensorOpMul, + &alpha0, + hidden_tensor_desc, + reserveSpace, + &alpha1, + hidden_tensor_desc, + reserveSpace, + &beta, + hidden_tensor_desc, + reserveSpace, + zact_offset, + hidden_prev_offset, + hidden_offset); + } + }; auto call_gru_update_output = [&RBuff, &get_HxBuff_offset, @@ -651,15 +644,7 @@ void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, } }; - auto call_gru_hidden_state_update = [&RBuff, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - &sigDesc, - hidden_size, - call_gru_activate_rz, + auto call_gru_hidden_state_update = [call_gru_activate_rz, call_gru_compute_c, call_gru_activate_c_gate, call_gru_compute_hidden](int layer_id, int time_id) { @@ -736,8 +721,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const TensorDescriptor& yDesc, Data_t y, Data_t hy, - Data_t reserveSpace, - size_t reserveSpaceSize) const + Data_t reserveSpace) const { #if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP int seq_len = seq_array.size(); @@ -787,8 +771,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, }; ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); - ReluReserveBufferOffsets RBuff( - in_vec_size, hidden_size, nLayers, total_batch_size, max_batch, bi); + ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi); ActivationDescriptor activDesc; @@ -802,12 +785,12 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } auto call_relu_tan_input_gemm = - [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, hidden_size]( - int layer, float beta_t = 1) { + [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w](int layer, + float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), - k = layer > 0 ? RBuff.gemm_write_size() : RBuff.in_vec_size; + k = layer > 0 ? RBuff.gemm_write_size() : in_vec_size; - const int lda = layer > 0 ? RBuff.gemm_write_stride() : RBuff.in_vec_size, ldb = k, + const int lda = layer > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, ldc = RBuff.gemm_write_stride(); const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, @@ -845,7 +828,16 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, output_offset, GemmBackend_t::miopengemm); if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } }; auto call_relu_tan_bias_add = @@ -899,6 +891,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_hidden_gemm = [&RBuff, &WeiBuf, + hidden_size, &get_HxBuff_offset, &fbacc_per_time, &rbacc_per_time, @@ -916,11 +909,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, ? fbatches.at(time) : time == 0 ? rbatches.at(0) : rbatches.at(time - 1); - const int n = RBuff.hidden_size, k = RBuff.hidden_size; + const int n = hidden_size, k = hidden_size; - const int lda = (time != 0) ? RBuff.gemm_write_stride() : RBuff.hidden_size; + const int lda = (time != 0) ? RBuff.gemm_write_stride() : hidden_size; - const int ldb = RBuff.hidden_size, ldc = RBuff.gemm_write_stride(); + const int ldb = hidden_size, ldc = RBuff.gemm_write_stride(); const auto ht_ptr = time > 0 ? reserveSpace : hx; @@ -934,7 +927,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, rbatches.at(time) - rbatches.at(time - 1), n, k, - RBuff.hidden_size, + hidden_size, ldb, ldc, 1, // batch count @@ -957,6 +950,17 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RBuff.gemm_write_offset( layer, rbacc_per_time[time] + rbatches.at(time - 1), direction), GemmBackend_t::miopengemm); + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } } const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, @@ -976,14 +980,16 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, 1, // beta xDesc.GetType(), false}; - auto& bacc_per_time = rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; + auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; int cur_batch = time == 0 ? 0 : bacc_per_time[time - 1]; const auto hidden_offset = (time == 0) ? get_HxBuff_offset(layer, 0, direction) : RBuff.hidden_offset(layer, cur_batch, direction); - const int accumulated_batches = time == 0 ? 0 : bacc_per_time[time]; + const int accumulated_batches = direction == rnn_direction::Forward + ? time == 0 ? 0 : bacc_per_time[time] + : rbacc_per_time[time]; const auto RB_batch_save_points_off = RBuff.gemm_write_offset(layer, accumulated_batches, direction); @@ -997,12 +1003,21 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, RB_batch_save_points_off, GemmBackend_t::miopengemm); - if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } }; auto call_relu_tan_hidden_state_update = [&RBuff, + hidden_size, &fbacc_per_time, &rbacc_per_time, &fbatches, @@ -1016,7 +1031,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto cur_size = direction == rnn_direction::Forward ? fbatches.at(time) : rbatches.at(time); const std::vector tensor_size{ - 1, static_cast(cur_size), static_cast(RBuff.hidden_size)}; + 1, static_cast(cur_size), static_cast(hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -1052,11 +1067,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_update_output = [&RBuff, &get_HxBuff_offset, + hidden_size, &fbatches, &rbatches, &handle, &wDesc, - &bi, reserveSpace, hy, max_batch, @@ -1068,8 +1083,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; - auto copy_batch = time == seq_len - 1 ? batches.at(time) - : batches.at(time) - batches.at(time + 1); + auto copy_batch = + time == seq_len - 1 ? batches.at(time) : batches.at(time) - batches.at(time + 1); if(copy_batch <= 0) return; @@ -1077,14 +1092,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_src_stride{ RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; - const std::vector hcy_dst_stride{static_cast(RBuff.hidden_size * max_batch), - static_cast(RBuff.hidden_size), - 1}; - - auto hcy_layer_offset = get_HxBuff_offset(layer_id, 0, direction); + const std::vector hcy_dst_stride{ + static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; - auto hcy_batch_offset = - time == seq_len - 1 ? 0 : batches.at(time + 1) * RBuff.hidden_size; + auto batch_id_relative = batches.at(time) - copy_batch; auto accumulated_batch = direction == rnn_direction::Forward ? fbacc_per_time[time] : rbacc_per_time[time]; @@ -1093,7 +1104,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, time == seq_len - 1 ? accumulated_batch : accumulated_batch + batches.at(time + 1); const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(RBuff.hidden_size)}; + 1, static_cast(copy_batch), static_cast(hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); @@ -1104,7 +1115,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, dst_desc, hy, RBuff.hidden_offset(layer_id, batch_id_abs, direction), - hcy_layer_offset + hcy_batch_offset); + get_HxBuff_offset(layer_id, batch_id_relative, direction)); }; for(int layer_id = 0; layer_id < nLayers; layer_id++) @@ -1494,7 +1505,16 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, out_offset, GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } }; auto call_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, @@ -1607,7 +1627,16 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) - MIOPEN_THROW("GEMM execution failure"); + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } }; auto call_hidden_state_update = [&RBuff, @@ -3643,22 +3672,11 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( } if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && - dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && + // dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { - RNNForwardTrainingTanhRelu(handle, - in_n, - xDesc[0], - x, - hxDesc, - hx, - wDesc, - w, - yDesc[0], - y, - hy, - reserveSpace, - reserveSpaceSize); + RNNForwardTrainingTanhRelu( + handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hy, reserveSpace); if(is_profiling) { float eventTime_mS = RNNProfilingEnd(handle, start, stop); @@ -3672,19 +3690,8 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( if((rnnMode == miopenGRU) && !use_dropout && nLayers > 0 && dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { - RNNForwardTrainingGRU(handle, - in_n, - xDesc[0], - x, - hxDesc, - hx, - wDesc, - w, - yDesc[0], - y, - hy, - reserveSpace, - reserveSpaceSize); + RNNForwardTrainingGRU( + handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hy, reserveSpace); if(is_profiling) { float eventTime_mS = RNNProfilingEnd(handle, start, stop); @@ -5224,7 +5231,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); - ReluReserveBufferOffsets RBuff(input_size, hidden_size, nLayers, total_batches, max_batch, bi); + ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batches, bi); auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { @@ -5309,6 +5316,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto propagate_hidden_output = [&RBuff, &handle, + hidden_size, &fbatches, &rbatches, &fbacc_per_time, @@ -5326,12 +5334,12 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( float alpha1 = 1; float beta_t = 0; - std::vector hx_stride{batches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector hx_stride{batches.at(0) * hidden_size, hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(time), RBuff.hidden_size}; - std::vector reserve_size{1, batches.at(time), RBuff.hidden_size}; + std::vector hx_size{1, batches.at(time), hidden_size}; + std::vector reserve_size{1, batches.at(time), hidden_size}; auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); @@ -5357,6 +5365,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto propagate_hidden_prev = [&RBuff, &handle, + hidden_size, &fbatches, &rbatches, &fbacc_per_time, @@ -5370,16 +5379,15 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( if(direction == rnn_direction::Forward && dhy != nullptr && fbatches.at(time) > fbatches.at(time + 1)) { - std::vector hx_stride{fbatches.at(0) * RBuff.hidden_size, RBuff.hidden_size, 1}; + std::vector hx_stride{fbatches.at(0) * hidden_size, hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{ - 1, fbatches.at(time) - fbatches.at(time + 1), RBuff.hidden_size}; + std::vector hx_size{1, fbatches.at(time) - fbatches.at(time + 1), hidden_size}; std::vector reserve_size{ - 1, fbatches.at(time) - fbatches.at(time + 1), RBuff.hidden_size}; + 1, fbatches.at(time) - fbatches.at(time + 1), hidden_size}; float alpha0 = 1; float alpha1 = 1; @@ -5418,10 +5426,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, used_batch, - RBuff.hidden_size, - RBuff.hidden_size, + hidden_size, + hidden_size, RBuff.gemm_write_size(), - RBuff.hidden_size, + hidden_size, RBuff.gemm_write_size(), 1, // batch count 0, // Stride A @@ -5456,9 +5464,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_hidden_time = [*this, - &RBuff, + auto propagate_hidden_time = [&RBuff, &handle, + hidden_size, seqLen, &fbatches, &rbatches, @@ -5488,7 +5496,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - std::vector reserve_size{1, batches.at(time), RBuff.hidden_size}; + std::vector reserve_size{1, batches.at(time), hidden_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); float alpha = 1, beta = 0; @@ -5510,19 +5518,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( RBuff.gemm_write_offset(layer, bacc_per_time[time], direction)); }; - auto propagate_hidden = [*this, - &RBuff, - &handle, - seqLen, - &fbatches, - &rbatches, - &fbacc_per_time, - &rbacc_per_time, - &rnn_data_type, - workSpace, - reserveSpace, - &activDesc, - propagate_hidden_time](int layer) { + auto propagate_hidden = [*this, seqLen, propagate_hidden_time](int layer) { for(int time = seqLen - 1; time >= 0; time--) { propagate_hidden_time(layer, time, rnn_direction::Forward); @@ -5535,6 +5531,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto propagate_dhx_prev = [&RBuff, &WeiBuf, &rnn_data_type, + hidden_size, &fbatches, &rbatches, &fbacc_per_time, @@ -5557,11 +5554,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, false, batch_size, - RBuff.hidden_size, - RBuff.hidden_size, + hidden_size, + hidden_size, RBuff.gemm_write_size(), - RBuff.hidden_size, - RBuff.hidden_size, + hidden_size, + hidden_size, 1, // batch count 0, // Stride A 0, // Stride B @@ -5600,14 +5597,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_dhx = [*this, - seqLen, - &RBuff, - &fbatches, - &rbatches, - &fbacc_per_time, - &rbacc_per_time, - &propagate_dhx_prev](int layer) { + auto propagate_dhx = [*this, seqLen, &propagate_dhx_prev](int layer) { for(int time = 0; time < seqLen; time++) { propagate_dhx_prev(layer, time, rnn_direction::Forward); @@ -5634,7 +5624,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( false, total_batches, input_size, - RBuff.hidden_size * bi, + RBuff.gemm_write_size(), hy_stride, in_stride, in_stride, From 7ca8c32b6d2f87e25c6f689e2fedd0433e1eb744 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Fri, 17 Nov 2023 17:32:38 +0300 Subject: [PATCH 15/27] Enabled RNNForwardTrainingTanhRelu by default --- src/ocl/rnnocl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index cff925387c..a4992e7f9c 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -3672,8 +3672,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( } if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && - // dirMode == miopenRNNunidirection && inputMode != miopenRNNskip && - !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) + inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { RNNForwardTrainingTanhRelu( handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hy, reserveSpace); From e9f20d59c46080167db9291a2ae8093ebd0044b5 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Mon, 20 Nov 2023 21:06:32 +0300 Subject: [PATCH 16/27] RNN Relu and Tanh activation functions refactoring --- src/include/miopen/rnn.hpp | 13 - src/include/miopen/rnn_util.hpp | 273 +----------- src/ocl/rnnocl.cpp | 744 ++------------------------------ 3 files changed, 34 insertions(+), 996 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index 61ac82e97b..bec3bbf9bb 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -261,19 +261,6 @@ struct RNNDescriptor : miopenRNNDescriptor Data_t reserveSpace, size_t reserveSpaceSize) const; - void RNNForwardTrainingGRU(Handle& handle, - std::vector& seq_array, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& hxDesc, - ConstData_t hx, - const TensorDescriptor& wDesc, - ConstData_t w, - const TensorDescriptor& yDesc, - Data_t y, - Data_t hy, - Data_t reserveSpace) const; - void RNNForwardTrainingTanhRelu(Handle& handle, std::vector& seq_array, const TensorDescriptor& xDesc, diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 35d703dd1f..fba6f38a7e 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -140,51 +140,6 @@ struct RNNWeightOffsets int first_layer_offset() const; }; -struct GruWeightOffsets : public RNNWeightOffsets -{ - GruWeightOffsets(int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_cnt) - : weight_stride(matrixes::Count * hidden_vec_sz), - in_vec_sz(input_vector_sz), - h_vec_sz(hidden_vec_sz), - num_layers(layers_cnt), - bias_count(bias_cnt) - { - } - - int input_offset(int layer) - { - return layer == 0 ? 0 : first_layer_offset() + h_vec_sz * 2 * weight_stride * (layer - 1); - } - - int hidden_offset(int layer) - { - return layer == 0 ? input_offset(layer) + in_vec_sz * weight_stride - : input_offset(layer) + h_vec_sz * weight_stride; - } - - size_t bias_stride() { return matrixes::Count * h_vec_sz; } - int bias_off() - { - return (in_vec_sz + h_vec_sz + bias_count * h_vec_sz * (num_layers - 1)) * weight_stride; - } - int bias_off(int layer_id) { return bias_off() + layer_id * bias_count * weight_stride; } - int weight_stride; - -private: - const int in_vec_sz, h_vec_sz; - const int num_layers; - [[maybe_unused]] const int bi_scale = 0; - const int bias_count = 0; - enum matrixes - { - Z = 0, - R = 1, - C = 2, - Count = 3 - }; - int first_layer_offset() { return (in_vec_sz + h_vec_sz) * weight_stride; } -}; - struct ReluWeightOffsets : public RNNWeightOffsets { public: @@ -218,15 +173,15 @@ struct ReluWeightOffsets : public RNNWeightOffsets reverse * h_vec_sz * h_vec_sz; } - size_t bias_stride() { return h_vec_sz; } + size_t bias_stride() const { return h_vec_sz; } - int bias_off() + int bias_off() const { return first_layer_offset() + (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; } - int bias_off(int layer_id) { return bias_off() + bias_count * layer_id * weight_stride; } + int bias_off(int layer_id) const { return bias_off() + bias_count * layer_id * weight_stride; } int weight_stride; private: @@ -240,61 +195,6 @@ struct ReluWeightOffsets : public RNNWeightOffsets int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } }; -struct LSTMWeightsBufferHelper : public RNNWeightOffsets -{ -public: - const int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } - -public: - LSTMWeightsBufferHelper( - int input_vector_sz, int hidden_vec_sz, int layers_cnt, int bias_mode, int bi) - : weight_stride(hidden_vec_sz * gates_cnt), - in_vec_sz(input_vector_sz), - h_vec_sz(hidden_vec_sz), - num_layers(layers_cnt), - bi_scale(bi), - bias_cnt(bias_mode) - { - } - - int input_weight_offset(int layer) const - { - return layer == 0 ? 0 - : first_layer_offset() + - (h_vec_sz + h_vec_sz * bi_scale) * weight_stride * (layer - 1); - } - - int hidden_weight_offset(int layer, int reverse = 0) const - { - return layer == 0 ? input_weight_offset(layer) + in_vec_sz * weight_stride + - reverse * h_vec_sz * h_vec_sz - : input_weight_offset(layer) + bi_scale * h_vec_sz * weight_stride + - reverse * h_vec_sz * h_vec_sz; - } - - size_t bias_stride() { return bias_vector_mul_gate(); } - - int bias_off() - { - return first_layer_offset() + - (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; - } - - int bias_off(int layer_id) { return bias_off() + layer_id * bias_cnt * weight_stride; } - - size_t bias_vector_mul_gate() const { return h_vec_sz * gates_cnt; } - - const int weight_stride; - -private: - static const int gates_cnt = 4; - const int in_vec_sz; - const int h_vec_sz; - const int num_layers; - const int bi_scale = 1; - const int bias_cnt = 0; -}; - struct RNNOffsets { size_t layer_offset(int layer_id) const; @@ -310,58 +210,6 @@ struct RNNOffsets size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const; }; -struct GRUOffsets : public RNNOffsets -{ -public: - GRUOffsets(int h_vec_size, int layers_cnt, int total_batch_size) - : hidden_size(h_vec_size), batches_per_layer(total_batch_size), num_layers(layers_cnt) - { - } - - size_t layer_offset(int layer_id) const { return layer_id * layer_stride(); } - - size_t layer_stride() const { return gemm_write_stride() * batches_per_layer; } - - int gemm_write_size() const { return hidden_size; } - - size_t gemm_write_stride() const { return save_point::Count * gemm_write_size(); } - - size_t gemm_write_offset(int layer_id, int batch_num) const - { - return layer_offset(layer_id) + batch_num * gemm_write_stride(); - } - - size_t hidden_offset() const { return save_point::Ht * gemm_write_size(); } - -private: - const int hidden_size; - -public: - const int batches_per_layer; - - int r_offset() const { return save_point::R * gemm_write_size(); } - - int z_offset() const { return save_point::Z * gemm_write_size(); } - - int c_offset() const { return save_point::С * gemm_write_size(); } - - int activated_offset() const { return layer_stride() * num_layers; } - - size_t network_stride() { return layer_stride() * num_layers; } - -private: - int num_layers; - - enum save_point - { - Z = 0, - R = 1, - С = 2, - Ht = 3, - Count = 4 - }; -}; - struct ReluReserveBufferOffsets : public RNNOffsets { struct RBuffHelper @@ -406,7 +254,7 @@ struct ReluReserveBufferOffsets : public RNNOffsets size_t gemm_write_offset(int layer_id, int batch_id, int reverse) const { return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + - reverse * hidden_size; + (size_t)reverse * hidden_size; } size_t hidden_offset(int layer_id, int batch_id, int reverse) const @@ -424,119 +272,6 @@ struct ReluReserveBufferOffsets : public RNNOffsets const RBuffHelper strides; }; -struct LSTMReserveBufferHelper : public RNNOffsets -{ - struct RBuffHelper - { - int element, save_point, batch; - size_t layer, table; - }; - -private: - static const int gates_cnt = 4; - auto Reserve_Buffer_strides(int save_point_sz, - int batches_per_l, - int save_points, - int layers_cnt, - int bidirect_mode = 0) const - { - const auto element_st = bidirect_mode ? 2 : 1; - - const auto save_point_st = element_st * save_point_sz; - const auto batch_st = save_point_st * save_points; - const auto layer_st = static_cast(batch_st) * batches_per_l; - const auto table_st = layer_st * layers_cnt; - - if(bidirect_mode == 0) - return RBuffHelper{element_st, save_point_st, batch_st, layer_st, table_st}; - - MIOPEN_THROW("execution failure: bidirect is not supported by this solver"); - } - -public: - enum save_point - { - F = 1, - I = 0, - G = 2, - O = 3, - St = 4, - Ht = 5, - Count = 6 - }; - - LSTMReserveBufferHelper(int hidden_vec_size, - int layers_cnt, - int batches_per_l, - int in_vec_sz, - int bidirect_mode = 0) - : hidden_size(hidden_vec_size), - batches_per_layer(batches_per_l), - in_vec_size(in_vec_sz), - save_point_size(bidirect_mode ? hidden_vec_size * 2 : hidden_vec_size), - layers(layers_cnt), - strides(Reserve_Buffer_strides( - save_point_size, batches_per_layer, save_point::Count, layers_cnt, 0)) - { - } - - size_t layer_offset(int layer) const { return static_cast(layer) * strides.layer; } - size_t layer_stride() const { return strides.layer; } - - int gemm_write_size() const { return save_point_size * gates_cnt; } - size_t gemm_write_stride() const { return strides.batch; } - - size_t gemm_write_offset(int layer, int batch) const - { - return layer_offset(layer) + static_cast(gemm_write_stride()) * batch; - } - - size_t hidden_offset(int layer, int batch) const - { - return gemm_write_offset(layer, batch) + save_point::Ht * save_point_size; - } - - const int hidden_size; - const int batches_per_layer; - const int in_vec_size; - - auto f_offset(int layer, int batch_num) const - { - return gemm_write_offset(layer, batch_num) + save_point::F * save_point_size; - } - - auto i_offset(int layer, int batch_num) const - { - return gemm_write_offset(layer, batch_num) + save_point::I * save_point_size; - } - - auto g_offset(int layer, int batch_num) const - { - return gemm_write_offset(layer, batch_num) + save_point::G * save_point_size; - } - - auto o_offset(int layer, int batch_num) const - { - return gemm_write_offset(layer, batch_num) + save_point::O * save_point_size; - } - - const int save_point_size; // for bidirect TODO - const int layers; - const RBuffHelper strides; - - auto st_offset(int layer, int batch_num) - { - return gemm_write_offset(layer, batch_num) + save_point::St * save_point_size; - } - - size_t extra_save_point_offset(int layer, int batch_num) const - { - return strides.table // all data offset - + static_cast(batches_per_layer) * layer * hidden_size + - static_cast(batch_num * hidden_size); - } -}; - struct RNNTensorPaddingConverter { static void ConvertTensorData(const Handle& handle, diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 980cf3d1c7..a33b734f7c 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -40,676 +40,6 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_RNNFWD_exp) namespace miopen { -void RNNDescriptor::RNNForwardTrainingGRU(Handle& handle, - std::vector& seq_array, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& hxDesc, - ConstData_t hx, - const TensorDescriptor& wDesc, - ConstData_t w, - const TensorDescriptor& yDesc, - Data_t y, - Data_t hy, - Data_t reserveSpace) const -{ -#if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP - int seq_len = seq_array.size(); - if(seq_len == 0) - return; - - int max_batch = seq_array[0]; - int hidden_size; - std::tie(std::ignore, max_batch, hidden_size) = miopen::tien<3>(hxDesc.GetLengths()); - - int in_vec_size = xDesc.GetLengths()[1]; - int out_vec_size = yDesc.GetLengths()[1]; - - GruWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2); - - ActivationDescriptor sigDesc = {miopenActivationLOGISTIC, 1, 0, 1}; - ActivationDescriptor tanhDesc = {miopenActivationTANH, 1, 1, 1}; - - int total_batch_size = 0; - // accumulated batches per time - std::vector bacc_per_time(seq_len + 1); - std::vector batches; - - for(int i = 0; i < seq_len; i++) - { - bacc_per_time[i] = total_batch_size; - total_batch_size += seq_array[i]; - batches.push_back(seq_array[i]); - } - - bacc_per_time[seq_len] = total_batch_size; - - GRUOffsets RBuff(hidden_size, nLayers, total_batch_size); - - int bi = dirMode != 0u ? 2 : 1; - - auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int reverse = 0) { - return (static_cast(hidden_size) * max_batch) * (bi * layer_id + reverse); - }; - - auto call_gru_input_gemm = [*this, - &RBuff, - &WeiBuf, - hidden_size, - in_vec_size, - &handle, - &xDesc, - &wDesc, - reserveSpace, - x, - w](int layer_id, float beta_t = 1) { - // n = Rx,Zx,Cx - const int m = RBuff.batches_per_layer, n = WeiBuf.weight_stride, - k = layer_id > 0 ? hidden_size : in_vec_size; - - const int lda = layer_id > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, - ldc = RBuff.gemm_write_stride(); - - const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta - xDesc.GetType(), - false}; - - const auto input_weight_offset = WeiBuf.input_offset(layer_id); - const auto output_offset = RBuff.layer_offset(layer_id); - - const auto input_offset = - layer_id > 0 ? RBuff.gemm_write_offset(layer_id - 1, 0) + RBuff.hidden_offset() : 0; - - const auto input_ptr = layer_id > 0 ? reserveSpace : x; - - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - input_ptr, - input_offset, - w, - input_weight_offset, - reserveSpace, - output_offset, - GemmBackend_t::rocblas); - if(gemm_status != miopenStatusSuccess) - { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else - { - MIOPEN_LOG_E("GEMM failed"); - } - } - - if(biasMode != 0u) - { - float alpha0 = 1; - float alpha1 = 1; - beta_t = 0; - - const std::vector tensor_size{1, - static_cast(RBuff.batches_per_layer), - static_cast(WeiBuf.weight_stride)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto tensor_desc = - miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - const std::vector weight_size{1, 1, static_cast(WeiBuf.weight_stride)}; - - const std::vector weight_stride{static_cast(WeiBuf.weight_stride), - static_cast(WeiBuf.weight_stride), - 1}; - - auto wei_desc = miopen::TensorDescriptor(wDesc.GetType(), weight_size, weight_stride); - - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - tensor_desc, - reserveSpace, - &alpha1, - wei_desc, - w, - &beta_t, - tensor_desc, - reserveSpace, - output_offset, - WeiBuf.bias_off(layer_id), - output_offset); - } - - const std::vector tensor_size{ - 1, static_cast(RBuff.batches_per_layer), static_cast(hidden_size)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - float alpha0 = 0; - float alpha1 = 0; - beta_t = 0; - - CopyTensor(handle, - desc, - reserveSpace, - desc, - reserveSpace, - RBuff.layer_offset(layer_id) + RBuff.c_offset(), - RBuff.layer_offset(layer_id) + RBuff.hidden_offset()); - - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - desc, - reserveSpace, - &alpha1, - desc, - reserveSpace, - &beta_t, - desc, - reserveSpace, - RBuff.layer_offset(layer_id) + RBuff.c_offset(), - RBuff.layer_offset(layer_id) + RBuff.c_offset(), - RBuff.layer_offset(layer_id) + RBuff.c_offset()); - }; - - auto call_gru_bias_add = [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer_id, - float beta_t = 0) { - float alpha0 = 1; - float alpha1 = 1; - - const auto bias_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_stride()}, - std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - - const auto hidden_interim_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{ - 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, - std::vector{ - static_cast(RBuff.layer_stride()), RBuff.gemm_write_stride(), 1}); - - const auto RB_layer_out_off = RBuff.layer_offset(layer_id); - - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_interim_desc, - reserveSpace, - &alpha1, - bias_desc, - w, - &beta_t, - hidden_interim_desc, - reserveSpace, - RB_layer_out_off, - WeiBuf.bias_off(layer_id) + WeiBuf.weight_stride, - RB_layer_out_off); - }; - - auto call_gru_hidden_gemm = [&RBuff, - &WeiBuf, - &get_HxBuff_offset, - &bacc_per_time, - &batches, - &handle, - &xDesc, - reserveSpace, - hx, - w, - hidden_size](int layer, int cur_time) { - if(cur_time == 0 && hx == nullptr) - return; - - const int m = batches.at(cur_time), n = WeiBuf.weight_stride, k = hidden_size; - - const int lda = (cur_time == 0) ? hidden_size : RBuff.gemm_write_stride(); - - const int ldb = hidden_size, ldc = RBuff.gemm_write_stride(); - - const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - xDesc.GetType(), - false}; - - const auto hidden_offset = - (cur_time == 0) ? get_HxBuff_offset(layer) - : RBuff.gemm_write_offset(layer, bacc_per_time[cur_time - 1]) + - RBuff.hidden_offset(); - - const auto ht_ptr = cur_time > 0 ? reserveSpace : hx; - - const auto result_offset = RBuff.gemm_write_offset(layer, bacc_per_time[cur_time]); - - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc_hx, - ht_ptr, - hidden_offset, - w, - WeiBuf.hidden_offset(layer), - reserveSpace, - result_offset, - GemmBackend_t::rocblas); - if(gemm_status != miopenStatusSuccess) - { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else - { - MIOPEN_LOG_E("GEMM failed"); - } - } - }; - - auto call_gru_activate_rz = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &sigDesc, hidden_size]( - int layer_id, int time_id) { - float alpha = 1, beta = 0; - - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size) * 2}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - auto r_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]); - auto r_act_offset = r_offset + RBuff.activated_offset(); - - sigDesc.Forward(handle, - &alpha, - // input tensor descriptor - src_desc, - // input pointer - reserveSpace, - &beta, - // output tensor descriptor - dst_desc, - // output pointer - reserveSpace, - // input tensor offset - r_offset, - // output tensor offset - r_act_offset); - }; - - auto call_gru_compute_c = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size]( - int layer_id, int time_id) { - auto с_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); - auto hidden_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); - auto hidden_act_offset = hidden_offset + RBuff.activated_offset(); - auto r_act_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + - RBuff.r_offset() + RBuff.activated_offset(); - - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - CopyTensor(handle, desc, reserveSpace, desc, reserveSpace, с_offset, hidden_act_offset); - - float alpha0 = 1; - float alpha1 = 1; - float beta = 0; - - OpTensor(handle, - miopenTensorOpMul, - &alpha0, - desc, - reserveSpace, - &alpha1, - desc, - reserveSpace, - &beta, - desc, - reserveSpace, - r_act_offset, - с_offset, - с_offset); - - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - desc, - reserveSpace, - &alpha1, - desc, - reserveSpace, - &beta, - desc, - reserveSpace, - с_offset, - hidden_offset, - с_offset); - }; - - auto call_gru_activate_c_gate = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, &tanhDesc, hidden_size]( - int layer_id, int time_id) { - float alpha = 1, beta = 0; - - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - auto c_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.c_offset(); - auto c_act_offset = c_offset + RBuff.activated_offset(); - - tanhDesc.Forward(handle, - &alpha, - // input tensor descriptor - src_desc, - // input pointer - reserveSpace, - &beta, - // output tensor descriptor - dst_desc, - // output pointer - reserveSpace, - // input tensor offset - c_offset, - // output tensor offset - c_act_offset); - }; - - auto call_gru_compute_hidden = - [&RBuff, &bacc_per_time, &batches, &handle, &wDesc, reserveSpace, hidden_size, hx]( - int layer_id, int time_id) { - const std::vector tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - - auto hidden_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + RBuff.hidden_offset(); - auto zact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + - RBuff.z_offset() + RBuff.activated_offset(); - auto cact_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id]) + - RBuff.c_offset() + RBuff.activated_offset(); - - const std::vector hidden_tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - const std::vector hidden_tensor_stride{ - static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - auto hidden_tensor_desc = - miopen::TensorDescriptor(wDesc.GetType(), hidden_tensor_size, hidden_tensor_stride); - float alpha0 = -1, alpha1 = 1, beta = 0; - - OpTensor(handle, - miopenTensorOpMul, - &alpha0, - hidden_tensor_desc, - reserveSpace, - &alpha1, - hidden_tensor_desc, - reserveSpace, - &beta, - hidden_tensor_desc, - reserveSpace, - zact_offset, - cact_offset, - hidden_offset); - - alpha0 = 1; - alpha1 = 1; - beta = 0; - - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_tensor_desc, - reserveSpace, - &alpha1, - hidden_tensor_desc, - reserveSpace, - &beta, - hidden_tensor_desc, - reserveSpace, - cact_offset, - hidden_offset, - hidden_offset); - - if(time_id == 0) - { - const std::vector hx_tensor_size{ - 1, static_cast(batches.at(time_id)), static_cast(hidden_size)}; - const std::vector hx_tensor_stride{ - static_cast(batches.at(time_id) * hidden_size), - static_cast(hidden_size), - 1}; - - auto hx_tensor_desc = - miopen::TensorDescriptor(wDesc.GetType(), hx_tensor_size, hx_tensor_stride); - auto hx_offset = batches.at(time_id) * hidden_size * layer_id; - - alpha0 = 1; - alpha1 = 1; - beta = 1; - - OpTensor(handle, - miopenTensorOpMul, - &alpha0, - hidden_tensor_desc, - reserveSpace, - &alpha1, - hx_tensor_desc, - hx, - &beta, - hidden_tensor_desc, - reserveSpace, - zact_offset, - hx_offset, - hidden_offset); - } - else - { - auto hidden_prev_offset = - RBuff.gemm_write_offset(layer_id, bacc_per_time[time_id - 1]) + - RBuff.hidden_offset(); - alpha0 = 1; - alpha1 = 1; - beta = 1; - - OpTensor(handle, - miopenTensorOpMul, - &alpha0, - hidden_tensor_desc, - reserveSpace, - &alpha1, - hidden_tensor_desc, - reserveSpace, - &beta, - hidden_tensor_desc, - reserveSpace, - zact_offset, - hidden_prev_offset, - hidden_offset); - } - }; - - auto call_gru_update_output = [&RBuff, - &get_HxBuff_offset, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - hy, - max_batch, - hidden_size, - seq_len](int layer_id) { - if(hy == nullptr) - return; - - auto hcy_layer_offset = get_HxBuff_offset(layer_id); - - const std::vector hcy_src_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - const std::vector hcy_dst_stride{ - static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; - - for(int time_i = seq_len - 1; time_i >= 0; time_i--) - { - auto copy_batch = (time_i == seq_len - 1) ? batches.at(time_i) - : batches.at(time_i) - batches.at(time_i + 1); - if(copy_batch > 0) - { - auto batch_id_relative = batches.at(time_i) - copy_batch; - auto batch_id_abs = bacc_per_time[time_i] + batch_id_relative; - - auto hcy_batch_offset = batch_id_relative * hidden_size; - - auto src_batch_offset = - RBuff.gemm_write_offset(layer_id, batch_id_abs) + RBuff.hidden_offset(); - - const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(hidden_size)}; - - auto src_desc = - miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); - auto dst_desc = - miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); - - CopyTensor(handle, - src_desc, - reserveSpace, - dst_desc, - hy, - src_batch_offset, - hcy_layer_offset + hcy_batch_offset); - } - } - }; - - auto call_gru_hidden_state_update = [call_gru_activate_rz, - call_gru_compute_c, - call_gru_activate_c_gate, - call_gru_compute_hidden](int layer_id, int time_id) { - call_gru_activate_rz(layer_id, time_id); - call_gru_compute_c(layer_id, time_id); - call_gru_activate_c_gate(layer_id, time_id); - call_gru_compute_hidden(layer_id, time_id); - }; - - for(int layer_id = 0; layer_id < nLayers; layer_id++) - { - call_gru_input_gemm(layer_id); - if(biasMode != 0u) - { - call_gru_bias_add(layer_id); - } - for(int time = 0; time < seq_len; time++) - { - call_gru_hidden_gemm(layer_id, time); - call_gru_hidden_state_update(layer_id, time); - } - call_gru_update_output(layer_id); - } - - // output tensor copy - { - const std::vector y_copy_size{ - 1, static_cast(total_batch_size), static_cast(out_vec_size)}; - - const std::vector y_src_stride{ - RBuff.layer_stride(), static_cast(RBuff.gemm_write_stride()), 1}; - - const std::vector y_dst_stride{static_cast(out_vec_size * total_batch_size), - static_cast(out_vec_size), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_src_stride); - auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); - - int src_offset = RBuff.layer_offset(nLayers - 1) + RBuff.hidden_offset(); - - CopyTensor(handle, src_desc, reserveSpace, y_dst_desc, y, src_offset, 0); - } - -#else - (void)handle; - (void)seq_array; - (void)xDesc; - (void)x; - (void)hxDesc; - (void)hx; - (void)cx; - (void)wDesc; - (void)w; - (void)yDesc; - (void)y; - (void)hy; - (void)cy; - (void)reserveSpace; - (void)reserveSpaceSize; - - MIOPEN_THROW("GEMM is not supported"); -#endif -} - void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, std::vector& seq_array, const TensorDescriptor& xDesc, @@ -767,7 +97,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + - hidden_size * batch_id; + (size_t)hidden_size * batch_id; }; ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); @@ -905,9 +235,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(time == 0 && hx == nullptr) return; - const int m = direction == rnn_direction::Forward - ? fbatches.at(time) - : time == 0 ? rbatches.at(0) : rbatches.at(time - 1); + const int m = direction == rnn_direction::Forward ? fbatches.at(time) + : time == 0 ? rbatches.at(0) + : rbatches.at(time - 1); const int n = hidden_size, k = hidden_size; @@ -3650,9 +2980,8 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( #if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP if(rnnMode == miopenLSTM && algoMode == miopenRNNdefault && !use_dropout && nLayers > 1 && - inputMode != miopenRNNskip && - !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{})) && xDesc[0].GetType() == miopenFloat && - seqLen >= 32) + inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{})) && + xDesc[0].GetType() == miopenFloat && seqLen >= 32) { RNNForwardTraining_MS(handle, in_n, @@ -3681,7 +3010,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( } if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && - inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) + inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) { RNNForwardTrainingTanhRelu( handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hy, reserveSpace); @@ -3695,20 +3024,6 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( return; } - if((rnnMode == miopenGRU) && !use_dropout && nLayers > 0 && dirMode == miopenRNNunidirection && - inputMode != miopenRNNskip && !(miopen::IsDisabled(MIOPEN_RNNFWD_exp{}))) - { - RNNForwardTrainingGRU( - handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hy, reserveSpace); - if(is_profiling) - { - float eventTime_mS = RNNProfilingEnd(handle, start, stop); - handle.EnableProfiling(true); - handle.ResetKernelTime(); - handle.AccumKernelTime(eventTime_mS); - } - return; - } #endif // MIOPEN_USE_GEMM&& MIOPEN_BACKEND_HIP int in_stride = xDesc[0].GetLengths()[1]; @@ -5015,27 +4330,29 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, workSpaceSize, reserveSpace, reserveSpaceSize); - } else { + } + else + { - RNNBackwardDataPackedTensors(handle, - seqLen, - dyDesc, - dy, - dhy, - dcy, - w, - hx, - cx, - dxDesc, - dx, - dhxDesc, - dhx, - dcxDesc, - dcx, - workSpace, - workSpaceSize, - reserveSpace, - reserveSpaceSize); + RNNBackwardDataPackedTensors(handle, + seqLen, + dyDesc, + dy, + dhy, + dcy, + w, + hx, + cx, + dxDesc, + dx, + dhxDesc, + dhx, + dcxDesc, + dcx, + workSpace, + workSpaceSize, + reserveSpace, + reserveSpaceSize); } } else @@ -5117,7 +4434,6 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, #endif } - void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( Handle& handle, const int seqLen, @@ -5257,7 +4573,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + - hidden_size * batch_id; + static_cast(hidden_size) * batch_id; }; auto propagate_output = From 8ebdcf47db11476866f7417c0434c21dcfc3dacf Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Fri, 1 Dec 2023 20:14:32 +0300 Subject: [PATCH 17/27] -no-hx --no-dhy --no-hy --no-dhx modes fix --- src/include/miopen/rnn_util.hpp | 2 +- src/ocl/rnnocl.cpp | 259 ++++++++++++++++++++++++-------- 2 files changed, 198 insertions(+), 63 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index fba6f38a7e..4df4f0336d 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -173,7 +173,7 @@ struct ReluWeightOffsets : public RNNWeightOffsets reverse * h_vec_sz * h_vec_sz; } - size_t bias_stride() const { return h_vec_sz; } + size_t bias_stride() const { return static_cast(h_vec_sz) * bi_scale; } int bias_off() const { diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index a33b734f7c..d9fe56b017 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -170,39 +170,50 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } }; - auto call_relu_tan_bias_add = - [&RBuff, &WeiBuf, &handle, &wDesc, reserveSpace, w](int layer, float beta_t = 0) { - float alpha0 = 1; - float alpha1 = 1; + auto call_relu_tan_bias_add = [*this, + &RBuff, + &WeiBuf, + &handle, + &wDesc, + reserveSpace, + w, + &hx, + &hidden_size, + &fbatches, + &seq_len](int layer, float beta_t = 0) { + float alpha0 = 1; + float alpha1 = 1; - const auto bias_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_stride()}, - std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + auto bias_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, 1, WeiBuf.bias_stride()}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - const auto hidden_interim_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{ - 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, - std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + const auto hidden_interim_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{ + 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, + std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); + const auto RB_layer_out_off = RBuff.layer_offset(layer); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hidden_interim_desc, - reserveSpace, // A - &alpha1, - bias_desc, - w, // B - &beta_t, - hidden_interim_desc, - reserveSpace, // C - RB_layer_out_off, // A offset - WeiBuf.bias_off(layer), // B offset - RB_layer_out_off); // C offset + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hidden_interim_desc, + reserveSpace, // A + &alpha1, + bias_desc, + w, // B + &beta_t, + hidden_interim_desc, + reserveSpace, // C + RB_layer_out_off, // A offset + WeiBuf.bias_off(layer), // B offset + RB_layer_out_off); // C offset + if(hx != nullptr) + { OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -217,7 +228,93 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RB_layer_out_off, WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), RB_layer_out_off); - }; + return; + } + + auto reserve_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, RBuff.batches_per_layer - fbatches.at(0), hidden_size}, + std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + + bias_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, 1, hidden_size}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + reserve_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + reserve_desc, + reserveSpace, + RB_layer_out_off + fbatches.at(0) * RBuff.gemm_write_stride(), + WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), + RB_layer_out_off + fbatches.at(0) * RBuff.gemm_write_stride(), + true); + // Update time + + if(dirMode != 0u) + { + if(fbatches.at(0) == fbatches.at(seq_len - 1)) + { + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + reserve_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + reserve_desc, + reserveSpace, + RB_layer_out_off + hidden_size, + WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, + RB_layer_out_off + hidden_size, + true); + // Update time + } + else + { + int cur_batch = 0; + for(int ti = 0; ti < seq_len; ti++) + { + if(ti != (seq_len - 1)) + { + auto offset = RB_layer_out_off + cur_batch * RBuff.gemm_write_stride(); + + reserve_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, fbatches.at(ti + 1), hidden_size}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + reserve_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + reserve_desc, + reserveSpace, + static_cast(offset) + hidden_size, + WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, + static_cast(offset) + hidden_size, + true); + // Update time + } + cur_batch += fbatches.at(ti); + } + } + } + }; auto call_relu_tan_hidden_gemm = [&RBuff, &WeiBuf, @@ -3058,6 +3155,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( x_stride[1] = in_stride; y_stride[0] = batch_n * out_stride; y_stride[1] = out_stride; + if(hy != nullptr || (rnnMode == miopenLSTM && cy != nullptr)) { hx_size[2] = hy_d * hy_n * hy_h; @@ -3077,6 +3175,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( profileRNNkernels(handle, 1, ctime); } } + hx_stride[0] = in_n.at(0) * uni_stride; hx_stride[1] = uni_stride; @@ -4314,7 +4413,8 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, if(paddingMode == miopenRNNIONotPadded) { - if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH)) + bool use_dropout = !float_equal(miopen::deref(dropoutDesc).dropout, 0); + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout) { RNNBackwardDataPackedTensorsRelu(handle, seqLen, @@ -4333,7 +4433,6 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, } else { - RNNBackwardDataPackedTensors(handle, seqLen, dyDesc, @@ -4481,6 +4580,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( int bi = dirMode != 0u ? 2 : 1; int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); + int in_stride = input_size; + int out_stride = out_vec_size; + if(input_size <= 0 || hidden_size <= 0 || max_batch <= 0 || out_vec_size <= 0 || seqLen <= 0) { MIOPEN_THROW(miopenStatusBadParm); @@ -4934,7 +5036,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_dhx = [*this, seqLen, &propagate_dhx_prev](int layer) { + auto propagate_dhx = [*this, seqLen, &propagate_dhx_prev, &dhx](int layer) { + if(dhx == nullptr) + return; + for(int time = 0; time < seqLen; time++) { propagate_dhx_prev(layer, time, rnn_direction::Forward); @@ -4953,38 +5058,72 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( propagate_dhx(li); } - int in_stride = input_size; int hy_stride = hidden_size * bi * static_cast(workspaceScale); - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - false, - total_batches, - input_size, - RBuff.gemm_write_size(), - hy_stride, - in_stride, - in_stride, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 0, // beta - rnn_data_type, - false}; - miopenStatus_t gemm_status = - CallGemm(handle, gemm_desc, workSpace, 0, w, 0, dx, 0, GemmBackend_t::rocblas); - - if(gemm_status != miopenStatusSuccess) + if(inputMode == miopenRNNskip) { - if(gemm_status == miopenStatusNotImplemented) + std::vector o_size{1, total_batch_size, hidden_size}; + std::vector o_stride{total_batch_size * out_stride, out_stride, 1}; + std::vector r_size{1, total_batch_size, hidden_size}; + std::vector r_stride{total_batch_size * in_stride, in_stride, 1}; + + auto x_desc = miopen::TensorDescriptor(rnn_data_type, r_size, r_stride); + auto sp_desc = miopen::TensorDescriptor(rnn_data_type, o_size, o_stride); + + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; + + for(int gi = 0; gi < nHiddenTensorsPerLayer * bi; gi++) { - MIOPEN_LOG_E("GEMM not implemented"); + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + sp_desc, + workSpace, + &alpha1, + x_desc, + dx, + &beta_t, + x_desc, + dx, + static_cast(gi) * hidden_size, + 0, + 0); } - else + } + else + { + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + total_batches, + input_size, + RBuff.gemm_write_size(), + hy_stride, + in_stride, + in_stride, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 0, // beta + rnn_data_type, + false}; + miopenStatus_t gemm_status = + CallGemm(handle, gemm_desc, workSpace, 0, w, 0, dx, 0, GemmBackend_t::rocblas); + + if(gemm_status != miopenStatusSuccess) { - MIOPEN_LOG_E("GEMM failed"); + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } } } @@ -5357,7 +5496,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( sp_size[2] = hy_h; hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); sp_desc = miopen::TensorDescriptor(rnn_data_type, sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -5390,7 +5528,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( sp_size[2] = hy_h; hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); sp_desc = miopen::TensorDescriptor(rnn_data_type, sp_size, sp_stride); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -5482,7 +5619,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( 1, // beta rnn_data_type, false}; - miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -6367,7 +6503,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( // Update time profileRNNkernels(handle, 1, ctime); } - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, From 174501264f780381d7ea104fb19b4171b35dd94b Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Mon, 18 Dec 2023 18:25:53 +0300 Subject: [PATCH 18/27] Pull request 2541 review fixes --- src/include/miopen/rnn_util.hpp | 8 +- src/ocl/rnnocl.cpp | 434 +++++++++++++++----------------- 2 files changed, 202 insertions(+), 240 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 4df4f0336d..42df311ae1 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -148,8 +148,8 @@ struct ReluWeightOffsets : public RNNWeightOffsets int layers_cnt, int bias_mode, int bi, - int wei_stride) - : weight_stride(wei_stride), + int nHiddenTensorsPerLayer) + : weight_stride(hidden_vec_sz * bi * nHiddenTensorsPerLayer), in_vec_sz(input_vector_sz), h_vec_sz(hidden_vec_sz), num_layers(layers_cnt), @@ -231,10 +231,10 @@ struct ReluReserveBufferOffsets : public RNNOffsets } public: - ReluReserveBufferOffsets(int hidden_vec_size, int layers_cnt, int batches_per_l, int bi_scale) + ReluReserveBufferOffsets(int hidden_vec_size, int layers_cnt, int batches_per_l, int bi_scale, int workspace_scale) : hidden_size(hidden_vec_size), batches_per_layer(batches_per_l), - save_point_size(hidden_vec_size * bi_scale), + save_point_size(hidden_vec_size * bi_scale * workspace_scale), layers(layers_cnt), strides(Reserve_Buffer_strides(save_point_size, batches_per_l, layers_cnt)) { diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index fc7fd34f01..be90043040 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -58,8 +58,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(seq_len == 0) return; - std::vector fbatches; - std::vector rbatches; + std::vector batches; int in_vec_size = xDesc.GetLengths()[1]; int out_vec_size = yDesc.GetLengths()[1]; @@ -70,29 +69,18 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, int total_batch_size = 0; // accumulated batches per time - std::vector fbacc_per_time(seq_len + 1); - std::vector rbacc_per_time(seq_len + 1); + std::vector bacc_per_time(seq_len + 1); for(int i = 0; i < seq_len; i++) { - fbacc_per_time[i] = total_batch_size; + bacc_per_time[i] = total_batch_size; total_batch_size += seq_array[i]; - fbatches.push_back(seq_array[i]); + batches.push_back(seq_array[i]); } - int rtotal_batch_size = total_batch_size; - for(int i = seq_len - 1; i >= 0; i--) - { - rtotal_batch_size -= fbatches[i]; - rbacc_per_time[seq_len - 1 - i] = rtotal_batch_size; - rbatches.push_back(fbatches[i]); - } - - fbacc_per_time[seq_len] = total_batch_size; - rbacc_per_time[seq_len] = 0; + bacc_per_time[seq_len] = total_batch_size; int bi = dirMode != 0u ? 2 : 1; - int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { @@ -100,8 +88,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, (size_t)hidden_size * batch_id; }; - ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); - ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi); + ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); + ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); ActivationDescriptor activDesc; @@ -179,7 +167,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, w, &hx, &hidden_size, - &fbatches, + &batches, + &bacc_per_time, &seq_len](int layer, float beta_t = 0) { float alpha0 = 1; float alpha1 = 1; @@ -195,8 +184,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_out_off = RBuff.layer_offset(layer); - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -207,10 +194,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, w, // B &beta_t, hidden_interim_desc, - reserveSpace, // C - RB_layer_out_off, // A offset - WeiBuf.bias_off(layer), // B offset - RB_layer_out_off); // C offset + reserveSpace, // C + RBuff.layer_offset(layer), // A offset + WeiBuf.bias_off(layer), // B offset + RBuff.layer_offset(layer)); // C offset if(hx != nullptr) { @@ -225,15 +212,15 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &beta_t, hidden_interim_desc, reserveSpace, - RB_layer_out_off, + RBuff.layer_offset(layer), WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), - RB_layer_out_off); + RBuff.layer_offset(layer)); return; } auto reserve_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer - fbatches.at(0), hidden_size}, + std::vector{1, RBuff.batches_per_layer - batches.at(0), hidden_size}, std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); bias_desc = miopen::TensorDescriptor( @@ -252,67 +239,59 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &beta_t, reserve_desc, reserveSpace, - RB_layer_out_off + fbatches.at(0) * RBuff.gemm_write_stride(), + RBuff.layer_offset(layer) + batches.at(0) * RBuff.gemm_write_stride(), WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), - RB_layer_out_off + fbatches.at(0) * RBuff.gemm_write_stride(), + RBuff.layer_offset(layer) + batches.at(0) * RBuff.gemm_write_stride(), true); - // Update time - if(dirMode != 0u) + if(dirMode == 0u) + return; + + if(batches.at(0) == batches.at(seq_len - 1)) { - if(fbatches.at(0) == fbatches.at(seq_len - 1)) - { - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - reserve_desc, - reserveSpace, - &alpha1, - bias_desc, - w, - &beta_t, - reserve_desc, - reserveSpace, - RB_layer_out_off + hidden_size, - WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, - RB_layer_out_off + hidden_size, - true); - // Update time - } - else - { - int cur_batch = 0; - for(int ti = 0; ti < seq_len; ti++) - { - if(ti != (seq_len - 1)) - { - auto offset = RB_layer_out_off + cur_batch * RBuff.gemm_write_stride(); + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + reserve_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + reserve_desc, + reserveSpace, + RBuff.layer_offset(layer) + hidden_size, + WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, + RBuff.layer_offset(layer) + hidden_size, + true); + return; + } - reserve_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, fbatches.at(ti + 1), hidden_size}, - std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + for(int ti = 0; ti < seq_len - 1; ti++) + { - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - reserve_desc, - reserveSpace, - &alpha1, - bias_desc, - w, - &beta_t, - reserve_desc, - reserveSpace, - static_cast(offset) + hidden_size, - WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, - static_cast(offset) + hidden_size, - true); - // Update time - } - cur_batch += fbatches.at(ti); - } - } + auto offset = RBuff.layer_offset(layer) + bacc_per_time[ti] * RBuff.gemm_write_stride(); + + reserve_desc = miopen::TensorDescriptor( + wDesc.GetType(), + std::vector{1, batches.at(ti + 1), hidden_size}, + std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + reserve_desc, + reserveSpace, + &alpha1, + bias_desc, + w, + &beta_t, + reserve_desc, + reserveSpace, + static_cast(offset) + hidden_size, + WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, + static_cast(offset) + hidden_size, + true); } }; @@ -320,21 +299,23 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &WeiBuf, hidden_size, &get_HxBuff_offset, - &fbacc_per_time, - &rbacc_per_time, - &fbatches, - &rbatches, + &bacc_per_time, + &batches, &handle, &xDesc, reserveSpace, hx, + seq_len, w](int layer, int time, int direction) { if(time == 0 && hx == nullptr) return; - const int m = direction == rnn_direction::Forward ? fbatches.at(time) - : time == 0 ? rbatches.at(0) - : rbatches.at(time - 1); + const int cur_time = direction == rnn_direction::Forward ? time : seq_len - 1 - time; + const int prev_time = direction == rnn_direction::Forward ? cur_time - 1 : cur_time + 1; + + const int m = direction == rnn_direction::Forward ? batches.at(cur_time) + : time == 0 ? batches.at(cur_time) + : batches.at(prev_time); const int n = hidden_size, k = hidden_size; @@ -345,13 +326,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const auto ht_ptr = time > 0 ? reserveSpace : hx; if(time != 0 && direction == rnn_direction::Backward && hx != nullptr && - rbatches.at(time) > rbatches.at(time - 1)) + batches.at(cur_time) > batches.at(prev_time)) { miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, true, - rbatches.at(time) - rbatches.at(time - 1), + batches.at(cur_time) - batches.at(prev_time), n, k, hidden_size, @@ -370,13 +351,14 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, CallGemm(handle, gemm_desc, hx, - get_HxBuff_offset(layer, rbatches.at(time - 1), direction), + get_HxBuff_offset(layer, batches.at(prev_time), direction), w, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, RBuff.gemm_write_offset( - layer, rbacc_per_time[time] + rbatches.at(time - 1), direction), + layer, bacc_per_time[cur_time] + batches.at(prev_time), direction), GemmBackend_t::rocblas); + if(gemm_status != miopenStatusSuccess) { if(gemm_status == miopenStatusNotImplemented) @@ -407,19 +389,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, 1, // beta xDesc.GetType(), false}; - auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - int cur_batch = time == 0 ? 0 : bacc_per_time[time - 1]; + const auto hidden_offset = + (time == 0) ? get_HxBuff_offset(layer, 0, direction) + : RBuff.hidden_offset(layer, bacc_per_time[prev_time], direction); - const auto hidden_offset = (time == 0) ? get_HxBuff_offset(layer, 0, direction) - : RBuff.hidden_offset(layer, cur_batch, direction); - - const int accumulated_batches = direction == rnn_direction::Forward - ? time == 0 ? 0 : bacc_per_time[time] - : rbacc_per_time[time]; - - const auto RB_batch_save_points_off = - RBuff.gemm_write_offset(layer, accumulated_batches, direction); + const auto save_point_offset = + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction); const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, @@ -428,7 +404,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, w, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, - RB_batch_save_points_off, + save_point_offset, GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) { @@ -445,20 +421,19 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_hidden_state_update = [&RBuff, hidden_size, - &fbacc_per_time, - &rbacc_per_time, - &fbatches, - &rbatches, + &bacc_per_time, + &batches, &handle, &wDesc, reserveSpace, - &activDesc](int layer_id, int time, int direction) { + &activDesc, + seq_len](int layer_id, int time, int direction) { float alpha = 1, beta = 0; - auto cur_size = direction == rnn_direction::Forward ? fbatches.at(time) : rbatches.at(time); + const int cur_time = direction == rnn_direction::Forward ? time : seq_len - time - 1; const std::vector tensor_size{ - 1, static_cast(cur_size), static_cast(hidden_size)}; + 1, static_cast(batches[cur_time]), static_cast(hidden_size)}; const std::vector tensor_stride{static_cast(RBuff.layer_stride()), static_cast(RBuff.gemm_write_stride()), @@ -467,13 +442,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto cur_batch = - direction == rnn_direction::Forward ? fbacc_per_time[time] : rbacc_per_time[time]; - const auto RB_layer_save_points_off = - RBuff.gemm_write_offset(layer_id, cur_batch, direction); + RBuff.gemm_write_offset(layer_id, bacc_per_time[cur_time], direction); - const auto hidden_offset = RBuff.hidden_offset(layer_id, cur_batch, direction); + const auto hidden_offset = + RBuff.hidden_offset(layer_id, bacc_per_time[cur_time], direction); activDesc.Forward(handle, &alpha, @@ -495,25 +468,24 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto call_relu_tan_update_output = [&RBuff, &get_HxBuff_offset, hidden_size, - &fbatches, - &rbatches, + &batches, &handle, &wDesc, reserveSpace, hy, max_batch, - &fbacc_per_time, - &rbacc_per_time, + &bacc_per_time, seq_len](int layer_id, int time, int direction) { if(hy == nullptr) return; - auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; + auto cur_time = direction == rnn_direction::Forward ? time : seq_len - 1 - time; + auto next_time = direction == rnn_direction::Forward ? cur_time + 1 : cur_time - 1; - auto copy_batch = - time == seq_len - 1 ? batches.at(time) : batches.at(time) - batches.at(time + 1); + auto copy_batches = time == seq_len - 1 ? batches.at(cur_time) + : batches.at(cur_time) - batches.at(next_time); - if(copy_batch <= 0) + if(copy_batches <= 0) return; const std::vector hcy_src_stride{ @@ -522,16 +494,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_dst_stride{ static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; - auto batch_id_relative = batches.at(time) - copy_batch; + auto batch_id_relative = batches.at(cur_time) - copy_batches; - auto accumulated_batch = - direction == rnn_direction::Forward ? fbacc_per_time[time] : rbacc_per_time[time]; - - auto batch_id_abs = - time == seq_len - 1 ? accumulated_batch : accumulated_batch + batches.at(time + 1); + auto batch_id_abs = time == seq_len - 1 ? bacc_per_time[cur_time] + : bacc_per_time[cur_time] + batches.at(next_time); const std::vector hcy_copy_size{ - 1, static_cast(copy_batch), static_cast(hidden_size)}; + 1, static_cast(copy_batches), static_cast(hidden_size)}; auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); @@ -4550,10 +4519,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( { #if MIOPEN_USE_GEMM - // reset kernel timer - // if projections supported, dcxDesc.GetLengths()[2] should be used for hidden_size, - // dhxDesc.GetLengths()[2] for proj_size. - if(paddingMode != miopenRNNIONotPadded) { MIOPEN_THROW("Padded IO is not supported by this solver"); @@ -4571,13 +4536,15 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto rnn_data_type = dhxDesc.GetType(); - std::vector fbatches; + std::vector batches; + std::vector bacc_per_time(seqLen + 1); + int input_size = dxDesc[0].GetLengths()[1]; - int max_batch = dhxDesc.GetLengths()[1]; + int hy_d = dhxDesc.GetLengths()[0]; + int max_batch = dhxDesc.GetLengths()[1]; int hidden_size = dhxDesc.GetLengths()[2]; int out_vec_size = dyDesc[0].GetLengths()[1]; int bi = dirMode != 0u ? 2 : 1; - int wei_stride = hidden_size * bi * static_cast(nHiddenTensorsPerLayer); int in_stride = input_size; int out_stride = out_vec_size; @@ -4587,7 +4554,25 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_THROW(miopenStatusBadParm); } - int total_batches = 0; + float beta = 0; + + auto workSpaceDataTypeSize = workSpaceSize / GetTypeSize(rnn_data_type); + auto reservespace_desc = + miopen::TensorDescriptor(rnn_data_type, + std::vector{1, 1, workSpaceDataTypeSize}, + std::vector{workSpaceDataTypeSize, workSpaceDataTypeSize, 1}); + SetTensor(handle, reservespace_desc, workSpace, &beta); + + if(dhx != nullptr) + { + int dhx_size = max_batch * hidden_size * hy_d; + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, + std::vector{1, 1, dhx_size}, + std::vector{dhx_size, dhx_size, 1}); + SetTensor(handle, hx_desc, dhx, &beta); + } + + int total_batch_size = 0; for(int i = 0; i < seqLen; i++) { int batchval, inputvec, batchvalout, outputvec; @@ -4606,39 +4591,19 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } else { - if(batchval > fbatches.back() || batchval < 0) + if(batchval > batches.back() || batchval < 0) { MIOPEN_THROW(miopenStatusBadParm, "Incorrect input batch size at time " + std::to_string(i) + "! Batch size must not ascend!"); } } - fbatches.push_back(batchval); - total_batches += dxDesc[i].GetLengths()[0]; - } - - int total_batch_size = 0; - // accumulated batches per time - std::vector fbacc_per_time(seqLen + 1); - std::vector rbacc_per_time(seqLen + 1); - - for(int i = 0; i < seqLen; i++) - { - fbacc_per_time[i] = total_batch_size; - total_batch_size += fbatches[i]; - } - - int rtotal_batch_size = total_batch_size; - std::vector rbatches; - for(int i = seqLen - 1; i >= 0; i--) - { - rtotal_batch_size -= fbatches[i]; - rbacc_per_time[seqLen - 1 - i] = rtotal_batch_size; - rbatches.push_back(fbatches[i]); + batches.push_back(batchval); + bacc_per_time[i] = total_batch_size; + total_batch_size += batchval; } - fbacc_per_time[seqLen] = total_batch_size; - rbacc_per_time[seqLen] = 0; + bacc_per_time[seqLen] = total_batch_size; if(out_vec_size != (bi * hidden_size)) { @@ -4656,7 +4621,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( input_size = 0; } - // Update time ActivationDescriptor activDesc; if(rnnMode == miopenRNNRELU) { @@ -4667,9 +4631,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( activDesc = {miopenActivationTANH, 1, 1, 1}; } - ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, wei_stride); + ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); - ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batches, bi); + ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { @@ -4678,8 +4642,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( }; auto propagate_output = - [&RBuff, out_vec_size, &handle, &rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, - int layer) { + [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, + int layer) { // Propagate output // if(layer == numLayers - 1) @@ -4755,36 +4719,34 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto propagate_hidden_output = [&RBuff, &handle, hidden_size, - &fbatches, - &rbatches, - &fbacc_per_time, - &rbacc_per_time, - &rnn_data_type, + &batches, + &bacc_per_time, + rnn_data_type, dhy, workSpace, + seqLen, &get_HxBuff_offset](int layer, int time, int direction) { if(dhy == nullptr) return; - auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; + const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; + const int start_time = direction == rnn_direction::Forward ? 0 : seqLen - time - 1; float alpha0 = 1; float alpha1 = 1; float beta_t = 0; - std::vector hx_stride{batches.at(0) * hidden_size, hidden_size, 1}; + std::vector hx_stride{batches.at(start_time) * hidden_size, hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(time), hidden_size}; - std::vector reserve_size{1, batches.at(time), hidden_size}; + std::vector hx_size{1, batches.at(cur_time), hidden_size}; + std::vector reserve_size{1, batches.at(cur_time), hidden_size}; auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); auto workspace_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - OpTensor(handle, miopenTensorOpAdd, &alpha0, @@ -4797,35 +4759,37 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( workspace_desc, workSpace, get_HxBuff_offset(layer, 0, direction), - RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[time], direction)); + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); }; auto propagate_hidden_prev = [&RBuff, &handle, hidden_size, - &fbatches, - &rbatches, - &fbacc_per_time, - &rbacc_per_time, - &rnn_data_type, + &batches, + &bacc_per_time, + rnn_data_type, dhy, workSpace, &get_HxBuff_offset, WeiBuf, - w](int layer, int time, int direction) { + seqLen, + w, + max_batch](int layer, int time, int direction) { + const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; + const int next_time = direction == rnn_direction::Forward ? cur_time + 1 : cur_time - 1; if(direction == rnn_direction::Forward && dhy != nullptr && - fbatches.at(time) > fbatches.at(time + 1)) + batches.at(cur_time) > batches.at(next_time)) { - std::vector hx_stride{fbatches.at(0) * hidden_size, hidden_size, 1}; + std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, fbatches.at(time) - fbatches.at(time + 1), hidden_size}; + std::vector hx_size{1, batches.at(cur_time) - batches.at(next_time), hidden_size}; std::vector reserve_size{ - 1, fbatches.at(time) - fbatches.at(time + 1), hidden_size}; + 1, batches.at(cur_time) - batches.at(next_time), hidden_size}; float alpha0 = 1; float alpha1 = 1; @@ -4846,20 +4810,19 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta_t, reserve_desc, workSpace, - get_HxBuff_offset(layer, fbatches.at(time + 1), direction), + get_HxBuff_offset(layer, batches.at(next_time), direction), RBuff.gemm_write_offset( - layer, fbacc_per_time[time] + fbatches.at(time + 1), direction), + layer, bacc_per_time[cur_time] + batches.at(next_time), direction), RBuff.gemm_write_offset( - layer, fbacc_per_time[time] + fbatches.at(time + 1), direction)); + layer, bacc_per_time[cur_time] + batches.at(next_time), direction)); } - int used_batch = direction == rnn_direction::Forward ? fbatches[time + 1] : rbatches[time]; + int used_batch = + direction == rnn_direction::Forward ? batches[next_time] : batches[cur_time]; if(used_batch <= 0) return; - auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, @@ -4882,11 +4845,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( CallGemm(handle, gemm_desc, workSpace, - RBuff.gemm_write_offset(layer, bacc_per_time[time + 1], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[next_time], direction), w, WeiBuf.hidden_weight_offset(layer, direction), workSpace, - RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) @@ -4906,18 +4869,20 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &handle, hidden_size, seqLen, - &fbatches, - &rbatches, - &fbacc_per_time, - &rbacc_per_time, - &rnn_data_type, + &batches, + &bacc_per_time, + rnn_data_type, workSpace, reserveSpace, &activDesc, propagate_hidden_output, - propagate_hidden_prev](int layer, int time, int direction) { + propagate_hidden_prev, + max_batch](int layer, int time, int direction) { + const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; + std::vector hx_stride{ - fbatches.at(0) * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; + max_batch * hidden_size, hidden_size, 1}; + std::vector reserve_stride{ RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; @@ -4930,11 +4895,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( propagate_hidden_prev(layer, time, direction); } - auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; - - auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - - std::vector reserve_size{1, batches.at(time), hidden_size}; + std::vector reserve_size{1, batches.at(cur_time), hidden_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); float alpha = 1, beta = 0; @@ -4950,10 +4911,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta, reserve_desc, workSpace, - RBuff.hidden_offset(layer, bacc_per_time[time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[time], direction)); + RBuff.hidden_offset(layer, bacc_per_time[cur_time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); }; auto propagate_hidden = [*this, seqLen, propagate_hidden_time](int layer) { @@ -4968,22 +4929,21 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto propagate_dhx_prev = [&RBuff, &WeiBuf, - &rnn_data_type, + rnn_data_type, hidden_size, - &fbatches, - &rbatches, - &fbacc_per_time, - &rbacc_per_time, + &batches, + &bacc_per_time, &handle, w, dhx, &get_HxBuff_offset, - workSpace](int layer, int time, int direction) { - auto& batches = direction == rnn_direction::Forward ? fbatches : rbatches; + workSpace, + seqLen](int layer, int time, int direction) { + const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; + const int prev_time = direction == rnn_direction::Forward ? cur_time - 1 : cur_time + 1; - auto& bacc_per_time = direction == rnn_direction::Forward ? fbacc_per_time : rbacc_per_time; - - int batch_size = time == 0 ? batches.at(time) : batches.at(time) - batches.at(time - 1); + int batch_size = + time == 0 ? batches.at(cur_time) : batches.at(cur_time) - batches.at(prev_time); if(batch_size <= 0) return; @@ -5006,20 +4966,20 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int use_batch = time == 0 ? 0 : batches.at(time - 1); + int output_batch = time == 0 ? 0 : batches.at(prev_time); - int write_offset = - time == 0 ? bacc_per_time[time] : bacc_per_time[time] + batches.at(time - 1); + int input_batch = + time == 0 ? bacc_per_time[cur_time] : bacc_per_time[prev_time] - batch_size; miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, workSpace, - RBuff.gemm_write_offset(layer, write_offset, direction), + RBuff.gemm_write_offset(layer, input_batch, direction), w, WeiBuf.hidden_weight_offset(layer, direction), dhx, - get_HxBuff_offset(layer, use_batch, direction), + get_HxBuff_offset(layer, output_batch, direction), GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) @@ -5096,7 +5056,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, - total_batches, + total_batch_size, input_size, RBuff.gemm_write_size(), hy_stride, @@ -5286,7 +5246,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( x_stride[1] = in_stride; y_stride[0] = batch_n * out_stride; y_stride[1] = out_stride; - if(dhx != nullptr || (rnnMode == miopenLSTM && dcx != nullptr)) + + if(dhx != nullptr) { hx_size[2] = hy_d * hy_n * hy_h; hx_stride[0] = hx_size[2]; @@ -5305,6 +5266,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( profileRNNkernels(handle, 1, ctime); } } + hx_stride[0] = in_n.at(0) * uni_stride; hx_stride[1] = uni_stride; From d7958930699c1f123eb9079a9836fdc146716e68 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Mon, 18 Dec 2023 23:42:38 +0300 Subject: [PATCH 19/27] Added profiling --- src/include/miopen/rnn.hpp | 1 + src/ocl/rnnocl.cpp | 105 ++++++++++++++++++++++++++++++------- 2 files changed, 86 insertions(+), 20 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index bec3bbf9bb..8a7677b08d 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -271,6 +271,7 @@ struct RNNDescriptor : miopenRNNDescriptor ConstData_t w, const TensorDescriptor& yDesc, Data_t y, + const TensorDescriptor& hyDesc, Data_t hy, Data_t reserveSpace) const; diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index be90043040..0f0b4aef9e 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -50,18 +50,23 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, ConstData_t w, const TensorDescriptor& yDesc, Data_t y, + const TensorDescriptor& hyDesc, Data_t hy, Data_t reserveSpace) const { #if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP + float ctime = 0; + profileRNNkernels(handle, 0, ctime); int seq_len = seq_array.size(); if(seq_len == 0) return; std::vector batches; + float beta = 0; int in_vec_size = xDesc.GetLengths()[1]; int out_vec_size = yDesc.GetLengths()[1]; + int biNumLayers = hyDesc.GetLengths()[0]; int max_batch = seq_array[0]; int hidden_size; @@ -80,7 +85,30 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, bacc_per_time[seq_len] = total_batch_size; - int bi = dirMode != 0u ? 2 : 1; + int bi = dirMode != 0u ? 2 : 1; + + if(in_vec_size <= 0 || hidden_size <= 0 || max_batch <= 0 || biNumLayers <= 0 || + out_vec_size <= 0 || seq_len <= 0) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + const auto sp_tensor_size = GetParamsSize(xDesc.GetLengths()[1]) / GetTypeSize(wDesc.GetType()); + + auto sp_desc = miopen::TensorDescriptor( + wDesc.GetType(), {1, 1, sp_tensor_size}, {sp_tensor_size, sp_tensor_size, 1}); + + SetTensor(handle, sp_desc, reserveSpace, &beta); + profileRNNkernels(handle, 1, ctime); + + if(hy != nullptr) + { + const auto hy_tensor_size = biNumLayers * max_batch * hidden_size; + auto hy_desc = miopen::TensorDescriptor( + wDesc.GetType(), {1, 1, hy_tensor_size}, {hy_tensor_size, hy_tensor_size, 1}); + SetTensor(handle, hy_desc, hy, &beta); + profileRNNkernels(handle, 1, ctime); + } auto get_HxBuff_offset = [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { @@ -88,7 +116,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, (size_t)hidden_size * batch_id; }; - ReluWeightOffsets WeiBuf(in_vec_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); + ReluWeightOffsets WeiBuf( + in_vec_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); ActivationDescriptor activDesc; @@ -103,8 +132,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } auto call_relu_tan_input_gemm = - [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w](int layer, - float beta_t = 1) { + [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, &ctime]( + int layer, float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), k = layer > 0 ? RBuff.gemm_write_size() : in_vec_size; @@ -156,6 +185,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); }; auto call_relu_tan_bias_add = [*this, @@ -169,7 +199,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &hidden_size, &batches, &bacc_per_time, - &seq_len](int layer, float beta_t = 0) { + &seq_len, + &ctime](int layer, float beta_t = 0) { float alpha0 = 1; float alpha1 = 1; @@ -199,6 +230,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.bias_off(layer), // B offset RBuff.layer_offset(layer)); // C offset + profileRNNkernels(handle, 1, ctime); + if(hx != nullptr) { OpTensor(handle, @@ -215,6 +248,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RBuff.layer_offset(layer), WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), RBuff.layer_offset(layer)); + profileRNNkernels(handle, 1, ctime); return; } @@ -244,6 +278,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RBuff.layer_offset(layer) + batches.at(0) * RBuff.gemm_write_stride(), true); + profileRNNkernels(handle, 1, ctime); + if(dirMode == 0u) return; @@ -264,6 +300,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, RBuff.layer_offset(layer) + hidden_size, true); + profileRNNkernels(handle, 1, ctime); return; } @@ -292,6 +329,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, static_cast(offset) + hidden_size, true); + profileRNNkernels(handle, 1, ctime); } }; @@ -306,7 +344,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, hx, seq_len, - w](int layer, int time, int direction) { + w, + &ctime](int layer, int time, int direction) { if(time == 0 && hx == nullptr) return; @@ -370,6 +409,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); } const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, @@ -417,6 +457,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); }; auto call_relu_tan_hidden_state_update = [&RBuff, @@ -427,7 +468,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &wDesc, reserveSpace, &activDesc, - seq_len](int layer_id, int time, int direction) { + seq_len, + &ctime](int layer_id, int time, int direction) { float alpha = 1, beta = 0; const int cur_time = direction == rnn_direction::Forward ? time : seq_len - time - 1; @@ -463,6 +505,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RB_layer_save_points_off, // output tensor offset hidden_offset); + profileRNNkernels(handle, 1, ctime); }; auto call_relu_tan_update_output = [&RBuff, @@ -475,7 +518,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, hy, max_batch, &bacc_per_time, - seq_len](int layer_id, int time, int direction) { + seq_len, + &ctime](int layer_id, int time, int direction) { if(hy == nullptr) return; @@ -512,6 +556,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, hy, RBuff.hidden_offset(layer_id, batch_id_abs, direction), get_HxBuff_offset(layer_id, batch_id_relative, direction)); + profileRNNkernels(handle, 1, ctime); }; for(int layer_id = 0; layer_id < nLayers; layer_id++) @@ -565,6 +610,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, y, RBuff.hidden_offset(nLayers - 1, 0, rnn_direction::Forward), 0); + profileRNNkernels(handle, 1, ctime); } #else (void)handle; @@ -3080,7 +3126,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( inputMode != miopenRNNskip && !(miopen::IsDisabled(ENV(MIOPEN_RNNFWD_exp)))) { RNNForwardTrainingTanhRelu( - handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hy, reserveSpace); + handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hyDesc, hy, reserveSpace); if(is_profiling) { float eventTime_mS = RNNProfilingEnd(handle, start, stop); @@ -4519,6 +4565,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( { #if MIOPEN_USE_GEMM + float ctime = 0.; + // reset kernel timer + profileRNNkernels(handle, 0, ctime); + if(paddingMode != miopenRNNIONotPadded) { MIOPEN_THROW("Padded IO is not supported by this solver"); @@ -4541,7 +4591,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( int input_size = dxDesc[0].GetLengths()[1]; int hy_d = dhxDesc.GetLengths()[0]; - int max_batch = dhxDesc.GetLengths()[1]; + int max_batch = dhxDesc.GetLengths()[1]; int hidden_size = dhxDesc.GetLengths()[2]; int out_vec_size = dyDesc[0].GetLengths()[1]; int bi = dirMode != 0u ? 2 : 1; @@ -4562,6 +4612,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( std::vector{1, 1, workSpaceDataTypeSize}, std::vector{workSpaceDataTypeSize, workSpaceDataTypeSize, 1}); SetTensor(handle, reservespace_desc, workSpace, &beta); + profileRNNkernels(handle, 1, ctime); if(dhx != nullptr) { @@ -4570,6 +4621,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( std::vector{1, 1, dhx_size}, std::vector{dhx_size, dhx_size, 1}); SetTensor(handle, hx_desc, dhx, &beta); + profileRNNkernels(handle, 1, ctime); } int total_batch_size = 0; @@ -4631,7 +4683,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( activDesc = {miopenActivationTANH, 1, 1, 1}; } - ReluWeightOffsets WeiBuf(input_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); + ReluWeightOffsets WeiBuf( + input_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); @@ -4642,8 +4695,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( }; auto propagate_output = - [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, - int layer) { + [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w, &ctime]( + int numLayers, int layer) { // Propagate output // if(layer == numLayers - 1) @@ -4669,6 +4722,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( CopyTensor( handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); + profileRNNkernels(handle, 1, ctime); } // Propagate previous layer // @@ -4713,6 +4767,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); } }; @@ -4725,7 +4780,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( dhy, workSpace, seqLen, - &get_HxBuff_offset](int layer, int time, int direction) { + &get_HxBuff_offset, + &ctime](int layer, int time, int direction) { if(dhy == nullptr) return; @@ -4761,6 +4817,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( get_HxBuff_offset(layer, 0, direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); + profileRNNkernels(handle, 1, ctime); }; auto propagate_hidden_prev = [&RBuff, @@ -4775,7 +4832,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( WeiBuf, seqLen, w, - max_batch](int layer, int time, int direction) { + max_batch, + &ctime](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; const int next_time = direction == rnn_direction::Forward ? cur_time + 1 : cur_time - 1; if(direction == rnn_direction::Forward && dhy != nullptr && @@ -4815,6 +4873,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( layer, bacc_per_time[cur_time] + batches.at(next_time), direction), RBuff.gemm_write_offset( layer, bacc_per_time[cur_time] + batches.at(next_time), direction)); + profileRNNkernels(handle, 1, ctime); } int used_batch = @@ -4863,6 +4922,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); }; auto propagate_hidden_time = [&RBuff, @@ -4877,12 +4937,12 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &activDesc, propagate_hidden_output, propagate_hidden_prev, - max_batch](int layer, int time, int direction) { + max_batch, + &ctime](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; - std::vector hx_stride{ - max_batch * hidden_size, hidden_size, 1}; - + std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; + std::vector reserve_stride{ RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; @@ -4915,6 +4975,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); + profileRNNkernels(handle, 1, ctime); }; auto propagate_hidden = [*this, seqLen, propagate_hidden_time](int layer) { @@ -4938,7 +4999,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( dhx, &get_HxBuff_offset, workSpace, - seqLen](int layer, int time, int direction) { + seqLen, + &ctime](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; const int prev_time = direction == rnn_direction::Forward ? cur_time - 1 : cur_time + 1; @@ -4993,6 +5055,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); }; auto propagate_dhx = [*this, seqLen, &propagate_dhx_prev, &dhx](int layer) { @@ -5049,6 +5112,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( static_cast(gi) * hidden_size, 0, 0); + profileRNNkernels(handle, 1, ctime); } } else @@ -5084,6 +5148,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } + profileRNNkernels(handle, 1, ctime); } #else From b93c0d3e77ca82ad2140f129ca91480e64be7624 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Tue, 19 Dec 2023 00:35:29 +0300 Subject: [PATCH 20/27] Removed ocl profiling --- src/include/miopen/rnn.hpp | 3 +- src/ocl/rnnocl.cpp | 86 +++++++++++++------------------------- 2 files changed, 31 insertions(+), 58 deletions(-) diff --git a/src/include/miopen/rnn.hpp b/src/include/miopen/rnn.hpp index 8a7677b08d..c5a29299c1 100644 --- a/src/include/miopen/rnn.hpp +++ b/src/include/miopen/rnn.hpp @@ -273,7 +273,8 @@ struct RNNDescriptor : miopenRNNDescriptor Data_t y, const TensorDescriptor& hyDesc, Data_t hy, - Data_t reserveSpace) const; + Data_t reserveSpace, + size_t reserveSpaceSize) const; void RNNForwardInference(Handle& handle, int seqLen, diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 0f0b4aef9e..5305b561bc 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -52,11 +52,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, Data_t y, const TensorDescriptor& hyDesc, Data_t hy, - Data_t reserveSpace) const + Data_t reserveSpace, + size_t reserveSpaceSize) const { #if MIOPEN_USE_GEMM && MIOPEN_BACKEND_HIP - float ctime = 0; - profileRNNkernels(handle, 0, ctime); int seq_len = seq_array.size(); if(seq_len == 0) return; @@ -93,13 +92,12 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW(miopenStatusBadParm); } - const auto sp_tensor_size = GetParamsSize(xDesc.GetLengths()[1]) / GetTypeSize(wDesc.GetType()); + const auto sp_tensor_size = reserveSpaceSize / GetTypeSize(wDesc.GetType()); auto sp_desc = miopen::TensorDescriptor( wDesc.GetType(), {1, 1, sp_tensor_size}, {sp_tensor_size, sp_tensor_size, 1}); SetTensor(handle, sp_desc, reserveSpace, &beta); - profileRNNkernels(handle, 1, ctime); if(hy != nullptr) { @@ -107,7 +105,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, auto hy_desc = miopen::TensorDescriptor( wDesc.GetType(), {1, 1, hy_tensor_size}, {hy_tensor_size, hy_tensor_size, 1}); SetTensor(handle, hy_desc, hy, &beta); - profileRNNkernels(handle, 1, ctime); } auto get_HxBuff_offset = @@ -132,8 +129,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } auto call_relu_tan_input_gemm = - [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w, &ctime]( - int layer, float beta_t = 1) { + [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w](int layer, + float beta_t = 1) { const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), k = layer > 0 ? RBuff.gemm_write_size() : in_vec_size; @@ -185,7 +182,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); }; auto call_relu_tan_bias_add = [*this, @@ -199,8 +195,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &hidden_size, &batches, &bacc_per_time, - &seq_len, - &ctime](int layer, float beta_t = 0) { + &seq_len](int layer, float beta_t = 0) { float alpha0 = 1; float alpha1 = 1; @@ -230,8 +225,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.bias_off(layer), // B offset RBuff.layer_offset(layer)); // C offset - profileRNNkernels(handle, 1, ctime); - if(hx != nullptr) { OpTensor(handle, @@ -248,7 +241,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RBuff.layer_offset(layer), WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), RBuff.layer_offset(layer)); - profileRNNkernels(handle, 1, ctime); return; } @@ -278,8 +270,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RBuff.layer_offset(layer) + batches.at(0) * RBuff.gemm_write_stride(), true); - profileRNNkernels(handle, 1, ctime); - if(dirMode == 0u) return; @@ -300,7 +290,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, RBuff.layer_offset(layer) + hidden_size, true); - profileRNNkernels(handle, 1, ctime); return; } @@ -329,7 +318,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, static_cast(offset) + hidden_size, true); - profileRNNkernels(handle, 1, ctime); } }; @@ -344,8 +332,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, hx, seq_len, - w, - &ctime](int layer, int time, int direction) { + w](int layer, int time, int direction) { if(time == 0 && hx == nullptr) return; @@ -409,7 +396,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); } const miopen::GemmDescriptor gemm_desc_hx = GemmDescriptor{false, @@ -457,7 +443,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); }; auto call_relu_tan_hidden_state_update = [&RBuff, @@ -468,8 +453,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &wDesc, reserveSpace, &activDesc, - seq_len, - &ctime](int layer_id, int time, int direction) { + seq_len](int layer_id, int time, int direction) { float alpha = 1, beta = 0; const int cur_time = direction == rnn_direction::Forward ? time : seq_len - time - 1; @@ -505,7 +489,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RB_layer_save_points_off, // output tensor offset hidden_offset); - profileRNNkernels(handle, 1, ctime); }; auto call_relu_tan_update_output = [&RBuff, @@ -518,8 +501,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, hy, max_batch, &bacc_per_time, - seq_len, - &ctime](int layer_id, int time, int direction) { + seq_len](int layer_id, int time, int direction) { if(hy == nullptr) return; @@ -556,7 +538,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, hy, RBuff.hidden_offset(layer_id, batch_id_abs, direction), get_HxBuff_offset(layer_id, batch_id_relative, direction)); - profileRNNkernels(handle, 1, ctime); }; for(int layer_id = 0; layer_id < nLayers; layer_id++) @@ -610,7 +591,6 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, y, RBuff.hidden_offset(nLayers - 1, 0, rnn_direction::Forward), 0); - profileRNNkernels(handle, 1, ctime); } #else (void)handle; @@ -3125,8 +3105,20 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && inputMode != miopenRNNskip && !(miopen::IsDisabled(ENV(MIOPEN_RNNFWD_exp)))) { - RNNForwardTrainingTanhRelu( - handle, in_n, xDesc[0], x, hxDesc, hx, wDesc, w, yDesc[0], y, hyDesc, hy, reserveSpace); + RNNForwardTrainingTanhRelu(handle, + in_n, + xDesc[0], + x, + hxDesc, + hx, + wDesc, + w, + yDesc[0], + y, + hyDesc, + hy, + reserveSpace, + reserveSpaceSize); if(is_profiling) { float eventTime_mS = RNNProfilingEnd(handle, start, stop); @@ -4564,11 +4556,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( size_t reserveSpaceSize) const { #if MIOPEN_USE_GEMM - - float ctime = 0.; - // reset kernel timer - profileRNNkernels(handle, 0, ctime); - if(paddingMode != miopenRNNIONotPadded) { MIOPEN_THROW("Padded IO is not supported by this solver"); @@ -4612,7 +4599,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( std::vector{1, 1, workSpaceDataTypeSize}, std::vector{workSpaceDataTypeSize, workSpaceDataTypeSize, 1}); SetTensor(handle, reservespace_desc, workSpace, &beta); - profileRNNkernels(handle, 1, ctime); if(dhx != nullptr) { @@ -4621,7 +4607,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( std::vector{1, 1, dhx_size}, std::vector{dhx_size, dhx_size, 1}); SetTensor(handle, hx_desc, dhx, &beta); - profileRNNkernels(handle, 1, ctime); } int total_batch_size = 0; @@ -4695,8 +4680,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( }; auto propagate_output = - [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w, &ctime]( - int numLayers, int layer) { + [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, + int layer) { // Propagate output // if(layer == numLayers - 1) @@ -4722,7 +4707,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( CopyTensor( handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); - profileRNNkernels(handle, 1, ctime); } // Propagate previous layer // @@ -4767,7 +4751,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); } }; @@ -4780,8 +4763,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( dhy, workSpace, seqLen, - &get_HxBuff_offset, - &ctime](int layer, int time, int direction) { + &get_HxBuff_offset](int layer, int time, int direction) { if(dhy == nullptr) return; @@ -4817,7 +4799,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( get_HxBuff_offset(layer, 0, direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); - profileRNNkernels(handle, 1, ctime); }; auto propagate_hidden_prev = [&RBuff, @@ -4832,8 +4813,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( WeiBuf, seqLen, w, - max_batch, - &ctime](int layer, int time, int direction) { + max_batch](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; const int next_time = direction == rnn_direction::Forward ? cur_time + 1 : cur_time - 1; if(direction == rnn_direction::Forward && dhy != nullptr && @@ -4873,7 +4853,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( layer, bacc_per_time[cur_time] + batches.at(next_time), direction), RBuff.gemm_write_offset( layer, bacc_per_time[cur_time] + batches.at(next_time), direction)); - profileRNNkernels(handle, 1, ctime); } int used_batch = @@ -4922,7 +4901,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); }; auto propagate_hidden_time = [&RBuff, @@ -4937,8 +4915,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &activDesc, propagate_hidden_output, propagate_hidden_prev, - max_batch, - &ctime](int layer, int time, int direction) { + max_batch](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; @@ -4975,7 +4952,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); - profileRNNkernels(handle, 1, ctime); }; auto propagate_hidden = [*this, seqLen, propagate_hidden_time](int layer) { @@ -4999,8 +4975,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( dhx, &get_HxBuff_offset, workSpace, - seqLen, - &ctime](int layer, int time, int direction) { + seqLen](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; const int prev_time = direction == rnn_direction::Forward ? cur_time - 1 : cur_time + 1; @@ -5055,7 +5030,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); }; auto propagate_dhx = [*this, seqLen, &propagate_dhx_prev, &dhx](int layer) { @@ -5112,7 +5086,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( static_cast(gi) * hidden_size, 0, 0); - profileRNNkernels(handle, 1, ctime); } } else @@ -5148,7 +5121,6 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( MIOPEN_LOG_E("GEMM failed"); } } - profileRNNkernels(handle, 1, ctime); } #else From 87329c60c0387a37373574e5bff0ed54678c6674 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Tue, 19 Dec 2023 14:11:59 +0300 Subject: [PATCH 21/27] review fixes --- src/ocl/rnnocl.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 5305b561bc..03e09e3f7f 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -86,8 +86,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, int bi = dirMode != 0u ? 2 : 1; - if(in_vec_size <= 0 || hidden_size <= 0 || max_batch <= 0 || biNumLayers <= 0 || - out_vec_size <= 0 || seq_len <= 0) + if (in_vec_size <= 0 || hidden_size <= 0 || max_batch <= 0 || biNumLayers <= 0 || + out_vec_size <= 0 || seq_len == 0) { MIOPEN_THROW(miopenStatusBadParm); } @@ -184,7 +184,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } }; - auto call_relu_tan_bias_add = [*this, + auto call_relu_tan_bias_add = [this, &RBuff, &WeiBuf, &handle, @@ -244,6 +244,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return; } + if((RBuff.batches_per_layer - batches.at(0)) <= 0) + return; + auto reserve_desc = miopen::TensorDescriptor( wDesc.GetType(), std::vector{1, RBuff.batches_per_layer - batches.at(0), hidden_size}, @@ -3102,7 +3105,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( return; } - if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && nLayers > 1 && + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && inputMode != miopenRNNskip && !(miopen::IsDisabled(ENV(MIOPEN_RNNFWD_exp)))) { RNNForwardTrainingTanhRelu(handle, @@ -4913,8 +4916,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( workSpace, reserveSpace, &activDesc, - propagate_hidden_output, - propagate_hidden_prev, + &propagate_hidden_output, + &propagate_hidden_prev, max_batch](int layer, int time, int direction) { const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; @@ -4954,7 +4957,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); }; - auto propagate_hidden = [*this, seqLen, propagate_hidden_time](int layer) { + auto propagate_hidden = [this, seqLen, propagate_hidden_time](int layer) { for(int time = seqLen - 1; time >= 0; time--) { propagate_hidden_time(layer, time, rnn_direction::Forward); @@ -5032,7 +5035,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_dhx = [*this, seqLen, &propagate_dhx_prev, &dhx](int layer) { + auto propagate_dhx = [this, seqLen, &propagate_dhx_prev, dhx](int layer) { if(dhx == nullptr) return; From 7ae3116a6229e25eecc18b30b897f8ff4a730faf Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Wed, 20 Dec 2023 23:30:00 +0300 Subject: [PATCH 22/27] review fixes --- src/ocl/rnnocl.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 03e09e3f7f..e46aa90c08 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -108,7 +108,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } auto get_HxBuff_offset = - [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { + [bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + (size_t)hidden_size * batch_id; }; @@ -4597,11 +4597,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( float beta = 0; auto workSpaceDataTypeSize = workSpaceSize / GetTypeSize(rnn_data_type); - auto reservespace_desc = + auto workSpace_desc = miopen::TensorDescriptor(rnn_data_type, std::vector{1, 1, workSpaceDataTypeSize}, std::vector{workSpaceDataTypeSize, workSpaceDataTypeSize, 1}); - SetTensor(handle, reservespace_desc, workSpace, &beta); + SetTensor(handle, workSpace_desc, workSpace, &beta); if(dhx != nullptr) { @@ -4677,7 +4677,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); auto get_HxBuff_offset = - [&bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { + [bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + static_cast(hidden_size) * batch_id; }; @@ -5287,7 +5287,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensors( y_stride[0] = batch_n * out_stride; y_stride[1] = out_stride; - if(dhx != nullptr) + if(dhx != nullptr || (rnnMode == miopenLSTM && dcx != nullptr)) { hx_size[2] = hy_d * hy_n * hy_h; hx_stride[0] = hx_size[2]; From 15a891fc60b7a69f8e180f2d9e3713e11a45923f Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Thu, 21 Dec 2023 23:04:37 +0300 Subject: [PATCH 23/27] minor fix --- src/ocl/rnnocl.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index e46aa90c08..023c44ba24 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -195,7 +195,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &hidden_size, &batches, &bacc_per_time, - &seq_len](int layer, float beta_t = 0) { + &seq_len, + max_batch](int layer, float beta_t = 0) { float alpha0 = 1; float alpha1 = 1; @@ -244,12 +245,12 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return; } - if((RBuff.batches_per_layer - batches.at(0)) <= 0) + if((RBuff.batches_per_layer - max_batch) <= 0) return; auto reserve_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer - batches.at(0), hidden_size}, + std::vector{1, RBuff.batches_per_layer - max_batch, hidden_size}, std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); bias_desc = miopen::TensorDescriptor( @@ -268,15 +269,15 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &beta_t, reserve_desc, reserveSpace, - RBuff.layer_offset(layer) + batches.at(0) * RBuff.gemm_write_stride(), + RBuff.layer_offset(layer) + max_batch * RBuff.gemm_write_stride(), WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), - RBuff.layer_offset(layer) + batches.at(0) * RBuff.gemm_write_stride(), + RBuff.layer_offset(layer) + max_batch * RBuff.gemm_write_stride(), true); if(dirMode == 0u) return; - if(batches.at(0) == batches.at(seq_len - 1)) + if(max_batch == batches.at(seq_len - 1)) { OpTensor(handle, miopenTensorOpAdd, From ac66ac0fde8a05c2b9284fff1f335b1b1257fbfc Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Thu, 11 Jan 2024 17:41:40 +0300 Subject: [PATCH 24/27] Review fixes. Made RnnDirection enum class. Removed RnnOffsets class. Added RnnBatches class --- src/include/miopen/rnn_util.hpp | 92 ++++++---- src/ocl/rnnocl.cpp | 305 ++++++++++++++++---------------- 2 files changed, 202 insertions(+), 195 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index 42df311ae1..ea2b7e065a 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -35,12 +35,55 @@ namespace miopen { -enum rnn_direction +enum class RnnDirection { Forward = 0, Backward = 1 }; +struct RnnBatches +{ + + int at(int time, RnnDirection direction) const { return batches.at(cur_time(time, direction)); } + + int next(int time, RnnDirection direction) const + { + return batches.at(next_time(time, direction)); + } + + int prev(int time, RnnDirection direction) const + { + return batches.at(prev_time(time, direction)); + } + + void push_back(int batch) { batches.push_back(batch); } + + RnnBatches(std::vector& input) : batches(input){}; + RnnBatches(){}; + + int back() const { return batches.back(); } + +private: + int cur_time(int time, RnnDirection direction) const + { + return direction == RnnDirection::Forward ? time : batches.size() - time - 1; + } + + int next_time(int time, RnnDirection direction) const + { + return direction == RnnDirection::Forward ? cur_time(time, direction) + 1 + : cur_time(time, direction) - 1; + } + + int prev_time(int time, RnnDirection direction) const + { + return direction == RnnDirection::Forward ? cur_time(time, direction) - 1 + : cur_time(time, direction) + 1; + } + + std::vector batches; +}; + #if MIOPEN_BACKEND_HIP inline void RNNProfilingBegin(const miopen::Handle& handle, miopen::HipEventPtr& start, @@ -127,20 +170,7 @@ void LSTMBackwardHiddenStateUpdate(const Handle& handle, std::size_t dhidden_offset, std::size_t f_offset_pre); -struct RNNWeightOffsets -{ - -public: - int input_offset(int layer) const; - int hidden_offset(int layer) const; - int bias_off(); - int bias_off(int layer) const; - -private: - int first_layer_offset() const; -}; - -struct ReluWeightOffsets : public RNNWeightOffsets +struct ReluWeightOffsets { public: ReluWeightOffsets(int input_vector_sz, @@ -165,12 +195,12 @@ struct ReluWeightOffsets : public RNNWeightOffsets (h_vec_sz + h_vec_sz * bi_scale) * weight_stride * (layer - 1); } - int hidden_weight_offset(int layer, int reverse = 0) const + int hidden_weight_offset(int layer, RnnDirection reverse) const { return layer == 0 ? input_weight_offset(layer) + in_vec_sz * weight_stride + - reverse * h_vec_sz * h_vec_sz + static_cast(reverse) * h_vec_sz * h_vec_sz : input_weight_offset(layer) + bi_scale * h_vec_sz * weight_stride + - reverse * h_vec_sz * h_vec_sz; + static_cast(reverse) * h_vec_sz * h_vec_sz; } size_t bias_stride() const { return static_cast(h_vec_sz) * bi_scale; } @@ -195,22 +225,7 @@ struct ReluWeightOffsets : public RNNWeightOffsets int first_layer_offset() const { return (in_vec_sz + h_vec_sz) * weight_stride; } }; -struct RNNOffsets -{ - size_t layer_offset(int layer_id) const; - - size_t layer_stride() const; - - int gemm_write_size() const; - - size_t gemm_write_stride() const; - - size_t gemm_write_offset(int layer_id, int batch_id = 0, int reverse = 0) const; - - size_t hidden_offset(int layer_id, int batch_id = 0, int reverse = 0) const; -}; - -struct ReluReserveBufferOffsets : public RNNOffsets +struct ReluReserveBufferOffsets { struct RBuffHelper { @@ -231,7 +246,8 @@ struct ReluReserveBufferOffsets : public RNNOffsets } public: - ReluReserveBufferOffsets(int hidden_vec_size, int layers_cnt, int batches_per_l, int bi_scale, int workspace_scale) + ReluReserveBufferOffsets( + int hidden_vec_size, int layers_cnt, int batches_per_l, int bi_scale, int workspace_scale) : hidden_size(hidden_vec_size), batches_per_layer(batches_per_l), save_point_size(hidden_vec_size * bi_scale * workspace_scale), @@ -251,13 +267,13 @@ struct ReluReserveBufferOffsets : public RNNOffsets size_t gemm_write_stride() const { return strides.batch; } - size_t gemm_write_offset(int layer_id, int batch_id, int reverse) const + size_t gemm_write_offset(int layer_id, int batch_id, RnnDirection reverse) const { return layer_offset(layer_id) + static_cast(gemm_write_stride()) * batch_id + - (size_t)reverse * hidden_size; + static_cast(reverse) * hidden_size; } - size_t hidden_offset(int layer_id, int batch_id, int reverse) const + size_t hidden_offset(int layer_id, int batch_id, RnnDirection reverse) const { return strides.table + gemm_write_offset(layer_id, batch_id, reverse); } diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 023c44ba24..747c3bc7f6 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -60,7 +60,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(seq_len == 0) return; - std::vector batches; + RnnBatches batches(seq_array); + RnnBatches bacc_per_time; float beta = 0; int in_vec_size = xDesc.GetLengths()[1]; @@ -72,21 +73,16 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, std::tie(std::ignore, max_batch, hidden_size) = miopen::tien<3>(hxDesc.GetLengths()); int total_batch_size = 0; - // accumulated batches per time - std::vector bacc_per_time(seq_len + 1); for(int i = 0; i < seq_len; i++) { - bacc_per_time[i] = total_batch_size; + bacc_per_time.push_back(total_batch_size); total_batch_size += seq_array[i]; - batches.push_back(seq_array[i]); } - bacc_per_time[seq_len] = total_batch_size; - int bi = dirMode != 0u ? 2 : 1; - if (in_vec_size <= 0 || hidden_size <= 0 || max_batch <= 0 || biNumLayers <= 0 || + if(in_vec_size <= 0 || hidden_size <= 0 || max_batch <= 0 || biNumLayers <= 0 || out_vec_size <= 0 || seq_len == 0) { MIOPEN_THROW(miopenStatusBadParm); @@ -108,13 +104,15 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } auto get_HxBuff_offset = - [bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { - return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + + [bi, hidden_size, max_batch](int layer_id, int batch_id, RnnDirection reverse) { + return (static_cast(hidden_size) * (max_batch)) * + (bi * layer_id + static_cast(reverse)) + (size_t)hidden_size * batch_id; }; ReluWeightOffsets WeiBuf( in_vec_size, hidden_size, nLayers, biasMode * 2, bi, nHiddenTensorsPerLayer); + ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); ActivationDescriptor activDesc; @@ -158,7 +156,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const auto input_weight_offset = WeiBuf.input_weight_offset(layer); const auto output_offset = RBuff.layer_offset(layer); - const auto input_offset = layer > 0 ? RBuff.hidden_offset(layer - 1, 0, 0) : 0; + const auto input_offset = + layer > 0 ? RBuff.hidden_offset(layer - 1, 0, RnnDirection::Forward) : 0; const auto input_ptr = layer > 0 ? reserveSpace : x; @@ -193,10 +192,10 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, w, &hx, &hidden_size, - &batches, - &bacc_per_time, &seq_len, - max_batch](int layer, float beta_t = 0) { + max_batch, + &batches, + &bacc_per_time](int layer, float beta_t = 0) { float alpha0 = 1; float alpha1 = 1; @@ -277,7 +276,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(dirMode == 0u) return; - if(max_batch == batches.at(seq_len - 1)) + if(max_batch == batches.at(seq_len - 1, RnnDirection::Forward)) { OpTensor(handle, miopenTensorOpAdd, @@ -300,11 +299,12 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, for(int ti = 0; ti < seq_len - 1; ti++) { - auto offset = RBuff.layer_offset(layer) + bacc_per_time[ti] * RBuff.gemm_write_stride(); + auto offset = RBuff.layer_offset(layer) + + bacc_per_time.at(ti, RnnDirection::Forward) * RBuff.gemm_write_stride(); reserve_desc = miopen::TensorDescriptor( wDesc.GetType(), - std::vector{1, batches.at(ti + 1), hidden_size}, + std::vector{1, batches.at(ti + 1, RnnDirection::Forward), hidden_size}, std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); OpTensor(handle, @@ -329,23 +329,19 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &WeiBuf, hidden_size, &get_HxBuff_offset, - &bacc_per_time, - &batches, &handle, &xDesc, reserveSpace, hx, - seq_len, - w](int layer, int time, int direction) { + w, + &batches, + &bacc_per_time](int layer, int time, RnnDirection direction) { if(time == 0 && hx == nullptr) return; - const int cur_time = direction == rnn_direction::Forward ? time : seq_len - 1 - time; - const int prev_time = direction == rnn_direction::Forward ? cur_time - 1 : cur_time + 1; - - const int m = direction == rnn_direction::Forward ? batches.at(cur_time) - : time == 0 ? batches.at(cur_time) - : batches.at(prev_time); + const auto m = direction == RnnDirection::Forward ? batches.at(time, direction) + : time == 0 ? batches.at(time, direction) + : batches.prev(time, direction); const int n = hidden_size, k = hidden_size; @@ -355,14 +351,14 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const auto ht_ptr = time > 0 ? reserveSpace : hx; - if(time != 0 && direction == rnn_direction::Backward && hx != nullptr && - batches.at(cur_time) > batches.at(prev_time)) + if(time != 0 && direction == RnnDirection::Backward && hx != nullptr && + batches.at(time, direction) > batches.prev(time, direction)) { miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, true, - batches.at(cur_time) - batches.at(prev_time), + batches.at(time, direction) - batches.prev(time, direction), n, k, hidden_size, @@ -381,12 +377,14 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, CallGemm(handle, gemm_desc, hx, - get_HxBuff_offset(layer, batches.at(prev_time), direction), + get_HxBuff_offset(layer, batches.prev(time, direction), direction), w, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, - RBuff.gemm_write_offset( - layer, bacc_per_time[cur_time] + batches.at(prev_time), direction), + RBuff.gemm_write_offset(layer, + bacc_per_time.at(time, direction) + + batches.prev(time, direction), + direction), GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) @@ -421,11 +419,12 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, false}; const auto hidden_offset = - (time == 0) ? get_HxBuff_offset(layer, 0, direction) - : RBuff.hidden_offset(layer, bacc_per_time[prev_time], direction); + (time == 0) + ? get_HxBuff_offset(layer, 0, direction) + : RBuff.hidden_offset(layer, bacc_per_time.prev(time, direction), direction); const auto save_point_offset = - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction); + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction); const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, @@ -449,71 +448,62 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, } }; - auto call_relu_tan_hidden_state_update = [&RBuff, - hidden_size, - &bacc_per_time, - &batches, - &handle, - &wDesc, - reserveSpace, - &activDesc, - seq_len](int layer_id, int time, int direction) { - float alpha = 1, beta = 0; + auto call_relu_tan_hidden_state_update = + [&RBuff, hidden_size, &handle, &wDesc, reserveSpace, &activDesc, &batches, &bacc_per_time]( + int layer_id, int time, RnnDirection direction) { + float alpha = 1, beta = 0; - const int cur_time = direction == rnn_direction::Forward ? time : seq_len - time - 1; + const std::vector tensor_size{1, + static_cast(batches.at(time, direction)), + static_cast(hidden_size)}; - const std::vector tensor_size{ - 1, static_cast(batches[cur_time]), static_cast(hidden_size)}; + const std::vector tensor_stride{static_cast(RBuff.layer_stride()), + static_cast(RBuff.gemm_write_stride()), + 1}; - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; + auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + const auto RB_layer_save_points_off = + RBuff.gemm_write_offset(layer_id, bacc_per_time.at(time, direction), direction); - const auto RB_layer_save_points_off = - RBuff.gemm_write_offset(layer_id, bacc_per_time[cur_time], direction); + const auto hidden_offset = + RBuff.hidden_offset(layer_id, bacc_per_time.at(time, direction), direction); - const auto hidden_offset = - RBuff.hidden_offset(layer_id, bacc_per_time[cur_time], direction); - - activDesc.Forward(handle, - &alpha, - // input tensor descriptor - src_desc, - // input pointer - reserveSpace, - &beta, - // output tensor descriptor - dst_desc, - // output pointer - reserveSpace, - // input tensor offset - RB_layer_save_points_off, - // output tensor offset - hidden_offset); - }; + activDesc.Forward(handle, + &alpha, + // input tensor descriptor + src_desc, + // input pointer + reserveSpace, + &beta, + // output tensor descriptor + dst_desc, + // output pointer + reserveSpace, + // input tensor offset + RB_layer_save_points_off, + // output tensor offset + hidden_offset); + }; auto call_relu_tan_update_output = [&RBuff, &get_HxBuff_offset, hidden_size, - &batches, &handle, &wDesc, reserveSpace, hy, max_batch, &bacc_per_time, - seq_len](int layer_id, int time, int direction) { + seq_len, + &batches](int layer_id, int time, RnnDirection direction) { if(hy == nullptr) return; - auto cur_time = direction == rnn_direction::Forward ? time : seq_len - 1 - time; - auto next_time = direction == rnn_direction::Forward ? cur_time + 1 : cur_time - 1; - - auto copy_batches = time == seq_len - 1 ? batches.at(cur_time) - : batches.at(cur_time) - batches.at(next_time); + auto copy_batches = time == seq_len - 1 + ? batches.at(time, direction) + : batches.at(time, direction) - batches.next(time, direction); if(copy_batches <= 0) return; @@ -524,10 +514,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_dst_stride{ static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; - auto batch_id_relative = batches.at(cur_time) - copy_batches; + auto batch_id_relative = batches.at(time, direction) - copy_batches; - auto batch_id_abs = time == seq_len - 1 ? bacc_per_time[cur_time] - : bacc_per_time[cur_time] + batches.at(next_time); + auto batch_id_abs = time == seq_len - 1 + ? bacc_per_time.at(time, direction) + : bacc_per_time.at(time, direction) + batches.next(time, direction); const std::vector hcy_copy_size{ 1, static_cast(copy_batches), static_cast(hidden_size)}; @@ -552,24 +543,26 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, for(int time = 0; time < seq_len; time++) { - call_relu_tan_hidden_gemm(layer_id, time, rnn_direction::Forward); + call_relu_tan_hidden_gemm(layer_id, time, RnnDirection::Forward); - call_relu_tan_hidden_state_update(layer_id, time, rnn_direction::Forward); + call_relu_tan_hidden_state_update(layer_id, time, RnnDirection::Forward); if(dirMode == 0u) continue; - call_relu_tan_hidden_gemm(layer_id, time, rnn_direction::Backward); + call_relu_tan_hidden_gemm(layer_id, time, RnnDirection::Backward); - call_relu_tan_hidden_state_update(layer_id, time, rnn_direction::Backward); + call_relu_tan_hidden_state_update(layer_id, time, RnnDirection::Backward); } for(int time = seq_len - 1; time >= 0; time--) { - call_relu_tan_update_output(layer_id, time, rnn_direction::Forward); + call_relu_tan_update_output(layer_id, time, RnnDirection::Forward); + if(dirMode == 0u) continue; - call_relu_tan_update_output(layer_id, time, rnn_direction::Backward); + + call_relu_tan_update_output(layer_id, time, RnnDirection::Backward); } } @@ -593,7 +586,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, reserveSpace, y_dst_desc, y, - RBuff.hidden_offset(nLayers - 1, 0, rnn_direction::Forward), + RBuff.hidden_offset(nLayers - 1, 0, RnnDirection::Forward), 0); } #else @@ -4577,8 +4570,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( auto rnn_data_type = dhxDesc.GetType(); - std::vector batches; - std::vector bacc_per_time(seqLen + 1); + RnnBatches batches; + RnnBatches bacc_per_time; int input_size = dxDesc[0].GetLengths()[1]; int hy_d = dhxDesc.GetLengths()[0]; @@ -4639,13 +4632,12 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( "! Batch size must not ascend!"); } } + batches.push_back(batchval); - bacc_per_time[i] = total_batch_size; + bacc_per_time.push_back(total_batch_size); total_batch_size += batchval; } - bacc_per_time[seqLen] = total_batch_size; - if(out_vec_size != (bi * hidden_size)) { MIOPEN_THROW(miopenStatusBadParm, "Output size doesn't match hidden state size!"); @@ -4678,8 +4670,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( ReluReserveBufferOffsets RBuff(hidden_size, nLayers, total_batch_size, bi, workspaceScale); auto get_HxBuff_offset = - [bi, hidden_size, max_batch](int layer_id, int batch_id, int reverse) { - return (static_cast(hidden_size) * (max_batch)) * (bi * layer_id + reverse) + + [bi, hidden_size, max_batch](int layer_id, int batch_id, RnnDirection reverse) { + return (static_cast(hidden_size) * (max_batch)) * + (bi * layer_id + static_cast(reverse)) + static_cast(hidden_size) * batch_id; }; @@ -4767,23 +4760,24 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( dhy, workSpace, seqLen, - &get_HxBuff_offset](int layer, int time, int direction) { + &get_HxBuff_offset]( + int layer, int time, RnnDirection direction) { if(dhy == nullptr) return; - const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; - const int start_time = direction == rnn_direction::Forward ? 0 : seqLen - time - 1; + const int start_time = direction == RnnDirection::Forward ? 0 : seqLen - time - 1; float alpha0 = 1; float alpha1 = 1; float beta_t = 0; - std::vector hx_stride{batches.at(start_time) * hidden_size, hidden_size, 1}; + std::vector hx_stride{ + batches.at(start_time, RnnDirection::Forward) * hidden_size, hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(cur_time), hidden_size}; - std::vector reserve_size{1, batches.at(cur_time), hidden_size}; + std::vector hx_size{1, batches.at(time, direction), hidden_size}; + std::vector reserve_size{1, batches.at(time, direction), hidden_size}; auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); @@ -4801,8 +4795,8 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( workspace_desc, workSpace, get_HxBuff_offset(layer, 0, direction), - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); }; auto propagate_hidden_prev = [&RBuff, @@ -4815,23 +4809,21 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( workSpace, &get_HxBuff_offset, WeiBuf, - seqLen, w, - max_batch](int layer, int time, int direction) { - const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; - const int next_time = direction == rnn_direction::Forward ? cur_time + 1 : cur_time - 1; - if(direction == rnn_direction::Forward && dhy != nullptr && - batches.at(cur_time) > batches.at(next_time)) + max_batch](int layer, int time, RnnDirection direction) { + if(direction == RnnDirection::Forward && dhy != nullptr && + batches.at(time, direction) > batches.next(time, direction)) { std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; std::vector reserve_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(cur_time) - batches.at(next_time), hidden_size}; + std::vector hx_size{ + 1, batches.at(time, direction) - batches.next(time, direction), hidden_size}; std::vector reserve_size{ - 1, batches.at(cur_time) - batches.at(next_time), hidden_size}; + 1, batches.at(time, direction) - batches.next(time, direction), hidden_size}; float alpha0 = 1; float alpha1 = 1; @@ -4852,15 +4844,19 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &beta_t, reserve_desc, workSpace, - get_HxBuff_offset(layer, batches.at(next_time), direction), - RBuff.gemm_write_offset( - layer, bacc_per_time[cur_time] + batches.at(next_time), direction), - RBuff.gemm_write_offset( - layer, bacc_per_time[cur_time] + batches.at(next_time), direction)); + get_HxBuff_offset(layer, batches.next(time, direction), direction), + RBuff.gemm_write_offset(layer, + bacc_per_time.at(time, direction) + + batches.next(time, direction), + direction), + RBuff.gemm_write_offset(layer, + bacc_per_time.at(time, direction) + + batches.next(time, direction), + direction)); } - int used_batch = - direction == rnn_direction::Forward ? batches[next_time] : batches[cur_time]; + int used_batch = direction == RnnDirection::Forward ? batches.next(time, direction) + : batches.at(time, direction); if(used_batch <= 0) return; @@ -4887,11 +4883,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( CallGemm(handle, gemm_desc, workSpace, - RBuff.gemm_write_offset(layer, bacc_per_time[next_time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time.next(time, direction), direction), w, WeiBuf.hidden_weight_offset(layer, direction), workSpace, - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) @@ -4919,9 +4915,7 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( &activDesc, &propagate_hidden_output, &propagate_hidden_prev, - max_batch](int layer, int time, int direction) { - const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; - + max_batch](int layer, int time, RnnDirection direction) { std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; std::vector reserve_stride{ @@ -4936,35 +4930,36 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( propagate_hidden_prev(layer, time, direction); } - std::vector reserve_size{1, batches.at(cur_time), hidden_size}; + std::vector reserve_size{1, batches.at(time, direction), hidden_size}; auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); float alpha = 1, beta = 0; - activDesc.Backward(handle, - &alpha, - reserve_desc, - reserveSpace, - reserve_desc, - workSpace, - reserve_desc, - reserveSpace, - &beta, - reserve_desc, - workSpace, - RBuff.hidden_offset(layer, bacc_per_time[cur_time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction), - RBuff.gemm_write_offset(layer, bacc_per_time[cur_time], direction)); + activDesc.Backward( + handle, + &alpha, + reserve_desc, + reserveSpace, + reserve_desc, + workSpace, + reserve_desc, + reserveSpace, + &beta, + reserve_desc, + workSpace, + RBuff.hidden_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); }; auto propagate_hidden = [this, seqLen, propagate_hidden_time](int layer) { for(int time = seqLen - 1; time >= 0; time--) { - propagate_hidden_time(layer, time, rnn_direction::Forward); + propagate_hidden_time(layer, time, RnnDirection::Forward); if(dirMode == 0u) continue; - propagate_hidden_time(layer, time, rnn_direction::Backward); + propagate_hidden_time(layer, time, RnnDirection::Backward); } }; @@ -4978,13 +4973,9 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( w, dhx, &get_HxBuff_offset, - workSpace, - seqLen](int layer, int time, int direction) { - const int cur_time = direction == rnn_direction::Forward ? time : seqLen - time - 1; - const int prev_time = direction == rnn_direction::Forward ? cur_time - 1 : cur_time + 1; - - int batch_size = - time == 0 ? batches.at(cur_time) : batches.at(cur_time) - batches.at(prev_time); + workSpace](int layer, int time, RnnDirection direction) { + int batch_size = time == 0 ? batches.at(time, direction) + : batches.at(time, direction) - batches.prev(time, direction); if(batch_size <= 0) return; @@ -5007,10 +4998,10 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int output_batch = time == 0 ? 0 : batches.at(prev_time); + int output_batch = time == 0 ? 0 : batches.prev(time, direction); - int input_batch = - time == 0 ? bacc_per_time[cur_time] : bacc_per_time[prev_time] - batch_size; + int input_batch = time == 0 ? bacc_per_time.at(time, direction) + : bacc_per_time.prev(time, direction) - batch_size; miopenStatus_t gemm_status = CallGemm(handle, @@ -5042,12 +5033,12 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( for(int time = 0; time < seqLen; time++) { - propagate_dhx_prev(layer, time, rnn_direction::Forward); + propagate_dhx_prev(layer, time, RnnDirection::Forward); if(dirMode == 0u) continue; - propagate_dhx_prev(layer, time, rnn_direction::Backward); + propagate_dhx_prev(layer, time, RnnDirection::Backward); } }; From 6f25e7d287458990027f8e6ce0238ad336280e86 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Thu, 11 Jan 2024 19:13:35 +0300 Subject: [PATCH 25/27] Enabled miopenRNNskip mode in RNNForwardTrainingTanhRelu --- src/ocl/rnnocl.cpp | 135 +++++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 54 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 747c3bc7f6..b0b1babd0e 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -126,62 +126,89 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, activDesc = {miopenActivationTANH, 1, 1, 1}; } - auto call_relu_tan_input_gemm = - [&RBuff, &WeiBuf, &in_vec_size, &handle, &xDesc, reserveSpace, x, w](int layer, - float beta_t = 1) { - const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), - k = layer > 0 ? RBuff.gemm_write_size() : in_vec_size; - - const int lda = layer > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, - ldc = RBuff.gemm_write_stride(); - - const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - true, - m, - n, - k, - lda, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta - xDesc.GetType(), - false}; - - const auto input_weight_offset = WeiBuf.input_weight_offset(layer); - const auto output_offset = RBuff.layer_offset(layer); - - const auto input_offset = - layer > 0 ? RBuff.hidden_offset(layer - 1, 0, RnnDirection::Forward) : 0; - - const auto input_ptr = layer > 0 ? reserveSpace : x; - - const miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - input_ptr, - input_offset, - w, - input_weight_offset, - reserveSpace, - output_offset, - GemmBackend_t::rocblas); - if(gemm_status != miopenStatusSuccess) + auto call_relu_tan_input_gemm = [this, + &RBuff, + &WeiBuf, + &in_vec_size, + &handle, + &xDesc, + reserveSpace, + x, + w, + hidden_size, + wDesc, + bi](int layer, float beta_t = 1) { + if(inputMode == miopenRNNskip && layer == 0) + { + auto input_desc = + miopen::TensorDescriptor(wDesc.GetType(), + {1, RBuff.batches_per_layer, hidden_size}, + {1, RBuff.batches_per_layer * in_vec_size, in_vec_size}); + auto reserve_desc = + miopen::TensorDescriptor(wDesc.GetType(), + {1, RBuff.batches_per_layer, hidden_size}, + {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + + for(int gi = 0; gi < nHiddenTensorsPerLayer * bi; gi++) { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else - { - MIOPEN_LOG_E("GEMM failed"); - } + CopyTensor(handle, x_desc, x, sp_desc, reserveSpace, 0, gi * hidden_size); } - }; + return; + } + + const int m = RBuff.batches_per_layer, n = RBuff.gemm_write_size(), + k = layer > 0 ? RBuff.gemm_write_size() : in_vec_size; + + const int lda = layer > 0 ? RBuff.gemm_write_stride() : in_vec_size, ldb = k, + ldc = RBuff.gemm_write_stride(); + + const miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + m, + n, + k, + lda, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + beta_t, // beta + xDesc.GetType(), + false}; + + const auto input_weight_offset = WeiBuf.input_weight_offset(layer); + const auto output_offset = RBuff.layer_offset(layer); + + const auto input_offset = + layer > 0 ? RBuff.hidden_offset(layer - 1, 0, RnnDirection::Forward) : 0; + + const auto input_ptr = layer > 0 ? reserveSpace : x; + + const miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + input_ptr, + input_offset, + w, + input_weight_offset, + reserveSpace, + output_offset, + GemmBackend_t::rocblas); + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } + } + }; auto call_relu_tan_bias_add = [this, &RBuff, From a145ecb5fa40a2c9ec2bed1934fd370f860d2ad8 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Fri, 12 Jan 2024 19:30:08 +0300 Subject: [PATCH 26/27] Descriptors renaming --- src/ocl/rnnocl.cpp | 436 +++++++++++++++++++++++---------------------- 1 file changed, 222 insertions(+), 214 deletions(-) diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index b0b1babd0e..29c4b0bfd8 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -62,6 +62,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, RnnBatches batches(seq_array); RnnBatches bacc_per_time; + auto rnn_data_type = wDesc.GetType(); + float beta = 0; int in_vec_size = xDesc.GetLengths()[1]; @@ -88,18 +90,22 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, MIOPEN_THROW(miopenStatusBadParm); } - const auto sp_tensor_size = reserveSpaceSize / GetTypeSize(wDesc.GetType()); + const auto sp_tensor_size = reserveSpaceSize / GetTypeSize(rnn_data_type); auto sp_desc = miopen::TensorDescriptor( - wDesc.GetType(), {1, 1, sp_tensor_size}, {sp_tensor_size, sp_tensor_size, 1}); + rnn_data_type, {1, 1, sp_tensor_size}, {sp_tensor_size, sp_tensor_size, 1}); + // Clear reserveSpace buffer + // SetTensor(handle, sp_desc, reserveSpace, &beta); if(hy != nullptr) { const auto hy_tensor_size = biNumLayers * max_batch * hidden_size; auto hy_desc = miopen::TensorDescriptor( - wDesc.GetType(), {1, 1, hy_tensor_size}, {hy_tensor_size, hy_tensor_size, 1}); + rnn_data_type, {1, 1, hy_tensor_size}, {hy_tensor_size, hy_tensor_size, 1}); + // Clear hy buffer + // SetTensor(handle, hy_desc, hy, &beta); } @@ -136,22 +142,22 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, x, w, hidden_size, - wDesc, + rnn_data_type, bi](int layer, float beta_t = 1) { if(inputMode == miopenRNNskip && layer == 0) { auto input_desc = - miopen::TensorDescriptor(wDesc.GetType(), + miopen::TensorDescriptor(rnn_data_type, {1, RBuff.batches_per_layer, hidden_size}, {1, RBuff.batches_per_layer * in_vec_size, in_vec_size}); auto reserve_desc = - miopen::TensorDescriptor(wDesc.GetType(), + miopen::TensorDescriptor(rnn_data_type, {1, RBuff.batches_per_layer, hidden_size}, {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); for(int gi = 0; gi < nHiddenTensorsPerLayer * bi; gi++) { - CopyTensor(handle, x_desc, x, sp_desc, reserveSpace, 0, gi * hidden_size); + CopyTensor(handle, input_desc, x, reserve_desc, reserveSpace, 0, gi * hidden_size); } return; } @@ -214,7 +220,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &RBuff, &WeiBuf, &handle, - &wDesc, + rnn_data_type, reserveSpace, w, &hx, @@ -226,27 +232,25 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, float alpha0 = 1; float alpha1 = 1; - auto bias_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, 1, WeiBuf.bias_stride()}, - std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + auto bias_desc = miopen::TensorDescriptor(rnn_data_type, + {1, 1, WeiBuf.bias_stride()}, + {WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - const auto hidden_interim_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{ - 1, static_cast(RBuff.batches_per_layer), WeiBuf.bias_stride()}, - std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + const auto hidden_desc = + miopen::TensorDescriptor(rnn_data_type, + {1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, + {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); OpTensor(handle, miopenTensorOpAdd, &alpha0, - hidden_interim_desc, + hidden_desc, reserveSpace, // A &alpha1, bias_desc, w, // B &beta_t, - hidden_interim_desc, + hidden_desc, reserveSpace, // C RBuff.layer_offset(layer), // A offset WeiBuf.bias_off(layer), // B offset @@ -257,13 +261,13 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, OpTensor(handle, miopenTensorOpAdd, &alpha0, - hidden_interim_desc, + hidden_desc, reserveSpace, &alpha1, bias_desc, w, &beta_t, - hidden_interim_desc, + hidden_desc, reserveSpace, RBuff.layer_offset(layer), WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), @@ -271,18 +275,18 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, return; } + // hx == nullptr + // if((RBuff.batches_per_layer - max_batch) <= 0) return; - auto reserve_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, RBuff.batches_per_layer - max_batch, hidden_size}, - std::vector{RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + auto reserve_desc = + miopen::TensorDescriptor(rnn_data_type, + {1, RBuff.batches_per_layer - max_batch, hidden_size}, + {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); bias_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, 1, hidden_size}, - std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + rnn_data_type, {1, 1, hidden_size}, {WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); OpTensor(handle, miopenTensorOpAdd, @@ -330,9 +334,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, bacc_per_time.at(ti, RnnDirection::Forward) * RBuff.gemm_write_stride(); reserve_desc = miopen::TensorDescriptor( - wDesc.GetType(), - std::vector{1, batches.at(ti + 1, RnnDirection::Forward), hidden_size}, - std::vector{WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); + rnn_data_type, + {1, batches.at(ti + 1, RnnDirection::Forward), hidden_size}, + {WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); OpTensor(handle, miopenTensorOpAdd, @@ -476,20 +480,20 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, }; auto call_relu_tan_hidden_state_update = - [&RBuff, hidden_size, &handle, &wDesc, reserveSpace, &activDesc, &batches, &bacc_per_time]( - int layer_id, int time, RnnDirection direction) { + [&RBuff, + hidden_size, + &handle, + rnn_data_type, + reserveSpace, + &activDesc, + &batches, + &bacc_per_time](int layer_id, int time, RnnDirection direction) { float alpha = 1, beta = 0; - const std::vector tensor_size{1, - static_cast(batches.at(time, direction)), - static_cast(hidden_size)}; - - const std::vector tensor_stride{static_cast(RBuff.layer_stride()), - static_cast(RBuff.gemm_write_stride()), - 1}; - - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), tensor_size, tensor_stride); + auto hidden_desc = + miopen::TensorDescriptor(rnn_data_type, + {1, batches.at(time, direction), hidden_size}, + {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); const auto RB_layer_save_points_off = RBuff.gemm_write_offset(layer_id, bacc_per_time.at(time, direction), direction); @@ -500,12 +504,12 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, activDesc.Forward(handle, &alpha, // input tensor descriptor - src_desc, + hidden_desc, // input pointer reserveSpace, &beta, // output tensor descriptor - dst_desc, + hidden_desc, // output pointer reserveSpace, // input tensor offset @@ -518,7 +522,7 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &get_HxBuff_offset, hidden_size, &handle, - &wDesc, + rnn_data_type, reserveSpace, hy, max_batch, @@ -550,8 +554,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_copy_size{ 1, static_cast(copy_batches), static_cast(hidden_size)}; - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_src_stride); - auto dst_desc = miopen::TensorDescriptor(wDesc.GetType(), hcy_copy_size, hcy_dst_stride); + auto src_desc = miopen::TensorDescriptor(rnn_data_type, hcy_copy_size, hcy_src_stride); + auto dst_desc = miopen::TensorDescriptor(rnn_data_type, hcy_copy_size, hcy_dst_stride); CopyTensor(handle, src_desc, @@ -605,8 +609,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, static_cast(out_vec_size), 1}; - auto src_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_src_stride); - auto y_dst_desc = miopen::TensorDescriptor(wDesc.GetType(), y_copy_size, y_dst_stride); + auto src_desc = miopen::TensorDescriptor(rnn_data_type, y_copy_size, y_src_stride); + auto y_dst_desc = miopen::TensorDescriptor(rnn_data_type, y_copy_size, y_dst_stride); CopyTensor(handle, src_desc, @@ -4626,11 +4630,11 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( if(dhx != nullptr) { - int dhx_size = max_batch * hidden_size * hy_d; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, - std::vector{1, 1, dhx_size}, - std::vector{dhx_size, dhx_size, 1}); - SetTensor(handle, hx_desc, dhx, &beta); + int dhx_size = max_batch * hidden_size * hy_d; + auto dhx_desc = miopen::TensorDescriptor(rnn_data_type, + std::vector{1, 1, dhx_size}, + std::vector{dhx_size, dhx_size, 1}); + SetTensor(handle, dhx_desc, dhx, &beta); } int total_batch_size = 0; @@ -4703,128 +4707,133 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( static_cast(hidden_size) * batch_id; }; - auto propagate_output = - [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, - int layer) { - // Propagate output - // - if(layer == numLayers - 1) - { - const std::vector y_src_size{1, - static_cast(RBuff.batches_per_layer), - static_cast(out_vec_size)}; + auto propagate_output = [&RBuff, + out_vec_size, + &handle, + rnn_data_type, + workSpace, + dy, + &WeiBuf, + w](int numLayers, int layer) { + // Propagate output + // + if(layer == numLayers - 1) + { + const std::vector output_size{ + 1, static_cast(RBuff.batches_per_layer), static_cast(out_vec_size)}; - const std::vector y_dst_size{1, + const std::vector workspace_size{1, static_cast(RBuff.batches_per_layer), static_cast(RBuff.gemm_write_size())}; - const std::vector y_dst_stride{ - RBuff.layer_stride(), static_cast(RBuff.gemm_write_size()), 1}; + const std::vector workspace_stride{ + RBuff.layer_stride(), static_cast(RBuff.gemm_write_size()), 1}; - const std::vector y_src_stride{ - static_cast(out_vec_size * RBuff.batches_per_layer), - static_cast(out_vec_size), - 1}; + const std::vector output_stride{ + static_cast(out_vec_size * RBuff.batches_per_layer), + static_cast(out_vec_size), + 1}; - auto y_src_desc = miopen::TensorDescriptor(rnn_data_type, y_src_size, y_src_stride); - auto y_dst_desc = miopen::TensorDescriptor(rnn_data_type, y_dst_size, y_dst_stride); + auto y_src_desc = miopen::TensorDescriptor(rnn_data_type, output_size, output_stride); + auto y_dst_desc = + miopen::TensorDescriptor(rnn_data_type, workspace_size, workspace_stride); - CopyTensor( - handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); - } - // Propagate previous layer - // - else - { - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - false, - RBuff.batches_per_layer, - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - rnn_data_type, - false}; + CopyTensor(handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); + } + // Propagate previous layer + // + else + { + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + RBuff.batches_per_layer, + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; - miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - workSpace, - RBuff.layer_offset(layer + 1), - w, - WeiBuf.input_weight_offset(layer + 1), - workSpace, - RBuff.layer_offset(layer), - GemmBackend_t::rocblas); + miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + workSpace, + RBuff.layer_offset(layer + 1), + w, + WeiBuf.input_weight_offset(layer + 1), + workSpace, + RBuff.layer_offset(layer), + GemmBackend_t::rocblas); - if(gemm_status != miopenStatusSuccess) + if(gemm_status != miopenStatusSuccess) + { + if(gemm_status == miopenStatusNotImplemented) { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else - { - MIOPEN_LOG_E("GEMM failed"); - } + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); } } - }; + } + }; - auto propagate_hidden_output = [&RBuff, - &handle, - hidden_size, - &batches, - &bacc_per_time, - rnn_data_type, - dhy, - workSpace, - seqLen, - &get_HxBuff_offset]( - int layer, int time, RnnDirection direction) { - if(dhy == nullptr) - return; + auto propagate_hidden_output = + [&RBuff, + &handle, + hidden_size, + &batches, + &bacc_per_time, + rnn_data_type, + dhy, + workSpace, + seqLen, + &get_HxBuff_offset](int layer, int time, RnnDirection direction) { + if(dhy == nullptr) + return; - const int start_time = direction == RnnDirection::Forward ? 0 : seqLen - time - 1; + const int start_time = direction == RnnDirection::Forward ? 0 : seqLen - time - 1; - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; - std::vector hx_stride{ - batches.at(start_time, RnnDirection::Forward) * hidden_size, hidden_size, 1}; - std::vector reserve_stride{ - static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; + std::vector hx_stride{ + batches.at(start_time, RnnDirection::Forward) * hidden_size, hidden_size, 1}; + std::vector workspace_stride{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{1, batches.at(time, direction), hidden_size}; - std::vector reserve_size{1, batches.at(time, direction), hidden_size}; + std::vector hx_size{1, batches.at(time, direction), hidden_size}; + std::vector reserve_size{1, batches.at(time, direction), hidden_size}; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); - auto workspace_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + auto workspace_desc = + miopen::TensorDescriptor(rnn_data_type, reserve_size, workspace_stride); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - workspace_desc, - workSpace, - &beta_t, - workspace_desc, - workSpace, - get_HxBuff_offset(layer, 0, direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); - }; + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + hx_desc, + dhy, + &alpha1, + workspace_desc, + workSpace, + &beta_t, + workspace_desc, + workSpace, + get_HxBuff_offset(layer, 0, direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); + }; auto propagate_hidden_prev = [&RBuff, &handle, @@ -4841,35 +4850,35 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( if(direction == RnnDirection::Forward && dhy != nullptr && batches.at(time, direction) > batches.next(time, direction)) { - std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; + std::vector hy_stride{max_batch * hidden_size, hidden_size, 1}; - std::vector reserve_stride{ + std::vector hidden_tensor_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hx_size{ + std::vector hy_size{ 1, batches.at(time, direction) - batches.next(time, direction), hidden_size}; - std::vector reserve_size{ + std::vector hidden_tensor_size{ 1, batches.at(time, direction) - batches.next(time, direction), hidden_size}; float alpha0 = 1; float alpha1 = 1; float beta_t = 0; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); - auto reserve_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); + auto hy_tensor_desc = miopen::TensorDescriptor(rnn_data_type, hy_size, hy_stride); + auto hidden_tensor_desc = + miopen::TensorDescriptor(rnn_data_type, hidden_tensor_size, hidden_tensor_stride); OpTensor(handle, miopenTensorOpAdd, &alpha0, - hx_desc, + hy_tensor_desc, dhy, &alpha1, - reserve_desc, + hidden_tensor_desc, workSpace, &beta_t, - reserve_desc, + hidden_tensor_desc, workSpace, get_HxBuff_offset(layer, batches.next(time, direction), direction), RBuff.gemm_write_offset(layer, @@ -4930,57 +4939,56 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_hidden_time = [&RBuff, - &handle, - hidden_size, - seqLen, - &batches, - &bacc_per_time, - rnn_data_type, - workSpace, - reserveSpace, - &activDesc, - &propagate_hidden_output, - &propagate_hidden_prev, - max_batch](int layer, int time, RnnDirection direction) { - std::vector hx_stride{max_batch * hidden_size, hidden_size, 1}; + auto propagate_hidden_time = + [&RBuff, + &handle, + hidden_size, + seqLen, + &batches, + &bacc_per_time, + rnn_data_type, + workSpace, + reserveSpace, + &activDesc, + &propagate_hidden_output, + &propagate_hidden_prev](int layer, int time, RnnDirection direction) { + if(time == seqLen - 1) + { + propagate_hidden_output(layer, time, direction); + } + else + { + propagate_hidden_prev(layer, time, direction); + } - std::vector reserve_stride{ - RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; + std::vector hidden_tensor_stride{ + RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; + std::vector hidden_tensor_size{1, batches.at(time, direction), hidden_size}; - if(time == seqLen - 1) - { - propagate_hidden_output(layer, time, direction); - } - else - { - propagate_hidden_prev(layer, time, direction); - } + auto hidden_desc = + miopen::TensorDescriptor(rnn_data_type, hidden_tensor_size, hidden_tensor_stride); - std::vector reserve_size{1, batches.at(time, direction), hidden_size}; - auto reserve_desc = miopen::TensorDescriptor(rnn_data_type, reserve_size, reserve_stride); - - float alpha = 1, beta = 0; - - activDesc.Backward( - handle, - &alpha, - reserve_desc, - reserveSpace, - reserve_desc, - workSpace, - reserve_desc, - reserveSpace, - &beta, - reserve_desc, - workSpace, - RBuff.hidden_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); - }; + float alpha = 1, beta = 0; + + activDesc.Backward( + handle, + &alpha, + hidden_desc, + reserveSpace, + hidden_desc, + workSpace, + hidden_desc, + reserveSpace, + &beta, + hidden_desc, + workSpace, + RBuff.hidden_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); + }; - auto propagate_hidden = [this, seqLen, propagate_hidden_time](int layer) { + auto propagate_hidden = [this, seqLen, &propagate_hidden_time](int layer) { for(int time = seqLen - 1; time >= 0; time--) { propagate_hidden_time(layer, time, RnnDirection::Forward); From ade5f3a594547f81508dcb627eacc8cef0f062b0 Mon Sep 17 00:00:00 2001 From: Alexey Akimov Date: Sat, 27 Jan 2024 12:39:31 +0300 Subject: [PATCH 27/27] RNNBackwardData function renaming --- src/include/miopen/rnn_util.hpp | 9 +- src/ocl/rnnocl.cpp | 754 +++++++++++++++++--------------- 2 files changed, 406 insertions(+), 357 deletions(-) diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index ea2b7e065a..5b65de55f1 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -32,6 +32,7 @@ #include #include #include +#include namespace miopen { @@ -43,7 +44,6 @@ enum class RnnDirection struct RnnBatches { - int at(int time, RnnDirection direction) const { return batches.at(cur_time(time, direction)); } int next(int time, RnnDirection direction) const @@ -211,7 +211,12 @@ struct ReluWeightOffsets (h_vec_sz * bi_scale + h_vec_sz) * (num_layers - 1) * weight_stride; } - int bias_off(int layer_id) const { return bias_off() + bias_count * layer_id * weight_stride; } + int + bias_off(int layer_id, int bias_id, RnnDirection direction) const + { + return bias_off() + bias_count * layer_id * weight_stride + bias_id * bias_stride() + + static_cast(direction) * h_vec_sz; + } int weight_stride; private: diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp index 29c4b0bfd8..89695382b1 100644 --- a/src/ocl/rnnocl.cpp +++ b/src/ocl/rnnocl.cpp @@ -143,21 +143,21 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, w, hidden_size, rnn_data_type, - bi](int layer, float beta_t = 1) { + bi](int layer) { if(inputMode == miopenRNNskip && layer == 0) { - auto input_desc = + auto x_desc = miopen::TensorDescriptor(rnn_data_type, {1, RBuff.batches_per_layer, hidden_size}, - {1, RBuff.batches_per_layer * in_vec_size, in_vec_size}); - auto reserve_desc = + {1, static_cast(RBuff.batches_per_layer ) * in_vec_size, in_vec_size}); + auto ht_desc = miopen::TensorDescriptor(rnn_data_type, {1, RBuff.batches_per_layer, hidden_size}, {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); for(int gi = 0; gi < nHiddenTensorsPerLayer * bi; gi++) { - CopyTensor(handle, input_desc, x, reserve_desc, reserveSpace, 0, gi * hidden_size); + CopyTensor(handle, x_desc, x, ht_desc, reserveSpace, 0, gi * hidden_size); } return; } @@ -177,31 +177,32 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, lda, ldb, ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - beta_t, // beta + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta xDesc.GetType(), false}; - const auto input_weight_offset = WeiBuf.input_weight_offset(layer); - const auto output_offset = RBuff.layer_offset(layer); + const auto wx_offset = WeiBuf.input_weight_offset(layer); + const auto ht_offset = RBuff.layer_offset(layer); - const auto input_offset = + const auto xt_offset = layer > 0 ? RBuff.hidden_offset(layer - 1, 0, RnnDirection::Forward) : 0; const auto input_ptr = layer > 0 ? reserveSpace : x; - + // Ht(t)^ = Whx(t)*x(t), t = 0:seq_len - 1 + // const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, input_ptr, - input_offset, + xt_offset, w, - input_weight_offset, + wx_offset, reserveSpace, - output_offset, + ht_offset, GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) { @@ -228,49 +229,54 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &seq_len, max_batch, &batches, - &bacc_per_time](int layer, float beta_t = 0) { + &bacc_per_time](int layer) { float alpha0 = 1; float alpha1 = 1; + float beta = 0; auto bias_desc = miopen::TensorDescriptor(rnn_data_type, {1, 1, WeiBuf.bias_stride()}, {WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - const auto hidden_desc = + auto ht_desc = miopen::TensorDescriptor(rnn_data_type, {1, RBuff.batches_per_layer, WeiBuf.bias_stride()}, {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + // Ht(t)^ = Ht(t)^ + b, t = 0:seq_len - 1 + // OpTensor(handle, miopenTensorOpAdd, &alpha0, - hidden_desc, - reserveSpace, // A + ht_desc, + reserveSpace, &alpha1, bias_desc, - w, // B - &beta_t, - hidden_desc, - reserveSpace, // C - RBuff.layer_offset(layer), // A offset - WeiBuf.bias_off(layer), // B offset - RBuff.layer_offset(layer)); // C offset + w, + &beta, + ht_desc, + reserveSpace, + RBuff.layer_offset(layer), + WeiBuf.bias_off(layer, 0, RnnDirection::Forward), + RBuff.layer_offset(layer)); if(hx != nullptr) { + // Ht(t)^ = H(t)^ + hx_bias, t = 0:seq_len - 1 + // OpTensor(handle, miopenTensorOpAdd, &alpha0, - hidden_desc, + ht_desc, reserveSpace, &alpha1, bias_desc, w, - &beta_t, - hidden_desc, + &beta, + ht_desc, reserveSpace, RBuff.layer_offset(layer), - WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), + WeiBuf.bias_off(layer, 1, RnnDirection::Forward), RBuff.layer_offset(layer)); return; } @@ -280,10 +286,9 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if((RBuff.batches_per_layer - max_batch) <= 0) return; - auto reserve_desc = - miopen::TensorDescriptor(rnn_data_type, - {1, RBuff.batches_per_layer - max_batch, hidden_size}, - {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); + ht_desc = miopen::TensorDescriptor(rnn_data_type, + {1, RBuff.batches_per_layer - max_batch, hidden_size}, + {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); bias_desc = miopen::TensorDescriptor( rnn_data_type, {1, 1, hidden_size}, {WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); @@ -291,17 +296,17 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, OpTensor(handle, miopenTensorOpAdd, &alpha0, - reserve_desc, + ht_desc, reserveSpace, &alpha1, bias_desc, w, - &beta_t, - reserve_desc, + &beta, + ht_desc, reserveSpace, - RBuff.layer_offset(layer) + max_batch * RBuff.gemm_write_stride(), - WeiBuf.bias_off(layer) + WeiBuf.bias_stride(), - RBuff.layer_offset(layer) + max_batch * RBuff.gemm_write_stride(), + RBuff.gemm_write_offset(layer, max_batch, RnnDirection::Forward), + WeiBuf.bias_off(layer, 1, RnnDirection::Forward), + RBuff.gemm_write_offset(layer, max_batch, RnnDirection::Forward), true); if(dirMode == 0u) @@ -312,46 +317,46 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, OpTensor(handle, miopenTensorOpAdd, &alpha0, - reserve_desc, + ht_desc, reserveSpace, &alpha1, bias_desc, w, - &beta_t, - reserve_desc, + &beta, + ht_desc, reserveSpace, - RBuff.layer_offset(layer) + hidden_size, - WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, - RBuff.layer_offset(layer) + hidden_size, + RBuff.gemm_write_offset(layer, 0, RnnDirection::Backward), + WeiBuf.bias_off(layer, 1, RnnDirection::Backward), + RBuff.gemm_write_offset(layer, 0, RnnDirection::Backward), true); return; } + // Ht(t)^ = Ht(t)^ + bias2, t = 1:seq_len - 1, reverse direction + // for(int ti = 0; ti < seq_len - 1; ti++) { + auto ht_offset = RBuff.gemm_write_offset( + layer, bacc_per_time.at(ti, RnnDirection::Forward), RnnDirection::Backward); - auto offset = RBuff.layer_offset(layer) + - bacc_per_time.at(ti, RnnDirection::Forward) * RBuff.gemm_write_stride(); - - reserve_desc = miopen::TensorDescriptor( + ht_desc = miopen::TensorDescriptor( rnn_data_type, {1, batches.at(ti + 1, RnnDirection::Forward), hidden_size}, {WeiBuf.bias_stride(), WeiBuf.bias_stride(), 1}); - OpTensor(handle, miopenTensorOpAdd, &alpha0, - reserve_desc, + ht_desc, reserveSpace, &alpha1, bias_desc, w, - &beta_t, - reserve_desc, + &beta, + ht_desc, reserveSpace, - static_cast(offset) + hidden_size, - WeiBuf.bias_off(layer) + WeiBuf.bias_stride() + hidden_size, - static_cast(offset) + hidden_size, + ht_offset, + WeiBuf.bias_off(layer, 1, RnnDirection::Backward), + ht_offset, true); } }; @@ -385,25 +390,29 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(time != 0 && direction == RnnDirection::Backward && hx != nullptr && batches.at(time, direction) > batches.prev(time, direction)) { - miopen::GemmDescriptor gemm_desc = - GemmDescriptor{false, - false, - true, - batches.at(time, direction) - batches.prev(time, direction), - n, - k, - hidden_size, - ldb, - ldc, - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - xDesc.GetType(), - false}; + auto dbatches = batches.at(time, direction) - batches.prev(time, direction); + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + true, + dbatches, + n, + k, + hidden_size, + ldb, + ldc, + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + xDesc.GetType(), + false}; + // Ht(t)^ = Ht(t)^ + Whx * hx, + // for batches = bacc_per_time.prev(time, direction) - dbatches : + // bacc_per_time.prev(time, direction), t = 1:seq_len - 1 + // const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc, @@ -412,10 +421,8 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, w, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, - RBuff.gemm_write_offset(layer, - bacc_per_time.at(time, direction) + - batches.prev(time, direction), - direction), + RBuff.gemm_write_offset( + layer, bacc_per_time.prev(time, direction) - dbatches, direction), GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) @@ -449,22 +456,24 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, xDesc.GetType(), false}; - const auto hidden_offset = + const auto ht_offset = (time == 0) ? get_HxBuff_offset(layer, 0, direction) : RBuff.hidden_offset(layer, bacc_per_time.prev(time, direction), direction); - const auto save_point_offset = + const auto not_activated_ht_offset = RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction); + // Ht(t)^ = Ht(t)^ + Whh * Ht(t-1) + // const miopenStatus_t gemm_status = CallGemm(handle, gemm_desc_hx, ht_ptr, - hidden_offset, + ht_offset, w, WeiBuf.hidden_weight_offset(layer, direction), reserveSpace, - save_point_offset, + not_activated_ht_offset, GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) { @@ -490,30 +499,32 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, &bacc_per_time](int layer_id, int time, RnnDirection direction) { float alpha = 1, beta = 0; - auto hidden_desc = + auto ht_desc = miopen::TensorDescriptor(rnn_data_type, {1, batches.at(time, direction), hidden_size}, {RBuff.layer_stride(), RBuff.gemm_write_stride(), 1}); - const auto RB_layer_save_points_off = + const auto ht_not_activated_offset = RBuff.gemm_write_offset(layer_id, bacc_per_time.at(time, direction), direction); const auto hidden_offset = RBuff.hidden_offset(layer_id, bacc_per_time.at(time, direction), direction); + // Ht(t) = @(Ht^(t)) + // activDesc.Forward(handle, &alpha, // input tensor descriptor - hidden_desc, + ht_desc, // input pointer reserveSpace, &beta, // output tensor descriptor - hidden_desc, + ht_desc, // output pointer reserveSpace, // input tensor offset - RB_layer_save_points_off, + ht_not_activated_offset, // output tensor offset hidden_offset); }; @@ -532,11 +543,11 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, if(hy == nullptr) return; - auto copy_batches = time == seq_len - 1 - ? batches.at(time, direction) - : batches.at(time, direction) - batches.next(time, direction); + auto dbatches = time == seq_len - 1 + ? batches.at(time, direction) + : batches.at(time, direction) - batches.next(time, direction); - if(copy_batches <= 0) + if(dbatches <= 0) return; const std::vector hcy_src_stride{ @@ -545,14 +556,14 @@ void RNNDescriptor::RNNForwardTrainingTanhRelu(Handle& handle, const std::vector hcy_dst_stride{ static_cast(hidden_size * max_batch), static_cast(hidden_size), 1}; - auto batch_id_relative = batches.at(time, direction) - copy_batches; + auto batch_id_relative = batches.at(time, direction) - dbatches; auto batch_id_abs = time == seq_len - 1 ? bacc_per_time.at(time, direction) : bacc_per_time.at(time, direction) + batches.next(time, direction); const std::vector hcy_copy_size{ - 1, static_cast(copy_batches), static_cast(hidden_size)}; + 1, static_cast(dbatches), static_cast(hidden_size)}; auto src_desc = miopen::TensorDescriptor(rnn_data_type, hcy_copy_size, hcy_src_stride); auto dst_desc = miopen::TensorDescriptor(rnn_data_type, hcy_copy_size, hcy_dst_stride); @@ -755,7 +766,7 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle, const int layers; const int gates_cnt; const int - bias_cnt; // 0 - no bisa; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec + bias_cnt; // 0 - no bias; 1 - one bias; 2 - separate bias for x_vec and for hidden_vec private: const size_t matrix_normal_start_off; const size_t bias_start_off; @@ -3133,6 +3144,7 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors( if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout && inputMode != miopenRNNskip && !(miopen::IsDisabled(ENV(MIOPEN_RNNFWD_exp)))) { + RNNForwardTrainingTanhRelu(handle, in_n, xDesc[0], @@ -4448,6 +4460,7 @@ void RNNDescriptor::RNNBackwardData(Handle& handle, if(paddingMode == miopenRNNIONotPadded) { bool use_dropout = !float_equal(miopen::deref(dropoutDesc).dropout, 0); + if((rnnMode == miopenRNNRELU || rnnMode == miopenRNNTANH) && !use_dropout) { RNNBackwardDataPackedTensorsRelu(handle, @@ -4707,200 +4720,222 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( static_cast(hidden_size) * batch_id; }; - auto propagate_output = [&RBuff, - out_vec_size, - &handle, - rnn_data_type, - workSpace, - dy, - &WeiBuf, - w](int numLayers, int layer) { - // Propagate output - // - if(layer == numLayers - 1) - { - const std::vector output_size{ - 1, static_cast(RBuff.batches_per_layer), static_cast(out_vec_size)}; - - const std::vector workspace_size{1, - static_cast(RBuff.batches_per_layer), - static_cast(RBuff.gemm_write_size())}; - - const std::vector workspace_stride{ - RBuff.layer_stride(), static_cast(RBuff.gemm_write_size()), 1}; - - const std::vector output_stride{ - static_cast(out_vec_size * RBuff.batches_per_layer), - static_cast(out_vec_size), - 1}; - - auto y_src_desc = miopen::TensorDescriptor(rnn_data_type, output_size, output_stride); - auto y_dst_desc = - miopen::TensorDescriptor(rnn_data_type, workspace_size, workspace_stride); - - CopyTensor(handle, y_src_desc, dy, y_dst_desc, workSpace, 0, RBuff.layer_offset(layer)); - } - // Propagate previous layer - // - else - { - miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, - false, - false, - RBuff.batches_per_layer, - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - RBuff.gemm_write_size(), - 1, // batch count - 0, // Stride A - 0, // Stride B - 0, // Stride C - 1, // alpha - 1, // beta - rnn_data_type, - false}; + auto back_propagate_dy = + [&RBuff, out_vec_size, &handle, rnn_data_type, workSpace, dy, &WeiBuf, w](int numLayers, + int layer) { + // Propagate dy from output + // + if(layer == numLayers - 1) + { + const std::vector dy_output_size{ + 1, + static_cast(RBuff.batches_per_layer), + static_cast(out_vec_size)}; + + const std::vector dy_output_stride{ + static_cast(out_vec_size * RBuff.batches_per_layer), + static_cast(out_vec_size), + 1}; + + const std::vector dy_workspace_size{ + 1, + static_cast(RBuff.batches_per_layer), + static_cast(RBuff.gemm_write_size())}; + + const std::vector dy_workspace_stride{ + RBuff.layer_stride(), static_cast(RBuff.gemm_write_size()), 1}; + + auto dy_output_desc = + miopen::TensorDescriptor(rnn_data_type, dy_output_size, dy_output_stride); + auto dy_workspace_desc = + miopen::TensorDescriptor(rnn_data_type, dy_workspace_size, dy_workspace_stride); + + int dy_output_offset = 0; + int dy_workspace_offset = RBuff.layer_offset(layer); + + // dY(l,t) = dy(l,t); t = 1:seq_len - 1 + // + CopyTensor(handle, + dy_output_desc, + dy, + dy_workspace_desc, + workSpace, + dy_output_offset, + dy_workspace_offset); + } + // Propagate dy from previous layer + // + else + { + miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, + false, + false, + RBuff.batches_per_layer, + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + RBuff.gemm_write_size(), + 1, // batch count + 0, // Stride A + 0, // Stride B + 0, // Stride C + 1, // alpha + 1, // beta + rnn_data_type, + false}; - miopenStatus_t gemm_status = CallGemm(handle, - gemm_desc, - workSpace, - RBuff.layer_offset(layer + 1), - w, - WeiBuf.input_weight_offset(layer + 1), - workSpace, - RBuff.layer_offset(layer), - GemmBackend_t::rocblas); + int dy_prev_layer_offset = RBuff.layer_offset(layer + 1); + int dy_current_layer_offset = RBuff.layer_offset(layer); + + // dY(l,t) = dHt(l+1,t)/Why; t = 1:seq_len - 1 + // + miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + workSpace, + dy_prev_layer_offset, + w, + WeiBuf.input_weight_offset(layer + 1), + workSpace, + dy_current_layer_offset, + GemmBackend_t::rocblas); - if(gemm_status != miopenStatusSuccess) - { - if(gemm_status == miopenStatusNotImplemented) - { - MIOPEN_LOG_E("GEMM not implemented"); - } - else + if(gemm_status != miopenStatusSuccess) { - MIOPEN_LOG_E("GEMM failed"); + if(gemm_status == miopenStatusNotImplemented) + { + MIOPEN_LOG_E("GEMM not implemented"); + } + else + { + MIOPEN_LOG_E("GEMM failed"); + } } } - } - }; + }; - auto propagate_hidden_output = - [&RBuff, - &handle, - hidden_size, - &batches, - &bacc_per_time, - rnn_data_type, - dhy, - workSpace, - seqLen, - &get_HxBuff_offset](int layer, int time, RnnDirection direction) { - if(dhy == nullptr) - return; + auto back_propagate_dhy_output = [&RBuff, + &handle, + hidden_size, + &batches, + &bacc_per_time, + rnn_data_type, + dhy, + workSpace, + seqLen, + &get_HxBuff_offset]( + int layer, int time, RnnDirection direction) { + if(dhy == nullptr) + return; - const int start_time = direction == RnnDirection::Forward ? 0 : seqLen - time - 1; + const int start_time = direction == RnnDirection::Forward ? 0 : seqLen - time - 1; - float alpha0 = 1; - float alpha1 = 1; - float beta_t = 0; + float alpha0 = 1; + float alpha1 = 1; + float beta_t = 0; - std::vector hx_stride{ - batches.at(start_time, RnnDirection::Forward) * hidden_size, hidden_size, 1}; - std::vector workspace_stride{ - static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; + std::vector dhy_out_stride{ + batches.at(start_time, RnnDirection::Forward) * hidden_size, hidden_size, 1}; - std::vector hx_size{1, batches.at(time, direction), hidden_size}; - std::vector reserve_size{1, batches.at(time, direction), hidden_size}; + std::vector dhy_workspace_stride{ + static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - auto hx_desc = miopen::TensorDescriptor(rnn_data_type, hx_size, hx_stride); + std::vector dhy_out_size{1, batches.at(time, direction), hidden_size}; + std::vector dhy_workspace_size{1, batches.at(time, direction), hidden_size}; - auto workspace_desc = - miopen::TensorDescriptor(rnn_data_type, reserve_size, workspace_stride); + auto dhy_out_desc = miopen::TensorDescriptor(rnn_data_type, dhy_out_size, dhy_out_stride); - OpTensor(handle, - miopenTensorOpAdd, - &alpha0, - hx_desc, - dhy, - &alpha1, - workspace_desc, - workSpace, - &beta_t, - workspace_desc, - workSpace, - get_HxBuff_offset(layer, 0, direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); - }; + auto dhy_workspace_desc = + miopen::TensorDescriptor(rnn_data_type, dhy_workspace_size, dhy_workspace_stride); + + auto dhy_out_offset = get_HxBuff_offset(layer, 0, direction); + auto dhy_workspace_offset = + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction); + + // dHt(l,seq_len - 1) = dY(l,seq_len - 1) + dHy(seq_len - 1) + // + OpTensor(handle, + miopenTensorOpAdd, + &alpha0, + dhy_out_desc, + dhy, + &alpha1, + dhy_workspace_desc, + workSpace, + &beta_t, + dhy_workspace_desc, + workSpace, + dhy_out_offset, + dhy_workspace_offset, + dhy_workspace_offset); + }; + + auto back_propagate_dhy_prev = [&RBuff, + &handle, + hidden_size, + &batches, + &bacc_per_time, + rnn_data_type, + dhy, + workSpace, + &get_HxBuff_offset, + WeiBuf, + w, + max_batch](int layer, int time, RnnDirection direction) { + auto dbatches = batches.at(time, direction) - batches.next(time, direction); - auto propagate_hidden_prev = [&RBuff, - &handle, - hidden_size, - &batches, - &bacc_per_time, - rnn_data_type, - dhy, - workSpace, - &get_HxBuff_offset, - WeiBuf, - w, - max_batch](int layer, int time, RnnDirection direction) { - if(direction == RnnDirection::Forward && dhy != nullptr && - batches.at(time, direction) > batches.next(time, direction)) + if(direction == RnnDirection::Forward && dhy != nullptr && dbatches > 0) { - std::vector hy_stride{max_batch * hidden_size, hidden_size, 1}; + std::vector dhy_stride{max_batch * hidden_size, hidden_size, 1}; - std::vector hidden_tensor_stride{ + std::vector dht_tensor_stride{ static_cast(RBuff.layer_stride()), RBuff.gemm_write_size(), 1}; - std::vector hy_size{ - 1, batches.at(time, direction) - batches.next(time, direction), hidden_size}; + std::vector dhy_size{1, dbatches, hidden_size}; - std::vector hidden_tensor_size{ - 1, batches.at(time, direction) - batches.next(time, direction), hidden_size}; + std::vector dht_tensor_size{1, dbatches, hidden_size}; float alpha0 = 1; float alpha1 = 1; float beta_t = 0; - auto hy_tensor_desc = miopen::TensorDescriptor(rnn_data_type, hy_size, hy_stride); - auto hidden_tensor_desc = - miopen::TensorDescriptor(rnn_data_type, hidden_tensor_size, hidden_tensor_stride); - + auto dhy_tensor_desc = miopen::TensorDescriptor(rnn_data_type, dhy_size, dhy_stride); + auto dht_tensor_desc = + miopen::TensorDescriptor(rnn_data_type, dht_tensor_size, dht_tensor_stride); + + auto dhy_offset = get_HxBuff_offset(layer, batches.next(time, direction), direction); + auto dht_tensor_offset = RBuff.gemm_write_offset(layer, + bacc_per_time.at(time, direction) + + batches.next(time, direction), + direction); + // dHt(t,l) = dY(t,l) + dHy(t,l) for relative batches = batches(t+1):batches(t) + // OpTensor(handle, miopenTensorOpAdd, &alpha0, - hy_tensor_desc, + dhy_tensor_desc, dhy, &alpha1, - hidden_tensor_desc, + dht_tensor_desc, workSpace, &beta_t, - hidden_tensor_desc, + dht_tensor_desc, workSpace, - get_HxBuff_offset(layer, batches.next(time, direction), direction), - RBuff.gemm_write_offset(layer, - bacc_per_time.at(time, direction) + - batches.next(time, direction), - direction), - RBuff.gemm_write_offset(layer, - bacc_per_time.at(time, direction) + - batches.next(time, direction), - direction)); + dhy_offset, + dht_tensor_offset, + dht_tensor_offset); } - int used_batch = direction == RnnDirection::Forward ? batches.next(time, direction) - : batches.at(time, direction); + int dht_batch_size = direction == RnnDirection::Forward ? batches.next(time, direction) + : batches.at(time, direction); - if(used_batch <= 0) + if(dht_batch_size <= 0) return; miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, - used_batch, + dht_batch_size, hidden_size, hidden_size, RBuff.gemm_write_size(), @@ -4915,16 +4950,21 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - miopenStatus_t gemm_status = - CallGemm(handle, - gemm_desc, - workSpace, - RBuff.gemm_write_offset(layer, bacc_per_time.next(time, direction), direction), - w, - WeiBuf.hidden_weight_offset(layer, direction), - workSpace, - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - GemmBackend_t::rocblas); + // dHt(l,t) = dY(t,l) + dHt^(t+1)/Whh for relative batches = 0:batches(t+1) + // + int dht_next_deactivated_offset = + RBuff.gemm_write_offset(layer, bacc_per_time.next(time, direction), direction); + int dht_offset = + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction); + miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + workSpace, + dht_next_deactivated_offset, + w, + WeiBuf.hidden_weight_offset(layer, direction), + workSpace, + dht_offset, + GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) { @@ -4939,86 +4979,86 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_hidden_time = - [&RBuff, - &handle, - hidden_size, - seqLen, - &batches, - &bacc_per_time, - rnn_data_type, - workSpace, - reserveSpace, - &activDesc, - &propagate_hidden_output, - &propagate_hidden_prev](int layer, int time, RnnDirection direction) { - if(time == seqLen - 1) - { - propagate_hidden_output(layer, time, direction); - } - else - { - propagate_hidden_prev(layer, time, direction); - } - - std::vector hidden_tensor_stride{ - RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; - std::vector hidden_tensor_size{1, batches.at(time, direction), hidden_size}; + auto back_propagate_dhy_time = [&RBuff, + &handle, + hidden_size, + seqLen, + &batches, + &bacc_per_time, + rnn_data_type, + workSpace, + reserveSpace, + &activDesc, + &back_propagate_dhy_output, + &back_propagate_dhy_prev]( + int layer, int time, RnnDirection direction) { + if(time == seqLen - 1) + { + back_propagate_dhy_output(layer, time, direction); + } + else + { + back_propagate_dhy_prev(layer, time, direction); + } - auto hidden_desc = - miopen::TensorDescriptor(rnn_data_type, hidden_tensor_size, hidden_tensor_stride); + std::vector dht_tensor_stride{ + RBuff.batches_per_layer * RBuff.gemm_write_size(), RBuff.gemm_write_size(), 1}; + std::vector dht_tensor_size{1, batches.at(time, direction), hidden_size}; - float alpha = 1, beta = 0; + auto dht_desc = miopen::TensorDescriptor(rnn_data_type, dht_tensor_size, dht_tensor_stride); - activDesc.Backward( - handle, - &alpha, - hidden_desc, - reserveSpace, - hidden_desc, - workSpace, - hidden_desc, - reserveSpace, - &beta, - hidden_desc, - workSpace, - RBuff.hidden_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), - RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); - }; + float alpha = 1, beta = 0; + // dHt^(l,t) = @^-1(dHt(l,t)) + // + activDesc.Backward( + handle, + &alpha, + dht_desc, + reserveSpace, + dht_desc, + workSpace, + dht_desc, + reserveSpace, + &beta, + dht_desc, + workSpace, + RBuff.hidden_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction), + RBuff.gemm_write_offset(layer, bacc_per_time.at(time, direction), direction)); + }; - auto propagate_hidden = [this, seqLen, &propagate_hidden_time](int layer) { + auto back_propagate_dhy = [this, seqLen, &back_propagate_dhy_time](int layer) { for(int time = seqLen - 1; time >= 0; time--) { - propagate_hidden_time(layer, time, RnnDirection::Forward); + back_propagate_dhy_time(layer, time, RnnDirection::Forward); if(dirMode == 0u) continue; - propagate_hidden_time(layer, time, RnnDirection::Backward); + back_propagate_dhy_time(layer, time, RnnDirection::Backward); } }; - auto propagate_dhx_prev = [&RBuff, - &WeiBuf, - rnn_data_type, - hidden_size, - &batches, - &bacc_per_time, - &handle, - w, - dhx, - &get_HxBuff_offset, - workSpace](int layer, int time, RnnDirection direction) { - int batch_size = time == 0 ? batches.at(time, direction) - : batches.at(time, direction) - batches.prev(time, direction); - - if(batch_size <= 0) + auto forward_propagate_dhx_prev = [&RBuff, + &WeiBuf, + rnn_data_type, + hidden_size, + &batches, + &bacc_per_time, + &handle, + w, + dhx, + &get_HxBuff_offset, + workSpace](int layer, int time, RnnDirection direction) { + int dbatches = time == 0 ? batches.at(time, direction) + : batches.at(time, direction) - batches.prev(time, direction); + + if(dbatches <= 0) return; miopen::GemmDescriptor gemm_desc = GemmDescriptor{false, false, false, - batch_size, + dbatches, hidden_size, hidden_size, RBuff.gemm_write_size(), @@ -5033,21 +5073,25 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( rnn_data_type, false}; - int output_batch = time == 0 ? 0 : batches.prev(time, direction); + int dhx_batch = time == 0 ? 0 : batches.prev(time, direction); - int input_batch = time == 0 ? bacc_per_time.at(time, direction) - : bacc_per_time.prev(time, direction) - batch_size; + int dht_batch = time == 0 ? bacc_per_time.at(time, direction) + : bacc_per_time.prev(time, direction) - dbatches; - miopenStatus_t gemm_status = - CallGemm(handle, - gemm_desc, - workSpace, - RBuff.gemm_write_offset(layer, input_batch, direction), - w, - WeiBuf.hidden_weight_offset(layer, direction), - dhx, - get_HxBuff_offset(layer, output_batch, direction), - GemmBackend_t::rocblas); + int dhx_offset = get_HxBuff_offset(layer, dhx_batch, direction); + int dht_prev_offset = RBuff.gemm_write_offset(layer, dht_batch, direction); + + // dhx(l,t) = dHt(l,t-1)/Whh for relative batches = batches(t+1):batches(t) + // + miopenStatus_t gemm_status = CallGemm(handle, + gemm_desc, + workSpace, + dht_prev_offset, + w, + WeiBuf.hidden_weight_offset(layer, direction), + dhx, + dhx_offset, + GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) { @@ -5062,39 +5106,39 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( } }; - auto propagate_dhx = [this, seqLen, &propagate_dhx_prev, dhx](int layer) { + auto forward_propagate_dhx = [this, seqLen, &forward_propagate_dhx_prev, dhx](int layer) { if(dhx == nullptr) return; for(int time = 0; time < seqLen; time++) { - propagate_dhx_prev(layer, time, RnnDirection::Forward); + forward_propagate_dhx_prev(layer, time, RnnDirection::Forward); if(dirMode == 0u) continue; - propagate_dhx_prev(layer, time, RnnDirection::Backward); + forward_propagate_dhx_prev(layer, time, RnnDirection::Backward); } }; for(int li = static_cast(nLayers) - 1; li >= 0; li--) { - propagate_output(nLayers, li); - propagate_hidden(li); - propagate_dhx(li); + back_propagate_dy(nLayers, li); + back_propagate_dhy(li); + forward_propagate_dhx(li); } int hy_stride = hidden_size * bi * static_cast(workspaceScale); if(inputMode == miopenRNNskip) { - std::vector o_size{1, total_batch_size, hidden_size}; - std::vector o_stride{total_batch_size * out_stride, out_stride, 1}; - std::vector r_size{1, total_batch_size, hidden_size}; - std::vector r_stride{total_batch_size * in_stride, in_stride, 1}; - - auto x_desc = miopen::TensorDescriptor(rnn_data_type, r_size, r_stride); - auto sp_desc = miopen::TensorDescriptor(rnn_data_type, o_size, o_stride); + auto workspace_desc = + miopen::TensorDescriptor(rnn_data_type, + {1, total_batch_size, hidden_size}, + {static_cast(total_batch_size) * out_stride, out_stride, 1}); + auto dx_desc = miopen::TensorDescriptor(rnn_data_type, + {1, total_batch_size, hidden_size}, + {static_cast(total_batch_size) * in_stride, in_stride, 1}); float alpha0 = 1; float alpha1 = 1; @@ -5105,13 +5149,13 @@ void RNNDescriptor::RNNBackwardDataPackedTensorsRelu( OpTensor(handle, miopenTensorOpAdd, &alpha0, - sp_desc, + workspace_desc, workSpace, &alpha1, - x_desc, + dx_desc, dx, &beta_t, - x_desc, + dx_desc, dx, static_cast(gi) * hidden_size, 0,