Merge pull request #99 from sharannarang/rnn_fix

Fix RNN benchmark to measure backprop wrt params.
baidu-research · May 25, 2018 · a5a2b60 · a5a2b60
2 parents 3540221 + 2e448e5
commit a5a2b60
Show file tree

Hide file tree

Showing 9 changed files with 104 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -480,10 +480,10 @@ The recurrent op kernels are only run on NVIDIA hardware.
 
 | Hidden Units   | Batch Size   | TimeSteps   | Recurrent Type   | Application           | Total Time (ms) | Fwd TeraFLOPS   | Processor       |
 | -------------- | ------------ | ----------- | ---------------- | --------------------- | ------------    | --------------- | --------------- |
-| 1760           | 16           | 50          | Vanilla          | Speech Recognition    | 6.75            | 1.46            | Tesla V100 FP32 |
-| 2560           | 32           | 50          | Vanilla          | Speech Recognition    | 11.48           | 3.43            | Tesla V100 Mixed Precision |
-| 1024           | 128          | 25          | LSTM             | Machine Translation   | 6.46            | 12.41           | Tesla V100 Mixed Precision |
-| 2816           | 32           | 1500        | GRU              | Speech Recognition    | 591.02          | 10.45           | Tesla V100 Mixed Precision |
+| 1760           | 16           | 50          | Vanilla          | Speech Recognition    | 8.21            | 1.19            | Tesla V100 Mixed Precision |
+| 2560           | 32           | 50          | Vanilla          | Speech Recognition    | 10.50           | 4.08            | Tesla V100 Mixed Precision |
+| 1024           | 128          | 25          | LSTM             | Machine Translation   | 5.56            | 10.91           | Tesla V100 Mixed Precision |
+| 2816           | 32           | 1500        | GRU              | Speech Recognition    | 380.04          | 11.85           | Tesla V100 Mixed Precision |
 
 ### All-Reduce Results
 

diff --git a/code/nvidia/rnn_bench.cu b/code/nvidia/rnn_bench.cu
@@ -116,6 +116,8 @@ class cudnnRNN {
     size_t train_size_;
 
     Tensor<T> weights_;
+    Tensor<T> dW_;
+
     Tensor<float> workspace_;
     Tensor<float> trainspace_;
 
@@ -169,7 +171,6 @@ class cudnnRNN {
 
             weights_ = rand<T>(std::vector<int>{static_cast<int>(weight_size_ / sizeof(T)), 1}, curand_gen);
 
-
             std::vector<int> dim = {weights_.size(), 1, 1};
             wDesc_ = FilterDescriptorNd<T>(CUDNN_TENSOR_NCHW, dim);
 
@@ -179,6 +180,8 @@ class cudnnRNN {
                                                         xDescArray_.ptr(),
                                                         &workspace_size_) );
 
+            dW_ = zeros<T>(std::vector<int>{static_cast<int>(weight_size_ / sizeof(T)), 1});
+
             workspace_ = zeros<float>(std::vector<int>{static_cast<int>(workspace_size_ / sizeof(float)), 1});
 
             CHECK_CUDNN_ERROR( cudnnGetRNNTrainingReserveSize(cudnn_handle,
@@ -243,14 +246,33 @@ class cudnnRNN {
                                                     (void *)trainspace_.begin(),
                                                     train_size_) );
         }
+
+        void backward_params(Tensor<T> x, Tensor<T> hx, Tensor<T> y) {
+            CHECK_CUDNN_ERROR(cudnnRNNBackwardWeights(cudnn_handle,
+                                                      rnn_desc_.desc(),
+                                                      time_steps_,
+                                                      xDescArray_.ptr(),
+                                                      (void *)x.begin(),
+                                                      hx_desc_.desc(),
+                                                      (void *)hx.begin(),
+                                                      yDescArray_.ptr(),
+                                                      (void *)y.begin(),
+                                                      (void *)workspace_.begin(),
+                                                      workspace_size_,
+                                                      wDesc_.desc(),
+                                                      (void *)dW_.begin(),
+                                                      (void *)trainspace_.begin(),
+                                                      train_size_) );
+        }
+
 };
 
 template <typename T>
-std::tuple<int, int> time_rnn(int hidden_size,
-                              int batch_size,
-                              int time_steps,
-                              const std::string& type,
-                              int inference) {
+std::tuple<int, int, int> time_rnn(int hidden_size,
+                                   int batch_size,
+                                   int time_steps,
+                                   const std::string& type,
+                                   int inference) {
 
     cudnnRNN<T> rnn(hidden_size, batch_size, time_steps, type);
 
@@ -285,7 +307,8 @@ std::tuple<int, int> time_rnn(int hidden_size,
     auto end = std::chrono::steady_clock::now();
 
     auto forward_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;
-    int backward_time = 0;
+    int bwd_data_time = 0;
+    int bwd_params_time = 0;
 
     if (!inference) {
         //Warm up
@@ -303,12 +326,31 @@ std::tuple<int, int> time_rnn(int hidden_size,
         cudaDeviceSynchronize();
 
         end = std::chrono::steady_clock::now();
-        backward_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;
+        bwd_data_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;
+
+        /* Backward wrt params */
+        //Warm up
+        rnn.backward_params(x, hx, y);
+
+        cudaDeviceSynchronize();
+
+        start = std::chrono::steady_clock::now();
+
+        for (int i = 0; i < numRepeats; ++i) {
+            rnn.backward_params(x, hx, y);
+        }
+
+        cudaDeviceSynchronize();
+
+        end = std::chrono::steady_clock::now();
+        bwd_params_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;
+
 
     }
 
     return std::make_tuple(static_cast<int>(forward_time),
-                           static_cast<int>(backward_time));
+                           static_cast<int>(bwd_data_time),
+                           static_cast<int>(bwd_params_time));
 
 }
 
@@ -347,11 +389,14 @@ int main(int argc, char **argv) {
     }
 
     std::cout << std::setw(30) << "Times" << std::endl;
-    std::cout << std::setfill('-') << std::setw(88) << "-" << std::endl;
+    std::cout << std::setfill('-') << std::setw(115) << "-" << std::endl;
     std::cout << std::setfill(' ');
-    std::cout << "    type    hidden   N     timesteps   precision     fwd_time (usec)   ";
-    if (!inference)
-        std::cout << "bwd_time (usec)";
+    std::cout << "    type    hidden   N     timesteps   precision   fwd_time (usec)   ";
+    if (!inference) {
+        std::cout << "bwd_inputs_time (usec)";
+        std::cout << "  bwd_params_time (usec)";
+    }
+
     std::cout << std::endl;
     for (const auto &problem : (inference ? inference_server_set : training_set)) {
         int hidden_state, batch_size, time_steps;
@@ -363,66 +408,75 @@ int main(int argc, char **argv) {
         std::cout << std::setw(8) << batch_size;
         std::cout << std::setw(8) << time_steps;
         std::cout << std::setw(14) << precision;
-        int fwd_time, bwd_time;
+        int fwd_time, bwd_data_time, bwd_params_time;
 
         std::stringstream ss;
         ss << "Unsupported precision requested. Precision: " << precision << " Inference: " << inference;
 
 #if CUDNN_MAJOR >= 6
         if (inference) {
             if (precision == "float") {
-                std::tie(fwd_time, bwd_time) = time_rnn<float>(hidden_state,
-                                                               batch_size,
-                                                               time_steps,
-                                                               type,
-                                                               inference);
+                std::tie(fwd_time, bwd_data_time, bwd_params_time) =
+                    time_rnn<float>(hidden_state,
+                                    batch_size,
+                                    time_steps,
+                                    type,
+                                    inference);
 
             } else if (precision == "half") {
-                std::tie(fwd_time, bwd_time) = time_rnn<uint16_t>(hidden_state,
-                                                                  batch_size,
-                                                                  time_steps,
-                                                                  type,
-                                                                  inference);
+                std::tie(fwd_time, bwd_data_time, bwd_params_time) =
+                    time_rnn<uint16_t>(hidden_state,
+                                       batch_size,
+                                       time_steps,
+                                       type,
+                                       inference);
             } else if (precision == "int8") {
-                std::tie(fwd_time, bwd_time) = time_rnn<uint8_t>(hidden_state,
-                                                                 batch_size,
-                                                                 time_steps,
-                                                                 type,
-                                                                 inference);
+                std::tie(fwd_time, bwd_data_time, bwd_params_time) =
+                    time_rnn<uint8_t>(hidden_state,
+                                      batch_size,
+                                      time_steps,
+                                      type,
+                                      inference);
             } else {
                 throw std::runtime_error(ss.str());
             }
         } else {
             if (precision == "float") {
-                std::tie(fwd_time, bwd_time) = time_rnn<float>(hidden_state,
-                                                               batch_size,
-                                                               time_steps,
-                                                               type,
-                                                               inference);
+                std::tie(fwd_time, bwd_data_time, bwd_params_time) =
+                     time_rnn<float>(hidden_state,
+                                     batch_size,
+                                     time_steps,
+                                     type,
+                                     inference);
 
             } else if (precision == "half") {
-                std::tie(fwd_time, bwd_time) = time_rnn<uint16_t>(hidden_state,
-                                                                  batch_size,
-                                                                  time_steps,
-                                                                  type,
-                                                                  inference);
+                std::tie(fwd_time, bwd_data_time, bwd_params_time) =
+                     time_rnn<uint16_t>(hidden_state,
+                                        batch_size,
+                                        time_steps,
+                                        type,
+                                        inference);
             } else {
                 throw std::runtime_error(ss.str());
             }
         }
 #else
         if (precision != "float")
             throw std::runtime_error(ss.str());
-        std::tie(fwd_time, bwd_time) = time_rnn<float>(hidden_state,
-                                                       batch_size,
-                                                       time_steps,
-                                                       type,
-                                                       inference);
+
+        std::tie(fwd_time, bwd_data_time, bwd_params_time) =
+             time_rnn<float>(hidden_state,
+                             batch_size,
+                             time_steps,
+                             type,
+                             inference);
 #endif
 
         std::cout << std::setw(18) << fwd_time;
-        if (!inference)
-            std::cout << std::setw(18) << bwd_time;
+        if (!inference) {
+            std::cout << std::setw(20) << bwd_data_time;
+            std::cout << std::setw(20) << bwd_params_time;
+        }
         std::cout << std::endl;
     }
 

diff --git a/results/train/DeepBench_NV_1080Ti.xlsx b/results/train/DeepBench_NV_1080Ti.xlsx
diff --git a/results/train/DeepBench_NV_M40.xlsx b/results/train/DeepBench_NV_M40.xlsx
diff --git a/results/train/DeepBench_NV_P100.xlsx b/results/train/DeepBench_NV_P100.xlsx
diff --git a/results/train/DeepBench_NV_TitanX.xlsx b/results/train/DeepBench_NV_TitanX.xlsx
diff --git a/results/train/DeepBench_NV_TitanX_Pascal.xlsx b/results/train/DeepBench_NV_TitanX_Pascal.xlsx
diff --git a/results/train/DeepBench_NV_TitanXp.xlsx b/results/train/DeepBench_NV_TitanXp.xlsx
diff --git a/results/train/DeepBench_NV_V100.xlsx b/results/train/DeepBench_NV_V100.xlsx