Skip to content

Commit

Permalink
Merge pull request #99 from sharannarang/rnn_fix
Browse files Browse the repository at this point in the history
Fix RNN benchmark to measure backprop wrt params.
  • Loading branch information
sharannarang authored May 25, 2018
2 parents 3540221 + 2e448e5 commit a5a2b60
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 50 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,10 @@ The recurrent op kernels are only run on NVIDIA hardware.

| Hidden Units | Batch Size | TimeSteps | Recurrent Type | Application | Total Time (ms) | Fwd TeraFLOPS | Processor |
| -------------- | ------------ | ----------- | ---------------- | --------------------- | ------------ | --------------- | --------------- |
| 1760 | 16 | 50 | Vanilla | Speech Recognition | 6.75 | 1.46 | Tesla V100 FP32 |
| 2560 | 32 | 50 | Vanilla | Speech Recognition | 11.48 | 3.43 | Tesla V100 Mixed Precision |
| 1024 | 128 | 25 | LSTM | Machine Translation | 6.46 | 12.41 | Tesla V100 Mixed Precision |
| 2816 | 32 | 1500 | GRU | Speech Recognition | 591.02 | 10.45 | Tesla V100 Mixed Precision |
| 1760 | 16 | 50 | Vanilla | Speech Recognition | 8.21 | 1.19 | Tesla V100 Mixed Precision |
| 2560 | 32 | 50 | Vanilla | Speech Recognition | 10.50 | 4.08 | Tesla V100 Mixed Precision |
| 1024 | 128 | 25 | LSTM | Machine Translation | 5.56 | 10.91 | Tesla V100 Mixed Precision |
| 2816 | 32 | 1500 | GRU | Speech Recognition | 380.04 | 11.85 | Tesla V100 Mixed Precision |

### All-Reduce Results

Expand Down
146 changes: 100 additions & 46 deletions code/nvidia/rnn_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ class cudnnRNN {
size_t train_size_;

Tensor<T> weights_;
Tensor<T> dW_;

Tensor<float> workspace_;
Tensor<float> trainspace_;

Expand Down Expand Up @@ -169,7 +171,6 @@ class cudnnRNN {

weights_ = rand<T>(std::vector<int>{static_cast<int>(weight_size_ / sizeof(T)), 1}, curand_gen);


std::vector<int> dim = {weights_.size(), 1, 1};
wDesc_ = FilterDescriptorNd<T>(CUDNN_TENSOR_NCHW, dim);

Expand All @@ -179,6 +180,8 @@ class cudnnRNN {
xDescArray_.ptr(),
&workspace_size_) );

dW_ = zeros<T>(std::vector<int>{static_cast<int>(weight_size_ / sizeof(T)), 1});

workspace_ = zeros<float>(std::vector<int>{static_cast<int>(workspace_size_ / sizeof(float)), 1});

CHECK_CUDNN_ERROR( cudnnGetRNNTrainingReserveSize(cudnn_handle,
Expand Down Expand Up @@ -243,14 +246,33 @@ class cudnnRNN {
(void *)trainspace_.begin(),
train_size_) );
}

void backward_params(Tensor<T> x, Tensor<T> hx, Tensor<T> y) {
CHECK_CUDNN_ERROR(cudnnRNNBackwardWeights(cudnn_handle,
rnn_desc_.desc(),
time_steps_,
xDescArray_.ptr(),
(void *)x.begin(),
hx_desc_.desc(),
(void *)hx.begin(),
yDescArray_.ptr(),
(void *)y.begin(),
(void *)workspace_.begin(),
workspace_size_,
wDesc_.desc(),
(void *)dW_.begin(),
(void *)trainspace_.begin(),
train_size_) );
}

};

template <typename T>
std::tuple<int, int> time_rnn(int hidden_size,
int batch_size,
int time_steps,
const std::string& type,
int inference) {
std::tuple<int, int, int> time_rnn(int hidden_size,
int batch_size,
int time_steps,
const std::string& type,
int inference) {

cudnnRNN<T> rnn(hidden_size, batch_size, time_steps, type);

Expand Down Expand Up @@ -285,7 +307,8 @@ std::tuple<int, int> time_rnn(int hidden_size,
auto end = std::chrono::steady_clock::now();

auto forward_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;
int backward_time = 0;
int bwd_data_time = 0;
int bwd_params_time = 0;

if (!inference) {
//Warm up
Expand All @@ -303,12 +326,31 @@ std::tuple<int, int> time_rnn(int hidden_size,
cudaDeviceSynchronize();

end = std::chrono::steady_clock::now();
backward_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;
bwd_data_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;

/* Backward wrt params */
//Warm up
rnn.backward_params(x, hx, y);

cudaDeviceSynchronize();

start = std::chrono::steady_clock::now();

for (int i = 0; i < numRepeats; ++i) {
rnn.backward_params(x, hx, y);
}

cudaDeviceSynchronize();

end = std::chrono::steady_clock::now();
bwd_params_time = std::chrono::duration<double, std::micro>(end - start).count() / numRepeats;


}

return std::make_tuple(static_cast<int>(forward_time),
static_cast<int>(backward_time));
static_cast<int>(bwd_data_time),
static_cast<int>(bwd_params_time));

}

Expand Down Expand Up @@ -347,11 +389,14 @@ int main(int argc, char **argv) {
}

std::cout << std::setw(30) << "Times" << std::endl;
std::cout << std::setfill('-') << std::setw(88) << "-" << std::endl;
std::cout << std::setfill('-') << std::setw(115) << "-" << std::endl;
std::cout << std::setfill(' ');
std::cout << " type hidden N timesteps precision fwd_time (usec) ";
if (!inference)
std::cout << "bwd_time (usec)";
std::cout << " type hidden N timesteps precision fwd_time (usec) ";
if (!inference) {
std::cout << "bwd_inputs_time (usec)";
std::cout << " bwd_params_time (usec)";
}

std::cout << std::endl;
for (const auto &problem : (inference ? inference_server_set : training_set)) {
int hidden_state, batch_size, time_steps;
Expand All @@ -363,66 +408,75 @@ int main(int argc, char **argv) {
std::cout << std::setw(8) << batch_size;
std::cout << std::setw(8) << time_steps;
std::cout << std::setw(14) << precision;
int fwd_time, bwd_time;
int fwd_time, bwd_data_time, bwd_params_time;

std::stringstream ss;
ss << "Unsupported precision requested. Precision: " << precision << " Inference: " << inference;

#if CUDNN_MAJOR >= 6
if (inference) {
if (precision == "float") {
std::tie(fwd_time, bwd_time) = time_rnn<float>(hidden_state,
batch_size,
time_steps,
type,
inference);
std::tie(fwd_time, bwd_data_time, bwd_params_time) =
time_rnn<float>(hidden_state,
batch_size,
time_steps,
type,
inference);

} else if (precision == "half") {
std::tie(fwd_time, bwd_time) = time_rnn<uint16_t>(hidden_state,
batch_size,
time_steps,
type,
inference);
std::tie(fwd_time, bwd_data_time, bwd_params_time) =
time_rnn<uint16_t>(hidden_state,
batch_size,
time_steps,
type,
inference);
} else if (precision == "int8") {
std::tie(fwd_time, bwd_time) = time_rnn<uint8_t>(hidden_state,
batch_size,
time_steps,
type,
inference);
std::tie(fwd_time, bwd_data_time, bwd_params_time) =
time_rnn<uint8_t>(hidden_state,
batch_size,
time_steps,
type,
inference);
} else {
throw std::runtime_error(ss.str());
}
} else {
if (precision == "float") {
std::tie(fwd_time, bwd_time) = time_rnn<float>(hidden_state,
batch_size,
time_steps,
type,
inference);
std::tie(fwd_time, bwd_data_time, bwd_params_time) =
time_rnn<float>(hidden_state,
batch_size,
time_steps,
type,
inference);

} else if (precision == "half") {
std::tie(fwd_time, bwd_time) = time_rnn<uint16_t>(hidden_state,
batch_size,
time_steps,
type,
inference);
std::tie(fwd_time, bwd_data_time, bwd_params_time) =
time_rnn<uint16_t>(hidden_state,
batch_size,
time_steps,
type,
inference);
} else {
throw std::runtime_error(ss.str());
}
}
#else
if (precision != "float")
throw std::runtime_error(ss.str());
std::tie(fwd_time, bwd_time) = time_rnn<float>(hidden_state,
batch_size,
time_steps,
type,
inference);

std::tie(fwd_time, bwd_data_time, bwd_params_time) =
time_rnn<float>(hidden_state,
batch_size,
time_steps,
type,
inference);
#endif

std::cout << std::setw(18) << fwd_time;
if (!inference)
std::cout << std::setw(18) << bwd_time;
if (!inference) {
std::cout << std::setw(20) << bwd_data_time;
std::cout << std::setw(20) << bwd_params_time;
}
std::cout << std::endl;
}

Expand Down
Binary file modified results/train/DeepBench_NV_1080Ti.xlsx
Binary file not shown.
Binary file modified results/train/DeepBench_NV_M40.xlsx
Binary file not shown.
Binary file modified results/train/DeepBench_NV_P100.xlsx
Binary file not shown.
Binary file modified results/train/DeepBench_NV_TitanX.xlsx
Binary file not shown.
Binary file modified results/train/DeepBench_NV_TitanX_Pascal.xlsx
Binary file not shown.
Binary file modified results/train/DeepBench_NV_TitanXp.xlsx
Binary file not shown.
Binary file modified results/train/DeepBench_NV_V100.xlsx
Binary file not shown.

0 comments on commit a5a2b60

Please sign in to comment.