From e7f554764e0cc9cd77084049c0a96ef91040924e Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Mon, 13 Nov 2023 06:44:17 +0000
Subject: [PATCH] check for Inf & NaNs

---
 src/turbomind/models/llama/LlamaBatch.cc             |  2 +-
 .../models/llama/LlamaContextAttentionLayer.cc       |  6 ++++++
 src/turbomind/models/llama/LlamaContextDecoder.cc    | 12 +++++++-----
 src/turbomind/models/llama/LlamaDecoder.cc           | 12 +++++++-----
 .../models/llama/LlamaDecoderSelfAttentionLayer.cc   |  6 ++++++
 src/turbomind/models/llama/LlamaFfnLayer.cc          |  5 +++++
 src/turbomind/models/llama/LlamaV2.cc                |  2 +-
 7 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 7bcce83a6d..e02c3eaab3 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -507,7 +507,7 @@ bool LlamaBatch<T>::generate()
     // T x = 999999.f;
     // cudaMemcpyAsync(decoder_input_buf_, &x, sizeof(x), cudaMemcpyDefault, stream_);
 
-    // CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_);
+    CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_);
 
     // if (compare_mode == kCmpWrite) {
     //     if (rank_ == 0) {
diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
index 881582acea..02e8a77869 100644
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -157,6 +157,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
     // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
     linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);
 
+    CheckValues(qkv_buf_, num_token * weights->qkv.output_dims, "prefill_qkv", stream_);
+
     //////////////////////////////////////////////
     /// transpose qkv & apply rotary embedding & rebuild padding
     /// qkv [B, s, H + 2kvH, D] -> (q [B, H, s, D], k [B, kvH, s, D], v [B, kvH, s, D])
@@ -237,10 +239,14 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                   weights->past_kv_scale.data());
     }
 
+    CheckValues(qkv_buf_3_, num_token * weights->output.input_dims, "prefill_context", stream_);
+
     //////////////////////////////////////////////
     /// output gemm <Bs,HD> -> <Bs,HD>
     linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
 
+    CheckValues(attention_out, num_token * weights->output.output_dims, "prefill_o", stream_);
+
     if (tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
diff --git a/src/turbomind/models/llama/LlamaContextDecoder.cc b/src/turbomind/models/llama/LlamaContextDecoder.cc
index 7edf087a8b..9c5fcb7e9c 100644
--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
@@ -234,6 +234,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                             stream_);
     sync_check_cuda_error();
 
+    CheckValues(decoder_input_output, sess.token_num * hidden_units_, Concat("prefill_input", 0), stream_);
+
     /////////////////////////////////////////////
     /// RMSNorm
     invokeRootMeanSquareNorm(decoder_output,
@@ -245,14 +247,14 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                              stream_);
     sync_check_cuda_error();
 
-    // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_);
+    CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_);
 
     for (size_t layer = 0; layer < num_layer_; ++layer) {
         /////////////////////////////////////////////
         /// self-attention
         forwardSelfAttn(sess, decoder_output, input_tensors, layer, false);
 
-        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_);
+        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_);
 
         invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
                                           decoder_output,
@@ -264,7 +266,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                           stream_);
         sync_check_cuda_error();
 
-        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_);
+        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_);
 
         ////////////////////////////////////////////
         /// feed-forward network
@@ -273,7 +275,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             {"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
         silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
 
-        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_);
+        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_);
 
         auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
                                                      input_tensors->at("output_norm_weight").getPtr<T>();
@@ -287,7 +289,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                           stream_);
         sync_check_cuda_error();
 
-        // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_);
+        CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_);
     }
 
     if (is_free_buffer_after_forward_) {
diff --git a/src/turbomind/models/llama/LlamaDecoder.cc b/src/turbomind/models/llama/LlamaDecoder.cc
index 7b934f3cb2..791f942c8a 100644
--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
@@ -197,6 +197,8 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
 
     // int step = input_tensors->at("step").getVal<int>();
 
+    CheckValues(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0), stream_);
+
     ////////////////////////////////////////////
     /// RMSNorm
     invokeRootMeanSquareNorm(decoder_output,
@@ -208,7 +210,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                              stream_);
     sync_check_cuda_error();
 
-    // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_);
+    CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_);
 
     // CheckBatchConsistency(decoder_output,
     //                       hidden_units_,
@@ -228,7 +230,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
         //                       tensor_para_.rank_,
         //                       stream_);
 
-        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_);
+        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_);
 
         invokeFusedAddBiasResidualRMSNorm(decoder_input,
                                           decoder_output,
@@ -240,12 +242,12 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                                           stream_);
         sync_check_cuda_error();
 
-        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_);
+        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_);
 
         // decoder_layer_output_ = ffn(decoder_normed_input_)
         forwardFfn(sess, decoder_output, layer);
 
-        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_);
+        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_);
 
         auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
                                                      input_tensors->at("output_norm_weight").getPtr<T>();
@@ -259,7 +261,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        ou
                                           stream_);
         sync_check_cuda_error();
 
-        // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_);
+        CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_);
     }
 
     if (is_free_buffer_after_forward_) {
diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
index 4eb7937ba6..894069b774 100644
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -248,6 +248,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
     linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
     POP_RANGE;
 
+    CheckValues(qkv_buf_, batch_size * weights->qkv.output_dims, "decode_qkv", stream_);
+
     // CheckBatchConsistency(qkv_buf_,
     //                       (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_,
     //                       batch_size,
@@ -302,6 +304,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
         stream_);
     sync_check_cuda_error();
 
+    CheckValues(context_buf_, batch_size * weights->output.input_dims, "decode_context", stream_);
+
     // CheckBatchConsistency((T*)context_buf_,
     //                       local_hidden_units_,
     //                       batch_size,
@@ -311,6 +315,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
 
     linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
 
+    CheckValues(hidden_features_data, batch_size * weights->output.output_dims, "decode_o", stream_);
+
     // CheckBatchConsistency(hidden_features_data,
     //                       hidden_units_,
     //                       batch_size,
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index f605d8f27b..678d436b0b 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -20,6 +20,7 @@
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
 #include "src/turbomind/kernels/activation_kernels.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
+#include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/nvtx_utils.h"
 // #include <glog/logging.h>
 
@@ -99,10 +100,14 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
         activation(num_token);
     }
 
+    CheckValues(gating_buf_, num_token * weights->output.input_dims, "ffn13", stream_);
+
     // w2(x)
     linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output);
     POP_RANGE;
 
+    CheckValues(ffn_output_data, num_token * weights->output.output_dims, "ffn2", stream_);
+
     if (tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 97cf0e0a3d..4f798f9db0 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -441,7 +441,7 @@ void LlamaV2<T>::internalThreadEntry(int device_id)
     model_instance_barrier() = shared_state_->barrier.get();
 
     // initialize global counters
-    // CheckValues((T*)0, 0, {}, 0);
+    CheckValues((T*)0, 0, {}, 0);
     shared_state_->barrier->wait();
 
     auto& request_queue  = shared_state_->request_queue;