From e7f554764e0cc9cd77084049c0a96ef91040924e Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 13 Nov 2023 06:44:17 +0000 Subject: [PATCH] check for Inf & NaNs --- src/turbomind/models/llama/LlamaBatch.cc | 2 +- .../models/llama/LlamaContextAttentionLayer.cc | 6 ++++++ src/turbomind/models/llama/LlamaContextDecoder.cc | 12 +++++++----- src/turbomind/models/llama/LlamaDecoder.cc | 12 +++++++----- .../models/llama/LlamaDecoderSelfAttentionLayer.cc | 6 ++++++ src/turbomind/models/llama/LlamaFfnLayer.cc | 5 +++++ src/turbomind/models/llama/LlamaV2.cc | 2 +- 7 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 7bcce83a6d..e02c3eaab3 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -507,7 +507,7 @@ bool LlamaBatch::generate() // T x = 999999.f; // cudaMemcpyAsync(decoder_input_buf_, &x, sizeof(x), cudaMemcpyDefault, stream_); - // CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_); + CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_); // if (compare_mode == kCmpWrite) { // if (rank_ == 0) { diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc index 881582acea..02e8a77869 100644 --- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc @@ -157,6 +157,8 @@ inline void LlamaContextAttentionLayer::forward(TensorMap* // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim] linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv); + CheckValues(qkv_buf_, num_token * weights->qkv.output_dims, "prefill_qkv", stream_); + ////////////////////////////////////////////// /// transpose qkv & apply rotary embedding & rebuild padding /// qkv [B, s, H + 2kvH, D] -> (q [B, H, s, D], k [B, kvH, s, D], v [B, kvH, s, D]) @@ -237,10 +239,14 @@ inline void LlamaContextAttentionLayer::forward(TensorMap* weights->past_kv_scale.data()); } + CheckValues(qkv_buf_3_, num_token * weights->output.input_dims, "prefill_context", stream_); + ////////////////////////////////////////////// /// output gemm -> linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output); + CheckValues(attention_out, num_token * weights->output.output_dims, "prefill_o", stream_); + if (tensor_para_.world_size_ > 1) { NcclGuard nccl_guard(tensor_para_, stream_); ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_); diff --git a/src/turbomind/models/llama/LlamaContextDecoder.cc b/src/turbomind/models/llama/LlamaContextDecoder.cc index 7edf087a8b..9c5fcb7e9c 100644 --- a/src/turbomind/models/llama/LlamaContextDecoder.cc +++ b/src/turbomind/models/llama/LlamaContextDecoder.cc @@ -234,6 +234,8 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); + CheckValues(decoder_input_output, sess.token_num * hidden_units_, Concat("prefill_input", 0), stream_); + ///////////////////////////////////////////// /// RMSNorm invokeRootMeanSquareNorm(decoder_output, @@ -245,14 +247,14 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_); + CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_); for (size_t layer = 0; layer < num_layer_; ++layer) { ///////////////////////////////////////////// /// self-attention forwardSelfAttn(sess, decoder_output, input_tensors, layer, false); - // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_); + CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_); invokeFusedAddBiasResidualRMSNorm(decoder_input_output, decoder_output, @@ -264,7 +266,7 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_); + CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_); //////////////////////////////////////////// /// feed-forward network @@ -273,7 +275,7 @@ void LlamaContextDecoder::forward(std::unordered_map* {"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}}; silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights); - // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_); + CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_); auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights : input_tensors->at("output_norm_weight").getPtr(); @@ -287,7 +289,7 @@ void LlamaContextDecoder::forward(std::unordered_map* stream_); sync_check_cuda_error(); - // CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_); + CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_); } if (is_free_buffer_after_forward_) { diff --git a/src/turbomind/models/llama/LlamaDecoder.cc b/src/turbomind/models/llama/LlamaDecoder.cc index 7b934f3cb2..791f942c8a 100644 --- a/src/turbomind/models/llama/LlamaDecoder.cc +++ b/src/turbomind/models/llama/LlamaDecoder.cc @@ -197,6 +197,8 @@ void LlamaDecoder::forward(std::unordered_map* ou // int step = input_tensors->at("step").getVal(); + CheckValues(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0), stream_); + //////////////////////////////////////////// /// RMSNorm invokeRootMeanSquareNorm(decoder_output, @@ -208,7 +210,7 @@ void LlamaDecoder::forward(std::unordered_map* ou stream_); sync_check_cuda_error(); - // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_); + CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_); // CheckBatchConsistency(decoder_output, // hidden_units_, @@ -228,7 +230,7 @@ void LlamaDecoder::forward(std::unordered_map* ou // tensor_para_.rank_, // stream_); - // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_); + CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_); invokeFusedAddBiasResidualRMSNorm(decoder_input, decoder_output, @@ -240,12 +242,12 @@ void LlamaDecoder::forward(std::unordered_map* ou stream_); sync_check_cuda_error(); - // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_); + CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_); // decoder_layer_output_ = ffn(decoder_normed_input_) forwardFfn(sess, decoder_output, layer); - // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_); + CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_); auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights : input_tensors->at("output_norm_weight").getPtr(); @@ -259,7 +261,7 @@ void LlamaDecoder::forward(std::unordered_map* ou stream_); sync_check_cuda_error(); - // CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_); + CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_); } if (is_free_buffer_after_forward_) { diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc index 4eb7937ba6..894069b774 100644 --- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc @@ -248,6 +248,8 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv); POP_RANGE; + CheckValues(qkv_buf_, batch_size * weights->qkv.output_dims, "decode_qkv", stream_); + // CheckBatchConsistency(qkv_buf_, // (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_, // batch_size, @@ -302,6 +304,8 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o stream_); sync_check_cuda_error(); + CheckValues(context_buf_, batch_size * weights->output.input_dims, "decode_context", stream_); + // CheckBatchConsistency((T*)context_buf_, // local_hidden_units_, // batch_size, @@ -311,6 +315,8 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output); + CheckValues(hidden_features_data, batch_size * weights->output.output_dims, "decode_o", stream_); + // CheckBatchConsistency(hidden_features_data, // hidden_units_, // batch_size, diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index f605d8f27b..678d436b0b 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -20,6 +20,7 @@ #include "src/turbomind/models/llama/LlamaFfnLayer.h" #include "src/turbomind/kernels/activation_kernels.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" +#include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/utils/nvtx_utils.h" // #include @@ -99,10 +100,14 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, activation(num_token); } + CheckValues(gating_buf_, num_token * weights->output.input_dims, "ffn13", stream_); + // w2(x) linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output); POP_RANGE; + CheckValues(ffn_output_data, num_token * weights->output.output_dims, "ffn2", stream_); + if (tensor_para_.world_size_ > 1) { NcclGuard nccl_guard(tensor_para_, stream_); ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_); diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 97cf0e0a3d..4f798f9db0 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -441,7 +441,7 @@ void LlamaV2::internalThreadEntry(int device_id) model_instance_barrier() = shared_state_->barrier.get(); // initialize global counters - // CheckValues((T*)0, 0, {}, 0); + CheckValues((T*)0, 0, {}, 0); shared_state_->barrier->wait(); auto& request_queue = shared_state_->request_queue;