Skip to content

Commit

Permalink
check for Inf & NaNs
Browse files Browse the repository at this point in the history
  • Loading branch information
lzhangzz committed Nov 13, 2023
1 parent 56a1f7b commit e7f5547
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/turbomind/models/llama/LlamaBatch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ bool LlamaBatch<T>::generate()
// T x = 999999.f;
// cudaMemcpyAsync(decoder_input_buf_, &x, sizeof(x), cudaMemcpyDefault, stream_);

// CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_);
CheckValues(decoder_input_buf_, batch_size_ * llama_->hidden_units_, "embedding_lookup", stream_);

// if (compare_mode == kCmpWrite) {
// if (rank_ == 0) {
Expand Down
6 changes: 6 additions & 0 deletions src/turbomind/models/llama/LlamaContextAttentionLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
// [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);

CheckValues(qkv_buf_, num_token * weights->qkv.output_dims, "prefill_qkv", stream_);

//////////////////////////////////////////////
/// transpose qkv & apply rotary embedding & rebuild padding
/// qkv [B, s, H + 2kvH, D] -> (q [B, H, s, D], k [B, kvH, s, D], v [B, kvH, s, D])
Expand Down Expand Up @@ -237,10 +239,14 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
weights->past_kv_scale.data());
}

CheckValues(qkv_buf_3_, num_token * weights->output.input_dims, "prefill_context", stream_);

//////////////////////////////////////////////
/// output gemm <Bs,HD> -> <Bs,HD>
linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);

CheckValues(attention_out, num_token * weights->output.output_dims, "prefill_o", stream_);

if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
Expand Down
12 changes: 7 additions & 5 deletions src/turbomind/models/llama/LlamaContextDecoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
stream_);
sync_check_cuda_error();

CheckValues(decoder_input_output, sess.token_num * hidden_units_, Concat("prefill_input", 0), stream_);

/////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
Expand All @@ -245,14 +247,14 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
stream_);
sync_check_cuda_error();

// CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_);
CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm", 0), stream_);

for (size_t layer = 0; layer < num_layer_; ++layer) {
/////////////////////////////////////////////
/// self-attention
forwardSelfAttn(sess, decoder_output, input_tensors, layer, false);

// CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_);
CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_self_attn", layer), stream_);

invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
decoder_output,
Expand All @@ -264,7 +266,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
stream_);
sync_check_cuda_error();

// CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_);
CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm1", layer), stream_);

////////////////////////////////////////////
/// feed-forward network
Expand All @@ -273,7 +275,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
{"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);

// CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_);
CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_ffn", layer), stream_);

auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
Expand All @@ -287,7 +289,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
stream_);
sync_check_cuda_error();

// CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_);
CheckValues(decoder_output, sess.token_num * hidden_units_, Concat("prefill_norm2", layer), stream_);
}

if (is_free_buffer_after_forward_) {
Expand Down
12 changes: 7 additions & 5 deletions src/turbomind/models/llama/LlamaDecoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou

// int step = input_tensors->at("step").getVal<int>();

CheckValues(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0), stream_);

////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
Expand All @@ -208,7 +210,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
stream_);
sync_check_cuda_error();

// CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_);
CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm", 0), stream_);

// CheckBatchConsistency(decoder_output,
// hidden_units_,
Expand All @@ -228,7 +230,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
// tensor_para_.rank_,
// stream_);

// CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_);
CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_self_attn", layer), stream_);

invokeFusedAddBiasResidualRMSNorm(decoder_input,
decoder_output,
Expand All @@ -240,12 +242,12 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
stream_);
sync_check_cuda_error();

// CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_);
CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm1", layer), stream_);

// decoder_layer_output_ = ffn(decoder_normed_input_)
forwardFfn(sess, decoder_output, layer);

// CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_);
CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_ffn", layer), stream_);

auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
Expand All @@ -259,7 +261,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
stream_);
sync_check_cuda_error();

// CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_);
CheckValues(decoder_output, sess.batch_size * hidden_units_, Concat("decode_norm2", layer), stream_);
}

if (is_free_buffer_after_forward_) {
Expand Down
6 changes: 6 additions & 0 deletions src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o
linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
POP_RANGE;

CheckValues(qkv_buf_, batch_size * weights->qkv.output_dims, "decode_qkv", stream_);

// CheckBatchConsistency(qkv_buf_,
// (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_,
// batch_size,
Expand Down Expand Up @@ -302,6 +304,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o
stream_);
sync_check_cuda_error();

CheckValues(context_buf_, batch_size * weights->output.input_dims, "decode_context", stream_);

// CheckBatchConsistency((T*)context_buf_,
// local_hidden_units_,
// batch_size,
Expand All @@ -311,6 +315,8 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o

linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);

CheckValues(hidden_features_data, batch_size * weights->output.output_dims, "decode_o", stream_);

// CheckBatchConsistency(hidden_features_data,
// hidden_units_,
// batch_size,
Expand Down
5 changes: 5 additions & 0 deletions src/turbomind/models/llama/LlamaFfnLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/nvtx_utils.h"
// #include <glog/logging.h>

Expand Down Expand Up @@ -99,10 +100,14 @@ void LlamaFfnLayer<T>::forward(TensorMap* output_tensors,
activation(num_token);
}

CheckValues(gating_buf_, num_token * weights->output.input_dims, "ffn13", stream_);

// w2(x)
linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output);
POP_RANGE;

CheckValues(ffn_output_data, num_token * weights->output.output_dims, "ffn2", stream_);

if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/models/llama/LlamaV2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ void LlamaV2<T>::internalThreadEntry(int device_id)
model_instance_barrier() = shared_state_->barrier.get();

// initialize global counters
// CheckValues((T*)0, 0, {}, 0);
CheckValues((T*)0, 0, {}, 0);
shared_state_->barrier->wait();

auto& request_queue = shared_state_->request_queue;
Expand Down

0 comments on commit e7f5547

Please sign in to comment.