From 38e843800ae400d3654a5bafc069c0dfaeb4551c Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 5 Dec 2023 08:23:15 +0000 Subject: [PATCH 01/10] support image_embs input --- lmdeploy/turbomind/turbomind.py | 23 +++++ src/turbomind/models/llama/LlamaBatch.cc | 22 ++++- src/turbomind/models/llama/LlamaV2.cc | 84 ++++++++++++++----- src/turbomind/models/llama/LlamaV2.h | 51 ++++++----- src/turbomind/models/llama/SequenceManager.h | 4 + .../triton_backend/llama/LlamaTritonModel.cc | 3 + .../triton_backend/llama/LlamaTritonModel.h | 1 + 7 files changed, 143 insertions(+), 45 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index ad7c0cb518..9fba482f40 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -459,6 +459,8 @@ async def async_stream_infer(self, *args, **kwargs): def stream_infer(self, session_id, input_ids, + image_embs=None, + image_offsets=None, request_output_len: int = 512, sequence_start: bool = True, sequence_end: bool = False, @@ -544,6 +546,27 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): CORRID=np.array(session_id, dtype=np.uint64), STOP=_broadcast_np((1 if stop else 0), np.int32)) + if image_embs is not None: + assert len(image_offsets) == len(image_embs) + # image_embs Union[List[np.array], List[List[np.array]]] + # image_offsets Union[List[int], List[List[int]]] + if isinstance(image_offsets[0], int): + image_offsets = [image_offsets] + image_embs = [image_embs] + image_embs = [[ + torch.from_numpy(x).squeeze().unsqueeze(0) for x in y + ] for y in image_embs] + image_embs = [torch.cat(x) for x in image_embs] + image_embs = pad_sequence(image_embs, batch_first=True) + image_offsets = [torch.IntTensor(x) for x in image_offsets] + image_offsets = pad_sequence(image_offsets, batch_first=True) + if self.tm_model.config.weight_type == 'fp32': + image_embs = image_embs.float() + else: + image_embs = image_embs.half() + inputs['image_embs'] = image_embs + inputs['image_offsets'] = image_offsets + if ignore_eos: stop_words = None bad_words = torch.tensor([[[self.eos_id], [1]]], dtype=torch.int32) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 90f303a8bf..fdfa602c25 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -258,6 +258,21 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) output_ids = Copy(input_ids, input_length, output_ids); } + // copy image embeddings + if (model_->image_dim_ > 0 && r->inputs[rank_].isExist("image_embs")) { + T* image_embs = r->inputs[rank_].getPtr("image_embs"); + const int* h_image_offsets = r->inputs[rank_].getPtr("image_offsets"); + const auto n_offsets = r->inputs[rank_].at("image_offsets").shape.back(); + const int count = model_->image_dim_ * model_->hidden_units_; + for (size_t i = 0; i < n_offsets && h_image_offsets[i] > 0; i++) { + seq.image_offsets.push_back(seq.tokens.size() + h_image_offsets[i]); + auto& emb = seq.image_embs.emplace_back(); + emb.resize(count * sizeof(T)); + std::memcpy(emb.data(), image_embs, count * sizeof(T)); + image_embs += count; + } + } + // total context length (history + input) state.h_context_length[idx] = output_ids - output_ids_base; state.h_finished[idx] = false; @@ -1420,6 +1435,8 @@ bool LlamaBatch::Forward(GenerationState& g, int iter) std::vector decode_indices{}; std::vector decode_lengths{}; + std::vector sequences; + BatchedCopy batched_copy; for (int i = first; i < last; ++i) { input_ids = batched_copy.Add(input_d_ptrs[i], h_input_length_buf_[i], input_ids); @@ -1436,6 +1453,7 @@ bool LlamaBatch::Forward(GenerationState& g, int iter) } decode_indices.push_back(i); decode_lengths.push_back(h_input_length_buf_[i]); + sequences.push_back(state_->sequences[i]); max_input_len = std::max(max_input_len, h_input_length_buf_[i]); } int token_count = input_ids - context_decoder_ids_buf_; @@ -1482,7 +1500,9 @@ bool LlamaBatch::Forward(GenerationState& g, int iter) pf_batch_size, max_input_len, max_context_cnts[p], - max_context_cnts[p]); + max_context_cnts[p], + decode_lengths.data(), + sequences.data()); if (iter == 0) { // compute logits of inputs if requested diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 12a3bc3cf5..77dae80cbd 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -48,6 +48,7 @@ LlamaV2::LlamaV2(size_t head_num, size_t inter_size, size_t num_layer, size_t vocab_size, + size_t image_dim, float norm_eps, const LlamaAttentionParams& attn_params, int start_id, @@ -69,6 +70,7 @@ LlamaV2::LlamaV2(size_t head_num, inter_size_(inter_size), num_layer_(num_layer), vocab_size_(vocab_size), + image_dim_(image_dim), attn_params_(attn_params), vocab_size_padded_(vocab_size), rmsnorm_eps_(norm_eps), @@ -166,28 +168,63 @@ void LlamaV2::embeddingLookup(T* embeddings, const int* token_ids_buf, int ba } template -void LlamaV2::forwardUnified(T* out, - T* decoder_output, - T* decoder_input, - void** k_block_ptrs, - void** v_block_ptrs, - const int* input_ids, - const int* cu_block_cnts, - const float* rope_theta, - const bool* dc_finished, - const int* pf_input_length, - const int* pf_context_length, - T** pf_tmp_k_ptrs, - T** pf_tmp_v_ptrs, - size_t token_num, - int dc_batch_size, - int dc_step, - int dc_sum_seq_len, - int dc_max_seq_len, - int pf_batch_size, - int pf_max_input_len, - int pf_max_context_len, - int pf_session_len) +void LlamaV2::updateImageEmbedding(T* decoder_input, + const int bsz, + const int* decode_lengths, + const Sequence** sequences) +{ + TM_LOG_DEBUG(__PRETTY_FUNCTION__); + + if (image_dim_ <= 0) { + return; + } + + for (int i = 0; i < bsz; i++) { + decoder_input += ((i > 0) ? decode_lengths[i - 1] : 0) * hidden_units_; + if (decode_lengths[i] == 1) { + continue; + } + const auto& seq = *sequences[i]; + for (int j = 0; j < seq.image_offsets.size(); j++) { + if (seq.image_offsets[j] + image_dim_ <= seq.cache_len) { + continue; + } + int off_dst = std::max(0, seq.image_offsets[j] - seq.cache_len); + int off_src = std::max(0, seq.cache_len - seq.image_offsets[j]); + T* dst_ptr = decoder_input + off_dst * hidden_units_; + std::byte* src_ptr = seq.image_embs[j].data() + off_src * hidden_units_; + size_t count = (image_dim_ - off_src) * hidden_units_ * sizeof(T); + cudaMemcpyAsync(dst_ptr, src_ptr, count, cudaMemcpyDefault, stream_); + } + } + sync_check_cuda_error(); +} + +template +void LlamaV2::forwardUnified(T* out, + T* decoder_output, + T* decoder_input, + void** k_block_ptrs, + void** v_block_ptrs, + const int* input_ids, + const int* cu_block_cnts, + const float* rope_theta, + const bool* dc_finished, + const int* pf_input_length, + const int* pf_context_length, + T** pf_tmp_k_ptrs, + T** pf_tmp_v_ptrs, + size_t token_num, + int dc_batch_size, + int dc_step, + int dc_sum_seq_len, + int dc_max_seq_len, + int pf_batch_size, + int pf_max_input_len, + int pf_max_context_len, + int pf_session_len, + const int* decode_lengths, + const Sequence** sequences) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); @@ -203,6 +240,9 @@ void LlamaV2::forwardUnified(T* out, 1, hidden_units_, stream_); + + updateImageEmbedding(decoder_input, dc_batch_size + pf_batch_size, decode_lengths, sequences); + sync_check_cuda_error(); const auto dtype = getTensorType(); diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 19cea4b58e..6eb4580eb4 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -58,6 +58,7 @@ class LlamaV2 { size_t inter_size, size_t num_layer, size_t vocab_size, + size_t image_dim, float norm_eps, const LlamaAttentionParams& attn_params, int start_id, @@ -107,28 +108,32 @@ class LlamaV2 { void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step); - void forwardUnified(T* out, - T* decoder_output, - T* decoder_input, - void** k_block_ptrs, - void** v_block_ptrs, - const int* input_ids, - const int* cu_block_cnts, - const float* rope_theta, - const bool* dc_finished, - const int* pf_input_length, - const int* pf_context_length, - T** pf_tmp_k_ptrs, - T** pf_tmp_v_ptrs, - size_t token_num, - int dc_batch_size, - int dc_step, - int dc_sum_seq_len, - int dc_max_seq_len, - int pf_batch_size, - int pf_max_input_len, - int pf_max_context_len, - int pf_session_len); + void updateImageEmbedding(T* decoder_input, const int bsz, const int* decode_lengths, const Sequence** sequences); + + void forwardUnified(T* out, + T* decoder_output, + T* decoder_input, + void** k_block_ptrs, + void** v_block_ptrs, + const int* input_ids, + const int* cu_block_cnts, + const float* rope_theta, + const bool* dc_finished, + const int* pf_input_length, + const int* pf_context_length, + T** pf_tmp_k_ptrs, + T** pf_tmp_v_ptrs, + size_t token_num, + int dc_batch_size, + int dc_step, + int dc_sum_seq_len, + int dc_max_seq_len, + int pf_batch_size, + int pf_max_input_len, + int pf_max_context_len, + int pf_session_len, + const int* decode_lengths, + const Sequence** sequences); void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size); @@ -172,6 +177,8 @@ class LlamaV2 { const size_t local_kv_head_num_; NcclParam tensor_para_; + const size_t image_dim_; + cudaStream_t stream_; cublasMMWrapper* cublas_wrapper_; IAllocator* allocator_; diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index be38d855b2..eb58ad4bdd 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -33,6 +33,10 @@ struct Sequence { mutable float rope_theta = 0.f; + // image data + mutable std::vector> image_embs{}; + mutable std::vector image_offsets{}; + Sequence(uint64_t _id): id(_id) {} friend std::ostream& operator<<(std::ostream& os, const Sequence& seq); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 33711b502d..c018a5e24a 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -172,6 +172,8 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, quant_policy_ = reader.GetInteger("llama", "quant_policy", 0); group_size_ = reader.GetInteger("llama", "group_size", 0); + image_dim_ = reader.GetInteger("llama", "image_dim", image_dim_); + // rotary embedding parameters attn_params_.rotary_embedding_dim = reader.GetInteger("llama", "rotary_embedding"); attn_params_.rotary_embedding_base = reader.GetFloat("llama", "rope_theta", 10000.0f); @@ -273,6 +275,7 @@ std::unique_ptr> LlamaTritonModel::createSh inter_size_, num_layer_, vocab_size_, + image_dim_, norm_eps_, attn_params_, start_id_, diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index ff086a9099..49f6a59a02 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -101,6 +101,7 @@ struct LlamaTritonModel: public AbstractTransformerModel { bool attn_bias_; int quant_policy_; int group_size_; + size_t image_dim_; // shared weights for each device std::vector>> shared_weights_; From 1ab98823accd93bb4d753bf7e4f7c421946e1125 Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 5 Dec 2023 08:49:28 +0000 Subject: [PATCH 02/10] add some checks --- lmdeploy/turbomind/turbomind.py | 4 +++- src/turbomind/models/llama/LlamaBatch.cc | 11 ++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 9fba482f40..db04905383 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -559,7 +559,9 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): image_embs = [torch.cat(x) for x in image_embs] image_embs = pad_sequence(image_embs, batch_first=True) image_offsets = [torch.IntTensor(x) for x in image_offsets] - image_offsets = pad_sequence(image_offsets, batch_first=True) + image_offsets = pad_sequence(image_offsets, + batch_first=True, + padding_value=-1) if self.tm_model.config.weight_type == 'fp32': image_embs = image_embs.float() else: diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index fdfa602c25..4c605b920b 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -260,9 +260,18 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) // copy image embeddings if (model_->image_dim_ > 0 && r->inputs[rank_].isExist("image_embs")) { + auto image_embs_tensor = r->inputs[rank_].at("image_embs"); + const auto n_offsets = r->inputs[rank_].at("image_offsets").shape.back(); + if (image_embs_tensor.shape.size() != 4 || image_embs_tensor.shape[1] != n_offsets + || image_embs_tensor.shape[2] != model_->image_dim_) { + TM_LOG_WARNING("[ImageFeature] Invalid image feature, id = %ld, info = %s", + (long)seq.id, + image_embs_tensor.toString().c_str()); + continue; + } + T* image_embs = r->inputs[rank_].getPtr("image_embs"); const int* h_image_offsets = r->inputs[rank_].getPtr("image_offsets"); - const auto n_offsets = r->inputs[rank_].at("image_offsets").shape.back(); const int count = model_->image_dim_ * model_->hidden_units_; for (size_t i = 0; i < n_offsets && h_image_offsets[i] > 0; i++) { seq.image_offsets.push_back(seq.tokens.size() + h_image_offsets[i]); From 014a1efa2286c19ab17da986640cdfdddd51ee8b Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 5 Dec 2023 08:52:24 +0000 Subject: [PATCH 03/10] update interactive/config.pbtxt && TurbomindModelConfig --- .../turbomind/triton_models/interactive/config.pbtxt | 12 ++++++++++++ lmdeploy/turbomind/deploy/target_model/base.py | 1 + 2 files changed, 13 insertions(+) diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt index 003881ce43..e3e62c655e 100644 --- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt +++ b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt @@ -59,6 +59,18 @@ input [ data_type: TYPE_UINT32 dims: [ -1 ] }, + { + name: "image_embs" + data_type: TYPE_FP16 + dims: [ -1, -1, -1 ] + optional: true + }, + { + name: "image_offsets" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + }, { name: "step" data_type: TYPE_INT32 diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 92e6232301..0b9cceff1d 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -58,6 +58,7 @@ class TurbomindModelConfig: max_position_embeddings: int = 0 rope_scaling_factor: float = 0.0 use_logn_attn: int = 0 + image_dim: int = 0 @classmethod def from_dict(cls, env, allow_none=False): From 19c0f5f15127f4cb63cff65fa567d3b10da59403 Mon Sep 17 00:00:00 2001 From: irexyc Date: Thu, 7 Dec 2023 10:36:31 +0800 Subject: [PATCH 04/10] update docstring --- lmdeploy/turbomind/turbomind.py | 2 ++ src/turbomind/models/llama/LlamaBatch.cc | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index db04905383..fb2b4f17fb 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -478,6 +478,8 @@ def stream_infer(self, Args: session_id (int): the id of a session input_ids (numpy.ndarray): the token ids of a prompt + image_embs (List[numpy.ndarray]): the image features + image_offsets (List[int]): image_embs offsets to input_ids request_output_len (int): the max number of to-be-generated tokens sequence_start (bool): indicator for starting a sequence sequence_end (bool): indicator for ending a sequence diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 4c605b920b..ca5717d9bf 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -273,7 +273,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) T* image_embs = r->inputs[rank_].getPtr("image_embs"); const int* h_image_offsets = r->inputs[rank_].getPtr("image_offsets"); const int count = model_->image_dim_ * model_->hidden_units_; - for (size_t i = 0; i < n_offsets && h_image_offsets[i] > 0; i++) { + for (size_t i = 0; i < n_offsets && h_image_offsets[i] >= 0; i++) { seq.image_offsets.push_back(seq.tokens.size() + h_image_offsets[i]); auto& emb = seq.image_embs.emplace_back(); emb.resize(count * sizeof(T)); From 1753ead9b60c30d907fb26ab38d87909f8cdff64 Mon Sep 17 00:00:00 2001 From: irexyc Date: Fri, 8 Dec 2023 10:10:13 +0800 Subject: [PATCH 05/10] refactor --- .../triton_models/interactive/config.pbtxt | 23 ++++-- .../turbomind/deploy/target_model/base.py | 1 - lmdeploy/turbomind/turbomind.py | 60 ++++++++------ src/turbomind/models/llama/LlamaBatch.cc | 80 ++++++++++++++----- src/turbomind/models/llama/LlamaV2.cc | 43 ++++------ src/turbomind/models/llama/LlamaV2.h | 7 +- src/turbomind/models/llama/SequenceManager.h | 7 +- .../triton_backend/llama/LlamaTritonModel.cc | 3 - .../triton_backend/llama/LlamaTritonModel.h | 1 - 9 files changed, 137 insertions(+), 88 deletions(-) diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt index e3e62c655e..806f429da9 100644 --- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt +++ b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt @@ -60,14 +60,27 @@ input [ dims: [ -1 ] }, { - name: "image_embs" - data_type: TYPE_FP16 - dims: [ -1, -1, -1 ] + name: "embeddings" + data_type: TYPE_BYTES + dims: [ -1 ] optional: true }, { - name: "image_offsets" - data_type: TYPE_INT32 + name: "embedding_counts" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "embedding_begins" + data_type: TYPE_UINT32 + dims: [ -1 ] + optional: true + }, + { + name: "embedding_ends" + data_type: TYPE_UINT32 dims: [ -1 ] optional: true }, diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 0b9cceff1d..92e6232301 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -58,7 +58,6 @@ class TurbomindModelConfig: max_position_embeddings: int = 0 rope_scaling_factor: float = 0.0 use_logn_attn: int = 0 - image_dim: int = 0 @classmethod def from_dict(cls, env, allow_none=False): diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index fb2b4f17fb..84b3caa8ec 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -459,8 +459,9 @@ async def async_stream_infer(self, *args, **kwargs): def stream_infer(self, session_id, input_ids, - image_embs=None, - image_offsets=None, + embeddings=None, + embedding_begins=None, + embedding_ends=None, request_output_len: int = 512, sequence_start: bool = True, sequence_end: bool = False, @@ -478,8 +479,9 @@ def stream_infer(self, Args: session_id (int): the id of a session input_ids (numpy.ndarray): the token ids of a prompt - image_embs (List[numpy.ndarray]): the image features - image_offsets (List[int]): image_embs offsets to input_ids + embeddings (List[numpy.ndarray]): embeddings features + embedding_begins (List[int]): embeddings begin offset to input_ids + embedding_ends (List[int]): embeddings end offset to input_ids request_output_len (int): the max number of to-be-generated tokens sequence_start (bool): indicator for starting a sequence sequence_end (bool): indicator for ending a sequence @@ -548,28 +550,38 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): CORRID=np.array(session_id, dtype=np.uint64), STOP=_broadcast_np((1 if stop else 0), np.int32)) - if image_embs is not None: - assert len(image_offsets) == len(image_embs) - # image_embs Union[List[np.array], List[List[np.array]]] - # image_offsets Union[List[int], List[List[int]]] - if isinstance(image_offsets[0], int): - image_offsets = [image_offsets] - image_embs = [image_embs] - image_embs = [[ - torch.from_numpy(x).squeeze().unsqueeze(0) for x in y - ] for y in image_embs] - image_embs = [torch.cat(x) for x in image_embs] - image_embs = pad_sequence(image_embs, batch_first=True) - image_offsets = [torch.IntTensor(x) for x in image_offsets] - image_offsets = pad_sequence(image_offsets, - batch_first=True, - padding_value=-1) + if embeddings is not None: + assert len(embeddings) == len(embedding_begins) == len( + embedding_ends) + if isinstance(embedding_begins[0], int): + embedding_begins = [embedding_begins] + embedding_ends = [embedding_ends] + embeddings = [embeddings] + # convert to lookup table type + # TODO bf16 if self.tm_model.config.weight_type == 'fp32': - image_embs = image_embs.float() + embeddings = [[x.astype(np.float32) for x in y] + for y in embeddings] else: - image_embs = image_embs.half() - inputs['image_embs'] = image_embs - inputs['image_offsets'] = image_offsets + embeddings = [[x.astype(np.float16) for x in y] + for y in embeddings] + + embedding_counts = torch.IntTensor( + [len(embs) for embs in embeddings]) + embeddings = [[torch.from_numpy(x).squeeze() for x in y] + for y in embeddings] + embeddings = [torch.cat(x) for x in embeddings] + embeddings = pad_sequence(embeddings, batch_first=True) + embeddings = embeddings.reshape(embeddings.shape[0], + -1).view(torch.int8) + embedding_begins = [torch.IntTensor(x) for x in embedding_begins] + embedding_begins = pad_sequence(embedding_begins, batch_first=True) + embedding_ends = [torch.IntTensor(x) for x in embedding_ends] + embedding_ends = pad_sequence(embedding_ends, batch_first=True) + inputs['embeddings'] = embeddings + inputs['embedding_counts'] = embedding_counts + inputs['embedding_begins'] = embedding_begins + inputs['embedding_ends'] = embedding_ends if ignore_eos: stop_words = None diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index ca5717d9bf..df631faf18 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -60,6 +60,22 @@ void ClearState(BatchState& s) s.size = s.active_size = 0; } +void DropEmbeddings(const Sequence& seq) +{ + int seq_len = seq.tokens.size(); + int num_emb = seq.embeddings.size(); + size_t sz = num_emb; + for (; sz >= 1; sz--) { + if (seq.embedding_ends[sz - 1] <= seq_len) { + break; + } + } + // should we keep part of embedding? + seq.embeddings.resize(sz); + seq.embedding_begins.resize(sz); + seq.embedding_ends.resize(sz); +} + template void LlamaBatch::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs) { @@ -234,6 +250,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) if (step <= seq.tokens.size()) { seq.tokens.resize(step); seq.cache_len = std::min(seq.cache_len, step); + DropEmbeddings(seq); } else if (rank_ == 0) { TM_LOG_WARNING( @@ -258,27 +275,50 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) output_ids = Copy(input_ids, input_length, output_ids); } - // copy image embeddings - if (model_->image_dim_ > 0 && r->inputs[rank_].isExist("image_embs")) { - auto image_embs_tensor = r->inputs[rank_].at("image_embs"); - const auto n_offsets = r->inputs[rank_].at("image_offsets").shape.back(); - if (image_embs_tensor.shape.size() != 4 || image_embs_tensor.shape[1] != n_offsets - || image_embs_tensor.shape[2] != model_->image_dim_) { - TM_LOG_WARNING("[ImageFeature] Invalid image feature, id = %ld, info = %s", + // copy embeddings + if (r->inputs[rank_].isExist("embedding_counts")) { + int emb_count = r->inputs[rank_].getVal("embedding_counts"); + const auto emb_tensor = r->inputs[rank_].at("embeddings"); + const auto begin_tensor = r->inputs[rank_].at("embedding_begins"); + const auto end_tensor = r->inputs[rank_].at("embedding_ends"); + const int* begin = begin_tensor.getPtr(); + const int* end = end_tensor.getPtr(); + + auto check_embeddings = [&]() { + if (emb_count <= 0 || begin_tensor.shape != end_tensor.shape || emb_tensor.shape.size() != 2) { + return false; + } + int emb_len = 0; + for (size_t i = 0; i < emb_count; i++) { + emb_len += (end[i] - begin[i]); + if (begin[i] < 0 || end[i] < 0 || begin[i] >= end[i] || end[i] > input_length + || emb_len > input_length + || emb_len * model_->hidden_units_ * sizeof(T) > emb_tensor.shape[1]) { + return false; + } + } + return true; + }; + + if (!check_embeddings()) { + TM_LOG_WARNING("[ImageFeature] Skip invalid embeddings, id = %ld, input_length = %d, " + "embeddings = %s, embedding_counts = %d, begins = %s, ends = %s", (long)seq.id, - image_embs_tensor.toString().c_str()); - continue; + input_length, + emb_tensor.toString().c_str(), + emb_count, + begin_tensor.toString().c_str(), + end_tensor.toString().c_str()); } - - T* image_embs = r->inputs[rank_].getPtr("image_embs"); - const int* h_image_offsets = r->inputs[rank_].getPtr("image_offsets"); - const int count = model_->image_dim_ * model_->hidden_units_; - for (size_t i = 0; i < n_offsets && h_image_offsets[i] >= 0; i++) { - seq.image_offsets.push_back(seq.tokens.size() + h_image_offsets[i]); - auto& emb = seq.image_embs.emplace_back(); - emb.resize(count * sizeof(T)); - std::memcpy(emb.data(), image_embs, count * sizeof(T)); - image_embs += count; + else { + char* emb_tensor_ptr = emb_tensor.getPtr(); + for (size_t i = 0; i < emb_count; i++) { + size_t count = (end[i] - begin[i]) * model_->hidden_units_ * sizeof(T); + seq.embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count)); + seq.embedding_begins.emplace_back(begin[i] + seq.tokens.size()); + seq.embedding_ends.emplace_back(end[i] + seq.tokens.size()); + emb_tensor_ptr += count; + } } } @@ -1510,7 +1550,7 @@ bool LlamaBatch::Forward(GenerationState& g, int iter) max_input_len, max_context_cnts[p], max_context_cnts[p], - decode_lengths.data(), + h_input_length_buf_ + first, sequences.data()); if (iter == 0) { diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 77dae80cbd..fd2fd99592 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -48,7 +48,6 @@ LlamaV2::LlamaV2(size_t head_num, size_t inter_size, size_t num_layer, size_t vocab_size, - size_t image_dim, float norm_eps, const LlamaAttentionParams& attn_params, int start_id, @@ -70,7 +69,6 @@ LlamaV2::LlamaV2(size_t head_num, inter_size_(inter_size), num_layer_(num_layer), vocab_size_(vocab_size), - image_dim_(image_dim), attn_params_(attn_params), vocab_size_padded_(vocab_size), rmsnorm_eps_(norm_eps), @@ -168,34 +166,27 @@ void LlamaV2::embeddingLookup(T* embeddings, const int* token_ids_buf, int ba } template -void LlamaV2::updateImageEmbedding(T* decoder_input, - const int bsz, - const int* decode_lengths, - const Sequence** sequences) +void LlamaV2::updateEmbedding(T* decoder_input, const int bsz, const int* h_input_length, const Sequence** sequences) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); - if (image_dim_ <= 0) { - return; - } - for (int i = 0; i < bsz; i++) { - decoder_input += ((i > 0) ? decode_lengths[i - 1] : 0) * hidden_units_; - if (decode_lengths[i] == 1) { - continue; - } - const auto& seq = *sequences[i]; - for (int j = 0; j < seq.image_offsets.size(); j++) { - if (seq.image_offsets[j] + image_dim_ <= seq.cache_len) { - continue; + const auto& seq = *sequences[i]; + const auto& embeddings = seq.embeddings; + const auto& begins = seq.embedding_begins; + const auto& ends = seq.embedding_ends; + for (int j = embeddings.size() - 1; j >= 0; j--) { + if (ends[j] <= seq.cache_len) { + break; } - int off_dst = std::max(0, seq.image_offsets[j] - seq.cache_len); - int off_src = std::max(0, seq.cache_len - seq.image_offsets[j]); - T* dst_ptr = decoder_input + off_dst * hidden_units_; - std::byte* src_ptr = seq.image_embs[j].data() + off_src * hidden_units_; - size_t count = (image_dim_ - off_src) * hidden_units_ * sizeof(T); - cudaMemcpyAsync(dst_ptr, src_ptr, count, cudaMemcpyDefault, stream_); + int off_dst = std::max(0, begins[j] - seq.cache_len); + int off_src = std::max(0, seq.cache_len - begins[j]); + size_t byte_size = (ends[j] - begins[j]) * hidden_units_ * sizeof(T); + T* dst_ptr = decoder_input + off_dst * hidden_units_; + auto src_ptr = embeddings[j].data() + off_src * hidden_units_ * sizeof(T); + cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_); } + decoder_input += h_input_length[i] * hidden_units_; } sync_check_cuda_error(); } @@ -223,7 +214,7 @@ void LlamaV2::forwardUnified(T* out, int pf_max_input_len, int pf_max_context_len, int pf_session_len, - const int* decode_lengths, + const int* h_input_length, const Sequence** sequences) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); @@ -241,7 +232,7 @@ void LlamaV2::forwardUnified(T* out, hidden_units_, stream_); - updateImageEmbedding(decoder_input, dc_batch_size + pf_batch_size, decode_lengths, sequences); + updateEmbedding(decoder_input, dc_batch_size + pf_batch_size, h_input_length, sequences); sync_check_cuda_error(); diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 6eb4580eb4..551b7cb121 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -58,7 +58,6 @@ class LlamaV2 { size_t inter_size, size_t num_layer, size_t vocab_size, - size_t image_dim, float norm_eps, const LlamaAttentionParams& attn_params, int start_id, @@ -108,7 +107,7 @@ class LlamaV2 { void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step); - void updateImageEmbedding(T* decoder_input, const int bsz, const int* decode_lengths, const Sequence** sequences); + void updateEmbedding(T* decoder_input, const int bsz, const int* h_input_length, const Sequence** sequences); void forwardUnified(T* out, T* decoder_output, @@ -132,7 +131,7 @@ class LlamaV2 { int pf_max_input_len, int pf_max_context_len, int pf_session_len, - const int* decode_lengths, + const int* h_input_length, const Sequence** sequences); void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size); @@ -177,8 +176,6 @@ class LlamaV2 { const size_t local_kv_head_num_; NcclParam tensor_para_; - const size_t image_dim_; - cudaStream_t stream_; cublasMMWrapper* cublas_wrapper_; IAllocator* allocator_; diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index eb58ad4bdd..4902c6321d 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -33,9 +33,10 @@ struct Sequence { mutable float rope_theta = 0.f; - // image data - mutable std::vector> image_embs{}; - mutable std::vector image_offsets{}; + // embedding data + mutable std::vector> embeddings; + mutable std::vector embedding_begins; + mutable std::vector embedding_ends; Sequence(uint64_t _id): id(_id) {} diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index c018a5e24a..33711b502d 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -172,8 +172,6 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, quant_policy_ = reader.GetInteger("llama", "quant_policy", 0); group_size_ = reader.GetInteger("llama", "group_size", 0); - image_dim_ = reader.GetInteger("llama", "image_dim", image_dim_); - // rotary embedding parameters attn_params_.rotary_embedding_dim = reader.GetInteger("llama", "rotary_embedding"); attn_params_.rotary_embedding_base = reader.GetFloat("llama", "rope_theta", 10000.0f); @@ -275,7 +273,6 @@ std::unique_ptr> LlamaTritonModel::createSh inter_size_, num_layer_, vocab_size_, - image_dim_, norm_eps_, attn_params_, start_id_, diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 49f6a59a02..ff086a9099 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -101,7 +101,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { bool attn_bias_; int quant_policy_; int group_size_; - size_t image_dim_; // shared weights for each device std::vector>> shared_weights_; From a6c4977690736cb69e4817b9a78bdbf41be8382e Mon Sep 17 00:00:00 2001 From: irexyc Date: Fri, 8 Dec 2023 10:29:40 +0800 Subject: [PATCH 06/10] support convert embeddings to bf16 --- lmdeploy/turbomind/turbomind.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 84b3caa8ec..144a7978b0 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -558,10 +558,14 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): embedding_ends = [embedding_ends] embeddings = [embeddings] # convert to lookup table type - # TODO bf16 if self.tm_model.config.weight_type == 'fp32': embeddings = [[x.astype(np.float32) for x in y] for y in embeddings] + elif self.tm_model.config.weight_type == 'bf16': + embeddings = [[ + torch.from_numpy(x).bfloat16().view(torch.half).numpy() + for x in y + ] for y in embeddings] else: embeddings = [[x.astype(np.float16) for x in y] for y in embeddings] From b0e3984ac4a0c0528110c57d07ba20ebae4275aa Mon Sep 17 00:00:00 2001 From: irexyc Date: Fri, 8 Dec 2023 17:10:52 +0800 Subject: [PATCH 07/10] update interactive/config.pbtxt --- .../serve/turbomind/triton_models/interactive/config.pbtxt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt index 806f429da9..7a90fd9d04 100644 --- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt +++ b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt @@ -61,7 +61,7 @@ input [ }, { name: "embeddings" - data_type: TYPE_BYTES + data_type: TYPE_INT8 dims: [ -1 ] optional: true }, @@ -72,7 +72,7 @@ input [ reshape: { shape: [ ] } optional: true }, - { + { name: "embedding_begins" data_type: TYPE_UINT32 dims: [ -1 ] From e215bbaf9e1b61f8290da463f06849dd7f6c7e02 Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 12 Dec 2023 03:30:41 +0000 Subject: [PATCH 08/10] embeddings -> input_embeddings --- .../triton_models/interactive/config.pbtxt | 2 +- lmdeploy/turbomind/turbomind.py | 42 +++++++++---------- src/turbomind/models/llama/LlamaBatch.cc | 14 +++---- src/turbomind/models/llama/LlamaV2.cc | 2 +- src/turbomind/models/llama/SequenceManager.h | 2 +- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt index 7a90fd9d04..2a82f8c250 100644 --- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt +++ b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt @@ -60,7 +60,7 @@ input [ dims: [ -1 ] }, { - name: "embeddings" + name: "input_embeddings" data_type: TYPE_INT8 dims: [ -1 ] optional: true diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 144a7978b0..10eeeecc8f 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -459,7 +459,7 @@ async def async_stream_infer(self, *args, **kwargs): def stream_infer(self, session_id, input_ids, - embeddings=None, + input_embeddings=None, embedding_begins=None, embedding_ends=None, request_output_len: int = 512, @@ -479,9 +479,9 @@ def stream_infer(self, Args: session_id (int): the id of a session input_ids (numpy.ndarray): the token ids of a prompt - embeddings (List[numpy.ndarray]): embeddings features - embedding_begins (List[int]): embeddings begin offset to input_ids - embedding_ends (List[int]): embeddings end offset to input_ids + input_embeddings (List[numpy.ndarray]): embeddings features + embedding_begins (List[int]): the begin offsets of input_embeddings + embedding_ends (List[int]): the end offset of input_embeddings request_output_len (int): the max number of to-be-generated tokens sequence_start (bool): indicator for starting a sequence sequence_end (bool): indicator for ending a sequence @@ -550,39 +550,39 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): CORRID=np.array(session_id, dtype=np.uint64), STOP=_broadcast_np((1 if stop else 0), np.int32)) - if embeddings is not None: - assert len(embeddings) == len(embedding_begins) == len( + if input_embeddings is not None: + assert len(input_embeddings) == len(embedding_begins) == len( embedding_ends) if isinstance(embedding_begins[0], int): embedding_begins = [embedding_begins] embedding_ends = [embedding_ends] - embeddings = [embeddings] + input_embeddings = [input_embeddings] # convert to lookup table type if self.tm_model.config.weight_type == 'fp32': - embeddings = [[x.astype(np.float32) for x in y] - for y in embeddings] + input_embeddings = [[x.astype(np.float32) for x in y] + for y in input_embeddings] elif self.tm_model.config.weight_type == 'bf16': - embeddings = [[ + input_embeddings = [[ torch.from_numpy(x).bfloat16().view(torch.half).numpy() for x in y - ] for y in embeddings] + ] for y in input_embeddings] else: - embeddings = [[x.astype(np.float16) for x in y] - for y in embeddings] + input_embeddings = [[x.astype(np.float16) for x in y] + for y in input_embeddings] embedding_counts = torch.IntTensor( - [len(embs) for embs in embeddings]) - embeddings = [[torch.from_numpy(x).squeeze() for x in y] - for y in embeddings] - embeddings = [torch.cat(x) for x in embeddings] - embeddings = pad_sequence(embeddings, batch_first=True) - embeddings = embeddings.reshape(embeddings.shape[0], - -1).view(torch.int8) + [len(embs) for embs in input_embeddings]) + input_embeddings = [[torch.from_numpy(x).squeeze() for x in y] + for y in input_embeddings] + input_embeddings = [torch.cat(x) for x in input_embeddings] + input_embeddings = pad_sequence(input_embeddings, batch_first=True) + input_embeddings = input_embeddings.reshape( + input_embeddings.shape[0], -1).view(torch.int8) embedding_begins = [torch.IntTensor(x) for x in embedding_begins] embedding_begins = pad_sequence(embedding_begins, batch_first=True) embedding_ends = [torch.IntTensor(x) for x in embedding_ends] embedding_ends = pad_sequence(embedding_ends, batch_first=True) - inputs['embeddings'] = embeddings + inputs['input_embeddings'] = input_embeddings inputs['embedding_counts'] = embedding_counts inputs['embedding_begins'] = embedding_begins inputs['embedding_ends'] = embedding_ends diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index df631faf18..8ba2d3e4a3 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -63,7 +63,7 @@ void ClearState(BatchState& s) void DropEmbeddings(const Sequence& seq) { int seq_len = seq.tokens.size(); - int num_emb = seq.embeddings.size(); + int num_emb = seq.input_embeddings.size(); size_t sz = num_emb; for (; sz >= 1; sz--) { if (seq.embedding_ends[sz - 1] <= seq_len) { @@ -71,7 +71,7 @@ void DropEmbeddings(const Sequence& seq) } } // should we keep part of embedding? - seq.embeddings.resize(sz); + seq.input_embeddings.resize(sz); seq.embedding_begins.resize(sz); seq.embedding_ends.resize(sz); } @@ -275,10 +275,10 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) output_ids = Copy(input_ids, input_length, output_ids); } - // copy embeddings + // copy input embeddings if (r->inputs[rank_].isExist("embedding_counts")) { int emb_count = r->inputs[rank_].getVal("embedding_counts"); - const auto emb_tensor = r->inputs[rank_].at("embeddings"); + const auto emb_tensor = r->inputs[rank_].at("input_embeddings"); const auto begin_tensor = r->inputs[rank_].at("embedding_begins"); const auto end_tensor = r->inputs[rank_].at("embedding_ends"); const int* begin = begin_tensor.getPtr(); @@ -301,8 +301,8 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) }; if (!check_embeddings()) { - TM_LOG_WARNING("[ImageFeature] Skip invalid embeddings, id = %ld, input_length = %d, " - "embeddings = %s, embedding_counts = %d, begins = %s, ends = %s", + TM_LOG_WARNING("[ImageFeature] Skip invalid input embeddings, id = %ld, input_length = %d, " + "input embeddings = %s, embedding_counts = %d, begins = %s, ends = %s", (long)seq.id, input_length, emb_tensor.toString().c_str(), @@ -314,7 +314,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) char* emb_tensor_ptr = emb_tensor.getPtr(); for (size_t i = 0; i < emb_count; i++) { size_t count = (end[i] - begin[i]) * model_->hidden_units_ * sizeof(T); - seq.embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count)); + seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count)); seq.embedding_begins.emplace_back(begin[i] + seq.tokens.size()); seq.embedding_ends.emplace_back(end[i] + seq.tokens.size()); emb_tensor_ptr += count; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index fd2fd99592..69bac0d4e8 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -172,7 +172,7 @@ void LlamaV2::updateEmbedding(T* decoder_input, const int bsz, const int* h_i for (int i = 0; i < bsz; i++) { const auto& seq = *sequences[i]; - const auto& embeddings = seq.embeddings; + const auto& embeddings = seq.input_embeddings; const auto& begins = seq.embedding_begins; const auto& ends = seq.embedding_ends; for (int j = embeddings.size() - 1; j >= 0; j--) { diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index 4902c6321d..0f27c03a3b 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -34,7 +34,7 @@ struct Sequence { mutable float rope_theta = 0.f; // embedding data - mutable std::vector> embeddings; + mutable std::vector> input_embeddings; mutable std::vector embedding_begins; mutable std::vector embedding_ends; From cc01fff5b4882780e80a6819a97e0985b63b94bc Mon Sep 17 00:00:00 2001 From: irexyc Date: Thu, 14 Dec 2023 08:09:50 +0000 Subject: [PATCH 09/10] use input_embedding_ranges --- .../triton_models/interactive/config.pbtxt | 17 +----- lmdeploy/turbomind/turbomind.py | 16 +++--- src/turbomind/models/llama/LlamaBatch.cc | 57 ++++++++++--------- src/turbomind/models/llama/LlamaV2.cc | 13 +++-- src/turbomind/models/llama/SequenceManager.h | 3 +- 5 files changed, 50 insertions(+), 56 deletions(-) diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt index 2a82f8c250..0b1e431ea4 100644 --- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt +++ b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt @@ -66,22 +66,9 @@ input [ optional: true }, { - name: "embedding_counts" + name: "input_embedding_ranges" data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "embedding_begins" - data_type: TYPE_UINT32 - dims: [ -1 ] - optional: true - }, - { - name: "embedding_ends" - data_type: TYPE_UINT32 - dims: [ -1 ] + dims: [ -1, 2 ] optional: true }, { diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 10eeeecc8f..6c2dc4ff53 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -570,8 +570,6 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): input_embeddings = [[x.astype(np.float16) for x in y] for y in input_embeddings] - embedding_counts = torch.IntTensor( - [len(embs) for embs in input_embeddings]) input_embeddings = [[torch.from_numpy(x).squeeze() for x in y] for y in input_embeddings] input_embeddings = [torch.cat(x) for x in input_embeddings] @@ -579,13 +577,17 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): input_embeddings = input_embeddings.reshape( input_embeddings.shape[0], -1).view(torch.int8) embedding_begins = [torch.IntTensor(x) for x in embedding_begins] - embedding_begins = pad_sequence(embedding_begins, batch_first=True) + embedding_begins = pad_sequence(embedding_begins, + batch_first=True, + padding_value=-1) embedding_ends = [torch.IntTensor(x) for x in embedding_ends] - embedding_ends = pad_sequence(embedding_ends, batch_first=True) + embedding_ends = pad_sequence(embedding_ends, + batch_first=True, + padding_value=-1) + input_embedding_ranges = torch.stack( + [embedding_begins, embedding_ends], dim=2) inputs['input_embeddings'] = input_embeddings - inputs['embedding_counts'] = embedding_counts - inputs['embedding_begins'] = embedding_begins - inputs['embedding_ends'] = embedding_ends + inputs['input_embedding_ranges'] = input_embedding_ranges if ignore_eos: stop_words = None diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 01d05b2962..3be0432d26 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -66,14 +66,13 @@ void DropEmbeddings(const Sequence& seq) int num_emb = seq.input_embeddings.size(); size_t sz = num_emb; for (; sz >= 1; sz--) { - if (seq.embedding_ends[sz - 1] <= seq_len) { + if (seq.input_embedding_ranges[sz - 1].second <= seq_len) { break; } } // should we keep part of embedding? seq.input_embeddings.resize(sz); - seq.embedding_begins.resize(sz); - seq.embedding_ends.resize(sz); + seq.input_embedding_ranges.resize(sz); } template @@ -276,47 +275,53 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) } // copy input embeddings - if (r->inputs[rank_].isExist("embedding_counts")) { - int emb_count = r->inputs[rank_].getVal("embedding_counts"); + if (r->inputs[rank_].isExist("input_embedding_ranges")) { + const auto range_tensor = r->inputs[rank_].at("input_embedding_ranges"); const auto emb_tensor = r->inputs[rank_].at("input_embeddings"); - const auto begin_tensor = r->inputs[rank_].at("embedding_begins"); - const auto end_tensor = r->inputs[rank_].at("embedding_ends"); - const int* begin = begin_tensor.getPtr(); - const int* end = end_tensor.getPtr(); + const int* ranges = range_tensor.getPtr(); - auto check_embeddings = [&]() { - if (emb_count <= 0 || begin_tensor.shape != end_tensor.shape || emb_tensor.shape.size() != 2) { + auto check_embeddings = [&](int& num_valid_embeddings) { + if (range_tensor.shape.size() != 3 || range_tensor.shape[2] % 2 != 0) { return false; } - int emb_len = 0; - for (size_t i = 0; i < emb_count; i++) { - emb_len += (end[i] - begin[i]); - if (begin[i] < 0 || end[i] < 0 || begin[i] >= end[i] || end[i] > input_length - || emb_len > input_length - || emb_len * model_->hidden_units_ * sizeof(T) > emb_tensor.shape[1]) { + int embedding_count = range_tensor.shape[1]; + int embedding_length = 0; + int pre_end = -1; + + for (size_t i = 0; i < embedding_count; i++) { + int begin = ranges[i * 2]; + int end = ranges[i * 2 + 1]; + embedding_length += (end - begin); + if (begin < 0 || end < 0) { + break; + } + if (begin >= end || end > input_length || begin < pre_end + || embedding_length * model_->hidden_units_ * sizeof(T) > emb_tensor.shape[1]) { return false; } + pre_end = end; + num_valid_embeddings = i + 1; } return true; }; - if (!check_embeddings()) { + int num_valid_embeddings = 0; + if (!check_embeddings(num_valid_embeddings)) { TM_LOG_WARNING("[ImageFeature] Skip invalid input embeddings, id = %ld, input_length = %d, " - "input embeddings = %s, embedding_counts = %d, begins = %s, ends = %s", + "input embeddings = %s, range_tensor = %s", (long)seq.id, input_length, emb_tensor.toString().c_str(), - emb_count, - begin_tensor.toString().c_str(), - end_tensor.toString().c_str()); + range_tensor.toString().c_str()); } else { char* emb_tensor_ptr = emb_tensor.getPtr(); - for (size_t i = 0; i < emb_count; i++) { - size_t count = (end[i] - begin[i]) * model_->hidden_units_ * sizeof(T); + for (size_t i = 0; i < num_valid_embeddings; i++) { + int begin = ranges[i * 2]; + int end = ranges[i * 2 + 1]; + size_t count = (end - begin) * model_->hidden_units_ * sizeof(T); seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count)); - seq.embedding_begins.emplace_back(begin[i] + seq.tokens.size()); - seq.embedding_ends.emplace_back(end[i] + seq.tokens.size()); + seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size()); emb_tensor_ptr += count; } } diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 70de38187f..0663df30c0 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -173,15 +173,16 @@ void LlamaV2::updateEmbedding(T* decoder_input, const int bsz, const int* h_i for (int i = 0; i < bsz; i++) { const auto& seq = *sequences[i]; const auto& embeddings = seq.input_embeddings; - const auto& begins = seq.embedding_begins; - const auto& ends = seq.embedding_ends; + const auto& ranges = seq.input_embedding_ranges; for (int j = embeddings.size() - 1; j >= 0; j--) { - if (ends[j] <= seq.cache_len) { + int begin = ranges[j].first; + int end = ranges[j].second; + if (end <= seq.cache_len) { break; } - int off_dst = std::max(0, begins[j] - seq.cache_len); - int off_src = std::max(0, seq.cache_len - begins[j]); - size_t byte_size = (ends[j] - begins[j]) * hidden_units_ * sizeof(T); + int off_dst = std::max(0, begin - seq.cache_len); + int off_src = std::max(0, seq.cache_len - begin); + size_t byte_size = (end - begin) * hidden_units_ * sizeof(T); T* dst_ptr = decoder_input + off_dst * hidden_units_; auto src_ptr = embeddings[j].data() + off_src * hidden_units_ * sizeof(T); cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_); diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index 31a12d113b..1744646395 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -35,8 +35,7 @@ struct Sequence { // embedding data mutable std::vector> input_embeddings; - mutable std::vector embedding_begins; - mutable std::vector embedding_ends; + mutable std::vector> input_embedding_ranges; explicit Sequence(uint64_t _id): id(_id) {} From 7e335d532abf7f7233c34a227ba4af8c1b1732ea Mon Sep 17 00:00:00 2001 From: irexyc Date: Fri, 15 Dec 2023 04:18:21 +0000 Subject: [PATCH 10/10] remove embedding_begins/ends --- lmdeploy/turbomind/turbomind.py | 36 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 6c2dc4ff53..a602f805ca 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -460,8 +460,7 @@ def stream_infer(self, session_id, input_ids, input_embeddings=None, - embedding_begins=None, - embedding_ends=None, + input_embedding_ranges=None, request_output_len: int = 512, sequence_start: bool = True, sequence_end: bool = False, @@ -480,8 +479,8 @@ def stream_infer(self, session_id (int): the id of a session input_ids (numpy.ndarray): the token ids of a prompt input_embeddings (List[numpy.ndarray]): embeddings features - embedding_begins (List[int]): the begin offsets of input_embeddings - embedding_ends (List[int]): the end offset of input_embeddings + input_embedding_ranges (List[Tuple[int,int]]): the begin/end + offsets of input_embeddings to input_ids request_output_len (int): the max number of to-be-generated tokens sequence_start (bool): indicator for starting a sequence sequence_end (bool): indicator for ending a sequence @@ -551,12 +550,10 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): STOP=_broadcast_np((1 if stop else 0), np.int32)) if input_embeddings is not None: - assert len(input_embeddings) == len(embedding_begins) == len( - embedding_ends) - if isinstance(embedding_begins[0], int): - embedding_begins = [embedding_begins] - embedding_ends = [embedding_ends] + assert len(input_embeddings) == len(input_embedding_ranges) + if isinstance(input_embeddings[0], np.ndarray): input_embeddings = [input_embeddings] + input_embedding_ranges = [input_embedding_ranges] # convert to lookup table type if self.tm_model.config.weight_type == 'fp32': input_embeddings = [[x.astype(np.float32) for x in y] @@ -576,16 +573,17 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): input_embeddings = pad_sequence(input_embeddings, batch_first=True) input_embeddings = input_embeddings.reshape( input_embeddings.shape[0], -1).view(torch.int8) - embedding_begins = [torch.IntTensor(x) for x in embedding_begins] - embedding_begins = pad_sequence(embedding_begins, - batch_first=True, - padding_value=-1) - embedding_ends = [torch.IntTensor(x) for x in embedding_ends] - embedding_ends = pad_sequence(embedding_ends, - batch_first=True, - padding_value=-1) - input_embedding_ranges = torch.stack( - [embedding_begins, embedding_ends], dim=2) + + _input_embedding_ranges = [] + for x in input_embedding_ranges: + if x is not None and len(x) != 0: + _input_embedding_ranges.append(torch.IntTensor(x)) + else: + _input_embedding_ranges.append(torch.IntTensor(size=(0, + 2))) + input_embedding_ranges = pad_sequence(_input_embedding_ranges, + batch_first=True, + padding_value=-1) inputs['input_embeddings'] = input_embeddings inputs['input_embedding_ranges'] = input_embedding_ranges