build turbomind (#35)

* build turbomind * change namespace fastertransformer to turbomind * change logger name
InternLM · Jul 1, 2023 · 35d6446 · 35d6446
1 parent 53d2e42
commit 35d6446
Show file tree

Hide file tree

Showing 196 changed files with 1,292 additions and 1,292 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 4.0.1
     hooks:
       - id: flake8
-        args: ["--exclude=llama_service/fastertransformer/triton_model/llama_models/*, configs/*"]
+        args: ["--exclude=lmdeploy/turbomind/triton_models/*"]
   - repo: https://github.com/PyCQA/isort
     rev: 5.11.5
     hooks:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -53,7 +53,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 FetchContent_MakeAvailable(repo-cutlass)
 
 set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
-set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/fastertransformer/cutlass_extensions/include)
+set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
 
 option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
 

diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server:
 <summary><b>7B</b></summary>
 
 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
     --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 <summary><b>13B</b></summary>
 
 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -128,7 +128,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-7b \
   --delta-path lmsys/vicuna-7b-delta-v1.1
 
-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 
@@ -144,7 +144,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-13b \
   --delta-path lmsys/vicuna-13b-delta-v1.1
 
-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -96,7 +96,7 @@ make -j$(nproc) && make install
 <summary><b>7B</b></summary>
 
 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
     --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -107,7 +107,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 <summary><b>13B</b></summary>
 
 ```shell
-python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
+python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
@@ -126,7 +126,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-7b \
   --delta-path lmsys/vicuna-7b-delta-v1.1
 
-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 
@@ -142,7 +142,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-13b \
   --delta-path lmsys/vicuna-13b-delta-v1.1
 
-python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
+python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
 bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
 ```
 

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
@@ -21,16 +21,16 @@
 #include <memory>
 #include <thread>
 
-#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
-#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
-#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
-#include "src/fastertransformer/utils/custom_ar_comm.h"
-#include "src/fastertransformer/utils/mpi_utils.h"
-#include "src/fastertransformer/utils/nccl_utils.h"
-#include "src/fastertransformer/utils/nvtx_utils.h"
-#include "src/fastertransformer/utils/word_list.h"
-
-namespace ft = fastertransformer;
+#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
+#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/utils/mpi_utils.h"
+#include "src/turbomind/utils/nccl_utils.h"
+#include "src/turbomind/utils/nvtx_utils.h"
+#include "src/turbomind/utils/word_list.h"
+
+namespace ft = turbomind;
 
 constexpr const bool kUSE_MPI = true;
 

diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@ def get_version():
           author='OpenMMLab',
           author_email='[email protected]',
           packages=find_packages(
-              exclude=('lmdeploy/serve/fastertransformer/triton_models', )),
+              exclude=('lmdeploy/serve/turbomind/triton_models', )),
           classifiers=[
               'Programming Language :: Python :: 3.8',
               'Programming Language :: Python :: 3.9',

diff --git a/src/turbomind/kernels/activation_kernels.cu b/src/turbomind/kernels/activation_kernels.cu
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/kernels/activation_kernels.h"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
 
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #endif
 
-namespace fastertransformer {
+namespace turbomind {
 
 /* Gelu Activation */
 
@@ -255,8 +255,8 @@ void invokeGenericActivation(T*           out,
                              const int    seq_len,
                              cudaStream_t stream)
 {
-    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
-    FT_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
+    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TM_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
     using PT                   = typename packed_type<T>::type;
     constexpr int packed_elems = num_elems<PT>::value;
     using PBT                  = typename packed_as<BT, packed_elems>::type;
@@ -272,7 +272,7 @@ void invokeGenericActivation(T*           out,
         block.x = n_threads;
         grid.x  = ceil(m * n / double(n_threads));
     }
-    FT_LOG_DEBUG("%d %d", grid.x, block.x);
+    TM_LOG_DEBUG("%d %d", grid.x, block.x);
     sync_check_cuda_error();
     generic_activation<Activation><<<grid, block, 0, stream>>>(reinterpret_cast<PT*>(out),
                                                                reinterpret_cast<const PBT*>(bias),
@@ -655,4 +655,4 @@ void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stre
 template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
 template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/activation_kernels.h b/src/turbomind/kernels/activation_kernels.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <stdlib.h>
 
-namespace fastertransformer {
+namespace turbomind {
 
 // clang-format off
 template<typename T> struct GeluActivation;
@@ -107,4 +107,4 @@ void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStre
 template<typename T>
 void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/ban_bad_words.cu b/src/turbomind/kernels/ban_bad_words.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "src/fastertransformer/kernels/ban_bad_words.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/kernels/ban_bad_words.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
-namespace fastertransformer {
+namespace turbomind {
 
 template<typename T>
 __global__ void ban_bad_words(T*         logits,
@@ -161,4 +161,4 @@ template void invokeBanBadWords(float*       logits,
                                 size_t       step,
                                 cudaStream_t stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/ban_bad_words.h b/src/turbomind/kernels/ban_bad_words.h
@@ -19,7 +19,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
-namespace fastertransformer {
+namespace turbomind {
 
 template<typename T>
 void invokeBanBadWords(T*           logits,
@@ -36,4 +36,4 @@ void invokeBanBadWords(T*           logits,
                        size_t       step,
                        cudaStream_t stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/beam_search_penalty_kernels.cu b/src/turbomind/kernels/beam_search_penalty_kernels.cu
@@ -16,10 +16,10 @@
 
 #include <assert.h>
 
-#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
-#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
+#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 
-namespace fastertransformer {
+namespace turbomind {
 
 template<typename T>
 __global__ void add_bias_temperature(T*          logits,
@@ -310,4 +310,4 @@ template void invokeAddBiasApplyPenalties(int                         step,
                                           const int                   min_length,
                                           cudaStream_t                stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/beam_search_penalty_kernels.h b/src/turbomind/kernels/beam_search_penalty_kernels.h
@@ -17,10 +17,10 @@
 
 #include <cuda_fp16.h>
 
-#include "src/fastertransformer/kernels/penalty_types.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/kernels/penalty_types.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
-namespace fastertransformer {
+namespace turbomind {
 
 template<typename T>
 void invokeAddBiasApplyPenalties(int                         step,
@@ -45,4 +45,4 @@ void invokeAddBiasApplyPenalties(int                         step,
                                  const int                   min_length,
                                  cudaStream_t                stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/beam_search_topk_kernels.cu b/src/turbomind/kernels/beam_search_topk_kernels.cu
@@ -22,13 +22,13 @@
 #include "3rdparty/cub/cub.cuh"
 #endif
 
-#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
-#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
-#include "src/fastertransformer/utils/cuda_utils.h"
-#include "src/fastertransformer/utils/logger.h"
+#include "src/turbomind/kernels/beam_search_topk_kernels.h"
+#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
+#include "src/turbomind/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"
 
-namespace fastertransformer {
+namespace turbomind {
 
 template<typename T>
 __device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
@@ -595,7 +595,7 @@ void invokeTopkBeamSearch(void*           workspace,
                           const int*      end_ids,
                           cudaStream_t    stream)
 {
-    FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
+    TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
     // log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
     const int vocab_size = vocab_size_padded_;
     // Beam size should be less than or equal to vocab size.
@@ -842,4 +842,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
     insertUnfinishedPath<<<batch_size, 256, 0, stream>>>(beam_hyps, finished, cum_log_probs, batch_size, beam_width);
 }
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/beam_search_topk_kernels.h b/src/turbomind/kernels/beam_search_topk_kernels.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-namespace fastertransformer {
+namespace turbomind {
 
 // In original beam search implementation, if a beam is finished, we set it as finished
 // and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
@@ -91,4 +91,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
                                 const int      beam_width,
                                 cudaStream_t   stream);
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/bert_preprocess_kernels.cu b/src/turbomind/kernels/bert_preprocess_kernels.cu
@@ -15,11 +15,11 @@
  */
 
 #include "bert_preprocess_kernels.h"
-#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cuda_type_utils.cuh"
 
-namespace fastertransformer {
+namespace turbomind {
 
 __global__ void getPaddingOffsetAndCuSeqLensKernel(size_t*    h_valid_word_num,
                                                    int*       tmp_mask_offset,
@@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_M
 
 #endif
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/bert_preprocess_kernels.h b/src/turbomind/kernels/bert_preprocess_kernels.h
@@ -15,15 +15,15 @@
  */
 
 #pragma once
-#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
-#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/turbomind/kernels/gen_relative_pos_bias.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #ifdef ENABLE_FP8
-#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/turbomind/utils/cuda_fp8_utils.h"
 #endif  // ENABLE_FP8
 
-namespace fastertransformer {
+namespace turbomind {
 
 void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
                                         size_t*      h_token_num,
@@ -111,4 +111,4 @@ template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
 void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
 #endif  // ENABLE_FP8
 
-}  // namespace fastertransformer
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/custom_ar_kernels.cu b/src/turbomind/kernels/custom_ar_kernels.cu
@@ -15,9 +15,9 @@
  */
 
 #include "custom_ar_kernels.h"
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/turbomind/utils/cuda_type_utils.cuh"
 
-namespace fastertransformer {
+namespace turbomind {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -395,4 +395,4 @@ template void invokeOneOrTwoShotAllReduceKernel<__nv_bfloat16>(AllReduceParams<_
                                                                cudaStream_t                    stream);
 #endif
 template void invokeOneOrTwoShotAllReduceKernel<uint32_t>(AllReduceParams<uint32_t>& param, cudaStream_t stream);
-}  // namespace fastertransformer
+}  // namespace turbomind