Skip to content

Commit

Permalink
build turbomind (#35)
Browse files Browse the repository at this point in the history
* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
  • Loading branch information
lvhan028 authored Jul 1, 2023
1 parent 53d2e42 commit 35d6446
Show file tree
Hide file tree
Showing 196 changed files with 1,292 additions and 1,292 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 4.0.1
hooks:
- id: flake8
args: ["--exclude=llama_service/fastertransformer/triton_model/llama_models/*, configs/*"]
args: ["--exclude=lmdeploy/turbomind/triton_models/*"]
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_MakeAvailable(repo-cutlass)

set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/fastertransformer/cutlass_extensions/include)
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)

option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)

Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server:
<summary><b>7B</b></summary>

```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
Expand All @@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
<summary><b>13B</b></summary>

```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
Expand All @@ -128,7 +128,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1

python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```

Expand All @@ -144,7 +144,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1

python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```

Expand Down
8 changes: 4 additions & 4 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ make -j$(nproc) && make install
<summary><b>7B</b></summary>

```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-7B /path/to/llama-7b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
Expand All @@ -107,7 +107,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
<summary><b>13B</b></summary>

```shell
python3 lmdeploy/serve/fastertransformer/deploy.py llama-13B /path/to/llama-13b llama \
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```
Expand All @@ -126,7 +126,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1

python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-7B /path/to/vicuna-7b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```

Expand All @@ -142,7 +142,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1

python3 lmdeploy/serve/fastertransformer/deploy.py vicuna-13B /path/to/vicuna-13b hf
python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
```

Expand Down
20 changes: 10 additions & 10 deletions examples/cpp/llama/llama_triton_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@
#include <memory>
#include <thread>

#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include "src/fastertransformer/utils/word_list.h"

namespace ft = fastertransformer;
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/mpi_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include "src/turbomind/utils/nvtx_utils.h"
#include "src/turbomind/utils/word_list.h"

namespace ft = turbomind;

constexpr const bool kUSE_MPI = true;

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_version():
author='OpenMMLab',
author_email='[email protected]',
packages=find_packages(
exclude=('lmdeploy/serve/fastertransformer/triton_models', )),
exclude=('lmdeploy/serve/turbomind/triton_models', )),
classifiers=[
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
Expand Down
18 changes: 9 additions & 9 deletions src/turbomind/kernels/activation_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
* limitations under the License.
*/

#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"

#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif

namespace fastertransformer {
namespace turbomind {

/* Gelu Activation */

Expand Down Expand Up @@ -255,8 +255,8 @@ void invokeGenericActivation(T* out,
const int seq_len,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
using PT = typename packed_type<T>::type;
constexpr int packed_elems = num_elems<PT>::value;
using PBT = typename packed_as<BT, packed_elems>::type;
Expand All @@ -272,7 +272,7 @@ void invokeGenericActivation(T* out,
block.x = n_threads;
grid.x = ceil(m * n / double(n_threads));
}
FT_LOG_DEBUG("%d %d", grid.x, block.x);
TM_LOG_DEBUG("%d %d", grid.x, block.x);
sync_check_cuda_error();
generic_activation<Activation><<<grid, block, 0, stream>>>(reinterpret_cast<PT*>(out),
reinterpret_cast<const PBT*>(bias),
Expand Down Expand Up @@ -655,4 +655,4 @@ void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stre
template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
6 changes: 3 additions & 3 deletions src/turbomind/kernels/activation_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@

#pragma once

#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdlib.h>

namespace fastertransformer {
namespace turbomind {

// clang-format off
template<typename T> struct GeluActivation;
Expand Down Expand Up @@ -107,4 +107,4 @@ void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStre
template<typename T>
void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
8 changes: 4 additions & 4 deletions src/turbomind/kernels/ban_bad_words.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
* limitations under the License.
*/

#include "src/fastertransformer/kernels/ban_bad_words.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/ban_bad_words.h"
#include "src/turbomind/utils/cuda_utils.h"

namespace fastertransformer {
namespace turbomind {

template<typename T>
__global__ void ban_bad_words(T* logits,
Expand Down Expand Up @@ -161,4 +161,4 @@ template void invokeBanBadWords(float* logits,
size_t step,
cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
4 changes: 2 additions & 2 deletions src/turbomind/kernels/ban_bad_words.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <cuda_fp16.h>
#include <cuda_runtime.h>

namespace fastertransformer {
namespace turbomind {

template<typename T>
void invokeBanBadWords(T* logits,
Expand All @@ -36,4 +36,4 @@ void invokeBanBadWords(T* logits,
size_t step,
cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
8 changes: 4 additions & 4 deletions src/turbomind/kernels/beam_search_penalty_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#include <assert.h>

#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"

namespace fastertransformer {
namespace turbomind {

template<typename T>
__global__ void add_bias_temperature(T* logits,
Expand Down Expand Up @@ -310,4 +310,4 @@ template void invokeAddBiasApplyPenalties(int step,
const int min_length,
cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
8 changes: 4 additions & 4 deletions src/turbomind/kernels/beam_search_penalty_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@

#include <cuda_fp16.h>

#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/utils/cuda_utils.h"

namespace fastertransformer {
namespace turbomind {

template<typename T>
void invokeAddBiasApplyPenalties(int step,
Expand All @@ -45,4 +45,4 @@ void invokeAddBiasApplyPenalties(int step,
const int min_length,
cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
16 changes: 8 additions & 8 deletions src/turbomind/kernels/beam_search_topk_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
#include "3rdparty/cub/cub.cuh"
#endif

#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"

namespace fastertransformer {
namespace turbomind {

template<typename T>
__device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
Expand Down Expand Up @@ -595,7 +595,7 @@ void invokeTopkBeamSearch(void* workspace,
const int* end_ids,
cudaStream_t stream)
{
FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
// log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
const int vocab_size = vocab_size_padded_;
// Beam size should be less than or equal to vocab size.
Expand Down Expand Up @@ -842,4 +842,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
insertUnfinishedPath<<<batch_size, 256, 0, stream>>>(beam_hyps, finished, cum_log_probs, batch_size, beam_width);
}

} // namespace fastertransformer
} // namespace turbomind
4 changes: 2 additions & 2 deletions src/turbomind/kernels/beam_search_topk_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#pragma once

namespace fastertransformer {
namespace turbomind {

// In original beam search implementation, if a beam is finished, we set it as finished
// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
Expand Down Expand Up @@ -91,4 +91,4 @@ void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
const int beam_width,
cudaStream_t stream);

} // namespace fastertransformer
} // namespace turbomind
10 changes: 5 additions & 5 deletions src/turbomind/kernels/bert_preprocess_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
*/

#include "bert_preprocess_kernels.h"
#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"

namespace fastertransformer {
namespace turbomind {

__global__ void getPaddingOffsetAndCuSeqLensKernel(size_t* h_valid_word_num,
int* tmp_mask_offset,
Expand Down Expand Up @@ -467,4 +467,4 @@ template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_M

#endif

} // namespace fastertransformer
} // namespace turbomind
10 changes: 5 additions & 5 deletions src/turbomind/kernels/bert_preprocess_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
*/

#pragma once
#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifdef ENABLE_FP8
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#endif // ENABLE_FP8

namespace fastertransformer {
namespace turbomind {

void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
size_t* h_token_num,
Expand Down Expand Up @@ -111,4 +111,4 @@ template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
#endif // ENABLE_FP8

} // namespace fastertransformer
} // namespace turbomind
6 changes: 3 additions & 3 deletions src/turbomind/kernels/custom_ar_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
*/

#include "custom_ar_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"

namespace fastertransformer {
namespace turbomind {

////////////////////////////////////////////////////////////////////////////////////////////////////

Expand Down Expand Up @@ -395,4 +395,4 @@ template void invokeOneOrTwoShotAllReduceKernel<__nv_bfloat16>(AllReduceParams<_
cudaStream_t stream);
#endif
template void invokeOneOrTwoShotAllReduceKernel<uint32_t>(AllReduceParams<uint32_t>& param, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
Loading

0 comments on commit 35d6446

Please sign in to comment.