From 70e6ab26ba3791c351f948a20b86967eb46f693b Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Sat, 1 Jul 2023 22:12:23 +0800 Subject: [PATCH] Change target tritonfastertransformerbackend to trtonturbomindbackend (#36) * change target tritonfastertransformerbackend to tritonturbomindbackend * install targets to backends/turbomind * changge model_dir --- CMakeLists.txt | 4 +- README.md | 8 +-- README_zh-CN.md | 8 +-- ... => TritonTurboMindBackendConfig.cmake.in} | 0 examples/cpp/llama/llama_config.ini | 4 +- lmdeploy/serve/turbomind/deploy.py | 2 +- lmdeploy/serve/turbomind/service_docker_up.sh | 4 +- .../triton_models/interactive/config.pbtxt | 2 +- src/turbomind/models/llama/llama_utils.cu | 2 +- src/turbomind/triton_backend/CMakeLists.txt | 66 +++++++++---------- .../triton_backend/libfastertransformer.cc | 12 ++-- 11 files changed, 56 insertions(+), 56 deletions(-) rename cmake/{TritonFasterTransformerBackendConfig.cmake.in => TritonTurboMindBackendConfig.cmake.in} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index d333e64533..55ec3e8431 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -374,8 +374,8 @@ install( transformer-shared EXPORT transformer-shared-targets - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind ) install( diff --git a/README.md b/README.md index e1836afa7a..21b9255a7e 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server: ```shell python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \ --tokenizer_path /path/to/tokenizer/model -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` @@ -111,7 +111,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast ```shell python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \ --tokenizer_path /path/to/tokenizer/model --tp 2 -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` @@ -129,7 +129,7 @@ python3 -m fastchat.model.apply_delta \ --delta-path lmsys/vicuna-7b-delta-v1.1 python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` @@ -145,7 +145,7 @@ python3 -m fastchat.model.apply_delta \ --delta-path lmsys/vicuna-13b-delta-v1.1 python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` diff --git a/README_zh-CN.md b/README_zh-CN.md index dcd053cf8b..c3c7cccbc8 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -98,7 +98,7 @@ make -j$(nproc) && make install ```shell python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \ --tokenizer_path /path/to/tokenizer/model -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` @@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast ```shell python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \ --tokenizer_path /path/to/tokenizer/model --tp 2 -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` @@ -127,7 +127,7 @@ python3 -m fastchat.model.apply_delta \ --delta-path lmsys/vicuna-7b-delta-v1.1 python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` @@ -143,7 +143,7 @@ python3 -m fastchat.model.apply_delta \ --delta-path lmsys/vicuna-13b-delta-v1.1 python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf -bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer +bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind ``` diff --git a/cmake/TritonFasterTransformerBackendConfig.cmake.in b/cmake/TritonTurboMindBackendConfig.cmake.in similarity index 100% rename from cmake/TritonFasterTransformerBackendConfig.cmake.in rename to cmake/TritonTurboMindBackendConfig.cmake.in diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index af768f015c..fd05f21799 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -2,8 +2,8 @@ data_type=fp16 enable_custom_all_reduce=0 pipeline_para_size=1 -tensor_para_size=8 -model_dir=/shared_data/chatpjlm-0/v0.2.3/fastertransformer/weights/ +tensor_para_size=1 +model_dir=/workspace/models/triton_models/weights/ [request] diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py index 285cec3c63..26a1db1ad1 100644 --- a/lmdeploy/serve/turbomind/deploy.py +++ b/lmdeploy/serve/turbomind/deploy.py @@ -227,7 +227,7 @@ def get_param(_name, _size): del ckpt for name, param in model_params.items(): - # transpose all weights as FasterTransformer is expecting column-major + # transpose all weights as TurboMind is expecting column-major # weights: (output_dims, input_dims) -> (input_dims, output_dims) key = name.split('.')[-2] if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']: diff --git a/lmdeploy/serve/turbomind/service_docker_up.sh b/lmdeploy/serve/turbomind/service_docker_up.sh index b182fad13d..d45345e616 100644 --- a/lmdeploy/serve/turbomind/service_docker_up.sh +++ b/lmdeploy/serve/turbomind/service_docker_up.sh @@ -5,7 +5,7 @@ show_help() { echo echo "Options:" echo " -h, --help Show this help message and exit" - echo " --lib-dir Specify the directory of fastertransformer libraries" + echo " --lib-dir Specify the directory of turbomind libraries" } # check if '-h' or '--help' in the arguments @@ -64,7 +64,7 @@ for ((i = 1; i <= $#; i++)); do docker run \ --gpus $DEVICES \ --rm \ - -v "${LIB_PATH}":/opt/tritonserver/backends/fastertransformer \ + -v "${LIB_PATH}":/opt/tritonserver/backends/turbomind \ -v ""${SCRIPT_ABS_DIR}"":/workspace/models \ --shm-size 16g \ -p 33336:22 \ diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt index 4a669e0f5b..003881ce43 100644 --- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt +++ b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "turbomind" -backend: "fastertransformer" +backend: "turbomind" default_model_filename: "weights" max_batch_size: 1 diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index 6d4cdb8113..7050d2d13f 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -148,7 +148,7 @@ size_t curandStateGetSize() bool isDebug() { static const bool is_debug = [] { - const auto level = std::getenv("FT_DEBUG_LEVEL"); + const auto level = std::getenv("TM_DEBUG_LEVEL"); if (level && level == std::string("DEBUG")) { return true; } diff --git a/src/turbomind/triton_backend/CMakeLists.txt b/src/turbomind/triton_backend/CMakeLists.txt index 29779b40e8..4307b88e83 100644 --- a/src/turbomind/triton_backend/CMakeLists.txt +++ b/src/turbomind/triton_backend/CMakeLists.txt @@ -26,7 +26,7 @@ cmake_minimum_required (VERSION 3.18) -project(tritonfastertransformerbackend LANGUAGES C CXX) +project(tritonturbomindbackend LANGUAGES C CXX) # # Options @@ -89,12 +89,12 @@ endif() # TRITON_ENABLE_GPU configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY) add_library( - triton-fastertransformer-backend SHARED + triton-turbomind-backend SHARED libfastertransformer.cc ) add_library( - TritonFasterTransformerBackend::triton-fastertransformer-backend ALIAS triton-fastertransformer-backend + TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend ) find_package(CUDAToolkit REQUIRED) @@ -106,13 +106,13 @@ endif() set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) -target_compile_definitions(triton-fastertransformer-backend +target_compile_definitions(triton-turbomind-backend PUBLIC USE_TRITONSERVER_DATATYPE BUILD_MULTI_GPU) target_include_directories( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ${TRITON_PYTORCH_INCLUDE_PATHS} @@ -123,31 +123,31 @@ target_include_directories( ) target_link_directories( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE ${CUDA_PATH}/lib64 ) -target_compile_features(triton-fastertransformer-backend PRIVATE cxx_std_14) +target_compile_features(triton-turbomind-backend PRIVATE cxx_std_14) target_compile_options( - triton-fastertransformer-backend PRIVATE + triton-turbomind-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror> ) if(${TRITON_ENABLE_GPU}) target_compile_definitions( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE TRITON_ENABLE_GPU=1 ) endif() # TRITON_ENABLE_GPU set_target_properties( - triton-fastertransformer-backend + triton-turbomind-backend PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_fastertransformer + OUTPUT_NAME triton_turbomind SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE @@ -159,7 +159,7 @@ set_target_properties( # Need to turn off unused-but-set-variable due to Torchvision # Need to turn off unknown-pragmas due to ATen OpenMP set_target_properties( - triton-fastertransformer-backend + triton-turbomind-backend PROPERTIES COMPILE_FLAGS "-Wno-unknown-pragmas -Wno-unused-but-set-variable" ) @@ -170,7 +170,7 @@ FOREACH(p ${TRITON_PYTORCH_LIB_PATHS}) ENDFOREACH(p) target_link_libraries( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core @@ -186,23 +186,23 @@ target_link_libraries( if (BUILD_MULTI_GPU) target_compile_definitions( - triton-fastertransformer-backend + triton-turbomind-backend PUBLIC BUILD_MULTI_GPU ) target_include_directories( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE ${MPI_INCLUDE_PATH} ) target_link_directories( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE ${MPI_Libraries} /usr/local/mpi/lib ) target_link_libraries( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE ${NCCL_LIBRARIES} ${MPI_LIBRARIES} @@ -211,7 +211,7 @@ endif() if(${TRITON_ENABLE_GPU}) target_link_libraries( - triton-fastertransformer-backend + triton-turbomind-backend PRIVATE CUDA::cudart ) @@ -221,38 +221,38 @@ endif() # TRITON_ENABLE_GPU # Install # include(GNUInstallDirs) -set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonFasterTransformerBackend) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMindBackend) install( TARGETS - triton-fastertransformer-backend + triton-turbomind-backend EXPORT - triton-fastertransformer-backend-targets - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer + triton-turbomind-backend-targets + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind ) install( EXPORT - triton-fastertransformer-backend-targets + triton-turbomind-backend-targets FILE - TritonFasterTransformerBackendTargets.cmake + TritonTurboMindBackendTargets.cmake NAMESPACE - TritonFasterTransformerBackend:: + TritonTurboMindBackend:: DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( - ${CMAKE_SOURCE_DIR}/cmake/TritonFasterTransformerBackendConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake + ${CMAKE_SOURCE_DIR}/cmake/TritonTurboMindBackendConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES - ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake DESTINATION ${INSTALL_CONFIGDIR} ) @@ -260,12 +260,12 @@ install( # Export from build tree # export( - EXPORT triton-fastertransformer-backend-targets - FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendTargets.cmake - NAMESPACE TritonFasterTransformerBackend:: + EXPORT triton-turbomind-backend-targets + FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendTargets.cmake + NAMESPACE TritonTurboMindBackend:: ) -export(PACKAGE TritonFasterTransformerBackend) +export(PACKAGE TritonTurboMindBackend) # Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. diff --git a/src/turbomind/triton_backend/libfastertransformer.cc b/src/turbomind/triton_backend/libfastertransformer.cc index 55b6ab6005..0ffb50863f 100644 --- a/src/turbomind/triton_backend/libfastertransformer.cc +++ b/src/turbomind/triton_backend/libfastertransformer.cc @@ -511,11 +511,11 @@ TRITONSERVER_Error* ModelState::AutoCompleteConfig() } } else { - // Auto-complete configuration is not supported since fastertransformer does + // Auto-complete configuration is not supported since turbomind does // not store/capture sufficient model metadata so just log error instead. LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("skipping model configuration auto-complete for '") + Name() - + "': not supported for fastertransformer backend") + + "': not supported for turbomind backend") .c_str()); } @@ -940,7 +940,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const request_count, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, - std::string("null request given to FasterTransformer backend for '" + Name() + "'").c_str())); + std::string("null request given to TurboMind backend for '" + Name() + "'").c_str())); return; } @@ -1115,7 +1115,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const for (auto& response : responses) { if (response != nullptr) { LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), - "failed to send FasterTransformer backend response"); + "failed to send TurboMind backend response"); LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str()); } else { @@ -1160,7 +1160,7 @@ void streaming_callback(std::shared_ptr> if (response != nullptr) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str()); LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr), - "failed to send FasterTransformer backend response"); + "failed to send TurboMind backend response"); LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str()); } else { @@ -1358,7 +1358,7 @@ ModelInstanceState::Execute(std::vector* responses, response_count, TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, - ("FasterTransformer execute failure: " + std::string(ex.what())).c_str())); + ("TurboMind execute failure: " + std::string(ex.what())).c_str())); } auto output_tensors = output_tensors_list[0]; return output_tensors;