From deeb43e76808a6420880795586792af59a64e939 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 28 Nov 2023 17:15:13 +0800 Subject: [PATCH 1/5] feat(build): enable ninja and lld --- CMakeLists.txt | 4 +++- docs/en/build.md | 1 + docs/zh_cn/build.md | 1 + generate.sh | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53e0eb2471..e22deb7d94 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE) endif() set(CXX_STD "17" CACHE STRING "C++ standard") - +# enable lld linker time optimization for binary and .so +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -flto=auto") +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld -flto=auto") set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) set(TF_PATH "" CACHE STRING "TensorFlow path") diff --git a/docs/en/build.md b/docs/en/build.md index cb278073c9..c9a52faf9e 100644 --- a/docs/en/build.md +++ b/docs/en/build.md @@ -67,6 +67,7 @@ Then, follow the steps below to set up the compilation environment: ``` - build and install lmdeploy libraries: ```shell + apt install ninja-build lld # install ninja and lld linker cd lmdeploy # the home folder of lmdeploy mkdir build && cd build sh ../generate.sh diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md index a73296354b..128d863e33 100644 --- a/docs/zh_cn/build.md +++ b/docs/zh_cn/build.md @@ -67,6 +67,7 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。 ``` - lmdeploy 编译安装: ```shell + apt install ninja-build lld # 安装更快的 Ninja 和 lld 链接器 cd lmdeploy # lmdeploy 源码的根目录 mkdir build && cd build sh ../generate.sh diff --git a/generate.sh b/generate.sh index 5e09688663..fceebf13ac 100755 --- a/generate.sh +++ b/generate.sh @@ -1,6 +1,6 @@ #!/bin/sh -cmake .. \ +cmake -G Ninja .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_INSTALL_PREFIX=./install \ From 2612e5e09adddfd615143136fa543646e854c330 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 28 Nov 2023 17:27:06 +0800 Subject: [PATCH 2/5] fix(.github): add ninja installation --- .github/workflows/linux-x64-gpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml index d940408ce7..41875e2ea3 100644 --- a/.github/workflows/linux-x64-gpu.yml +++ b/.github/workflows/linux-x64-gpu.yml @@ -51,6 +51,7 @@ jobs: cd /work source /opt/conda/bin/activate conda activate py38 + apt install ninja-build lld mkdir build && cd build bash ../generate.sh make -j$(nproc) && make install From 96c513c503bb5cd4b3d6f83cd73f195b9c87cca2 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 29 Nov 2023 13:03:32 +0800 Subject: [PATCH 3/5] fix(CI): remove dimsize=256 --- .github/workflows/linux-x64-gpu.yml | 4 ++-- CMakeLists.txt | 6 +++--- docs/en/build.md | 2 +- docs/zh_cn/build.md | 3 ++- .../models/llama/flash_attention2/CMakeLists.txt | 6 +++--- .../llama/flash_attention2/flash_fwd_launch_template.h | 4 ++++ .../models/llama/flash_attention2/static_switch.h | 8 ++++++++ 7 files changed, 23 insertions(+), 10 deletions(-) diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml index 41875e2ea3..3401a8ddd6 100644 --- a/.github/workflows/linux-x64-gpu.yml +++ b/.github/workflows/linux-x64-gpu.yml @@ -51,7 +51,7 @@ jobs: cd /work source /opt/conda/bin/activate conda activate py38 - apt install ninja-build lld + apt install -y ninja-build lld mkdir build && cd build bash ../generate.sh - make -j$(nproc) && make install + ninja -j$(nproc) && ninja install diff --git a/CMakeLists.txt b/CMakeLists.txt index e22deb7d94..27b6b150e7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,9 +103,9 @@ if(USE_TRITONSERVER_DATATYPE) endif() set(CXX_STD "17" CACHE STRING "C++ standard") -# enable lld linker time optimization for binary and .so -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -flto=auto") -set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld -flto=auto") +# enable gold linker for binary and .so +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold") +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold") set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) set(TF_PATH "" CACHE STRING "TensorFlow path") diff --git a/docs/en/build.md b/docs/en/build.md index c9a52faf9e..afce8c4d8d 100644 --- a/docs/en/build.md +++ b/docs/en/build.md @@ -71,7 +71,7 @@ Then, follow the steps below to set up the compilation environment: cd lmdeploy # the home folder of lmdeploy mkdir build && cd build sh ../generate.sh - make -j$(nproc) && make install + ninja -j$(nproc) && ninja install ``` - install lmdeploy python package: ```shell diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md index 128d863e33..ac2f23f168 100644 --- a/docs/zh_cn/build.md +++ b/docs/zh_cn/build.md @@ -71,7 +71,8 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。 cd lmdeploy # lmdeploy 源码的根目录 mkdir build && cd build sh ../generate.sh - make -j$(nproc) && make install + ninja && ninja install + ninja -j$(nproc) && ninja install ``` - 安装 lmdeploy python package: ```shell diff --git a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt index 9f527d7d1a..1a1fe37eaa 100644 --- a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt +++ b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt @@ -4,10 +4,10 @@ project(flash_attention2) add_library(${PROJECT_NAME} STATIC flash_api.cpp - flash_fwd_hdim32_fp16_sm80.cu - flash_fwd_hdim64_fp16_sm80.cu + # flash_fwd_hdim32_fp16_sm80.cu + # flash_fwd_hdim64_fp16_sm80.cu flash_fwd_hdim128_fp16_sm80.cu - flash_fwd_hdim256_fp16_sm80.cu + # flash_fwd_hdim256_fp16_sm80.cu ) target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass) diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h index 4a94da08b2..c24a9da9e6 100644 --- a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h +++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h @@ -57,6 +57,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream) }); } +#if 0 template void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream) { @@ -94,6 +95,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream) } }); } +#endif template void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) @@ -139,6 +141,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) }); } +#if 0 template void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) { @@ -168,3 +171,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) // Is_causal>(params, stream); }); } +#endif diff --git a/src/turbomind/models/llama/flash_attention2/static_switch.h b/src/turbomind/models/llama/flash_attention2/static_switch.h index bf4a9195ea..fd19a0ea61 100644 --- a/src/turbomind/models/llama/flash_attention2/static_switch.h +++ b/src/turbomind/models/llama/flash_attention2/static_switch.h @@ -38,6 +38,7 @@ } \ }() +#if 0 #define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ [&] { \ if (HEADDIM <= 32) { \ @@ -57,3 +58,10 @@ return __VA_ARGS__(); \ } \ }() +#else +#define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + }() +#endif From 66fab2eb24f3e4430f4f244630dab1deb21bf561 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 29 Nov 2023 13:49:57 +0800 Subject: [PATCH 4/5] fix(CI): add option for generate.sh --- .github/workflows/linux-x64-gpu.yml | 5 ++--- generate.sh | 8 +++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml index 3401a8ddd6..1670c9245a 100644 --- a/.github/workflows/linux-x64-gpu.yml +++ b/.github/workflows/linux-x64-gpu.yml @@ -51,7 +51,6 @@ jobs: cd /work source /opt/conda/bin/activate conda activate py38 - apt install -y ninja-build lld mkdir build && cd build - bash ../generate.sh - ninja -j$(nproc) && ninja install + bash ../generate.sh make + make -j$(nproc) && make install diff --git a/generate.sh b/generate.sh index fceebf13ac..6648d2e22a 100755 --- a/generate.sh +++ b/generate.sh @@ -1,6 +1,12 @@ #!/bin/sh -cmake -G Ninja .. \ +builder="-G Ninja" + +if [ "$1" == "make" ]; then + builder="" +fi + +cmake ${builder} .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_INSTALL_PREFIX=./install \ From df928a94e280a5ca89fd972fb192ba3f613b2214 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Wed, 29 Nov 2023 15:05:19 +0800 Subject: [PATCH 5/5] fix(docs): update --- builder/manywheel/entrypoint_build.sh | 2 +- docs/en/build.md | 2 +- docs/zh_cn/build.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh index 8d1eb16de9..c9c2cae6e9 100755 --- a/builder/manywheel/entrypoint_build.sh +++ b/builder/manywheel/entrypoint_build.sh @@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION cd lmdeploy mkdir -p build && cd build && rm -rf * -bash ../generate.sh +bash ../generate.sh make make -j$(nproc) && make install if [ $? != 0 ]; then echo "build failed" diff --git a/docs/en/build.md b/docs/en/build.md index afce8c4d8d..b2de1d34b6 100644 --- a/docs/en/build.md +++ b/docs/en/build.md @@ -67,7 +67,7 @@ Then, follow the steps below to set up the compilation environment: ``` - build and install lmdeploy libraries: ```shell - apt install ninja-build lld # install ninja and lld linker + apt install ninja-build # install ninja cd lmdeploy # the home folder of lmdeploy mkdir build && cd build sh ../generate.sh diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md index ac2f23f168..2d3b329b62 100644 --- a/docs/zh_cn/build.md +++ b/docs/zh_cn/build.md @@ -67,7 +67,7 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。 ``` - lmdeploy 编译安装: ```shell - apt install ninja-build lld # 安装更快的 Ninja 和 lld 链接器 + apt install ninja-build # 安装更快的 Ninja cd lmdeploy # lmdeploy 源码的根目录 mkdir build && cd build sh ../generate.sh