diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml index d940408ce7..1670c9245a 100644 --- a/.github/workflows/linux-x64-gpu.yml +++ b/.github/workflows/linux-x64-gpu.yml @@ -52,5 +52,5 @@ jobs: source /opt/conda/bin/activate conda activate py38 mkdir build && cd build - bash ../generate.sh + bash ../generate.sh make make -j$(nproc) && make install diff --git a/CMakeLists.txt b/CMakeLists.txt index 53e0eb2471..27b6b150e7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE) endif() set(CXX_STD "17" CACHE STRING "C++ standard") - +# enable gold linker for binary and .so +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold") +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold") set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) set(TF_PATH "" CACHE STRING "TensorFlow path") diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh index 8d1eb16de9..c9c2cae6e9 100755 --- a/builder/manywheel/entrypoint_build.sh +++ b/builder/manywheel/entrypoint_build.sh @@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION cd lmdeploy mkdir -p build && cd build && rm -rf * -bash ../generate.sh +bash ../generate.sh make make -j$(nproc) && make install if [ $? != 0 ]; then echo "build failed" diff --git a/docs/en/build.md b/docs/en/build.md index cb278073c9..b2de1d34b6 100644 --- a/docs/en/build.md +++ b/docs/en/build.md @@ -67,10 +67,11 @@ Then, follow the steps below to set up the compilation environment: ``` - build and install lmdeploy libraries: ```shell + apt install ninja-build # install ninja cd lmdeploy # the home folder of lmdeploy mkdir build && cd build sh ../generate.sh - make -j$(nproc) && make install + ninja -j$(nproc) && ninja install ``` - install lmdeploy python package: ```shell diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md index a73296354b..2d3b329b62 100644 --- a/docs/zh_cn/build.md +++ b/docs/zh_cn/build.md @@ -67,10 +67,12 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。 ``` - lmdeploy 编译安装: ```shell + apt install ninja-build # 安装更快的 Ninja cd lmdeploy # lmdeploy 源码的根目录 mkdir build && cd build sh ../generate.sh - make -j$(nproc) && make install + ninja && ninja install + ninja -j$(nproc) && ninja install ``` - 安装 lmdeploy python package: ```shell diff --git a/generate.sh b/generate.sh index 5e09688663..6648d2e22a 100755 --- a/generate.sh +++ b/generate.sh @@ -1,6 +1,12 @@ #!/bin/sh -cmake .. \ +builder="-G Ninja" + +if [ "$1" == "make" ]; then + builder="" +fi + +cmake ${builder} .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_INSTALL_PREFIX=./install \ diff --git a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt index 9f527d7d1a..1a1fe37eaa 100644 --- a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt +++ b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt @@ -4,10 +4,10 @@ project(flash_attention2) add_library(${PROJECT_NAME} STATIC flash_api.cpp - flash_fwd_hdim32_fp16_sm80.cu - flash_fwd_hdim64_fp16_sm80.cu + # flash_fwd_hdim32_fp16_sm80.cu + # flash_fwd_hdim64_fp16_sm80.cu flash_fwd_hdim128_fp16_sm80.cu - flash_fwd_hdim256_fp16_sm80.cu + # flash_fwd_hdim256_fp16_sm80.cu ) target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass) diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h index 4a94da08b2..c24a9da9e6 100644 --- a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h +++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h @@ -57,6 +57,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream) }); } +#if 0 template void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream) { @@ -94,6 +95,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream) } }); } +#endif template void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) @@ -139,6 +141,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) }); } +#if 0 template void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) { @@ -168,3 +171,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) // Is_causal>(params, stream); }); } +#endif diff --git a/src/turbomind/models/llama/flash_attention2/static_switch.h b/src/turbomind/models/llama/flash_attention2/static_switch.h index bf4a9195ea..fd19a0ea61 100644 --- a/src/turbomind/models/llama/flash_attention2/static_switch.h +++ b/src/turbomind/models/llama/flash_attention2/static_switch.h @@ -38,6 +38,7 @@ } \ }() +#if 0 #define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ [&] { \ if (HEADDIM <= 32) { \ @@ -57,3 +58,10 @@ return __VA_ARGS__(); \ } \ }() +#else +#define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + }() +#endif