From deeb43e76808a6420880795586792af59a64e939 Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Tue, 28 Nov 2023 17:15:13 +0800
Subject: [PATCH 1/5] feat(build): enable ninja and lld

---
 CMakeLists.txt      | 4 +++-
 docs/en/build.md    | 1 +
 docs/zh_cn/build.md | 1 +
 generate.sh         | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53e0eb2471..e22deb7d94 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE)
 endif()
 
 set(CXX_STD "17" CACHE STRING "C++ standard")
-
+# enable lld linker time optimization for binary and .so
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -flto=auto")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld -flto=auto")
 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 
 set(TF_PATH "" CACHE STRING "TensorFlow path")
diff --git a/docs/en/build.md b/docs/en/build.md
index cb278073c9..c9a52faf9e 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -67,6 +67,7 @@ Then, follow the steps below to set up the compilation environment:
   ```
 - build and install lmdeploy libraries:
   ```shell
+  apt install ninja-build lld # install ninja and lld linker
   cd lmdeploy # the home folder of lmdeploy
   mkdir build && cd build
   sh ../generate.sh
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
index a73296354b..128d863e33 100644
--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -67,6 +67,7 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
   ```
 - lmdeploy 编译安装:
   ```shell
+  apt install ninja-build lld # 安装更快的 Ninja 和 lld 链接器
   cd lmdeploy # lmdeploy 源码的根目录
   mkdir build && cd build
   sh ../generate.sh
diff --git a/generate.sh b/generate.sh
index 5e09688663..fceebf13ac 100755
--- a/generate.sh
+++ b/generate.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-cmake .. \
+cmake -G Ninja .. \
     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
     -DCMAKE_INSTALL_PREFIX=./install \

From 2612e5e09adddfd615143136fa543646e854c330 Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Tue, 28 Nov 2023 17:27:06 +0800
Subject: [PATCH 2/5] fix(.github): add ninja installation

---
 .github/workflows/linux-x64-gpu.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
index d940408ce7..41875e2ea3 100644
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
@@ -51,6 +51,7 @@ jobs:
             cd /work
             source /opt/conda/bin/activate
             conda activate py38
+            apt install ninja-build lld
             mkdir build && cd build
             bash ../generate.sh
             make -j$(nproc) && make install

From 96c513c503bb5cd4b3d6f83cd73f195b9c87cca2 Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Wed, 29 Nov 2023 13:03:32 +0800
Subject: [PATCH 3/5] fix(CI): remove dimsize=256

---
 .github/workflows/linux-x64-gpu.yml                       | 4 ++--
 CMakeLists.txt                                            | 6 +++---
 docs/en/build.md                                          | 2 +-
 docs/zh_cn/build.md                                       | 3 ++-
 .../models/llama/flash_attention2/CMakeLists.txt          | 6 +++---
 .../llama/flash_attention2/flash_fwd_launch_template.h    | 4 ++++
 .../models/llama/flash_attention2/static_switch.h         | 8 ++++++++
 7 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
index 41875e2ea3..3401a8ddd6 100644
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
@@ -51,7 +51,7 @@ jobs:
             cd /work
             source /opt/conda/bin/activate
             conda activate py38
-            apt install ninja-build lld
+            apt install -y ninja-build lld
             mkdir build && cd build
             bash ../generate.sh
-            make -j$(nproc) && make install
+            ninja -j$(nproc) && ninja install
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e22deb7d94..27b6b150e7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,9 +103,9 @@ if(USE_TRITONSERVER_DATATYPE)
 endif()
 
 set(CXX_STD "17" CACHE STRING "C++ standard")
-# enable lld linker time optimization for binary and .so
-set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -flto=auto")
-set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld -flto=auto")
+# enable gold linker for binary and .so
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 
 set(TF_PATH "" CACHE STRING "TensorFlow path")
diff --git a/docs/en/build.md b/docs/en/build.md
index c9a52faf9e..afce8c4d8d 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -71,7 +71,7 @@ Then, follow the steps below to set up the compilation environment:
   cd lmdeploy # the home folder of lmdeploy
   mkdir build && cd build
   sh ../generate.sh
-  make -j$(nproc) && make install
+  ninja -j$(nproc) && ninja install
   ```
 - install lmdeploy python package:
   ```shell
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
index 128d863e33..ac2f23f168 100644
--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -71,7 +71,8 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
   cd lmdeploy # lmdeploy 源码的根目录
   mkdir build && cd build
   sh ../generate.sh
-  make -j$(nproc) && make install
+  ninja && ninja install
+  ninja -j$(nproc) && ninja install
   ```
 - 安装 lmdeploy python package:
   ```shell
diff --git a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
index 9f527d7d1a..1a1fe37eaa 100644
--- a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
@@ -4,10 +4,10 @@ project(flash_attention2)
 
 add_library(${PROJECT_NAME} STATIC
     flash_api.cpp
-    flash_fwd_hdim32_fp16_sm80.cu
-    flash_fwd_hdim64_fp16_sm80.cu
+    # flash_fwd_hdim32_fp16_sm80.cu
+    # flash_fwd_hdim64_fp16_sm80.cu
     flash_fwd_hdim128_fp16_sm80.cu
-    flash_fwd_hdim256_fp16_sm80.cu
+    # flash_fwd_hdim256_fp16_sm80.cu
     )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
index 4a94da08b2..c24a9da9e6 100644
--- a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
@@ -57,6 +57,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
+#if 0
 template<typename T>
 void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream)
 {
@@ -94,6 +95,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream)
         }
     });
 }
+#endif
 
 template<typename T>
 void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
@@ -139,6 +141,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
+#if 0
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
@@ -168,3 +171,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
         // Is_causal>(params, stream);
     });
 }
+#endif
diff --git a/src/turbomind/models/llama/flash_attention2/static_switch.h b/src/turbomind/models/llama/flash_attention2/static_switch.h
index bf4a9195ea..fd19a0ea61 100644
--- a/src/turbomind/models/llama/flash_attention2/static_switch.h
+++ b/src/turbomind/models/llama/flash_attention2/static_switch.h
@@ -38,6 +38,7 @@
         }                                                                                                              \
     }()
 
+#if 0
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
     [&] {                                                                                                              \
         if (HEADDIM <= 32) {                                                                                           \
@@ -57,3 +58,10 @@
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
     }()
+#else
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                              \
+        constexpr static int kHeadDim = 128;                                                                           \
+        return __VA_ARGS__();                                                                                          \
+    }()
+#endif

From 66fab2eb24f3e4430f4f244630dab1deb21bf561 Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Wed, 29 Nov 2023 13:49:57 +0800
Subject: [PATCH 4/5] fix(CI): add option for generate.sh

---
 .github/workflows/linux-x64-gpu.yml | 5 ++---
 generate.sh                         | 8 +++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
index 3401a8ddd6..1670c9245a 100644
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
@@ -51,7 +51,6 @@ jobs:
             cd /work
             source /opt/conda/bin/activate
             conda activate py38
-            apt install -y ninja-build lld
             mkdir build && cd build
-            bash ../generate.sh
-            ninja -j$(nproc) && ninja install
+            bash ../generate.sh make
+            make -j$(nproc) && make install
diff --git a/generate.sh b/generate.sh
index fceebf13ac..6648d2e22a 100755
--- a/generate.sh
+++ b/generate.sh
@@ -1,6 +1,12 @@
 #!/bin/sh
 
-cmake -G Ninja .. \
+builder="-G Ninja"
+
+if [ "$1" == "make" ]; then
+    builder=""
+fi
+
+cmake ${builder} .. \
     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
     -DCMAKE_INSTALL_PREFIX=./install \

From df928a94e280a5ca89fd972fb192ba3f613b2214 Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Wed, 29 Nov 2023 15:05:19 +0800
Subject: [PATCH 5/5] fix(docs): update

---
 builder/manywheel/entrypoint_build.sh | 2 +-
 docs/en/build.md                      | 2 +-
 docs/zh_cn/build.md                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh
index 8d1eb16de9..c9c2cae6e9 100755
--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION
 
 cd lmdeploy
 mkdir -p build && cd build && rm -rf *
-bash ../generate.sh
+bash ../generate.sh make
 make -j$(nproc) && make install
 if [ $? != 0 ]; then
     echo "build failed"
diff --git a/docs/en/build.md b/docs/en/build.md
index afce8c4d8d..b2de1d34b6 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -67,7 +67,7 @@ Then, follow the steps below to set up the compilation environment:
   ```
 - build and install lmdeploy libraries:
   ```shell
-  apt install ninja-build lld # install ninja and lld linker
+  apt install ninja-build # install ninja
   cd lmdeploy # the home folder of lmdeploy
   mkdir build && cd build
   sh ../generate.sh
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
index ac2f23f168..2d3b329b62 100644
--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -67,7 +67,7 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
   ```
 - lmdeploy 编译安装:
   ```shell
-  apt install ninja-build lld # 安装更快的 Ninja 和 lld 链接器
+  apt install ninja-build # 安装更快的 Ninja
   cd lmdeploy # lmdeploy 源码的根目录
   mkdir build && cd build
   sh ../generate.sh