diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
index d940408ce7..1670c9245a 100644
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
@@ -52,5 +52,5 @@ jobs:
             source /opt/conda/bin/activate
             conda activate py38
             mkdir build && cd build
-            bash ../generate.sh
+            bash ../generate.sh make
             make -j$(nproc) && make install
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53e0eb2471..27b6b150e7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE)
 endif()
 
 set(CXX_STD "17" CACHE STRING "C++ standard")
-
+# enable gold linker for binary and .so
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 
 set(TF_PATH "" CACHE STRING "TensorFlow path")
diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh
index 8d1eb16de9..c9c2cae6e9 100755
--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION
 
 cd lmdeploy
 mkdir -p build && cd build && rm -rf *
-bash ../generate.sh
+bash ../generate.sh make
 make -j$(nproc) && make install
 if [ $? != 0 ]; then
     echo "build failed"
diff --git a/docs/en/build.md b/docs/en/build.md
index cb278073c9..b2de1d34b6 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -67,10 +67,11 @@ Then, follow the steps below to set up the compilation environment:
   ```
 - build and install lmdeploy libraries:
   ```shell
+  apt install ninja-build # install ninja
   cd lmdeploy # the home folder of lmdeploy
   mkdir build && cd build
   sh ../generate.sh
-  make -j$(nproc) && make install
+  ninja -j$(nproc) && ninja install
   ```
 - install lmdeploy python package:
   ```shell
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
index a73296354b..2d3b329b62 100644
--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -67,10 +67,12 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
   ```
 - lmdeploy 编译安装:
   ```shell
+  apt install ninja-build # 安装更快的 Ninja
   cd lmdeploy # lmdeploy 源码的根目录
   mkdir build && cd build
   sh ../generate.sh
-  make -j$(nproc) && make install
+  ninja && ninja install
+  ninja -j$(nproc) && ninja install
   ```
 - 安装 lmdeploy python package:
   ```shell
diff --git a/generate.sh b/generate.sh
index 5e09688663..6648d2e22a 100755
--- a/generate.sh
+++ b/generate.sh
@@ -1,6 +1,12 @@
 #!/bin/sh
 
-cmake .. \
+builder="-G Ninja"
+
+if [ "$1" == "make" ]; then
+    builder=""
+fi
+
+cmake ${builder} .. \
     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
     -DCMAKE_INSTALL_PREFIX=./install \
diff --git a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
index 9f527d7d1a..1a1fe37eaa 100644
--- a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
@@ -4,10 +4,10 @@ project(flash_attention2)
 
 add_library(${PROJECT_NAME} STATIC
     flash_api.cpp
-    flash_fwd_hdim32_fp16_sm80.cu
-    flash_fwd_hdim64_fp16_sm80.cu
+    # flash_fwd_hdim32_fp16_sm80.cu
+    # flash_fwd_hdim64_fp16_sm80.cu
     flash_fwd_hdim128_fp16_sm80.cu
-    flash_fwd_hdim256_fp16_sm80.cu
+    # flash_fwd_hdim256_fp16_sm80.cu
     )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
index 4a94da08b2..c24a9da9e6 100644
--- a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
@@ -57,6 +57,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
+#if 0
 template<typename T>
 void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream)
 {
@@ -94,6 +95,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream)
         }
     });
 }
+#endif
 
 template<typename T>
 void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
@@ -139,6 +141,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
+#if 0
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
@@ -168,3 +171,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
         // Is_causal>(params, stream);
     });
 }
+#endif
diff --git a/src/turbomind/models/llama/flash_attention2/static_switch.h b/src/turbomind/models/llama/flash_attention2/static_switch.h
index bf4a9195ea..fd19a0ea61 100644
--- a/src/turbomind/models/llama/flash_attention2/static_switch.h
+++ b/src/turbomind/models/llama/flash_attention2/static_switch.h
@@ -38,6 +38,7 @@
         }                                                                                                              \
     }()
 
+#if 0
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
     [&] {                                                                                                              \
         if (HEADDIM <= 32) {                                                                                           \
@@ -57,3 +58,10 @@
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
     }()
+#else
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                              \
+        constexpr static int kHeadDim = 128;                                                                           \
+        return __VA_ARGS__();                                                                                          \
+    }()
+#endif