refactor: arg --no-flash-attn

Signed-off-by: thxCode <[email protected]>
gpustack · Feb 10, 2025 · 201eb57 · 201eb57
1 parent 8f5038c
commit 201eb57
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 0 deletions.
diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md
@@ -31,6 +31,7 @@ GLOBAL OPTIONS:
    --flash-attention, --flash-attn, --fa, --diffusion-fa  Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false)
    --gpu-layers value, --ngl value, --n-gpu-layers value  Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)
    --main-gpu value, --mg value                           Specify the GPU to use for the model (with "--split-mode=none") or for intermediate results and KV (with "--split-mode=row"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, "--main-gpu" only works when "--tensor-split" is set. (default: 0)
+   --no-flash-attention, --no-flash-attn                  Specify disabling Flash Attention. (default: false)
    --parallel-size value, --parallel value, --np value    Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1)
    --platform-footprint value                             Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is "150,250". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, "cudaMemGetInfo" or "cudaSetDevice" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250")
    --rpc value                                            Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with "--tensor-split".

diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
@@ -505,6 +505,18 @@ func main() {
 					"which is used to estimate the usage. " +
 					"Flash Attention can reduce the usage of RAM/VRAM.",
 			},
+			&cli.BoolFlag{ // LLaMABox compatibility
+				Category: "Estimate",
+				Name:     "no-flash-attention",
+				Aliases: []string{
+					"no-flash-attn",
+				},
+				Usage: "Specify disabling Flash Attention.",
+				Action: func(context *cli.Context, b bool) error {
+					flashAttention = !b
+					return nil
+				},
+			},
 			&cli.UintFlag{
 				Destination: &mainGPU,
 				Value:       mainGPU,