Skip to content

Commit f9a5600

Browse files
authored
[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (vllm-project#7225)
1 parent fd95e02 commit f9a5600

File tree

4 files changed

+33
-14
lines changed

4 files changed

+33
-14
lines changed

tests/basic_correctness/test_cpu_offload.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
2222
["--cpu-offload-gb", "2"])
2323

2424

25-
@pytest.mark.skipif(not is_quant_method_supported("awq"),
26-
reason="awq is not supported on this GPU type.")
25+
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
26+
reason="gptq_marlin is not supported on this GPU type.")
27+
def test_cpu_offload_gptq():
28+
# Test GPTQ Marlin
29+
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
30+
["--cpu-offload-gb", "1"])
31+
# Test GPTQ
32+
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
33+
["--quantization", "gptq"],
34+
["--quantization", "gptq", "--cpu-offload-gb", "1"])
35+
36+
37+
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
38+
reason="awq_marlin is not supported on this GPU type.")
2739
def test_cpu_offload_awq():
28-
compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
29-
["--cpu-offload-gb", "2"])
40+
# Test AWQ Marlin
41+
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
42+
["--cpu-offload-gb", "1"])
43+
# Test AWQ
44+
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
45+
["--quantization", "awq"],
46+
["--quantization", "awq", "--cpu-offload-gb", "1"])
3047

3148

3249
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),

tests/utils.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,9 @@ def compare_two_settings(model: str,
266266
arg1_results = results[:n]
267267
arg2_results = results[n:]
268268
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
269-
assert arg1_result == arg2_result, \
270-
f"Results for {model=} are not the same with {arg1=} and {arg2=}"
269+
assert arg1_result == arg2_result, (
270+
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
271+
f"{arg1_result=} != {arg2_result=}")
271272

272273

273274
def init_test_distributed_environment(

vllm/model_executor/layers/quantization/gptq.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -204,13 +204,7 @@ def create_weights(
204204

205205
layer.exllama_state = exllama_state
206206

207-
def apply(self,
208-
layer: torch.nn.Module,
209-
x: torch.Tensor,
210-
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
211-
qweight = layer.qweight
212-
out_shape = x.shape[:-1] + (qweight.shape[-1], )
213-
reshaped_x = x.reshape(-1, x.shape[-1])
207+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
214208
# exllama needs to shuffle the weight after the weight is loaded
215209
# here we do the shuffle on first forward pass
216210
if layer.exllama_state == ExllamaState.UNINITIALIZED:
@@ -222,6 +216,14 @@ def apply(self,
222216
layer.exllama_state = ExllamaState.READY
223217
ops.gptq_shuffle(layer.qweight, layer.g_idx,
224218
self.quant_config.weight_bits)
219+
220+
def apply(self,
221+
layer: torch.nn.Module,
222+
x: torch.Tensor,
223+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
224+
out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
225+
reshaped_x = x.reshape(-1, x.shape[-1])
226+
225227
output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
226228
layer.scales, layer.g_idx,
227229
layer.exllama_state == ExllamaState.READY,

vllm/model_executor/layers/quantization/gptq_marlin.py

-1
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,6 @@ def create_weights(
251251
scales_and_zp_size,
252252
output_size_per_partition // self.quant_config.pack_factor,
253253
dtype=torch.int32,
254-
device="meta",
255254
),
256255
requires_grad=False,
257256
)

0 commit comments

Comments
 (0)