Skip to content

Commit

Permalink
Optimize GPTQ algorithm to reduce running time (#13)
Browse files Browse the repository at this point in the history
## Type of Change

feature
API changed or not: no

## Description

Optimize GPTQ algorithm to reduce running time.

test llama2-7b model on SPR with calibration data length = 1:

- before:  about 210 min 
- after:     about 36 min

## Dependency Change?

any library dependency introduced or removed: no

Signed-off-by: yuwenzho <[email protected]>
  • Loading branch information
yuwenzho authored May 20, 2024
1 parent c87721d commit c03ae0f
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
13 changes: 7 additions & 6 deletions onnx_neural_compressor/algorithms/weight_only/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ def find_params(weight):
perm = np.argsort(np.diag(H))[::-1]
W = W[perm, :]
H = H[perm, :][:, perm]
Losses = np.zeros(W.shape)
Q = np.zeros(W.shape)
Losses = np.zeros_like(W)
Q = np.zeros_like(W)
damp = percdamp * np.mean(np.diag(H))
diag = np.arange(shape[0])
H[diag, diag] += damp # add a average value of
Expand All @@ -142,9 +142,9 @@ def find_params(weight):
count = i2 - i1

W1 = copy.deepcopy(W[i1:i2, :])
Q1 = np.zeros(W1.shape)
Err1 = np.zeros(W1.shape)
Losses1 = np.zeros(W1.shape)
Q1 = np.zeros_like(W1)
Err1 = np.zeros_like(W1)
Losses1 = np.zeros_like(W1)
Hinv1 = Hinv[i1:i2, i1:i2]

for i in range(count): # within a block, channel wise
Expand All @@ -155,7 +155,7 @@ def find_params(weight):
if (i1 + i) % group_size == 0:
scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])

q = (scale * (np.clip(np.round(np.expand_dims(w, axis=1) / scale) + zp, 0, maxq) - zp)).flatten()
q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
Q1[i, :] = q
Losses1[i, :] = (w - q) ** 2 / d**2

Expand Down Expand Up @@ -345,6 +345,7 @@ def gptq_quantize(
k_blocks = (org_shape[0] + group_size - 1) // group_size
q_weight = woq_utility.pad_tensor(q_weight, group_size, k_blocks)
q_weight, scale, zp = woq_utility.quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")

q_matmul_node, new_inits = woq_utility.make_matmul_weight_only_node(
node=node,
weight_shape=org_shape,
Expand Down
17 changes: 8 additions & 9 deletions onnx_neural_compressor/algorithms/weight_only/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,14 @@ def make_matmul_weight_only_node(
packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
else:
packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
for i in range(zero_point.shape[0] // k_blocks):
for j in range(k_blocks):
idx = i * k_blocks + j
zp = zero_point[idx]
packed_zp[idx // 2] = (
((packed_zp[idx // 2] & 0x0F) | (zp << 4))
if (idx & 1)
else ((packed_zp[idx // 2] & 0xF0) | zp)
)
# create an index array
idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
# separate odd and even indices
even_idx = idx[::2]
odd_idx = idx[1::2]
# vectorized operation for even and odd indices
packed_zp[even_idx // 2] = ((packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel())
packed_zp[odd_idx // 2] = ((packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4))

zp_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
Expand Down

0 comments on commit c03ae0f

Please sign in to comment.