Optimize GPTQ algorithm to reduce running time (#13)

## Type of Change feature API changed or not: no ## Description Optimize GPTQ algorithm to reduce running time. test llama2-7b model on SPR with calibration data length = 1: - before: about 210 min - after: about 36 min ## Dependency Change? any library dependency introduced or removed: no Signed-off-by: yuwenzho <[email protected]>
onnx · May 20, 2024 · c03ae0f · c03ae0f
1 parent c87721d
commit c03ae0f
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 15 deletions.
diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -130,8 +130,8 @@ def find_params(weight):
         perm = np.argsort(np.diag(H))[::-1]
         W = W[perm, :]
         H = H[perm, :][:, perm]
-    Losses = np.zeros(W.shape)
-    Q = np.zeros(W.shape)
+    Losses = np.zeros_like(W)
+    Q = np.zeros_like(W)
     damp = percdamp * np.mean(np.diag(H))
     diag = np.arange(shape[0])
     H[diag, diag] += damp  # add a average value of
@@ -142,9 +142,9 @@ def find_params(weight):
         count = i2 - i1
 
         W1 = copy.deepcopy(W[i1:i2, :])
-        Q1 = np.zeros(W1.shape)
-        Err1 = np.zeros(W1.shape)
-        Losses1 = np.zeros(W1.shape)
+        Q1 = np.zeros_like(W1)
+        Err1 = np.zeros_like(W1)
+        Losses1 = np.zeros_like(W1)
         Hinv1 = Hinv[i1:i2, i1:i2]
 
         for i in range(count):  # within a block, channel wise
@@ -155,7 +155,7 @@ def find_params(weight):
                 if (i1 + i) % group_size == 0:
                     scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])
 
-            q = (scale * (np.clip(np.round(np.expand_dims(w, axis=1) / scale) + zp, 0, maxq) - zp)).flatten()
+            q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
             Q1[i, :] = q
             Losses1[i, :] = (w - q) ** 2 / d**2
 
@@ -345,6 +345,7 @@ def gptq_quantize(
                 k_blocks = (org_shape[0] + group_size - 1) // group_size
                 q_weight = woq_utility.pad_tensor(q_weight, group_size, k_blocks)
                 q_weight, scale, zp = woq_utility.quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
+
                 q_matmul_node, new_inits = woq_utility.make_matmul_weight_only_node(
                     node=node,
                     weight_shape=org_shape,

diff --git a/onnx_neural_compressor/algorithms/weight_only/utility.py b/onnx_neural_compressor/algorithms/weight_only/utility.py
@@ -112,15 +112,14 @@ def make_matmul_weight_only_node(
                 packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
             else:
                 packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
-                for i in range(zero_point.shape[0] // k_blocks):
-                    for j in range(k_blocks):
-                        idx = i * k_blocks + j
-                        zp = zero_point[idx]
-                        packed_zp[idx // 2] = (
-                            ((packed_zp[idx // 2] & 0x0F) | (zp << 4))
-                            if (idx & 1)
-                            else ((packed_zp[idx // 2] & 0xF0) | zp)
-                        )
+                # create an index array
+                idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
+                # separate odd and even indices
+                even_idx = idx[::2]
+                odd_idx = idx[1::2]
+                # vectorized operation for even and odd indices
+                packed_zp[even_idx // 2] = ((packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel())
+                packed_zp[odd_idx // 2] = ((packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4))
 
             zp_tensor = onnx.helper.make_tensor(
                 name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True