@@ -246,6 +246,7 @@ def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ra
246
246
247
247
return q_weight , scale , zero_point
248
248
249
+
249
250
def quant_tensor_k_quant_cpu (data , num_bits = 4 , group_size = 32 ):
250
251
"""Quantize tensor per group based on k quant.
251
252
Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
@@ -260,44 +261,44 @@ def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
260
261
scale: scale
261
262
zero_point: zero point
262
263
"""
263
- data = np .reshape (data , (- 1 , group_size )).astype (np .float32 ) # (nb, group_size)
264
+ data = np .reshape (data , (- 1 , group_size )).astype (np .float32 ) # (nb, group_size)
264
265
maxq = 2 ** num_bits - 1
265
266
minq = 0
266
- sum_x2 = np .sum (data ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
267
- av_x = np .sqrt (sum_x2 / group_size ) # (nb, 1)
268
- weights = np .add (av_x , np .abs (data )) # (nb, group_size)
269
- rmin = np .min (data , axis = 1 , keepdims = True ) # (nb, 1)
270
- rmax = np .max (data , axis = 1 , keepdims = True ) # (nb, 1)
271
- sum_w = np .sum (weights , axis = 1 , keepdims = True ) # (nb, 1)
272
- sum_x = np .sum (weights * data , axis = 1 , keepdims = True ) # (nb, group_size)
273
- iscale = np .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
267
+ sum_x2 = np .sum (data ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
268
+ av_x = np .sqrt (sum_x2 / group_size ) # (nb, 1)
269
+ weights = np .add (av_x , np .abs (data )) # (nb, group_size)
270
+ rmin = np .min (data , axis = 1 , keepdims = True ) # (nb, 1)
271
+ rmax = np .max (data , axis = 1 , keepdims = True ) # (nb, 1)
272
+ sum_w = np .sum (weights , axis = 1 , keepdims = True ) # (nb, 1)
273
+ sum_x = np .sum (weights * data , axis = 1 , keepdims = True ) # (nb, group_size)
274
+ iscale = np .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
274
275
mask = rmin != rmax
275
276
iscale [mask ] = (maxq - minq ) / (rmax [mask ] - rmin [mask ])
276
277
scale = 1 / iscale
277
- quant_data = np .clip (np .round (iscale * (data - rmin )), minq , maxq ) # (nb, group_size)
278
- diff = scale * quant_data + rmin - data # (nb, group_size)
279
- best_mad = np .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
278
+ quant_data = np .clip (np .round (iscale * (data - rmin )), minq , maxq ) # (nb, group_size)
279
+ diff = scale * quant_data + rmin - data # (nb, group_size)
280
+ best_mad = np .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
280
281
nstep = 20
281
282
rdelta = 0.1
282
283
# nstep * rdelta = -2 * rrmin, maxq - minq = 2**num_bits - 1
283
284
rrmin = - 1
284
285
for is_ in range (nstep ):
285
- iscale_new = np .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
286
+ iscale_new = np .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
286
287
factor = np .array ([rrmin + rdelta * is_ + maxq - minq ]).astype (data .dtype )[0 ]
287
288
mask = rmin != rmax
288
289
iscale_new [mask ] = factor / (rmax [mask ] - rmin [mask ])
289
- quant_data_new = np .clip (np .round (iscale_new * (data - rmin )), minq , maxq ) # (nb, group_size)
290
+ quant_data_new = np .clip (np .round (iscale_new * (data - rmin )), minq , maxq ) # (nb, group_size)
290
291
mul_weights_quant_data_new = weights * quant_data_new
291
- sum_l = np .sum (mul_weights_quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
292
- sum_l2 = np .sum (mul_weights_quant_data_new * quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
293
- sum_xl = np .sum (mul_weights_quant_data_new * data , axis = 1 , keepdims = True ) # (nb, 1)
294
- D = np .subtract (sum_w * sum_l2 , sum_l ** 2 ) # (nb, 1)
292
+ sum_l = np .sum (mul_weights_quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
293
+ sum_l2 = np .sum (mul_weights_quant_data_new * quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
294
+ sum_xl = np .sum (mul_weights_quant_data_new * data , axis = 1 , keepdims = True ) # (nb, 1)
295
+ D = np .subtract (sum_w * sum_l2 , sum_l ** 2 ) # (nb, 1)
295
296
296
- this_scale = (sum_w * sum_xl - sum_x * sum_l ) / D # (nb, 1)
297
- this_min = (sum_l2 * sum_x - sum_l * sum_xl ) / D # (nb, 1)
297
+ this_scale = (sum_w * sum_xl - sum_x * sum_l ) / D # (nb, 1)
298
+ this_min = (sum_l2 * sum_x - sum_l * sum_xl ) / D # (nb, 1)
298
299
299
- diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
300
- mad = np .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
300
+ diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
301
+ mad = np .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
301
302
302
303
mad_1 = np .array (mad )
303
304
best_mad_1 = np .array (best_mad )
@@ -307,7 +308,7 @@ def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
307
308
scale [idx_to_replace ] = this_scale [idx_to_replace ]
308
309
rmin [idx_to_replace ] = this_min [idx_to_replace ]
309
310
310
- zero_point = np .clip ((( - rmin ) / scale ).round (), 0 , maxq ).astype ("uint8" )
311
+ zero_point = np .clip (((- rmin ) / scale ).round (), 0 , maxq ).astype ("uint8" )
311
312
scale = scale .astype (np .float64 )
312
313
q_weight = np .empty_like (data , dtype = scale .dtype )
313
314
np .divide (data , scale , out = q_weight )
@@ -317,6 +318,7 @@ def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
317
318
318
319
return q_weight , scale , zero_point
319
320
321
+
320
322
def quant_tensor_k_quant_cuda (data , num_bits = 4 , group_size = 32 ):
321
323
"""Quantize tensor per group based on k quant.
322
324
Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
@@ -334,45 +336,46 @@ def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
334
336
try :
335
337
import cupy as cp
336
338
import torch
339
+
337
340
if torch .cuda .is_available ():
338
341
data = cp .asarray (data )
339
- data = data .reshape ((- 1 , group_size )).astype (cp .float32 ) # nb = data.shape[0], (nb, group_size)
342
+ data = data .reshape ((- 1 , group_size )).astype (cp .float32 ) # nb = data.shape[0], (nb, group_size)
340
343
maxq = 2 ** num_bits - 1
341
344
minq = 0
342
- sum_x2 = cp .sum (data ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
343
- av_x = cp .sqrt (sum_x2 / group_size ) # (nb, 1)
344
- weights = cp .add (av_x , cp .abs (data )) # (nb, group_size)
345
- rmin = cp .min (data , axis = 1 , keepdims = True ) # (nb, 1)
346
- rmax = cp .max (data , axis = 1 , keepdims = True ) # (nb, 1)
347
- sum_w = cp .sum (weights , axis = 1 , keepdims = True ) # (nb, 1)
348
- sum_x = cp .sum (weights * data , axis = 1 , keepdims = True ) # (nb, group_size)
349
- iscale = cp .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
345
+ sum_x2 = cp .sum (data ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
346
+ av_x = cp .sqrt (sum_x2 / group_size ) # (nb, 1)
347
+ weights = cp .add (av_x , cp .abs (data )) # (nb, group_size)
348
+ rmin = cp .min (data , axis = 1 , keepdims = True ) # (nb, 1)
349
+ rmax = cp .max (data , axis = 1 , keepdims = True ) # (nb, 1)
350
+ sum_w = cp .sum (weights , axis = 1 , keepdims = True ) # (nb, 1)
351
+ sum_x = cp .sum (weights * data , axis = 1 , keepdims = True ) # (nb, group_size)
352
+ iscale = cp .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
350
353
mask = rmin != rmax
351
354
iscale [mask ] = (maxq - minq ) / (rmax [mask ] - rmin [mask ])
352
355
scale = 1 / iscale
353
- quant_data = cp .clip (cp .round (iscale * (data - rmin )), minq , maxq ) # (nb, group_size)
354
- diff = scale * quant_data + rmin - data # (nb, group_size)
355
- best_mad = cp .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
356
+ quant_data = cp .clip (cp .round (iscale * (data - rmin )), minq , maxq ) # (nb, group_size)
357
+ diff = scale * quant_data + rmin - data # (nb, group_size)
358
+ best_mad = cp .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
356
359
nstep = 20
357
360
rdelta = 0.1
358
361
rrmin = - 1
359
362
for is_ in range (nstep ):
360
- iscale_new = cp .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
363
+ iscale_new = cp .ones (rmax .shape , dtype = data .dtype ) # (nb, 1)
361
364
factor = cp .array ([rrmin + rdelta * is_ + maxq - minq ]).astype (data .dtype )[0 ]
362
365
mask = rmin != rmax
363
366
iscale_new [mask ] = factor / (rmax [mask ] - rmin [mask ])
364
- quant_data_new = cp .clip (cp .round (iscale_new * (data - rmin )), minq , maxq ) # (nb, group_size)
367
+ quant_data_new = cp .clip (cp .round (iscale_new * (data - rmin )), minq , maxq ) # (nb, group_size)
365
368
mul_weights_quant_data_new = weights * quant_data_new
366
- sum_l = cp .sum (mul_weights_quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
367
- sum_l2 = cp .sum (mul_weights_quant_data_new * quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
368
- sum_xl = cp .sum (mul_weights_quant_data_new * data , axis = 1 , keepdims = True ) # (nb, 1)
369
- D = cp .subtract (sum_w * sum_l2 , sum_l ** 2 ) # (nb, 1)
369
+ sum_l = cp .sum (mul_weights_quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
370
+ sum_l2 = cp .sum (mul_weights_quant_data_new * quant_data_new , axis = 1 , keepdims = True ) # (nb, 1)
371
+ sum_xl = cp .sum (mul_weights_quant_data_new * data , axis = 1 , keepdims = True ) # (nb, 1)
372
+ D = cp .subtract (sum_w * sum_l2 , sum_l ** 2 ) # (nb, 1)
370
373
371
- this_scale = (sum_w * sum_xl - sum_x * sum_l ) / D # (nb, 1)
372
- this_min = (sum_l2 * sum_x - sum_l * sum_xl ) / D # (nb, 1)
374
+ this_scale = (sum_w * sum_xl - sum_x * sum_l ) / D # (nb, 1)
375
+ this_min = (sum_l2 * sum_x - sum_l * sum_xl ) / D # (nb, 1)
373
376
374
- diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
375
- mad = cp .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
377
+ diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
378
+ mad = cp .sum (weights * diff ** 2 , axis = 1 , keepdims = True ) # (nb, 1)
376
379
377
380
mad_1 = cp .array (mad )
378
381
best_mad_1 = cp .array (best_mad )
@@ -382,7 +385,7 @@ def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
382
385
scale [idx_to_replace ] = this_scale [idx_to_replace ]
383
386
rmin [idx_to_replace ] = this_min [idx_to_replace ]
384
387
385
- zero_point = cp .clip ((( - rmin ) / scale ).round (), 0 , maxq ).astype ("uint8" )
388
+ zero_point = cp .clip (((- rmin ) / scale ).round (), 0 , maxq ).astype ("uint8" )
386
389
scale = scale .astype (cp .float64 )
387
390
q_weight = cp .empty_like (data , dtype = scale .dtype )
388
391
cp .divide (data , scale , out = q_weight )
@@ -392,20 +395,18 @@ def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
392
395
393
396
return q_weight .get (), scale .get (), zero_point .get ()
394
397
else :
395
- logger .warning ("Try to use k-quant quantization on CUDA. However, CUDA is not available." \
396
- "Fall back to k-quant quantization on CPU." )
397
- return quant_tensor_k_quant_cpu (
398
- data , num_bits , group_size
398
+ logger .warning (
399
+ "Try to use k-quant quantization on CUDA. However, CUDA is not available."
400
+ "Fall back to k-quant quantization on CPU."
399
401
)
402
+ return quant_tensor_k_quant_cpu (data , num_bits , group_size )
400
403
except ImportError :
401
404
logger .info (
402
- "Now we are using k-quant quantization on cpu, which is time consuming." \
403
- "Please consider install cupy to speed up on CUDA. See https://cupy.dev/" \
404
- "Please also install torch to check CUDA availablity."
405
- )
406
- return quant_tensor_k_quant_cpu (
407
- data , num_bits , group_size
405
+ "Now we are using k-quant quantization on cpu, which is time consuming."
406
+ "Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
407
+ "Please also install torch to check CUDA availability."
408
408
)
409
+ return quant_tensor_k_quant_cpu (data , num_bits , group_size )
409
410
410
411
411
412
def qdq_tensor (data , num_bits = 4 , group_size = 32 , scheme = "asym" , dtype = "int" , ratio = 1.0 ):
0 commit comments