From 223488ee03a0165e79bd04ed3b33cf580851de5d Mon Sep 17 00:00:00 2001 From: Ferdinand Mom Date: Tue, 2 May 2023 18:17:57 +0000 Subject: [PATCH] fix(sanity-check): ref and implem now yield the same results at every step --- quantize/gptq/quant.py | 9 +- quantize/gptq/sanity_check_main.py | 149 +++++++++++++--------------- quantize/gptq/sanity_check_utils.py | 75 ++++++++++++-- 3 files changed, 143 insertions(+), 90 deletions(-) diff --git a/quantize/gptq/quant.py b/quantize/gptq/quant.py index 00cb2819..897289ff 100644 --- a/quantize/gptq/quant.py +++ b/quantize/gptq/quant.py @@ -153,11 +153,10 @@ def make_quant_custom(module, names, bits, groupsize, name=''): for attr in dir(module): tmp = getattr(module, attr) name1 = name + '.' + attr if name != '' else attr - if name1 in names: - - bias_name = attr.replace('w', 'b') + if name1 in names: + bias = getattr(module, attr.replace('w', 'b')) layer_name = attr.replace('w', 'quant') - setattr(module, layer_name, QuantLinear_custom(bits, groupsize, tmp.shape[0], tmp.shape[1], module.w[bias_name] is not None)) + setattr(module, layer_name, QuantLinear_custom(bits, groupsize, tmp.shape[0], tmp.shape[1], bias is not None)) class QuantLinear_custom(nn.Module): @@ -203,7 +202,7 @@ def pack(self, weight, bias, scales, zeros, g_idx = None): intweight = [] for idx in range(self.infeatures): - intweight.append(torch.round((weight[:,idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[:,None]) + intweight.append(torch.round((weight.data[:,idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[:,None]) intweight = torch.cat(intweight,dim=1) intweight = intweight.t().contiguous() intweight = intweight.numpy().astype(np.uint32) diff --git a/quantize/gptq/sanity_check_main.py b/quantize/gptq/sanity_check_main.py index 31803adf..d96ee24f 100644 --- a/quantize/gptq/sanity_check_main.py +++ b/quantize/gptq/sanity_check_main.py @@ -49,7 +49,6 @@ def quantize_gptq(model, train_loader): quantizers = {} layers = list(model.modules())[1:] layers = [l for l in layers if isinstance(l, nn.Linear)] - layers = layers[:-1] is_last_layer = lambda x: x == (len(layers) - 1) nsamples = len(train_loader.dataset) @@ -60,54 +59,50 @@ def quantize_gptq(model, train_loader): inps[i*batch_size:(i+1)*batch_size] = inp.view(-1, 32*32) outs = torch.zeros_like(inps) - for layer_id in range(len(layers)): - layer = layers[layer_id] - - subset = find_layers(layer) - gptq = {} + + if not is_last_layer(layer_id): - for name in subset: - gptq[name] = GPTQ(subset[name], name) - gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure(bits=WBITS, perchannel=True, sym=True, mse=False, trits=False) + layer = layers[layer_id] + + subset = find_layers(layer) + gptq = {} - def add_batch(name): - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - return tmp - - handles = [] - - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) + print(f"Quantizing layer {layer_id} ...") + for name in subset: + gptq[name] = GPTQ(subset[name], name) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure(bits=WBITS, perchannel=True, sym=False, mse=False, trits=False) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + + handles = [] + + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) - for i in range(nsamples): - if not is_last_layer(layer_id): + for i in range(nsamples): outs[i] = layer(inps[i]) - else: - _ = layer(inps[i]) - for h in handles: h.remove() - - for name in subset: - print(i, name) - print('Quantizing ...') - scale,zero,g_idx = gptq[name].fasterquant(percdamp=0.01, groupsize=GROUPSIZE, actorder=False) - quantizers[f"linear{layer_id + 1}"] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu()) - gptq[name].free() + for h in handles: h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + scale,zero,g_idx = gptq[name].fasterquant(percdamp=0.01, groupsize=GROUPSIZE, actorder=False) + quantizers[f"linear{layer_id + 1}"] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu()) + gptq[name].free() - for i in range(nsamples): - if not is_last_layer(layer_id): + for i in range(nsamples): outs[i] = layer(inps[i]) - else: - _ = layer(inps[i]) - - del layer - del gptq - torch.cuda.empty_cache() - if not is_last_layer(layer_id): + del layer + del gptq + torch.cuda.empty_cache() + inps, outs = outs, inps return quantizers @@ -132,7 +127,6 @@ def __init__(self, weight, name): self.deactivate_add_batch_call = False def add_batch(self, inp): - # After calling fasterquant, we don't want to call add_batch anymore if self.deactivate_add_batch_call: return @@ -140,14 +134,10 @@ def add_batch(self, inp): if len(inp.shape) == 2: inp = inp.unsqueeze(0) - #TODO: is the case with len = 1 still necessary ? - tmp = 1 if len(inp.shape) == 1 else inp.shape[0] + tmp = inp.shape[0] # Assume weight come from nn.Linear - if len(inp.shape) == 3: - inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() - self.H *= self.nsamples / (self.nsamples + tmp) self.nsamples += tmp inp = math.sqrt(2 / self.nsamples) * inp.float() @@ -155,8 +145,9 @@ def add_batch(self, inp): def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False): W = self.weight.data.clone() - # Need to transpose here, same reason as in __init__ with self.columns - W = W.t() + # OLD: Need to transpose here, same reason as in __init__ with self.columns + # UPDATE: no need to tranpose as we already transpose in my_linear() + # W = W.t() W = W.float() tick = time.time() @@ -166,6 +157,7 @@ def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False) H = self.H del self.H + dead = torch.diag(H) == 0 H[dead, dead] = 1 W[:, dead] = 0 @@ -242,9 +234,6 @@ def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False) Q = Q[:, invperm] g_idx = g_idx[invperm] - #TODO: Do we have to uncomment it ? - # if isinstance(self.layer, transformers.Conv1D): - # Q = Q.t() self.weight.data = Q.reshape(self.weight.shape).to(self.weight.data.dtype) if scale == []: @@ -267,9 +256,10 @@ def _fill_subset(self, layer_id): return {} # Keep only layer within block layer_id is_weight = re.compile(f'^linear{layer_id}_w$') - for name in self.w.keys(): - if is_weight.match(name): - self.subset[name] = self.w[name] + + for name in dir(self): + if is_weight.match(name): + self.subset[name] = getattr(self, name) return self.subset def alloc_gptq(self, layer_id): @@ -277,7 +267,7 @@ def alloc_gptq(self, layer_id): self.gptq = {} self.subset = self._fill_subset(layer_id) - + for name in self.subset: self.gptq[name] = self.GPTQ(self.subset[name], name) self.gptq[name].quantizer = Quantizer() @@ -299,7 +289,8 @@ def fasterquant(self, layer_id, quantizers): ## Begin SimpleNet_V2 def my_linear(self, x, weight, bias): - out = x @ weight.weight + bias + # out = x @ weight.weight.T + bias # Use version below as it is more stable + out = F.linear(x, weight.weight, bias) weight.add_batch(x) return out @@ -308,6 +299,7 @@ def forward(self, x): x = x.view(x.size(0), -1) residual = x + #TODO: maybe we would need to transpose weight when building linear0_quant ? x = F.relu(self.linear0_quant(x)) x = self.linear1_quant(x) x = F.relu(x) + residual @@ -320,7 +312,6 @@ def forward(self, x): @torch.no_grad() def quantize_gptq_custom(model, train_loader): - nb_layers = model.nb_layers is_last_layer = lambda x: x == (nb_layers - 1) @@ -333,40 +324,44 @@ def quantize_gptq_custom(model, train_loader): outs = torch.zeros_like(inps) quantizers = {} - + for layer_id in range(nb_layers): if not is_last_layer(layer_id): print(f"Quantizing layer {layer_id} ...") + bias = getattr(model, f"linear{layer_id}_b") + model.alloc_gptq(layer_id) for i in range(nsamples): - outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], model.w[f"linear{layer_id}_b"]) - + outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], bias) + model.gptq[f"linear{layer_id}_w"].deactivate_add_batch_call = True model.fasterquant(layer_id, quantizers) for i in range(nsamples): - outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], model.w[f"linear{layer_id}_b"]) - + outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], bias) + + setattr(model, f"linear{layer_id}_w", nn.Parameter(model.gptq[f"linear{layer_id}_w"].weight)) model.free_gptq() inps, outs = outs, inps - + return quantizers def model_pack_custom(model, quantizers, wbits, groupsize): # Extract weights and bias from model - is_weight = re.compile(r'^linear\d+_w$') + is_weight, is_bias = re.compile(r'^linear\d+_w$'), re.compile(r'^linear\d+_b$') weights, bias = OrderedDict(), OrderedDict() - for name, param in model.w.items(): - if is_weight.match(name): - weights[name] = param - else: - bias[name] = param + + for attr in dir(model): + if is_weight.match(attr): + weights[attr] = getattr(model, attr) + elif is_bias.match(attr): + bias[attr] = getattr(model, attr) make_quant_custom(model, quantizers, wbits, groupsize) qlayers = find_layers(model, [QuantLinear_custom]) @@ -383,13 +378,13 @@ def load_quant_custom(model, checkpoint, wbits, groupsize): print('Loading model ...') model = model.eval() # Extract weights and bias from model - is_weight = re.compile(r'^linear\d+_w$') + is_weight, is_bias = re.compile(r'^linear\d+_w$'), re.compile(r'^linear\d+_b$') weights, bias = OrderedDict(), OrderedDict() - for name, param in model.w.items(): - if is_weight.match(name): - weights[name] = param - else: - bias[name] = param + for attr in dir(model): + if is_weight.match(attr): + weights[attr] = getattr(model, attr) + elif is_bias.match(attr): + bias[attr] = getattr(model, attr) # Create linear layer out of weights and bias layers = {} @@ -442,10 +437,6 @@ def assert_parameters(model, model_custom): criterion = nn.CrossEntropyLoss() train_loader, _, _ = MNISTloader(train_val_split=0.95).load() - #TODO: Do custom eval gptq - #TODO: Is reference GPTQ quantizing bias as well ? - #TODO: Add seed everywhere in GPT for reproducibility - ## ================== REFERENCE ================== if args.train: model = SimpleNet() diff --git a/quantize/gptq/sanity_check_utils.py b/quantize/gptq/sanity_check_utils.py index 0c0a6888..22e4a1f3 100644 --- a/quantize/gptq/sanity_check_utils.py +++ b/quantize/gptq/sanity_check_utils.py @@ -8,6 +8,7 @@ from torch.utils.data import DataLoader, random_split from torchvision import datasets, transforms import math +import struct def seed_everything(seed: int): random.seed(seed) @@ -39,7 +40,6 @@ def forward(self, x): x = x.view(x.size(0), -1) residual = x - x = F.relu(self.linear1(x)) x = self.linear2(x) x = F.relu(x) + residual @@ -95,11 +95,7 @@ def __init__(self, num_classes=10): self.linear3_b = nn.Parameter(torch.nn.init.uniform_(torch.empty(num_classes), -bound, bound)) self.w = {} - self.nb_layers = 0 - for i in range(0, 4): - self.w[f"linear{i}_w"] = getattr(self, f"linear{i}_w") - self.w[f"linear{i}_b"] = getattr(self, f"linear{i}_b") - self.nb_layers += 1 + self.nb_layers = 4 def my_linear(self, x, weight, bias): # return x @ weight.t() + bias. @@ -252,3 +248,70 @@ def train(num_epochs, model, optimizer, criterion, train_loader, device): info = "Epoch: {:3}/{} \t train_loss: {:.3f} \t train_acc: {:.3f}" print(info.format(epoch + 1, num_epochs, train_loss, train_acc)) + +def write_bin(filename, array): + from functools import reduce + # Force endianess: https://stackoverflow.com/questions/23831422/what-endianness-does-python-use-to-write-into-files + dtype_to_format = { + np.int8: 'i', + np.int16: 'i', + np.int32: 'i', + np.int64: 'i', + np.unsignedinteger: 'I', + np.float16: 'f', + np.float32: 'f', + np.float64: 'f', + np.double: 'd' + } + fmt = dtype_to_format[array.dtype.type] + shapes = [shape for shape in array.shape] + # n, c, h, w = array.shape + with open(filename, "wb") as f: + # number of dim + f.write(struct.pack('I', len(shapes))) + for shape in shapes: + f.write(struct.pack('I', shape)) + f.write(struct.pack('c', bytes(fmt, 'utf-8'))) + f.write(struct.pack(f"{fmt}"*(reduce(lambda x, y: x * y, shapes)), *array.flatten(order="C").tolist())) + +def read_bin(filename): + # https://qiita.com/madaikiteruyo/items/dadc99aa29f7eae0cdd0 + format_to_byte = { + 'c': 1, + 'i': 4, + 'I': 4, + 'f': 4, + 'd': 8 + } + + data = [] + dims, fmt = None, None + with open(filename, "rb") as f: + # read row and col (np.int = 4 bytes) + byte = f.read(format_to_byte['i']) + + if byte == b'': + raise Exception("read_bin: Empty binary") + else: + nb_dim = struct.unpack('I', byte) + + # Read dims + byte = f.read(nb_dim[0] * format_to_byte['I']) + dims = struct.unpack('I'*nb_dim[0], byte) + # Read character format + byte = f.read(1) + if byte == b'': + raise Exception("read_bin: Empty binary") + else: + fmt = chr(struct.unpack('c', byte)[0][0]) + + if len(fmt) != 1: raise Exception("read_bin: No format dumped in binary") + + while True: + byte = f.read(format_to_byte[fmt]) + if byte == b'': + break + else: + data.append(struct.unpack(fmt, byte)[0]) + + return np.array(data).reshape(*dims) \ No newline at end of file