From 223488ee03a0165e79bd04ed3b33cf580851de5d Mon Sep 17 00:00:00 2001
From: Ferdinand Mom <ferdinandmom.business@gmail.com>
Date: Tue, 2 May 2023 18:17:57 +0000
Subject: [PATCH] fix(sanity-check): ref and implem now yield the same results
 at every step

---
 quantize/gptq/quant.py              |   9 +-
 quantize/gptq/sanity_check_main.py  | 149 +++++++++++++---------------
 quantize/gptq/sanity_check_utils.py |  75 ++++++++++++--
 3 files changed, 143 insertions(+), 90 deletions(-)

diff --git a/quantize/gptq/quant.py b/quantize/gptq/quant.py
index 00cb2819..897289ff 100644
--- a/quantize/gptq/quant.py
+++ b/quantize/gptq/quant.py
@@ -153,11 +153,10 @@ def make_quant_custom(module, names, bits, groupsize, name=''):
     for attr in dir(module):
         tmp = getattr(module, attr)
         name1 = name + '.' + attr if name != '' else attr
-        if name1 in names:
-            
-            bias_name = attr.replace('w', 'b')
+        if name1 in names:            
+            bias = getattr(module, attr.replace('w', 'b'))
             layer_name = attr.replace('w', 'quant')
-            setattr(module, layer_name, QuantLinear_custom(bits, groupsize, tmp.shape[0], tmp.shape[1], module.w[bias_name] is not None))
+            setattr(module, layer_name, QuantLinear_custom(bits, groupsize, tmp.shape[0], tmp.shape[1], bias is not None))
 
 
 class QuantLinear_custom(nn.Module):
@@ -203,7 +202,7 @@ def pack(self, weight, bias, scales, zeros, g_idx = None):
             
         intweight = []
         for idx in range(self.infeatures):
-            intweight.append(torch.round((weight[:,idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[:,None])
+            intweight.append(torch.round((weight.data[:,idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[:,None])
         intweight = torch.cat(intweight,dim=1)
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(np.uint32)
diff --git a/quantize/gptq/sanity_check_main.py b/quantize/gptq/sanity_check_main.py
index 31803adf..d96ee24f 100644
--- a/quantize/gptq/sanity_check_main.py
+++ b/quantize/gptq/sanity_check_main.py
@@ -49,7 +49,6 @@ def quantize_gptq(model, train_loader):
     quantizers = {}
     layers = list(model.modules())[1:]
     layers = [l for l in layers if isinstance(l, nn.Linear)]
-    layers = layers[:-1]
     is_last_layer = lambda x: x == (len(layers) - 1)
 
     nsamples = len(train_loader.dataset)
@@ -60,54 +59,50 @@ def quantize_gptq(model, train_loader):
         inps[i*batch_size:(i+1)*batch_size] = inp.view(-1, 32*32)
     outs = torch.zeros_like(inps)
     
-
     for layer_id in range(len(layers)):
-        layer = layers[layer_id]
-
-        subset = find_layers(layer)
-        gptq = {}
+        
+        if not is_last_layer(layer_id):
 
-        for name in subset:
-            gptq[name] = GPTQ(subset[name], name)
-            gptq[name].quantizer = Quantizer()
-            gptq[name].quantizer.configure(bits=WBITS, perchannel=True, sym=True, mse=False, trits=False)
+            layer = layers[layer_id]
+            
+            subset = find_layers(layer)
+            gptq = {}
 
-        def add_batch(name):
-            def tmp(_, inp, out):
-                gptq[name].add_batch(inp[0].data, out.data)
-            return tmp
-        
-        handles = []
-        
-        for name in subset:
-            handles.append(subset[name].register_forward_hook(add_batch(name)))
+            print(f"Quantizing layer {layer_id} ...")
+            for name in subset:
+                gptq[name] = GPTQ(subset[name], name)
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(bits=WBITS, perchannel=True, sym=False, mse=False, trits=False)
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            
+            handles = []
+            
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
 
-        for i in range(nsamples):
-            if not is_last_layer(layer_id):
+            for i in range(nsamples):
                 outs[i] = layer(inps[i])
-            else:
-                _ = layer(inps[i])
 
-        for h in handles: h.remove()
-
-        for name in subset:
-            print(i, name)
-            print('Quantizing ...')
-            scale,zero,g_idx = gptq[name].fasterquant(percdamp=0.01, groupsize=GROUPSIZE, actorder=False)
-            quantizers[f"linear{layer_id + 1}"] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu())
-            gptq[name].free()
+            for h in handles: h.remove()
+            
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                scale,zero,g_idx = gptq[name].fasterquant(percdamp=0.01, groupsize=GROUPSIZE, actorder=False)
+                quantizers[f"linear{layer_id + 1}"] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu())
+                gptq[name].free()
 
-        for i in range(nsamples):
-            if not is_last_layer(layer_id):
+            for i in range(nsamples):
                 outs[i] = layer(inps[i])
-            else:
-                _ = layer(inps[i])
-                
-        del layer
-        del gptq 
-        torch.cuda.empty_cache()
 
-        if not is_last_layer(layer_id):
+            del layer
+            del gptq 
+            torch.cuda.empty_cache()
+
             inps, outs = outs, inps
     
     return quantizers
@@ -132,7 +127,6 @@ def __init__(self, weight, name):
             self.deactivate_add_batch_call = False
 
         def add_batch(self, inp):
-            
             # After calling fasterquant, we don't want to call add_batch anymore
             if self.deactivate_add_batch_call:
                 return
@@ -140,14 +134,10 @@ def add_batch(self, inp):
             if len(inp.shape) == 2:
                 inp = inp.unsqueeze(0)
             
-            #TODO: is the case with len = 1 still necessary ?
-            tmp = 1 if len(inp.shape) == 1 else inp.shape[0]
+            tmp = inp.shape[0]
 
             # Assume weight come from nn.Linear
-            if len(inp.shape) == 3:
-                inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
-
             self.H *= self.nsamples / (self.nsamples + tmp)
             self.nsamples += tmp
             inp = math.sqrt(2 / self.nsamples) * inp.float()
@@ -155,8 +145,9 @@ def add_batch(self, inp):
 
         def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False):
             W = self.weight.data.clone()
-            # Need to transpose here, same reason as in __init__ with self.columns
-            W = W.t()
+            # OLD: Need to transpose here, same reason as in __init__ with self.columns
+            # UPDATE: no need to tranpose as we already transpose in my_linear()
+            # W = W.t()
             W = W.float()
 
             tick = time.time()
@@ -166,6 +157,7 @@ def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False)
 
             H = self.H
             del self.H
+
             dead = torch.diag(H) == 0
             H[dead, dead] = 1
             W[:, dead] = 0
@@ -242,9 +234,6 @@ def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False)
                 Q = Q[:, invperm]
                 g_idx = g_idx[invperm]
 
-            #TODO: Do we have to uncomment it ?
-            # if isinstance(self.layer, transformers.Conv1D):
-            #   Q = Q.t()
             self.weight.data = Q.reshape(self.weight.shape).to(self.weight.data.dtype)
            
             if scale == []:
@@ -267,9 +256,10 @@ def _fill_subset(self, layer_id):
             return {}
         # Keep only layer within block layer_id
         is_weight = re.compile(f'^linear{layer_id}_w$')
-        for name in self.w.keys():                
-            if is_weight.match(name):                
-                self.subset[name] = self.w[name]
+
+        for name in dir(self):
+            if is_weight.match(name):
+                self.subset[name] = getattr(self, name)
         return self.subset
         
     def alloc_gptq(self, layer_id):
@@ -277,7 +267,7 @@ def alloc_gptq(self, layer_id):
         self.gptq = {}
 
         self.subset = self._fill_subset(layer_id)
-        
+
         for name in self.subset:
             self.gptq[name] = self.GPTQ(self.subset[name], name)
             self.gptq[name].quantizer = Quantizer()
@@ -299,7 +289,8 @@ def fasterquant(self, layer_id, quantizers):
 
     ## Begin SimpleNet_V2
     def my_linear(self, x, weight, bias):
-        out = x @ weight.weight + bias
+        # out = x @ weight.weight.T + bias # Use version below as it is more stable
+        out = F.linear(x, weight.weight, bias)
         weight.add_batch(x)
         return out
 
@@ -308,6 +299,7 @@ def forward(self, x):
             x = x.view(x.size(0), -1)
         
         residual = x
+        #TODO: maybe we would need to transpose weight when building linear0_quant ?
         x = F.relu(self.linear0_quant(x))
         x = self.linear1_quant(x)
         x = F.relu(x) + residual
@@ -320,7 +312,6 @@ def forward(self, x):
 
 @torch.no_grad()
 def quantize_gptq_custom(model, train_loader):
-    
     nb_layers = model.nb_layers
     is_last_layer = lambda x: x == (nb_layers - 1)
 
@@ -333,40 +324,44 @@ def quantize_gptq_custom(model, train_loader):
     outs = torch.zeros_like(inps)
 
     quantizers = {}
-
+    
     for layer_id in range(nb_layers):
         
         if not is_last_layer(layer_id):
             
             print(f"Quantizing layer {layer_id} ...")
 
+            bias = getattr(model, f"linear{layer_id}_b")
+
             model.alloc_gptq(layer_id)
 
             for i in range(nsamples):
-                outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], model.w[f"linear{layer_id}_b"])
-        
+                outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], bias)
+
             model.gptq[f"linear{layer_id}_w"].deactivate_add_batch_call = True
 
             model.fasterquant(layer_id, quantizers)
 
             for i in range(nsamples):
-                outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], model.w[f"linear{layer_id}_b"])
-
+                outs[i] = model.my_linear(inps[i], model.gptq[f"linear{layer_id}_w"], bias)
+            
+            setattr(model, f"linear{layer_id}_w", nn.Parameter(model.gptq[f"linear{layer_id}_w"].weight))
             model.free_gptq()
 
             inps, outs = outs, inps
-
+    
     return quantizers
 
 def model_pack_custom(model, quantizers, wbits, groupsize):
     # Extract weights and bias from model
-    is_weight = re.compile(r'^linear\d+_w$')
+    is_weight, is_bias = re.compile(r'^linear\d+_w$'), re.compile(r'^linear\d+_b$')
     weights, bias = OrderedDict(), OrderedDict()
-    for name, param in model.w.items():
-        if is_weight.match(name):
-            weights[name] = param
-        else:
-            bias[name] = param
+
+    for attr in dir(model):
+        if is_weight.match(attr):
+            weights[attr] = getattr(model, attr)
+        elif is_bias.match(attr):
+            bias[attr] = getattr(model, attr)
 
     make_quant_custom(model, quantizers, wbits, groupsize)
     qlayers = find_layers(model, [QuantLinear_custom])
@@ -383,13 +378,13 @@ def load_quant_custom(model, checkpoint, wbits, groupsize):
     print('Loading model ...')
     model = model.eval()
     # Extract weights and bias from model
-    is_weight = re.compile(r'^linear\d+_w$')
+    is_weight, is_bias = re.compile(r'^linear\d+_w$'), re.compile(r'^linear\d+_b$')
     weights, bias = OrderedDict(), OrderedDict()
-    for name, param in model.w.items():
-        if is_weight.match(name):
-            weights[name] = param
-        else:
-            bias[name] = param
+    for attr in dir(model):
+        if is_weight.match(attr):
+            weights[attr] = getattr(model, attr)
+        elif is_bias.match(attr):
+            bias[attr] = getattr(model, attr)
     
     # Create linear layer out of weights and bias
     layers = {}
@@ -442,10 +437,6 @@ def assert_parameters(model, model_custom):
     criterion = nn.CrossEntropyLoss()
     train_loader, _, _ = MNISTloader(train_val_split=0.95).load()
 
-    #TODO: Do custom eval gptq
-    #TODO: Is reference GPTQ quantizing bias as well ?
-    #TODO: Add seed everywhere in GPT for reproducibility
-
     ## ================== REFERENCE ==================
     if args.train:
         model = SimpleNet()
diff --git a/quantize/gptq/sanity_check_utils.py b/quantize/gptq/sanity_check_utils.py
index 0c0a6888..22e4a1f3 100644
--- a/quantize/gptq/sanity_check_utils.py
+++ b/quantize/gptq/sanity_check_utils.py
@@ -8,6 +8,7 @@
 from torch.utils.data import DataLoader, random_split
 from torchvision import datasets, transforms
 import math
+import struct
 
 def seed_everything(seed: int):
     random.seed(seed)
@@ -39,7 +40,6 @@ def forward(self, x):
             x = x.view(x.size(0), -1)
 
         residual = x
-
         x = F.relu(self.linear1(x))
         x = self.linear2(x)
         x = F.relu(x) + residual
@@ -95,11 +95,7 @@ def __init__(self, num_classes=10):
         self.linear3_b = nn.Parameter(torch.nn.init.uniform_(torch.empty(num_classes), -bound, bound))
 
         self.w = {}
-        self.nb_layers = 0
-        for i in range(0, 4):
-            self.w[f"linear{i}_w"] = getattr(self, f"linear{i}_w")
-            self.w[f"linear{i}_b"] = getattr(self, f"linear{i}_b")
-            self.nb_layers += 1
+        self.nb_layers = 4
 
     def my_linear(self, x, weight, bias):
         # return x @ weight.t() + bias.
@@ -252,3 +248,70 @@ def train(num_epochs, model, optimizer, criterion, train_loader, device):
 
         info = "Epoch: {:3}/{} \t train_loss: {:.3f} \t train_acc: {:.3f}"
         print(info.format(epoch + 1, num_epochs, train_loss, train_acc))
+
+def write_bin(filename, array):
+    from functools import reduce
+    # Force endianess: https://stackoverflow.com/questions/23831422/what-endianness-does-python-use-to-write-into-files
+    dtype_to_format = {
+        np.int8: 'i',
+        np.int16: 'i',
+        np.int32: 'i',
+        np.int64: 'i',
+        np.unsignedinteger: 'I',
+        np.float16: 'f',
+        np.float32: 'f',
+        np.float64: 'f',
+        np.double: 'd'
+    }
+    fmt = dtype_to_format[array.dtype.type]
+    shapes = [shape for shape in array.shape]
+    # n, c, h, w = array.shape
+    with open(filename, "wb") as f:
+        # number of dim
+        f.write(struct.pack('I', len(shapes)))
+        for shape in shapes:
+            f.write(struct.pack('I', shape))
+        f.write(struct.pack('c', bytes(fmt, 'utf-8')))
+        f.write(struct.pack(f"{fmt}"*(reduce(lambda x, y: x * y, shapes)), *array.flatten(order="C").tolist()))
+
+def read_bin(filename):
+    # https://qiita.com/madaikiteruyo/items/dadc99aa29f7eae0cdd0
+    format_to_byte = {
+        'c': 1,
+        'i': 4,
+        'I': 4,
+        'f': 4,
+        'd': 8
+    }
+
+    data = []
+    dims, fmt = None, None
+    with open(filename, "rb") as f:
+        # read row and col (np.int = 4 bytes)
+        byte = f.read(format_to_byte['i'])
+
+        if byte == b'':
+            raise Exception("read_bin: Empty binary")
+        else:
+            nb_dim = struct.unpack('I', byte)
+        
+        # Read dims
+        byte = f.read(nb_dim[0] * format_to_byte['I'])
+        dims = struct.unpack('I'*nb_dim[0], byte)
+        # Read character format
+        byte = f.read(1)
+        if byte == b'':
+            raise Exception("read_bin: Empty binary")
+        else:
+            fmt = chr(struct.unpack('c', byte)[0][0])
+
+        if len(fmt) != 1: raise Exception("read_bin: No format dumped in binary")
+        
+        while True:
+            byte = f.read(format_to_byte[fmt])
+            if byte == b'':
+                break
+            else:
+                data.append(struct.unpack(fmt, byte)[0])
+
+    return np.array(data).reshape(*dims)
\ No newline at end of file