support for BatchNorm1d

eljanmahammadli · Nov 19, 2023 · 9864d65 · 9864d65
1 parent 6067fda
commit 9864d65
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 12 deletions.
diff --git a/gradipy/nn/__init__.py b/gradipy/nn/__init__.py
@@ -6,4 +6,4 @@
     init_xavier_uniform,
 )
 from . import optim
-from .modules import Linear, Conv2d
+from .modules import Linear, Conv2d, BatchNorm1d
diff --git a/gradipy/nn/modules.py b/gradipy/nn/modules.py
@@ -6,13 +6,16 @@
 
 class Module(ABC):
     def __init__(self) -> None:
-        self.parameters = []
         self.y = None
 
     @abstractmethod
     def forward(self) -> Tensor:
         pass
 
+    @abstractmethod
+    def parameters(self) -> list:
+        pass
+
     def backward(self) -> Tensor:
         self.y.backward()
 
@@ -25,17 +28,20 @@ def __call__(self, *args) -> Tensor:
 
 
 class Linear(Module):
-    def __init__(self, in_features: int, out_features: int) -> None:
+    def __init__(self, in_features: int, out_features: int, bias: bool = False) -> None:
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
+        self.bias = bias
+        # TODO: this is fixed for relu, condider to use pytorch's init
         self.weight = init_kaiming_normal(in_features, out_features)
-        self.parameters = [self.weight]
-        self.y = None
 
     def forward(self, x: Tensor) -> Tensor:
-        self.y = x.matmul(self.weight)
-        return self.y
+        # TODO: implement bias
+        return x.matmul(self.weight)
+
+    def parameters(self) -> list:
+        return [self.weight] + ([] if self.bias is False else [self.bias])
 
 
 class Conv2d(Module):
@@ -44,19 +50,51 @@ def __init__(
         in_channels: int,
         out_channels: int,
         kernel_size: int,
-        stride: int,
-        padding: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = False,
     ) -> None:
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
+        self.bias = bias
         # TODO: implement better init for conv2d. Is kaiming normal good enough?
+        # TODO: implement bias
         self.weight = Tensor(np.random.randn(out_channels, in_channels, kernel_size, kernel_size))
-        self.parameters = [self.weight]
 
     def forward(self, x: Tensor) -> Tensor:
-        self.y = x.conv2d(self.weight, None, self.stride, self.padding)
-        return self.y
+        return x.conv2d(self.weight, None, self.stride, self.padding)
+
+    def parameters(self) -> list:
+        return [self.weight] + ([] if self.bias is False else [self.bias])
+
+
+class BatchNorm1d(Module):
+    def __init__(self, num_features: int, eps: float = 1e-5, momentum: float = 0.1) -> None:
+        super().__init__()
+        self.eps = eps
+        self.momentum = momentum
+        self.weight = np.ones(num_features, dtype=np.float32)
+        self.bias = np.zeros(num_features, dtype=np.float32)
+        self.running_mean = np.zeros(num_features, dtype=np.float32)
+        self.running_var = np.ones(num_features, dtype=np.float32)
+        self.training = True
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.training:
+            xmean = x.data.mean(axis=0)
+            xvar = x.data.var(axis=0)
+        else:
+            xmean = self.running_mean
+            xvar = self.running_var
+        if self.training:
+            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
+            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
+        out = self.weight * ((x.data - xmean) / np.sqrt(xvar + self.eps)) + self.bias
+        return Tensor(out)  # what is the children of this tensor?
+
+    def parameters(self) -> list:
+        return [Tensor(self.weight), Tensor(self.bias)]
diff --git a/test/test_tensor_ops.py b/test/test_tensor_ops.py
@@ -218,3 +218,38 @@ def test_gradipy():
 
     for x, y in zip(test_pytorch(), test_gradipy()):
         np.testing.assert_allclose(x, y)
+
+
+def test_BatchNorm1d():
+    ii = np.random.randn(32, 10).astype(np.float32)
+    num_features = ii.shape[-1]
+
+    def test_pytorch():
+        i = torch.from_numpy(ii)
+        bn = ptnn.BatchNorm1d(num_features)
+        o = bn(i)
+        return (
+            o.detach().numpy(),
+            bn.weight.detach().numpy(),
+            bn.bias.detach().numpy(),
+            bn.running_mean.numpy(),
+            # bn.running_var.numpy(),
+        )
+
+        return (bn.running_var.numpy(),)
+
+    def test_gradipy():
+        i = Tensor(ii)
+        bn = nn.BatchNorm1d(num_features)
+        o = bn(i)
+        return (
+            o.data,
+            bn.weight.data,
+            bn.bias.data,
+            bn.running_mean,
+            # bn.running_var
+            # variance calculation is different from pytorch for some reason.
+        )
+
+    for x, y in zip(test_pytorch(), test_gradipy()):
+        np.testing.assert_allclose(x, y, atol=1e-5)