Added BatchNorm1d Layer | Each layer now has affine bool attribute

rrmina · Apr 1, 2019 · 7319171 · 7319171
1 parent d2bdd65
commit 7319171
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 14 deletions.
diff --git a/eureka/nn.py b/eureka/nn.py
@@ -2,6 +2,13 @@
 from .initializer import initialize_weight
 from .activation import relu, sigmoid, sigmoid_prime, softmax, tanh, tanh_prime, leaky_relu, elu
 
+# Base Class for nn layers/modules
+class BaseLayer(object):
+    def __init__(self):
+        # Affine means that the layer has learnable parameters
+        # (i.e weight and bias for Linear/conv | gamma and bias for batch norm)
+        self.affine = False
+
 # Useful object for stacking of layers
 class Sequential(object):
     # Initialization of Sequential Stack
@@ -34,8 +41,11 @@ def test(self):
         self.train_mode = False
 
 # Fully-connected Layer
-class Linear(object):
+class Linear(BaseLayer):
     def __init__(self, in_features, out_features, initializer="xavier"):
+        super(Linear, self).__init__()
+        self.affine = True
+
         # Initialize layer type
         self.layer_type = "nn.Linear"
 
@@ -61,8 +71,9 @@ def backward(self, dh):
         return np.dot(dh, self.w.T)
 
 # Dropout Layer
-class Dropout(object):
+class Dropout(BaseLayer):
     def __init__(self, drop_prob):
+        super(Dropout, self).__init__()
         # Initialize layer type
         self.layer_type = "nn.Dropout"
 
@@ -83,9 +94,75 @@ def forward(self, x):
     def backward(self, da):
         return da * self.mask
 
+# Normalization Layers
+class BatchNorm1d(BaseLayer):
+    """
+    Reference: https://kevinzakka.github.io/2016/09/14/batch_normalization/
+    To be consistent with the naming convention of layers with affine parameters (e.g. nn.Linear), 
+    we rename gamma and beta as w and b
+    This also reduces unnecessary codes implementing gradient descent in optim due to different naming convention 
+    """
+    def __init__(self, num_features, epsilon=1e-8, affine=False):
+        super(BatchNorm1d, self).__init__()
+        self.affine = affine
+
+        # Layer type
+        self.layer_type = "nn.BatchNorm1d"
+
+        # Hyperparameters
+        self.epsilon = epsilon
+
+        # Class variables
+        self.x_hat = None
+        self.u = None
+        self.std = None
+        self.batch_size = None
+
+        # Affine (Learnable) parameters
+        self.w = np.ones((1, num_features))     # gamma
+        self.b = np.zeros((1, num_features))    # beta
+
+        # For the most part of BatchNorm, you don't actually need to implement these gradient variables
+        if (self.affine):
+            self.dw = np.zeros((1, num_features)), np.zeros((1, num_features))
+            self.vw, self.vb = np.zeros((1, num_features)), np.zeros((1, num_features))
+            self.sw, self.sb = np.zeros((1, num_features)), np.zeros((1, num_features))
+
+    def forward(self, x):
+        # Class variables
+        self.m = x.shape[0] # batch size
+
+        # Mean per feature over minibatch
+        self.u = np.mean(x, axis=0)
+
+        # Standard Deviation per feature over minibatch
+        self.std = np.sqrt(np.var(x, axis = 0) + self.epsilon)
+
+        # Normalize
+        self.x_hat = (x - self.u)/self.std
+
+        # Scale and Shift
+        out = self.x_hat * self.w + self.b
+
+        return out
+
+    def backward(self, d_bn_out):
+        # Gradient with respect to affine parameters
+        if (self.affine):
+            self.dbeta = np.sum(d_bn_out, axis=0)
+            self.dgamma = np.sum(d_bn_out*self.x_hat, axis=0)
+
+        # Gradient of loss with respect to BN-layer input x
+        dx_hat = d_bn_out * self.w
+        numerator = self.m * dx_hat - np.sum(dx_hat, axis=0) - self.x_hat*np.sum(dx_hat*self.x_hat, axis=0)
+        dx = numerator/(self.m * self.std)
+
+        return dx
+
 # Activation Functions
-class ReLU(object):
+class ReLU(BaseLayer):
     def __init__(self):
+        super(ReLU, self).__init__()
         self.layer_type = "activation.ReLU"
         self.relu_prime = None
 
@@ -98,8 +175,9 @@ def backward(self, da):
         self.relu_prime[self.relu_prime > 0] = 1
         return da * self.relu_prime
 
-class Sigmoid(object):
+class Sigmoid(BaseLayer):
     def __init__(self):
+        super(Sigmoid, self).__init__()
         self.layer_type = "activation.Sigmoid"
         self.sigmoid_out = None
 
@@ -110,8 +188,9 @@ def forward(self, x):
     def backward(self, da):
         return da * sigmoid_prime(self.sigmoid_out)
 
-class Softmax(object):
+class Softmax(BaseLayer):
     def __init__(self):
+        super(Softmax, self).__init__()
         self.layer_type = "activation.Softmax"
         self.softmax_out = None
 
@@ -122,8 +201,9 @@ def forward(self, x):
     def backward(self, y):
         return self.softmax_out - y
 
-class Tanh(object):
+class Tanh(BaseLayer):
     def __init__(self):
+        super(Tanh, self).__init__()
         self.layer_type = "activation.Tanh"
         self.tanh_out = None
 
@@ -134,8 +214,9 @@ def forward(self, x):
     def backward(self, da):
         return da * tanh_prime(self.tanh_out)
 
-class LeakyReLU(object):
+class LeakyReLU(BaseLayer):
     def __init__(self):
+        super(LeakyReLU, self).__init__()
         self.layer_type = "activation.LeakyReLU"
         self.neg_indices = None
         self.pos_indices = None
@@ -152,8 +233,9 @@ def backward(self, da):
         self.leaky_relu_prime[self.neg_indices] = 0.01
         return da * self.leaky_relu_prime
 
-class ELU(object):
+class ELU(BaseLayer):
     def __init__(self, alpha=1.0):
+        super(ELU, self).__init__()
         self.layer_type = "activation.ELU"
         self.alpha = np.float32(alpha)
         self.neg_indices = None

diff --git a/eureka/optim.py b/eureka/optim.py
@@ -8,7 +8,7 @@ def __init__(self, model_instance, lr=0.001):
 
     def step(self):
         for layer in self.model_instance.layers:
-            if (layer.layer_type == "nn.Linear"):
+            if (layer.affine):
                 # Weight Update
                 layer.w -= self.lr * layer.dw
                 layer.b -= self.lr * layer.db
@@ -21,7 +21,7 @@ def __init__(self, model_instance, lr=0.01, beta_1=0.9):
 
     def step(self):
         for layer in self.model_instance.layers:
-            if (layer.layer_type == "nn.Linear"):
+            if (layer.affine):
                 # Compute 1st moment
                 layer.vw = self.beta_1 * layer.vw + (1-self.beta_1) * layer.dw
                 layer.vb = self.beta_1 * layer.vb + (1-self.beta_1) * layer.db
@@ -39,7 +39,7 @@ def __init__(self, model_instance, lr=0.0001, beta_2=0.999, epsilon=1e-8):
 
     def step(self):
         for layer in self.model_instance.layers:
-            if (layer.layer_type == "nn.Linear"):
+            if (layer.affine):
                 # Compute 2nd moment
                 layer.sw = self.beta_2 * layer.sw + (1-self.beta_2) * layer.dw * layer.dw
                 layer.sb = self.beta_2 * layer.sb + (1-self.beta_2) * layer.db * layer.db
@@ -58,7 +58,7 @@ def __init__(self, model_instance, lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=
 
     def step(self):
         for layer in self.model_instance.layers:
-            if (layer.layer_type == "nn.Linear"):
+            if (layer.affine):
                 # Compute 1st moment
                 layer.vw = self.beta_1 * layer.vw + (1-self.beta_1) * layer.dw
                 layer.vb = self.beta_1 * layer.vb + (1-self.beta_1) * layer.db
@@ -81,8 +81,8 @@ def __init__(self, model_instance, lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e
 
     def step(self):
         for layer in self.model_instance.layers:
-            if (layer.layer_type == "nn.Linear"):
-
+            if (layer.affine):
+                
                 # Compute 1st moment
                 # This is a wrong mistake! See what I did there? negative * negative = positive?
                 # It turns out that this 1st moment computation produces empirically good results WOW Eureka!