Skip to content

Commit

Permalink
Added BatchNorm1d Layer | Each layer now has affine bool attribute
Browse files Browse the repository at this point in the history
  • Loading branch information
rrmina committed Apr 1, 2019
1 parent d2bdd65 commit 7319171
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 14 deletions.
98 changes: 90 additions & 8 deletions eureka/nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
from .initializer import initialize_weight
from .activation import relu, sigmoid, sigmoid_prime, softmax, tanh, tanh_prime, leaky_relu, elu

# Base Class for nn layers/modules
class BaseLayer(object):
def __init__(self):
# Affine means that the layer has learnable parameters
# (i.e weight and bias for Linear/conv | gamma and bias for batch norm)
self.affine = False

# Useful object for stacking of layers
class Sequential(object):
# Initialization of Sequential Stack
Expand Down Expand Up @@ -34,8 +41,11 @@ def test(self):
self.train_mode = False

# Fully-connected Layer
class Linear(object):
class Linear(BaseLayer):
def __init__(self, in_features, out_features, initializer="xavier"):
super(Linear, self).__init__()
self.affine = True

# Initialize layer type
self.layer_type = "nn.Linear"

Expand All @@ -61,8 +71,9 @@ def backward(self, dh):
return np.dot(dh, self.w.T)

# Dropout Layer
class Dropout(object):
class Dropout(BaseLayer):
def __init__(self, drop_prob):
super(Dropout, self).__init__()
# Initialize layer type
self.layer_type = "nn.Dropout"

Expand All @@ -83,9 +94,75 @@ def forward(self, x):
def backward(self, da):
return da * self.mask

# Normalization Layers
class BatchNorm1d(BaseLayer):
"""
Reference: https://kevinzakka.github.io/2016/09/14/batch_normalization/
To be consistent with the naming convention of layers with affine parameters (e.g. nn.Linear),
we rename gamma and beta as w and b
This also reduces unnecessary codes implementing gradient descent in optim due to different naming convention
"""
def __init__(self, num_features, epsilon=1e-8, affine=False):
super(BatchNorm1d, self).__init__()
self.affine = affine

# Layer type
self.layer_type = "nn.BatchNorm1d"

# Hyperparameters
self.epsilon = epsilon

# Class variables
self.x_hat = None
self.u = None
self.std = None
self.batch_size = None

# Affine (Learnable) parameters
self.w = np.ones((1, num_features)) # gamma
self.b = np.zeros((1, num_features)) # beta

# For the most part of BatchNorm, you don't actually need to implement these gradient variables
if (self.affine):
self.dw = np.zeros((1, num_features)), np.zeros((1, num_features))
self.vw, self.vb = np.zeros((1, num_features)), np.zeros((1, num_features))
self.sw, self.sb = np.zeros((1, num_features)), np.zeros((1, num_features))

def forward(self, x):
# Class variables
self.m = x.shape[0] # batch size

# Mean per feature over minibatch
self.u = np.mean(x, axis=0)

# Standard Deviation per feature over minibatch
self.std = np.sqrt(np.var(x, axis = 0) + self.epsilon)

# Normalize
self.x_hat = (x - self.u)/self.std

# Scale and Shift
out = self.x_hat * self.w + self.b

return out

def backward(self, d_bn_out):
# Gradient with respect to affine parameters
if (self.affine):
self.dbeta = np.sum(d_bn_out, axis=0)
self.dgamma = np.sum(d_bn_out*self.x_hat, axis=0)

# Gradient of loss with respect to BN-layer input x
dx_hat = d_bn_out * self.w
numerator = self.m * dx_hat - np.sum(dx_hat, axis=0) - self.x_hat*np.sum(dx_hat*self.x_hat, axis=0)
dx = numerator/(self.m * self.std)

return dx

# Activation Functions
class ReLU(object):
class ReLU(BaseLayer):
def __init__(self):
super(ReLU, self).__init__()
self.layer_type = "activation.ReLU"
self.relu_prime = None

Expand All @@ -98,8 +175,9 @@ def backward(self, da):
self.relu_prime[self.relu_prime > 0] = 1
return da * self.relu_prime

class Sigmoid(object):
class Sigmoid(BaseLayer):
def __init__(self):
super(Sigmoid, self).__init__()
self.layer_type = "activation.Sigmoid"
self.sigmoid_out = None

Expand All @@ -110,8 +188,9 @@ def forward(self, x):
def backward(self, da):
return da * sigmoid_prime(self.sigmoid_out)

class Softmax(object):
class Softmax(BaseLayer):
def __init__(self):
super(Softmax, self).__init__()
self.layer_type = "activation.Softmax"
self.softmax_out = None

Expand All @@ -122,8 +201,9 @@ def forward(self, x):
def backward(self, y):
return self.softmax_out - y

class Tanh(object):
class Tanh(BaseLayer):
def __init__(self):
super(Tanh, self).__init__()
self.layer_type = "activation.Tanh"
self.tanh_out = None

Expand All @@ -134,8 +214,9 @@ def forward(self, x):
def backward(self, da):
return da * tanh_prime(self.tanh_out)

class LeakyReLU(object):
class LeakyReLU(BaseLayer):
def __init__(self):
super(LeakyReLU, self).__init__()
self.layer_type = "activation.LeakyReLU"
self.neg_indices = None
self.pos_indices = None
Expand All @@ -152,8 +233,9 @@ def backward(self, da):
self.leaky_relu_prime[self.neg_indices] = 0.01
return da * self.leaky_relu_prime

class ELU(object):
class ELU(BaseLayer):
def __init__(self, alpha=1.0):
super(ELU, self).__init__()
self.layer_type = "activation.ELU"
self.alpha = np.float32(alpha)
self.neg_indices = None
Expand Down
12 changes: 6 additions & 6 deletions eureka/optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(self, model_instance, lr=0.001):

def step(self):
for layer in self.model_instance.layers:
if (layer.layer_type == "nn.Linear"):
if (layer.affine):
# Weight Update
layer.w -= self.lr * layer.dw
layer.b -= self.lr * layer.db
Expand All @@ -21,7 +21,7 @@ def __init__(self, model_instance, lr=0.01, beta_1=0.9):

def step(self):
for layer in self.model_instance.layers:
if (layer.layer_type == "nn.Linear"):
if (layer.affine):
# Compute 1st moment
layer.vw = self.beta_1 * layer.vw + (1-self.beta_1) * layer.dw
layer.vb = self.beta_1 * layer.vb + (1-self.beta_1) * layer.db
Expand All @@ -39,7 +39,7 @@ def __init__(self, model_instance, lr=0.0001, beta_2=0.999, epsilon=1e-8):

def step(self):
for layer in self.model_instance.layers:
if (layer.layer_type == "nn.Linear"):
if (layer.affine):
# Compute 2nd moment
layer.sw = self.beta_2 * layer.sw + (1-self.beta_2) * layer.dw * layer.dw
layer.sb = self.beta_2 * layer.sb + (1-self.beta_2) * layer.db * layer.db
Expand All @@ -58,7 +58,7 @@ def __init__(self, model_instance, lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=

def step(self):
for layer in self.model_instance.layers:
if (layer.layer_type == "nn.Linear"):
if (layer.affine):
# Compute 1st moment
layer.vw = self.beta_1 * layer.vw + (1-self.beta_1) * layer.dw
layer.vb = self.beta_1 * layer.vb + (1-self.beta_1) * layer.db
Expand All @@ -81,8 +81,8 @@ def __init__(self, model_instance, lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e

def step(self):
for layer in self.model_instance.layers:
if (layer.layer_type == "nn.Linear"):

if (layer.affine):
# Compute 1st moment
# This is a wrong mistake! See what I did there? negative * negative = positive?
# It turns out that this 1st moment computation produces empirically good results WOW Eureka!
Expand Down

0 comments on commit 7319171

Please sign in to comment.