diff --git a/eureka/nn.py b/eureka/nn.py index cd68eea..4f4d2e2 100644 --- a/eureka/nn.py +++ b/eureka/nn.py @@ -2,6 +2,13 @@ from .initializer import initialize_weight from .activation import relu, sigmoid, sigmoid_prime, softmax, tanh, tanh_prime, leaky_relu, elu +# Base Class for nn layers/modules +class BaseLayer(object): + def __init__(self): + # Affine means that the layer has learnable parameters + # (i.e weight and bias for Linear/conv | gamma and bias for batch norm) + self.affine = False + # Useful object for stacking of layers class Sequential(object): # Initialization of Sequential Stack @@ -34,8 +41,11 @@ def test(self): self.train_mode = False # Fully-connected Layer -class Linear(object): +class Linear(BaseLayer): def __init__(self, in_features, out_features, initializer="xavier"): + super(Linear, self).__init__() + self.affine = True + # Initialize layer type self.layer_type = "nn.Linear" @@ -61,8 +71,9 @@ def backward(self, dh): return np.dot(dh, self.w.T) # Dropout Layer -class Dropout(object): +class Dropout(BaseLayer): def __init__(self, drop_prob): + super(Dropout, self).__init__() # Initialize layer type self.layer_type = "nn.Dropout" @@ -83,9 +94,75 @@ def forward(self, x): def backward(self, da): return da * self.mask +# Normalization Layers +class BatchNorm1d(BaseLayer): + """ + Reference: https://kevinzakka.github.io/2016/09/14/batch_normalization/ + To be consistent with the naming convention of layers with affine parameters (e.g. nn.Linear), + we rename gamma and beta as w and b + This also reduces unnecessary codes implementing gradient descent in optim due to different naming convention + """ + def __init__(self, num_features, epsilon=1e-8, affine=False): + super(BatchNorm1d, self).__init__() + self.affine = affine + + # Layer type + self.layer_type = "nn.BatchNorm1d" + + # Hyperparameters + self.epsilon = epsilon + + # Class variables + self.x_hat = None + self.u = None + self.std = None + self.batch_size = None + + # Affine (Learnable) parameters + self.w = np.ones((1, num_features)) # gamma + self.b = np.zeros((1, num_features)) # beta + + # For the most part of BatchNorm, you don't actually need to implement these gradient variables + if (self.affine): + self.dw = np.zeros((1, num_features)), np.zeros((1, num_features)) + self.vw, self.vb = np.zeros((1, num_features)), np.zeros((1, num_features)) + self.sw, self.sb = np.zeros((1, num_features)), np.zeros((1, num_features)) + + def forward(self, x): + # Class variables + self.m = x.shape[0] # batch size + + # Mean per feature over minibatch + self.u = np.mean(x, axis=0) + + # Standard Deviation per feature over minibatch + self.std = np.sqrt(np.var(x, axis = 0) + self.epsilon) + + # Normalize + self.x_hat = (x - self.u)/self.std + + # Scale and Shift + out = self.x_hat * self.w + self.b + + return out + + def backward(self, d_bn_out): + # Gradient with respect to affine parameters + if (self.affine): + self.dbeta = np.sum(d_bn_out, axis=0) + self.dgamma = np.sum(d_bn_out*self.x_hat, axis=0) + + # Gradient of loss with respect to BN-layer input x + dx_hat = d_bn_out * self.w + numerator = self.m * dx_hat - np.sum(dx_hat, axis=0) - self.x_hat*np.sum(dx_hat*self.x_hat, axis=0) + dx = numerator/(self.m * self.std) + + return dx + # Activation Functions -class ReLU(object): +class ReLU(BaseLayer): def __init__(self): + super(ReLU, self).__init__() self.layer_type = "activation.ReLU" self.relu_prime = None @@ -98,8 +175,9 @@ def backward(self, da): self.relu_prime[self.relu_prime > 0] = 1 return da * self.relu_prime -class Sigmoid(object): +class Sigmoid(BaseLayer): def __init__(self): + super(Sigmoid, self).__init__() self.layer_type = "activation.Sigmoid" self.sigmoid_out = None @@ -110,8 +188,9 @@ def forward(self, x): def backward(self, da): return da * sigmoid_prime(self.sigmoid_out) -class Softmax(object): +class Softmax(BaseLayer): def __init__(self): + super(Softmax, self).__init__() self.layer_type = "activation.Softmax" self.softmax_out = None @@ -122,8 +201,9 @@ def forward(self, x): def backward(self, y): return self.softmax_out - y -class Tanh(object): +class Tanh(BaseLayer): def __init__(self): + super(Tanh, self).__init__() self.layer_type = "activation.Tanh" self.tanh_out = None @@ -134,8 +214,9 @@ def forward(self, x): def backward(self, da): return da * tanh_prime(self.tanh_out) -class LeakyReLU(object): +class LeakyReLU(BaseLayer): def __init__(self): + super(LeakyReLU, self).__init__() self.layer_type = "activation.LeakyReLU" self.neg_indices = None self.pos_indices = None @@ -152,8 +233,9 @@ def backward(self, da): self.leaky_relu_prime[self.neg_indices] = 0.01 return da * self.leaky_relu_prime -class ELU(object): +class ELU(BaseLayer): def __init__(self, alpha=1.0): + super(ELU, self).__init__() self.layer_type = "activation.ELU" self.alpha = np.float32(alpha) self.neg_indices = None diff --git a/eureka/optim.py b/eureka/optim.py index c3da222..a3a44d5 100644 --- a/eureka/optim.py +++ b/eureka/optim.py @@ -8,7 +8,7 @@ def __init__(self, model_instance, lr=0.001): def step(self): for layer in self.model_instance.layers: - if (layer.layer_type == "nn.Linear"): + if (layer.affine): # Weight Update layer.w -= self.lr * layer.dw layer.b -= self.lr * layer.db @@ -21,7 +21,7 @@ def __init__(self, model_instance, lr=0.01, beta_1=0.9): def step(self): for layer in self.model_instance.layers: - if (layer.layer_type == "nn.Linear"): + if (layer.affine): # Compute 1st moment layer.vw = self.beta_1 * layer.vw + (1-self.beta_1) * layer.dw layer.vb = self.beta_1 * layer.vb + (1-self.beta_1) * layer.db @@ -39,7 +39,7 @@ def __init__(self, model_instance, lr=0.0001, beta_2=0.999, epsilon=1e-8): def step(self): for layer in self.model_instance.layers: - if (layer.layer_type == "nn.Linear"): + if (layer.affine): # Compute 2nd moment layer.sw = self.beta_2 * layer.sw + (1-self.beta_2) * layer.dw * layer.dw layer.sb = self.beta_2 * layer.sb + (1-self.beta_2) * layer.db * layer.db @@ -58,7 +58,7 @@ def __init__(self, model_instance, lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon= def step(self): for layer in self.model_instance.layers: - if (layer.layer_type == "nn.Linear"): + if (layer.affine): # Compute 1st moment layer.vw = self.beta_1 * layer.vw + (1-self.beta_1) * layer.dw layer.vb = self.beta_1 * layer.vb + (1-self.beta_1) * layer.db @@ -81,8 +81,8 @@ def __init__(self, model_instance, lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e def step(self): for layer in self.model_instance.layers: - if (layer.layer_type == "nn.Linear"): - + if (layer.affine): + # Compute 1st moment # This is a wrong mistake! See what I did there? negative * negative = positive? # It turns out that this 1st moment computation produces empirically good results WOW Eureka!