|
| 1 | +#! /usr/bin/env python |
| 2 | +""" |
| 3 | +File: my_lstm.py |
| 4 | +
|
| 5 | +Author: Thomas Wood ([email protected]) |
| 6 | +
|
| 7 | +Description: a quick and dirty lstm layer based on description of lstm networks |
| 8 | +at http://deeplearning.net/tutorial/lstm.html |
| 9 | +
|
| 10 | +""" |
| 11 | + |
| 12 | +import numpy as np |
| 13 | +from numpy import tanh |
| 14 | +from numpy.random import random |
| 15 | +from string import printable |
| 16 | + |
| 17 | + |
| 18 | +def sigmoid(z): |
| 19 | + return 1./(1.+np.exp(-z)) |
| 20 | + |
| 21 | +def rand_mat(nrow, ncol, sigma, mu=0.0): |
| 22 | + return sigma*(2*np.random.random((nrow,ncol))-1.) + np.tile(mu,(nrow,ncol)) |
| 23 | + |
| 24 | +def gen_bag_hashtable(): |
| 25 | + N = len(printable) |
| 26 | + table = {} |
| 27 | + for k in range(N): |
| 28 | + table[printable[k]] = k |
| 29 | + return table |
| 30 | + |
| 31 | +def make_wordvector(s, table): |
| 32 | + N = len(printable) |
| 33 | + L = len(s) |
| 34 | + a = np.zeros((N,L)) |
| 35 | + for k in range(L): |
| 36 | + a[ table[ s[k] ], k ] = 1 |
| 37 | + return a |
| 38 | + |
| 39 | +def make_string(x): |
| 40 | + s = [] |
| 41 | + for k in range(x.shape[1]): |
| 42 | + s.append(printable[np.argmax(x[:,k])]) |
| 43 | + return ''.join(s) |
| 44 | + |
| 45 | +class LSTMLayer: |
| 46 | + """ |
| 47 | + There are four afferent weight matrices: |
| 48 | +
|
| 49 | + W_i - used to update input gate |
| 50 | + W_c - used to update prelimiary candidate hidden state |
| 51 | + W_f - used to update forget gate |
| 52 | + W_o - used to upate output gate |
| 53 | +
|
| 54 | + four recurrent weight matrices (U_i, U_c, U_f, U_o) |
| 55 | +
|
| 56 | + and four bias vectors (b_i, b_c, b_f, b_o) |
| 57 | +
|
| 58 | + along with a weight matrix for the candidate vector (V_o). |
| 59 | +
|
| 60 | + There are also the persistent values used to step forward the lstm layer, |
| 61 | + the hidden state -- h_(t-1), and |
| 62 | + the candidate vector -- C_(t-1) |
| 63 | +
|
| 64 | + """ |
| 65 | + def __init__(self, n_in, n_out, params, eps=0.001): |
| 66 | + |
| 67 | + self.n_input = n_in # dimension of the input vector x_t |
| 68 | + self.n_output = n_out |
| 69 | + ####---- LAYER PARAMETERS |
| 70 | + |
| 71 | + # W consists of four afferent weight matrices W_i, W_c, W_f, W_o |
| 72 | + ind_W = 4*n_in*n_out |
| 73 | + self.W = params[:ind_W].reshape((4*n_out, n_in)) |
| 74 | + # U consists of four recurrent weight matrices U_i, U_c, U_f, U_o |
| 75 | + ind_U = ind_W + 4*n_out*n_out |
| 76 | + self.U = params[ind_W:ind_U].reshape((4*n_out, n_out)) |
| 77 | + # bias consists of four biases b_i, b_c, b_f, b_o |
| 78 | + ind_bias = ind_U + 4*n_out |
| 79 | + self.bias = params[ind_U:ind_bias].reshape((4*n_out, )) |
| 80 | + # One more matrix just for the value of the candidate vector |
| 81 | + self.V_o = params[ind_bias:].reshape((n_out, n_out)) |
| 82 | + |
| 83 | + ####---- LAYER STATES - (PERSISTENT) |
| 84 | + |
| 85 | + # h is the value of the hidden state of the layer |
| 86 | + self.h = eps*(2*random((n_in,))-1.) |
| 87 | + |
| 88 | + # X is the candidate value |
| 89 | + self.C = eps*(2*random((n_in,))-1.) |
| 90 | + |
| 91 | + def step(self, x): |
| 92 | + """ |
| 93 | + Input Gate update rule: |
| 94 | + i_t = sigmoid(W_i*x_t + U_i*h_(t-1) + b_i) |
| 95 | +
|
| 96 | + Preliminary Candidate hidden state update rule: |
| 97 | + Cprelim_t = tanh(W_c*x_t +U_c*h_(t-1) + b_c) |
| 98 | +
|
| 99 | + Forget Gate update rule: |
| 100 | + f_t = sigmoid(W_f*x_t + U_f*h_(t-1) + b_f) |
| 101 | +
|
| 102 | + Candidate hidden state update rule: |
| 103 | + C_t = i_t*Cprelim_t + f_t*C_(t-1) |
| 104 | +
|
| 105 | + Output Gate update rule: |
| 106 | + o_t = sigmoid(W_o*x_t +U_o*h_(t-1) +V_o*C_t + b_o) |
| 107 | +
|
| 108 | + Hidden state update rule: |
| 109 | + h_t = o_t * tanh(C_t) |
| 110 | +
|
| 111 | + """ |
| 112 | + |
| 113 | + # We have stacked the afferent and reccurent weight matrices to allow |
| 114 | + # us to easily compute the products of x and h with their respective |
| 115 | + # weight matrix with a single step. |
| 116 | + W_x = np.dot(self.W, x)#.reshape((self.W.shape[0],1)) |
| 117 | + U_h = np.dot(self.U, self.h) |
| 118 | + |
| 119 | + n = self.n_output # for ease of reading and writing |
| 120 | + |
| 121 | + # Split the pre-calculated matrices up for easier access |
| 122 | + # Common practice for me when splitting up an array in this fashion |
| 123 | + # I will often go back through and remove unnecessary variables. |
| 124 | + |
| 125 | + # W_i_x = W_x[:n] |
| 126 | + # W_c_x = W_x[n:2*n] |
| 127 | + # W_f_x = W_x[2*n:3*n] |
| 128 | + # W_o_x = W_x[3*n:] |
| 129 | + # |
| 130 | + # U_i_h = U_h[:n] |
| 131 | + # U_c_h = U_h[n:2*n] |
| 132 | + # U_f_h = U_h[2*n:3*n] |
| 133 | + # U_o_h = U_h[3*n:] |
| 134 | + |
| 135 | + # i_t = sigmoid(W_i_x + U_i_h + self.bias[:n]) |
| 136 | + # C_pre = tanh(W_c_x + U_c_h + self.bias[n:2*n]) |
| 137 | + # f_t = sigmoid(W_f_x + U_f_h + self.bias[2*n:3*n]) |
| 138 | + |
| 139 | + # self.C = i_t * C_pre + f_t * self.C |
| 140 | + |
| 141 | + |
| 142 | + self.C = sigmoid(W_x[:n] + U_h[:n] + self.bias[:n]) \ |
| 143 | + * tanh(W_x[n:2*n] + U_h[n:2*n] + self.bias[n:2*n]) \ |
| 144 | + + sigmoid(W_x[2*n:3*n] + U_h[2*n:3*n] + self.bias[2*n:3*n]) \ |
| 145 | + * self.C |
| 146 | + |
| 147 | + # o_t = sigmoid(W_o_x + U_o_h + np.dot(self.V_o,self.C) + self.bias[3*n:]) |
| 148 | + # self.h = o_t * tanh(self.C) |
| 149 | + self.h = sigmoid(W_x[3*n:] +U_h[3*n:] + \ |
| 150 | + np.dot(self.V_o, self.C) + self.bias[3*n:]) * tanh(self.C) |
| 151 | + |
| 152 | + return self.h |
| 153 | + |
| 154 | +def rudimentary_test(): |
| 155 | + """ |
| 156 | + Very simple test of BRNNLayer functionality. I'm training a DQN for |
| 157 | + Space Invaders right now and I don't really want to get into any training |
| 158 | + until my GPU is free for all the matrix multiplication. |
| 159 | +
|
| 160 | + Right now this is just a fun example of how to multiply random numbers |
| 161 | + to get more random numbers. I might add in some objective costs along with |
| 162 | + some optimization routines, but I would likely make a new repository for |
| 163 | + my optimization function. |
| 164 | + """ |
| 165 | + |
| 166 | + s = """0 a is the quick fox who jumped over the lazy brown dog's new sentence.""" |
| 167 | + table = gen_bag_hashtable() |
| 168 | + |
| 169 | + v = make_wordvector(s, table) |
| 170 | + |
| 171 | + n_in, T = v.shape |
| 172 | + n_out = n_in |
| 173 | + n_hidden = 100 # Learn a more complex representation? |
| 174 | + eps = 0.1 |
| 175 | + |
| 176 | + |
| 177 | + n_params = 2*n_in*n_hidden + \ |
| 178 | + 2*n_hidden*n_hidden + \ |
| 179 | + 2*n_out*n_hidden + \ |
| 180 | + 2*n_hidden+n_out |
| 181 | + |
| 182 | + params1 = eps*(2*random(n_params,)-1.) |
| 183 | + params2 = eps*(2*random(n_params,)-1.) |
| 184 | + params3 = eps*(2*random(n_params,)-1.) |
| 185 | + |
| 186 | + brnn1 = BRNNLayer(n_in,n_hidden,n_in,params1, eps) |
| 187 | + brnn2 = BRNNLayer(n_in,n_hidden,n_in,params2, eps) |
| 188 | + brnn3 = BRNNLayer(n_in,n_hidden,n_in,params3, eps) |
| 189 | + |
| 190 | + y1 = brnn1.gen_sequence(v) |
| 191 | + y2 = brnn2.gen_sequence(y1) |
| 192 | + y3 = brnn3.gen_sequence(y2) |
| 193 | + |
| 194 | + s1 = make_string(y3) |
| 195 | + print s1 |
| 196 | + |
| 197 | +if __name__ == "__main__": |
| 198 | + rudimentary_test() |
0 commit comments