diff --git a/doc/Intro_RNN.pdf b/doc/Intro_RNN.pdf new file mode 100644 index 0000000..84109df Binary files /dev/null and b/doc/Intro_RNN.pdf differ diff --git a/groundhog/layers/cost_layers.py b/groundhog/layers/cost_layers.py index 0b37760..a0f70cf 100644 --- a/groundhog/layers/cost_layers.py +++ b/groundhog/layers/cost_layers.py @@ -409,6 +409,240 @@ def _get_samples(self, model, length=30, temp=1, *inps): raise NotImplemented +class LinearLayer(CostLayer): + """ + Linear output layer. + """ + + def _init_params(self): + """ + Initialize the parameters of the layer, either by using sparse initialization or small + isotropic noise. + """ + if self.rank_n_approx: + W_em1 = self.init_fn(self.nin, + self.rank_n_approx, + self.sparsity, + self.scale, + self.rng) + W_em2 = self.init_fn(self.rank_n_approx, + self.nout, + self.sparsity, + self.scale, + self.rng) + self.W_em1 = theano.shared(W_em1, + name='W1_%s'%self.name) + self.W_em2 = theano.shared(W_em2, + name='W2_%s'%self.name) + self.b_em = theano.shared( + numpy.zeros((self.nout,), dtype=theano.config.floatX), + name='b_%s'%self.name) + self.params += [self.W_em1, self.W_em2, self.b_em] + self.myparams = []#[self.W_em1, self.W_em2, self.b_em] + if self.weight_noise: + self.nW_em1 = theano.shared(W_em1*0., + name='noise_W1_%s'%self.name) + self.nW_em2 = theano.shared(W_em*0., + name='noise_W2_%s'%self.name) + self.nb_em = theano.shared(b_em*0., + name='noise_b_%s'%self.name) + self.noise_params = [self.nW_em1, self.nW_em2, self.nb_em] + self.noise_params_shape_fn = [ + constant_shape(x.get_value().shape) + for x in self.noise_params] + + else: + W_em = self.init_fn(self.nin, + self.nout, + self.sparsity, + self.scale, + self.rng) + self.W_em = theano.shared(W_em, + name='W_%s'%self.name) + self.b_em = theano.shared( + numpy.zeros((self.nout,), dtype=theano.config.floatX), + name='b_%s'%self.name) + self.add_wghs = [] + self.n_add_wghs = [] + if self.additional_inputs: + for pos, sz in enumerate(self.additional_inputs): + W_add = self.init_fn(sz, + self.nout, + self.sparsity, + self.scale, + self.rng) + self.add_wghs += [theano.shared(W_add, + name='W_add%d_%s'%(pos, self.name))] + if self.weight_noise: + self.n_add_wghs += [theano.shared(W_add*0., + name='noise_W_add%d_%s'%(pos, + self.name))] + + self.params += [self.W_em, self.b_em] + self.add_wghs + self.myparams = []#[self.W_em, self.b_em] + self.add_wghs + if self.weight_noise: + self.nW_em = theano.shared(W_em*0., + name='noise_W_%s'%self.name) + self.nb_em = theano.shared(numpy.zeros((self.nout,), + dtype=theano.config.floatX), + name='noise_b_%s'%self.name) + self.noise_params = [self.nW_em, self.nb_em] + self.n_add_wghs + self.noise_params_shape_fn = [ + constant_shape(x.get_value().shape) + for x in self.noise_params] + + def _check_dtype(self, matrix, inp): + if 'int' in inp.dtype and inp.ndim==2: + return matrix[inp.flatten()] + elif 'int' in inp.dtype: + return matrix[inp] + elif 'float' in inp.dtype and inp.ndim == 3: + shape0 = inp.shape[0] + shape1 = inp.shape[1] + shape2 = inp.shape[2] + return TT.dot(inp.reshape((shape0*shape1, shape2)), matrix) + else: + return TT.dot(inp, matrix) + + + def fprop(self, state_below, temp = numpy.float32(1), use_noise=True, + additional_inputs = None): + """ + Constructs the computational graph of this layer. + """ + + if self.rank_n_approx: + if use_noise and self.noise_params: + emb_val = self._check_dtype(self.W_em1+self.nW_em1, + state_below) + emb_val = TT.dot(self.W_em2 + self.nW_em2, emb_val) + else: + emb_val = self._check_dtype(self.W_em1, state_below) + emb_val = TT.dot(self.W_em2, emb_val) + else: + if use_noise and self.noise_params: + emb_val = self._check_dtype(self.W_em + self.nW_em, state_below) + else: + emb_val = self._check_dtype(self.W_em, state_below) + + if additional_inputs: + for st, wgs in zip(additional_inputs, self.add_wghs): + emb_val += self._check_dtype(wgs, st) + + if use_noise and self.noise_params: + emb_val = (emb_val + self.b_em+ self.nb_em) + else: + emb_val = (emb_val + self.b_em) + self.out = emb_val + self.state_below = state_below + self.model_output = emb_val + return emb_val + + def get_cost(self, state_below, target=None, mask = None, temp=1, + reg = None, scale=None, sum_over_time=True, use_noise=True, + additional_inputs=None): + """ + This function computes the cost of this layer. + + :param state_below: theano variable representing the input to the + softmax layer + :param target: theano variable representing the target for this + layer + :return: mean cross entropy + """ + class_probs = self.fprop(state_below, temp = temp, + use_noise=use_noise, + additional_inputs=additional_inputs) + pvals = class_probs + assert target, 'Computing the cost requires a target' + if target.ndim == 3: + target = target.reshape((target.shape[0]*target.shape[1], + target.shape[2])) + assert 'float' in target.dtype + cost = (class_probs - target)**2 + if mask: + mask = mask.flatten() + cost = cost * TT.cast(mask, theano.config.floatX) + if sum_over_time is None: + sum_over_time = self.sum_over_time + if sum_over_time: + if state_below.ndim ==3: + sh0 = TT.cast(state_below.shape[0], + theano.config.floatX) + sh1 = TT.cast(state_below.shape[1], + theano.config.floatX) + self.cost = cost.sum()/sh1 + else: + self.cost =cost.sum() + else: + self.cost = cost.mean() + if scale: + self.cost = self.cost*scale + if reg: + self.cost = self.cost + reg + self.out = self.cost + self.mask = mask + self.cost_scale = scale + return self.cost + + + def get_grads(self, state_below, target, mask = None, reg = None, + scale=None, sum_over_time=True, use_noise=True, + additional_inputs=None): + """ + This function implements both the forward and backwards pass of this + layer. The reason we do this in a single function is because for the + factorized softmax layer is hard to rely on grad and get an + optimized graph. For uniformity I've implemented this method for + this layer as well (though one doesn't need to use it) + + :param state_below: theano variable representing the input to the + softmax layer + :param target: theano variable representing the target for this + layer + :return: cost, dC_dstate_below, param_grads, new_properties + dC_dstate_below is a computational graph representing the + gradient of the cost wrt to state_below + param_grads is a list containing the gradients wrt to the + different parameters of the layer + new_properties is a dictionary containing additional properties + of the model; properties are theano expression that are + evaluated and reported by the model + """ + cost = self.get_cost(state_below, + target, + mask = mask, + reg = reg, + scale=scale, + sum_over_time=sum_over_time, + use_noise=use_noise, + additional_inputs=additional_inputs) + grads = TT.grad(cost, self.params) + if self.additional_gradients: + for new_grads, to_replace, properties in self.additional_gradients: + gparams, params = new_grads + prop_expr = [x[1] for x in properties] + replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace] + rval = theano.clone(gparams + prop_expr, + replace=replace) + gparams = rval[:len(gparams)] + prop_expr = rval[len(gparams):] + self.properties += [(x[0], y) for x,y in zip(properties, + prop_expr)] + for gp, p in zip(gparams, params): + grads[self.params.index(p)] += gp + + self.cost = cost + self.grads = grads + def Gvs_fn(*args): + w = (1 - self.model_output) * self.model_output * state_below.shape[1] + Gvs = TT.Lop(self.model_output, self.params, + TT.Rop(self.model_output, self.params, args)/w) + return Gvs + self.Gvs = Gvs_fn + return cost, grads + + class SigmoidLayer(CostLayer): """ Sigmoid output layer.