Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: lisa-lab/DeepLearningTutorials
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: wuxiyu/DeepLearningTutorials
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
Able to merge. These branches can be automatically merged.
  • 1 commit
  • 1 file changed
  • 1 contributor

Commits on Feb 6, 2018

  1. add comment

    wuxiyu authored Feb 6, 2018

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    6cb895a View commit details
Showing with 24 additions and 26 deletions.
  1. +24 −26 code/lstm.py
50 changes: 24 additions & 26 deletions code/lstm.py
Original file line number Diff line number Diff line change
@@ -98,13 +98,13 @@ def init_params(options):
params['Wemb'] = (0.01 * randn).astype(config.floatX)
params = get_layer(options['encoder'])[0](options,
params,
prefix=options['encoder'])
prefix=options['encoder']) #param_init_lstm
# classifier
params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
options['ydim']).astype(config.floatX)
params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)

return params
return params #1.word embedding(Wemb:n_words*dim_proj) 2.output 权重和偏倚(U:dim_proj*ydim; b:ydim y=Ux+b)


def load_params(path, params):
@@ -138,7 +138,6 @@ def ortho_weight(ndim):
def param_init_lstm(options, params, prefix='lstm'):
"""
Init the LSTM parameter:
:see: init_params
"""
W = numpy.concatenate([ortho_weight(options['dim_proj']),
@@ -154,11 +153,13 @@ def param_init_lstm(options, params, prefix='lstm'):
b = numpy.zeros((4 * options['dim_proj'],))
params[_p(prefix, 'b')] = b.astype(config.floatX)

return params
return params #1.三个gate和一个cell的x的权重部分(inputgate,forgetgate,outputgate,cell)并在一起,矩阵(dim_proj*(dim_proj*4)) lstm_W
#2.三个gate和一个cell的cell部分(inputgate,forgetgate,outputgate,cell)并在一起,矩阵(dim_proj*(dim_proj*4)) lstm_U
#3.三个gate和一个cell的x的偏倚部分,向量(dim_proj*4) lstm_b


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
nsteps = state_below.shape[0] #state_below就是上文的emb,3维的张量(n_timesteps*n_samples*dim_proj)
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
@@ -168,41 +169,44 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):

def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, :, n * dim:(n + 1) * dim] #当batch大小==1时,进入这个分支
return _x[:, n * dim:(n + 1) * dim]

def _step(m_, x_, h_, c_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) #Us (三个gate和一个cell的cell部分),scan的原因,h_维数是dim_proj,U是dim_proj*(dim_proj*4),得到dim_proj*4
preact += x_ #x_维度n_samples*(dim_proj*4),广播,注意,这里的x_是新的state_below,也就是Wx+b

i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) #ifoc维度n_samples*dim_proj

c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
c = f * c_ + i * c #c_维度dim_proj。这里是*不是dot,元素相乘。原公式里也是如此,没用点乘。
c = m_[:, None] * c + (1. - m_)[:, None] * c_ #[:, None]意思是增加新的轴,相当于numpy.newaxis。

h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_

return h, c

state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
tparams[_p(prefix, 'b')]) #Wx+b (三个gate和一个cell的x部分)
#state_below是n_timesteps*n_samples*dim_proj,W是dim_proj*(dim_proj*4),得到维度为n_timesteps*n_samples*(dim_proj*4)
#b维度(dim_proj*4) ,broadcast广播机制的存在,使得所有的样本都会加上b
#这时state_below维数不是n_timesteps*n_samples*dim_proj,而是n_timesteps*n_samples*(dim_proj*4)

dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
dim_proj), #这是h
tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
dim_proj)], #这是c
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]
return rval[0] #rval[0]维度大小为n_timesteps * n_samples * dim_proj


# ff: Feed Forward (normal neural net), only useful to put after lstm
@@ -212,10 +216,8 @@ def _step(m_, x_, h_, c_):

def sgd(lr, tparams, grads, x, mask, y, cost):
""" Stochastic Gradient Descent
:note: A more complicated version of sgd then needed. This is
done like that for adadelta and rmsprop.
"""
# New set of shared variable that will contain the gradient
# for a mini-batch.
@@ -241,7 +243,6 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
def adadelta(lr, tparams, grads, x, mask, y, cost):
"""
An adaptive learning rate optimizer
Parameters
----------
lr : Theano SharedVariable
@@ -258,11 +259,9 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
Targets
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [ADADELTA]_.
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
Rate Method*, arXiv:1212.5701.
"""
@@ -303,7 +302,6 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
"""
A variant of SGD that scales the step size by running average of the
recent step norms.
Parameters
----------
lr : Theano SharedVariable
@@ -320,11 +318,9 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
Targets
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [Hint2014]_.
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
lecture 6a,
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
@@ -379,10 +375,10 @@ def build_model(tparams, options):

emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
n_samples,
options['dim_proj']])
options['dim_proj']]) #x是一个矩阵,行是句子数也就是样本数(batch大小),emb是一个3维张量
proj = get_layer(options['encoder'])[1](tparams, emb, options,
prefix=options['encoder'],
mask=mask)
mask=mask) #lstm_layer返回h向量(张量是因为scan的原因)
if options['encoder'] == 'lstm':
proj = (proj * mask[:, :, None]).sum(axis=0)
proj = proj / mask.sum(axis=0)[:, None]
@@ -481,6 +477,7 @@ def train_lstm(
train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
maxlen=maxlen)
if test_size > 0:
# 不是很懂作者逻辑,在imdb.py里对test排序后在这里又进行shuffle打乱。
# The test set is sorted by size, but we want to keep random
# size example. So we must select a random selection of the
# examples.
@@ -517,7 +514,7 @@ def train_lstm(
weight_decay *= decay_c
cost += weight_decay

f_cost = theano.function([x, mask, y], cost, name='f_cost')
f_cost = theano.function([x, mask, y], cost, name='f_cost') #cost重复定义了。

grads = tensor.grad(cost, wrt=list(tparams.values()))
f_grad = theano.function([x, mask, y], grads, name='f_grad')
@@ -654,4 +651,5 @@ def train_lstm(
train_lstm(
max_epochs=100,
test_size=500,
reload_model=True,
)