Skip to content

Commit

Permalink
Merge branch 'attardi-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
mdelhoneux committed Jul 29, 2019
2 parents c4bb019 + b49e936 commit b4a8391
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 129 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The techniques behind the original parser are described in the paper [Simple and

#### Required software

* Python 2.7 interpreter
* Python 3 (/!\ recent move from python 2.7 which was used for all releases).
* [DyNet library](https://github.com/clab/dynet/tree/master/python)

Note: the current version is Dynet 2.0 but Dynet 1.0 was used in both releases 1.0 and 2.0
Expand Down Expand Up @@ -84,6 +84,7 @@ You can also specify the gamma scalar using `--elmo_gamma` or set `--elmo_learn_
to learn the value during training.

Credits to Johannes Gontrum for this addition.
Credits to Giuseppe Attardi for porting the parser to python 3.

#### Citation

Expand Down
42 changes: 21 additions & 21 deletions barchybrid/src/arc_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
from copy import deepcopy
from collections import defaultdict
import codecs, json
import json

class ArcHybridLSTM:
def __init__(self, vocab, options):
Expand Down Expand Up @@ -65,8 +65,8 @@ def __evaluate(self, stack, buf, train):

#feature rep
empty = self.feature_extractor.empty
topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [empty] for i in xrange(self.k) ]
topBuffer = [ buf.roots[i].lstms if len(buf) > i else [empty] for i in xrange(1) ]
topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [empty] for i in range(self.k) ]
topBuffer = [ buf.roots[i].lstms if len(buf) > i else [empty] for i in range(1) ]

input = dy.concatenate(list(chain(*(topStack + topBuffer))))
output = self.unlabeled_MLP(input)
Expand Down Expand Up @@ -116,11 +116,11 @@ def __evaluate(self, stack, buf, train):


def Save(self, filename):
print 'Saving model to ' + filename
print('Saving model to ' + filename)
self.model.save(filename)

def Load(self, filename):
print 'Loading model from ' + filename
print('Loading model from ' + filename)
self.model.populate(filename)


Expand Down Expand Up @@ -208,7 +208,7 @@ def Predict(self, treebanks, datasplit, options):
reached_max_swap = 0
char_map = {}
if options.char_map_file:
char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
char_map_fh = open(options.char_map_file,encoding='utf-8')
char_map = json.loads(char_map_fh.read())
# should probably use a namedtuple in get_vocab to make this prettier
_, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)
Expand All @@ -218,10 +218,10 @@ def Predict(self, treebanks, datasplit, options):
test_embeddings = defaultdict(lambda: {})
if options.word_emb_size > 0 and options.ext_word_emb_file:
new_test_words = \
set(test_words) - self.feature_extractor.words.viewkeys()
set(test_words) - self.feature_extractor.words.keys()

print "Number of OOV word types at test time: %i (out of %i)" % (
len(new_test_words), len(test_words))
print("Number of OOV word types at test time: %i (out of %i)" %
(len(new_test_words), len(test_words)))

if len(new_test_words) > 0:
# no point loading embeddings if there are no words to look for
Expand All @@ -234,15 +234,15 @@ def Predict(self, treebanks, datasplit, options):
)
test_embeddings["words"].update(embeddings)
if len(test_langs) > 1 and test_embeddings["words"]:
print "External embeddings found for %i words "\
print("External embeddings found for %i words "\
"(out of %i)" % \
(len(test_embeddings["words"]), len(new_test_words))
(len(test_embeddings["words"]), len(new_test_words)))

if options.char_emb_size > 0:
new_test_chars = \
set(test_chars) - self.feature_extractor.chars.viewkeys()
print "Number of OOV char types at test time: %i (out of %i)" % (
len(new_test_chars), len(test_chars))
set(test_chars) - self.feature_extractor.chars.keys()
print("Number of OOV char types at test time: %i (out of %i)" %
(len(new_test_chars), len(test_chars)))

if len(new_test_chars) > 0:
for lang in test_langs:
Expand All @@ -255,9 +255,9 @@ def Predict(self, treebanks, datasplit, options):
)
test_embeddings["chars"].update(embeddings)
if len(test_langs) > 1 and test_embeddings["chars"]:
print "External embeddings found for %i chars "\
print("External embeddings found for %i chars "\
"(out of %i)" % \
(len(test_embeddings["chars"]), len(new_test_chars))
(len(test_embeddings["chars"]), len(new_test_chars)))

data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
for iSentence, osentence in enumerate(data,1):
Expand Down Expand Up @@ -286,7 +286,7 @@ def Predict(self, treebanks, datasplit, options):
if iSwap == max_swap and not reached_swap_for_i_sentence:
reached_max_swap += 1
reached_swap_for_i_sentence = True
print "reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence)
print("reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence))
self.apply_transition(best,stack,buf,hoffset)
if best[1] == SWAP:
iSwap += 1
Expand Down Expand Up @@ -315,7 +315,7 @@ def Train(self, trainData, options):
start = time.time()

random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data
print "Length of training data: ", len(trainData)
print("Length of training data: ", len(trainData))

errs = []

Expand All @@ -328,7 +328,7 @@ def Train(self, trainData, options):
' Errors: %.3f'%((float(eerrors)) / etotal)+\
' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
' Time: %.2gs'%(time.time()-start)
print loss_message
print(loss_message)
start = time.time()
eerrors = 0
eloss = 0.0
Expand Down Expand Up @@ -432,5 +432,5 @@ def Train(self, trainData, options):
dy.renew_cg()

self.trainer.update()
print "Loss: ", mloss/iSentence
print "Total Training Time: %.2gs"%(time.time()-beg)
print("Loss: ", mloss/iSentence)
print("Total Training Time: %.2gs" % (time.time()-beg))
27 changes: 14 additions & 13 deletions barchybrid/src/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import random
from collections import defaultdict
import codecs, re, os
import re, os

class FeatureExtractor(object):
def __init__(self, model, options, vocab, nnvecs=1):
Expand Down Expand Up @@ -57,7 +57,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
options,
emb_file=options.ext_word_emb_file,
lang=lang,
words=self.words.viewkeys()
words=self.words.keys()
)
self.external_embedding["words"].update(embeddings)

Expand All @@ -82,7 +82,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
options,
emb_dir=options.ext_emb_dir,
lang=lang,
words=self.words.viewkeys()
words=self.words.keys()
)
self.external_embedding["words"].update(embeddings)

Expand All @@ -105,7 +105,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
2 * (options.char_lstm_output_size
if options.char_emb_size > 0 else 0)
)
print "Word-level LSTM input size: " + str(self.lstm_input_size)
print("Word-level LSTM input size: " + str(self.lstm_input_size))

self.bilstms = []
if options.no_bilstms > 0:
Expand Down Expand Up @@ -136,14 +136,15 @@ def Init(self,options):
paddingTbankVec = self.treebank_lookup[0] if options.tbank_emb_size > 0 else None

self.paddingVec = dy.tanh(self.word2lstm.expr() *\
dy.concatenate(filter(None,[paddingWordVec,
dy.concatenate(list(filter(None,[paddingWordVec,
paddingElmoVec,
paddingPosVec,
paddingCharVec,
paddingTbankVec])) + self.word2lstmbias.expr())
paddingTbankVec]))) + self.word2lstmbias.expr())

self.empty = self.paddingVec if self.nnvecs == 1 else\
dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])
dy.concatenate([self.paddingVec for _ in range(self.nnvecs)])


def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda:{})):

Expand Down Expand Up @@ -197,11 +198,11 @@ def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdic
# TODO
root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim)

root.vec = dy.concatenate(filter(None, [root.vecs["word"],
root.vec = dy.concatenate(list(filter(None, [root.vecs["word"],
root.vecs["elmo"],
root.vecs["pos"],
root.vecs["char"],
root.vecs["treebank"]]))
root.vecs["treebank"]])))

for bilstm in self.bilstms:
bilstm.set_token_vecs(sentence,train)
Expand All @@ -224,19 +225,19 @@ def get_char_vector(self,root,train,test_embeddings_chars={}):
def init_lookups(self,options):

if self.external_embedding["words"]:
print 'Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"])
print('Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"]))
for word in self.external_embedding["words"]:
if len(self.external_embedding["words"][word]) != options.word_emb_size:
raise Exception("Size of external embedding does not match specified word embedding size of %s"%(options.word_emb_size))
self.word_lookup.init_row(self.words[word],self.external_embedding["words"][word])
elif options.word_emb_size > 0:
print 'No word external embeddings found: all vectors initialised randomly'
print('No word external embeddings found: all vectors initialised randomly')

if self.external_embedding["chars"]:
print 'Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"])
print('Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"]))
for char in self.external_embedding["chars"]:
if len(self.external_embedding["chars"][char]) != options.char_emb_size:
raise Exception("Size of external embedding does not match specified char embedding size of %s"%(options.char_emb_size))
self.char_lookup.init_row(self.chars[char],self.external_embedding["chars"][char])
elif options.char_emb_size > 0:
print 'No character external embeddings found: all vectors initialised randomly'
print('No character external embeddings found: all vectors initialised randomly')
35 changes: 19 additions & 16 deletions barchybrid/src/mstlstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __getExpr(self, sentence, i, j, train):


def __evaluate(self, sentence, train):
exprs = [ [self.__getExpr(sentence, i, j, train) for j in xrange(len(sentence))] for i in xrange(len(sentence)) ]
exprs = [ [self.__getExpr(sentence, i, j, train) for j in range(len(sentence))] for i in range(len(sentence)) ]
scores = np.array([ [output.scalar_value() for output in exprsRow] for exprsRow in exprs ])
return scores, exprs

Expand All @@ -58,7 +58,7 @@ def Load(self, filename):
def Predict(self, treebanks, datasplit, options):
char_map = {}
if options.char_map_file:
char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
char_map_fh = open(options.char_map_file,encoding='utf-8')
char_map = json.loads(char_map_fh.read())
# should probably use a namedtuple in get_vocab to make this prettier
_, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)
Expand All @@ -68,10 +68,10 @@ def Predict(self, treebanks, datasplit, options):
test_embeddings = defaultdict(lambda: {})
if options.word_emb_size > 0 and options.ext_word_emb_file:
new_test_words = \
set(test_words) - self.feature_extractor.words.viewkeys()
set(test_words) - self.feature_extractor.words.keys()

print "Number of OOV word types at test time: %i (out of %i)" % (
len(new_test_words), len(test_words))
print("Number of OOV word types at test time: %i (out of %i)" % (
len(new_test_words), len(test_words)))

if len(new_test_words) > 0:
# no point loading embeddings if there are no words to look for
Expand All @@ -84,15 +84,16 @@ def Predict(self, treebanks, datasplit, options):
)
test_embeddings["words"].update(embeddings)
if len(test_langs) > 1 and test_embeddings["words"]:
print "External embeddings found for %i words "\
print("External embeddings found for %i words "\
"(out of %i)" % \
(len(test_embeddings["words"]), len(new_test_words))
(len(test_embeddings["words"]),
len(new_test_words)))

if options.char_emb_size > 0:
new_test_chars = \
set(test_chars) - self.feature_extractor.chars.viewkeys()
print "Number of OOV char types at test time: %i (out of %i)" % (
len(new_test_chars), len(test_chars))
set(test_chars) - self.feature_extractor.chars.keys()
print("Number of OOV char types at test time: %i (out of %i)" % (
len(new_test_chars), len(test_chars)))

if len(new_test_chars) > 0:
for lang in test_langs:
Expand All @@ -105,9 +106,10 @@ def Predict(self, treebanks, datasplit, options):
)
test_embeddings["chars"].update(embeddings)
if len(test_langs) > 1 and test_embeddings["chars"]:
print "External embeddings found for %i chars "\
print("External embeddings found for %i chars "\
"(out of %i)" % \
(len(test_embeddings["chars"]), len(new_test_chars))
(len(test_embeddings["chars"]),
len(new_test_chars)))

data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
for iSentence, osentence in enumerate(data,1):
Expand All @@ -124,7 +126,8 @@ def Predict(self, treebanks, datasplit, options):
## ADD for handling multi-roots problem
rootHead = [head for head in heads if head==0]
if len(rootHead) != 1:
print "it has multi-root, changing it for heading first root for other roots"
print("it has multi-root, changing it for heading first root\
for other roots")
rootHead = [seq for seq, head in enumerate(heads) if head == 0]
for seq in rootHead[1:]:heads[seq] = rootHead[0]
## finish to multi-roots
Expand Down Expand Up @@ -174,7 +177,7 @@ def Train(self, trainData, options):
' Errors: %.3f'%((float(eerrors)) / etotal)+\
' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
' Time: %.2gs'%(time.time()-start)
print loss_message
print(loss_message)
start = time.time()
eerrors = 0
eloss = 0.0
Expand Down Expand Up @@ -244,5 +247,5 @@ def Train(self, trainData, options):
dy.renew_cg()

self.trainer.update()
print "Loss: ", mloss/iSentence
print "Total Training Time: %.2gs"%(time.time()-beg)
print("Loss: ", mloss/iSentence)
print("Total Training Time: %.2gs"%(time.time()-beg))
Loading

0 comments on commit b4a8391

Please sign in to comment.