Merge branch 'attardi-master'

UppsalaNLP · Jul 29, 2019 · b4a8391 · b4a8391
2 parents c4bb019 + b49e936
commit b4a8391
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 129 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ The techniques behind the original parser are described in the paper [Simple and
 
 #### Required software
 
- * Python 2.7 interpreter
+ * Python 3 (/!\ recent move from python 2.7 which was used for all releases).
  * [DyNet library](https://github.com/clab/dynet/tree/master/python)
 
     Note: the current version is Dynet 2.0 but Dynet 1.0 was used in both releases 1.0 and 2.0
@@ -84,6 +84,7 @@ You can also specify the gamma scalar using `--elmo_gamma` or set `--elmo_learn_
 to learn the value during training.
 
 Credits to Johannes Gontrum for this addition.
+Credits to Giuseppe Attardi for porting the parser to python 3.
 
 #### Citation
 

diff --git a/barchybrid/src/arc_hybrid.py b/barchybrid/src/arc_hybrid.py
@@ -5,7 +5,7 @@
 import numpy as np
 from copy import deepcopy
 from collections import defaultdict
-import codecs, json
+import json
 
 class ArcHybridLSTM:
     def __init__(self, vocab, options):
@@ -65,8 +65,8 @@ def __evaluate(self, stack, buf, train):
 
         #feature rep
         empty = self.feature_extractor.empty
-        topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [empty] for i in xrange(self.k) ]
-        topBuffer = [ buf.roots[i].lstms if len(buf) > i else [empty] for i in xrange(1) ]
+        topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [empty] for i in range(self.k) ]
+        topBuffer = [ buf.roots[i].lstms if len(buf) > i else [empty] for i in range(1) ]
 
         input = dy.concatenate(list(chain(*(topStack + topBuffer))))
         output = self.unlabeled_MLP(input)
@@ -116,11 +116,11 @@ def __evaluate(self, stack, buf, train):
 
 
     def Save(self, filename):
-        print 'Saving model to ' + filename
+        print('Saving model to ' + filename)
         self.model.save(filename)
 
     def Load(self, filename):
-        print 'Loading model from ' + filename
+        print('Loading model from ' + filename)
         self.model.populate(filename)
 
 
@@ -208,7 +208,7 @@ def Predict(self, treebanks, datasplit, options):
         reached_max_swap = 0
         char_map = {}
         if options.char_map_file:
-            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
+            char_map_fh = open(options.char_map_file,encoding='utf-8')
             char_map = json.loads(char_map_fh.read())
         # should probably use a namedtuple in get_vocab to make this prettier
         _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)
@@ -218,10 +218,10 @@ def Predict(self, treebanks, datasplit, options):
         test_embeddings = defaultdict(lambda: {})
         if options.word_emb_size > 0 and options.ext_word_emb_file:
             new_test_words = \
-                set(test_words) - self.feature_extractor.words.viewkeys()
+                set(test_words) - self.feature_extractor.words.keys()
 
-            print "Number of OOV word types at test time: %i (out of %i)" % (
-                len(new_test_words), len(test_words))
+            print("Number of OOV word types at test time: %i (out of %i)" %
+                  (len(new_test_words), len(test_words)))
 
             if len(new_test_words) > 0:
                 # no point loading embeddings if there are no words to look for
@@ -234,15 +234,15 @@ def Predict(self, treebanks, datasplit, options):
                     )
                     test_embeddings["words"].update(embeddings)
                 if len(test_langs) > 1 and test_embeddings["words"]:
-                    print "External embeddings found for %i words "\
+                    print("External embeddings found for %i words "\
                           "(out of %i)" % \
-                          (len(test_embeddings["words"]), len(new_test_words))
+                          (len(test_embeddings["words"]), len(new_test_words)))
 
         if options.char_emb_size > 0:
             new_test_chars = \
-                set(test_chars) - self.feature_extractor.chars.viewkeys()
-            print "Number of OOV char types at test time: %i (out of %i)" % (
-                len(new_test_chars), len(test_chars))
+                set(test_chars) - self.feature_extractor.chars.keys()
+            print("Number of OOV char types at test time: %i (out of %i)" %
+                  (len(new_test_chars), len(test_chars)))
 
             if len(new_test_chars) > 0:
                 for lang in test_langs:
@@ -255,9 +255,9 @@ def Predict(self, treebanks, datasplit, options):
                     )
                     test_embeddings["chars"].update(embeddings)
                 if len(test_langs) > 1 and test_embeddings["chars"]:
-                    print "External embeddings found for %i chars "\
+                    print("External embeddings found for %i chars "\
                           "(out of %i)" % \
-                          (len(test_embeddings["chars"]), len(new_test_chars))
+                          (len(test_embeddings["chars"]), len(new_test_chars)))
 
         data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
         for iSentence, osentence in enumerate(data,1):
@@ -286,7 +286,7 @@ def Predict(self, treebanks, datasplit, options):
                 if iSwap == max_swap and not reached_swap_for_i_sentence:
                     reached_max_swap += 1
                     reached_swap_for_i_sentence = True
-                    print "reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence)
+                    print("reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence))
                 self.apply_transition(best,stack,buf,hoffset)
                 if best[1] == SWAP:
                     iSwap += 1
@@ -315,7 +315,7 @@ def Train(self, trainData, options):
         start = time.time()
 
         random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data
-        print "Length of training data: ", len(trainData)
+        print("Length of training data: ", len(trainData))
 
         errs = []
 
@@ -328,7 +328,7 @@ def Train(self, trainData, options):
                 ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                 ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                 ' Time: %.2gs'%(time.time()-start)
-                print loss_message
+                print(loss_message)
                 start = time.time()
                 eerrors = 0
                 eloss = 0.0
@@ -432,5 +432,5 @@ def Train(self, trainData, options):
             dy.renew_cg()
 
         self.trainer.update()
-        print "Loss: ", mloss/iSentence
-        print "Total Training Time: %.2gs"%(time.time()-beg)
+        print("Loss: ", mloss/iSentence)
+        print("Total Training Time: %.2gs" % (time.time()-beg))
diff --git a/barchybrid/src/feature_extractor.py b/barchybrid/src/feature_extractor.py
@@ -4,7 +4,7 @@
 import numpy as np
 import random
 from collections import defaultdict
-import codecs, re, os
+import re, os
 
 class FeatureExtractor(object):
     def __init__(self, model, options, vocab, nnvecs=1):
@@ -57,7 +57,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
                         options,
                         emb_file=options.ext_word_emb_file,
                         lang=lang,
-                        words=self.words.viewkeys()
+                        words=self.words.keys()
                     )
                     self.external_embedding["words"].update(embeddings)
 
@@ -82,7 +82,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
                             options,
                             emb_dir=options.ext_emb_dir,
                             lang=lang,
-                            words=self.words.viewkeys()
+                            words=self.words.keys()
                         )
                         self.external_embedding["words"].update(embeddings)
 
@@ -105,7 +105,7 @@ def __init__(self, model, options, vocab, nnvecs=1):
                 2 * (options.char_lstm_output_size
                      if options.char_emb_size > 0 else 0)
         )
-        print "Word-level LSTM input size: " + str(self.lstm_input_size)
+        print("Word-level LSTM input size: " + str(self.lstm_input_size))
 
         self.bilstms = []
         if options.no_bilstms > 0:
@@ -136,14 +136,15 @@ def Init(self,options):
         paddingTbankVec = self.treebank_lookup[0] if options.tbank_emb_size > 0 else None
 
         self.paddingVec = dy.tanh(self.word2lstm.expr() *\
-            dy.concatenate(filter(None,[paddingWordVec,
+            dy.concatenate(list(filter(None,[paddingWordVec,
                                         paddingElmoVec,
                                         paddingPosVec,
                                         paddingCharVec,
-                                        paddingTbankVec])) + self.word2lstmbias.expr())
+                                        paddingTbankVec]))) + self.word2lstmbias.expr())
 
         self.empty = self.paddingVec if self.nnvecs == 1 else\
-            dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])
+            dy.concatenate([self.paddingVec for _ in range(self.nnvecs)])
+
 
     def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda:{})):
 
@@ -197,11 +198,11 @@ def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdic
                     # TODO
                     root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim)
 
-            root.vec = dy.concatenate(filter(None, [root.vecs["word"],
+            root.vec = dy.concatenate(list(filter(None, [root.vecs["word"],
                                                     root.vecs["elmo"],
                                                     root.vecs["pos"],
                                                     root.vecs["char"],
-                                                    root.vecs["treebank"]]))
+                                                         root.vecs["treebank"]])))
 
         for bilstm in self.bilstms:
             bilstm.set_token_vecs(sentence,train)
@@ -224,19 +225,19 @@ def get_char_vector(self,root,train,test_embeddings_chars={}):
     def init_lookups(self,options):
 
         if self.external_embedding["words"]:
-            print 'Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"])
+            print('Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"]))
             for word in self.external_embedding["words"]:
                 if len(self.external_embedding["words"][word]) != options.word_emb_size:
                     raise Exception("Size of external embedding does not match specified word embedding size of %s"%(options.word_emb_size))
                 self.word_lookup.init_row(self.words[word],self.external_embedding["words"][word])
         elif options.word_emb_size > 0:
-            print 'No word external embeddings found: all vectors initialised randomly'
+            print('No word external embeddings found: all vectors initialised randomly')
 
         if self.external_embedding["chars"]:
-            print 'Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"])
+            print('Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"]))
             for char in self.external_embedding["chars"]:
                 if len(self.external_embedding["chars"][char]) != options.char_emb_size:
                     raise Exception("Size of external embedding does not match specified char embedding size of %s"%(options.char_emb_size))
                 self.char_lookup.init_row(self.chars[char],self.external_embedding["chars"][char])
         elif options.char_emb_size > 0:
-            print 'No character external embeddings found: all vectors initialised randomly'
+            print('No character external embeddings found: all vectors initialised randomly')
diff --git a/barchybrid/src/mstlstm.py b/barchybrid/src/mstlstm.py
@@ -37,7 +37,7 @@ def  __getExpr(self, sentence, i, j, train):
 
 
     def __evaluate(self, sentence, train):
-        exprs = [ [self.__getExpr(sentence, i, j, train) for j in xrange(len(sentence))] for i in xrange(len(sentence)) ]
+        exprs = [ [self.__getExpr(sentence, i, j, train) for j in range(len(sentence))] for i in range(len(sentence)) ]
         scores = np.array([ [output.scalar_value() for output in exprsRow] for exprsRow in exprs ])
         return scores, exprs
 
@@ -58,7 +58,7 @@ def Load(self, filename):
     def Predict(self, treebanks, datasplit, options):
         char_map = {}
         if options.char_map_file:
-            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
+            char_map_fh = open(options.char_map_file,encoding='utf-8')
             char_map = json.loads(char_map_fh.read())
         # should probably use a namedtuple in get_vocab to make this prettier
         _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)
@@ -68,10 +68,10 @@ def Predict(self, treebanks, datasplit, options):
         test_embeddings = defaultdict(lambda: {})
         if options.word_emb_size > 0 and options.ext_word_emb_file:
             new_test_words = \
-                    set(test_words) - self.feature_extractor.words.viewkeys()
+                    set(test_words) - self.feature_extractor.words.keys()
 
-            print "Number of OOV word types at test time: %i (out of %i)" % (
-                len(new_test_words), len(test_words))
+            print("Number of OOV word types at test time: %i (out of %i)" % (
+                len(new_test_words), len(test_words)))
 
             if len(new_test_words) > 0:
                 # no point loading embeddings if there are no words to look for
@@ -84,15 +84,16 @@ def Predict(self, treebanks, datasplit, options):
                     )
                     test_embeddings["words"].update(embeddings)
                     if len(test_langs) > 1 and test_embeddings["words"]:
-                        print "External embeddings found for %i words "\
+                        print("External embeddings found for %i words "\
                                 "(out of %i)" % \
-                                (len(test_embeddings["words"]), len(new_test_words))
+                                (len(test_embeddings["words"]),
+                                 len(new_test_words)))
 
         if options.char_emb_size > 0:
             new_test_chars = \
-                    set(test_chars) - self.feature_extractor.chars.viewkeys()
-            print "Number of OOV char types at test time: %i (out of %i)" % (
-                len(new_test_chars), len(test_chars))
+                    set(test_chars) - self.feature_extractor.chars.keys()
+            print("Number of OOV char types at test time: %i (out of %i)" % (
+                len(new_test_chars), len(test_chars)))
 
             if len(new_test_chars) > 0:
                 for lang in test_langs:
@@ -105,9 +106,10 @@ def Predict(self, treebanks, datasplit, options):
                     )
                     test_embeddings["chars"].update(embeddings)
                     if len(test_langs) > 1 and test_embeddings["chars"]:
-                        print "External embeddings found for %i chars "\
+                        print("External embeddings found for %i chars "\
                                 "(out of %i)" % \
-                                (len(test_embeddings["chars"]), len(new_test_chars))
+                                (len(test_embeddings["chars"]),
+                                 len(new_test_chars)))
 
         data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
         for iSentence, osentence in enumerate(data,1):
@@ -124,7 +126,8 @@ def Predict(self, treebanks, datasplit, options):
                 ## ADD for handling multi-roots problem
                 rootHead = [head for head in heads if head==0]
                 if len(rootHead) != 1:
-                    print "it has multi-root, changing it for heading first root for other roots"
+                    print("it has multi-root, changing it for heading first root\
+                          for other roots")
                     rootHead = [seq for seq, head in enumerate(heads) if head == 0]
                     for seq in rootHead[1:]:heads[seq] = rootHead[0]
                 ## finish to multi-roots
@@ -174,7 +177,7 @@ def Train(self, trainData, options):
                         ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                         ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                         ' Time: %.2gs'%(time.time()-start)
-                print loss_message
+                print(loss_message)
                 start = time.time()
                 eerrors = 0
                 eloss = 0.0
@@ -244,5 +247,5 @@ def Train(self, trainData, options):
             dy.renew_cg()
 
         self.trainer.update()
-        print "Loss: ", mloss/iSentence
-        print "Total Training Time: %.2gs"%(time.time()-beg)
+        print("Loss: ", mloss/iSentence)
+        print("Total Training Time: %.2gs"%(time.time()-beg))