diff --git a/README.md b/README.md index 9b183d0..e5b59e0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ The techniques behind the original parser are described in the paper [Simple and #### Required software - * Python 2.7 interpreter + * Python 3 (/!\ recent move from python 2.7 which was used for all releases). * [DyNet library](https://github.com/clab/dynet/tree/master/python) Note: the current version is Dynet 2.0 but Dynet 1.0 was used in both releases 1.0 and 2.0 @@ -84,6 +84,7 @@ You can also specify the gamma scalar using `--elmo_gamma` or set `--elmo_learn_ to learn the value during training. Credits to Johannes Gontrum for this addition. +Credits to Giuseppe Attardi for porting the parser to python 3. #### Citation diff --git a/barchybrid/src/arc_hybrid.py b/barchybrid/src/arc_hybrid.py index 93dd668..76c73f4 100644 --- a/barchybrid/src/arc_hybrid.py +++ b/barchybrid/src/arc_hybrid.py @@ -5,7 +5,7 @@ import numpy as np from copy import deepcopy from collections import defaultdict -import codecs, json +import json class ArcHybridLSTM: def __init__(self, vocab, options): @@ -65,8 +65,8 @@ def __evaluate(self, stack, buf, train): #feature rep empty = self.feature_extractor.empty - topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [empty] for i in xrange(self.k) ] - topBuffer = [ buf.roots[i].lstms if len(buf) > i else [empty] for i in xrange(1) ] + topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [empty] for i in range(self.k) ] + topBuffer = [ buf.roots[i].lstms if len(buf) > i else [empty] for i in range(1) ] input = dy.concatenate(list(chain(*(topStack + topBuffer)))) output = self.unlabeled_MLP(input) @@ -116,11 +116,11 @@ def __evaluate(self, stack, buf, train): def Save(self, filename): - print 'Saving model to ' + filename + print('Saving model to ' + filename) self.model.save(filename) def Load(self, filename): - print 'Loading model from ' + filename + print('Loading model from ' + filename) self.model.populate(filename) @@ -208,7 +208,7 @@ def Predict(self, treebanks, datasplit, options): reached_max_swap = 0 char_map = {} if options.char_map_file: - char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') + char_map_fh = open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) @@ -218,10 +218,10 @@ def Predict(self, treebanks, datasplit, options): test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ - set(test_words) - self.feature_extractor.words.viewkeys() + set(test_words) - self.feature_extractor.words.keys() - print "Number of OOV word types at test time: %i (out of %i)" % ( - len(new_test_words), len(test_words)) + print("Number of OOV word types at test time: %i (out of %i)" % + (len(new_test_words), len(test_words))) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for @@ -234,15 +234,15 @@ def Predict(self, treebanks, datasplit, options): ) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: - print "External embeddings found for %i words "\ + print("External embeddings found for %i words "\ "(out of %i)" % \ - (len(test_embeddings["words"]), len(new_test_words)) + (len(test_embeddings["words"]), len(new_test_words))) if options.char_emb_size > 0: new_test_chars = \ - set(test_chars) - self.feature_extractor.chars.viewkeys() - print "Number of OOV char types at test time: %i (out of %i)" % ( - len(new_test_chars), len(test_chars)) + set(test_chars) - self.feature_extractor.chars.keys() + print("Number of OOV char types at test time: %i (out of %i)" % + (len(new_test_chars), len(test_chars))) if len(new_test_chars) > 0: for lang in test_langs: @@ -255,9 +255,9 @@ def Predict(self, treebanks, datasplit, options): ) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: - print "External embeddings found for %i chars "\ + print("External embeddings found for %i chars "\ "(out of %i)" % \ - (len(test_embeddings["chars"]), len(new_test_chars)) + (len(test_embeddings["chars"]), len(new_test_chars))) data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): @@ -286,7 +286,7 @@ def Predict(self, treebanks, datasplit, options): if iSwap == max_swap and not reached_swap_for_i_sentence: reached_max_swap += 1 reached_swap_for_i_sentence = True - print "reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence) + print("reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence)) self.apply_transition(best,stack,buf,hoffset) if best[1] == SWAP: iSwap += 1 @@ -315,7 +315,7 @@ def Train(self, trainData, options): start = time.time() random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data - print "Length of training data: ", len(trainData) + print("Length of training data: ", len(trainData)) errs = [] @@ -328,7 +328,7 @@ def Train(self, trainData, options): ' Errors: %.3f'%((float(eerrors)) / etotal)+\ ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\ ' Time: %.2gs'%(time.time()-start) - print loss_message + print(loss_message) start = time.time() eerrors = 0 eloss = 0.0 @@ -432,5 +432,5 @@ def Train(self, trainData, options): dy.renew_cg() self.trainer.update() - print "Loss: ", mloss/iSentence - print "Total Training Time: %.2gs"%(time.time()-beg) + print("Loss: ", mloss/iSentence) + print("Total Training Time: %.2gs" % (time.time()-beg)) diff --git a/barchybrid/src/feature_extractor.py b/barchybrid/src/feature_extractor.py index 57b3473..5491e35 100644 --- a/barchybrid/src/feature_extractor.py +++ b/barchybrid/src/feature_extractor.py @@ -4,7 +4,7 @@ import numpy as np import random from collections import defaultdict -import codecs, re, os +import re, os class FeatureExtractor(object): def __init__(self, model, options, vocab, nnvecs=1): @@ -57,7 +57,7 @@ def __init__(self, model, options, vocab, nnvecs=1): options, emb_file=options.ext_word_emb_file, lang=lang, - words=self.words.viewkeys() + words=self.words.keys() ) self.external_embedding["words"].update(embeddings) @@ -82,7 +82,7 @@ def __init__(self, model, options, vocab, nnvecs=1): options, emb_dir=options.ext_emb_dir, lang=lang, - words=self.words.viewkeys() + words=self.words.keys() ) self.external_embedding["words"].update(embeddings) @@ -105,7 +105,7 @@ def __init__(self, model, options, vocab, nnvecs=1): 2 * (options.char_lstm_output_size if options.char_emb_size > 0 else 0) ) - print "Word-level LSTM input size: " + str(self.lstm_input_size) + print("Word-level LSTM input size: " + str(self.lstm_input_size)) self.bilstms = [] if options.no_bilstms > 0: @@ -136,14 +136,15 @@ def Init(self,options): paddingTbankVec = self.treebank_lookup[0] if options.tbank_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() *\ - dy.concatenate(filter(None,[paddingWordVec, + dy.concatenate(list(filter(None,[paddingWordVec, paddingElmoVec, paddingPosVec, paddingCharVec, - paddingTbankVec])) + self.word2lstmbias.expr()) + paddingTbankVec]))) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else\ - dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) + dy.concatenate([self.paddingVec for _ in range(self.nnvecs)]) + def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda:{})): @@ -197,11 +198,11 @@ def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdic # TODO root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim) - root.vec = dy.concatenate(filter(None, [root.vecs["word"], + root.vec = dy.concatenate(list(filter(None, [root.vecs["word"], root.vecs["elmo"], root.vecs["pos"], root.vecs["char"], - root.vecs["treebank"]])) + root.vecs["treebank"]]))) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence,train) @@ -224,19 +225,19 @@ def get_char_vector(self,root,train,test_embeddings_chars={}): def init_lookups(self,options): if self.external_embedding["words"]: - print 'Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"]) + print('Initialising %i word vectors with external embeddings'%len(self.external_embedding["words"])) for word in self.external_embedding["words"]: if len(self.external_embedding["words"][word]) != options.word_emb_size: raise Exception("Size of external embedding does not match specified word embedding size of %s"%(options.word_emb_size)) self.word_lookup.init_row(self.words[word],self.external_embedding["words"][word]) elif options.word_emb_size > 0: - print 'No word external embeddings found: all vectors initialised randomly' + print('No word external embeddings found: all vectors initialised randomly') if self.external_embedding["chars"]: - print 'Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"]) + print('Initialising %i char vectors with external embeddings'%len(self.external_embedding["chars"])) for char in self.external_embedding["chars"]: if len(self.external_embedding["chars"][char]) != options.char_emb_size: raise Exception("Size of external embedding does not match specified char embedding size of %s"%(options.char_emb_size)) self.char_lookup.init_row(self.chars[char],self.external_embedding["chars"][char]) elif options.char_emb_size > 0: - print 'No character external embeddings found: all vectors initialised randomly' + print('No character external embeddings found: all vectors initialised randomly') diff --git a/barchybrid/src/mstlstm.py b/barchybrid/src/mstlstm.py index bc03ad2..0052774 100644 --- a/barchybrid/src/mstlstm.py +++ b/barchybrid/src/mstlstm.py @@ -37,7 +37,7 @@ def __getExpr(self, sentence, i, j, train): def __evaluate(self, sentence, train): - exprs = [ [self.__getExpr(sentence, i, j, train) for j in xrange(len(sentence))] for i in xrange(len(sentence)) ] + exprs = [ [self.__getExpr(sentence, i, j, train) for j in range(len(sentence))] for i in range(len(sentence)) ] scores = np.array([ [output.scalar_value() for output in exprsRow] for exprsRow in exprs ]) return scores, exprs @@ -58,7 +58,7 @@ def Load(self, filename): def Predict(self, treebanks, datasplit, options): char_map = {} if options.char_map_file: - char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') + char_map_fh = open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) @@ -68,10 +68,10 @@ def Predict(self, treebanks, datasplit, options): test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ - set(test_words) - self.feature_extractor.words.viewkeys() + set(test_words) - self.feature_extractor.words.keys() - print "Number of OOV word types at test time: %i (out of %i)" % ( - len(new_test_words), len(test_words)) + print("Number of OOV word types at test time: %i (out of %i)" % ( + len(new_test_words), len(test_words))) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for @@ -84,15 +84,16 @@ def Predict(self, treebanks, datasplit, options): ) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: - print "External embeddings found for %i words "\ + print("External embeddings found for %i words "\ "(out of %i)" % \ - (len(test_embeddings["words"]), len(new_test_words)) + (len(test_embeddings["words"]), + len(new_test_words))) if options.char_emb_size > 0: new_test_chars = \ - set(test_chars) - self.feature_extractor.chars.viewkeys() - print "Number of OOV char types at test time: %i (out of %i)" % ( - len(new_test_chars), len(test_chars)) + set(test_chars) - self.feature_extractor.chars.keys() + print("Number of OOV char types at test time: %i (out of %i)" % ( + len(new_test_chars), len(test_chars))) if len(new_test_chars) > 0: for lang in test_langs: @@ -105,9 +106,10 @@ def Predict(self, treebanks, datasplit, options): ) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: - print "External embeddings found for %i chars "\ + print("External embeddings found for %i chars "\ "(out of %i)" % \ - (len(test_embeddings["chars"]), len(new_test_chars)) + (len(test_embeddings["chars"]), + len(new_test_chars))) data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): @@ -124,7 +126,8 @@ def Predict(self, treebanks, datasplit, options): ## ADD for handling multi-roots problem rootHead = [head for head in heads if head==0] if len(rootHead) != 1: - print "it has multi-root, changing it for heading first root for other roots" + print("it has multi-root, changing it for heading first root\ + for other roots") rootHead = [seq for seq, head in enumerate(heads) if head == 0] for seq in rootHead[1:]:heads[seq] = rootHead[0] ## finish to multi-roots @@ -174,7 +177,7 @@ def Train(self, trainData, options): ' Errors: %.3f'%((float(eerrors)) / etotal)+\ ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\ ' Time: %.2gs'%(time.time()-start) - print loss_message + print(loss_message) start = time.time() eerrors = 0 eloss = 0.0 @@ -244,5 +247,5 @@ def Train(self, trainData, options): dy.renew_cg() self.trainer.update() - print "Loss: ", mloss/iSentence - print "Total Training Time: %.2gs"%(time.time()-beg) + print("Loss: ", mloss/iSentence) + print("Total Training Time: %.2gs"%(time.time()-beg)) diff --git a/barchybrid/src/options_manager.py b/barchybrid/src/options_manager.py index f76e241..d405e34 100644 --- a/barchybrid/src/options_manager.py +++ b/barchybrid/src/options_manager.py @@ -42,7 +42,7 @@ def __init__(self,options): if not options.outdir: raise Exception("You must specify an output directory via the --outdir option") elif not os.path.exists(options.outdir): # create output directory if it doesn't exist - print "Creating output directory " + options.outdir + print("Creating output directory " + options.outdir) os.mkdir(options.outdir) if not options.graph_based and (not options.predict and not @@ -51,7 +51,7 @@ def __init__(self,options): raise Exception("Must include either head, rl or rlmost (For example, if you specified --disable-head and --disable-rlmost, you must specify --userl)") if not options.graph_based and (options.rlFlag and options.rlMostFlag): - print 'Warning: Switching off rlMostFlag to allow rlFlag to take precedence' + print('Warning: Switching off rlMostFlag to allow rlFlag to take precedence') options.rlMostFlag = False if options.word_emb_size == 0 and options.pos_emb_size == 0 and\ @@ -140,10 +140,10 @@ def create_UD_treebank_list(self,options): else: treebank.outdir = options.outdir if not os.path.exists(treebank.outdir): # create language-specific output folder if it doesn't exist - print "Creating language-specific output directory " + treebank.outdir + print("Creating language-specific output directory " + treebank.outdir) os.mkdir(treebank.outdir) else: - print ("Warning: language-specific subdirectory " + treebank.outdir + print("Warning: language-specific subdirectory " + treebank.outdir + " already exists, contents may be overwritten") if not options.predict: @@ -162,7 +162,7 @@ def create_UD_treebank_list(self,options): treebanks.append(treebank) else: - print "Warning: skipping invalid language code " + iso + print("Warning: skipping invalid language code " + iso) return treebanks @@ -179,7 +179,7 @@ def prepareDev(self,treebank,options): dev_file = os.path.join(treebank.outdir,'dev-split' + '.conllu') # location for the new dev file train_file = os.path.join(treebank.outdir,'train-split' + '.conllu') # location for the new train file dev_len = int(0.01*options.dev_percent*tot_sen) - print ("Taking " + str(dev_len) + " of " + str(tot_sen) + print("Taking " + str(dev_len) + " of " + str(tot_sen) + " sentences from training data as new dev data for " + treebank.name) random.shuffle(train_data) dev_data = train_data[:dev_len] @@ -191,21 +191,21 @@ def prepareDev(self,treebank,options): treebank.devfile = dev_file treebank.trainfile = train_file else: # not enough sentences - print ("Warning: not enough sentences in training data to create dev set for " + print("Warning: not enough sentences in training data to create dev set for " + treebank.name + " (minimum required --min-train-size: " + str(options.min_train_sents) + ")") treebank.pred_dev = False else: # option --create-dev not set - print ("Warning: No dev data for " + treebank.name + print("Warning: No dev data for " + treebank.name + ", consider adding option --create-dev to create dev data from training set") treebank.pred_dev = False if options.model_selection and not treebank.pred_dev: - print "Warning: can't do model selection for " + treebank.name + " as prediction on dev data is off" + print("Warning: can't do model selection for " + treebank.name + " as prediction on dev data is off") # if debug options is set, we read in the training, dev and test files as appropriate, cap the number of sentences and store # new files with these smaller data sets def createDebugData(self,treebank,options): ext = '.conllu' if options.conllu else '.conll' - print 'Creating smaller data sets for debugging' + print('Creating smaller data sets for debugging') if not options.predict: train_data = list(utils.read_conll(treebank.trainfile,maxSize=options.debug_train_sents,hard_lim=True)) train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file diff --git a/barchybrid/src/parser.py b/barchybrid/src/parser.py index 2f5d5e9..e2d60bf 100644 --- a/barchybrid/src/parser.py +++ b/barchybrid/src/parser.py @@ -2,49 +2,49 @@ from options_manager import OptionsManager import pickle, utils, os, time, sys, copy, itertools, re, random from shutil import copyfile -import codecs + def run(experiment,options): if options.graph_based: from mstlstm import MSTParserLSTM as Parser - print 'Working with a graph-based parser' + print('Working with a graph-based parser') else: from arc_hybrid import ArcHybridLSTM as Parser - print 'Working with a transition-based parser' + print('Working with a transition-based parser') if not options.predict: # training paramsfile = os.path.join(experiment.outdir, options.params) if not options.continueTraining: - print 'Preparing vocab' + print('Preparing vocab') vocab = utils.get_vocab(experiment.treebanks,"train") - print 'Finished collecting vocab' + print('Finished collecting vocab') - with open(paramsfile, 'w') as paramsfp: - print 'Saving params to ' + paramsfile + with open(paramsfile, 'wb') as paramsfp: + print('Saving params to ' + paramsfile) pickle.dump((vocab, options), paramsfp) - print 'Initializing the model' + print('Initializing the model') parser = Parser(vocab, options) else: #continue if options.continueParams: paramsfile = options.continueParams with open(paramsfile, 'r') as paramsfp: stored_vocab, stored_options = pickle.load(paramsfp) - print 'Initializing the model:' + print('Initializing the model:') parser = Parser(stored_vocab, stored_options) parser.Load(options.continueModel) dev_best = [options.epochs,-1.0] # best epoch, best score - for epoch in xrange(options.first_epoch, options.epochs+1): + for epoch in range(options.first_epoch, options.epochs+1): - print 'Starting epoch ' + str(epoch) + print('Starting epoch ' + str(epoch)) traindata = list(utils.read_conll_dir(experiment.treebanks, "train", options.max_sentences)) parser.Train(traindata,options) - print 'Finished epoch ' + str(epoch) + print('Finished epoch ' + str(epoch)) model_file = os.path.join(experiment.outdir, options.model + str(epoch)) parser.Save(model_file) @@ -56,7 +56,7 @@ def run(experiment,options): if pred_treebanks: for treebank in pred_treebanks: treebank.outfilename = os.path.join(treebank.outdir, 'dev_epoch_' + str(epoch) + '.conllu') - print "Predicting on dev data for " + treebank.name + print("Predicting on dev data for " + treebank.name) pred = list(parser.Predict(pred_treebanks,"dev",options)) utils.write_conll_multiling(pred,pred_treebanks) @@ -64,28 +64,28 @@ def run(experiment,options): mean_score = 0.0 for treebank in pred_treebanks: score = utils.evaluate(treebank.dev_gold,treebank.outfilename,options.conllu) - print "Dev score %.2f at epoch %i for %s"%(score,epoch,treebank.name) + print("Dev score %.2f at epoch %i for %s"%(score,epoch,treebank.name)) mean_score += score if len(pred_treebanks) > 1: # multiling case mean_score = mean_score/len(pred_treebanks) - print "Mean dev score %.2f at epoch %i"%(mean_score,epoch) + print("Mean dev score %.2f at epoch %i"%(mean_score,epoch)) if options.model_selection: if mean_score > dev_best[1]: dev_best = [epoch,mean_score] # update best dev score - # hack to print the word "mean" if the dev score is an average + # hack to printthe word "mean" if the dev score is an average mean_string = "mean " if len(pred_treebanks) > 1 else "" - print "Best %sdev score %.2f at epoch %i"%(mean_string,dev_best[1],dev_best[0]) + print("Best %sdev score %.2f at epoch %i"%(mean_string,dev_best[1],dev_best[0])) # at the last epoch choose which model to copy to barchybrid.model if epoch == options.epochs: bestmodel_file = os.path.join(experiment.outdir,"barchybrid.model" + str(dev_best[0])) model_file = os.path.join(experiment.outdir,"barchybrid.model") - print "Copying " + bestmodel_file + " to " + model_file + print("Copying " + bestmodel_file + " to " + model_file) copyfile(bestmodel_file,model_file) best_dev_file = os.path.join(experiment.outdir,"best_dev_epoch.txt") with open (best_dev_file, 'w') as fh: - print "Writing best scores to: " + best_dev_file + print("Writing best scores to: " + best_dev_file) if len(experiment.treebanks) == 1: fh.write("Best dev score %s at epoch %i\n"%(dev_best[1],dev_best[0])) else: @@ -94,7 +94,7 @@ def run(experiment,options): else: #if predict - so params = os.path.join(experiment.modeldir,options.params) - print 'Reading params from ' + params + print('Reading params from ' + params) with open(params, 'r') as paramsfp: stored_vocab, stored_opt = pickle.load(paramsfp) @@ -126,11 +126,12 @@ def run(experiment,options): if options.pred_eval: for treebank in experiment.treebanks: - print "Evaluating on " + treebank.name + print("Evaluating on " + treebank.name) score = utils.evaluate(treebank.test_gold,treebank.outfilename,options.conllu) - print "Obtained LAS F1 score of %.2f on %s" %(score,treebank.name) + print("Obtained LAS F1 score of %.2f on %s" %(score,treebank.name)) + + print('Finished predicting') - print 'Finished predicting' if __name__ == '__main__': diff --git a/barchybrid/src/utils.py b/barchybrid/src/utils.py index 7dbcee2..d694116 100644 --- a/barchybrid/src/utils.py +++ b/barchybrid/src/utils.py @@ -4,12 +4,13 @@ from itertools import chain from operator import itemgetter import random -import codecs, json +import json # a global variable so we don't have to keep loading from file repeatedly iso_dict = {} reverse_iso_dict = {} + class ConllEntry: def __init__(self, id, form, lemma, pos, cpos, feats=None, parent_id=None, relation=None, deps=None, misc=None, treebank_id=None, proxy_tbank=None, language=None, char_rep=None): @@ -48,6 +49,7 @@ def __str__(self): self.deps, self.misc] return '\t'.join(['_' if v is None else v for v in values]) + class Treebank(object): def __init__(self,trainfile,devfile,testfile): self.name = 'noname' @@ -59,6 +61,7 @@ def __init__(self,trainfile,devfile,testfile): self.outfilename = None self.proxy_tbank = None + class UDtreebank(Treebank): def __init__(self, treebank_info, options): """ @@ -101,6 +104,7 @@ def __init__(self, treebank_info, options): self.dev_gold = self.test_gold self.outfilename = self.iso_id + '.conllu' + class ParseForest: def __init__(self, sentence): self.roots = list(sentence) @@ -143,6 +147,7 @@ def isProj(sentence): return len(forest.roots) == 1 + def get_vocab(treebanks,datasplit,char_map={}): """ Collect frequencies of words, cpos, pos and deprels + languages. @@ -178,21 +183,24 @@ def get_vocab(treebanks,datasplit,char_map={}): # loads the same when predicting with a saved model later on # this is also another reason not to use sets for everything here as they are unordered # which creates problems when loading from file at predict time - return (wordsCount, wordsCount.keys(), charsCount.keys(), posCount.keys(), - cposCount.keys(), relCount.keys(), tbankCount.keys(), langCount.keys()) + return (wordsCount, list(wordsCount.keys()), list(charsCount.keys()), list(posCount.keys()), + list(cposCount.keys()), list(relCount.keys()), list(tbankCount.keys()), list(langCount.keys())) + def load_iso_dict(json_file='./src/utils/ud_iso.json'): - print "Loading ISO dict from %s"%json_file + print("Loading ISO dict from %s"%json_file) global iso_dict - ud_iso_file = codecs.open(json_file,encoding='utf-8') + ud_iso_file = open(json_file,encoding='utf-8') json_str = ud_iso_file.read() iso_dict = json.loads(json_str) + def load_reverse_iso_dict(json_file='./src/utils/ud_iso.json'): global reverse_iso_dict if not iso_dict: load_iso_dict(json_file=json_file) - reverse_iso_dict = {v: k for k, v in iso_dict.iteritems()} + reverse_iso_dict = {v: k for k, v in iso_dict.items()} + def load_lang_iso_dict(json_file='./src/utils/ud_iso.json'): @@ -208,6 +216,7 @@ def load_lang_iso_dict(json_file='./src/utils/ud_iso.json'): return lang_iso_dict + # convert treebank to language by removing everything after underscore def get_lang_from_tbank_name(tbank_name): @@ -221,6 +230,7 @@ def get_lang_from_tbank_name(tbank_name): return lang + def get_lang_from_tbank_id(tbank_id): if not tbank_id: return None @@ -229,6 +239,7 @@ def get_lang_from_tbank_id(tbank_id): load_reverse_iso_dict() return get_lang_from_tbank_name(reverse_iso_dict[tbank_id]) + # gets everything before the underscore in treebank iso e.g. "sv_talbanken" -> "sv" # with an exception for the two Norwegian variants where it's useful to consider them # as separate languages @@ -241,10 +252,12 @@ def get_lang_iso(treebank_iso): m = re.match(r'(.*_(nynorsk|bokmaal)?)',treebank_iso) return m.group(1).rstrip('_') + # from a list of treebanks, return those that match a particular language def get_treebanks_from_lang(treebank_ids,lang): return [treebank_id for treebank_id in treebank_ids if get_lang_from_tbank_id(treebank_id) == lang] + def get_all_treebanks(options): if not iso_dict: @@ -255,8 +268,9 @@ def get_all_treebanks(options): return json_treebanks + def read_conll_dir(treebanks,filetype,maxSize=-1,char_map={}): - #print "Max size for each corpus: ", maxSize + #print("Max size for each corpus: ", maxSize) if filetype == "train": return chain(*(read_conll(treebank.trainfile, treebank.iso_id, treebank.proxy_tbank, maxSize, train=True, char_map=char_map) for treebank in treebanks)) elif filetype == "dev": @@ -268,15 +282,14 @@ def read_conll_dir(treebanks,filetype,maxSize=-1,char_map={}): def generate_root_token(treebank_id, proxy_tbank, language): return ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_', -1, 'rroot', '_', '_',treebank_id=treebank_id, proxy_tbank=proxy_tbank, - language=language - ) + language=language) def read_conll(filename, treebank_id=None, proxy_tbank=None, maxSize=-1, hard_lim=False, vocab_prep=False, drop_nproj=False, train=True, char_map={}): # hard lim means capping the corpus size across the whole training procedure # soft lim means using a sample of the whole corpus at each epoch - fh = codecs.open(filename,'r',encoding='utf-8') - print "Reading " + filename + fh = open(filename,'r',encoding='utf-8') + print("Reading " + filename) if vocab_prep and not hard_lim: maxSize = -1 # when preparing the vocab with a soft limit we need to use the whole corpus ts = time.time() @@ -312,12 +325,12 @@ def read_conll(filename, treebank_id=None, proxy_tbank=None, maxSize=-1, hard_li yield tokens yield_count += 1 if yield_count == maxSize: - print "Capping size of corpus at " + str(yield_count) + " sentences" + print("Capping size of corpus at " + str(yield_count) + " sentences") break; else: yield tokens else: - #print 'Non-projective sentence dropped' + #print('Non-projective sentence dropped') dropped += 1 tokens = [generate_root_token(treebank_id, proxy_tbank, language)] else: @@ -335,52 +348,53 @@ def read_conll(filename, treebank_id=None, proxy_tbank=None, maxSize=-1, hard_li tokens.append(token) if hard_lim and yield_count < maxSize: - print 'Warning: unable to yield ' + str(maxSize) + ' sentences, only ' + str(yield_count) + ' found' + print('Warning: unable to yield ' + str(maxSize) + ' sentences, only ' + str(yield_count) + ' found') # TODO: deal with case where there are still unyielded tokens # e.g. when there is no newline at end of file # if len(tokens) > 1: # yield tokens - print sents_read, 'sentences read' + print(sents_read, 'sentences read') if maxSize > 0 and not hard_lim: if len(sents) > maxSize: sents = random.sample(sents,maxSize) - print "Yielding " + str(len(sents)) + " random sentences" + print("Yielding " + str(len(sents)) + " random sentences") for toks in sents: yield toks te = time.time() - print 'Time: %.2gs'%(te-ts) + print('Time: %.2gs'%(te-ts)) + def write_conll(fn, conll_gen): - print "Writing to " + fn + print("Writing to " + fn) sents = 0 - with codecs.open(fn, 'w', encoding='utf-8') as fh: + with open(fn, 'w', encoding='utf-8') as fh: for sentence in conll_gen: sents += 1 for entry in sentence[1:]: - fh.write(unicode(entry) + '\n') - #print str(entry) + fh.write(str(entry) + '\n') fh.write('\n') - print "Wrote " + str(sents) + " sentences" + print("Wrote " + str(sents) + " sentences") + def write_conll_multiling(conll_gen, treebanks): tbank_dict = {treebank.iso_id:treebank for treebank in treebanks} cur_tbank = conll_gen[0][0].treebank_id outfile = tbank_dict[cur_tbank].outfilename - fh = codecs.open(outfile,'w',encoding='utf-8') - print "Writing to " + outfile + fh = open(outfile,'w',encoding='utf-8') + print("Writing to " + outfile) for sentence in conll_gen: if cur_tbank != sentence[0].treebank_id: fh.close() cur_tbank = sentence[0].treebank_id outfile = tbank_dict[cur_tbank].outfilename - fh = codecs.open(outfile,'w',encoding='utf-8') - print "Writing to " + outfile + fh = open(outfile,'w',encoding='utf-8') + print("Writing to " + outfile) for entry in sentence[1:]: - fh.write(unicode(entry) + '\n') + fh.write(str(entry) + '\n') fh.write('\n') @@ -388,18 +402,20 @@ def parse_list_arg(l): """Return a list of line values if it's a file or a list of values if it is a string""" if os.path.isfile(l): - f = codecs.open(l, 'r', encoding='utf-8') + f = open(l, 'r', encoding='utf-8') return [line.strip("\n").split()[0] for line in f] else: return [el for el in l.split(" ")] + numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); def normalize(word): return 'NUM' if numberRegex.match(word) else word.lower() + def evaluate(gold,test,conllu): scoresfile = test + '.txt' - print "Writing to " + scoresfile + print("Writing to " + scoresfile) if not conllu: #os.system('perl src/utils/eval.pl -g ' + gold + ' -s ' + test + ' > ' + scoresfile + ' &') os.system('perl src/utils/eval.pl -g ' + gold + ' -s ' + test + ' > ' + scoresfile ) @@ -408,6 +424,7 @@ def evaluate(gold,test,conllu): score = get_LAS_score(scoresfile,conllu) return score + def inorder(sentence): queue = [sentence[0]] def inorder_helper(sentence,i): @@ -423,18 +440,21 @@ def inorder_helper(sentence,i): return results return inorder_helper(sentence,queue[0].id) + def set_seeds(options): python_seed = 1 if not options.predict and options.dynet_seed: # seeds shouldn't make any difference when predicting - print "Using default Python seed" + print("Using default Python seed") random.seed(python_seed) + def generate_seed(): return random.randint(0,10**9) # this range seems to work for Dynet and Python's random function + def get_LAS_score(filename, conllu=True): score = None - with codecs.open(filename,'r',encoding='utf-8') as fh: + with open(filename,'r',encoding='utf-8') as fh: if conllu: for line in fh: if re.match(r'^LAS',line): @@ -446,44 +466,50 @@ def get_LAS_score(filename, conllu=True): return score +import lzma + def extract_embeddings_from_file(filename, words=None, max_emb=-1, filtered_filename=None): # words should be a set used to filter the embeddings - print "Extracting embeddings from", filename + print("Extracting embeddings from", filename) ts = time.time() line_count = 0 error_count = 0 # e.g. invalid utf-8 in embeddings file - with open(filename,'r') as fh: # byte string + #with open(filename,'r') as fh: # byte string + with lzma.open(filename, mode='rt', encoding='utf-8') as fh: - fh.readline() # ignore first line with embedding stats + next(fh) # ignore first line with embedding stats embeddings = OrderedDict() - for line in fh: + while True: if max_emb < 0 or line_count < max_emb: try: + line = next(fh) # only split on normal space, not e.g. non-break space - eles = line.decode('utf-8').strip().split(" ") + eles = line.strip().split(" ") word = re.sub(u"\xa0"," ",eles[0]) # replace non-break space with regular space if not words or word in words: embeddings[word] = [float(f) for f in eles[1:]] + except StopIteration: + break except UnicodeDecodeError: -# print "Unable to read word at line %i: %s"%(line_count, word) +# print("Unable to read word at line %i: %s"%(line_count, word)) error_count += 1 line_count += 1 if line_count % 100000 == 0: - print "Reading line: " + str(line_count) + print("Reading line: " + str(line_count)) else: break - print "Read %i embeddings"%line_count + print("Read %i embeddings"%line_count) te = time.time() - print 'Time: %.2gs'%(te-ts) -# print "%i utf-8 errors"%error_count + print('Time: %.2gs'%(te-ts)) +# print("%i utf-8 errors"%error_count) if words: - print "%i entries found from vocabulary (out of %i)"%(len(embeddings),len(words)) + print("%i entries found from vocabulary (out of %i)"%(len(embeddings),len(words))) if filtered_filename and embeddings: - print "Writing filtered embeddings to " + filtered_filename + print("Writing filtered embeddings to " + filtered_filename) with open(filtered_filename,'w') as fh_filter: no_embeddings = len(embeddings) embedding_size = len(embeddings.itervalues().next()) @@ -496,6 +522,7 @@ def extract_embeddings_from_file(filename, words=None, max_emb=-1, filtered_file return embeddings + def get_external_embeddings(options, emb_file=None, emb_dir=None, lang=None, words=None, chars=False): @@ -528,11 +555,12 @@ def get_external_embeddings(options, emb_file=None, emb_dir=None, emb_file, words, options.max_ext_emb) external_embedding.update(embeddings) else: - print "Warning: %s does not exist, proceeding without" \ - % emb_file + print("Warning: %s does not exist, proceeding without" \ + % emb_file) return external_embedding + # for the most part, we want to send stored options to the parser when in # --predict mode, however we want to allow some of these to be updated # based on the command line options specified by the user at predict time