diff --git a/README.md b/README.md index 4eea6b8..ef92be8 100644 --- a/README.md +++ b/README.md @@ -47,13 +47,13 @@ where each line in the hypothesis file is a generated sentence and the correspon lines across the reference files are ground truth reference sentences for the corresponding hypothesis. -### Within a script: for the entire corpus ### +### functional API: for the entire corpus ### from nlgeval import compute_metrics metrics_dict = compute_metrics(hypothesis='examples/hyp.txt', references=['examples/ref1.txt', 'examples/ref2.txt']) -### Within a script: for only one sentence ### +### functional API: for only one sentence ### from nlgeval import compute_individual_metrics metrics_dict = compute_individual_metrics(references, hypothesis) @@ -61,6 +61,15 @@ corresponding hypothesis. where `references` is a list of ground truth reference text strings and `hypothesis` is the hypothesis text string. +### object oriented API for repeated calls in a script ### + + from nlgeval import NLGEval + nlgeval = NLGEval() # loads the models + metrics_dict = nlgeval.evaluate(references, hypothesis) + +where `references` is a list of ground truth reference text strings and +`hypothesis` is the hypothesis text string. + ## Reference ## If you use this code as part of any published research, please cite the following paper: diff --git a/nlgeval/__init__.py b/nlgeval/__init__.py index 503e22d..f329b5e 100644 --- a/nlgeval/__init__.py +++ b/nlgeval/__init__.py @@ -126,3 +126,84 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False ret_scores[name] = value return ret_scores + + +class NLGEval: + def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False): + self.no_overlap = no_overlap + if not no_overlap: + self.load_scorers() + + self.no_skipthoughts = no_skipthoughts + if not self.no_skipthoughts: + self.load_skipthought_model() + + self.no_glove = no_glove + if not self.no_glove: + self.load_glove() + + def load_scorers(self): + self.scorers = [ + (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), + (Meteor(), "METEOR"), + (Rouge(), "ROUGE_L"), + (Cider(), "CIDEr") + ] + + def load_skipthought_model(self): + from nlgeval.skipthoughts import skipthoughts + import numpy as np + from sklearn.metrics.pairwise import cosine_similarity + self.np = np + self.cosine_similarity = cosine_similarity + + model = skipthoughts.load_model() + self.skipthought_encoder = skipthoughts.Encoder(model) + + def load_glove(self): + from nlgeval.word2vec.evaluate import Embedding + from nlgeval.word2vec.evaluate import eval_emb_metrics + import numpy as np + self.eval_emb_metrics = eval_emb_metrics + self.np = np + self.glove_emb = Embedding() + + def evaluate(self, ref, hyp): + assert isinstance(hyp, str) + ref = [a.strip() for a in ref] + refs = {0: ref} + ref_list = [ref] + + hyps = {0: [hyp.strip()]} + hyp_list = [hyp] + + ret_scores = {} + if not self.no_overlap: + for scorer, method in self.scorers: + score, scores = scorer.compute_score(refs, hyps) + if isinstance(method, list): + for sc, scs, m in zip(score, scores, method): + ret_scores[m] = sc + else: + ret_scores[method] = score + + if not self.no_skipthoughts: + vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False) + ref_list_T = self.np.array(ref_list).T.tolist() + vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) + cosine_similarity = map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs) + cosine_similarity = self.np.max(cosine_similarity, axis=0).mean() + ret_scores['SkipThoughtCS'] = cosine_similarity + + if not self.no_glove: + glove_hyps = [h.strip() for h in hyp_list] + ref_list_T = self.np.array(ref_list).T.tolist() + glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) + scores = self.eval_emb_metrics(glove_hyps, glove_refs, emb=self.glove_emb) + scores = scores.split('\n') + for score in scores: + name, value = score.split(':') + value = float(value.strip()) + ret_scores[name] = value + + return ret_scores diff --git a/nlgeval/word2vec/evaluate.py b/nlgeval/word2vec/evaluate.py index cdf8e5a..e7ec00f 100644 --- a/nlgeval/word2vec/evaluate.py +++ b/nlgeval/word2vec/evaluate.py @@ -28,11 +28,12 @@ def vec(self, key): return self.unk -def eval_emb_metrics(hypothesis, references): +def eval_emb_metrics(hypothesis, references, emb=None): from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import word_tokenize import numpy as np - emb = Embedding() + if emb is None: + emb = Embedding() emb_hyps = [] avg_emb_hyps = [] diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/api.py b/test/api.py new file mode 100644 index 0000000..5bbe6f0 --- /dev/null +++ b/test/api.py @@ -0,0 +1,13 @@ +from nlgeval import NLGEval + +def test_oo_api(): + with open("examples/hyp.txt") as f: + hyp = f.readlines() + with open("examples/ref1.txt") as f: + ref1 = f.readlines() + with open("examples/ref2.txt") as f: + ref2 = f.readlines() + + nlge = NLGEval() + res = nlge.evaluate([ref1[0]] + [ref2[0]], hyp[0]) + res = nlge.evaluate([ref1[1]] + [ref2[1]], hyp[1])