Make compatible with Python 2 and 3.

Maluuba · Jun 26, 2018 · 5908b4c · 5908b4c
1 parent 4d43b65
commit 5908b4c
Show file tree

Hide file tree

Showing 19 changed files with 283 additions and 124 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,15 +1,16 @@
 *.pyc
+
+*.npy
+
 pycocoevalcap/tokenizer/tmp*
 delete.py
 nlg_eval.egg-info/
 
 bi_skip.npz
 bi_skip.npz.pkl
-btable.npy
 dictionary.txt
 uni_skip.npz
 uni_skip.npz.pkl
-utable.npy
 
 glove.6B.300d.txt
 glove.6B.300d.model.bin

diff --git a/README.md b/README.md
@@ -15,20 +15,24 @@ Rows across these files should correspond to the same example.
 
 ## Requirements ##
 Tested using
-
 - java 1.8.0
-- python 2.7
-  - click 6.3
-  - nltk 3.1
-  - numpy 1.11.0
-  - scikit-learn 0.17
-  - gensim 0.12.4
-  - Theano 0.8.1
-  - scipy 0.17.0
+- python 3.6
+  - click 6.7
+  - nltk 3.3
+  - numpy 1.14.5
+  - scikit-learn 0.19.1
+  - gensim 3.4.0
+  - Theano 1.0.2
+  - scipy 1.1.0
+  - six>=1.11
+
+Python 2.7 has also been tested with mostly the same dependencies but an older version of gensim. You can see the version requirements in [requirements_py2.txt](requirements_py2.txt)
 
 ## Setup ##
 
 For the initial one-time setup, make sure java 1.8.0 is installed. After that just run:
+
+    pip install six
 
     # install the python dependencies
     pip install -e .
@@ -116,6 +120,8 @@ gives
     VectorExtremaCosineSimilarity: 0.568696
     GreedyMatchingScore: 0.784205
 
+Examples of the Python API can be found in [test_nlgeval.py](nlgeval/tests/test_nlgeval.py),
+
 ## Important Note ##
 CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus,
 CIDEr score for a reference dataset with only 1 image (or example for NLG) will be zero. When evaluating using one (or few)

diff --git a/bin/nlg-eval b/bin/nlg-eval
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python
 #
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

diff --git a/nlgeval/__init__.py b/nlgeval/__init__.py
@@ -1,10 +1,14 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 from __future__ import print_function
+
+from six.moves import map
+
 from nlgeval.pycocoevalcap.bleu.bleu import Bleu
+from nlgeval.pycocoevalcap.cider.cider import Cider
 from nlgeval.pycocoevalcap.meteor.meteor import Meteor
 from nlgeval.pycocoevalcap.rouge.rouge import Rouge
-from nlgeval.pycocoevalcap.cider.cider import Cider
+
 
 def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
     with open(hypothesis, 'r') as f:
@@ -13,7 +17,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
     for iidx, reference in enumerate(references):
         with open(reference, 'r') as f:
             ref_list.append(f.readlines())
-    ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
+    ref_list = [list(map(str.strip, refs)) for refs in zip(*ref_list)]
     refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
     hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
     assert len(refs) == len(hyps)
@@ -46,7 +50,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
         vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
         ref_list_T = np.array(ref_list).T.tolist()
         vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
-        cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
+        cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
         cosine_similarity = np.max(cosine_similarity, axis=0).mean()
         print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity))
         ret_scores['SkipThoughtCS'] = cosine_similarity
@@ -107,7 +111,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
         vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
         ref_list_T = np.array(ref_list).T.tolist()
         vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
-        cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
+        cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
         cosine_similarity = np.max(cosine_similarity, axis=0).mean()
         ret_scores['SkipThoughtCS'] = cosine_similarity
 
@@ -128,7 +132,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
     return ret_scores
 
 
-class NLGEval:
+class NLGEval(object):
     def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False):
         self.no_overlap = no_overlap
         if not no_overlap:
@@ -191,7 +195,7 @@ def compute_individual_metrics(self, ref, hyp):
             vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False)
             ref_list_T = self.np.array(ref_list).T.tolist()
             vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
-            cosine_similarity = map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
+            cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
             cosine_similarity = self.np.max(cosine_similarity, axis=0).mean()
             ret_scores['SkipThoughtCS'] = cosine_similarity
 
@@ -209,7 +213,7 @@ def compute_individual_metrics(self, ref, hyp):
         return ret_scores
 
     def compute_metrics(self, ref_list, hyp_list):
-        ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
+        ref_list = [list(map(str.strip, refs)) for refs in zip(*ref_list)]
         refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
         hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
         assert len(refs) == len(hyps)
@@ -228,7 +232,7 @@ def compute_metrics(self, ref_list, hyp_list):
             vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False)
             ref_list_T = self.np.array(ref_list).T.tolist()
             vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
-            cosine_similarity = map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
+            cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
             cosine_similarity = self.np.max(cosine_similarity, axis=0).mean()
             ret_scores['SkipThoughtCS'] = cosine_similarity
 

diff --git a/nlgeval/pycocoevalcap/bleu/bleu.py b/nlgeval/pycocoevalcap/bleu/bleu.py
@@ -8,7 +8,7 @@
 # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 # Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>
 
-from bleu_scorer import BleuScorer
+from .bleu_scorer import BleuScorer
 
 
 class Bleu:

diff --git a/nlgeval/pycocoevalcap/bleu/bleu_scorer.py b/nlgeval/pycocoevalcap/bleu/bleu_scorer.py
@@ -20,14 +20,18 @@
 import sys, math, re
 from collections import defaultdict
 
+import six
+from six.moves import xrange as range
+
+
 def precook(s, n=4, out=False):
     """Takes a string as input and returns an object that can be given to
     either cook_refs or cook_test. This is optional: cook_refs and cook_test
     can take string arguments as well."""
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return (len(words), counts)
@@ -42,7 +46,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
     for ref in refs:
         rl, counts = precook(ref, n)
         reflen.append(rl)
-        for (ngram,count) in counts.iteritems():
+        for (ngram,count) in six.iteritems(counts):
             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
     # Calculate effective reference sentence length.
@@ -57,10 +61,11 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
 
     return (reflen, maxcounts)
 
-def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
+def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
     '''Takes a test sentence and returns an object that
     encapsulates everything that BLEU needs to know about it.'''
 
+    reflen, refmaxcounts = reflen_refmaxcounts
     testlen, counts = precook(test, n, True)
 
     result = {}
@@ -74,10 +79,10 @@ def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
 
     result["testlen"] = testlen
 
-    result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
 
     result['correct'] = [0]*n
-    for (ngram, count) in counts.iteritems():
+    for (ngram, count) in six.iteritems(counts):
         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 
     return result
@@ -224,40 +229,40 @@ def compute_score(self, option=None, verbose=0):
             self._reflen += reflen
 
             for key in ['guess','correct']:
-                for k in xrange(n):
+                for k in range(n):
                     totalcomps[key][k] += comps[key][k]
 
             # append per image bleu score
             bleu = 1.
-            for k in xrange(n):
+            for k in range(n):
                 bleu *= (float(comps['correct'][k]) + tiny) \
                         /(float(comps['guess'][k]) + small)
                 bleu_list[k].append(bleu ** (1./(k+1)))
             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
             if ratio < 1:
-                for k in xrange(n):
+                for k in range(n):
                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
 
             if verbose > 1:
-                print comps, reflen
+                print(comps, reflen)
 
         totalcomps['reflen'] = self._reflen
         totalcomps['testlen'] = self._testlen
 
         bleus = []
         bleu = 1.
-        for k in xrange(n):
+        for k in range(n):
             bleu *= float(totalcomps['correct'][k] + tiny) \
                     / (totalcomps['guess'][k] + small)
             bleus.append(bleu ** (1./(k+1)))
         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
         if ratio < 1:
-            for k in xrange(n):
+            for k in range(n):
                 bleus[k] *= math.exp(1 - 1/ratio)
 
         if verbose > 0:
-            print totalcomps
-            print "ratio:", ratio
+            print(totalcomps)
+            print("ratio:", ratio)
 
         self._score = bleus
         return self._score, bleu_list
diff --git a/nlgeval/pycocoevalcap/cider/cider.py b/nlgeval/pycocoevalcap/cider/cider.py
@@ -7,7 +7,7 @@
 #
 # Authors: Ramakrishna Vedantam <[email protected]> and Tsung-Yi Lin <[email protected]>
 
-from cider_scorer import CiderScorer
+from .cider_scorer import CiderScorer
 import pdb
 
 class Cider:
@@ -51,4 +51,4 @@ def compute_score(self, gts, res):
         return score, scores
 
     def method(self):
-        return "CIDEr"
+        return "CIDEr"
diff --git a/nlgeval/pycocoevalcap/cider/cider_scorer.py b/nlgeval/pycocoevalcap/cider/cider_scorer.py
@@ -3,10 +3,12 @@
 # Ramakrishna Vedantam <[email protected]>
 
 import copy
+import math
 from collections import defaultdict
+
 import numpy as np
-import pdb
-import math
+from six.moves import xrange as range
+import six
 
 def precook(s, n=4, out=False):
     """
@@ -19,8 +21,8 @@ def precook(s, n=4, out=False):
     """
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return counts
@@ -99,7 +101,7 @@ def compute_doc_freq(self):
         '''
         for refs in self.crefs:
             # refs, k ref captions of one image
-            for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
+            for ngram in set([ngram for ref in refs for (ngram,count) in six.iteritems(ref)]):
                 self.document_frequency[ngram] += 1
             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
@@ -115,7 +117,7 @@ def counts2vec(cnts):
             vec = [defaultdict(float) for _ in range(self.n)]
             length = 0
             norm = [0.0 for _ in range(self.n)]
-            for (ngram,term_freq) in cnts.iteritems():
+            for (ngram,term_freq) in six.iteritems(cnts):
                 # give word count 1 if it doesn't appear in reference corpus
                 df = np.log(max(1.0, self.document_frequency[ngram]))
                 # ngram index
@@ -146,7 +148,7 @@ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
             val = np.array([0.0 for _ in range(self.n)])
             for n in range(self.n):
                 # ngram
-                for (ngram,count) in vec_hyp[n].iteritems():
+                for (ngram,count) in six.iteritems(vec_hyp[n]):
                     # vrama91 : added clipping
                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]