Skip to content

Commit

Permalink
Make compatible with Python 2 and 3.
Browse files Browse the repository at this point in the history
  • Loading branch information
kracwarlock authored and Justin Harris committed Jun 26, 2018
1 parent 4d43b65 commit 5908b4c
Show file tree
Hide file tree
Showing 19 changed files with 283 additions and 124 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
*.pyc

*.npy

pycocoevalcap/tokenizer/tmp*
delete.py
nlg_eval.egg-info/

bi_skip.npz
bi_skip.npz.pkl
btable.npy
dictionary.txt
uni_skip.npz
uni_skip.npz.pkl
utable.npy

glove.6B.300d.txt
glove.6B.300d.model.bin
Expand Down
24 changes: 15 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,24 @@ Rows across these files should correspond to the same example.

## Requirements ##
Tested using

- java 1.8.0
- python 2.7
- click 6.3
- nltk 3.1
- numpy 1.11.0
- scikit-learn 0.17
- gensim 0.12.4
- Theano 0.8.1
- scipy 0.17.0
- python 3.6
- click 6.7
- nltk 3.3
- numpy 1.14.5
- scikit-learn 0.19.1
- gensim 3.4.0
- Theano 1.0.2
- scipy 1.1.0
- six>=1.11

Python 2.7 has also been tested with mostly the same dependencies but an older version of gensim. You can see the version requirements in [requirements_py2.txt](requirements_py2.txt)

## Setup ##

For the initial one-time setup, make sure java 1.8.0 is installed. After that just run:

pip install six

# install the python dependencies
pip install -e .
Expand Down Expand Up @@ -116,6 +120,8 @@ gives
VectorExtremaCosineSimilarity: 0.568696
GreedyMatchingScore: 0.784205

Examples of the Python API can be found in [test_nlgeval.py](nlgeval/tests/test_nlgeval.py),

## Important Note ##
CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus,
CIDEr score for a reference dataset with only 1 image (or example for NLG) will be zero. When evaluating using one (or few)
Expand Down
2 changes: 1 addition & 1 deletion bin/nlg-eval
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.7
#!/usr/bin/env python
#
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
Expand Down
20 changes: 12 additions & 8 deletions nlgeval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
from __future__ import print_function

from six.moves import map

from nlgeval.pycocoevalcap.bleu.bleu import Bleu
from nlgeval.pycocoevalcap.cider.cider import Cider
from nlgeval.pycocoevalcap.meteor.meteor import Meteor
from nlgeval.pycocoevalcap.rouge.rouge import Rouge
from nlgeval.pycocoevalcap.cider.cider import Cider


def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
with open(hypothesis, 'r') as f:
Expand All @@ -13,7 +17,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
for iidx, reference in enumerate(references):
with open(reference, 'r') as f:
ref_list.append(f.readlines())
ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
ref_list = [list(map(str.strip, refs)) for refs in zip(*ref_list)]
refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
assert len(refs) == len(hyps)
Expand Down Expand Up @@ -46,7 +50,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
ref_list_T = np.array(ref_list).T.tolist()
vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
cosine_similarity = np.max(cosine_similarity, axis=0).mean()
print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity))
ret_scores['SkipThoughtCS'] = cosine_similarity
Expand Down Expand Up @@ -107,7 +111,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
ref_list_T = np.array(ref_list).T.tolist()
vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
cosine_similarity = np.max(cosine_similarity, axis=0).mean()
ret_scores['SkipThoughtCS'] = cosine_similarity

Expand All @@ -128,7 +132,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
return ret_scores


class NLGEval:
class NLGEval(object):
def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False):
self.no_overlap = no_overlap
if not no_overlap:
Expand Down Expand Up @@ -191,7 +195,7 @@ def compute_individual_metrics(self, ref, hyp):
vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False)
ref_list_T = self.np.array(ref_list).T.tolist()
vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
cosine_similarity = map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
cosine_similarity = self.np.max(cosine_similarity, axis=0).mean()
ret_scores['SkipThoughtCS'] = cosine_similarity

Expand All @@ -209,7 +213,7 @@ def compute_individual_metrics(self, ref, hyp):
return ret_scores

def compute_metrics(self, ref_list, hyp_list):
ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
ref_list = [list(map(str.strip, refs)) for refs in zip(*ref_list)]
refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
assert len(refs) == len(hyps)
Expand All @@ -228,7 +232,7 @@ def compute_metrics(self, ref_list, hyp_list):
vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False)
ref_list_T = self.np.array(ref_list).T.tolist()
vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
cosine_similarity = map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
cosine_similarity = self.np.max(cosine_similarity, axis=0).mean()
ret_scores['SkipThoughtCS'] = cosine_similarity

Expand Down
2 changes: 1 addition & 1 deletion nlgeval/pycocoevalcap/bleu/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>

from bleu_scorer import BleuScorer
from .bleu_scorer import BleuScorer


class Bleu:
Expand Down
33 changes: 19 additions & 14 deletions nlgeval/pycocoevalcap/bleu/bleu_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@
import sys, math, re
from collections import defaultdict

import six
from six.moves import xrange as range


def precook(s, n=4, out=False):
"""Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
Expand All @@ -42,7 +46,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in counts.iteritems():
for (ngram,count) in six.iteritems(counts):
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

# Calculate effective reference sentence length.
Expand All @@ -57,10 +61,11 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"

return (reflen, maxcounts)

def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''

reflen, refmaxcounts = reflen_refmaxcounts
testlen, counts = precook(test, n, True)

result = {}
Expand All @@ -74,10 +79,10 @@ def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):

result["testlen"] = testlen

result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]

result['correct'] = [0]*n
for (ngram, count) in counts.iteritems():
for (ngram, count) in six.iteritems(counts):
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)

return result
Expand Down Expand Up @@ -224,40 +229,40 @@ def compute_score(self, option=None, verbose=0):
self._reflen += reflen

for key in ['guess','correct']:
for k in xrange(n):
for k in range(n):
totalcomps[key][k] += comps[key][k]

# append per image bleu score
bleu = 1.
for k in xrange(n):
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)

if verbose > 1:
print comps, reflen
print(comps, reflen)

totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen

bleus = []
bleu = 1.
for k in xrange(n):
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)

if verbose > 0:
print totalcomps
print "ratio:", ratio
print(totalcomps)
print("ratio:", ratio)

self._score = bleus
return self._score, bleu_list
4 changes: 2 additions & 2 deletions nlgeval/pycocoevalcap/cider/cider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#
# Authors: Ramakrishna Vedantam <[email protected]> and Tsung-Yi Lin <[email protected]>

from cider_scorer import CiderScorer
from .cider_scorer import CiderScorer
import pdb

class Cider:
Expand Down Expand Up @@ -51,4 +51,4 @@ def compute_score(self, gts, res):
return score, scores

def method(self):
return "CIDEr"
return "CIDEr"
16 changes: 9 additions & 7 deletions nlgeval/pycocoevalcap/cider/cider_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
# Ramakrishna Vedantam <[email protected]>

import copy
import math
from collections import defaultdict

import numpy as np
import pdb
import math
from six.moves import xrange as range
import six

def precook(s, n=4, out=False):
"""
Expand All @@ -19,8 +21,8 @@ def precook(s, n=4, out=False):
"""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
Expand Down Expand Up @@ -99,7 +101,7 @@ def compute_doc_freq(self):
'''
for refs in self.crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
for ngram in set([ngram for ref in refs for (ngram,count) in six.iteritems(ref)]):
self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

Expand All @@ -115,7 +117,7 @@ def counts2vec(cnts):
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram,term_freq) in cnts.iteritems():
for (ngram,term_freq) in six.iteritems(cnts):
# give word count 1 if it doesn't appear in reference corpus
df = np.log(max(1.0, self.document_frequency[ngram]))
# ngram index
Expand Down Expand Up @@ -146,7 +148,7 @@ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
# ngram
for (ngram,count) in vec_hyp[n].iteritems():
for (ngram,count) in six.iteritems(vec_hyp[n]):
# vrama91 : added clipping
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]

Expand Down
Loading

0 comments on commit 5908b4c

Please sign in to comment.