tylin · KaiKangSDU · Feb 5, 2018 · Feb 6, 2018 · Feb 6, 2018 · Feb 6, 2018
diff --git a/README.md b/README.md
@@ -3,24 +3,28 @@ Microsoft COCO Caption Evaluation
 
 Evaluation codes for MS COCO caption generation.
 
-## Requirements ##
-- java 1.8.0
-- python 2.7
+## Description ##
+This repository provides Python 3 support for the caption evaluation metrics used for the MS COCO dataset.
 
-## Files ##
-./
-- cocoEvalCapDemo.py (demo script)
+The code is derived from the original repository that supports Python 2.7: https://github.com/tylin/coco-caption.  
+Caption evaluation depends on the COCO API that natively supports Python 3.
 
-./annotation
-- captions_val2014.json (MS COCO 2014 caption validation set)
-- Visit MS COCO [download](http://mscoco.org/dataset/#download) page for more details.
+## Requirements ##
+- Java 1.8.0
+- Python 3
 
-./results
-- captions_val2014_fakecap_results.json (an example of fake results for running demo)
-- Visit MS COCO [format](http://mscoco.org/dataset/#format) page for more details.
+## Installation ##
+To install pycocoevalcap and the pycocotools dependency (https://github.com/cocodataset/cocoapi), run:
+```
+pip install pycocoevalcap
+```
 
-./pycocoevalcap: The folder where all evaluation codes are stored.
-- evals.py: The file includes COCOEavlCap class that can be used to evaluate results on COCO.
+## Usage ##
+See the example script: [example/coco_eval_example.py](example/coco_eval_example.py)
+
+## Files ##
+./
+- eval.py: The file includes COCOEavlCap class that can be used to evaluate results on COCO.
 - tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
 - bleu: Bleu evalutation codes
 - meteor: Meteor evaluation codes
@@ -30,9 +34,8 @@ Evaluation codes for MS COCO caption generation.
 
 ## Setup ##
 
-- You will first need to download the [Stanford CoreNLP 3.6.0](http://stanfordnlp.github.io/CoreNLP/index.html) code and models for use by SPICE. To do this, run:
-    ./get_stanford_models.sh
-- Note: SPICE will try to create a cache of parsed sentences in ./pycocoevalcap/spice/cache/. This dramatically speeds up repeated evaluations. The cache directory can be moved by setting 'CACHE_DIR' in ./pycocoevalcap/spice. In the same file, caching can be turned off by removing the '-cache' argument to 'spice_cmd'. 
+- SPICE requires the download of [Stanford CoreNLP 3.6.0](http://stanfordnlp.github.io/CoreNLP/index.html) code and models. This will be done automatically the first time the SPICE evaluation is performed.
+- Note: SPICE will try to create a cache of parsed sentences in ./spice/cache/. This dramatically speeds up repeated evaluations. The cache directory can be moved by setting 'CACHE_DIR' in ./spice. In the same file, caching can be turned off by removing the '-cache' argument to 'spice_cmd'.
 
 ## References ##
 

diff --git a/pycocoevalcap/bleu/LICENSE → bleu/LICENSE b/pycocoevalcap/bleu/LICENSE → bleu/LICENSE
diff --git a/pycocoevalcap/bleu/bleu.py → bleu/bleu.py b/pycocoevalcap/bleu/bleu.py → bleu/bleu.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # File Name : bleu.py
 #
 # Description : Wrapper for BLEU scorer.
@@ -8,7 +8,7 @@
 # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 # Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>
 
-from bleu_scorer import BleuScorer
+from .bleu_scorer import BleuScorer
 
 
 class Bleu:
@@ -18,7 +18,7 @@ def __init__(self, n=4):
         self._hypo_for_image = {}
         self.ref_for_image = {}
 
-    def compute_score(self, gts, res):
+    def compute_score(self, gts, res, verbose=1):
 
         assert(gts.keys() == res.keys())
         imgIds = gts.keys()
@@ -37,7 +37,7 @@ def compute_score(self, gts, res):
             bleu_scorer += (hypo[0], ref)
 
         #score, scores = bleu_scorer.compute_score(option='shortest')
-        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
+        score, scores = bleu_scorer.compute_score(option='closest', verbose=verbose)
         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
 
         # return (bleu, bleu_info)

diff --git a/pycocoevalcap/bleu/bleu_scorer.py → bleu/bleu_scorer.py b/pycocoevalcap/bleu/bleu_scorer.py → bleu/bleu_scorer.py
@@ -7,7 +7,7 @@
 # reserved. Do not redistribute without permission from the
 # author. Not for commercial use.
 
-# Modified by: 
+# Modified by:
 # Hao Fang <[email protected]>
 # Tsung-Yi Lin <[email protected]>
 
@@ -26,8 +26,8 @@ def precook(s, n=4, out=False):
     can take string arguments as well."""
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return (len(words), counts)
@@ -42,7 +42,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
     for ref in refs:
         rl, counts = precook(ref, n)
         reflen.append(rl)
-        for (ngram,count) in counts.iteritems():
+        for (ngram,count) in counts.items():
             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
     # Calculate effective reference sentence length.
@@ -52,32 +52,33 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
         reflen = float(sum(reflen))/len(reflen)
 
     ## lhuang: N.B.: leave reflen computaiton to the very end!!
-    
+
     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 
     return (reflen, maxcounts)
 
-def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
+def cook_test(test, refs, eff=None, n=4):
     '''Takes a test sentence and returns an object that
     encapsulates everything that BLEU needs to know about it.'''
 
+    reflen, refmaxcounts = refs
     testlen, counts = precook(test, n, True)
 
     result = {}
 
     # Calculate effective reference sentence length.
-    
+
     if eff == "closest":
         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
     else: ## i.e., "average" or "shortest" or None
         result["reflen"] = reflen
 
     result["testlen"] = testlen
 
-    result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
 
     result['correct'] = [0]*n
-    for (ngram, count) in counts.iteritems():
+    for (ngram, count) in counts.items():
         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 
     return result
@@ -108,7 +109,7 @@ def __init__(self, test=None, refs=None, n=4, special_reflen=None):
 
     def cook_append(self, test, refs):
         '''called by constructor and __iadd__ to avoid creating new instances.'''
-        
+
         if refs is not None:
             self.crefs.append(cook_refs(refs))
             if test is not None:
@@ -136,7 +137,7 @@ def reflen(self, option=None):
 
     def testlen(self, option=None):
         self.compute_score(option=option)
-        return self._testlen        
+        return self._testlen
 
     def retest(self, new_test):
         if type(new_test) is str:
@@ -151,7 +152,7 @@ def retest(self, new_test):
 
     def rescore(self, new_test):
         ''' replace test(s) with new test(s), and returns the new score.'''
-        
+
         return self.retest(new_test).compute_score()
 
     def size(self):
@@ -170,7 +171,7 @@ def __iadd__(self, other):
             self.crefs.extend(other.crefs)
             self._score = None ## need to recompute
 
-        return self        
+        return self
 
     def compatible(self, other):
         return isinstance(other, BleuScorer) and self.n == other.n
@@ -179,7 +180,7 @@ def single_reflen(self, option="average"):
         return self._single_reflen(self.crefs[0][0], option)
 
     def _single_reflen(self, reflens, option=None, testlen=None):
-        
+
         if option == "shortest":
             reflen = min(reflens)
         elif option == "average":
@@ -194,7 +195,7 @@ def _single_reflen(self, reflens, option=None, testlen=None):
     def recompute_score(self, option=None, verbose=0):
         self._score = None
         return self.compute_score(option, verbose)
-        
+
     def compute_score(self, option=None, verbose=0):
         n = self.n
         small = 1e-9
@@ -212,7 +213,7 @@ def compute_score(self, option=None, verbose=0):
         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
 
         # for each sentence
-        for comps in self.ctest:            
+        for comps in self.ctest:
             testlen = comps['testlen']
             self._testlen += testlen
 
@@ -222,42 +223,42 @@ def compute_score(self, option=None, verbose=0):
                 reflen = self.special_reflen
 
             self._reflen += reflen
-                
+
             for key in ['guess','correct']:
-                for k in xrange(n):
+                for k in range(n):
                     totalcomps[key][k] += comps[key][k]
 
             # append per image bleu score
             bleu = 1.
-            for k in xrange(n):
+            for k in range(n):
                 bleu *= (float(comps['correct'][k]) + tiny) \
-                        /(float(comps['guess'][k]) + small) 
+                        /(float(comps['guess'][k]) + small)
                 bleu_list[k].append(bleu ** (1./(k+1)))
             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
             if ratio < 1:
-                for k in xrange(n):
+                for k in range(n):
                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
 
             if verbose > 1:
-                print comps, reflen
+                print(comps, reflen)
 
         totalcomps['reflen'] = self._reflen
         totalcomps['testlen'] = self._testlen
 
         bleus = []
         bleu = 1.
-        for k in xrange(n):
+        for k in range(n):
             bleu *= float(totalcomps['correct'][k] + tiny) \
                     / (totalcomps['guess'][k] + small)
             bleus.append(bleu ** (1./(k+1)))
         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
         if ratio < 1:
-            for k in xrange(n):
+            for k in range(n):
                 bleus[k] *= math.exp(1 - 1/ratio)
 
         if verbose > 0:
-            print totalcomps
-            print "ratio:", ratio
+            print(totalcomps)
+            print("ratio:", ratio)
 
         self._score = bleus
         return self._score, bleu_list
diff --git a/pycocoevalcap/cider/cider.py → cider/cider.py b/pycocoevalcap/cider/cider.py → cider/cider.py
@@ -1,18 +1,18 @@
 # Filename: cider.py
 #
-# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
+# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
 #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 #
 # Creation Date: Sun Feb  8 14:16:54 2015
 #
 # Authors: Ramakrishna Vedantam <[email protected]> and Tsung-Yi Lin <[email protected]>
 
-from cider_scorer import CiderScorer
+from .cider_scorer import CiderScorer
 import pdb
 
 class Cider:
     """
-    Main Class to compute the CIDEr metric 
+    Main Class to compute the CIDEr metric
 
     """
     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
@@ -26,7 +26,7 @@ def compute_score(self, gts, res):
         Main function to compute CIDEr score
         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
-        :return: cider (float) : computed CIDEr score for the corpus 
+        :return: cider (float) : computed CIDEr score for the corpus
         """
 
         assert(gts.keys() == res.keys())
@@ -51,4 +51,4 @@ def compute_score(self, gts, res):
         return score, scores
 
     def method(self):
-        return "CIDEr"
+        return "CIDEr"
diff --git a/pycocoevalcap/cider/cider_scorer.py → cider/cider_scorer.py b/pycocoevalcap/cider/cider_scorer.py → cider/cider_scorer.py
@@ -19,8 +19,8 @@ def precook(s, n=4, out=False):
     """
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return counts
@@ -99,7 +99,7 @@ def compute_doc_freq(self):
         '''
         for refs in self.crefs:
             # refs, k ref captions of one image
-            for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
+            for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
                 self.document_frequency[ngram] += 1
             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
@@ -115,7 +115,7 @@ def counts2vec(cnts):
             vec = [defaultdict(float) for _ in range(self.n)]
             length = 0
             norm = [0.0 for _ in range(self.n)]
-            for (ngram,term_freq) in cnts.iteritems():
+            for (ngram,term_freq) in cnts.items():
                 # give word count 1 if it doesn't appear in reference corpus
                 df = np.log(max(1.0, self.document_frequency[ngram]))
                 # ngram index
@@ -146,7 +146,7 @@ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
             val = np.array([0.0 for _ in range(self.n)])
             for n in range(self.n):
                 # ngram
-                for (ngram,count) in vec_hyp[n].iteritems():
+                for (ngram,count) in vec_hyp[n].items():
                     # vrama91 : added clipping
                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
 
@@ -189,4 +189,4 @@ def compute_score(self, option=None, verbose=0):
         score = self.compute_cider()
         # debug
         # print score
-        return np.mean(np.array(score)), np.array(score)
+        return np.mean(np.array(score)), np.array(score)