Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 20 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,28 @@ Microsoft COCO Caption Evaluation

Evaluation codes for MS COCO caption generation.

## Requirements ##
- java 1.8.0
- python 2.7
## Description ##
This repository provides Python 3 support for the caption evaluation metrics used for the MS COCO dataset.

## Files ##
./
- cocoEvalCapDemo.py (demo script)
The code is derived from the original repository that supports Python 2.7: https://github.com/tylin/coco-caption.
Caption evaluation depends on the COCO API that natively supports Python 3.

./annotation
- captions_val2014.json (MS COCO 2014 caption validation set)
- Visit MS COCO [download](http://mscoco.org/dataset/#download) page for more details.
## Requirements ##
- Java 1.8.0
- Python 3

./results
- captions_val2014_fakecap_results.json (an example of fake results for running demo)
- Visit MS COCO [format](http://mscoco.org/dataset/#format) page for more details.
## Installation ##
To install pycocoevalcap and the pycocotools dependency (https://github.com/cocodataset/cocoapi), run:
```
pip install pycocoevalcap
```

./pycocoevalcap: The folder where all evaluation codes are stored.
- evals.py: The file includes COCOEavlCap class that can be used to evaluate results on COCO.
## Usage ##
See the example script: [example/coco_eval_example.py](example/coco_eval_example.py)

## Files ##
./
- eval.py: The file includes COCOEavlCap class that can be used to evaluate results on COCO.
- tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
- bleu: Bleu evalutation codes
- meteor: Meteor evaluation codes
Expand All @@ -30,9 +34,8 @@ Evaluation codes for MS COCO caption generation.

## Setup ##

- You will first need to download the [Stanford CoreNLP 3.6.0](http://stanfordnlp.github.io/CoreNLP/index.html) code and models for use by SPICE. To do this, run:
./get_stanford_models.sh
- Note: SPICE will try to create a cache of parsed sentences in ./pycocoevalcap/spice/cache/. This dramatically speeds up repeated evaluations. The cache directory can be moved by setting 'CACHE_DIR' in ./pycocoevalcap/spice. In the same file, caching can be turned off by removing the '-cache' argument to 'spice_cmd'.
- SPICE requires the download of [Stanford CoreNLP 3.6.0](http://stanfordnlp.github.io/CoreNLP/index.html) code and models. This will be done automatically the first time the SPICE evaluation is performed.
- Note: SPICE will try to create a cache of parsed sentences in ./spice/cache/. This dramatically speeds up repeated evaluations. The cache directory can be moved by setting 'CACHE_DIR' in ./spice. In the same file, caching can be turned off by removing the '-cache' argument to 'spice_cmd'.

## References ##

Expand Down
File renamed without changes.
8 changes: 4 additions & 4 deletions pycocoevalcap/bleu/bleu.py → bleu/bleu.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
#
#
# File Name : bleu.py
#
# Description : Wrapper for BLEU scorer.
Expand All @@ -8,7 +8,7 @@
# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>

from bleu_scorer import BleuScorer
from .bleu_scorer import BleuScorer


class Bleu:
Expand All @@ -18,7 +18,7 @@ def __init__(self, n=4):
self._hypo_for_image = {}
self.ref_for_image = {}

def compute_score(self, gts, res):
def compute_score(self, gts, res, verbose=1):

assert(gts.keys() == res.keys())
imgIds = gts.keys()
Expand All @@ -37,7 +37,7 @@ def compute_score(self, gts, res):
bleu_scorer += (hypo[0], ref)

#score, scores = bleu_scorer.compute_score(option='shortest')
score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
score, scores = bleu_scorer.compute_score(option='closest', verbose=verbose)
#score, scores = bleu_scorer.compute_score(option='average', verbose=1)

# return (bleu, bleu_info)
Expand Down
53 changes: 27 additions & 26 deletions pycocoevalcap/bleu/bleu_scorer.py → bleu/bleu_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.

# Modified by:
# Modified by:
# Hao Fang <[email protected]>
# Tsung-Yi Lin <[email protected]>

Expand All @@ -26,8 +26,8 @@ def precook(s, n=4, out=False):
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
Expand All @@ -42,7 +42,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in counts.iteritems():
for (ngram,count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

# Calculate effective reference sentence length.
Expand All @@ -52,32 +52,33 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
reflen = float(sum(reflen))/len(reflen)

## lhuang: N.B.: leave reflen computaiton to the very end!!

## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)

return (reflen, maxcounts)

def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
def cook_test(test, refs, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''

reflen, refmaxcounts = refs
testlen, counts = precook(test, n, True)

result = {}

# Calculate effective reference sentence length.

if eff == "closest":
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
else: ## i.e., "average" or "shortest" or None
result["reflen"] = reflen

result["testlen"] = testlen

result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]

result['correct'] = [0]*n
for (ngram, count) in counts.iteritems():
for (ngram, count) in counts.items():
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)

return result
Expand Down Expand Up @@ -108,7 +109,7 @@ def __init__(self, test=None, refs=None, n=4, special_reflen=None):

def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''

if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
Expand Down Expand Up @@ -136,7 +137,7 @@ def reflen(self, option=None):

def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
return self._testlen

def retest(self, new_test):
if type(new_test) is str:
Expand All @@ -151,7 +152,7 @@ def retest(self, new_test):

def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''

return self.retest(new_test).compute_score()

def size(self):
Expand All @@ -170,7 +171,7 @@ def __iadd__(self, other):
self.crefs.extend(other.crefs)
self._score = None ## need to recompute

return self
return self

def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n
Expand All @@ -179,7 +180,7 @@ def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)

def _single_reflen(self, reflens, option=None, testlen=None):

if option == "shortest":
reflen = min(reflens)
elif option == "average":
Expand All @@ -194,7 +195,7 @@ def _single_reflen(self, reflens, option=None, testlen=None):
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)

def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
Expand All @@ -212,7 +213,7 @@ def compute_score(self, option=None, verbose=0):
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}

# for each sentence
for comps in self.ctest:
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen

Expand All @@ -222,42 +223,42 @@ def compute_score(self, option=None, verbose=0):
reflen = self.special_reflen

self._reflen += reflen

for key in ['guess','correct']:
for k in xrange(n):
for k in range(n):
totalcomps[key][k] += comps[key][k]

# append per image bleu score
bleu = 1.
for k in xrange(n):
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)

if verbose > 1:
print comps, reflen
print(comps, reflen)

totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen

bleus = []
bleu = 1.
for k in xrange(n):
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)

if verbose > 0:
print totalcomps
print "ratio:", ratio
print(totalcomps)
print("ratio:", ratio)

self._score = bleus
return self._score, bleu_list
10 changes: 5 additions & 5 deletions pycocoevalcap/cider/cider.py → cider/cider.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# Filename: cider.py
#
# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
# by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
#
# Creation Date: Sun Feb 8 14:16:54 2015
#
# Authors: Ramakrishna Vedantam <[email protected]> and Tsung-Yi Lin <[email protected]>

from cider_scorer import CiderScorer
from .cider_scorer import CiderScorer
import pdb

class Cider:
"""
Main Class to compute the CIDEr metric
Main Class to compute the CIDEr metric

"""
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
Expand All @@ -26,7 +26,7 @@ def compute_score(self, gts, res):
Main function to compute CIDEr score
:param hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
ref_for_image (dict) : dictionary with key <image> and value <tokenized reference sentence>
:return: cider (float) : computed CIDEr score for the corpus
:return: cider (float) : computed CIDEr score for the corpus
"""

assert(gts.keys() == res.keys())
Expand All @@ -51,4 +51,4 @@ def compute_score(self, gts, res):
return score, scores

def method(self):
return "CIDEr"
return "CIDEr"
12 changes: 6 additions & 6 deletions pycocoevalcap/cider/cider_scorer.py → cider/cider_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def precook(s, n=4, out=False):
"""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
Expand Down Expand Up @@ -99,7 +99,7 @@ def compute_doc_freq(self):
'''
for refs in self.crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

Expand All @@ -115,7 +115,7 @@ def counts2vec(cnts):
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram,term_freq) in cnts.iteritems():
for (ngram,term_freq) in cnts.items():
# give word count 1 if it doesn't appear in reference corpus
df = np.log(max(1.0, self.document_frequency[ngram]))
# ngram index
Expand Down Expand Up @@ -146,7 +146,7 @@ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
# ngram
for (ngram,count) in vec_hyp[n].iteritems():
for (ngram,count) in vec_hyp[n].items():
# vrama91 : added clipping
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]

Expand Down Expand Up @@ -189,4 +189,4 @@ def compute_score(self, option=None, verbose=0):
score = self.compute_cider()
# debug
# print score
return np.mean(np.array(score)), np.array(score)
return np.mean(np.array(score)), np.array(score)
Loading