Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VL-RewardBench #703

Merged
merged 15 commits into from
Jan 1, 2025
2 changes: 1 addition & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def main():
judge_kwargs['model'] = 'chatgpt-0125'
elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name):
judge_kwargs['model'] = 'gpt-4-turbo'
elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath'], dataset_name):
elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench'], dataset_name): # noqa: E501
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name): # noqa: E501
judge_kwargs['model'] = 'gpt-4o'
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/api/siliconflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def resize_image(image: Image.Image, max_height: int, max_width: int) -> Image.I
def encode_image(path: str, max_height: int = 1024, max_width: int = 1024) -> str:
image = Image.open(path).convert("RGB")
image = resize_image(image, max_height, max_width)
height, width = image.size
width, height = image.size
if min(height, width) < 50:
scale = 50 / min(width, height)
image = image.resize((int(width * scale), int(height * scale)))
Expand Down
3 changes: 2 additions & 1 deletion vlmeval/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
VideoChatGPT_ROOT = None
PLLaVA_ROOT = None
RBDash_ROOT = None
VITA_ROOT = None
VITA_ROOT = '/fs-computility/mllm1/shared/dhd/VITA'
LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '

video_models = {
Expand Down Expand Up @@ -332,6 +332,7 @@
qwen2vl_series = {
'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
'QVQ-72B-Preview': partial(Qwen2VLChat, model_path='Qwen/QVQ-72B-Preview', min_pixels=1280*28*28, max_pixels=16384*28*28, system_prompt='You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.', max_new_tokens=8192, post_process=False),
'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
Expand Down
3 changes: 2 additions & 1 deletion vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .mmlongbench import MMLongBench
from .dude import DUDE
from .slidevqa import SlideVQA
from .vl_rewardbench import VLRewardBench

from .mmbench_video import MMBenchVideo
from .videomme import VideoMME
Expand Down Expand Up @@ -132,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs):
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset,
GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
CMMMU
CMMMU, VLRewardBench
]

VIDEO_DATASET = [
Expand Down
12 changes: 10 additions & 2 deletions vlmeval/dataset/image_ccocr.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# flake8: noqa

import os
import re
import tempfile
from functools import partial
import pandas as pd

from .utils import ccocr_evaluator_map
try:
from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map
except ImportError as err:
import warnings
warnings.warn('The dependency of CCOCR evaluator is not properly installed')
warnings.warn(f'{type(err)}: {err}')

from .image_base import ImageBaseDataset
from ..smp import *

Expand Down Expand Up @@ -157,7 +165,7 @@ def evaluate(self, eval_file, **judge_kwargs):
for data_info in dict_list:
image_name = data_info['image_name']
gt_info[image_name] = data_info['answer']

# warning the FAIL samples
if data_info['prediction'] != FAIL_MSG:
ptd_info[image_name] = data_info['prediction']
Expand Down
4 changes: 2 additions & 2 deletions vlmeval/dataset/image_mcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ class ImageMCQDataset(ImageBaseDataset):
'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
'RealWorldQA': '92321028d2bc29040284b6674721e48f',
'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7',
'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
'BLINK': '3b6649b6a662184ea046908e5506260e',
'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
Expand Down Expand Up @@ -286,7 +286,7 @@ class MMMUDataset(ImageMCQDataset):
}

DATASET_MD5 = {
'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64',
'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
}

Expand Down
3 changes: 1 addition & 2 deletions vlmeval/dataset/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from .judge_util import build_judge, DEBUG_MESSAGE
from .multiple_choice import extract_answer_from_item, prefetch_answer
from .vqa_eval import levenshtein_distance
from .ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map


__all__ = [
'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'ccocr_evaluator_map',
'build_judge', 'extract_answer_from_item', 'prefetch_answer',
'levenshtein_distance', 'DEBUG_MESSAGE',
]
8 changes: 4 additions & 4 deletions vlmeval/dataset/utils/ccocr_evaluator/README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy

## Introduction

Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.

## Running Scripts

Once the environment is ready, execute the following script from the root directory of VLMEvalKit
Once the environment is ready, execute the following script from the root directory of VLMEvalKit
to perform inference and evaluation tasks in batch.

```shell
Expand Down Expand Up @@ -44,13 +44,13 @@ If you find our work helpful, feel free to give us a cite.

```
@misc{yang2024ccocr,
title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
year={2024},
eprint={2412.02210},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.02210},
url={https://arxiv.org/abs/2412.02210},
}
```

Expand Down
2 changes: 1 addition & 1 deletion vlmeval/dataset/utils/ccocr_evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
"doc_parsing": ParsingEvaluator("doc_parsing"),
"multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
"multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
}
}
12 changes: 6 additions & 6 deletions vlmeval/dataset/utils/ccocr_evaluator/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def pick_response_text(json_path):

response_text = None
if model_name.startswith("gpt") or model_name.startswith("o1"):
response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None)
response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501
elif model_name.startswith("local_"):
response_text = model_response
else:
Expand All @@ -35,7 +35,7 @@ def pick_response_text(json_path):
elif model_name.startswith("gemini"):
content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None)
elif model_name.startswith("qwen"):
content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None)
content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None) # noqa: E501
else:
raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name))

Expand Down Expand Up @@ -115,7 +115,7 @@ def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs):
# add response_success_ratio
if "summary" in eval_info and with_response_ratio:
success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9)
eval_info["summary"].update({"response_success_ratio": success_ratio })
eval_info["summary"].update({"response_success_ratio": success_ratio})
return meta_info, eval_info


Expand Down Expand Up @@ -149,9 +149,9 @@ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
data_status_info = json.load(f)
all_dataset_name.extend(data_status_info.keys())
dataset_list = sorted(set(all_dataset_name))

# summary main code
all_evaluate_info, line_index = {}, 0
all_evaluate_info, _ = {}, 0
for exp_name in os.listdir(exp_dir_base):
dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
if not os.path.exists(dir_status_path):
Expand Down Expand Up @@ -219,4 +219,4 @@ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):

summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False)
print("--> info: summary saved at : {}".format(summary_path))
print("happy coding.")
print("happy coding.")
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def eval_formula(self, response_info, gt_info, op_name='formula'):
pred = response_info[img_name]

if op_name == 'formula':
pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "")
pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "") # noqa: E501
gt = gt.replace(" ", "")
elif op_name == 'molecular':
pred = pred.replace("\n", "").replace(" ", "").replace("<smiles>", "").replace("</smiles>", "")
Expand Down
61 changes: 25 additions & 36 deletions vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ def update_cost(node1: Node, node2: Node):
label2 = node2.label
label1_leaf = "<leaf>" in label1
label2_leaf = "<leaf>" in label2
if label1_leaf == True and label2_leaf == True:
if label1_leaf and label2_leaf:
return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
elif label1_leaf == False and label2_leaf == True:
elif not label1_leaf and label2_leaf:
return 1 + len(label2.replace("<leaf>", ""))
elif label1_leaf == True and label2_leaf == False:
elif label1_leaf and not label2_leaf:
return 1 + len(label1.replace("<leaf>", ""))
else:
return int(label1 != label2)
Expand Down Expand Up @@ -121,7 +121,8 @@ def normalize_dict(data: Union[Dict, List, Any]):

def cal_f1_all(preds, answers):
"""
Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives
Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives,
false negatives and false positives
"""
metric_info, error_info = {}, {}
total_tp, total_fn_or_fp = 0, 0
Expand Down Expand Up @@ -233,35 +234,28 @@ def cal_acc(pred: dict, answer: dict):
"""
pred = construct_tree_from_dict(normalize_dict(pred))
answer = construct_tree_from_dict(normalize_dict(answer))
return max(
0,
1
- (
zss.distance(
pred,
answer,
get_children=zss.Node.get_children,
insert_cost=insert_and_remove_cost,
remove_cost=insert_and_remove_cost,
update_cost=update_cost,
return_operations=False,
)
/ zss.distance(
construct_tree_from_dict(normalize_dict({})),
answer,
get_children=zss.Node.get_children,
insert_cost=insert_and_remove_cost,
remove_cost=insert_and_remove_cost,
update_cost=update_cost,
return_operations=False,
)
),
val1 = zss.distance(
pred,
answer,
get_children=zss.Node.get_children,
insert_cost=insert_and_remove_cost,
remove_cost=insert_and_remove_cost,
update_cost=update_cost,
return_operations=False,
)
val2 = zss.distance(
construct_tree_from_dict(normalize_dict({})),
answer,
get_children=zss.Node.get_children,
insert_cost=insert_and_remove_cost,
remove_cost=insert_and_remove_cost,
update_cost=update_cost,
return_operations=False,
)
return max(0, 1 - val1 / val2)


def cal_acc_all(pred_info, answer_info):
"""
"""
acc_info, error_info = {}, {}
for file_name, answer in answer_info.items():
# if file_name not in pred_info:
Expand Down Expand Up @@ -303,13 +297,11 @@ def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None):
acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info)
eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info,
"f1_error_info": error_info, "acc_error_info": acc_error_info}
print(data_name, "f1_score", f1_score, "acc", acc_average)
print(data_name, "f1_score", f1_score, "acc", acc_average)
return eval_info


def post_process_to_json(qwen_info_str, file_name=None):
"""
"""
try:
if "```json" in qwen_info_str:
if "```" not in qwen_info_str:
Expand All @@ -320,10 +312,7 @@ def post_process_to_json(qwen_info_str, file_name=None):
json_str = qwen_info_str.strip().replace("\n", "")
json_data = json.loads(json_str)
return json_data
except Exception as e:
# print("--> post error: {}, file_name: {}".format(e, file_name))
# print("json_raw", qwen_info_str)
# print("json_str", json_str)
except Exception as err: # noqa: F841
return None


Expand Down
6 changes: 4 additions & 2 deletions vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ def evaluate(self, response_info, gt_info, **kwargs):
image_pdt_info, image_gt_info = {}, {}
for file_name, gt_src in gt_info.items():
pred_src = response_info.get(file_name, "")
pdt_token_list = text_normalize_and_tokenize(str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
gt_token_list = text_normalize_and_tokenize(str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
pdt_token_list = text_normalize_and_tokenize(
str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
gt_token_list = text_normalize_and_tokenize(
str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
image_pdt_info[file_name] = pdt_token_list
image_gt_info[file_name] = gt_token_list
eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False)
Expand Down
Loading
Loading