open-compass · kennymckormick · Jan 1, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/run.py b/run.py
@@ -322,7 +322,7 @@ def main():
                         judge_kwargs['model'] = 'chatgpt-0125'
                     elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name):
                         judge_kwargs['model'] = 'gpt-4-turbo'
-                    elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath'], dataset_name):
+                    elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench'], dataset_name):  # noqa: E501
                         judge_kwargs['model'] = 'gpt-4o-mini'
                     elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name):  # noqa: E501
                         judge_kwargs['model'] = 'gpt-4o'

diff --git a/vlmeval/api/siliconflow.py b/vlmeval/api/siliconflow.py
@@ -26,7 +26,7 @@ def resize_image(image: Image.Image, max_height: int, max_width: int) -> Image.I
 def encode_image(path: str, max_height: int = 1024, max_width: int = 1024) -> str:
     image = Image.open(path).convert("RGB")
     image = resize_image(image, max_height, max_width)
-    height, width = image.size
+    width, height = image.size
     if min(height, width) < 50:
         scale = 50 / min(width, height)
         image = image.resize((int(width * scale), int(height * scale)))

diff --git a/vlmeval/config.py b/vlmeval/config.py
@@ -13,7 +13,7 @@
 VideoChatGPT_ROOT = None
 PLLaVA_ROOT = None
 RBDash_ROOT = None
-VITA_ROOT = None
+VITA_ROOT = '/fs-computility/mllm1/shared/dhd/VITA'
 LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
 
 video_models = {
@@ -332,6 +332,7 @@
 qwen2vl_series = {
     'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
     'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'QVQ-72B-Preview': partial(Qwen2VLChat, model_path='Qwen/QVQ-72B-Preview', min_pixels=1280*28*28, max_pixels=16384*28*28, system_prompt='You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.', max_new_tokens=8192, post_process=False),
     'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
     'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
     'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -20,6 +20,7 @@
 from .mmlongbench import MMLongBench
 from .dude import DUDE
 from .slidevqa import SlideVQA
+from .vl_rewardbench import VLRewardBench
 
 from .mmbench_video import MMBenchVideo
 from .videomme import VideoMME
@@ -132,7 +133,7 @@ def evaluate(self, eval_file, **judge_kwargs):
     MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, CCOCRDataset,
     GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
     MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
-    CMMMU
+    CMMMU, VLRewardBench
 ]
 
 VIDEO_DATASET = [

diff --git a/vlmeval/dataset/image_ccocr.py b/vlmeval/dataset/image_ccocr.py
@@ -1,10 +1,18 @@
+# flake8: noqa
+
 import os
 import re
 import tempfile
 from functools import partial
 import pandas as pd
 
-from .utils import ccocr_evaluator_map
+try:
+    from .utils.ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map
+except ImportError as err:
+    import warnings
+    warnings.warn('The dependency of CCOCR evaluator is not properly installed')
+    warnings.warn(f'{type(err)}: {err}')
+
 from .image_base import ImageBaseDataset
 from ..smp import *
 
@@ -157,7 +165,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         for data_info in dict_list:
             image_name = data_info['image_name']
             gt_info[image_name] = data_info['answer']
-            
+
             # warning the FAIL samples
             if data_info['prediction'] != FAIL_MSG:
                 ptd_info[image_name] = data_info['prediction']

diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py
@@ -143,7 +143,7 @@ class ImageMCQDataset(ImageBaseDataset):
         'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
         'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
         'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
-        'RealWorldQA': '92321028d2bc29040284b6674721e48f',
+        'RealWorldQA': '4de008f55dc4fd008ca9e15321dc44b7',
         'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
         'BLINK': '3b6649b6a662184ea046908e5506260e',
         'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
@@ -286,7 +286,7 @@ class MMMUDataset(ImageMCQDataset):
     }
 
     DATASET_MD5 = {
-        'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
+        'MMMU_DEV_VAL': '585e8ad75e73f75dcad265dfd0417d64',
         'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
     }
 

diff --git a/vlmeval/dataset/utils/__init__.py b/vlmeval/dataset/utils/__init__.py
@@ -1,10 +1,9 @@
 from .judge_util import build_judge, DEBUG_MESSAGE
 from .multiple_choice import extract_answer_from_item, prefetch_answer
 from .vqa_eval import levenshtein_distance
-from .ccocr_evaluator import evaluator_map_info as ccocr_evaluator_map
 
 
 __all__ = [
-    'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'ccocr_evaluator_map',
+    'build_judge', 'extract_answer_from_item', 'prefetch_answer',
     'levenshtein_distance', 'DEBUG_MESSAGE',
 ]
diff --git a/vlmeval/dataset/utils/ccocr_evaluator/README.md b/vlmeval/dataset/utils/ccocr_evaluator/README.md
@@ -1,12 +1,12 @@
 # CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy
 
 ## Introduction
- 
+
 Please refer to our [GitHub](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/Benchmarks/CC-OCR) for more information.
 
 ## Running Scripts
 
-Once the environment is ready, execute the following script from the root directory of VLMEvalKit 
+Once the environment is ready, execute the following script from the root directory of VLMEvalKit
 to perform inference and evaluation tasks in batch.
 
 ```shell
@@ -44,13 +44,13 @@ If you find our work helpful, feel free to give us a cite.
 
 ```
 @misc{yang2024ccocr,
-      title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy}, 
+      title={CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating Large Multimodal Models in Literacy},
       author={Zhibo Yang and Jun Tang and Zhaohai Li and Pengfei Wang and Jianqiang Wan and Humen Zhong and Xuejing Liu and Mingkun Yang and Peng Wang and Shuai Bai and LianWen Jin and Junyang Lin},
       year={2024},
       eprint={2412.02210},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2412.02210}, 
+      url={https://arxiv.org/abs/2412.02210},
 }
 ```
 

diff --git a/vlmeval/dataset/utils/ccocr_evaluator/__init__.py b/vlmeval/dataset/utils/ccocr_evaluator/__init__.py
@@ -9,4 +9,4 @@
     "doc_parsing": ParsingEvaluator("doc_parsing"),
     "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
     "multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
-}
+}
diff --git a/vlmeval/dataset/utils/ccocr_evaluator/common.py b/vlmeval/dataset/utils/ccocr_evaluator/common.py
@@ -26,7 +26,7 @@ def pick_response_text(json_path):
 
     response_text = None
     if model_name.startswith("gpt") or model_name.startswith("o1"):
-        response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None)
+        response_text = model_response.get("data", {}).get("response", {}).get("choices", [{}])[0].get("message", {}).get("content", None)  # noqa: E501
     elif model_name.startswith("local_"):
         response_text = model_response
     else:
@@ -35,7 +35,7 @@ def pick_response_text(json_path):
         elif model_name.startswith("gemini"):
             content_list = model_response.get("candidates", [{}])[0].get("content", {}).get("parts", None)
         elif model_name.startswith("qwen"):
-            content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None)
+            content_list = model_response.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", None)  # noqa: E501
         else:
             raise NotImplementedError("The pick_response_text NOT implemented for model: {}".format(model_name))
 
@@ -115,7 +115,7 @@ def __call__(self, pdt_res_dir, gt_info, with_response_ratio=True, **kwargs):
         # add response_success_ratio
         if "summary" in eval_info and with_response_ratio:
             success_ratio = (len(response_info) + len(post_error_list)) / (len(gt_info) + 1e-9)
-            eval_info["summary"].update({"response_success_ratio": success_ratio })
+            eval_info["summary"].update({"response_success_ratio": success_ratio})
         return meta_info, eval_info
 
 
@@ -149,9 +149,9 @@ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
                 data_status_info = json.load(f)
             all_dataset_name.extend(data_status_info.keys())
         dataset_list = sorted(set(all_dataset_name))
-    
+
     # summary main code
-    all_evaluate_info, line_index = {}, 0
+    all_evaluate_info, _ = {}, 0
     for exp_name in os.listdir(exp_dir_base):
         dir_status_path = os.path.join(exp_dir_base, exp_name, "status.json")
         if not os.path.exists(dir_status_path):
@@ -219,4 +219,4 @@ def summary_multi_exp(exp_dir_base, dataset_list=None, is_weighted_sum=False):
 
     summary_path = summary_multi_exp(exp_base_dir, dataset_list=None, is_weighted_sum=False)
     print("--> info: summary saved at : {}".format(summary_path))
-    print("happy coding.")
+    print("happy coding.")
diff --git a/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/doc_parsing_evaluator.py
@@ -241,7 +241,7 @@ def eval_formula(self, response_info, gt_info, op_name='formula'):
             pred = response_info[img_name]
 
             if op_name == 'formula':
-                pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "")
+                pred = pred.replace("\n", " ").replace("```latex", "").replace("```", "").replace("\t", " ").replace(" ", "")  # noqa: E501
                 gt = gt.replace(" ", "")
             elif op_name == 'molecular':
                 pred = pred.replace("\n", "").replace(" ", "").replace("<smiles>", "").replace("</smiles>", "")

diff --git a/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/kie_evaluator.py
@@ -66,11 +66,11 @@ def update_cost(node1: Node, node2: Node):
     label2 = node2.label
     label1_leaf = "<leaf>" in label1
     label2_leaf = "<leaf>" in label2
-    if label1_leaf == True and label2_leaf == True:
+    if label1_leaf and label2_leaf:
         return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
-    elif label1_leaf == False and label2_leaf == True:
+    elif not label1_leaf and label2_leaf:
         return 1 + len(label2.replace("<leaf>", ""))
-    elif label1_leaf == True and label2_leaf == False:
+    elif label1_leaf and not label2_leaf:
         return 1 + len(label1.replace("<leaf>", ""))
     else:
         return int(label1 != label2)
@@ -121,7 +121,8 @@ def normalize_dict(data: Union[Dict, List, Any]):
 
 def cal_f1_all(preds, answers):
     """
-    Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives
+    Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives,
+    false negatives and false positives
     """
     metric_info, error_info = {}, {}
     total_tp, total_fn_or_fp = 0, 0
@@ -233,35 +234,28 @@ def cal_acc(pred: dict, answer: dict):
     """
     pred = construct_tree_from_dict(normalize_dict(pred))
     answer = construct_tree_from_dict(normalize_dict(answer))
-    return max(
-        0,
-        1
-        - (
-                zss.distance(
-                    pred,
-                    answer,
-                    get_children=zss.Node.get_children,
-                    insert_cost=insert_and_remove_cost,
-                    remove_cost=insert_and_remove_cost,
-                    update_cost=update_cost,
-                    return_operations=False,
-                )
-                / zss.distance(
-            construct_tree_from_dict(normalize_dict({})),
-            answer,
-            get_children=zss.Node.get_children,
-            insert_cost=insert_and_remove_cost,
-            remove_cost=insert_and_remove_cost,
-            update_cost=update_cost,
-            return_operations=False,
-        )
-        ),
+    val1 = zss.distance(
+        pred,
+        answer,
+        get_children=zss.Node.get_children,
+        insert_cost=insert_and_remove_cost,
+        remove_cost=insert_and_remove_cost,
+        update_cost=update_cost,
+        return_operations=False,
+    )
+    val2 = zss.distance(
+        construct_tree_from_dict(normalize_dict({})),
+        answer,
+        get_children=zss.Node.get_children,
+        insert_cost=insert_and_remove_cost,
+        remove_cost=insert_and_remove_cost,
+        update_cost=update_cost,
+        return_operations=False,
     )
+    return max(0, 1 - val1 / val2)
 
 
 def cal_acc_all(pred_info, answer_info):
-    """
-    """
     acc_info, error_info = {}, {}
     for file_name, answer in answer_info.items():
         # if file_name not in pred_info:
@@ -303,13 +297,11 @@ def eval_donut(pdt_info, gt_info, normalize_func=None, data_name=None):
     acc_average, acc_error_info = cal_acc_all(pdt_info, gt_info)
     eval_info = {"f1_score": f1_score, "acc": acc_average, "class_f1_score": class_eval_info,
                  "f1_error_info": error_info, "acc_error_info": acc_error_info}
-    print(data_name, "f1_score", f1_score,  "acc", acc_average)
+    print(data_name, "f1_score", f1_score, "acc", acc_average)
     return eval_info
 
 
 def post_process_to_json(qwen_info_str, file_name=None):
-    """
-    """
     try:
         if "```json" in qwen_info_str:
             if "```" not in qwen_info_str:
@@ -320,10 +312,7 @@ def post_process_to_json(qwen_info_str, file_name=None):
             json_str = qwen_info_str.strip().replace("\n", "")
         json_data = json.loads(json_str)
         return json_data
-    except Exception as e:
-        # print("--> post error: {}, file_name: {}".format(e, file_name))
-        # print("json_raw", qwen_info_str)
-        # print("json_str", json_str)
+    except Exception as err:  # noqa: F841
         return None
 
 

diff --git a/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py b/vlmeval/dataset/utils/ccocr_evaluator/ocr_evaluator.py
@@ -92,8 +92,10 @@ def evaluate(self, response_info, gt_info, **kwargs):
         image_pdt_info, image_gt_info = {}, {}
         for file_name, gt_src in gt_info.items():
             pred_src = response_info.get(file_name, "")
-            pdt_token_list = text_normalize_and_tokenize(str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
-            gt_token_list = text_normalize_and_tokenize(str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
+            pdt_token_list = text_normalize_and_tokenize(
+                str(pred_src).strip(), is_word_level, is_lower, is_alphanum_only)
+            gt_token_list = text_normalize_and_tokenize(
+                str(gt_src).strip(), is_word_level, is_lower, is_alphanum_only)
             image_pdt_info[file_name] = pdt_token_list
             image_gt_info[file_name] = gt_token_list
         eval_result = calculate_metrics(image_pdt_info, image_gt_info, is_verbose=False)