[None][test] Add longbench v2 for long context evaluation (#8604)

baize97 · web-flow · commit 0019d99e6d15 · 2025-10-27T20:01:14.000+08:00
Signed-off-by: mni &lt;125171826+baize97@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
@@ -21,7 +21,7 @@
 from .. import LLM as PyTorchLLM
 from .._tensorrt_engine import LLM
 from ..evaluate import (GSM8K, MMLU, MMMU, CnnDailymail, GPQADiamond,
-                        GPQAExtended, GPQAMain, JsonModeEval)
+                        GPQAExtended, GPQAMain, JsonModeEval, LongBenchV2)
 from ..llmapi import BuildConfig, KvCacheConfig
 from ..llmapi.llm_utils import update_llm_args_with_extra_options
 from ..logger import logger, severity_map
@@ -159,6 +159,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
 main.add_command(GPQAExtended.command)
 main.add_command(JsonModeEval.command)
 main.add_command(MMMU.command)
+main.add_command(LongBenchV2.command)
 
 if __name__ == "__main__":
     main()
diff --git a/tensorrt_llm/evaluate/__init__.py b/tensorrt_llm/evaluate/__init__.py
@@ -16,9 +16,10 @@
 from .cnn_dailymail import CnnDailymail
 from .json_mode_eval import JsonModeEval
 from .lm_eval import GSM8K, MMMU, GPQADiamond, GPQAExtended, GPQAMain
+from .longbench_v2 import LongBenchV2
 from .mmlu import MMLU
 
 __all__ = [
     "CnnDailymail", "MMLU", "GSM8K", "GPQADiamond", "GPQAMain", "GPQAExtended",
-    "JsonModeEval", "MMMU"
+    "JsonModeEval", "MMMU", "LongBenchV2"
 ]
diff --git a/tensorrt_llm/evaluate/longbench_v2.py b/tensorrt_llm/evaluate/longbench_v2.py