sbintuitions · h-asano · Dec 11, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/docs/preset_configs.py b/docs/preset_configs.py
@@ -60,10 +60,7 @@ def _get_level_prefix(level: int) -> str:
 with mkdocs_gen_files.open(Path("preset_configs") / "index.md", "w") as fd:
     fd.write("# Preset Configs\n")
     fd.write(
-        "You can check the config using the following command:\n"
-        "```bash\n"
-        "flexeval_presets <config_name>\n"
-        "```\n",
+        "You can check the config using the following command:\n```bash\nflexeval_presets <config_name>\n```\n",
     )
     fd.write(_nested_dict_to_markdown(all_pages))
 

diff --git a/examples/format_following/src/ifeval/combination.py b/examples/format_following/src/ifeval/combination.py
@@ -31,9 +31,7 @@ def __init__(self, prompt_to_repeat: str) -> None:
         self.prompt_to_repeat = prompt_to_repeat
 
     def check(self, response: str) -> bool:
-        if response.strip().lower().startswith(self.prompt_to_repeat.strip().lower()):
-            return True
-        return False
+        return response.strip().lower().startswith(self.prompt_to_repeat.strip().lower())
 
 
 class TwoResponses(ResponseConstraint):

diff --git a/examples/format_following/src/ifeval/length_constraints.py b/examples/format_following/src/ifeval/length_constraints.py
@@ -130,7 +130,7 @@ def check(self, response: str) -> bool:
         return False
 
 
-@functools.lru_cache(maxsize=None)
+@functools.lru_cache
 def _get_sentence_tokenizer() -> nltk.tokenize.punkt.PunktSentenceTokenizer:
     nltk.download("punkt_tab", quiet=True)
     return nltk.data.load("nltk:tokenizers/punkt/english.pickle")

diff --git a/examples/format_following/src/metric/instruction_following_eval.py b/examples/format_following/src/metric/instruction_following_eval.py
@@ -36,11 +36,11 @@ def evaluate(
     ) -> MetricResult:
         is_check_passed_list: list[list[bool]] = []
         is_check_passed_per_checker = defaultdict(list)
-        for lm_output, extra_info in zip(lm_outputs, extra_info_list):
+        for lm_output, extra_info in zip(lm_outputs, extra_info_list, strict=True):
             constraints = [self._instantiate_checker_from_params(params) for params in extra_info["constraints"]]
             is_check_passed = [checker.check(lm_output) for checker in constraints]
             is_check_passed_list.append(is_check_passed)
-            for checker, is_passed in zip(constraints, is_check_passed):
+            for checker, is_passed in zip(constraints, is_check_passed, strict=True):
                 is_check_passed_per_checker[checker.__class__.__name__].append(is_passed)
 
         num_items = len(is_check_passed_list)

diff --git a/examples/format_following/src/multilingual_ifeval/ja.py b/examples/format_following/src/multilingual_ifeval/ja.py
@@ -739,7 +739,7 @@ class KatakanaOnly(ResponseConstraint):
 
     def check(self, response: str) -> bool:
         def is_katakana(char: str) -> bool:
-            return "ァ" <= char <= "ン" or char == "ー" or char == "・" or "ｦ" <= char <= "ﾟ"
+            return "ァ" <= char <= "ン" or char in {"ー", "・"} or "ｦ" <= char <= "ﾟ"
 
         def is_ignorable(char: str) -> bool:
             return not unicodedata.category(char).startswith("L")

diff --git a/flexeval/core/chat_dataset/base.py b/flexeval/core/chat_dataset/base.py
@@ -2,8 +2,9 @@
 
 import warnings
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Any, Sequence
+from typing import Any
 
 
 @dataclass

diff --git a/flexeval/core/chat_dataset/openai_messages.py b/flexeval/core/chat_dataset/openai_messages.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import json
-from typing import Any, Iterator
+from collections.abc import Iterator
+from typing import Any
 
 from .base import ChatDataset, ChatInstance
 

diff --git a/flexeval/core/chat_dataset/template_based.py b/flexeval/core/chat_dataset/template_based.py
@@ -8,7 +8,7 @@
 
 import datasets
 from jinja2 import Template
-from smart_open import open
+from smart_open import open  # noqa: A004
 
 from flexeval.core.utils.jinja2_utils import JINJA2_ENV
 
@@ -120,8 +120,7 @@ def __getitem__(self, i: int) -> ChatInstance:
             reference_list_string = self.reference_list_template.render(**item)
             if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
                 msg = (
-                    f"The reference_list_template should render a list of strings "
-                    f"but we got `{reference_list_string}`."
+                    f"The reference_list_template should render a list of strings but we got `{reference_list_string}`."
                 )
                 raise ValueError(msg)
             reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

diff --git a/flexeval/core/evaluate_from_data.py b/flexeval/core/evaluate_from_data.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import Any, Iterable
+from collections.abc import Iterable
+from typing import Any
 
 from loguru import logger
 

diff --git a/flexeval/core/evaluate_generation.py b/flexeval/core/evaluate_generation.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import Any, Sequence
+from collections.abc import Sequence
+from typing import Any
 
 from loguru import logger
 from tqdm import tqdm

diff --git a/flexeval/core/evaluate_multiple_choice.py b/flexeval/core/evaluate_multiple_choice.py
@@ -59,7 +59,7 @@ def evaluate_multiple_choice(
             if batch_id == 0:
                 logger.info("Example of the model inputs and outputs:")
                 logger.info(f"prefix: {batch_prefixes[0]}")
-                logger.info(f"choices: {batch_choices[:len(eval_instance.choices)]}")
+                logger.info(f"choices: {batch_choices[: len(eval_instance.choices)]}")
 
             batch_log_probs = language_model.compute_log_probs(
                 text_list=batch_choices,

diff --git a/flexeval/core/evaluate_perplexity.py b/flexeval/core/evaluate_perplexity.py
@@ -2,7 +2,7 @@
 
 import math
 from collections import defaultdict
-from typing import Sequence
+from collections.abc import Sequence
 
 from loguru import logger
 from tqdm import tqdm

diff --git a/flexeval/core/evaluate_reward_model.py b/flexeval/core/evaluate_reward_model.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 from collections import defaultdict
-from typing import Any, Sequence
+from collections.abc import Sequence
+from typing import Any
 
 from loguru import logger
 from tqdm import tqdm

diff --git a/flexeval/core/few_shot_generator/base.py b/flexeval/core/few_shot_generator/base.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Union
+from typing import Any
 
 from flexeval.core.chat_dataset import ChatDataset, ChatInstance
 from flexeval.core.generation_dataset import GenerationDataset, GenerationInstance
 from flexeval.core.multiple_choice_dataset import MultipleChoiceDataset, MultipleChoiceInstance
 
-Dataset = Union[GenerationDataset, MultipleChoiceDataset, ChatDataset]
-Instance = Union[GenerationInstance, MultipleChoiceInstance, ChatInstance]
+Dataset = GenerationDataset | MultipleChoiceDataset | ChatDataset
+Instance = GenerationInstance | MultipleChoiceInstance | ChatInstance
 
 
 class FewShotGenerator(ABC):

diff --git a/flexeval/core/generation_dataset/base.py b/flexeval/core/generation_dataset/base.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Any, Sequence
+from typing import Any
 
 
 @dataclass

diff --git a/flexeval/core/generation_dataset/template_based.py b/flexeval/core/generation_dataset/template_based.py
@@ -6,7 +6,7 @@
 
 import datasets
 from jinja2 import Template
-from smart_open import open
+from smart_open import open  # noqa: A004
 
 from flexeval.core.utils.jinja2_utils import JINJA2_ENV
 
@@ -82,8 +82,7 @@ def __getitem__(self, i: int) -> GenerationInstance:
             reference_list_string = self.reference_list_template.render(**item)
             if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
                 msg = (
-                    f"The reference_list_template should render a list of strings "
-                    f"but we got `{reference_list_string}`."
+                    f"The reference_list_template should render a list of strings but we got `{reference_list_string}`."
                 )
                 raise ValueError(msg)
             reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])

diff --git a/flexeval/core/language_model/hf_lm.py b/flexeval/core/language_model/hf_lm.py
@@ -4,7 +4,8 @@
 import copy
 import gc
 import json
-from typing import Any, Callable, Literal, TypeVar
+from collections.abc import Callable
+from typing import Any, Literal, TypeVar
 
 import torch
 import torch.nn.functional as F  # noqa: N812

diff --git a/flexeval/core/language_model/openai_api.py b/flexeval/core/language_model/openai_api.py
@@ -3,8 +3,9 @@
 import itertools
 import os
 import time
+from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Callable, TypeVar
+from typing import Any, TypeVar
 
 import openai
 import tiktoken

diff --git a/flexeval/core/language_model/openai_batch_api.py b/flexeval/core/language_model/openai_batch_api.py
@@ -84,7 +84,7 @@ def __init__(
         # convert the flexeval-specific argument name to the OpenAI-specific name
         if "max_new_tokens" in self.default_gen_kwargs:
             self.default_gen_kwargs["max_completion_tokens"] = self.default_gen_kwargs.pop("max_new_tokens")
-        self.temp_jsonl_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl")
+        self.temp_jsonl_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl")  # noqa: SIM115
 
         self.polling_interval_seconds = polling_interval_seconds
         self.developer_message = developer_message
@@ -146,7 +146,7 @@ async def _post_batch_requests(
         self.create_batch_file(custom_id_2_input, **gen_kwargs)
 
         # Update batch file
-        with open(self.temp_jsonl_file.name, "rb") as batch_file:  # noqa: ASYNC101
+        with open(self.temp_jsonl_file.name, "rb") as batch_file:  # noqa: ASYNC230
             batch_input_file = await self._client.files.create(file=batch_file, purpose="batch")
 
         # Run Job
@@ -190,7 +190,7 @@ def _execute_batch_requests(  # noqa: C901
             for messages, tools in zip(messages_list, tools_list)
         }
         # The response will be an empty string if the API produces an error.
-        custom_id_2_response: dict[str, str | list[dict[str, Any]]] = {custom_id: "" for custom_id in custom_id_2_input}
+        custom_id_2_response: dict[str, str | list[dict[str, Any]]] = dict.fromkeys(custom_id_2_input, "")
         exec_cnt = 1
 
         while len(custom_id_2_input) > 0:

diff --git a/flexeval/core/language_model/vllm_model.py b/flexeval/core/language_model/vllm_model.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import time
-from typing import TYPE_CHECKING, Any, Callable, Literal
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from loguru import logger

diff --git a/flexeval/core/language_model/vllm_serve_lm.py b/flexeval/core/language_model/vllm_serve_lm.py
@@ -6,7 +6,8 @@
 import subprocess
 import threading
 import time
-from typing import IO, Any, Callable
+from collections.abc import Callable
+from typing import IO, Any
 
 import requests
 import torch

diff --git a/flexeval/core/metric/llm_label.py b/flexeval/core/metric/llm_label.py
@@ -79,7 +79,7 @@ def summarize_evaluator_labels(
         category2mean_score[category] = score
         category2dist[category] = calc_label_dist(valid_labels, label_names)
 
-    for category in category2mean_score:
+    for category in category2mean_score:  # noqa: PLC0206
         summary[f"{score_key}/{category}"] = category2mean_score[category]
         summary[f"{dist_key}/{category}"] = category2dist[category]
 

diff --git a/flexeval/core/metric/perspective_api.py b/flexeval/core/metric/perspective_api.py
@@ -2,7 +2,8 @@
 
 import os
 import time
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import numpy as np
 from googleapiclient import discovery
@@ -83,7 +84,7 @@ def evaluate(
         instance_details = []
         for lm_output in lm_outputs:
             if lm_output == "":
-                instance_details.append({att: 0.0 for att in self.attributes})
+                instance_details.append(dict.fromkeys(self.attributes, 0.0))
                 continue
             analyze_request = {
                 "comment": {"text": lm_output},

diff --git a/flexeval/core/metric/substring_match.py b/flexeval/core/metric/substring_match.py
@@ -59,7 +59,7 @@ def evaluate(
         ]
 
         score = 0.0
-        if len(match_list):
+        if match_list:
             score = sum(match_list) / len(match_list)
 
         summary = {f"substring_match-{self.mode}": score}

diff --git a/flexeval/core/multiple_choice_dataset/base.py b/flexeval/core/multiple_choice_dataset/base.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Sequence
 
 
 @dataclass

diff --git a/flexeval/core/multiple_choice_dataset/template_based.py b/flexeval/core/multiple_choice_dataset/template_based.py
@@ -78,7 +78,7 @@ def __getitem__(self, i: int) -> MultipleChoiceInstance:
 
         answer_index = int(self.answer_index_template.render(**item))
         if not (answer_index >= 0 and answer_index < len(choices)):
-            msg = f"at least {answer_index+1} choices required, but got {choices}"
+            msg = f"at least {answer_index + 1} choices required, but got {choices}"
             raise ValueError(msg)
 
         return MultipleChoiceInstance(

diff --git a/flexeval/core/pairwise_comparison/match_maker/all_combinations.py b/flexeval/core/pairwise_comparison/match_maker/all_combinations.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import itertools
-from typing import Iterable
+from collections.abc import Iterable
 
 from flexeval.core.pairwise_comparison.match import Match
 

diff --git a/flexeval/core/pairwise_comparison/match_maker/base.py b/flexeval/core/pairwise_comparison/match_maker/base.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Iterable, TypeVar
+from collections.abc import Iterable
+from typing import TypeVar
 
 from flexeval.core.pairwise_comparison.match import Match
 

diff --git a/flexeval/core/pairwise_comparison/match_maker/random_combinations.py b/flexeval/core/pairwise_comparison/match_maker/random_combinations.py
@@ -2,7 +2,7 @@
 
 import itertools
 import random
-from typing import Iterable
+from collections.abc import Iterable
 
 from flexeval.core.pairwise_comparison.match import Match
 
@@ -25,7 +25,7 @@ def generate_matches(
 
         cached_matches = cached_matches or []
         cache_dict = {match.get_key_for_cache(): match for match in cached_matches}
-        model_match_counter: dict[str, int] = {name: 0 for name in model_names}
+        model_match_counter: dict[str, int] = dict.fromkeys(model_names, 0)
         possible_new_matches: list[Match] = []
         matches: list[Match] = []
         for m1, m2 in all_permutations:

diff --git a/flexeval/core/pairwise_comparison/scorer/win_rate.py b/flexeval/core/pairwise_comparison/scorer/win_rate.py
@@ -30,7 +30,7 @@ def compute_scores(
                 win_count_dict[model2] += 0.5
 
         win_rate_dict = {}
-        for model in match_count_dict:
+        for model in match_count_dict:  # noqa: PLC0206
             win_rate_dict[model] = 100 * win_count_dict.get(model, 0.0) / match_count_dict[model]
 
         return dict(sorted(win_rate_dict.items(), key=lambda x: -x[1]))
diff --git a/flexeval/core/reward_bench_dataset/base.py b/flexeval/core/reward_bench_dataset/base.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Any, Sequence
+from typing import Any
 
 
 @dataclass

diff --git a/flexeval/core/reward_bench_dataset/template_based.py b/flexeval/core/reward_bench_dataset/template_based.py
@@ -5,7 +5,7 @@
 
 import datasets
 from jinja2 import Template
-from smart_open import open
+from smart_open import open  # noqa: A004
 
 from flexeval.core.utils.jinja2_utils import JINJA2_ENV
 

diff --git a/flexeval/core/reward_model/pairwise_judge_reward_model.py b/flexeval/core/reward_model/pairwise_judge_reward_model.py
@@ -35,9 +35,7 @@ def evaluate_model_output(model_output: str, gold_label: PairwiseChoice) -> bool
         return False
 
     # If only gold label is in model output, then output is **correct**
-    if gold_label.value in model_output:
-        return True
-    return False
+    return gold_label.value in model_output
 
 
 def aggregate_judge_results(

diff --git a/flexeval/core/text_dataset/base.py b/flexeval/core/text_dataset/base.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Sequence
 
 
 @dataclass