Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions docs/preset_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,7 @@ def _get_level_prefix(level: int) -> str:
with mkdocs_gen_files.open(Path("preset_configs") / "index.md", "w") as fd:
fd.write("# Preset Configs\n")
fd.write(
"You can check the config using the following command:\n"
"```bash\n"
"flexeval_presets <config_name>\n"
"```\n",
"You can check the config using the following command:\n```bash\nflexeval_presets <config_name>\n```\n",
)
fd.write(_nested_dict_to_markdown(all_pages))

Expand Down
4 changes: 1 addition & 3 deletions examples/format_following/src/ifeval/combination.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ def __init__(self, prompt_to_repeat: str) -> None:
self.prompt_to_repeat = prompt_to_repeat

def check(self, response: str) -> bool:
if response.strip().lower().startswith(self.prompt_to_repeat.strip().lower()):
return True
return False
return response.strip().lower().startswith(self.prompt_to_repeat.strip().lower())


class TwoResponses(ResponseConstraint):
Expand Down
2 changes: 1 addition & 1 deletion examples/format_following/src/ifeval/length_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def check(self, response: str) -> bool:
return False


@functools.lru_cache(maxsize=None)
@functools.lru_cache
def _get_sentence_tokenizer() -> nltk.tokenize.punkt.PunktSentenceTokenizer:
nltk.download("punkt_tab", quiet=True)
return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ def evaluate(
) -> MetricResult:
is_check_passed_list: list[list[bool]] = []
is_check_passed_per_checker = defaultdict(list)
for lm_output, extra_info in zip(lm_outputs, extra_info_list):
for lm_output, extra_info in zip(lm_outputs, extra_info_list, strict=True):
constraints = [self._instantiate_checker_from_params(params) for params in extra_info["constraints"]]
is_check_passed = [checker.check(lm_output) for checker in constraints]
is_check_passed_list.append(is_check_passed)
for checker, is_passed in zip(constraints, is_check_passed):
for checker, is_passed in zip(constraints, is_check_passed, strict=True):
is_check_passed_per_checker[checker.__class__.__name__].append(is_passed)

num_items = len(is_check_passed_list)
Expand Down
2 changes: 1 addition & 1 deletion examples/format_following/src/multilingual_ifeval/ja.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ class KatakanaOnly(ResponseConstraint):

def check(self, response: str) -> bool:
def is_katakana(char: str) -> bool:
return "ァ" <= char <= "ン" or char == "ー" or char == "・" or "ヲ" <= char <= "゚"
return "ァ" <= char <= "ン" or char in {"ー", "・"} or "ヲ" <= char <= "゚"

def is_ignorable(char: str) -> bool:
return not unicodedata.category(char).startswith("L")
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/chat_dataset/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import warnings
from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import Any, Sequence
from typing import Any


@dataclass
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/chat_dataset/openai_messages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

import json
from typing import Any, Iterator
from collections.abc import Iterator
from typing import Any

from .base import ChatDataset, ChatInstance

Expand Down
5 changes: 2 additions & 3 deletions flexeval/core/chat_dataset/template_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import datasets
from jinja2 import Template
from smart_open import open
from smart_open import open # noqa: A004

from flexeval.core.utils.jinja2_utils import JINJA2_ENV

Expand Down Expand Up @@ -120,8 +120,7 @@ def __getitem__(self, i: int) -> ChatInstance:
reference_list_string = self.reference_list_template.render(**item)
if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
msg = (
f"The reference_list_template should render a list of strings "
f"but we got `{reference_list_string}`."
f"The reference_list_template should render a list of strings but we got `{reference_list_string}`."
)
raise ValueError(msg)
reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/evaluate_from_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any, Iterable
from collections.abc import Iterable
from typing import Any

from loguru import logger

Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/evaluate_generation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any, Sequence
from collections.abc import Sequence
from typing import Any

from loguru import logger
from tqdm import tqdm
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/evaluate_multiple_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def evaluate_multiple_choice(
if batch_id == 0:
logger.info("Example of the model inputs and outputs:")
logger.info(f"prefix: {batch_prefixes[0]}")
logger.info(f"choices: {batch_choices[:len(eval_instance.choices)]}")
logger.info(f"choices: {batch_choices[: len(eval_instance.choices)]}")

batch_log_probs = language_model.compute_log_probs(
text_list=batch_choices,
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/evaluate_perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import math
from collections import defaultdict
from typing import Sequence
from collections.abc import Sequence

from loguru import logger
from tqdm import tqdm
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/evaluate_reward_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

from collections import defaultdict
from typing import Any, Sequence
from collections.abc import Sequence
from typing import Any

from loguru import logger
from tqdm import tqdm
Expand Down
6 changes: 3 additions & 3 deletions flexeval/core/few_shot_generator/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any, Union
from typing import Any

from flexeval.core.chat_dataset import ChatDataset, ChatInstance
from flexeval.core.generation_dataset import GenerationDataset, GenerationInstance
from flexeval.core.multiple_choice_dataset import MultipleChoiceDataset, MultipleChoiceInstance

Dataset = Union[GenerationDataset, MultipleChoiceDataset, ChatDataset]
Instance = Union[GenerationInstance, MultipleChoiceInstance, ChatInstance]
Dataset = GenerationDataset | MultipleChoiceDataset | ChatDataset
Instance = GenerationInstance | MultipleChoiceInstance | ChatInstance


class FewShotGenerator(ABC):
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/generation_dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import Any, Sequence
from typing import Any


@dataclass
Expand Down
5 changes: 2 additions & 3 deletions flexeval/core/generation_dataset/template_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import datasets
from jinja2 import Template
from smart_open import open
from smart_open import open # noqa: A004

from flexeval.core.utils.jinja2_utils import JINJA2_ENV

Expand Down Expand Up @@ -82,8 +82,7 @@ def __getitem__(self, i: int) -> GenerationInstance:
reference_list_string = self.reference_list_template.render(**item)
if not (reference_list_string.startswith("[") and reference_list_string.endswith("]")):
msg = (
f"The reference_list_template should render a list of strings "
f"but we got `{reference_list_string}`."
f"The reference_list_template should render a list of strings but we got `{reference_list_string}`."
)
raise ValueError(msg)
reference_list.extend([str(ref) for ref in literal_eval(reference_list_string)])
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/language_model/hf_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import copy
import gc
import json
from typing import Any, Callable, Literal, TypeVar
from collections.abc import Callable
from typing import Any, Literal, TypeVar

import torch
import torch.nn.functional as F # noqa: N812
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/language_model/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import itertools
import os
import time
from collections.abc import Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Callable, TypeVar
from typing import Any, TypeVar

import openai
import tiktoken
Expand Down
6 changes: 3 additions & 3 deletions flexeval/core/language_model/openai_batch_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(
# convert the flexeval-specific argument name to the OpenAI-specific name
if "max_new_tokens" in self.default_gen_kwargs:
self.default_gen_kwargs["max_completion_tokens"] = self.default_gen_kwargs.pop("max_new_tokens")
self.temp_jsonl_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl")
self.temp_jsonl_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl") # noqa: SIM115

self.polling_interval_seconds = polling_interval_seconds
self.developer_message = developer_message
Expand Down Expand Up @@ -146,7 +146,7 @@ async def _post_batch_requests(
self.create_batch_file(custom_id_2_input, **gen_kwargs)

# Update batch file
with open(self.temp_jsonl_file.name, "rb") as batch_file: # noqa: ASYNC101
with open(self.temp_jsonl_file.name, "rb") as batch_file: # noqa: ASYNC230
batch_input_file = await self._client.files.create(file=batch_file, purpose="batch")

# Run Job
Expand Down Expand Up @@ -190,7 +190,7 @@ def _execute_batch_requests( # noqa: C901
for messages, tools in zip(messages_list, tools_list)
}
# The response will be an empty string if the API produces an error.
custom_id_2_response: dict[str, str | list[dict[str, Any]]] = {custom_id: "" for custom_id in custom_id_2_input}
custom_id_2_response: dict[str, str | list[dict[str, Any]]] = dict.fromkeys(custom_id_2_input, "")
exec_cnt = 1

while len(custom_id_2_input) > 0:
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/language_model/vllm_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

import time
from typing import TYPE_CHECKING, Any, Callable, Literal
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Literal

import torch
from loguru import logger
Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/language_model/vllm_serve_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import subprocess
import threading
import time
from typing import IO, Any, Callable
from collections.abc import Callable
from typing import IO, Any

import requests
import torch
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/metric/llm_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def summarize_evaluator_labels(
category2mean_score[category] = score
category2dist[category] = calc_label_dist(valid_labels, label_names)

for category in category2mean_score:
for category in category2mean_score: # noqa: PLC0206
summary[f"{score_key}/{category}"] = category2mean_score[category]
summary[f"{dist_key}/{category}"] = category2dist[category]

Expand Down
5 changes: 3 additions & 2 deletions flexeval/core/metric/perspective_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import os
import time
from typing import Any, Callable
from collections.abc import Callable
from typing import Any

import numpy as np
from googleapiclient import discovery
Expand Down Expand Up @@ -83,7 +84,7 @@ def evaluate(
instance_details = []
for lm_output in lm_outputs:
if lm_output == "":
instance_details.append({att: 0.0 for att in self.attributes})
instance_details.append(dict.fromkeys(self.attributes, 0.0))
continue
analyze_request = {
"comment": {"text": lm_output},
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/metric/substring_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def evaluate(
]

score = 0.0
if len(match_list):
if match_list:
score = sum(match_list) / len(match_list)

summary = {f"substring_match-{self.mode}": score}
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/multiple_choice_dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass
from typing import Sequence


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/multiple_choice_dataset/template_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __getitem__(self, i: int) -> MultipleChoiceInstance:

answer_index = int(self.answer_index_template.render(**item))
if not (answer_index >= 0 and answer_index < len(choices)):
msg = f"at least {answer_index+1} choices required, but got {choices}"
msg = f"at least {answer_index + 1} choices required, but got {choices}"
raise ValueError(msg)

return MultipleChoiceInstance(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import itertools
from typing import Iterable
from collections.abc import Iterable

from flexeval.core.pairwise_comparison.match import Match

Expand Down
3 changes: 2 additions & 1 deletion flexeval/core/pairwise_comparison/match_maker/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Iterable, TypeVar
from collections.abc import Iterable
from typing import TypeVar

from flexeval.core.pairwise_comparison.match import Match

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import itertools
import random
from typing import Iterable
from collections.abc import Iterable

from flexeval.core.pairwise_comparison.match import Match

Expand All @@ -25,7 +25,7 @@ def generate_matches(

cached_matches = cached_matches or []
cache_dict = {match.get_key_for_cache(): match for match in cached_matches}
model_match_counter: dict[str, int] = {name: 0 for name in model_names}
model_match_counter: dict[str, int] = dict.fromkeys(model_names, 0)
possible_new_matches: list[Match] = []
matches: list[Match] = []
for m1, m2 in all_permutations:
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/pairwise_comparison/scorer/win_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def compute_scores(
win_count_dict[model2] += 0.5

win_rate_dict = {}
for model in match_count_dict:
for model in match_count_dict: # noqa: PLC0206
win_rate_dict[model] = 100 * win_count_dict.get(model, 0.0) / match_count_dict[model]

return dict(sorted(win_rate_dict.items(), key=lambda x: -x[1]))
3 changes: 2 additions & 1 deletion flexeval/core/reward_bench_dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import Any, Sequence
from typing import Any


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/reward_bench_dataset/template_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import datasets
from jinja2 import Template
from smart_open import open
from smart_open import open # noqa: A004

from flexeval.core.utils.jinja2_utils import JINJA2_ENV

Expand Down
4 changes: 1 addition & 3 deletions flexeval/core/reward_model/pairwise_judge_reward_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ def evaluate_model_output(model_output: str, gold_label: PairwiseChoice) -> bool
return False

# If only gold label is in model output, then output is **correct**
if gold_label.value in model_output:
return True
return False
return gold_label.value in model_output


def aggregate_judge_results(
Expand Down
2 changes: 1 addition & 1 deletion flexeval/core/text_dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass
from typing import Sequence


@dataclass
Expand Down
Loading
Loading