Skip to content

Commit ed7b34d

Browse files
Small fixes and poe checks
1 parent 3ec1441 commit ed7b34d

File tree

4 files changed

+22
-57
lines changed

4 files changed

+22
-57
lines changed

src/magentic_ui/eval/benchmarks/sentinelbench/sentinelbench.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,16 @@ def load_dataset(self) -> None:
140140

141141
# Add optional fields if they exist and are not empty
142142
if row_dict.get("relative_vs_absolute"):
143-
base_metadata["relative_vs_absolute"] = str(row_dict["relative_vs_absolute"])
144-
if "adversarial_attacks" in row_dict and row_dict["adversarial_attacks"] is not None:
145-
base_metadata["adversarial_attacks"] = str(row_dict["adversarial_attacks"])
143+
base_metadata["relative_vs_absolute"] = str(
144+
row_dict["relative_vs_absolute"]
145+
)
146+
if (
147+
"adversarial_attacks" in row_dict
148+
and row_dict["adversarial_attacks"] is not None
149+
):
150+
base_metadata["adversarial_attacks"] = str(
151+
row_dict["adversarial_attacks"]
152+
)
146153
if row_dict.get("failure_tolerance"):
147154
base_metadata["failure_tolerance"] = str(row_dict["failure_tolerance"])
148155

src/magentic_ui/eval/benchmarks/sentinelbench/task_variants.py

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,6 @@
44

55
from typing import Dict, List, Any, cast
66

7-
try:
8-
from ...models import BaseTask # type: ignore
9-
except ImportError:
10-
# Handle case when running tools directly (not as module)
11-
import sys
12-
from pathlib import Path
13-
14-
sys.path.append(str(Path(__file__).parent.parent.parent))
15-
from models import BaseTask # type: ignore
16-
177
# Define task variants with different parameter values for SentinelBench
188
SENTINELBENCH_TASK_VARIANTS = {
199
# Time-based variants
@@ -262,18 +252,16 @@ def calculate_sentinelbench_timeout(
262252
default_timeout = 60 * default_timeout_minutes
263253

264254
# Check if this is a SentinelBench task with parameter_value
265-
if (
266-
hasattr(task, "metadata")
267-
and task.metadata
268-
and isinstance(task.metadata, dict)
269-
):
255+
if hasattr(task, "metadata") and task.metadata and isinstance(task.metadata, dict):
270256
task_metadata: Any = getattr(task, "metadata", {})
271257
metadata: Dict[str, Any] = cast(Dict[str, Any], task_metadata)
272258
if "parameter_value" in metadata:
273259
parameter_value: Any = metadata["parameter_value"]
274260

275261
# Get base task ID (remove parameter part if present)
276-
base_task_id = task.id.split("/")[0] if hasattr(task, "id") and task.id else ""
262+
base_task_id = (
263+
task.id.split("/")[0] if hasattr(task, "id") and task.id else ""
264+
)
277265

278266
# Duration-based tasks
279267
if base_task_id in DURATION_TASKS:
@@ -306,16 +294,14 @@ def get_timeout_display_info(task: Any, timeout_seconds: int) -> str:
306294
timeout_minutes = int(timeout_seconds / 60)
307295

308296
# Check if this is a SentinelBench task with parameter_value
309-
if (
310-
hasattr(task, "metadata")
311-
and task.metadata
312-
and isinstance(task.metadata, dict)
313-
):
297+
if hasattr(task, "metadata") and task.metadata and isinstance(task.metadata, dict):
314298
task_metadata: Any = getattr(task, "metadata", {})
315299
metadata: Dict[str, Any] = cast(Dict[str, Any], task_metadata)
316300
if "parameter_value" in metadata:
317301
parameter_value: Any = metadata["parameter_value"]
318-
base_task_id = task.id.split("/")[0] if hasattr(task, "id") and task.id else ""
302+
base_task_id = (
303+
task.id.split("/")[0] if hasattr(task, "id") and task.id else ""
304+
)
319305

320306
# Duration-based tasks
321307
if base_task_id in DURATION_TASKS:

src/magentic_ui/eval/benchmarks/sentinelbench/tools/single_task_performance.py

Lines changed: 3 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -45,41 +45,13 @@
4545
from pathlib import Path
4646
import logging
4747

48+
# Import model pricing from task_variants
49+
from ..task_variants import MODEL_PRICING
50+
4851
# Set up logging
4952
logging.basicConfig(level=logging.INFO)
5053
logger = logging.getLogger(__name__)
5154

52-
# Model pricing (from compare_sentinel_performance.py)
53-
MODEL_PRICING = {
54-
# OpenAI GPT
55-
"gpt-4o": {"input": 0.005, "output": 0.02}, # Standard
56-
"gpt-4o-batch": {"input": 0.0025, "output": 0.01}, # Batch/Azure
57-
"gpt-4o-2024-08-06": {"input": 0.005, "output": 0.02},
58-
"gpt-4o-2024-11-20": {"input": 0.005, "output": 0.02},
59-
"gpt-4o-mini": {
60-
"input": 0.0006,
61-
"output": 0.0024,
62-
}, # Standard (Batch = 0.0003/0.0012)
63-
"gpt-4o-mini-2024-07-18": {"input": 0.0006, "output": 0.0024},
64-
"gpt-4": {"input": 0.03, "output": 0.06},
65-
"gpt-4-turbo": {"input": 0.01, "output": 0.03},
66-
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
67-
"gpt-5-mini": {
68-
"input": 0.00025,
69-
"output": 0.002,
70-
}, # GPT-5 mini: $0.25/$2.00 per 1M tokens
71-
# Anthropic Claude
72-
"claude-3-5-sonnet-20241022": {"input": 0.003, "output": 0.015},
73-
"claude-3-5-sonnet-20240620": {"input": 0.003, "output": 0.015},
74-
"claude-3-opus-20240229": {"input": 0.015, "output": 0.075},
75-
"claude-3-haiku-20240307": {"input": 0.00025, "output": 0.00125},
76-
# Google Gemini
77-
"gemini-1.5-pro": {"input": 0.00125, "output": 0.005}, # ≤128k ctx
78-
"gemini-1.5-pro-extended": {"input": 0.0025, "output": 0.01}, # >128k ctx
79-
"gemini-1.5-flash": {"input": 0.000075, "output": 0.0003}, # ≤128k ctx
80-
"gemini-1.5-flash-extended": {"input": 0.00015, "output": 0.0006}, # >128k ctx
81-
}
82-
8355

8456
def format_time_dimension(seconds: int) -> str:
8557
"""Format time dimension for display on plots."""

src/magentic_ui/eval/benchmarks/sentinelbench/tools/task_type_comparison.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
Creates combined plots showing accuracy, latency, and cost scaling averaged across task types.
66
77
Usage:
8-
python analyze_task_types.py --sentinel-csv plots/FINAL/all_tasks_with_sentinel.csv \
8+
python task_type_comparison.py --sentinel-csv plots/FINAL/all_tasks_with_sentinel.csv \
99
--non-sentinel-csv plots/FINAL/all_tasks_without_sentinel.csv \
1010
--model gpt-5-mini \
1111
--output-dir plots/task_types

0 commit comments

Comments
 (0)