diff --git a/.gitignore b/.gitignore
index 8a45ae5c..ccb7ad11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ third_party/*
.venvs/
__pycache__/
.pytest_cache/
+**/temp/
runs/
runs_old/
runs_old_2/
diff --git a/TASK_DETAILS.md b/TASK_DETAILS.md
index b331f7d8..e8f9c50e 100644
--- a/TASK_DETAILS.md
+++ b/TASK_DETAILS.md
@@ -21,7 +21,7 @@ We welcome new engineering problem ideas — even without complete verification
Maximize CRTBP lunar payload under trajectory and dynamics constraints (Octave validated) |
- | ParticlePhysics |
+ ParticlePhysics |
MuonTomography |
Muon detector placement optimization under flux, budget, and excavation constraints |
@@ -29,6 +29,10 @@ We welcome new engineering problem ideas — even without complete verification
ProtonTherapyPlanning |
IMPT dose weight optimization under tumor coverage, OAR safety, and beam cost constraints |
+
+ PETScannerOptimization |
+ PET detector ring geometry optimization under sensitivity, parallax, and crystal-volume budget trade-offs |
+
| KernelEngineering |
MLA |
@@ -204,7 +208,7 @@ We welcome new engineering problem ideas — even without complete verification
Polarization-multiplexed holography |
- | ComputerSystems |
+ ComputerSystems |
MallocLab |
High-performance C memory allocator (utilization & throughput) |
@@ -212,6 +216,18 @@ We welcome new engineering problem ideas — even without complete verification
DuckDBWorkloadOptimization |
Index / materialized-view selection and query rewriting on official DuckDB workloads |
+
+ DuckDBIndexSelection |
+ Whitelist index selection for a family of analytical DuckDB workloads |
+
+
+ DuckDBPreAggregationSelection |
+ Whitelist pre-aggregation selection for a family of analytical DuckDB reporting workloads |
+
+
+ DuckDBQueryRewrite |
+ Semantics-preserving SQL rewrite across a family of analytical DuckDB queries |
+
| EngDesign |
CY_03, WJ_01, XY_05, AM_02, AM_03, YJ_02, YJ_03 |
@@ -251,6 +267,19 @@ We welcome new engineering problem ideas — even without complete verification
discrete_rebalance_mip |
Discrete lot-constrained rebalancing with mixed-integer optimization |
+
+ | MaterialEngineering |
+ MicrowaveAbsorberDesign |
+ Single-layer X-band microwave absorber optimization under bandwidth, reflection-loss, density, and cost trade-offs |
+
+
+ LightweightBroadbandAbsorber |
+ Lightweight broadband absorber optimization with a minimum effective-bandwidth hard constraint |
+
+
+ NanoCarbonAbsorberOptimization |
+ Mixed-variable nano-carbon absorber optimization over material type, carbon content, and thickness |
+
| JobShop |
abz |
@@ -298,7 +327,16 @@ We welcome new engineering problem ideas — even without complete verification
pyMOTO-based 2D beam topology optimization (SIMP + OC/MMA) under a volume-fraction constraint |
- | Robotics |
+ OperationsResearch |
+ DynamicCurrentMinimumTimeRouting |
+ Constrained minimum-time ship routing over coastal grids with currents and draft limits |
+
+
+ FuelMinimizingShipWeatherRouting |
+ Fuel-minimizing ship weather routing over coastal grids with arrival-time constraints |
+
+
+ | Robotics |
DynamicObstacleAvoidanceNavigation |
Navigate a differential-drive robot from start to goal in a dynamic environment |
@@ -322,6 +360,18 @@ We welcome new engineering problem ideas — even without complete verification
CoFlyersVasarhelyiTuning |
Tune the Vasarhelyi flocking parameters for the CoFlyers swarm system |
+
+ GridPathPlanningWithObstacles |
+ Single-robot collision-free path planning across a family of obstacle grids |
+
+
+ MultiRobotPrioritizedPlanning |
+ Prioritized multi-robot path planning across a family of grid MAPF cases |
+
+
+ NarrowPassagePlanning |
+ Single-robot path planning across a family of narrow-passage bottleneck grids |
+
| Aerodynamics |
CarAerodynamicsSensing |
diff --git a/TASK_DETAILS_zh-CN.md b/TASK_DETAILS_zh-CN.md
index 44d1b1fa..fd95e4f0 100644
--- a/TASK_DETAILS_zh-CN.md
+++ b/TASK_DETAILS_zh-CN.md
@@ -21,7 +21,7 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
在 CRTBP 轨道约束下最大化月球着陆载荷(Octave 验证) |
- | ParticlePhysics |
+ ParticlePhysics |
MuonTomography |
在缪子通量、预算与开挖约束下优化探测器布局 |
@@ -29,6 +29,10 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
ProtonTherapyPlanning |
在肿瘤覆盖、危及器官保护与束流成本约束下优化 IMPT 剂量权重 |
+
+ PETScannerOptimization |
+ 在灵敏度、视差误差与晶体体积预算约束下优化 PET 探测环几何参数 |
+
| KernelEngineering |
MLA |
@@ -204,7 +208,7 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
偏振复用全息 |
- | ComputerSystems |
+ ComputerSystems |
MallocLab |
高性能 C 动态内存分配器(utilization & throughput) |
@@ -212,6 +216,18 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
DuckDBWorkloadOptimization |
基于 DuckDB 官方 workload 的索引 / 物化视图选择与查询改写 |
+
+ DuckDBIndexSelection |
+ 面向一组分析型 DuckDB workload 的白名单索引选择 |
+
+
+ DuckDBPreAggregationSelection |
+ 面向一组分析型 DuckDB 报表 workload 的白名单预聚合选择 |
+
+
+ DuckDBQueryRewrite |
+ 面向一组分析型 DuckDB 查询的语义等价 SQL 改写 |
+
| EngDesign |
CY_03, WJ_01, XY_05, AM_02, AM_03, YJ_02, YJ_03 |
@@ -251,6 +267,19 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
discrete_rebalance_mip |
带整数手数约束的离散再平衡混合整数优化 |
+
+ | MaterialEngineering |
+ MicrowaveAbsorberDesign |
+ 在带宽、反射损耗、密度与成本之间折中的单层 X 波段吸波材料优化 |
+
+
+ LightweightBroadbandAbsorber |
+ 带最小有效带宽硬约束的轻量宽带吸波材料优化 |
+
+
+ NanoCarbonAbsorberOptimization |
+ 联合优化材料类型、碳含量与厚度的混合变量纳米碳吸波材料任务 |
+
| JobShop |
abz |
@@ -298,7 +327,16 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
基于 pyMOTO 的 2D 梁拓扑优化(SIMP + OC/MMA),体积分数约束 |
- | Robotics |
+ OperationsResearch |
+ DynamicCurrentMinimumTimeRouting |
+ 在流场与吃水约束下进行沿海栅格最短航时船舶路径规划 |
+
+
+ FuelMinimizingShipWeatherRouting |
+ 在到达时限约束下进行沿海栅格最小燃料天气航线规划 |
+
+
+ | Robotics |
DynamicObstacleAvoidanceNavigation |
在动态环境中控制差分轮机器人从起点到终点 |
@@ -322,6 +360,18 @@ Frontier-Eng 目前已覆盖以下领域的任务。每个任务均配有可运
CoFlyersVasarhelyiTuning |
调优 CoFlyers 群飞系统的 Vasarhelyi 参数 |
+
+ GridPathPlanningWithObstacles |
+ 面向一组障碍栅格的单机器人无碰撞路径规划 |
+
+
+ MultiRobotPrioritizedPlanning |
+ 面向一组栅格 MAPF case 的多机器人优先级路径规划 |
+
+
+ NarrowPassagePlanning |
+ 面向一组窄通道瓶颈栅格的单机器人路径规划 |
+
| Aerodynamics |
CarAerodynamicsSensing |
diff --git a/benchmarks/CommunicationEngineering/LDPCErrorFloor/baseline/solution.py b/benchmarks/CommunicationEngineering/LDPCErrorFloor/baseline/solution.py
index f5e88b06..8633d6c7 100644
--- a/benchmarks/CommunicationEngineering/LDPCErrorFloor/baseline/solution.py
+++ b/benchmarks/CommunicationEngineering/LDPCErrorFloor/baseline/solution.py
@@ -24,8 +24,9 @@ class TrappingSetSampler(BiasedVarianceSampler):
"""
def __init__(self, code, *, seed: int = 0):
- # Use bias_factor=1.5 to increase noise by 50%
- super().__init__(code, seed=seed, bias_factor=1.5)
+ # Use a moderate variance bias that remains valid when the evaluator
+ # independently recomputes weights and decoding outcomes.
+ super().__init__(code, seed=seed, bias_factor=1.0)
self.rng = Generator(Philox(seed))
def simulate_variance_controlled(
@@ -59,4 +60,3 @@ def simulate_variance_controlled(
result = sampler.simulate_variance_controlled(code=code)
print(result)
-
diff --git a/benchmarks/CommunicationEngineering/LDPCErrorFloor/scripts/init.py b/benchmarks/CommunicationEngineering/LDPCErrorFloor/scripts/init.py
index 351453e1..315e1f58 100644
--- a/benchmarks/CommunicationEngineering/LDPCErrorFloor/scripts/init.py
+++ b/benchmarks/CommunicationEngineering/LDPCErrorFloor/scripts/init.py
@@ -46,8 +46,9 @@ class TrappingSetSampler(BiasedVarianceSampler):
"""
def __init__(self, code, *, seed: int = 0):
- # Use bias_factor=1.5 to increase noise by 50%
- super().__init__(code, seed=seed, bias_factor=1.5)
+ # Use a moderate variance bias that remains valid when the evaluator
+ # independently recomputes weights and decoding outcomes.
+ super().__init__(code, seed=seed, bias_factor=1.0)
self.rng = Generator(Philox(seed))
def simulate_variance_controlled(
diff --git a/benchmarks/CommunicationEngineering/LDPCErrorFloor/verification/evaluator.py b/benchmarks/CommunicationEngineering/LDPCErrorFloor/verification/evaluator.py
index c2eb4fd8..25c9e5d4 100644
--- a/benchmarks/CommunicationEngineering/LDPCErrorFloor/verification/evaluator.py
+++ b/benchmarks/CommunicationEngineering/LDPCErrorFloor/verification/evaluator.py
@@ -27,11 +27,17 @@
EPSILON = 2.0 # Increased tolerance for initial submissions
INVALID_SCORE_SCALE = 0.1
INVALID_SCORE_CAP = 0.1
-# Reference values (calibrated from baseline under current frozen eval constants).
-# With MAX_SAMPLES=50/REPEATS=1, baseline err_rate is around 1e-57 ~ 1e-48.
-# Use a stable order-of-magnitude anchor instead of placeholder 1e-5 so valid metric
-# is meaningful for this benchmark.
-R0_DEV = 1e-56
+STD_TOL = 1e-9
+ERR_RATIO_REL_TOL = 1e-6
+ERR_RATIO_ABS_TOL = 1e-12
+INTEGER_TOL = 1e-6
+LOG_RATIO_TOL = 0.5
+LOG_WEIGHT_CLIP = 100.0
+# Reference values calibrated from the shipped baseline under evaluator-owned
+# sampling. The randomly constructed short LDPC instance is intentionally tiny
+# for smoke evaluation, so this anchor reflects the frozen benchmark constants
+# rather than a production-code error-floor estimate.
+R0_DEV = 0.89
R0_LOG_DEV = float(math.log(R0_DEV))
T0_DEV = 10.0 # Reference runtime
@@ -147,6 +153,180 @@ def _normalize_result(result: Any) -> tuple[float, float, float, float, float, f
raise ValueError("simulate_variance_controlled 返回值格式不支持")
+def _validate_result(payload: tuple[float, float, float, float, float, float]) -> dict[str, float | bool]:
+ errors_log, weights_log, err_ratio, total_samples, actual_std, converged = payload
+
+ if not np.isfinite(weights_log):
+ raise ValueError("weights_log 必须是有限值")
+ if np.isnan(errors_log) or errors_log == float("inf"):
+ raise ValueError("errors_log 必须是有限值或 -inf")
+ if not np.isfinite(total_samples) or total_samples <= 0:
+ raise ValueError("total_samples 必须是正数")
+ rounded_samples = int(round(total_samples))
+ if abs(total_samples - rounded_samples) > INTEGER_TOL:
+ raise ValueError("total_samples 必须是整数")
+ if rounded_samples > MAX_SAMPLES:
+ raise ValueError(f"total_samples={rounded_samples} 超过 max_samples={MAX_SAMPLES}")
+ if np.isnan(actual_std) or actual_std < 0.0:
+ raise ValueError("actual_std 必须是非负数或 inf")
+
+ converged_value = bool(converged)
+ if converged_value and (not np.isfinite(actual_std) or actual_std > TARGET_STD + STD_TOL):
+ raise ValueError("converged=True 但 actual_std 未达到 target_std")
+
+ if errors_log == float("-inf"):
+ if not np.isfinite(err_ratio) or not math.isclose(err_ratio, 0.0, abs_tol=ERR_RATIO_ABS_TOL):
+ raise ValueError("errors_log=-inf 时 err_ratio 必须为 0")
+ if converged_value:
+ raise ValueError("未观测到错误时不应标记 converged=True")
+ derived_err_ratio = 0.0
+ err_rate_log = -20.0
+ else:
+ if not np.isfinite(errors_log):
+ raise ValueError("errors_log 必须是有限值或 -inf")
+ if not np.isfinite(err_ratio) or err_ratio < 0.0 or err_ratio > 1.0 + ERR_RATIO_REL_TOL:
+ raise ValueError("err_ratio 必须位于 [0, 1]")
+ log_ratio = errors_log - weights_log
+ if log_ratio > math.log1p(ERR_RATIO_REL_TOL):
+ raise ValueError("errors_log 对应的误差权重不能超过总权重")
+ derived_err_ratio = float(math.exp(log_ratio))
+ err_rate_log = float(log_ratio)
+
+ return {
+ "errors_log": errors_log,
+ "weights_log": weights_log,
+ # Keep the candidate-reported ratio for diagnostics, but use the
+ # log-domain reconstruction as the authoritative metric. In practice
+ # some samplers expose `err_ratio` as a numerically smoothed helper
+ # statistic instead of an exact exp(errors_log - weights_log).
+ "err_ratio": float(err_ratio if np.isfinite(err_ratio) else derived_err_ratio),
+ "derived_err_ratio": derived_err_ratio,
+ "total_samples": float(rounded_samples),
+ "actual_std": actual_std,
+ "converged": converged_value,
+ "err_rate_log": err_rate_log,
+ }
+
+
+def _as_1d_float_array(value: Any, *, name: str, expected_len: int) -> np.ndarray:
+ arr = np.asarray(value, dtype=np.float64)
+ if arr.shape != (expected_len,):
+ raise ValueError(f"{name} shape must be ({expected_len},), got {arr.shape}")
+ if not np.all(np.isfinite(arr)):
+ raise ValueError(f"{name} must contain only finite values")
+ return arr
+
+
+def _as_noise_batch(value: Any, *, expected_n: int, requested_batch: int) -> np.ndarray:
+ arr = np.asarray(value, dtype=np.float64)
+ if arr.ndim != 2 or arr.shape[1] != expected_n:
+ raise ValueError(f"noise batch shape must be (batch, {expected_n}), got {arr.shape}")
+ if arr.shape[0] <= 0 or arr.shape[0] > requested_batch:
+ raise ValueError(f"noise batch size must be in [1, {requested_batch}], got {arr.shape[0]}")
+ if not np.all(np.isfinite(arr)):
+ raise ValueError("noise batch must contain only finite values")
+ return arr
+
+
+def _summarize_weighted_event_run(
+ *,
+ event_weights: list[float],
+ total_weight: float,
+ total_samples: int,
+ contributions: list[float],
+ min_events: int,
+) -> dict[str, float | bool]:
+ if total_samples <= 0 or not np.isfinite(total_weight) or total_weight <= 0.0:
+ raise ValueError("evaluator-owned simulation produced no positive total weight")
+
+ if event_weights:
+ event_sum = float(np.sum(event_weights))
+ ratio = event_sum / total_weight
+ ratio_log = float(math.log(max(ratio, ERR_RATIO_ABS_TOL)))
+ event_weights_arr = np.asarray(event_weights, dtype=np.float64)
+ contribution_arr = np.asarray(contributions, dtype=np.float64)
+ # Standard error of the weighted event contribution normalized by total weight.
+ actual_std = float(np.std(contribution_arr / (total_weight / total_samples)) / math.sqrt(total_samples))
+ converged = bool(len(event_weights_arr) >= min_events and actual_std <= TARGET_STD + STD_TOL)
+ else:
+ ratio = 0.0
+ ratio_log = -20.0
+ actual_std = float("inf")
+ converged = False
+
+ return {
+ "ratio": ratio,
+ "ratio_log": ratio_log,
+ "total_samples": float(total_samples),
+ "actual_std": actual_std,
+ "converged": converged,
+ "event_count": float(len(event_weights)),
+ }
+
+
+def _run_evaluator_owned_simulation(sampler: Any, code: Any) -> dict[str, float | bool]:
+ tx_bits = np.zeros(code.n, dtype=int)
+ tx_signal = np.ones(code.n)
+ total_weight = 0.0
+ total_samples = 0
+ event_weights: list[float] = []
+ contributions: list[float] = []
+
+ while total_samples < MAX_SAMPLES:
+ requested_batch = min(BATCH_SIZE, MAX_SAMPLES - total_samples)
+ try:
+ noise, log_pdf_biased = sampler.sample(DEV_SIGMA, tx_bits, requested_batch)
+ except Exception as e:
+ raise RuntimeError(f"sample 执行失败: {e}") from e
+
+ noise = _as_noise_batch(noise, expected_n=code.n, requested_batch=requested_batch)
+ batch_size_actual = int(noise.shape[0])
+ log_pdf_biased = _as_1d_float_array(log_pdf_biased, name="log_pdf_biased", expected_len=batch_size_actual)
+
+ log_pdf_true = (
+ -np.sum(noise**2, axis=1) / (2 * DEV_SIGMA**2)
+ - code.n / 2 * np.log(2 * np.pi * DEV_SIGMA**2)
+ )
+ if not np.all(np.isfinite(log_pdf_true)):
+ raise ValueError("true log pdf contains non-finite values")
+
+ log_weights = np.clip(log_pdf_true - log_pdf_biased, -LOG_WEIGHT_CLIP, LOG_WEIGHT_CLIP)
+ weights = np.exp(log_weights)
+ if not np.all(np.isfinite(weights)) or np.any(weights < 0.0):
+ raise ValueError("importance weights must be finite and non-negative")
+
+ for i in range(batch_size_actual):
+ received = tx_signal + noise[i, :]
+ llr = 2.0 * received / (DEV_SIGMA**2)
+ decoded, _ = code.decode(llr)
+ is_error = not np.array_equal(decoded, tx_bits)
+ weight = float(weights[i])
+ total_weight += weight
+ contributions.append(weight if is_error else 0.0)
+ if is_error:
+ event_weights.append(weight)
+
+ total_samples += batch_size_actual
+ if len(event_weights) >= MIN_ERRORS:
+ interim = _summarize_weighted_event_run(
+ event_weights=event_weights,
+ total_weight=total_weight,
+ total_samples=total_samples,
+ contributions=contributions,
+ min_events=MIN_ERRORS,
+ )
+ if bool(interim["converged"]):
+ break
+
+ return _summarize_weighted_event_run(
+ event_weights=event_weights,
+ total_weight=total_weight,
+ total_samples=total_samples,
+ contributions=contributions,
+ min_events=MIN_ERRORS,
+ )
+
+
def _build_code(repo_root: Path, seed: int):
LDPCCode = _import_ldpc_code(repo_root)
@@ -202,45 +382,28 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
if hasattr(sampler, "rng"):
sampler.rng = Generator(Philox(seed))
- if not hasattr(sampler, "simulate_variance_controlled"):
- raise AttributeError("TrappingSetSampler 缺少 simulate_variance_controlled 方法")
+ if not hasattr(sampler, "sample"):
+ raise AttributeError("TrappingSetSampler 缺少 sample 方法")
t0 = time.time()
- try:
- result = sampler.simulate_variance_controlled(
- code=code,
- sigma=DEV_SIGMA,
- target_std=TARGET_STD,
- max_samples=MAX_SAMPLES,
- batch_size=BATCH_SIZE,
- fix_tx=True,
- min_errors=MIN_ERRORS,
- )
- except Exception as e:
- raise RuntimeError(f"simulate_variance_controlled 执行失败: {e}") from e
+ result = _run_evaluator_owned_simulation(sampler, code)
dt = time.time() - t0
- errors_log, weights_log, err_ratio, total_samples, actual_std, converged = _normalize_result(result)
- err_rate_log = float(errors_log - weights_log)
-
- # Handle case when no errors found (errors_log = -inf)
- if not np.isfinite(err_rate_log):
- # Use a very small error rate estimate instead of -inf
- # This allows evaluation to continue but will result in valid=0
- err_rate_log = float('-20.0') # log(2e-9), very small but finite
-
+ err_rate_log = float(result["ratio_log"])
runtimes.append(float(dt))
err_logs.append(err_rate_log)
- ratios.append(err_ratio)
- samples.append(total_samples)
- stds.append(actual_std)
- converged_flags.append(converged)
+ ratios.append(float(result["ratio"]))
+ samples.append(float(result["total_samples"]))
+ stds.append(float(result["actual_std"]))
+ converged_flags.append(1.0 if bool(result["converged"]) else 0.0)
runtime_median = float(np.median(runtimes))
err_log_median = float(np.median(err_logs))
err_log_ratio = float(abs(err_log_median - R0_LOG_DEV))
- valid = float(err_log_ratio < EPSILON)
+ variance_ok = float(np.nanmedian(stds) <= TARGET_STD + STD_TOL)
+ convergence_ok = float(np.mean(converged_flags) >= 0.5)
+ valid = float(err_log_ratio < EPSILON and variance_ok and convergence_ok)
raw_score = float(T0_DEV / (runtime_median * err_log_ratio + 1e-6))
if valid > 0:
score = raw_score
@@ -259,9 +422,14 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
"actual_samples_median": float(np.nanmedian(samples)),
"actual_std_median": float(np.nanmedian(stds)),
"converged_rate": float(np.mean(converged_flags)),
+ "variance_ok": variance_ok,
+ "convergence_ok": convergence_ok,
"sigma": DEV_SIGMA,
}
)
+ artifacts["validity_reason"] = (
+ "ok" if valid > 0 else f"anchor_ok={err_log_ratio < EPSILON},variance_ok={bool(variance_ok)},convergence_ok={bool(convergence_ok)}"
+ )
artifacts["dev_constants"] = json.dumps(
{
"sigma": DEV_SIGMA,
@@ -269,6 +437,10 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
"max_samples": MAX_SAMPLES,
"batch_size": BATCH_SIZE,
"epsilon": EPSILON,
+ "std_tol": STD_TOL,
+ "log_ratio_tol": LOG_RATIO_TOL,
+ "log_weight_clip": LOG_WEIGHT_CLIP,
+ "simulation_owner": "evaluator",
"r0_dev": R0_DEV,
"t0_dev": T0_DEV,
"repeats": REPEATS,
diff --git a/benchmarks/CommunicationEngineering/PMDSimulation/scripts/init.py b/benchmarks/CommunicationEngineering/PMDSimulation/scripts/init.py
index 7f46d32b..d006fe1c 100644
--- a/benchmarks/CommunicationEngineering/PMDSimulation/scripts/init.py
+++ b/benchmarks/CommunicationEngineering/PMDSimulation/scripts/init.py
@@ -45,8 +45,8 @@ class PMDSampler(SamplerBase):
def __init__(self, fiber_model=None, *, seed: int = 0):
super().__init__(fiber_model, seed=seed)
self.rng = Generator(Philox(seed))
- # Adaptive biasing parameters - use very conservative initial values
- self.bias_strength = 0.15 # Initial biasing strength (mean shift) - very conservative
+ # Fixed baseline tilt calibrated for the evaluator-owned PMD smoke test.
+ self.bias_strength = 0.25
self.bias_direction = None # Will be set adaptively
self.adaptation_rate = 0.05 # Learning rate for adaptation - slower for stability
diff --git a/benchmarks/CommunicationEngineering/PMDSimulation/verification/evaluator.py b/benchmarks/CommunicationEngineering/PMDSimulation/verification/evaluator.py
index 674b1fc7..e42b2990 100644
--- a/benchmarks/CommunicationEngineering/PMDSimulation/verification/evaluator.py
+++ b/benchmarks/CommunicationEngineering/PMDSimulation/verification/evaluator.py
@@ -30,8 +30,14 @@
EPSILON = 2.0 # Increased tolerance for initial submissions
INVALID_SCORE_SCALE = 0.1
INVALID_SCORE_CAP = 0.1
-# Reference values (to be calibrated with baseline solution)
-R0_DEV = 1e-9 # Reference outage probability (adjusted for initial testing)
+STD_TOL = 1e-9
+OUTAGE_PROB_REL_TOL = 1e-6
+OUTAGE_PROB_ABS_TOL = 1e-12
+INTEGER_TOL = 1e-6
+LOG_WEIGHT_CLIP = 100.0
+# Reference value calibrated from the shipped baseline under evaluator-owned
+# sampling and the frozen PMD smoke-test constants.
+R0_DEV = 2.3e-8
R0_LOG_DEV = float(math.log(R0_DEV))
T0_DEV = 10.0
@@ -132,6 +138,187 @@ def _normalize_result(result: Any) -> tuple[float, float, float, float, float, f
raise ValueError("simulate_variance_controlled 返回值格式不支持")
+def _validate_result(payload: tuple[float, float, float, float, float, float]) -> dict[str, float | bool]:
+ outages_log, weights_log, outage_prob, total_samples, actual_std, converged = payload
+
+ if not np.isfinite(weights_log):
+ raise ValueError("weights_log 必须是有限值")
+ if np.isnan(outages_log) or outages_log == float("inf"):
+ raise ValueError("outages_log 必须是有限值或 -inf")
+ if not np.isfinite(total_samples) or total_samples <= 0:
+ raise ValueError("total_samples 必须是正数")
+ rounded_samples = int(round(total_samples))
+ if abs(total_samples - rounded_samples) > INTEGER_TOL:
+ raise ValueError("total_samples 必须是整数")
+ if rounded_samples > MAX_SAMPLES:
+ raise ValueError(f"total_samples={rounded_samples} 超过 max_samples={MAX_SAMPLES}")
+ if np.isnan(actual_std) or actual_std < 0.0:
+ raise ValueError("actual_std 必须是非负数或 inf")
+
+ converged_value = bool(converged)
+ if converged_value and (not np.isfinite(actual_std) or actual_std > TARGET_STD + STD_TOL):
+ raise ValueError("converged=True 但 actual_std 未达到 target_std")
+
+ if outages_log == float("-inf"):
+ if not np.isfinite(outage_prob) or not math.isclose(outage_prob, 0.0, abs_tol=OUTAGE_PROB_ABS_TOL):
+ raise ValueError("outages_log=-inf 时 outage_prob 必须为 0")
+ if converged_value:
+ raise ValueError("未观测到 outage 时不应标记 converged=True")
+ derived_outage_prob = 0.0
+ outage_prob_log = -20.0
+ else:
+ if not np.isfinite(outages_log):
+ raise ValueError("outages_log 必须是有限值或 -inf")
+ if not np.isfinite(outage_prob) or outage_prob < 0.0 or outage_prob > 1.0 + OUTAGE_PROB_REL_TOL:
+ raise ValueError("outage_prob 必须位于 [0, 1]")
+ log_ratio = outages_log - weights_log
+ if log_ratio > math.log1p(OUTAGE_PROB_REL_TOL):
+ raise ValueError("outages_log 对应的 outage 权重不能超过总权重")
+ derived_outage_prob = float(math.exp(log_ratio))
+ if not math.isclose(
+ outage_prob,
+ derived_outage_prob,
+ rel_tol=OUTAGE_PROB_REL_TOL,
+ abs_tol=OUTAGE_PROB_ABS_TOL,
+ ):
+ raise ValueError("outage_prob 与 outages_log/weights_log 推导出的概率不一致")
+ outage_prob_log = float(log_ratio)
+
+ return {
+ "outages_log": outages_log,
+ "weights_log": weights_log,
+ "outage_prob": derived_outage_prob,
+ "total_samples": float(rounded_samples),
+ "actual_std": actual_std,
+ "converged": converged_value,
+ "outage_prob_log": outage_prob_log,
+ }
+
+
+def _as_1d_float_array(value: Any, *, name: str, expected_len: int) -> np.ndarray:
+ arr = np.asarray(value, dtype=np.float64)
+ if arr.shape != (expected_len,):
+ raise ValueError(f"{name} shape must be ({expected_len},), got {arr.shape}")
+ if not np.all(np.isfinite(arr)):
+ raise ValueError(f"{name} must contain only finite values")
+ return arr
+
+
+def _as_beta_batch(value: Any, *, expected_segments: int, requested_batch: int) -> np.ndarray:
+ arr = np.asarray(value, dtype=np.float64)
+ expected_shape_tail = (expected_segments, 3)
+ if arr.ndim != 3 or arr.shape[1:] != expected_shape_tail:
+ raise ValueError(f"beta_vectors shape must be (batch, {expected_segments}, 3), got {arr.shape}")
+ if arr.shape[0] <= 0 or arr.shape[0] > requested_batch:
+ raise ValueError(f"beta batch size must be in [1, {requested_batch}], got {arr.shape[0]}")
+ if not np.all(np.isfinite(arr)):
+ raise ValueError("beta_vectors must contain only finite values")
+ return arr
+
+
+def _summarize_weighted_event_run(
+ *,
+ event_weights: list[float],
+ total_weight: float,
+ total_samples: int,
+ contributions: list[float],
+ min_events: int,
+) -> dict[str, float | bool]:
+ if total_samples <= 0 or not np.isfinite(total_weight) or total_weight <= 0.0:
+ raise ValueError("evaluator-owned simulation produced no positive total weight")
+
+ if event_weights:
+ event_sum = float(np.sum(event_weights))
+ prob = event_sum / total_weight
+ prob_log = float(math.log(max(prob, OUTAGE_PROB_ABS_TOL)))
+ contribution_arr = np.asarray(contributions, dtype=np.float64)
+ actual_std = float(np.std(contribution_arr / (total_weight / total_samples)) / math.sqrt(total_samples))
+ converged = bool(len(event_weights) >= min_events and actual_std <= TARGET_STD + STD_TOL)
+ else:
+ prob = 0.0
+ prob_log = -20.0
+ actual_std = float("inf")
+ converged = False
+
+ return {
+ "prob": prob,
+ "prob_log": prob_log,
+ "total_samples": float(total_samples),
+ "actual_std": actual_std,
+ "converged": converged,
+ "event_count": float(len(event_weights)),
+ }
+
+
+def _run_evaluator_owned_simulation(sampler: Any, fiber: Any) -> dict[str, float | bool]:
+ total_weight = 0.0
+ total_samples = 0
+ event_weights: list[float] = []
+ contributions: list[float] = []
+
+ while total_samples < MAX_SAMPLES:
+ requested_batch = min(BATCH_SIZE, MAX_SAMPLES - total_samples)
+ try:
+ beta_vectors, log_pdf_biased = sampler.sample(
+ num_segments=fiber.num_segments,
+ batch_size=requested_batch,
+ )
+ except Exception as e:
+ raise RuntimeError(f"sample 执行失败: {e}") from e
+
+ beta_vectors = _as_beta_batch(
+ beta_vectors,
+ expected_segments=fiber.num_segments,
+ requested_batch=requested_batch,
+ )
+ batch_size_actual = int(beta_vectors.shape[0])
+ log_pdf_biased = _as_1d_float_array(log_pdf_biased, name="log_pdf_biased", expected_len=batch_size_actual)
+
+ log_pdf_true = np.sum(
+ -0.5 * np.sum(beta_vectors**2, axis=2) - 1.5 * np.log(2 * np.pi),
+ axis=1,
+ )
+ if not np.all(np.isfinite(log_pdf_true)):
+ raise ValueError("true log pdf contains non-finite values")
+
+ log_weights = np.clip(log_pdf_true - log_pdf_biased, -LOG_WEIGHT_CLIP, LOG_WEIGHT_CLIP)
+ weights = np.exp(log_weights)
+ if not np.all(np.isfinite(weights)) or np.any(weights < 0.0):
+ raise ValueError("importance weights must be finite and non-negative")
+
+ dgd = fiber.evolve_pmd(beta_vectors)
+ if dgd.shape != (batch_size_actual,) or not np.all(np.isfinite(dgd)):
+ raise ValueError("DGD values must be finite with shape (batch,)")
+
+ for i in range(batch_size_actual):
+ is_outage = bool(dgd[i] > DGD_THRESHOLD)
+ weight = float(weights[i])
+ total_weight += weight
+ contributions.append(weight if is_outage else 0.0)
+ if is_outage:
+ event_weights.append(weight)
+
+ total_samples += batch_size_actual
+ if len(event_weights) >= MIN_OUTAGES:
+ interim = _summarize_weighted_event_run(
+ event_weights=event_weights,
+ total_weight=total_weight,
+ total_samples=total_samples,
+ contributions=contributions,
+ min_events=MIN_OUTAGES,
+ )
+ if bool(interim["converged"]):
+ break
+
+ return _summarize_weighted_event_run(
+ event_weights=event_weights,
+ total_weight=total_weight,
+ total_samples=total_samples,
+ contributions=contributions,
+ min_events=MIN_OUTAGES,
+ )
+
+
def _build_fiber(repo_root: Path):
PMDFiberModel = _import_fiber_model(repo_root)
return PMDFiberModel(
@@ -184,43 +371,28 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
except Exception as e:
raise RuntimeError(f"PMDSampler 初始化失败: {e}") from e
- if not hasattr(sampler, "simulate_variance_controlled"):
- raise AttributeError("PMDSampler 缺少 simulate_variance_controlled 方法")
+ if not hasattr(sampler, "sample"):
+ raise AttributeError("PMDSampler 缺少 sample 方法")
t0 = time.time()
- try:
- result = sampler.simulate_variance_controlled(
- fiber_model=fiber,
- dgd_threshold=DGD_THRESHOLD,
- target_std=TARGET_STD,
- max_samples=MAX_SAMPLES,
- batch_size=BATCH_SIZE,
- min_outages=MIN_OUTAGES,
- )
- except Exception as e:
- raise RuntimeError(f"simulate_variance_controlled 执行失败: {e}") from e
+ result = _run_evaluator_owned_simulation(sampler, fiber)
dt = time.time() - t0
- outages_log, weights_log, outage_prob, total_samples, actual_std, converged = _normalize_result(result)
- outage_prob_log = float(outages_log - weights_log)
-
- # Handle case when no outages found (outages_log = -inf)
- if not np.isfinite(outage_prob_log):
- # Use a very small outage probability estimate instead of -inf
- outage_prob_log = float('-20.0') # log(2e-9), very small but finite
-
+ outage_prob_log = float(result["prob_log"])
runtimes.append(float(dt))
outage_logs.append(outage_prob_log)
- probs.append(outage_prob)
- samples.append(total_samples)
- stds.append(actual_std)
- converged_flags.append(converged)
+ probs.append(float(result["prob"]))
+ samples.append(float(result["total_samples"]))
+ stds.append(float(result["actual_std"]))
+ converged_flags.append(1.0 if bool(result["converged"]) else 0.0)
runtime_median = float(np.median(runtimes))
outage_log_median = float(np.median(outage_logs))
outage_log_ratio = float(abs(outage_log_median - R0_LOG_DEV))
- valid = float(outage_log_ratio < EPSILON)
+ variance_ok = float(np.nanmedian(stds) <= TARGET_STD + STD_TOL)
+ convergence_ok = float(np.mean(converged_flags) >= 0.5)
+ valid = float(outage_log_ratio < EPSILON and variance_ok and convergence_ok)
raw_score = float(T0_DEV / (runtime_median * outage_log_ratio + 1e-6))
if valid > 0:
score = raw_score
@@ -238,8 +410,13 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
"actual_samples_median": float(np.nanmedian(samples)),
"actual_std_median": float(np.nanmedian(stds)),
"converged_rate": float(np.mean(converged_flags)),
+ "variance_ok": variance_ok,
+ "convergence_ok": convergence_ok,
"dgd_threshold": DGD_THRESHOLD,
})
+ artifacts["validity_reason"] = (
+ "ok" if valid > 0 else f"anchor_ok={outage_log_ratio < EPSILON},variance_ok={bool(variance_ok)},convergence_ok={bool(convergence_ok)}"
+ )
artifacts["dev_constants"] = json.dumps({
"fiber_length_km": FIBER_LENGTH_KM,
"pmd_coefficient": PMD_COEFFICIENT,
@@ -248,6 +425,9 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
"max_samples": MAX_SAMPLES,
"batch_size": BATCH_SIZE,
"epsilon": EPSILON,
+ "std_tol": STD_TOL,
+ "log_weight_clip": LOG_WEIGHT_CLIP,
+ "simulation_owner": "evaluator",
"r0_dev": R0_DEV,
"t0_dev": T0_DEV,
"repeats": REPEATS,
@@ -288,4 +468,3 @@ def main() -> None:
if __name__ == "__main__":
main()
-
diff --git a/benchmarks/CommunicationEngineering/RayleighFadingBER/verification/evaluator.py b/benchmarks/CommunicationEngineering/RayleighFadingBER/verification/evaluator.py
index 9989b814..24f38e92 100644
--- a/benchmarks/CommunicationEngineering/RayleighFadingBER/verification/evaluator.py
+++ b/benchmarks/CommunicationEngineering/RayleighFadingBER/verification/evaluator.py
@@ -35,6 +35,8 @@
ERR_RATIO_REL_TOL = 1e-6
ERR_RATIO_ABS_TOL = 1e-12
INTEGER_TOL = 1e-6
+STD_TOL = 1e-9
+LOG_WEIGHT_CLIP = 100.0
def _is_repo_root(path: Path) -> bool:
@@ -220,6 +222,138 @@ def _validate_result(payload: dict[str, float | bool]) -> dict[str, float | bool
}
+def _as_1d_float_array(value: Any, *, name: str, expected_len: int) -> np.ndarray:
+ arr = np.asarray(value, dtype=np.float64)
+ if arr.shape != (expected_len,):
+ raise ValueError(f"{name} shape must be ({expected_len},), got {arr.shape}")
+ if not np.all(np.isfinite(arr)):
+ raise ValueError(f"{name} must contain only finite values")
+ return arr
+
+
+def _as_channel_batch(value: Any, *, expected_branches: int, requested_batch: int) -> np.ndarray:
+ arr = np.asarray(value, dtype=np.float64)
+ if arr.ndim != 2 or arr.shape[1] != expected_branches:
+ raise ValueError(f"h_magnitudes shape must be (batch, {expected_branches}), got {arr.shape}")
+ if arr.shape[0] <= 0 or arr.shape[0] > requested_batch:
+ raise ValueError(f"channel batch size must be in [1, {requested_batch}], got {arr.shape[0]}")
+ if not np.all(np.isfinite(arr)) or np.any(arr <= 0.0):
+ raise ValueError("h_magnitudes must contain only finite positive values")
+ return arr
+
+
+def _summarize_weighted_event_run(
+ *,
+ event_weights: list[float],
+ total_weight: float,
+ total_samples: int,
+ contributions: list[float],
+ min_events: int,
+) -> dict[str, float | bool]:
+ if total_samples <= 0 or not np.isfinite(total_weight) or total_weight <= 0.0:
+ raise ValueError("evaluator-owned simulation produced no positive total weight")
+
+ if event_weights:
+ event_sum = float(np.sum(event_weights))
+ ratio = event_sum / total_weight
+ ratio_log = float(math.log(max(ratio, ERR_RATIO_ABS_TOL)))
+ contribution_arr = np.asarray(contributions, dtype=np.float64)
+ actual_std = float(np.std(contribution_arr / (total_weight / total_samples)) / math.sqrt(total_samples))
+ converged = bool(len(event_weights) >= min_events and actual_std <= TARGET_STD + STD_TOL)
+ else:
+ ratio = 0.0
+ ratio_log = -20.0
+ actual_std = float("inf")
+ converged = False
+
+ return {
+ "ratio": ratio,
+ "ratio_log": ratio_log,
+ "total_samples": float(total_samples),
+ "actual_std": actual_std,
+ "converged": converged,
+ "event_count": float(len(event_weights)),
+ }
+
+
+def _run_evaluator_owned_simulation(sampler: Any, channel: Any, *, seed: int) -> dict[str, float | bool]:
+ rng = Generator(Philox(seed + 10_000))
+ total_weight = 0.0
+ total_samples = 0
+ event_weights: list[float] = []
+ contributions: list[float] = []
+ sigma_h = float(channel.sigma_h)
+
+ while total_samples < MAX_SAMPLES:
+ requested_batch = min(BATCH_SIZE, MAX_SAMPLES - total_samples)
+ try:
+ h_magnitudes, log_pdf_biased = sampler.sample(
+ num_branches=channel.num_branches,
+ batch_size=requested_batch,
+ sigma_h=sigma_h,
+ )
+ except Exception as e:
+ raise RuntimeError(f"sample 执行失败: {e}") from e
+
+ h_magnitudes = _as_channel_batch(
+ h_magnitudes,
+ expected_branches=channel.num_branches,
+ requested_batch=requested_batch,
+ )
+ batch_size_actual = int(h_magnitudes.shape[0])
+ log_pdf_biased = _as_1d_float_array(log_pdf_biased, name="log_pdf_biased", expected_len=batch_size_actual)
+
+ log_pdf_true = np.sum(
+ -h_magnitudes**2 / (2 * sigma_h**2) - np.log(sigma_h**2) + np.log(h_magnitudes),
+ axis=1,
+ )
+ if not np.all(np.isfinite(log_pdf_true)):
+ raise ValueError("true log pdf contains non-finite values")
+
+ log_weights = np.clip(log_pdf_true - log_pdf_biased, -LOG_WEIGHT_CLIP, LOG_WEIGHT_CLIP)
+ weights = np.exp(log_weights)
+ if not np.all(np.isfinite(weights)) or np.any(weights < 0.0):
+ raise ValueError("importance weights must be finite and non-negative")
+
+ combined_snr = channel.combine_snr(h_magnitudes, DIVERSITY_TYPE, SNR_DB)
+ if combined_snr.shape != (batch_size_actual,) or not np.all(np.isfinite(combined_snr)):
+ raise ValueError("combined SNR values must be finite with shape (batch,)")
+
+ ber = np.asarray(channel.compute_ber(combined_snr, MODULATION), dtype=np.float64)
+ if ber.shape != (batch_size_actual,) or not np.all(np.isfinite(ber)):
+ raise ValueError("BER values must be finite with shape (batch,)")
+ ber = np.clip(ber, 0.0, 1.0)
+ error_draws = rng.random(batch_size_actual) < ber
+
+ for i in range(batch_size_actual):
+ is_error = bool(error_draws[i])
+ weight = float(weights[i])
+ total_weight += weight
+ contributions.append(weight if is_error else 0.0)
+ if is_error:
+ event_weights.append(weight)
+
+ total_samples += batch_size_actual
+ if len(event_weights) >= MIN_ERRORS:
+ interim = _summarize_weighted_event_run(
+ event_weights=event_weights,
+ total_weight=total_weight,
+ total_samples=total_samples,
+ contributions=contributions,
+ min_events=MIN_ERRORS,
+ )
+ if bool(interim["converged"]):
+ break
+
+ return _summarize_weighted_event_run(
+ event_weights=event_weights,
+ total_weight=total_weight,
+ total_samples=total_samples,
+ contributions=contributions,
+ min_events=MIN_ERRORS,
+ )
+
+
def _build_channel(repo_root: Path):
RayleighFadingChannel = _import_channel_model(repo_root)
return RayleighFadingChannel(num_branches=NUM_BRANCHES, sigma_h=1.0)
@@ -269,43 +403,29 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
except Exception as e:
raise RuntimeError(f"DeepFadeSampler 初始化失败: {e}") from e
- if not hasattr(sampler, "simulate_variance_controlled"):
- raise AttributeError("DeepFadeSampler 缺少 simulate_variance_controlled 方法")
+ if not hasattr(sampler, "sample"):
+ raise AttributeError("DeepFadeSampler 缺少 sample 方法")
t0 = time.time()
- try:
- result = sampler.simulate_variance_controlled(
- channel_model=channel,
- diversity_type=DIVERSITY_TYPE,
- modulation=MODULATION,
- snr_db=SNR_DB,
- target_std=TARGET_STD,
- max_samples=MAX_SAMPLES,
- batch_size=BATCH_SIZE,
- min_errors=MIN_ERRORS,
- )
- except Exception as e:
- raise RuntimeError(f"simulate_variance_controlled 执行失败: {e}") from e
+ result = _run_evaluator_owned_simulation(sampler, channel, seed=rep)
dt = time.time() - t0
- normalized = _normalize_result(result)
- validated = _validate_result(normalized)
- err_rate_log = float(validated["err_rate_log"])
+ err_rate_log = float(result["ratio_log"])
runtimes.append(float(dt))
err_logs.append(err_rate_log)
- ratios.append(float(validated["err_ratio"]))
- samples.append(float(validated["total_samples"]))
- stds.append(float(validated["actual_std"]))
- converged_flags.append(1.0 if bool(validated["converged"]) else 0.0)
+ ratios.append(float(result["ratio"]))
+ samples.append(float(result["total_samples"]))
+ stds.append(float(result["actual_std"]))
+ converged_flags.append(1.0 if bool(result["converged"]) else 0.0)
repetition_diagnostics.append({
"repeat": rep,
"runtime_s": float(dt),
- "err_ratio": float(validated["err_ratio"]),
+ "err_ratio": float(result["ratio"]),
"err_rate_log": err_rate_log,
- "total_samples": float(validated["total_samples"]),
- "actual_std": float(validated["actual_std"]),
- "converged": bool(validated["converged"]),
+ "total_samples": float(result["total_samples"]),
+ "actual_std": float(result["actual_std"]),
+ "converged": bool(result["converged"]),
})
runtime_median = float(np.median(runtimes))
@@ -313,7 +433,7 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
err_log_ratio = float(abs(err_log_median - R0_LOG_DEV))
actual_std_median = float(np.nanmedian(stds))
converged_rate = float(np.mean(converged_flags))
- variance_ok = actual_std_median <= TARGET_STD + ERR_RATIO_ABS_TOL
+ variance_ok = actual_std_median <= TARGET_STD + STD_TOL
convergence_ok = math.isclose(converged_rate, 1.0, abs_tol=ERR_RATIO_ABS_TOL)
valid = float(err_log_ratio < EPSILON and variance_ok and convergence_ok)
@@ -335,12 +455,18 @@ def evaluate(program_path: str, *, repo_root: Path | None = None):
"convergence_ok": 1.0 if convergence_ok else 0.0,
"snr_db": SNR_DB,
})
+ artifacts["validity_reason"] = (
+ "ok" if valid > 0 else f"anchor_ok={err_log_ratio < EPSILON},variance_ok={bool(variance_ok)},convergence_ok={bool(convergence_ok)}"
+ )
artifacts["dev_constants"] = json.dumps({
"snr_db": SNR_DB,
"target_std": TARGET_STD,
"max_samples": MAX_SAMPLES,
"batch_size": BATCH_SIZE,
"epsilon": EPSILON,
+ "std_tol": STD_TOL,
+ "log_weight_clip": LOG_WEIGHT_CLIP,
+ "simulation_owner": "evaluator",
"r0_dev": R0_DEV,
"t0_dev": T0_DEV,
"repeats": REPEATS,
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/README.md b/benchmarks/ComputerSystems/DuckDBIndexSelection/README.md
new file mode 100644
index 00000000..7a76b4a5
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/README.md
@@ -0,0 +1,49 @@
+# DuckDB Index Selection
+
+Choose a small whitelist subset of DuckDB indexes for a workload family and minimize hidden-case average runtime.
+
+## What Changed
+
+- The task now evaluates `PUBLIC_CASES` and `HIDDEN_CASES` instead of one frozen workload.
+- The baseline is a simple heuristic index selector, not an empty placeholder.
+- The evaluator scores hidden-case average runtime rather than one manifest.
+
+## What You Edit
+
+- Target file: `scripts/init.py`
+- Entry point: `select_indexes(workload_manifest)`
+
+## Source of Truth
+
+- `Task.md`: full task contract
+- `Task_zh-CN.md`: Chinese task contract
+- `runtime/problem.py`: case family and runtime helper
+- `baseline/solution.py`: heuristic baseline
+- `verification/evaluator.py`: local evaluator
+
+## Environment
+
+From repository root:
+
+```bash
+pip install -r frontier_eval/requirements.txt
+pip install -r benchmarks/ComputerSystems/DuckDBIndexSelection/verification/requirements.txt
+```
+
+## Quick Run
+
+```bash
+python benchmarks/ComputerSystems/DuckDBIndexSelection/verification/evaluator.py \
+ benchmarks/ComputerSystems/DuckDBIndexSelection/scripts/init.py \
+ --metrics-out /tmp/DuckDBIndexSelection_metrics.json
+```
+
+## Main Metrics
+
+- `combined_score = -hidden_avg_runtime_s`
+- `valid`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/README_zh-CN.md b/benchmarks/ComputerSystems/DuckDBIndexSelection/README_zh-CN.md
new file mode 100644
index 00000000..69628af0
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/README_zh-CN.md
@@ -0,0 +1,46 @@
+# DuckDB 索引选择
+
+在一组 DuckDB workload 上选择少量白名单索引,并尽量降低 hidden case 的平均运行时间。
+
+## 本轮同步后的变化
+
+- 评测已从单一 workload 改成 `PUBLIC_CASES + HIDDEN_CASES`。
+- baseline 现在是启发式索引选择器,不再是空实现。
+- 分数改为 hidden case 平均运行时间,而不是单个 manifest。
+
+## 你会改的文件
+
+- 目标文件:`scripts/init.py`
+- 入口函数:`select_indexes(workload_manifest)`
+
+## 先看哪里
+
+- `Task.md` / `Task_zh-CN.md`:任务契约
+- `runtime/problem.py`:case family 与运行辅助逻辑
+- `baseline/solution.py`:启发式 baseline
+- `verification/evaluator.py`:本地评测入口
+
+## 环境准备
+
+```bash
+pip install -r frontier_eval/requirements.txt
+pip install -r benchmarks/ComputerSystems/DuckDBIndexSelection/verification/requirements.txt
+```
+
+## 快速运行
+
+```bash
+python benchmarks/ComputerSystems/DuckDBIndexSelection/verification/evaluator.py \
+ benchmarks/ComputerSystems/DuckDBIndexSelection/scripts/init.py \
+ --metrics-out /tmp/DuckDBIndexSelection_metrics.json
+```
+
+## 主要指标
+
+- `combined_score = -hidden_avg_runtime_s`
+- `valid`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/Task.md b/benchmarks/ComputerSystems/DuckDBIndexSelection/Task.md
new file mode 100644
index 00000000..b321f0c7
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/Task.md
@@ -0,0 +1,51 @@
+# DuckDB Index Selection Task
+
+## Problem
+
+Choose a small whitelist subset of DuckDB indexes for an analytical workload family and minimize hidden-case average runtime.
+
+This benchmark is no longer a single frozen lookup workload. The evaluator now uses multiple public and hidden workload manifests that vary query mix, recency filters, and lookup intensity. Good submissions should choose indexes that generalize across these manifests rather than overfitting one case.
+
+## What Is Frozen
+
+- The local DuckDB schema and data generator in `benchmarks/ComputerSystems/duckdb_local_workload.py`.
+- The legal whitelist of index names and per-case index budget in each workload manifest.
+- The timing protocol: create the selected indexes, warm up once, then time repeated workload execution for each case.
+
+## Submission Contract
+
+Submit one Python file that defines:
+
+```python
+def select_indexes(workload_manifest):
+ ...
+```
+
+Return a list of whitelist index names. A dict with key `indexes` is also accepted.
+
+## Evaluation
+
+1. Load `PUBLIC_CASES` and `HIDDEN_CASES` from `runtime/problem.py`.
+2. For each case, pass the case-specific manifest into `select_indexes(...)`.
+3. Create the selected indexes, run the case workload, and measure total runtime.
+4. Aggregate public and hidden runtimes separately; scoring uses the hidden average.
+
+## Metrics
+
+- `combined_score`: `-hidden_avg_runtime_s`
+- `valid`: `1.0` only if all cases execute successfully and every selected index is legal
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+- `num_public_cases`
+- `num_hidden_cases`
+
+## Invalid Submissions
+
+- `select_indexes(...)` is missing or crashes
+- The return value cannot be parsed into a list of names
+- Any selected name is outside the whitelist
+- Any case exceeds its index budget
+- Index creation or workload execution fails on any public or hidden case
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/Task_zh-CN.md b/benchmarks/ComputerSystems/DuckDBIndexSelection/Task_zh-CN.md
new file mode 100644
index 00000000..453f3a58
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/Task_zh-CN.md
@@ -0,0 +1,51 @@
+# DuckDB 索引选择
+
+## 任务概览
+
+在一组分析型 DuckDB workload 上,从白名单中选择少量索引,尽量降低 hidden case 的平均运行时间。
+
+这个 benchmark 不再是单一冻结 workload。评测会在 `runtime/problem.py` 中定义的多组 `public` 与 `hidden` manifest 上运行,它们会改变查询混合、时间过滤和 lookup 强度。好的策略应当能在多组 manifest 上稳定工作,而不是只对某一个 case 调参。
+
+## 哪些部分是冻结的
+
+- `benchmarks/ComputerSystems/duckdb_local_workload.py` 中的本地 DuckDB schema 与数据生成逻辑。
+- 每个 workload manifest 中给出的合法索引白名单,以及该 case 的索引预算上限。
+- 固定计时协议:创建所选索引,先做一次 warm-up,再对该 case 重复执行 workload 并计时。
+
+## 提交接口
+
+提交一个 Python 文件,定义:
+
+```python
+def select_indexes(workload_manifest):
+ ...
+```
+
+返回索引名列表;也接受带 `indexes` 字段的字典。
+
+## 评测流程
+
+1. 从 `runtime/problem.py` 读取 `PUBLIC_CASES` 与 `HIDDEN_CASES`。
+2. 对每个 case,把 case-specific manifest 传入 `select_indexes(...)`。
+3. 创建候选索引并运行该 case workload,测量总耗时。
+4. 分别聚合 public 与 hidden 耗时;最终分数使用 hidden 平均耗时。
+
+## 指标
+
+- `combined_score`:`-hidden_avg_runtime_s`
+- `valid`:只有所有 case 都成功执行且索引名全部合法时才为 `1.0`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+- `num_public_cases`
+- `num_hidden_cases`
+
+## 判为无效的情况
+
+- 缺少 `select_indexes(...)`,或函数执行报错
+- 返回值无法解析为索引名列表
+- 任意索引名不在白名单中
+- 任意 case 超过该 case 的索引预算
+- 任意 public 或 hidden case 在建索引或执行 workload 时失败
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/baseline/solution.py b/benchmarks/ComputerSystems/DuckDBIndexSelection/baseline/solution.py
new file mode 100644
index 00000000..9bb582b9
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/baseline/solution.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+
+def select_indexes(workload_manifest):
+ max_indexes = int(workload_manifest.get("max_indexes", 2))
+ priority_value = str(workload_manifest.get("priority_value", "1-URGENT"))
+ order_sample = int(workload_manifest.get("order_sample", 0))
+ customer_sample = int(workload_manifest.get("customer_sample", 0))
+ choices = []
+ if customer_sample >= order_sample:
+ choices.append("idx_orders_cust")
+ if order_sample >= customer_sample:
+ choices.append("idx_lineitem_order")
+ if priority_value in {"1-URGENT", "2-HIGH"}:
+ choices.append("idx_orders_priority")
+ if "1998" in str(workload_manifest.get("min_order_date", "")):
+ choices.append("idx_orders_date")
+ if max_indexes >= 3:
+ choices.append("idx_customer_segment")
+ out = []
+ for name in choices:
+ if name not in out:
+ out.append(name)
+ return out[:max_indexes]
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/agent_files.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/agent_files.txt
new file mode 100644
index 00000000..1d2eb069
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/agent_files.txt
@@ -0,0 +1,6 @@
+Task.md
+Task_zh-CN.md
+README.md
+baseline/solution.py
+runtime/problem.py
+references/source_manifest.md
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/candidate_destination.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/candidate_destination.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/candidate_destination.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/constraints.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/constraints.txt
new file mode 100644
index 00000000..88b1935c
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/constraints.txt
@@ -0,0 +1,4 @@
+Edit only `scripts/init.py`.
+Modify only code between `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` in that file.
+Do not modify files under `baseline/`, `runtime/`, `references/`, or `verification/`.
+Keep outputs valid and finite.
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/eval_command.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/eval_command.txt
new file mode 100644
index 00000000..fcba5e60
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/eval_command.txt
@@ -0,0 +1 @@
+{python} verification/evaluator.py {candidate} --metrics-out metrics.json
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/eval_cwd.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/eval_cwd.txt
new file mode 100644
index 00000000..9c558e35
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/eval_cwd.txt
@@ -0,0 +1 @@
+.
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/initial_program.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/initial_program.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/initial_program.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/readonly_files.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/readonly_files.txt
new file mode 100644
index 00000000..8bb37291
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/frontier_eval/readonly_files.txt
@@ -0,0 +1,5 @@
+baseline/solution.py
+runtime/problem.py
+runtime/duckdb_local_workload.py
+verification/evaluator.py
+references/source_manifest.md
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/references/source_manifest.md b/benchmarks/ComputerSystems/DuckDBIndexSelection/references/source_manifest.md
new file mode 100644
index 00000000..b5b0db78
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/references/source_manifest.md
@@ -0,0 +1,10 @@
+# Source Manifest
+
+- Upstream engine: `DuckDB`
+- Upstream lineage:
+ - DuckDB benchmark and TPC-H documentation
+ - DuckDB SQL and index support
+- Schema lineage: this benchmark uses a local frozen relational workload with `customer`, `orders`, and `lineitem` tables modeled after the TPC-H schema family.
+- Data provenance: rows are generated deterministically inside DuckDB from fixed SQL formulas and a fixed schema; this is a benchmark-local synthetic dataset, not official TPC-H `dbgen` output.
+- Authenticity note: the schema and workload lineage are traceable to official DuckDB/TPC-H benchmarking materials, but the data itself is a local frozen synthetic asset used because online extension-based generation was not reliable in this environment.
+- License lineage: DuckDB is released under the MIT License.
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/runtime/duckdb_local_workload.py b/benchmarks/ComputerSystems/DuckDBIndexSelection/runtime/duckdb_local_workload.py
new file mode 100644
index 00000000..cb0da163
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/runtime/duckdb_local_workload.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import math
+import time
+from typing import Any
+
+import duckdb
+
+
+CUSTOMER_COUNT = 20_000
+ORDER_COUNT = 120_000
+LINEITEM_COUNT = 600_000
+
+CUSTOMER_KEYS = tuple(1 + ((i * 97) % CUSTOMER_COUNT) for i in range(1, 301))
+ORDER_KEYS = tuple(1 + ((i * 193) % ORDER_COUNT) for i in range(1, 301))
+
+
+INDEX_CANDIDATES = {
+ "idx_orders_cust": "CREATE INDEX idx_orders_cust ON orders(o_custkey)",
+ "idx_orders_date": "CREATE INDEX idx_orders_date ON orders(o_orderdate)",
+ "idx_lineitem_order": "CREATE INDEX idx_lineitem_order ON lineitem(l_orderkey)",
+ "idx_customer_segment": "CREATE INDEX idx_customer_segment ON customer(c_mktsegment)",
+ "idx_orders_priority": "CREATE INDEX idx_orders_priority ON orders(o_orderpriority)",
+}
+
+PREAGGREGATION_CANDIDATES = {
+ "agg_quarter_segment_revenue": (
+ "CREATE TABLE agg_quarter_segment_revenue AS "
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_month_shipmode_revenue": (
+ "CREATE TABLE agg_month_shipmode_revenue AS "
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "GROUP BY 1, 2"
+ ),
+ "agg_customer_year_revenue": (
+ "CREATE TABLE agg_customer_year_revenue AS "
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_unused_priority_only": (
+ "CREATE TABLE agg_unused_priority_only AS "
+ "SELECT o.o_orderpriority, count(*) AS order_count "
+ "FROM orders o "
+ "GROUP BY 1"
+ ),
+}
+
+
+def build_connection() -> duckdb.DuckDBPyConnection:
+ con = duckdb.connect(database=":memory:")
+ con.execute("PRAGMA threads=1")
+ con.execute(
+ f"""
+ CREATE TABLE customer AS
+ SELECT i AS c_custkey,
+ 'Customer #' || i AS c_name,
+ CASE i % 5
+ WHEN 0 THEN 'BUILDING'
+ WHEN 1 THEN 'AUTOMOBILE'
+ WHEN 2 THEN 'HOUSEHOLD'
+ WHEN 3 THEN 'FURNITURE'
+ ELSE 'MACHINERY'
+ END AS c_mktsegment,
+ i % 25 AS c_nationkey
+ FROM range(1, {CUSTOMER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE orders AS
+ SELECT i AS o_orderkey,
+ 1 + ((i * 17) % {CUSTOMER_COUNT}) AS o_custkey,
+ DATE '1995-01-01' + (((i * 13) % 1460) * INTERVAL 1 DAY) AS o_orderdate,
+ 100 + (((i * 37) % 100000) / 10.0) AS o_totalprice,
+ CASE i % 5
+ WHEN 0 THEN '1-URGENT'
+ WHEN 1 THEN '2-HIGH'
+ WHEN 2 THEN '3-MEDIUM'
+ WHEN 3 THEN '4-NOT SPECIFIED'
+ ELSE '5-LOW'
+ END AS o_orderpriority
+ FROM range(1, {ORDER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE lineitem AS
+ SELECT i AS l_lineitemkey,
+ 1 + ((i * 7) % {ORDER_COUNT}) AS l_orderkey,
+ 1 + ((i * 11) % 50000) AS l_partkey,
+ 1 + ((i * 13) % 10000) AS l_suppkey,
+ 1 + ((i * 5) % 50) AS l_quantity,
+ 10 + (((i * 19) % 100000) / 20.0) AS l_extendedprice,
+ (((i * 3) % 10) / 100.0) AS l_discount,
+ DATE '1995-01-01' + (((i * 29) % 1460) * INTERVAL 1 DAY) AS l_shipdate,
+ CASE i % 5
+ WHEN 0 THEN 'AIR'
+ WHEN 1 THEN 'MAIL'
+ WHEN 2 THEN 'RAIL'
+ WHEN 3 THEN 'TRUCK'
+ ELSE 'SHIP'
+ END AS l_shipmode
+ FROM range(1, {LINEITEM_COUNT + 1}) t(i)
+ """
+ )
+ return con
+
+
+def normalize_name_list(value: Any, key: str) -> list[str]:
+ if isinstance(value, dict):
+ if key not in value:
+ raise ValueError(f"missing {key}")
+ value = value[key]
+ if not isinstance(value, (list, tuple)):
+ raise ValueError(f"{key} must be a list or tuple")
+ out: list[str] = []
+ seen = set()
+ for item in value:
+ name = str(item)
+ if name not in seen:
+ out.append(name)
+ seen.add(name)
+ return out
+
+
+def compare_results(lhs: list[tuple[Any, ...]], rhs: list[tuple[Any, ...]], tol: float = 1e-6) -> bool:
+ if len(lhs) != len(rhs):
+ return False
+ for left_row, right_row in zip(lhs, rhs):
+ if len(left_row) != len(right_row):
+ return False
+ for left_value, right_value in zip(left_row, right_row):
+ if isinstance(left_value, float) or isinstance(right_value, float):
+ if not math.isfinite(float(left_value)) or not math.isfinite(float(right_value)):
+ return False
+ if abs(float(left_value) - float(right_value)) > tol:
+ return False
+ elif left_value != right_value:
+ return False
+ return True
+
+
+def _index_keys(sample_size: int, source: tuple[int, ...]) -> tuple[int, ...]:
+ sample_size = max(1, min(len(source), int(sample_size)))
+ return tuple(source[:sample_size])
+
+
+def run_index_workload(con: duckdb.DuckDBPyConnection, manifest: dict[str, Any]) -> float:
+ start_time = time.perf_counter()
+ customer_keys = _index_keys(manifest.get("customer_sample", 80), CUSTOMER_KEYS)
+ order_keys = _index_keys(manifest.get("order_sample", 80), ORDER_KEYS)
+ urgent_customer_keys = _index_keys(manifest.get("urgent_customer_sample", 40), CUSTOMER_KEYS)
+ min_order_date = str(manifest.get("min_order_date", "1997-01-01"))
+ priority_value = str(manifest.get("priority_value", "1-URGENT"))
+
+ for customer_key in customer_keys:
+ con.execute(
+ "SELECT sum(o_totalprice) "
+ "FROM orders "
+ "WHERE o_custkey = ? AND o_orderdate >= CAST(? AS DATE)",
+ [customer_key, min_order_date],
+ ).fetchone()
+ for order_key in order_keys:
+ con.execute(
+ "SELECT sum(l_extendedprice * (1 - l_discount)) "
+ "FROM lineitem "
+ "WHERE l_orderkey = ?",
+ [order_key],
+ ).fetchone()
+ for customer_key in urgent_customer_keys:
+ con.execute(
+ "SELECT count(*) "
+ "FROM customer c "
+ "JOIN orders o ON c.c_custkey = o.o_custkey "
+ "WHERE c.c_custkey = ? AND o.o_orderpriority = ?",
+ [customer_key, priority_value],
+ ).fetchone()
+ return time.perf_counter() - start_time
+
+
+def measure_index_design(selected_indexes: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_indexes", tuple(sorted(INDEX_CANDIDATES))))
+ max_indexes = int(manifest.get("max_indexes", len(allowed)))
+ unknown = [name for name in selected_indexes if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown index names: {unknown}")
+ if len(selected_indexes) > max_indexes:
+ raise ValueError(f"too many indexes selected: {len(selected_indexes)} > {max_indexes}")
+
+ con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_indexes:
+ con.execute(INDEX_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ workload_runtime = 0.0
+ repetitions = int(manifest.get("repetitions", 3))
+ run_index_workload(con, manifest)
+ for _ in range(repetitions):
+ workload_runtime += run_index_workload(con, manifest)
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "workload_runtime_s": float(workload_runtime),
+ "total_runtime_s": float(setup_runtime + workload_runtime),
+ "selected_index_count": len(selected_indexes),
+ }
+
+
+def _report_quarter_segment(con: duckdb.DuckDBPyConnection, use_aggregate: bool, segment_filter: tuple[str, ...]) -> list[tuple[Any, ...]]:
+ values = ", ".join(f"'{value}'" for value in segment_filter)
+ if use_aggregate:
+ return con.execute(
+ "SELECT quarter_bucket, segment, revenue "
+ "FROM agg_quarter_segment_revenue "
+ f"WHERE segment IN ({values}) "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ f"WHERE c.c_mktsegment IN ({values}) "
+ "GROUP BY 1, 2 "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+
+
+def _report_month_shipmode(con: duckdb.DuckDBPyConnection, use_aggregate: bool, min_shipdate: str) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT month_bucket, shipmode, revenue "
+ "FROM agg_month_shipmode_revenue "
+ "WHERE month_bucket >= CAST(? AS DATE) "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "WHERE l.l_shipdate >= CAST(? AS DATE) "
+ "GROUP BY 1, 2 "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+
+
+def _report_customer_year(con: duckdb.DuckDBPyConnection, use_aggregate: bool, revenue_year: int, limit_rows: int) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT revenue_year, c_custkey, revenue "
+ "FROM agg_customer_year_revenue "
+ "WHERE revenue_year = ? "
+ "ORDER BY revenue DESC, c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+ return con.execute(
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2 "
+ "HAVING year(o.o_orderdate) = ? "
+ "ORDER BY revenue DESC, c.c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+
+
+def _run_preaggregation_reports(
+ con: duckdb.DuckDBPyConnection,
+ selected: set[str],
+ manifest: dict[str, Any],
+) -> tuple[float, tuple[list[tuple[Any, ...]], ...]]:
+ start_time = time.perf_counter()
+ result_a = _report_quarter_segment(
+ con,
+ "agg_quarter_segment_revenue" in selected,
+ tuple(manifest.get("segment_filter", ("BUILDING", "AUTOMOBILE", "HOUSEHOLD"))),
+ )
+ result_b = _report_month_shipmode(
+ con,
+ "agg_month_shipmode_revenue" in selected,
+ str(manifest.get("min_shipdate", "1997-01-01")),
+ )
+ result_c = _report_customer_year(
+ con,
+ "agg_customer_year_revenue" in selected,
+ int(manifest.get("revenue_year", 1998)),
+ int(manifest.get("limit_rows", 100)),
+ )
+ runtime = time.perf_counter() - start_time
+ return runtime, (result_a, result_b, result_c)
+
+
+def measure_preaggregation_design(selected_preaggregations: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_preaggregations", tuple(sorted(PREAGGREGATION_CANDIDATES))))
+ max_preaggregations = int(manifest.get("max_preaggregations", len(allowed)))
+ unknown = [name for name in selected_preaggregations if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown pre-aggregation names: {unknown}")
+ if len(selected_preaggregations) > max_preaggregations:
+ raise ValueError(
+ f"too many pre-aggregations selected: {len(selected_preaggregations)} > {max_preaggregations}"
+ )
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_preaggregations:
+ candidate_con.execute(PREAGGREGATION_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ _, baseline_results = _run_preaggregation_reports(baseline_con, set(), manifest)
+ _, candidate_results = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ if any(not compare_results(left, right) for left, right in zip(candidate_results, baseline_results)):
+ raise ValueError("candidate pre-aggregation selection changed the query results")
+
+ repetitions = int(manifest.get("repetitions", 3))
+ repeated_baseline_runtime = 0.0
+ repeated_candidate_runtime = 0.0
+ _run_preaggregation_reports(baseline_con, set(), manifest)
+ _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ for _ in range(repetitions):
+ extra_runtime, _ = _run_preaggregation_reports(baseline_con, set(), manifest)
+ repeated_baseline_runtime += extra_runtime
+ extra_runtime, _ = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ repeated_candidate_runtime += extra_runtime
+
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "candidate_workload_runtime_s": float(repeated_candidate_runtime),
+ "candidate_total_runtime_s": float(setup_runtime + repeated_candidate_runtime),
+ "baseline_total_runtime_s": float(repeated_baseline_runtime),
+ "selected_preaggregation_count": len(selected_preaggregations),
+ }
+
+
+def measure_query_rewrite(sql: str, manifest: dict[str, Any]) -> dict[str, Any]:
+ sql = str(sql).strip()
+ if not sql:
+ raise ValueError("query must not be empty")
+ baseline_sql = str(manifest["baseline_sql"]).strip()
+ repetitions = int(manifest.get("repetitions", 3))
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ baseline_rows = baseline_con.execute(baseline_sql).fetchall()
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ if not compare_results(candidate_rows, baseline_rows):
+ raise ValueError("candidate query result does not match the baseline result")
+
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_start = time.perf_counter()
+ for _ in range(repetitions):
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_runtime = time.perf_counter() - baseline_start
+
+ candidate_con.execute(sql).fetchall()
+ candidate_start = time.perf_counter()
+ for _ in range(repetitions):
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ candidate_runtime = time.perf_counter() - candidate_start
+
+ return {
+ "baseline_runtime_s": float(baseline_runtime),
+ "candidate_runtime_s": float(candidate_runtime),
+ "row_count": len(candidate_rows),
+ }
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/runtime/problem.py b/benchmarks/ComputerSystems/DuckDBIndexSelection/runtime/problem.py
new file mode 100644
index 00000000..690e0708
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/runtime/problem.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+try:
+ from .duckdb_local_workload import INDEX_CANDIDATES, measure_index_design, normalize_name_list
+except ImportError:
+ from benchmarks.ComputerSystems.duckdb_local_workload import INDEX_CANDIDATES, measure_index_design, normalize_name_list
+
+
+PUBLIC_CASES = (
+ {
+ "case_id": "public_customer_join",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 2,
+ "customer_sample": 60,
+ "order_sample": 50,
+ "urgent_customer_sample": 30,
+ "priority_value": "1-URGENT",
+ "min_order_date": "1997-01-01",
+ "repetitions": 3,
+ },
+ {
+ "case_id": "public_order_lookup",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 2,
+ "customer_sample": 40,
+ "order_sample": 80,
+ "urgent_customer_sample": 20,
+ "priority_value": "2-HIGH",
+ "min_order_date": "1996-01-01",
+ "repetitions": 3,
+ },
+ {
+ "case_id": "public_priority_mix",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 3,
+ "customer_sample": 90,
+ "order_sample": 40,
+ "urgent_customer_sample": 50,
+ "priority_value": "1-URGENT",
+ "min_order_date": "1998-01-01",
+ "repetitions": 2,
+ },
+)
+
+HIDDEN_CASES = (
+ {
+ "case_id": "hidden_deep_history",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 2,
+ "customer_sample": 55,
+ "order_sample": 70,
+ "urgent_customer_sample": 35,
+ "priority_value": "3-MEDIUM",
+ "min_order_date": "1995-06-01",
+ "repetitions": 3,
+ },
+ {
+ "case_id": "hidden_recent_priority",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 2,
+ "customer_sample": 75,
+ "order_sample": 60,
+ "urgent_customer_sample": 45,
+ "priority_value": "1-URGENT",
+ "min_order_date": "1998-06-01",
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_lookup_heavy",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 2,
+ "customer_sample": 25,
+ "order_sample": 120,
+ "urgent_customer_sample": 20,
+ "priority_value": "5-LOW",
+ "min_order_date": "1997-01-01",
+ "repetitions": 3,
+ },
+ {
+ "case_id": "hidden_balanced",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 3,
+ "customer_sample": 70,
+ "order_sample": 70,
+ "urgent_customer_sample": 40,
+ "priority_value": "2-HIGH",
+ "min_order_date": "1996-07-01",
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_customer_focus",
+ "candidate_indexes": tuple(sorted(INDEX_CANDIDATES)),
+ "max_indexes": 2,
+ "customer_sample": 110,
+ "order_sample": 35,
+ "urgent_customer_sample": 60,
+ "priority_value": "1-URGENT",
+ "min_order_date": "1997-04-01",
+ "repetitions": 2,
+ },
+)
+
+WORKLOAD_MANIFEST = dict(PUBLIC_CASES[0])
+
+
+def load_instance():
+ return dict(WORKLOAD_MANIFEST)
+
+
+def evaluate_selection(selection, manifest: dict | None = None):
+ manifest = WORKLOAD_MANIFEST if manifest is None else dict(manifest)
+ return measure_index_design(normalize_name_list(selection, "indexes"), manifest)
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/scripts/init.py b/benchmarks/ComputerSystems/DuckDBIndexSelection/scripts/init.py
new file mode 100644
index 00000000..cf1bad8b
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/scripts/init.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+def _is_repo_root(path: Path) -> bool:
+ return (path / "benchmarks").is_dir() and (path / "frontier_eval").is_dir()
+
+
+def _ensure_import_path() -> None:
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if _is_repo_root(parent):
+ ps = str(parent)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+ return
+ benchmark_root = here.parents[1]
+ ps = str(benchmark_root)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+
+
+_ensure_import_path()
+
+try:
+ from benchmarks.ComputerSystems.DuckDBIndexSelection.baseline.solution import select_indexes as _baseline_select_indexes
+ from benchmarks.ComputerSystems.DuckDBIndexSelection.runtime.problem import WORKLOAD_MANIFEST, evaluate_selection
+except ModuleNotFoundError:
+ from baseline.solution import select_indexes as _baseline_select_indexes
+ from runtime.problem import WORKLOAD_MANIFEST, evaluate_selection
+
+
+# EVOLVE-BLOCK-START
+def select_indexes(workload_manifest):
+ return _baseline_select_indexes(workload_manifest)
+# EVOLVE-BLOCK-END
+
+
+if __name__ == "__main__":
+ print(evaluate_selection(select_indexes(WORKLOAD_MANIFEST)))
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/verification/evaluator.py b/benchmarks/ComputerSystems/DuckDBIndexSelection/verification/evaluator.py
new file mode 100644
index 00000000..0da234c6
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/verification/evaluator.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import runpy
+import traceback
+from pathlib import Path
+
+
+def _repo_root() -> Path:
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if (parent / "benchmarks").is_dir() and (parent / "frontier_eval").is_dir():
+ return parent
+ return Path.cwd().resolve()
+
+
+def _benchmark_root() -> Path:
+ return Path(__file__).resolve().parents[1]
+
+
+def _ensure_import_path() -> None:
+ import sys
+
+ for p in (_repo_root(), _benchmark_root()):
+ ps = str(p)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+
+
+_ensure_import_path()
+
+try:
+ from benchmarks.ComputerSystems.DuckDBIndexSelection.baseline.solution import select_indexes as baseline_select_indexes
+ from benchmarks.ComputerSystems.DuckDBIndexSelection.runtime.problem import (
+ HIDDEN_CASES,
+ PUBLIC_CASES,
+ evaluate_selection,
+ )
+except ModuleNotFoundError:
+ from baseline.solution import select_indexes as baseline_select_indexes
+ from runtime.problem import HIDDEN_CASES, PUBLIC_CASES, evaluate_selection
+
+
+def _run_case(select_indexes, case):
+ selection = select_indexes(dict(case))
+ return evaluate_selection(selection, case)
+
+
+def evaluate(program_path: str):
+ metrics = {
+ "combined_score": -1e18,
+ "valid": 0.0,
+ "public_avg_runtime_s": 0.0,
+ "hidden_avg_runtime_s": 0.0,
+ "baseline_hidden_avg_runtime_s": 0.0,
+ "num_public_cases": 0.0,
+ "num_hidden_cases": 0.0,
+ }
+ artifacts = {}
+ namespace = runpy.run_path(str(Path(program_path).expanduser().resolve()), run_name="candidate_program")
+ select_indexes = namespace.get("select_indexes")
+ if not callable(select_indexes):
+ artifacts["error_message"] = "candidate must define select_indexes(workload_manifest)"
+ return metrics, artifacts
+ try:
+ public_candidate = [_run_case(select_indexes, case) for case in PUBLIC_CASES]
+ hidden_candidate = [_run_case(select_indexes, case) for case in HIDDEN_CASES]
+ hidden_baseline = [_run_case(baseline_select_indexes, case) for case in HIDDEN_CASES]
+ except Exception:
+ artifacts["error_message"] = traceback.format_exc()
+ return metrics, artifacts
+
+ hidden_avg = sum(float(item["total_runtime_s"]) for item in hidden_candidate) / len(hidden_candidate)
+ baseline_hidden_avg = sum(float(item["total_runtime_s"]) for item in hidden_baseline) / len(hidden_baseline)
+ public_avg = sum(float(item["total_runtime_s"]) for item in public_candidate) / len(public_candidate)
+ if not math.isfinite(hidden_avg) or hidden_avg <= 0:
+ artifacts["error_message"] = "candidate runtime is invalid"
+ return metrics, artifacts
+
+ metrics["valid"] = 1.0
+ metrics["public_avg_runtime_s"] = public_avg
+ metrics["hidden_avg_runtime_s"] = hidden_avg
+ metrics["baseline_hidden_avg_runtime_s"] = baseline_hidden_avg
+ metrics["num_public_cases"] = float(len(PUBLIC_CASES))
+ metrics["num_hidden_cases"] = float(len(HIDDEN_CASES))
+ metrics["combined_score"] = -hidden_avg
+ return metrics, artifacts
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("program")
+ parser.add_argument("--metrics-out", default="metrics.json")
+ args = parser.parse_args()
+ metrics, artifacts = evaluate(args.program)
+ Path(args.metrics_out).write_text(json.dumps(metrics, indent=2), encoding="utf-8")
+ if artifacts:
+ Path("artifacts.json").write_text(json.dumps(artifacts, indent=2), encoding="utf-8")
+ print(json.dumps(metrics, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/ComputerSystems/DuckDBIndexSelection/verification/requirements.txt b/benchmarks/ComputerSystems/DuckDBIndexSelection/verification/requirements.txt
new file mode 100644
index 00000000..8a6ba6a1
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBIndexSelection/verification/requirements.txt
@@ -0,0 +1 @@
+duckdb
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/README.md b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/README.md
new file mode 100644
index 00000000..d43cb4e0
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/README.md
@@ -0,0 +1,47 @@
+# DuckDB Pre-Aggregation Selection
+
+Choose a small whitelist subset of legal pre-aggregations for a workload family and minimize hidden-case average runtime.
+
+## What Changed
+
+- The task now evaluates multiple public and hidden report configurations.
+- The baseline is a heuristic materialization choice, not a null selector.
+- Candidate designs must preserve report semantics across the whole case family.
+
+## What You Edit
+
+- Target file: `scripts/init.py`
+- Entry point: `select_preaggregations(workload_manifest)`
+
+## Source of Truth
+
+- `Task.md`
+- `Task_zh-CN.md`
+- `runtime/problem.py`
+- `baseline/solution.py`
+- `verification/evaluator.py`
+
+## Environment
+
+```bash
+pip install -r frontier_eval/requirements.txt
+pip install -r benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/requirements.txt
+```
+
+## Quick Run
+
+```bash
+python benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/evaluator.py \
+ benchmarks/ComputerSystems/DuckDBPreAggregationSelection/scripts/init.py \
+ --metrics-out /tmp/DuckDBPreAggregationSelection_metrics.json
+```
+
+## Main Metrics
+
+- `combined_score = -hidden_avg_runtime_s`
+- `valid`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/README_zh-CN.md b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/README_zh-CN.md
new file mode 100644
index 00000000..d673fa15
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/README_zh-CN.md
@@ -0,0 +1,46 @@
+# DuckDB 预聚合选择
+
+在一组 DuckDB 报表 workload 上选择少量合法预聚合,并尽量降低 hidden case 的平均运行时间。
+
+## 本轮同步后的变化
+
+- 评测已改成多组 public / hidden 报表配置。
+- baseline 现在是启发式物化选择,不再是空选择器。
+- 候选方案必须在整个 case family 上保持报表语义不变。
+
+## 你会改的文件
+
+- 目标文件:`scripts/init.py`
+- 入口函数:`select_preaggregations(workload_manifest)`
+
+## 先看哪里
+
+- `Task.md` / `Task_zh-CN.md`
+- `runtime/problem.py`
+- `baseline/solution.py`
+- `verification/evaluator.py`
+
+## 环境准备
+
+```bash
+pip install -r frontier_eval/requirements.txt
+pip install -r benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/requirements.txt
+```
+
+## 快速运行
+
+```bash
+python benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/evaluator.py \
+ benchmarks/ComputerSystems/DuckDBPreAggregationSelection/scripts/init.py \
+ --metrics-out /tmp/DuckDBPreAggregationSelection_metrics.json
+```
+
+## 主要指标
+
+- `combined_score = -hidden_avg_runtime_s`
+- `valid`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/Task.md b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/Task.md
new file mode 100644
index 00000000..494ae0c8
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/Task.md
@@ -0,0 +1,52 @@
+# DuckDB Pre-Aggregation Selection Task
+
+## Problem
+
+Choose a small whitelist subset of legal pre-aggregations for an analytical workload family and minimize hidden-case average runtime.
+
+The evaluator now uses multiple public and hidden report configurations rather than one frozen workload. Each case changes segment filters, time windows, top-k settings, or report emphasis. The goal is to pick pre-aggregations that generalize across these report shapes without changing query semantics.
+
+## What Is Frozen
+
+- The local DuckDB schema and data generator in `benchmarks/ComputerSystems/duckdb_local_workload.py`.
+- The whitelist of legal pre-aggregation names and the per-case pre-aggregation budget.
+- The semantics check: every candidate design must preserve the results of the frozen report family.
+
+## Submission Contract
+
+Submit one Python file that defines:
+
+```python
+def select_preaggregations(workload_manifest):
+ ...
+```
+
+Return a list of whitelist pre-aggregation names. A dict with key `preaggregations` is also accepted.
+
+## Evaluation
+
+1. Load case manifests from `PUBLIC_CASES` and `HIDDEN_CASES`.
+2. For each case, call `select_preaggregations(...)` with the case manifest.
+3. Materialize the selected pre-aggregations and verify that report outputs remain unchanged.
+4. Aggregate runtime across cases; scoring uses the hidden-case average.
+
+## Metrics
+
+- `combined_score`: `-hidden_avg_runtime_s`
+- `valid`: `1.0` only if all reports stay semantically correct and all cases run successfully
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+- `num_public_cases`
+- `num_hidden_cases`
+
+## Invalid Submissions
+
+- `select_preaggregations(...)` is missing or crashes
+- The return value cannot be parsed into a list of names
+- Any selected name is outside the whitelist
+- Any case exceeds its pre-aggregation budget
+- Candidate pre-aggregations change any report result
+- Setup or evaluation fails on any public or hidden case
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/Task_zh-CN.md b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/Task_zh-CN.md
new file mode 100644
index 00000000..87a6044f
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/Task_zh-CN.md
@@ -0,0 +1,52 @@
+# DuckDB 预聚合选择
+
+## 任务概览
+
+在一组分析型 DuckDB 报表 workload 上,从白名单中选择少量预聚合结构,尽量降低 hidden case 的平均运行时间。
+
+评测不再是单一冻结 workload,而是多组 `public` / `hidden` 报表配置。不同 case 会改变 segment 过滤、时间窗口、top-k 数量或报表重心。目标是在不改变语义的前提下,选择对多组 case 都有帮助的预聚合。
+
+## 哪些部分是冻结的
+
+- `benchmarks/ComputerSystems/duckdb_local_workload.py` 中的本地 DuckDB schema 与数据生成逻辑。
+- 合法预聚合名称白名单,以及每个 case 的预聚合预算上限。
+- 固定的语义校验:候选预聚合不能改变冻结报表族的输出结果。
+
+## 提交接口
+
+提交一个 Python 文件,定义:
+
+```python
+def select_preaggregations(workload_manifest):
+ ...
+```
+
+返回预聚合名称列表;也接受带 `preaggregations` 字段的字典。
+
+## 评测流程
+
+1. 从 `PUBLIC_CASES` 与 `HIDDEN_CASES` 载入 case manifest。
+2. 对每个 case,把该 case 的 manifest 传给 `select_preaggregations(...)`。
+3. 物化所选预聚合,并验证报表输出语义不变。
+4. 聚合不同 case 的运行时间;最终分数使用 hidden case 平均耗时。
+
+## 指标
+
+- `combined_score`:`-hidden_avg_runtime_s`
+- `valid`:只有所有报表语义正确且所有 case 都成功运行时才为 `1.0`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+- `num_public_cases`
+- `num_hidden_cases`
+
+## 判为无效的情况
+
+- 缺少 `select_preaggregations(...)`,或函数执行报错
+- 返回值无法解析为名称列表
+- 任意名称不在白名单中
+- 任意 case 超过预聚合预算
+- 任意预聚合方案改变了报表结果
+- 任意 public 或 hidden case 在构建或评测时失败
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/baseline/solution.py b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/baseline/solution.py
new file mode 100644
index 00000000..a16fa8cd
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/baseline/solution.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+
+def select_preaggregations(workload_manifest):
+ max_preaggregations = int(workload_manifest.get("max_preaggregations", 1))
+ limit_rows = int(workload_manifest.get("limit_rows", 100))
+ min_shipdate = str(workload_manifest.get("min_shipdate", "1997-01-01"))
+ choices = ["agg_quarter_segment_revenue"]
+ if limit_rows <= 60:
+ choices.insert(0, "agg_customer_year_revenue")
+ if "1998" in min_shipdate or max_preaggregations >= 2:
+ choices.append("agg_month_shipmode_revenue")
+ out = []
+ for name in choices:
+ if name not in out:
+ out.append(name)
+ return out[:max_preaggregations]
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/agent_files.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/agent_files.txt
new file mode 100644
index 00000000..1d2eb069
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/agent_files.txt
@@ -0,0 +1,6 @@
+Task.md
+Task_zh-CN.md
+README.md
+baseline/solution.py
+runtime/problem.py
+references/source_manifest.md
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/candidate_destination.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/candidate_destination.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/candidate_destination.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/constraints.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/constraints.txt
new file mode 100644
index 00000000..88b1935c
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/constraints.txt
@@ -0,0 +1,4 @@
+Edit only `scripts/init.py`.
+Modify only code between `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` in that file.
+Do not modify files under `baseline/`, `runtime/`, `references/`, or `verification/`.
+Keep outputs valid and finite.
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/eval_command.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/eval_command.txt
new file mode 100644
index 00000000..fcba5e60
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/eval_command.txt
@@ -0,0 +1 @@
+{python} verification/evaluator.py {candidate} --metrics-out metrics.json
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/eval_cwd.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/eval_cwd.txt
new file mode 100644
index 00000000..9c558e35
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/eval_cwd.txt
@@ -0,0 +1 @@
+.
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/initial_program.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/initial_program.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/initial_program.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/readonly_files.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/readonly_files.txt
new file mode 100644
index 00000000..8bb37291
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/frontier_eval/readonly_files.txt
@@ -0,0 +1,5 @@
+baseline/solution.py
+runtime/problem.py
+runtime/duckdb_local_workload.py
+verification/evaluator.py
+references/source_manifest.md
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/references/source_manifest.md b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/references/source_manifest.md
new file mode 100644
index 00000000..36093907
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/references/source_manifest.md
@@ -0,0 +1,10 @@
+# Source Manifest
+
+- Upstream engine: `DuckDB`
+- Upstream lineage:
+ - DuckDB benchmark and TPC-H documentation
+ - DuckDB SQL execution on analytical reporting queries
+- Schema lineage: this benchmark uses a local frozen relational workload with `customer`, `orders`, and `lineitem` tables modeled after the TPC-H schema family.
+- Data provenance: rows are generated deterministically inside DuckDB from fixed SQL formulas and a fixed schema; this is a benchmark-local synthetic dataset, not official TPC-H `dbgen` output.
+- Authenticity note: the reporting queries and schema family are traceable to official analytical benchmark patterns, while the candidate pre-aggregations are benchmark-local frozen physical-design options.
+- License lineage: DuckDB is released under the MIT License.
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/runtime/duckdb_local_workload.py b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/runtime/duckdb_local_workload.py
new file mode 100644
index 00000000..cb0da163
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/runtime/duckdb_local_workload.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import math
+import time
+from typing import Any
+
+import duckdb
+
+
+CUSTOMER_COUNT = 20_000
+ORDER_COUNT = 120_000
+LINEITEM_COUNT = 600_000
+
+CUSTOMER_KEYS = tuple(1 + ((i * 97) % CUSTOMER_COUNT) for i in range(1, 301))
+ORDER_KEYS = tuple(1 + ((i * 193) % ORDER_COUNT) for i in range(1, 301))
+
+
+INDEX_CANDIDATES = {
+ "idx_orders_cust": "CREATE INDEX idx_orders_cust ON orders(o_custkey)",
+ "idx_orders_date": "CREATE INDEX idx_orders_date ON orders(o_orderdate)",
+ "idx_lineitem_order": "CREATE INDEX idx_lineitem_order ON lineitem(l_orderkey)",
+ "idx_customer_segment": "CREATE INDEX idx_customer_segment ON customer(c_mktsegment)",
+ "idx_orders_priority": "CREATE INDEX idx_orders_priority ON orders(o_orderpriority)",
+}
+
+PREAGGREGATION_CANDIDATES = {
+ "agg_quarter_segment_revenue": (
+ "CREATE TABLE agg_quarter_segment_revenue AS "
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_month_shipmode_revenue": (
+ "CREATE TABLE agg_month_shipmode_revenue AS "
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "GROUP BY 1, 2"
+ ),
+ "agg_customer_year_revenue": (
+ "CREATE TABLE agg_customer_year_revenue AS "
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_unused_priority_only": (
+ "CREATE TABLE agg_unused_priority_only AS "
+ "SELECT o.o_orderpriority, count(*) AS order_count "
+ "FROM orders o "
+ "GROUP BY 1"
+ ),
+}
+
+
+def build_connection() -> duckdb.DuckDBPyConnection:
+ con = duckdb.connect(database=":memory:")
+ con.execute("PRAGMA threads=1")
+ con.execute(
+ f"""
+ CREATE TABLE customer AS
+ SELECT i AS c_custkey,
+ 'Customer #' || i AS c_name,
+ CASE i % 5
+ WHEN 0 THEN 'BUILDING'
+ WHEN 1 THEN 'AUTOMOBILE'
+ WHEN 2 THEN 'HOUSEHOLD'
+ WHEN 3 THEN 'FURNITURE'
+ ELSE 'MACHINERY'
+ END AS c_mktsegment,
+ i % 25 AS c_nationkey
+ FROM range(1, {CUSTOMER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE orders AS
+ SELECT i AS o_orderkey,
+ 1 + ((i * 17) % {CUSTOMER_COUNT}) AS o_custkey,
+ DATE '1995-01-01' + (((i * 13) % 1460) * INTERVAL 1 DAY) AS o_orderdate,
+ 100 + (((i * 37) % 100000) / 10.0) AS o_totalprice,
+ CASE i % 5
+ WHEN 0 THEN '1-URGENT'
+ WHEN 1 THEN '2-HIGH'
+ WHEN 2 THEN '3-MEDIUM'
+ WHEN 3 THEN '4-NOT SPECIFIED'
+ ELSE '5-LOW'
+ END AS o_orderpriority
+ FROM range(1, {ORDER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE lineitem AS
+ SELECT i AS l_lineitemkey,
+ 1 + ((i * 7) % {ORDER_COUNT}) AS l_orderkey,
+ 1 + ((i * 11) % 50000) AS l_partkey,
+ 1 + ((i * 13) % 10000) AS l_suppkey,
+ 1 + ((i * 5) % 50) AS l_quantity,
+ 10 + (((i * 19) % 100000) / 20.0) AS l_extendedprice,
+ (((i * 3) % 10) / 100.0) AS l_discount,
+ DATE '1995-01-01' + (((i * 29) % 1460) * INTERVAL 1 DAY) AS l_shipdate,
+ CASE i % 5
+ WHEN 0 THEN 'AIR'
+ WHEN 1 THEN 'MAIL'
+ WHEN 2 THEN 'RAIL'
+ WHEN 3 THEN 'TRUCK'
+ ELSE 'SHIP'
+ END AS l_shipmode
+ FROM range(1, {LINEITEM_COUNT + 1}) t(i)
+ """
+ )
+ return con
+
+
+def normalize_name_list(value: Any, key: str) -> list[str]:
+ if isinstance(value, dict):
+ if key not in value:
+ raise ValueError(f"missing {key}")
+ value = value[key]
+ if not isinstance(value, (list, tuple)):
+ raise ValueError(f"{key} must be a list or tuple")
+ out: list[str] = []
+ seen = set()
+ for item in value:
+ name = str(item)
+ if name not in seen:
+ out.append(name)
+ seen.add(name)
+ return out
+
+
+def compare_results(lhs: list[tuple[Any, ...]], rhs: list[tuple[Any, ...]], tol: float = 1e-6) -> bool:
+ if len(lhs) != len(rhs):
+ return False
+ for left_row, right_row in zip(lhs, rhs):
+ if len(left_row) != len(right_row):
+ return False
+ for left_value, right_value in zip(left_row, right_row):
+ if isinstance(left_value, float) or isinstance(right_value, float):
+ if not math.isfinite(float(left_value)) or not math.isfinite(float(right_value)):
+ return False
+ if abs(float(left_value) - float(right_value)) > tol:
+ return False
+ elif left_value != right_value:
+ return False
+ return True
+
+
+def _index_keys(sample_size: int, source: tuple[int, ...]) -> tuple[int, ...]:
+ sample_size = max(1, min(len(source), int(sample_size)))
+ return tuple(source[:sample_size])
+
+
+def run_index_workload(con: duckdb.DuckDBPyConnection, manifest: dict[str, Any]) -> float:
+ start_time = time.perf_counter()
+ customer_keys = _index_keys(manifest.get("customer_sample", 80), CUSTOMER_KEYS)
+ order_keys = _index_keys(manifest.get("order_sample", 80), ORDER_KEYS)
+ urgent_customer_keys = _index_keys(manifest.get("urgent_customer_sample", 40), CUSTOMER_KEYS)
+ min_order_date = str(manifest.get("min_order_date", "1997-01-01"))
+ priority_value = str(manifest.get("priority_value", "1-URGENT"))
+
+ for customer_key in customer_keys:
+ con.execute(
+ "SELECT sum(o_totalprice) "
+ "FROM orders "
+ "WHERE o_custkey = ? AND o_orderdate >= CAST(? AS DATE)",
+ [customer_key, min_order_date],
+ ).fetchone()
+ for order_key in order_keys:
+ con.execute(
+ "SELECT sum(l_extendedprice * (1 - l_discount)) "
+ "FROM lineitem "
+ "WHERE l_orderkey = ?",
+ [order_key],
+ ).fetchone()
+ for customer_key in urgent_customer_keys:
+ con.execute(
+ "SELECT count(*) "
+ "FROM customer c "
+ "JOIN orders o ON c.c_custkey = o.o_custkey "
+ "WHERE c.c_custkey = ? AND o.o_orderpriority = ?",
+ [customer_key, priority_value],
+ ).fetchone()
+ return time.perf_counter() - start_time
+
+
+def measure_index_design(selected_indexes: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_indexes", tuple(sorted(INDEX_CANDIDATES))))
+ max_indexes = int(manifest.get("max_indexes", len(allowed)))
+ unknown = [name for name in selected_indexes if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown index names: {unknown}")
+ if len(selected_indexes) > max_indexes:
+ raise ValueError(f"too many indexes selected: {len(selected_indexes)} > {max_indexes}")
+
+ con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_indexes:
+ con.execute(INDEX_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ workload_runtime = 0.0
+ repetitions = int(manifest.get("repetitions", 3))
+ run_index_workload(con, manifest)
+ for _ in range(repetitions):
+ workload_runtime += run_index_workload(con, manifest)
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "workload_runtime_s": float(workload_runtime),
+ "total_runtime_s": float(setup_runtime + workload_runtime),
+ "selected_index_count": len(selected_indexes),
+ }
+
+
+def _report_quarter_segment(con: duckdb.DuckDBPyConnection, use_aggregate: bool, segment_filter: tuple[str, ...]) -> list[tuple[Any, ...]]:
+ values = ", ".join(f"'{value}'" for value in segment_filter)
+ if use_aggregate:
+ return con.execute(
+ "SELECT quarter_bucket, segment, revenue "
+ "FROM agg_quarter_segment_revenue "
+ f"WHERE segment IN ({values}) "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ f"WHERE c.c_mktsegment IN ({values}) "
+ "GROUP BY 1, 2 "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+
+
+def _report_month_shipmode(con: duckdb.DuckDBPyConnection, use_aggregate: bool, min_shipdate: str) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT month_bucket, shipmode, revenue "
+ "FROM agg_month_shipmode_revenue "
+ "WHERE month_bucket >= CAST(? AS DATE) "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "WHERE l.l_shipdate >= CAST(? AS DATE) "
+ "GROUP BY 1, 2 "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+
+
+def _report_customer_year(con: duckdb.DuckDBPyConnection, use_aggregate: bool, revenue_year: int, limit_rows: int) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT revenue_year, c_custkey, revenue "
+ "FROM agg_customer_year_revenue "
+ "WHERE revenue_year = ? "
+ "ORDER BY revenue DESC, c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+ return con.execute(
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2 "
+ "HAVING year(o.o_orderdate) = ? "
+ "ORDER BY revenue DESC, c.c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+
+
+def _run_preaggregation_reports(
+ con: duckdb.DuckDBPyConnection,
+ selected: set[str],
+ manifest: dict[str, Any],
+) -> tuple[float, tuple[list[tuple[Any, ...]], ...]]:
+ start_time = time.perf_counter()
+ result_a = _report_quarter_segment(
+ con,
+ "agg_quarter_segment_revenue" in selected,
+ tuple(manifest.get("segment_filter", ("BUILDING", "AUTOMOBILE", "HOUSEHOLD"))),
+ )
+ result_b = _report_month_shipmode(
+ con,
+ "agg_month_shipmode_revenue" in selected,
+ str(manifest.get("min_shipdate", "1997-01-01")),
+ )
+ result_c = _report_customer_year(
+ con,
+ "agg_customer_year_revenue" in selected,
+ int(manifest.get("revenue_year", 1998)),
+ int(manifest.get("limit_rows", 100)),
+ )
+ runtime = time.perf_counter() - start_time
+ return runtime, (result_a, result_b, result_c)
+
+
+def measure_preaggregation_design(selected_preaggregations: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_preaggregations", tuple(sorted(PREAGGREGATION_CANDIDATES))))
+ max_preaggregations = int(manifest.get("max_preaggregations", len(allowed)))
+ unknown = [name for name in selected_preaggregations if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown pre-aggregation names: {unknown}")
+ if len(selected_preaggregations) > max_preaggregations:
+ raise ValueError(
+ f"too many pre-aggregations selected: {len(selected_preaggregations)} > {max_preaggregations}"
+ )
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_preaggregations:
+ candidate_con.execute(PREAGGREGATION_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ _, baseline_results = _run_preaggregation_reports(baseline_con, set(), manifest)
+ _, candidate_results = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ if any(not compare_results(left, right) for left, right in zip(candidate_results, baseline_results)):
+ raise ValueError("candidate pre-aggregation selection changed the query results")
+
+ repetitions = int(manifest.get("repetitions", 3))
+ repeated_baseline_runtime = 0.0
+ repeated_candidate_runtime = 0.0
+ _run_preaggregation_reports(baseline_con, set(), manifest)
+ _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ for _ in range(repetitions):
+ extra_runtime, _ = _run_preaggregation_reports(baseline_con, set(), manifest)
+ repeated_baseline_runtime += extra_runtime
+ extra_runtime, _ = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ repeated_candidate_runtime += extra_runtime
+
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "candidate_workload_runtime_s": float(repeated_candidate_runtime),
+ "candidate_total_runtime_s": float(setup_runtime + repeated_candidate_runtime),
+ "baseline_total_runtime_s": float(repeated_baseline_runtime),
+ "selected_preaggregation_count": len(selected_preaggregations),
+ }
+
+
+def measure_query_rewrite(sql: str, manifest: dict[str, Any]) -> dict[str, Any]:
+ sql = str(sql).strip()
+ if not sql:
+ raise ValueError("query must not be empty")
+ baseline_sql = str(manifest["baseline_sql"]).strip()
+ repetitions = int(manifest.get("repetitions", 3))
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ baseline_rows = baseline_con.execute(baseline_sql).fetchall()
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ if not compare_results(candidate_rows, baseline_rows):
+ raise ValueError("candidate query result does not match the baseline result")
+
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_start = time.perf_counter()
+ for _ in range(repetitions):
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_runtime = time.perf_counter() - baseline_start
+
+ candidate_con.execute(sql).fetchall()
+ candidate_start = time.perf_counter()
+ for _ in range(repetitions):
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ candidate_runtime = time.perf_counter() - candidate_start
+
+ return {
+ "baseline_runtime_s": float(baseline_runtime),
+ "candidate_runtime_s": float(candidate_runtime),
+ "row_count": len(candidate_rows),
+ }
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/runtime/problem.py b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/runtime/problem.py
new file mode 100644
index 00000000..85fab5df
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/runtime/problem.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+try:
+ from .duckdb_local_workload import PREAGGREGATION_CANDIDATES, measure_preaggregation_design, normalize_name_list
+except ImportError:
+ from benchmarks.ComputerSystems.duckdb_local_workload import PREAGGREGATION_CANDIDATES, measure_preaggregation_design, normalize_name_list
+
+
+PUBLIC_CASES = (
+ {
+ "case_id": "public_all_reports",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 2,
+ "segment_filter": ("BUILDING", "AUTOMOBILE", "HOUSEHOLD"),
+ "min_shipdate": "1997-01-01",
+ "revenue_year": 1998,
+ "limit_rows": 100,
+ "repetitions": 3,
+ },
+ {
+ "case_id": "public_focus_segments",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 1,
+ "segment_filter": ("BUILDING", "AUTOMOBILE"),
+ "min_shipdate": "1998-01-01",
+ "revenue_year": 1997,
+ "limit_rows": 50,
+ "repetitions": 3,
+ },
+ {
+ "case_id": "public_long_horizon",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 2,
+ "segment_filter": ("HOUSEHOLD", "FURNITURE", "MACHINERY"),
+ "min_shipdate": "1996-01-01",
+ "revenue_year": 1996,
+ "limit_rows": 75,
+ "repetitions": 2,
+ },
+)
+
+HIDDEN_CASES = (
+ {
+ "case_id": "hidden_shipmode_recent",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 2,
+ "segment_filter": ("BUILDING", "HOUSEHOLD"),
+ "min_shipdate": "1998-06-01",
+ "revenue_year": 1998,
+ "limit_rows": 40,
+ "repetitions": 3,
+ },
+ {
+ "case_id": "hidden_segment_mix",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 2,
+ "segment_filter": ("AUTOMOBILE", "FURNITURE"),
+ "min_shipdate": "1997-04-01",
+ "revenue_year": 1997,
+ "limit_rows": 60,
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_customer_topn",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 1,
+ "segment_filter": ("BUILDING", "AUTOMOBILE", "HOUSEHOLD"),
+ "min_shipdate": "1997-01-01",
+ "revenue_year": 1998,
+ "limit_rows": 25,
+ "repetitions": 3,
+ },
+ {
+ "case_id": "hidden_wide_reports",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 3,
+ "segment_filter": ("BUILDING", "AUTOMOBILE", "HOUSEHOLD", "FURNITURE"),
+ "min_shipdate": "1995-01-01",
+ "revenue_year": 1995,
+ "limit_rows": 90,
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_narrow_reports",
+ "candidate_preaggregations": tuple(sorted(PREAGGREGATION_CANDIDATES)),
+ "max_preaggregations": 1,
+ "segment_filter": ("MACHINERY",),
+ "min_shipdate": "1998-01-01",
+ "revenue_year": 1998,
+ "limit_rows": 20,
+ "repetitions": 3,
+ },
+)
+
+WORKLOAD_MANIFEST = dict(PUBLIC_CASES[0])
+
+
+def load_instance():
+ return dict(WORKLOAD_MANIFEST)
+
+
+def evaluate_selection(selection, manifest: dict | None = None):
+ manifest = WORKLOAD_MANIFEST if manifest is None else dict(manifest)
+ return measure_preaggregation_design(normalize_name_list(selection, "preaggregations"), manifest)
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/scripts/init.py b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/scripts/init.py
new file mode 100644
index 00000000..93cf2a4d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/scripts/init.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+def _is_repo_root(path: Path) -> bool:
+ return (path / "benchmarks").is_dir() and (path / "frontier_eval").is_dir()
+
+
+def _ensure_import_path() -> None:
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if _is_repo_root(parent):
+ ps = str(parent)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+ return
+ benchmark_root = here.parents[1]
+ ps = str(benchmark_root)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+
+
+_ensure_import_path()
+
+try:
+ from benchmarks.ComputerSystems.DuckDBPreAggregationSelection.baseline.solution import select_preaggregations as _baseline_select_preaggregations
+ from benchmarks.ComputerSystems.DuckDBPreAggregationSelection.runtime.problem import WORKLOAD_MANIFEST, evaluate_selection
+except ModuleNotFoundError:
+ from baseline.solution import select_preaggregations as _baseline_select_preaggregations
+ from runtime.problem import WORKLOAD_MANIFEST, evaluate_selection
+
+
+# EVOLVE-BLOCK-START
+def select_preaggregations(workload_manifest):
+ return _baseline_select_preaggregations(workload_manifest)
+# EVOLVE-BLOCK-END
+
+
+if __name__ == "__main__":
+ print(evaluate_selection(select_preaggregations(WORKLOAD_MANIFEST)))
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/evaluator.py b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/evaluator.py
new file mode 100644
index 00000000..7ec23cb9
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/evaluator.py
@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import runpy
+import traceback
+from pathlib import Path
+
+
+def _repo_root() -> Path:
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if (parent / "benchmarks").is_dir() and (parent / "frontier_eval").is_dir():
+ return parent
+ return Path.cwd().resolve()
+
+
+def _benchmark_root() -> Path:
+ return Path(__file__).resolve().parents[1]
+
+
+def _ensure_import_path() -> None:
+ import sys
+
+ for p in (_repo_root(), _benchmark_root()):
+ ps = str(p)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+
+
+_ensure_import_path()
+
+try:
+ from benchmarks.ComputerSystems.DuckDBPreAggregationSelection.baseline.solution import (
+ select_preaggregations as baseline_select_preaggregations,
+ )
+ from benchmarks.ComputerSystems.DuckDBPreAggregationSelection.runtime.problem import (
+ HIDDEN_CASES,
+ PUBLIC_CASES,
+ evaluate_selection,
+ )
+except ModuleNotFoundError:
+ from baseline.solution import select_preaggregations as baseline_select_preaggregations
+ from runtime.problem import HIDDEN_CASES, PUBLIC_CASES, evaluate_selection
+
+
+def _run_case(select_preaggregations, case):
+ selection = select_preaggregations(dict(case))
+ return evaluate_selection(selection, case)
+
+
+def evaluate(program_path: str):
+ metrics = {
+ "combined_score": -1e18,
+ "valid": 0.0,
+ "public_avg_runtime_s": 0.0,
+ "hidden_avg_runtime_s": 0.0,
+ "baseline_hidden_avg_runtime_s": 0.0,
+ "num_public_cases": 0.0,
+ "num_hidden_cases": 0.0,
+ }
+ artifacts = {}
+ namespace = runpy.run_path(str(Path(program_path).expanduser().resolve()), run_name="candidate_program")
+ select_preaggregations = namespace.get("select_preaggregations")
+ if not callable(select_preaggregations):
+ artifacts["error_message"] = "candidate must define select_preaggregations(workload_manifest)"
+ return metrics, artifacts
+ try:
+ public_candidate = [_run_case(select_preaggregations, case) for case in PUBLIC_CASES]
+ hidden_candidate = [_run_case(select_preaggregations, case) for case in HIDDEN_CASES]
+ hidden_baseline = [_run_case(baseline_select_preaggregations, case) for case in HIDDEN_CASES]
+ except Exception:
+ artifacts["error_message"] = traceback.format_exc()
+ return metrics, artifacts
+
+ hidden_avg = sum(float(item["candidate_total_runtime_s"]) for item in hidden_candidate) / len(hidden_candidate)
+ baseline_hidden_avg = sum(float(item["candidate_total_runtime_s"]) for item in hidden_baseline) / len(hidden_baseline)
+ public_avg = sum(float(item["candidate_total_runtime_s"]) for item in public_candidate) / len(public_candidate)
+ if not math.isfinite(hidden_avg) or hidden_avg <= 0:
+ artifacts["error_message"] = "candidate runtime is invalid"
+ return metrics, artifacts
+
+ metrics["valid"] = 1.0
+ metrics["public_avg_runtime_s"] = public_avg
+ metrics["hidden_avg_runtime_s"] = hidden_avg
+ metrics["baseline_hidden_avg_runtime_s"] = baseline_hidden_avg
+ metrics["num_public_cases"] = float(len(PUBLIC_CASES))
+ metrics["num_hidden_cases"] = float(len(HIDDEN_CASES))
+ metrics["combined_score"] = -hidden_avg
+ return metrics, artifacts
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("program")
+ parser.add_argument("--metrics-out", default="metrics.json")
+ args = parser.parse_args()
+ metrics, artifacts = evaluate(args.program)
+ Path(args.metrics_out).write_text(json.dumps(metrics, indent=2), encoding="utf-8")
+ if artifacts:
+ Path("artifacts.json").write_text(json.dumps(artifacts, indent=2), encoding="utf-8")
+ print(json.dumps(metrics, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/requirements.txt b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/requirements.txt
new file mode 100644
index 00000000..8a6ba6a1
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBPreAggregationSelection/verification/requirements.txt
@@ -0,0 +1 @@
+duckdb
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/README.md b/benchmarks/ComputerSystems/DuckDBQueryRewrite/README.md
new file mode 100644
index 00000000..7a50d91e
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/README.md
@@ -0,0 +1,47 @@
+# DuckDB Query Rewrite
+
+Rewrite analytical SQL for a query family while preserving exact results and minimizing hidden-case average runtime.
+
+## What Changed
+
+- The evaluator now runs multiple public and hidden SQL cases.
+- Baseline rewrites are case-aware and no longer just echo the input SQL.
+- Semantic equivalence is checked on every case before runtime matters.
+
+## What You Edit
+
+- Target file: `scripts/init.py`
+- Entry point: `rewrite_query(sql, workload_manifest)`
+
+## Source of Truth
+
+- `Task.md`
+- `Task_zh-CN.md`
+- `runtime/problem.py`
+- `baseline/solution.py`
+- `verification/evaluator.py`
+
+## Environment
+
+```bash
+pip install -r frontier_eval/requirements.txt
+pip install -r benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/requirements.txt
+```
+
+## Quick Run
+
+```bash
+python benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/evaluator.py \
+ benchmarks/ComputerSystems/DuckDBQueryRewrite/scripts/init.py \
+ --metrics-out /tmp/DuckDBQueryRewrite_metrics.json
+```
+
+## Main Metrics
+
+- `combined_score = -hidden_avg_runtime_s`
+- `valid`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/README_zh-CN.md b/benchmarks/ComputerSystems/DuckDBQueryRewrite/README_zh-CN.md
new file mode 100644
index 00000000..0a7d10ab
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/README_zh-CN.md
@@ -0,0 +1,46 @@
+# DuckDB 查询重写
+
+对一组分析型 SQL 做语义等价改写,并尽量降低 hidden case 的平均运行时间。
+
+## 本轮同步后的变化
+
+- 评测已改成多组 public / hidden SQL case。
+- baseline 改写现在会按 case 选择具体 SQL,不再原样返回输入。
+- 每个 case 都会先做语义等价检查,只有等价后才比较运行时间。
+
+## 你会改的文件
+
+- 目标文件:`scripts/init.py`
+- 入口函数:`rewrite_query(sql, workload_manifest)`
+
+## 先看哪里
+
+- `Task.md` / `Task_zh-CN.md`
+- `runtime/problem.py`
+- `baseline/solution.py`
+- `verification/evaluator.py`
+
+## 环境准备
+
+```bash
+pip install -r frontier_eval/requirements.txt
+pip install -r benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/requirements.txt
+```
+
+## 快速运行
+
+```bash
+python benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/evaluator.py \
+ benchmarks/ComputerSystems/DuckDBQueryRewrite/scripts/init.py \
+ --metrics-out /tmp/DuckDBQueryRewrite_metrics.json
+```
+
+## 主要指标
+
+- `combined_score = -hidden_avg_runtime_s`
+- `valid`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/Task.md b/benchmarks/ComputerSystems/DuckDBQueryRewrite/Task.md
new file mode 100644
index 00000000..394619b6
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/Task.md
@@ -0,0 +1,51 @@
+# DuckDB Query Rewrite Task
+
+## Problem
+
+Rewrite analytical SQL queries for a workload family while preserving exact results and minimizing hidden-case average runtime.
+
+This task is no longer a single frozen SQL statement. The evaluator now uses multiple public and hidden SQL cases with different grouping keys, filters, and rollups. A good rewrite strategy should preserve semantics exactly and improve runtime across the query family.
+
+## What Is Frozen
+
+- The local DuckDB schema and data generator in `benchmarks/ComputerSystems/duckdb_local_workload.py`.
+- The case-specific baseline SQL stored in `PUBLIC_CASES` and `HIDDEN_CASES`.
+- The semantic check: candidate rows must match the frozen baseline query exactly, up to floating-point tolerance.
+
+## Submission Contract
+
+Submit one Python file that defines:
+
+```python
+def rewrite_query(sql, workload_manifest):
+ ...
+```
+
+Return a rewritten SQL string. A dict with key `sql` is also accepted by the runtime helper.
+
+## Evaluation
+
+1. For each public and hidden case, pass the baseline SQL and case manifest into `rewrite_query(...)`.
+2. Execute both the baseline SQL and the candidate SQL on fresh DuckDB databases.
+3. Reject the candidate if any query result differs from the baseline result.
+4. Measure runtime across the case family; scoring uses the hidden-case average.
+
+## Metrics
+
+- `combined_score`: `-hidden_avg_runtime_s`
+- `valid`: `1.0` only if every rewritten query is semantically equivalent and all cases run successfully
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+- `num_public_cases`
+- `num_hidden_cases`
+
+## Invalid Submissions
+
+- `rewrite_query(...)` is missing or crashes
+- The returned value cannot be interpreted as SQL
+- Any public or hidden case changes the query result
+- Any rewritten query fails to execute
+- Any reported runtime becomes non-finite
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/Task_zh-CN.md b/benchmarks/ComputerSystems/DuckDBQueryRewrite/Task_zh-CN.md
new file mode 100644
index 00000000..9aa6ddfa
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/Task_zh-CN.md
@@ -0,0 +1,51 @@
+# DuckDB SQL 改写
+
+## 任务概览
+
+对一组分析型 SQL 进行改写,在保持结果完全等价的前提下,尽量降低 hidden case 的平均运行时间。
+
+这个任务不再是单条冻结 SQL。评测现在会使用多组 `public` / `hidden` SQL case,它们会改变分组键、过滤条件和 rollup 形式。好的策略应当既保证语义完全一致,又能对整个 query family 带来稳定收益。
+
+## 哪些部分是冻结的
+
+- `benchmarks/ComputerSystems/duckdb_local_workload.py` 中的本地 DuckDB schema 与数据生成逻辑。
+- `PUBLIC_CASES` 与 `HIDDEN_CASES` 中保存的 case-specific baseline SQL。
+- 固定语义校验:候选结果必须与 baseline 查询结果逐行等价,浮点值只允许很小容差。
+
+## 提交接口
+
+提交一个 Python 文件,定义:
+
+```python
+def rewrite_query(sql, workload_manifest):
+ ...
+```
+
+返回改写后的 SQL 字符串;runtime helper 也接受带 `sql` 字段的字典。
+
+## 评测流程
+
+1. 对每个 public / hidden case,把 baseline SQL 和 case manifest 传入 `rewrite_query(...)`。
+2. 在全新的 DuckDB 数据库上分别执行 baseline SQL 与 candidate SQL。
+3. 如果任意 case 的结果与 baseline 不一致,则直接判失败。
+4. 聚合整个 case family 的运行时间;最终分数使用 hidden case 平均耗时。
+
+## 指标
+
+- `combined_score`:`-hidden_avg_runtime_s`
+- `valid`:只有所有改写结果都语义等价且全部 case 成功运行时才为 `1.0`
+- `public_avg_runtime_s`
+- `hidden_avg_runtime_s`
+- `baseline_hidden_avg_runtime_s`
+- `num_public_cases`
+- `num_hidden_cases`
+
+## 判为无效的情况
+
+- 缺少 `rewrite_query(...)`,或函数执行报错
+- 返回值无法解释为 SQL
+- 任意 public 或 hidden case 改变了查询结果
+- 任意改写 SQL 执行失败
+- 任意运行时间指标变成非有限值
+
+
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/baseline/solution.py b/benchmarks/ComputerSystems/DuckDBQueryRewrite/baseline/solution.py
new file mode 100644
index 00000000..4a893bd6
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/baseline/solution.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+
+def rewrite_query(sql, workload_manifest):
+ query_id = str(workload_manifest.get("query_id", ""))
+ rewrites = {
+ "quarter_join": """
+SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket,
+ c.c_mktsegment AS segment,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue,
+ count(DISTINCT o.o_orderkey) AS order_count
+FROM customer c
+JOIN orders o ON o.o_custkey = c.c_custkey
+JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+WHERE c.c_mktsegment IN ('BUILDING', 'AUTOMOBILE', 'HOUSEHOLD')
+GROUP BY 1, 2
+ORDER BY quarter_bucket, segment
+""".strip(),
+ "shipmode_month": """
+SELECT date_trunc('month', l_shipdate) AS month_bucket,
+ l_shipmode AS shipmode,
+ sum(l_extendedprice * (1 - l_discount)) AS revenue,
+ count(*) AS line_count
+FROM lineitem
+WHERE l_shipdate >= DATE '1997-01-01'
+GROUP BY 1, 2
+ORDER BY month_bucket, shipmode
+""".strip(),
+ "customer_year": """
+SELECT year(o.o_orderdate) AS revenue_year,
+ c.c_custkey AS customer_key,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue,
+ count(DISTINCT o.o_orderkey) AS order_count
+FROM customer c
+JOIN orders o ON o.o_custkey = c.c_custkey
+JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+GROUP BY 1, 2
+HAVING year(o.o_orderdate) = 1998
+ORDER BY revenue DESC, customer_key
+LIMIT 80
+""".strip(),
+ "quarter_join_recent": """
+SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket,
+ c.c_mktsegment AS segment,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue,
+ count(DISTINCT o.o_orderkey) AS order_count
+FROM customer c
+JOIN orders o ON o.o_custkey = c.c_custkey
+JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+WHERE c.c_mktsegment IN ('BUILDING', 'AUTOMOBILE') AND o.o_orderdate >= DATE '1997-01-01'
+GROUP BY 1, 2
+ORDER BY quarter_bucket, segment
+""".strip(),
+ "shipmode_recent": """
+SELECT date_trunc('month', l_shipdate) AS month_bucket,
+ l_shipmode AS shipmode,
+ sum(l_extendedprice * (1 - l_discount)) AS revenue,
+ count(*) AS line_count
+FROM lineitem
+WHERE l_shipdate >= DATE '1998-01-01'
+GROUP BY 1, 2
+ORDER BY month_bucket, shipmode
+""".strip(),
+ "customer_year_1997": """
+SELECT year(o.o_orderdate) AS revenue_year,
+ c.c_custkey AS customer_key,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue,
+ count(DISTINCT o.o_orderkey) AS order_count
+FROM customer c
+JOIN orders o ON o.o_custkey = c.c_custkey
+JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+GROUP BY 1, 2
+HAVING year(o.o_orderdate) = 1997
+ORDER BY revenue DESC, customer_key
+LIMIT 60
+""".strip(),
+ "segment_rollup": """
+SELECT c.c_mktsegment AS segment,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue,
+ count(DISTINCT o.o_orderkey) AS order_count
+FROM customer c
+JOIN orders o ON o.o_custkey = c.c_custkey
+JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+WHERE o.o_orderdate >= DATE '1996-01-01'
+GROUP BY 1
+ORDER BY segment
+""".strip(),
+ "priority_rollup": """
+SELECT o.o_orderpriority AS priority_bucket,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue,
+ count(*) AS order_count
+FROM orders o
+JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+WHERE o.o_orderdate >= DATE '1997-01-01'
+GROUP BY 1
+ORDER BY priority_bucket
+""".strip(),
+ }
+ return rewrites.get(query_id, str(sql).strip())
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/agent_files.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/agent_files.txt
new file mode 100644
index 00000000..1d2eb069
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/agent_files.txt
@@ -0,0 +1,6 @@
+Task.md
+Task_zh-CN.md
+README.md
+baseline/solution.py
+runtime/problem.py
+references/source_manifest.md
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/candidate_destination.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/candidate_destination.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/candidate_destination.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/constraints.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/constraints.txt
new file mode 100644
index 00000000..88b1935c
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/constraints.txt
@@ -0,0 +1,4 @@
+Edit only `scripts/init.py`.
+Modify only code between `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` in that file.
+Do not modify files under `baseline/`, `runtime/`, `references/`, or `verification/`.
+Keep outputs valid and finite.
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/eval_command.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/eval_command.txt
new file mode 100644
index 00000000..fcba5e60
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/eval_command.txt
@@ -0,0 +1 @@
+{python} verification/evaluator.py {candidate} --metrics-out metrics.json
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/eval_cwd.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/eval_cwd.txt
new file mode 100644
index 00000000..9c558e35
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/eval_cwd.txt
@@ -0,0 +1 @@
+.
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/initial_program.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/initial_program.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/initial_program.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/readonly_files.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/readonly_files.txt
new file mode 100644
index 00000000..8bb37291
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/frontier_eval/readonly_files.txt
@@ -0,0 +1,5 @@
+baseline/solution.py
+runtime/problem.py
+runtime/duckdb_local_workload.py
+verification/evaluator.py
+references/source_manifest.md
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/references/source_manifest.md b/benchmarks/ComputerSystems/DuckDBQueryRewrite/references/source_manifest.md
new file mode 100644
index 00000000..43dc6c9b
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/references/source_manifest.md
@@ -0,0 +1,10 @@
+# Source Manifest
+
+- Upstream engine: `DuckDB`
+- Upstream lineage:
+ - DuckDB benchmark and TPC-H documentation
+ - DuckDB SQL optimizer and query execution model
+- Schema lineage: this benchmark uses a local frozen relational workload with `customer`, `orders`, and `lineitem` tables modeled after the TPC-H schema family.
+- Data provenance: rows are generated deterministically inside DuckDB from fixed SQL formulas and a fixed schema; this is a benchmark-local synthetic dataset, not official TPC-H `dbgen` output.
+- Authenticity note: the workload shape is traceable to official DuckDB/TPC-H analytical reporting patterns, while the exact query instance is a benchmark-local frozen SQL task chosen to expose meaningful rewrite opportunities.
+- License lineage: DuckDB is released under the MIT License.
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/runtime/duckdb_local_workload.py b/benchmarks/ComputerSystems/DuckDBQueryRewrite/runtime/duckdb_local_workload.py
new file mode 100644
index 00000000..cb0da163
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/runtime/duckdb_local_workload.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import math
+import time
+from typing import Any
+
+import duckdb
+
+
+CUSTOMER_COUNT = 20_000
+ORDER_COUNT = 120_000
+LINEITEM_COUNT = 600_000
+
+CUSTOMER_KEYS = tuple(1 + ((i * 97) % CUSTOMER_COUNT) for i in range(1, 301))
+ORDER_KEYS = tuple(1 + ((i * 193) % ORDER_COUNT) for i in range(1, 301))
+
+
+INDEX_CANDIDATES = {
+ "idx_orders_cust": "CREATE INDEX idx_orders_cust ON orders(o_custkey)",
+ "idx_orders_date": "CREATE INDEX idx_orders_date ON orders(o_orderdate)",
+ "idx_lineitem_order": "CREATE INDEX idx_lineitem_order ON lineitem(l_orderkey)",
+ "idx_customer_segment": "CREATE INDEX idx_customer_segment ON customer(c_mktsegment)",
+ "idx_orders_priority": "CREATE INDEX idx_orders_priority ON orders(o_orderpriority)",
+}
+
+PREAGGREGATION_CANDIDATES = {
+ "agg_quarter_segment_revenue": (
+ "CREATE TABLE agg_quarter_segment_revenue AS "
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_month_shipmode_revenue": (
+ "CREATE TABLE agg_month_shipmode_revenue AS "
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "GROUP BY 1, 2"
+ ),
+ "agg_customer_year_revenue": (
+ "CREATE TABLE agg_customer_year_revenue AS "
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_unused_priority_only": (
+ "CREATE TABLE agg_unused_priority_only AS "
+ "SELECT o.o_orderpriority, count(*) AS order_count "
+ "FROM orders o "
+ "GROUP BY 1"
+ ),
+}
+
+
+def build_connection() -> duckdb.DuckDBPyConnection:
+ con = duckdb.connect(database=":memory:")
+ con.execute("PRAGMA threads=1")
+ con.execute(
+ f"""
+ CREATE TABLE customer AS
+ SELECT i AS c_custkey,
+ 'Customer #' || i AS c_name,
+ CASE i % 5
+ WHEN 0 THEN 'BUILDING'
+ WHEN 1 THEN 'AUTOMOBILE'
+ WHEN 2 THEN 'HOUSEHOLD'
+ WHEN 3 THEN 'FURNITURE'
+ ELSE 'MACHINERY'
+ END AS c_mktsegment,
+ i % 25 AS c_nationkey
+ FROM range(1, {CUSTOMER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE orders AS
+ SELECT i AS o_orderkey,
+ 1 + ((i * 17) % {CUSTOMER_COUNT}) AS o_custkey,
+ DATE '1995-01-01' + (((i * 13) % 1460) * INTERVAL 1 DAY) AS o_orderdate,
+ 100 + (((i * 37) % 100000) / 10.0) AS o_totalprice,
+ CASE i % 5
+ WHEN 0 THEN '1-URGENT'
+ WHEN 1 THEN '2-HIGH'
+ WHEN 2 THEN '3-MEDIUM'
+ WHEN 3 THEN '4-NOT SPECIFIED'
+ ELSE '5-LOW'
+ END AS o_orderpriority
+ FROM range(1, {ORDER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE lineitem AS
+ SELECT i AS l_lineitemkey,
+ 1 + ((i * 7) % {ORDER_COUNT}) AS l_orderkey,
+ 1 + ((i * 11) % 50000) AS l_partkey,
+ 1 + ((i * 13) % 10000) AS l_suppkey,
+ 1 + ((i * 5) % 50) AS l_quantity,
+ 10 + (((i * 19) % 100000) / 20.0) AS l_extendedprice,
+ (((i * 3) % 10) / 100.0) AS l_discount,
+ DATE '1995-01-01' + (((i * 29) % 1460) * INTERVAL 1 DAY) AS l_shipdate,
+ CASE i % 5
+ WHEN 0 THEN 'AIR'
+ WHEN 1 THEN 'MAIL'
+ WHEN 2 THEN 'RAIL'
+ WHEN 3 THEN 'TRUCK'
+ ELSE 'SHIP'
+ END AS l_shipmode
+ FROM range(1, {LINEITEM_COUNT + 1}) t(i)
+ """
+ )
+ return con
+
+
+def normalize_name_list(value: Any, key: str) -> list[str]:
+ if isinstance(value, dict):
+ if key not in value:
+ raise ValueError(f"missing {key}")
+ value = value[key]
+ if not isinstance(value, (list, tuple)):
+ raise ValueError(f"{key} must be a list or tuple")
+ out: list[str] = []
+ seen = set()
+ for item in value:
+ name = str(item)
+ if name not in seen:
+ out.append(name)
+ seen.add(name)
+ return out
+
+
+def compare_results(lhs: list[tuple[Any, ...]], rhs: list[tuple[Any, ...]], tol: float = 1e-6) -> bool:
+ if len(lhs) != len(rhs):
+ return False
+ for left_row, right_row in zip(lhs, rhs):
+ if len(left_row) != len(right_row):
+ return False
+ for left_value, right_value in zip(left_row, right_row):
+ if isinstance(left_value, float) or isinstance(right_value, float):
+ if not math.isfinite(float(left_value)) or not math.isfinite(float(right_value)):
+ return False
+ if abs(float(left_value) - float(right_value)) > tol:
+ return False
+ elif left_value != right_value:
+ return False
+ return True
+
+
+def _index_keys(sample_size: int, source: tuple[int, ...]) -> tuple[int, ...]:
+ sample_size = max(1, min(len(source), int(sample_size)))
+ return tuple(source[:sample_size])
+
+
+def run_index_workload(con: duckdb.DuckDBPyConnection, manifest: dict[str, Any]) -> float:
+ start_time = time.perf_counter()
+ customer_keys = _index_keys(manifest.get("customer_sample", 80), CUSTOMER_KEYS)
+ order_keys = _index_keys(manifest.get("order_sample", 80), ORDER_KEYS)
+ urgent_customer_keys = _index_keys(manifest.get("urgent_customer_sample", 40), CUSTOMER_KEYS)
+ min_order_date = str(manifest.get("min_order_date", "1997-01-01"))
+ priority_value = str(manifest.get("priority_value", "1-URGENT"))
+
+ for customer_key in customer_keys:
+ con.execute(
+ "SELECT sum(o_totalprice) "
+ "FROM orders "
+ "WHERE o_custkey = ? AND o_orderdate >= CAST(? AS DATE)",
+ [customer_key, min_order_date],
+ ).fetchone()
+ for order_key in order_keys:
+ con.execute(
+ "SELECT sum(l_extendedprice * (1 - l_discount)) "
+ "FROM lineitem "
+ "WHERE l_orderkey = ?",
+ [order_key],
+ ).fetchone()
+ for customer_key in urgent_customer_keys:
+ con.execute(
+ "SELECT count(*) "
+ "FROM customer c "
+ "JOIN orders o ON c.c_custkey = o.o_custkey "
+ "WHERE c.c_custkey = ? AND o.o_orderpriority = ?",
+ [customer_key, priority_value],
+ ).fetchone()
+ return time.perf_counter() - start_time
+
+
+def measure_index_design(selected_indexes: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_indexes", tuple(sorted(INDEX_CANDIDATES))))
+ max_indexes = int(manifest.get("max_indexes", len(allowed)))
+ unknown = [name for name in selected_indexes if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown index names: {unknown}")
+ if len(selected_indexes) > max_indexes:
+ raise ValueError(f"too many indexes selected: {len(selected_indexes)} > {max_indexes}")
+
+ con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_indexes:
+ con.execute(INDEX_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ workload_runtime = 0.0
+ repetitions = int(manifest.get("repetitions", 3))
+ run_index_workload(con, manifest)
+ for _ in range(repetitions):
+ workload_runtime += run_index_workload(con, manifest)
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "workload_runtime_s": float(workload_runtime),
+ "total_runtime_s": float(setup_runtime + workload_runtime),
+ "selected_index_count": len(selected_indexes),
+ }
+
+
+def _report_quarter_segment(con: duckdb.DuckDBPyConnection, use_aggregate: bool, segment_filter: tuple[str, ...]) -> list[tuple[Any, ...]]:
+ values = ", ".join(f"'{value}'" for value in segment_filter)
+ if use_aggregate:
+ return con.execute(
+ "SELECT quarter_bucket, segment, revenue "
+ "FROM agg_quarter_segment_revenue "
+ f"WHERE segment IN ({values}) "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ f"WHERE c.c_mktsegment IN ({values}) "
+ "GROUP BY 1, 2 "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+
+
+def _report_month_shipmode(con: duckdb.DuckDBPyConnection, use_aggregate: bool, min_shipdate: str) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT month_bucket, shipmode, revenue "
+ "FROM agg_month_shipmode_revenue "
+ "WHERE month_bucket >= CAST(? AS DATE) "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "WHERE l.l_shipdate >= CAST(? AS DATE) "
+ "GROUP BY 1, 2 "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+
+
+def _report_customer_year(con: duckdb.DuckDBPyConnection, use_aggregate: bool, revenue_year: int, limit_rows: int) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT revenue_year, c_custkey, revenue "
+ "FROM agg_customer_year_revenue "
+ "WHERE revenue_year = ? "
+ "ORDER BY revenue DESC, c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+ return con.execute(
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2 "
+ "HAVING year(o.o_orderdate) = ? "
+ "ORDER BY revenue DESC, c.c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+
+
+def _run_preaggregation_reports(
+ con: duckdb.DuckDBPyConnection,
+ selected: set[str],
+ manifest: dict[str, Any],
+) -> tuple[float, tuple[list[tuple[Any, ...]], ...]]:
+ start_time = time.perf_counter()
+ result_a = _report_quarter_segment(
+ con,
+ "agg_quarter_segment_revenue" in selected,
+ tuple(manifest.get("segment_filter", ("BUILDING", "AUTOMOBILE", "HOUSEHOLD"))),
+ )
+ result_b = _report_month_shipmode(
+ con,
+ "agg_month_shipmode_revenue" in selected,
+ str(manifest.get("min_shipdate", "1997-01-01")),
+ )
+ result_c = _report_customer_year(
+ con,
+ "agg_customer_year_revenue" in selected,
+ int(manifest.get("revenue_year", 1998)),
+ int(manifest.get("limit_rows", 100)),
+ )
+ runtime = time.perf_counter() - start_time
+ return runtime, (result_a, result_b, result_c)
+
+
+def measure_preaggregation_design(selected_preaggregations: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_preaggregations", tuple(sorted(PREAGGREGATION_CANDIDATES))))
+ max_preaggregations = int(manifest.get("max_preaggregations", len(allowed)))
+ unknown = [name for name in selected_preaggregations if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown pre-aggregation names: {unknown}")
+ if len(selected_preaggregations) > max_preaggregations:
+ raise ValueError(
+ f"too many pre-aggregations selected: {len(selected_preaggregations)} > {max_preaggregations}"
+ )
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_preaggregations:
+ candidate_con.execute(PREAGGREGATION_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ _, baseline_results = _run_preaggregation_reports(baseline_con, set(), manifest)
+ _, candidate_results = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ if any(not compare_results(left, right) for left, right in zip(candidate_results, baseline_results)):
+ raise ValueError("candidate pre-aggregation selection changed the query results")
+
+ repetitions = int(manifest.get("repetitions", 3))
+ repeated_baseline_runtime = 0.0
+ repeated_candidate_runtime = 0.0
+ _run_preaggregation_reports(baseline_con, set(), manifest)
+ _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ for _ in range(repetitions):
+ extra_runtime, _ = _run_preaggregation_reports(baseline_con, set(), manifest)
+ repeated_baseline_runtime += extra_runtime
+ extra_runtime, _ = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ repeated_candidate_runtime += extra_runtime
+
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "candidate_workload_runtime_s": float(repeated_candidate_runtime),
+ "candidate_total_runtime_s": float(setup_runtime + repeated_candidate_runtime),
+ "baseline_total_runtime_s": float(repeated_baseline_runtime),
+ "selected_preaggregation_count": len(selected_preaggregations),
+ }
+
+
+def measure_query_rewrite(sql: str, manifest: dict[str, Any]) -> dict[str, Any]:
+ sql = str(sql).strip()
+ if not sql:
+ raise ValueError("query must not be empty")
+ baseline_sql = str(manifest["baseline_sql"]).strip()
+ repetitions = int(manifest.get("repetitions", 3))
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ baseline_rows = baseline_con.execute(baseline_sql).fetchall()
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ if not compare_results(candidate_rows, baseline_rows):
+ raise ValueError("candidate query result does not match the baseline result")
+
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_start = time.perf_counter()
+ for _ in range(repetitions):
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_runtime = time.perf_counter() - baseline_start
+
+ candidate_con.execute(sql).fetchall()
+ candidate_start = time.perf_counter()
+ for _ in range(repetitions):
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ candidate_runtime = time.perf_counter() - candidate_start
+
+ return {
+ "baseline_runtime_s": float(baseline_runtime),
+ "candidate_runtime_s": float(candidate_runtime),
+ "row_count": len(candidate_rows),
+ }
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/runtime/problem.py b/benchmarks/ComputerSystems/DuckDBQueryRewrite/runtime/problem.py
new file mode 100644
index 00000000..659f3207
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/runtime/problem.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+try:
+ from .duckdb_local_workload import measure_query_rewrite
+except ImportError:
+ from benchmarks.ComputerSystems.duckdb_local_workload import measure_query_rewrite
+
+
+PUBLIC_CASES = (
+ {
+ "case_id": "public_quarter_join",
+ "query_id": "quarter_join",
+ "baseline_sql": """
+WITH revenue AS (
+ SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket,
+ c.c_mktsegment AS segment,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE c.c_mktsegment IN ('BUILDING', 'AUTOMOBILE', 'HOUSEHOLD')
+ GROUP BY 1, 2
+),
+order_counts AS (
+ SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket,
+ c.c_mktsegment AS segment,
+ count(DISTINCT o.o_orderkey) AS order_count
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE c.c_mktsegment IN ('BUILDING', 'AUTOMOBILE', 'HOUSEHOLD')
+ GROUP BY 1, 2
+)
+SELECT r.quarter_bucket, r.segment, r.revenue, o.order_count
+FROM revenue r
+JOIN order_counts o USING (quarter_bucket, segment)
+ORDER BY quarter_bucket, segment
+""".strip(),
+ "repetitions": 3,
+ },
+ {
+ "case_id": "public_shipmode_month",
+ "query_id": "shipmode_month",
+ "baseline_sql": """
+WITH revenue AS (
+ SELECT date_trunc('month', l_shipdate) AS month_bucket,
+ l_shipmode AS shipmode,
+ sum(l_extendedprice * (1 - l_discount)) AS revenue
+ FROM lineitem
+ WHERE l_shipdate >= DATE '1997-01-01'
+ GROUP BY 1, 2
+),
+counts AS (
+ SELECT date_trunc('month', l_shipdate) AS month_bucket,
+ l_shipmode AS shipmode,
+ count(*) AS line_count
+ FROM lineitem
+ WHERE l_shipdate >= DATE '1997-01-01'
+ GROUP BY 1, 2
+)
+SELECT r.month_bucket, r.shipmode, r.revenue, c.line_count
+FROM revenue r
+JOIN counts c USING (month_bucket, shipmode)
+ORDER BY month_bucket, shipmode
+""".strip(),
+ "repetitions": 3,
+ },
+ {
+ "case_id": "public_customer_year",
+ "query_id": "customer_year",
+ "baseline_sql": """
+WITH rev AS (
+ SELECT year(o.o_orderdate) AS revenue_year,
+ c.c_custkey AS customer_key,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ GROUP BY 1, 2
+),
+orders_seen AS (
+ SELECT year(o.o_orderdate) AS revenue_year,
+ c.c_custkey AS customer_key,
+ count(DISTINCT o.o_orderkey) AS order_count
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ GROUP BY 1, 2
+)
+SELECT rev.revenue_year, rev.customer_key, rev.revenue, orders_seen.order_count
+FROM rev
+JOIN orders_seen USING (revenue_year, customer_key)
+WHERE rev.revenue_year = 1998
+ORDER BY rev.revenue DESC, rev.customer_key
+LIMIT 80
+""".strip(),
+ "repetitions": 2,
+ },
+)
+
+HIDDEN_CASES = (
+ {
+ "case_id": "hidden_quarter_join_recent",
+ "query_id": "quarter_join_recent",
+ "baseline_sql": """
+WITH revenue AS (
+ SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket,
+ c.c_mktsegment AS segment,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE c.c_mktsegment IN ('BUILDING', 'AUTOMOBILE') AND o.o_orderdate >= DATE '1997-01-01'
+ GROUP BY 1, 2
+),
+order_counts AS (
+ SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket,
+ c.c_mktsegment AS segment,
+ count(DISTINCT o.o_orderkey) AS order_count
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE c.c_mktsegment IN ('BUILDING', 'AUTOMOBILE') AND o.o_orderdate >= DATE '1997-01-01'
+ GROUP BY 1, 2
+)
+SELECT r.quarter_bucket, r.segment, r.revenue, o.order_count
+FROM revenue r
+JOIN order_counts o USING (quarter_bucket, segment)
+ORDER BY quarter_bucket, segment
+""".strip(),
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_shipmode_recent",
+ "query_id": "shipmode_recent",
+ "baseline_sql": """
+WITH revenue AS (
+ SELECT date_trunc('month', l_shipdate) AS month_bucket,
+ l_shipmode AS shipmode,
+ sum(l_extendedprice * (1 - l_discount)) AS revenue
+ FROM lineitem
+ WHERE l_shipdate >= DATE '1998-01-01'
+ GROUP BY 1, 2
+),
+counts AS (
+ SELECT date_trunc('month', l_shipdate) AS month_bucket,
+ l_shipmode AS shipmode,
+ count(*) AS line_count
+ FROM lineitem
+ WHERE l_shipdate >= DATE '1998-01-01'
+ GROUP BY 1, 2
+)
+SELECT r.month_bucket, r.shipmode, r.revenue, c.line_count
+FROM revenue r
+JOIN counts c USING (month_bucket, shipmode)
+ORDER BY month_bucket, shipmode
+""".strip(),
+ "repetitions": 3,
+ },
+ {
+ "case_id": "hidden_customer_year_1997",
+ "query_id": "customer_year_1997",
+ "baseline_sql": """
+WITH rev AS (
+ SELECT year(o.o_orderdate) AS revenue_year,
+ c.c_custkey AS customer_key,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ GROUP BY 1, 2
+),
+orders_seen AS (
+ SELECT year(o.o_orderdate) AS revenue_year,
+ c.c_custkey AS customer_key,
+ count(DISTINCT o.o_orderkey) AS order_count
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ GROUP BY 1, 2
+)
+SELECT rev.revenue_year, rev.customer_key, rev.revenue, orders_seen.order_count
+FROM rev
+JOIN orders_seen USING (revenue_year, customer_key)
+WHERE rev.revenue_year = 1997
+ORDER BY rev.revenue DESC, rev.customer_key
+LIMIT 60
+""".strip(),
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_segment_rollup",
+ "query_id": "segment_rollup",
+ "baseline_sql": """
+WITH revenue AS (
+ SELECT c.c_mktsegment AS segment,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE o.o_orderdate >= DATE '1996-01-01'
+ GROUP BY 1
+),
+counts AS (
+ SELECT c.c_mktsegment AS segment,
+ count(DISTINCT o.o_orderkey) AS order_count
+ FROM customer c
+ JOIN orders o ON o.o_custkey = c.c_custkey
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE o.o_orderdate >= DATE '1996-01-01'
+ GROUP BY 1
+)
+SELECT r.segment, r.revenue, c.order_count
+FROM revenue r
+JOIN counts c USING (segment)
+ORDER BY r.segment
+""".strip(),
+ "repetitions": 2,
+ },
+ {
+ "case_id": "hidden_priority_rollup",
+ "query_id": "priority_rollup",
+ "baseline_sql": """
+WITH revenue AS (
+ SELECT o.o_orderpriority AS priority_bucket,
+ sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue
+ FROM orders o
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE o.o_orderdate >= DATE '1997-01-01'
+ GROUP BY 1
+),
+counts AS (
+ SELECT o.o_orderpriority AS priority_bucket,
+ count(*) AS order_count
+ FROM orders o
+ JOIN lineitem l ON l.l_orderkey = o.o_orderkey
+ WHERE o.o_orderdate >= DATE '1997-01-01'
+ GROUP BY 1
+)
+SELECT r.priority_bucket, r.revenue, c.order_count
+FROM revenue r
+JOIN counts c USING (priority_bucket)
+ORDER BY r.priority_bucket
+""".strip(),
+ "repetitions": 2,
+ },
+)
+
+WORKLOAD_MANIFEST = dict(PUBLIC_CASES[0])
+ORIGINAL_QUERY_SQL = WORKLOAD_MANIFEST["baseline_sql"]
+
+
+def load_instance():
+ return {"sql": ORIGINAL_QUERY_SQL, "manifest": dict(WORKLOAD_MANIFEST)}
+
+
+def evaluate_query(value, manifest: dict | None = None):
+ manifest = WORKLOAD_MANIFEST if manifest is None else dict(manifest)
+ if isinstance(value, dict):
+ if "sql" not in value:
+ raise ValueError("missing sql")
+ value = value["sql"]
+ return measure_query_rewrite(str(value), manifest)
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/scripts/init.py b/benchmarks/ComputerSystems/DuckDBQueryRewrite/scripts/init.py
new file mode 100644
index 00000000..c0f51b39
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/scripts/init.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+def _is_repo_root(path: Path) -> bool:
+ return (path / "benchmarks").is_dir() and (path / "frontier_eval").is_dir()
+
+
+def _ensure_import_path() -> None:
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if _is_repo_root(parent):
+ ps = str(parent)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+ return
+ benchmark_root = here.parents[1]
+ ps = str(benchmark_root)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+
+
+_ensure_import_path()
+
+try:
+ from benchmarks.ComputerSystems.DuckDBQueryRewrite.baseline.solution import rewrite_query as _baseline_rewrite_query
+ from benchmarks.ComputerSystems.DuckDBQueryRewrite.runtime.problem import ORIGINAL_QUERY_SQL, WORKLOAD_MANIFEST, evaluate_query
+except ModuleNotFoundError:
+ from baseline.solution import rewrite_query as _baseline_rewrite_query
+ from runtime.problem import ORIGINAL_QUERY_SQL, WORKLOAD_MANIFEST, evaluate_query
+
+
+# EVOLVE-BLOCK-START
+def rewrite_query(sql, workload_manifest):
+ return _baseline_rewrite_query(sql, workload_manifest)
+# EVOLVE-BLOCK-END
+
+
+if __name__ == "__main__":
+ print(evaluate_query(rewrite_query(ORIGINAL_QUERY_SQL, WORKLOAD_MANIFEST)))
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/evaluator.py b/benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/evaluator.py
new file mode 100644
index 00000000..15d5d2bd
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/evaluator.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import runpy
+import traceback
+from pathlib import Path
+
+
+def _repo_root() -> Path:
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if (parent / "benchmarks").is_dir() and (parent / "frontier_eval").is_dir():
+ return parent
+ return Path.cwd().resolve()
+
+
+def _benchmark_root() -> Path:
+ return Path(__file__).resolve().parents[1]
+
+
+def _ensure_import_path() -> None:
+ import sys
+
+ for p in (_repo_root(), _benchmark_root()):
+ ps = str(p)
+ if ps not in sys.path:
+ sys.path.insert(0, ps)
+
+
+_ensure_import_path()
+
+try:
+ from benchmarks.ComputerSystems.DuckDBQueryRewrite.baseline.solution import rewrite_query as baseline_rewrite_query
+ from benchmarks.ComputerSystems.DuckDBQueryRewrite.runtime.problem import HIDDEN_CASES, PUBLIC_CASES, evaluate_query
+except ModuleNotFoundError:
+ from baseline.solution import rewrite_query as baseline_rewrite_query
+ from runtime.problem import HIDDEN_CASES, PUBLIC_CASES, evaluate_query
+
+
+def _run_case(rewrite_query, case):
+ rewritten = rewrite_query(case["baseline_sql"], dict(case))
+ return evaluate_query(rewritten, case)
+
+
+def evaluate(program_path: str):
+ metrics = {
+ "combined_score": -1e18,
+ "valid": 0.0,
+ "public_avg_runtime_s": 0.0,
+ "hidden_avg_runtime_s": 0.0,
+ "baseline_hidden_avg_runtime_s": 0.0,
+ "num_public_cases": 0.0,
+ "num_hidden_cases": 0.0,
+ }
+ artifacts = {}
+ namespace = runpy.run_path(str(Path(program_path).expanduser().resolve()), run_name="candidate_program")
+ rewrite_query = namespace.get("rewrite_query")
+ if not callable(rewrite_query):
+ artifacts["error_message"] = "candidate must define rewrite_query(sql, workload_manifest)"
+ return metrics, artifacts
+ try:
+ public_candidate = [_run_case(rewrite_query, case) for case in PUBLIC_CASES]
+ hidden_candidate = [_run_case(rewrite_query, case) for case in HIDDEN_CASES]
+ hidden_baseline = [_run_case(baseline_rewrite_query, case) for case in HIDDEN_CASES]
+ except Exception:
+ artifacts["error_message"] = traceback.format_exc()
+ return metrics, artifacts
+
+ hidden_avg = sum(float(item["candidate_runtime_s"]) for item in hidden_candidate) / len(hidden_candidate)
+ baseline_hidden_avg = sum(float(item["candidate_runtime_s"]) for item in hidden_baseline) / len(hidden_baseline)
+ public_avg = sum(float(item["candidate_runtime_s"]) for item in public_candidate) / len(public_candidate)
+ if not math.isfinite(hidden_avg) or hidden_avg <= 0:
+ artifacts["error_message"] = "candidate runtime is invalid"
+ return metrics, artifacts
+
+ metrics["valid"] = 1.0
+ metrics["public_avg_runtime_s"] = public_avg
+ metrics["hidden_avg_runtime_s"] = hidden_avg
+ metrics["baseline_hidden_avg_runtime_s"] = baseline_hidden_avg
+ metrics["num_public_cases"] = float(len(PUBLIC_CASES))
+ metrics["num_hidden_cases"] = float(len(HIDDEN_CASES))
+ metrics["combined_score"] = -hidden_avg
+ return metrics, artifacts
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("program")
+ parser.add_argument("--metrics-out", default="metrics.json")
+ args = parser.parse_args()
+ metrics, artifacts = evaluate(args.program)
+ Path(args.metrics_out).write_text(json.dumps(metrics, indent=2), encoding="utf-8")
+ if artifacts:
+ Path("artifacts.json").write_text(json.dumps(artifacts, indent=2), encoding="utf-8")
+ print(json.dumps(metrics, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/requirements.txt b/benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/requirements.txt
new file mode 100644
index 00000000..8a6ba6a1
--- /dev/null
+++ b/benchmarks/ComputerSystems/DuckDBQueryRewrite/verification/requirements.txt
@@ -0,0 +1 @@
+duckdb
diff --git a/benchmarks/ComputerSystems/duckdb_local_workload.py b/benchmarks/ComputerSystems/duckdb_local_workload.py
new file mode 100644
index 00000000..cb0da163
--- /dev/null
+++ b/benchmarks/ComputerSystems/duckdb_local_workload.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import math
+import time
+from typing import Any
+
+import duckdb
+
+
+CUSTOMER_COUNT = 20_000
+ORDER_COUNT = 120_000
+LINEITEM_COUNT = 600_000
+
+CUSTOMER_KEYS = tuple(1 + ((i * 97) % CUSTOMER_COUNT) for i in range(1, 301))
+ORDER_KEYS = tuple(1 + ((i * 193) % ORDER_COUNT) for i in range(1, 301))
+
+
+INDEX_CANDIDATES = {
+ "idx_orders_cust": "CREATE INDEX idx_orders_cust ON orders(o_custkey)",
+ "idx_orders_date": "CREATE INDEX idx_orders_date ON orders(o_orderdate)",
+ "idx_lineitem_order": "CREATE INDEX idx_lineitem_order ON lineitem(l_orderkey)",
+ "idx_customer_segment": "CREATE INDEX idx_customer_segment ON customer(c_mktsegment)",
+ "idx_orders_priority": "CREATE INDEX idx_orders_priority ON orders(o_orderpriority)",
+}
+
+PREAGGREGATION_CANDIDATES = {
+ "agg_quarter_segment_revenue": (
+ "CREATE TABLE agg_quarter_segment_revenue AS "
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_month_shipmode_revenue": (
+ "CREATE TABLE agg_month_shipmode_revenue AS "
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "GROUP BY 1, 2"
+ ),
+ "agg_customer_year_revenue": (
+ "CREATE TABLE agg_customer_year_revenue AS "
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2"
+ ),
+ "agg_unused_priority_only": (
+ "CREATE TABLE agg_unused_priority_only AS "
+ "SELECT o.o_orderpriority, count(*) AS order_count "
+ "FROM orders o "
+ "GROUP BY 1"
+ ),
+}
+
+
+def build_connection() -> duckdb.DuckDBPyConnection:
+ con = duckdb.connect(database=":memory:")
+ con.execute("PRAGMA threads=1")
+ con.execute(
+ f"""
+ CREATE TABLE customer AS
+ SELECT i AS c_custkey,
+ 'Customer #' || i AS c_name,
+ CASE i % 5
+ WHEN 0 THEN 'BUILDING'
+ WHEN 1 THEN 'AUTOMOBILE'
+ WHEN 2 THEN 'HOUSEHOLD'
+ WHEN 3 THEN 'FURNITURE'
+ ELSE 'MACHINERY'
+ END AS c_mktsegment,
+ i % 25 AS c_nationkey
+ FROM range(1, {CUSTOMER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE orders AS
+ SELECT i AS o_orderkey,
+ 1 + ((i * 17) % {CUSTOMER_COUNT}) AS o_custkey,
+ DATE '1995-01-01' + (((i * 13) % 1460) * INTERVAL 1 DAY) AS o_orderdate,
+ 100 + (((i * 37) % 100000) / 10.0) AS o_totalprice,
+ CASE i % 5
+ WHEN 0 THEN '1-URGENT'
+ WHEN 1 THEN '2-HIGH'
+ WHEN 2 THEN '3-MEDIUM'
+ WHEN 3 THEN '4-NOT SPECIFIED'
+ ELSE '5-LOW'
+ END AS o_orderpriority
+ FROM range(1, {ORDER_COUNT + 1}) t(i)
+ """
+ )
+ con.execute(
+ f"""
+ CREATE TABLE lineitem AS
+ SELECT i AS l_lineitemkey,
+ 1 + ((i * 7) % {ORDER_COUNT}) AS l_orderkey,
+ 1 + ((i * 11) % 50000) AS l_partkey,
+ 1 + ((i * 13) % 10000) AS l_suppkey,
+ 1 + ((i * 5) % 50) AS l_quantity,
+ 10 + (((i * 19) % 100000) / 20.0) AS l_extendedprice,
+ (((i * 3) % 10) / 100.0) AS l_discount,
+ DATE '1995-01-01' + (((i * 29) % 1460) * INTERVAL 1 DAY) AS l_shipdate,
+ CASE i % 5
+ WHEN 0 THEN 'AIR'
+ WHEN 1 THEN 'MAIL'
+ WHEN 2 THEN 'RAIL'
+ WHEN 3 THEN 'TRUCK'
+ ELSE 'SHIP'
+ END AS l_shipmode
+ FROM range(1, {LINEITEM_COUNT + 1}) t(i)
+ """
+ )
+ return con
+
+
+def normalize_name_list(value: Any, key: str) -> list[str]:
+ if isinstance(value, dict):
+ if key not in value:
+ raise ValueError(f"missing {key}")
+ value = value[key]
+ if not isinstance(value, (list, tuple)):
+ raise ValueError(f"{key} must be a list or tuple")
+ out: list[str] = []
+ seen = set()
+ for item in value:
+ name = str(item)
+ if name not in seen:
+ out.append(name)
+ seen.add(name)
+ return out
+
+
+def compare_results(lhs: list[tuple[Any, ...]], rhs: list[tuple[Any, ...]], tol: float = 1e-6) -> bool:
+ if len(lhs) != len(rhs):
+ return False
+ for left_row, right_row in zip(lhs, rhs):
+ if len(left_row) != len(right_row):
+ return False
+ for left_value, right_value in zip(left_row, right_row):
+ if isinstance(left_value, float) or isinstance(right_value, float):
+ if not math.isfinite(float(left_value)) or not math.isfinite(float(right_value)):
+ return False
+ if abs(float(left_value) - float(right_value)) > tol:
+ return False
+ elif left_value != right_value:
+ return False
+ return True
+
+
+def _index_keys(sample_size: int, source: tuple[int, ...]) -> tuple[int, ...]:
+ sample_size = max(1, min(len(source), int(sample_size)))
+ return tuple(source[:sample_size])
+
+
+def run_index_workload(con: duckdb.DuckDBPyConnection, manifest: dict[str, Any]) -> float:
+ start_time = time.perf_counter()
+ customer_keys = _index_keys(manifest.get("customer_sample", 80), CUSTOMER_KEYS)
+ order_keys = _index_keys(manifest.get("order_sample", 80), ORDER_KEYS)
+ urgent_customer_keys = _index_keys(manifest.get("urgent_customer_sample", 40), CUSTOMER_KEYS)
+ min_order_date = str(manifest.get("min_order_date", "1997-01-01"))
+ priority_value = str(manifest.get("priority_value", "1-URGENT"))
+
+ for customer_key in customer_keys:
+ con.execute(
+ "SELECT sum(o_totalprice) "
+ "FROM orders "
+ "WHERE o_custkey = ? AND o_orderdate >= CAST(? AS DATE)",
+ [customer_key, min_order_date],
+ ).fetchone()
+ for order_key in order_keys:
+ con.execute(
+ "SELECT sum(l_extendedprice * (1 - l_discount)) "
+ "FROM lineitem "
+ "WHERE l_orderkey = ?",
+ [order_key],
+ ).fetchone()
+ for customer_key in urgent_customer_keys:
+ con.execute(
+ "SELECT count(*) "
+ "FROM customer c "
+ "JOIN orders o ON c.c_custkey = o.o_custkey "
+ "WHERE c.c_custkey = ? AND o.o_orderpriority = ?",
+ [customer_key, priority_value],
+ ).fetchone()
+ return time.perf_counter() - start_time
+
+
+def measure_index_design(selected_indexes: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_indexes", tuple(sorted(INDEX_CANDIDATES))))
+ max_indexes = int(manifest.get("max_indexes", len(allowed)))
+ unknown = [name for name in selected_indexes if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown index names: {unknown}")
+ if len(selected_indexes) > max_indexes:
+ raise ValueError(f"too many indexes selected: {len(selected_indexes)} > {max_indexes}")
+
+ con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_indexes:
+ con.execute(INDEX_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ workload_runtime = 0.0
+ repetitions = int(manifest.get("repetitions", 3))
+ run_index_workload(con, manifest)
+ for _ in range(repetitions):
+ workload_runtime += run_index_workload(con, manifest)
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "workload_runtime_s": float(workload_runtime),
+ "total_runtime_s": float(setup_runtime + workload_runtime),
+ "selected_index_count": len(selected_indexes),
+ }
+
+
+def _report_quarter_segment(con: duckdb.DuckDBPyConnection, use_aggregate: bool, segment_filter: tuple[str, ...]) -> list[tuple[Any, ...]]:
+ values = ", ".join(f"'{value}'" for value in segment_filter)
+ if use_aggregate:
+ return con.execute(
+ "SELECT quarter_bucket, segment, revenue "
+ "FROM agg_quarter_segment_revenue "
+ f"WHERE segment IN ({values}) "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('quarter', o.o_orderdate) AS quarter_bucket, "
+ " c.c_mktsegment AS segment, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ f"WHERE c.c_mktsegment IN ({values}) "
+ "GROUP BY 1, 2 "
+ "ORDER BY quarter_bucket, segment"
+ ).fetchall()
+
+
+def _report_month_shipmode(con: duckdb.DuckDBPyConnection, use_aggregate: bool, min_shipdate: str) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT month_bucket, shipmode, revenue "
+ "FROM agg_month_shipmode_revenue "
+ "WHERE month_bucket >= CAST(? AS DATE) "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+ return con.execute(
+ "SELECT date_trunc('month', l.l_shipdate) AS month_bucket, "
+ " l.l_shipmode AS shipmode, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM lineitem l "
+ "WHERE l.l_shipdate >= CAST(? AS DATE) "
+ "GROUP BY 1, 2 "
+ "ORDER BY month_bucket, shipmode",
+ [min_shipdate],
+ ).fetchall()
+
+
+def _report_customer_year(con: duckdb.DuckDBPyConnection, use_aggregate: bool, revenue_year: int, limit_rows: int) -> list[tuple[Any, ...]]:
+ if use_aggregate:
+ return con.execute(
+ "SELECT revenue_year, c_custkey, revenue "
+ "FROM agg_customer_year_revenue "
+ "WHERE revenue_year = ? "
+ "ORDER BY revenue DESC, c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+ return con.execute(
+ "SELECT year(o.o_orderdate) AS revenue_year, "
+ " c.c_custkey, "
+ " sum(l.l_extendedprice * (1 - l.l_discount)) AS revenue "
+ "FROM customer c "
+ "JOIN orders o ON o.o_custkey = c.c_custkey "
+ "JOIN lineitem l ON l.l_orderkey = o.o_orderkey "
+ "GROUP BY 1, 2 "
+ "HAVING year(o.o_orderdate) = ? "
+ "ORDER BY revenue DESC, c.c_custkey "
+ "LIMIT ?",
+ [revenue_year, limit_rows],
+ ).fetchall()
+
+
+def _run_preaggregation_reports(
+ con: duckdb.DuckDBPyConnection,
+ selected: set[str],
+ manifest: dict[str, Any],
+) -> tuple[float, tuple[list[tuple[Any, ...]], ...]]:
+ start_time = time.perf_counter()
+ result_a = _report_quarter_segment(
+ con,
+ "agg_quarter_segment_revenue" in selected,
+ tuple(manifest.get("segment_filter", ("BUILDING", "AUTOMOBILE", "HOUSEHOLD"))),
+ )
+ result_b = _report_month_shipmode(
+ con,
+ "agg_month_shipmode_revenue" in selected,
+ str(manifest.get("min_shipdate", "1997-01-01")),
+ )
+ result_c = _report_customer_year(
+ con,
+ "agg_customer_year_revenue" in selected,
+ int(manifest.get("revenue_year", 1998)),
+ int(manifest.get("limit_rows", 100)),
+ )
+ runtime = time.perf_counter() - start_time
+ return runtime, (result_a, result_b, result_c)
+
+
+def measure_preaggregation_design(selected_preaggregations: list[str], manifest: dict[str, Any]) -> dict[str, float | int]:
+ allowed = tuple(manifest.get("candidate_preaggregations", tuple(sorted(PREAGGREGATION_CANDIDATES))))
+ max_preaggregations = int(manifest.get("max_preaggregations", len(allowed)))
+ unknown = [name for name in selected_preaggregations if name not in allowed]
+ if unknown:
+ raise ValueError(f"unknown pre-aggregation names: {unknown}")
+ if len(selected_preaggregations) > max_preaggregations:
+ raise ValueError(
+ f"too many pre-aggregations selected: {len(selected_preaggregations)} > {max_preaggregations}"
+ )
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ start_setup = time.perf_counter()
+ for name in selected_preaggregations:
+ candidate_con.execute(PREAGGREGATION_CANDIDATES[name])
+ setup_runtime = time.perf_counter() - start_setup
+
+ _, baseline_results = _run_preaggregation_reports(baseline_con, set(), manifest)
+ _, candidate_results = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ if any(not compare_results(left, right) for left, right in zip(candidate_results, baseline_results)):
+ raise ValueError("candidate pre-aggregation selection changed the query results")
+
+ repetitions = int(manifest.get("repetitions", 3))
+ repeated_baseline_runtime = 0.0
+ repeated_candidate_runtime = 0.0
+ _run_preaggregation_reports(baseline_con, set(), manifest)
+ _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ for _ in range(repetitions):
+ extra_runtime, _ = _run_preaggregation_reports(baseline_con, set(), manifest)
+ repeated_baseline_runtime += extra_runtime
+ extra_runtime, _ = _run_preaggregation_reports(candidate_con, set(selected_preaggregations), manifest)
+ repeated_candidate_runtime += extra_runtime
+
+ return {
+ "setup_runtime_s": float(setup_runtime),
+ "candidate_workload_runtime_s": float(repeated_candidate_runtime),
+ "candidate_total_runtime_s": float(setup_runtime + repeated_candidate_runtime),
+ "baseline_total_runtime_s": float(repeated_baseline_runtime),
+ "selected_preaggregation_count": len(selected_preaggregations),
+ }
+
+
+def measure_query_rewrite(sql: str, manifest: dict[str, Any]) -> dict[str, Any]:
+ sql = str(sql).strip()
+ if not sql:
+ raise ValueError("query must not be empty")
+ baseline_sql = str(manifest["baseline_sql"]).strip()
+ repetitions = int(manifest.get("repetitions", 3))
+
+ baseline_con = build_connection()
+ candidate_con = build_connection()
+ baseline_rows = baseline_con.execute(baseline_sql).fetchall()
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ if not compare_results(candidate_rows, baseline_rows):
+ raise ValueError("candidate query result does not match the baseline result")
+
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_start = time.perf_counter()
+ for _ in range(repetitions):
+ baseline_con.execute(baseline_sql).fetchall()
+ baseline_runtime = time.perf_counter() - baseline_start
+
+ candidate_con.execute(sql).fetchall()
+ candidate_start = time.perf_counter()
+ for _ in range(repetitions):
+ candidate_rows = candidate_con.execute(sql).fetchall()
+ candidate_runtime = time.perf_counter() - candidate_start
+
+ return {
+ "baseline_runtime_s": float(baseline_runtime),
+ "candidate_runtime_s": float(candidate_runtime),
+ "row_count": len(candidate_rows),
+ }
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/README.md b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/README.md
new file mode 100644
index 00000000..ed13cdec
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/README.md
@@ -0,0 +1,26 @@
+# LightweightBroadbandAbsorber
+
+Lightweight broadband CNTs@Nd-BaM/PE microwave absorber optimization (8.2–18 GHz).
+
+## Key Features
+- 4 material components with competing weight/performance trade-offs
+- Minimum EAB hard constraint (>= 4.0 GHz)
+- Density penalty is the dominant penalty term (weight 0.5)
+
+## Quick Start
+```bash
+pip install -r verification/requirements.txt
+python verification/evaluator.py scripts/init.py
+python verification/evaluator.py baseline/solution.py
+```
+
+## Unified Run
+
+```bash
+bash scripts/run_v2_unified.sh MaterialEngineering/LightweightBroadbandAbsorber \
+ algorithm=openevolve \
+ algorithm.iterations=0
+```
+
+## Reference
+Wang et al., *Materials* 2024, 17, 3433.
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/README_zh-CN.md b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/README_zh-CN.md
new file mode 100644
index 00000000..25e58adb
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/README_zh-CN.md
@@ -0,0 +1,29 @@
+# LightweightBroadbandAbsorber
+
+[English](./README.md) | 简体中文
+
+## 概览
+
+该任务针对 8.2-18 GHz 频段的轻量宽带吸波体设计,要求在带宽、反射损耗、厚度、密度和成本之间取得平衡。
+
+## 关键特征
+
+- 4 种材料组分,存在性能与重量之间的竞争关系
+- 存在最小 `EAB` 硬约束(`>= 4.0 GHz`)
+- 密度惩罚是主导惩罚项
+
+## 快速开始
+
+```bash
+pip install -r verification/requirements.txt
+python verification/evaluator.py scripts/init.py
+python verification/evaluator.py baseline/solution.py
+```
+
+## Unified 运行
+
+```bash
+bash scripts/run_v2_unified.sh MaterialEngineering/LightweightBroadbandAbsorber \
+ algorithm=openevolve \
+ algorithm.iterations=0
+```
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/Task.md b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/Task.md
new file mode 100644
index 00000000..55ae3b07
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/Task.md
@@ -0,0 +1,110 @@
+# LightweightBroadbandAbsorber — Task Specification
+
+## 1. Background
+
+Lightweight broadband microwave absorbers are essential in aerospace, unmanned aerial vehicles, and portable electronic systems where both electromagnetic stealth and weight reduction are critical. This benchmark is based on the CNTs@Nd₀.₁₅-BaM/PE composite system (Wang et al., *Materials* 2024, 17, 3433), where the best experimental result achieved RL_min = −58.01 dB with EAB = 4.26 GHz at 1.9 mm thickness.
+
+The task targets the **8.2–18 GHz range** and introduces a **minimum bandwidth hard constraint** and a **heavily penalized density** to push optimizers toward lightweight solutions.
+
+## 2. Design Variables
+
+The optimizer controls five variables across **four material components**:
+
+| Variable | Symbol | Unit | Range | Description |
+|----------|--------|------|-------|-------------|
+| Thickness | `d_mm` | mm | [1.0, 5.0] | Absorber layer thickness |
+| Magnetic absorber fraction | `phi_magnetic_absorber` | — | [0, 1] | Nd₀.₁₅-BaM (density 5.1 g/cm³) |
+| Conductive filler fraction | `phi_conductive_filler` | — | [0, 1] | CNTs at 8wt% (density 1.7 g/cm³) |
+| Lightweight magnetic fraction | `phi_lightweight_magnetic` | — | [0, 1] | Hollow Nd-BaM (density 2.8 g/cm³) |
+| Matrix fraction | `phi_matrix` | — | [0, 1] | PE matrix (density 0.95 g/cm³) |
+
+**Constraint**: All volume fractions must sum to 1.0 (tolerance: 1e-6).
+
+## 3. Evaluation
+
+### 3.1 Material Property Estimation
+
+Effective properties computed using **linear volume-fraction mixing**:
+
+$$\varepsilon_{r,eff} = \sum_i \phi_i \cdot \varepsilon_{r,i}, \quad \mu_{r,eff} = \sum_i \phi_i \cdot \mu_{r,i}$$
+
+> **Simplifications**: Frequency-independent constant parameters; linear mixing rule. See `material_db.json` for details. Convention: $\varepsilon_r = \varepsilon' - j\varepsilon''$ (negative imaginary part).
+
+### 3.2 Physical Model
+
+Standard transmission line theory with PEC backing:
+
+$$Z_{in} = Z_0 \sqrt{\frac{\mu_r}{\varepsilon_r}} \tanh\left(j \frac{2\pi f d}{c} \sqrt{\mu_r \varepsilon_r}\right)$$
+
+$$RL(f) = 20 \log_{10} \left| \frac{Z_{in} - Z_0}{Z_{in} + Z_0} \right|$$
+
+### 3.3 Metrics
+
+- **Frequency range**: 8.2–18.0 GHz (197 points)
+- **$RL_{min}$**: minimum reflection loss
+- **$EAB_{10}$**: maximum continuous bandwidth where $RL \leq -10\;\text{dB}$
+
+### 3.4 Hard Constraint
+
+**$EAB_{10} < 4.0$ GHz → infeasible** (`combined_score = 0`).
+
+### 3.5 Scoring
+
+All metrics min-max normalized to [0, 1]:
+
+| Metric | Range | Unit |
+|--------|-------|------|
+| $EAB_{10}$ | [0, 9.8] | GHz |
+| $|RL_{min}|$ | [0, 60] | dB |
+| $d$ | [1.0, 5.0] | mm |
+| $\rho$ | [0.9, 5.5] | g/cm³ |
+| cost | [1.0, 4.0] | — |
+
+$$\text{combined\_score} = 1.0 \cdot \hat{EAB} + 0.15 \cdot |\widehat{RL}_{min}| - 0.4 \cdot \hat{d} - 0.5 \cdot \hat{\rho} - 0.05 \cdot \widehat{cost}$$
+
+> **Important**: Final results determined solely by `verification/evaluator.py`.
+
+## 4. Input / Output
+
+### 4.1 Input
+- `references/material_db.json`: material database (fixed)
+- `references/problem_config.json`: configuration (fixed)
+
+### 4.2 Output
+`temp/submission.json`:
+```json
+{
+ "benchmark_id": "lightweight_broadband_absorber_8_18ghz",
+ "d_mm": 1.9,
+ "phi_magnetic_absorber": 0.25,
+ "phi_conductive_filler": 0.10,
+ "phi_lightweight_magnetic": 0.05,
+ "phi_matrix": 0.60
+}
+```
+
+## 5. Feasibility Rules
+
+Infeasible if:
+1. `submission.json` missing or unparseable.
+2. Any required key absent.
+3. `benchmark_id` mismatch.
+4. `d_mm` outside [1.0, 5.0] or non-finite.
+5. Any volume fraction outside [0, 1] or non-finite.
+6. Volume fractions do not sum to 1.0 (tolerance: 1e-6).
+7. **$EAB_{10} < 4.0\;\text{GHz}$**.
+8. Timeout (120s) or non-zero exit code.
+
+## 6. How to Run
+
+```bash
+python verification/evaluator.py scripts/init.py
+python verification/evaluator.py baseline/solution.py
+bash scripts/run_v2_unified.sh MaterialEngineering/LightweightBroadbandAbsorber \
+ algorithm=openevolve \
+ algorithm.iterations=0
+```
+
+## 7. References
+
+- Wang, Y.; et al. "Preparation and microwave absorption properties of CNTs@Nd-BaM/PE composites." *Materials* 2024, 17, 3433.
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/Task_zh-CN.md b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/Task_zh-CN.md
new file mode 100644
index 00000000..d79f0187
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/Task_zh-CN.md
@@ -0,0 +1,70 @@
+# LightweightBroadbandAbsorber — 任务说明
+
+## 1. 背景
+
+轻量宽带微波吸收材料对于航空航天、无人机和便携电子系统都很重要,因为这些场景同时要求电磁隐身与减重。本 benchmark 基于 CNTs@Nd-BaM/PE 复合体系,重点引入了宽带硬约束和更强的密度惩罚。
+
+## 2. 设计变量
+
+优化器控制 5 个变量,涉及 4 种材料组分:
+
+- `d_mm`:厚度,范围 `[1.0, 5.0]`
+- `phi_magnetic_absorber`:磁性吸收剂体积分数
+- `phi_conductive_filler`:导电填料体积分数
+- `phi_lightweight_magnetic`:轻量磁性组分体积分数
+- `phi_matrix`:基体体积分数
+
+约束:
+
+- 所有体积分数和为 `1.0`
+- 容差 `1e-6`
+
+## 3. 评估方式
+
+评测器使用线性体积分数混合规则计算等效电磁参数,并通过 PEC 背板传输线理论计算反射损耗曲线。
+
+主要指标:
+
+- `RL_min`
+- `EAB_10`
+
+硬约束:
+
+- 若 `EAB_10 < 4.0 GHz`,则判为 infeasible,`combined_score = 0`
+
+最终分数综合考虑:
+
+- 带宽奖励
+- 吸收深度奖励
+- 厚度惩罚
+- 密度惩罚
+- 成本惩罚
+
+实际以 `verification/evaluator.py` 为准。
+
+## 4. 输出格式
+
+候选程序必须写出 `temp/submission.json`,包含:
+
+```json
+{
+ "benchmark_id": "lightweight_broadband_absorber_8_18ghz",
+ "d_mm": 1.9,
+ "phi_magnetic_absorber": 0.25,
+ "phi_conductive_filler": 0.10,
+ "phi_lightweight_magnetic": 0.05,
+ "phi_matrix": 0.60
+}
+```
+
+## 5. 判无效条件
+
+以下情况会被判无效:
+
+- 输出缺失或格式错误
+- 必需字段缺失
+- `benchmark_id` 不匹配
+- 任意数值非有限或超出范围
+- 体积分数之和不满足约束
+- `EAB_10 < 4.0 GHz`
+- 候选程序超时或非零退出
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/baseline/result_log.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/baseline/result_log.txt
new file mode 100644
index 00000000..6b77c125
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/baseline/result_log.txt
@@ -0,0 +1,24 @@
+Baseline Execution Log
+======================
+Command: python verification/evaluator.py baseline/solution.py
+Date: 2026-03-22
+Method: Random search (3000 samples, seed=42)
+Material system: CNTs@Nd0.15-BaM/PE composites
+
+Evaluation Result:
+{
+ "valid": 1,
+ "feasible": 1,
+ "combined_score": 0.4422,
+ "rl_min_db": -46.72,
+ "eab10_ghz": 5.3,
+ "thickness_mm": 2.0008,
+ "density": 1.866,
+ "cost_proxy": 1.619,
+ "runtime_sec": 1.7
+}
+
+Notes:
+- EAB = 5.3 GHz (meets >= 4.0 GHz hard constraint).
+- Density = 1.87 g/cm3 (lightweight designs rewarded, density penalty weight = 0.5).
+- Based on Wang et al., Materials 2024, 17, 3433.
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/baseline/solution.py b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/baseline/solution.py
new file mode 100644
index 00000000..1a80f002
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/baseline/solution.py
@@ -0,0 +1,92 @@
+"""
+Baseline for LightweightBroadbandAbsorber. Random search, 3000 samples.
+"""
+import json, random
+from pathlib import Path
+import numpy as np
+
+Z0, C0 = 377.0, 2.998e8
+
+def norm(v, lo, hi):
+ if hi <= lo: return 0.0
+ return max(0.0, min(1.0, (v - lo) / (hi - lo)))
+
+def main():
+ task_dir = Path(__file__).resolve().parents[1]
+ temp_dir = task_dir / "temp"
+ temp_dir.mkdir(exist_ok=True)
+
+ cfg = json.loads((task_dir / "references" / "problem_config.json").read_text())
+ mdb = json.loads((task_dir / "references" / "material_db.json").read_text())
+ freqs = np.linspace(cfg["freq_ghz_min"]*1e9, cfg["freq_ghz_max"]*1e9, cfg["num_freq_points"])
+ w, n = cfg["weights"], cfg["normalization"]
+ mat = mdb["matrix"]
+ ma = mdb["magnetic_absorber"]
+ cf = mdb["conductive_filler"]
+ lm = mdb["lightweight_magnetic"]
+ min_eab = cfg.get("min_eab_ghz", 0.0)
+
+ best_score, best_sub = -1e18, None
+ random.seed(42)
+
+ for _ in range(3000):
+ p_ma = random.uniform(0.0, 0.4)
+ p_cf = random.uniform(0.05, 0.5)
+ p_lm = random.uniform(0.0, 0.3)
+ if p_ma + p_cf + p_lm > 0.95: continue
+ p_x = 1.0 - p_ma - p_cf - p_lm
+ d_mm = random.uniform(cfg["d_mm_min"], cfg["d_mm_max"])
+
+ comps = [(p_x, mat), (p_ma, ma), (p_cf, cf), (p_lm, lm)]
+ er = complex(sum(p*c["eps_real"] for p,c in comps), -sum(p*c["eps_imag"] for p,c in comps))
+ mr = complex(sum(p*c["mu_real"] for p,c in comps), -sum(p*c["mu_imag"] for p,c in comps))
+ dens = sum(p*c["density"] for p,c in comps)
+ cost = sum(p*c["cost_proxy"] for p,c in comps)
+
+ d_m = d_mm * 1e-3
+ rl = np.zeros(len(freqs))
+ for i, f in enumerate(freqs):
+ g = 1j*(2*np.pi*f*d_m/C0)*np.sqrt(mr*er)
+ zi = Z0*np.sqrt(mr/er)*np.tanh(g)
+ r = abs((zi-Z0)/(zi+Z0))
+ rl[i] = 20*np.log10(max(r, 1e-15))
+
+ rl_min = float(np.min(rl))
+ mask = rl <= -10; ml = cl = ei = 0
+ for i, f in enumerate(mask):
+ if f: cl += 1
+ else: cl = 0
+ if cl > ml: ml = cl; ei = i
+ eab = (freqs[ei] - freqs[ei-ml+1]) / 1e9 if ml > 0 else 0.0
+
+ if eab < min_eab: continue
+
+ s = (w["eab10"]*norm(eab, n["eab10_ghz"]["min"], n["eab10_ghz"]["max"])
+ + w["rl_min"]*norm(abs(rl_min), n["abs_rl_min_db"]["min"], n["abs_rl_min_db"]["max"])
+ - w["thickness"]*norm(d_mm, n["thickness_mm"]["min"], n["thickness_mm"]["max"])
+ - w["density"]*norm(dens, n["density"]["min"], n["density"]["max"])
+ - w["cost"]*norm(cost, n["cost"]["min"], n["cost"]["max"]))
+
+ if s > best_score:
+ best_score = s
+ best_sub = {
+ "benchmark_id": cfg["benchmark_id"],
+ "d_mm": round(d_mm, 4),
+ "phi_magnetic_absorber": round(p_ma, 4),
+ "phi_conductive_filler": round(p_cf, 4),
+ "phi_lightweight_magnetic": round(p_lm, 4),
+ "phi_matrix": round(p_x, 4),
+ }
+
+ if best_sub:
+ best_sub["phi_matrix"] = round(1.0 - best_sub["phi_magnetic_absorber"] - best_sub["phi_conductive_filler"] - best_sub["phi_lightweight_magnetic"], 6)
+
+ out = temp_dir / "submission.json"
+ with open(out, "w", encoding="utf-8") as f:
+ json.dump(best_sub, f, indent=2)
+ print(f"Baseline done. Best score: {best_score:.4f}")
+ print(f"Submission: {json.dumps(best_sub, indent=2)}")
+ print(f"Written to {out}")
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/agent_files.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/agent_files.txt
new file mode 100644
index 00000000..296905a3
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/agent_files.txt
@@ -0,0 +1,8 @@
+README.md
+README_zh-CN.md
+Task.md
+Task_zh-CN.md
+scripts/init.py
+verification/evaluator.py
+references/
+frontier_eval/constraints.txt
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/artifact_files.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/artifact_files.txt
new file mode 100644
index 00000000..cb7566f6
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/artifact_files.txt
@@ -0,0 +1 @@
+temp/submission.json
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/candidate_destination.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/candidate_destination.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/candidate_destination.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/constraints.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/constraints.txt
new file mode 100644
index 00000000..b4a44e4e
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/constraints.txt
@@ -0,0 +1,6 @@
+UnifiedTask constraints:
+1) Only modify `scripts/init.py`.
+2) Preserve the submission schema expected by `verification/evaluator.py`.
+3) Do not modify benchmark assets, documentation, references, verification code, baseline code, or `frontier_eval/` metadata.
+4) Keep the output filename as `temp/submission.json`.
+5) Prioritize validity and the EAB hard constraint before optimization.
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/copy_files.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/copy_files.txt
new file mode 100644
index 00000000..9c558e35
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/copy_files.txt
@@ -0,0 +1 @@
+.
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/eval_command.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/eval_command.txt
new file mode 100644
index 00000000..8cfcad47
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/eval_command.txt
@@ -0,0 +1 @@
+{python} frontier_eval/run_eval.py --candidate {candidate} --metrics-out metrics.json --artifacts-out artifacts.json
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/eval_cwd.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/eval_cwd.txt
new file mode 100644
index 00000000..9c558e35
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/eval_cwd.txt
@@ -0,0 +1 @@
+.
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/evaluator.py b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/evaluator.py
new file mode 100644
index 00000000..44f6edb9
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/evaluator.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def _is_repo_root(path: Path) -> bool:
+ return (path / "frontier_eval").is_dir() and (path / "benchmarks").is_dir()
+
+
+def _find_repo_root() -> Path:
+ if "FRONTIER_ENGINEERING_ROOT" in os.environ:
+ return Path(os.environ["FRONTIER_ENGINEERING_ROOT"]).expanduser().resolve()
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if _is_repo_root(parent):
+ return parent
+ return Path.cwd().resolve()
+
+
+def _tail(text: str, limit: int = 8000) -> str:
+ if len(text) <= limit:
+ return text
+ return text[-limit:]
+
+
+def _parse_result(stdout: str) -> dict:
+ marker_pos = stdout.find("EVALUATION RESULT")
+ search_start = marker_pos if marker_pos >= 0 else 0
+ json_start = stdout.find("{", search_start)
+ json_end = stdout.rfind("}")
+ if json_start < 0 or json_end < json_start:
+ raise ValueError("Failed to locate JSON result block in evaluator stdout")
+ return json.loads(stdout[json_start : json_end + 1])
+
+
+def evaluate(program_path: str, *, repo_root: Path | None = None):
+ start = time.time()
+ repo_root = _find_repo_root() if repo_root is None else repo_root.expanduser().resolve()
+ _ = repo_root
+ program_path = Path(program_path).expanduser().resolve()
+ task_dir = Path(__file__).resolve().parents[1]
+
+ eval_script = (task_dir / "verification" / "evaluator.py").resolve()
+ proc = subprocess.run(
+ [sys.executable, str(eval_script), str(program_path)],
+ cwd=str(task_dir),
+ capture_output=True,
+ text=True,
+ timeout=300,
+ )
+
+ metrics = {
+ "combined_score": 0.0,
+ "valid": 0.0,
+ "timeout": 0.0,
+ "runtime_s": float(time.time() - start),
+ "program_returncode": float(proc.returncode),
+ }
+ artifacts = {
+ "evaluator_stdout": _tail(proc.stdout),
+ "evaluator_stderr": _tail(proc.stderr),
+ }
+ for candidate in [task_dir / "temp" / "submission.json", task_dir / "submission.json"]:
+ if candidate.exists():
+ artifacts[candidate.relative_to(task_dir).as_posix()] = candidate.read_text(
+ encoding="utf-8", errors="replace"
+ )
+
+ try:
+ result = _parse_result(proc.stdout)
+ metrics["combined_score"] = float(result.get("combined_score", 0.0))
+ metrics["valid"] = 1.0 if float(result.get("valid", 0.0)) > 0 else 0.0
+ except Exception as exc:
+ artifacts["error_message"] = f"Failed to parse evaluator result: {exc}"
+
+ return _wrap(metrics, artifacts)
+
+
+def _wrap(metrics: dict[str, float], artifacts: dict[str, str]):
+ try:
+ from openevolve.evaluation_result import EvaluationResult
+ except Exception:
+ return {"metrics": metrics, "artifacts": artifacts}
+ return EvaluationResult(metrics=metrics, artifacts=artifacts)
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/initial_program.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/initial_program.txt
new file mode 100644
index 00000000..b9411b3d
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/initial_program.txt
@@ -0,0 +1 @@
+scripts/init.py
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/readonly_files.txt b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/readonly_files.txt
new file mode 100644
index 00000000..e35eda2e
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/readonly_files.txt
@@ -0,0 +1,8 @@
+README.md
+README_zh-CN.md
+Task.md
+Task_zh-CN.md
+references/
+verification/
+baseline/
+frontier_eval/
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/run_eval.py b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/run_eval.py
new file mode 100644
index 00000000..e3307605
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/frontier_eval/run_eval.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import argparse
+import inspect
+import json
+import os
+import traceback
+from importlib.util import module_from_spec, spec_from_file_location
+from pathlib import Path
+from typing import Any
+
+INVALID_COMBINED_SCORE = -1e18
+
+
+def _write_json(path: Path, obj: Any) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(
+ json.dumps(obj, ensure_ascii=False, indent=2, default=str) + "\n",
+ encoding="utf-8",
+ )
+
+
+def _normalize_result(result: Any) -> tuple[dict[str, Any], dict[str, Any]]:
+ if hasattr(result, "metrics") and hasattr(result, "artifacts"):
+ return dict(getattr(result, "metrics")), dict(getattr(result, "artifacts"))
+ if isinstance(result, dict):
+ raw_metrics = result.get("metrics")
+ raw_artifacts = result.get("artifacts")
+ if isinstance(raw_metrics, dict):
+ return dict(raw_metrics), dict(raw_artifacts or {})
+ return dict(result), {}
+ raise TypeError("Evaluator must return an EvaluationResult-like object or a dict.")
+
+
+def _load_local_evaluator() -> Any:
+ evaluator_path = Path(__file__).with_name("evaluator.py").resolve()
+ spec = spec_from_file_location("_frontier_eval_local_evaluator", evaluator_path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"Failed to load local evaluator from {evaluator_path}")
+ module = module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return getattr(module, "evaluate")
+
+
+def _find_repo_root() -> Path:
+ env_root = os.environ.get("FRONTIER_ENGINEERING_ROOT")
+ if env_root:
+ return Path(env_root).expanduser().resolve()
+ here = Path(__file__).resolve()
+ for parent in [here.parent, *here.parents]:
+ if (parent / "frontier_eval").is_dir() and (parent / "benchmarks").is_dir():
+ return parent
+ return Path.cwd().resolve()
+
+
+def _build_kwargs(evaluate_fn: Any) -> dict[str, Any]:
+ kwargs: dict[str, Any] = {}
+ try:
+ parameters = inspect.signature(evaluate_fn).parameters
+ except Exception:
+ return kwargs
+ if "repo_root" in parameters:
+ kwargs["repo_root"] = _find_repo_root()
+ return kwargs
+
+
+def main(argv: list[str]) -> int:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--candidate", required=True)
+ parser.add_argument("--metrics-out", default="metrics.json")
+ parser.add_argument("--artifacts-out", default="artifacts.json")
+ args = parser.parse_args(argv)
+
+ candidate_path = Path(args.candidate).expanduser().resolve()
+ metrics_out = Path(args.metrics_out).expanduser().resolve()
+ artifacts_out = Path(args.artifacts_out).expanduser().resolve()
+
+ metrics: dict[str, Any] = {"combined_score": INVALID_COMBINED_SCORE, "valid": 0.0}
+ artifacts: dict[str, Any] = {
+ "local_evaluator_path": str(Path(__file__).with_name("evaluator.py").resolve()),
+ "candidate_path": str(candidate_path),
+ }
+
+ try:
+ evaluate_fn = _load_local_evaluator()
+ result = evaluate_fn(str(candidate_path), **_build_kwargs(evaluate_fn))
+ metrics, evaluator_artifacts = _normalize_result(result)
+ artifacts.update(evaluator_artifacts)
+ except Exception as exc:
+ artifacts["error_message"] = str(exc)
+ artifacts["traceback"] = traceback.format_exc()
+
+ _write_json(metrics_out, metrics)
+ _write_json(artifacts_out, artifacts)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main(__import__("sys").argv[1:]))
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/references/material_db.json b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/references/material_db.json
new file mode 100644
index 00000000..4fb6d4db
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/references/material_db.json
@@ -0,0 +1,48 @@
+{
+ "matrix": {
+ "name": "PE (polyethylene)",
+ "eps_real": 2.3,
+ "eps_imag": 0.02,
+ "mu_real": 1.0,
+ "mu_imag": 0.0,
+ "density": 0.95,
+ "cost_proxy": 1.0,
+ "description": "Polyethylene matrix. Low permittivity, non-magnetic, lightweight."
+ },
+ "magnetic_absorber": {
+ "name": "Nd0.15-BaM",
+ "eps_real": 14.0,
+ "eps_imag": 8.0,
+ "mu_real": 1.35,
+ "mu_imag": 0.25,
+ "density": 5.1,
+ "cost_proxy": 2.5,
+ "description": "Nd-doped barium ferrite (BaNd0.15Fe11.85O19). Provides both dielectric and magnetic loss via natural resonance and eddy current effects."
+ },
+ "conductive_filler": {
+ "name": "CNTs (8wt%)",
+ "eps_real": 18.0,
+ "eps_imag": 12.0,
+ "mu_real": 1.0,
+ "mu_imag": 0.0,
+ "density": 1.7,
+ "cost_proxy": 3.5,
+ "description": "Carbon nanotubes at 8wt% loading. High dielectric loss from conductive network formation. Based on Nd0.15-BaM/8%CNTs composite data."
+ },
+ "lightweight_magnetic": {
+ "name": "Hollow Nd-BaM microspheres",
+ "eps_real": 7.0,
+ "eps_imag": 2.5,
+ "mu_real": 1.15,
+ "mu_imag": 0.12,
+ "density": 2.8,
+ "cost_proxy": 4.0,
+ "description": "Hollow Nd-doped barium ferrite microspheres. Reduced density compared to solid Nd-BaM while retaining moderate magnetic loss."
+ },
+ "_notes": {
+ "data_source": "Electromagnetic parameters derived from VNA measurements in Wang et al., Materials 2024, 17, 3433 (CNTs@Nd0.15-BaM/PE composites, 8.2-18 GHz).",
+ "sign_convention": "eps_r = eps_real - j*eps_imag (negative imaginary part). Same for permeability.",
+ "mixing_rule": "Linear volume-fraction mixing. Simplified first-order approximation.",
+ "electromagnetic_parameters": "All values are frequency-independent constant approximations averaged over the 8.2-18 GHz range."
+ }
+}
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/references/problem_config.json b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/references/problem_config.json
new file mode 100644
index 00000000..6cdea2e7
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/references/problem_config.json
@@ -0,0 +1,31 @@
+{
+ "benchmark_id": "lightweight_broadband_absorber_8_18ghz",
+ "task_name": "LightweightBroadbandAbsorber",
+ "description": "Lightweight broadband CNTs@Nd-BaM/PE absorber optimization, 8.2-18 GHz",
+ "freq_ghz_min": 8.2,
+ "freq_ghz_max": 18.0,
+ "num_freq_points": 197,
+ "backing": "PEC",
+ "d_mm_min": 1.0,
+ "d_mm_max": 5.0,
+ "phi_min": 0.0,
+ "phi_max": 1.0,
+ "phi_sum_tolerance": 1e-6,
+ "rl_threshold_db": -10.0,
+ "min_eab_ghz": 4.0,
+ "normalization": {
+ "eab10_ghz": { "min": 0.0, "max": 9.8 },
+ "abs_rl_min_db": { "min": 0.0, "max": 60.0 },
+ "thickness_mm": { "min": 1.0, "max": 5.0 },
+ "density": { "min": 0.9, "max": 5.5 },
+ "cost": { "min": 1.0, "max": 4.0 }
+ },
+ "weights": {
+ "eab10": 1.0,
+ "rl_min": 0.15,
+ "thickness": 0.4,
+ "density": 0.5,
+ "cost": 0.05
+ },
+ "notes": "Density penalty is dominant (0.5) to incentivize lightweight designs. EAB < 4.0 GHz => infeasible. All metrics normalized to [0,1]. Based on Wang et al., Materials 2024, 17, 3433."
+}
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/scripts/init.py b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/scripts/init.py
new file mode 100644
index 00000000..8e13783f
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/scripts/init.py
@@ -0,0 +1,47 @@
+"""
+Minimal initialization for LightweightBroadbandAbsorber benchmark.
+This is the target file for agent evolution.
+"""
+import json
+from pathlib import Path
+
+
+def main():
+ task_dir = Path(__file__).resolve().parents[1]
+ temp_dir = task_dir / "temp"
+ temp_dir.mkdir(exist_ok=True)
+
+ config_path = task_dir / "references" / "problem_config.json"
+ with open(config_path, "r", encoding="utf-8") as f:
+ config = json.load(f)
+
+ # EVOLVE-BLOCK-START
+ # Design a lightweight broadband absorber for 8.2-18 GHz.
+ # Variables:
+ # d_mm: absorber thickness in mm [1.0, 5.0]
+ # phi_magnetic_absorber: Nd0.15-BaM volume fraction [0, 1]
+ # phi_conductive_filler: CNTs volume fraction [0, 1]
+ # phi_lightweight_magnetic: hollow Nd-BaM volume fraction [0, 1]
+ # phi_matrix: PE matrix volume fraction [0, 1]
+ # Constraint: all phi sum to 1.0
+ # Hard constraint: EAB >= 4.0 GHz (otherwise infeasible)
+ # Goal: maximize combined_score (wide bandwidth, deep RL, thin, LIGHT, cheap)
+
+ submission = {
+ "benchmark_id": config["benchmark_id"],
+ "d_mm": 1.9,
+ "phi_magnetic_absorber": 0.25,
+ "phi_conductive_filler": 0.10,
+ "phi_lightweight_magnetic": 0.05,
+ "phi_matrix": 0.60
+ }
+ # EVOLVE-BLOCK-END
+
+ output_path = temp_dir / "submission.json"
+ with open(output_path, "w", encoding="utf-8") as f:
+ json.dump(submission, f, indent=2)
+ print(f"Submission written to {output_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/verification/evaluator.py b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/verification/evaluator.py
new file mode 100644
index 00000000..5a5bd5af
--- /dev/null
+++ b/benchmarks/MaterialEngineering/LightweightBroadbandAbsorber/verification/evaluator.py
@@ -0,0 +1,141 @@
+"""
+Official evaluator for LightweightBroadbandAbsorber benchmark.
+Single-layer broadband CNTs@Nd-BaM/PE absorber, 8.2-18 GHz, PEC backing.
+4 material components. Minimum EAB hard constraint.
+
+Usage: python verification/evaluator.py scripts/init.py
+"""
+import json, math, subprocess, sys, time
+from pathlib import Path
+import numpy as np
+
+Z0 = 377.0
+C0 = 2.998e8
+
+def load_json(p):
+ with open(p, "r", encoding="utf-8") as f: return json.load(f)
+
+def fail_result(msg):
+ return {"valid": 0, "feasible": 0, "combined_score": 0.0, "message": msg}
+
+def validate_submission(sub, cfg):
+ for k in ["benchmark_id","d_mm","phi_magnetic_absorber","phi_conductive_filler","phi_lightweight_magnetic","phi_matrix"]:
+ if k not in sub: return False, f"Missing key: '{k}'"
+ if sub["benchmark_id"] != cfg["benchmark_id"]:
+ return False, f"benchmark_id mismatch"
+ d = sub["d_mm"]
+ if not isinstance(d,(int,float)) or not math.isfinite(d): return False, f"Invalid d_mm"
+ if not (cfg["d_mm_min"] <= d <= cfg["d_mm_max"]): return False, f"d_mm out of range"
+ phis = []
+ for k in ["phi_magnetic_absorber","phi_conductive_filler","phi_lightweight_magnetic","phi_matrix"]:
+ v = sub[k]
+ if not isinstance(v,(int,float)) or not math.isfinite(v): return False, f"Invalid {k}"
+ if v < cfg["phi_min"] or v > cfg["phi_max"]: return False, f"{k} out of range"
+ phis.append(v)
+ if abs(sum(phis)-1.0) > cfg["phi_sum_tolerance"]:
+ return False, f"Volume fractions sum to {sum(phis):.10f}, not 1.0"
+ return True, "ok"
+
+def mix_properties(sub, mdb):
+ phi_ma = sub["phi_magnetic_absorber"]
+ phi_cf = sub["phi_conductive_filler"]
+ phi_lm = sub["phi_lightweight_magnetic"]
+ phi_x = sub["phi_matrix"]
+ comps = [(phi_x, mdb["matrix"]), (phi_ma, mdb["magnetic_absorber"]),
+ (phi_cf, mdb["conductive_filler"]), (phi_lm, mdb["lightweight_magnetic"])]
+ er = sum(p*c["eps_real"] for p,c in comps)
+ ei = sum(p*c["eps_imag"] for p,c in comps)
+ mr = sum(p*c["mu_real"] for p,c in comps)
+ mi = sum(p*c["mu_imag"] for p,c in comps)
+ dn = sum(p*c["density"] for p,c in comps)
+ ct = sum(p*c["cost_proxy"] for p,c in comps)
+ return {"eps_r": complex(er,-ei), "mu_r": complex(mr,-mi), "density": dn, "cost": ct}
+
+def compute_rl_curve(eps_r, mu_r, d_mm, cfg):
+ freqs = np.linspace(cfg["freq_ghz_min"]*1e9, cfg["freq_ghz_max"]*1e9, cfg["num_freq_points"])
+ d_m = d_mm * 1e-3
+ rl = np.zeros(len(freqs))
+ for i, f in enumerate(freqs):
+ g = 1j*(2*np.pi*f*d_m/C0)*np.sqrt(mu_r*eps_r)
+ zi = Z0*np.sqrt(mu_r/eps_r)*np.tanh(g)
+ r = abs((zi-Z0)/(zi+Z0))
+ rl[i] = 20.0*np.log10(max(r,1e-15))
+ return freqs, rl
+
+def compute_eab10(freqs, rl, thr=-10.0):
+ mask = rl <= thr
+ if not np.any(mask): return 0.0
+ ml=cl=ei=0
+ for i,f in enumerate(mask):
+ if f:
+ cl+=1
+ if cl>ml: ml=cl; ei=i
+ else: cl=0
+ if ml==0: return 0.0
+ return (freqs[ei]-freqs[ei-ml+1])/1e9
+
+def norm(v, lo, hi):
+ if hi<=lo: return 0.0
+ return max(0.0, min(1.0, (v-lo)/(hi-lo)))
+
+def compute_score(rl_min, eab, d, dens, cost, w, n):
+ return float(
+ w["eab10"]*norm(eab, n["eab10_ghz"]["min"], n["eab10_ghz"]["max"])
+ + w["rl_min"]*norm(abs(rl_min), n["abs_rl_min_db"]["min"], n["abs_rl_min_db"]["max"])
+ - w["thickness"]*norm(d, n["thickness_mm"]["min"], n["thickness_mm"]["max"])
+ - w["density"]*norm(dens, n["density"]["min"], n["density"]["max"])
+ - w["cost"]*norm(cost, n["cost"]["min"], n["cost"]["max"])
+ )
+
+def evaluate_candidate(prog, task_dir):
+ t0 = time.time()
+ try:
+ proc = subprocess.run([sys.executable, str(prog)], cwd=str(task_dir),
+ capture_output=True, text=True, timeout=120)
+ except subprocess.TimeoutExpired:
+ return fail_result("Timeout (120s)")
+ runtime = time.time()-t0
+ print("=== Candidate stdout ==="); print(proc.stdout)
+ if proc.stderr.strip(): print("=== stderr ==="); print(proc.stderr)
+ if proc.returncode != 0: return fail_result(f"Exit code {proc.returncode}")
+
+ sp = task_dir/"temp"/"submission.json"
+ if not sp.exists(): sp = task_dir/"submission.json"
+ if not sp.exists(): return fail_result("submission.json not found")
+ try: sub = load_json(sp)
+ except Exception as e: return fail_result(f"Parse error: {e}")
+
+ cfg = load_json(task_dir/"references"/"problem_config.json")
+ mdb = load_json(task_dir/"references"/"material_db.json")
+ ok, msg = validate_submission(sub, cfg)
+ if not ok: return fail_result(f"Validation: {msg}")
+
+ props = mix_properties(sub, mdb)
+ freqs, rl = compute_rl_curve(props["eps_r"], props["mu_r"], sub["d_mm"], cfg)
+ rl_min = float(np.min(rl))
+ eab = compute_eab10(freqs, rl, cfg.get("rl_threshold_db",-10.0))
+
+ base = {"rl_min_db": rl_min, "eab10_ghz": eab, "thickness_mm": sub["d_mm"],
+ "density": props["density"], "cost_proxy": props["cost"], "runtime_sec": round(runtime,3)}
+
+ min_eab = cfg.get("min_eab_ghz", 0.0)
+ if eab < min_eab:
+ return {**base, "valid": 1, "feasible": 0, "combined_score": 0.0,
+ "message": f"EAB={eab:.2f} GHz < min required {min_eab} GHz"}
+
+ score = compute_score(rl_min, eab, sub["d_mm"], props["density"], props["cost"],
+ cfg["weights"], cfg["normalization"])
+ return {**base, "valid": 1, "feasible": 1, "combined_score": score}
+
+def main():
+ if len(sys.argv)<2: print("Usage: python verification/evaluator.py