Skip to content

Commit 050f7a5

Browse files
committed
addid
1 parent 9d9b482 commit 050f7a5

File tree

3 files changed

+85
-50
lines changed

3 files changed

+85
-50
lines changed

aws/lambda/benchmark_regression_summary_report/common/config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@
4848
),
4949
metrics={
5050
"passrate": RegressionPolicy(
51-
name="passrate", condition="greater_than", threshold=0.9
51+
name="passrate", condition="greater_than", threshold=0.9, baseline_aggregation="max"
5252
),
5353
"geomean": RegressionPolicy(
54-
name="geomean", condition="greater_than", threshold=0.95
54+
name="geomean", condition="greater_than", threshold=0.95,baseline_aggregation="max"
5555
),
56-
"dynamo_peak_mem": RegressionPolicy(
57-
name="dynamo_peak_mem", condition="greater_than", threshold=0.9
56+
"compression_ratio": RegressionPolicy(
57+
name="compression_ratio", condition="greater_than", threshold=0.9, baseline_aggregation="max"
5858
),
5959
},
6060
notification_config={

aws/lambda/benchmark_regression_summary_report/common/config_model.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -97,30 +97,45 @@ def baseline_timedelta(self) -> timedelta:
9797
@dataclass
9898
class RegressionPolicy:
9999
"""
100-
- "greater_than": higher is better; violation if value < baseline * threshold
101-
- "less_than": lower is better; violation if value > baseline * threshold
102-
- "equal_to": value should be ~= baseline * threshold within rel_tol
100+
Defines the policy for a given metric.
101+
- "greater_than": higher is better; violation if new value < baseline * threshold
102+
- "less_than": lower is better; violation if new value > baseline * threshold
103+
- "equal_to": new value should be ~= baseline * threshold within rel_tol
104+
- "greater_equal": higher is better; violation if new value <= baseline * threshold
105+
- "less_equal": lower is better; violation if new value >= baseline * threshold
106+
103107
"""
104108
name: str
105-
condition: Literal["greater_than", "less_than", "equal_to"]
109+
condition: Literal["greater_than", "less_than", "equal_to","greater_equal","less_equal"]
106110
threshold: float
111+
baseline_aggregation: Literal["avg", "max", "min", "p50", "p90", "p95","latest","earliest"] = "max"
107112
rel_tol: float = 1e-3 # used only for "equal_to"
108113

109114
def is_violation(self, value: float, baseline: float) -> bool:
110115
target = baseline * self.threshold
111116

112117
if self.condition == "greater_than":
113-
# value should be >= target
118+
# value must be strictly greater than target
119+
return value <= target
120+
121+
if self.condition == "greater_equal":
122+
# value must be greater or equal to target
114123
return value < target
115124

116125
if self.condition == "less_than":
117-
# value should be <= target
126+
# value must be strictly less than target
127+
return value >= target
128+
129+
if self.condition == "less_equal":
130+
# value must be less or equal to target
118131
return value > target
119132

120-
# equal_to: |value - target| should be within rel_tol * max(1, |target|)
121-
denom = max(1.0, abs(target))
122-
return abs(value - target) > self.rel_tol * denom
133+
if self.condition == "equal_to":
134+
# |value - target| should be within rel_tol * max(1, |target|)
135+
denom = max(1.0, abs(target))
136+
return abs(value - target) > self.rel_tol * denom
123137

138+
raise ValueError(f"Unknown condition: {self.condition}")
124139
class BaseNotificationConfig:
125140
# every subclass must override this
126141
type_tag: ClassVar[str]

aws/lambda/benchmark_regression_summary_report/common/regression_utils.py

Lines changed: 57 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from common.config_model import BenchmarkConfig, RegressionPolicy
77
from common.benchmark_time_series_api_model import (
88
BenchmarkTimeSeriesApiData,
9+
BenchmarkTimeSeriesItem,
910
)
1011
import pprint
1112

@@ -21,7 +22,7 @@ class BaselineItem(TypedDict):
2122
value: float
2223

2324

24-
class LatestItem(TypedDict):
25+
class BenchmarkValueItem(TypedDict):
2526
group_info: Dict[str, Any]
2627
values: List[Dict[str, Any]]
2728

@@ -35,8 +36,6 @@ class PerGroupResult(TypedDict, total=True):
3536

3637

3738
def percentile(values: list[float], q: float):
38-
if not values:
39-
return None
4039
v = sorted(values)
4140
k = (len(v) - 1) * q
4241
f = math.floor(k)
@@ -54,20 +53,20 @@ def __init__(
5453
baseline_ts: BenchmarkTimeSeriesApiData,
5554
) -> None:
5655
self.metric_policies = config.policy.metrics
57-
self.latest_ts = self._to_latest_data_map(latest_ts)
58-
self.baseline_ts = self._to_baseline_map(baseline_ts, mode="max")
56+
self.latest_ts = self._to_data_map(latest_ts)
57+
self.baseline_raw = self._to_data_map(baseline_ts)
5958

6059
def generate(self) -> Tuple[List[PerGroupResult], bool]:
6160
return self.detect_regressions_with_policies(
62-
self.baseline_ts,
61+
self.baseline_raw,
6362
self.latest_ts,
6463
metric_policies=self.metric_policies,
6564
)
6665

6766
def detect_regressions_with_policies(
6867
self,
69-
baseline_map: Dict[tuple, BaselineItem],
70-
dp_map: Dict[tuple, LatestItem],
68+
baseline_map: Dict[tuple, BenchmarkValueItem],
69+
dp_map: Dict[tuple, BenchmarkValueItem],
7170
*,
7271
metric_policies: Dict[str, RegressionPolicy],
7372
min_points: int = 2,
@@ -90,27 +89,41 @@ def detect_regressions_with_policies(
9089
points: List[Any] = cur_item["values"] if cur_item else []
9190

9291
base_item = baseline_map.get(key)
92+
if not base_item:
93+
logger.warning("Skip. No baseline item found for %s", gi)
94+
results.append(
95+
PerGroupResult(
96+
group_info=gi,
97+
baseline=None,
98+
points=[],
99+
label="insufficient_data",
100+
policy=None,
101+
)
102+
)
103+
continue
93104
logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item))
94-
baseline_value = base_item.get("value") if base_item else None
95105
policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
96106
if not policy:
97107
logger.warning("No policy for %s", gi)
98108
results.append(
99109
PerGroupResult(
100110
group_info=gi,
101-
baseline=baseline_value,
111+
baseline=None,
102112
points=[],
103113
label="insufficient_data",
104114
policy=None,
105115
)
106116
)
107117
continue
118+
119+
baseline_aggre_mode = policy.baseline_aggregation
120+
baseline_value = self._get_baseline(base_item,baseline_aggre_mode)
108121
if baseline_value is None or len(points) == 0:
109122
logger.warning("baseline_value is %s, len(points) == %s", baseline_value,len(points))
110123
results.append(
111124
PerGroupResult(
112125
group_info=gi,
113-
baseline=baseline_value,
126+
baseline=None,
114127
points=[],
115128
label="insufficient_data",
116129
policy=policy,
@@ -120,7 +133,7 @@ def detect_regressions_with_policies(
120133

121134
# Per-point violations (True = regression)
122135
flags: List[bool] = [
123-
policy.is_violation(p["value"], baseline_value) for p in points
136+
policy.is_violation(p["value"], baseline_value["value"]) for p in points
124137
]
125138
label = self.classify_flags(flags, min_points=min_points)
126139

@@ -138,10 +151,10 @@ def detect_regressions_with_policies(
138151
is_any_regression = True
139152
return results, is_any_regression
140153

141-
def _to_latest_data_map(
154+
def _to_data_map(
142155
self, data: "BenchmarkTimeSeriesApiData", field: str = "value"
143-
) -> Dict[tuple, LatestItem]:
144-
result: Dict[tuple, LatestItem] = {}
156+
) -> Dict[tuple, BenchmarkValueItem]:
157+
result: Dict[tuple, BenchmarkValueItem] = {}
145158
for ts_group in data.time_series:
146159
group_keys = tuple(sorted(ts_group.group_info.items()))
147160
points: List[Dict[str, Any]] = []
@@ -164,32 +177,39 @@ def _to_latest_data_map(
164177
}
165178
return result
166179

167-
def _to_baseline_map(
180+
def _get_baseline(
168181
self,
169-
baseline: BenchmarkTimeSeriesApiData,
182+
data: BenchmarkValueItem,
170183
mode: str = "mean",
171184
field: str = "value",
172-
) -> Dict[tuple, BaselineItem]:
173-
result = {}
174-
for ts_group in baseline.time_series:
175-
group_keys = tuple(sorted(ts_group.group_info.items()))
176-
values = [float(d[field]) for d in ts_group.data if field in d]
177-
if not values:
178-
continue
179-
180-
if mode == "mean":
181-
val = statistics.fmean(values)
182-
elif mode == "p90":
183-
val = percentile(values, 0.9)
184-
elif mode == "max":
185-
val = max(values)
186-
else:
187-
raise ValueError("mode must be 'mean' or 'p90'")
185+
) -> Optional[BaselineItem]:
186+
values = [float(d[field]) for d in data["values"] if field in d]
187+
if not values:
188+
return None
188189

189-
result[group_keys] = {
190-
"group_info": ts_group.group_info,
191-
"value": val,
192-
}
190+
if mode == "mean":
191+
val = statistics.fmean(values)
192+
elif mode == "p90":
193+
val = percentile(values, 0.9)
194+
elif mode == "max":
195+
val = max(values)
196+
elif mode == "min":
197+
val = min(values)
198+
elif mode == "latest":
199+
val = values[-1]
200+
elif mode == "earliest":
201+
val = values[0]
202+
elif mode == "p50":
203+
val = percentile(values, 0.5)
204+
elif mode == "p95":
205+
val = percentile(values, 0.95)
206+
else:
207+
logger.warning("Unknown mode: %s", mode)
208+
return None
209+
result:BaselineItem = {
210+
"group_info": data["group_info"],
211+
"value": val,
212+
}
193213
return result
194214

195215
def classify_flags(

0 commit comments

Comments
 (0)