addid

yangw-dev · yangw-dev · commit 050f7a5b1a74 · 2025-09-03T13:02:02.000-07:00
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -48,13 +48,13 @@
         ),
         metrics={
             "passrate": RegressionPolicy(
-                name="passrate", condition="greater_than", threshold=0.9
+                name="passrate", condition="greater_than", threshold=0.9, baseline_aggregation="max"
             ),
             "geomean": RegressionPolicy(
-                name="geomean", condition="greater_than", threshold=0.95
+                name="geomean", condition="greater_than", threshold=0.95,baseline_aggregation="max"
             ),
-            "dynamo_peak_mem": RegressionPolicy(
-                name="dynamo_peak_mem", condition="greater_than", threshold=0.9
+            "compression_ratio": RegressionPolicy(
+                name="compression_ratio", condition="greater_than", threshold=0.9, baseline_aggregation="max"
             ),
         },
         notification_config={
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -97,30 +97,45 @@ def baseline_timedelta(self) -> timedelta:
 @dataclass
 class RegressionPolicy:
     """
-        - "greater_than": higher is better; violation if value < baseline * threshold
-        - "less_than":    lower  is better; violation if value > baseline * threshold
-        - "equal_to":     value should be ~= baseline * threshold within rel_tol
+    Defines the policy for a given metric.
+        - "greater_than": higher is better; violation if new value < baseline * threshold
+        - "less_than":    lower  is better; violation if new value > baseline * threshold
+        - "equal_to":     new value should be ~= baseline * threshold within rel_tol
+        - "greater_equal": higher is better; violation if new value <= baseline * threshold
+        - "less_equal":    lower  is better; violation if new value >= baseline * threshold
+
     """
     name: str
-    condition: Literal["greater_than", "less_than", "equal_to"]
+    condition: Literal["greater_than", "less_than", "equal_to","greater_equal","less_equal"]
     threshold: float
+    baseline_aggregation: Literal["avg", "max", "min", "p50", "p90", "p95","latest","earliest"] = "max"
     rel_tol: float = 1e-3  # used only for "equal_to"
 
     def is_violation(self, value: float, baseline: float) -> bool:
         target = baseline * self.threshold
 
         if self.condition == "greater_than":
-            # value should be >= target
+            # value must be strictly greater than target
+            return value <= target
+
+        if self.condition == "greater_equal":
+            # value must be greater or equal to target
             return value < target
 
         if self.condition == "less_than":
-            # value should be <= target
+            # value must be strictly less than target
+            return value >= target
+
+        if self.condition == "less_equal":
+            # value must be less or equal to target
             return value > target
 
-        # equal_to: |value - target| should be within rel_tol * max(1, |target|)
-        denom = max(1.0, abs(target))
-        return abs(value - target) > self.rel_tol * denom
+        if self.condition == "equal_to":
+            # |value - target| should be within rel_tol * max(1, |target|)
+            denom = max(1.0, abs(target))
+            return abs(value - target) > self.rel_tol * denom
 
+        raise ValueError(f"Unknown condition: {self.condition}")
 class BaseNotificationConfig:
     # every subclass must override this
     type_tag: ClassVar[str]
diff --git a/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py b/aws/lambda/benchmark_regression_summary_report/common/regression_utils.py
@@ -6,6 +6,7 @@
 from common.config_model import BenchmarkConfig, RegressionPolicy
 from common.benchmark_time_series_api_model import (
     BenchmarkTimeSeriesApiData,
+    BenchmarkTimeSeriesItem,
 )
 import pprint
 
@@ -21,7 +22,7 @@ class BaselineItem(TypedDict):
     value: float
 
 
-class LatestItem(TypedDict):
+class BenchmarkValueItem(TypedDict):
     group_info: Dict[str, Any]
     values: List[Dict[str, Any]]
 
@@ -35,8 +36,6 @@ class PerGroupResult(TypedDict, total=True):
 
 
 def percentile(values: list[float], q: float):
-    if not values:
-        return None
     v = sorted(values)
     k = (len(v) - 1) * q
     f = math.floor(k)
@@ -54,20 +53,20 @@ def __init__(
         baseline_ts: BenchmarkTimeSeriesApiData,
     ) -> None:
         self.metric_policies = config.policy.metrics
-        self.latest_ts = self._to_latest_data_map(latest_ts)
-        self.baseline_ts = self._to_baseline_map(baseline_ts, mode="max")
+        self.latest_ts = self._to_data_map(latest_ts)
+        self.baseline_raw = self._to_data_map(baseline_ts)
 
     def generate(self) -> Tuple[List[PerGroupResult], bool]:
         return self.detect_regressions_with_policies(
-            self.baseline_ts,
+            self.baseline_raw,
             self.latest_ts,
             metric_policies=self.metric_policies,
         )
 
     def detect_regressions_with_policies(
         self,
-        baseline_map: Dict[tuple, BaselineItem],
-        dp_map: Dict[tuple, LatestItem],
+        baseline_map: Dict[tuple, BenchmarkValueItem],
+        dp_map: Dict[tuple, BenchmarkValueItem],
         *,
         metric_policies: Dict[str, RegressionPolicy],
         min_points: int = 2,
@@ -90,27 +89,41 @@ def detect_regressions_with_policies(
             points: List[Any] = cur_item["values"] if cur_item else []
 
             base_item = baseline_map.get(key)
+            if not base_item:
+                logger.warning("Skip. No baseline item found for %s", gi)
+                results.append(
+                    PerGroupResult(
+                        group_info=gi,
+                        baseline=None,
+                        points=[],
+                        label="insufficient_data",
+                        policy=None,
+                    )
+                )
+                continue
             logger.info("base_item for keys(%s):\n%s ",key, pprint.pformat(base_item))
-            baseline_value = base_item.get("value") if base_item else None
             policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
             if not policy:
                 logger.warning("No policy for %s", gi)
                 results.append(
                     PerGroupResult(
                         group_info=gi,
-                        baseline=baseline_value,
+                        baseline=None,
                         points=[],
                         label="insufficient_data",
                         policy=None,
                     )
                 )
                 continue
+
+            baseline_aggre_mode = policy.baseline_aggregation
+            baseline_value = self._get_baseline(base_item,baseline_aggre_mode)
             if baseline_value is None or len(points) == 0:
                 logger.warning("baseline_value is %s, len(points) == %s", baseline_value,len(points))
                 results.append(
                     PerGroupResult(
                         group_info=gi,
-                        baseline=baseline_value,
+                        baseline=None,
                         points=[],
                         label="insufficient_data",
                         policy=policy,
@@ -120,7 +133,7 @@ def detect_regressions_with_policies(
 
             # Per-point violations (True = regression)
             flags: List[bool] = [
-                policy.is_violation(p["value"], baseline_value) for p in points
+                policy.is_violation(p["value"], baseline_value["value"]) for p in points
             ]
             label = self.classify_flags(flags, min_points=min_points)
 
@@ -138,10 +151,10 @@ def detect_regressions_with_policies(
                 is_any_regression = True
         return results, is_any_regression
 
-    def _to_latest_data_map(
+    def _to_data_map(
         self, data: "BenchmarkTimeSeriesApiData", field: str = "value"
-    ) -> Dict[tuple, LatestItem]:
-        result: Dict[tuple, LatestItem] = {}
+    ) -> Dict[tuple, BenchmarkValueItem]:
+        result: Dict[tuple, BenchmarkValueItem] = {}
         for ts_group in data.time_series:
             group_keys = tuple(sorted(ts_group.group_info.items()))
             points: List[Dict[str, Any]] = []
@@ -164,32 +177,39 @@ def _to_latest_data_map(
             }
         return result
 
-    def _to_baseline_map(
+    def _get_baseline(
         self,
-        baseline: BenchmarkTimeSeriesApiData,
+        data: BenchmarkValueItem,
         mode: str = "mean",
         field: str = "value",
-    ) -> Dict[tuple, BaselineItem]:
-        result = {}
-        for ts_group in baseline.time_series:
-            group_keys = tuple(sorted(ts_group.group_info.items()))
-            values = [float(d[field]) for d in ts_group.data if field in d]
-            if not values:
-                continue
-
-            if mode == "mean":
-                val = statistics.fmean(values)
-            elif mode == "p90":
-                val = percentile(values, 0.9)
-            elif mode == "max":
-                val = max(values)
-            else:
-                raise ValueError("mode must be 'mean' or 'p90'")
+    ) -> Optional[BaselineItem]:
+        values = [float(d[field]) for d in data["values"] if field in d]
+        if not values:
+            return None
 
-            result[group_keys] = {
-                "group_info": ts_group.group_info,
-                "value": val,
-            }
+        if mode == "mean":
+            val = statistics.fmean(values)
+        elif mode == "p90":
+            val = percentile(values, 0.9)
+        elif mode == "max":
+            val = max(values)
+        elif mode == "min":
+            val = min(values)
+        elif mode == "latest":
+            val = values[-1]
+        elif mode == "earliest":
+            val = values[0]
+        elif mode == "p50":
+            val = percentile(values, 0.5)
+        elif mode == "p95":
+            val = percentile(values, 0.95)
+        else:
+            logger.warning("Unknown mode: %s", mode)
+            return None
+        result:BaselineItem =  {
+            "group_info": data["group_info"],
+            "value": val,
+        }
         return result
 
     def classify_flags(