66from common .config_model import BenchmarkConfig , RegressionPolicy
77from common .benchmark_time_series_api_model import (
88 BenchmarkTimeSeriesApiData ,
9+ BenchmarkTimeSeriesItem ,
910)
1011import pprint
1112
@@ -21,7 +22,7 @@ class BaselineItem(TypedDict):
2122 value : float
2223
2324
24- class LatestItem (TypedDict ):
25+ class BenchmarkValueItem (TypedDict ):
2526 group_info : Dict [str , Any ]
2627 values : List [Dict [str , Any ]]
2728
@@ -35,8 +36,6 @@ class PerGroupResult(TypedDict, total=True):
3536
3637
3738def percentile (values : list [float ], q : float ):
38- if not values :
39- return None
4039 v = sorted (values )
4140 k = (len (v ) - 1 ) * q
4241 f = math .floor (k )
@@ -54,20 +53,20 @@ def __init__(
5453 baseline_ts : BenchmarkTimeSeriesApiData ,
5554 ) -> None :
5655 self .metric_policies = config .policy .metrics
57- self .latest_ts = self ._to_latest_data_map (latest_ts )
58- self .baseline_ts = self ._to_baseline_map (baseline_ts , mode = "max" )
56+ self .latest_ts = self ._to_data_map (latest_ts )
57+ self .baseline_raw = self ._to_data_map (baseline_ts )
5958
6059 def generate (self ) -> Tuple [List [PerGroupResult ], bool ]:
6160 return self .detect_regressions_with_policies (
62- self .baseline_ts ,
61+ self .baseline_raw ,
6362 self .latest_ts ,
6463 metric_policies = self .metric_policies ,
6564 )
6665
6766 def detect_regressions_with_policies (
6867 self ,
69- baseline_map : Dict [tuple , BaselineItem ],
70- dp_map : Dict [tuple , LatestItem ],
68+ baseline_map : Dict [tuple , BenchmarkValueItem ],
69+ dp_map : Dict [tuple , BenchmarkValueItem ],
7170 * ,
7271 metric_policies : Dict [str , RegressionPolicy ],
7372 min_points : int = 2 ,
@@ -90,27 +89,41 @@ def detect_regressions_with_policies(
9089 points : List [Any ] = cur_item ["values" ] if cur_item else []
9190
9291 base_item = baseline_map .get (key )
92+ if not base_item :
93+ logger .warning ("Skip. No baseline item found for %s" , gi )
94+ results .append (
95+ PerGroupResult (
96+ group_info = gi ,
97+ baseline = None ,
98+ points = [],
99+ label = "insufficient_data" ,
100+ policy = None ,
101+ )
102+ )
103+ continue
93104 logger .info ("base_item for keys(%s):\n %s " ,key , pprint .pformat (base_item ))
94- baseline_value = base_item .get ("value" ) if base_item else None
95105 policy = self ._resolve_policy (metric_policies , gi .get ("metric" , "" ))
96106 if not policy :
97107 logger .warning ("No policy for %s" , gi )
98108 results .append (
99109 PerGroupResult (
100110 group_info = gi ,
101- baseline = baseline_value ,
111+ baseline = None ,
102112 points = [],
103113 label = "insufficient_data" ,
104114 policy = None ,
105115 )
106116 )
107117 continue
118+
119+ baseline_aggre_mode = policy .baseline_aggregation
120+ baseline_value = self ._get_baseline (base_item ,baseline_aggre_mode )
108121 if baseline_value is None or len (points ) == 0 :
109122 logger .warning ("baseline_value is %s, len(points) == %s" , baseline_value ,len (points ))
110123 results .append (
111124 PerGroupResult (
112125 group_info = gi ,
113- baseline = baseline_value ,
126+ baseline = None ,
114127 points = [],
115128 label = "insufficient_data" ,
116129 policy = policy ,
@@ -120,7 +133,7 @@ def detect_regressions_with_policies(
120133
121134 # Per-point violations (True = regression)
122135 flags : List [bool ] = [
123- policy .is_violation (p ["value" ], baseline_value ) for p in points
136+ policy .is_violation (p ["value" ], baseline_value [ "value" ] ) for p in points
124137 ]
125138 label = self .classify_flags (flags , min_points = min_points )
126139
@@ -138,10 +151,10 @@ def detect_regressions_with_policies(
138151 is_any_regression = True
139152 return results , is_any_regression
140153
141- def _to_latest_data_map (
154+ def _to_data_map (
142155 self , data : "BenchmarkTimeSeriesApiData" , field : str = "value"
143- ) -> Dict [tuple , LatestItem ]:
144- result : Dict [tuple , LatestItem ] = {}
156+ ) -> Dict [tuple , BenchmarkValueItem ]:
157+ result : Dict [tuple , BenchmarkValueItem ] = {}
145158 for ts_group in data .time_series :
146159 group_keys = tuple (sorted (ts_group .group_info .items ()))
147160 points : List [Dict [str , Any ]] = []
@@ -164,32 +177,39 @@ def _to_latest_data_map(
164177 }
165178 return result
166179
167- def _to_baseline_map (
180+ def _get_baseline (
168181 self ,
169- baseline : BenchmarkTimeSeriesApiData ,
182+ data : BenchmarkValueItem ,
170183 mode : str = "mean" ,
171184 field : str = "value" ,
172- ) -> Dict [tuple , BaselineItem ]:
173- result = {}
174- for ts_group in baseline .time_series :
175- group_keys = tuple (sorted (ts_group .group_info .items ()))
176- values = [float (d [field ]) for d in ts_group .data if field in d ]
177- if not values :
178- continue
179-
180- if mode == "mean" :
181- val = statistics .fmean (values )
182- elif mode == "p90" :
183- val = percentile (values , 0.9 )
184- elif mode == "max" :
185- val = max (values )
186- else :
187- raise ValueError ("mode must be 'mean' or 'p90'" )
185+ ) -> Optional [BaselineItem ]:
186+ values = [float (d [field ]) for d in data ["values" ] if field in d ]
187+ if not values :
188+ return None
188189
189- result [group_keys ] = {
190- "group_info" : ts_group .group_info ,
191- "value" : val ,
192- }
190+ if mode == "mean" :
191+ val = statistics .fmean (values )
192+ elif mode == "p90" :
193+ val = percentile (values , 0.9 )
194+ elif mode == "max" :
195+ val = max (values )
196+ elif mode == "min" :
197+ val = min (values )
198+ elif mode == "latest" :
199+ val = values [- 1 ]
200+ elif mode == "earliest" :
201+ val = values [0 ]
202+ elif mode == "p50" :
203+ val = percentile (values , 0.5 )
204+ elif mode == "p95" :
205+ val = percentile (values , 0.95 )
206+ else :
207+ logger .warning ("Unknown mode: %s" , mode )
208+ return None
209+ result :BaselineItem = {
210+ "group_info" : data ["group_info" ],
211+ "value" : val ,
212+ }
193213 return result
194214
195215 def classify_flags (
0 commit comments