33//! The core idea is the following: For each key-value pair associated with a record, aggregate so
44//! (key, value) -> count. This gives a count of how frequently each key appears.
55//!
6- //! For now it's not incremental.
6+ //! The statistics executor is incremental - it loads existing counts from the output_reader
7+ //! and updates them with new records.
78
89use std:: collections:: { HashMap , HashSet } ;
910use std:: hash:: { Hash , Hasher } ;
@@ -13,8 +14,7 @@ use chroma_error::ChromaError;
1314use chroma_segment:: blockfile_record:: RecordSegmentReader ;
1415use chroma_segment:: types:: HydratedMaterializedLogRecord ;
1516use chroma_types:: {
16- Chunk , LogRecord , MaterializedLogOperation , MetadataValue , Operation , OperationRecord ,
17- UpdateMetadataValue ,
17+ Chunk , LogRecord , MetadataValue , Operation , OperationRecord , UpdateMetadataValue ,
1818} ;
1919use futures:: StreamExt ;
2020
@@ -27,8 +27,10 @@ pub trait StatisticsFunctionFactory: std::fmt::Debug + Send + Sync {
2727
2828/// Accumulate statistics. Must be an associative and commutative over a sequence of `observe` calls.
2929pub trait StatisticsFunction : std:: fmt:: Debug + Send {
30- fn observe ( & mut self , hydrated_record : & HydratedMaterializedLogRecord < ' _ , ' _ > ) ;
30+ fn observe_insert ( & mut self , hydrated_record : & HydratedMaterializedLogRecord < ' _ , ' _ > ) ;
31+ fn observe_delete ( & mut self , hydrated_record : & HydratedMaterializedLogRecord < ' _ , ' _ > ) ;
3132 fn output ( & self ) -> UpdateMetadataValue ;
33+ fn as_any_mut ( & mut self ) -> & mut dyn std:: any:: Any ;
3234}
3335
3436#[ derive( Debug , Default ) ]
@@ -45,14 +47,29 @@ pub struct CounterFunction {
4547 acc : i64 ,
4648}
4749
50+ impl CounterFunction {
51+ /// Create a CounterFunction with an initial value.
52+ pub fn with_initial_value ( value : i64 ) -> Self {
53+ Self { acc : value }
54+ }
55+ }
56+
4857impl StatisticsFunction for CounterFunction {
49- fn observe ( & mut self , _: & HydratedMaterializedLogRecord < ' _ , ' _ > ) {
58+ fn observe_insert ( & mut self , _: & HydratedMaterializedLogRecord < ' _ , ' _ > ) {
5059 self . acc = self . acc . saturating_add ( 1 ) ;
5160 }
5261
62+ fn observe_delete ( & mut self , _: & HydratedMaterializedLogRecord < ' _ , ' _ > ) {
63+ self . acc = self . acc . saturating_sub ( 1 ) ;
64+ }
65+
5366 fn output ( & self ) -> UpdateMetadataValue {
5467 UpdateMetadataValue :: Int ( self . acc )
5568 }
69+
70+ fn as_any_mut ( & mut self ) -> & mut dyn std:: any:: Any {
71+ self
72+ }
5673}
5774
5875/// Canonical representation of metadata values tracked by the statistics executor.
@@ -171,31 +188,153 @@ impl Hash for StatisticsValue {
171188#[ derive( Debug ) ]
172189pub struct StatisticsFunctionExecutor ( pub Box < dyn StatisticsFunctionFactory > ) ;
173190
191+ impl StatisticsFunctionExecutor {
192+ /// Load existing statistics from the output reader.
193+ /// Returns a HashMap with the same structure as the counts HashMap.
194+ async fn load_existing_statistics (
195+ & self ,
196+ output_reader : Option < & RecordSegmentReader < ' _ > > ,
197+ ) -> Result <
198+ HashMap < String , HashMap < StatisticsValue , Box < dyn StatisticsFunction > > > ,
199+ Box < dyn ChromaError > ,
200+ > {
201+ let mut counts: HashMap < String , HashMap < StatisticsValue , Box < dyn StatisticsFunction > > > =
202+ HashMap :: default ( ) ;
203+
204+ let Some ( reader) = output_reader else {
205+ return Ok ( counts) ;
206+ } ;
207+
208+ let max_offset_id = reader. get_max_offset_id ( ) ;
209+ let mut stream = reader. get_data_stream ( 0 ..=max_offset_id) . await ;
210+
211+ while let Some ( record_result) = stream. next ( ) . await {
212+ let ( _, record) = record_result?;
213+
214+ // Parse the record to extract key, value, type, and count
215+ let Some ( metadata) = & record. metadata else {
216+ continue ;
217+ } ;
218+
219+ let key = match metadata. get ( "key" ) {
220+ Some ( MetadataValue :: Str ( k) ) => k. clone ( ) ,
221+ _ => continue ,
222+ } ;
223+
224+ let value_type = match metadata. get ( "type" ) {
225+ Some ( MetadataValue :: Str ( t) ) => t. as_str ( ) ,
226+ _ => continue ,
227+ } ;
228+
229+ let value_str = match metadata. get ( "value" ) {
230+ Some ( MetadataValue :: Str ( v) ) => v. as_str ( ) ,
231+ _ => continue ,
232+ } ;
233+
234+ let count = match metadata. get ( "count" ) {
235+ Some ( MetadataValue :: Int ( c) ) => * c,
236+ _ => continue ,
237+ } ;
238+
239+ // Reconstruct the StatisticsValue from type and value
240+ let stats_value = match value_type {
241+ "bool" => match value_str {
242+ "true" => StatisticsValue :: Bool ( true ) ,
243+ "false" => StatisticsValue :: Bool ( false ) ,
244+ _ => continue ,
245+ } ,
246+ "int" => match value_str. parse :: < i64 > ( ) {
247+ Ok ( i) => StatisticsValue :: Int ( i) ,
248+ _ => continue ,
249+ } ,
250+ "float" => match value_str. parse :: < f64 > ( ) {
251+ Ok ( f) => StatisticsValue :: Float ( f) ,
252+ _ => continue ,
253+ } ,
254+ "str" => StatisticsValue :: Str ( value_str. to_string ( ) ) ,
255+ "sparse" => match value_str. parse :: < u32 > ( ) {
256+ Ok ( index) => StatisticsValue :: SparseVector ( index) ,
257+ _ => continue ,
258+ } ,
259+ _ => continue ,
260+ } ;
261+
262+ // Create a statistics function initialized with the existing count
263+ let stats_function =
264+ Box :: new ( CounterFunction :: with_initial_value ( count) ) as Box < dyn StatisticsFunction > ;
265+
266+ counts
267+ . entry ( key)
268+ . or_default ( )
269+ . insert ( stats_value, stats_function) ;
270+ }
271+
272+ Ok ( counts)
273+ }
274+ }
275+
174276#[ async_trait]
175277impl AttachedFunctionExecutor for StatisticsFunctionExecutor {
176278 async fn execute (
177279 & self ,
178280 input_records : Chunk < HydratedMaterializedLogRecord < ' _ , ' _ > > ,
179281 output_reader : Option < & RecordSegmentReader < ' _ > > ,
180282 ) -> Result < Chunk < LogRecord > , Box < dyn ChromaError > > {
181- let mut counts: HashMap < String , HashMap < StatisticsValue , Box < dyn StatisticsFunction > > > =
182- HashMap :: default ( ) ;
283+ // Load existing statistics from output_reader if available
284+ let mut counts = self . load_existing_statistics ( output_reader) . await ?;
285+
286+ // Process new input records and update counts
183287 for ( hydrated_record, _index) in input_records. iter ( ) {
184- // This is only applicable for non-incremental statistics.
185- // TODO(tanujnay112): Change this when we make incremental statistics work.
186- if hydrated_record. get_operation ( ) == MaterializedLogOperation :: DeleteExisting {
187- continue ;
288+ let metadata_delta = hydrated_record. compute_metadata_delta ( ) ;
289+
290+ // Decrement counts for deleted metadata
291+ for ( key, old_value) in metadata_delta. metadata_to_delete {
292+ for stats_value in StatisticsValue :: from_metadata_value ( old_value) {
293+ let inner_map = counts. entry ( key. to_string ( ) ) . or_default ( ) ;
294+ inner_map
295+ . entry ( stats_value)
296+ . or_insert_with ( || self . 0 . create ( ) )
297+ . observe_delete ( hydrated_record) ;
298+ }
299+ }
300+
301+ // Decrement counts for old values in updates
302+ for ( key, ( old_value, new_value) ) in & metadata_delta. metadata_to_update {
303+ for stats_value in StatisticsValue :: from_metadata_value ( old_value) {
304+ let inner_map = counts. entry ( key. to_string ( ) ) . or_default ( ) ;
305+ inner_map
306+ . entry ( stats_value)
307+ . or_insert_with ( || self . 0 . create ( ) )
308+ . observe_delete ( hydrated_record) ;
309+ }
310+
311+ for stats_value in StatisticsValue :: from_metadata_value ( new_value) {
312+ let inner_map = counts. entry ( key. to_string ( ) ) . or_default ( ) ;
313+ inner_map
314+ . entry ( stats_value)
315+ . or_insert_with ( || self . 0 . create ( ) )
316+ . observe_insert ( hydrated_record) ;
317+ }
188318 }
189319
190- // Use merged_metadata to get the metadata from the hydrated record
191- let metadata = hydrated_record. merged_metadata ( ) ;
192- for ( key, value) in metadata. iter ( ) {
193- let inner_map = counts. entry ( key. clone ( ) ) . or_default ( ) ;
320+ // Increment counts for new values in both updates and inserts
321+ for ( key, value) in metadata_delta
322+ . metadata_to_update
323+ . iter ( )
324+ . map ( |( k, ( _old, new) ) | ( * k, * new) )
325+ . chain (
326+ metadata_delta
327+ . metadata_to_insert
328+ . iter ( )
329+ . map ( |( k, v) | ( * k, * v) ) ,
330+ )
331+ {
194332 for stats_value in StatisticsValue :: from_metadata_value ( value) {
333+ let inner_map = counts. entry ( key. to_string ( ) ) . or_default ( ) ;
195334 inner_map
196335 . entry ( stats_value)
197336 . or_insert_with ( || self . 0 . create ( ) )
198- . observe ( hydrated_record) ;
337+ . observe_insert ( hydrated_record) ;
199338 }
200339 }
201340 }
0 commit comments