Remove MARK_AGGREGATED and INITIALIZE from ReportsProcessed

mendess · mendess · commit 85c9c241e244 · 2023-10-27T15:32:08.000+01:00
diff --git a/daphne/src/lib.rs b/daphne/src/lib.rs
@@ -366,6 +366,32 @@ impl<T> Extend<(DapBatchBucket, (T, Vec<(ReportId, Time)>))> for DapAggregateSpa
     }
 }
 
+impl FromIterator<(DapBatchBucket, (ReportId, Time))> for DapAggregateSpan<()> {
+    fn from_iter<I>(iter: I) -> Self
+    where
+        I: IntoIterator<Item = (DapBatchBucket, (ReportId, Time))>,
+    {
+        let mut this = Self::default();
+        this.extend(iter);
+        this
+    }
+}
+
+impl Extend<(DapBatchBucket, (ReportId, Time))> for DapAggregateSpan<()> {
+    fn extend<I>(&mut self, iter: I)
+    where
+        I: IntoIterator<Item = (DapBatchBucket, (ReportId, Time))>,
+    {
+        for (k, v) in iter {
+            self.span
+                .entry(k)
+                .or_insert_with(|| ((), Vec::new()))
+                .1
+                .push(v);
+        }
+    }
+}
+
 /// Per-task DAP parameters.
 #[derive(Clone, Deserialize, Serialize)]
 pub struct DapTaskConfig {
@@ -484,32 +510,35 @@ impl DapTaskConfig {
         &self,
         part_batch_sel: &'sel PartialBatchSelector,
         consumed_reports: impl Iterator<Item = &'rep EarlyReportStateConsumed<'rep>>,
-    ) -> Result<HashMap<DapBatchBucket, Vec<&'rep EarlyReportStateConsumed<'rep>>>, DapError> {
+    ) -> Result<DapAggregateSpan<()>, DapError> {
         if !self.query.is_valid_part_batch_sel(part_batch_sel) {
             return Err(fatal_error!(
                 err = "partial batch selector not compatible with task",
             ));
         }
+        Ok(consumed_reports
+            .filter(|consumed_report| consumed_report.is_ready())
+            .map(|consumed_report| {
+                let bucket = self.bucket_for(part_batch_sel, consumed_report);
+                let metadata = consumed_report.metadata();
+                (bucket, (metadata.id.clone(), metadata.time))
+            })
+            .collect())
+    }
 
-        let mut span: HashMap<_, Vec<_>> = HashMap::new();
-        for consumed_report in consumed_reports.filter(|consumed_report| consumed_report.is_ready())
-        {
-            let bucket = match part_batch_sel {
-                PartialBatchSelector::TimeInterval => DapBatchBucket::TimeInterval {
-                    batch_window: self.quantized_time_lower_bound(consumed_report.metadata().time),
-                },
-                PartialBatchSelector::FixedSizeByBatchId { batch_id } => {
-                    DapBatchBucket::FixedSize {
-                        batch_id: batch_id.clone(),
-                    }
-                }
-            };
-
-            let consumed_reports_per_bucket = span.entry(bucket).or_default();
-            consumed_reports_per_bucket.push(consumed_report);
+    pub fn bucket_for(
+        &self,
+        part_batch_sel: &PartialBatchSelector,
+        consumed_report: &EarlyReportStateConsumed<'_>,
+    ) -> DapBatchBucket {
+        match part_batch_sel {
+            PartialBatchSelector::TimeInterval => DapBatchBucket::TimeInterval {
+                batch_window: self.quantized_time_lower_bound(consumed_report.metadata().time),
+            },
+            PartialBatchSelector::FixedSizeByBatchId { batch_id } => DapBatchBucket::FixedSize {
+                batch_id: batch_id.clone(),
+            },
         }
-
-        Ok(span)
     }
 
     /// Check if the batch size is too small. Returns an error if the report count is too large.
diff --git a/daphne/src/roles/mod.rs b/daphne/src/roles/mod.rs
@@ -9,7 +9,7 @@ mod leader;
 
 use crate::{
     constants::DapMediaType,
-    messages::{BatchSelector, ReportMetadata, TaskId, Time, TransitionFailure},
+    messages::{BatchSelector, ReportMetadata, TaskId, Time},
     taskprov::{self, TaskprovVersion},
     DapAbort, DapError, DapQueryConfig, DapRequest, DapTaskConfig,
 };
@@ -103,34 +103,6 @@ async fn check_batch<S>(
     Ok(())
 }
 
-/// Check for transition failures due to:
-///
-/// * the report having already been processed
-/// * the report having already been collected
-/// * the report not being within time bounds
-///
-/// Returns `Some(TransitionFailure)` if there is a problem,
-/// or `None` if no transition failure occurred.
-pub fn early_metadata_check(
-    metadata: &ReportMetadata,
-    processed: bool,
-    collected: bool,
-    min_time: u64,
-    max_time: u64,
-) -> Option<TransitionFailure> {
-    if processed {
-        Some(TransitionFailure::ReportReplayed)
-    } else if collected {
-        Some(TransitionFailure::BatchCollected)
-    } else if metadata.time < min_time {
-        Some(TransitionFailure::ReportDropped)
-    } else if metadata.time > max_time {
-        Some(TransitionFailure::ReportTooEarly)
-    } else {
-        None
-    }
-}
-
 fn check_request_content_type<S>(
     req: &DapRequest<S>,
     expected: DapMediaType,
@@ -195,7 +167,7 @@ async fn resolve_taskprov<S>(
 
 #[cfg(test)]
 mod test {
-    use super::{early_metadata_check, DapAggregator, DapAuthorizedSender, DapHelper, DapLeader};
+    use super::{DapAggregator, DapAuthorizedSender, DapHelper, DapLeader};
     use crate::{
         assert_metrics_include, async_test_version, async_test_versions,
         auth::BearerToken,
@@ -234,6 +206,34 @@ mod test {
         }};
     }
 
+    /// Check for transition failures due to:
+    ///
+    /// * the report having already been processed
+    /// * the report having already been collected
+    /// * the report not being within time bounds
+    ///
+    /// Returns `Some(TransitionFailure)` if there is a problem,
+    /// or `None` if no transition failure occurred.
+    pub fn early_metadata_check(
+        metadata: &ReportMetadata,
+        processed: bool,
+        collected: bool,
+        min_time: u64,
+        max_time: u64,
+    ) -> Option<TransitionFailure> {
+        if processed {
+            Some(TransitionFailure::ReportReplayed)
+        } else if collected {
+            Some(TransitionFailure::BatchCollected)
+        } else if metadata.time < min_time {
+            Some(TransitionFailure::ReportDropped)
+        } else if metadata.time > max_time {
+            Some(TransitionFailure::ReportTooEarly)
+        } else {
+            None
+        }
+    }
+
     pub(super) struct TestData {
         pub now: Time,
         global_config: DapGlobalConfig,
diff --git a/daphne/src/testing.rs b/daphne/src/testing.rs
@@ -13,7 +13,7 @@ use crate::{
         AggregationJobContinueReq, AggregationJobId, AggregationJobInitReq, AggregationJobResp,
         BatchId, BatchSelector, Collection, CollectionJobId, CollectionReq,
         Draft02AggregationJobId, HpkeCiphertext, Interval, PartialBatchSelector, Report, ReportId,
-        ReportMetadata, TaskId, Time, TransitionFailure,
+        TaskId, Time, TransitionFailure,
     },
     metrics::DaphneMetrics,
     roles::{DapAggregator, DapAuthorizedSender, DapHelper, DapLeader, DapReportInitializer},
@@ -697,7 +697,7 @@ impl MockAggregator {
         &self,
         task_id: &TaskId,
         bucket: &DapBatchBucket,
-        metadata: &ReportMetadata,
+        id: &ReportId,
     ) -> Option<TransitionFailure> {
         // Check AggStateStore to see whether the report is part of a batch that has already
         // been collected.
@@ -713,7 +713,7 @@ impl MockAggregator {
             .lock()
             .expect("report_store: failed to lock");
         let report_store = guard.entry(task_id.clone()).or_default();
-        if report_store.processed.contains(&metadata.id) {
+        if report_store.processed.contains(id) {
             return Some(TransitionFailure::ReportReplayed);
         }
 
@@ -920,17 +920,13 @@ impl DapReportInitializer for MockAggregator {
         )?;
 
         let mut early_fails = HashMap::new();
-        for (bucket, reports_consumed_per_bucket) in span.iter() {
-            for metadata in reports_consumed_per_bucket
-                .iter()
-                .map(|report| report.metadata())
-            {
+        for (bucket, ((), report_ids_and_time)) in span.iter() {
+            for (id, _) in report_ids_and_time {
                 // Check whether Report has been collected or replayed.
-                if let Some(transition_failure) = self
-                    .check_report_early_fail(task_id, bucket, metadata)
-                    .await
+                if let Some(transition_failure) =
+                    self.check_report_early_fail(task_id, bucket, id).await
                 {
-                    early_fails.insert(metadata.id.clone(), transition_failure);
+                    early_fails.insert(id.clone(), transition_failure);
                 };
             }
         }
@@ -1233,7 +1229,7 @@ impl DapLeader<BearerToken> for MockAggregator {
 
         // Check whether Report has been collected or replayed.
         if let Some(transition_failure) = self
-            .check_report_early_fail(task_id, &bucket, &report.report_metadata)
+            .check_report_early_fail(task_id, &bucket, &report.report_metadata.id)
             .await
         {
             return Err(DapError::Transition(transition_failure));
diff --git a/daphne/src/vdaf/mod.rs b/daphne/src/vdaf/mod.rs
@@ -187,6 +187,9 @@ impl<'req> EarlyReportStateConsumed<'req> {
         })
     }
 
+    /// Convert this EarlyReportStateConsumed into a rejected [EarlyReportStateInitialized] using
+    /// `failure` as the reason. If this is already a rejected report, the passed in `failure`
+    /// value overwrites the previous one.
     pub fn into_initialized_rejected_due_to(
         self,
         failure: TransitionFailure,
diff --git a/daphne_worker/src/durable/reports_processed.rs b/daphne_worker/src/durable/reports_processed.rs
@@ -14,22 +14,19 @@ use daphne::{
     },
     DapError, VdafConfig,
 };
-use futures::{
-    future::{ready, try_join_all},
-    StreamExt, TryStreamExt,
-};
+use futures::{future::try_join_all, StreamExt, TryStreamExt};
 use prio::codec::{CodecError, ParameterizedDecode};
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, collections::HashSet, ops::ControlFlow, time::Duration};
+use std::{borrow::Cow, collections::HashSet, future::ready, ops::ControlFlow, time::Duration};
 use tracing::Instrument;
 use worker::*;
 
-use super::{req_parse, Alarmed, DapDurableObject, GarbageCollectable};
+use super::{req_parse, state_set_if_not_exists, Alarmed, DapDurableObject, GarbageCollectable};
 
 pub(crate) const DURABLE_REPORTS_PROCESSED_INITIALIZE: &str =
     "/internal/do/reports_processed/initialize";
-pub(crate) const DURABLE_REPORTS_PROCESSED_MARK_AGGREGATED: &str =
-    "/internal/do/reports_processed/mark_aggregated";
+pub(crate) const DURABLE_REPORTS_PROCESSED_INITIALIZED: &str =
+    "/internal/do/reports_processed/initialized";
 
 /// Durable Object (DO) for tracking which reports have been processed.
 ///
@@ -63,63 +60,6 @@ impl<'id> From<&'id ReportId> for ReportIdKey<'id> {
     }
 }
 
-#[derive(Debug)]
-enum CheckedReplays<'s> {
-    SomeReplayed(Vec<&'s ReportId>),
-    AllFresh(Vec<ReportIdKey<'s>>),
-}
-
-impl<'r> Default for CheckedReplays<'r> {
-    fn default() -> Self {
-        Self::AllFresh(vec![])
-    }
-}
-
-impl<'r> CheckedReplays<'r> {
-    fn add_replay(mut self, id: &'r ReportId) -> Self {
-        match &mut self {
-            Self::SomeReplayed(r) => {
-                r.push(id);
-                self
-            }
-            Self::AllFresh(_) => Self::SomeReplayed(vec![id]),
-        }
-    }
-
-    fn add_fresh(mut self, id: ReportIdKey<'r>) -> Self {
-        match &mut self {
-            Self::SomeReplayed(_) => {}
-            Self::AllFresh(r) => r.push(id),
-        }
-        self
-    }
-}
-
-impl ReportsProcessed {
-    async fn check_replays<'s>(&self, report_ids: &'s [ReportId]) -> Result<CheckedReplays<'s>> {
-        futures::stream::iter(report_ids.iter().map(ReportIdKey::from))
-            .then(|id| {
-                let state = &self.state;
-                async move {
-                    state_get::<bool>(state, &id.1)
-                        .await
-                        .map(|presence| match presence {
-                            // if it's present then it's a replay
-                            Some(true) => Err(id.0),
-                            Some(false) | None => Ok(id),
-                        })
-                }
-            })
-            .try_fold(CheckedReplays::default(), |acc, id| async move {
-                Ok(match id {
-                    Ok(not_replayed) => acc.add_fresh(not_replayed),
-                    Err(replayed) => acc.add_replay(replayed),
-                })
-            })
-            .await
-    }
-}
-
 #[durable_object]
 impl DurableObject for ReportsProcessed {
     fn new(state: State, env: Env) -> Self {
@@ -166,6 +106,22 @@ impl ReportsProcessed {
         .await?;
 
         match (req.path().as_ref(), req.method()) {
+            (DURABLE_REPORTS_PROCESSED_INITIALIZED, Method::Post) => {
+                let to_mark = req_parse::<Vec<ReportId>>(&mut req).await?;
+                let state = &self.state;
+                let replays = futures::stream::iter(&to_mark)
+                    .map(|id| async move {
+                        state_set_if_not_exists(state, &format!("processed/{id}"), &true)
+                            .await
+                            .map(|o| o.is_some().then_some(id))
+                    })
+                    .buffer_unordered(usize::MAX)
+                    .try_filter_map(|replay| ready(Ok(replay)))
+                    .try_collect::<Vec<_>>()
+                    .await?;
+
+                Response::from_json(&replays)
+            }
             // Initialize a report:
             //  * Ensure the report wasn't replayed
             //  * Ensure the report won't be included in a batch that was already collected
@@ -230,31 +186,6 @@ impl ReportsProcessed {
                 })
             }
 
-            // Mark reports as aggregated.
-            //
-            // If there are any replays, no reports are marked as aggregated.
-            //
-            // Idempotent
-            // Input: `Vec<ReportId>`
-            // Output: `Vec<ReportId>`
-            (DURABLE_REPORTS_PROCESSED_MARK_AGGREGATED, Method::Post) => {
-                let report_ids: Vec<ReportId> = req_parse(&mut req).await?;
-                match self.check_replays(&report_ids).await? {
-                    CheckedReplays::SomeReplayed(report_ids) => Response::from_json(&report_ids),
-                    CheckedReplays::AllFresh(report_ids) => {
-                        let state = &self.state;
-                        futures::stream::iter(&report_ids)
-                            .then(|report_id| async move {
-                                state.storage().put(&report_id.1, &true).await
-                            })
-                            .try_for_each(|_| ready(Ok(())))
-                            .await?;
-
-                        Response::from_json(&[(); 0])
-                    }
-                }
-            }
-
             _ => Err(int_err(format!(
                 "ReportsProcessed: unexpected request: method={:?}; path={:?}",
                 req.method(),
diff --git a/daphne_worker/src/roles/aggregator.rs b/daphne_worker/src/roles/aggregator.rs

Original file line number	Diff line number	Diff line change
`@@ -187,6 +187,9 @@ impl<'req> EarlyReportStateConsumed<'req> {`
`187`	`187`	`})`
`188`	`188`	`}`
`189`	`189`
	`190`	`+ /// Convert this EarlyReportStateConsumed into a rejected [EarlyReportStateInitialized] using`
	`191`	+ /// `failure` as the reason. If this is already a rejected report, the passed in `failure`
	`192`	`+ /// value overwrites the previous one.`
`190`	`193`	`pub fn into_initialized_rejected_due_to(`
`191`	`194`	`self,`
`192`	`195`	`failure: TransitionFailure,`