@@ -9,11 +9,36 @@ use nexus_db_queries::context::OpContext;
99use nexus_db_queries:: db:: DataStore ;
1010use nexus_types:: deployment:: { Blueprint , BlueprintZoneDisposition } ;
1111use omicron_common:: api:: external:: Error ;
12- use omicron_uuid_kinds:: GenericUuid ;
12+ use omicron_uuid_kinds:: { GenericUuid , OmicronZoneUuid } ;
1313use slog:: { debug, info, warn} ;
1414
15- /// For each expunged Nexus zone, re-assign sagas owned by that Nexus to the
16- /// specified nexus (`nexus_id`).
15+ /// For each expunged Nexus zone in the same generation as the current Nexus,
16+ /// re-assign sagas owned by that Nexus to the specified nexus (`nexus_id`).
17+ ///
18+ /// Reassigning sagas in this way is how we ensure that that sagas complete even
19+ /// when the Nexus that was running them fails permanently (leading to
20+ /// expungement).
21+ ///
22+ /// Reassignment of sagas assigned to any expunged Nexus nodes is a prerequisite
23+ /// for Nexus handoff (during upgrade). That's because in general, Nexus is
24+ /// tightly coupled to saga implementations and it's not safe for different
25+ /// versions of a Nexus to operate on the same saga (even at different times).
26+ /// For more on this, see RFD 289. As a result, we finish all sagas prior to
27+ /// handing off from one version to the next.
28+ ///
29+ /// Strictly speaking, it should not be required to limit re-assignment to Nexus
30+ /// instances of the same generation. As mentioned above, sagas do not survive
31+ /// across generations, so the only sagas that exist ought to be from the same
32+ /// generation that Nexus is running. This is a belt-and-suspenders check put
33+ /// in place because the impact of getting this wrong (running a saga created by
34+ /// an older version) could be pretty bad.
35+ ///
36+ /// We could use the Nexus image for this rather than the generation.
37+ /// In practice, these should be equivalent here because we bump the Nexus
38+ /// generation on deployed systems when (and only when) there's a new Nexus
39+ /// image. But handoff works in terms of generations -- that's the abstraction
40+ /// we've created for "a group of Nexus instances that speak the same database
41+ /// version and can exchange sagas". So that's what we use here.
1742pub ( crate ) async fn reassign_sagas_from_expunged (
1843 opctx : & OpContext ,
1944 datastore : & DataStore ,
@@ -22,29 +47,12 @@ pub(crate) async fn reassign_sagas_from_expunged(
2247) -> Result < bool , Error > {
2348 let log = & opctx. log ;
2449
25- // Identify any Nexus zones that have been expunged and need to have sagas
26- // re-assigned.
27- //
28- // TODO: Currently, we take any expunged Nexus instances and attempt to
29- // assign all their sagas to ourselves. Per RFD 289, we can only re-assign
30- // sagas between two instances of Nexus that are at the same version. Right
31- // now this can't happen so there's nothing to do here to ensure that
32- // constraint. However, once we support allowing the control plane to be
33- // online _during_ an upgrade, there may be multiple different Nexus
34- // instances running at the same time. At that point, we will need to make
35- // sure that we only ever try to assign ourselves sagas from other Nexus
36- // instances that we know are running the same version as ourselves.
37- let nexus_zone_ids: Vec < _ > = blueprint
38- . all_omicron_zones ( BlueprintZoneDisposition :: is_ready_for_cleanup)
39- . filter_map ( |( _, z) | {
40- z. zone_type
41- . is_nexus ( )
42- . then ( || nexus_db_model:: SecId ( z. id . into_untyped_uuid ( ) ) )
43- } )
44- . collect ( ) ;
45-
46- debug ! ( log, "re-assign sagas: found Nexus instances" ;
47- "nexus_zone_ids" => ?nexus_zone_ids) ;
50+ let nexus_zone_ids = find_expunged_same_generation ( blueprint, nexus_id) ?;
51+ debug ! (
52+ log,
53+ "re-assign sagas: found expunged Nexus instances with matching generation" ;
54+ "nexus_zone_ids" => ?nexus_zone_ids
55+ ) ;
4856
4957 let result =
5058 datastore. sagas_reassign_sec ( opctx, & nexus_zone_ids, nexus_id) . await ;
@@ -68,3 +76,204 @@ pub(crate) async fn reassign_sagas_from_expunged(
6876 }
6977 }
7078}
79+
80+ /// Returns the list of Nexus ids for expunged (and ready-to-cleanup) Nexus
81+ /// zones in the same generation as the given Nexus id
82+ fn find_expunged_same_generation (
83+ blueprint : & Blueprint ,
84+ nexus_id : SecId ,
85+ ) -> Result < Vec < SecId > , Error > {
86+ let nexus_zone_id = OmicronZoneUuid :: from_untyped_uuid ( nexus_id. 0 ) ;
87+ let active_nexus_generation =
88+ blueprint. find_generation_for_self ( nexus_zone_id) ?;
89+ Ok ( blueprint
90+ . all_nexus_zones ( BlueprintZoneDisposition :: is_ready_for_cleanup)
91+ . filter_map ( |( _sled_id, zone_config, nexus_config) | {
92+ ( nexus_config. nexus_generation == active_nexus_generation)
93+ . then_some ( zone_config. id )
94+ } )
95+ . map ( |id| SecId ( id. into_untyped_uuid ( ) ) )
96+ . collect ( ) )
97+ }
98+
99+ #[ cfg( test) ]
100+ mod test {
101+ use super :: * ;
102+ use nexus_reconfigurator_planning:: blueprint_builder:: BlueprintBuilder ;
103+ use nexus_reconfigurator_planning:: example:: ExampleSystemBuilder ;
104+ use nexus_reconfigurator_planning:: planner:: PlannerRng ;
105+ use nexus_types:: deployment:: BlueprintSource ;
106+ use omicron_common:: api:: external:: Generation ;
107+ use omicron_test_utils:: dev:: test_setup_log;
108+ use std:: collections:: BTreeSet ;
109+ use uuid:: Uuid ;
110+
111+ #[ test]
112+ fn test_find_expunged_same_generation ( ) {
113+ const TEST_NAME : & str = "test_find_expunged_same_generation" ;
114+
115+ let logctx = test_setup_log ( TEST_NAME ) ;
116+ let log = & logctx. log ;
117+
118+ // To do an exhaustive test of `find_expunged_same_generation()`, we
119+ // want some expunged zones and some non-expunged zones in each of two
120+ // different generations.
121+
122+ // First, create a basic blueprint with several Nexus zones in
123+ // generation 1.
124+ let ( example, blueprint1) =
125+ ExampleSystemBuilder :: new ( log, TEST_NAME ) . nexus_count ( 4 ) . build ( ) ;
126+ let g1 = Generation :: new ( ) ;
127+ let g1_nexus_ids: Vec < _ > = blueprint1
128+ . all_nexus_zones ( BlueprintZoneDisposition :: is_in_service)
129+ . map ( |( sled_id, zone_config, nexus_config) | {
130+ assert_eq ! ( nexus_config. nexus_generation, g1) ;
131+ ( sled_id, zone_config. id , zone_config. image_source . clone ( ) )
132+ } )
133+ . collect ( ) ;
134+
135+ // Expunge two of these Nexus zones and mark them ready for cleanup
136+ // immediately.
137+ let ( g1_expunge_ids, g1_keep_ids) = g1_nexus_ids. split_at ( 2 ) ;
138+ let mut builder = BlueprintBuilder :: new_based_on (
139+ log,
140+ & blueprint1,
141+ & example. input ,
142+ & example. collection ,
143+ "test suite" ,
144+ PlannerRng :: from_entropy ( ) ,
145+ )
146+ . expect ( "new blueprint builder" ) ;
147+
148+ for ( sled_id, expunge_id, _image_source) in g1_expunge_ids {
149+ builder
150+ . sled_expunge_zone ( * sled_id, * expunge_id)
151+ . expect ( "expunge zone" ) ;
152+ builder
153+ . sled_mark_expunged_zone_ready_for_cleanup (
154+ * sled_id,
155+ * expunge_id,
156+ )
157+ . expect ( "mark zone for cleanup" ) ;
158+ }
159+
160+ // Create the same number of Nexus zones in the next generation.
161+ // We'll use the same images.
162+ let g2 = g1. next ( ) ;
163+ for ( sled_id, _zone_id, image_source) in & g1_nexus_ids {
164+ builder
165+ . sled_add_zone_nexus ( * sled_id, image_source. clone ( ) , g2)
166+ . expect ( "add Nexus zone" ) ;
167+ }
168+
169+ let blueprint2 = builder. build ( BlueprintSource :: Test ) ;
170+ let g2_nexus_ids: Vec < _ > = blueprint2
171+ . all_nexus_zones ( BlueprintZoneDisposition :: is_in_service)
172+ . filter_map ( |( sled_id, zone_config, nexus_config) | {
173+ ( nexus_config. nexus_generation == g2)
174+ . then_some ( ( sled_id, zone_config. id ) )
175+ } )
176+ . collect ( ) ;
177+
178+ // Now expunge a few of those, too. This time, only the first is going
179+ // to be marked ready for cleanup.
180+ let ( g2_expunge_ids, g2_keep_ids) = g2_nexus_ids. split_at ( 2 ) ;
181+ let mut builder = BlueprintBuilder :: new_based_on (
182+ log,
183+ & blueprint2,
184+ & example. input ,
185+ & example. collection ,
186+ "test suite" ,
187+ PlannerRng :: from_entropy ( ) ,
188+ )
189+ . expect ( "new blueprint builder" ) ;
190+
191+ let ( sled_id, g2_expunged_cleaned_up) = g2_expunge_ids[ 0 ] ;
192+ builder
193+ . sled_expunge_zone ( sled_id, g2_expunged_cleaned_up)
194+ . expect ( "expunge zone" ) ;
195+ builder
196+ . sled_mark_expunged_zone_ready_for_cleanup (
197+ g2_expunge_ids[ 0 ] . 0 ,
198+ g2_expunged_cleaned_up,
199+ )
200+ . expect ( "mark zone for cleanup" ) ;
201+
202+ let ( sled_id, g2_expunged_not_cleaned_up) = g2_expunge_ids[ 1 ] ;
203+ builder
204+ . sled_expunge_zone ( sled_id, g2_expunged_not_cleaned_up)
205+ . expect ( "expunge zone" ) ;
206+
207+ let blueprint3 = builder. build ( BlueprintSource :: Test ) ;
208+
209+ // Finally, we have:
210+ //
211+ // - g1_keep_ids: two in-service Nexus zones in generation 1
212+ // - g1_expunge_ids: two expunged Nexus zones in generation 1,
213+ // both cleaned up
214+ // - g2_keep_ids: two in-service Nexus zones in generation 2
215+ // - g2_expunge_ids: expunged Nexus zones in generation 2,
216+ // only the first of which is ready for cleanup
217+ //
218+ // Now we can exhaustively test various cases.
219+
220+ // For the in-service zones in generation 1, we should find the expunged
221+ // zones in generation 1.
222+ let g1_matched: BTreeSet < SecId > = g1_expunge_ids
223+ . into_iter ( )
224+ . map ( |( _sled_id, zone_id, _image_source) | {
225+ SecId ( zone_id. into_untyped_uuid ( ) )
226+ } )
227+ . collect ( ) ;
228+ for ( _sled_id, zone_id, _image_source) in g1_keep_ids {
229+ let matched = find_expunged_same_generation (
230+ & blueprint3,
231+ SecId ( zone_id. into_untyped_uuid ( ) ) ,
232+ )
233+ . unwrap ( ) ;
234+ assert_eq ! (
235+ matched. into_iter( ) . collect:: <BTreeSet <_>>( ) ,
236+ g1_matched
237+ ) ;
238+ }
239+
240+ // It should be impossible in a real system for the
241+ // expunged-and-ready-to-cleanup zones to execute this function. Being
242+ // ready to cleanup means we know they're gone. So there's nothing to
243+ // test here.
244+
245+ // For the in-service zones in generation 2, we should find the
246+ // expunged-and-ready-for-cleanup zone in generation 2.
247+ let g2_matched = SecId ( g2_expunged_cleaned_up. into_untyped_uuid ( ) ) ;
248+ for ( _sled_id, zone_id) in g2_keep_ids {
249+ let matched = find_expunged_same_generation (
250+ & blueprint3,
251+ SecId ( zone_id. into_untyped_uuid ( ) ) ,
252+ )
253+ . unwrap ( ) ;
254+ assert_eq ! ( matched. len( ) , 1 ) ;
255+ assert_eq ! ( matched[ 0 ] , g2_matched) ;
256+ }
257+
258+ // It is possible for the expunged and not-yet-ready-for-cleanup zone in
259+ // generation 2 to wind up calling this function. It should not find
260+ // itself!
261+ let matched = find_expunged_same_generation (
262+ & blueprint3,
263+ SecId ( g2_expunged_not_cleaned_up. into_untyped_uuid ( ) ) ,
264+ )
265+ . unwrap ( ) ;
266+ assert_eq ! ( matched. len( ) , 1 ) ;
267+ assert_eq ! ( matched[ 0 ] , g2_matched) ;
268+
269+ // Test the sole error case: if we cannot figure out which generation we
270+ // were given.
271+ let error =
272+ find_expunged_same_generation ( & blueprint3, SecId ( Uuid :: new_v4 ( ) ) )
273+ . expect_err ( "made-up Nexus should not exist" ) ;
274+ assert ! ( matches!( error, Error :: InternalError { internal_message }
275+ if internal_message. contains( "did not find Nexus" ) ) ) ;
276+
277+ logctx. cleanup_successful ( ) ;
278+ }
279+ }
0 commit comments