Skip to content

Commit 97528cd

Browse files
authored
saga re-assignment should only grab sagas from Nexus of same generation (#9270)
1 parent 3ca8ec6 commit 97528cd

File tree

5 files changed

+261
-42
lines changed

5 files changed

+261
-42
lines changed

live-tests/tests/test_nexus_add_remove.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use nexus_types::deployment::BlueprintZoneType;
2525
use nexus_types::deployment::PlannerConfig;
2626
use nexus_types::deployment::SledFilter;
2727
use nexus_types::deployment::blueprint_zone_type;
28-
use omicron_common::address::NEXUS_INTERNAL_PORT;
28+
use omicron_common::address::NEXUS_LOCKSTEP_PORT;
2929
use omicron_test_utils::dev::poll::CondCheckError;
3030
use omicron_test_utils::dev::poll::wait_for_condition;
3131
use slog::{debug, info};
@@ -133,7 +133,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
133133
assert_eq!(new_zone.kind(), ZoneKind::Nexus);
134134
let new_zone_addr = new_zone.underlay_ip();
135135
let new_zone_sockaddr =
136-
SocketAddrV6::new(new_zone_addr, NEXUS_INTERNAL_PORT, 0, 0);
136+
SocketAddrV6::new(new_zone_addr, NEXUS_LOCKSTEP_PORT, 0, 0);
137137
let new_zone_client = lc.specific_internal_nexus_client(new_zone_sockaddr);
138138

139139
// Wait for the new Nexus zone to show up and be usable.

live-tests/tests/test_nexus_handoff.rs

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,9 @@ async fn test_nexus_handoff(lc: &LiveTestContext) {
131131
.map(|(id, current)| {
132132
(
133133
id,
134-
lc.specific_internal_nexus_client(current.cfg.internal_address),
134+
lc.specific_internal_nexus_client(
135+
current.cfg.lockstep_address(),
136+
),
135137
)
136138
})
137139
.collect();
@@ -193,18 +195,15 @@ async fn test_nexus_handoff(lc: &LiveTestContext) {
193195

194196
// Find the new Nexus zones and make clients for them.
195197
let new_nexus_clients = blueprint_new_nexus
196-
.all_omicron_zones(BlueprintZoneDisposition::is_in_service)
197-
.filter_map(|(_sled_id, z)| {
198-
let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus {
199-
nexus_generation,
200-
internal_address,
201-
..
202-
}) = &z.zone_type
203-
else {
204-
return None;
205-
};
206-
(*nexus_generation == next_generation).then(|| {
207-
(z.id, lc.specific_internal_nexus_client(*internal_address))
198+
.all_nexus_zones(BlueprintZoneDisposition::is_in_service)
199+
.filter_map(|(_sled_id, zone_cfg, nexus_config)| {
200+
(nexus_config.nexus_generation == next_generation).then(|| {
201+
(
202+
zone_cfg.id,
203+
lc.specific_internal_nexus_client(
204+
nexus_config.lockstep_address(),
205+
),
206+
)
208207
})
209208
})
210209
.collect::<BTreeMap<_, _>>();

nexus/reconfigurator/execution/src/sagas.rs

Lines changed: 235 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,36 @@ use nexus_db_queries::context::OpContext;
99
use nexus_db_queries::db::DataStore;
1010
use nexus_types::deployment::{Blueprint, BlueprintZoneDisposition};
1111
use omicron_common::api::external::Error;
12-
use omicron_uuid_kinds::GenericUuid;
12+
use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid};
1313
use slog::{debug, info, warn};
1414

15-
/// For each expunged Nexus zone, re-assign sagas owned by that Nexus to the
16-
/// specified nexus (`nexus_id`).
15+
/// For each expunged Nexus zone in the same generation as the current Nexus,
16+
/// re-assign sagas owned by that Nexus to the specified nexus (`nexus_id`).
17+
///
18+
/// Reassigning sagas in this way is how we ensure that that sagas complete even
19+
/// when the Nexus that was running them fails permanently (leading to
20+
/// expungement).
21+
///
22+
/// Reassignment of sagas assigned to any expunged Nexus nodes is a prerequisite
23+
/// for Nexus handoff (during upgrade). That's because in general, Nexus is
24+
/// tightly coupled to saga implementations and it's not safe for different
25+
/// versions of a Nexus to operate on the same saga (even at different times).
26+
/// For more on this, see RFD 289. As a result, we finish all sagas prior to
27+
/// handing off from one version to the next.
28+
///
29+
/// Strictly speaking, it should not be required to limit re-assignment to Nexus
30+
/// instances of the same generation. As mentioned above, sagas do not survive
31+
/// across generations, so the only sagas that exist ought to be from the same
32+
/// generation that Nexus is running. This is a belt-and-suspenders check put
33+
/// in place because the impact of getting this wrong (running a saga created by
34+
/// an older version) could be pretty bad.
35+
///
36+
/// We could use the Nexus image for this rather than the generation.
37+
/// In practice, these should be equivalent here because we bump the Nexus
38+
/// generation on deployed systems when (and only when) there's a new Nexus
39+
/// image. But handoff works in terms of generations -- that's the abstraction
40+
/// we've created for "a group of Nexus instances that speak the same database
41+
/// version and can exchange sagas". So that's what we use here.
1742
pub(crate) async fn reassign_sagas_from_expunged(
1843
opctx: &OpContext,
1944
datastore: &DataStore,
@@ -22,29 +47,12 @@ pub(crate) async fn reassign_sagas_from_expunged(
2247
) -> Result<bool, Error> {
2348
let log = &opctx.log;
2449

25-
// Identify any Nexus zones that have been expunged and need to have sagas
26-
// re-assigned.
27-
//
28-
// TODO: Currently, we take any expunged Nexus instances and attempt to
29-
// assign all their sagas to ourselves. Per RFD 289, we can only re-assign
30-
// sagas between two instances of Nexus that are at the same version. Right
31-
// now this can't happen so there's nothing to do here to ensure that
32-
// constraint. However, once we support allowing the control plane to be
33-
// online _during_ an upgrade, there may be multiple different Nexus
34-
// instances running at the same time. At that point, we will need to make
35-
// sure that we only ever try to assign ourselves sagas from other Nexus
36-
// instances that we know are running the same version as ourselves.
37-
let nexus_zone_ids: Vec<_> = blueprint
38-
.all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup)
39-
.filter_map(|(_, z)| {
40-
z.zone_type
41-
.is_nexus()
42-
.then(|| nexus_db_model::SecId(z.id.into_untyped_uuid()))
43-
})
44-
.collect();
45-
46-
debug!(log, "re-assign sagas: found Nexus instances";
47-
"nexus_zone_ids" => ?nexus_zone_ids);
50+
let nexus_zone_ids = find_expunged_same_generation(blueprint, nexus_id)?;
51+
debug!(
52+
log,
53+
"re-assign sagas: found expunged Nexus instances with matching generation";
54+
"nexus_zone_ids" => ?nexus_zone_ids
55+
);
4856

4957
let result =
5058
datastore.sagas_reassign_sec(opctx, &nexus_zone_ids, nexus_id).await;
@@ -68,3 +76,204 @@ pub(crate) async fn reassign_sagas_from_expunged(
6876
}
6977
}
7078
}
79+
80+
/// Returns the list of Nexus ids for expunged (and ready-to-cleanup) Nexus
81+
/// zones in the same generation as the given Nexus id
82+
fn find_expunged_same_generation(
83+
blueprint: &Blueprint,
84+
nexus_id: SecId,
85+
) -> Result<Vec<SecId>, Error> {
86+
let nexus_zone_id = OmicronZoneUuid::from_untyped_uuid(nexus_id.0);
87+
let active_nexus_generation =
88+
blueprint.find_generation_for_self(nexus_zone_id)?;
89+
Ok(blueprint
90+
.all_nexus_zones(BlueprintZoneDisposition::is_ready_for_cleanup)
91+
.filter_map(|(_sled_id, zone_config, nexus_config)| {
92+
(nexus_config.nexus_generation == active_nexus_generation)
93+
.then_some(zone_config.id)
94+
})
95+
.map(|id| SecId(id.into_untyped_uuid()))
96+
.collect())
97+
}
98+
99+
#[cfg(test)]
100+
mod test {
101+
use super::*;
102+
use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
103+
use nexus_reconfigurator_planning::example::ExampleSystemBuilder;
104+
use nexus_reconfigurator_planning::planner::PlannerRng;
105+
use nexus_types::deployment::BlueprintSource;
106+
use omicron_common::api::external::Generation;
107+
use omicron_test_utils::dev::test_setup_log;
108+
use std::collections::BTreeSet;
109+
use uuid::Uuid;
110+
111+
#[test]
112+
fn test_find_expunged_same_generation() {
113+
const TEST_NAME: &str = "test_find_expunged_same_generation";
114+
115+
let logctx = test_setup_log(TEST_NAME);
116+
let log = &logctx.log;
117+
118+
// To do an exhaustive test of `find_expunged_same_generation()`, we
119+
// want some expunged zones and some non-expunged zones in each of two
120+
// different generations.
121+
122+
// First, create a basic blueprint with several Nexus zones in
123+
// generation 1.
124+
let (example, blueprint1) =
125+
ExampleSystemBuilder::new(log, TEST_NAME).nexus_count(4).build();
126+
let g1 = Generation::new();
127+
let g1_nexus_ids: Vec<_> = blueprint1
128+
.all_nexus_zones(BlueprintZoneDisposition::is_in_service)
129+
.map(|(sled_id, zone_config, nexus_config)| {
130+
assert_eq!(nexus_config.nexus_generation, g1);
131+
(sled_id, zone_config.id, zone_config.image_source.clone())
132+
})
133+
.collect();
134+
135+
// Expunge two of these Nexus zones and mark them ready for cleanup
136+
// immediately.
137+
let (g1_expunge_ids, g1_keep_ids) = g1_nexus_ids.split_at(2);
138+
let mut builder = BlueprintBuilder::new_based_on(
139+
log,
140+
&blueprint1,
141+
&example.input,
142+
&example.collection,
143+
"test suite",
144+
PlannerRng::from_entropy(),
145+
)
146+
.expect("new blueprint builder");
147+
148+
for (sled_id, expunge_id, _image_source) in g1_expunge_ids {
149+
builder
150+
.sled_expunge_zone(*sled_id, *expunge_id)
151+
.expect("expunge zone");
152+
builder
153+
.sled_mark_expunged_zone_ready_for_cleanup(
154+
*sled_id,
155+
*expunge_id,
156+
)
157+
.expect("mark zone for cleanup");
158+
}
159+
160+
// Create the same number of Nexus zones in the next generation.
161+
// We'll use the same images.
162+
let g2 = g1.next();
163+
for (sled_id, _zone_id, image_source) in &g1_nexus_ids {
164+
builder
165+
.sled_add_zone_nexus(*sled_id, image_source.clone(), g2)
166+
.expect("add Nexus zone");
167+
}
168+
169+
let blueprint2 = builder.build(BlueprintSource::Test);
170+
let g2_nexus_ids: Vec<_> = blueprint2
171+
.all_nexus_zones(BlueprintZoneDisposition::is_in_service)
172+
.filter_map(|(sled_id, zone_config, nexus_config)| {
173+
(nexus_config.nexus_generation == g2)
174+
.then_some((sled_id, zone_config.id))
175+
})
176+
.collect();
177+
178+
// Now expunge a few of those, too. This time, only the first is going
179+
// to be marked ready for cleanup.
180+
let (g2_expunge_ids, g2_keep_ids) = g2_nexus_ids.split_at(2);
181+
let mut builder = BlueprintBuilder::new_based_on(
182+
log,
183+
&blueprint2,
184+
&example.input,
185+
&example.collection,
186+
"test suite",
187+
PlannerRng::from_entropy(),
188+
)
189+
.expect("new blueprint builder");
190+
191+
let (sled_id, g2_expunged_cleaned_up) = g2_expunge_ids[0];
192+
builder
193+
.sled_expunge_zone(sled_id, g2_expunged_cleaned_up)
194+
.expect("expunge zone");
195+
builder
196+
.sled_mark_expunged_zone_ready_for_cleanup(
197+
g2_expunge_ids[0].0,
198+
g2_expunged_cleaned_up,
199+
)
200+
.expect("mark zone for cleanup");
201+
202+
let (sled_id, g2_expunged_not_cleaned_up) = g2_expunge_ids[1];
203+
builder
204+
.sled_expunge_zone(sled_id, g2_expunged_not_cleaned_up)
205+
.expect("expunge zone");
206+
207+
let blueprint3 = builder.build(BlueprintSource::Test);
208+
209+
// Finally, we have:
210+
//
211+
// - g1_keep_ids: two in-service Nexus zones in generation 1
212+
// - g1_expunge_ids: two expunged Nexus zones in generation 1,
213+
// both cleaned up
214+
// - g2_keep_ids: two in-service Nexus zones in generation 2
215+
// - g2_expunge_ids: expunged Nexus zones in generation 2,
216+
// only the first of which is ready for cleanup
217+
//
218+
// Now we can exhaustively test various cases.
219+
220+
// For the in-service zones in generation 1, we should find the expunged
221+
// zones in generation 1.
222+
let g1_matched: BTreeSet<SecId> = g1_expunge_ids
223+
.into_iter()
224+
.map(|(_sled_id, zone_id, _image_source)| {
225+
SecId(zone_id.into_untyped_uuid())
226+
})
227+
.collect();
228+
for (_sled_id, zone_id, _image_source) in g1_keep_ids {
229+
let matched = find_expunged_same_generation(
230+
&blueprint3,
231+
SecId(zone_id.into_untyped_uuid()),
232+
)
233+
.unwrap();
234+
assert_eq!(
235+
matched.into_iter().collect::<BTreeSet<_>>(),
236+
g1_matched
237+
);
238+
}
239+
240+
// It should be impossible in a real system for the
241+
// expunged-and-ready-to-cleanup zones to execute this function. Being
242+
// ready to cleanup means we know they're gone. So there's nothing to
243+
// test here.
244+
245+
// For the in-service zones in generation 2, we should find the
246+
// expunged-and-ready-for-cleanup zone in generation 2.
247+
let g2_matched = SecId(g2_expunged_cleaned_up.into_untyped_uuid());
248+
for (_sled_id, zone_id) in g2_keep_ids {
249+
let matched = find_expunged_same_generation(
250+
&blueprint3,
251+
SecId(zone_id.into_untyped_uuid()),
252+
)
253+
.unwrap();
254+
assert_eq!(matched.len(), 1);
255+
assert_eq!(matched[0], g2_matched);
256+
}
257+
258+
// It is possible for the expunged and not-yet-ready-for-cleanup zone in
259+
// generation 2 to wind up calling this function. It should not find
260+
// itself!
261+
let matched = find_expunged_same_generation(
262+
&blueprint3,
263+
SecId(g2_expunged_not_cleaned_up.into_untyped_uuid()),
264+
)
265+
.unwrap();
266+
assert_eq!(matched.len(), 1);
267+
assert_eq!(matched[0], g2_matched);
268+
269+
// Test the sole error case: if we cannot figure out which generation we
270+
// were given.
271+
let error =
272+
find_expunged_same_generation(&blueprint3, SecId(Uuid::new_v4()))
273+
.expect_err("made-up Nexus should not exist");
274+
assert!(matches!(error, Error::InternalError { internal_message }
275+
if internal_message.contains("did not find Nexus")));
276+
277+
logctx.cleanup_successful();
278+
}
279+
}

nexus/types/src/deployment.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ impl Blueprint {
479479
nexus_id: OmicronZoneUuid,
480480
) -> Result<Generation, Error> {
481481
for (_sled_id, zone_config, nexus_config) in
482-
self.all_nexus_zones(BlueprintZoneDisposition::is_in_service)
482+
self.all_nexus_zones(BlueprintZoneDisposition::could_be_running)
483483
{
484484
if zone_config.id == nexus_id {
485485
return Ok(nexus_config.nexus_generation);

nexus/types/src/deployment/zone_type.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,17 @@ pub mod blueprint_zone_type {
577577
pub nexus_generation: Generation,
578578
}
579579

580+
impl Nexus {
581+
pub fn lockstep_address(&self) -> SocketAddrV6 {
582+
SocketAddrV6::new(
583+
*self.internal_address.ip(),
584+
self.lockstep_port,
585+
0,
586+
0,
587+
)
588+
}
589+
}
590+
580591
#[derive(
581592
Debug,
582593
Clone,

0 commit comments

Comments
 (0)