Skip to content

Commit 4b62dfc

Browse files
leftwoNieuwejaarKaiSforzaJustinAzoffdavid-crespo
authored
Reduce test flakes (#8989)
Various tuning to reduce test flakes during CI. Decrease the number of tests running at the same time on buildomat runs. The buildomat AWS instance has 8 cores, so we configure the nextest run to have two less than that. This reduces overall system load. Increase some internal timeouts, and add loops to some tests to allow them to finish even if the system is under heavy load. Added a few debug messages, but, they may be of limited value. --------- Co-authored-by: Nils Nieuwejaar <[email protected]> Co-authored-by: Kai Sforza <[email protected]> Co-authored-by: Justin <[email protected]> Co-authored-by: David Crespo <[email protected]> Co-authored-by: Ryan Goodfellow <[email protected]>
1 parent ef0a966 commit 4b62dfc

File tree

7 files changed

+117
-47
lines changed

7 files changed

+117
-47
lines changed

.github/buildomat/build-and-test.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,13 @@ ptime -m cargo build -Z unstable-options --timings=json \
128128
#
129129
# We apply our own timeout to ensure that we get a normal failure on timeout
130130
# rather than a buildomat timeout. See oxidecomputer/buildomat#8.
131-
#
131+
# To avoid too many tests running at the same time, we choose a test threads
132+
# 2 less (negative 2) than the default. This avoids many test flakes where
133+
# the test would have worked but the system was too overloaded and tests
134+
# take longer than their default timeouts.
132135
banner test
133-
ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose
136+
ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose \
137+
--test-threads -2
134138

135139
#
136140
# https://github.com/nextest-rs/nextest/issues/16

nexus/src/app/background/tasks/instance_reincarnation.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -910,8 +910,10 @@ mod test {
910910
.await;
911911

912912
// Activate the background task again. Now, only instance 2 should be
913-
// restarted.
913+
// restarted. Possible test flake here and this adds a bit more debug
914+
// if we see this assertion fail.
914915
let status = assert_activation_ok!(task.activate(&opctx).await);
916+
eprintln!("status: {:?}", status);
915917
assert_eq!(status.total_instances_found(), 1);
916918
assert_eq!(
917919
status.instances_reincarnated,

nexus/src/app/sagas/region_snapshot_replacement_start.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1534,17 +1534,21 @@ pub(crate) mod test {
15341534
request: &RegionSnapshotReplacement,
15351535
) {
15361536
let opctx = test_opctx(cptestctx);
1537+
15371538
let db_request = datastore
15381539
.get_region_snapshot_replacement_request_by_id(&opctx, request.id)
15391540
.await
15401541
.unwrap();
15411542

15421543
assert_eq!(db_request.new_region_id, None);
1543-
assert_eq!(
1544-
db_request.replacement_state,
1545-
RegionSnapshotReplacementState::Requested
1546-
);
15471544
assert_eq!(db_request.operating_saga_id, None);
1545+
1546+
match db_request.replacement_state {
1547+
RegionSnapshotReplacementState::Requested => {}
1548+
x => {
1549+
panic!("replacement state {:?} != Requested", x);
1550+
}
1551+
}
15481552
}
15491553

15501554
async fn assert_volume_untouched(

nexus/test-utils/src/resource_helpers.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1688,7 +1688,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> {
16881688
}
16891689
},
16901690
&Duration::from_millis(50),
1691-
&Duration::from_secs(30),
1691+
&Duration::from_secs(120),
16921692
)
16931693
.await
16941694
.expect("expected to find inventory collection");

nexus/tests/integration_tests/crucible_replacements.rs

Lines changed: 86 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ pub(crate) async fn wait_for_all_replacements(
202202
}
203203
},
204204
&std::time::Duration::from_millis(50),
205-
&std::time::Duration::from_secs(180),
205+
&std::time::Duration::from_secs(260),
206206
)
207207
.await
208208
.expect("all replacements finished");
@@ -480,7 +480,7 @@ mod region_replacement {
480480
}
481481
},
482482
&std::time::Duration::from_millis(50),
483-
&std::time::Duration::from_secs(60),
483+
&std::time::Duration::from_secs(260),
484484
)
485485
.await
486486
.expect("request transitioned to expected state");
@@ -1070,24 +1070,34 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume(
10701070
activate_background_task(&internal_client, "region_replacement_driver")
10711071
.await;
10721072

1073-
assert!(match last_background_task.last {
1073+
let res = match last_background_task.last {
10741074
LastResult::Completed(last_result_completed) => {
10751075
match serde_json::from_value::<RegionReplacementDriverStatus>(
10761076
last_result_completed.details,
10771077
) {
10781078
Err(e) => {
1079+
eprintln!("Json not what we expected");
10791080
eprintln!("{e}");
10801081
false
10811082
}
10821083

1083-
Ok(v) => !v.drive_invoked_ok.is_empty(),
1084+
Ok(v) => {
1085+
if !v.drive_invoked_ok.is_empty() {
1086+
eprintln!("v.drive_ok: {:?}", v.drive_invoked_ok);
1087+
true
1088+
} else {
1089+
eprintln!("v.drive_ok: {:?} empty", v.drive_invoked_ok);
1090+
false
1091+
}
1092+
}
10841093
}
10851094
}
1086-
1087-
_ => {
1095+
x => {
1096+
eprintln!("Unexpected result here: {:?}", x);
10881097
false
10891098
}
1090-
});
1099+
};
1100+
assert!(res);
10911101

10921102
// wait for the drive saga to complete here
10931103
wait_for_condition(
@@ -1484,20 +1494,33 @@ mod region_snapshot_replacement {
14841494

14851495
// Assert no volumes are referencing the snapshot address
14861496

1487-
let volumes = self
1488-
.datastore
1489-
.find_volumes_referencing_socket_addr(
1490-
&self.opctx(),
1491-
self.snapshot_socket_addr,
1492-
)
1493-
.await
1494-
.unwrap();
1497+
let mut counter = 1;
1498+
loop {
1499+
let volumes = self
1500+
.datastore
1501+
.find_volumes_referencing_socket_addr(
1502+
&self.opctx(),
1503+
self.snapshot_socket_addr,
1504+
)
1505+
.await
1506+
.unwrap();
14951507

1496-
if !volumes.is_empty() {
1497-
eprintln!("{:?}", volumes);
1508+
if !volumes.is_empty() {
1509+
eprintln!(
1510+
"Volume should be gone, try {counter} {:?}",
1511+
volumes
1512+
);
1513+
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
1514+
counter += 1;
1515+
if counter > 200 {
1516+
panic!(
1517+
"Tried 200 times, and still this did not finish"
1518+
);
1519+
}
1520+
} else {
1521+
break;
1522+
}
14981523
}
1499-
1500-
assert!(volumes.is_empty());
15011524
}
15021525

15031526
/// Assert no Crucible resources are leaked
@@ -1648,32 +1671,59 @@ mod region_snapshot_replacement {
16481671
match result {
16491672
InsertStepResult::Inserted { .. } => {}
16501673

1651-
_ => {
1652-
assert!(
1653-
false,
1654-
"bad result from create_region_snapshot_replacement_step"
1674+
InsertStepResult::AlreadyHandled { existing_step_id } => {
1675+
let region_snapshot_replace_request = self
1676+
.datastore
1677+
.get_region_snapshot_replacement_request_by_id(
1678+
&self.opctx(),
1679+
existing_step_id,
1680+
)
1681+
.await;
1682+
eprintln!(
1683+
"we were suppose to create this: {:?} but found it AlreadyHandled, then got {:?}",
1684+
self.replacement_request_id,
1685+
region_snapshot_replace_request
16551686
);
1687+
panic!("Something else created our replacement");
16561688
}
16571689
}
16581690
}
16591691

16601692
pub async fn assert_read_only_target_gone(&self) {
1661-
let region_snapshot_replace_request = self
1662-
.datastore
1663-
.get_region_snapshot_replacement_request_by_id(
1664-
&self.opctx(),
1665-
self.replacement_request_id,
1666-
)
1667-
.await
1668-
.unwrap();
1693+
eprintln!(
1694+
"Starting, replace_request_id: {:?}",
1695+
self.replacement_request_id
1696+
);
1697+
let mut i = 1;
1698+
loop {
1699+
let region_snapshot_replace_request = self
1700+
.datastore
1701+
.get_region_snapshot_replacement_request_by_id(
1702+
&self.opctx(),
1703+
self.replacement_request_id,
1704+
)
1705+
.await
1706+
.unwrap();
1707+
eprintln!(
1708+
"In loop {i} with rs_replace_request: {:?}",
1709+
region_snapshot_replace_request
1710+
);
16691711

1670-
assert!(
1671-
self.datastore
1712+
let res = self
1713+
.datastore
16721714
.read_only_target_addr(&region_snapshot_replace_request)
16731715
.await
1674-
.unwrap()
1675-
.is_none()
1676-
);
1716+
.unwrap();
1717+
1718+
eprintln!("In loop {i} target that should be gone: {:?}", res);
1719+
if res.is_none() {
1720+
// test pass, move on
1721+
break;
1722+
}
1723+
eprintln!("loop {i}, snapshot that should be gone: {:?}", res);
1724+
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
1725+
i += 1;
1726+
}
16771727
}
16781728

16791729
pub async fn remove_disk_from_snapshot_rop(&self) {

nexus/tests/integration_tests/instances.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7656,7 +7656,7 @@ pub async fn instance_wait_for_state_as(
76567656
instance_id: InstanceUuid,
76577657
state: omicron_common::api::external::InstanceState,
76587658
) -> Instance {
7659-
const MAX_WAIT: Duration = Duration::from_secs(120);
7659+
const MAX_WAIT: Duration = Duration::from_secs(320);
76607660

76617661
slog::info!(
76627662
&client.client_log,

sled-agent/src/sim/http_entrypoints_pantry.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -399,8 +399,18 @@ mod tests {
399399
let raw_url = format!(
400400
"https://raw.githubusercontent.com/oxidecomputer/crucible/{part}/openapi/crucible-pantry.json",
401401
);
402-
let raw_json =
403-
reqwest::blocking::get(&raw_url).unwrap().text().unwrap();
402+
403+
// The default timeout of 30 seconds was sometimes not enough
404+
// heavy load.
405+
let raw_json = reqwest::blocking::Client::builder()
406+
.timeout(std::time::Duration::from_secs(120))
407+
.build()
408+
.unwrap()
409+
.get(&raw_url)
410+
.send()
411+
.unwrap()
412+
.text()
413+
.unwrap();
404414
serde_json::from_str(&raw_json).unwrap()
405415
}
406416

0 commit comments

Comments
 (0)