Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion illumos-utils/src/svcs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ impl SvcsInMaintenanceResult {
));
error!(
log,
"unable to parse; output line missing zone:";
"unable to parse; output line missing zone";
"line" => line,
);
continue;
Expand Down
221 changes: 220 additions & 1 deletion illumos-utils/src/zpool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@

use crate::{ExecutionError, PFEXEC, execute_async};
use camino::{Utf8Path, Utf8PathBuf};
use chrono::DateTime;
use chrono::Utc;
use schemars::JsonSchema;
use serde::Deserialize;
use serde::Serialize;
use slog::Logger;
use slog::error;
use slog::info;
use std::str::FromStr;
use tokio::process::Command;

Expand Down Expand Up @@ -60,7 +68,10 @@ pub struct GetInfoError {
err: Error,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[derive(
Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema,
)]
#[serde(rename_all = "snake_case")]
pub enum ZpoolHealth {
/// The device is online and functioning.
Online,
Expand Down Expand Up @@ -198,6 +209,87 @@ pub struct PathInPool {
pub path: Utf8PathBuf,
}

/// Lists unhealthy zpools, parsing errors if any, and the time the health check
/// for zpools ran.
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub struct UnhealthyZpoolsResult {
pub zpools: Vec<String>,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went back and forth between just having a list of unhealthy zpools, or associating each zpool with it's state. In the end I went with listing the zpools only, but I'm not convinced. We'll be including the information of the health checks in the support bundle, and it'd be useful for them to be able to see what state each zpool is in. Thoughts? @davepacheco @jgallagher

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Associating each zpool with its state sounds good to me; having an explicit entry for "this zpool was healthy" seems safer than inferring "any zpool that isn't explicitly listed as unhealthy must have been healthy".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, I was thinking of only including the "unhealthy" zpools with their associated statuses in this list. Similarly with the svcs_in_maintenance I only added the services in maintenance with their associated zones. If I were to include the "healthy" zpools then it wouldn't really be consistent with the services in maintenance no?

My take on the health checks is to only report on things that are in an unhealthy state. Thoughts?

pub errors: Vec<String>,
pub time_of_status: Option<DateTime<Utc>>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be optional? If we have a result, it must have been collected at some time, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is meant for when this runs on non-illumos environements

https://github.com/oxidecomputer/omicron/pull/9615/files#diff-07cfb91c0643fac6671e8b81317c92cbe806ab07e2af6fdce16db75315245e32R446-R453

The idea is that there is no time of status because no command was ever run.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also there is a point in time where health monitor may not have run yet -> #9589 (comment)

}

impl UnhealthyZpoolsResult {
pub fn new() -> Self {
Self { zpools: vec![], errors: vec![], time_of_status: None }
}

pub fn is_empty(&self) -> bool {
self.zpools.is_empty()
&& self.errors.is_empty()
&& self.time_of_status == None
}

#[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
fn parse(log: &Logger, data: &[u8]) -> Self {
let mut zpools = vec![];
let mut errors = vec![];
if data.is_empty() {
return Self { zpools, errors, time_of_status: Some(Utc::now()) };
}

// Example of the response from running `zpool list -Hpo health,name`
//
// FAULTED fakepool1
// FAULTED fakepool2
// ONLINE rpool
let s = String::from_utf8_lossy(data);
let lines = s.trim().lines();

for line in lines {
let line = line.trim();
let mut pool = line.split_whitespace();

if let Some(state_str) = pool.next() {
// Only attempt to parse a zpool that is in a non-functional
// state.
match ZpoolHealth::from_str(state_str) {
Ok(ZpoolHealth::Faulted)
| Ok(ZpoolHealth::Offline)
| Ok(ZpoolHealth::Removed)
| Ok(ZpoolHealth::Unavailable) => {
if let Some(name) = pool.next() {
zpools.push(name.to_string());
} else {
errors.push(format!(
"Unexpected output line: {line}"
));
error!(
log,
"unable to parse; output line missing zpool name";
"line" => line,
);
continue;
}
}
// Pool is in a healthy or degraded state, skip it.
Ok(_) => {}
Err(e) => {
errors.push(format!("{e}"));
info!(
log,
"output from 'zpool list' contains a zpool with \
an unknown state: {state_str}",
);
}
}
}
}

Self { zpools, errors, time_of_status: Some(Utc::now()) }
}
}

/// Wraps commands for interacting with ZFS pools.
pub struct Zpool(());

Expand Down Expand Up @@ -330,11 +422,138 @@ impl Zpool {
})?;
Ok(zpool)
}

/// Lists zpools that are in a unhealthy non-functional state. Specifically
/// if they are in the following states:
///
/// - Faulted
/// - Offline
/// - Removed
/// - Unavailable
#[cfg(target_os = "illumos")]
pub async fn status_unhealthy(
log: &Logger,
) -> Result<UnhealthyZpoolsResult, ExecutionError> {
let mut command = Command::new(ZPOOL);
let cmd = command.args(&["list", "-Hpo", "health,name"]);
info!(log, "Retrieving information from zpools");
let output = execute_async(cmd).await?;
let zpool_result = UnhealthyZpoolsResult::parse(&log, &output.stdout);
info!(log, "Successfully retrieved unhealthy zpools");
Ok(zpool_result)
}

#[cfg(not(target_os = "illumos"))]
pub async fn status_unhealthy(
log: &Logger,
) -> Result<UnhealthyZpoolsResult, ExecutionError> {
info!(log, "OS not illumos, will not retrieve zpool information");
let zpool_result = UnhealthyZpoolsResult::new();
Ok(zpool_result)
}
}

#[cfg(test)]
mod test {
use super::*;
use slog::Drain;
use slog::o;
use slog_term::FullFormat;
use slog_term::PlainDecorator;
use slog_term::TestStdoutWriter;

fn log() -> slog::Logger {
let decorator = PlainDecorator::new(TestStdoutWriter);
let drain = FullFormat::new(decorator).build().fuse();
let drain = slog_async::Async::new(drain).build().fuse();
slog::Logger::root(drain, o!())
}

#[test]
fn test_unhealthy_zpool_parse_success() {
let output = r#"FAULTED fakepool1
UNAVAIL fakepool2
ONLINE rpool
"#;

let log = log();
let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes());

// We want to make sure we only have two unhealthy pools
assert_eq!(
result.zpools,
vec!["fakepool1".to_string(), "fakepool2".to_string()]
);
assert_eq!(result.errors.len(), 0);
assert!(result.time_of_status.is_some());
}

#[test]
fn test_unhealthy_zpool_parse_none_success() {
let output = r#"DEGRADED fakepool1
ONLINE fakepool2
ONLINE rpool
"#;

let log = log();
let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes());

// We want to make sure we only have zero unhealthy pools
assert_eq!(result.zpools.len(), 0);
assert_eq!(result.errors.len(), 0);
assert!(result.time_of_status.is_some());
}

#[test]
fn test_unhealthy_zpool_empty_success() {
let output = r#""#;

let log = log();
let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes());

// We want to make sure we only have zero unhealthy pools
assert_eq!(result.zpools.len(), 0);
assert_eq!(result.errors.len(), 0);
assert!(result.time_of_status.is_some());
}

#[test]
fn test_unhealthy_zpool_parse_unknown_status_fail() {
let output = r#"BARNACLES! fakepool1
FAULTED fakepool2
ONLINE rpool
"#;

let log = log();
let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes());

assert_eq!(result.zpools, vec!["fakepool2".to_string()]);
assert_eq!(
result.errors,
vec![
"Failed to parse output: Unrecognized zpool 'health': BARNACLES!"
.to_string(),
]
);
assert!(result.time_of_status.is_some());
}

#[test]
fn test_unhealthy_zpool_parse_zpool_fail() {
let output = r#"FAULTED
ONLINE rpool
"#;

let log = log();
let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes());

assert_eq!(result.zpools.len(), 0);
assert_eq!(
result.errors,
vec!["Unexpected output line: FAULTED".to_string(),],
);
assert!(result.time_of_status.is_some());
}

#[test]
fn test_parse_zpool() {
Expand Down
Loading
Loading