-
Notifications
You must be signed in to change notification settings - Fork 67
[inventory] Add unhealthy zpools from each sled #9615
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
ed4d1ee
b30e767
27204f4
b038f6e
25db966
cc6d397
219918c
407a068
c817b9c
d01216d
b1fecc8
8eb5001
7cd8f4b
c5baf76
e6944dc
0458330
4c7cad3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,14 @@ | |
|
|
||
| use crate::{ExecutionError, PFEXEC, execute_async}; | ||
| use camino::{Utf8Path, Utf8PathBuf}; | ||
| use chrono::DateTime; | ||
| use chrono::Utc; | ||
| use schemars::JsonSchema; | ||
| use serde::Deserialize; | ||
| use serde::Serialize; | ||
| use slog::Logger; | ||
| use slog::error; | ||
| use slog::info; | ||
| use std::str::FromStr; | ||
| use tokio::process::Command; | ||
|
|
||
|
|
@@ -60,7 +68,10 @@ pub struct GetInfoError { | |
| err: Error, | ||
| } | ||
|
|
||
| #[derive(Clone, Copy, Debug, PartialEq, Eq)] | ||
| #[derive( | ||
| Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, | ||
| )] | ||
| #[serde(rename_all = "snake_case")] | ||
| pub enum ZpoolHealth { | ||
| /// The device is online and functioning. | ||
| Online, | ||
|
|
@@ -198,6 +209,87 @@ pub struct PathInPool { | |
| pub path: Utf8PathBuf, | ||
| } | ||
|
|
||
| /// Lists unhealthy zpools, parsing errors if any, and the time the health check | ||
| /// for zpools ran. | ||
| #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] | ||
| #[serde(rename_all = "snake_case")] | ||
| pub struct UnhealthyZpoolsResult { | ||
| pub zpools: Vec<String>, | ||
| pub errors: Vec<String>, | ||
| pub time_of_status: Option<DateTime<Utc>>, | ||
|
||
| } | ||
|
|
||
| impl UnhealthyZpoolsResult { | ||
| pub fn new() -> Self { | ||
| Self { zpools: vec![], errors: vec![], time_of_status: None } | ||
| } | ||
|
|
||
| pub fn is_empty(&self) -> bool { | ||
| self.zpools.is_empty() | ||
| && self.errors.is_empty() | ||
| && self.time_of_status == None | ||
| } | ||
|
|
||
| #[cfg_attr(not(target_os = "illumos"), allow(dead_code))] | ||
| fn parse(log: &Logger, data: &[u8]) -> Self { | ||
| let mut zpools = vec![]; | ||
| let mut errors = vec![]; | ||
| if data.is_empty() { | ||
| return Self { zpools, errors, time_of_status: Some(Utc::now()) }; | ||
| } | ||
|
|
||
| // Example of the response from running `zpool list -Hpo health,name` | ||
| // | ||
| // FAULTED fakepool1 | ||
| // FAULTED fakepool2 | ||
| // ONLINE rpool | ||
| let s = String::from_utf8_lossy(data); | ||
| let lines = s.trim().lines(); | ||
|
|
||
| for line in lines { | ||
| let line = line.trim(); | ||
| let mut pool = line.split_whitespace(); | ||
|
|
||
| if let Some(state_str) = pool.next() { | ||
| // Only attempt to parse a zpool that is in a non-functional | ||
| // state. | ||
| match ZpoolHealth::from_str(state_str) { | ||
| Ok(ZpoolHealth::Faulted) | ||
| | Ok(ZpoolHealth::Offline) | ||
| | Ok(ZpoolHealth::Removed) | ||
| | Ok(ZpoolHealth::Unavailable) => { | ||
| if let Some(name) = pool.next() { | ||
| zpools.push(name.to_string()); | ||
| } else { | ||
| errors.push(format!( | ||
| "Unexpected output line: {line}" | ||
| )); | ||
| error!( | ||
| log, | ||
| "unable to parse; output line missing zpool name"; | ||
| "line" => line, | ||
| ); | ||
| continue; | ||
| } | ||
| } | ||
| // Pool is in a healthy or degraded state, skip it. | ||
| Ok(_) => {} | ||
karencfv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Err(e) => { | ||
| errors.push(format!("{e}")); | ||
| info!( | ||
| log, | ||
| "output from 'zpool list' contains a zpool with \ | ||
| an unknown state: {state_str}", | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Self { zpools, errors, time_of_status: Some(Utc::now()) } | ||
| } | ||
| } | ||
|
|
||
| /// Wraps commands for interacting with ZFS pools. | ||
| pub struct Zpool(()); | ||
|
|
||
|
|
@@ -330,11 +422,138 @@ impl Zpool { | |
| })?; | ||
| Ok(zpool) | ||
| } | ||
|
|
||
| /// Lists zpools that are in a unhealthy non-functional state. Specifically | ||
| /// if they are in the following states: | ||
| /// | ||
| /// - Faulted | ||
| /// - Offline | ||
| /// - Removed | ||
| /// - Unavailable | ||
| #[cfg(target_os = "illumos")] | ||
| pub async fn status_unhealthy( | ||
| log: &Logger, | ||
| ) -> Result<UnhealthyZpoolsResult, ExecutionError> { | ||
| let mut command = Command::new(ZPOOL); | ||
| let cmd = command.args(&["list", "-Hpo", "health,name"]); | ||
| info!(log, "Retrieving information from zpools"); | ||
| let output = execute_async(cmd).await?; | ||
| let zpool_result = UnhealthyZpoolsResult::parse(&log, &output.stdout); | ||
| info!(log, "Successfully retrieved unhealthy zpools"); | ||
| Ok(zpool_result) | ||
| } | ||
|
|
||
| #[cfg(not(target_os = "illumos"))] | ||
| pub async fn status_unhealthy( | ||
| log: &Logger, | ||
| ) -> Result<UnhealthyZpoolsResult, ExecutionError> { | ||
| info!(log, "OS not illumos, will not retrieve zpool information"); | ||
| let zpool_result = UnhealthyZpoolsResult::new(); | ||
| Ok(zpool_result) | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod test { | ||
| use super::*; | ||
| use slog::Drain; | ||
| use slog::o; | ||
| use slog_term::FullFormat; | ||
| use slog_term::PlainDecorator; | ||
| use slog_term::TestStdoutWriter; | ||
|
|
||
| fn log() -> slog::Logger { | ||
| let decorator = PlainDecorator::new(TestStdoutWriter); | ||
| let drain = FullFormat::new(decorator).build().fuse(); | ||
| let drain = slog_async::Async::new(drain).build().fuse(); | ||
| slog::Logger::root(drain, o!()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_unhealthy_zpool_parse_success() { | ||
| let output = r#"FAULTED fakepool1 | ||
| UNAVAIL fakepool2 | ||
| ONLINE rpool | ||
| "#; | ||
|
|
||
| let log = log(); | ||
| let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes()); | ||
|
|
||
| // We want to make sure we only have two unhealthy pools | ||
| assert_eq!( | ||
| result.zpools, | ||
| vec!["fakepool1".to_string(), "fakepool2".to_string()] | ||
| ); | ||
| assert_eq!(result.errors.len(), 0); | ||
| assert!(result.time_of_status.is_some()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_unhealthy_zpool_parse_none_success() { | ||
| let output = r#"DEGRADED fakepool1 | ||
| ONLINE fakepool2 | ||
| ONLINE rpool | ||
| "#; | ||
|
|
||
| let log = log(); | ||
| let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes()); | ||
|
|
||
| // We want to make sure we only have zero unhealthy pools | ||
| assert_eq!(result.zpools.len(), 0); | ||
| assert_eq!(result.errors.len(), 0); | ||
| assert!(result.time_of_status.is_some()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_unhealthy_zpool_empty_success() { | ||
| let output = r#""#; | ||
|
|
||
| let log = log(); | ||
| let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes()); | ||
|
|
||
| // We want to make sure we only have zero unhealthy pools | ||
| assert_eq!(result.zpools.len(), 0); | ||
| assert_eq!(result.errors.len(), 0); | ||
| assert!(result.time_of_status.is_some()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_unhealthy_zpool_parse_unknown_status_fail() { | ||
| let output = r#"BARNACLES! fakepool1 | ||
| FAULTED fakepool2 | ||
| ONLINE rpool | ||
| "#; | ||
|
|
||
| let log = log(); | ||
| let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes()); | ||
|
|
||
| assert_eq!(result.zpools, vec!["fakepool2".to_string()]); | ||
| assert_eq!( | ||
| result.errors, | ||
| vec![ | ||
| "Failed to parse output: Unrecognized zpool 'health': BARNACLES!" | ||
| .to_string(), | ||
| ] | ||
| ); | ||
| assert!(result.time_of_status.is_some()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_unhealthy_zpool_parse_zpool_fail() { | ||
| let output = r#"FAULTED | ||
| ONLINE rpool | ||
| "#; | ||
|
|
||
| let log = log(); | ||
| let result = UnhealthyZpoolsResult::parse(&log, output.as_bytes()); | ||
|
|
||
| assert_eq!(result.zpools.len(), 0); | ||
| assert_eq!( | ||
| result.errors, | ||
| vec!["Unexpected output line: FAULTED".to_string(),], | ||
| ); | ||
| assert!(result.time_of_status.is_some()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_parse_zpool() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I went back and forth between just having a list of unhealthy zpools, or associating each zpool with it's state. In the end I went with listing the zpools only, but I'm not convinced. We'll be including the information of the health checks in the support bundle, and it'd be useful for them to be able to see what state each zpool is in. Thoughts? @davepacheco @jgallagher
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Associating each zpool with its state sounds good to me; having an explicit entry for "this zpool was healthy" seems safer than inferring "any zpool that isn't explicitly listed as unhealthy must have been healthy".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm, I was thinking of only including the "unhealthy" zpools with their associated statuses in this list. Similarly with the
svcs_in_maintenanceI only added the services in maintenance with their associated zones. If I were to include the "healthy" zpools then it wouldn't really be consistent with the services in maintenance no?My take on the health checks is to only report on things that are in an unhealthy state. Thoughts?