diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index d6d6f89b40b..aa9a4decc04 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -700,20 +700,23 @@ task: "fm_analysis" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms - parent sitrep ID: None + parent sitrep ID: Some(..................... (sitrep)) current inventory collection ID: Some(..................... (collection)) ereport classes consumed: (none) FAULT MANAGEMENT ANALYSIS SUMMARY ================================= -/!\ analysis failed: FM analysis is not yet implemented + no changes from the current situation report (Some(..................... (sitrep))) fault management analysis inputs -------------------------------- - parent sitrep: + parent sitrep: ..................... inventory collection: ..................... + no new ereports since the parent sitrep no cases copied forward + no in-service control plane disks + fault management analysis report -------------------------------- sitrep ID: ..................... @@ -724,28 +727,28 @@ task: "fm_rendezvous" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms -(i) no FM situation report loaded, so rendezvous was not performed + current sitrep: ..................... creating requested alerts: -(i) note: this operation was not executed + started at (s ago) and ran for ms alerts requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 creating requested support bundles: -(i) note: this operation was not executed + started at (s ago) and ran for ms support bundles requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 marking ereports as seen: -(i) note: this operation was not executed + started at (s ago) and ran for ms total ereports in sitrep: 0 not marked when the sitrep was loaded: 0 marked seen by this activation: 0 already marked seen: 0 - batch size: 0 + batch size: 1000 batches: 0 errors: 0 @@ -762,6 +765,8 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 + orphaned fm_fact rows deleted: 0 + batches: 1 orphaned fm_support_bundle_request rows deleted: 0 batches: 1 orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 @@ -775,7 +780,9 @@ task: "fm_sitrep_loader" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - no FM situation report available to load + loaded latest FM situation report as of : + sitrep ..................... (sitrep) (v1) + made current at: task: "instance_reincarnation" configured period: every m @@ -1391,20 +1398,23 @@ task: "fm_analysis" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms - parent sitrep ID: None + parent sitrep ID: Some(..................... (sitrep)) current inventory collection ID: Some(..................... (collection)) ereport classes consumed: (none) FAULT MANAGEMENT ANALYSIS SUMMARY ================================= -/!\ analysis failed: FM analysis is not yet implemented + no changes from the current situation report (Some(..................... (sitrep))) fault management analysis inputs -------------------------------- - parent sitrep: + parent sitrep: ..................... inventory collection: ..................... + no new ereports since the parent sitrep no cases copied forward + no in-service control plane disks + fault management analysis report -------------------------------- sitrep ID: ..................... @@ -1415,28 +1425,28 @@ task: "fm_rendezvous" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms -(i) no FM situation report loaded, so rendezvous was not performed + current sitrep: ..................... creating requested alerts: -(i) note: this operation was not executed + started at (s ago) and ran for ms alerts requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 creating requested support bundles: -(i) note: this operation was not executed + started at (s ago) and ran for ms support bundles requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 marking ereports as seen: -(i) note: this operation was not executed + started at (s ago) and ran for ms total ereports in sitrep: 0 not marked when the sitrep was loaded: 0 marked seen by this activation: 0 already marked seen: 0 - batch size: 0 + batch size: 1000 batches: 0 errors: 0 @@ -1453,6 +1463,8 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 + orphaned fm_fact rows deleted: 0 + batches: 1 orphaned fm_support_bundle_request rows deleted: 0 batches: 1 orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 @@ -1466,7 +1478,9 @@ task: "fm_sitrep_loader" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - no FM situation report available to load + loaded latest FM situation report as of : + sitrep ..................... (sitrep) (v1) + made current at: task: "instance_reincarnation" configured period: every m diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 92c6e4e186d..34be86b23f1 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -187,6 +187,12 @@ async fn test_omdb_success_cases() { .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) .await; + // Wait until `fm_analysis` has committed at least one sitrep, so that the + // omdb snapshot for FM tasks is stable. (Otherwise sitrep IDs render as + // `None` or `Some(...)` depending on whether the task's natural cadence + // had landed by the time we sample it.) + cptestctx.wait_for_at_least_one_sitrep(Duration::from_secs(60)).await; + let mut output = String::new(); let invocations: &[&[&str]] = &[ @@ -350,6 +356,15 @@ async fn test_omdb_success_cases() { .field("triggered by", r"[\w ]+") .section(&["task: \"tuf_artifact_replication\"", "request ringbuf:"]); + // The `fm_analysis` task's input report includes a line comparing the + // current inventory collection against the parent sitrep's collection, + // which can be either "same" or "different" depending on whether a new + // inventory was collected between sitreps. Collapse both forms. + redactor.variable_regex( + "fm_input_inv_comparison", + r" --> (same collection as parent sitrep|different from parent sitrep \(collection [-a-f0-9]+\))", + ); + // The `sp_ereport_ingester` task's output depends on how many simulated // sled agents ahppen to register with Nexus before its first execution. // These redactions work around the issue described in diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 04776f99539..5ca49b7b187 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -9,10 +9,10 @@ use super::DiagnosisEngine; use super::SupportBundleRequest; use crate::DbTypedUuid; use crate::ereport; -use nexus_db_schema::schema::{fm_case, fm_ereport_in_case}; +use nexus_db_schema::schema::{fm_case, fm_ereport_in_case, fm_fact}; use nexus_types::fm; use omicron_uuid_kinds::{ - CaseEreportKind, CaseKind, EreporterRestartKind, SitrepKind, + CaseEreportKind, CaseKind, EreporterRestartKind, FactKind, SitrepKind, }; /// Metadata describing a fault management case. @@ -64,6 +64,7 @@ impl CaseMetadata { alerts_requested: _, support_bundles_requested: _, ereports: _, + facts: _, } = case; Self { sitrep_id: sitrep_id.into(), @@ -76,6 +77,44 @@ impl CaseMetadata { } } +/// Diesel row for the `fm_fact` table. See +/// [`nexus_types::fm::case::Fact`] for semantics. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_fact)] +pub struct Fact { + pub id: DbTypedUuid, + /// The sitrep to which this fact belongs. + /// + /// This will change as the fact is carried forward from + /// one sitrep to the next. + pub sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + /// Sitrep in which this fact was first added. + /// + /// Preserved unchanged when the fact is carried forward; debug-only. + pub created_sitrep_id: DbTypedUuid, + pub payload: serde_json::Value, + pub comment: String, +} + +impl Fact { + pub fn from_sitrep( + sitrep_id: impl Into>, + case_id: impl Into>, + fact: &fm::case::Fact, + ) -> Self { + let fm::case::Fact { id, created_sitrep_id, payload, comment } = fact; + Self { + id: (*id).into(), + sitrep_id: sitrep_id.into(), + case_id: case_id.into(), + created_sitrep_id: (*created_sitrep_id).into(), + payload: payload.clone(), + comment: comment.clone(), + } + } +} + /// An association between an ereport and a case. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_ereport_in_case)] diff --git a/nexus/db-model/src/fm/diagnosis_engine.rs b/nexus/db-model/src/fm/diagnosis_engine.rs index 7d354142bbb..a99ceeac1ad 100644 --- a/nexus/db-model/src/fm/diagnosis_engine.rs +++ b/nexus/db-model/src/fm/diagnosis_engine.rs @@ -24,6 +24,7 @@ impl_enum_type!( pub enum DiagnosisEngine; PowerShelf => b"power_shelf" + PhysicalDisk => b"physical_disk" ); @@ -31,6 +32,9 @@ impl From for fm::DiagnosisEngineKind { fn from(de: DiagnosisEngine) -> Self { match de { DiagnosisEngine::PowerShelf => fm::DiagnosisEngineKind::PowerShelf, + DiagnosisEngine::PhysicalDisk => { + fm::DiagnosisEngineKind::PhysicalDisk + } } } } @@ -39,6 +43,9 @@ impl From for DiagnosisEngine { fn from(fm_de: fm::DiagnosisEngineKind) -> Self { match fm_de { fm::DiagnosisEngineKind::PowerShelf => DiagnosisEngine::PowerShelf, + fm::DiagnosisEngineKind::PhysicalDisk => { + DiagnosisEngine::PhysicalDisk + } } } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 5caf71e9e28..f14fbb76b0d 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(261, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(262, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(262, "fm-disk-de-and-facts"), KnownVersion::new(261, "remove-add-zones-with-mupdate-override"), KnownVersion::new(260, "ereport-trim-serial-trailing-nulls"), KnownVersion::new(259, "vmm-failure-reason"), diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 1552d463dfb..a33d243adfb 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -34,6 +34,7 @@ use nexus_db_schema::schema::ereport::dsl as ereport_dsl; use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; use nexus_db_schema::schema::fm_case::dsl as case_dsl; use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; +use nexus_db_schema::schema::fm_fact::dsl as fact_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; use nexus_db_schema::schema::fm_support_bundle_request::dsl as support_bundle_req_dsl; @@ -48,6 +49,8 @@ use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportKind; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::FactKind; +use omicron_uuid_kinds::FactUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SitrepUuid; use omicron_uuid_kinds::SupportBundleKind; @@ -121,6 +124,7 @@ sitrep_child_tables! { SupportBundleRequestDataSelectionEreports => { table: "fm_support_bundle_request_data_selection_ereports" }, SupportBundleRequest => { table: "fm_support_bundle_request" }, Case => { table: "fm_case" }, + Fact => { table: "fm_fact" }, } /// Per-child-table statistics from a single GC pass. @@ -366,6 +370,8 @@ impl DataStore { let mut support_bundle_requests = self.support_bundle_requests_read_on_conn(id, conn).await?; + let mut case_facts = self.fm_facts_read_on_conn(id, conn).await?; + // Next, load the case metadata entries and marry them to the sets of // ereports, alert requests, and support bundle requests for those // cases that we loaded in the previous steps. @@ -408,6 +414,7 @@ impl DataStore { alert_requests.remove(&id).unwrap_or_default(); let support_bundles_requested = support_bundle_requests.remove(&id).unwrap_or_default(); + let facts = case_facts.remove(&id).unwrap_or_default(); fm::Case { id, metadata: fm::case::Metadata { @@ -419,6 +426,7 @@ impl DataStore { alerts_requested, ereports, support_bundles_requested, + facts, } })); } @@ -497,6 +505,61 @@ impl DataStore { Ok(by_case) } + /// Fetch all `fm_fact` rows belonging to cases in the given sitrep, + /// grouped by `case_id`. + async fn fm_facts_read_on_conn( + &self, + id: SitrepUuid, + conn: &async_bb8_diesel::Connection, + ) -> Result>, Error> { + let mut by_case = + HashMap::>::new(); + + let mut paginator: Paginator> = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated( + fact_dsl::fm_fact, + fact_dsl::id, + &p.current_pagparams(), + ) + .filter(fact_dsl::sitrep_id.eq(id.into_untyped_uuid())) + .select(model::fm::Fact::as_select()) + .load_async(conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to load case facts") + })?; + + paginator = p.found_batch(&batch, &|f| f.id); + for fact in batch { + let case_id: CaseUuid = fact.case_id.into(); + let id: FactUuid = fact.id.into(); + by_case + .entry(case_id) + .or_default() + .insert_unique(fm::case::Fact { + id, + created_sitrep_id: fact.created_sitrep_id.into(), + payload: fact.payload, + comment: fact.comment, + }) + .map_err(|_| { + let internal_message = format!( + "encountered multiple case facts for case \ + {case_id} with the same fact UUID {id}. this \ + should really not be possible, as the fact \ + UUID is a primary key!", + ); + Error::InternalError { internal_message } + })?; + } + } + + Ok(by_case) + } + /// Fetch all support bundle requests belonging to cases in the given /// sitrep, including their child data selection tables. async fn support_bundle_requests_read_on_conn( @@ -740,6 +803,7 @@ impl DataStore { let mut support_bundles_requested = Vec::new(); let mut bundle_data_selections_requested = Vec::new(); let mut case_ereports = Vec::new(); + let mut case_facts = Vec::new(); for case in sitrep.cases { let case_id = case.id; cases.push(model::fm::CaseMetadata::from_sitrep(sitrep_id, &case)); @@ -763,6 +827,11 @@ impl DataStore { ); bundle_data_selections_requested.push((req_id, data_selection)); } + for fact in case.facts.iter() { + case_facts.push(model::fm::Fact::from_sitrep( + sitrep_id, case_id, fact, + )); + } } if !case_ereports.is_empty() { @@ -797,6 +866,17 @@ impl DataStore { ) .await?; + if !case_facts.is_empty() { + diesel::insert_into(fact_dsl::fm_fact) + .values(case_facts) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert case facts") + })?; + } + if !cases.is_empty() { diesel::insert_into(case_dsl::fm_case) .values(cases) @@ -1138,6 +1218,7 @@ impl DataStore { alert_requests_deleted: usize, support_bundle_requests_deleted: usize, cases_deleted: usize, + facts_deleted: usize, } let err = OptionalError::new(); @@ -1147,6 +1228,7 @@ impl DataStore { alert_requests_deleted, support_bundle_requests_deleted, cases_deleted, + facts_deleted, } = self // Sitrep deletion is transactional to prevent a sitrep from being // left in a partially-deleted state should the Nexus instance @@ -1190,6 +1272,15 @@ impl DataStore { Self::support_bundle_requests_delete_on_conn(&conn, ids.clone()) .await?; + // Delete case facts. + let facts_deleted = diesel::delete( + fact_dsl::fm_fact.filter( + fact_dsl::sitrep_id.eq_any(ids.clone()), + ), + ) + .execute_async(&conn) + .await?; + // Delete case metadata records. let cases_deleted = diesel::delete( case_dsl::fm_case @@ -1212,6 +1303,7 @@ impl DataStore { alert_requests_deleted, support_bundle_requests_deleted, case_ereports_deleted, + facts_deleted, }) } }) @@ -1230,6 +1322,7 @@ impl DataStore { "case_ereports_deleted" => case_ereports_deleted, "alert_requests_deleted" => alert_requests_deleted, "support_bundle_requests_deleted" => support_bundle_requests_deleted, + "facts_deleted" => facts_deleted, ); Ok(sitreps_deleted) @@ -2037,6 +2130,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested, + facts, } = case; let case_id = id; let Some(expected) = this.cases.get(&case_id) else { @@ -2068,6 +2162,7 @@ mod tests { &expected.metadata.de, de, "while checking case {case_id}" ); + assert_eq!(&expected.facts, facts, "while checking case {case_id}"); // Now, check that all the ereports are present in both cases. assert_eq!(ereports.len(), expected.ereports.len()); @@ -2241,6 +2336,19 @@ mod tests { .unwrap(); } + let mut facts = iddqd::IdOrdMap::new(); + facts + .insert_unique(fm::case::Fact { + id: FactUuid::new_v4(), + created_sitrep_id: sitrep_id, + payload: serde_json::json!({ + "kind": "representative_fact", + "note": "for round-trip testing", + }), + comment: "a representative fact for case 1".to_string(), + }) + .unwrap(); + fm::Case { id: omicron_uuid_kinds::CaseUuid::new_v4(), metadata: fm::case::Metadata { @@ -2252,6 +2360,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested, + facts, } }; @@ -2288,6 +2397,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -2406,6 +2516,7 @@ mod tests { ereports: iddqd::IdOrdMap::new(), alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested, + facts: iddqd::IdOrdMap::new(), }; let mut cases = iddqd::IdOrdMap::new(); diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 05d6c96b1db..b6aea91e62b 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -3243,6 +3243,17 @@ table! { } } +table! { + fm_fact (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + case_id -> Uuid, + created_sitrep_id -> Uuid, + payload -> Jsonb, + comment -> Text, + } +} + table! { fm_ereport_in_case (sitrep_id, id) { id -> Uuid, @@ -3258,6 +3269,8 @@ table! { allow_tables_to_appear_in_same_query!(fm_ereport_in_case, ereport); allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_fact); +allow_tables_to_appear_in_same_query!(fm_case, fm_fact); table! { fm_alert_request (sitrep_id, id) { diff --git a/nexus/fm/src/analysis_input.rs b/nexus/fm/src/analysis_input.rs index c9d1ae57428..2f8e135cbb2 100644 --- a/nexus/fm/src/analysis_input.rs +++ b/nexus/fm/src/analysis_input.rs @@ -8,6 +8,7 @@ use chrono::{DateTime, Utc}; use iddqd::IdOrdMap; use nexus_types::fm::analysis_reports::ClosedCaseReport; use nexus_types::fm::{self, Sitrep, SitrepVersion}; +use nexus_types::in_service_disk::InServiceDisk; use nexus_types::inventory; use omicron_uuid_kinds::CollectionUuid; use std::collections::BTreeMap; @@ -39,6 +40,8 @@ pub struct Input { new_ereports: IdOrdMap, open_cases: IdOrdMap, closed_cases_copied_forward: IdOrdMap, + /// All control plane managed disks + in_service_disks: Arc>, } impl Input { @@ -65,11 +68,19 @@ impl Input { &self.closed_cases_copied_forward } + /// All control-plane-managed disks (`physical_disk.disk_policy = + /// in_service` in the DB), indexed by `physical_disk_id`. See the + /// field-level documentation on `Input::in_service_disks` for semantics. + pub fn in_service_disks(&self) -> &IdOrdMap { + &self.in_service_disks + } + /// Returns a [`Builder`] for constructing a new `Input` from the provided - /// `parent_sitrep` and inventory collection. + /// `parent_sitrep`, inventory collection, and in-service disks. pub fn builder( parent_sitrep: Option>, inv: Arc, + in_service_disks: Arc>, ) -> Result { // Before preparing analysis inputs, check that the proposed input // inventory collection is at least as new as the parent sitrep's @@ -94,6 +105,7 @@ impl Input { Ok(Builder { parent_sitrep, inv, + in_service_disks, new_ereports: IdOrdMap::default(), unmarked_seen_ereports: BTreeSet::default(), }) @@ -117,6 +129,7 @@ pub enum InvalidInputs { pub struct Builder { parent_sitrep: Option>, inv: Arc, + in_service_disks: Arc>, /// Ereports which are new and should be input to analysis in the next /// sitrep. new_ereports: IdOrdMap, @@ -184,6 +197,11 @@ impl Builder { .collect(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), + in_service_disks: self + .in_service_disks + .iter() + .map(|d| d.physical_disk_id) + .collect(), }; // Determine which cases must be copied forwards into the next sitrep. @@ -234,6 +252,7 @@ impl Builder { new_ereports: self.new_ereports, open_cases, closed_cases_copied_forward, + in_service_disks: self.in_service_disks, }; (input, report) @@ -346,6 +365,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; let open_case2 = { @@ -366,6 +386,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; let closed_case_with_unmarked = { @@ -393,6 +414,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; let closed_case_without_unmarked = { @@ -414,6 +436,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; @@ -463,8 +486,12 @@ mod tests { // Build analysis input let (input, report) = { - let mut builder = Input::builder(Some(parent_sitrep), inv) - .expect("collection start time check should always pass"); + let mut builder = Input::builder( + Some(parent_sitrep), + inv, + Arc::new(IdOrdMap::new()), + ) + .expect("collection start time check should always pass"); // Pass in four ereports: // - two that are in the open cases of the parent sitrep // - one that is in the (to-be-copied-forward) closed case diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index d4d8c5e4d4e..810e6648492 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -10,7 +10,9 @@ use nexus_types::alert::AlertClass; use nexus_types::fm; use nexus_types::support_bundle::BundleDataSelection; use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::FactUuid; use omicron_uuid_kinds::SitrepUuid; +use std::fmt::Debug; use std::sync::Arc; #[derive(Debug)] @@ -76,6 +78,7 @@ impl AllCases { ereports: Default::default(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), }; let mut builder = CaseBuilder::new(&self.log, sitrep_id, case, case_rng); @@ -215,6 +218,65 @@ impl CaseBuilder { self.report_log.entry("case closed").comment(comment); } + /// Replace this case's free-form comment string. + pub fn set_comment(&mut self, comment: impl ToString) { + self.case.metadata.comment = comment.to_string(); + } + + /// Emit a new fact under this case. The fact's UUID is freshly + /// allocated from the case's deterministic RNG. + pub fn add_fact( + &mut self, + fact: &T, + comment: impl ToString, + ) -> anyhow::Result { + let id = loop { + let id = self.rng.next_fact(); + if !self.case.facts.contains_key(&id) { + break id; + } + }; + let payload = serde_json::to_value(fact).with_context( + || "failed to serialize case fact payload {fact:?}", + )?; + let comment = comment.to_string(); + slog::info!( + &self.log, + "added a fact"; + "fact_id" => %id, + "payload" => %payload, + "comment" => %comment, + ); + self.report_log + .entry("added fact") + .kv("fact_id", id) + .kv("payload", &payload) + .comment(comment.clone()); + let fact = fm::case::Fact { + id, + created_sitrep_id: self.sitrep_id, + payload, + comment, + }; + self.case.facts.insert_unique(fact).expect("UUID should be unused"); + Ok(id) + } + + /// Remove a fact from this case. The fact will not be carried forward + /// into the next sitrep. + pub fn remove_fact(&mut self, id: FactUuid) { + if self.case.facts.remove(&id).is_some() { + slog::info!(&self.log, "removed a fact"; "fact_id" => %id); + self.report_log.entry("removed fact").kv("fact_id", id); + } + } + + /// Iterate the facts currently attached to this case (including any that + /// were carried forward from the parent sitrep). + pub fn facts(&self) -> impl Iterator { + self.case.facts.iter() + } + pub fn add_ereport( &mut self, report: &Arc, diff --git a/nexus/fm/src/builder/rng.rs b/nexus/fm/src/builder/rng.rs index 5490ef67ae9..9ba02eca5cc 100644 --- a/nexus/fm/src/builder/rng.rs +++ b/nexus/fm/src/builder/rng.rs @@ -14,6 +14,8 @@ use omicron_uuid_kinds::CaseEreportKind; use omicron_uuid_kinds::CaseEreportUuid; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::FactKind; +use omicron_uuid_kinds::FactUuid; use omicron_uuid_kinds::SitrepUuid; use omicron_uuid_kinds::SupportBundleKind; use omicron_uuid_kinds::SupportBundleUuid; @@ -59,11 +61,17 @@ impl SitrepBuilderRng { } } +/// Per-case child RNGs. Each `next_*` returns the next deterministic UUID +/// in its stream; collisions across calls are statistically impossible, but +/// callers in `builder/case.rs` still guard inserts with a `contains_key` +/// loop so that a future change here (e.g., reseeding mid-build) can't +/// silently corrupt a case. #[derive(Clone, Debug)] pub(super) struct CaseBuilderRng { ereport_assignment_rng: TypedUuidRng, alert_rng: TypedUuidRng, support_bundle_rng: TypedUuidRng, + fact_rng: TypedUuidRng, } impl CaseBuilderRng { @@ -85,7 +93,11 @@ impl CaseBuilderRng { &mut sitrep.parent, (case_id, "support-bundle"), ); - Self { alert_rng, ereport_assignment_rng, support_bundle_rng } + let fact_rng = TypedUuidRng::from_parent_rng( + &mut sitrep.parent, + (case_id, "case-fact"), + ); + Self { alert_rng, ereport_assignment_rng, support_bundle_rng, fact_rng } } pub(super) fn next_alert(&mut self) -> AlertUuid { @@ -99,4 +111,8 @@ impl CaseBuilderRng { pub(super) fn next_support_bundle(&mut self) -> SupportBundleUuid { self.support_bundle_rng.next() } + + pub(super) fn next_fact(&mut self) -> FactUuid { + self.fact_rng.next() + } } diff --git a/nexus/fm/src/diagnosis.rs b/nexus/fm/src/diagnosis/mod.rs similarity index 59% rename from nexus/fm/src/diagnosis.rs rename to nexus/fm/src/diagnosis/mod.rs index 96ea491f1b1..9e9f1959c96 100644 --- a/nexus/fm/src/diagnosis.rs +++ b/nexus/fm/src/diagnosis/mod.rs @@ -2,22 +2,32 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! Fault management diagnosis engines. +//! +//! Each submodule defines one diagnosis engine (DE). `analyze` dispatches to +//! each engine in turn; engines are deterministic and idempotent per RFD 603, +//! so the dispatch order does not matter. + use crate::SitrepBuilder; use crate::analysis_input::Input; +mod physical_disk; + pub fn analyze( - _input: &Input, - _builder: &mut SitrepBuilder<'_>, + input: &Input, + builder: &mut SitrepBuilder<'_>, ) -> anyhow::Result<()> { - anyhow::bail!("FM analysis is not yet implemented") + physical_disk::analyze(input, builder)?; + Ok(()) } -/// Ereport classes that the diagnosis engine currently understands. -/// Preparation only surfaces ereports whose class is in this set — there is -/// no value in loading ereports FM analysis cannot consume. +/// Ereport classes that any diagnosis engine in this build of Nexus knows +/// how to consume. The background task uses this to filter loaded ereports +/// — there is no value in loading ereports FM analysis cannot consume. /// -/// Empty until [`analyze`] gains real handling. Grow this alongside FM -/// analysis as new classes gain support. +/// Empty today: the only enabled DE is the physical disk DE, which is +/// polling-based and consumes no ereports. Grow this list alongside FM +/// analysis as new classes gain ereport support. /// /// **NULL-class ereports are intentionally excluded by the loader's SQL /// filter** (`class = ANY(...)` never matches NULL). If FM analysis ever diff --git a/nexus/fm/src/diagnosis/physical_disk.rs b/nexus/fm/src/diagnosis/physical_disk.rs new file mode 100644 index 00000000000..3f4b8759531 --- /dev/null +++ b/nexus/fm/src/diagnosis/physical_disk.rs @@ -0,0 +1,943 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Disk diagnosis engine. + +use crate::SitrepBuilder; +use crate::analysis_input::Input; +use chrono::{DateTime, Utc}; +use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use nexus_types::fm::DiagnosisEngineKind; +use nexus_types::inventory::ZpoolHealth; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, FactUuid, PhysicalDiskUuid, ZpoolUuid, +}; +use serde::{Deserialize, Serialize}; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; + +/// Per-fact state for the disk diagnosis engine, serialized into the +/// `fm_fact.payload` JSONB column. Other diagnosis engines must not +/// inspect or modify this; shared FM code treats it as opaque bytes. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum DiskFact { + /// The zpool's most recently observed health is non-`Online`. + ZpoolUnhealthy(ZpoolUnhealthyFactPayload), +} + +/// Payload of a [`DiskFact::ZpoolUnhealthy`] fact. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub struct ZpoolUnhealthyFactPayload { + /// The physical disk this fact (and its parent case) is about. + /// Every fact on a Disk case must agree on this value. + pub physical_disk_id: PhysicalDiskUuid, + /// The zpool whose health was observed. Kept for provenance — the + /// case is keyed by `physical_disk_id`, but knowing the exact zpool + /// makes the fact self-describing when read in isolation. + pub zpool_id: ZpoolUuid, + pub last_seen_health: ZpoolHealth, + /// Inventory collection that produced this observation. Recorded for + /// provenance: if multiple `ZpoolUnhealthy` facts ever end up on the + /// same case, this lets a human reader see which inventory each came + /// from. + pub observed_in_inv: CollectionUuid, + /// `time_done` of `observed_in_inv`. + pub time_observed: DateTime, +} + +/// A [`DiskFact::ZpoolUnhealthy`] payload paired with the `FactUuid` it +/// lives under. Used to build in-memory indices over facts during +/// analysis; not serialized. +#[derive(Clone, Copy, Debug)] +struct ZpoolUnhealthyFact { + fact_id: FactUuid, + payload: ZpoolUnhealthyFactPayload, +} + +impl IdOrdItem for ZpoolUnhealthyFact { + type Key<'a> = FactUuid; + fn key(&self) -> Self::Key<'_> { + self.fact_id + } + id_upcast!(); +} + +/// One in-service disk paired with its current observed health. +/// `health` is `None` when the disk's zpool was not seen in the current +/// inventory (e.g., sled down, lossy collection). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct DiskHealthSnapshot { + physical_disk_id: PhysicalDiskUuid, + zpool_id: ZpoolUuid, + health: Option, +} + +impl IdOrdItem for DiskHealthSnapshot { + type Key<'a> = PhysicalDiskUuid; + fn key(&self) -> Self::Key<'_> { + self.physical_disk_id + } + id_upcast!(); +} + +/// Per-case summary built from a case's facts. Each Disk case is about a +/// single physical disk; every fact on the case must reference that disk. +struct ParentCaseSummary { + /// The physical disk this case is about. + physical_disk_id: PhysicalDiskUuid, + /// All `ZpoolUnhealthy` facts on this case. Normally one; pathological + /// cases (e.g., hand-edited DB) may have multiple — the diagnosis + /// engine keeps all of them. + unhealthy_facts: IdOrdMap, +} + +pub(super) fn analyze( + input: &Input, + builder: &mut SitrepBuilder<'_>, +) -> anyhow::Result<()> { + let inv_collection_id = input.inventory().id; + let inv_time_done = input.inventory().time_done; + + // Index every zpool we observed in this inventory, so we can distinguish + // "saw it, it's Online" from "didn't see it at all" when looking up by + // an in-service disk's zpool below. + let observed_health: BTreeMap = input + .inventory() + .sled_agents + .iter() + .flat_map(|sa| sa.zpools.iter()) + .map(|z| (z.id, z.health)) + .collect(); + + // The current health snapshot for every in-service disk, keyed by + // physical_disk_id. Absence from this index is a positive signal that + // the control plane has moved on from the disk (expungement / + // decommissioning); see prepare_inputs in + // nexus/src/app/background/tasks/fm_analysis.rs. + let in_service_health: IdOrdMap = input + .in_service_disks() + .iter() + .map(|d| DiskHealthSnapshot { + physical_disk_id: d.physical_disk_id, + zpool_id: d.zpool_id, + health: observed_health.get(&d.zpool_id).copied(), + }) + .collect(); + + // Index parent-forwarded Disk cases from the input — the state copied + // from the parent sitrep, *not* the in-progress builder we mutate + // below. Every case is about one physical disk; we derive the disk + // from its facts. Skip (with a warning) any case we can't safely + // interpret. + let parent_cases: BTreeMap = input + .open_cases() + .iter() + .filter(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) + .filter_map(|c| { + let case_id = c.id; + let mut unhealthy_facts: IdOrdMap = + IdOrdMap::new(); + let mut case_disk_id: Option = None; + for fact in c.facts.iter() { + match fact.payload_to::() { + Ok(DiskFact::ZpoolUnhealthy(payload)) => { + let topic = *case_disk_id + .get_or_insert(payload.physical_disk_id); + if topic != payload.physical_disk_id { + slog::warn!( + &builder.log, + "skipping Disk case: facts reference \ + different physical disks (1 expected)"; + "case_id" => %case_id, + "expected_physical_disk_id" => %topic, + "fact_physical_disk_id" => + %payload.physical_disk_id, + ); + return None; + } + unhealthy_facts + .insert_unique(ZpoolUnhealthyFact { + fact_id: fact.id, + payload, + }) + .expect("fact ids are unique within a case"); + } + Err(e) => { + slog::warn!( + &builder.log, + "skipping entire Disk case; one of its facts \ + did not deserialize as DiskFact"; + "case_id" => %case_id, + "fact_id" => %fact.id, + "error" => InlineErrorChain::new(&*e).to_string(), + ); + return None; + } + } + } + let Some(physical_disk_id) = case_disk_id else { + slog::warn!( + &builder.log, + "skipping Disk case with no facts; cannot derive disk \ + topic"; + "case_id" => %case_id, + ); + return None; + }; + Some(( + case_id, + ParentCaseSummary { physical_disk_id, unhealthy_facts }, + )) + }) + .collect(); + + // For each parent case, decide what to do based on its disk's current + // state: + // - disk no longer in service → close the case (expungement) + // - disk's zpool back to Online → close the case (recovery) + // - disk still unhealthy → drop any facts whose recorded health no + // longer matches; the matching loop below will re-add a fresh fact + // - disk in service but absent from inventory → leave alone (absence + // is NOT a recovery signal: sled could be powered off, or + // inventory could be lossy) + for (case_id, summary) in &parent_cases { + let mut case_mut = builder + .cases + .case_mut(case_id) + .expect("case_id came from iterating builder.cases"); + match in_service_health.get(&summary.physical_disk_id) { + None => { + case_mut.close(format!( + "disk {} no longer in service", + summary.physical_disk_id, + )); + } + Some(snap) if snap.health == Some(ZpoolHealth::Online) => { + case_mut + .close(format!("zpool {} back to Online", snap.zpool_id,)); + } + Some(snap) => { + let Some(current_health) = snap.health else { + continue; + }; + for fact_ref in summary.unhealthy_facts.iter() { + if fact_ref.payload.last_seen_health != current_health { + case_mut.remove_fact(fact_ref.fact_id); + } + } + } + } + } + + // For each currently-faulty in-service disk: ensure a case exists + // (reusing the parent-forwarded one for this disk if any) and add a + // fresh fact if one with this exact health isn't already present. + for disk in in_service_health.iter() { + let Some(current_health) = disk.health else { + continue; + }; + if current_health == ZpoolHealth::Online { + continue; + } + + let parent_for_disk = + parent_cases.iter().find_map(|(case_id, summary)| { + if summary.physical_disk_id == disk.physical_disk_id { + Some((*case_id, summary)) + } else { + None + } + }); + + let case_id_for_fact = match parent_for_disk { + // Parent case already has an accurate fact — fully covered. + Some((_, summary)) + if summary + .unhealthy_facts + .iter() + .any(|f| f.payload.last_seen_health == current_health) => + { + continue; + } + // Parent case exists; its stale facts were removed above. + // Refresh under the same case. + Some((case_id, _)) => case_id, + // No parent case for this disk — open one. + None => { + let mut new_case = + builder.cases.open_case(DiagnosisEngineKind::PhysicalDisk); + new_case.set_comment(format!( + "physical disk {} unhealthy", + disk.physical_disk_id, + )); + new_case.id + } + }; + + builder + .cases + .case_mut(&case_id_for_fact) + .expect("case_id came from this fn") + .add_fact( + &DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + physical_disk_id: disk.physical_disk_id, + zpool_id: disk.zpool_id, + last_seen_health: current_health, + observed_in_inv: inv_collection_id, + time_observed: inv_time_done, + }), + format!("zpool {} health={current_health}", disk.zpool_id,), + )?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::analysis_input::Input; + use crate::builder::{SitrepBuilder, SitrepBuilderRng}; + use crate::test_util::FmTest; + use chrono::Utc; + use iddqd::IdOrdMap; + use nexus_types::external_api::physical_disk::PhysicalDiskKind; + use nexus_types::fm::{self, Sitrep, SitrepVersion}; + use nexus_types::in_service_disk::InServiceDisk; + use nexus_types::inventory; + use omicron_test_utils::dev; + use omicron_uuid_kinds::{ + OmicronZoneUuid, PhysicalDiskUuid, SitrepUuid, SledUuid, + }; + use std::sync::Arc; + + /// Synthesize a synthetic in-service disk set from a list of zpool IDs. + /// Each zpool gets its own fresh `PhysicalDiskUuid` and dummy identity + /// facts — tests in this module only care about the zpool dimension. + fn mk_in_service( + zpool_ids: impl IntoIterator, + ) -> IdOrdMap { + zpool_ids + .into_iter() + .map(|zpool_id| InServiceDisk { + physical_disk_id: PhysicalDiskUuid::new_v4(), + zpool_id, + sled_id: SledUuid::new_v4(), + vendor: "test-vendor".to_string(), + serial: format!("test-serial-{zpool_id}"), + model: "test-model".to_string(), + variant: PhysicalDiskKind::U2, + }) + .collect() + } + + /// Find the `physical_disk_id` for the given `zpool_id` in the + /// in-service set, or fabricate a fresh one if not present (e.g., when + /// simulating an expunged disk whose case should still reference some + /// stable PhysicalDiskUuid). + fn disk_id_for( + in_service: &IdOrdMap, + zpool_id: ZpoolUuid, + ) -> PhysicalDiskUuid { + in_service + .iter() + .find(|d| d.zpool_id == zpool_id) + .map(|d| d.physical_disk_id) + .unwrap_or_else(PhysicalDiskUuid::new_v4) + } + + /// Make a synthetic test scenario from the example system: returns a + /// `LogContext` (the caller must `cleanup_successful()` it), the + /// example collection, and every zpool ID in that collection. + fn setup( + test_name: &'static str, + ) -> (dev::LogContext, inventory::Collection, Vec) { + let (fm_test, logctx) = FmTest::new_with_logctx(test_name); + // Build the example system once to get a Collection with zpools. + let (example, _bp) = fm_test.system_builder.build(); + let zpool_ids: Vec = example + .collection + .sled_agents + .iter() + .flat_map(|sa| sa.zpools.iter().map(|z| z.id)) + .collect(); + assert!( + !zpool_ids.is_empty(), + "example system should have at least one zpool" + ); + (logctx, example.collection, zpool_ids) + } + + /// Set the zpool with `zpool_id` to `health`, panicking if not found. + fn set_health( + collection: &mut inventory::Collection, + zpool_id: ZpoolUuid, + health: ZpoolHealth, + ) { + for mut sa in collection.sled_agents.iter_mut() { + for z in sa.zpools.iter_mut() { + if z.id == zpool_id { + z.health = health; + return; + } + } + } + panic!("zpool {zpool_id} not found in collection"); + } + + /// Build an `Input` from a collection, an optional parent sitrep, and a + /// pre-built set of in-service disks. + fn build_input( + collection: inventory::Collection, + parent_sitrep: Option, + in_service: IdOrdMap, + ) -> Input { + let parent = parent_sitrep.map(|s| { + Arc::new(( + SitrepVersion { + id: s.id(), + version: 0, + time_made_current: Utc::now(), + }, + s, + )) + }); + let builder = + Input::builder(parent, Arc::new(collection), Arc::new(in_service)) + .expect("input builder should accept fresh inventory"); + let (input, _report) = builder.build(); + input + } + + /// Run `disk::analyze` over an input and return the resulting Sitrep + /// along with the analysis report (whose log entries the close-comment + /// assertions in `closes_*` tests inspect). + fn run_analyze( + log: &slog::Logger, + input: &Input, + ) -> (Sitrep, fm::analysis_reports::AnalysisReport) { + let mut builder = SitrepBuilder::new_with_rng( + log, + input, + SitrepBuilderRng::from_seed("disk-analyze"), + ); + analyze(input, &mut builder).expect("analyze ok"); + builder.build(OmicronZoneUuid::new_v4(), Utc::now()) + } + + fn make_parent_with_disk_case( + parent_sitrep_id: SitrepUuid, + inv_collection_id: omicron_uuid_kinds::CollectionUuid, + physical_disk_id: PhysicalDiskUuid, + zpool_id: ZpoolUuid, + ) -> Sitrep { + let mut cases = iddqd::IdOrdMap::new(); + let case_id = omicron_uuid_kinds::CaseUuid::new_v4(); + let mut facts = iddqd::IdOrdMap::new(); + facts + .insert_unique(fm::case::Fact { + id: omicron_uuid_kinds::FactUuid::new_v4(), + created_sitrep_id: parent_sitrep_id, + payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy( + ZpoolUnhealthyFactPayload { + physical_disk_id, + zpool_id, + last_seen_health: ZpoolHealth::Degraded, + observed_in_inv: inv_collection_id, + time_observed: Utc::now(), + }, + )) + .unwrap(), + comment: format!("zpool {zpool_id} degraded"), + }) + .unwrap(); + cases + .insert_unique(fm::Case { + id: case_id, + metadata: fm::case::Metadata { + created_sitrep_id: parent_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::PhysicalDisk, + comment: format!("zpool {zpool_id} degraded"), + }, + ereports: Default::default(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + facts, + }) + .unwrap(); + Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + inv_collection_id, + creator_id: OmicronZoneUuid::new_v4(), + parent_sitrep_id: None, + time_created: Utc::now(), + next_inv_min_time_started: Utc::now(), + comment: String::new(), + }, + cases, + ereports_by_id: Default::default(), + } + } + + /// Helper: collect (case, fact) pairs in a sitrep where the fact parses + /// as `DiskFact`. Optionally filtered to open cases only. + fn disk_facts( + sitrep: &Sitrep, + open_only: bool, + ) -> Vec<(&fm::Case, &fm::case::Fact, DiskFact)> { + sitrep + .cases + .iter() + .filter(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) + .filter(|c| !open_only || c.is_open()) + .flat_map(|c| { + c.facts.iter().filter_map(move |f| { + f.payload_to::().ok().map(|d| (c, f, d)) + }) + }) + .collect() + } + + #[test] + fn opens_on_degraded_in_service() { + let (logctx, mut collection, zpools) = + setup("disk_open_degraded_in_service"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + let expected_disk_id = disk_id_for(&in_service, target); + let input = build_input(collection, None, in_service); + + let (sitrep, _report) = run_analyze(&logctx.log, &input); + let facts = disk_facts(&sitrep, true); + assert_eq!(facts.len(), 1); + match &facts[0].2 { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + physical_disk_id, + zpool_id, + last_seen_health, + .. + }) => { + assert_eq!(*physical_disk_id, expected_disk_id); + assert_eq!(*zpool_id, target); + assert_eq!(*last_seen_health, ZpoolHealth::Degraded); + } + } + logctx.cleanup_successful(); + } + + #[test] + fn skips_degraded_when_expunged() { + let (logctx, mut collection, zpools) = setup("disk_skip_expunged"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Faulted); + // target is *not* in the in-service set. + let in_service = mk_in_service(zpools.iter().copied().skip(1)); + let input = build_input(collection, None, in_service); + + let (sitrep, _report) = run_analyze(&logctx.log, &input); + let cases = disk_facts(&sitrep, false); + assert!( + cases.is_empty(), + "no Disk cases should be opened for expunged zpool, got: {:?}", + cases + ); + logctx.cleanup_successful(); + } + + #[test] + fn idempotent_when_case_already_open() { + let (logctx, mut collection, zpools) = setup("disk_idempotent"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); + let parent_id = SitrepUuid::new_v4(); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + let open_cases = disk_facts(&sitrep, true); + assert_eq!(open_cases.len(), 1); + match &open_cases[0].2 { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + zpool_id, + .. + }) => { + assert_eq!(*zpool_id, target); + } + } + logctx.cleanup_successful(); + } + + #[test] + fn closes_on_recovery() { + let (logctx, collection, zpools) = setup("disk_close_on_recovery"); + let target = zpools[0]; + // The example system reports zpools as Online by default. + let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); + let parent_id = SitrepUuid::new_v4(); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, report) = run_analyze(&logctx.log, &input); + let all = disk_facts(&sitrep, false); + assert_eq!(all.len(), 1); + assert!( + !all[0].0.is_open(), + "case should be closed when zpool returns to Online", + ); + let report_str = format!("{}", report.display_multiline(0)); + assert!( + report_str.contains("back to Online"), + "close comment should call out the recovery cause, got: \ + {report_str}", + ); + logctx.cleanup_successful(); + } + + #[test] + fn closes_on_expungement() { + let (logctx, mut collection, zpools) = + setup("disk_close_on_expungement"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + // Target is NOT in-service in this sitrep (just expunged). + let in_service = mk_in_service(zpools.iter().copied().skip(1)); + // Target isn't in the in-service set; fabricate a stable PhysicalDiskUuid. + let target_disk_id = disk_id_for(&in_service, target); + let parent_id = SitrepUuid::new_v4(); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, report) = run_analyze(&logctx.log, &input); + let all = disk_facts(&sitrep, false); + assert_eq!(all.len(), 1); + assert!( + !all[0].0.is_open(), + "case should be closed when zpool's disk is expunged", + ); + let report_str = format!("{}", report.display_multiline(0)); + assert!( + report_str.contains("no longer in service"), + "close comment should call out the expungement cause, got: \ + {report_str}", + ); + logctx.cleanup_successful(); + } + + #[test] + fn keeps_open_on_absence_from_inventory() { + // A zpool the case is about does NOT appear in the inventory at all + // (sled powered off, lossy collection, etc.). The case should stay + // open: absence is not a recovery signal. + let (logctx, collection, zpools) = setup("disk_keep_open_on_absence"); + let phantom = ZpoolUuid::new_v4(); + assert!(!zpools.contains(&phantom)); + let in_service = mk_in_service(zpools.iter().copied().chain([phantom])); + let phantom_disk_id = disk_id_for(&in_service, phantom); + let parent_id = SitrepUuid::new_v4(); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + phantom_disk_id, + phantom, + ); + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + let all = disk_facts(&sitrep, false); + assert_eq!(all.len(), 1); + assert!( + all[0].0.is_open(), + "case should remain open when its zpool is absent from the \ + current inventory collection (sled could be down or inventory \ + is lossy)", + ); + logctx.cleanup_successful(); + } + + /// A parent Disk case carries a fact whose JSON `kind` doesn't match any + /// variant we know. The diagnosis engine should leave that case alone (not + /// touch its open/closed state, not mutate the fact, not count it + /// toward dedup) and freely open a new case for a separate live signal. + #[test] + fn unreadable_fact_is_skipped() { + let (logctx, mut collection, zpools) = setup("disk_unreadable_payload"); + // Live signal on zpools[0]. + let live_target = zpools[0]; + set_health(&mut collection, live_target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + + // Build a parent sitrep with one Disk case carrying a payload that + // doesn't match any variant. + let parent_sitrep_id = SitrepUuid::new_v4(); + let unreadable_case_id = omicron_uuid_kinds::CaseUuid::new_v4(); + let mut parent_cases = iddqd::IdOrdMap::new(); + let unreadable_fact_id = omicron_uuid_kinds::FactUuid::new_v4(); + let unreadable_payload = serde_json::json!({ + "kind": "this_variant_does_not_exist", + "mystery": "data", + }); + let mut parent_facts = iddqd::IdOrdMap::new(); + parent_facts + .insert_unique(fm::case::Fact { + id: unreadable_fact_id, + created_sitrep_id: parent_sitrep_id, + payload: unreadable_payload.clone(), + comment: "fact with payload from the future".to_string(), + }) + .unwrap(); + parent_cases + .insert_unique(fm::Case { + id: unreadable_case_id, + metadata: fm::case::Metadata { + created_sitrep_id: parent_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::PhysicalDisk, + comment: "case with fact from the future".to_string(), + }, + ereports: Default::default(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + facts: parent_facts, + }) + .unwrap(); + let parent = Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + inv_collection_id: collection.id, + creator_id: OmicronZoneUuid::new_v4(), + parent_sitrep_id: None, + time_created: Utc::now(), + next_inv_min_time_started: Utc::now(), + comment: String::new(), + }, + cases: parent_cases, + ereports_by_id: Default::default(), + }; + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + + // The unreadable case must still be present, still open, and its + // fact unchanged. + let unreadable = sitrep + .cases + .iter() + .find(|c| c.id == unreadable_case_id) + .expect("unreadable case should be copied forward"); + assert!( + unreadable.is_open(), + "unreadable case must not be closed by the diagnosis engine", + ); + let kept_fact = unreadable + .facts + .iter() + .find(|f| f.id == unreadable_fact_id) + .expect("unreadable fact should be preserved"); + assert_eq!(kept_fact.payload, unreadable_payload); + + // A fresh ZpoolUnhealthy fact must still open for the live signal + // — the unreadable case did not consume the dedup slot. + let readable = disk_facts(&sitrep, true); + assert_eq!( + readable.len(), + 1, + "expected one readable open ZpoolUnhealthy fact for the live \ + signal", + ); + match &readable[0].2 { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + zpool_id, + .. + }) => { + assert_eq!(*zpool_id, live_target); + } + } + logctx.cleanup_successful(); + } + + /// A parent Disk case with zero facts has no derivable topic disk, so + /// the diagnosis engine leaves it alone (carried forward unchanged). + #[test] + fn empty_case_is_left_open() { + let (logctx, collection, _zpools) = setup("disk_empty_case_left_open"); + let in_service = mk_in_service(std::iter::empty()); + + let parent_sitrep_id = SitrepUuid::new_v4(); + let empty_case_id = omicron_uuid_kinds::CaseUuid::new_v4(); + let mut parent_cases = iddqd::IdOrdMap::new(); + parent_cases + .insert_unique(fm::Case { + id: empty_case_id, + metadata: fm::case::Metadata { + created_sitrep_id: parent_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::PhysicalDisk, + comment: "an open case with no facts".to_string(), + }, + ereports: Default::default(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + facts: Default::default(), + }) + .unwrap(); + let parent = Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + inv_collection_id: collection.id, + creator_id: OmicronZoneUuid::new_v4(), + parent_sitrep_id: None, + time_created: Utc::now(), + next_inv_min_time_started: Utc::now(), + comment: String::new(), + }, + cases: parent_cases, + ereports_by_id: Default::default(), + }; + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + + let case = sitrep + .cases + .iter() + .find(|c| c.id == empty_case_id) + .expect("empty case should still be in the output sitrep"); + assert!( + case.is_open(), + "empty case should be left open (no topic disk to verify)", + ); + logctx.cleanup_successful(); + } + + /// When the parent sitrep's fact content matches the diagnosis engine's current + /// observation, the fact carries forward with the same UUID — no + /// remove-and-readd churn. + #[test] + fn fact_uuid_stable_when_observation_unchanged() { + let (logctx, mut collection, zpools) = setup("disk_fact_uuid_stable"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); + let parent_id = SitrepUuid::new_v4(); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); + // Capture the parent's fact UUID for the target zpool. + let parent_fact_id = parent + .cases + .iter() + .find(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) + .expect("parent should have one Disk case") + .facts + .iter() + .next() + .expect("parent case should have one fact") + .id; + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + let open = disk_facts(&sitrep, true); + assert_eq!(open.len(), 1, "expected exactly one open Disk fact"); + assert_eq!( + open[0].1.id, parent_fact_id, + "fact UUID should be stable across sitreps when the \ + observation hasn't changed", + ); + match &open[0].2 { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + zpool_id, + last_seen_health, + .. + }) => { + assert_eq!(*zpool_id, target); + assert_eq!(*last_seen_health, ZpoolHealth::Degraded); + } + } + logctx.cleanup_successful(); + } + + /// When the parent's fact recorded a different `last_seen_health` than + /// what we observe now, the diagnosis engine removes the stale fact and emits + /// a fresh one (new UUID). The case stays open because the zpool is + /// still unhealthy — just with a different value. + #[test] + fn fact_uuid_rotates_when_observation_changes() { + let (logctx, mut collection, zpools) = setup("disk_fact_uuid_rotates"); + let target = zpools[0]; + // Parent recorded Degraded; current inventory shows Faulted. + set_health(&mut collection, target, ZpoolHealth::Faulted); + let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); + let parent_id = SitrepUuid::new_v4(); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); + let parent_fact_id = parent + .cases + .iter() + .find(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) + .expect("parent should have one Disk case") + .facts + .iter() + .next() + .expect("parent case should have one fact") + .id; + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + let open = disk_facts(&sitrep, true); + assert_eq!( + open.len(), + 1, + "expected exactly one open Disk fact (the refreshed one)", + ); + assert_ne!( + open[0].1.id, parent_fact_id, + "fact UUID should rotate because last_seen_health changed", + ); + match &open[0].2 { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + zpool_id, + last_seen_health, + .. + }) => { + assert_eq!(*zpool_id, target); + assert_eq!(*last_seen_health, ZpoolHealth::Faulted); + } + } + // The case itself should still be the same one that was carried + // forward — only the fact rotated. + assert!(open[0].0.is_open()); + logctx.cleanup_successful(); + } +} diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 10173be4021..17cb3fddd48 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -9,11 +9,15 @@ use anyhow::Context; use chrono::Utc; use fm::analysis_input::InvalidInputs; use futures::future::BoxFuture; +use iddqd::IdOrdMap; +use nexus_db_model::PhysicalDiskPolicy; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore; +use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::pagination::Paginator; use nexus_fm as fm; +use nexus_types::in_service_disk::InServiceDisk; use nexus_types::internal_api::background::FmAnalysisStatus; use nexus_types::internal_api::background::fm_analysis as status; use nexus_types::inventory; @@ -217,8 +221,48 @@ impl FmAnalysis { (fm::analysis_input::Input, status::PreparationStatus), PreparationError, > { - let mut builder = - fm::analysis_input::Input::builder(parent_sitrep, inv)?; + // Load all external (U.2) zpools and project them down to FM's + // `InServiceDisk` view, filtering on `disk_policy = in_service` and a + // live (non-soft-deleted) physical_disk row. M.2 disks are not + // represented as control plane disks today, so the U.2-only filter + // on the underlying query matches reality. + // + // This is the executed view from the DB — flipped only after sagas / + // cleaners have actually drained resources, not while a planner is + // merely proposing changes. A faulty disk a planner proposes to + // expunge is still the diagnoser's concern until the control plane + // has actually moved on. + let zpools_and_disks = self + .datastore + .zpool_list_all_external_batched(opctx) + .await + .context("failed to load in-service control plane disks")?; + let mut in_service_disks_map = IdOrdMap::new(); + for (zpool, disk) in zpools_and_disks { + if disk.disk_policy != PhysicalDiskPolicy::InService { + continue; + } + in_service_disks_map + .insert_unique(InServiceDisk { + physical_disk_id: disk.id(), + zpool_id: zpool.id(), + sled_id: disk.sled_id.into(), + vendor: disk.vendor, + serial: disk.serial, + model: disk.model, + variant: disk.variant.into(), + }) + .expect( + "physical_disk.id is a primary key, so duplicates are \ + impossible", + ); + } + let in_service_disks = Arc::new(in_service_disks_map); + let mut builder = fm::analysis_input::Input::builder( + parent_sitrep, + inv, + in_service_disks, + )?; let mut errors = Vec::new(); self.load_new_ereports(opctx, &mut builder, &mut errors) .await diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index ef76c1db310..0e2696f4778 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -552,6 +552,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case1 .alerts_requested @@ -631,6 +632,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case2 .alerts_requested @@ -933,6 +935,7 @@ mod tests { ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -1145,6 +1148,7 @@ mod tests { ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -1254,6 +1258,7 @@ mod tests { ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -1449,6 +1454,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case1 .support_bundles_requested @@ -1625,6 +1631,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case.support_bundles_requested .insert_unique(fm::case::SupportBundleRequest { diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 6778ea72598..5150a7e4834 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -316,7 +316,6 @@ pub struct Nexus { repo_depot_resolver: Box, /// Watch channel containing the currently-loaded fault management sitrep. - #[allow(dead_code)] sitrep_load_rx: watch::Receiver>, /// handle to pull update status data @@ -1148,6 +1147,14 @@ impl Nexus { &self.db_datastore } + pub(crate) fn sitrep_load_rx( + &self, + ) -> watch::Receiver> { + let mut rx = self.sitrep_load_rx.clone(); + rx.mark_unchanged(); + rx + } + pub(crate) fn samael_max_issue_delay(&self) -> Option { let mid = self.samael_max_issue_delay.lock().unwrap(); *mid diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 929342771a8..5df5b7ef3f3 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -524,6 +524,14 @@ impl nexus_test_interface::NexusServer for Server { self.apictx.context.nexus.inventory_load_rx() } + fn sitrep_load_rx( + &self, + ) -> watch::Receiver< + Option>, + > { + self.apictx.context.nexus.sitrep_load_rx() + } + fn get_http_server_external_address(&self) -> SocketAddr { self.apictx.context.nexus.get_external_server_address().unwrap() } diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 35f726740e1..d2ea6a256cc 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -88,6 +88,12 @@ pub trait NexusServer: Send + Sync + 'static { fn inventory_load_rx(&self) -> watch::Receiver>>; + fn sitrep_load_rx( + &self, + ) -> watch::Receiver< + Option>, + >; + fn get_http_server_external_address(&self) -> SocketAddr; fn get_http_server_techport_address(&self) -> SocketAddr; fn get_http_server_internal_address(&self) -> SocketAddr; diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 693aea88732..745ea7b31bc 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -200,6 +200,38 @@ impl ControlPlaneTestContext { } } + /// Wait until at least one fault management sitrep has been committed and + /// loaded. + /// + /// # Panics + /// + /// Panics if no sitrep is loaded within `timeout`. + pub async fn wait_for_at_least_one_sitrep(&self, timeout: Duration) { + let mut sitrep_rx = self.server.sitrep_load_rx(); + + match wait_for_watch_channel_condition( + &mut sitrep_rx, + async |sitrep| { + if sitrep.is_some() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + timeout, + ) + .await + { + Ok(()) => (), + Err(poll::Error::TimedOut(elapsed)) => { + panic!("no sitrep found within {elapsed:?}"); + } + Err(poll::Error::PermanentError(())) => { + unreachable!("check can only fail via timeout") + } + } + } + pub fn internal_client(&self) -> nexus_client::Client { nexus_client::Client::new( &format!("http://{}", self.internal_client.bind_address), diff --git a/nexus/types/output/analysis_input_report_empty.out b/nexus/types/output/analysis_input_report_empty.out index f221b3960a0..a479e87d36c 100644 --- a/nexus/types/output/analysis_input_report_empty.out +++ b/nexus/types/output/analysis_input_report_empty.out @@ -4,3 +4,5 @@ parent sitrep: inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb no new ereports since the parent sitrep no cases copied forward + +no in-service control plane disks diff --git a/nexus/types/output/analysis_input_report_same_inv.out b/nexus/types/output/analysis_input_report_same_inv.out index 5ed6b60d151..25631755f78 100644 --- a/nexus/types/output/analysis_input_report_same_inv.out +++ b/nexus/types/output/analysis_input_report_same_inv.out @@ -5,3 +5,5 @@ inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb --> same collection as parent sitrep no new ereports since the parent sitrep no cases copied forward + +no in-service control plane disks diff --git a/nexus/types/output/analysis_input_report_with_cases.out b/nexus/types/output/analysis_input_report_with_cases.out index f3af0fe55ff..a1fc877ef72 100644 --- a/nexus/types/output/analysis_input_report_with_cases.out +++ b/nexus/types/output/analysis_input_report_with_cases.out @@ -22,3 +22,5 @@ cases (2 total): closed in sitrep: 55555555-5555-5555-5555-555555555555 copied forwards because these ereports haven't been marked seen yet: * ereport dddddddd-dddd-dddd-dddd-dddddddddddd:2 + +no in-service control plane disks diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index cb78bc05f6a..4c4bbfbaa6b 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -12,6 +12,7 @@ pub mod ereport; pub use ereport::{Ereport, EreportId}; pub mod case; pub use case::Case; +mod json_display; use case::AlertRequest; use chrono::{DateTime, Utc}; @@ -216,4 +217,5 @@ pub struct SitrepVersion { #[strum(serialize_all = "snake_case")] pub enum DiagnosisEngineKind { PowerShelf, + PhysicalDisk, } diff --git a/nexus/types/src/fm/analysis_reports.rs b/nexus/types/src/fm/analysis_reports.rs index 839fb7fade8..42b6c6537a4 100644 --- a/nexus/types/src/fm/analysis_reports.rs +++ b/nexus/types/src/fm/analysis_reports.rs @@ -7,8 +7,11 @@ use super::case; use super::ereport::EreportId; +use super::json_display::fmt_json_value; use iddqd::IdOrdMap; -use omicron_uuid_kinds::{CaseUuid, CollectionUuid, SitrepUuid}; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, PhysicalDiskUuid, SitrepUuid, +}; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -158,85 +161,6 @@ impl DebugLog { } } -/// Recursively format a JSON value as a bulleted list entry, nesting any -/// object or array children as indented sub-bullets. -fn fmt_json_value( - f: &mut fmt::Formatter<'_>, - key: &str, - value: &serde_json::Value, - indent: usize, -) -> fmt::Result { - match value { - serde_json::Value::Object(map) => { - writeln!(f, "{:indent$}* {key}:", "")?; - for (k, v) in map { - fmt_json_value(f, k, v, indent + 2)?; - } - Ok(()) - } - serde_json::Value::Array(arr) => { - writeln!(f, "{:indent$}* {key}:", "")?; - let indent = indent + 2; - for (i, v) in arr.iter().enumerate() { - fmt_json_array_item(f, i + 1, v, indent)?; - } - Ok(()) - } - serde_json::Value::String(s) => { - writeln!(f, "{:indent$}* {key}: {s}", "") - } - serde_json::Value::Null => { - writeln!(f, "{:indent$}* {key}: ", "") - } - serde_json::Value::Bool(b) => { - writeln!(f, "{:indent$}* {key}: {b}", "") - } - serde_json::Value::Number(n) => { - writeln!(f, "{:indent$}* {key}: {n}", "") - } - } -} - -/// Format a single element of a JSON array as a numbered list item, -/// e.g. `1. value` for scalars or `1.` followed by indented children for -/// objects and nested arrays. -fn fmt_json_array_item( - f: &mut fmt::Formatter<'_>, - n: usize, - value: &serde_json::Value, - indent: usize, -) -> fmt::Result { - match value { - serde_json::Value::Object(map) => { - writeln!(f, "{:indent$}{n}.", "")?; - for (k, v) in map { - fmt_json_value(f, k, v, indent + 2)?; - } - Ok(()) - } - serde_json::Value::Array(arr) => { - writeln!(f, "{:indent$}{n}.", "")?; - let indent = indent + 2; - for (i, v) in arr.iter().enumerate() { - fmt_json_array_item(f, i + 1, v, indent)?; - } - Ok(()) - } - serde_json::Value::String(s) => { - writeln!(f, "{:indent$}{n}. {s}", "") - } - serde_json::Value::Null => { - writeln!(f, "{:indent$}{n}. ", "") - } - serde_json::Value::Bool(b) => { - writeln!(f, "{:indent$}{n}. {b}", "") - } - serde_json::Value::Number(num) => { - writeln!(f, "{:indent$}{n}. {num}", "") - } - } -} - impl LogEntry { pub fn new(event: impl ToString) -> Self { Self { event: event.to_string(), comment: None, kvs: BTreeMap::new() } @@ -305,6 +229,9 @@ pub struct InputReport { /// Cases which have closed, but which have been copied forwards as they /// contain ereports which have not yet been marked seen. pub closed_cases_copied_forward: BTreeMap, + /// All control-plane-managed physical disks visible to the diagnosis + /// engines for this analysis pass. + pub in_service_disks: BTreeSet, } #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] @@ -335,6 +262,7 @@ impl fmt::Display for InputReportMultilineDisplay<'_> { new_ereport_ids, open_cases, closed_cases_copied_forward, + in_service_disks, }, indent, } = self; @@ -435,6 +363,21 @@ impl fmt::Display for InputReportMultilineDisplay<'_> { writeln!(f, "{:indent$}no cases copied forward", "")?; } + if in_service_disks.is_empty() { + writeln!(f, "\n{:indent$}no in-service control plane disks", "")?; + } else { + writeln!( + f, + "\n{:indent$}in-service control plane disks ({} total):", + "", + in_service_disks.len() + )?; + let indent = indent + 2; + for disk_id in in_service_disks { + writeln!(f, "{:indent$}* disk {disk_id}", "")?; + } + } + Ok(()) } } @@ -519,6 +462,7 @@ mod tests { new_ereport_ids, open_cases, closed_cases_copied_forward, + in_service_disks: BTreeSet::new(), } } @@ -534,6 +478,7 @@ mod tests { new_ereport_ids: BTreeSet::new(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), + in_service_disks: BTreeSet::new(), } } @@ -552,6 +497,7 @@ mod tests { new_ereport_ids: BTreeSet::new(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), + in_service_disks: BTreeSet::new(), } } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 03e930e4d64..b2a2fdf3f8c 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -2,14 +2,17 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use super::json_display::fmt_json_value; use crate::alert::AlertClass; use crate::fm::DiagnosisEngineKind; use crate::fm::Ereport; use crate::fm::EreportId; use crate::support_bundle::BundleDataSelection; +use anyhow::Context; use iddqd::{IdOrdItem, IdOrdMap}; use omicron_uuid_kinds::{ - AlertUuid, CaseEreportUuid, CaseUuid, SitrepUuid, SupportBundleUuid, + AlertUuid, CaseEreportUuid, CaseUuid, FactUuid, SitrepUuid, + SupportBundleUuid, }; use serde::{Deserialize, Serialize}; use std::fmt; @@ -24,6 +27,9 @@ pub struct Case { pub ereports: IdOrdMap, pub alerts_requested: IdOrdMap, pub support_bundles_requested: IdOrdMap, + /// Diagnosis-engine-derived facts attached to this case. See + /// [`Fact`] for semantics. + pub facts: IdOrdMap, } impl Case { @@ -159,6 +165,98 @@ impl CaseEreport { } } +/// A diagnosis-engine-derived fact attached to a [`Case`]. +/// +/// Facts are **immutable**: to "update" a fact, the diagnosis engine +/// removes the old one and adds a fresh one. As long as a fact's content +/// matches the engine's current view, the same fact is carried forward +/// across sitreps unchanged. +/// +/// The `payload` shape is owned and interpreted only by the case's +/// diagnosis engine. Other engines and shared FM code must treat it as +/// opaque bytes. +/// +/// `Eq`/`PartialEq` derive over all fields, including the raw +/// `serde_json::Value` payload (object key order does not matter; number +/// representation does). This is the equality the DB round-trip test +/// needs. Engine-side comparison should go through [`Fact::payload_to`] +/// and compare typed enum values — never the raw payload. +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct Fact { + pub id: FactUuid, + /// The sitrep in which this fact was first added. Preserved + /// unchanged when the fact is carried forward into a child sitrep. + /// Debug-only. + pub created_sitrep_id: SitrepUuid, + pub payload: serde_json::Value, + pub comment: String, +} + +impl IdOrdItem for Fact { + type Key<'a> = &'a FactUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + iddqd::id_upcast!(); +} + +impl Fact { + /// Attempt to deserialize this fact's payload as `T`. + pub fn payload_to( + &self, + ) -> anyhow::Result { + serde_json::from_value(self.payload.clone()).with_context(|| { + format!( + "failed to deserialize case fact payload: {:?}", + self.payload + ) + }) + } + + pub fn display_multiline( + &self, + indent: usize, + sitrep_id: Option, + ) -> impl fmt::Display + '_ { + struct DisplayFact<'a> { + fact: &'a Fact, + indent: usize, + sitrep_id: Option, + } + + impl fmt::Display for DisplayFact<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const BULLET: &str = "* "; + const ADDED_IN: &str = "added in:"; + const COMMENT: &str = "comment:"; + const WIDTH: usize = const_max_len(&[ADDED_IN, COMMENT]); + + let &Self { + fact: Fact { id, created_sitrep_id, payload, comment }, + indent, + sitrep_id, + } = self; + let this_sitrep = |s| { + if Some(s) == sitrep_id { " <-- this sitrep" } else { "" } + }; + + writeln!(f, "{BULLET:>indent$}fact {id}")?; + writeln!( + f, + "{:>indent$}{ADDED_IN:indent$}{COMMENT: { ereports, alerts_requested, support_bundles_requested, + facts, }, indent, sitrep_id, @@ -286,6 +385,16 @@ impl fmt::Display for DisplayCase<'_> { } } + if !facts.is_empty() { + writeln!(f, "\n{:>indent$}facts:", "")?; + writeln!(f, "{:>indent$}------", "")?; + + let indent = indent + 2; + for fact in facts.iter() { + fact.display_multiline(indent, sitrep_id).fmt(f)?; + } + } + if !alerts_requested.is_empty() { writeln!(f, "\n{:>indent$}alerts requested:", "")?; writeln!(f, "{:>indent$}-----------------", "")?; @@ -377,8 +486,8 @@ mod tests { use crate::support_bundle::BundleDataSelection; use ereport_types::{Ena, EreportId}; use omicron_uuid_kinds::{ - AlertUuid, CaseUuid, EreporterRestartUuid, OmicronZoneUuid, SitrepUuid, - SupportBundleUuid, + AlertUuid, CaseUuid, EreporterRestartUuid, FactUuid, OmicronZoneUuid, + SitrepUuid, SupportBundleUuid, }; use std::str::FromStr; use std::sync::Arc; @@ -514,6 +623,21 @@ mod tests { }) .unwrap(); + let mut facts = IdOrdMap::new(); + facts + .insert_unique(Fact { + id: FactUuid::from_str("f00f00f0-0f00-4f00-8f00-f00f00f00f00") + .unwrap(), + created_sitrep_id, + payload: serde_json::json!({ + "kind": "FactFunLevel", + "fun_level": 3, + "wow": ["what", "a", "fun", "fact"], + }), + comment: "made-up fact for display test".to_string(), + }) + .unwrap(); + // Create the case let case = Case { id: case_id, @@ -527,6 +651,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested, + facts, }; eprintln!("example case display:"); diff --git a/nexus/types/src/fm/json_display.rs b/nexus/types/src/fm/json_display.rs new file mode 100644 index 00000000000..a86b0cc650b --- /dev/null +++ b/nexus/types/src/fm/json_display.rs @@ -0,0 +1,87 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Shared helpers for rendering `serde_json::Value`s as bulleted lists in +//! `Display` impls. + +use std::fmt; + +/// Recursively format a JSON value as a bulleted list entry, nesting any +/// object or array children as indented sub-bullets. +pub(crate) fn fmt_json_value( + f: &mut fmt::Formatter<'_>, + key: &str, + value: &serde_json::Value, + indent: usize, +) -> fmt::Result { + match value { + serde_json::Value::Object(map) => { + writeln!(f, "{:indent$}* {key}:", "")?; + for (k, v) in map { + fmt_json_value(f, k, v, indent + 2)?; + } + Ok(()) + } + serde_json::Value::Array(arr) => { + writeln!(f, "{:indent$}* {key}:", "")?; + let indent = indent + 2; + for (i, v) in arr.iter().enumerate() { + fmt_json_array_item(f, i + 1, v, indent)?; + } + Ok(()) + } + serde_json::Value::String(s) => { + writeln!(f, "{:indent$}* {key}: {s}", "") + } + serde_json::Value::Null => { + writeln!(f, "{:indent$}* {key}: ", "") + } + serde_json::Value::Bool(b) => { + writeln!(f, "{:indent$}* {key}: {b}", "") + } + serde_json::Value::Number(n) => { + writeln!(f, "{:indent$}* {key}: {n}", "") + } + } +} + +/// Format a single element of a JSON array as a numbered list item, +/// e.g. `1. value` for scalars or `1.` followed by indented children for +/// objects and nested arrays. +pub(crate) fn fmt_json_array_item( + f: &mut fmt::Formatter<'_>, + n: usize, + value: &serde_json::Value, + indent: usize, +) -> fmt::Result { + match value { + serde_json::Value::Object(map) => { + writeln!(f, "{:indent$}{n}.", "")?; + for (k, v) in map { + fmt_json_value(f, k, v, indent + 2)?; + } + Ok(()) + } + serde_json::Value::Array(arr) => { + writeln!(f, "{:indent$}{n}.", "")?; + let indent = indent + 2; + for (i, v) in arr.iter().enumerate() { + fmt_json_array_item(f, i + 1, v, indent)?; + } + Ok(()) + } + serde_json::Value::String(s) => { + writeln!(f, "{:indent$}{n}. {s}", "") + } + serde_json::Value::Null => { + writeln!(f, "{:indent$}{n}. ", "") + } + serde_json::Value::Bool(b) => { + writeln!(f, "{:indent$}{n}. {b}", "") + } + serde_json::Value::Number(num) => { + writeln!(f, "{:indent$}{n}. {num}", "") + } + } +} diff --git a/nexus/types/src/in_service_disk.rs b/nexus/types/src/in_service_disk.rs new file mode 100644 index 00000000000..bc7814ce2cb --- /dev/null +++ b/nexus/types/src/in_service_disk.rs @@ -0,0 +1,41 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! "Currently in-service control-plane disks" — the executed view from +//! the `physical_disk` and `zpool` DB tables. +//! +//! This is distinct from the planned view in `BlueprintPhysicalDiskConfig`: +//! a disk is in this set only after the control plane has actually committed +//! to managing it (`physical_disk.disk_policy = 'in_service'`), not while a +//! planner is merely proposing to expunge or adopt it. Consumers that need +//! the *committed* view of which disks are part of the rack — fault +//! management diagnosers in particular — should read this rather than the +//! target blueprint. + +use crate::external_api::physical_disk::PhysicalDiskKind; +use iddqd::{IdOrdItem, id_upcast}; +use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; + +/// One control-plane-managed physical disk, joined with its zpool and the +/// stable identity facts the DB already knows about it. +#[derive(Clone, Debug, PartialEq)] +pub struct InServiceDisk { + pub physical_disk_id: PhysicalDiskUuid, + pub zpool_id: ZpoolUuid, + pub sled_id: SledUuid, + pub vendor: String, + pub serial: String, + pub model: String, + pub variant: PhysicalDiskKind, +} + +impl IdOrdItem for InServiceDisk { + type Key<'a> = PhysicalDiskUuid; + fn key(&self) -> Self::Key<'_> { + self.physical_disk_id + } + id_upcast!(); +} diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 887fb615599..1b71884590e 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -45,7 +45,7 @@ use sled_agent_types_versions::latest::inventory::SingleMeasurementInventory; use sled_agent_types_versions::latest::inventory::SledCpuFamily; use sled_agent_types_versions::latest::inventory::SledRole; use sled_agent_types_versions::latest::inventory::SvcsEnabledNotOnlineResult; -use sled_agent_types_versions::latest::inventory::ZpoolHealth; +pub use sled_agent_types_versions::latest::inventory::ZpoolHealth; use sled_hardware_types::BaseboardId; use std::collections::BTreeMap; use std::collections::BTreeSet; diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 36e945f58eb..9a5eeb77c06 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -35,6 +35,7 @@ pub mod deployment; pub mod external_api; pub mod fm; pub mod identity; +pub mod in_service_disk; pub mod instance; pub mod internal_api; pub mod inventory; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 308379c85c5..9948d124513 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7620,7 +7620,8 @@ ON omicron.public.fm_sitrep_history (sitrep_id); CREATE TYPE IF NOT EXISTS omicron.public.diagnosis_engine AS ENUM ( - 'power_shelf' + 'power_shelf', + 'physical_disk' ); CREATE TABLE IF NOT EXISTS omicron.public.fm_case ( @@ -7647,6 +7648,33 @@ CREATE INDEX IF NOT EXISTS lookup_fm_cases_for_sitrep ON omicron.public.fm_case (sitrep_id); +-- Per-engine "facts" attached to a case. A fact's payload is a JSONB blob +-- owned by the case's diagnosis engine (see `fm_case.de`) and opaque to +-- shared FM code. +CREATE TABLE IF NOT EXISTS omicron.public.fm_fact ( + -- Stable UUID for this fact across sitreps. + id UUID NOT NULL, + -- Sitrep this row belongs to. + sitrep_id UUID NOT NULL, + -- UUID of the case this fact attaches to. + case_id UUID NOT NULL, + -- UUID of the sitrep in which this fact was first added. Preserved + -- unchanged when the fact is carried forward into a child sitrep, so + -- this can be used to tell at a glance how long a fact has been + -- attached to its case. Debug-only. + created_sitrep_id UUID NOT NULL, + -- Engine-defined JSONB payload. Opaque to shared FM code. + payload JSONB NOT NULL, + -- Free-form, debug-only comment. + comment TEXT NOT NULL, + + PRIMARY KEY (sitrep_id, id) +); + +CREATE INDEX IF NOT EXISTS + lookup_fm_facts_for_case +ON omicron.public.fm_fact (sitrep_id, case_id); + CREATE TABLE IF NOT EXISTS omicron.public.fm_ereport_in_case ( -- ID of this association. When an ereport is assigned to a case, that -- association is assigned a UUID. These are used primarily to aid in @@ -8623,7 +8651,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '261.0.0', NULL) + (TRUE, NOW(), NOW(), '262.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/fm-disk-de-and-facts/up1.sql b/schema/crdb/fm-disk-de-and-facts/up1.sql new file mode 100644 index 00000000000..602e91c8175 --- /dev/null +++ b/schema/crdb/fm-disk-de-and-facts/up1.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.diagnosis_engine ADD VALUE IF NOT EXISTS 'physical_disk'; diff --git a/schema/crdb/fm-disk-de-and-facts/up2.sql b/schema/crdb/fm-disk-de-and-facts/up2.sql new file mode 100644 index 00000000000..64839157e05 --- /dev/null +++ b/schema/crdb/fm-disk-de-and-facts/up2.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.fm_fact ( + id UUID NOT NULL, + sitrep_id UUID NOT NULL, + case_id UUID NOT NULL, + created_sitrep_id UUID NOT NULL, + payload JSONB NOT NULL, + comment TEXT NOT NULL, + PRIMARY KEY (sitrep_id, id) +); diff --git a/schema/crdb/fm-disk-de-and-facts/up3.sql b/schema/crdb/fm-disk-de-and-facts/up3.sql new file mode 100644 index 00000000000..02e661dd6d9 --- /dev/null +++ b/schema/crdb/fm-disk-de-and-facts/up3.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_fm_facts_for_case ON omicron.public.fm_fact (sitrep_id, case_id); diff --git a/schema/crdb/fm-disk-de-and-facts/up3.verify.sql b/schema/crdb/fm-disk-de-and-facts/up3.verify.sql new file mode 100644 index 00000000000..492674a12a3 --- /dev/null +++ b/schema/crdb/fm-disk-de-and-facts/up3.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'fm_fact' AND index_name = 'lookup_fm_facts_for_case')),'true','Schema change verification failed: index lookup_fm_facts_for_case on table fm_fact does not exist') AS BOOL); diff --git a/test-utils/src/dev/test_cmds.rs b/test-utils/src/dev/test_cmds.rs index f254c06726e..60919f4311f 100644 --- a/test-utils/src/dev/test_cmds.rs +++ b/test-utils/src/dev/test_cmds.rs @@ -244,6 +244,19 @@ impl<'a> Redactor<'a> { self } + /// Replace any text matching `pattern` with ``. + /// + /// More flexible than [`Self::field`], which requires a fixed + /// `namevalue` shape. Use this when the entire variable substring + /// is itself a regex (e.g., the substring may take one of several + /// alternative forms across runs). + pub fn variable_regex(&mut self, name: &str, pattern: &str) -> &mut Self { + let re = regex::Regex::new(pattern).unwrap(); + let replacement = format!("<{}_REDACTED>", name.to_uppercase()); + self.extra_regex.push((re, replacement)); + self + } + /// Redact an entire indented section. /// /// This can be used if the shape of a section might change from run to run. diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 4687e2d5710..304e122201a 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -60,6 +60,7 @@ impl_typed_uuid_kinds! { ExternalIp = {}, ExternalSubnet = {}, ExternalZpool = {}, + Fact = {}, FmdHostCase = {}, FmdResource = {}, Instance = {},