From 793b1ec96e274826fc1471dcd5e321df38b930c7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 18 May 2026 12:19:47 -0700 Subject: [PATCH 01/15] [fm] Add simple disk diagnoser based on zpool health The first fault management diagnosis engine: opens a case for any non-Online zpool whose backing physical disk is currently in service in the control plane, and closes it on recovery or expungement. Supporting infrastructure introduced along the way: - DiagnosisEngineKind::Disk variant (Rust + DB enum) - fm_case_fact child table for per-engine state (one case has 0..N immutable facts; stable UUIDs across sitreps; participates in copy-forward + GC like other sitrep child tables) - CaseBuilder::{add_fact, remove_fact, facts} API - InServiceDisk nexus-types projection consumed by FM, populated from the existing zpool_list_all_external_batched datastore method with policy filtering done in the background task Schema migration: add-disk-de-and-facts (version 260) adds the 'disk' enum value and creates fm_case_fact. --- dev-tools/omdb/tests/successes.out | 46 +- dev-tools/omdb/tests/test_all_output.rs | 9 + nexus/db-model/src/fm/case.rs | 34 +- nexus/db-model/src/fm/diagnosis_engine.rs | 3 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/fm.rs | 110 +++ nexus/db-schema/src/schema.rs | 12 + nexus/fm/src/analysis_input.rs | 51 +- nexus/fm/src/builder/case.rs | 53 +- nexus/fm/src/builder/rng.rs | 18 +- nexus/fm/src/diagnosis/disk.rs | 705 ++++++++++++++++++ .../fm/src/{diagnosis.rs => diagnosis/mod.rs} | 26 +- nexus/src/app/background/tasks/fm_analysis.rs | 48 +- .../src/app/background/tasks/fm_rendezvous.rs | 7 + nexus/types/src/fm.rs | 1 + nexus/types/src/fm/case.rs | 67 +- nexus/types/src/in_service_disk.rs | 41 + nexus/types/src/inventory.rs | 2 +- nexus/types/src/lib.rs | 1 + schema/crdb/add-disk-de-and-facts/up1.sql | 1 + schema/crdb/add-disk-de-and-facts/up2.sql | 8 + schema/crdb/add-disk-de-and-facts/up3.sql | 1 + .../crdb/add-disk-de-and-facts/up3.verify.sql | 2 + schema/crdb/dbinit.sql | 27 +- test-utils/src/dev/test_cmds.rs | 17 + uuid-kinds/src/lib.rs | 1 + 26 files changed, 1246 insertions(+), 48 deletions(-) create mode 100644 nexus/fm/src/diagnosis/disk.rs rename nexus/fm/src/{diagnosis.rs => diagnosis/mod.rs} (59%) create mode 100644 nexus/types/src/in_service_disk.rs create mode 100644 schema/crdb/add-disk-de-and-facts/up1.sql create mode 100644 schema/crdb/add-disk-de-and-facts/up2.sql create mode 100644 schema/crdb/add-disk-de-and-facts/up3.sql create mode 100644 schema/crdb/add-disk-de-and-facts/up3.verify.sql diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index daa911498d3..23a1406c802 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -700,17 +700,18 @@ task: "fm_analysis" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms - parent sitrep ID: None + parent sitrep ID: Some(..................... (sitrep)) current inventory collection ID: Some(..................... (collection)) ereport classes consumed: (none) FAULT MANAGEMENT ANALYSIS SUMMARY ================================= -/!\ analysis failed: FM analysis is not yet implemented + no changes from the current situation report (Some(..................... (sitrep))) fault management analysis inputs -------------------------------- - parent sitrep: + parent sitrep: ..................... inventory collection: ..................... + no new ereports since the parent sitrep no cases copied forward @@ -724,28 +725,28 @@ task: "fm_rendezvous" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms -(i) no FM situation report loaded, so rendezvous was not performed + current sitrep: ..................... creating requested alerts: -(i) note: this operation was not executed + started at (s ago) and ran for ms alerts requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 creating requested support bundles: -(i) note: this operation was not executed + started at (s ago) and ran for ms support bundles requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 marking ereports as seen: -(i) note: this operation was not executed + started at (s ago) and ran for ms total ereports in sitrep: 0 not marked when the sitrep was loaded: 0 marked seen by this activation: 0 already marked seen: 0 - batch size: 0 + batch size: 1000 batches: 0 errors: 0 @@ -760,6 +761,8 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_case rows deleted: 0 batches: 1 + orphaned fm_case_fact rows deleted: 0 + batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 orphaned fm_support_bundle_request rows deleted: 0 @@ -775,7 +778,9 @@ task: "fm_sitrep_loader" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - no FM situation report available to load + loaded latest FM situation report as of : + sitrep ..................... (sitrep) (v1) + made current at: task: "instance_reincarnation" configured period: every m @@ -1391,17 +1396,18 @@ task: "fm_analysis" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms - parent sitrep ID: None + parent sitrep ID: Some(..................... (sitrep)) current inventory collection ID: Some(..................... (collection)) ereport classes consumed: (none) FAULT MANAGEMENT ANALYSIS SUMMARY ================================= -/!\ analysis failed: FM analysis is not yet implemented + no changes from the current situation report (Some(..................... (sitrep))) fault management analysis inputs -------------------------------- - parent sitrep: + parent sitrep: ..................... inventory collection: ..................... + no new ereports since the parent sitrep no cases copied forward @@ -1415,28 +1421,28 @@ task: "fm_rendezvous" configured period: every m last completed activation: , triggered by started at (s ago) and ran for ms -(i) no FM situation report loaded, so rendezvous was not performed + current sitrep: ..................... creating requested alerts: -(i) note: this operation was not executed + started at (s ago) and ran for ms alerts requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 creating requested support bundles: -(i) note: this operation was not executed + started at (s ago) and ran for ms support bundles requested: 0 requested in this sitrep: 0 created in this activation: 0 already created: 0 errors: 0 marking ereports as seen: -(i) note: this operation was not executed + started at (s ago) and ran for ms total ereports in sitrep: 0 not marked when the sitrep was loaded: 0 marked seen by this activation: 0 already marked seen: 0 - batch size: 0 + batch size: 1000 batches: 0 errors: 0 @@ -1451,6 +1457,8 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_case rows deleted: 0 batches: 1 + orphaned fm_case_fact rows deleted: 0 + batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 orphaned fm_support_bundle_request rows deleted: 0 @@ -1466,7 +1474,9 @@ task: "fm_sitrep_loader" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - no FM situation report available to load + loaded latest FM situation report as of : + sitrep ..................... (sitrep) (v1) + made current at: task: "instance_reincarnation" configured period: every m diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index b787dceefd9..ba91844d890 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -358,6 +358,15 @@ async fn test_omdb_success_cases() { .field("triggered by", r"[\w ]+") .section(&["task: \"tuf_artifact_replication\"", "request ringbuf:"]); + // The `fm_analysis` task's input report includes a line comparing the + // current inventory collection against the parent sitrep's collection, + // which can be either "same" or "different" depending on whether a new + // inventory was collected between sitreps. Collapse both forms. + redactor.variable_regex( + "fm_input_inv_comparison", + r" --> (same collection as parent sitrep|different from parent sitrep \(collection [-a-f0-9]+\))", + ); + // The `sp_ereport_ingester` task's output depends on how many simulated // sled agents ahppen to register with Nexus before its first execution. // These redactions work around the issue described in diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 04776f99539..1217454bfb1 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -9,10 +9,10 @@ use super::DiagnosisEngine; use super::SupportBundleRequest; use crate::DbTypedUuid; use crate::ereport; -use nexus_db_schema::schema::{fm_case, fm_ereport_in_case}; +use nexus_db_schema::schema::{fm_case, fm_case_fact, fm_ereport_in_case}; use nexus_types::fm; use omicron_uuid_kinds::{ - CaseEreportKind, CaseKind, EreporterRestartKind, SitrepKind, + CaseEreportKind, CaseFactKind, CaseKind, EreporterRestartKind, SitrepKind, }; /// Metadata describing a fault management case. @@ -64,6 +64,7 @@ impl CaseMetadata { alerts_requested: _, support_bundles_requested: _, ereports: _, + facts: _, } = case; Self { sitrep_id: sitrep_id.into(), @@ -76,6 +77,35 @@ impl CaseMetadata { } } +/// Diesel row for the `fm_case_fact` table. See +/// [`nexus_types::fm::case::CaseFact`] for semantics. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_case_fact)] +pub struct CaseFact { + pub id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + pub payload: serde_json::Value, + pub comment: String, +} + +impl CaseFact { + pub fn from_sitrep( + sitrep_id: impl Into>, + case_id: impl Into>, + fact: &fm::case::CaseFact, + ) -> Self { + let fm::case::CaseFact { id, payload, comment } = fact; + Self { + id: (*id).into(), + sitrep_id: sitrep_id.into(), + case_id: case_id.into(), + payload: payload.clone(), + comment: comment.clone(), + } + } +} + /// An association between an ereport and a case. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_ereport_in_case)] diff --git a/nexus/db-model/src/fm/diagnosis_engine.rs b/nexus/db-model/src/fm/diagnosis_engine.rs index 7d354142bbb..ab3052339e8 100644 --- a/nexus/db-model/src/fm/diagnosis_engine.rs +++ b/nexus/db-model/src/fm/diagnosis_engine.rs @@ -24,6 +24,7 @@ impl_enum_type!( pub enum DiagnosisEngine; PowerShelf => b"power_shelf" + Disk => b"disk" ); @@ -31,6 +32,7 @@ impl From for fm::DiagnosisEngineKind { fn from(de: DiagnosisEngine) -> Self { match de { DiagnosisEngine::PowerShelf => fm::DiagnosisEngineKind::PowerShelf, + DiagnosisEngine::Disk => fm::DiagnosisEngineKind::Disk, } } } @@ -39,6 +41,7 @@ impl From for DiagnosisEngine { fn from(fm_de: fm::DiagnosisEngineKind) -> Self { match fm_de { fm::DiagnosisEngineKind::PowerShelf => DiagnosisEngine::PowerShelf, + fm::DiagnosisEngineKind::Disk => DiagnosisEngine::Disk, } } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 23ad6b94e3c..bc3057784d7 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(259, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(260, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(260, "add-disk-de-and-facts"), KnownVersion::new(259, "vmm-failure-reason"), KnownVersion::new(258, "lookup-unmarked-ereports-by-class"), KnownVersion::new(257, "add-disk-adoption-requests"), diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 1552d463dfb..f600005c9e7 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -33,6 +33,7 @@ use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::ereport::dsl as ereport_dsl; use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; use nexus_db_schema::schema::fm_case::dsl as case_dsl; +use nexus_db_schema::schema::fm_case_fact::dsl as case_fact_dsl; use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; @@ -46,6 +47,8 @@ use omicron_common::api::external::ListResultVec; use omicron_uuid_kinds::AlertKind; use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportKind; +use omicron_uuid_kinds::CaseFactKind; +use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::GenericUuid; @@ -121,6 +124,7 @@ sitrep_child_tables! { SupportBundleRequestDataSelectionEreports => { table: "fm_support_bundle_request_data_selection_ereports" }, SupportBundleRequest => { table: "fm_support_bundle_request" }, Case => { table: "fm_case" }, + CaseFact => { table: "fm_case_fact" }, } /// Per-child-table statistics from a single GC pass. @@ -366,6 +370,8 @@ impl DataStore { let mut support_bundle_requests = self.support_bundle_requests_read_on_conn(id, conn).await?; + let mut case_facts = self.fm_case_facts_read_on_conn(id, conn).await?; + // Next, load the case metadata entries and marry them to the sets of // ereports, alert requests, and support bundle requests for those // cases that we loaded in the previous steps. @@ -408,6 +414,7 @@ impl DataStore { alert_requests.remove(&id).unwrap_or_default(); let support_bundles_requested = support_bundle_requests.remove(&id).unwrap_or_default(); + let facts = case_facts.remove(&id).unwrap_or_default(); fm::Case { id, metadata: fm::case::Metadata { @@ -419,6 +426,7 @@ impl DataStore { alerts_requested, ereports, support_bundles_requested, + facts, } })); } @@ -497,6 +505,61 @@ impl DataStore { Ok(by_case) } + /// Fetch all `fm_case_fact` rows belonging to cases in the given sitrep, + /// grouped by `case_id`. + async fn fm_case_facts_read_on_conn( + &self, + id: SitrepUuid, + conn: &async_bb8_diesel::Connection, + ) -> Result>, Error> + { + let mut by_case = + HashMap::>::new(); + + let mut paginator: Paginator> = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated( + case_fact_dsl::fm_case_fact, + case_fact_dsl::id, + &p.current_pagparams(), + ) + .filter(case_fact_dsl::sitrep_id.eq(id.into_untyped_uuid())) + .select(model::fm::CaseFact::as_select()) + .load_async(conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to load case facts") + })?; + + paginator = p.found_batch(&batch, &|f| f.id); + for fact in batch { + let case_id: CaseUuid = fact.case_id.into(); + let id: CaseFactUuid = fact.id.into(); + by_case + .entry(case_id) + .or_default() + .insert_unique(fm::case::CaseFact { + id, + payload: fact.payload, + comment: fact.comment, + }) + .map_err(|_| { + let internal_message = format!( + "encountered multiple case facts for case \ + {case_id} with the same fact UUID {id}. this \ + should really not be possible, as the fact \ + UUID is a primary key!", + ); + Error::InternalError { internal_message } + })?; + } + } + + Ok(by_case) + } + /// Fetch all support bundle requests belonging to cases in the given /// sitrep, including their child data selection tables. async fn support_bundle_requests_read_on_conn( @@ -740,6 +803,7 @@ impl DataStore { let mut support_bundles_requested = Vec::new(); let mut bundle_data_selections_requested = Vec::new(); let mut case_ereports = Vec::new(); + let mut case_facts = Vec::new(); for case in sitrep.cases { let case_id = case.id; cases.push(model::fm::CaseMetadata::from_sitrep(sitrep_id, &case)); @@ -763,6 +827,11 @@ impl DataStore { ); bundle_data_selections_requested.push((req_id, data_selection)); } + for fact in case.facts.iter() { + case_facts.push(model::fm::CaseFact::from_sitrep( + sitrep_id, case_id, fact, + )); + } } if !case_ereports.is_empty() { @@ -797,6 +866,17 @@ impl DataStore { ) .await?; + if !case_facts.is_empty() { + diesel::insert_into(case_fact_dsl::fm_case_fact) + .values(case_facts) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert case facts") + })?; + } + if !cases.is_empty() { diesel::insert_into(case_dsl::fm_case) .values(cases) @@ -1138,6 +1218,7 @@ impl DataStore { alert_requests_deleted: usize, support_bundle_requests_deleted: usize, cases_deleted: usize, + case_facts_deleted: usize, } let err = OptionalError::new(); @@ -1147,6 +1228,7 @@ impl DataStore { alert_requests_deleted, support_bundle_requests_deleted, cases_deleted, + case_facts_deleted, } = self // Sitrep deletion is transactional to prevent a sitrep from being // left in a partially-deleted state should the Nexus instance @@ -1190,6 +1272,15 @@ impl DataStore { Self::support_bundle_requests_delete_on_conn(&conn, ids.clone()) .await?; + // Delete case facts. + let case_facts_deleted = diesel::delete( + case_fact_dsl::fm_case_fact.filter( + case_fact_dsl::sitrep_id.eq_any(ids.clone()), + ), + ) + .execute_async(&conn) + .await?; + // Delete case metadata records. let cases_deleted = diesel::delete( case_dsl::fm_case @@ -1212,6 +1303,7 @@ impl DataStore { alert_requests_deleted, support_bundle_requests_deleted, case_ereports_deleted, + case_facts_deleted, }) } }) @@ -1230,6 +1322,7 @@ impl DataStore { "case_ereports_deleted" => case_ereports_deleted, "alert_requests_deleted" => alert_requests_deleted, "support_bundle_requests_deleted" => support_bundle_requests_deleted, + "case_facts_deleted" => case_facts_deleted, ); Ok(sitreps_deleted) @@ -2037,6 +2130,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested, + facts, } = case; let case_id = id; let Some(expected) = this.cases.get(&case_id) else { @@ -2068,6 +2162,7 @@ mod tests { &expected.metadata.de, de, "while checking case {case_id}" ); + assert_eq!(&expected.facts, facts, "while checking case {case_id}"); // Now, check that all the ereports are present in both cases. assert_eq!(ereports.len(), expected.ereports.len()); @@ -2241,6 +2336,18 @@ mod tests { .unwrap(); } + let mut facts = iddqd::IdOrdMap::new(); + facts + .insert_unique(fm::case::CaseFact { + id: CaseFactUuid::new_v4(), + payload: serde_json::json!({ + "kind": "representative_fact", + "note": "for round-trip testing", + }), + comment: "a representative fact for case 1".to_string(), + }) + .unwrap(); + fm::Case { id: omicron_uuid_kinds::CaseUuid::new_v4(), metadata: fm::case::Metadata { @@ -2252,6 +2359,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested, + facts, } }; @@ -2288,6 +2396,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -2406,6 +2515,7 @@ mod tests { ereports: iddqd::IdOrdMap::new(), alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested, + facts: iddqd::IdOrdMap::new(), }; let mut cases = iddqd::IdOrdMap::new(); diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index a9b8c880db1..19119861c92 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -3244,6 +3244,16 @@ table! { } } +table! { + fm_case_fact (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + case_id -> Uuid, + payload -> Jsonb, + comment -> Text, + } +} + table! { fm_ereport_in_case (sitrep_id, id) { id -> Uuid, @@ -3259,6 +3269,8 @@ table! { allow_tables_to_appear_in_same_query!(fm_ereport_in_case, ereport); allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case_fact); +allow_tables_to_appear_in_same_query!(fm_case, fm_case_fact); table! { fm_alert_request (sitrep_id, id) { diff --git a/nexus/fm/src/analysis_input.rs b/nexus/fm/src/analysis_input.rs index 735cf92cef6..2698fdf5c5e 100644 --- a/nexus/fm/src/analysis_input.rs +++ b/nexus/fm/src/analysis_input.rs @@ -8,6 +8,7 @@ use chrono::{DateTime, Utc}; use iddqd::IdOrdMap; use nexus_types::fm::analysis_reports::ClosedCaseReport; use nexus_types::fm::{self, Sitrep, SitrepVersion}; +use nexus_types::in_service_disk::InServiceDisk; use nexus_types::inventory; use omicron_uuid_kinds::CollectionUuid; use std::collections::BTreeMap; @@ -24,8 +25,8 @@ pub use nexus_types::fm::analysis_reports::InputReport as Report; /// - The current [inventory collection](Input::inventory) /// - Any [new ereports](Input::new_ereports) which were received since when /// the parent sitrep was produced -/// - The set of [cases](Input::cases) which must be copied forwards into -/// the next sitrep +/// - The set of [open cases](Input::open_cases) which must be copied +/// forwards into the next sitrep /// /// This type represents the outputs of the analysis preparation phase. Once it /// is constructed, the inputs are immutable and cannot be modified. To @@ -39,6 +40,8 @@ pub struct Input { new_ereports: IdOrdMap, open_cases: IdOrdMap, closed_cases_copied_forward: IdOrdMap, + /// All control plane managed disks + in_service_disks: Arc>, } impl Input { @@ -54,7 +57,10 @@ impl Input { &self.new_ereports } - pub fn cases(&self) -> &IdOrdMap { + /// Open cases from the parent sitrep, copied forward into this analysis + /// input. Closed cases live separately on the (crate-private) + /// `closed_cases_copied_forward` accessor. + pub fn open_cases(&self) -> &IdOrdMap { &self.open_cases } @@ -62,11 +68,19 @@ impl Input { &self.closed_cases_copied_forward } + /// All control-plane-managed disks (`physical_disk.disk_policy = + /// in_service` in the DB), indexed by `physical_disk_id`. See the + /// field-level documentation on `Input::in_service_disks` for semantics. + pub fn in_service_disks(&self) -> &IdOrdMap { + &self.in_service_disks + } + /// Returns a [`Builder`] for constructing a new `Input` from the provided - /// `parent_sitrep` and inventory collection. + /// `parent_sitrep`, inventory collection, and in-service disks. pub fn builder( parent_sitrep: Option>, inv: Arc, + in_service_disks: Arc>, ) -> Result { // Before preparing analysis inputs, check that the proposed input // inventory collection is at least as new as the parent sitrep's @@ -91,6 +105,7 @@ impl Input { Ok(Builder { parent_sitrep, inv, + in_service_disks, new_ereports: IdOrdMap::default(), unmarked_seen_ereports: BTreeSet::default(), }) @@ -114,6 +129,7 @@ pub enum InvalidInputs { pub struct Builder { parent_sitrep: Option>, inv: Arc, + in_service_disks: Arc>, /// Ereports which are new and should be input to analysis in the next /// sitrep. new_ereports: IdOrdMap, @@ -231,6 +247,7 @@ impl Builder { new_ereports: self.new_ereports, open_cases, closed_cases_copied_forward, + in_service_disks: self.in_service_disks, }; (input, report) @@ -343,6 +360,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; let open_case2 = { @@ -363,6 +381,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; let closed_case_with_unmarked = { @@ -390,6 +409,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; let closed_case_without_unmarked = { @@ -411,6 +431,7 @@ mod tests { .collect(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), } }; @@ -460,8 +481,12 @@ mod tests { // Build analysis input let (input, report) = { - let mut builder = Input::builder(Some(parent_sitrep), inv) - .expect("collection start time check should always pass"); + let mut builder = Input::builder( + Some(parent_sitrep), + inv, + Arc::new(IdOrdMap::new()), + ) + .expect("collection start time check should always pass"); // Pass in four ereports: // - two that are in the open cases of the parent sitrep // - one that is in the (to-be-copied-forward) closed case @@ -531,14 +556,18 @@ mod tests { // Check the contents of open cases. assert!( - input.cases().contains_key(&open_case1_id), - "open case 1 from the parent sitrep should be in input.cases()" + input.open_cases().contains_key(&open_case1_id), + "open case 1 from the parent sitrep should be in input.open_cases()" ); assert!( - input.cases().contains_key(&open_case2_id), - "open case 2 from the parent sitrep should be in input.cases()" + input.open_cases().contains_key(&open_case2_id), + "open case 2 from the parent sitrep should be in input.open_cases()" + ); + assert_eq!( + input.open_cases().len(), + 2, + "exactly two cases should be open" ); - assert_eq!(input.cases().len(), 2, "exactly two cases should be open"); // Start building a sitrep... let mut sitrep_builder = diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index d7168bef8f1..a5bd5222522 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -9,6 +9,7 @@ use iddqd::id_ord_map::{self, IdOrdMap}; use nexus_types::alert::AlertClass; use nexus_types::fm; use nexus_types::support_bundle::BundleDataSelection; +use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::SitrepUuid; use std::sync::Arc; @@ -38,7 +39,7 @@ impl AllCases { mut rng: rng::SitrepBuilderRng, ) -> Self { let cases = inputs - .cases() + .open_cases() .iter() .map(|case| { let rng = rng::CaseBuilderRng::new(case.id, &mut rng); @@ -76,6 +77,7 @@ impl AllCases { ereports: Default::default(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), + facts: Default::default(), }; let mut builder = CaseBuilder::new(&self.log, sitrep_id, case, case_rng); @@ -215,6 +217,55 @@ impl CaseBuilder { self.report_log.entry("case closed").comment(comment); } + /// Replace this case's free-form comment string. + pub fn set_comment(&mut self, comment: impl ToString) { + self.case.metadata.comment = comment.to_string(); + } + + /// Emit a new fact under this case. The fact's UUID is freshly + /// allocated from the case's deterministic RNG. + pub fn add_fact( + &mut self, + fact: &T, + comment: impl ToString, + ) -> anyhow::Result { + let id = loop { + let id = self.rng.next_fact(); + if !self.case.facts.contains_key(&id) { + break id; + } + }; + let payload = serde_json::to_value(fact) + .context("failed to serialize case fact payload")?; + let comment = comment.to_string(); + let fact = fm::case::CaseFact { id, payload, comment: comment.clone() }; + self.case.facts.insert_unique(fact).expect("UUID should be unused"); + + slog::info!( + &self.log, + "added a fact"; + "fact_id" => %id, + "comment" => %comment, + ); + self.report_log.entry("added fact").kv("fact_id", id).comment(comment); + Ok(id) + } + + /// Remove a fact from this case. The fact will not be carried forward + /// into the next sitrep. + pub fn remove_fact(&mut self, id: CaseFactUuid) { + if self.case.facts.remove(&id).is_some() { + slog::info!(&self.log, "removed a fact"; "fact_id" => %id); + self.report_log.entry("removed fact").kv("fact_id", id); + } + } + + /// Iterate the facts currently attached to this case (including any that + /// were carried forward from the parent sitrep). + pub fn facts(&self) -> impl Iterator { + self.case.facts.iter() + } + pub fn add_ereport( &mut self, report: &Arc, diff --git a/nexus/fm/src/builder/rng.rs b/nexus/fm/src/builder/rng.rs index 5490ef67ae9..01cd04bd7d7 100644 --- a/nexus/fm/src/builder/rng.rs +++ b/nexus/fm/src/builder/rng.rs @@ -12,6 +12,8 @@ use omicron_uuid_kinds::AlertKind; use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportKind; use omicron_uuid_kinds::CaseEreportUuid; +use omicron_uuid_kinds::CaseFactKind; +use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::SitrepUuid; @@ -59,11 +61,17 @@ impl SitrepBuilderRng { } } +/// Per-case child RNGs. Each `next_*` returns the next deterministic UUID +/// in its stream; collisions across calls are statistically impossible, but +/// callers in `builder/case.rs` still guard inserts with a `contains_key` +/// loop so that a future change here (e.g., reseeding mid-build) can't +/// silently corrupt a case. #[derive(Clone, Debug)] pub(super) struct CaseBuilderRng { ereport_assignment_rng: TypedUuidRng, alert_rng: TypedUuidRng, support_bundle_rng: TypedUuidRng, + fact_rng: TypedUuidRng, } impl CaseBuilderRng { @@ -85,7 +93,11 @@ impl CaseBuilderRng { &mut sitrep.parent, (case_id, "support-bundle"), ); - Self { alert_rng, ereport_assignment_rng, support_bundle_rng } + let fact_rng = TypedUuidRng::from_parent_rng( + &mut sitrep.parent, + (case_id, "case-fact"), + ); + Self { alert_rng, ereport_assignment_rng, support_bundle_rng, fact_rng } } pub(super) fn next_alert(&mut self) -> AlertUuid { @@ -99,4 +111,8 @@ impl CaseBuilderRng { pub(super) fn next_support_bundle(&mut self) -> SupportBundleUuid { self.support_bundle_rng.next() } + + pub(super) fn next_fact(&mut self) -> CaseFactUuid { + self.fact_rng.next() + } } diff --git a/nexus/fm/src/diagnosis/disk.rs b/nexus/fm/src/diagnosis/disk.rs new file mode 100644 index 00000000000..3d0ed5f8a10 --- /dev/null +++ b/nexus/fm/src/diagnosis/disk.rs @@ -0,0 +1,705 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Disk diagnosis engine. + +use crate::SitrepBuilder; +use crate::analysis_input::Input; +use nexus_types::fm::DiagnosisEngineKind; +use nexus_types::in_service_disk::InServiceDisk; +use nexus_types::inventory::ZpoolHealth; +use omicron_uuid_kinds::{CaseFactUuid, CaseUuid, ZpoolUuid}; +use serde::{Deserialize, Serialize}; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; + +/// Per-fact state for the disk diagnosis engine, serialized into the +/// `fm_case_fact.payload` JSONB column. Other diagnosis engines must not +/// inspect or modify this; shared FM code treats it as opaque bytes. +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum DiskFact { + /// The zpool's most recently observed health is non-`Online`. + ZpoolUnhealthy { zpool_id: ZpoolUuid, last_seen_health: ZpoolHealth }, +} + +/// Per-case summary built from a case's parent-forwarded facts. +struct ParentCaseSummary { + /// IDs of `ZpoolUnhealthy` facts on this case, grouped by their + /// recorded zpool. There can be multiple facts in pathological cases + /// (e.g., two zpool ids on the same case after a hand-edit); the + /// diagnoser keeps all of them in its accounting. + zpool_unhealthy: BTreeMap>, +} + +pub(super) fn analyze( + input: &Input, + builder: &mut SitrepBuilder<'_>, +) -> anyhow::Result<()> { + // The disk DE's primary key today is `zpool_id`, so we build a local + // index keyed by zpool. Future variants of `DiskFact` are welcome to + // derive their own secondary indices (e.g., by `sled_id` for FMD). + let in_service_by_zpool: BTreeMap = + input.in_service_disks().iter().map(|d| (d.zpool_id, d)).collect(); + + // Index every zpool we observed in this inventory by ID, so we can + // distinguish "saw it, it's Online" from "didn't see it at all" below. + let observed: BTreeMap = input + .inventory() + .sled_agents + .iter() + .flat_map(|sa| sa.zpools.iter()) + .map(|z| (z.id, z.health)) + .collect(); + + // Currently-faulty, control-plane-managed zpools. + // + // Out-of-service zpools are intentionally ignored: a non-`Online` zpool + // whose disk has been expunged is no longer the control plane's concern. + let faulty: BTreeMap = observed + .iter() + .filter(|(id, _)| in_service_by_zpool.contains_key(*id)) + .filter(|(_, h)| **h != ZpoolHealth::Online) + .map(|(id, h)| (*id, *h)) + .collect(); + + // Inspect parent-forwarded Disk cases from the input (i.e., the state + // copied from the parent sitrep — *not* the in-progress builder, which + // we will mutate below). Each case's facts are JSON blobs owned by this + // engine; deserialize each one as DiskFact. Skip (with a warning) any + // fact we can't read. + let parent_summaries: BTreeMap = input + .open_cases() + .iter() + .filter(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .map(|c| { + let case_id = c.id; + let mut summary = + ParentCaseSummary { zpool_unhealthy: BTreeMap::new() }; + for fact in c.facts.iter() { + match fact.payload_as::() { + Ok(DiskFact::ZpoolUnhealthy { + zpool_id, + last_seen_health, + }) => { + summary + .zpool_unhealthy + .entry(zpool_id) + .or_default() + .push((fact.id, last_seen_health)); + } + Err(e) => { + slog::warn!( + &builder.log, + "skipping Disk case fact with unreadable \ + payload; this run will not modify it"; + "case_id" => %case_id, + "fact_id" => %fact.id, + "error" => InlineErrorChain::new(&*e).to_string(), + ); + } + } + } + (case_id, summary) + }) + .collect(); + + // Close any open Disk case whose entire set of (interpretable) facts + // points at zpools that are now Online or expunged. Closed cases are not + // copied forward, so their facts naturally drop with them. + // + // Absence from inventory is NOT a recovery signal: a sled could be + // powered off, or inventory could be lossy. + for (case_id, summary) in &parent_summaries { + if summary.zpool_unhealthy.is_empty() { + continue; + } + let any_still_unhealthy = + summary.zpool_unhealthy.keys().any(|zpool_id| { + in_service_by_zpool.contains_key(zpool_id) + && observed.get(zpool_id) != Some(&ZpoolHealth::Online) + }); + if !any_still_unhealthy { + builder + .cases + .case_mut(case_id) + .expect("case_id came from iterating builder.cases") + .close( + "all ZpoolUnhealthy facts have resolved (zpool back to \ + Online, or disk no longer in service)", + ); + } + } + + // For each parent-forwarded ZpoolUnhealthy fact: if the recorded + // last_seen_health no longer matches current observation, remove the + // stale fact. We'll re-add a fresh one below with current data (new UUID). + for (case_id, summary) in &parent_summaries { + let mut case_mut = builder + .cases + .case_mut(case_id) + .expect("case_id came from iterating builder.cases"); + if !case_mut.is_open() { + continue; + } + for (zpool_id, facts) in &summary.zpool_unhealthy { + let Some(current_health) = faulty.get(zpool_id) else { + continue; + }; + for (fact_id, last_seen_health) in facts { + if *current_health != *last_seen_health { + case_mut.remove_fact(*fact_id); + } + } + } + } + + // For each currently-faulty in-service zpool: if there's no open case + + // accurate fact already covering it, ensure a case exists (reuse the + // parent-forwarded one for this zpool if any) and add a fresh fact. + for (zpool_id, current_health) in faulty { + let parent_for_zpool = + parent_summaries.iter().find_map(|(case_id, summary)| { + summary + .zpool_unhealthy + .get(&zpool_id) + .map(|facts| (*case_id, facts)) + }); + + let case_id_for_fact = match parent_for_zpool { + // Parent case already has an accurate fact — fully covered. + Some((_, facts)) + if facts.iter().any(|(_, h)| *h == current_health) => + { + continue; + } + // Parent case exists; its stale facts were removed above. + // Refresh under the same case. + Some((case_id, _)) => case_id, + // No parent case for this zpool — open one. + None => { + let mut new_case = + builder.cases.open_case(DiagnosisEngineKind::Disk); + new_case.set_comment(format!("zpool {zpool_id} unhealthy")); + new_case.id + } + }; + + builder + .cases + .case_mut(&case_id_for_fact) + .expect("case_id came from this fn") + .add_fact( + &DiskFact::ZpoolUnhealthy { + zpool_id, + last_seen_health: current_health, + }, + format!("zpool {zpool_id} health={current_health}"), + )?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::analysis_input::Input; + use crate::builder::{SitrepBuilder, SitrepBuilderRng}; + use crate::test_util::FmTest; + use chrono::Utc; + use iddqd::IdOrdMap; + use nexus_types::external_api::physical_disk::PhysicalDiskKind; + use nexus_types::fm::{self, Sitrep, SitrepVersion}; + use nexus_types::inventory; + use omicron_test_utils::dev; + use omicron_uuid_kinds::{ + OmicronZoneUuid, PhysicalDiskUuid, SitrepUuid, SledUuid, + }; + use std::sync::Arc; + + /// Synthesize a synthetic in-service disk set from a list of zpool IDs. + /// Each zpool gets its own fresh `PhysicalDiskUuid` and dummy identity + /// facts — tests in this module only care about the zpool dimension. + fn mk_in_service( + zpool_ids: impl IntoIterator, + ) -> IdOrdMap { + zpool_ids + .into_iter() + .map(|zpool_id| InServiceDisk { + physical_disk_id: PhysicalDiskUuid::new_v4(), + zpool_id, + sled_id: SledUuid::new_v4(), + vendor: "test-vendor".to_string(), + serial: format!("test-serial-{zpool_id}"), + model: "test-model".to_string(), + variant: PhysicalDiskKind::U2, + }) + .collect() + } + + /// Make a synthetic test scenario from the example system: returns a + /// `LogContext` (the caller must `cleanup_successful()` it), the + /// example collection, and every zpool ID in that collection. + fn setup( + test_name: &'static str, + ) -> (dev::LogContext, inventory::Collection, Vec) { + let (fm_test, logctx) = FmTest::new_with_logctx(test_name); + // Build the example system once to get a Collection with zpools. + let (example, _bp) = fm_test.system_builder.build(); + let zpool_ids: Vec = example + .collection + .sled_agents + .iter() + .flat_map(|sa| sa.zpools.iter().map(|z| z.id)) + .collect(); + assert!( + !zpool_ids.is_empty(), + "example system should have at least one zpool" + ); + (logctx, example.collection, zpool_ids) + } + + /// Set the zpool with `zpool_id` to `health`, panicking if not found. + fn set_health( + collection: &mut inventory::Collection, + zpool_id: ZpoolUuid, + health: ZpoolHealth, + ) { + for mut sa in collection.sled_agents.iter_mut() { + for z in sa.zpools.iter_mut() { + if z.id == zpool_id { + z.health = health; + return; + } + } + } + panic!("zpool {zpool_id} not found in collection"); + } + + /// Build an `Input` from a collection, an optional parent sitrep, and a + /// pre-built set of in-service disks. + fn build_input( + collection: inventory::Collection, + parent_sitrep: Option, + in_service: IdOrdMap, + ) -> Input { + let parent = parent_sitrep.map(|s| { + Arc::new(( + SitrepVersion { + id: s.id(), + version: 0, + time_made_current: Utc::now(), + }, + s, + )) + }); + let builder = + Input::builder(parent, Arc::new(collection), Arc::new(in_service)) + .expect("input builder should accept fresh inventory"); + let (input, _report) = builder.build(); + input + } + + /// Run `disk::analyze` over an input and return the resulting Sitrep. + fn run_analyze(log: &slog::Logger, input: &Input) -> Sitrep { + let mut builder = SitrepBuilder::new_with_rng( + log, + input, + SitrepBuilderRng::from_seed("disk-analyze"), + ); + analyze(input, &mut builder).expect("analyze ok"); + let (sitrep, _report) = + builder.build(OmicronZoneUuid::new_v4(), Utc::now()); + sitrep + } + + fn make_parent_with_disk_case( + parent_sitrep_id: SitrepUuid, + inv_collection_id: omicron_uuid_kinds::CollectionUuid, + zpool_id: ZpoolUuid, + ) -> Sitrep { + let mut cases = iddqd::IdOrdMap::new(); + let case_id = omicron_uuid_kinds::CaseUuid::new_v4(); + let mut facts = iddqd::IdOrdMap::new(); + facts + .insert_unique(fm::case::CaseFact { + id: omicron_uuid_kinds::CaseFactUuid::new_v4(), + payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy { + zpool_id, + last_seen_health: ZpoolHealth::Degraded, + }) + .unwrap(), + comment: format!("zpool {zpool_id} degraded"), + }) + .unwrap(); + cases + .insert_unique(fm::Case { + id: case_id, + metadata: fm::case::Metadata { + created_sitrep_id: parent_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::Disk, + comment: format!("zpool {zpool_id} degraded"), + }, + ereports: Default::default(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + facts, + }) + .unwrap(); + Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + inv_collection_id, + creator_id: OmicronZoneUuid::new_v4(), + parent_sitrep_id: None, + time_created: Utc::now(), + next_inv_min_time_started: Utc::now(), + comment: String::new(), + }, + cases, + ereports_by_id: Default::default(), + } + } + + /// Helper: collect (case, fact) pairs in a sitrep where the fact parses + /// as `DiskFact`. Optionally filtered to open cases only. + fn disk_facts( + sitrep: &Sitrep, + open_only: bool, + ) -> Vec<(&fm::Case, &fm::case::CaseFact, DiskFact)> { + sitrep + .cases + .iter() + .filter(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .filter(|c| !open_only || c.is_open()) + .flat_map(|c| { + c.facts.iter().filter_map(move |f| { + f.payload_as::().ok().map(|d| (c, f, d)) + }) + }) + .collect() + } + + #[test] + fn opens_on_degraded_in_service() { + let (logctx, mut collection, zpools) = + setup("disk_open_degraded_in_service"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + let input = build_input(collection, None, in_service); + + let sitrep = run_analyze(&logctx.log, &input); + let facts = disk_facts(&sitrep, true); + assert_eq!(facts.len(), 1); + match &facts[0].2 { + DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health } => { + assert_eq!(*zpool_id, target); + assert_eq!(*last_seen_health, ZpoolHealth::Degraded); + } + } + logctx.cleanup_successful(); + } + + #[test] + fn skips_degraded_when_expunged() { + let (logctx, mut collection, zpools) = setup("disk_skip_expunged"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Faulted); + // target is *not* in the in-service set. + let in_service = mk_in_service(zpools.iter().copied().skip(1)); + let input = build_input(collection, None, in_service); + + let sitrep = run_analyze(&logctx.log, &input); + let cases = disk_facts(&sitrep, false); + assert!( + cases.is_empty(), + "no Disk cases should be opened for expunged zpool, got: {:?}", + cases + ); + logctx.cleanup_successful(); + } + + #[test] + fn idempotent_when_case_already_open() { + let (logctx, mut collection, zpools) = setup("disk_idempotent"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + let parent_id = SitrepUuid::new_v4(); + let parent = + make_parent_with_disk_case(parent_id, collection.id, target); + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + let open_cases = disk_facts(&sitrep, true); + assert_eq!(open_cases.len(), 1); + match &open_cases[0].2 { + DiskFact::ZpoolUnhealthy { zpool_id, .. } => { + assert_eq!(*zpool_id, target); + } + } + logctx.cleanup_successful(); + } + + #[test] + fn closes_on_recovery() { + let (logctx, collection, zpools) = setup("disk_close_on_recovery"); + let target = zpools[0]; + // The example system reports zpools as Online by default. + let in_service = mk_in_service(zpools.iter().copied()); + let parent_id = SitrepUuid::new_v4(); + let parent = + make_parent_with_disk_case(parent_id, collection.id, target); + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + let all = disk_facts(&sitrep, false); + assert_eq!(all.len(), 1); + assert!( + !all[0].0.is_open(), + "case should be closed when zpool returns to Online", + ); + logctx.cleanup_successful(); + } + + #[test] + fn closes_on_expungement() { + let (logctx, mut collection, zpools) = + setup("disk_close_on_expungement"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + // Target is NOT in-service in this sitrep (just expunged). + let in_service = mk_in_service(zpools.iter().copied().skip(1)); + let parent_id = SitrepUuid::new_v4(); + let parent = + make_parent_with_disk_case(parent_id, collection.id, target); + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + let all = disk_facts(&sitrep, false); + assert_eq!(all.len(), 1); + assert!( + !all[0].0.is_open(), + "case should be closed when zpool's disk is expunged", + ); + logctx.cleanup_successful(); + } + + #[test] + fn keeps_open_on_absence_from_inventory() { + // A zpool the case is about does NOT appear in the inventory at all + // (sled powered off, lossy collection, etc.). The case should stay + // open: absence is not a recovery signal. + let (logctx, collection, zpools) = setup("disk_keep_open_on_absence"); + let phantom = ZpoolUuid::new_v4(); + assert!(!zpools.contains(&phantom)); + let in_service = mk_in_service(zpools.iter().copied().chain([phantom])); + let parent_id = SitrepUuid::new_v4(); + let parent = + make_parent_with_disk_case(parent_id, collection.id, phantom); + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + let all = disk_facts(&sitrep, false); + assert_eq!(all.len(), 1); + assert!( + all[0].0.is_open(), + "case should remain open when its zpool is absent from the \ + current inventory collection (sled could be down or inventory \ + is lossy)", + ); + logctx.cleanup_successful(); + } + + /// A parent Disk case carries a fact whose JSON `kind` doesn't match any + /// variant we know. The diagnoser should leave that case alone (not + /// touch its open/closed state, not mutate the fact, not count it + /// toward dedup) and freely open a new case for a separate live signal. + #[test] + fn unreadable_fact_is_skipped() { + let (logctx, mut collection, zpools) = setup("disk_unreadable_payload"); + // Live signal on zpools[0]. + let live_target = zpools[0]; + set_health(&mut collection, live_target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + + // Build a parent sitrep with one Disk case carrying a payload that + // doesn't match any variant. + let parent_sitrep_id = SitrepUuid::new_v4(); + let unreadable_case_id = omicron_uuid_kinds::CaseUuid::new_v4(); + let mut parent_cases = iddqd::IdOrdMap::new(); + let unreadable_fact_id = omicron_uuid_kinds::CaseFactUuid::new_v4(); + let unreadable_payload = serde_json::json!({ + "kind": "this_variant_does_not_exist", + "mystery": "data", + }); + let mut parent_facts = iddqd::IdOrdMap::new(); + parent_facts + .insert_unique(fm::case::CaseFact { + id: unreadable_fact_id, + payload: unreadable_payload.clone(), + comment: "fact with payload from the future".to_string(), + }) + .unwrap(); + parent_cases + .insert_unique(fm::Case { + id: unreadable_case_id, + metadata: fm::case::Metadata { + created_sitrep_id: parent_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::Disk, + comment: "case with fact from the future".to_string(), + }, + ereports: Default::default(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + facts: parent_facts, + }) + .unwrap(); + let parent = Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + inv_collection_id: collection.id, + creator_id: OmicronZoneUuid::new_v4(), + parent_sitrep_id: None, + time_created: Utc::now(), + next_inv_min_time_started: Utc::now(), + comment: String::new(), + }, + cases: parent_cases, + ereports_by_id: Default::default(), + }; + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + + // The unreadable case must still be present, still open, and its + // fact unchanged. + let unreadable = sitrep + .cases + .iter() + .find(|c| c.id == unreadable_case_id) + .expect("unreadable case should be copied forward"); + assert!( + unreadable.is_open(), + "unreadable case must not be closed by the diagnoser", + ); + let kept_fact = unreadable + .facts + .iter() + .find(|f| f.id == unreadable_fact_id) + .expect("unreadable fact should be preserved"); + assert_eq!(kept_fact.payload, unreadable_payload); + + // A fresh ZpoolUnhealthy fact must still open for the live signal + // — the unreadable case did not consume the dedup slot. + let readable = disk_facts(&sitrep, true); + assert_eq!( + readable.len(), + 1, + "expected one readable open ZpoolUnhealthy fact for the live \ + signal", + ); + match &readable[0].2 { + DiskFact::ZpoolUnhealthy { zpool_id, .. } => { + assert_eq!(*zpool_id, live_target); + } + } + logctx.cleanup_successful(); + } + + /// When the parent sitrep's fact content matches the diagnoser's current + /// observation, the fact carries forward with the same UUID — no + /// remove-and-readd churn. + #[test] + fn fact_uuid_stable_when_observation_unchanged() { + let (logctx, mut collection, zpools) = setup("disk_fact_uuid_stable"); + let target = zpools[0]; + set_health(&mut collection, target, ZpoolHealth::Degraded); + let in_service = mk_in_service(zpools.iter().copied()); + let parent_id = SitrepUuid::new_v4(); + let parent = + make_parent_with_disk_case(parent_id, collection.id, target); + // Capture the parent's fact UUID for the target zpool. + let parent_fact_id = parent + .cases + .iter() + .find(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .expect("parent should have one Disk case") + .facts + .iter() + .next() + .expect("parent case should have one fact") + .id; + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + let open = disk_facts(&sitrep, true); + assert_eq!(open.len(), 1, "expected exactly one open Disk fact"); + assert_eq!( + open[0].1.id, parent_fact_id, + "fact UUID should be stable across sitreps when the \ + observation hasn't changed", + ); + match &open[0].2 { + DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health } => { + assert_eq!(*zpool_id, target); + assert_eq!(*last_seen_health, ZpoolHealth::Degraded); + } + } + logctx.cleanup_successful(); + } + + /// When the parent's fact recorded a different `last_seen_health` than + /// what we observe now, the diagnoser removes the stale fact and emits + /// a fresh one (new UUID). The case stays open because the zpool is + /// still unhealthy — just with a different value. + #[test] + fn fact_uuid_rotates_when_observation_changes() { + let (logctx, mut collection, zpools) = setup("disk_fact_uuid_rotates"); + let target = zpools[0]; + // Parent recorded Degraded; current inventory shows Faulted. + set_health(&mut collection, target, ZpoolHealth::Faulted); + let in_service = mk_in_service(zpools.iter().copied()); + let parent_id = SitrepUuid::new_v4(); + let parent = + make_parent_with_disk_case(parent_id, collection.id, target); + let parent_fact_id = parent + .cases + .iter() + .find(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .expect("parent should have one Disk case") + .facts + .iter() + .next() + .expect("parent case should have one fact") + .id; + + let input = build_input(collection, Some(parent), in_service); + let sitrep = run_analyze(&logctx.log, &input); + let open = disk_facts(&sitrep, true); + assert_eq!( + open.len(), + 1, + "expected exactly one open Disk fact (the refreshed one)", + ); + assert_ne!( + open[0].1.id, parent_fact_id, + "fact UUID should rotate because last_seen_health changed", + ); + match &open[0].2 { + DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health } => { + assert_eq!(*zpool_id, target); + assert_eq!(*last_seen_health, ZpoolHealth::Faulted); + } + } + // The case itself should still be the same one that was carried + // forward — only the fact rotated. + assert!(open[0].0.is_open()); + logctx.cleanup_successful(); + } +} diff --git a/nexus/fm/src/diagnosis.rs b/nexus/fm/src/diagnosis/mod.rs similarity index 59% rename from nexus/fm/src/diagnosis.rs rename to nexus/fm/src/diagnosis/mod.rs index 96ea491f1b1..266a5da0809 100644 --- a/nexus/fm/src/diagnosis.rs +++ b/nexus/fm/src/diagnosis/mod.rs @@ -2,22 +2,32 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! Fault management diagnosis engines. +//! +//! Each submodule defines one diagnosis engine (DE). `analyze` dispatches to +//! each engine in turn; engines are deterministic and idempotent per RFD 603, +//! so the dispatch order does not matter. + use crate::SitrepBuilder; use crate::analysis_input::Input; +mod disk; + pub fn analyze( - _input: &Input, - _builder: &mut SitrepBuilder<'_>, + input: &Input, + builder: &mut SitrepBuilder<'_>, ) -> anyhow::Result<()> { - anyhow::bail!("FM analysis is not yet implemented") + disk::analyze(input, builder)?; + Ok(()) } -/// Ereport classes that the diagnosis engine currently understands. -/// Preparation only surfaces ereports whose class is in this set — there is -/// no value in loading ereports FM analysis cannot consume. +/// Ereport classes that any diagnosis engine in this build of Nexus knows +/// how to consume. The background task uses this to filter loaded ereports +/// — there is no value in loading ereports FM analysis cannot consume. /// -/// Empty until [`analyze`] gains real handling. Grow this alongside FM -/// analysis as new classes gain support. +/// Empty today: the only enabled DE is the disk DE, which is polling-based +/// and consumes no ereports. Grow this list alongside FM analysis as new +/// classes gain ereport support. /// /// **NULL-class ereports are intentionally excluded by the loader's SQL /// filter** (`class = ANY(...)` never matches NULL). If FM analysis ever diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 10173be4021..17cb3fddd48 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -9,11 +9,15 @@ use anyhow::Context; use chrono::Utc; use fm::analysis_input::InvalidInputs; use futures::future::BoxFuture; +use iddqd::IdOrdMap; +use nexus_db_model::PhysicalDiskPolicy; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore; +use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::pagination::Paginator; use nexus_fm as fm; +use nexus_types::in_service_disk::InServiceDisk; use nexus_types::internal_api::background::FmAnalysisStatus; use nexus_types::internal_api::background::fm_analysis as status; use nexus_types::inventory; @@ -217,8 +221,48 @@ impl FmAnalysis { (fm::analysis_input::Input, status::PreparationStatus), PreparationError, > { - let mut builder = - fm::analysis_input::Input::builder(parent_sitrep, inv)?; + // Load all external (U.2) zpools and project them down to FM's + // `InServiceDisk` view, filtering on `disk_policy = in_service` and a + // live (non-soft-deleted) physical_disk row. M.2 disks are not + // represented as control plane disks today, so the U.2-only filter + // on the underlying query matches reality. + // + // This is the executed view from the DB — flipped only after sagas / + // cleaners have actually drained resources, not while a planner is + // merely proposing changes. A faulty disk a planner proposes to + // expunge is still the diagnoser's concern until the control plane + // has actually moved on. + let zpools_and_disks = self + .datastore + .zpool_list_all_external_batched(opctx) + .await + .context("failed to load in-service control plane disks")?; + let mut in_service_disks_map = IdOrdMap::new(); + for (zpool, disk) in zpools_and_disks { + if disk.disk_policy != PhysicalDiskPolicy::InService { + continue; + } + in_service_disks_map + .insert_unique(InServiceDisk { + physical_disk_id: disk.id(), + zpool_id: zpool.id(), + sled_id: disk.sled_id.into(), + vendor: disk.vendor, + serial: disk.serial, + model: disk.model, + variant: disk.variant.into(), + }) + .expect( + "physical_disk.id is a primary key, so duplicates are \ + impossible", + ); + } + let in_service_disks = Arc::new(in_service_disks_map); + let mut builder = fm::analysis_input::Input::builder( + parent_sitrep, + inv, + in_service_disks, + )?; let mut errors = Vec::new(); self.load_new_ereports(opctx, &mut builder, &mut errors) .await diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index ef76c1db310..0e2696f4778 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -552,6 +552,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case1 .alerts_requested @@ -631,6 +632,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case2 .alerts_requested @@ -933,6 +935,7 @@ mod tests { ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -1145,6 +1148,7 @@ mod tests { ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -1254,6 +1258,7 @@ mod tests { ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), } }; @@ -1449,6 +1454,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case1 .support_bundles_requested @@ -1625,6 +1631,7 @@ mod tests { alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), + facts: iddqd::IdOrdMap::new(), }; case.support_bundles_requested .insert_unique(fm::case::SupportBundleRequest { diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index cb78bc05f6a..7f0a4c055b9 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -216,4 +216,5 @@ pub struct SitrepVersion { #[strum(serialize_all = "snake_case")] pub enum DiagnosisEngineKind { PowerShelf, + Disk, } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 03e930e4d64..8b36aae7e44 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -7,9 +7,11 @@ use crate::fm::DiagnosisEngineKind; use crate::fm::Ereport; use crate::fm::EreportId; use crate::support_bundle::BundleDataSelection; +use anyhow::Context; use iddqd::{IdOrdItem, IdOrdMap}; use omicron_uuid_kinds::{ - AlertUuid, CaseEreportUuid, CaseUuid, SitrepUuid, SupportBundleUuid, + AlertUuid, CaseEreportUuid, CaseFactUuid, CaseUuid, SitrepUuid, + SupportBundleUuid, }; use serde::{Deserialize, Serialize}; use std::fmt; @@ -24,6 +26,9 @@ pub struct Case { pub ereports: IdOrdMap, pub alerts_requested: IdOrdMap, pub support_bundles_requested: IdOrdMap, + /// Diagnosis-engine-derived facts attached to this case. See + /// [`CaseFact`] for semantics. + pub facts: IdOrdMap, } impl Case { @@ -159,6 +164,48 @@ impl CaseEreport { } } +/// A diagnosis-engine-derived fact attached to a [`Case`]. +/// +/// Facts are **immutable**: to "update" a fact, the diagnosis engine +/// removes the old one and adds a fresh one. As long as a fact's content +/// matches the engine's current view, the same fact is carried forward +/// across sitreps unchanged. +/// +/// The `payload` shape is owned and interpreted only by the case's +/// diagnosis engine. Other engines and shared FM code must treat it as +/// opaque bytes. +/// +/// `Eq`/`PartialEq` compare the raw `serde_json::Value` payloads, so two +/// facts are equal iff their payloads are JSON-value-equal (object key +/// order does not matter; number representation does). This is the +/// equality the DB round-trip needs. Engine-side comparison should go +/// through [`CaseFact::payload_as`] and compare typed enum values — never +/// the raw payload. +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct CaseFact { + pub id: CaseFactUuid, + pub payload: serde_json::Value, + pub comment: String, +} + +impl IdOrdItem for CaseFact { + type Key<'a> = &'a CaseFactUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + iddqd::id_upcast!(); +} + +impl CaseFact { + /// Attempt to deserialize this fact's payload as `T`. + pub fn payload_as( + &self, + ) -> anyhow::Result { + serde_json::from_value(self.payload.clone()) + .context("failed to deserialize case fact payload") + } +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct AlertRequest { pub id: AlertUuid, @@ -217,6 +264,7 @@ impl fmt::Display for DisplayCase<'_> { ereports, alerts_requested, support_bundles_requested, + facts, }, indent, sitrep_id, @@ -286,6 +334,22 @@ impl fmt::Display for DisplayCase<'_> { } } + if !facts.is_empty() { + writeln!(f, "\n{:>indent$}facts:", "")?; + writeln!(f, "{:>indent$}------", "")?; + + let indent = indent + 2; + for CaseFact { id, payload, comment } in facts.iter() { + const PAYLOAD: &str = "payload:"; + const COMMENT: &str = "comment:"; + const WIDTH: usize = const_max_len(&[PAYLOAD, COMMENT]); + + writeln!(f, "{BULLET:>indent$}fact {id}")?; + writeln!(f, "{:>indent$}{PAYLOAD:indent$}{COMMENT:indent$}alerts requested:", "")?; writeln!(f, "{:>indent$}-----------------", "")?; @@ -527,6 +591,7 @@ mod tests { ereports, alerts_requested, support_bundles_requested, + facts: IdOrdMap::new(), }; eprintln!("example case display:"); diff --git a/nexus/types/src/in_service_disk.rs b/nexus/types/src/in_service_disk.rs new file mode 100644 index 00000000000..bc7814ce2cb --- /dev/null +++ b/nexus/types/src/in_service_disk.rs @@ -0,0 +1,41 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! "Currently in-service control-plane disks" — the executed view from +//! the `physical_disk` and `zpool` DB tables. +//! +//! This is distinct from the planned view in `BlueprintPhysicalDiskConfig`: +//! a disk is in this set only after the control plane has actually committed +//! to managing it (`physical_disk.disk_policy = 'in_service'`), not while a +//! planner is merely proposing to expunge or adopt it. Consumers that need +//! the *committed* view of which disks are part of the rack — fault +//! management diagnosers in particular — should read this rather than the +//! target blueprint. + +use crate::external_api::physical_disk::PhysicalDiskKind; +use iddqd::{IdOrdItem, id_upcast}; +use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; + +/// One control-plane-managed physical disk, joined with its zpool and the +/// stable identity facts the DB already knows about it. +#[derive(Clone, Debug, PartialEq)] +pub struct InServiceDisk { + pub physical_disk_id: PhysicalDiskUuid, + pub zpool_id: ZpoolUuid, + pub sled_id: SledUuid, + pub vendor: String, + pub serial: String, + pub model: String, + pub variant: PhysicalDiskKind, +} + +impl IdOrdItem for InServiceDisk { + type Key<'a> = PhysicalDiskUuid; + fn key(&self) -> Self::Key<'_> { + self.physical_disk_id + } + id_upcast!(); +} diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 887fb615599..1b71884590e 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -45,7 +45,7 @@ use sled_agent_types_versions::latest::inventory::SingleMeasurementInventory; use sled_agent_types_versions::latest::inventory::SledCpuFamily; use sled_agent_types_versions::latest::inventory::SledRole; use sled_agent_types_versions::latest::inventory::SvcsEnabledNotOnlineResult; -use sled_agent_types_versions::latest::inventory::ZpoolHealth; +pub use sled_agent_types_versions::latest::inventory::ZpoolHealth; use sled_hardware_types::BaseboardId; use std::collections::BTreeMap; use std::collections::BTreeSet; diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 36e945f58eb..9a5eeb77c06 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -35,6 +35,7 @@ pub mod deployment; pub mod external_api; pub mod fm; pub mod identity; +pub mod in_service_disk; pub mod instance; pub mod internal_api; pub mod inventory; diff --git a/schema/crdb/add-disk-de-and-facts/up1.sql b/schema/crdb/add-disk-de-and-facts/up1.sql new file mode 100644 index 00000000000..296a7cc49a8 --- /dev/null +++ b/schema/crdb/add-disk-de-and-facts/up1.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.diagnosis_engine ADD VALUE IF NOT EXISTS 'disk'; diff --git a/schema/crdb/add-disk-de-and-facts/up2.sql b/schema/crdb/add-disk-de-and-facts/up2.sql new file mode 100644 index 00000000000..00ae401a78b --- /dev/null +++ b/schema/crdb/add-disk-de-and-facts/up2.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.fm_case_fact ( + id UUID NOT NULL, + sitrep_id UUID NOT NULL, + case_id UUID NOT NULL, + payload JSONB NOT NULL, + comment TEXT NOT NULL, + PRIMARY KEY (sitrep_id, id) +); diff --git a/schema/crdb/add-disk-de-and-facts/up3.sql b/schema/crdb/add-disk-de-and-facts/up3.sql new file mode 100644 index 00000000000..3a1c82db315 --- /dev/null +++ b/schema/crdb/add-disk-de-and-facts/up3.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_fm_case_facts_for_case ON omicron.public.fm_case_fact (sitrep_id, case_id); diff --git a/schema/crdb/add-disk-de-and-facts/up3.verify.sql b/schema/crdb/add-disk-de-and-facts/up3.verify.sql new file mode 100644 index 00000000000..e23c166614e --- /dev/null +++ b/schema/crdb/add-disk-de-and-facts/up3.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'fm_case_fact' AND index_name = 'lookup_fm_case_facts_for_case')),'true','Schema change verification failed: index lookup_fm_case_facts_for_case on table fm_case_fact does not exist') AS BOOL); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 707c3270779..b5837073940 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7623,7 +7623,8 @@ ON omicron.public.fm_sitrep_history (sitrep_id); CREATE TYPE IF NOT EXISTS omicron.public.diagnosis_engine AS ENUM ( - 'power_shelf' + 'power_shelf', + 'disk' ); CREATE TABLE IF NOT EXISTS omicron.public.fm_case ( @@ -7650,6 +7651,28 @@ CREATE INDEX IF NOT EXISTS lookup_fm_cases_for_sitrep ON omicron.public.fm_case (sitrep_id); +-- Per-engine "facts" attached to a case. A fact's payload is a JSONB blob +-- owned by the case's diagnosis engine (see `fm_case.de`) and opaque to +-- shared FM code. +CREATE TABLE IF NOT EXISTS omicron.public.fm_case_fact ( + -- Stable UUID for this fact across sitreps. + id UUID NOT NULL, + -- Sitrep this row belongs to. + sitrep_id UUID NOT NULL, + -- UUID of the case this fact attaches to. + case_id UUID NOT NULL, + -- Engine-defined JSONB payload. Opaque to shared FM code. + payload JSONB NOT NULL, + -- Free-form, debug-only comment. + comment TEXT NOT NULL, + + PRIMARY KEY (sitrep_id, id) +); + +CREATE INDEX IF NOT EXISTS + lookup_fm_case_facts_for_case +ON omicron.public.fm_case_fact (sitrep_id, case_id); + CREATE TABLE IF NOT EXISTS omicron.public.fm_ereport_in_case ( -- ID of this association. When an ereport is assigned to a case, that -- association is assigned a UUID. These are used primarily to aid in @@ -8626,7 +8649,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '259.0.0', NULL) + (TRUE, NOW(), NOW(), '260.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/test-utils/src/dev/test_cmds.rs b/test-utils/src/dev/test_cmds.rs index f254c06726e..bddd0c551f6 100644 --- a/test-utils/src/dev/test_cmds.rs +++ b/test-utils/src/dev/test_cmds.rs @@ -244,6 +244,23 @@ impl<'a> Redactor<'a> { self } + /// Replace any text matching `pattern` with ``. + /// + /// More flexible than [`Self::field`], which requires a fixed + /// `namevalue` shape. Use this when the entire variable substring + /// is itself a regex (e.g., the substring may take one of several + /// alternative forms across runs). + pub fn variable_regex( + &mut self, + name: &str, + pattern: &str, + ) -> &mut Self { + let re = regex::Regex::new(pattern).unwrap(); + let replacement = format!("<{}_REDACTED>", name.to_uppercase()); + self.extra_regex.push((re, replacement)); + self + } + /// Redact an entire indented section. /// /// This can be used if the shape of a section might change from run to run. diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 4687e2d5710..26d9eb33636 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -50,6 +50,7 @@ impl_typed_uuid_kinds! { BuiltInUser = {}, Case = {}, CaseEreport = {}, + CaseFact = {}, Collection = {}, ConsoleSession = {}, Dataset = {}, From dfabfb4795eed0070b9aafce744da33bccf1bea3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 May 2026 12:06:27 -0700 Subject: [PATCH 02/15] Debuggable facts --- nexus/fm/src/builder/case.rs | 8 +++++--- test-utils/src/dev/test_cmds.rs | 6 +----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index a5bd5222522..2bc87f7f0e8 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -12,6 +12,7 @@ use nexus_types::support_bundle::BundleDataSelection; use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseUuid; use omicron_uuid_kinds::SitrepUuid; +use std::fmt::Debug; use std::sync::Arc; #[derive(Debug)] @@ -224,7 +225,7 @@ impl CaseBuilder { /// Emit a new fact under this case. The fact's UUID is freshly /// allocated from the case's deterministic RNG. - pub fn add_fact( + pub fn add_fact( &mut self, fact: &T, comment: impl ToString, @@ -235,8 +236,9 @@ impl CaseBuilder { break id; } }; - let payload = serde_json::to_value(fact) - .context("failed to serialize case fact payload")?; + let payload = serde_json::to_value(fact).with_context( + || "failed to serialize case fact payload {fact:?}", + )?; let comment = comment.to_string(); let fact = fm::case::CaseFact { id, payload, comment: comment.clone() }; self.case.facts.insert_unique(fact).expect("UUID should be unused"); diff --git a/test-utils/src/dev/test_cmds.rs b/test-utils/src/dev/test_cmds.rs index bddd0c551f6..60919f4311f 100644 --- a/test-utils/src/dev/test_cmds.rs +++ b/test-utils/src/dev/test_cmds.rs @@ -250,11 +250,7 @@ impl<'a> Redactor<'a> { /// `namevalue` shape. Use this when the entire variable substring /// is itself a regex (e.g., the substring may take one of several /// alternative forms across runs). - pub fn variable_regex( - &mut self, - name: &str, - pattern: &str, - ) -> &mut Self { + pub fn variable_regex(&mut self, name: &str, pattern: &str) -> &mut Self { let re = regex::Regex::new(pattern).unwrap(); let replacement = format!("<{}_REDACTED>", name.to_uppercase()); self.extra_regex.push((re, replacement)); From 7555b0bb53ed8e7242d406128aae62cd09cfa4bf Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 May 2026 12:37:10 -0700 Subject: [PATCH 03/15] more logging --- nexus/fm/src/builder/case.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index 2bc87f7f0e8..97492db0fef 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -240,16 +240,20 @@ impl CaseBuilder { || "failed to serialize case fact payload {fact:?}", )?; let comment = comment.to_string(); - let fact = fm::case::CaseFact { id, payload, comment: comment.clone() }; - self.case.facts.insert_unique(fact).expect("UUID should be unused"); - slog::info!( &self.log, "added a fact"; "fact_id" => %id, + "payload" => %payload, "comment" => %comment, ); - self.report_log.entry("added fact").kv("fact_id", id).comment(comment); + self.report_log + .entry("added fact") + .kv("fact_id", id) + .kv("payload", &payload) + .comment(comment.clone()); + let fact = fm::case::CaseFact { id, payload, comment }; + self.case.facts.insert_unique(fact).expect("UUID should be unused"); Ok(id) } From bcf5b68896652daa8058330e183dfc0ca4ca9c35 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 May 2026 12:50:43 -0700 Subject: [PATCH 04/15] CaseFact -> Fact, add created_sitrep_id to facts, fm- prefix --- nexus/db-model/src/fm/case.rs | 14 ++++--- nexus/db-model/src/schema_versions.rs | 2 +- nexus/db-queries/src/db/datastore/fm.rs | 15 ++++---- nexus/db-schema/src/schema.rs | 1 + nexus/fm/src/builder/case.rs | 9 ++++- nexus/fm/src/diagnosis/disk.rs | 8 ++-- nexus/types/src/fm/case.rs | 38 ++++++++++++------- schema/crdb/dbinit.sql | 5 +++ .../up1.sql | 0 .../up2.sql | 1 + .../up3.sql | 0 .../up3.verify.sql | 0 12 files changed, 62 insertions(+), 31 deletions(-) rename schema/crdb/{add-disk-de-and-facts => fm-disk-de-and-facts}/up1.sql (100%) rename schema/crdb/{add-disk-de-and-facts => fm-disk-de-and-facts}/up2.sql (85%) rename schema/crdb/{add-disk-de-and-facts => fm-disk-de-and-facts}/up3.sql (100%) rename schema/crdb/{add-disk-de-and-facts => fm-disk-de-and-facts}/up3.verify.sql (100%) diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 1217454bfb1..83a237fa78f 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -78,28 +78,32 @@ impl CaseMetadata { } /// Diesel row for the `fm_case_fact` table. See -/// [`nexus_types::fm::case::CaseFact`] for semantics. +/// [`nexus_types::fm::case::Fact`] for semantics. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_case_fact)] -pub struct CaseFact { +pub struct Fact { pub id: DbTypedUuid, pub sitrep_id: DbTypedUuid, pub case_id: DbTypedUuid, + /// Sitrep in which this fact was first added. Preserved unchanged + /// when the fact is carried forward; debug-only. + pub created_sitrep_id: DbTypedUuid, pub payload: serde_json::Value, pub comment: String, } -impl CaseFact { +impl Fact { pub fn from_sitrep( sitrep_id: impl Into>, case_id: impl Into>, - fact: &fm::case::CaseFact, + fact: &fm::case::Fact, ) -> Self { - let fm::case::CaseFact { id, payload, comment } = fact; + let fm::case::Fact { id, created_sitrep_id, payload, comment } = fact; Self { id: (*id).into(), sitrep_id: sitrep_id.into(), case_id: case_id.into(), + created_sitrep_id: (*created_sitrep_id).into(), payload: payload.clone(), comment: comment.clone(), } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index bc3057784d7..2b6f58facaf 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -28,7 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), - KnownVersion::new(260, "add-disk-de-and-facts"), + KnownVersion::new(260, "fm-disk-de-and-facts"), KnownVersion::new(259, "vmm-failure-reason"), KnownVersion::new(258, "lookup-unmarked-ereports-by-class"), KnownVersion::new(257, "add-disk-adoption-requests"), diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index f600005c9e7..bee38c90be6 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -511,10 +511,9 @@ impl DataStore { &self, id: SitrepUuid, conn: &async_bb8_diesel::Connection, - ) -> Result>, Error> - { + ) -> Result>, Error> { let mut by_case = - HashMap::>::new(); + HashMap::>::new(); let mut paginator: Paginator> = Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); @@ -525,7 +524,7 @@ impl DataStore { &p.current_pagparams(), ) .filter(case_fact_dsl::sitrep_id.eq(id.into_untyped_uuid())) - .select(model::fm::CaseFact::as_select()) + .select(model::fm::Fact::as_select()) .load_async(conn) .await .map_err(|e| { @@ -540,8 +539,9 @@ impl DataStore { by_case .entry(case_id) .or_default() - .insert_unique(fm::case::CaseFact { + .insert_unique(fm::case::Fact { id, + created_sitrep_id: fact.created_sitrep_id.into(), payload: fact.payload, comment: fact.comment, }) @@ -828,7 +828,7 @@ impl DataStore { bundle_data_selections_requested.push((req_id, data_selection)); } for fact in case.facts.iter() { - case_facts.push(model::fm::CaseFact::from_sitrep( + case_facts.push(model::fm::Fact::from_sitrep( sitrep_id, case_id, fact, )); } @@ -2338,8 +2338,9 @@ mod tests { let mut facts = iddqd::IdOrdMap::new(); facts - .insert_unique(fm::case::CaseFact { + .insert_unique(fm::case::Fact { id: CaseFactUuid::new_v4(), + created_sitrep_id: sitrep_id, payload: serde_json::json!({ "kind": "representative_fact", "note": "for round-trip testing", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 19119861c92..e7c7d969700 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -3249,6 +3249,7 @@ table! { id -> Uuid, sitrep_id -> Uuid, case_id -> Uuid, + created_sitrep_id -> Uuid, payload -> Jsonb, comment -> Text, } diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index 97492db0fef..dae9d17f352 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -252,7 +252,12 @@ impl CaseBuilder { .kv("fact_id", id) .kv("payload", &payload) .comment(comment.clone()); - let fact = fm::case::CaseFact { id, payload, comment }; + let fact = fm::case::Fact { + id, + created_sitrep_id: self.sitrep_id, + payload, + comment, + }; self.case.facts.insert_unique(fact).expect("UUID should be unused"); Ok(id) } @@ -268,7 +273,7 @@ impl CaseBuilder { /// Iterate the facts currently attached to this case (including any that /// were carried forward from the parent sitrep). - pub fn facts(&self) -> impl Iterator { + pub fn facts(&self) -> impl Iterator { self.case.facts.iter() } diff --git a/nexus/fm/src/diagnosis/disk.rs b/nexus/fm/src/diagnosis/disk.rs index 3d0ed5f8a10..6e774bca739 100644 --- a/nexus/fm/src/diagnosis/disk.rs +++ b/nexus/fm/src/diagnosis/disk.rs @@ -324,8 +324,9 @@ mod tests { let case_id = omicron_uuid_kinds::CaseUuid::new_v4(); let mut facts = iddqd::IdOrdMap::new(); facts - .insert_unique(fm::case::CaseFact { + .insert_unique(fm::case::Fact { id: omicron_uuid_kinds::CaseFactUuid::new_v4(), + created_sitrep_id: parent_sitrep_id, payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health: ZpoolHealth::Degraded, @@ -369,7 +370,7 @@ mod tests { fn disk_facts( sitrep: &Sitrep, open_only: bool, - ) -> Vec<(&fm::Case, &fm::case::CaseFact, DiskFact)> { + ) -> Vec<(&fm::Case, &fm::case::Fact, DiskFact)> { sitrep .cases .iter() @@ -539,8 +540,9 @@ mod tests { }); let mut parent_facts = iddqd::IdOrdMap::new(); parent_facts - .insert_unique(fm::case::CaseFact { + .insert_unique(fm::case::Fact { id: unreadable_fact_id, + created_sitrep_id: parent_sitrep_id, payload: unreadable_payload.clone(), comment: "fact with payload from the future".to_string(), }) diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 8b36aae7e44..b290c30d602 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -27,8 +27,8 @@ pub struct Case { pub alerts_requested: IdOrdMap, pub support_bundles_requested: IdOrdMap, /// Diagnosis-engine-derived facts attached to this case. See - /// [`CaseFact`] for semantics. - pub facts: IdOrdMap, + /// [`Fact`] for semantics. + pub facts: IdOrdMap, } impl Case { @@ -175,20 +175,23 @@ impl CaseEreport { /// diagnosis engine. Other engines and shared FM code must treat it as /// opaque bytes. /// -/// `Eq`/`PartialEq` compare the raw `serde_json::Value` payloads, so two -/// facts are equal iff their payloads are JSON-value-equal (object key -/// order does not matter; number representation does). This is the -/// equality the DB round-trip needs. Engine-side comparison should go -/// through [`CaseFact::payload_as`] and compare typed enum values — never -/// the raw payload. +/// `Eq`/`PartialEq` derive over all fields, including the raw +/// `serde_json::Value` payload (object key order does not matter; number +/// representation does). This is the equality the DB round-trip test +/// needs. Engine-side comparison should go through [`Fact::payload_as`] +/// and compare typed enum values — never the raw payload. #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] -pub struct CaseFact { +pub struct Fact { pub id: CaseFactUuid, + /// The sitrep in which this fact was first added. Preserved + /// unchanged when the fact is carried forward into a child sitrep. + /// Debug-only. + pub created_sitrep_id: SitrepUuid, pub payload: serde_json::Value, pub comment: String, } -impl IdOrdItem for CaseFact { +impl IdOrdItem for Fact { type Key<'a> = &'a CaseFactUuid; fn key(&self) -> Self::Key<'_> { &self.id @@ -196,7 +199,7 @@ impl IdOrdItem for CaseFact { iddqd::id_upcast!(); } -impl CaseFact { +impl Fact { /// Attempt to deserialize this fact's payload as `T`. pub fn payload_as( &self, @@ -339,12 +342,21 @@ impl fmt::Display for DisplayCase<'_> { writeln!(f, "{:>indent$}------", "")?; let indent = indent + 2; - for CaseFact { id, payload, comment } in facts.iter() { + for Fact { id, created_sitrep_id, payload, comment } in facts.iter() + { + const ADDED_IN: &str = "added in:"; const PAYLOAD: &str = "payload:"; const COMMENT: &str = "comment:"; - const WIDTH: usize = const_max_len(&[PAYLOAD, COMMENT]); + const WIDTH: usize = + const_max_len(&[ADDED_IN, PAYLOAD, COMMENT]); writeln!(f, "{BULLET:>indent$}fact {id}")?; + writeln!( + f, + "{:>indent$}{ADDED_IN:indent$}{PAYLOAD:indent$}{COMMENT: Date: Tue, 19 May 2026 15:35:49 -0700 Subject: [PATCH 05/15] payload_to, deserialization error context --- nexus/fm/src/diagnosis/disk.rs | 4 ++-- nexus/types/src/fm/case.rs | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/nexus/fm/src/diagnosis/disk.rs b/nexus/fm/src/diagnosis/disk.rs index 6e774bca739..d3eb4d97512 100644 --- a/nexus/fm/src/diagnosis/disk.rs +++ b/nexus/fm/src/diagnosis/disk.rs @@ -78,7 +78,7 @@ pub(super) fn analyze( let mut summary = ParentCaseSummary { zpool_unhealthy: BTreeMap::new() }; for fact in c.facts.iter() { - match fact.payload_as::() { + match fact.payload_to::() { Ok(DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health, @@ -378,7 +378,7 @@ mod tests { .filter(|c| !open_only || c.is_open()) .flat_map(|c| { c.facts.iter().filter_map(move |f| { - f.payload_as::().ok().map(|d| (c, f, d)) + f.payload_to::().ok().map(|d| (c, f, d)) }) }) .collect() diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index b290c30d602..278fc343896 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -178,7 +178,7 @@ impl CaseEreport { /// `Eq`/`PartialEq` derive over all fields, including the raw /// `serde_json::Value` payload (object key order does not matter; number /// representation does). This is the equality the DB round-trip test -/// needs. Engine-side comparison should go through [`Fact::payload_as`] +/// needs. Engine-side comparison should go through [`Fact::payload_to`] /// and compare typed enum values — never the raw payload. #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct Fact { @@ -201,11 +201,15 @@ impl IdOrdItem for Fact { impl Fact { /// Attempt to deserialize this fact's payload as `T`. - pub fn payload_as( + pub fn payload_to( &self, ) -> anyhow::Result { - serde_json::from_value(self.payload.clone()) - .context("failed to deserialize case fact payload") + serde_json::from_value(self.payload.clone()).with_context(|| { + format!( + "failed to deserialize case fact payload: {:?}", + self.payload + ) + }) } } From 58c780f71f2406eda9b1db6716dc5de1d9d84274 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 19 May 2026 15:48:41 -0700 Subject: [PATCH 06/15] case formatting --- nexus/types/src/fm/case.rs | 93 ++++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 278fc343896..daa260a8bfa 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -211,6 +211,57 @@ impl Fact { ) }) } + + pub fn display_multiline( + &self, + indent: usize, + sitrep_id: Option, + ) -> impl fmt::Display + '_ { + struct DisplayFact<'a> { + fact: &'a Fact, + indent: usize, + sitrep_id: Option, + } + + impl fmt::Display for DisplayFact<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const BULLET: &str = "* "; + const ADDED_IN: &str = "added in:"; + const COMMENT: &str = "comment:"; + const PAYLOAD: &str = "payload:"; + const WIDTH: usize = + const_max_len(&[ADDED_IN, COMMENT, PAYLOAD]); + + let &Self { + fact: Fact { id, created_sitrep_id, payload, comment }, + indent, + sitrep_id, + } = self; + let this_sitrep = |s| { + if Some(s) == sitrep_id { " <-- this sitrep" } else { "" } + }; + + writeln!(f, "{BULLET:>indent$}fact {id}")?; + writeln!( + f, + "{:>indent$}{ADDED_IN:indent$}{COMMENT:indent$}{PAYLOAD:payload_indent$}{line}", "")?; + } + writeln!(f) + } + } + + DisplayFact { fact: self, indent, sitrep_id } + } } #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] @@ -346,23 +397,8 @@ impl fmt::Display for DisplayCase<'_> { writeln!(f, "{:>indent$}------", "")?; let indent = indent + 2; - for Fact { id, created_sitrep_id, payload, comment } in facts.iter() - { - const ADDED_IN: &str = "added in:"; - const PAYLOAD: &str = "payload:"; - const COMMENT: &str = "comment:"; - const WIDTH: usize = - const_max_len(&[ADDED_IN, PAYLOAD, COMMENT]); - - writeln!(f, "{BULLET:>indent$}fact {id}")?; - writeln!( - f, - "{:>indent$}{ADDED_IN:indent$}{PAYLOAD:indent$}{COMMENT: Date: Wed, 20 May 2026 11:19:48 -0700 Subject: [PATCH 07/15] s/Disk/PhysicalDisk --- nexus/db-model/src/fm/diagnosis_engine.rs | 10 +++++++--- nexus/fm/src/diagnosis/disk.rs | 14 +++++++------- nexus/types/src/fm.rs | 2 +- schema/crdb/dbinit.sql | 2 +- schema/crdb/fm-disk-de-and-facts/up1.sql | 2 +- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/nexus/db-model/src/fm/diagnosis_engine.rs b/nexus/db-model/src/fm/diagnosis_engine.rs index ab3052339e8..a99ceeac1ad 100644 --- a/nexus/db-model/src/fm/diagnosis_engine.rs +++ b/nexus/db-model/src/fm/diagnosis_engine.rs @@ -24,7 +24,7 @@ impl_enum_type!( pub enum DiagnosisEngine; PowerShelf => b"power_shelf" - Disk => b"disk" + PhysicalDisk => b"physical_disk" ); @@ -32,7 +32,9 @@ impl From for fm::DiagnosisEngineKind { fn from(de: DiagnosisEngine) -> Self { match de { DiagnosisEngine::PowerShelf => fm::DiagnosisEngineKind::PowerShelf, - DiagnosisEngine::Disk => fm::DiagnosisEngineKind::Disk, + DiagnosisEngine::PhysicalDisk => { + fm::DiagnosisEngineKind::PhysicalDisk + } } } } @@ -41,7 +43,9 @@ impl From for DiagnosisEngine { fn from(fm_de: fm::DiagnosisEngineKind) -> Self { match fm_de { fm::DiagnosisEngineKind::PowerShelf => DiagnosisEngine::PowerShelf, - fm::DiagnosisEngineKind::Disk => DiagnosisEngine::Disk, + fm::DiagnosisEngineKind::PhysicalDisk => { + DiagnosisEngine::PhysicalDisk + } } } } diff --git a/nexus/fm/src/diagnosis/disk.rs b/nexus/fm/src/diagnosis/disk.rs index d3eb4d97512..1cd8defad17 100644 --- a/nexus/fm/src/diagnosis/disk.rs +++ b/nexus/fm/src/diagnosis/disk.rs @@ -72,7 +72,7 @@ pub(super) fn analyze( let parent_summaries: BTreeMap = input .open_cases() .iter() - .filter(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .filter(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) .map(|c| { let case_id = c.id; let mut summary = @@ -180,7 +180,7 @@ pub(super) fn analyze( // No parent case for this zpool — open one. None => { let mut new_case = - builder.cases.open_case(DiagnosisEngineKind::Disk); + builder.cases.open_case(DiagnosisEngineKind::PhysicalDisk); new_case.set_comment(format!("zpool {zpool_id} unhealthy")); new_case.id } @@ -341,7 +341,7 @@ mod tests { metadata: fm::case::Metadata { created_sitrep_id: parent_sitrep_id, closed_sitrep_id: None, - de: DiagnosisEngineKind::Disk, + de: DiagnosisEngineKind::PhysicalDisk, comment: format!("zpool {zpool_id} degraded"), }, ereports: Default::default(), @@ -374,7 +374,7 @@ mod tests { sitrep .cases .iter() - .filter(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .filter(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) .filter(|c| !open_only || c.is_open()) .flat_map(|c| { c.facts.iter().filter_map(move |f| { @@ -553,7 +553,7 @@ mod tests { metadata: fm::case::Metadata { created_sitrep_id: parent_sitrep_id, closed_sitrep_id: None, - de: DiagnosisEngineKind::Disk, + de: DiagnosisEngineKind::PhysicalDisk, comment: "case with fact from the future".to_string(), }, ereports: Default::default(), @@ -630,7 +630,7 @@ mod tests { let parent_fact_id = parent .cases .iter() - .find(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .find(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) .expect("parent should have one Disk case") .facts .iter() @@ -673,7 +673,7 @@ mod tests { let parent_fact_id = parent .cases .iter() - .find(|c| c.metadata.de == DiagnosisEngineKind::Disk) + .find(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) .expect("parent should have one Disk case") .facts .iter() diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 7f0a4c055b9..dadf91efa10 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -216,5 +216,5 @@ pub struct SitrepVersion { #[strum(serialize_all = "snake_case")] pub enum DiagnosisEngineKind { PowerShelf, - Disk, + PhysicalDisk, } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 27ae834f23f..8c715d9ced0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7624,7 +7624,7 @@ ON omicron.public.fm_sitrep_history (sitrep_id); CREATE TYPE IF NOT EXISTS omicron.public.diagnosis_engine AS ENUM ( 'power_shelf', - 'disk' + 'physical_disk' ); CREATE TABLE IF NOT EXISTS omicron.public.fm_case ( diff --git a/schema/crdb/fm-disk-de-and-facts/up1.sql b/schema/crdb/fm-disk-de-and-facts/up1.sql index 296a7cc49a8..602e91c8175 100644 --- a/schema/crdb/fm-disk-de-and-facts/up1.sql +++ b/schema/crdb/fm-disk-de-and-facts/up1.sql @@ -1 +1 @@ -ALTER TYPE omicron.public.diagnosis_engine ADD VALUE IF NOT EXISTS 'disk'; +ALTER TYPE omicron.public.diagnosis_engine ADD VALUE IF NOT EXISTS 'physical_disk'; From b016d7c9ce0a80be1caf04fb0dc815a7bd4c4af0 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 12:27:50 -0700 Subject: [PATCH 08/15] Reuse bulleted JSON formatter for fact payload display --- nexus/types/src/fm.rs | 1 + nexus/types/src/fm/analysis_reports.rs | 80 +---------------------- nexus/types/src/fm/case.rs | 13 +--- nexus/types/src/fm/json_display.rs | 87 ++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 89 deletions(-) create mode 100644 nexus/types/src/fm/json_display.rs diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index dadf91efa10..4c4bbfbaa6b 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -12,6 +12,7 @@ pub mod ereport; pub use ereport::{Ereport, EreportId}; pub mod case; pub use case::Case; +mod json_display; use case::AlertRequest; use chrono::{DateTime, Utc}; diff --git a/nexus/types/src/fm/analysis_reports.rs b/nexus/types/src/fm/analysis_reports.rs index 839fb7fade8..249e26f1b8b 100644 --- a/nexus/types/src/fm/analysis_reports.rs +++ b/nexus/types/src/fm/analysis_reports.rs @@ -7,6 +7,7 @@ use super::case; use super::ereport::EreportId; +use super::json_display::fmt_json_value; use iddqd::IdOrdMap; use omicron_uuid_kinds::{CaseUuid, CollectionUuid, SitrepUuid}; use serde::{Deserialize, Serialize}; @@ -158,85 +159,6 @@ impl DebugLog { } } -/// Recursively format a JSON value as a bulleted list entry, nesting any -/// object or array children as indented sub-bullets. -fn fmt_json_value( - f: &mut fmt::Formatter<'_>, - key: &str, - value: &serde_json::Value, - indent: usize, -) -> fmt::Result { - match value { - serde_json::Value::Object(map) => { - writeln!(f, "{:indent$}* {key}:", "")?; - for (k, v) in map { - fmt_json_value(f, k, v, indent + 2)?; - } - Ok(()) - } - serde_json::Value::Array(arr) => { - writeln!(f, "{:indent$}* {key}:", "")?; - let indent = indent + 2; - for (i, v) in arr.iter().enumerate() { - fmt_json_array_item(f, i + 1, v, indent)?; - } - Ok(()) - } - serde_json::Value::String(s) => { - writeln!(f, "{:indent$}* {key}: {s}", "") - } - serde_json::Value::Null => { - writeln!(f, "{:indent$}* {key}: ", "") - } - serde_json::Value::Bool(b) => { - writeln!(f, "{:indent$}* {key}: {b}", "") - } - serde_json::Value::Number(n) => { - writeln!(f, "{:indent$}* {key}: {n}", "") - } - } -} - -/// Format a single element of a JSON array as a numbered list item, -/// e.g. `1. value` for scalars or `1.` followed by indented children for -/// objects and nested arrays. -fn fmt_json_array_item( - f: &mut fmt::Formatter<'_>, - n: usize, - value: &serde_json::Value, - indent: usize, -) -> fmt::Result { - match value { - serde_json::Value::Object(map) => { - writeln!(f, "{:indent$}{n}.", "")?; - for (k, v) in map { - fmt_json_value(f, k, v, indent + 2)?; - } - Ok(()) - } - serde_json::Value::Array(arr) => { - writeln!(f, "{:indent$}{n}.", "")?; - let indent = indent + 2; - for (i, v) in arr.iter().enumerate() { - fmt_json_array_item(f, i + 1, v, indent)?; - } - Ok(()) - } - serde_json::Value::String(s) => { - writeln!(f, "{:indent$}{n}. {s}", "") - } - serde_json::Value::Null => { - writeln!(f, "{:indent$}{n}. ", "") - } - serde_json::Value::Bool(b) => { - writeln!(f, "{:indent$}{n}. {b}", "") - } - serde_json::Value::Number(num) => { - writeln!(f, "{:indent$}{n}. {num}", "") - } - } -} - impl LogEntry { pub fn new(event: impl ToString) -> Self { Self { event: event.to_string(), comment: None, kvs: BTreeMap::new() } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index daa260a8bfa..239349de13a 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use super::json_display::fmt_json_value; use crate::alert::AlertClass; use crate::fm::DiagnosisEngineKind; use crate::fm::Ereport; @@ -228,9 +229,7 @@ impl Fact { const BULLET: &str = "* "; const ADDED_IN: &str = "added in:"; const COMMENT: &str = "comment:"; - const PAYLOAD: &str = "payload:"; - const WIDTH: usize = - const_max_len(&[ADDED_IN, COMMENT, PAYLOAD]); + const WIDTH: usize = const_max_len(&[ADDED_IN, COMMENT]); let &Self { fact: Fact { id, created_sitrep_id, payload, comment }, @@ -249,13 +248,7 @@ impl Fact { this_sitrep(*created_sitrep_id), )?; writeln!(f, "{:>indent$}{COMMENT:indent$}{PAYLOAD:payload_indent$}{line}", "")?; - } + fmt_json_value(f, "payload", payload, indent)?; writeln!(f) } } diff --git a/nexus/types/src/fm/json_display.rs b/nexus/types/src/fm/json_display.rs new file mode 100644 index 00000000000..a86b0cc650b --- /dev/null +++ b/nexus/types/src/fm/json_display.rs @@ -0,0 +1,87 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Shared helpers for rendering `serde_json::Value`s as bulleted lists in +//! `Display` impls. + +use std::fmt; + +/// Recursively format a JSON value as a bulleted list entry, nesting any +/// object or array children as indented sub-bullets. +pub(crate) fn fmt_json_value( + f: &mut fmt::Formatter<'_>, + key: &str, + value: &serde_json::Value, + indent: usize, +) -> fmt::Result { + match value { + serde_json::Value::Object(map) => { + writeln!(f, "{:indent$}* {key}:", "")?; + for (k, v) in map { + fmt_json_value(f, k, v, indent + 2)?; + } + Ok(()) + } + serde_json::Value::Array(arr) => { + writeln!(f, "{:indent$}* {key}:", "")?; + let indent = indent + 2; + for (i, v) in arr.iter().enumerate() { + fmt_json_array_item(f, i + 1, v, indent)?; + } + Ok(()) + } + serde_json::Value::String(s) => { + writeln!(f, "{:indent$}* {key}: {s}", "") + } + serde_json::Value::Null => { + writeln!(f, "{:indent$}* {key}: ", "") + } + serde_json::Value::Bool(b) => { + writeln!(f, "{:indent$}* {key}: {b}", "") + } + serde_json::Value::Number(n) => { + writeln!(f, "{:indent$}* {key}: {n}", "") + } + } +} + +/// Format a single element of a JSON array as a numbered list item, +/// e.g. `1. value` for scalars or `1.` followed by indented children for +/// objects and nested arrays. +pub(crate) fn fmt_json_array_item( + f: &mut fmt::Formatter<'_>, + n: usize, + value: &serde_json::Value, + indent: usize, +) -> fmt::Result { + match value { + serde_json::Value::Object(map) => { + writeln!(f, "{:indent$}{n}.", "")?; + for (k, v) in map { + fmt_json_value(f, k, v, indent + 2)?; + } + Ok(()) + } + serde_json::Value::Array(arr) => { + writeln!(f, "{:indent$}{n}.", "")?; + let indent = indent + 2; + for (i, v) in arr.iter().enumerate() { + fmt_json_array_item(f, i + 1, v, indent)?; + } + Ok(()) + } + serde_json::Value::String(s) => { + writeln!(f, "{:indent$}{n}. {s}", "") + } + serde_json::Value::Null => { + writeln!(f, "{:indent$}{n}. ", "") + } + serde_json::Value::Bool(b) => { + writeln!(f, "{:indent$}{n}. {b}", "") + } + serde_json::Value::Number(num) => { + writeln!(f, "{:indent$}{n}. {num}", "") + } + } +} From aafbf9d6cb49b21376b59c7b6c1177654faf92f1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 12:37:51 -0700 Subject: [PATCH 09/15] Facts don't case about your prefixes? --- dev-tools/omdb/tests/successes.out | 4 +- nexus/db-model/src/fm/case.rs | 10 ++--- nexus/db-queries/src/db/datastore/fm.rs | 42 +++++++++---------- nexus/db-schema/src/schema.rs | 6 +-- nexus/fm/src/builder/case.rs | 6 +-- nexus/fm/src/builder/rng.rs | 8 ++-- nexus/fm/src/diagnosis/disk.rs | 10 ++--- nexus/types/src/fm/case.rs | 16 ++++--- schema/crdb/dbinit.sql | 6 +-- schema/crdb/fm-disk-de-and-facts/up2.sql | 2 +- schema/crdb/fm-disk-de-and-facts/up3.sql | 2 +- .../crdb/fm-disk-de-and-facts/up3.verify.sql | 2 +- uuid-kinds/src/lib.rs | 2 +- 13 files changed, 57 insertions(+), 59 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 83305c1708e..52bc5d9992b 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -761,7 +761,7 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_case rows deleted: 0 batches: 1 - orphaned fm_case_fact rows deleted: 0 + orphaned fm_fact rows deleted: 0 batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 @@ -1457,7 +1457,7 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_case rows deleted: 0 batches: 1 - orphaned fm_case_fact rows deleted: 0 + orphaned fm_fact rows deleted: 0 batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 83a237fa78f..aa5187a8bd0 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -9,10 +9,10 @@ use super::DiagnosisEngine; use super::SupportBundleRequest; use crate::DbTypedUuid; use crate::ereport; -use nexus_db_schema::schema::{fm_case, fm_case_fact, fm_ereport_in_case}; +use nexus_db_schema::schema::{fm_case, fm_ereport_in_case, fm_fact}; use nexus_types::fm; use omicron_uuid_kinds::{ - CaseEreportKind, CaseFactKind, CaseKind, EreporterRestartKind, SitrepKind, + CaseEreportKind, CaseKind, EreporterRestartKind, FactKind, SitrepKind, }; /// Metadata describing a fault management case. @@ -77,12 +77,12 @@ impl CaseMetadata { } } -/// Diesel row for the `fm_case_fact` table. See +/// Diesel row for the `fm_fact` table. See /// [`nexus_types::fm::case::Fact`] for semantics. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] -#[diesel(table_name = fm_case_fact)] +#[diesel(table_name = fm_fact)] pub struct Fact { - pub id: DbTypedUuid, + pub id: DbTypedUuid, pub sitrep_id: DbTypedUuid, pub case_id: DbTypedUuid, /// Sitrep in which this fact was first added. Preserved unchanged diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index bee38c90be6..a33d243adfb 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -33,8 +33,8 @@ use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::ereport::dsl as ereport_dsl; use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; use nexus_db_schema::schema::fm_case::dsl as case_dsl; -use nexus_db_schema::schema::fm_case_fact::dsl as case_fact_dsl; use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; +use nexus_db_schema::schema::fm_fact::dsl as fact_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; use nexus_db_schema::schema::fm_support_bundle_request::dsl as support_bundle_req_dsl; @@ -47,10 +47,10 @@ use omicron_common::api::external::ListResultVec; use omicron_uuid_kinds::AlertKind; use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportKind; -use omicron_uuid_kinds::CaseFactKind; -use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::FactKind; +use omicron_uuid_kinds::FactUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SitrepUuid; use omicron_uuid_kinds::SupportBundleKind; @@ -124,7 +124,7 @@ sitrep_child_tables! { SupportBundleRequestDataSelectionEreports => { table: "fm_support_bundle_request_data_selection_ereports" }, SupportBundleRequest => { table: "fm_support_bundle_request" }, Case => { table: "fm_case" }, - CaseFact => { table: "fm_case_fact" }, + Fact => { table: "fm_fact" }, } /// Per-child-table statistics from a single GC pass. @@ -370,7 +370,7 @@ impl DataStore { let mut support_bundle_requests = self.support_bundle_requests_read_on_conn(id, conn).await?; - let mut case_facts = self.fm_case_facts_read_on_conn(id, conn).await?; + let mut case_facts = self.fm_facts_read_on_conn(id, conn).await?; // Next, load the case metadata entries and marry them to the sets of // ereports, alert requests, and support bundle requests for those @@ -505,9 +505,9 @@ impl DataStore { Ok(by_case) } - /// Fetch all `fm_case_fact` rows belonging to cases in the given sitrep, + /// Fetch all `fm_fact` rows belonging to cases in the given sitrep, /// grouped by `case_id`. - async fn fm_case_facts_read_on_conn( + async fn fm_facts_read_on_conn( &self, id: SitrepUuid, conn: &async_bb8_diesel::Connection, @@ -515,15 +515,15 @@ impl DataStore { let mut by_case = HashMap::>::new(); - let mut paginator: Paginator> = + let mut paginator: Paginator> = Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); while let Some(p) = paginator.next() { let batch = paginated( - case_fact_dsl::fm_case_fact, - case_fact_dsl::id, + fact_dsl::fm_fact, + fact_dsl::id, &p.current_pagparams(), ) - .filter(case_fact_dsl::sitrep_id.eq(id.into_untyped_uuid())) + .filter(fact_dsl::sitrep_id.eq(id.into_untyped_uuid())) .select(model::fm::Fact::as_select()) .load_async(conn) .await @@ -535,7 +535,7 @@ impl DataStore { paginator = p.found_batch(&batch, &|f| f.id); for fact in batch { let case_id: CaseUuid = fact.case_id.into(); - let id: CaseFactUuid = fact.id.into(); + let id: FactUuid = fact.id.into(); by_case .entry(case_id) .or_default() @@ -867,7 +867,7 @@ impl DataStore { .await?; if !case_facts.is_empty() { - diesel::insert_into(case_fact_dsl::fm_case_fact) + diesel::insert_into(fact_dsl::fm_fact) .values(case_facts) .execute_async(&*conn) .await @@ -1218,7 +1218,7 @@ impl DataStore { alert_requests_deleted: usize, support_bundle_requests_deleted: usize, cases_deleted: usize, - case_facts_deleted: usize, + facts_deleted: usize, } let err = OptionalError::new(); @@ -1228,7 +1228,7 @@ impl DataStore { alert_requests_deleted, support_bundle_requests_deleted, cases_deleted, - case_facts_deleted, + facts_deleted, } = self // Sitrep deletion is transactional to prevent a sitrep from being // left in a partially-deleted state should the Nexus instance @@ -1273,9 +1273,9 @@ impl DataStore { .await?; // Delete case facts. - let case_facts_deleted = diesel::delete( - case_fact_dsl::fm_case_fact.filter( - case_fact_dsl::sitrep_id.eq_any(ids.clone()), + let facts_deleted = diesel::delete( + fact_dsl::fm_fact.filter( + fact_dsl::sitrep_id.eq_any(ids.clone()), ), ) .execute_async(&conn) @@ -1303,7 +1303,7 @@ impl DataStore { alert_requests_deleted, support_bundle_requests_deleted, case_ereports_deleted, - case_facts_deleted, + facts_deleted, }) } }) @@ -1322,7 +1322,7 @@ impl DataStore { "case_ereports_deleted" => case_ereports_deleted, "alert_requests_deleted" => alert_requests_deleted, "support_bundle_requests_deleted" => support_bundle_requests_deleted, - "case_facts_deleted" => case_facts_deleted, + "facts_deleted" => facts_deleted, ); Ok(sitreps_deleted) @@ -2339,7 +2339,7 @@ mod tests { let mut facts = iddqd::IdOrdMap::new(); facts .insert_unique(fm::case::Fact { - id: CaseFactUuid::new_v4(), + id: FactUuid::new_v4(), created_sitrep_id: sitrep_id, payload: serde_json::json!({ "kind": "representative_fact", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index ec765db2d86..b6aea91e62b 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -3244,7 +3244,7 @@ table! { } table! { - fm_case_fact (sitrep_id, id) { + fm_fact (sitrep_id, id) { id -> Uuid, sitrep_id -> Uuid, case_id -> Uuid, @@ -3269,8 +3269,8 @@ table! { allow_tables_to_appear_in_same_query!(fm_ereport_in_case, ereport); allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); -allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case_fact); -allow_tables_to_appear_in_same_query!(fm_case, fm_case_fact); +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_fact); +allow_tables_to_appear_in_same_query!(fm_case, fm_fact); table! { fm_alert_request (sitrep_id, id) { diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index dae9d17f352..810e6648492 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -9,8 +9,8 @@ use iddqd::id_ord_map::{self, IdOrdMap}; use nexus_types::alert::AlertClass; use nexus_types::fm; use nexus_types::support_bundle::BundleDataSelection; -use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::FactUuid; use omicron_uuid_kinds::SitrepUuid; use std::fmt::Debug; use std::sync::Arc; @@ -229,7 +229,7 @@ impl CaseBuilder { &mut self, fact: &T, comment: impl ToString, - ) -> anyhow::Result { + ) -> anyhow::Result { let id = loop { let id = self.rng.next_fact(); if !self.case.facts.contains_key(&id) { @@ -264,7 +264,7 @@ impl CaseBuilder { /// Remove a fact from this case. The fact will not be carried forward /// into the next sitrep. - pub fn remove_fact(&mut self, id: CaseFactUuid) { + pub fn remove_fact(&mut self, id: FactUuid) { if self.case.facts.remove(&id).is_some() { slog::info!(&self.log, "removed a fact"; "fact_id" => %id); self.report_log.entry("removed fact").kv("fact_id", id); diff --git a/nexus/fm/src/builder/rng.rs b/nexus/fm/src/builder/rng.rs index 01cd04bd7d7..9ba02eca5cc 100644 --- a/nexus/fm/src/builder/rng.rs +++ b/nexus/fm/src/builder/rng.rs @@ -12,10 +12,10 @@ use omicron_uuid_kinds::AlertKind; use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportKind; use omicron_uuid_kinds::CaseEreportUuid; -use omicron_uuid_kinds::CaseFactKind; -use omicron_uuid_kinds::CaseFactUuid; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::FactKind; +use omicron_uuid_kinds::FactUuid; use omicron_uuid_kinds::SitrepUuid; use omicron_uuid_kinds::SupportBundleKind; use omicron_uuid_kinds::SupportBundleUuid; @@ -71,7 +71,7 @@ pub(super) struct CaseBuilderRng { ereport_assignment_rng: TypedUuidRng, alert_rng: TypedUuidRng, support_bundle_rng: TypedUuidRng, - fact_rng: TypedUuidRng, + fact_rng: TypedUuidRng, } impl CaseBuilderRng { @@ -112,7 +112,7 @@ impl CaseBuilderRng { self.support_bundle_rng.next() } - pub(super) fn next_fact(&mut self) -> CaseFactUuid { + pub(super) fn next_fact(&mut self) -> FactUuid { self.fact_rng.next() } } diff --git a/nexus/fm/src/diagnosis/disk.rs b/nexus/fm/src/diagnosis/disk.rs index 1cd8defad17..ff582bcfd44 100644 --- a/nexus/fm/src/diagnosis/disk.rs +++ b/nexus/fm/src/diagnosis/disk.rs @@ -9,13 +9,13 @@ use crate::analysis_input::Input; use nexus_types::fm::DiagnosisEngineKind; use nexus_types::in_service_disk::InServiceDisk; use nexus_types::inventory::ZpoolHealth; -use omicron_uuid_kinds::{CaseFactUuid, CaseUuid, ZpoolUuid}; +use omicron_uuid_kinds::{CaseUuid, FactUuid, ZpoolUuid}; use serde::{Deserialize, Serialize}; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; /// Per-fact state for the disk diagnosis engine, serialized into the -/// `fm_case_fact.payload` JSONB column. Other diagnosis engines must not +/// `fm_fact.payload` JSONB column. Other diagnosis engines must not /// inspect or modify this; shared FM code treats it as opaque bytes. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case")] @@ -30,7 +30,7 @@ struct ParentCaseSummary { /// recorded zpool. There can be multiple facts in pathological cases /// (e.g., two zpool ids on the same case after a hand-edit); the /// diagnoser keeps all of them in its accounting. - zpool_unhealthy: BTreeMap>, + zpool_unhealthy: BTreeMap>, } pub(super) fn analyze( @@ -325,7 +325,7 @@ mod tests { let mut facts = iddqd::IdOrdMap::new(); facts .insert_unique(fm::case::Fact { - id: omicron_uuid_kinds::CaseFactUuid::new_v4(), + id: omicron_uuid_kinds::FactUuid::new_v4(), created_sitrep_id: parent_sitrep_id, payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy { zpool_id, @@ -533,7 +533,7 @@ mod tests { let parent_sitrep_id = SitrepUuid::new_v4(); let unreadable_case_id = omicron_uuid_kinds::CaseUuid::new_v4(); let mut parent_cases = iddqd::IdOrdMap::new(); - let unreadable_fact_id = omicron_uuid_kinds::CaseFactUuid::new_v4(); + let unreadable_fact_id = omicron_uuid_kinds::FactUuid::new_v4(); let unreadable_payload = serde_json::json!({ "kind": "this_variant_does_not_exist", "mystery": "data", diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 239349de13a..b2a2fdf3f8c 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -11,7 +11,7 @@ use crate::support_bundle::BundleDataSelection; use anyhow::Context; use iddqd::{IdOrdItem, IdOrdMap}; use omicron_uuid_kinds::{ - AlertUuid, CaseEreportUuid, CaseFactUuid, CaseUuid, SitrepUuid, + AlertUuid, CaseEreportUuid, CaseUuid, FactUuid, SitrepUuid, SupportBundleUuid, }; use serde::{Deserialize, Serialize}; @@ -183,7 +183,7 @@ impl CaseEreport { /// and compare typed enum values — never the raw payload. #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct Fact { - pub id: CaseFactUuid, + pub id: FactUuid, /// The sitrep in which this fact was first added. Preserved /// unchanged when the fact is carried forward into a child sitrep. /// Debug-only. @@ -193,7 +193,7 @@ pub struct Fact { } impl IdOrdItem for Fact { - type Key<'a> = &'a CaseFactUuid; + type Key<'a> = &'a FactUuid; fn key(&self) -> Self::Key<'_> { &self.id } @@ -486,8 +486,8 @@ mod tests { use crate::support_bundle::BundleDataSelection; use ereport_types::{Ena, EreportId}; use omicron_uuid_kinds::{ - AlertUuid, CaseFactUuid, CaseUuid, EreporterRestartUuid, - OmicronZoneUuid, SitrepUuid, SupportBundleUuid, + AlertUuid, CaseUuid, EreporterRestartUuid, FactUuid, OmicronZoneUuid, + SitrepUuid, SupportBundleUuid, }; use std::str::FromStr; use std::sync::Arc; @@ -626,10 +626,8 @@ mod tests { let mut facts = IdOrdMap::new(); facts .insert_unique(Fact { - id: CaseFactUuid::from_str( - "f00f00f0-0f00-4f00-8f00-f00f00f00f00", - ) - .unwrap(), + id: FactUuid::from_str("f00f00f0-0f00-4f00-8f00-f00f00f00f00") + .unwrap(), created_sitrep_id, payload: serde_json::json!({ "kind": "FactFunLevel", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c5f70f3e246..9948d124513 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7651,7 +7651,7 @@ ON omicron.public.fm_case (sitrep_id); -- Per-engine "facts" attached to a case. A fact's payload is a JSONB blob -- owned by the case's diagnosis engine (see `fm_case.de`) and opaque to -- shared FM code. -CREATE TABLE IF NOT EXISTS omicron.public.fm_case_fact ( +CREATE TABLE IF NOT EXISTS omicron.public.fm_fact ( -- Stable UUID for this fact across sitreps. id UUID NOT NULL, -- Sitrep this row belongs to. @@ -7672,8 +7672,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_case_fact ( ); CREATE INDEX IF NOT EXISTS - lookup_fm_case_facts_for_case -ON omicron.public.fm_case_fact (sitrep_id, case_id); + lookup_fm_facts_for_case +ON omicron.public.fm_fact (sitrep_id, case_id); CREATE TABLE IF NOT EXISTS omicron.public.fm_ereport_in_case ( -- ID of this association. When an ereport is assigned to a case, that diff --git a/schema/crdb/fm-disk-de-and-facts/up2.sql b/schema/crdb/fm-disk-de-and-facts/up2.sql index b9451c23699..64839157e05 100644 --- a/schema/crdb/fm-disk-de-and-facts/up2.sql +++ b/schema/crdb/fm-disk-de-and-facts/up2.sql @@ -1,4 +1,4 @@ -CREATE TABLE IF NOT EXISTS omicron.public.fm_case_fact ( +CREATE TABLE IF NOT EXISTS omicron.public.fm_fact ( id UUID NOT NULL, sitrep_id UUID NOT NULL, case_id UUID NOT NULL, diff --git a/schema/crdb/fm-disk-de-and-facts/up3.sql b/schema/crdb/fm-disk-de-and-facts/up3.sql index 3a1c82db315..02e661dd6d9 100644 --- a/schema/crdb/fm-disk-de-and-facts/up3.sql +++ b/schema/crdb/fm-disk-de-and-facts/up3.sql @@ -1 +1 @@ -CREATE INDEX IF NOT EXISTS lookup_fm_case_facts_for_case ON omicron.public.fm_case_fact (sitrep_id, case_id); +CREATE INDEX IF NOT EXISTS lookup_fm_facts_for_case ON omicron.public.fm_fact (sitrep_id, case_id); diff --git a/schema/crdb/fm-disk-de-and-facts/up3.verify.sql b/schema/crdb/fm-disk-de-and-facts/up3.verify.sql index e23c166614e..492674a12a3 100644 --- a/schema/crdb/fm-disk-de-and-facts/up3.verify.sql +++ b/schema/crdb/fm-disk-de-and-facts/up3.verify.sql @@ -1,2 +1,2 @@ -- DO NOT EDIT. Generated by test_migration_verification_files. -SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'fm_case_fact' AND index_name = 'lookup_fm_case_facts_for_case')),'true','Schema change verification failed: index lookup_fm_case_facts_for_case on table fm_case_fact does not exist') AS BOOL); +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'fm_fact' AND index_name = 'lookup_fm_facts_for_case')),'true','Schema change verification failed: index lookup_fm_facts_for_case on table fm_fact does not exist') AS BOOL); diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 26d9eb33636..304e122201a 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -50,7 +50,6 @@ impl_typed_uuid_kinds! { BuiltInUser = {}, Case = {}, CaseEreport = {}, - CaseFact = {}, Collection = {}, ConsoleSession = {}, Dataset = {}, @@ -61,6 +60,7 @@ impl_typed_uuid_kinds! { ExternalIp = {}, ExternalSubnet = {}, ExternalZpool = {}, + Fact = {}, FmdHostCase = {}, FmdResource = {}, Instance = {}, From d021ea9412ecbd761d237a6535aea75e4f966063 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 12:45:48 -0700 Subject: [PATCH 10/15] comments --- nexus/db-model/src/fm/case.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index aa5187a8bd0..5ca49b7b187 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -83,10 +83,15 @@ impl CaseMetadata { #[diesel(table_name = fm_fact)] pub struct Fact { pub id: DbTypedUuid, + /// The sitrep to which this fact belongs. + /// + /// This will change as the fact is carried forward from + /// one sitrep to the next. pub sitrep_id: DbTypedUuid, pub case_id: DbTypedUuid, - /// Sitrep in which this fact was first added. Preserved unchanged - /// when the fact is carried forward; debug-only. + /// Sitrep in which this fact was first added. + /// + /// Preserved unchanged when the fact is carried forward; debug-only. pub created_sitrep_id: DbTypedUuid, pub payload: serde_json::Value, pub comment: String, From d6467a148f3698d60d0bc9eb0068bd74a119ea6f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 14:34:31 -0700 Subject: [PATCH 11/15] Guarantee at least one sitrep in omdb tests --- dev-tools/omdb/tests/test_all_output.rs | 6 +++++ nexus/src/app/mod.rs | 9 ++++++- nexus/src/lib.rs | 13 ++++++++++ nexus/test-interface/src/lib.rs | 11 +++++++++ nexus/test-utils/src/nexus_test.rs | 32 +++++++++++++++++++++++++ 5 files changed, 70 insertions(+), 1 deletion(-) diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 52fb598a4c6..34be86b23f1 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -187,6 +187,12 @@ async fn test_omdb_success_cases() { .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) .await; + // Wait until `fm_analysis` has committed at least one sitrep, so that the + // omdb snapshot for FM tasks is stable. (Otherwise sitrep IDs render as + // `None` or `Some(...)` depending on whether the task's natural cadence + // had landed by the time we sample it.) + cptestctx.wait_for_at_least_one_sitrep(Duration::from_secs(60)).await; + let mut output = String::new(); let invocations: &[&[&str]] = &[ diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 6778ea72598..5150a7e4834 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -316,7 +316,6 @@ pub struct Nexus { repo_depot_resolver: Box, /// Watch channel containing the currently-loaded fault management sitrep. - #[allow(dead_code)] sitrep_load_rx: watch::Receiver>, /// handle to pull update status data @@ -1148,6 +1147,14 @@ impl Nexus { &self.db_datastore } + pub(crate) fn sitrep_load_rx( + &self, + ) -> watch::Receiver> { + let mut rx = self.sitrep_load_rx.clone(); + rx.mark_unchanged(); + rx + } + pub(crate) fn samael_max_issue_delay(&self) -> Option { let mid = self.samael_max_issue_delay.lock().unwrap(); *mid diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 929342771a8..88a366b9a69 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -524,6 +524,19 @@ impl nexus_test_interface::NexusServer for Server { self.apictx.context.nexus.inventory_load_rx() } + fn sitrep_load_rx( + &self, + ) -> watch::Receiver< + Option< + Arc<( + nexus_types::fm::SitrepVersion, + nexus_types::fm::Sitrep, + )>, + >, + > { + self.apictx.context.nexus.sitrep_load_rx() + } + fn get_http_server_external_address(&self) -> SocketAddr { self.apictx.context.nexus.get_external_server_address().unwrap() } diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 35f726740e1..3d0f9865493 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -88,6 +88,17 @@ pub trait NexusServer: Send + Sync + 'static { fn inventory_load_rx(&self) -> watch::Receiver>>; + fn sitrep_load_rx( + &self, + ) -> watch::Receiver< + Option< + Arc<( + nexus_types::fm::SitrepVersion, + nexus_types::fm::Sitrep, + )>, + >, + >; + fn get_http_server_external_address(&self) -> SocketAddr; fn get_http_server_techport_address(&self) -> SocketAddr; fn get_http_server_internal_address(&self) -> SocketAddr; diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 693aea88732..745ea7b31bc 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -200,6 +200,38 @@ impl ControlPlaneTestContext { } } + /// Wait until at least one fault management sitrep has been committed and + /// loaded. + /// + /// # Panics + /// + /// Panics if no sitrep is loaded within `timeout`. + pub async fn wait_for_at_least_one_sitrep(&self, timeout: Duration) { + let mut sitrep_rx = self.server.sitrep_load_rx(); + + match wait_for_watch_channel_condition( + &mut sitrep_rx, + async |sitrep| { + if sitrep.is_some() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + timeout, + ) + .await + { + Ok(()) => (), + Err(poll::Error::TimedOut(elapsed)) => { + panic!("no sitrep found within {elapsed:?}"); + } + Err(poll::Error::PermanentError(())) => { + unreachable!("check can only fail via timeout") + } + } + } + pub fn internal_client(&self) -> nexus_client::Client { nexus_client::Client::new( &format!("http://{}", self.internal_client.bind_address), From b33b0e00835db87d395f09b3430d300fdd0d8757 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 14:41:34 -0700 Subject: [PATCH 12/15] disk -> physical_disk --- nexus/fm/src/diagnosis/mod.rs | 10 +++++----- nexus/fm/src/diagnosis/{disk.rs => physical_disk.rs} | 0 2 files changed, 5 insertions(+), 5 deletions(-) rename nexus/fm/src/diagnosis/{disk.rs => physical_disk.rs} (100%) diff --git a/nexus/fm/src/diagnosis/mod.rs b/nexus/fm/src/diagnosis/mod.rs index 266a5da0809..9e9f1959c96 100644 --- a/nexus/fm/src/diagnosis/mod.rs +++ b/nexus/fm/src/diagnosis/mod.rs @@ -11,13 +11,13 @@ use crate::SitrepBuilder; use crate::analysis_input::Input; -mod disk; +mod physical_disk; pub fn analyze( input: &Input, builder: &mut SitrepBuilder<'_>, ) -> anyhow::Result<()> { - disk::analyze(input, builder)?; + physical_disk::analyze(input, builder)?; Ok(()) } @@ -25,9 +25,9 @@ pub fn analyze( /// how to consume. The background task uses this to filter loaded ereports /// — there is no value in loading ereports FM analysis cannot consume. /// -/// Empty today: the only enabled DE is the disk DE, which is polling-based -/// and consumes no ereports. Grow this list alongside FM analysis as new -/// classes gain ereport support. +/// Empty today: the only enabled DE is the physical disk DE, which is +/// polling-based and consumes no ereports. Grow this list alongside FM +/// analysis as new classes gain ereport support. /// /// **NULL-class ereports are intentionally excluded by the loader's SQL /// filter** (`class = ANY(...)` never matches NULL). If FM analysis ever diff --git a/nexus/fm/src/diagnosis/disk.rs b/nexus/fm/src/diagnosis/physical_disk.rs similarity index 100% rename from nexus/fm/src/diagnosis/disk.rs rename to nexus/fm/src/diagnosis/physical_disk.rs From 6ae7bc68daff4e02abb8d175d18d8a3732336d73 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 15:01:33 -0700 Subject: [PATCH 13/15] IdOrdMap, ZpoolUnhealthyFact, adding initial observation, expanded report, comments --- nexus/fm/src/analysis_input.rs | 5 + nexus/fm/src/diagnosis/physical_disk.rs | 250 ++++++++++++------ nexus/src/lib.rs | 7 +- .../output/analysis_input_report_empty.out | 2 + .../output/analysis_input_report_same_inv.out | 2 + .../analysis_input_report_with_cases.out | 2 + nexus/types/src/fm/analysis_reports.rs | 26 +- 7 files changed, 211 insertions(+), 83 deletions(-) diff --git a/nexus/fm/src/analysis_input.rs b/nexus/fm/src/analysis_input.rs index 2698fdf5c5e..2f8e135cbb2 100644 --- a/nexus/fm/src/analysis_input.rs +++ b/nexus/fm/src/analysis_input.rs @@ -197,6 +197,11 @@ impl Builder { .collect(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), + in_service_disks: self + .in_service_disks + .iter() + .map(|d| d.physical_disk_id) + .collect(), }; // Determine which cases must be copied forwards into the next sitrep. diff --git a/nexus/fm/src/diagnosis/physical_disk.rs b/nexus/fm/src/diagnosis/physical_disk.rs index ff582bcfd44..d85e2f92e87 100644 --- a/nexus/fm/src/diagnosis/physical_disk.rs +++ b/nexus/fm/src/diagnosis/physical_disk.rs @@ -6,13 +6,14 @@ use crate::SitrepBuilder; use crate::analysis_input::Input; +use chrono::{DateTime, Utc}; +use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; use nexus_types::fm::DiagnosisEngineKind; -use nexus_types::in_service_disk::InServiceDisk; use nexus_types::inventory::ZpoolHealth; -use omicron_uuid_kinds::{CaseUuid, FactUuid, ZpoolUuid}; +use omicron_uuid_kinds::{CaseUuid, CollectionUuid, FactUuid, ZpoolUuid}; use serde::{Deserialize, Serialize}; use slog_error_chain::InlineErrorChain; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; /// Per-fact state for the disk diagnosis engine, serialized into the /// `fm_fact.payload` JSONB column. Other diagnosis engines must not @@ -21,54 +22,116 @@ use std::collections::BTreeMap; #[serde(tag = "kind", rename_all = "snake_case")] pub enum DiskFact { /// The zpool's most recently observed health is non-`Online`. - ZpoolUnhealthy { zpool_id: ZpoolUuid, last_seen_health: ZpoolHealth }, + ZpoolUnhealthy(ZpoolUnhealthyFact), +} + +/// Payload of a [`DiskFact::ZpoolUnhealthy`] fact. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub struct ZpoolUnhealthyFact { + pub zpool_id: ZpoolUuid, + pub last_seen_health: ZpoolHealth, + /// Inventory collection that produced this observation. Recorded for + /// provenance: if multiple `ZpoolUnhealthy` facts ever end up on the + /// same case, this lets a human reader see which inventory each came + /// from. + pub observed_in_inv: CollectionUuid, + /// `time_done` of `observed_in_inv`. + pub time_observed: DateTime, +} + +/// A parent-forwarded [`DiskFact::ZpoolUnhealthy`] fact paired with the +/// `FactUuid` it lives under. Used to index parent-case facts during +/// analysis; not serialized. +#[derive(Clone, Copy, Debug)] +struct UnhealthyZpoolFact { + fact_id: FactUuid, + fact: ZpoolUnhealthyFact, +} + +impl IdOrdItem for UnhealthyZpoolFact { + type Key<'a> = FactUuid; + fn key(&self) -> Self::Key<'_> { + self.fact_id + } + id_upcast!(); +} + +/// One observed zpool's health, keyed by `zpool_id`. Used for both the +/// inventory snapshot and the filtered "currently faulty" subset. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ObservedZpool { + zpool_id: ZpoolUuid, + health: ZpoolHealth, +} + +impl IdOrdItem for ObservedZpool { + type Key<'a> = ZpoolUuid; + fn key(&self) -> Self::Key<'_> { + self.zpool_id + } + id_upcast!(); } /// Per-case summary built from a case's parent-forwarded facts. struct ParentCaseSummary { - /// IDs of `ZpoolUnhealthy` facts on this case, grouped by their - /// recorded zpool. There can be multiple facts in pathological cases - /// (e.g., two zpool ids on the same case after a hand-edit); the - /// diagnoser keeps all of them in its accounting. - zpool_unhealthy: BTreeMap>, + /// All `ZpoolUnhealthy` facts on this case. In typical operation there + /// is at most one fact per zpool; pathological cases (e.g., hand-edited + /// DB) can have multiple — the diagnosis engine keeps all of them. + unhealthy_zpools: IdOrdMap, +} + +impl ParentCaseSummary { + fn zpool_ids(&self) -> BTreeSet { + self.unhealthy_zpools.iter().map(|f| f.fact.zpool_id).collect() + } + + fn facts_for_zpool( + &self, + zpool_id: ZpoolUuid, + ) -> impl Iterator { + self.unhealthy_zpools + .iter() + .filter(move |f| f.fact.zpool_id == zpool_id) + } } pub(super) fn analyze( input: &Input, builder: &mut SitrepBuilder<'_>, ) -> anyhow::Result<()> { - // The disk DE's primary key today is `zpool_id`, so we build a local - // index keyed by zpool. Future variants of `DiskFact` are welcome to - // derive their own secondary indices (e.g., by `sled_id` for FMD). - let in_service_by_zpool: BTreeMap = - input.in_service_disks().iter().map(|d| (d.zpool_id, d)).collect(); + let inv_collection_id = input.inventory().id; + let inv_time_done = input.inventory().time_done; + + // The disk DE's lookups today are by `zpool_id`, so we build a local + // set of in-service zpools. Future variants of `DiskFact` are welcome + // to derive their own indices (e.g., by `sled_id` for FMD). + let in_service_zpools: BTreeSet = + input.in_service_disks().iter().map(|d| d.zpool_id).collect(); // Index every zpool we observed in this inventory by ID, so we can // distinguish "saw it, it's Online" from "didn't see it at all" below. - let observed: BTreeMap = input + let observed: IdOrdMap = input .inventory() .sled_agents .iter() .flat_map(|sa| sa.zpools.iter()) - .map(|z| (z.id, z.health)) + .map(|z| ObservedZpool { zpool_id: z.id, health: z.health }) .collect(); // Currently-faulty, control-plane-managed zpools. // // Out-of-service zpools are intentionally ignored: a non-`Online` zpool // whose disk has been expunged is no longer the control plane's concern. - let faulty: BTreeMap = observed + let faulty: IdOrdMap = observed .iter() - .filter(|(id, _)| in_service_by_zpool.contains_key(*id)) - .filter(|(_, h)| **h != ZpoolHealth::Online) - .map(|(id, h)| (*id, *h)) + .filter(|z| in_service_zpools.contains(&z.zpool_id)) + .filter(|z| z.health != ZpoolHealth::Online) + .copied() .collect(); - // Inspect parent-forwarded Disk cases from the input (i.e., the state - // copied from the parent sitrep — *not* the in-progress builder, which - // we will mutate below). Each case's facts are JSON blobs owned by this - // engine; deserialize each one as DiskFact. Skip (with a warning) any - // fact we can't read. + // Index parent-forwarded Disk cases from the input — the state copied + // from the parent sitrep, *not* the in-progress builder we mutate + // below. Skip (with a warning) any fact we can't deserialize. let parent_summaries: BTreeMap = input .open_cases() .iter() @@ -76,24 +139,24 @@ pub(super) fn analyze( .map(|c| { let case_id = c.id; let mut summary = - ParentCaseSummary { zpool_unhealthy: BTreeMap::new() }; + ParentCaseSummary { unhealthy_zpools: IdOrdMap::new() }; for fact in c.facts.iter() { match fact.payload_to::() { - Ok(DiskFact::ZpoolUnhealthy { - zpool_id, - last_seen_health, - }) => { + Ok(DiskFact::ZpoolUnhealthy(inner)) => { summary - .zpool_unhealthy - .entry(zpool_id) - .or_default() - .push((fact.id, last_seen_health)); + .unhealthy_zpools + .insert_unique(UnhealthyZpoolFact { + fact_id: fact.id, + fact: inner, + }) + .expect("fact ids are unique within a case"); } Err(e) => { slog::warn!( &builder.log, - "skipping Disk case fact with unreadable \ - payload; this run will not modify it"; + "skipping Disk case fact whose payload did not \ + deserialize as DiskFact; this run will not \ + modify it"; "case_id" => %case_id, "fact_id" => %fact.id, "error" => InlineErrorChain::new(&*e).to_string(), @@ -106,29 +169,37 @@ pub(super) fn analyze( .collect(); // Close any open Disk case whose entire set of (interpretable) facts - // points at zpools that are now Online or expunged. Closed cases are not - // copied forward, so their facts naturally drop with them. + // points at zpools that are now Online or expunged. // // Absence from inventory is NOT a recovery signal: a sled could be // powered off, or inventory could be lossy. for (case_id, summary) in &parent_summaries { - if summary.zpool_unhealthy.is_empty() { + if summary.unhealthy_zpools.is_empty() { continue; } - let any_still_unhealthy = - summary.zpool_unhealthy.keys().any(|zpool_id| { - in_service_by_zpool.contains_key(zpool_id) - && observed.get(zpool_id) != Some(&ZpoolHealth::Online) - }); + let mut reasons: Vec = Vec::new(); + let mut any_still_unhealthy = false; + for zpool_id in summary.zpool_ids() { + if !in_service_zpools.contains(&zpool_id) { + reasons.push(format!("zpool {zpool_id} no longer in service")); + } else if observed.get(&zpool_id).map(|z| z.health) + == Some(ZpoolHealth::Online) + { + reasons.push(format!("zpool {zpool_id} back to Online")); + } else { + any_still_unhealthy = true; + break; + } + } if !any_still_unhealthy { builder .cases .case_mut(case_id) .expect("case_id came from iterating builder.cases") - .close( - "all ZpoolUnhealthy facts have resolved (zpool back to \ - Online, or disk no longer in service)", - ); + .close(format!( + "all ZpoolUnhealthy facts resolved: {}", + reasons.join("; "), + )); } } @@ -143,14 +214,12 @@ pub(super) fn analyze( if !case_mut.is_open() { continue; } - for (zpool_id, facts) in &summary.zpool_unhealthy { - let Some(current_health) = faulty.get(zpool_id) else { + for fact_ref in summary.unhealthy_zpools.iter() { + let Some(current) = faulty.get(&fact_ref.fact.zpool_id) else { continue; }; - for (fact_id, last_seen_health) in facts { - if *current_health != *last_seen_health { - case_mut.remove_fact(*fact_id); - } + if current.health != fact_ref.fact.last_seen_health { + case_mut.remove_fact(fact_ref.fact_id); } } } @@ -158,19 +227,25 @@ pub(super) fn analyze( // For each currently-faulty in-service zpool: if there's no open case + // accurate fact already covering it, ensure a case exists (reuse the // parent-forwarded one for this zpool if any) and add a fresh fact. - for (zpool_id, current_health) in faulty { + for current in faulty.iter() { + let zpool_id = current.zpool_id; + let current_health = current.health; let parent_for_zpool = parent_summaries.iter().find_map(|(case_id, summary)| { - summary - .zpool_unhealthy - .get(&zpool_id) - .map(|facts| (*case_id, facts)) + let mut facts = summary.facts_for_zpool(zpool_id).peekable(); + if facts.peek().is_some() { + Some((*case_id, summary)) + } else { + None + } }); let case_id_for_fact = match parent_for_zpool { // Parent case already has an accurate fact — fully covered. - Some((_, facts)) - if facts.iter().any(|(_, h)| *h == current_health) => + Some((_, summary)) + if summary.facts_for_zpool(zpool_id).any(|f| { + f.fact.last_seen_health == current_health + }) => { continue; } @@ -191,10 +266,12 @@ pub(super) fn analyze( .case_mut(&case_id_for_fact) .expect("case_id came from this fn") .add_fact( - &DiskFact::ZpoolUnhealthy { + &DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { zpool_id, last_seen_health: current_health, - }, + observed_in_inv: inv_collection_id, + time_observed: inv_time_done, + }), format!("zpool {zpool_id} health={current_health}"), )?; } @@ -212,6 +289,7 @@ mod tests { use iddqd::IdOrdMap; use nexus_types::external_api::physical_disk::PhysicalDiskKind; use nexus_types::fm::{self, Sitrep, SitrepVersion}; + use nexus_types::in_service_disk::InServiceDisk; use nexus_types::inventory; use omicron_test_utils::dev; use omicron_uuid_kinds::{ @@ -327,10 +405,14 @@ mod tests { .insert_unique(fm::case::Fact { id: omicron_uuid_kinds::FactUuid::new_v4(), created_sitrep_id: parent_sitrep_id, - payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy { - zpool_id, - last_seen_health: ZpoolHealth::Degraded, - }) + payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy( + ZpoolUnhealthyFact { + zpool_id, + last_seen_health: ZpoolHealth::Degraded, + observed_in_inv: inv_collection_id, + time_observed: Utc::now(), + }, + )) .unwrap(), comment: format!("zpool {zpool_id} degraded"), }) @@ -397,7 +479,11 @@ mod tests { let facts = disk_facts(&sitrep, true); assert_eq!(facts.len(), 1); match &facts[0].2 { - DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health } => { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + zpool_id, + last_seen_health, + .. + }) => { assert_eq!(*zpool_id, target); assert_eq!(*last_seen_health, ZpoolHealth::Degraded); } @@ -439,7 +525,9 @@ mod tests { let open_cases = disk_facts(&sitrep, true); assert_eq!(open_cases.len(), 1); match &open_cases[0].2 { - DiskFact::ZpoolUnhealthy { zpool_id, .. } => { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + zpool_id, .. + }) => { assert_eq!(*zpool_id, target); } } @@ -517,7 +605,7 @@ mod tests { } /// A parent Disk case carries a fact whose JSON `kind` doesn't match any - /// variant we know. The diagnoser should leave that case alone (not + /// variant we know. The diagnosis engine should leave that case alone (not /// touch its open/closed state, not mutate the fact, not count it /// toward dedup) and freely open a new case for a separate live signal. #[test] @@ -588,7 +676,7 @@ mod tests { .expect("unreadable case should be copied forward"); assert!( unreadable.is_open(), - "unreadable case must not be closed by the diagnoser", + "unreadable case must not be closed by the diagnosis engine", ); let kept_fact = unreadable .facts @@ -607,14 +695,16 @@ mod tests { signal", ); match &readable[0].2 { - DiskFact::ZpoolUnhealthy { zpool_id, .. } => { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + zpool_id, .. + }) => { assert_eq!(*zpool_id, live_target); } } logctx.cleanup_successful(); } - /// When the parent sitrep's fact content matches the diagnoser's current + /// When the parent sitrep's fact content matches the diagnosis engine's current /// observation, the fact carries forward with the same UUID — no /// remove-and-readd churn. #[test] @@ -648,7 +738,11 @@ mod tests { observation hasn't changed", ); match &open[0].2 { - DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health } => { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + zpool_id, + last_seen_health, + .. + }) => { assert_eq!(*zpool_id, target); assert_eq!(*last_seen_health, ZpoolHealth::Degraded); } @@ -657,7 +751,7 @@ mod tests { } /// When the parent's fact recorded a different `last_seen_health` than - /// what we observe now, the diagnoser removes the stale fact and emits + /// what we observe now, the diagnosis engine removes the stale fact and emits /// a fresh one (new UUID). The case stays open because the zpool is /// still unhealthy — just with a different value. #[test] @@ -694,7 +788,11 @@ mod tests { "fact UUID should rotate because last_seen_health changed", ); match &open[0].2 { - DiskFact::ZpoolUnhealthy { zpool_id, last_seen_health } => { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + zpool_id, + last_seen_health, + .. + }) => { assert_eq!(*zpool_id, target); assert_eq!(*last_seen_health, ZpoolHealth::Faulted); } diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 88a366b9a69..5df5b7ef3f3 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -527,12 +527,7 @@ impl nexus_test_interface::NexusServer for Server { fn sitrep_load_rx( &self, ) -> watch::Receiver< - Option< - Arc<( - nexus_types::fm::SitrepVersion, - nexus_types::fm::Sitrep, - )>, - >, + Option>, > { self.apictx.context.nexus.sitrep_load_rx() } diff --git a/nexus/types/output/analysis_input_report_empty.out b/nexus/types/output/analysis_input_report_empty.out index f221b3960a0..a479e87d36c 100644 --- a/nexus/types/output/analysis_input_report_empty.out +++ b/nexus/types/output/analysis_input_report_empty.out @@ -4,3 +4,5 @@ parent sitrep: inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb no new ereports since the parent sitrep no cases copied forward + +no in-service control plane disks diff --git a/nexus/types/output/analysis_input_report_same_inv.out b/nexus/types/output/analysis_input_report_same_inv.out index 5ed6b60d151..25631755f78 100644 --- a/nexus/types/output/analysis_input_report_same_inv.out +++ b/nexus/types/output/analysis_input_report_same_inv.out @@ -5,3 +5,5 @@ inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb --> same collection as parent sitrep no new ereports since the parent sitrep no cases copied forward + +no in-service control plane disks diff --git a/nexus/types/output/analysis_input_report_with_cases.out b/nexus/types/output/analysis_input_report_with_cases.out index f3af0fe55ff..a1fc877ef72 100644 --- a/nexus/types/output/analysis_input_report_with_cases.out +++ b/nexus/types/output/analysis_input_report_with_cases.out @@ -22,3 +22,5 @@ cases (2 total): closed in sitrep: 55555555-5555-5555-5555-555555555555 copied forwards because these ereports haven't been marked seen yet: * ereport dddddddd-dddd-dddd-dddd-dddddddddddd:2 + +no in-service control plane disks diff --git a/nexus/types/src/fm/analysis_reports.rs b/nexus/types/src/fm/analysis_reports.rs index 249e26f1b8b..42b6c6537a4 100644 --- a/nexus/types/src/fm/analysis_reports.rs +++ b/nexus/types/src/fm/analysis_reports.rs @@ -9,7 +9,9 @@ use super::case; use super::ereport::EreportId; use super::json_display::fmt_json_value; use iddqd::IdOrdMap; -use omicron_uuid_kinds::{CaseUuid, CollectionUuid, SitrepUuid}; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, PhysicalDiskUuid, SitrepUuid, +}; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -227,6 +229,9 @@ pub struct InputReport { /// Cases which have closed, but which have been copied forwards as they /// contain ereports which have not yet been marked seen. pub closed_cases_copied_forward: BTreeMap, + /// All control-plane-managed physical disks visible to the diagnosis + /// engines for this analysis pass. + pub in_service_disks: BTreeSet, } #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] @@ -257,6 +262,7 @@ impl fmt::Display for InputReportMultilineDisplay<'_> { new_ereport_ids, open_cases, closed_cases_copied_forward, + in_service_disks, }, indent, } = self; @@ -357,6 +363,21 @@ impl fmt::Display for InputReportMultilineDisplay<'_> { writeln!(f, "{:indent$}no cases copied forward", "")?; } + if in_service_disks.is_empty() { + writeln!(f, "\n{:indent$}no in-service control plane disks", "")?; + } else { + writeln!( + f, + "\n{:indent$}in-service control plane disks ({} total):", + "", + in_service_disks.len() + )?; + let indent = indent + 2; + for disk_id in in_service_disks { + writeln!(f, "{:indent$}* disk {disk_id}", "")?; + } + } + Ok(()) } } @@ -441,6 +462,7 @@ mod tests { new_ereport_ids, open_cases, closed_cases_copied_forward, + in_service_disks: BTreeSet::new(), } } @@ -456,6 +478,7 @@ mod tests { new_ereport_ids: BTreeSet::new(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), + in_service_disks: BTreeSet::new(), } } @@ -474,6 +497,7 @@ mod tests { new_ereport_ids: BTreeSet::new(), open_cases: BTreeMap::new(), closed_cases_copied_forward: BTreeMap::new(), + in_service_disks: BTreeSet::new(), } } From 503fe79cd2352e56b50c623ea598c5588794e8d9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 15:42:20 -0700 Subject: [PATCH 14/15] omdb output, struct naming --- dev-tools/omdb/tests/successes.out | 12 +++-- nexus/fm/src/diagnosis/physical_disk.rs | 58 +++++++++++++------------ nexus/test-interface/src/lib.rs | 7 +-- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 52bc5d9992b..aa9a4decc04 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -715,6 +715,8 @@ task: "fm_analysis" no new ereports since the parent sitrep no cases copied forward + no in-service control plane disks + fault management analysis report -------------------------------- sitrep ID: ..................... @@ -761,10 +763,10 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_case rows deleted: 0 batches: 1 - orphaned fm_fact rows deleted: 0 - batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 + orphaned fm_fact rows deleted: 0 + batches: 1 orphaned fm_support_bundle_request rows deleted: 0 batches: 1 orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 @@ -1411,6 +1413,8 @@ task: "fm_analysis" no new ereports since the parent sitrep no cases copied forward + no in-service control plane disks + fault management analysis report -------------------------------- sitrep ID: ..................... @@ -1457,10 +1461,10 @@ task: "fm_sitrep_gc" batches: 1 orphaned fm_case rows deleted: 0 batches: 1 - orphaned fm_fact rows deleted: 0 - batches: 1 orphaned fm_ereport_in_case rows deleted: 0 batches: 1 + orphaned fm_fact rows deleted: 0 + batches: 1 orphaned fm_support_bundle_request rows deleted: 0 batches: 1 orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 diff --git a/nexus/fm/src/diagnosis/physical_disk.rs b/nexus/fm/src/diagnosis/physical_disk.rs index d85e2f92e87..ac164f1def2 100644 --- a/nexus/fm/src/diagnosis/physical_disk.rs +++ b/nexus/fm/src/diagnosis/physical_disk.rs @@ -22,12 +22,12 @@ use std::collections::{BTreeMap, BTreeSet}; #[serde(tag = "kind", rename_all = "snake_case")] pub enum DiskFact { /// The zpool's most recently observed health is non-`Online`. - ZpoolUnhealthy(ZpoolUnhealthyFact), + ZpoolUnhealthy(ZpoolUnhealthyFactPayload), } /// Payload of a [`DiskFact::ZpoolUnhealthy`] fact. #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] -pub struct ZpoolUnhealthyFact { +pub struct ZpoolUnhealthyFactPayload { pub zpool_id: ZpoolUuid, pub last_seen_health: ZpoolHealth, /// Inventory collection that produced this observation. Recorded for @@ -39,16 +39,16 @@ pub struct ZpoolUnhealthyFact { pub time_observed: DateTime, } -/// A parent-forwarded [`DiskFact::ZpoolUnhealthy`] fact paired with the -/// `FactUuid` it lives under. Used to index parent-case facts during +/// A [`DiskFact::ZpoolUnhealthy`] payload paired with the `FactUuid` it +/// lives under. Used to build in-memory indices over facts during /// analysis; not serialized. #[derive(Clone, Copy, Debug)] -struct UnhealthyZpoolFact { +struct ZpoolUnhealthyFact { fact_id: FactUuid, - fact: ZpoolUnhealthyFact, + payload: ZpoolUnhealthyFactPayload, } -impl IdOrdItem for UnhealthyZpoolFact { +impl IdOrdItem for ZpoolUnhealthyFact { type Key<'a> = FactUuid; fn key(&self) -> Self::Key<'_> { self.fact_id @@ -77,21 +77,21 @@ struct ParentCaseSummary { /// All `ZpoolUnhealthy` facts on this case. In typical operation there /// is at most one fact per zpool; pathological cases (e.g., hand-edited /// DB) can have multiple — the diagnosis engine keeps all of them. - unhealthy_zpools: IdOrdMap, + unhealthy_zpools: IdOrdMap, } impl ParentCaseSummary { fn zpool_ids(&self) -> BTreeSet { - self.unhealthy_zpools.iter().map(|f| f.fact.zpool_id).collect() + self.unhealthy_zpools.iter().map(|f| f.payload.zpool_id).collect() } fn facts_for_zpool( &self, zpool_id: ZpoolUuid, - ) -> impl Iterator { + ) -> impl Iterator { self.unhealthy_zpools .iter() - .filter(move |f| f.fact.zpool_id == zpool_id) + .filter(move |f| f.payload.zpool_id == zpool_id) } } @@ -142,12 +142,12 @@ pub(super) fn analyze( ParentCaseSummary { unhealthy_zpools: IdOrdMap::new() }; for fact in c.facts.iter() { match fact.payload_to::() { - Ok(DiskFact::ZpoolUnhealthy(inner)) => { + Ok(DiskFact::ZpoolUnhealthy(payload)) => { summary .unhealthy_zpools - .insert_unique(UnhealthyZpoolFact { + .insert_unique(ZpoolUnhealthyFact { fact_id: fact.id, - fact: inner, + payload, }) .expect("fact ids are unique within a case"); } @@ -215,10 +215,10 @@ pub(super) fn analyze( continue; } for fact_ref in summary.unhealthy_zpools.iter() { - let Some(current) = faulty.get(&fact_ref.fact.zpool_id) else { + let Some(current) = faulty.get(&fact_ref.payload.zpool_id) else { continue; }; - if current.health != fact_ref.fact.last_seen_health { + if current.health != fact_ref.payload.last_seen_health { case_mut.remove_fact(fact_ref.fact_id); } } @@ -243,9 +243,9 @@ pub(super) fn analyze( let case_id_for_fact = match parent_for_zpool { // Parent case already has an accurate fact — fully covered. Some((_, summary)) - if summary.facts_for_zpool(zpool_id).any(|f| { - f.fact.last_seen_health == current_health - }) => + if summary + .facts_for_zpool(zpool_id) + .any(|f| f.payload.last_seen_health == current_health) => { continue; } @@ -266,7 +266,7 @@ pub(super) fn analyze( .case_mut(&case_id_for_fact) .expect("case_id came from this fn") .add_fact( - &DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + &DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { zpool_id, last_seen_health: current_health, observed_in_inv: inv_collection_id, @@ -406,7 +406,7 @@ mod tests { id: omicron_uuid_kinds::FactUuid::new_v4(), created_sitrep_id: parent_sitrep_id, payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy( - ZpoolUnhealthyFact { + ZpoolUnhealthyFactPayload { zpool_id, last_seen_health: ZpoolHealth::Degraded, observed_in_inv: inv_collection_id, @@ -479,7 +479,7 @@ mod tests { let facts = disk_facts(&sitrep, true); assert_eq!(facts.len(), 1); match &facts[0].2 { - DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { zpool_id, last_seen_health, .. @@ -525,8 +525,9 @@ mod tests { let open_cases = disk_facts(&sitrep, true); assert_eq!(open_cases.len(), 1); match &open_cases[0].2 { - DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { - zpool_id, .. + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + zpool_id, + .. }) => { assert_eq!(*zpool_id, target); } @@ -695,8 +696,9 @@ mod tests { signal", ); match &readable[0].2 { - DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { - zpool_id, .. + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + zpool_id, + .. }) => { assert_eq!(*zpool_id, live_target); } @@ -738,7 +740,7 @@ mod tests { observation hasn't changed", ); match &open[0].2 { - DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { zpool_id, last_seen_health, .. @@ -788,7 +790,7 @@ mod tests { "fact UUID should rotate because last_seen_health changed", ); match &open[0].2 { - DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFact { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { zpool_id, last_seen_health, .. diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 3d0f9865493..d2ea6a256cc 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -91,12 +91,7 @@ pub trait NexusServer: Send + Sync + 'static { fn sitrep_load_rx( &self, ) -> watch::Receiver< - Option< - Arc<( - nexus_types::fm::SitrepVersion, - nexus_types::fm::Sitrep, - )>, - >, + Option>, >; fn get_http_server_external_address(&self) -> SocketAddr; From 5dfc4fd9e3ee5bb539ece58183cf2de5c05d077e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 17:02:19 -0700 Subject: [PATCH 15/15] Index by physical disk, refactor, check analysis report --- nexus/fm/src/diagnosis/physical_disk.rs | 432 ++++++++++++++++-------- 1 file changed, 284 insertions(+), 148 deletions(-) diff --git a/nexus/fm/src/diagnosis/physical_disk.rs b/nexus/fm/src/diagnosis/physical_disk.rs index ac164f1def2..3f4b8759531 100644 --- a/nexus/fm/src/diagnosis/physical_disk.rs +++ b/nexus/fm/src/diagnosis/physical_disk.rs @@ -10,10 +10,12 @@ use chrono::{DateTime, Utc}; use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; use nexus_types::fm::DiagnosisEngineKind; use nexus_types::inventory::ZpoolHealth; -use omicron_uuid_kinds::{CaseUuid, CollectionUuid, FactUuid, ZpoolUuid}; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, FactUuid, PhysicalDiskUuid, ZpoolUuid, +}; use serde::{Deserialize, Serialize}; use slog_error_chain::InlineErrorChain; -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::BTreeMap; /// Per-fact state for the disk diagnosis engine, serialized into the /// `fm_fact.payload` JSONB column. Other diagnosis engines must not @@ -28,6 +30,12 @@ pub enum DiskFact { /// Payload of a [`DiskFact::ZpoolUnhealthy`] fact. #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] pub struct ZpoolUnhealthyFactPayload { + /// The physical disk this fact (and its parent case) is about. + /// Every fact on a Disk case must agree on this value. + pub physical_disk_id: PhysicalDiskUuid, + /// The zpool whose health was observed. Kept for provenance — the + /// case is keyed by `physical_disk_id`, but knowing the exact zpool + /// makes the fact self-describing when read in isolation. pub zpool_id: ZpoolUuid, pub last_seen_health: ZpoolHealth, /// Inventory collection that produced this observation. Recorded for @@ -56,43 +64,33 @@ impl IdOrdItem for ZpoolUnhealthyFact { id_upcast!(); } -/// One observed zpool's health, keyed by `zpool_id`. Used for both the -/// inventory snapshot and the filtered "currently faulty" subset. +/// One in-service disk paired with its current observed health. +/// `health` is `None` when the disk's zpool was not seen in the current +/// inventory (e.g., sled down, lossy collection). #[derive(Clone, Copy, Debug, Eq, PartialEq)] -struct ObservedZpool { +struct DiskHealthSnapshot { + physical_disk_id: PhysicalDiskUuid, zpool_id: ZpoolUuid, - health: ZpoolHealth, + health: Option, } -impl IdOrdItem for ObservedZpool { - type Key<'a> = ZpoolUuid; +impl IdOrdItem for DiskHealthSnapshot { + type Key<'a> = PhysicalDiskUuid; fn key(&self) -> Self::Key<'_> { - self.zpool_id + self.physical_disk_id } id_upcast!(); } -/// Per-case summary built from a case's parent-forwarded facts. +/// Per-case summary built from a case's facts. Each Disk case is about a +/// single physical disk; every fact on the case must reference that disk. struct ParentCaseSummary { - /// All `ZpoolUnhealthy` facts on this case. In typical operation there - /// is at most one fact per zpool; pathological cases (e.g., hand-edited - /// DB) can have multiple — the diagnosis engine keeps all of them. - unhealthy_zpools: IdOrdMap, -} - -impl ParentCaseSummary { - fn zpool_ids(&self) -> BTreeSet { - self.unhealthy_zpools.iter().map(|f| f.payload.zpool_id).collect() - } - - fn facts_for_zpool( - &self, - zpool_id: ZpoolUuid, - ) -> impl Iterator { - self.unhealthy_zpools - .iter() - .filter(move |f| f.payload.zpool_id == zpool_id) - } + /// The physical disk this case is about. + physical_disk_id: PhysicalDiskUuid, + /// All `ZpoolUnhealthy` facts on this case. Normally one; pathological + /// cases (e.g., hand-edited DB) may have multiple — the diagnosis + /// engine keeps all of them. + unhealthy_facts: IdOrdMap, } pub(super) fn analyze( @@ -102,49 +100,64 @@ pub(super) fn analyze( let inv_collection_id = input.inventory().id; let inv_time_done = input.inventory().time_done; - // The disk DE's lookups today are by `zpool_id`, so we build a local - // set of in-service zpools. Future variants of `DiskFact` are welcome - // to derive their own indices (e.g., by `sled_id` for FMD). - let in_service_zpools: BTreeSet = - input.in_service_disks().iter().map(|d| d.zpool_id).collect(); - - // Index every zpool we observed in this inventory by ID, so we can - // distinguish "saw it, it's Online" from "didn't see it at all" below. - let observed: IdOrdMap = input + // Index every zpool we observed in this inventory, so we can distinguish + // "saw it, it's Online" from "didn't see it at all" when looking up by + // an in-service disk's zpool below. + let observed_health: BTreeMap = input .inventory() .sled_agents .iter() .flat_map(|sa| sa.zpools.iter()) - .map(|z| ObservedZpool { zpool_id: z.id, health: z.health }) + .map(|z| (z.id, z.health)) .collect(); - // Currently-faulty, control-plane-managed zpools. - // - // Out-of-service zpools are intentionally ignored: a non-`Online` zpool - // whose disk has been expunged is no longer the control plane's concern. - let faulty: IdOrdMap = observed + // The current health snapshot for every in-service disk, keyed by + // physical_disk_id. Absence from this index is a positive signal that + // the control plane has moved on from the disk (expungement / + // decommissioning); see prepare_inputs in + // nexus/src/app/background/tasks/fm_analysis.rs. + let in_service_health: IdOrdMap = input + .in_service_disks() .iter() - .filter(|z| in_service_zpools.contains(&z.zpool_id)) - .filter(|z| z.health != ZpoolHealth::Online) - .copied() + .map(|d| DiskHealthSnapshot { + physical_disk_id: d.physical_disk_id, + zpool_id: d.zpool_id, + health: observed_health.get(&d.zpool_id).copied(), + }) .collect(); // Index parent-forwarded Disk cases from the input — the state copied // from the parent sitrep, *not* the in-progress builder we mutate - // below. Skip (with a warning) any fact we can't deserialize. - let parent_summaries: BTreeMap = input + // below. Every case is about one physical disk; we derive the disk + // from its facts. Skip (with a warning) any case we can't safely + // interpret. + let parent_cases: BTreeMap = input .open_cases() .iter() .filter(|c| c.metadata.de == DiagnosisEngineKind::PhysicalDisk) - .map(|c| { + .filter_map(|c| { let case_id = c.id; - let mut summary = - ParentCaseSummary { unhealthy_zpools: IdOrdMap::new() }; + let mut unhealthy_facts: IdOrdMap = + IdOrdMap::new(); + let mut case_disk_id: Option = None; for fact in c.facts.iter() { match fact.payload_to::() { Ok(DiskFact::ZpoolUnhealthy(payload)) => { - summary - .unhealthy_zpools + let topic = *case_disk_id + .get_or_insert(payload.physical_disk_id); + if topic != payload.physical_disk_id { + slog::warn!( + &builder.log, + "skipping Disk case: facts reference \ + different physical disks (1 expected)"; + "case_id" => %case_id, + "expected_physical_disk_id" => %topic, + "fact_physical_disk_id" => + %payload.physical_disk_id, + ); + return None; + } + unhealthy_facts .insert_unique(ZpoolUnhealthyFact { fact_id: fact.id, payload, @@ -154,97 +167,96 @@ pub(super) fn analyze( Err(e) => { slog::warn!( &builder.log, - "skipping Disk case fact whose payload did not \ - deserialize as DiskFact; this run will not \ - modify it"; + "skipping entire Disk case; one of its facts \ + did not deserialize as DiskFact"; "case_id" => %case_id, "fact_id" => %fact.id, "error" => InlineErrorChain::new(&*e).to_string(), ); + return None; } } } - (case_id, summary) + let Some(physical_disk_id) = case_disk_id else { + slog::warn!( + &builder.log, + "skipping Disk case with no facts; cannot derive disk \ + topic"; + "case_id" => %case_id, + ); + return None; + }; + Some(( + case_id, + ParentCaseSummary { physical_disk_id, unhealthy_facts }, + )) }) .collect(); - // Close any open Disk case whose entire set of (interpretable) facts - // points at zpools that are now Online or expunged. - // - // Absence from inventory is NOT a recovery signal: a sled could be - // powered off, or inventory could be lossy. - for (case_id, summary) in &parent_summaries { - if summary.unhealthy_zpools.is_empty() { - continue; - } - let mut reasons: Vec = Vec::new(); - let mut any_still_unhealthy = false; - for zpool_id in summary.zpool_ids() { - if !in_service_zpools.contains(&zpool_id) { - reasons.push(format!("zpool {zpool_id} no longer in service")); - } else if observed.get(&zpool_id).map(|z| z.health) - == Some(ZpoolHealth::Online) - { - reasons.push(format!("zpool {zpool_id} back to Online")); - } else { - any_still_unhealthy = true; - break; - } - } - if !any_still_unhealthy { - builder - .cases - .case_mut(case_id) - .expect("case_id came from iterating builder.cases") - .close(format!( - "all ZpoolUnhealthy facts resolved: {}", - reasons.join("; "), - )); - } - } - - // For each parent-forwarded ZpoolUnhealthy fact: if the recorded - // last_seen_health no longer matches current observation, remove the - // stale fact. We'll re-add a fresh one below with current data (new UUID). - for (case_id, summary) in &parent_summaries { + // For each parent case, decide what to do based on its disk's current + // state: + // - disk no longer in service → close the case (expungement) + // - disk's zpool back to Online → close the case (recovery) + // - disk still unhealthy → drop any facts whose recorded health no + // longer matches; the matching loop below will re-add a fresh fact + // - disk in service but absent from inventory → leave alone (absence + // is NOT a recovery signal: sled could be powered off, or + // inventory could be lossy) + for (case_id, summary) in &parent_cases { let mut case_mut = builder .cases .case_mut(case_id) .expect("case_id came from iterating builder.cases"); - if !case_mut.is_open() { - continue; - } - for fact_ref in summary.unhealthy_zpools.iter() { - let Some(current) = faulty.get(&fact_ref.payload.zpool_id) else { - continue; - }; - if current.health != fact_ref.payload.last_seen_health { - case_mut.remove_fact(fact_ref.fact_id); + match in_service_health.get(&summary.physical_disk_id) { + None => { + case_mut.close(format!( + "disk {} no longer in service", + summary.physical_disk_id, + )); + } + Some(snap) if snap.health == Some(ZpoolHealth::Online) => { + case_mut + .close(format!("zpool {} back to Online", snap.zpool_id,)); + } + Some(snap) => { + let Some(current_health) = snap.health else { + continue; + }; + for fact_ref in summary.unhealthy_facts.iter() { + if fact_ref.payload.last_seen_health != current_health { + case_mut.remove_fact(fact_ref.fact_id); + } + } } } } - // For each currently-faulty in-service zpool: if there's no open case + - // accurate fact already covering it, ensure a case exists (reuse the - // parent-forwarded one for this zpool if any) and add a fresh fact. - for current in faulty.iter() { - let zpool_id = current.zpool_id; - let current_health = current.health; - let parent_for_zpool = - parent_summaries.iter().find_map(|(case_id, summary)| { - let mut facts = summary.facts_for_zpool(zpool_id).peekable(); - if facts.peek().is_some() { + // For each currently-faulty in-service disk: ensure a case exists + // (reusing the parent-forwarded one for this disk if any) and add a + // fresh fact if one with this exact health isn't already present. + for disk in in_service_health.iter() { + let Some(current_health) = disk.health else { + continue; + }; + if current_health == ZpoolHealth::Online { + continue; + } + + let parent_for_disk = + parent_cases.iter().find_map(|(case_id, summary)| { + if summary.physical_disk_id == disk.physical_disk_id { Some((*case_id, summary)) } else { None } }); - let case_id_for_fact = match parent_for_zpool { + let case_id_for_fact = match parent_for_disk { // Parent case already has an accurate fact — fully covered. Some((_, summary)) if summary - .facts_for_zpool(zpool_id) + .unhealthy_facts + .iter() .any(|f| f.payload.last_seen_health == current_health) => { continue; @@ -252,11 +264,14 @@ pub(super) fn analyze( // Parent case exists; its stale facts were removed above. // Refresh under the same case. Some((case_id, _)) => case_id, - // No parent case for this zpool — open one. + // No parent case for this disk — open one. None => { let mut new_case = builder.cases.open_case(DiagnosisEngineKind::PhysicalDisk); - new_case.set_comment(format!("zpool {zpool_id} unhealthy")); + new_case.set_comment(format!( + "physical disk {} unhealthy", + disk.physical_disk_id, + )); new_case.id } }; @@ -267,12 +282,13 @@ pub(super) fn analyze( .expect("case_id came from this fn") .add_fact( &DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { - zpool_id, + physical_disk_id: disk.physical_disk_id, + zpool_id: disk.zpool_id, last_seen_health: current_health, observed_in_inv: inv_collection_id, time_observed: inv_time_done, }), - format!("zpool {zpool_id} health={current_health}"), + format!("zpool {} health={current_health}", disk.zpool_id,), )?; } @@ -317,6 +333,21 @@ mod tests { .collect() } + /// Find the `physical_disk_id` for the given `zpool_id` in the + /// in-service set, or fabricate a fresh one if not present (e.g., when + /// simulating an expunged disk whose case should still reference some + /// stable PhysicalDiskUuid). + fn disk_id_for( + in_service: &IdOrdMap, + zpool_id: ZpoolUuid, + ) -> PhysicalDiskUuid { + in_service + .iter() + .find(|d| d.zpool_id == zpool_id) + .map(|d| d.physical_disk_id) + .unwrap_or_else(PhysicalDiskUuid::new_v4) + } + /// Make a synthetic test scenario from the example system: returns a /// `LogContext` (the caller must `cleanup_successful()` it), the /// example collection, and every zpool ID in that collection. @@ -380,22 +411,26 @@ mod tests { input } - /// Run `disk::analyze` over an input and return the resulting Sitrep. - fn run_analyze(log: &slog::Logger, input: &Input) -> Sitrep { + /// Run `disk::analyze` over an input and return the resulting Sitrep + /// along with the analysis report (whose log entries the close-comment + /// assertions in `closes_*` tests inspect). + fn run_analyze( + log: &slog::Logger, + input: &Input, + ) -> (Sitrep, fm::analysis_reports::AnalysisReport) { let mut builder = SitrepBuilder::new_with_rng( log, input, SitrepBuilderRng::from_seed("disk-analyze"), ); analyze(input, &mut builder).expect("analyze ok"); - let (sitrep, _report) = - builder.build(OmicronZoneUuid::new_v4(), Utc::now()); - sitrep + builder.build(OmicronZoneUuid::new_v4(), Utc::now()) } fn make_parent_with_disk_case( parent_sitrep_id: SitrepUuid, inv_collection_id: omicron_uuid_kinds::CollectionUuid, + physical_disk_id: PhysicalDiskUuid, zpool_id: ZpoolUuid, ) -> Sitrep { let mut cases = iddqd::IdOrdMap::new(); @@ -407,6 +442,7 @@ mod tests { created_sitrep_id: parent_sitrep_id, payload: serde_json::to_value(&DiskFact::ZpoolUnhealthy( ZpoolUnhealthyFactPayload { + physical_disk_id, zpool_id, last_seen_health: ZpoolHealth::Degraded, observed_in_inv: inv_collection_id, @@ -473,17 +509,20 @@ mod tests { let target = zpools[0]; set_health(&mut collection, target, ZpoolHealth::Degraded); let in_service = mk_in_service(zpools.iter().copied()); + let expected_disk_id = disk_id_for(&in_service, target); let input = build_input(collection, None, in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); let facts = disk_facts(&sitrep, true); assert_eq!(facts.len(), 1); match &facts[0].2 { DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + physical_disk_id, zpool_id, last_seen_health, .. }) => { + assert_eq!(*physical_disk_id, expected_disk_id); assert_eq!(*zpool_id, target); assert_eq!(*last_seen_health, ZpoolHealth::Degraded); } @@ -500,7 +539,7 @@ mod tests { let in_service = mk_in_service(zpools.iter().copied().skip(1)); let input = build_input(collection, None, in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); let cases = disk_facts(&sitrep, false); assert!( cases.is_empty(), @@ -516,12 +555,17 @@ mod tests { let target = zpools[0]; set_health(&mut collection, target, ZpoolHealth::Degraded); let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); let parent_id = SitrepUuid::new_v4(); - let parent = - make_parent_with_disk_case(parent_id, collection.id, target); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); let open_cases = disk_facts(&sitrep, true); assert_eq!(open_cases.len(), 1); match &open_cases[0].2 { @@ -541,18 +585,29 @@ mod tests { let target = zpools[0]; // The example system reports zpools as Online by default. let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); let parent_id = SitrepUuid::new_v4(); - let parent = - make_parent_with_disk_case(parent_id, collection.id, target); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, report) = run_analyze(&logctx.log, &input); let all = disk_facts(&sitrep, false); assert_eq!(all.len(), 1); assert!( !all[0].0.is_open(), "case should be closed when zpool returns to Online", ); + let report_str = format!("{}", report.display_multiline(0)); + assert!( + report_str.contains("back to Online"), + "close comment should call out the recovery cause, got: \ + {report_str}", + ); logctx.cleanup_successful(); } @@ -564,18 +619,30 @@ mod tests { set_health(&mut collection, target, ZpoolHealth::Degraded); // Target is NOT in-service in this sitrep (just expunged). let in_service = mk_in_service(zpools.iter().copied().skip(1)); + // Target isn't in the in-service set; fabricate a stable PhysicalDiskUuid. + let target_disk_id = disk_id_for(&in_service, target); let parent_id = SitrepUuid::new_v4(); - let parent = - make_parent_with_disk_case(parent_id, collection.id, target); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, report) = run_analyze(&logctx.log, &input); let all = disk_facts(&sitrep, false); assert_eq!(all.len(), 1); assert!( !all[0].0.is_open(), "case should be closed when zpool's disk is expunged", ); + let report_str = format!("{}", report.display_multiline(0)); + assert!( + report_str.contains("no longer in service"), + "close comment should call out the expungement cause, got: \ + {report_str}", + ); logctx.cleanup_successful(); } @@ -588,12 +655,17 @@ mod tests { let phantom = ZpoolUuid::new_v4(); assert!(!zpools.contains(&phantom)); let in_service = mk_in_service(zpools.iter().copied().chain([phantom])); + let phantom_disk_id = disk_id_for(&in_service, phantom); let parent_id = SitrepUuid::new_v4(); - let parent = - make_parent_with_disk_case(parent_id, collection.id, phantom); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + phantom_disk_id, + phantom, + ); let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); let all = disk_facts(&sitrep, false); assert_eq!(all.len(), 1); assert!( @@ -666,7 +738,7 @@ mod tests { }; let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); // The unreadable case must still be present, still open, and its // fact unchanged. @@ -706,6 +778,60 @@ mod tests { logctx.cleanup_successful(); } + /// A parent Disk case with zero facts has no derivable topic disk, so + /// the diagnosis engine leaves it alone (carried forward unchanged). + #[test] + fn empty_case_is_left_open() { + let (logctx, collection, _zpools) = setup("disk_empty_case_left_open"); + let in_service = mk_in_service(std::iter::empty()); + + let parent_sitrep_id = SitrepUuid::new_v4(); + let empty_case_id = omicron_uuid_kinds::CaseUuid::new_v4(); + let mut parent_cases = iddqd::IdOrdMap::new(); + parent_cases + .insert_unique(fm::Case { + id: empty_case_id, + metadata: fm::case::Metadata { + created_sitrep_id: parent_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::PhysicalDisk, + comment: "an open case with no facts".to_string(), + }, + ereports: Default::default(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + facts: Default::default(), + }) + .unwrap(); + let parent = Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + inv_collection_id: collection.id, + creator_id: OmicronZoneUuid::new_v4(), + parent_sitrep_id: None, + time_created: Utc::now(), + next_inv_min_time_started: Utc::now(), + comment: String::new(), + }, + cases: parent_cases, + ereports_by_id: Default::default(), + }; + + let input = build_input(collection, Some(parent), in_service); + let (sitrep, _report) = run_analyze(&logctx.log, &input); + + let case = sitrep + .cases + .iter() + .find(|c| c.id == empty_case_id) + .expect("empty case should still be in the output sitrep"); + assert!( + case.is_open(), + "empty case should be left open (no topic disk to verify)", + ); + logctx.cleanup_successful(); + } + /// When the parent sitrep's fact content matches the diagnosis engine's current /// observation, the fact carries forward with the same UUID — no /// remove-and-readd churn. @@ -715,9 +841,14 @@ mod tests { let target = zpools[0]; set_health(&mut collection, target, ZpoolHealth::Degraded); let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); let parent_id = SitrepUuid::new_v4(); - let parent = - make_parent_with_disk_case(parent_id, collection.id, target); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); // Capture the parent's fact UUID for the target zpool. let parent_fact_id = parent .cases @@ -731,7 +862,7 @@ mod tests { .id; let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); let open = disk_facts(&sitrep, true); assert_eq!(open.len(), 1, "expected exactly one open Disk fact"); assert_eq!( @@ -763,9 +894,14 @@ mod tests { // Parent recorded Degraded; current inventory shows Faulted. set_health(&mut collection, target, ZpoolHealth::Faulted); let in_service = mk_in_service(zpools.iter().copied()); + let target_disk_id = disk_id_for(&in_service, target); let parent_id = SitrepUuid::new_v4(); - let parent = - make_parent_with_disk_case(parent_id, collection.id, target); + let parent = make_parent_with_disk_case( + parent_id, + collection.id, + target_disk_id, + target, + ); let parent_fact_id = parent .cases .iter() @@ -778,7 +914,7 @@ mod tests { .id; let input = build_input(collection, Some(parent), in_service); - let sitrep = run_analyze(&logctx.log, &input); + let (sitrep, _report) = run_analyze(&logctx.log, &input); let open = disk_facts(&sitrep, true); assert_eq!( open.len(),