From f9414d29ae227b4658ef55deed129ac9908c409b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 23 Sep 2025 14:34:06 -0400 Subject: [PATCH] PlanningReport: first pass at operator notes These should be (a) serialized in the db along with the blueprint they came from and (b) be available in the external API, but I wanted to get a first pass out so folks could look over the way I've pulled these out. I'm not at all sure this is the best way to generate these strings, and even if it is, they undoubtedly need some serious wordsmithing for external consumption and consistency. --- dev-tools/omdb/src/bin/omdb/db/blueprints.rs | 8 + dev-tools/omdb/src/bin/omdb/nexus.rs | 8 + dev-tools/reconfigurator-cli/src/lib.rs | 27 +- .../output/cmds-add-sled-no-disks-stdout | 3 + ...mds-add-zones-with-mupdate-override-stdout | 13 +- ...ds-expunge-newly-added-external-dns-stdout | 3 + .../output/cmds-mupdate-update-flow-stdout | 53 +- .../output/cmds-noop-image-source-stdout | 12 +- .../tests/output/cmds-target-release-stdout | 3 + nexus/reconfigurator/planning/src/planner.rs | 8 +- nexus/types/src/deployment.rs | 1 + nexus/types/src/deployment/planning_report.rs | 52 +- .../planning_report/operator_notes.rs | 462 ++++++++++++++++++ 13 files changed, 624 insertions(+), 29 deletions(-) create mode 100644 nexus/types/src/deployment/planning_report/operator_notes.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs b/dev-tools/omdb/src/bin/omdb/db/blueprints.rs index a4ee372bc02..8e1e0e2a125 100644 --- a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs +++ b/dev-tools/omdb/src/bin/omdb/db/blueprints.rs @@ -232,5 +232,13 @@ async fn cmd_db_blueprint_planner_report_show( println!("planner report for blueprint {blueprint_id}:"); println!("{report}"); + let operator_notes = report.operator_notes().into_notes(); + if !operator_notes.is_empty() { + println!("\nnotes for customer operator:"); + for note in operator_notes { + println!(" * {note}"); + } + } + Ok(()) } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index ba395aff074..a851b68683b 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1309,6 +1309,14 @@ fn print_task_blueprint_planner(details: &serde_json::Value) { but could not make it the target: {error}" ); println!("{report}"); + + let operator_notes = report.operator_notes().into_notes(); + if !operator_notes.is_empty() { + println!("\nnotes for customer operator:"); + for note in operator_notes { + println!(" * {note}"); + } + } } BlueprintPlannerStatus::Targeted { blueprint_id, report, .. } => { println!( diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 5d03257a923..63519a849f3 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -2106,9 +2106,34 @@ fn cmd_blueprint_plan( .context("creating planner")?; let blueprint = planner.plan().context("generating blueprint")?; + let operator_notes = match &blueprint.source { + BlueprintSource::Planner(report) => { + let mut notes = report + .operator_notes() + .into_notes() + .into_iter() + .map(|note| format!("\n * {note}")) + .peekable(); + if notes.peek().is_some() { + format!( + "\n\nnotes for customer operator:{}", + notes.collect::() + ) + } else { + String::new() + } + } + BlueprintSource::Rss + | BlueprintSource::PlannerLoadedFromDatabase + | BlueprintSource::ReconfiguratorCliEdit + | BlueprintSource::Test => unreachable!( + "unexpected blueprint source {} (just ran planner!)", + blueprint.source + ), + }; let rv = format!( "generated blueprint {} based on parent blueprint {}\n\ - blueprint source: {}", + blueprint source: {}{operator_notes}", blueprint.id, parent_blueprint.id, blueprint.source, ); system.add_blueprint(blueprint)?; diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout index 67bc81ec0a2..586d762b57f 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout @@ -53,6 +53,9 @@ planning report: +notes for customer operator: + * 1 sled with no available disks to host NTP service + > blueprint-show 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 parent: dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout index 040173296fc..11387a7bf00 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-zones-with-mupdate-override-stdout @@ -95,7 +95,7 @@ INFO BootPartitionDetails inventory hash not found in TUF repo, ignoring for noo generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (2) is lower than minimum required by blueprint (3) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 @@ -109,6 +109,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > # Diff the blueprints. This diff should show "will remove mupdate override" > # and the target release minimum generation being set. @@ -220,7 +223,7 @@ INFO BootPartitionDetails inventory hash not found in TUF repo, ignoring for noo generated blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 based on parent blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (2) is lower than minimum required by blueprint (3) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 @@ -234,6 +237,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 to: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 @@ -301,6 +307,9 @@ planner config: +notes for customer operator: + * support override configured: internal services can be placed even if some versions are unknown due to sled recovery operations + > blueprint-diff latest from: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 to: blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index 9d953c1aeb7..d15c0d482cc 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -868,6 +868,9 @@ planner config: +notes for customer operator: + * support override configured: internal services can be placed even if some versions are unknown due to sled recovery operations + > blueprint-diff 366b0b68-d80e-4bc1-abd3-dc69837847e0 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 from: blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 to: blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 51ac8e590b6..53fb87f34ed 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -505,7 +505,7 @@ INFO skipped noop image source check on sled, sled_id: d81c6a84-79b8-4958-ae41-e generated blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 based on parent blueprint d60afc57-f15d-476c-bd0f-b1071e2bb976 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (2) is lower than minimum required by blueprint (3) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, d81c6a84-79b8-4958-ae41-ea46c9b19763 @@ -520,6 +520,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > # Diff the blueprints. This diff should show: > # @@ -787,7 +790,7 @@ INFO skipped noop image source check on sled, sled_id: d81c6a84-79b8-4958-ae41-e generated blueprint 626487fa-7139-45ec-8416-902271fc730b based on parent blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 blueprint source: planner with report: planning report: -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, d81c6a84-79b8-4958-ae41-ea46c9b19763 - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c @@ -801,6 +804,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 to: blueprint 626487fa-7139-45ec-8416-902271fc730b @@ -920,7 +926,7 @@ generated blueprint c1a0d242-9160-40f4-96ae-61f8f40a0b1b based on parent bluepri blueprint source: planner with report: planning report: * noop converting 6/6 install-dataset zones to artifact store on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (3) is lower than minimum required by blueprint (4) - sleds have remove mupdate override set in blueprint: d81c6a84-79b8-4958-ae41-ea46c9b19763 @@ -934,6 +940,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 6/6 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > # Diff the blueprints. This diff should show: > # * on sled 0: @@ -1110,7 +1120,7 @@ generated blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 based on parent bluepri blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (3) is lower than minimum required by blueprint (4) - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c @@ -1123,6 +1133,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-show latest blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 parent: c1a0d242-9160-40f4-96ae-61f8f40a0b1b @@ -1292,7 +1305,7 @@ parent: c1a0d242-9160-40f4-96ae-61f8f40a0b1b blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (3) is lower than minimum required by blueprint (4) - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c @@ -1440,7 +1453,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * noop converting 6/6 install-dataset zones to artifact store on sled d81c6a84-79b8-4958-ae41-ea46c9b19763 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c - sleds have deployment units with image sources not set to Artifact: @@ -1451,6 +1464,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 6/6 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-show latest blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 parent: afb09faf-a586-4483-9289-04d4f1d8ba23 @@ -1621,7 +1638,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * noop converting 6/6 install-dataset zones to artifact store on sled d81c6a84-79b8-4958-ae41-ea46c9b19763 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have mupdate override errors: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c - sleds have deployment units with image sources not set to Artifact: @@ -1740,7 +1757,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * skipping noop zone image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have deployment units with image sources not set to Artifact: - sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c: 7 zones @@ -1750,6 +1767,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 to: blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1 @@ -1818,6 +1838,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 7/7 service zones to known versions + * out of eligible sleds: only placed 0 services out of 1 desired + > blueprint-show latest blueprint 12d602a6-5ab4-487a-b94e-eb30cdf30300 parent: 8f2d1f39-7c88-4701-aa43-56bf281b28c1 @@ -2191,6 +2215,10 @@ planning report: +notes for customer operator: + * across 3 sleds, converted 3 OS images to known versions + * out of eligible sleds: only placed 0 services out of 1 desired + > blueprint-diff latest from: blueprint 12d602a6-5ab4-487a-b94e-eb30cdf30300 to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 @@ -2423,7 +2451,7 @@ planning report: * skipping noop zone image source check on sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c: all 7 zones are already from artifacts * skipping noop zone image source check on sled c3bc4c6d-fdde-4fc4-8493-89d2a1e5ee6b: all 0 zones are already from artifacts * skipping noop zone image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - current target release generation (4) is lower than minimum required by blueprint (5) - sleds have remove mupdate override set in blueprint: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 @@ -2435,6 +2463,9 @@ planning report: +notes for customer operator: + * service updates are waiting for system to recover from support-driven update (mupdate) + > blueprint-diff latest from: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 to: blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c @@ -2573,6 +2604,10 @@ planner config: +notes for customer operator: + * support override configured: internal services can be placed even if some versions are unknown due to sled recovery operations + * out of eligible sleds: only placed 0 services out of 1 desired + > blueprint-diff latest from: blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c to: blueprint 9f89efdf-a23e-4137-b7cc-79f4a91cbe1f diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout index 203e65f87ab..444df193911 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout @@ -185,7 +185,7 @@ blueprint source: planner with report: planning report: * noop converting 6/6 install-dataset zones to artifact store on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 * noop converting 5/6 install-dataset zones to artifact store on sled aff6c093-197d-42c5-ad80-9f10ba051a34 -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have remove mupdate override set in blueprint: d81c6a84-79b8-4958-ae41-ea46c9b19763 - sleds have deployment units with image sources not set to Artifact: @@ -200,6 +200,10 @@ planning report: +notes for customer operator: + * across 2 sleds, converted 11/12 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > # This diff should show expected changes to the blueprint. > blueprint-diff latest @@ -433,7 +437,7 @@ blueprint source: planner with report: planning report: * skipping noop zone image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * noop converting 2/2 install-dataset zones to artifact store on sled e96e226f-4ed9-4c01-91b9-69a9cd076c9e -* zone adds waiting on blockers +* zone adds waiting on mupdate blockers * zone adds and updates are blocked: - sleds have remove mupdate override set in blueprint: d81c6a84-79b8-4958-ae41-ea46c9b19763 - sleds have deployment units with image sources not set to Artifact: @@ -447,6 +451,10 @@ planning report: +notes for customer operator: + * across 1 sled, converted 2/2 service zones to known versions + * service updates are waiting for system to recover from support-driven update (mupdate) + > # This diff should show changes to the sled that's back in inventory. > blueprint-diff latest diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index 46533ac676f..0475de80881 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -152,6 +152,9 @@ planning report: +notes for customer operator: + * across 3 sleds, converted 25/25 service zones to known versions + > blueprint-diff latest from: blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index c9195170ba0..6bf6052ff72 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -259,7 +259,7 @@ impl<'a> Planner<'a> { { self.do_plan_add(&mgs_updates)? } else { - PlanningAddStepReport::waiting_on(ZoneAddWaitingOn::Blockers) + PlanningAddStepReport::waiting_on(ZoneAddWaitingOn::MupdateBlockers) }; add.add_update_blocked_reasons = add_update_blocked_reasons; add.add_zones_with_mupdate_override = add_zones_with_mupdate_override; @@ -536,7 +536,11 @@ impl<'a> Planner<'a> { // have been added and then expunged since our // parent blueprint was created). We don't want to // fail in this case, but will report it. - report.orphan_disks.insert(sled_id, disk.disk_id); + report + .orphan_disks + .entry(sled_id) + .or_default() + .push(disk.disk_id); } Err(err) => return Err(err), } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index f435955bf3a..adac2327399 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -131,6 +131,7 @@ pub use planning_report::PlanningNoopImageSourceSkipSledHostPhase2Reason; pub use planning_report::PlanningNoopImageSourceSkipSledZonesReason; pub use planning_report::PlanningNoopImageSourceSkipZoneReason; pub use planning_report::PlanningNoopImageSourceStepReport; +pub use planning_report::PlanningOperatorNotes; pub use planning_report::PlanningReport; pub use planning_report::PlanningZoneUpdatesStepReport; pub use planning_report::ZoneAddWaitingOn; diff --git a/nexus/types/src/deployment/planning_report.rs b/nexus/types/src/deployment/planning_report.rs index 6cee384b3b1..eb524c6b241 100644 --- a/nexus/types/src/deployment/planning_report.rs +++ b/nexus/types/src/deployment/planning_report.rs @@ -37,6 +37,10 @@ use std::fmt::Write; use std::sync::Arc; use thiserror::Error; +mod operator_notes; + +pub use operator_notes::PlanningOperatorNotes; + /// A full blueprint planning report. Other than the blueprint ID, each /// field corresponds to a step in the update planner, i.e., a subroutine /// of `omicron_nexus::reconfigurator::planning::Planner::do_plan`. @@ -89,14 +93,33 @@ impl PlanningReport { } pub fn is_empty(&self) -> bool { - self.expunge.is_empty() - && self.decommission.is_empty() - && self.noop_image_source.is_empty() - && self.mgs_updates.is_empty() - && self.add.is_empty() - && self.zone_updates.is_empty() - && self.nexus_generation_bump.is_empty() - && self.cockroachdb_settings.is_empty() + let Self { + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + nexus_generation_bump, + cockroachdb_settings, + // the config is not relevant to whether we have content + planner_config: _, + } = self; + + expunge.is_empty() + && decommission.is_empty() + && noop_image_source.is_empty() + && mgs_updates.is_empty() + && add.is_empty() + && zone_updates.is_empty() + && nexus_generation_bump.is_empty() + && cockroachdb_settings.is_empty() + } + + /// Returns a set of notes condensed from this report that are suitable for + /// display to a rack operator (i.e., not internal debugging). + pub fn operator_notes(&self) -> PlanningOperatorNotes { + PlanningOperatorNotes::new(self) } } @@ -139,7 +162,7 @@ impl fmt::Display for PlanningReport { #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub struct PlanningExpungeStepReport { /// Expunged disks not present in the parent blueprint. - pub orphan_disks: BTreeMap, + pub orphan_disks: BTreeMap>, } impl PlanningExpungeStepReport { @@ -161,8 +184,11 @@ impl fmt::Display for PlanningExpungeStepReport { "* planning input contained expunged disks \ not present in parent blueprint:", )?; - for (sled, disk) in orphan_disks.iter() { - writeln!(f, " * sled {sled}, disk {disk}",)?; + for (sled, disks) in orphan_disks.iter() { + writeln!(f, " * sled {sled}:")?; + for disk in disks { + writeln!(f, " * disk {disk}")?; + } } } Ok(()) @@ -675,13 +701,13 @@ pub struct DiscretionaryZonePlacement { pub enum ZoneAddWaitingOn { /// Waiting on one or more blockers (typically MUPdate-related reasons) to /// clear. - Blockers, + MupdateBlockers, } impl ZoneAddWaitingOn { pub fn as_str(&self) -> &'static str { match self { - Self::Blockers => "blockers", + Self::MupdateBlockers => "mupdate blockers", } } } diff --git a/nexus/types/src/deployment/planning_report/operator_notes.rs b/nexus/types/src/deployment/planning_report/operator_notes.rs new file mode 100644 index 00000000000..267ce873136 --- /dev/null +++ b/nexus/types/src/deployment/planning_report/operator_notes.rs @@ -0,0 +1,462 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Summarizing a [`PlanningReport`] for customer operators. + +use super::ZoneAddWaitingOn; +use super::ZoneUnsafeToShutdown; +use crate::deployment::PlannerConfig; +use crate::deployment::PlanningAddStepReport; +use crate::deployment::PlanningDecommissionStepReport; +use crate::deployment::PlanningExpungeStepReport; +use crate::deployment::PlanningMgsUpdatesStepReport; +use crate::deployment::PlanningNoopImageSourceSkipSledHostPhase2Reason; +use crate::deployment::PlanningNoopImageSourceSkipSledZonesReason; +use crate::deployment::PlanningNoopImageSourceSkipZoneReason; +use crate::deployment::PlanningNoopImageSourceStepReport; +use crate::deployment::PlanningReport; +use crate::deployment::PlanningZoneUpdatesStepReport; +use crate::deployment::planning_report::PlanningAddOutOfEligibleSleds; +use crate::deployment::planning_report::PlanningNoopImageSourceConverted; +use slog_error_chain::InlineErrorChain; + +#[derive(Debug, Clone)] +pub struct PlanningOperatorNotes { + notes: Vec, +} + +impl PlanningOperatorNotes { + pub(super) fn new(report: &PlanningReport) -> Self { + let PlanningReport { + planner_config, + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + // nexus generation bump is a normal part of the end of an update; + // there's nothing an operator can act on here. + nexus_generation_bump: _, + // cockroach cluster settings are not operator-relevant + cockroachdb_settings: _, + } = report; + + let mut me = Self { notes: Vec::new() }; + + me.push_planner_config_notes(planner_config); + me.push_expunge_notes(expunge); + me.push_decommission_notes(decommission); + me.push_noop_image_source_notes(noop_image_source); + me.push_mgs_updates_notes(mgs_updates); + me.push_add_notes(add); + me.push_zone_updates_notes(zone_updates); + + me + } + + pub fn into_notes(self) -> Vec { + self.notes + } + + fn push_planner_config_notes(&mut self, config: &PlannerConfig) { + let PlannerConfig { add_zones_with_mupdate_override } = config; + if *add_zones_with_mupdate_override { + self.notes.push( + "support override configured: internal services can be \ + placed even if some versions are unknown due to sled \ + recovery operations" + .to_string(), + ); + } + } + + fn push_expunge_notes(&mut self, expunge: &PlanningExpungeStepReport) { + let PlanningExpungeStepReport { orphan_disks } = expunge; + + let mut nsleds = 0; + let mut ndisks = 0; + for orphans in orphan_disks.values() { + nsleds += 1; + ndisks += orphans.len(); + } + if nsleds > 0 { + self.notes.push(format!( + "unexpectedly found {ndisks} orphaned disk{} \ + across {nsleds} sled{}", + pluralize_s(ndisks), + pluralize_s(nsleds) + )); + } + } + + fn push_decommission_notes( + &mut self, + decommission: &PlanningDecommissionStepReport, + ) { + let PlanningDecommissionStepReport { zombie_sleds } = decommission; + let nsleds = zombie_sleds.len(); + if nsleds > 0 { + self.notes.push(format!( + "unexpectedly found {nsleds} sled{} that \ + have been decommissioned", + pluralize_s(nsleds), + )); + } + } + + fn push_noop_image_source_notes( + &mut self, + noop_image_source: &PlanningNoopImageSourceStepReport, + ) { + let PlanningNoopImageSourceStepReport { + no_target_release, + skipped_sled_zones, + skipped_sled_host_phase_2, + skipped_zones, + converted, + } = noop_image_source; + + // If there's no target release, we can't do any updates nor do any + // no-op conversions, so just add a single note and bail out. + if *no_target_release { + self.notes.push( + "no update possible: no target release has been set" + .to_string(), + ); + return; + } + + // Report notes for sleds we skipped. + { + let mut nsleds_missing_from_inv = 0; + let mut nsleds_error_manifest = 0; + let mut nsleds_mupdate_override = 0; + for reason in skipped_sled_zones.values() { + use PlanningNoopImageSourceSkipSledZonesReason::*; + + match reason { + AllZonesAlreadyArtifact { .. } => { + // This is the normal steady state; nothing to note. + } + SledNotInInventory => { + nsleds_missing_from_inv += 1; + } + ErrorRetrievingZoneManifest { .. } => { + nsleds_error_manifest += 1; + } + RemoveMupdateOverride { .. } => { + nsleds_mupdate_override += 1; + } + } + } + // This loop does nothing, but we write it explicitly so if we add + // more reasons in the future, we have a chance to update these + // notes. + for reason in skipped_sled_host_phase_2.values() { + use PlanningNoopImageSourceSkipSledHostPhase2Reason::*; + match reason { + BothSlotsAlreadyArtifact => { + // This is the normal steady state; nothing to note. + } + SledNotInInventory => { + // This will already be counted above in + // `nsleds_missing_from_inv`. + } + } + } + + if nsleds_missing_from_inv > 0 { + self.notes.push(format!( + "{nsleds_missing_from_inv} sled{} missing from inventory", + pluralize_s(nsleds_missing_from_inv) + )); + } + + if nsleds_error_manifest > 0 { + self.notes.push(format!( + "{nsleds_error_manifest} sled{} have unexpected errors \ + on their boot disks", + pluralize_s(nsleds_error_manifest) + )); + } + + if nsleds_mupdate_override > 0 { + self.notes.push(format!( + "{nsleds_mupdate_override} sled{} are waiting for a \ + system version that matches their most recent \ + support-driven update (mupdate)", + pluralize_s(nsleds_mupdate_override) + )); + } + } + + // Report notes for zones we skipped. If we skipped the entire sled, + // that will be covered above; this is only for zones on sleds we + // considered. + { + let mut ninvalid_artifact = 0; + let mut nartifact_not_in_repo = 0; + for reason in skipped_zones.values() { + use PlanningNoopImageSourceSkipZoneReason::*; + match reason { + ZoneNotInManifest { .. } => { + // TODO-correctness Should we report anything for this + // reason? It has no constructors at the moment; see + // + } + InvalidArtifact { .. } => { + ninvalid_artifact += 1; + } + ArtifactNotInRepo { .. } => { + nartifact_not_in_repo += 1; + } + } + } + + if ninvalid_artifact > 0 { + self.notes.push(format!( + "{ninvalid_artifact} service zone{} have invalid artifacts", + pluralize_s(ninvalid_artifact) + )); + } + + if nartifact_not_in_repo > 0 { + self.notes.push(format!( + "{nartifact_not_in_repo} service zone{} are waiting for a \ + system version that matches their most recent \ + support-driven update (mupdate)", + pluralize_s(nartifact_not_in_repo) + )); + } + } + + // Report counts of converted artifacts. + { + let mut nsleds = 0; + let mut tot_eligible = 0; + let mut tot_dataset = 0; + let mut tot_eligible_os = 0; + for converted in converted.values() { + let PlanningNoopImageSourceConverted { + num_eligible, + num_dataset, + host_phase_2_slot_a_eligible, + host_phase_2_slot_b_eligible, + } = converted; + + nsleds += 1; + tot_eligible += num_eligible; + tot_dataset += num_dataset; + // We don't want to report 2 OSs updated for each sled, nor do + // we want to complain about failing to convert one slot if we + // were able to convert the other. Count _either_ slot as + // eligible as a success. + if *host_phase_2_slot_a_eligible + || *host_phase_2_slot_b_eligible + { + tot_eligible_os += 1; + } + } + + if nsleds > 0 { + self.notes.push(format!( + "across {nsleds} sled{}, converted {}{}{} to known versions", + pluralize_s(nsleds), + if tot_dataset > 0 { + format_args!( + "{tot_eligible}/{tot_dataset} service zone{}", + pluralize_s(tot_eligible), + ) + } else { + format_args!("") + }, + if tot_dataset > 0 && tot_eligible_os > 0 { + " and " + } else { + "" + }, + if tot_eligible_os > 0 { + format_args!( + "{tot_eligible_os} OS image{}", + pluralize_s(tot_eligible_os), + ) + } else { + format_args!("") + }, + )); + } + } + } + + fn push_mgs_updates_notes( + &mut self, + mgs_updates: &PlanningMgsUpdatesStepReport, + ) { + let PlanningMgsUpdatesStepReport { + blocked_mgs_updates, + // These are covered by the update status API + pending_mgs_updates: _, + } = mgs_updates; + + for update in blocked_mgs_updates { + let serial = &update.baseboard_id.serial_number; + let component = update.component; + let reason = InlineErrorChain::new(&update.reason); + + self.notes.push(format!( + "unable to update {serial} {component}: {reason}", + )); + } + } + + fn push_add_notes(&mut self, add: &PlanningAddStepReport) { + let PlanningAddStepReport { + waiting_on, + sleds_without_zpools_for_ntp_zones, + // summarized by `waiting_on`, so we ignore details + add_update_blocked_reasons: _, + // already included in planner config + add_zones_with_mupdate_override: _, + // not a relevant detail to operators: this is true only for systems + // that have never had a reconfigurator-driven update + target_release_generation_is_one: _, + // sleds waiting for NTP is normal while we're updaing NTP zones + sleds_without_ntp_zones_in_inventory: _, + sleds_waiting_for_ntp_zone: _, + sleds_missing_ntp_zone: _, + // not a relevant detail to operators: these are "operating + // normally" sleds + sleds_getting_ntp_and_discretionary_zones: _, + sleds_missing_crucible_zone, + out_of_eligible_sleds, + // not a relevant detail to operators: this is normal operation + sufficient_zones_exist: _, + // maybe not a relevant detail to operators? we're restoring + // redundancy or finishing an expunge+add update + discretionary_zones_placed: _, + } = add; + + if let Some(waiting_on) = waiting_on { + match waiting_on { + ZoneAddWaitingOn::MupdateBlockers => self.notes.push( + "service updates are waiting for system to recover \ + from support-driven update (mupdate)" + .to_string(), + ), + } + } + + if !sleds_without_zpools_for_ntp_zones.is_empty() { + let n = sleds_without_zpools_for_ntp_zones.len(); + self.notes.push(format!( + "{n} sled{} with no available disks to host NTP service", + pluralize_s(n), + )); + } + + { + let mut nsleds = 0; + let mut npools = 0; + for pools in sleds_missing_crucible_zone.values() { + nsleds += 1; + npools += pools.len(); + } + if npools > 0 { + self.notes.push(format!( + "{npools} disk{} are unavailable across {nsleds} sled{}", + pluralize_s(npools), + pluralize_s(nsleds), + )); + } + } + + { + let mut tot_placed = 0; + let mut tot_wanted_to_place = 0; + for out_of_eligible_sleds in out_of_eligible_sleds.values() { + let PlanningAddOutOfEligibleSleds { placed, wanted_to_place } = + out_of_eligible_sleds; + tot_placed += *placed; + tot_wanted_to_place += *wanted_to_place; + } + if tot_placed < tot_wanted_to_place { + self.notes.push(format!( + "out of eligible sleds: only placed \ + {tot_placed} service{} out of \ + {tot_wanted_to_place} desired", + pluralize_s(tot_placed), + )); + } + } + } + + fn push_zone_updates_notes( + &mut self, + zone_updates: &PlanningZoneUpdatesStepReport, + ) { + let PlanningZoneUpdatesStepReport { + unsafe_zones, + // these are normal update details covered by update status + waiting_zones: _, + waiting_on: _, + out_of_date_zones: _, + expunged_zones: _, + updated_zones: _, + } = zone_updates; + + // Report problems with particular zone types. + { + // Only report each kind of problem once (e.g., if 3/5 cockroach + // nodes are healthy, don't emit the same note twice for both + // unhealthy nodes). + let mut emitted_cockroach = false; + let mut emitted_boundary_ntp = false; + let mut emitted_internal_dns = false; + for kind in unsafe_zones.values() { + let emit = match kind { + ZoneUnsafeToShutdown::Cockroachdb { .. } => { + if !emitted_cockroach { + emitted_cockroach = true; + true + } else { + false + } + } + ZoneUnsafeToShutdown::BoundaryNtp { .. } => { + if !emitted_boundary_ntp { + emitted_boundary_ntp = true; + true + } else { + false + } + } + ZoneUnsafeToShutdown::InternalDns { .. } => { + if !emitted_internal_dns { + emitted_internal_dns = true; + true + } else { + false + } + } + }; + if emit { + // TODO Will we emit these notes during a normal update + // (e.g., when we reboot a sled hosting boundary NTP)? If + // so, we should probably not emit these unless we can get + // more confidence there's actually something wrong. + self.notes.push(format!( + "waiting for condition to clear: {kind}" + )); + } + } + } + } +} + +// Very dumb helper to pluralize words by appending an "s". Only used within +// this file to pluralize simple nouns like "sleds" and "disks". +fn pluralize_s(n: usize) -> &'static str { + if n == 1 { "" } else { "s" } +}