From 1c94bc00e0880710968a470487a1429f1f338f60 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 15 Jan 2026 12:18:14 -0800 Subject: [PATCH 1/6] Support bundle: automatic deletion to maintain free dataset buffer --- dev-tools/omdb/src/bin/omdb/nexus.rs | 27 ++ nexus-config/src/nexus_config.rs | 22 + nexus/db-model/src/schema_versions.rs | 3 +- .../src/db/datastore/support_bundle.rs | 444 ++++++++++++++++++ nexus/src/app/background/init.rs | 2 + .../tasks/support_bundle_collector.rs | 150 +++++- .../integration_tests/support_bundles.rs | 120 +++++ nexus/types/src/internal_api/background.rs | 15 + schema/crdb/bundle-state-index/up1.sql | 4 + schema/crdb/dbinit.sql | 7 +- smf/nexus/multi-sled/config-partial.toml | 2 + smf/nexus/single-sled/config-partial.toml | 2 + 12 files changed, 795 insertions(+), 3 deletions(-) create mode 100644 schema/crdb/bundle-state-index/up1.sql diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index c8573b122db..153d76c4a51 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -70,6 +70,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; use nexus_types::internal_api::background::SitrepGcStatus; use nexus_types::internal_api::background::SitrepLoadStatus; +use nexus_types::internal_api::background::SupportBundleAutoDeletionReport; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; @@ -2557,6 +2558,7 @@ fn print_task_service_firewall_rule_propagation(details: &serde_json::Value) { fn print_task_support_bundle_collector(details: &serde_json::Value) { #[derive(Deserialize)] struct SupportBundleCollectionStatus { + auto_deletion_report: Option, cleanup_report: Option, cleanup_err: Option, collection_report: Option, @@ -2571,11 +2573,36 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { error, details ), Ok(SupportBundleCollectionStatus { + auto_deletion_report, cleanup_report, cleanup_err, collection_report, collection_err, }) => { + // Print auto-deletion report first (since it runs first) + if let Some(SupportBundleAutoDeletionReport { + bundles_marked_for_deletion, + free_datasets, + total_datasets, + active_bundles, + errors, + }) = auto_deletion_report + { + println!(" Support Bundle Auto-Deletion Report:"); + println!(" Total debug datasets: {total_datasets}"); + println!(" Active bundles: {active_bundles}"); + println!(" Free datasets: {free_datasets}"); + println!( + " Bundles marked for deletion: {bundles_marked_for_deletion}" + ); + if !errors.is_empty() { + println!(" Errors:"); + for error in errors { + println!(" {error}"); + } + } + } + if let Some(cleanup_err) = cleanup_err { println!(" failed to perform cleanup: {cleanup_err}"); } diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index cb8022ece6b..934cfacdc51 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -30,6 +30,7 @@ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; use std::net::SocketAddr; +use std::num::NonZeroU32; use std::time::Duration; use uuid::Uuid; @@ -489,6 +490,25 @@ pub struct SupportBundleCollectorConfig { /// Default: Off #[serde(default)] pub disable: bool, + + /// Target number of free debug datasets to maintain for new allocations. + /// + /// When the number of free debug datasets drops below this value, the + /// oldest support bundles will be automatically deleted to free up space. + /// + /// Default: None (auto-deletion disabled) + #[serde(default)] + pub target_free_datasets: Option, + + /// Minimum number of bundles to retain, even if `target_free_datasets` + /// would otherwise trigger deletion. + /// + /// This prevents aggressive cleanup on small systems where the number of + /// debug datasets is less than or equal to `target_free_datasets`. + /// + /// Default: None (no minimum, can delete all bundles) + #[serde(default)] + pub min_bundles_to_keep: Option, } #[serde_as] @@ -1407,6 +1427,8 @@ mod test { SupportBundleCollectorConfig { period_secs: Duration::from_secs(30), disable: false, + target_free_datasets: None, + min_bundles_to_keep: None, }, physical_disk_adoption: PhysicalDiskAdoptionConfig { period_secs: Duration::from_secs(30), diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index c95ceb556f6..2ed3d3065ad 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(220, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(221, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(221, "bundle-state-index"), KnownVersion::new(220, "multicast-implicit-lifecycle"), KnownVersion::new(219, "blueprint-sled-last-used-ip"), KnownVersion::new(218, "measurements"), diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index afb0557a44b..6ddf29002f8 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -31,6 +31,7 @@ use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SupportBundleUuid; +use std::num::NonZeroU32; use uuid::Uuid; const CANNOT_ALLOCATE_ERR_MSG: &'static str = "Current policy limits support bundle creation to 'one per external disk', and \ @@ -62,6 +63,19 @@ pub struct SupportBundleExpungementReport { pub bundles_reassigned: usize, } +/// Result of querying for bundles eligible for automatic deletion. +#[derive(Debug, Clone)] +pub struct AutoDeletionResult { + /// Bundles that should be marked for deletion (oldest first). + pub bundles_to_delete: Vec, + /// Total number of debug datasets available. + pub total_datasets: usize, + /// Number of active bundles (before any deletions). + pub active_bundles: usize, + /// Number of free debug datasets (before any deletions). + pub free_datasets: usize, +} + impl DataStore { /// Creates a new support bundle. /// @@ -524,6 +538,115 @@ impl DataStore { Ok(()) } + + /// Finds support bundles eligible for automatic deletion to maintain + /// a buffer of free debug datasets. + /// + /// This method checks if there are fewer free debug datasets than + /// `target_free_datasets`, and if so, returns the oldest active bundles + /// that should be deleted to restore the buffer. + /// + /// The `min_bundles_to_keep` parameter prevents aggressive cleanup on + /// small systems by ensuring we never delete bundles if doing so would + /// leave fewer than this many bundles. + /// + /// Returns an `AutoDeletionResult` containing: + /// - The bundles that should be marked for deletion (oldest first) + /// - Statistics about the current state of datasets and bundles + pub async fn support_bundle_find_auto_deletion_candidates( + &self, + opctx: &OpContext, + target_free_datasets: NonZeroU32, + min_bundles_to_keep: Option, + ) -> Result { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + use nexus_db_schema::schema::rendezvous_debug_dataset::dsl as dataset_dsl; + use nexus_db_schema::schema::support_bundle::dsl as bundle_dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + // Count total and free datasets using a LEFT JOIN to avoid full table scans. + // A dataset is "free" if it has no bundle using it (any state). + // This uses the one_bundle_per_dataset unique index on support_bundle.dataset_id. + let dataset_counts: Vec<(Uuid, Option)> = + dataset_dsl::rendezvous_debug_dataset + .filter(dataset_dsl::time_tombstoned.is_null()) + .left_join( + bundle_dsl::support_bundle + .on(dataset_dsl::id.eq(bundle_dsl::dataset_id)), + ) + .select((dataset_dsl::id, bundle_dsl::dataset_id.nullable())) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + let total_datasets = dataset_counts.len(); + let free_datasets = dataset_counts + .iter() + .filter(|(_, bundle)| bundle.is_none()) + .count(); + + // Load Active bundles for deletion candidates + // Only Active bundles can be auto-deleted (not Collecting, Destroying, etc.) + // Ordered by creation time to get the oldest first for potential deletion + let active_bundle_list: Vec = bundle_dsl::support_bundle + .filter(bundle_dsl::state.eq(SupportBundleState::Active)) + .order(bundle_dsl::time_created.asc()) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + let active_bundles = active_bundle_list.len(); + + let target = target_free_datasets.get() as usize; + + // If we have enough free datasets, no deletion needed + if free_datasets >= target { + return Ok(AutoDeletionResult { + bundles_to_delete: Vec::new(), + total_datasets, + active_bundles, + free_datasets, + }); + } + + // Calculate how many bundles we need to delete + let mut bundles_to_delete_count = target - free_datasets; + + // Apply min_bundles_to_keep constraint + if let Some(min_keep) = min_bundles_to_keep { + let min_keep = min_keep.get() as usize; + let max_deletable = active_bundles.saturating_sub(min_keep); + bundles_to_delete_count = + bundles_to_delete_count.min(max_deletable); + } + + // If we can't delete any bundles, return empty + if bundles_to_delete_count == 0 { + return Ok(AutoDeletionResult { + bundles_to_delete: Vec::new(), + total_datasets, + active_bundles, + free_datasets, + }); + } + + // Take oldest bundles from the already-sorted list + let bundles_to_delete: Vec = active_bundle_list + .into_iter() + .take(bundles_to_delete_count) + .collect(); + + Ok(AutoDeletionResult { + bundles_to_delete, + total_datasets, + active_bundles, + free_datasets, + }) + } } #[cfg(test)] @@ -1548,4 +1671,325 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + // Tests for support_bundle_find_auto_deletion_candidates + + #[tokio::test] + async fn test_auto_deletion_no_bundles() { + let logctx = dev::test_setup_log("test_auto_deletion_no_bundles"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Create 5 debug datasets (pools) + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // With target_free=3 and 5 datasets, no bundles, free=5 >= 3 + // Should return no bundles to delete + let result = datastore + .support_bundle_find_auto_deletion_candidates( + &opctx, + NonZeroU32::new(3).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 0); + assert_eq!(result.free_datasets, 5); + assert!(result.bundles_to_delete.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_enough_free_datasets() { + let logctx = + dev::test_setup_log("test_auto_deletion_enough_free_datasets"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 10 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 10).await; + + // Create 5 bundles, leaving 5 free + for _ in 0..5 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // With target_free=3 and free=5 >= 3, should return no bundles + let result = datastore + .support_bundle_find_auto_deletion_candidates( + &opctx, + NonZeroU32::new(3).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 10); + assert_eq!(result.active_bundles, 5); + assert_eq!(result.free_datasets, 5); + assert!(result.bundles_to_delete.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_deletes_oldest_first() { + let logctx = + dev::test_setup_log("test_auto_deletion_deletes_oldest_first"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 5 bundles (all slots filled) + let mut bundle_ids = Vec::new(); + for _ in 0..5 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + bundle_ids.push(bundle.id); + + // Small delay to ensure different creation times + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + + // With target_free=2 and free=0, we need to delete 2 bundles + let result = datastore + .support_bundle_find_auto_deletion_candidates( + &opctx, + NonZeroU32::new(2).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 5); + assert_eq!(result.free_datasets, 0); + assert_eq!(result.bundles_to_delete.len(), 2); + + // Verify the oldest bundles are selected (first two created) + let delete_ids: Vec<_> = + result.bundles_to_delete.iter().map(|b| b.id).collect(); + assert!(delete_ids.contains(&bundle_ids[0])); + assert!(delete_ids.contains(&bundle_ids[1])); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_respects_min_bundles_to_keep() { + let logctx = dev::test_setup_log( + "test_auto_deletion_respects_min_bundles_to_keep", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 5 bundles (all slots filled) + for _ in 0..5 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // With target_free=3 and free=0, we'd want to delete 3 bundles + // But min_bundles_to_keep=4 means we can only delete 1 (5-4=1) + let result = datastore + .support_bundle_find_auto_deletion_candidates( + &opctx, + NonZeroU32::new(3).unwrap(), + Some(NonZeroU32::new(4).unwrap()), + ) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 5); + assert_eq!(result.free_datasets, 0); + // Can only delete 1 bundle due to min_bundles_to_keep constraint + assert_eq!(result.bundles_to_delete.len(), 1); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_min_bundles_prevents_all_deletion() { + let logctx = dev::test_setup_log( + "test_auto_deletion_min_bundles_prevents_all_deletion", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 3 bundles + for _ in 0..3 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // With target_free=5 (want all datasets free) and free=2 + // We'd want to delete 3 bundles, but min_bundles_to_keep=5 is > 3 active + // So we can't delete any + let result = datastore + .support_bundle_find_auto_deletion_candidates( + &opctx, + NonZeroU32::new(5).unwrap(), + Some(NonZeroU32::new(5).unwrap()), + ) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + assert_eq!(result.active_bundles, 3); + assert_eq!(result.free_datasets, 2); + // min_bundles_to_keep (5) > active_bundles (3), so no deletion + assert!(result.bundles_to_delete.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_only_selects_active_bundles() { + let logctx = dev::test_setup_log( + "test_auto_deletion_only_selects_active_bundles", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 5 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; + + // Create 3 bundles: 1 active, 1 collecting, 1 destroying + let bundle1 = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + let authz_bundle1 = authz_support_bundle_from_id(bundle1.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle1, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + // Second bundle stays in Collecting + let _bundle2 = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Third bundle is Destroying + let bundle3 = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + let authz_bundle3 = authz_support_bundle_from_id(bundle3.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle3, + SupportBundleState::Destroying, + ) + .await + .expect("Should update state"); + + // With target_free=5 and 3 bundles (1 Active, 1 Collecting, 1 Destroying), + // free_datasets = 5 - 3 = 2 (ALL bundles occupy datasets, not just Active) + // We should only delete Active bundles though + let result = datastore + .support_bundle_find_auto_deletion_candidates( + &opctx, + NonZeroU32::new(5).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + assert_eq!(result.total_datasets, 5); + // Only 1 bundle is Active (candidates for deletion) + assert_eq!(result.active_bundles, 1); + // Free datasets: 5 total - 3 bundles = 2 + // (All bundles occupy datasets regardless of state) + assert_eq!(result.free_datasets, 2); + // We want 5 free but only have 2, so we want to delete 3 + // But we only have 1 Active bundle to delete + assert_eq!(result.bundles_to_delete.len(), 1); + assert_eq!(result.bundles_to_delete[0].id, bundle1.id); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 0f2ad163549..e7a406b6ef4 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -632,6 +632,8 @@ impl BackgroundTasksInitializer { resolver.clone(), config.support_bundle_collector.disable, nexus_id, + config.support_bundle_collector.target_free_datasets, + config.support_bundle_collector.min_bundles_to_keep, ), ), opctx: opctx.child(BTreeMap::new()), diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 887be497a17..56bbc131040 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -13,6 +13,7 @@ use nexus_db_model::SupportBundleState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::SupportBundleAutoDeletionReport; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use omicron_common::api::external::DataPageParams; @@ -27,6 +28,7 @@ use omicron_uuid_kinds::ZpoolUuid; use serde_json::json; use sled_agent_types::support_bundle::NESTED_DATASET_NOT_FOUND; use slog_error_chain::InlineErrorChain; +use std::num::NonZeroU32; use std::sync::Arc; use super::support_bundle::collection::BundleCollection; @@ -56,6 +58,10 @@ pub struct SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, + /// Target number of free debug datasets to maintain for new allocations. + target_free_datasets: Option, + /// Minimum number of bundles to retain to prevent aggressive cleanup. + min_bundles_to_keep: Option, } impl SupportBundleCollector { @@ -64,8 +70,17 @@ impl SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, + target_free_datasets: Option, + min_bundles_to_keep: Option, ) -> Self { - SupportBundleCollector { datastore, resolver, disable, nexus_id } + SupportBundleCollector { + datastore, + resolver, + disable, + nexus_id, + target_free_datasets, + min_bundles_to_keep, + } } // Tells a sled agent to delete a support bundle @@ -308,6 +323,108 @@ impl SupportBundleCollector { Ok(report) } + /// Checks for bundles eligible for automatic deletion and marks them + /// for destruction. + /// + /// This maintains a buffer of free debug datasets for new bundle + /// allocations by deleting the oldest bundles when the free dataset + /// count drops below `target_free_datasets`. + async fn auto_delete_bundles( + &self, + opctx: &OpContext, + ) -> SupportBundleAutoDeletionReport { + let mut report = SupportBundleAutoDeletionReport::default(); + + // Skip if auto-deletion is disabled + let Some(target_free) = self.target_free_datasets else { + return report; + }; + + // Query for candidates + let result = self + .datastore + .support_bundle_find_auto_deletion_candidates( + opctx, + target_free, + self.min_bundles_to_keep, + ) + .await; + + let candidates = match result { + Ok(r) => r, + Err(err) => { + warn!( + &opctx.log, + "SupportBundleCollector: Failed to find auto-deletion candidates"; + "err" => %err + ); + report + .errors + .push(format!("Failed to query candidates: {}", err)); + return report; + } + }; + + // Update report with current state + report.total_datasets = candidates.total_datasets; + report.active_bundles = candidates.active_bundles; + report.free_datasets = candidates.free_datasets; + + // Mark each candidate bundle for deletion + for bundle in &candidates.bundles_to_delete { + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + + info!( + &opctx.log, + "SupportBundleCollector: Auto-deleting bundle to free dataset capacity"; + "id" => %bundle.id, + "time_created" => %bundle.time_created, + "free_datasets" => candidates.free_datasets, + "target_free" => target_free.get(), + ); + + // Transition bundle to Destroying state + match self + .datastore + .support_bundle_update( + opctx, + &authz_bundle, + SupportBundleState::Destroying, + ) + .await + { + Ok(()) => { + report.bundles_marked_for_deletion += 1; + } + Err(err) => { + // Handle gracefully - bundle may have been modified + // concurrently by another Nexus or user + if matches!(err, Error::InvalidRequest { .. }) { + info!( + &opctx.log, + "SupportBundleCollector: Concurrent state change during auto-deletion"; + "bundle" => %bundle.id, + "err" => ?err, + ); + } else { + warn!( + &opctx.log, + "SupportBundleCollector: Failed to mark bundle for auto-deletion"; + "bundle" => %bundle.id, + "err" => %err, + ); + report.errors.push(format!( + "Failed to mark bundle {} for deletion: {}", + bundle.id, err + )); + } + } + } + } + + report + } + async fn collect_bundle( &self, opctx: &OpContext, @@ -403,11 +520,16 @@ impl BackgroundTask for SupportBundleCollector { return json!({ "error": "task disabled" }); } + let auto_deletion_report; let mut cleanup_report = None; let mut cleanup_err = None; let mut collection_report = None; let mut collection_err = None; + // Phase 1: Auto-delete eligible bundles to maintain free dataset buffer + auto_deletion_report = self.auto_delete_bundles(&opctx).await; + + // Phase 2: Cleanup destroyed/failing bundles match self.cleanup_destroyed_bundles(&opctx).await { Ok(report) => cleanup_report = Some(report), Err(err) => { @@ -416,6 +538,7 @@ impl BackgroundTask for SupportBundleCollector { } }; + // Phase 3: Collect pending bundles let request = BundleRequest::default(); match self.collect_bundle(&opctx, &request).await { Ok(report) => collection_report = Some(report), @@ -426,6 +549,7 @@ impl BackgroundTask for SupportBundleCollector { }; json!({ + "auto_deletion_report": Some(auto_deletion_report), "cleanup_report": cleanup_report, "cleanup_err": cleanup_err, "collection_report": collection_report, @@ -490,6 +614,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let report = collector @@ -516,6 +642,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let request = BundleRequest::default(); @@ -823,6 +951,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); // The bundle collection should complete successfully. @@ -902,6 +1032,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); // Collect the bundle @@ -1013,6 +1145,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); // The bundle collection should complete successfully. @@ -1121,6 +1255,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); // Each time we call "collect_bundle", we collect a SINGLE bundle. @@ -1235,6 +1371,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let report = collector @@ -1288,6 +1426,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1387,6 +1527,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let report = collector @@ -1443,6 +1585,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1528,6 +1672,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1612,6 +1758,8 @@ mod test { resolver.clone(), false, nexus.id(), + None, // target_free_datasets + None, // min_bundles_to_keep ); // Collect the bundle diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 11ce5119ede..78ec7b74f9c 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -20,6 +20,7 @@ use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::shared::SupportBundleInfo; use nexus_types::external_api::shared::SupportBundleState; +use nexus_types::internal_api::background::SupportBundleAutoDeletionReport; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleCollectionStep; @@ -28,6 +29,7 @@ use omicron_common::api::external::LookupType; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; use std::io::Cursor; +use std::num::NonZeroU32; use zip::read::ZipArchive; type ControlPlaneTestContext = @@ -331,6 +333,8 @@ async fn bundle_update_comment( #[derive(Deserialize)] struct TaskOutput { + auto_deletion_err: Option, + auto_deletion_report: Option, cleanup_err: Option, collection_err: Option, cleanup_report: Option, @@ -930,3 +934,119 @@ async fn test_support_bundle_delete_failed_bundle( "Deleted bundle should not appear in bundle list" ); } + +// Test automatic deletion of support bundles to maintain free dataset capacity. +// +// This test verifies that: +// 1. Auto-deletion kicks in when free_datasets < target_free_datasets +// 2. The oldest bundles are marked for deletion first +// 3. min_bundles_to_keep is respected +// +// Configuration: target_free_datasets=1, min_bundles_to_keep=1, 5 datasets +// Create 5 bundles (filling all datasets), and verify auto-deletion happens +// when we run out of free datasets. +#[tokio::test] +async fn test_support_bundle_auto_deletion() { + // Create a test context with auto-deletion enabled: + // - target_free_datasets=1: try to maintain 1 free dataset + // - min_bundles_to_keep=1: always keep at least 1 bundle + let cptestctx = nexus_test_utils::ControlPlaneBuilder::new( + "test_support_bundle_auto_deletion", + ) + .customize_nexus_config(&|config| { + config + .pkg + .background_tasks + .support_bundle_collector + .target_free_datasets = Some(NonZeroU32::new(1).unwrap()); + config + .pkg + .background_tasks + .support_bundle_collector + .min_bundles_to_keep = Some(NonZeroU32::new(1).unwrap()); + }) + .start::() + .await; + + let client = &cptestctx.external_client; + + // Create 5 zpools, giving us 5 debug datasets + let _disk_test = + DiskTestBuilder::new(&cptestctx).with_zpool_count(5).build().await; + + // Create and activate bundles one by one. + // With 5 datasets and target_free=1: + // - After bundles 1-4: free >= 1, so no auto-delete needed + // - When creating bundle 5: + // - Before collection: 4 Active + 1 Collecting = 5 bundles, free = 0 + // - Auto-delete triggers: want 1 free, have 0, delete 1 oldest + // - min_keep=1, active=4, max_deletable=3, so we CAN delete + // - Result: oldest bundle deleted, then bundle 5 gets collected + let mut bundle_ids = Vec::new(); + for i in 0..5 { + let bundle = bundle_create(&client).await.unwrap(); + bundle_ids.push(bundle.id); + let output = + activate_bundle_collection_background_task(&cptestctx).await; + assert_eq!( + output.collection_err, None, + "Bundle {} collection failed", + i + ); + + // Check auto-deletion report + assert_eq!(output.auto_deletion_err, None); + let report = output + .auto_deletion_report + .expect("Should have auto_deletion_report"); + assert_eq!(report.total_datasets, 5); + + // Small delay to ensure different creation times + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + } + + // After creating 5 bundles: + // - Bundle 5 creation triggered auto-delete of bundle 1 (oldest) + // - Then cleanup removed bundle 1, and collection activated bundle 5 + // So we should have 4 Active bundles remaining + let bundles = bundles_list(&client).await.unwrap(); + assert_eq!(bundles.len(), 4, "Should have 4 bundles after auto-deletion"); + + // All remaining bundles should be Active + for bundle in &bundles { + assert_eq!(bundle.state, SupportBundleState::Active); + } + + // The oldest bundle (first created) should have been deleted + assert!( + !bundles.iter().any(|b| b.id == bundle_ids[0]), + "Oldest bundle (bundle 1) should have been auto-deleted" + ); + + // Bundles 2-5 should remain + for i in 1..5 { + assert!( + bundles.iter().any(|b| b.id == bundle_ids[i]), + "Bundle {} should remain", + i + 1 + ); + } + + // Now verify the auto-deletion report from the last run + // Re-run bg task to get a fresh report (no deletion should happen now) + let output = activate_bundle_collection_background_task(&cptestctx).await; + assert_eq!(output.auto_deletion_err, None); + let report = output.auto_deletion_report.expect("Should have report"); + + // Now we have: 4 bundles, 5 datasets, 1 free + // free (1) >= target (1), so no deletion needed + assert_eq!(report.total_datasets, 5); + assert_eq!(report.active_bundles, 4); + assert_eq!(report.free_datasets, 1); + assert_eq!( + report.bundles_marked_for_deletion, 0, + "No deletion needed when free >= target" + ); + + cptestctx.teardown().await; +} diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 878c6ee5684..798cef7de26 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -268,6 +268,21 @@ pub struct SupportBundleCleanupReport { pub db_failing_bundles_updated: usize, } +/// Describes what happened during automatic support bundle deletion. +#[derive(Debug, Default, Deserialize, Serialize, Eq, PartialEq)] +pub struct SupportBundleAutoDeletionReport { + /// Number of bundles marked for deletion to free up dataset capacity. + pub bundles_marked_for_deletion: usize, + /// Current count of free debug datasets (before deletions). + pub free_datasets: usize, + /// Total debug datasets available. + pub total_datasets: usize, + /// Active bundles count (before deletions). + pub active_bundles: usize, + /// Errors encountered during auto-deletion. + pub errors: Vec, +} + /// Identifies what we could or could not store within a support bundle. /// /// This struct will get emitted as part of the background task infrastructure. diff --git a/schema/crdb/bundle-state-index/up1.sql b/schema/crdb/bundle-state-index/up1.sql new file mode 100644 index 00000000000..862dc1685f9 --- /dev/null +++ b/schema/crdb/bundle-state-index/up1.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_bundle_by_state_and_creation ON omicron.public.support_bundle ( + state, + time_created +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index a956bbd4ec0..16750a16c0a 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2926,6 +2926,11 @@ CREATE INDEX IF NOT EXISTS lookup_bundle_by_creation ON omicron.public.support_b time_created ); +CREATE INDEX IF NOT EXISTS lookup_bundle_by_state_and_creation ON omicron.public.support_bundle ( + state, + time_created +); + /*******************************************************************/ /* @@ -7791,7 +7796,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '220.0.0', NULL) + (TRUE, NOW(), NOW(), '221.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index b9b843fbcbd..54b057adbb7 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -53,6 +53,8 @@ inventory.disable_collect = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 +support_bundle_collector.target_free_datasets = 16 +support_bundle_collector.min_bundles_to_keep = 16 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index c7e9e5c4317..cc5601c49dd 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -53,6 +53,8 @@ inventory.disable_collect = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 +support_bundle_collector.target_free_datasets = 1 +support_bundle_collector.min_bundles_to_keep = 4 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 From a3ad3d35c6d26a98f0ac8105524ad8b75e30fbfa Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 15 Jan 2026 14:08:09 -0800 Subject: [PATCH 2/6] support_bundle: optimize auto-deletion query for scale - Use COUNT queries instead of loading all dataset/bundle rows - Use GROUP BY to get used_datasets and active_bundles in a single query - Add LIMIT to the deletion candidates query - Exclude Failed bundles from used_datasets (their dataset was expunged) These changes allow the auto-deletion query to scale to systems with thousands of datasets without loading all rows into memory. --- .../src/db/datastore/support_bundle.rs | 81 +++++++++++-------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 6ddf29002f8..d9592135c15 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -566,42 +566,52 @@ impl DataStore { let conn = self.pool_connection_authorized(opctx).await?; - // Count total and free datasets using a LEFT JOIN to avoid full table scans. - // A dataset is "free" if it has no bundle using it (any state). - // This uses the one_bundle_per_dataset unique index on support_bundle.dataset_id. - let dataset_counts: Vec<(Uuid, Option)> = - dataset_dsl::rendezvous_debug_dataset - .filter(dataset_dsl::time_tombstoned.is_null()) - .left_join( - bundle_dsl::support_bundle - .on(dataset_dsl::id.eq(bundle_dsl::dataset_id)), - ) - .select((dataset_dsl::id, bundle_dsl::dataset_id.nullable())) + let target = target_free_datasets.get() as usize; + + // Count total non-tombstoned datasets + let total_datasets: i64 = dataset_dsl::rendezvous_debug_dataset + .filter(dataset_dsl::time_tombstoned.is_null()) + .count() + .get_result_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + let total_datasets = total_datasets as usize; + + // Count bundles by state to get both used_datasets and active_bundles + // in a single query. Uses the lookup_bundle_by_state_and_creation index. + // + // Bundles that occupy datasets are: + // - Collecting: actively writing to dataset + // - Active: has complete data on dataset + // - Destroying: waiting for storage cleanup + // - Failing: transitional, still has storage to clean + // Failed bundles do NOT occupy datasets - they're marked Failed when + // their dataset is expunged (see support_bundle_fail_expunged). + let bundle_counts: Vec<(SupportBundleState, i64)> = + bundle_dsl::support_bundle + .filter(bundle_dsl::state.eq_any([ + SupportBundleState::Collecting, + SupportBundleState::Active, + SupportBundleState::Destroying, + SupportBundleState::Failing, + ])) + .group_by(bundle_dsl::state) + .select((bundle_dsl::state, diesel::dsl::count_star())) .load_async(&*conn) .await .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) })?; - let total_datasets = dataset_counts.len(); - let free_datasets = dataset_counts + let used_datasets: usize = + bundle_counts.iter().map(|(_, count)| *count as usize).sum(); + let active_bundles: usize = bundle_counts .iter() - .filter(|(_, bundle)| bundle.is_none()) - .count(); - - // Load Active bundles for deletion candidates - // Only Active bundles can be auto-deleted (not Collecting, Destroying, etc.) - // Ordered by creation time to get the oldest first for potential deletion - let active_bundle_list: Vec = bundle_dsl::support_bundle - .filter(bundle_dsl::state.eq(SupportBundleState::Active)) - .order(bundle_dsl::time_created.asc()) - .select(SupportBundle::as_select()) - .load_async(&*conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - let active_bundles = active_bundle_list.len(); + .find(|(state, _)| *state == SupportBundleState::Active) + .map(|(_, count)| *count as usize) + .unwrap_or(0); - let target = target_free_datasets.get() as usize; + let free_datasets = total_datasets.saturating_sub(used_datasets); // If we have enough free datasets, no deletion needed if free_datasets >= target { @@ -634,11 +644,16 @@ impl DataStore { }); } - // Take oldest bundles from the already-sorted list - let bundles_to_delete: Vec = active_bundle_list - .into_iter() - .take(bundles_to_delete_count) - .collect(); + // Query only the oldest bundles we need to delete (with LIMIT) + // Uses the lookup_bundle_by_state_and_creation index for efficient ordering + let bundles_to_delete: Vec = bundle_dsl::support_bundle + .filter(bundle_dsl::state.eq(SupportBundleState::Active)) + .order(bundle_dsl::time_created.asc()) + .limit(bundles_to_delete_count as i64) + .select(SupportBundle::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; Ok(AutoDeletionResult { bundles_to_delete, From 0290b8b28a763a8dda96ee1e07ca6dc40a703ed4 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 16 Jan 2026 14:37:03 -0800 Subject: [PATCH 3/6] support_bundle: make auto-deletion atomic to prevent TOCTTOU issue Replace the two-step find-then-update approach with a single atomic CTE query that calculates how many deletions are needed AND performs them in one database operation. The previous approach had a time-of-check to time-of-use (TOCTTOU) issue where multiple Nexuses running concurrently could over-delete bundles: 1. Nexus A queries: free=2, needs 1 deletion -> gets B1 2. Nexus A transitions B1: Active -> Destroying 3. Nexus B queries: free=2 (unchanged, Destroying still occupies), needs 1 -> gets B2 4. Nexus B transitions B2: Active -> Destroying 5. Result: 2 bundles deleted when only 1 was needed The new atomic query: - Calculates free datasets and needed deletions - Respects min_bundles_to_keep constraint - Finds the N oldest Active bundles - Transitions them to Destroying state atomically - Returns the IDs of deleted bundles Also adds tests verifying: - Bundles are actually transitioned to Destroying state - Failed bundles don't count as occupying datasets - Explain test for SQL validity - Expectorate test for SQL output --- dev-tools/omdb/tests/successes.out | 10 + .../src/db/datastore/support_bundle.rs | 486 +++++++++++++----- .../output/support_bundle_auto_delete.sql | 52 ++ .../tasks/support_bundle_collector.rs | 77 +-- 4 files changed, 446 insertions(+), 179 deletions(-) create mode 100644 nexus/db-queries/tests/output/support_bundle_auto_delete.sql diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 1df742a2a8f..30a6369d249 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -844,6 +844,11 @@ task: "support_bundle_collector" configured period: every days h m s last completed activation: , triggered by started at (s ago) and ran for ms + Support Bundle Auto-Deletion Report: + Total debug datasets: 0 + Active bundles: 0 + Free datasets: 0 + Bundles marked for deletion: 0 Support Bundle Cleanup Report: Bundles deleted from sleds: 0 Bundles not found on sleds: 0 @@ -1412,6 +1417,11 @@ task: "support_bundle_collector" configured period: every days h m s last completed activation: , triggered by started at (s ago) and ran for ms + Support Bundle Auto-Deletion Report: + Total debug datasets: 0 + Active bundles: 0 + Free datasets: 0 + Bundles marked for deletion: 0 Support Bundle Cleanup Report: Bundles deleted from sleds: 0 Bundles not found on sleds: 0 diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index d9592135c15..ae327eb437d 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -12,6 +12,7 @@ use crate::db::model::SupportBundle; use crate::db::model::SupportBundleState; use crate::db::pagination::paginated; use crate::db::pagination::paginated_multicolumn; +use crate::db::raw_query_builder::TypedSqlQuery; use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -63,11 +64,11 @@ pub struct SupportBundleExpungementReport { pub bundles_reassigned: usize, } -/// Result of querying for bundles eligible for automatic deletion. +/// Result of atomically deleting support bundles for capacity management. #[derive(Debug, Clone)] pub struct AutoDeletionResult { - /// Bundles that should be marked for deletion (oldest first). - pub bundles_to_delete: Vec, + /// IDs of bundles that were transitioned to Destroying state. + pub deleted_ids: Vec, /// Total number of debug datasets available. pub total_datasets: usize, /// Number of active bundles (before any deletions). @@ -539,131 +540,163 @@ impl DataStore { Ok(()) } - /// Finds support bundles eligible for automatic deletion to maintain - /// a buffer of free debug datasets. + /// Atomically finds and deletes support bundles to maintain free dataset buffer. /// - /// This method checks if there are fewer free debug datasets than - /// `target_free_datasets`, and if so, returns the oldest active bundles - /// that should be deleted to restore the buffer. + /// This method performs a single atomic operation that: + /// 1. Calculates how many deletions are needed (free_datasets < target) + /// 2. Applies min_bundles_to_keep constraint (never delete below minimum) + /// 3. Finds the N oldest Active bundles + /// 4. Transitions them to Destroying state atomically + /// 5. Returns what was actually deleted + /// + /// This prevents over-deletion when multiple Nexuses run concurrently, + /// because the calculation and state transitions happen in a single + /// database operation. /// /// The `min_bundles_to_keep` parameter prevents aggressive cleanup on /// small systems by ensuring we never delete bundles if doing so would /// leave fewer than this many bundles. - /// - /// Returns an `AutoDeletionResult` containing: - /// - The bundles that should be marked for deletion (oldest first) - /// - Statistics about the current state of datasets and bundles - pub async fn support_bundle_find_auto_deletion_candidates( + pub async fn support_bundle_auto_delete( &self, opctx: &OpContext, target_free_datasets: NonZeroU32, min_bundles_to_keep: Option, ) -> Result { - opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - - use nexus_db_schema::schema::rendezvous_debug_dataset::dsl as dataset_dsl; - use nexus_db_schema::schema::support_bundle::dsl as bundle_dsl; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; - let target = target_free_datasets.get() as usize; + let query = support_bundle_auto_delete_query( + target_free_datasets, + min_bundles_to_keep, + ); - // Count total non-tombstoned datasets - let total_datasets: i64 = dataset_dsl::rendezvous_debug_dataset - .filter(dataset_dsl::time_tombstoned.is_null()) - .count() + // Return type: (total_datasets, used_datasets, active_bundles, deleted_ids) + let result: (i64, i64, i64, Vec) = query .get_result_async(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - let total_datasets = total_datasets as usize; - // Count bundles by state to get both used_datasets and active_bundles - // in a single query. Uses the lookup_bundle_by_state_and_creation index. - // - // Bundles that occupy datasets are: - // - Collecting: actively writing to dataset - // - Active: has complete data on dataset - // - Destroying: waiting for storage cleanup - // - Failing: transitional, still has storage to clean - // Failed bundles do NOT occupy datasets - they're marked Failed when - // their dataset is expunged (see support_bundle_fail_expunged). - let bundle_counts: Vec<(SupportBundleState, i64)> = - bundle_dsl::support_bundle - .filter(bundle_dsl::state.eq_any([ - SupportBundleState::Collecting, - SupportBundleState::Active, - SupportBundleState::Destroying, - SupportBundleState::Failing, - ])) - .group_by(bundle_dsl::state) - .select((bundle_dsl::state, diesel::dsl::count_star())) - .load_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - - let used_datasets: usize = - bundle_counts.iter().map(|(_, count)| *count as usize).sum(); - let active_bundles: usize = bundle_counts - .iter() - .find(|(state, _)| *state == SupportBundleState::Active) - .map(|(_, count)| *count as usize) - .unwrap_or(0); - - let free_datasets = total_datasets.saturating_sub(used_datasets); - - // If we have enough free datasets, no deletion needed - if free_datasets >= target { - return Ok(AutoDeletionResult { - bundles_to_delete: Vec::new(), - total_datasets, - active_bundles, - free_datasets, - }); - } - - // Calculate how many bundles we need to delete - let mut bundles_to_delete_count = target - free_datasets; - - // Apply min_bundles_to_keep constraint - if let Some(min_keep) = min_bundles_to_keep { - let min_keep = min_keep.get() as usize; - let max_deletable = active_bundles.saturating_sub(min_keep); - bundles_to_delete_count = - bundles_to_delete_count.min(max_deletable); - } - - // If we can't delete any bundles, return empty - if bundles_to_delete_count == 0 { - return Ok(AutoDeletionResult { - bundles_to_delete: Vec::new(), - total_datasets, - active_bundles, - free_datasets, - }); - } - - // Query only the oldest bundles we need to delete (with LIMIT) - // Uses the lookup_bundle_by_state_and_creation index for efficient ordering - let bundles_to_delete: Vec = bundle_dsl::support_bundle - .filter(bundle_dsl::state.eq(SupportBundleState::Active)) - .order(bundle_dsl::time_created.asc()) - .limit(bundles_to_delete_count as i64) - .select(SupportBundle::as_select()) - .load_async(&*conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + let total_datasets = result.0 as usize; + let used_datasets = result.1 as usize; + let active_bundles = result.2 as usize; + let deleted_ids: Vec = result + .3 + .into_iter() + .map(SupportBundleUuid::from_untyped_uuid) + .collect(); Ok(AutoDeletionResult { - bundles_to_delete, + deleted_ids, total_datasets, active_bundles, - free_datasets, + free_datasets: total_datasets.saturating_sub(used_datasets), }) } } +/// Builds the CTE query for atomic support bundle auto-deletion. +/// +/// This query atomically: +/// 1. Counts datasets and bundles +/// 2. Calculates how many bundles to delete (respecting min_bundles_to_keep) +/// 3. Selects the oldest Active bundles +/// 4. Transitions them to Destroying state +/// 5. Returns the stats and deleted IDs +pub fn support_bundle_auto_delete_query( + target_free_datasets: NonZeroU32, + min_bundles_to_keep: Option, +) -> TypedSqlQuery<( + diesel::sql_types::BigInt, + diesel::sql_types::BigInt, + diesel::sql_types::BigInt, + diesel::sql_types::Array, +)> { + use crate::db::raw_query_builder::QueryBuilder; + + let min_keep = min_bundles_to_keep.map(|n| i64::from(n.get())).unwrap_or(0); + + let mut query = QueryBuilder::new(); + + // Build the CTE query + query + .sql( + " +WITH + -- Count non-tombstoned datasets (uses lookup_usable_rendezvous_debug_dataset index) + dataset_count AS ( + SELECT COUNT(*) as total + FROM rendezvous_debug_dataset + WHERE time_tombstoned IS NULL + ), + -- Count bundles occupying datasets. + -- Bundles that occupy datasets: Collecting, Active, Destroying, Failing + -- Failed bundles do NOT occupy datasets (their dataset was expunged). + -- Uses lookup_bundle_by_state_and_creation index. + used_count AS ( + SELECT COUNT(*) as used + FROM support_bundle + WHERE state IN ('collecting', 'active', 'destroying', 'failing') + ), + -- Count active bundles (for min_bundles_to_keep check) + active_count AS ( + SELECT COUNT(*) as active + FROM support_bundle + WHERE state = 'active' + ), + -- Calculate how many bundles we need AND are allowed to delete. + -- min_bundles_to_keep is respected first (max_deletable constraint). + deletion_calc AS ( + SELECT + (SELECT total FROM dataset_count) as total_datasets, + (SELECT used FROM used_count) as used_datasets, + (SELECT active FROM active_count) as active_bundles, + GREATEST(0, + ", + ) + .param() + .sql( + " - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count)) + ) as bundles_needed, + GREATEST(0, + (SELECT active FROM active_count) - ", + ) + .param() + .sql( + " + ) as max_deletable + ), + -- Find the N oldest active bundles we're allowed to delete. + -- Uses lookup_bundle_by_state_and_creation index for ordering. + candidates AS ( + SELECT id + FROM support_bundle + WHERE state = 'active' + ORDER BY time_created ASC + LIMIT (SELECT LEAST(bundles_needed, max_deletable) FROM deletion_calc) + ), + -- Atomically transition to Destroying (only if still Active). + -- The state='active' check handles concurrent user deletions. + deleted AS ( + UPDATE support_bundle + SET state = 'destroying' + WHERE id IN (SELECT id FROM candidates) + AND state = 'active' + RETURNING id + ) +SELECT + (SELECT total_datasets FROM deletion_calc), + (SELECT used_datasets FROM deletion_calc), + (SELECT active_bundles FROM deletion_calc), + ARRAY(SELECT id FROM deleted) as deleted_ids +", + ) + .bind::(i64::from(target_free_datasets.get())) + .bind::(min_keep); + + query.query() +} + #[cfg(test)] mod test { use super::*; @@ -1687,7 +1720,7 @@ mod test { logctx.cleanup_successful(); } - // Tests for support_bundle_find_auto_deletion_candidates + // Tests for support_bundle_auto_delete #[tokio::test] async fn test_auto_deletion_no_bundles() { @@ -1699,9 +1732,9 @@ mod test { let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; // With target_free=3 and 5 datasets, no bundles, free=5 >= 3 - // Should return no bundles to delete + // Should delete no bundles let result = datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( &opctx, NonZeroU32::new(3).unwrap(), None, @@ -1712,7 +1745,7 @@ mod test { assert_eq!(result.total_datasets, 5); assert_eq!(result.active_bundles, 0); assert_eq!(result.free_datasets, 5); - assert!(result.bundles_to_delete.is_empty()); + assert!(result.deleted_ids.is_empty()); db.terminate().await; logctx.cleanup_successful(); @@ -1748,9 +1781,9 @@ mod test { .expect("Should update state"); } - // With target_free=3 and free=5 >= 3, should return no bundles + // With target_free=3 and free=5 >= 3, should delete no bundles let result = datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( &opctx, NonZeroU32::new(3).unwrap(), None, @@ -1761,7 +1794,7 @@ mod test { assert_eq!(result.total_datasets, 10); assert_eq!(result.active_bundles, 5); assert_eq!(result.free_datasets, 5); - assert!(result.bundles_to_delete.is_empty()); + assert!(result.deleted_ids.is_empty()); db.terminate().await; logctx.cleanup_successful(); @@ -1805,7 +1838,7 @@ mod test { // With target_free=2 and free=0, we need to delete 2 bundles let result = datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( &opctx, NonZeroU32::new(2).unwrap(), None, @@ -1816,13 +1849,11 @@ mod test { assert_eq!(result.total_datasets, 5); assert_eq!(result.active_bundles, 5); assert_eq!(result.free_datasets, 0); - assert_eq!(result.bundles_to_delete.len(), 2); + assert_eq!(result.deleted_ids.len(), 2); // Verify the oldest bundles are selected (first two created) - let delete_ids: Vec<_> = - result.bundles_to_delete.iter().map(|b| b.id).collect(); - assert!(delete_ids.contains(&bundle_ids[0])); - assert!(delete_ids.contains(&bundle_ids[1])); + assert!(result.deleted_ids.contains(&bundle_ids[0].into())); + assert!(result.deleted_ids.contains(&bundle_ids[1].into())); db.terminate().await; logctx.cleanup_successful(); @@ -1862,7 +1893,7 @@ mod test { // With target_free=3 and free=0, we'd want to delete 3 bundles // But min_bundles_to_keep=4 means we can only delete 1 (5-4=1) let result = datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( &opctx, NonZeroU32::new(3).unwrap(), Some(NonZeroU32::new(4).unwrap()), @@ -1874,7 +1905,7 @@ mod test { assert_eq!(result.active_bundles, 5); assert_eq!(result.free_datasets, 0); // Can only delete 1 bundle due to min_bundles_to_keep constraint - assert_eq!(result.bundles_to_delete.len(), 1); + assert_eq!(result.deleted_ids.len(), 1); db.terminate().await; logctx.cleanup_successful(); @@ -1915,7 +1946,7 @@ mod test { // We'd want to delete 3 bundles, but min_bundles_to_keep=5 is > 3 active // So we can't delete any let result = datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( &opctx, NonZeroU32::new(5).unwrap(), Some(NonZeroU32::new(5).unwrap()), @@ -1927,7 +1958,7 @@ mod test { assert_eq!(result.active_bundles, 3); assert_eq!(result.free_datasets, 2); // min_bundles_to_keep (5) > active_bundles (3), so no deletion - assert!(result.bundles_to_delete.is_empty()); + assert!(result.deleted_ids.is_empty()); db.terminate().await; logctx.cleanup_successful(); @@ -1985,7 +2016,7 @@ mod test { // free_datasets = 5 - 3 = 2 (ALL bundles occupy datasets, not just Active) // We should only delete Active bundles though let result = datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( &opctx, NonZeroU32::new(5).unwrap(), None, @@ -2001,10 +2032,221 @@ mod test { assert_eq!(result.free_datasets, 2); // We want 5 free but only have 2, so we want to delete 3 // But we only have 1 Active bundle to delete - assert_eq!(result.bundles_to_delete.len(), 1); - assert_eq!(result.bundles_to_delete[0].id, bundle1.id); + assert_eq!(result.deleted_ids.len(), 1); + assert_eq!(result.deleted_ids[0], bundle1.id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_auto_deletion_verifies_state_transition() { + let logctx = + dev::test_setup_log("test_auto_deletion_verifies_state_transition"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 3 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 3).await; + + // Create 3 bundles + let mut bundle_ids = Vec::new(); + for _ in 0..3 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + // Mark as active + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + bundle_ids.push(bundle.id); + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + + // Verify bundles start in Active state + for id in &bundle_ids { + let bundle = datastore + .support_bundle_get(&opctx, (*id).into()) + .await + .unwrap(); + assert_eq!(bundle.state, SupportBundleState::Active); + } + + // With target_free=2 and free=0, delete 2 bundles + let result = datastore + .support_bundle_auto_delete( + &opctx, + NonZeroU32::new(2).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + assert_eq!(result.deleted_ids.len(), 2); + + // Verify deleted bundles are now in Destroying state + for id in &result.deleted_ids { + let bundle = + datastore.support_bundle_get(&opctx, *id).await.unwrap(); + assert_eq!( + bundle.state, + SupportBundleState::Destroying, + "Bundle {} should be Destroying", + id + ); + } + + // Verify the remaining bundle is still Active + let remaining_id = bundle_ids + .iter() + .find(|id| { + !result.deleted_ids.contains(&SupportBundleUuid::from(**id)) + }) + .unwrap(); + let remaining_bundle = datastore + .support_bundle_get(&opctx, (*remaining_id).into()) + .await + .unwrap(); + assert_eq!(remaining_bundle.state, SupportBundleState::Active); db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_auto_deletion_failed_bundles_dont_occupy_datasets() { + let logctx = dev::test_setup_log( + "test_auto_deletion_failed_bundles_dont_occupy_datasets", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 3 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 3).await; + + // Create 3 bundles, all Active + let mut bundle_ids = Vec::new(); + for _ in 0..3 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + + bundle_ids.push(bundle.id); + } + + // All 3 datasets are used, free=0 + let result = datastore + .support_bundle_auto_delete( + &opctx, + NonZeroU32::new(1).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + assert_eq!(result.free_datasets, 0); + assert_eq!(result.deleted_ids.len(), 1); + + // Manually mark one bundle as Failed (simulating dataset expungement) + // Failed bundles don't occupy datasets + let authz_bundle = authz_support_bundle_from_id(bundle_ids[1].into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Failing, + ) + .await + .expect("Should update state"); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Failed, + ) + .await + .expect("Should update state"); + + // Now we have: 1 Destroying, 1 Failed, 1 Active + // - Total datasets: 3 + // - Used by non-Failed bundles: 2 (Destroying + Active) + // - Free datasets: 3 - 2 = 1 + let result = datastore + .support_bundle_auto_delete( + &opctx, + NonZeroU32::new(1).unwrap(), + None, + ) + .await + .expect("Should succeed"); + + // Free should be 1 now (Failed bundle doesn't count as using a dataset) + assert_eq!(result.free_datasets, 1); + // Since free=1 >= target=1, no deletion needed + assert!(result.deleted_ids.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Tests for the auto_delete CTE query + + #[tokio::test] + async fn test_auto_delete_query_explains() { + use crate::db::explain::ExplainableAsync; + use crate::db::pub_test_utils::TestDatabase; + + let logctx = dev::test_setup_log("test_auto_delete_query_explains"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let conn = db.pool().claim().await.unwrap(); + + let query = support_bundle_auto_delete_query( + NonZeroU32::new(3).unwrap(), + Some(NonZeroU32::new(5).unwrap()), + ); + + let _ = query.explain_async(&conn).await.unwrap_or_else(|e| { + panic!("Failed to explain query, is it valid SQL?\nerror: {e:#?}") + }); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn expectorate_auto_delete_query() { + use crate::db::raw_query_builder::expectorate_query_contents; + + let query = support_bundle_auto_delete_query( + NonZeroU32::new(3).unwrap(), + Some(NonZeroU32::new(5).unwrap()), + ); + expectorate_query_contents( + query, + "tests/output/support_bundle_auto_delete.sql", + ) + .await; + } } diff --git a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql new file mode 100644 index 00000000000..ff34596d66c --- /dev/null +++ b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql @@ -0,0 +1,52 @@ +WITH + dataset_count + AS (SELECT count(*) AS total FROM rendezvous_debug_dataset WHERE time_tombstoned IS NULL), + used_count + AS ( + SELECT + count(*) AS used + FROM + support_bundle + WHERE + state IN ('collecting', 'active', 'destroying', 'failing') + ), + active_count AS (SELECT count(*) AS active FROM support_bundle WHERE state = 'active'), + deletion_calc + AS ( + SELECT + (SELECT total FROM dataset_count) AS total_datasets, + (SELECT used FROM used_count) AS used_datasets, + (SELECT active FROM active_count) AS active_bundles, + greatest(0, $1 - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count))) + AS bundles_needed, + greatest(0, (SELECT active FROM active_count) - $2) AS max_deletable + ), + candidates + AS ( + SELECT + id + FROM + support_bundle + WHERE + state = 'active' + ORDER BY + time_created ASC + LIMIT + (SELECT least(bundles_needed, max_deletable) FROM deletion_calc) + ), + deleted + AS ( + UPDATE + support_bundle + SET + state = 'destroying' + WHERE + id IN (SELECT id FROM candidates) AND state = 'active' + RETURNING + id + ) +SELECT + (SELECT total_datasets FROM deletion_calc), + (SELECT used_datasets FROM deletion_calc), + (SELECT active_bundles FROM deletion_calc), + ARRAY (SELECT id FROM deleted) AS deleted_ids diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 56bbc131040..717bd827a19 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -323,12 +323,15 @@ impl SupportBundleCollector { Ok(report) } - /// Checks for bundles eligible for automatic deletion and marks them - /// for destruction. + /// Atomically finds and marks bundles for automatic deletion. /// /// This maintains a buffer of free debug datasets for new bundle /// allocations by deleting the oldest bundles when the free dataset /// count drops below `target_free_datasets`. + /// + /// The operation is atomic: finding candidates and transitioning them + /// to Destroying state happens in a single database query. This prevents + /// over-deletion when multiple Nexuses run concurrently. async fn auto_delete_bundles( &self, opctx: &OpContext, @@ -340,86 +343,46 @@ impl SupportBundleCollector { return report; }; - // Query for candidates + // Atomically find and delete bundles in a single query let result = self .datastore - .support_bundle_find_auto_deletion_candidates( + .support_bundle_auto_delete( opctx, target_free, self.min_bundles_to_keep, ) .await; - let candidates = match result { + let auto_deleted = match result { Ok(r) => r, Err(err) => { warn!( &opctx.log, - "SupportBundleCollector: Failed to find auto-deletion candidates"; + "SupportBundleCollector: Failed to auto-delete bundles"; "err" => %err ); report .errors - .push(format!("Failed to query candidates: {}", err)); + .push(format!("Failed to auto-delete bundles: {}", err)); return report; } }; - // Update report with current state - report.total_datasets = candidates.total_datasets; - report.active_bundles = candidates.active_bundles; - report.free_datasets = candidates.free_datasets; - - // Mark each candidate bundle for deletion - for bundle in &candidates.bundles_to_delete { - let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + // Update report with state (as of before any deletions) + report.total_datasets = auto_deleted.total_datasets; + report.active_bundles = auto_deleted.active_bundles; + report.free_datasets = auto_deleted.free_datasets; + report.bundles_marked_for_deletion = auto_deleted.deleted_ids.len(); + // Log each bundle that was marked for deletion + for id in &auto_deleted.deleted_ids { info!( &opctx.log, - "SupportBundleCollector: Auto-deleting bundle to free dataset capacity"; - "id" => %bundle.id, - "time_created" => %bundle.time_created, - "free_datasets" => candidates.free_datasets, + "SupportBundleCollector: Auto-deleted bundle to free dataset capacity"; + "id" => %id, + "free_datasets" => auto_deleted.free_datasets, "target_free" => target_free.get(), ); - - // Transition bundle to Destroying state - match self - .datastore - .support_bundle_update( - opctx, - &authz_bundle, - SupportBundleState::Destroying, - ) - .await - { - Ok(()) => { - report.bundles_marked_for_deletion += 1; - } - Err(err) => { - // Handle gracefully - bundle may have been modified - // concurrently by another Nexus or user - if matches!(err, Error::InvalidRequest { .. }) { - info!( - &opctx.log, - "SupportBundleCollector: Concurrent state change during auto-deletion"; - "bundle" => %bundle.id, - "err" => ?err, - ); - } else { - warn!( - &opctx.log, - "SupportBundleCollector: Failed to mark bundle for auto-deletion"; - "bundle" => %bundle.id, - "err" => %err, - ); - report.errors.push(format!( - "Failed to mark bundle {} for deletion: {}", - bundle.id, err - )); - } - } - } } report From 85745292705dfc1dad2d2f4fcc7b79ce81d47010 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 20 Jan 2026 18:08:44 -0800 Subject: [PATCH 4/6] Switch to percentage-based, DB encoded configs --- dev-tools/omdb/src/bin/omdb/nexus.rs | 9 + .../bin/omdb/nexus/support_bundle_config.rs | 150 ++++++++++ dev-tools/omdb/tests/usage_errors.out | 1 + nexus-config/src/nexus_config.rs | 24 +- nexus/db-model/src/support_bundle.rs | 16 + .../src/db/datastore/support_bundle.rs | 278 +++++++++++------- .../output/support_bundle_auto_delete.sql | 25 +- nexus/db-schema/src/schema.rs | 9 + nexus/src/app/background/init.rs | 2 - .../tasks/support_bundle_collector.rs | 63 +--- .../integration_tests/support_bundles.rs | 38 ++- schema/crdb/bundle-state-index/up2.sql | 18 ++ schema/crdb/bundle-state-index/up3.sql | 4 + schema/crdb/dbinit.sql | 30 ++ smf/nexus/multi-sled/config-partial.toml | 2 - smf/nexus/single-sled/config-partial.toml | 2 - 16 files changed, 456 insertions(+), 215 deletions(-) create mode 100644 dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs create mode 100644 schema/crdb/bundle-state-index/up2.sql create mode 100644 schema/crdb/bundle-state-index/up3.sql diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 153d76c4a51..72805cdf01e 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -6,6 +6,7 @@ mod quiesce; mod reconfigurator_config; +mod support_bundle_config; mod update_status; use crate::Omdb; @@ -101,6 +102,8 @@ use std::os::unix::fs::PermissionsExt; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use support_bundle_config::SupportBundleConfigArgs; +use support_bundle_config::cmd_nexus_support_bundle_config; use support_bundle_viewer::LocalFileAccess; use support_bundle_viewer::SupportBundleAccessor; use tabled::Tabled; @@ -160,6 +163,8 @@ enum NexusCommands { ReconfiguratorConfig(ReconfiguratorConfigArgs), /// view sagas, create and complete demo sagas Sagas(SagasArgs), + /// interact with support bundle auto-deletion config + SupportBundleConfig(SupportBundleConfigArgs), /// interact with sleds Sleds(SledsArgs), /// interact with support bundles @@ -783,6 +788,10 @@ impl NexusArgs { cmd_nexus_reconfigurator_config(&omdb, &client, args).await } + NexusCommands::SupportBundleConfig(args) => { + cmd_nexus_support_bundle_config(&omdb, log, args).await + } + NexusCommands::Sagas(SagasArgs { command }) => { if self.nexus_internal_url.is_none() { eprintln!( diff --git a/dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs b/dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs new file mode 100644 index 00000000000..0cf6363c71d --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/nexus/support_bundle_config.rs @@ -0,0 +1,150 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! omdb commands for support bundle auto-deletion configuration + +use crate::Omdb; +use crate::check_allow_destructive::DestructiveOperationToken; +use crate::db::DbUrlOptions; +use anyhow::Context; +use clap::Args; +use clap::Subcommand; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use std::sync::Arc; + +#[derive(Debug, Args)] +pub struct SupportBundleConfigArgs { + #[clap(flatten)] + db_url_opts: DbUrlOptions, + + #[command(subcommand)] + command: SupportBundleConfigCommands, +} + +#[derive(Debug, Subcommand)] +pub enum SupportBundleConfigCommands { + /// Show current support bundle auto-deletion config + Show, + + /// Set support bundle auto-deletion config + Set(SupportBundleConfigSetArgs), +} + +#[derive(Debug, Clone, Args)] +pub struct SupportBundleConfigSetArgs { + /// Target percentage of datasets to keep free (0-100) + #[clap(long)] + target_free_percent: Option, + + /// Minimum percentage of datasets to keep as bundles (0-100) + #[clap(long)] + min_keep_percent: Option, +} + +pub async fn cmd_nexus_support_bundle_config( + omdb: &Omdb, + log: &slog::Logger, + args: &SupportBundleConfigArgs, +) -> Result<(), anyhow::Error> { + let datastore = args.db_url_opts.connect(omdb, log).await?; + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + + let result = match &args.command { + SupportBundleConfigCommands::Show => { + support_bundle_config_show(&opctx, &datastore).await + } + SupportBundleConfigCommands::Set(set_args) => { + let token = omdb.check_allow_destructive()?; + support_bundle_config_set(&opctx, &datastore, set_args, token).await + } + }; + + datastore.terminate().await; + result +} + +async fn support_bundle_config_show( + opctx: &OpContext, + datastore: &Arc, +) -> Result<(), anyhow::Error> { + let config = datastore + .support_bundle_config_get(opctx) + .await + .context("failed to get support bundle config")?; + + println!("Support Bundle Auto-Deletion Config:"); + println!( + " Target free datasets: {}% (CEIL calculation)", + config.target_free_percent + ); + println!( + " Minimum bundles to keep: {}% (CEIL calculation)", + config.min_keep_percent + ); + println!(" Last modified: {}", config.time_modified); + + Ok(()) +} + +async fn support_bundle_config_set( + opctx: &OpContext, + datastore: &Arc, + args: &SupportBundleConfigSetArgs, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + // Get current config + let current = datastore + .support_bundle_config_get(opctx) + .await + .context("failed to get current support bundle config")?; + + // Apply changes, using current values as defaults + let new_target_free = + args.target_free_percent.unwrap_or(current.target_free_percent as u8); + let new_min_keep = + args.min_keep_percent.unwrap_or(current.min_keep_percent as u8); + + // Check if anything changed + if i64::from(new_target_free) == current.target_free_percent + && i64::from(new_min_keep) == current.min_keep_percent + { + println!("No changes to current config:"); + println!( + " Target free datasets: {}% (CEIL calculation)", + current.target_free_percent + ); + println!( + " Minimum bundles to keep: {}% (CEIL calculation)", + current.min_keep_percent + ); + return Ok(()); + } + + // Apply the update + datastore + .support_bundle_config_set(opctx, new_target_free, new_min_keep) + .await + .context("failed to set support bundle config")?; + + println!("Support bundle config updated:"); + if i64::from(new_target_free) != current.target_free_percent { + println!( + " Target free datasets: {}% -> {}%", + current.target_free_percent, new_target_free + ); + } else { + println!(" Target free datasets: {}% (unchanged)", new_target_free); + } + if i64::from(new_min_keep) != current.min_keep_percent { + println!( + " Minimum bundles to keep: {}% -> {}%", + current.min_keep_percent, new_min_keep + ); + } else { + println!(" Minimum bundles to keep: {}% (unchanged)", new_min_keep); + } + + Ok(()) +} diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index c7177a64abd..2771c31de7f 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -901,6 +901,7 @@ Commands: quiesce view or modify the quiesce status reconfigurator-config interact with reconfigurator config sagas view sagas, create and complete demo sagas + support-bundle-config interact with support bundle auto-deletion config sleds interact with sleds support-bundles interact with support bundles [aliases: sb] update-status show running artifact versions diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 9f113378edc..9cd96788090 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -27,7 +27,6 @@ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; use std::net::SocketAddr; -use std::num::NonZeroU32; use std::time::Duration; use uuid::Uuid; @@ -487,25 +486,8 @@ pub struct SupportBundleCollectorConfig { /// Default: Off #[serde(default)] pub disable: bool, - - /// Target number of free debug datasets to maintain for new allocations. - /// - /// When the number of free debug datasets drops below this value, the - /// oldest support bundles will be automatically deleted to free up space. - /// - /// Default: None (auto-deletion disabled) - #[serde(default)] - pub target_free_datasets: Option, - - /// Minimum number of bundles to retain, even if `target_free_datasets` - /// would otherwise trigger deletion. - /// - /// This prevents aggressive cleanup on small systems where the number of - /// debug datasets is less than or equal to `target_free_datasets`. - /// - /// Default: None (no minimum, can delete all bundles) - #[serde(default)] - pub min_bundles_to_keep: Option, + // NOTE: Auto-deletion configuration (target_free_percent, min_keep_percent) + // is now stored in the database's support_bundle_config table. } #[serde_as] @@ -1387,8 +1369,6 @@ mod test { SupportBundleCollectorConfig { period_secs: Duration::from_secs(30), disable: false, - target_free_datasets: None, - min_bundles_to_keep: None, }, physical_disk_adoption: PhysicalDiskAdoptionConfig { period_secs: Duration::from_secs(30), diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index b9dc9f55b17..6cb08a16b14 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -5,6 +5,7 @@ use super::impl_enum_type; use crate::typed_uuid::DbTypedUuid; use nexus_db_schema::schema::support_bundle; +use nexus_db_schema::schema::support_bundle_config; use chrono::{DateTime, Utc}; use nexus_types::external_api::shared::SupportBundleInfo as SupportBundleView; @@ -136,3 +137,18 @@ impl From for SupportBundleView { } } } + +/// Configuration for automatic support bundle deletion. +/// +/// This table uses a singleton pattern - exactly one row exists, created by +/// the schema migration. The row is only updated, never inserted or deleted. +#[derive(Clone, Debug, Queryable, Selectable, Serialize, Deserialize)] +#[diesel(table_name = support_bundle_config)] +pub struct SupportBundleConfig { + pub singleton: bool, + /// Percentage (0-100) of total datasets to keep free for new allocations. + pub target_free_percent: i64, + /// Percentage (0-100) of total datasets to retain as bundles (minimum). + pub min_keep_percent: i64, + pub time_modified: DateTime, +} diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index ae327eb437d..0813e83369b 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -32,7 +32,6 @@ use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SupportBundleUuid; -use std::num::NonZeroU32; use uuid::Uuid; const CANNOT_ALLOCATE_ERR_MSG: &'static str = "Current policy limits support bundle creation to 'one per external disk', and \ @@ -543,33 +542,29 @@ impl DataStore { /// Atomically finds and deletes support bundles to maintain free dataset buffer. /// /// This method performs a single atomic operation that: - /// 1. Calculates how many deletions are needed (free_datasets < target) - /// 2. Applies min_bundles_to_keep constraint (never delete below minimum) - /// 3. Finds the N oldest Active bundles - /// 4. Transitions them to Destroying state atomically - /// 5. Returns what was actually deleted + /// 1. Reads config from the `support_bundle_config` table + /// 2. Calculates thresholds based on percentage of total datasets + /// 3. Calculates how many deletions are needed (free_datasets < target) + /// 4. Applies min_keep constraint (never delete below minimum) + /// 5. Finds the N oldest Active bundles + /// 6. Transitions them to Destroying state atomically + /// 7. Returns what was actually deleted /// /// This prevents over-deletion when multiple Nexuses run concurrently, /// because the calculation and state transitions happen in a single /// database operation. /// - /// The `min_bundles_to_keep` parameter prevents aggressive cleanup on - /// small systems by ensuring we never delete bundles if doing so would - /// leave fewer than this many bundles. + /// Configuration is read from the database, ensuring all Nexus replicas + /// use consistent values. pub async fn support_bundle_auto_delete( &self, opctx: &OpContext, - target_free_datasets: NonZeroU32, - min_bundles_to_keep: Option, ) -> Result { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; - let query = support_bundle_auto_delete_query( - target_free_datasets, - min_bundles_to_keep, - ); + let query = support_bundle_auto_delete_query(); // Return type: (total_datasets, used_datasets, active_bundles, deleted_ids) let result: (i64, i64, i64, Vec) = query @@ -593,20 +588,81 @@ impl DataStore { free_datasets: total_datasets.saturating_sub(used_datasets), }) } + + /// Get the current support bundle auto-deletion config. + /// + /// The singleton row always exists (created by schema migration). + pub async fn support_bundle_config_get( + &self, + opctx: &OpContext, + ) -> Result { + use nexus_db_schema::schema::support_bundle_config::dsl; + + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + let conn = self.pool_connection_authorized(opctx).await?; + + dsl::support_bundle_config + .select(crate::db::model::SupportBundleConfig::as_select()) + .first_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Set support bundle auto-deletion config. + /// + /// Values are percentages (0-100). The singleton row always exists + /// (created by schema migration), so this is an UPDATE operation. + pub async fn support_bundle_config_set( + &self, + opctx: &OpContext, + target_free_percent: u8, + min_keep_percent: u8, + ) -> Result<(), Error> { + use nexus_db_schema::schema::support_bundle_config::dsl; + + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // Validate percentages are in range + if target_free_percent > 100 { + return Err(Error::invalid_request( + "target_free_percent must be between 0 and 100", + )); + } + if min_keep_percent > 100 { + return Err(Error::invalid_request( + "min_keep_percent must be between 0 and 100", + )); + } + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::update(dsl::support_bundle_config) + .filter(dsl::singleton.eq(true)) + .set(( + dsl::target_free_percent.eq(i64::from(target_free_percent)), + dsl::min_keep_percent.eq(i64::from(min_keep_percent)), + dsl::time_modified.eq(chrono::Utc::now()), + )) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } } /// Builds the CTE query for atomic support bundle auto-deletion. /// /// This query atomically: -/// 1. Counts datasets and bundles -/// 2. Calculates how many bundles to delete (respecting min_bundles_to_keep) -/// 3. Selects the oldest Active bundles -/// 4. Transitions them to Destroying state -/// 5. Returns the stats and deleted IDs -pub fn support_bundle_auto_delete_query( - target_free_datasets: NonZeroU32, - min_bundles_to_keep: Option, -) -> TypedSqlQuery<( +/// 1. Reads config from support_bundle_config table +/// 2. Calculates thresholds based on percentage of total datasets using CEIL +/// 3. Counts datasets and bundles +/// 4. Calculates how many bundles to delete (respecting min_keep) +/// 5. Selects the oldest Active bundles +/// 6. Transitions them to Destroying state +/// 7. Returns the stats and deleted IDs +pub fn support_bundle_auto_delete_query() -> TypedSqlQuery<( diesel::sql_types::BigInt, diesel::sql_types::BigInt, diesel::sql_types::BigInt, @@ -614,15 +670,18 @@ pub fn support_bundle_auto_delete_query( )> { use crate::db::raw_query_builder::QueryBuilder; - let min_keep = min_bundles_to_keep.map(|n| i64::from(n.get())).unwrap_or(0); - let mut query = QueryBuilder::new(); // Build the CTE query - query - .sql( - " + query.sql( + " WITH + -- Read config from database (singleton row always exists from migration) + config AS ( + SELECT target_free_percent, min_keep_percent + FROM support_bundle_config + WHERE singleton = true + ), -- Count non-tombstoned datasets (uses lookup_usable_rendezvous_debug_dataset index) dataset_count AS ( SELECT COUNT(*) as total @@ -638,32 +697,32 @@ WITH FROM support_bundle WHERE state IN ('collecting', 'active', 'destroying', 'failing') ), - -- Count active bundles (for min_bundles_to_keep check) + -- Count active bundles (for min_keep check) active_count AS ( SELECT COUNT(*) as active FROM support_bundle WHERE state = 'active' ), + -- Calculate absolute thresholds from percentages using CEIL. + -- CEIL ensures we always round up, so 10% of 5 datasets = 1, not 0. + thresholds AS ( + SELECT + CEIL((SELECT total FROM dataset_count) * (SELECT target_free_percent FROM config) / 100.0)::INT8 as target_free, + CEIL((SELECT total FROM dataset_count) * (SELECT min_keep_percent FROM config) / 100.0)::INT8 as min_keep + ), -- Calculate how many bundles we need AND are allowed to delete. - -- min_bundles_to_keep is respected first (max_deletable constraint). + -- min_keep is respected first (max_deletable constraint). deletion_calc AS ( SELECT (SELECT total FROM dataset_count) as total_datasets, (SELECT used FROM used_count) as used_datasets, (SELECT active FROM active_count) as active_bundles, GREATEST(0, - ", - ) - .param() - .sql( - " - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count)) + (SELECT target_free FROM thresholds) + - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count)) ) as bundles_needed, GREATEST(0, - (SELECT active FROM active_count) - ", - ) - .param() - .sql( - " + (SELECT active FROM active_count) - (SELECT min_keep FROM thresholds) ) as max_deletable ), -- Find the N oldest active bundles we're allowed to delete. @@ -690,9 +749,7 @@ SELECT (SELECT active_bundles FROM deletion_calc), ARRAY(SELECT id FROM deleted) as deleted_ids ", - ) - .bind::(i64::from(target_free_datasets.get())) - .bind::(min_keep); + ); query.query() } @@ -1731,14 +1788,15 @@ mod test { // Create 5 debug datasets (pools) let _test_sled = create_sled_and_zpools(&datastore, &opctx, 5).await; - // With target_free=3 and 5 datasets, no bundles, free=5 >= 3 - // Should delete no bundles + // Set config: 60% target_free (CEIL(5*60/100)=3), 0% min_keep + // With 5 datasets, no bundles, free=5 >= 3, should delete no bundles + datastore + .support_bundle_config_set(&opctx, 60, 0) + .await + .expect("Should set config"); + let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(3).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -1781,13 +1839,15 @@ mod test { .expect("Should update state"); } - // With target_free=3 and free=5 >= 3, should delete no bundles + // Set config: 30% target_free (CEIL(10*30/100)=3), 0% min_keep + // With 10 datasets, 5 bundles, free=5 >= 3, should delete no bundles + datastore + .support_bundle_config_set(&opctx, 30, 0) + .await + .expect("Should set config"); + let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(3).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -1836,13 +1896,15 @@ mod test { tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; } - // With target_free=2 and free=0, we need to delete 2 bundles + // Set config: 40% target_free (CEIL(5*40/100)=2), 0% min_keep + // With 5 datasets, 5 bundles, free=0 < 2, need to delete 2 bundles + datastore + .support_bundle_config_set(&opctx, 40, 0) + .await + .expect("Should set config"); + let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(2).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -1890,14 +1952,16 @@ mod test { .expect("Should update state"); } - // With target_free=3 and free=0, we'd want to delete 3 bundles - // But min_bundles_to_keep=4 means we can only delete 1 (5-4=1) + // Set config: 60% target_free (CEIL(5*60/100)=3), 80% min_keep (CEIL(5*80/100)=4) + // With free=0, we'd want to delete 3 bundles + // But min_keep=4 means we can only delete 1 (5-4=1) + datastore + .support_bundle_config_set(&opctx, 60, 80) + .await + .expect("Should set config"); + let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(3).unwrap(), - Some(NonZeroU32::new(4).unwrap()), - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -1942,22 +2006,23 @@ mod test { .expect("Should update state"); } - // With target_free=5 (want all datasets free) and free=2 - // We'd want to delete 3 bundles, but min_bundles_to_keep=5 is > 3 active + // Set config: 100% target_free (CEIL(5*100/100)=5), 100% min_keep (CEIL(5*100/100)=5) + // With free=2, we'd want to delete 3 bundles, but min_keep=5 > active=3 // So we can't delete any + datastore + .support_bundle_config_set(&opctx, 100, 100) + .await + .expect("Should set config"); + let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(5).unwrap(), - Some(NonZeroU32::new(5).unwrap()), - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); assert_eq!(result.total_datasets, 5); assert_eq!(result.active_bundles, 3); assert_eq!(result.free_datasets, 2); - // min_bundles_to_keep (5) > active_bundles (3), so no deletion + // min_keep (5) > active_bundles (3), so no deletion assert!(result.deleted_ids.is_empty()); db.terminate().await; @@ -2012,15 +2077,17 @@ mod test { .await .expect("Should update state"); - // With target_free=5 and 3 bundles (1 Active, 1 Collecting, 1 Destroying), + // Set config: 100% target_free (CEIL(5*100/100)=5), 0% min_keep + // With 3 bundles (1 Active, 1 Collecting, 1 Destroying), // free_datasets = 5 - 3 = 2 (ALL bundles occupy datasets, not just Active) // We should only delete Active bundles though + datastore + .support_bundle_config_set(&opctx, 100, 0) + .await + .expect("Should set config"); + let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(5).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -2082,13 +2149,15 @@ mod test { assert_eq!(bundle.state, SupportBundleState::Active); } - // With target_free=2 and free=0, delete 2 bundles + // Set config: 50% target_free → CEIL(3*50/100)=2, 0% min_keep + datastore + .support_bundle_config_set(&opctx, 50, 0) + .await + .expect("Should set config"); + + // With target_free=2 (from 50% of 3 datasets) and free=0, delete 2 bundles let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(2).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -2156,13 +2225,15 @@ mod test { bundle_ids.push(bundle.id); } + // Set config: 20% target_free → CEIL(3*20/100)=1, 0% min_keep + datastore + .support_bundle_config_set(&opctx, 20, 0) + .await + .expect("Should set config"); + // All 3 datasets are used, free=0 let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(1).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -2193,12 +2264,9 @@ mod test { // - Total datasets: 3 // - Used by non-Failed bundles: 2 (Destroying + Active) // - Free datasets: 3 - 2 = 1 + // Config still: 20% target_free (CEIL(3*20/100)=1), 0% min_keep let result = datastore - .support_bundle_auto_delete( - &opctx, - NonZeroU32::new(1).unwrap(), - None, - ) + .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); @@ -2222,10 +2290,7 @@ mod test { let db = TestDatabase::new_with_pool(&logctx.log).await; let conn = db.pool().claim().await.unwrap(); - let query = support_bundle_auto_delete_query( - NonZeroU32::new(3).unwrap(), - Some(NonZeroU32::new(5).unwrap()), - ); + let query = support_bundle_auto_delete_query(); let _ = query.explain_async(&conn).await.unwrap_or_else(|e| { panic!("Failed to explain query, is it valid SQL?\nerror: {e:#?}") @@ -2239,10 +2304,7 @@ mod test { async fn expectorate_auto_delete_query() { use crate::db::raw_query_builder::expectorate_query_contents; - let query = support_bundle_auto_delete_query( - NonZeroU32::new(3).unwrap(), - Some(NonZeroU32::new(5).unwrap()), - ); + let query = support_bundle_auto_delete_query(); expectorate_query_contents( query, "tests/output/support_bundle_auto_delete.sql", diff --git a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql index ff34596d66c..9f9db27054d 100644 --- a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql +++ b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql @@ -1,4 +1,8 @@ WITH + config + AS ( + SELECT target_free_percent, min_keep_percent FROM support_bundle_config WHERE singleton = true + ), dataset_count AS (SELECT count(*) AS total FROM rendezvous_debug_dataset WHERE time_tombstoned IS NULL), used_count @@ -11,15 +15,32 @@ WITH state IN ('collecting', 'active', 'destroying', 'failing') ), active_count AS (SELECT count(*) AS active FROM support_bundle WHERE state = 'active'), + thresholds + AS ( + SELECT + ceil( + (SELECT total FROM dataset_count) * (SELECT target_free_percent FROM config) / 100.0 + )::INT8 + AS target_free, + ceil( + (SELECT total FROM dataset_count) * (SELECT min_keep_percent FROM config) / 100.0 + )::INT8 + AS min_keep + ), deletion_calc AS ( SELECT (SELECT total FROM dataset_count) AS total_datasets, (SELECT used FROM used_count) AS used_datasets, (SELECT active FROM active_count) AS active_bundles, - greatest(0, $1 - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count))) + greatest( + 0, + (SELECT target_free FROM thresholds) + - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count)) + ) AS bundles_needed, - greatest(0, (SELECT active FROM active_count) - $2) AS max_deletable + greatest(0, (SELECT active FROM active_count) - (SELECT min_keep FROM thresholds)) + AS max_deletable ), candidates AS ( diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 46658e98c19..f973d61b2ad 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1532,6 +1532,15 @@ table! { } } +table! { + support_bundle_config (singleton) { + singleton -> Bool, + target_free_percent -> Int8, + min_keep_percent -> Int8, + time_modified -> Timestamptz, + } +} + /* hardware inventory */ table! { diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index e7a406b6ef4..0f2ad163549 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -632,8 +632,6 @@ impl BackgroundTasksInitializer { resolver.clone(), config.support_bundle_collector.disable, nexus_id, - config.support_bundle_collector.target_free_datasets, - config.support_bundle_collector.min_bundles_to_keep, ), ), opctx: opctx.child(BTreeMap::new()), diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 717bd827a19..a648760123c 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -28,7 +28,6 @@ use omicron_uuid_kinds::ZpoolUuid; use serde_json::json; use sled_agent_types::support_bundle::NESTED_DATASET_NOT_FOUND; use slog_error_chain::InlineErrorChain; -use std::num::NonZeroU32; use std::sync::Arc; use super::support_bundle::collection::BundleCollection; @@ -58,10 +57,6 @@ pub struct SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, - /// Target number of free debug datasets to maintain for new allocations. - target_free_datasets: Option, - /// Minimum number of bundles to retain to prevent aggressive cleanup. - min_bundles_to_keep: Option, } impl SupportBundleCollector { @@ -70,17 +65,8 @@ impl SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, - target_free_datasets: Option, - min_bundles_to_keep: Option, ) -> Self { - SupportBundleCollector { - datastore, - resolver, - disable, - nexus_id, - target_free_datasets, - min_bundles_to_keep, - } + SupportBundleCollector { datastore, resolver, disable, nexus_id } } // Tells a sled agent to delete a support bundle @@ -326,8 +312,9 @@ impl SupportBundleCollector { /// Atomically finds and marks bundles for automatic deletion. /// /// This maintains a buffer of free debug datasets for new bundle - /// allocations by deleting the oldest bundles when the free dataset - /// count drops below `target_free_datasets`. + /// allocations. Configuration (target_free_percent, min_keep_percent) + /// is read from the database, ensuring all Nexus replicas use consistent + /// values. /// /// The operation is atomic: finding candidates and transitioning them /// to Destroying state happens in a single database query. This prevents @@ -338,20 +325,9 @@ impl SupportBundleCollector { ) -> SupportBundleAutoDeletionReport { let mut report = SupportBundleAutoDeletionReport::default(); - // Skip if auto-deletion is disabled - let Some(target_free) = self.target_free_datasets else { - return report; - }; - - // Atomically find and delete bundles in a single query - let result = self - .datastore - .support_bundle_auto_delete( - opctx, - target_free, - self.min_bundles_to_keep, - ) - .await; + // Atomically find and delete bundles in a single query. + // Config is read from the database within the query. + let result = self.datastore.support_bundle_auto_delete(opctx).await; let auto_deleted = match result { Ok(r) => r, @@ -381,7 +357,6 @@ impl SupportBundleCollector { "SupportBundleCollector: Auto-deleted bundle to free dataset capacity"; "id" => %id, "free_datasets" => auto_deleted.free_datasets, - "target_free" => target_free.get(), ); } @@ -577,8 +552,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let report = collector @@ -605,8 +578,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let request = BundleRequest::default(); @@ -914,8 +885,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); // The bundle collection should complete successfully. @@ -995,8 +964,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); // Collect the bundle @@ -1108,8 +1075,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); // The bundle collection should complete successfully. @@ -1218,8 +1183,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); // Each time we call "collect_bundle", we collect a SINGLE bundle. @@ -1334,8 +1297,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let report = collector @@ -1389,8 +1350,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1490,8 +1449,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let report = collector @@ -1548,8 +1505,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1635,8 +1590,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1721,8 +1674,6 @@ mod test { resolver.clone(), false, nexus.id(), - None, // target_free_datasets - None, // min_bundles_to_keep ); // Collect the bundle diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 78ec7b74f9c..a4449a9b613 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -29,7 +29,6 @@ use omicron_common::api::external::LookupType; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; use std::io::Cursor; -use std::num::NonZeroU32; use zip::read::ZipArchive; type ControlPlaneTestContext = @@ -938,49 +937,46 @@ async fn test_support_bundle_delete_failed_bundle( // Test automatic deletion of support bundles to maintain free dataset capacity. // // This test verifies that: -// 1. Auto-deletion kicks in when free_datasets < target_free_datasets +// 1. Auto-deletion kicks in when free_datasets < target threshold // 2. The oldest bundles are marked for deletion first -// 3. min_bundles_to_keep is respected +// 3. min_keep percentage is respected // -// Configuration: target_free_datasets=1, min_bundles_to_keep=1, 5 datasets +// Configuration: 20% target_free (CEIL(5*20/100)=1), 20% min_keep (CEIL(5*20/100)=1), 5 datasets // Create 5 bundles (filling all datasets), and verify auto-deletion happens // when we run out of free datasets. #[tokio::test] async fn test_support_bundle_auto_deletion() { - // Create a test context with auto-deletion enabled: - // - target_free_datasets=1: try to maintain 1 free dataset - // - min_bundles_to_keep=1: always keep at least 1 bundle let cptestctx = nexus_test_utils::ControlPlaneBuilder::new( "test_support_bundle_auto_deletion", ) - .customize_nexus_config(&|config| { - config - .pkg - .background_tasks - .support_bundle_collector - .target_free_datasets = Some(NonZeroU32::new(1).unwrap()); - config - .pkg - .background_tasks - .support_bundle_collector - .min_bundles_to_keep = Some(NonZeroU32::new(1).unwrap()); - }) .start::() .await; let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + // Set auto-deletion config in the database: + // - target_free_percent=20: CEIL(5*20/100)=1 free dataset target + // - min_keep_percent=20: CEIL(5*20/100)=1 bundle minimum + datastore + .support_bundle_config_set(&opctx, 20, 20) + .await + .expect("Should be able to set config"); // Create 5 zpools, giving us 5 debug datasets let _disk_test = DiskTestBuilder::new(&cptestctx).with_zpool_count(5).build().await; // Create and activate bundles one by one. - // With 5 datasets and target_free=1: + // With 5 datasets and 20% target_free (CEIL(5*20/100)=1): // - After bundles 1-4: free >= 1, so no auto-delete needed // - When creating bundle 5: // - Before collection: 4 Active + 1 Collecting = 5 bundles, free = 0 // - Auto-delete triggers: want 1 free, have 0, delete 1 oldest - // - min_keep=1, active=4, max_deletable=3, so we CAN delete + // - 20% min_keep (CEIL(5*20/100)=1), active=4, max_deletable=3, so we CAN delete // - Result: oldest bundle deleted, then bundle 5 gets collected let mut bundle_ids = Vec::new(); for i in 0..5 { diff --git a/schema/crdb/bundle-state-index/up2.sql b/schema/crdb/bundle-state-index/up2.sql new file mode 100644 index 00000000000..fa26870b6f2 --- /dev/null +++ b/schema/crdb/bundle-state-index/up2.sql @@ -0,0 +1,18 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_config ( + -- Singleton pattern: only one row allowed + singleton BOOL PRIMARY KEY DEFAULT TRUE CHECK (singleton = TRUE), + + -- Percentage (0-100) of total datasets to keep free for new allocations. + -- Calculated as CEIL(total_datasets * target_free_percent / 100). + -- Example: 10% of 100 datasets = 10 free, 10% of 5 datasets = 1 free. + target_free_percent INT8 NOT NULL + CHECK (target_free_percent >= 0 AND target_free_percent <= 100), + + -- Percentage (0-100) of total datasets to retain as bundles (minimum). + -- Calculated as CEIL(total_datasets * min_keep_percent / 100). + -- Prevents aggressive cleanup on small systems. + min_keep_percent INT8 NOT NULL + CHECK (min_keep_percent >= 0 AND min_keep_percent <= 100), + + time_modified TIMESTAMPTZ NOT NULL DEFAULT NOW() +); diff --git a/schema/crdb/bundle-state-index/up3.sql b/schema/crdb/bundle-state-index/up3.sql new file mode 100644 index 00000000000..458ac3d3354 --- /dev/null +++ b/schema/crdb/bundle-state-index/up3.sql @@ -0,0 +1,4 @@ +-- Default: 10% free datasets, keep at least 10% worth of bundles +INSERT INTO omicron.public.support_bundle_config (singleton, target_free_percent, min_keep_percent, time_modified) +VALUES (TRUE, 10, 10, NOW()) +ON CONFLICT (singleton) DO NOTHING; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9c560f453e6..9c9c532c265 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2931,6 +2931,36 @@ CREATE INDEX IF NOT EXISTS lookup_bundle_by_state_and_creation ON omicron.public time_created ); +/* + * Support Bundle Config + * + * Configuration for automatic support bundle deletion. This table uses a + * singleton pattern (exactly one row) to store cluster-wide configuration. + */ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_config ( + -- Singleton pattern: only one row allowed + singleton BOOL PRIMARY KEY DEFAULT TRUE CHECK (singleton = TRUE), + + -- Percentage (0-100) of total datasets to keep free for new allocations. + -- Calculated as CEIL(total_datasets * target_free_percent / 100). + -- Example: 10% of 100 datasets = 10 free, 10% of 5 datasets = 1 free. + target_free_percent INT8 NOT NULL + CHECK (target_free_percent >= 0 AND target_free_percent <= 100), + + -- Percentage (0-100) of total datasets to retain as bundles (minimum). + -- Calculated as CEIL(total_datasets * min_keep_percent / 100). + -- Prevents aggressive cleanup on small systems. + min_keep_percent INT8 NOT NULL + CHECK (min_keep_percent >= 0 AND min_keep_percent <= 100), + + time_modified TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Default: 10% free datasets, keep at least 10% worth of bundles +INSERT INTO omicron.public.support_bundle_config (singleton, target_free_percent, min_keep_percent, time_modified) +VALUES (TRUE, 10, 10, NOW()) +ON CONFLICT (singleton) DO NOTHING; + /*******************************************************************/ /* diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 54b057adbb7..b9b843fbcbd 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -53,8 +53,6 @@ inventory.disable_collect = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 -support_bundle_collector.target_free_datasets = 16 -support_bundle_collector.min_bundles_to_keep = 16 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index cc5601c49dd..c7e9e5c4317 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -53,8 +53,6 @@ inventory.disable_collect = false phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 support_bundle_collector.period_secs = 30 -support_bundle_collector.target_free_datasets = 1 -support_bundle_collector.min_bundles_to_keep = 4 decommissioned_disk_cleaner.period_secs = 60 blueprints.period_secs_load = 10 blueprints.period_secs_plan = 60 From 564395cfcea51a19adbec313dcba106f28565aee Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 21 Jan 2026 17:28:34 -0800 Subject: [PATCH 5/6] Add more tests, fix accounting for destroying/failing bundles --- .../src/db/datastore/support_bundle.rs | 224 ++++++++++++++++-- .../output/support_bundle_auto_delete.sql | 11 +- 2 files changed, 210 insertions(+), 25 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 0813e83369b..9d9babeec31 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -688,14 +688,20 @@ WITH FROM rendezvous_debug_dataset WHERE time_tombstoned IS NULL ), - -- Count bundles occupying datasets. - -- Bundles that occupy datasets: Collecting, Active, Destroying, Failing + -- Count bundles occupying datasets for deletion calculation purposes. + -- We only count stable states: Collecting and Active. + -- + -- Why exclude transitional states (Destroying, Failing)? + -- These bundles are already in cleanup transitions and their datasets will + -- be freed soon. Including them would cause concurrent auto-delete operations + -- to each see free=0 and keep deleting more bundles than necessary. + -- + -- Why exclude Failed? -- Failed bundles do NOT occupy datasets (their dataset was expunged). - -- Uses lookup_bundle_by_state_and_creation index. used_count AS ( SELECT COUNT(*) as used FROM support_bundle - WHERE state IN ('collecting', 'active', 'destroying', 'failing') + WHERE state IN ('collecting', 'active') ), -- Count active bundles (for min_keep check) active_count AS ( @@ -727,11 +733,13 @@ WITH ), -- Find the N oldest active bundles we're allowed to delete. -- Uses lookup_bundle_by_state_and_creation index for ordering. + -- Secondary sort on id ensures deterministic selection when timestamps match, + -- which is critical for preventing over-deletion under concurrent execution. candidates AS ( SELECT id FROM support_bundle WHERE state = 'active' - ORDER BY time_created ASC + ORDER BY time_created ASC, id ASC LIMIT (SELECT LEAST(bundles_needed, max_deletable) FROM deletion_calc) ), -- Atomically transition to Destroying (only if still Active). @@ -2078,8 +2086,9 @@ mod test { .expect("Should update state"); // Set config: 100% target_free (CEIL(5*100/100)=5), 0% min_keep - // With 3 bundles (1 Active, 1 Collecting, 1 Destroying), - // free_datasets = 5 - 3 = 2 (ALL bundles occupy datasets, not just Active) + // With 3 bundles (1 Active, 1 Collecting, 1 Destroying): + // - used_count = 2 (Active + Collecting; Destroying not counted for deletion calc) + // - free_datasets = 5 - 2 = 3 // We should only delete Active bundles though datastore .support_bundle_config_set(&opctx, 100, 0) @@ -2094,10 +2103,10 @@ mod test { assert_eq!(result.total_datasets, 5); // Only 1 bundle is Active (candidates for deletion) assert_eq!(result.active_bundles, 1); - // Free datasets: 5 total - 3 bundles = 2 - // (All bundles occupy datasets regardless of state) - assert_eq!(result.free_datasets, 2); - // We want 5 free but only have 2, so we want to delete 3 + // Free datasets: 5 total - 2 used (Active + Collecting) = 3 + // (Destroying bundles are not counted as they're already being freed) + assert_eq!(result.free_datasets, 3); + // We want 5 free but only have 3, so we want to delete 2 // But we only have 1 Active bundle to delete assert_eq!(result.deleted_ids.len(), 1); assert_eq!(result.deleted_ids[0], bundle1.id.into()); @@ -2262,17 +2271,19 @@ mod test { // Now we have: 1 Destroying, 1 Failed, 1 Active // - Total datasets: 3 - // - Used by non-Failed bundles: 2 (Destroying + Active) - // - Free datasets: 3 - 2 = 1 + // - Used for deletion calc: 1 (only Active; Destroying not counted) + // - Free datasets: 3 - 1 = 2 // Config still: 20% target_free (CEIL(3*20/100)=1), 0% min_keep let result = datastore .support_bundle_auto_delete(&opctx) .await .expect("Should succeed"); - // Free should be 1 now (Failed bundle doesn't count as using a dataset) - assert_eq!(result.free_datasets, 1); - // Since free=1 >= target=1, no deletion needed + // Free should be 2 now: + // - Failed bundle doesn't count (dataset was expunged) + // - Destroying bundle doesn't count (already being freed) + assert_eq!(result.free_datasets, 2); + // Since free=2 >= target=1, no deletion needed assert!(result.deleted_ids.is_empty()); db.terminate().await; @@ -2311,4 +2322,185 @@ mod test { ) .await; } + + /// Test that concurrent auto-deletion operations don't over-delete bundles. + /// + /// This verifies the atomic CTE prevents the TOCTTOU issue where multiple + /// Nexuses running concurrently could each decide to delete bundles based + /// on stale state, resulting in more deletions than intended. + #[tokio::test] + async fn test_auto_deletion_concurrent_execution_prevents_over_deletion() { + let logctx = dev::test_setup_log( + "test_auto_deletion_concurrent_execution_prevents_over_deletion", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let nexus_id = OmicronZoneUuid::new_v4(); + + // Create 10 debug datasets + let _test_sled = create_sled_and_zpools(&datastore, &opctx, 10).await; + + // Create 10 bundles (all slots filled) + for _ in 0..10 { + let bundle = datastore + .support_bundle_create(&opctx, "for tests", nexus_id, None) + .await + .expect("Should be able to create bundle"); + + let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); + datastore + .support_bundle_update( + &opctx, + &authz_bundle, + SupportBundleState::Active, + ) + .await + .expect("Should update state"); + } + + // Set config: 20% target_free (CEIL(10*20/100)=2), 0% min_keep + // With 10 datasets, 10 bundles, free=0 < 2, need to delete 2 bundles + datastore + .support_bundle_config_set(&opctx, 20, 0) + .await + .expect("Should set config"); + + // Spawn multiple concurrent auto-delete operations. + // Without the atomic CTE, each would see free=0 and try to delete 2, + // potentially resulting in 10 deletions (5 tasks × 2 each). + // With the atomic CTE, only the first operation(s) should delete, + // and subsequent ones should see the updated state. + let num_concurrent_tasks = 5; + let mut handles = Vec::new(); + + for _ in 0..num_concurrent_tasks { + let datastore = datastore.clone(); + let opctx = opctx.child(std::collections::BTreeMap::new()); + handles.push(tokio::spawn(async move { + datastore.support_bundle_auto_delete(&opctx).await + })); + } + + // Collect all results + let mut total_deleted = 0; + for handle in handles { + let result = handle.await.expect("Task should complete"); + let result = result.expect("Auto-delete should succeed"); + total_deleted += result.deleted_ids.len(); + } + + // The key assertion: we should have deleted exactly 2 bundles total, + // not 2 × num_concurrent_tasks. The atomic CTE ensures that once + // bundles are transitioned to Destroying, they're no longer candidates + // for other concurrent operations (the UPDATE's WHERE state='active' + // clause filters them out). + assert_eq!( + total_deleted, 2, + "Should delete exactly 2 bundles total across all concurrent \ + operations, not {} (which would indicate over-deletion)", + total_deleted + ); + + // Verify the final state: 8 Active bundles remain + use nexus_db_schema::schema::support_bundle::dsl; + let conn = datastore.pool_connection_authorized(&opctx).await.unwrap(); + let active_count: i64 = dsl::support_bundle + .filter(dsl::state.eq(SupportBundleState::Active)) + .count() + .get_result_async(&*conn) + .await + .expect("Should count active bundles"); + + assert_eq!( + active_count, 8, + "Should have 8 Active bundles remaining after concurrent deletion" + ); + + // Verify we now have 2 Destroying bundles + let destroying_count: i64 = dsl::support_bundle + .filter(dsl::state.eq(SupportBundleState::Destroying)) + .count() + .get_result_async(&*conn) + .await + .expect("Should count destroying bundles"); + + assert_eq!( + destroying_count, 2, + "Should have exactly 2 bundles in Destroying state" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_config_set_rejects_invalid_target_free_percent() { + let logctx = dev::test_setup_log( + "test_config_set_rejects_invalid_target_free_percent", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // 101% should be rejected + let result = datastore.support_bundle_config_set(&opctx, 101, 10).await; + + assert!( + result.is_err(), + "Setting target_free_percent > 100 should fail" + ); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("target_free_percent"), + "Error message should mention target_free_percent: {}", + err + ); + + // Verify valid values still work + datastore + .support_bundle_config_set(&opctx, 100, 10) + .await + .expect("100% should be valid"); + + datastore + .support_bundle_config_set(&opctx, 0, 10) + .await + .expect("0% should be valid"); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_config_set_rejects_invalid_min_keep_percent() { + let logctx = dev::test_setup_log( + "test_config_set_rejects_invalid_min_keep_percent", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // 101% should be rejected + let result = datastore.support_bundle_config_set(&opctx, 10, 101).await; + + assert!(result.is_err(), "Setting min_keep_percent > 100 should fail"); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("min_keep_percent"), + "Error message should mention min_keep_percent: {}", + err + ); + + // Verify valid values still work + datastore + .support_bundle_config_set(&opctx, 10, 100) + .await + .expect("100% should be valid"); + + datastore + .support_bundle_config_set(&opctx, 10, 0) + .await + .expect("0% should be valid"); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql index 9f9db27054d..d54856c353e 100644 --- a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql +++ b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql @@ -6,14 +6,7 @@ WITH dataset_count AS (SELECT count(*) AS total FROM rendezvous_debug_dataset WHERE time_tombstoned IS NULL), used_count - AS ( - SELECT - count(*) AS used - FROM - support_bundle - WHERE - state IN ('collecting', 'active', 'destroying', 'failing') - ), + AS (SELECT count(*) AS used FROM support_bundle WHERE state IN ('collecting', 'active')), active_count AS (SELECT count(*) AS active FROM support_bundle WHERE state = 'active'), thresholds AS ( @@ -51,7 +44,7 @@ WITH WHERE state = 'active' ORDER BY - time_created ASC + time_created ASC, id ASC LIMIT (SELECT least(bundles_needed, max_deletable) FROM deletion_calc) ), From 73d2699bcc9e6b1f8e061be54a558983f53beef6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 22 Jan 2026 11:57:17 -0800 Subject: [PATCH 6/6] self-review --- nexus-config/src/nexus_config.rs | 2 - .../src/db/datastore/support_bundle.rs | 68 ++++++++----------- .../output/support_bundle_auto_delete.sql | 36 ++++------ 3 files changed, 41 insertions(+), 65 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index de6603182cb..d9defb6c1bc 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -488,8 +488,6 @@ pub struct SupportBundleCollectorConfig { /// Default: Off #[serde(default)] pub disable: bool, - // NOTE: Auto-deletion configuration (target_free_percent, min_keep_percent) - // is now stored in the database's support_bundle_config table. } #[serde_as] diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 9d9babeec31..e588eaa142d 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -590,8 +590,6 @@ impl DataStore { } /// Get the current support bundle auto-deletion config. - /// - /// The singleton row always exists (created by schema migration). pub async fn support_bundle_config_get( &self, opctx: &OpContext, @@ -611,8 +609,7 @@ impl DataStore { /// Set support bundle auto-deletion config. /// - /// Values are percentages (0-100). The singleton row always exists - /// (created by schema migration), so this is an UPDATE operation. + /// Values are percentages (0-100). pub async fn support_bundle_config_set( &self, opctx: &OpContext, @@ -676,71 +673,66 @@ pub fn support_bundle_auto_delete_query() -> TypedSqlQuery<( query.sql( " WITH - -- Read config from database (singleton row always exists from migration) + -- Read config from database config AS ( SELECT target_free_percent, min_keep_percent FROM support_bundle_config WHERE singleton = true ), - -- Count non-tombstoned datasets (uses lookup_usable_rendezvous_debug_dataset index) + -- Count non-tombstoned datasets dataset_count AS ( SELECT COUNT(*) as total FROM rendezvous_debug_dataset WHERE time_tombstoned IS NULL ), - -- Count bundles occupying datasets for deletion calculation purposes. - -- We only count stable states: Collecting and Active. - -- - -- Why exclude transitional states (Destroying, Failing)? - -- These bundles are already in cleanup transitions and their datasets will - -- be freed soon. Including them would cause concurrent auto-delete operations - -- to each see free=0 and keep deleting more bundles than necessary. - -- - -- Why exclude Failed? - -- Failed bundles do NOT occupy datasets (their dataset was expunged). + -- Count bundles occupying datasets (stable states: Collecting, Active). + -- Bundles in other states (Destroying, Failing, Failed) either do not use + -- datasets, or will transition to not using datasets imminently. used_count AS ( SELECT COUNT(*) as used FROM support_bundle WHERE state IN ('collecting', 'active') ), - -- Count active bundles (for min_keep check) + -- Count only Active bundles (which could be deleted). + -- We don't want to auto-delete bundles which are still being collected, so + -- this is effectively a count of 'viable targets to be deleted'. active_count AS ( SELECT COUNT(*) as active FROM support_bundle WHERE state = 'active' ), - -- Calculate absolute thresholds from percentages using CEIL. + -- Calculate how many bundles we want to delete AND are allowed to delete. + -- Uses CROSS JOIN to combine single-row CTEs, making all columns accessible. -- CEIL ensures we always round up, so 10% of 5 datasets = 1, not 0. - thresholds AS ( - SELECT - CEIL((SELECT total FROM dataset_count) * (SELECT target_free_percent FROM config) / 100.0)::INT8 as target_free, - CEIL((SELECT total FROM dataset_count) * (SELECT min_keep_percent FROM config) / 100.0)::INT8 as min_keep - ), - -- Calculate how many bundles we need AND are allowed to delete. - -- min_keep is respected first (max_deletable constraint). deletion_calc AS ( SELECT - (SELECT total FROM dataset_count) as total_datasets, - (SELECT used FROM used_count) as used_datasets, - (SELECT active FROM active_count) as active_bundles, + d.total as total_datasets, + u.used as used_datasets, + a.active as active_bundles, + -- 'Count we want free' - 'Count actually free' GREATEST(0, - (SELECT target_free FROM thresholds) - - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count)) - ) as bundles_needed, + CEIL(d.total * c.target_free_percent / 100.0)::INT8 - (d.total - u.used) + ) as autodeletion_count, + -- 'Count we can delete' - 'Count we must keep' GREATEST(0, - (SELECT active FROM active_count) - (SELECT min_keep FROM thresholds) + a.active - CEIL(d.total * c.min_keep_percent / 100.0)::INT8 ) as max_deletable + FROM dataset_count d + CROSS JOIN used_count u + CROSS JOIN active_count a + CROSS JOIN config c ), -- Find the N oldest active bundles we're allowed to delete. -- Uses lookup_bundle_by_state_and_creation index for ordering. - -- Secondary sort on id ensures deterministic selection when timestamps match, - -- which is critical for preventing over-deletion under concurrent execution. candidates AS ( SELECT id FROM support_bundle WHERE state = 'active' + -- Secondary sort on id ensures deterministic selection when timestamps match, + -- which helps with test determinism if the timestamps don't have sufficient + -- granularity between bundle creation times. ORDER BY time_created ASC, id ASC - LIMIT (SELECT LEAST(bundles_needed, max_deletable) FROM deletion_calc) + LIMIT (SELECT LEAST(autodeletion_count, max_deletable) FROM deletion_calc) ), -- Atomically transition to Destroying (only if still Active). -- The state='active' check handles concurrent user deletions. @@ -1785,8 +1777,6 @@ mod test { logctx.cleanup_successful(); } - // Tests for support_bundle_auto_delete - #[tokio::test] async fn test_auto_deletion_no_bundles() { let logctx = dev::test_setup_log("test_auto_deletion_no_bundles"); @@ -2293,11 +2283,11 @@ mod test { // Tests for the auto_delete CTE query #[tokio::test] - async fn test_auto_delete_query_explains() { + async fn test_auto_delete_query_explain() { use crate::db::explain::ExplainableAsync; use crate::db::pub_test_utils::TestDatabase; - let logctx = dev::test_setup_log("test_auto_delete_query_explains"); + let logctx = dev::test_setup_log("test_auto_delete_query_explain"); let db = TestDatabase::new_with_pool(&logctx.log).await; let conn = db.pool().claim().await.unwrap(); diff --git a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql index d54856c353e..fd38aa1b3cd 100644 --- a/nexus/db-queries/tests/output/support_bundle_auto_delete.sql +++ b/nexus/db-queries/tests/output/support_bundle_auto_delete.sql @@ -8,32 +8,20 @@ WITH used_count AS (SELECT count(*) AS used FROM support_bundle WHERE state IN ('collecting', 'active')), active_count AS (SELECT count(*) AS active FROM support_bundle WHERE state = 'active'), - thresholds - AS ( - SELECT - ceil( - (SELECT total FROM dataset_count) * (SELECT target_free_percent FROM config) / 100.0 - )::INT8 - AS target_free, - ceil( - (SELECT total FROM dataset_count) * (SELECT min_keep_percent FROM config) / 100.0 - )::INT8 - AS min_keep - ), deletion_calc AS ( SELECT - (SELECT total FROM dataset_count) AS total_datasets, - (SELECT used FROM used_count) AS used_datasets, - (SELECT active FROM active_count) AS active_bundles, - greatest( - 0, - (SELECT target_free FROM thresholds) - - ((SELECT total FROM dataset_count) - (SELECT used FROM used_count)) - ) - AS bundles_needed, - greatest(0, (SELECT active FROM active_count) - (SELECT min_keep FROM thresholds)) - AS max_deletable + d.total AS total_datasets, + u.used AS used_datasets, + a.active AS active_bundles, + greatest(0, ceil(d.total * c.target_free_percent / 100.0)::INT8 - (d.total - u.used)) + AS autodeletion_count, + greatest(0, a.active - ceil(d.total * c.min_keep_percent / 100.0)::INT8) AS max_deletable + FROM + dataset_count AS d + CROSS JOIN used_count AS u + CROSS JOIN active_count AS a + CROSS JOIN config AS c ), candidates AS ( @@ -46,7 +34,7 @@ WITH ORDER BY time_created ASC, id ASC LIMIT - (SELECT least(bundles_needed, max_deletable) FROM deletion_calc) + (SELECT least(autodeletion_count, max_deletable) FROM deletion_calc) ), deleted AS (