diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index fcd3eb57e25f..2d8b674f861b 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -112,6 +112,14 @@ where } } + pub(crate) fn try_exclusive(&self, key: T, operation: I) -> Option> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + let mut guard = TracingExclusiveGuard::new(entry.try_write_owned().ok()?); + *guard.guard = Some(operation); + Some(guard) + } + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do /// periodic housekeeping to avoid the map growing indefinitely pub(crate) fn housekeeping(&self) { diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index cc377e606ee8..c5eb106f24ef 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -97,6 +97,7 @@ pub(crate) enum DatabaseOperation { TenantGenerations, ShardGenerations, ListTenantShards, + LoadTenant, InsertTenantShards, UpdateTenantShard, DeleteTenant, @@ -330,11 +331,40 @@ impl Persistence { /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. - pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { + /// + /// We exclude shards configured to be detached. During startup, if we see any attached locations + /// for such shards, they will automatically be detached as 'orphans'. + pub(crate) async fn load_active_tenant_shards( + &self, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; self.with_measured_conn( DatabaseOperation::ListTenantShards, move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) + let query = tenant_shards.filter( + placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + ); + let result = query.load::(conn)?; + + Ok(result) + }, + ) + .await + } + + /// When restoring a previously detached tenant into memory, load it from the database + pub(crate) async fn load_tenant( + &self, + filter_tenant_id: TenantId, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::LoadTenant, + move |conn| -> DatabaseResult<_> { + let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string())); + let result = query.load::(conn)?; + + Ok(result) }, ) .await diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 359fcb3288e3..fd4ee7fd101e 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -155,6 +155,7 @@ enum TenantOperations { TimelineArchivalConfig, TimelineDetachAncestor, TimelineGcBlockUnblock, + DropDetached, } #[derive(Clone, strum_macros::Display)] @@ -416,8 +417,8 @@ pub struct Service { /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile /// Send into this queue to promptly attempt to reconcile this shard next time units are available. /// - /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler - /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// Note that this state logically lives inside ServiceState, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceState. This could be optimized to /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. delayed_reconcile_tx: tokio::sync::mpsc::Sender, @@ -1165,6 +1166,20 @@ impl Service { } } + // If we just finished detaching all shards for a tenant, it might be time to drop it from memory. + if tenant.policy == PlacementPolicy::Detached { + // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us + // from concurrent execution wrt a request handler that might expect the tenant to remain in memory for the + // duration of the request. + let guard = self.tenant_op_locks.try_exclusive( + tenant.tenant_shard_id.tenant_id, + TenantOperations::DropDetached, + ); + if let Some(guard) = guard { + self.maybe_drop_tenant(tenant.tenant_shard_id.tenant_id, &mut locked, &guard); + } + } + // Maybe some other work can proceed now that this job finished. if self.reconciler_concurrency.available_permits() > 0 { while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { @@ -1294,7 +1309,7 @@ impl Service { .set(nodes.len() as i64); tracing::info!("Loading shards from database..."); - let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; + let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; tracing::info!( "Loaded {} shards from database.", tenant_shard_persistence.len() @@ -1546,8 +1561,14 @@ impl Service { // the pageserver API (not via this service), we will auto-create any missing tenant // shards with default state. let insert = { - let locked = self.inner.write().unwrap(); - !locked.tenants.contains_key(&attach_req.tenant_shard_id) + match self + .maybe_load_tenant(attach_req.tenant_shard_id.tenant_id, &_tenant_lock) + .await + { + Ok(_) => false, + Err(ApiError::NotFound(_)) => true, + Err(e) => return Err(e.into()), + } }; if insert { @@ -2439,6 +2460,99 @@ impl Service { } } + /// For APIs that might act on tenants with [`PlacementPolicy::Detached`], first check if + /// the tenant is present in memory. If not, load it from the database. If it is found + /// in neither location, return a NotFound error. + /// + /// Caller must demonstrate they hold a lock guard, as otherwise two callers might try and load + /// it at the same time, or we might race with [`Self::maybe_drop_tenant`] + async fn maybe_load_tenant( + &self, + tenant_id: TenantId, + _guard: &TracingExclusiveGuard, + ) -> Result<(), ApiError> { + let present_in_memory = { + let locked = self.inner.read().unwrap(); + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + .is_some() + }; + + if present_in_memory { + return Ok(()); + } + + let tenant_shards = self.persistence.load_tenant(tenant_id).await?; + if tenant_shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )); + } + + // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running + // compute, so no benefit to making AZ sticky across detaches. + + let mut locked = self.inner.write().unwrap(); + tracing::info!( + "Loaded {} shards for tenant {}", + tenant_shards.len(), + tenant_id + ); + + locked.tenants.extend(tenant_shards.into_iter().map(|p| { + let intent = IntentState::new(); + let shard = + TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database"); + + // Sanity check: when loading on-demand, we should always be loaded something Detached + debug_assert!(shard.policy == PlacementPolicy::Detached); + if shard.policy != PlacementPolicy::Detached { + tracing::error!( + "Tenant shard {} loaded on-demand, but has non-Detached policy {:?}", + shard.tenant_shard_id, + shard.policy + ); + } + + (shard.tenant_shard_id, shard) + })); + + Ok(()) + } + + /// If all shards for a tenant are detached, and in a fully quiescent state (no observed locations on pageservers), + /// and have no reconciler running, then we can drop the tenant from memory. It will be reloaded on-demand + /// if we are asked to attach it again (see [`Self::maybe_load_tenant`]). + /// + /// Caller must demonstrate they hold a lock guard, as otherwise it is unsafe to drop a tenant from + /// memory while some other function might assume it continues to exist while not holding the lock on Self::inner. + fn maybe_drop_tenant( + &self, + tenant_id: TenantId, + locked: &mut std::sync::RwLockWriteGuard, + _guard: &TracingExclusiveGuard, + ) { + let mut tenant_shards = locked.tenants.range(TenantShardId::tenant_range(tenant_id)); + if tenant_shards.all(|(_id, shard)| { + shard.policy == PlacementPolicy::Detached + && shard.reconciler.is_none() + && shard.observed.is_empty() + }) { + let keys = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(id, _)| id) + .copied() + .collect::>(); + for key in keys { + tracing::info!("Dropping detached tenant shard {} from memory", key); + locked.tenants.remove(&key); + } + } + } + /// This API is used by the cloud control plane to migrate unsharded tenants that it created /// directly with pageservers into this service. /// @@ -2465,14 +2579,26 @@ impl Service { ) .await; - if !tenant_shard_id.is_unsharded() { + let tenant_id = if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( "This API is for importing single-sharded or unsharded tenants" ))); - } + } else { + tenant_shard_id.tenant_id + }; + + // In case we are waking up a Detached tenant + match self.maybe_load_tenant(tenant_id, &_tenant_lock).await { + Ok(()) | Err(ApiError::NotFound(_)) => { + // This is a creation or an update + } + Err(e) => { + return Err(e); + } + }; // First check if this is a creation or an update - let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); + let create_or_update = self.tenant_location_config_prepare(tenant_id, req); let mut result = TenantLocationConfigResponse { shards: Vec::new(), @@ -2600,6 +2726,8 @@ impl Service { let tenant_id = req.tenant_id; let patch = req.config; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + let base = { let locked = self.inner.read().unwrap(); let shards = locked @@ -2644,19 +2772,7 @@ impl Service { ) .await; - let tenant_exists = { - let locked = self.inner.read().unwrap(); - let mut r = locked - .tenants - .range(TenantShardId::tenant_range(req.tenant_id)); - r.next().is_some() - }; - - if !tenant_exists { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(), - )); - } + self.maybe_load_tenant(req.tenant_id, &_tenant_lock).await?; self.set_tenant_config_and_reconcile(req.tenant_id, req.config) .await @@ -2949,6 +3065,8 @@ impl Service { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + // Detach all shards. This also deletes local pageserver shard data. let (detach_waiters, node) = { let mut detach_waiters = Vec::new(); @@ -3068,6 +3186,8 @@ impl Service { ) .await; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); let TenantPolicyRequest { @@ -5150,11 +5270,13 @@ impl Service { ))); } - let mut shards = self.persistence.list_tenant_shards().await?; - shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + let mut persistent_shards = self.persistence.load_active_tenant_shards().await?; + persistent_shards + .sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); - if shards != expect_shards { + if persistent_shards != expect_shards { tracing::error!("Consistency check failed on shards."); tracing::error!( "Shards in memory: {}", @@ -5163,7 +5285,7 @@ impl Service { ); tracing::error!( "Shards in database: {}", - serde_json::to_string(&shards) + serde_json::to_string(&persistent_shards) .map_err(|e| ApiError::InternalServerError(e.into()))? ); return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -6119,6 +6241,10 @@ impl Service { let mut pending_reconciles = 0; let mut az_violations = 0; + // If we find any tenants to drop from memory, stash them to offload after + // we're done traversing the map of tenants. + let mut drop_detached_tenants = Vec::new(); + let mut reconciles_spawned = 0; for shard in tenants.values_mut() { // Accumulate scheduling statistics @@ -6152,6 +6278,25 @@ impl Service { // Shard wanted to reconcile but for some reason couldn't. pending_reconciles += 1; } + + // If this tenant is detached, try dropping it from memory. This is usually done + // proactively in [`Self::process_results`], but we do it here to handle the edge + // case where a reconcile completes while someone else is holding an op lock for the tenant. + if shard.tenant_shard_id.shard_number == ShardNumber(0) + && shard.policy == PlacementPolicy::Detached + { + if let Some(guard) = self.tenant_op_locks.try_exclusive( + shard.tenant_shard_id.tenant_id, + TenantOperations::DropDetached, + ) { + drop_detached_tenants.push((shard.tenant_shard_id.tenant_id, guard)); + } + } + } + + // Process any deferred tenant drops + for (tenant_id, guard) in drop_detached_tenants { + self.maybe_drop_tenant(tenant_id, &mut locked, &guard); } metrics::METRICS_REGISTRY diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index cba579e8a749..c17989a316c7 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -465,6 +465,10 @@ impl ObservedState { locations: HashMap::new(), } } + + pub(crate) fn is_empty(&self) -> bool { + self.locations.is_empty() + } } impl TenantShard { diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 973d0cdf82cf..207f55a2144a 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3299,13 +3299,116 @@ def test_storage_controller_detached_stopped( "generation": None, }, ) - + env.storage_controller.reconcile_until_idle() env.storage_controller.consistency_check() # Confirm the detach happened assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == [] +@run_only_on_default_postgres("Postgres version makes no difference here") +def test_storage_controller_detach_lifecycle( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that detached tenants are handled properly through their lifecycle: getting dropped + from memory when detached, then getting loaded back on-demand. + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_configs() + env.start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=1, + ) + virtual_ps_http.timeline_create(PgVersion.NOT_SET, tenant_id, timeline_id) + + remote_prefix = "/".join( + ( + "tenants", + str(tenant_id), + ) + ) + # We will later check data is gone after deletion, so as a control check that it is present to begin with + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + assert len(env.storage_controller.tenant_list()) == 1 + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + # Ensure reconciles are done (the one we do inline in location_conf is advisory and if it takes too long that API just succeeds anyway) + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # Confirm the detach happened on pageserver + assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == [] + # Confirm the tenant is not in memory on the controller + assert env.storage_controller.tenant_list() == [] + + # The detached tenant does not get loaded into memory across a controller restart + env.storage_controller.stop() + env.storage_controller.start() + assert env.storage_controller.tenant_list() == [] + env.storage_controller.consistency_check() + + # The detached tenant can be re-attached + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + assert len(env.storage_controller.tenant_list()) == 1 + env.storage_controller.consistency_check() + + # Detach it again before doing deletion + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # A detached tenant can be deleted + virtual_ps_http.tenant_delete(tenant_id) + + # Such deletions really work (empty remote storage) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + @run_only_on_default_postgres("Postgres version makes no difference here") def test_storage_controller_node_flap_detach_race( neon_env_builder: NeonEnvBuilder,