From 375556fc80ef35fc772fcfee5d33dee111490b6e Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 26 Jun 2026 11:50:45 -0700 Subject: [PATCH] fix(controlplane): decouple draining-CP-row reaper from the drain wall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In remote mode HandoverDrainTimeout is intentionally 0 (unbounded session drain) so a rolling control plane never cuts an in-flight customer import. But the janitor's draining-CP-row reaper was wired to the same field (janitor.maxDrainTimeout = cfg.HandoverDrainTimeout), so its gate `if maxDrainTimeout > 0` was never satisfied and ExpireDrainingControlPlaneInstances never ran. A draining old-generation CP heartbeats on a non-cancellable context, so its cp_instances row stays state=draining with a fresh heartbeat and is never expired. ListOrphanedWorkers only lists workers whose owning CP is expired, so every worker owned by that draining CP becomes invisible to the orphan sweep — they leak, holding full per-worker reservations and pinning their nodes via karpenter.sh/do-not-disrupt. Introduce a dedicated, finite DrainingInstanceExpiryTimeout (default 2h) that bounds how long a draining CP row may linger before the leader janitor force-expires it, leaving HandoverDrainTimeout=0 (session drain) untouched. Resolution goes through effectiveDrainingInstanceExpiry, which never returns 0, so the reaper can no longer be silently disabled by an unset knob. Expiring a draining CP row cannot cut a live import: the orphan sweep already spares workers with an active/reconnecting Flight session. Plumbed through file (draining_instance_expiry_timeout), env (DUCKGRES_DRAINING_INSTANCE_EXPIRY_TIMEOUT), and CLI (--draining-instance-expiry-timeout), mirroring HandoverDrainTimeout. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/duckgres-controlplane/main.go | 35 ++--- configloader/file_config.go | 59 ++++---- configresolve/cliflags.go | 2 + configresolve/cliflags_test.go | 1 + configresolve/resolve.go | 143 +++++++++++-------- controlplane/control.go | 11 +- controlplane/multitenant.go | 29 +++- controlplane/multitenant_default_ttl_test.go | 28 ++++ 8 files changed, 201 insertions(+), 107 deletions(-) diff --git a/cmd/duckgres-controlplane/main.go b/cmd/duckgres-controlplane/main.go index a1ce77de..a276d5d9 100644 --- a/cmd/duckgres-controlplane/main.go +++ b/cmd/duckgres-controlplane/main.go @@ -203,22 +203,23 @@ func main() { MinWorkers: resolved.ProcessMinWorkers, MaxWorkers: resolved.ProcessMaxWorkers, }, - SocketDir: *socketDir, - ConfigPath: *configFile, - WorkerQueueTimeout: resolved.WorkerQueueTimeout, - WorkerIdleTimeout: resolved.WorkerIdleTimeout, - RetireOnSessionEnd: resolved.ProcessRetireOnSessionEnd, - HandoverDrainTimeout: resolved.HandoverDrainTimeout, - MetricsServer: metricsSrv, - WorkerBackend: resolved.WorkerBackend, - ConfigStoreConn: resolved.ConfigStoreConn, - ConfigPollInterval: resolved.ConfigPollInterval, - InternalSecret: resolved.InternalSecret, - InternalSecretFallbacks: resolved.InternalSecretFallbacks, - SNIRoutingMode: resolved.SNIRoutingMode, - ManagedHostnameSuffixes: resolved.ManagedHostnameSuffixes, - DucklingBucketSuffix: resolved.DucklingBucketSuffix, - DuckLakeDefaultSpecVersion: resolved.DuckLakeDefaultSpecVersion, + SocketDir: *socketDir, + ConfigPath: *configFile, + WorkerQueueTimeout: resolved.WorkerQueueTimeout, + WorkerIdleTimeout: resolved.WorkerIdleTimeout, + RetireOnSessionEnd: resolved.ProcessRetireOnSessionEnd, + HandoverDrainTimeout: resolved.HandoverDrainTimeout, + DrainingInstanceExpiryTimeout: resolved.DrainingInstanceExpiryTimeout, + MetricsServer: metricsSrv, + WorkerBackend: resolved.WorkerBackend, + ConfigStoreConn: resolved.ConfigStoreConn, + ConfigPollInterval: resolved.ConfigPollInterval, + InternalSecret: resolved.InternalSecret, + InternalSecretFallbacks: resolved.InternalSecretFallbacks, + SNIRoutingMode: resolved.SNIRoutingMode, + ManagedHostnameSuffixes: resolved.ManagedHostnameSuffixes, + DucklingBucketSuffix: resolved.DucklingBucketSuffix, + DuckLakeDefaultSpecVersion: resolved.DuckLakeDefaultSpecVersion, K8s: controlplane.K8sConfig{ WorkerImage: resolved.K8sWorkerImage, WorkerNamespace: resolved.K8sWorkerNamespace, @@ -244,7 +245,7 @@ func main() { WorkerProfileMinMemory: resolved.K8sWorkerProfileMinMemory, WorkerProfileMaxMemory: resolved.K8sWorkerProfileMaxMemory, WorkerMaxTTL: resolved.K8sWorkerMaxTTL, - WorkerDefaultTTL: resolved.K8sWorkerDefaultTTL, + WorkerDefaultTTL: resolved.K8sWorkerDefaultTTL, AWSRegion: resolved.AWSRegion, }, } diff --git a/configloader/file_config.go b/configloader/file_config.go index 8ad43642..fac2936f 100644 --- a/configloader/file_config.go +++ b/configloader/file_config.go @@ -13,35 +13,36 @@ package configloader // mode (e.g., the worker binary reads but ignores ControlPlane fields); // the cost is one parsed-but-unused struct field per binary. type FileConfig struct { - Host string `yaml:"host"` - Port int `yaml:"port"` - FlightPort int `yaml:"flight_port"` // Control-plane Flight SQL ingress port (0 disables) - FlightSessionIdleTTL string `yaml:"flight_session_idle_ttl"` // e.g., "10m" - FlightSessionReapInterval string `yaml:"flight_session_reap_interval"` // e.g., "1m" - FlightHandleIdleTTL string `yaml:"flight_handle_idle_ttl"` // e.g., "15m" - FlightSessionTokenTTL string `yaml:"flight_session_token_ttl"` // e.g., "1h" - DataDir string `yaml:"data_dir"` - TLS TLSConfig `yaml:"tls"` - Users map[string]string `yaml:"users"` - RateLimit RateLimitFileConfig `yaml:"rate_limit"` - Extensions []string `yaml:"extensions"` - DuckLake DuckLakeFileConfig `yaml:"ducklake"` - Iceberg IcebergFileConfig `yaml:"iceberg"` - FilePersistence bool `yaml:"file_persistence"` - ProcessIsolation bool `yaml:"process_isolation"` - IdleTimeout string `yaml:"idle_timeout"` - SessionInitTimeout string `yaml:"session_init_timeout"` - MemoryLimit string `yaml:"memory_limit"` - Threads int `yaml:"threads"` - MemoryBudget string `yaml:"memory_budget"` - MemoryRebalance *bool `yaml:"memory_rebalance"` - Process ProcessFileConfig `yaml:"process"` - WorkerQueueTimeout string `yaml:"worker_queue_timeout"` - WorkerIdleTimeout string `yaml:"worker_idle_timeout"` - HandoverDrainTimeout string `yaml:"handover_drain_timeout"` - PassthroughUsers []string `yaml:"passthrough_users"` - LogLevel string `yaml:"log_level"` - QueryLog QueryLogFileConfig `yaml:"query_log"` + Host string `yaml:"host"` + Port int `yaml:"port"` + FlightPort int `yaml:"flight_port"` // Control-plane Flight SQL ingress port (0 disables) + FlightSessionIdleTTL string `yaml:"flight_session_idle_ttl"` // e.g., "10m" + FlightSessionReapInterval string `yaml:"flight_session_reap_interval"` // e.g., "1m" + FlightHandleIdleTTL string `yaml:"flight_handle_idle_ttl"` // e.g., "15m" + FlightSessionTokenTTL string `yaml:"flight_session_token_ttl"` // e.g., "1h" + DataDir string `yaml:"data_dir"` + TLS TLSConfig `yaml:"tls"` + Users map[string]string `yaml:"users"` + RateLimit RateLimitFileConfig `yaml:"rate_limit"` + Extensions []string `yaml:"extensions"` + DuckLake DuckLakeFileConfig `yaml:"ducklake"` + Iceberg IcebergFileConfig `yaml:"iceberg"` + FilePersistence bool `yaml:"file_persistence"` + ProcessIsolation bool `yaml:"process_isolation"` + IdleTimeout string `yaml:"idle_timeout"` + SessionInitTimeout string `yaml:"session_init_timeout"` + MemoryLimit string `yaml:"memory_limit"` + Threads int `yaml:"threads"` + MemoryBudget string `yaml:"memory_budget"` + MemoryRebalance *bool `yaml:"memory_rebalance"` + Process ProcessFileConfig `yaml:"process"` + WorkerQueueTimeout string `yaml:"worker_queue_timeout"` + WorkerIdleTimeout string `yaml:"worker_idle_timeout"` + HandoverDrainTimeout string `yaml:"handover_drain_timeout"` + DrainingInstanceExpiryTimeout string `yaml:"draining_instance_expiry_timeout"` + PassthroughUsers []string `yaml:"passthrough_users"` + LogLevel string `yaml:"log_level"` + QueryLog QueryLogFileConfig `yaml:"query_log"` // Worker backend configuration WorkerBackend string `yaml:"worker_backend"` // "process" (default) or "remote" diff --git a/configresolve/cliflags.go b/configresolve/cliflags.go index 34308045..431dbed5 100644 --- a/configresolve/cliflags.go +++ b/configresolve/cliflags.go @@ -52,6 +52,7 @@ func RegisterCLIInputsFlags(fs *flag.FlagSet) func() CLIInputs { workerQueueTimeout := fs.String("worker-queue-timeout", "", "How long to wait for an available worker/org connection slot (e.g., '60s') (env: DUCKGRES_WORKER_QUEUE_TIMEOUT)") workerIdleTimeout := fs.String("worker-idle-timeout", "", "How long to keep an idle worker alive (e.g., '5m') (env: DUCKGRES_WORKER_IDLE_TIMEOUT)") handoverDrainTimeout := fs.String("handover-drain-timeout", "", "How long to wait for planned shutdowns/upgrades to drain before forcing exit (default: '24h' in process mode, '15m' in remote mode) (env: DUCKGRES_HANDOVER_DRAIN_TIMEOUT)") + drainingInstanceExpiryTimeout := fs.String("draining-instance-expiry-timeout", "", "How long a draining control-plane row may linger before the leader janitor force-expires it so its orphaned workers become reapable. Independent of --handover-drain-timeout. (default: '2h') (env: DUCKGRES_DRAINING_INSTANCE_EXPIRY_TIMEOUT)") acmeDomain := fs.String("acme-domain", "", "Domain for ACME/Let's Encrypt certificate (env: DUCKGRES_ACME_DOMAIN)") acmeEmail := fs.String("acme-email", "", "Contact email for Let's Encrypt notifications (env: DUCKGRES_ACME_EMAIL)") acmeCacheDir := fs.String("acme-cache-dir", "", "Directory for ACME certificate cache (env: DUCKGRES_ACME_CACHE_DIR)") @@ -112,6 +113,7 @@ func RegisterCLIInputsFlags(fs *flag.FlagSet) func() CLIInputs { cli.WorkerQueueTimeout = *workerQueueTimeout cli.WorkerIdleTimeout = *workerIdleTimeout cli.HandoverDrainTimeout = *handoverDrainTimeout + cli.DrainingInstanceExpiryTimeout = *drainingInstanceExpiryTimeout cli.ACMEDomain = *acmeDomain cli.ACMEEmail = *acmeEmail cli.ACMECacheDir = *acmeCacheDir diff --git a/configresolve/cliflags_test.go b/configresolve/cliflags_test.go index 5dea829b..24c9a595 100644 --- a/configresolve/cliflags_test.go +++ b/configresolve/cliflags_test.go @@ -63,6 +63,7 @@ func fieldNameToFlagName(name string) string { {"WorkerQueueTimeout", "worker-queue-timeout"}, {"WorkerIdleTimeout", "worker-idle-timeout"}, {"HandoverDrainTimeout", "handover-drain-timeout"}, + {"DrainingInstanceExpiryTimeout", "draining-instance-expiry-timeout"}, {"ProcessMinWorkers", "process-min-workers"}, {"ProcessMaxWorkers", "process-max-workers"}, {"ProcessRetireOnSessionEnd", "process-retire-on-session-end"}, diff --git a/configresolve/resolve.go b/configresolve/resolve.go index 241aeef0..708705e0 100644 --- a/configresolve/resolve.go +++ b/configresolve/resolve.go @@ -27,65 +27,66 @@ import ( type CLIInputs struct { Set map[string]bool - Host string - Port int - FlightPort int - FlightSessionIdleTTL string - FlightSessionReapInterval string - FlightHandleIdleTTL string - FlightSessionTokenTTL string - DataDir string - CertFile string - KeyFile string - FilePersistence bool - ProcessIsolation bool - IdleTimeout string - SessionInitTimeout string - MemoryLimit string - Threads int - MemoryBudget string - MemoryRebalance bool - DuckLakeDeltaCatalogEnabled bool - DuckLakeDeltaCatalogPath string - DuckLakeDefaultSpecVersion string - IcebergEnabled bool - IcebergRegion string - IcebergNamespace string - ProcessMinWorkers int - ProcessMaxWorkers int - ProcessRetireOnSessionEnd bool - WorkerQueueTimeout string - WorkerIdleTimeout string - HandoverDrainTimeout string - ACMEDomain string - ACMEEmail string - ACMECacheDir string - ACMEDNSProvider string - ACMEDNSZoneID string - MaxConnections int - ConfigStoreConn string - ConfigPollInterval string - InternalSecret string - InternalSecretFallbacks string - SNIRoutingMode string - ManagedHostnameSuffixes string - WorkerBackend string - K8sWorkerImage string - K8sWorkerNamespace string - K8sControlPlaneID string - K8sWorkerPort int - K8sWorkerSecret string - K8sWorkerConfigMap string - K8sWorkerImagePullPolicy string - K8sWorkerServiceAccount string - K8sMaxWorkers int - K8sWorkerCPURequest string - K8sWorkerMemoryRequest string - K8sWorkerNodeSelector string - K8sWorkerTolerationKey string - K8sWorkerTolerationValue string - AWSRegion string - QueryLog bool + Host string + Port int + FlightPort int + FlightSessionIdleTTL string + FlightSessionReapInterval string + FlightHandleIdleTTL string + FlightSessionTokenTTL string + DataDir string + CertFile string + KeyFile string + FilePersistence bool + ProcessIsolation bool + IdleTimeout string + SessionInitTimeout string + MemoryLimit string + Threads int + MemoryBudget string + MemoryRebalance bool + DuckLakeDeltaCatalogEnabled bool + DuckLakeDeltaCatalogPath string + DuckLakeDefaultSpecVersion string + IcebergEnabled bool + IcebergRegion string + IcebergNamespace string + ProcessMinWorkers int + ProcessMaxWorkers int + ProcessRetireOnSessionEnd bool + WorkerQueueTimeout string + WorkerIdleTimeout string + HandoverDrainTimeout string + DrainingInstanceExpiryTimeout string + ACMEDomain string + ACMEEmail string + ACMECacheDir string + ACMEDNSProvider string + ACMEDNSZoneID string + MaxConnections int + ConfigStoreConn string + ConfigPollInterval string + InternalSecret string + InternalSecretFallbacks string + SNIRoutingMode string + ManagedHostnameSuffixes string + WorkerBackend string + K8sWorkerImage string + K8sWorkerNamespace string + K8sControlPlaneID string + K8sWorkerPort int + K8sWorkerSecret string + K8sWorkerConfigMap string + K8sWorkerImagePullPolicy string + K8sWorkerServiceAccount string + K8sMaxWorkers int + K8sWorkerCPURequest string + K8sWorkerMemoryRequest string + K8sWorkerNodeSelector string + K8sWorkerTolerationKey string + K8sWorkerTolerationValue string + AWSRegion string + QueryLog bool } type Resolved struct { @@ -97,6 +98,7 @@ type Resolved struct { WorkerQueueTimeout time.Duration WorkerIdleTimeout time.Duration HandoverDrainTimeout time.Duration + DrainingInstanceExpiryTimeout time.Duration WorkerBackend string K8sWorkerImage string K8sWorkerNamespace string @@ -196,6 +198,7 @@ func ResolveEffective(fileCfg *configloader.FileConfig, cli CLIInputs, getenv fu workerQueueTimeout := 60 * time.Second var workerIdleTimeout time.Duration var handoverDrainTimeout time.Duration + var drainingInstanceExpiryTimeout time.Duration var processMinWorkers, processMaxWorkers int var processRetireOnSessionEnd bool var workerBackend string @@ -427,6 +430,13 @@ func ResolveEffective(fileCfg *configloader.FileConfig, cli CLIInputs, getenv fu warn("Invalid handover_drain_timeout duration: " + err.Error()) } } + if fileCfg.DrainingInstanceExpiryTimeout != "" { + if d, err := time.ParseDuration(fileCfg.DrainingInstanceExpiryTimeout); err == nil { + drainingInstanceExpiryTimeout = d + } else { + warn("Invalid draining_instance_expiry_timeout duration: " + err.Error()) + } + } if len(fileCfg.PassthroughUsers) > 0 { cfg.PassthroughUsers = make(map[string]bool, len(fileCfg.PassthroughUsers)) for _, u := range fileCfg.PassthroughUsers { @@ -760,6 +770,13 @@ func ResolveEffective(fileCfg *configloader.FileConfig, cli CLIInputs, getenv fu warn("Invalid DUCKGRES_HANDOVER_DRAIN_TIMEOUT duration: " + err.Error()) } } + if v := getenv("DUCKGRES_DRAINING_INSTANCE_EXPIRY_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + drainingInstanceExpiryTimeout = d + } else { + warn("Invalid DUCKGRES_DRAINING_INSTANCE_EXPIRY_TIMEOUT duration: " + err.Error()) + } + } if v := getenv("DUCKGRES_ACME_DOMAIN"); v != "" { cfg.ACMEDomain = v } @@ -1101,6 +1118,13 @@ func ResolveEffective(fileCfg *configloader.FileConfig, cli CLIInputs, getenv fu warn("Invalid --handover-drain-timeout duration: " + err.Error()) } } + if cli.Set["draining-instance-expiry-timeout"] { + if d, err := time.ParseDuration(cli.DrainingInstanceExpiryTimeout); err == nil { + drainingInstanceExpiryTimeout = d + } else { + warn("Invalid --draining-instance-expiry-timeout duration: " + err.Error()) + } + } if cli.Set["acme-domain"] { cfg.ACMEDomain = cli.ACMEDomain } @@ -1260,6 +1284,7 @@ func ResolveEffective(fileCfg *configloader.FileConfig, cli CLIInputs, getenv fu WorkerQueueTimeout: workerQueueTimeout, WorkerIdleTimeout: workerIdleTimeout, HandoverDrainTimeout: handoverDrainTimeout, + DrainingInstanceExpiryTimeout: drainingInstanceExpiryTimeout, WorkerBackend: workerBackend, K8sWorkerImage: k8sWorkerImage, K8sWorkerNamespace: k8sWorkerNamespace, diff --git a/controlplane/control.go b/controlplane/control.go index 1a30617c..e34b4068 100644 --- a/controlplane/control.go +++ b/controlplane/control.go @@ -44,7 +44,16 @@ type ControlPlaneConfig struct { WorkerIdleTimeout time.Duration // How long to keep an idle worker alive (default: 5m) RetireOnSessionEnd bool // When true, process workers are retired immediately after their last session ends. HandoverDrainTimeout time.Duration // How long to wait for connections to drain during upgrade. 0 = unbounded (wait until k8s SIGKILL via terminationGracePeriodSeconds). Default: 0 in remote mode (so a CP rolling out doesn't kill in-flight customer queries at a self-imposed wall — see drainAndShutdown), 24h in process mode. - MetricsServer *http.Server // Optional metrics server to shut down during upgrade + // DrainingInstanceExpiryTimeout bounds how long a *draining* control-plane + // row may linger before the leader janitor force-expires it, so its orphaned + // workers become visible to the orphan sweep (ListOrphanedWorkers only lists + // workers whose owning CP is expired). This is deliberately independent of + // HandoverDrainTimeout: session draining stays unbounded (0) in remote mode so + // a rollout never cuts an in-flight import, while the draining *row* still gets + // a finite life so leaked workers can be reaped. 0 = use the built-in default + // (see effectiveDrainingInstanceExpiry); the reaper is never disabled. + DrainingInstanceExpiryTimeout time.Duration + MetricsServer *http.Server // Optional metrics server to shut down during upgrade // WorkerBackend selects the worker management backend. // "process" (default): workers are local child processes communicating over Unix sockets. diff --git a/controlplane/multitenant.go b/controlplane/multitenant.go index d89d9c6d..b385404d 100644 --- a/controlplane/multitenant.go +++ b/controlplane/multitenant.go @@ -41,6 +41,29 @@ func effectiveDefaultWorkerTTL(configured time.Duration) time.Duration { return defaultWorkerTTL } +// defaultDrainingInstanceExpiry is the built-in cap on how long a draining +// control-plane row may linger before the leader janitor force-expires it. It +// is deliberately generous: it only needs to bound leaked-worker dwell, and a +// value shorter than the longest legitimate session drain could expire a CP row +// while it is still serving (its actively-serving workers stay protected by the +// orphan sweep's Flight-session guard, but a finite cap should still not be +// aggressive). Operators tune it via DrainingInstanceExpiryTimeout. +const defaultDrainingInstanceExpiry = 2 * time.Hour + +// effectiveDrainingInstanceExpiry resolves the finite cap the leader janitor +// uses to expire stuck *draining* control-plane rows so their orphaned workers +// become reapable. A non-positive configured value falls back to +// defaultDrainingInstanceExpiry so the reaper is NEVER disabled by an unset +// knob — which is exactly the trap that let HandoverDrainTimeout=0 (unbounded +// session drain, correct) silently turn the draining-row reaper off when the +// two were wired to the same field. +func effectiveDrainingInstanceExpiry(configured time.Duration) time.Duration { + if configured > 0 { + return configured + } + return defaultDrainingInstanceExpiry +} + func (a *orgRouterAdapter) StackForOrg(orgID string) (WorkerPool, *SessionManager, *MemoryRebalancer, bool) { stack, ok := a.router.StackForOrg(orgID) if !ok { @@ -251,7 +274,11 @@ func SetupMultiTenant( 5*time.Second, ) janitor := NewControlPlaneJanitor(store, 5*time.Second, 20*time.Second) - janitor.maxDrainTimeout = cfg.HandoverDrainTimeout + // NOT cfg.HandoverDrainTimeout: that knob is the *session* drain wall and is + // intentionally 0 (unbounded) in remote mode, which would disable the + // draining-CP-row reaper (janitor.go: `if maxDrainTimeout > 0`) and leave a + // draining CP's orphaned workers unreapable. Use the dedicated finite cap. + janitor.maxDrainTimeout = effectiveDrainingInstanceExpiry(cfg.DrainingInstanceExpiryTimeout) janitor.hotIdleTTL = effectiveDefaultWorkerTTL(cfg.K8s.WorkerDefaultTTL) // Per-worker transitions (orphan retire, stuck reaper, hot-idle TTL) // all flow through this lifecycle service. The legacy retireWorker / diff --git a/controlplane/multitenant_default_ttl_test.go b/controlplane/multitenant_default_ttl_test.go index 0d3a6cbb..da819f65 100644 --- a/controlplane/multitenant_default_ttl_test.go +++ b/controlplane/multitenant_default_ttl_test.go @@ -18,3 +18,31 @@ func TestEffectiveDefaultWorkerTTL(t *testing.T) { t.Fatalf("expected negative hot-idle TTL to fall back to default, got %s", got) } } + +// TestEffectiveDrainingInstanceExpiry guards the draining-CP-row reaper against +// ever being disabled by an unset/zero cap — the exact regression where wiring +// janitor.maxDrainTimeout to HandoverDrainTimeout (0 in remote mode) silently +// turned the reaper off (`if maxDrainTimeout > 0` in the janitor), stranding a +// draining CP's orphaned workers. The resolver must NEVER return 0. +func TestEffectiveDrainingInstanceExpiry(t *testing.T) { + cases := []struct { + name string + configured time.Duration + want time.Duration + }{ + {"unset falls back to default", 0, defaultDrainingInstanceExpiry}, + {"negative falls back to default", -time.Minute, defaultDrainingInstanceExpiry}, + {"configured value wins", 6 * time.Hour, 6 * time.Hour}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := effectiveDrainingInstanceExpiry(tc.configured) + if got != tc.want { + t.Fatalf("effectiveDrainingInstanceExpiry(%s) = %s, want %s", tc.configured, got, tc.want) + } + if got <= 0 { + t.Fatalf("effectiveDrainingInstanceExpiry(%s) returned non-positive %s — reaper would be disabled", tc.configured, got) + } + }) + } +}