From 1a11a5fecd8d755cf5e6255cfa7a3196af99e6f0 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Mon, 11 May 2026 00:59:50 +0000 Subject: [PATCH] fix(e2e): Fix flaky Test_VerifyComponentsAreSuccessfullyStarted_WithRuntimeConfigLoad When a service fails during WaitReady (container starts but crashes due to runtime config validation), it remains registered in the scenario's services slice. The next attempt to start a service with the same name then fails with "another service with the same name has already been started". Fix by: 1. Calling s.Stop() after expected StartAndWaitReady failures to unregister the service before retrying with a new instance. 2. Making ConcreteService.Stop() and Kill() tolerant of already-removed containers (started with --rm flag) by treating "No such container" errors as successful stops. Signed-off-by: Ben Ye --- integration/e2e/service.go | 12 ++++++++++++ integration/runtime_config_test.go | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/integration/e2e/service.go b/integration/e2e/service.go index 4e2bc86a0b4..c2da00d01da 100644 --- a/integration/e2e/service.go +++ b/integration/e2e/service.go @@ -156,6 +156,12 @@ func (s *ConcreteService) Stop() error { logger.Log("Stopping", s.name) if out, err := RunCommandAndGetOutput("docker", "stop", "--time=30", s.containerName()); err != nil { + // If the container has already exited and been removed (e.g., started + // with --rm), treat it as a successful stop. + if strings.Contains(string(out), "No such container") { + s.usedNetworkName = "" + return nil + } logger.Log(string(out)) return err } @@ -181,6 +187,12 @@ func (s *ConcreteService) Kill() error { logger.Log("Killing", s.name) if out, err := RunCommandAndGetOutput("docker", "kill", s.containerName()); err != nil { + // If the container has already exited and been removed (e.g., started + // with --rm), treat it as a successful kill. + if strings.Contains(string(out), "No such container") { + s.usedNetworkName = "" + return nil + } logger.Log(string(out)) return err } diff --git a/integration/runtime_config_test.go b/integration/runtime_config_test.go index 5762739c797..7ffc43b8d2e 100644 --- a/integration/runtime_config_test.go +++ b/integration/runtime_config_test.go @@ -215,6 +215,10 @@ overrides: "-querier.store-gateway-addresses": strings.Join([]string{storeGateway.NetworkGRPCEndpoint()}, ","), }), "") require.Error(t, s.StartAndWaitReady(querier)) + // Stop the failed service to unregister it before retrying with the same name. + // Ignore error: if the container crashed before Start() completed, the service + // was never registered and Stop() returns "does not exist" which is fine. + _ = s.Stop(querier) // Start Query frontend queryFrontend := e2ecortex.NewQueryFrontendWithConfigFile("query-frontend", "", flags, "") @@ -231,6 +235,8 @@ overrides: // Ruler start, but fail with "-distributor.shard-by-all-labels": "false" ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), mergeFlags(flags, RulerFlags()), "") require.Error(t, s.StartAndWaitReady(ruler)) + // Stop the failed service to unregister it before retrying with the same name. + _ = s.Stop(ruler) // Ruler start, should success with "-distributor.shard-by-all-labels": "true" ruler = e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), mergeFlags(flags, RulerFlags(), map[string]string{ @@ -249,6 +255,8 @@ overrides: // Distributor start, but fail with "-distributor.shard-by-all-labels": "false" distributor := e2ecortex.NewQuerier("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), mergeFlags(flags, map[string]string{}), "") require.Error(t, s.StartAndWaitReady(distributor)) + // Stop the failed service to unregister it before retrying with the same name. + _ = s.Stop(distributor) // Distributor start, should success with "-distributor.shard-by-all-labels": "true" distributor = e2ecortex.NewQuerier("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), mergeFlags(flags, map[string]string{