diff --git a/cmd/hypercache-server/main.go b/cmd/hypercache-server/main.go index c439a8c..745dfd0 100644 --- a/cmd/hypercache-server/main.go +++ b/cmd/hypercache-server/main.go @@ -48,16 +48,25 @@ import ( // here so operators see one canonical reference and so the magic-number // linter doesn't flag repeated literals at the env-parse sites. const ( - defaultReplication = 3 - defaultCapacity = 100_000 - defaultVirtualNodes = 64 - defaultIndirectK = 2 - suspectMultiplier = 3 // suspect after = N × heartbeat interval - deadMultiplier = 6 // dead after = N × heartbeat interval - defaultHintTTL = 30 * time.Second - defaultHintReplay = 200 * time.Millisecond - defaultHeartbeat = 1 * time.Second - defaultRebalance = 250 * time.Millisecond + defaultReplication = 3 + defaultCapacity = 100_000 + defaultVirtualNodes = 64 + defaultIndirectK = 2 + suspectMultiplier = 3 // suspect after = N × heartbeat interval + deadMultiplier = 6 // dead after = N × heartbeat interval + defaultHintTTL = 30 * time.Second + defaultHintReplay = 200 * time.Millisecond + defaultHeartbeat = 1 * time.Second + defaultRebalance = 250 * time.Millisecond + // Membership gossip cadence. Without an enabled gossip loop the + // cluster has no path to re-introduce a previously-removed node: + // peers' heartbeats only probe nodes already in their membership + // list, and the Health endpoint is one-way. A graceful drain → + // restart (the canonical operator workflow) leaves the restarted + // node invisible to the rest of the cluster forever. Default 1s + // matches the heartbeat cadence — gossip+heartbeat together + // disseminate membership changes within a couple of ticks. + defaultGossip = 1 * time.Second clientAPIReadTimeout = 5 * time.Second clientAPIWriteTimeout = 5 * time.Second clientAPIIdleTimeout = 60 * time.Second @@ -85,6 +94,7 @@ type envConfig struct { Heartbeat time.Duration IndirectK int RebalanceInt time.Duration + GossipInt time.Duration } // loadConfig pulls every knob from the environment and applies sane @@ -122,6 +132,7 @@ func loadConfig() (envConfig, error) { Heartbeat: envDuration("HYPERCACHE_HEARTBEAT", defaultHeartbeat), IndirectK: envInt("HYPERCACHE_INDIRECT_PROBE_K", defaultIndirectK), RebalanceInt: envDuration("HYPERCACHE_REBALANCE_INTERVAL", defaultRebalance), + GossipInt: envDuration("HYPERCACHE_GOSSIP_INTERVAL", defaultGossip), } return cfg, nil @@ -235,6 +246,7 @@ func buildHyperCache(ctx context.Context, cfg envConfig, logger *slog.Logger) (* backend.WithDistWriteConsistency(backend.ConsistencyQuorum), backend.WithDistHeartbeat(cfg.Heartbeat, suspectMultiplier*cfg.Heartbeat, deadMultiplier*cfg.Heartbeat), backend.WithDistIndirectProbes(cfg.IndirectK, cfg.Heartbeat/2), + backend.WithDistGossipInterval(cfg.GossipInt), backend.WithDistHintTTL(cfg.HintTTL), backend.WithDistHintReplayInterval(cfg.HintReplay), backend.WithDistRebalanceInterval(cfg.RebalanceInt), diff --git a/cmd/hypercache-server/main_test.go b/cmd/hypercache-server/main_test.go index 8482aea..6cdc7d9 100644 --- a/cmd/hypercache-server/main_test.go +++ b/cmd/hypercache-server/main_test.go @@ -174,3 +174,49 @@ func TestDecodeBase64Bytes_NotPadded(t *testing.T) { t.Errorf("expected 5-char input to be rejected (len%%4 != 0)") } } + +// TestLoadConfigGossipInterval pins the gossip-interval wiring that +// fixes the "previously-removed node never rejoins the cluster" bug. +// +// Without gossip enabled, dist_memory.startGossipIfEnabled bails on +// gossipInterval <= 0 and no membership state ever propagates beyond +// the initial seed list. After a graceful drain, the peers' heartbeat +// loop removes the drained node; on restart, the rejoining node +// populates ITS own membership from seeds but has no path to +// re-introduce itself to those peers — only gossip carries that +// information. The Health endpoint is one-way ("ok" / "draining"), +// the heartbeat loop only probes peers already in membership, and +// no other propagation mechanism exists. +// +// The regression pins both the unset-env default (must be > 0 so +// gossip starts by default) and the override path. An integration +// test that exercises actual rejoin propagation across an in-process +// cluster is a follow-up; this pins the load-bearing wiring without +// the harness complexity. +func TestLoadConfigGossipInterval(t *testing.T) { + t.Run("default is non-zero so gossip starts", func(t *testing.T) { + t.Setenv("HYPERCACHE_GOSSIP_INTERVAL", "") + + cfg, err := loadConfig() + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + + if cfg.GossipInt <= 0 { + t.Fatalf("GossipInt = %v; default must be > 0 (gossip disabled = silent rejoin breakage)", cfg.GossipInt) + } + }) + + t.Run("env override is honored", func(t *testing.T) { + t.Setenv("HYPERCACHE_GOSSIP_INTERVAL", "750ms") + + cfg, err := loadConfig() + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + + if cfg.GossipInt.String() != "750ms" { + t.Errorf("GossipInt = %v, want 750ms", cfg.GossipInt) + } + }) +} diff --git a/hypercache-server b/hypercache-server index 222b5bc..212b26a 100755 Binary files a/hypercache-server and b/hypercache-server differ diff --git a/pkg/backend/dist_http_server.go b/pkg/backend/dist_http_server.go index 7fd0f4d..a92c8d8 100644 --- a/pkg/backend/dist_http_server.go +++ b/pkg/backend/dist_http_server.go @@ -384,6 +384,7 @@ func (s *distHTTPServer) handleSet(fctx fiber.Ctx, dm *DistMemory) error { Version: req.Version, Origin: req.Origin, LastUpdated: time.Now(), + LastAccess: time.Now(), } dm.applySet(s.ctx, it, req.Replicate)