Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions cmd/hypercache-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,25 @@ import (
// here so operators see one canonical reference and so the magic-number
// linter doesn't flag repeated literals at the env-parse sites.
const (
defaultReplication = 3
defaultCapacity = 100_000
defaultVirtualNodes = 64
defaultIndirectK = 2
suspectMultiplier = 3 // suspect after = N × heartbeat interval
deadMultiplier = 6 // dead after = N × heartbeat interval
defaultHintTTL = 30 * time.Second
defaultHintReplay = 200 * time.Millisecond
defaultHeartbeat = 1 * time.Second
defaultRebalance = 250 * time.Millisecond
defaultReplication = 3
defaultCapacity = 100_000
defaultVirtualNodes = 64
defaultIndirectK = 2
suspectMultiplier = 3 // suspect after = N × heartbeat interval
deadMultiplier = 6 // dead after = N × heartbeat interval
defaultHintTTL = 30 * time.Second
defaultHintReplay = 200 * time.Millisecond
defaultHeartbeat = 1 * time.Second
defaultRebalance = 250 * time.Millisecond
// Membership gossip cadence. Without an enabled gossip loop the
// cluster has no path to re-introduce a previously-removed node:
// peers' heartbeats only probe nodes already in their membership
// list, and the Health endpoint is one-way. A graceful drain →
// restart (the canonical operator workflow) leaves the restarted
// node invisible to the rest of the cluster forever. Default 1s
// matches the heartbeat cadence — gossip+heartbeat together
// disseminate membership changes within a couple of ticks.
defaultGossip = 1 * time.Second
clientAPIReadTimeout = 5 * time.Second
clientAPIWriteTimeout = 5 * time.Second
clientAPIIdleTimeout = 60 * time.Second
Expand Down Expand Up @@ -85,6 +94,7 @@ type envConfig struct {
Heartbeat time.Duration
IndirectK int
RebalanceInt time.Duration
GossipInt time.Duration
}

// loadConfig pulls every knob from the environment and applies sane
Expand Down Expand Up @@ -122,6 +132,7 @@ func loadConfig() (envConfig, error) {
Heartbeat: envDuration("HYPERCACHE_HEARTBEAT", defaultHeartbeat),
IndirectK: envInt("HYPERCACHE_INDIRECT_PROBE_K", defaultIndirectK),
RebalanceInt: envDuration("HYPERCACHE_REBALANCE_INTERVAL", defaultRebalance),
GossipInt: envDuration("HYPERCACHE_GOSSIP_INTERVAL", defaultGossip),
}

return cfg, nil
Expand Down Expand Up @@ -235,6 +246,7 @@ func buildHyperCache(ctx context.Context, cfg envConfig, logger *slog.Logger) (*
backend.WithDistWriteConsistency(backend.ConsistencyQuorum),
backend.WithDistHeartbeat(cfg.Heartbeat, suspectMultiplier*cfg.Heartbeat, deadMultiplier*cfg.Heartbeat),
backend.WithDistIndirectProbes(cfg.IndirectK, cfg.Heartbeat/2),
backend.WithDistGossipInterval(cfg.GossipInt),
backend.WithDistHintTTL(cfg.HintTTL),
backend.WithDistHintReplayInterval(cfg.HintReplay),
backend.WithDistRebalanceInterval(cfg.RebalanceInt),
Expand Down
46 changes: 46 additions & 0 deletions cmd/hypercache-server/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,49 @@ func TestDecodeBase64Bytes_NotPadded(t *testing.T) {
t.Errorf("expected 5-char input to be rejected (len%%4 != 0)")
}
}

// TestLoadConfigGossipInterval pins the gossip-interval wiring that
// fixes the "previously-removed node never rejoins the cluster" bug.
//
// Without gossip enabled, dist_memory.startGossipIfEnabled bails on
// gossipInterval <= 0 and no membership state ever propagates beyond
// the initial seed list. After a graceful drain, the peers' heartbeat
// loop removes the drained node; on restart, the rejoining node
// populates ITS own membership from seeds but has no path to
// re-introduce itself to those peers — only gossip carries that
// information. The Health endpoint is one-way ("ok" / "draining"),
// the heartbeat loop only probes peers already in membership, and
// no other propagation mechanism exists.
//
// The regression pins both the unset-env default (must be > 0 so
// gossip starts by default) and the override path. An integration
// test that exercises actual rejoin propagation across an in-process
// cluster is a follow-up; this pins the load-bearing wiring without
// the harness complexity.
func TestLoadConfigGossipInterval(t *testing.T) {
t.Run("default is non-zero so gossip starts", func(t *testing.T) {
t.Setenv("HYPERCACHE_GOSSIP_INTERVAL", "")

cfg, err := loadConfig()
if err != nil {
t.Fatalf("loadConfig: %v", err)
}

if cfg.GossipInt <= 0 {
t.Fatalf("GossipInt = %v; default must be > 0 (gossip disabled = silent rejoin breakage)", cfg.GossipInt)
}
})

t.Run("env override is honored", func(t *testing.T) {
t.Setenv("HYPERCACHE_GOSSIP_INTERVAL", "750ms")

cfg, err := loadConfig()
if err != nil {
t.Fatalf("loadConfig: %v", err)
}

if cfg.GossipInt.String() != "750ms" {
t.Errorf("GossipInt = %v, want 750ms", cfg.GossipInt)
}
})
}
Binary file modified hypercache-server
Binary file not shown.
1 change: 1 addition & 0 deletions pkg/backend/dist_http_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ func (s *distHTTPServer) handleSet(fctx fiber.Ctx, dm *DistMemory) error {
Version: req.Version,
Origin: req.Origin,
LastUpdated: time.Now(),
LastAccess: time.Now(),
}

dm.applySet(s.ctx, it, req.Replicate)
Expand Down
Loading