Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 83 additions & 7 deletions tests/systemtests/audit_empty_active_set_bootstrap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,23 +115,29 @@ func TestAuditEmptyActiveSetBootstrap_HostOnlyReportsRecover(t *testing.T) {

// TestAuditEmptyActiveSetBootstrap_NonCompliantHostStaysPostponed verifies
// the bootstrap-recovery exception still gates on self-compliance. A
// POSTPONED supernode that submits a host report violating a min-free
// threshold MUST remain POSTPONED even when the active set is empty.
// POSTPONED supernode that submits a host report violating a non-storage
// min-free threshold (CPU here) MUST remain POSTPONED even when the active
// set is empty.
//
// This guards against the exception turning into a "free pass" for
// misbehaving SNs and complements the unit-level tests in
// x/audit/v1/keeper/enforcement_empty_active_set_test.go.
//
// Note: per LEP-6 §17 disk pressure is owned exclusively by the STORAGE_FULL
// transition path (audit SetReport) and is no longer a postpone reason, so
// this test exercises the non-storage CPU path. The disk-pressure bootstrap
// case is covered by TestAuditEmptyActiveSetBootstrap_DiskPressureGoesToStorageFull.
func TestAuditEmptyActiveSetBootstrap_NonCompliantHostStaysPostponed(t *testing.T) {
const (
epochLengthBlocks = uint64(10)
originHeight = int64(1)
)

// Set a non-zero MinDiskFreePercent so non-compliant disk usage in the host
// Set a non-zero MinCpuFreePercent so non-compliant CPU usage in the host
// report blocks self-compliance.
sut.ModifyGenesisJSON(t,
setSupernodeParamsForAuditTests(t),
setAuditParamsForFastEpochsWithMinDiskFree(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}, 20),
setAuditParamsForFastEpochsWithMinCpuFree(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}, 20),
)
sut.StartChain(t)

Expand All @@ -153,10 +159,10 @@ func TestAuditEmptyActiveSetBootstrap_NonCompliantHostStaysPostponed(t *testing.
require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr))
require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr))

// Epoch 1: empty active set. Both submit host reports with disk usage 95%
// (5% free, below the 20% MinDiskFreePercent). Self-compliance fails.
// Epoch 1: empty active set. Both submit host reports with CPU usage 95%
// (5% free, below the 20% MinCpuFreePercent). Self-compliance fails.
epochID1 := uint64((epoch1Start - originHeight) / int64(epochLengthBlocks))
hostNonCompliant := auditHostReportWithDiskUsageJSON([]string{"PORT_STATE_OPEN"}, 95.0)
hostNonCompliant := auditHostReportWithCpuUsageJSON([]string{"PORT_STATE_OPEN"}, 95.0)
RequireTxSuccess(t, submitEpochReport(t, cli, n0.nodeName, epochID1, hostNonCompliant, nil))
RequireTxSuccess(t, submitEpochReport(t, cli, n1.nodeName, epochID1, hostNonCompliant, nil))

Expand All @@ -168,3 +174,73 @@ func TestAuditEmptyActiveSetBootstrap_NonCompliantHostStaysPostponed(t *testing.
require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr),
"node1 should remain POSTPONED — self-compliance gate blocks the bootstrap exception")
}

// TestAuditEmptyActiveSetBootstrap_DiskPressureGoesToStorageFull verifies the
// LEP-6 §17 invariant that disk pressure is owned exclusively by the
// STORAGE_FULL transition path, not by audit_host_requirements POSTPONE.
//
// Scenario:
// 1. Two SNs register and miss epoch 0 reports → both POSTPONED for
// audit_missing_reports.
// 2. Epoch 1: empty active set. Both submit host reports with disk usage
// above the supernode module's MaxStorageUsagePercent (default 90).
// 3. Epoch 1 end: the bootstrap-recovery exception fires because
// selfHostCompliant ignores disk (only cpu/mem gate the bootstrap
// exception). The new recoverSupernodeFromPostponed helper observes
// disk > MaxStorageUsagePercent in the same epoch's report and steers
// recovery to STORAGE_FULL instead of ACTIVE.
//
// Invariant locked in: disk pressure never produces ACTIVE in this branch,
// never produces POSTPONED via audit_host_requirements, and produces
// STORAGE_FULL exactly when disk > MaxStorageUsagePercent.
func TestAuditEmptyActiveSetBootstrap_DiskPressureGoesToStorageFull(t *testing.T) {
const (
epochLengthBlocks = uint64(10)
originHeight = int64(1)
)

// No MinCpuFreePercent / MinMemFreePercent override → only disk pressure
// is in play. The supernode module's default MaxStorageUsagePercent (90)
// gates the STORAGE_FULL transition; we report 95% to cross it.
sut.ModifyGenesisJSON(t,
setSupernodeParamsForAuditTests(t),
setAuditParamsForFastEpochs(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}),
)
sut.StartChain(t)

cli := NewLumeradCLI(t, sut, true)
n0 := getNodeIdentity(t, cli, "node0")
n1 := getNodeIdentity(t, cli, "node1")

registerSupernode(t, cli, n0, "192.168.1.1")
registerSupernode(t, cli, n1, "192.168.1.2")

// Epoch 0: no reports → both POSTPONED for audit_missing_reports.
currentHeight := sut.AwaitNextBlock(t)
_, epoch0Start := nextEpochAfterHeight(originHeight, epochLengthBlocks, currentHeight)
epoch1Start := epoch0Start + int64(epochLengthBlocks)
epoch2Start := epoch1Start + int64(epochLengthBlocks)

awaitAtLeastHeightWithSlack(t, epoch1Start)

require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr))
require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr))

// Epoch 1: empty active set. Both submit host reports with disk usage
// 95% (> MaxStorageUsagePercent=90). Self-compliance passes (no
// cpu/mem floor configured), so the bootstrap exception fires; the
// recovery helper observes the high disk and steers to STORAGE_FULL.
epochID1 := uint64((epoch1Start - originHeight) / int64(epochLengthBlocks))
hostHighDisk := auditHostReportWithDiskUsageJSON([]string{"PORT_STATE_OPEN"}, 95.0)
RequireTxSuccess(t, submitEpochReport(t, cli, n0.nodeName, epochID1, hostHighDisk, nil))
RequireTxSuccess(t, submitEpochReport(t, cli, n1.nodeName, epochID1, hostHighDisk, nil))

awaitAtLeastHeightWithSlack(t, epoch2Start)

// LEP-6 §17 invariant: disk pressure routes POSTPONED → STORAGE_FULL,
// never POSTPONED → ACTIVE, never stuck POSTPONED on audit_host_requirements.
require.Equal(t, "SUPERNODE_STATE_STORAGE_FULL", querySupernodeLatestState(t, cli, n0.valAddr),
"node0 should transition POSTPONED → STORAGE_FULL via the audit recovery helper (disk > MaxStorageUsagePercent)")
require.Equal(t, "SUPERNODE_STATE_STORAGE_FULL", querySupernodeLatestState(t, cli, n1.valAddr),
"node1 should transition POSTPONED → STORAGE_FULL via the audit recovery helper (disk > MaxStorageUsagePercent)")
}
34 changes: 26 additions & 8 deletions tests/systemtests/audit_test_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,22 @@ func setAuditParamsForFastEpochs(t *testing.T, epochLengthBlocks uint64, peerQuo
}
}

// setAuditParamsForFastEpochsWithMinDiskFree is setAuditParamsForFastEpochs
// plus an explicit MinDiskFreePercent override. Used by tests that need to
// exercise the self-compliance gate against the host report's disk-usage
// field (e.g. the empty-active-set bootstrap exception's self-compliance
// guard).
func setAuditParamsForFastEpochsWithMinDiskFree(t *testing.T, epochLengthBlocks uint64, peerQuorumReports, minTargets, maxTargets uint32, requiredOpenPorts []uint32, minDiskFreePercent uint32) GenesisMutator {
// setAuditParamsForFastEpochsWithMinCpuFree is setAuditParamsForFastEpochs
// plus an explicit MinCpuFreePercent override. Used by tests that need to
// exercise the self-compliance gate against a non-storage host minimum
// (e.g. the empty-active-set bootstrap exception's self-compliance guard).
//
// Note: disk pressure is intentionally NOT used for postpone/self-compliance
// gating — per LEP-6 it is owned exclusively by the STORAGE_FULL transition
// path in audit SetReport. Tests that want to keep an SN out of the recovery
// path must drive non-compliance via a non-storage metric (cpu or mem).
func setAuditParamsForFastEpochsWithMinCpuFree(t *testing.T, epochLengthBlocks uint64, peerQuorumReports, minTargets, maxTargets uint32, requiredOpenPorts []uint32, minCpuFreePercent uint32) GenesisMutator {
base := setAuditParamsForFastEpochs(t, epochLengthBlocks, peerQuorumReports, minTargets, maxTargets, requiredOpenPorts)
return func(genesis []byte) []byte {
t.Helper()
state := base(genesis)
var err error
state, err = sjson.SetRawBytes(state, "app_state.audit.params.min_disk_free_percent", []byte(strconv.FormatUint(uint64(minDiskFreePercent), 10)))
state, err = sjson.SetRawBytes(state, "app_state.audit.params.min_cpu_free_percent", []byte(strconv.FormatUint(uint64(minCpuFreePercent), 10)))
require.NoError(t, err)
return state
}
Expand Down Expand Up @@ -311,7 +315,7 @@ func auditHostReportJSON(inboundPortStates []string) string {

// auditHostReportWithDiskUsageJSON is like auditHostReportJSON but lets the
// caller pin disk_usage_percent. Used by tests that exercise the
// self-compliance gate (e.g. min-free thresholds).
// STORAGE_FULL transition via the SetReport disk-pressure path.
func auditHostReportWithDiskUsageJSON(inboundPortStates []string, diskUsagePercent float64) string {
bz, _ := json.Marshal(map[string]any{
"cpu_usage_percent": 1.0,
Expand All @@ -323,6 +327,20 @@ func auditHostReportWithDiskUsageJSON(inboundPortStates []string, diskUsagePerce
return string(bz)
}

// auditHostReportWithCpuUsageJSON is like auditHostReportJSON but lets the
// caller pin cpu_usage_percent. Used by tests that exercise the
// self-compliance gate against a non-storage host minimum.
func auditHostReportWithCpuUsageJSON(inboundPortStates []string, cpuUsagePercent float64) string {
bz, _ := json.Marshal(map[string]any{
"cpu_usage_percent": cpuUsagePercent,
"mem_usage_percent": 1.0,
"disk_usage_percent": 1.0,
"inbound_port_states": inboundPortStates,
"failed_actions_count": 0,
})
return string(bz)
}

// storageChallengeObservationJSON builds the JSON payload for --storage-challenge-observations flag.
func storageChallengeObservationJSON(targetSupernodeAccount string, portStates []string) string {
bz, _ := json.Marshal(map[string]any{
Expand Down
15 changes: 10 additions & 5 deletions x/audit/v1/POSTPONEMENT_RULES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Postponement and Recovery Rules (audit/v1)

This document describes the on-chain rules implemented by the audit module (v1) for switching a supernode between `ACTIVE` and `POSTPONED`, and for recovering back to `ACTIVE`.
This document describes the on-chain rules implemented by the audit module (v1) for switching a supernode between `ACTIVE` and `POSTPONED`, and for recovering out of `POSTPONED`.

## Definitions

Expand Down Expand Up @@ -33,10 +33,11 @@ This is evaluated by checking for a stored report in each of the last `N` epochs

### 2) Host Report requirements

If a submitted host report violates any enabled minimum free% threshold, the supernode is set to `POSTPONED`.
If a submitted host report violates any enabled CPU or memory minimum free% threshold, the supernode is set to `POSTPONED`.

- Params: `min_cpu_free_percent`, `min_mem_free_percent`, `min_disk_free_percent` (`free% = 100 - usage%`).
- Params: `min_cpu_free_percent`, `min_mem_free_percent` (`free% = 100 - usage%`).
- Special case: if `*_usage_percent == 0`, that metric is treated as **unknown** and does not trigger postponement.
- Disk pressure is not a postponement criterion; it is handled by the `STORAGE_FULL` state path.
Comment thread
mateeullahmalik marked this conversation as resolved.
Comment thread
mateeullahmalik marked this conversation as resolved.

The following host-report fields are currently ignored by postponement logic:
- `failed_actions_count`
Expand All @@ -50,8 +51,12 @@ An epoch counts toward the consecutive requirement only if:
- there is at least **1** peer reporter about the target in that epoch, and
- the share of peer reporters about the target in that epoch that report `PORT_STATE_CLOSED` for port index `i` meets or exceeds `peer_port_postpone_threshold_percent`.

## Recovery rule (POSTPONED → ACTIVE)
## Recovery rule (POSTPONED → ACTIVE or STORAGE_FULL)

In a single epoch, a `POSTPONED` supernode becomes `ACTIVE` if:
In a single epoch, a `POSTPONED` supernode recovers if:
- it submits one compliant host report (Host Report requirements), and
- there exists at least **1** peer report about that supernode in the same epoch where **all** required ports are `PORT_STATE_OPEN`.

The recovery target is determined from the same-epoch self HostReport:
- if `disk_usage_percent` is omitted/zero or is at or below `supernode.max_storage_usage_percent`, the supernode becomes `ACTIVE`;
- if `disk_usage_percent` is above `supernode.max_storage_usage_percent`, the supernode becomes `STORAGE_FULL`.
8 changes: 5 additions & 3 deletions x/audit/v1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,18 @@ At epoch end, a supernode can be postponed for:

- **Action-finalization evidence thresholds** (per-epoch counts meeting consecutive-epoch windows),
- **Missing reports** for `consecutive_epochs_to_postpone` consecutive epochs,
- **Self Report minimum failures** (CPU/mem/disk free% thresholds),
- **Self Report minimum failures** (CPU/mem free% thresholds),
- **Peer port thresholds**: a required port is treated as CLOSED if peer observations meet `peer_port_postpone_threshold_percent`, and this happens for `consecutive_epochs_to_postpone` consecutive epochs.

### Recovery (`POSTPONED -> ACTIVE`)
### Recovery (`POSTPONED -> ACTIVE` or `STORAGE_FULL`)

At epoch end, a supernode can recover:

- If postponed due to action-finalization evidence: by the action-finalization recovery window and total-bad-evidence constraint.
- Otherwise: if it has a compliant self report and at least one peer observation in the epoch where all required ports are `OPEN`.

If the same-epoch self HostReport still has `disk_usage_percent` above `supernode.max_storage_usage_percent`, recovery routes to `STORAGE_FULL` instead of `ACTIVE`.

Detailed behavior is implemented in the module's epoch-end enforcement logic.

## Evidence
Expand Down Expand Up @@ -192,7 +194,7 @@ Params are initialized from genesis and may later be updated by governance via `
- Enforcement:
- `min_cpu_free_percent`: `0` (disabled)
- `min_mem_free_percent`: `0` (disabled)
- `min_disk_free_percent`: `0` (disabled)
- `min_disk_free_percent`: `0` (legacy/no-op for epoch-end postponement; disk pressure is handled as `STORAGE_FULL`)
- `consecutive_epochs_to_postpone`: `1`
- `peer_port_postpone_threshold_percent`: `100`
- `keep_last_epoch_entries`: `200`
Expand Down
40 changes: 32 additions & 8 deletions x/audit/v1/keeper/enforcement.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (k Keeper) EnforceEpochEnd(ctx sdk.Context, epochID uint64, params types.Pa
continue
}

if err := k.recoverSupernodeActive(ctx, sn); err != nil {
if err := k.recoverSupernodeFromPostponed(ctx, sn, epochID); err != nil {
return err
}
k.clearActionFinalizationPostponedAtEpochID(ctx, sn.SupernodeAccount)
Expand Down Expand Up @@ -413,16 +413,14 @@ func (k Keeper) selfHostViolatesMinimums(ctx sdk.Context, supernodeAccount strin
return false, nil
}

// If any known metric is below minimum free%, postpone.
// If any known non-storage metric is below minimum free%, postpone.
// Disk pressure is modeled as STORAGE_FULL by audit SetReport, not POSTPONED.
if violatesMinFree(r.HostReport.CpuUsagePercent, params.MinCpuFreePercent) {
return true, nil
}
if violatesMinFree(r.HostReport.MemUsagePercent, params.MinMemFreePercent) {
return true, nil
}
if violatesMinFree(r.HostReport.DiskUsagePercent, params.MinDiskFreePercent) {
return true, nil
}

return false, nil
}
Expand All @@ -439,9 +437,6 @@ func (k Keeper) selfHostCompliant(ctx sdk.Context, supernodeAccount string, epoc
if !compliesMinFree(r.HostReport.MemUsagePercent, params.MinMemFreePercent) {
return false, nil
}
if !compliesMinFree(r.HostReport.DiskUsagePercent, params.MinDiskFreePercent) {
return false, nil
}

return true, nil
}
Expand Down Expand Up @@ -567,6 +562,35 @@ func (k Keeper) recoverSupernodeActive(ctx sdk.Context, sn sntypes.SuperNode) er
return k.supernodeKeeper.RecoverSuperNodeFromPostponed(ctx, valAddr)
}

func (k Keeper) markSupernodeStorageFull(ctx sdk.Context, sn sntypes.SuperNode) error {
if sn.ValidatorAddress == "" {
return fmt.Errorf("missing validator address for supernode %q", sn.SupernodeAccount)
}
valAddr, err := sdk.ValAddressFromBech32(sn.ValidatorAddress)
if err != nil {
return err
}
return k.supernodeKeeper.MarkSuperNodeStorageFull(ctx, valAddr)
}

func (k Keeper) recoverSupernodeFromPostponed(ctx sdk.Context, sn sntypes.SuperNode, epochID uint64) error {
r, found := k.GetReport(ctx, epochID, sn.SupernodeAccount)
if !found || r.HostReport.DiskUsagePercent == 0 {
return k.recoverSupernodeActive(ctx, sn)
}

if !isValidHostUsagePercent(r.HostReport.DiskUsagePercent) {
return k.markSupernodeStorageFull(ctx, sn)
}

maxStorage := float64(k.supernodeKeeper.GetParams(ctx).MaxStorageUsagePercent)
if r.HostReport.DiskUsagePercent <= maxStorage {
return k.recoverSupernodeActive(ctx, sn)
}
Comment thread
mateeullahmalik marked this conversation as resolved.
Comment thread
mateeullahmalik marked this conversation as resolved.

return k.markSupernodeStorageFull(ctx, sn)
}

Comment thread
mateeullahmalik marked this conversation as resolved.
// storageTruthBand represents a node suspicion severity level.
type storageTruthBand int

Expand Down
17 changes: 4 additions & 13 deletions x/audit/v1/keeper/enforcement_empty_active_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ func TestEnforceEpochEnd_EmptyActiveSet_NoSelfReport_NoRecover(t *testing.T) {

// TestEnforceEpochEnd_EmptyActiveSet_NonCompliantSelf_NoRecover verifies the
// bootstrap exception does NOT bypass the self-compliance health checks.
// A POSTPONED SN that submits a report violating the disk-usage minimum
// A POSTPONED SN that submits a report violating a non-storage host minimum
// stays POSTPONED even when the active set is empty.
func TestEnforceEpochEnd_EmptyActiveSet_NonCompliantSelf_NoRecover(t *testing.T) {
f := initFixture(t)
Expand All @@ -181,28 +181,19 @@ func TestEnforceEpochEnd_EmptyActiveSet_NonCompliantSelf_NoRecover(t *testing.T)
params := types.DefaultParams()
params.RequiredOpenPorts = []uint32{4444}
params.ConsecutiveEpochsToPostpone = 1
// Require at least 20% disk free; sn0 reports 95% usage → 5% free → not compliant.
params.MinDiskFreePercent = 20
// Require at least 20% CPU free; sn0 reports 95% usage → 5% free → not compliant.
params.MinCpuFreePercent = 20

epochID := uint64(1)

writeEmptyActiveSetAnchor(t, f, epochID)

// SetReport with non-zero DiskUsagePercent invokes the STORAGE_FULL
// transition source path, which queries supernodeKeeper. Stub these
// dependencies so the call lands cleanly without triggering a
// transition (we return "not found" → SetReport short-circuits).
f.supernodeKeeper.EXPECT().
GetSuperNodeByAccount(gomock.AssignableToTypeOf(f.ctx), sn0.SupernodeAccount).
Return(sntypes.SuperNode{}, false, nil).
Times(1)

if err := f.keeper.SetReport(f.ctx, types.EpochReport{
SupernodeAccount: sn0.SupernodeAccount,
EpochId: epochID,
ReportHeight: f.ctx.BlockHeight(),
HostReport: types.HostReport{
DiskUsagePercent: 95.0, // 5% free, below the 20% minimum
CpuUsagePercent: 95.0, // 5% free, below the 20% minimum
},
}); err != nil {
t.Fatalf("failed to set report: %v", err)
Expand Down
Loading
Loading