|
| 1 | +//go:build system_test |
| 2 | + |
| 3 | +package system |
| 4 | + |
| 5 | +// This test validates the "empty active set deadlock" bootstrap scenario: |
| 6 | +// |
| 7 | +// When ALL supernodes are POSTPONED at epoch start, the epoch anchor has an |
| 8 | +// empty active_supernode_accounts set. Without active probers, no peer |
| 9 | +// observations are generated, and the audit module's recovery rule |
| 10 | +// (compliant host report + peer all-ports-OPEN) can never be satisfied. |
| 11 | +// |
| 12 | +// The fix is to use legacy MsgReportSupernodeMetrics to recover SNs to |
| 13 | +// ACTIVE mid-epoch. Combined with audit epoch reports, the SN survives |
| 14 | +// the audit EndBlocker and appears in the next epoch's anchor, seeding |
| 15 | +// the active set and bootstrapping the peer-observation cycle. |
| 16 | +// |
| 17 | +// Scenario: |
| 18 | +// 1. Two supernodes register and start ACTIVE. |
| 19 | +// 2. Neither submits epoch reports for epoch 0 → both POSTPONED at epoch 0 end. |
| 20 | +// 3. Epoch 1: empty active set. Both submit host-only audit reports. |
| 21 | +// Verify: audit recovery alone cannot recover them (no peer observations). |
| 22 | +// 4. Legacy MsgReportSupernodeMetrics recovers both mid-epoch 2. |
| 23 | +// 5. Epoch 2 end: audit enforcement checks them as ACTIVE — they have reports, |
| 24 | +// host minimums disabled, no peer-port streak → they stay ACTIVE. |
| 25 | +// 6. Epoch 3: both are in the anchor active set → peer observations flow → self-sustaining. |
| 26 | + |
| 27 | +import ( |
| 28 | + "testing" |
| 29 | + |
| 30 | + sntypes "github.com/LumeraProtocol/lumera/x/supernode/v1/types" |
| 31 | + "github.com/stretchr/testify/require" |
| 32 | +) |
| 33 | + |
| 34 | +func TestAuditEmptyActiveSetBootstrap_LegacyMetricsBreaksDeadlock(t *testing.T) { |
| 35 | + const ( |
| 36 | + epochLengthBlocks = uint64(10) |
| 37 | + originHeight = int64(1) |
| 38 | + ) |
| 39 | + |
| 40 | + sut.ModifyGenesisJSON(t, |
| 41 | + setSupernodeParamsForAuditTests(t), |
| 42 | + setAuditParamsForFastEpochs(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}), |
| 43 | + ) |
| 44 | + sut.StartChain(t) |
| 45 | + |
| 46 | + cli := NewLumeradCLI(t, sut, true) |
| 47 | + n0 := getNodeIdentity(t, cli, "node0") |
| 48 | + n1 := getNodeIdentity(t, cli, "node1") |
| 49 | + |
| 50 | + registerSupernode(t, cli, n0, "192.168.1.1") |
| 51 | + registerSupernode(t, cli, n1, "192.168.1.2") |
| 52 | + |
| 53 | + // Both are ACTIVE after registration. |
| 54 | + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n0.valAddr)) |
| 55 | + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n1.valAddr)) |
| 56 | + |
| 57 | + // ── Epoch 0: Do NOT submit any epoch reports. ── |
| 58 | + // This simulates the testnet scenario where SNs were running releases |
| 59 | + // without audit code when the chain upgraded to enable the audit module. |
| 60 | + currentHeight := sut.AwaitNextBlock(t) |
| 61 | + _, epoch0Start := nextEpochAfterHeight(originHeight, epochLengthBlocks, currentHeight) |
| 62 | + epoch1Start := epoch0Start + int64(epochLengthBlocks) |
| 63 | + epoch2Start := epoch1Start + int64(epochLengthBlocks) |
| 64 | + |
| 65 | + // Wait for epoch 0 to end → both get POSTPONED for missing reports. |
| 66 | + awaitAtLeastHeight(t, epoch1Start) |
| 67 | + |
| 68 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr), |
| 69 | + "node0 should be POSTPONED after missing epoch 0 report") |
| 70 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr), |
| 71 | + "node1 should be POSTPONED after missing epoch 0 report") |
| 72 | + |
| 73 | + // ── Epoch 1: Empty active set — the deadlock. ── |
| 74 | + epochID1 := uint64((epoch1Start - originHeight) / int64(epochLengthBlocks)) |
| 75 | + |
| 76 | + // Both submit host-only audit epoch reports (as POSTPONED reporters, no observations). |
| 77 | + hostOK := auditHostReportJSON([]string{"PORT_STATE_OPEN"}) |
| 78 | + tx0 := submitEpochReport(t, cli, n0.nodeName, epochID1, hostOK, nil) |
| 79 | + RequireTxSuccess(t, tx0) |
| 80 | + tx1 := submitEpochReport(t, cli, n1.nodeName, epochID1, hostOK, nil) |
| 81 | + RequireTxSuccess(t, tx1) |
| 82 | + |
| 83 | + // Wait for epoch 1 to end WITHOUT legacy metrics recovery. |
| 84 | + // Both should remain POSTPONED — audit recovery fails (no peer observations). |
| 85 | + awaitAtLeastHeight(t, epoch2Start) |
| 86 | + |
| 87 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr), |
| 88 | + "node0 should still be POSTPONED — audit recovery alone cannot break the deadlock") |
| 89 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr), |
| 90 | + "node1 should still be POSTPONED — audit recovery alone cannot break the deadlock") |
| 91 | + |
| 92 | + // ── Epoch 2: Break the deadlock with legacy MsgReportSupernodeMetrics. ── |
| 93 | + epochID2 := epochID1 + 1 |
| 94 | + epoch3Start := epoch2Start + int64(epochLengthBlocks) |
| 95 | + |
| 96 | + // Submit legacy metrics → instant recovery to ACTIVE. |
| 97 | + compliantMetrics := sntypes.SupernodeMetrics{ |
| 98 | + VersionMajor: 2, |
| 99 | + VersionMinor: 4, |
| 100 | + VersionPatch: 5, |
| 101 | + OpenPorts: []sntypes.PortStatus{ |
| 102 | + {Port: 4444, State: sntypes.PortState_PORT_STATE_OPEN}, |
| 103 | + }, |
| 104 | + } |
| 105 | + |
| 106 | + hash0 := reportSupernodeMetrics(t, cli, n0.nodeName, n0.valAddr, n0.accAddr, compliantMetrics) |
| 107 | + txJSON0 := waitForTx(t, cli, hash0) |
| 108 | + resp0 := decodeTxResponse(t, txJSON0) |
| 109 | + require.Equal(t, uint32(0), resp0.Code, "legacy metrics tx for node0 should succeed: %s", resp0.RawLog) |
| 110 | + |
| 111 | + hash1 := reportSupernodeMetrics(t, cli, n1.nodeName, n1.valAddr, n1.accAddr, compliantMetrics) |
| 112 | + txJSON1 := waitForTx(t, cli, hash1) |
| 113 | + resp1 := decodeTxResponse(t, txJSON1) |
| 114 | + require.Equal(t, uint32(0), resp1.Code, "legacy metrics tx for node1 should succeed: %s", resp1.RawLog) |
| 115 | + |
| 116 | + // Both should now be ACTIVE (instant recovery via legacy path). |
| 117 | + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n0.valAddr), |
| 118 | + "node0 should be ACTIVE after legacy metrics recovery") |
| 119 | + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n1.valAddr), |
| 120 | + "node1 should be ACTIVE after legacy metrics recovery") |
| 121 | + |
| 122 | + // Also submit audit epoch reports so the audit EndBlocker doesn't re-postpone them. |
| 123 | + tx0e2 := submitEpochReport(t, cli, n0.nodeName, epochID2, hostOK, nil) |
| 124 | + RequireTxSuccess(t, tx0e2) |
| 125 | + tx1e2 := submitEpochReport(t, cli, n1.nodeName, epochID2, hostOK, nil) |
| 126 | + RequireTxSuccess(t, tx1e2) |
| 127 | + |
| 128 | + // Wait for epoch 2 to end. |
| 129 | + awaitAtLeastHeight(t, epoch3Start) |
| 130 | + |
| 131 | + // ── Verify: both survive the audit EndBlocker and remain ACTIVE. ── |
| 132 | + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n0.valAddr), |
| 133 | + "node0 should remain ACTIVE after epoch 2 enforcement (legacy metrics + audit report)") |
| 134 | + require.Equal(t, "SUPERNODE_STATE_ACTIVE", querySupernodeLatestState(t, cli, n1.valAddr), |
| 135 | + "node1 should remain ACTIVE after epoch 2 enforcement (legacy metrics + audit report)") |
| 136 | +} |
| 137 | + |
| 138 | +// TestAuditEmptyActiveSetDeadlock_HostOnlyReportsCannotRecover verifies that |
| 139 | +// when all supernodes are POSTPONED, submitting host-only epoch reports across |
| 140 | +// multiple epochs is insufficient for recovery — proving the deadlock exists. |
| 141 | +func TestAuditEmptyActiveSetDeadlock_HostOnlyReportsCannotRecover(t *testing.T) { |
| 142 | + const ( |
| 143 | + epochLengthBlocks = uint64(10) |
| 144 | + originHeight = int64(1) |
| 145 | + ) |
| 146 | + |
| 147 | + sut.ModifyGenesisJSON(t, |
| 148 | + setSupernodeParamsForAuditTests(t), |
| 149 | + setAuditParamsForFastEpochs(t, epochLengthBlocks, 1, 1, 1, []uint32{4444}), |
| 150 | + ) |
| 151 | + sut.StartChain(t) |
| 152 | + |
| 153 | + cli := NewLumeradCLI(t, sut, true) |
| 154 | + n0 := getNodeIdentity(t, cli, "node0") |
| 155 | + n1 := getNodeIdentity(t, cli, "node1") |
| 156 | + |
| 157 | + registerSupernode(t, cli, n0, "192.168.1.1") |
| 158 | + registerSupernode(t, cli, n1, "192.168.1.2") |
| 159 | + |
| 160 | + // Epoch 0: no reports → both POSTPONED. |
| 161 | + currentHeight := sut.AwaitNextBlock(t) |
| 162 | + _, epoch0Start := nextEpochAfterHeight(originHeight, epochLengthBlocks, currentHeight) |
| 163 | + epoch1Start := epoch0Start + int64(epochLengthBlocks) |
| 164 | + |
| 165 | + awaitAtLeastHeight(t, epoch1Start) |
| 166 | + |
| 167 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr)) |
| 168 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr)) |
| 169 | + |
| 170 | + // Submit host-only reports for 3 consecutive epochs. None should recover. |
| 171 | + hostOK := auditHostReportJSON([]string{"PORT_STATE_OPEN"}) |
| 172 | + for i := 0; i < 3; i++ { |
| 173 | + epochStart := epoch1Start + int64(i)*int64(epochLengthBlocks) |
| 174 | + nextEpochStart := epochStart + int64(epochLengthBlocks) |
| 175 | + epochID := uint64((epochStart - originHeight) / int64(epochLengthBlocks)) |
| 176 | + |
| 177 | + awaitAtLeastHeight(t, epochStart) |
| 178 | + |
| 179 | + tx0 := submitEpochReport(t, cli, n0.nodeName, epochID, hostOK, nil) |
| 180 | + RequireTxSuccess(t, tx0) |
| 181 | + tx1 := submitEpochReport(t, cli, n1.nodeName, epochID, hostOK, nil) |
| 182 | + RequireTxSuccess(t, tx1) |
| 183 | + |
| 184 | + awaitAtLeastHeight(t, nextEpochStart) |
| 185 | + |
| 186 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n0.valAddr), |
| 187 | + "node0 should remain POSTPONED in epoch %d — no peer observations possible", epochID) |
| 188 | + require.Equal(t, "SUPERNODE_STATE_POSTPONED", querySupernodeLatestState(t, cli, n1.valAddr), |
| 189 | + "node1 should remain POSTPONED in epoch %d — no peer observations possible", epochID) |
| 190 | + } |
| 191 | +} |
0 commit comments