From ee0fe9da492888b55fe183cf1a42931ad551ec6b Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 27 Jun 2023 09:22:03 +0000 Subject: [PATCH] add `StepDownOnRemoval` In 72d62a1, we made Raft leaders step down when they removed themselves from the group. However, this broke backwards compatibility. This patch makes the behavior opt-in via a new `StepDownOnRemoval` config option, with the intent to enable it unconditionally in the next major release. In CockroachDB, we campaign a designated voter when the leader is removed from the group. However, during a joint config change, the leader is first demoted to learner before removal. The step-down logic introduced in 72d62a1 triggers on learner demotion (which is the right behavior), but this left the CockroachDB group without a leader, having to wait out an election timeout. While it is straightforward to change this behavior in CockroachDB, we have to maintain backwards compatibility with older nodes during rolling upgrades. Signed-off-by: Erik Grinaker --- raft.go | 15 +- rafttest/interaction_env_handler_add_nodes.go | 2 + testdata/confchange_v1_remove_leader.txt | 87 ++++++-- .../confchange_v1_remove_leader_stepdown.txt | 190 +++++++++++++++++ testdata/confchange_v2_replace_leader.txt | 4 - .../confchange_v2_replace_leader_stepdown.txt | 192 ++++++++++++++++++ 6 files changed, 471 insertions(+), 19 deletions(-) create mode 100644 testdata/confchange_v1_remove_leader_stepdown.txt create mode 100644 testdata/confchange_v2_replace_leader_stepdown.txt diff --git a/raft.go b/raft.go index 45604c37..86916626 100644 --- a/raft.go +++ b/raft.go @@ -276,6 +276,13 @@ type Config struct { // This option may be removed once false positives are no longer possible. // See: https://github.com/etcd-io/raft/issues/80 DisableConfChangeValidation bool + + // StepDownOnRemoval makes the leader step down when it is removed from the + // group or demoted to a learner. + // + // This behavior will become unconditional in the future. See: + // https://github.com/etcd-io/raft/issues/83 + StepDownOnRemoval bool } func (c *Config) validate() error { @@ -408,6 +415,7 @@ type raft struct { // when raft changes its state to follower or candidate. randomizedElectionTimeout int disableProposalForwarding bool + stepDownOnRemoval bool tick func() step stepFunc @@ -447,6 +455,7 @@ func newRaft(c *Config) *raft { readOnly: newReadOnly(c.ReadOnlyOption), disableProposalForwarding: c.DisableProposalForwarding, disableConfChangeValidation: c.DisableConfChangeValidation, + stepDownOnRemoval: c.StepDownOnRemoval, } cfg, prs, err := confchange.Restore(confchange.Changer{ @@ -1918,7 +1927,7 @@ func (r *raft) switchToConfig(cfg tracker.Config, prs tracker.ProgressMap) pb.Co r.isLearner = ok && pr.IsLearner if (!ok || r.isLearner) && r.state == StateLeader { - // This node is leader and was removed or demoted, step down. + // This node is leader and was removed or demoted, step down if requested. // // We prevent demotions at the time writing but hypothetically we handle // them the same way as removing the leader. @@ -1926,7 +1935,9 @@ func (r *raft) switchToConfig(cfg tracker.Config, prs tracker.ProgressMap) pb.Co // TODO(tbg): ask follower with largest Match to TimeoutNow (to avoid // interruption). This might still drop some proposals but it's better than // nothing. - r.becomeFollower(r.Term, None) + if r.stepDownOnRemoval { + r.becomeFollower(r.Term, None) + } return cs } diff --git a/rafttest/interaction_env_handler_add_nodes.go b/rafttest/interaction_env_handler_add_nodes.go index fb17badb..e68a295f 100644 --- a/rafttest/interaction_env_handler_add_nodes.go +++ b/rafttest/interaction_env_handler_add_nodes.go @@ -67,6 +67,8 @@ func (env *InteractionEnv) handleAddNodes(t *testing.T, d datadriven.TestData) e default: return fmt.Errorf("invalid read-only option %q", arg.Vals[i]) } + case "step-down-on-removal": + arg.Scan(t, i, &cfg.StepDownOnRemoval) } } } diff --git a/testdata/confchange_v1_remove_leader.txt b/testdata/confchange_v1_remove_leader.txt index 6efb3921..cc91508a 100644 --- a/testdata/confchange_v1_remove_leader.txt +++ b/testdata/confchange_v1_remove_leader.txt @@ -78,8 +78,10 @@ propose 1 bar ---- ok -# n1 applies the conf change, removing itself and stepping down. But it still -# has an uncommitted 'bar' entry in the log that it sends out appends for first. +# n1 applies the conf change, so it has now removed itself. But it still has +# an uncommitted entry in the log. If the leader unconditionally counted itself +# as part of the commit quorum, we'd be in trouble. In the block below, we see +# it send out appends to the other nodes for the 'bar' entry. stabilize 1 ---- > 1 handling Ready @@ -104,14 +106,10 @@ stabilize 1 1->2 MsgApp Term:1 Log:1/6 Commit:5 1->3 MsgApp Term:1 Log:1/6 Commit:5 INFO 1 switched to configuration voters=(2 3) - INFO 1 became follower at term 1 -> 1 handling Ready - Ready MustSync=false: - Lead:0 State:StateFollower raft-state ---- -1: StateFollower (Non-Voter) Term:1 Lead:0 +1: StateLeader (Non-Voter) Term:1 Lead:1 2: StateFollower (Voter) Term:1 Lead:1 3: StateFollower (Voter) Term:1 Lead:1 @@ -136,7 +134,7 @@ stabilize 2 2->1 MsgAppResp Term:1 Log:0/6 INFO 2 switched to configuration voters=(2 3) -# ...because the old leader n1 ignores the append responses. +# ... which thankfully is what we see on the leader. stabilize 1 ---- > 1 receiving messages @@ -176,14 +174,77 @@ stabilize 3->1 MsgAppResp Term:1 Log:0/6 3->1 MsgAppResp Term:1 Log:0/6 3->1 MsgAppResp Term:1 Log:0/6 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:6 + CommittedEntries: + 1/6 EntryNormal "bar" + Messages: + 1->2 MsgApp Term:1 Log:1/6 Commit:6 + 1->3 MsgApp Term:1 Log:1/6 Commit:6 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/6 Commit:6 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/6 Commit:6 +> 2 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:6 + CommittedEntries: + 1/6 EntryNormal "bar" + Messages: + 2->1 MsgAppResp Term:1 Log:0/6 +> 3 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:6 + CommittedEntries: + 1/6 EntryNormal "bar" + Messages: + 3->1 MsgAppResp Term:1 Log:0/6 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/6 + 3->1 MsgAppResp Term:1 Log:0/6 -# n1 can no longer propose. +# However not all is well. n1 is still leader but unconditionally drops all +# proposals on the floor, so we're effectively stuck if it still heartbeats +# its followers... propose 1 baz ---- -INFO 1 no leader at term 1; dropping proposal raft proposal dropped -# Nor can it campaign to become leader. -campaign 1 +tick-heartbeat 1 +---- +ok + +# ... which, uh oh, it does. +# TODO(tbg): change behavior so that a leader that is removed immediately steps +# down, and initiates an optimistic handover. +stabilize ---- -WARN 1 is unpromotable and can not campaign +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:6 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:6 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:6 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:6 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + +# Just confirming the issue above - leader does not automatically step down. +# Expected behavior: a new leader is elected after an election timeout. +raft-state +---- +1: StateLeader (Non-Voter) Term:1 Lead:1 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 diff --git a/testdata/confchange_v1_remove_leader_stepdown.txt b/testdata/confchange_v1_remove_leader_stepdown.txt new file mode 100644 index 00000000..fe397650 --- /dev/null +++ b/testdata/confchange_v1_remove_leader_stepdown.txt @@ -0,0 +1,190 @@ +# We'll turn this back on after the boilerplate. +log-level none +---- +ok + +# Run a V1 membership change that removes the leader, asking it +# to step down on removal. +# Bootstrap n1, n2, n3. +add-nodes 3 voters=(1,2,3) index=2 step-down-on-removal=true +---- +ok + +campaign 1 +---- +ok + +stabilize +---- +ok + +log-level debug +---- +ok + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 + +# Start removing n1. +propose-conf-change 1 v1=true +r1 +---- +ok + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 + +# Propose an extra entry which will be sent out together with the conf change. +propose 1 foo +---- +ok + +# Send out the corresponding appends. +process-ready 1 +---- +Ready MustSync=true: +Entries: +1/4 EntryConfChange r1 +1/5 EntryNormal "foo" +Messages: +1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] +1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] +1->2 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] +1->3 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] + +# Send response from n2 (which is enough to commit the entries so far next time +# n1 runs). +stabilize 2 +---- +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] + 1->2 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] +> 2 handling Ready + Ready MustSync=true: + Entries: + 1/4 EntryConfChange r1 + 1/5 EntryNormal "foo" + Messages: + 2->1 MsgAppResp Term:1 Log:0/4 + 2->1 MsgAppResp Term:1 Log:0/5 + +# Put another entry in n1's log. +propose 1 bar +---- +ok + +# n1 applies the conf change, removing itself and stepping down. But it still +# has an uncommitted 'bar' entry in the log that it sends out appends for first. +stabilize 1 +---- +> 1 handling Ready + Ready MustSync=true: + Entries: + 1/6 EntryNormal "bar" + Messages: + 1->2 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] + 1->3 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/4 + 2->1 MsgAppResp Term:1 Log:0/5 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:5 + CommittedEntries: + 1/4 EntryConfChange r1 + 1/5 EntryNormal "foo" + Messages: + 1->2 MsgApp Term:1 Log:1/6 Commit:4 + 1->3 MsgApp Term:1 Log:1/6 Commit:4 + 1->2 MsgApp Term:1 Log:1/6 Commit:5 + 1->3 MsgApp Term:1 Log:1/6 Commit:5 + INFO 1 switched to configuration voters=(2 3) + INFO 1 became follower at term 1 +> 1 handling Ready + Ready MustSync=false: + Lead:0 State:StateFollower + +raft-state +---- +1: StateFollower (Non-Voter) Term:1 Lead:0 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 + +# n2 responds, n3 doesn't yet. Quorum for 'bar' should not be reached... +stabilize 2 +---- +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] + 1->2 MsgApp Term:1 Log:1/6 Commit:4 + 1->2 MsgApp Term:1 Log:1/6 Commit:5 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:5 + Entries: + 1/6 EntryNormal "bar" + CommittedEntries: + 1/4 EntryConfChange r1 + 1/5 EntryNormal "foo" + Messages: + 2->1 MsgAppResp Term:1 Log:0/6 + 2->1 MsgAppResp Term:1 Log:0/6 + 2->1 MsgAppResp Term:1 Log:0/6 + INFO 2 switched to configuration voters=(2 3) + +# ...because the old leader n1 ignores the append responses. +stabilize 1 +---- +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/6 + 2->1 MsgAppResp Term:1 Log:0/6 + 2->1 MsgAppResp Term:1 Log:0/6 + +# When n3 responds, quorum is reached and everything falls into place. +stabilize +---- +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] + 1->3 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] + 1->3 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] + 1->3 MsgApp Term:1 Log:1/6 Commit:4 + 1->3 MsgApp Term:1 Log:1/6 Commit:5 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:5 + Entries: + 1/4 EntryConfChange r1 + 1/5 EntryNormal "foo" + 1/6 EntryNormal "bar" + CommittedEntries: + 1/4 EntryConfChange r1 + 1/5 EntryNormal "foo" + Messages: + 3->1 MsgAppResp Term:1 Log:0/4 + 3->1 MsgAppResp Term:1 Log:0/5 + 3->1 MsgAppResp Term:1 Log:0/6 + 3->1 MsgAppResp Term:1 Log:0/6 + 3->1 MsgAppResp Term:1 Log:0/6 + INFO 3 switched to configuration voters=(2 3) +> 1 receiving messages + 3->1 MsgAppResp Term:1 Log:0/4 + 3->1 MsgAppResp Term:1 Log:0/5 + 3->1 MsgAppResp Term:1 Log:0/6 + 3->1 MsgAppResp Term:1 Log:0/6 + 3->1 MsgAppResp Term:1 Log:0/6 + +# n1 can no longer propose. +propose 1 baz +---- +INFO 1 no leader at term 1; dropping proposal +raft proposal dropped + +# Nor can it campaign to become leader. +campaign 1 +---- +WARN 1 is unpromotable and can not campaign diff --git a/testdata/confchange_v2_replace_leader.txt b/testdata/confchange_v2_replace_leader.txt index 27a3e1ab..dfb09505 100644 --- a/testdata/confchange_v2_replace_leader.txt +++ b/testdata/confchange_v2_replace_leader.txt @@ -33,10 +33,6 @@ raft-state 2: StateFollower (Voter) Term:1 Lead:1 3: StateFollower (Voter) Term:1 Lead:1 -log-level info ----- -ok - # create n4 add-nodes 1 ---- diff --git a/testdata/confchange_v2_replace_leader_stepdown.txt b/testdata/confchange_v2_replace_leader_stepdown.txt new file mode 100644 index 00000000..62d01d23 --- /dev/null +++ b/testdata/confchange_v2_replace_leader_stepdown.txt @@ -0,0 +1,192 @@ +# Run a V2 membership change that removes the leader and adds another voter as a +# single operation, using joint consensus and explicitly determining when to +# transition out of the joint config. Leadership is transferred by campaigning a +# designated voter in the new config once the old leader steps down. After the +# reconfiguration completes, we verify that the removed leader cannot campaign +# to become leader. + +# We'll turn this back on after the boilerplate. +log-level none +---- +ok + +# Bootstrap n1, n2, n3. +add-nodes 3 voters=(1,2,3) index=2 step-down-on-removal=true +---- +ok + +# n1 campaigns to become leader. +campaign 1 +---- +ok + +stabilize +---- +ok + +log-level info +---- +ok + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 + +# create n4 +add-nodes 1 +---- +INFO 4 switched to configuration voters=() +INFO 4 became follower at term 0 +INFO newRaft 4 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0] + +# Start reconfiguration to remove n1 and add n4. +propose-conf-change 1 v1=false transition=explicit +r1 v4 +---- +ok + +# Enter joint config. +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 +4: StateFollower (Voter) Term:1 Lead:1 + +# n4 will propose a transition out of the joint config. +propose-conf-change 4 +---- +ok + +# The group commits the command and everyone switches to the final config. +# n1 steps down as leader. +stabilize +---- +> 4 handling Ready + Ready MustSync=false: + Messages: + 4->1 MsgProp Term:0 Log:0/0 Entries:[0/0 EntryConfChangeV2] +> 1 receiving messages + 4->1 MsgProp Term:0 Log:0/0 Entries:[0/0 EntryConfChangeV2] +> 1 handling Ready + Ready MustSync=true: + Entries: + 1/5 EntryConfChangeV2 + Messages: + 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] + 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] + 1->4 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 4 receiving messages + 1->4 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 2 handling Ready + Ready MustSync=true: + Entries: + 1/5 EntryConfChangeV2 + Messages: + 2->1 MsgAppResp Term:1 Log:0/5 +> 3 handling Ready + Ready MustSync=true: + Entries: + 1/5 EntryConfChangeV2 + Messages: + 3->1 MsgAppResp Term:1 Log:0/5 +> 4 handling Ready + Ready MustSync=true: + Entries: + 1/5 EntryConfChangeV2 + Messages: + 4->1 MsgAppResp Term:1 Log:0/5 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/5 + 3->1 MsgAppResp Term:1 Log:0/5 + 4->1 MsgAppResp Term:1 Log:0/5 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:5 + CommittedEntries: + 1/5 EntryConfChangeV2 + Messages: + 1->2 MsgApp Term:1 Log:1/5 Commit:5 + 1->3 MsgApp Term:1 Log:1/5 Commit:5 + 1->4 MsgApp Term:1 Log:1/5 Commit:5 + INFO 1 switched to configuration voters=(2 3 4) + INFO 1 became follower at term 1 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/5 Commit:5 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/5 Commit:5 +> 4 receiving messages + 1->4 MsgApp Term:1 Log:1/5 Commit:5 +> 1 handling Ready + Ready MustSync=false: + Lead:0 State:StateFollower +> 2 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:5 + CommittedEntries: + 1/5 EntryConfChangeV2 + Messages: + 2->1 MsgAppResp Term:1 Log:0/5 + INFO 2 switched to configuration voters=(2 3 4) +> 3 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:5 + CommittedEntries: + 1/5 EntryConfChangeV2 + Messages: + 3->1 MsgAppResp Term:1 Log:0/5 + INFO 3 switched to configuration voters=(2 3 4) +> 4 handling Ready + Ready MustSync=false: + HardState Term:1 Commit:5 + CommittedEntries: + 1/5 EntryConfChangeV2 + Messages: + 4->1 MsgAppResp Term:1 Log:0/5 + INFO 4 switched to configuration voters=(2 3 4) +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/5 + 3->1 MsgAppResp Term:1 Log:0/5 + 4->1 MsgAppResp Term:1 Log:0/5 + +# n1 is out of the configuration. +raft-state +---- +1: StateFollower (Non-Voter) Term:1 Lead:0 +2: StateFollower (Voter) Term:1 Lead:1 +3: StateFollower (Voter) Term:1 Lead:1 +4: StateFollower (Voter) Term:1 Lead:1 + +# Make sure n1 cannot campaign to become leader. +campaign 1 +---- +WARN 1 is unpromotable and can not campaign + +# Campaign the dedicated voter n2 to become the new leader. +campaign 2 +---- +INFO 2 is starting a new election at term 1 +INFO 2 became candidate at term 2 +INFO 2 [logterm: 1, index: 5] sent MsgVote request to 3 at term 2 +INFO 2 [logterm: 1, index: 5] sent MsgVote request to 4 at term 2 + +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateFollower (Non-Voter) Term:1 Lead:0 +2: StateLeader (Voter) Term:2 Lead:2 +3: StateFollower (Voter) Term:2 Lead:2 +4: StateFollower (Voter) Term:2 Lead:2