Skip to content

Commit ebc2504

Browse files
hsun324facebook-github-bot
authored andcommitted
Cleanup snapshot if snapshot loading fails
Summary: Delete a moved-in snapshot if opening the snapshot fails to avoid situations where a previous failure leaves behind files that prevents future snapshots from being moved to the correct locations. Reviewed By: jaher Differential Revision: D67121203 fbshipit-source-id: ff996dbce15ddb4bc3db85b89c3065eb12ecac4a
1 parent 9722dc8 commit ebc2504

File tree

1 file changed

+21
-15
lines changed

1 file changed

+21
-15
lines changed

src/wa_raft_server.erl

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -767,21 +767,27 @@ stalled({call, From}, ?SNAPSHOT_AVAILABLE_COMMAND(Root, #raft_log_pos{index = Sn
767767
catch filelib:ensure_dir(Path),
768768
case prim_file:rename(Root, Path) of
769769
ok ->
770-
?LOG_NOTICE("Server[~0p, term ~0p, stalled] applying snapshot ~p:~p",
771-
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
772-
ok = wa_raft_storage:open_snapshot(Storage, SnapshotPos),
773-
{ok, View1} = wa_raft_log:reset(View0, SnapshotPos),
774-
State1 = State0#raft_state{log_view = View1, last_applied = SnapshotIndex, commit_index = SnapshotIndex},
775-
State2 = load_config(State1),
776-
?LOG_NOTICE("Server[~0p, term ~0p, stalled] switching to follower after installing snapshot at ~p:~p.",
777-
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
778-
State3 = case SnapshotTerm > CurrentTerm of
779-
true -> advance_term(?FUNCTION_NAME, SnapshotTerm, undefined, State2);
780-
false -> State2
781-
end,
782-
% At this point, we assume that we received some cluster membership configuration from
783-
% our peer so it is safe to transition to an operational state.
784-
{next_state, follower, State3, [{reply, From, ok}]};
770+
try
771+
?LOG_NOTICE("Server[~0p, term ~0p, stalled] applying snapshot ~p:~p",
772+
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
773+
ok = wa_raft_storage:open_snapshot(Storage, SnapshotPos),
774+
{ok, View1} = wa_raft_log:reset(View0, SnapshotPos),
775+
State1 = State0#raft_state{log_view = View1, last_applied = SnapshotIndex, commit_index = SnapshotIndex},
776+
State2 = load_config(State1),
777+
?LOG_NOTICE("Server[~0p, term ~0p, stalled] switching to follower after installing snapshot at ~p:~p.",
778+
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
779+
State3 = case SnapshotTerm > CurrentTerm of
780+
true -> advance_term(?FUNCTION_NAME, SnapshotTerm, undefined, State2);
781+
false -> State2
782+
end,
783+
% At this point, we assume that we received some cluster membership configuration from
784+
% our peer so it is safe to transition to an operational state.
785+
{next_state, follower, State3, [{reply, From, ok}]}
786+
after
787+
% It is assumed that the loading of the snapshot will move the snapshot away or
788+
% otherwise disassociate the storage state from the snapshot path.
789+
catch file:del_dir_r(Path)
790+
end;
785791
{error, Reason} ->
786792
?LOG_WARNING("Server[~0p, term ~0p, stalled] failed to rename available snapshot ~p to ~p due to ~p",
787793
[Name, CurrentTerm, Root, Path, Reason], #{domain => [whatsapp, wa_raft]}),

0 commit comments

Comments
 (0)