Skip to content

Commit

Permalink
Cleanup snapshot if snapshot loading fails
Browse files Browse the repository at this point in the history
Summary:
Delete a moved-in snapshot if opening the snapshot fails to avoid
situations where a previous failure leaves behind files that prevents
future snapshots from being moved to the correct locations.

Reviewed By: jaher

Differential Revision: D67121203

fbshipit-source-id: ff996dbce15ddb4bc3db85b89c3065eb12ecac4a
  • Loading branch information
hsun324 authored and facebook-github-bot committed Dec 12, 2024
1 parent 9722dc8 commit ebc2504
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions src/wa_raft_server.erl
Original file line number Diff line number Diff line change
Expand Up @@ -767,21 +767,27 @@ stalled({call, From}, ?SNAPSHOT_AVAILABLE_COMMAND(Root, #raft_log_pos{index = Sn
catch filelib:ensure_dir(Path),
case prim_file:rename(Root, Path) of
ok ->
?LOG_NOTICE("Server[~0p, term ~0p, stalled] applying snapshot ~p:~p",
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
ok = wa_raft_storage:open_snapshot(Storage, SnapshotPos),
{ok, View1} = wa_raft_log:reset(View0, SnapshotPos),
State1 = State0#raft_state{log_view = View1, last_applied = SnapshotIndex, commit_index = SnapshotIndex},
State2 = load_config(State1),
?LOG_NOTICE("Server[~0p, term ~0p, stalled] switching to follower after installing snapshot at ~p:~p.",
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
State3 = case SnapshotTerm > CurrentTerm of
true -> advance_term(?FUNCTION_NAME, SnapshotTerm, undefined, State2);
false -> State2
end,
% At this point, we assume that we received some cluster membership configuration from
% our peer so it is safe to transition to an operational state.
{next_state, follower, State3, [{reply, From, ok}]};
try
?LOG_NOTICE("Server[~0p, term ~0p, stalled] applying snapshot ~p:~p",
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
ok = wa_raft_storage:open_snapshot(Storage, SnapshotPos),
{ok, View1} = wa_raft_log:reset(View0, SnapshotPos),
State1 = State0#raft_state{log_view = View1, last_applied = SnapshotIndex, commit_index = SnapshotIndex},
State2 = load_config(State1),
?LOG_NOTICE("Server[~0p, term ~0p, stalled] switching to follower after installing snapshot at ~p:~p.",
[Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}),
State3 = case SnapshotTerm > CurrentTerm of
true -> advance_term(?FUNCTION_NAME, SnapshotTerm, undefined, State2);
false -> State2
end,
% At this point, we assume that we received some cluster membership configuration from
% our peer so it is safe to transition to an operational state.
{next_state, follower, State3, [{reply, From, ok}]}
after
% It is assumed that the loading of the snapshot will move the snapshot away or
% otherwise disassociate the storage state from the snapshot path.
catch file:del_dir_r(Path)
end;
{error, Reason} ->
?LOG_WARNING("Server[~0p, term ~0p, stalled] failed to rename available snapshot ~p to ~p due to ~p",
[Name, CurrentTerm, Root, Path, Reason], #{domain => [whatsapp, wa_raft]}),
Expand Down

0 comments on commit ebc2504

Please sign in to comment.