Skip to content

Commit 5540c69

Browse files
committed
Skip peer discovery cleanup when backend returns error
Previously if the peer discovery backend returned an error from failing to discover nodes, the `service_discovery_nodes/0` helper returned an empty list. During cleanup this would mean that any nodes unreachable during a partition would have destructive action taken against them: `rabbit_db_cluster:forget_member/2` and `rabbit_quorum_queue:shrink_all/1`. The `list_nodes/0` callback can fail transiently, though, and a failure shouldn't mean that the cluster is empty. It's safer to avoid cleaning up any nodes when the peer discovery backend fails to return the intended set of nodes.
1 parent 200127c commit 5540c69

File tree

1 file changed

+23
-36
lines changed

1 file changed

+23
-36
lines changed

deps/rabbitmq_peer_discovery_common/src/rabbit_peer_discovery_cleanup.erl

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -240,19 +240,29 @@ maybe_cleanup(State, UnreachableNodes) ->
240240
?LOG_DEBUG(
241241
"Peer discovery: cleanup discovered unreachable nodes: ~tp",
242242
[UnreachableNodes]),
243-
case lists:subtract(as_list(UnreachableNodes), as_list(service_discovery_nodes())) of
244-
[] ->
245-
?LOG_DEBUG(
246-
"Peer discovery: all unreachable nodes are still "
247-
"registered with the discovery backend ~tp",
248-
[rabbit_peer_discovery:backend()],
249-
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
250-
ok;
251-
Nodes ->
252-
?LOG_DEBUG(
253-
"Peer discovery: unreachable nodes are not registered "
254-
"with the discovery backend ~tp", [Nodes]),
255-
maybe_remove_nodes(Nodes, State#state.warn_only)
243+
Module = rabbit_peer_discovery:backend(),
244+
case rabbit_peer_discovery:normalize(Module:list_nodes()) of
245+
{ok, {OneOrMultipleNodes, _Type}} ->
246+
DiscoveredNodes = as_list(OneOrMultipleNodes),
247+
case lists:subtract(UnreachableNodes, DiscoveredNodes) of
248+
[] ->
249+
?LOG_DEBUG(
250+
"Peer discovery: all unreachable nodes are still "
251+
"registered with the discovery backend ~tp",
252+
[rabbit_peer_discovery:backend()],
253+
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
254+
ok;
255+
Nodes ->
256+
?LOG_DEBUG(
257+
"Peer discovery: unreachable nodes are not registered "
258+
"with the discovery backend ~tp", [Nodes]),
259+
maybe_remove_nodes(Nodes, State#state.warn_only)
260+
end;
261+
{error, Reason} ->
262+
?LOG_INFO(
263+
"Peer discovery cleanup: ~tp returned error ~tp",
264+
[Module, Reason]),
265+
ok
256266
end.
257267

258268
%%--------------------------------------------------------------------
@@ -288,26 +298,3 @@ maybe_remove_nodes([Node | Nodes], false) ->
288298
-spec unreachable_nodes() -> [node()].
289299
unreachable_nodes() ->
290300
rabbit_nodes:list_unreachable().
291-
292-
%%--------------------------------------------------------------------
293-
%% @private
294-
%% @doc Return the nodes that the service discovery backend knows about
295-
%% @spec service_discovery_nodes() -> [node()]
296-
%% @end
297-
%%--------------------------------------------------------------------
298-
-spec service_discovery_nodes() -> [node()].
299-
service_discovery_nodes() ->
300-
Module = rabbit_peer_discovery:backend(),
301-
case rabbit_peer_discovery:normalize(Module:list_nodes()) of
302-
{ok, {OneOrMultipleNodes, _Type}} ->
303-
Nodes = as_list(OneOrMultipleNodes),
304-
?LOG_DEBUG(
305-
"Peer discovery cleanup: ~tp returned ~tp",
306-
[Module, Nodes]),
307-
Nodes;
308-
{error, Reason} ->
309-
?LOG_DEBUG(
310-
"Peer discovery cleanup: ~tp returned error ~tp",
311-
[Module, Reason]),
312-
[]
313-
end.

0 commit comments

Comments
 (0)