Skip to content

Commit

Permalink
diffusion: IOError rethrow policy
Browse files Browse the repository at this point in the history
  • Loading branch information
coot committed Feb 13, 2025
1 parent d0f1f93 commit 4512e4b
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ import Ouroboros.Network.ConnectionId
import Ouroboros.Network.ConnectionManager.Core qualified as CM
import Ouroboros.Network.ConnectionManager.State qualified as CM
import Ouroboros.Network.ConnectionManager.Types
import Ouroboros.Network.Diffusion.P2P (isFatal)
import Ouroboros.Network.ExitPolicy (RepromoteDelay (..))
import Ouroboros.Network.InboundGovernor qualified as IG
import Ouroboros.Network.Mock.ConcreteBlock (BlockHeader)
Expand Down Expand Up @@ -1731,7 +1730,6 @@ unit_4191 = testWithIOSim prop_diffusion_dns_can_recover 125000 absInfo script
--
prop_connect_failure :: AbsIOError -> Property
prop_connect_failure (AbsIOError ioerr) =
label (if isFatal (ioe_type ioerr) then "fatal IOError" else "non-fatal IOError") $
testWithIOSim
(\trace _ ->
let -- events generated by `nodeAddr`
Expand All @@ -1749,18 +1747,10 @@ prop_connect_failure (AbsIOError ioerr) =
evs = eventsToList (selectDiffusionSimulationTrace events)
in counterexample (Trace.ppTrace show (ppSimEvent 0 0 0) . Trace.take noEvents $ trace)
. counterexample (show evs)
. (if isFatal (ioe_type ioerr)
then -- verify that the node was killed by the right exception
any (\case
TrErrored e | Just (ExceptionInHandler _ e') <- fromException e
, Just e'' <- fromException e'
, e'' == ioerr
-> True
_ -> False)
else -- verify that the ndoe was not killed by the `ioerr` exception
all (\case
TrErrored {} -> False
_ -> True)
. ( -- verify that the node was not killed by the `IOError`
all (\case
TrErrored {} -> False
_ -> True)
)
. map snd
$ evs
Expand Down
68 changes: 22 additions & 46 deletions ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ module Ouroboros.Network.Diffusion.P2P
, NodeToNodeConnectionManager
, NodeToNodePeerConnectionHandle
, NodeToNodePeerSelectionActions
-- * fatal io errors
, isFatal
-- * Re-exports
, AbstractTransitionTrace
, IG.RemoteTransitionTrace
Expand Down Expand Up @@ -62,11 +60,10 @@ import Data.Maybe (catMaybes)
import Data.Set (Set)
import Data.Typeable (Proxy (..), Typeable)
import Data.Void (Void)
import Network.Socket (Socket)
import GHC.IO.Exception (IOErrorType (..), IOException (..))
import System.Exit (ExitCode)
import System.Random (StdGen, newStdGen, split)

import Network.Socket (Socket)
import Network.Socket qualified as Socket
import Network.Mux qualified as Mx

Expand Down Expand Up @@ -795,11 +792,28 @@ runM Interfaces
Just (_ :: IOManagerError) -> ShutdownNode
Nothing -> mempty)
<>
-- IOError rethrow-policy
--
-- After a critical bug, we decided that `IOError` policy should only
-- kill the connection which thrown it. `IOError`s are not propagated.
-- There's a risk that one could arm an attack if one discovers
-- a mechanism to trigger fatal `IOError`s, e.g. through a kernel bug.
--
-- It is responsibility for an SPO to monitor the node if it is making
-- progress and have enough resources to do so, e.g. if it has enough
-- memory, file descriptors.
--
-- The `ouroboros-network` guarantees running on a fixed number of file
-- descriptors given a topology file, see
-- https://github.com/IntersectMBO/ouroboros-network/issues/4585#issuecomment-1591777447
-- There's also a calculation for `ouroboros-consensus`, see
-- https://github.com/IntersectMBO/ouroboros-consensus/issues/20#issuecomment-1514554680
-- File descriptors could be drained by the tracing system in
-- `cardano-node` (such a bug existed), or even an external process.
--
RethrowPolicy (\_ctx err ->
case fromException err of
Just IOError { ioe_type } ->
if isFatal ioe_type then ShutdownNode
else mempty
case fromException err :: Maybe IOException of
Just {} -> mempty
Nothing -> mempty)
<>
RethrowPolicy (\ctx err -> case (ctx, fromException err) of
Expand Down Expand Up @@ -1438,44 +1452,6 @@ run sigUSR1Signal tracers tracersExtra args argsExtra apps appsExtra = do
tracers tracersExtra args argsExtra apps appsExtra


--
-- Fatal Errors
--
-- If we are out of file descriptors (either because we exhausted
-- process or system limit) we should shut down the node and let the
-- operator investigate.
--
-- Refs:
-- * https://hackage.haskell.org/package/ghc-internal-9.1001.0/docs/src/GHC.Internal.Foreign.C.Error.html#errnoToIOError
-- * man socket.2
-- * man connect.2
-- * man accept.2
-- NOTE: many `connect` and `accept` exceptions are classified as
-- `OtherError`, here we only distinguish fatal IO errors (e.g.
-- ones that propagate to the main thread).
-- NOTE: we don't use the rethrow policy for `accept` calls, where
-- all but `ECONNABORTED` are fatal exceptions.
--
-- NOTE: UnsupportedOperation error type is not considered fatal since it can
-- happen on a race condition between the connect and accept call between two
-- nodes that have each other as local roots.
--
isFatal :: IOErrorType -> Bool
isFatal ResourceExhausted = True
-- EAGAIN -- connect, accept
-- EMFILE -- socket, accept
-- ENFILE -- socket, accept
-- ENOBUFS -- socket, accept
-- ENOMEM -- socket, accept
isFatal InvalidArgument = True
-- EINVAL -- socket, accept
-- ENOTSOCK -- connect
-- EBADF -- connect, accept
isFatal ProtocolError = True
-- EPROTONOSUPPOPRT -- socket
-- EPROTO -- accept
isFatal _ = False

--
-- Data flow
--
Expand Down

0 comments on commit 4512e4b

Please sign in to comment.