From 335449a210674b7fb4792fb58acda7ab3cb9b8c0 Mon Sep 17 00:00:00 2001 From: Marcin Szamotulski Date: Thu, 6 Feb 2025 12:39:30 +0100 Subject: [PATCH] diffusion: IOError rethrow policy --- .../Network/Diffusion/Testnet/Cardano.hs | 18 ++--- .../src/Ouroboros/Network/Diffusion/P2P.hs | 68 ++++++------------- 2 files changed, 26 insertions(+), 60 deletions(-) diff --git a/ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs b/ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs index 71b6b604a4f..09d2bb26aa7 100644 --- a/ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs +++ b/ouroboros-network/sim-tests-lib/Test/Ouroboros/Network/Diffusion/Testnet/Cardano.hs @@ -64,7 +64,6 @@ import Ouroboros.Network.ConnectionId import Ouroboros.Network.ConnectionManager.Core qualified as CM import Ouroboros.Network.ConnectionManager.State qualified as CM import Ouroboros.Network.ConnectionManager.Types -import Ouroboros.Network.Diffusion.P2P (isFatal) import Ouroboros.Network.ExitPolicy (RepromoteDelay (..)) import Ouroboros.Network.InboundGovernor qualified as IG import Ouroboros.Network.Mock.ConcreteBlock (BlockHeader) @@ -1731,7 +1730,6 @@ unit_4191 = testWithIOSim prop_diffusion_dns_can_recover 125000 absInfo script -- prop_connect_failure :: AbsIOError -> Property prop_connect_failure (AbsIOError ioerr) = - label (if isFatal (ioe_type ioerr) then "fatal IOError" else "non-fatal IOError") $ testWithIOSim (\trace _ -> let -- events generated by `nodeAddr` @@ -1749,18 +1747,10 @@ prop_connect_failure (AbsIOError ioerr) = evs = eventsToList (selectDiffusionSimulationTrace events) in counterexample (Trace.ppTrace show (ppSimEvent 0 0 0) . Trace.take noEvents $ trace) . counterexample (show evs) - . (if isFatal (ioe_type ioerr) - then -- verify that the node was killed by the right exception - any (\case - TrErrored e | Just (ExceptionInHandler _ e') <- fromException e - , Just e'' <- fromException e' - , e'' == ioerr - -> True - _ -> False) - else -- verify that the ndoe was not killed by the `ioerr` exception - all (\case - TrErrored {} -> False - _ -> True) + . ( -- verify that the node was not killed by the `IOError` + all (\case + TrErrored {} -> False + _ -> True) ) . map snd $ evs diff --git a/ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs b/ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs index fda448b03bc..ad69a746b04 100644 --- a/ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs +++ b/ouroboros-network/src/Ouroboros/Network/Diffusion/P2P.hs @@ -30,8 +30,6 @@ module Ouroboros.Network.Diffusion.P2P , NodeToNodeConnectionManager , NodeToNodePeerConnectionHandle , NodeToNodePeerSelectionActions - -- * fatal io errors - , isFatal -- * Re-exports , AbstractTransitionTrace , IG.RemoteTransitionTrace @@ -62,11 +60,10 @@ import Data.Maybe (catMaybes) import Data.Set (Set) import Data.Typeable (Proxy (..), Typeable) import Data.Void (Void) -import Network.Socket (Socket) -import GHC.IO.Exception (IOErrorType (..), IOException (..)) import System.Exit (ExitCode) import System.Random (StdGen, newStdGen, split) +import Network.Socket (Socket) import Network.Socket qualified as Socket import Network.Mux qualified as Mx @@ -795,11 +792,28 @@ runM Interfaces Just (_ :: IOManagerError) -> ShutdownNode Nothing -> mempty) <> + -- IOError rethrow-policy + -- + -- After a critical bug, we decided that `IOError` policy should only + -- kill the connection which thrown it. `IOError`s are not propagated. + -- There's a risk that one could arm an attack if one discovers + -- a mechanism to trigger fatal `IOError`s, e.g. through a kernel bug. + -- + -- It is responsibility for an SPO to monitor the node if it is making + -- progress and have enough resources to do so, e.g. if it has enough + -- memory, file descriptors. + -- + -- The `ouroboros-network` guarantees running on a fixed number of file + -- descriptors given a topology file, see + -- https://github.com/IntersectMBO/ouroboros-network/issues/4585#issuecomment-1591777447 + -- There's also a calculation for `ouroboros-consensus`, see + -- https://github.com/IntersectMBO/ouroboros-consensus/issues/20#issuecomment-1514554680 + -- File descriptors could be drained by the tracing system in + -- `cardano-node` (such a bug existed), or even an external process. + -- RethrowPolicy (\_ctx err -> - case fromException err of - Just IOError { ioe_type } -> - if isFatal ioe_type then ShutdownNode - else mempty + case fromException err :: Maybe IOException of + Just {} -> mempty Nothing -> mempty) <> RethrowPolicy (\ctx err -> case (ctx, fromException err) of @@ -1438,44 +1452,6 @@ run sigUSR1Signal tracers tracersExtra args argsExtra apps appsExtra = do tracers tracersExtra args argsExtra apps appsExtra --- --- Fatal Errors --- --- If we are out of file descriptors (either because we exhausted --- process or system limit) we should shut down the node and let the --- operator investigate. --- --- Refs: --- * https://hackage.haskell.org/package/ghc-internal-9.1001.0/docs/src/GHC.Internal.Foreign.C.Error.html#errnoToIOError --- * man socket.2 --- * man connect.2 --- * man accept.2 --- NOTE: many `connect` and `accept` exceptions are classified as --- `OtherError`, here we only distinguish fatal IO errors (e.g. --- ones that propagate to the main thread). --- NOTE: we don't use the rethrow policy for `accept` calls, where --- all but `ECONNABORTED` are fatal exceptions. --- --- NOTE: UnsupportedOperation error type is not considered fatal since it can --- happen on a race condition between the connect and accept call between two --- nodes that have each other as local roots. --- -isFatal :: IOErrorType -> Bool -isFatal ResourceExhausted = True - -- EAGAIN -- connect, accept - -- EMFILE -- socket, accept - -- ENFILE -- socket, accept - -- ENOBUFS -- socket, accept - -- ENOMEM -- socket, accept -isFatal InvalidArgument = True - -- EINVAL -- socket, accept - -- ENOTSOCK -- connect - -- EBADF -- connect, accept -isFatal ProtocolError = True - -- EPROTONOSUPPOPRT -- socket - -- EPROTO -- accept -isFatal _ = False - -- -- Data flow --