From 4f2e602777bd8f746c16bce61fa75e968889afa8 Mon Sep 17 00:00:00 2001 From: Tavis Rudd Date: Wed, 23 Apr 2014 23:07:45 -0700 Subject: [PATCH 1/4] add logging to failures caught in doRestartChild The `Left` branch of doStartChild is unrecoverable. It currently drops the affected child without logging. --- src/Control/Distributed/Process/Platform/Supervisor.hs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Control/Distributed/Process/Platform/Supervisor.hs b/src/Control/Distributed/Process/Platform/Supervisor.hs index ce2fc3e..a40a18a 100644 --- a/src/Control/Distributed/Process/Platform/Supervisor.hs +++ b/src/Control/Distributed/Process/Platform/Supervisor.hs @@ -1169,13 +1169,18 @@ doRestartChild _ spec _ state = do -- TODO: use ProcessId and DiedReason to log case start' of Right (ref, st') -> do return $ markActive st' ref spec - Left _ -> do -- TODO: handle this by policy + Left err -> do -- TODO: handle this by policy -- All child failures are handled via monitor signals, apart from - -- BadClosure, which comes back from doStartChild as (Left err). + -- BadClosure and UnresolvableAddress from the StarterProcess + -- variants of ChildStart, which both come back from + -- doStartChild as (Left err). -- Since we cannot recover from that, there's no point in trying -- to start this child again (as the closure will never resolve), -- so we remove the child forthwith. We should provide a policy -- for handling this situation though... + sup <- getSelfPid + logEntry Log.error $ + mkReport "Unrecoverable error in child" sup (childKey spec) (show err) return $ ( (active ^: Map.filter (/= chKey)) . (bumpStats Active chType decrement) . (bumpStats Specified chType decrement) From 365c517a764ecc0a26693937a58dd600eb3d5c7e Mon Sep 17 00:00:00 2001 From: Tavis Rudd Date: Fri, 25 Apr 2014 00:00:09 -0700 Subject: [PATCH 2/4] kill supervisor upon unrecoverable errors in child See discussion on PR #87. --- src/Control/Distributed/Process/Platform/Supervisor.hs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Control/Distributed/Process/Platform/Supervisor.hs b/src/Control/Distributed/Process/Platform/Supervisor.hs index a40a18a..9b2d193 100644 --- a/src/Control/Distributed/Process/Platform/Supervisor.hs +++ b/src/Control/Distributed/Process/Platform/Supervisor.hs @@ -1169,19 +1169,21 @@ doRestartChild _ spec _ state = do -- TODO: use ProcessId and DiedReason to log case start' of Right (ref, st') -> do return $ markActive st' ref spec - Left err -> do -- TODO: handle this by policy + Left err -> do -- All child failures are handled via monitor signals, apart from -- BadClosure and UnresolvableAddress from the StarterProcess -- variants of ChildStart, which both come back from -- doStartChild as (Left err). -- Since we cannot recover from that, there's no point in trying -- to start this child again (as the closure will never resolve), - -- so we remove the child forthwith. We should provide a policy - -- for handling this situation though... + -- so we remove the child forthwith. sup <- getSelfPid logEntry Log.error $ mkReport "Unrecoverable error in child" sup (childKey spec) (show err) - return $ ( (active ^: Map.filter (/= chKey)) + if (isTemporary (childRestart spec)) + -- TODO: convert this to a meaningful exception type + then die $ "Unrecoverable error in child " ++ (childKey spec) + else return $ ( (active ^: Map.filter (/= chKey)) . (bumpStats Active chType decrement) . (bumpStats Specified chType decrement) $ removeChild spec st From 6656a924fed1d2b80444a8556f37c9a166f474f9 Mon Sep 17 00:00:00 2001 From: Tavis Rudd Date: Fri, 25 Apr 2014 00:21:03 -0700 Subject: [PATCH 3/4] fix reversed if case + comment in doRestartChild --- .../Distributed/Process/Platform/Supervisor.hs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Control/Distributed/Process/Platform/Supervisor.hs b/src/Control/Distributed/Process/Platform/Supervisor.hs index 9b2d193..ec44d67 100644 --- a/src/Control/Distributed/Process/Platform/Supervisor.hs +++ b/src/Control/Distributed/Process/Platform/Supervisor.hs @@ -1174,20 +1174,18 @@ doRestartChild _ spec _ state = do -- TODO: use ProcessId and DiedReason to log -- BadClosure and UnresolvableAddress from the StarterProcess -- variants of ChildStart, which both come back from -- doStartChild as (Left err). - -- Since we cannot recover from that, there's no point in trying - -- to start this child again (as the closure will never resolve), - -- so we remove the child forthwith. sup <- getSelfPid logEntry Log.error $ mkReport "Unrecoverable error in child" sup (childKey spec) (show err) - if (isTemporary (childRestart spec)) - -- TODO: convert this to a meaningful exception type - then die $ "Unrecoverable error in child " ++ (childKey spec) - else return $ ( (active ^: Map.filter (/= chKey)) + if isTemporary (childRestart spec) + then return $ ( (active ^: Map.filter (/= chKey)) . (bumpStats Active chType decrement) . (bumpStats Specified chType decrement) $ removeChild spec st ) + else die $ "Unrecoverable error in child " ++ (childKey spec) + -- TODO: convert this to a meaningful exception type + where chKey = childKey spec chType = childType spec From a208ea26914410d0a15c500ba97c489abd6482c7 Mon Sep 17 00:00:00 2001 From: Tavis Rudd Date: Tue, 6 May 2014 12:16:43 -0700 Subject: [PATCH 4/4] replace brutal `die` in doRestartChild with `stopWith` See discussion on 6656a92. --- .../Process/Platform/Supervisor.hs | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/Control/Distributed/Process/Platform/Supervisor.hs b/src/Control/Distributed/Process/Platform/Supervisor.hs index ec44d67..5d1a89b 100644 --- a/src/Control/Distributed/Process/Platform/Supervisor.hs +++ b/src/Control/Distributed/Process/Platform/Supervisor.hs @@ -1149,14 +1149,14 @@ tryRestartChild pid st active' spec reason | True <- isTemporary (childRestart spec) = continue childRemoved | DiedNormal <- reason , True <- isIntrinsic (childRestart spec) = stopWith updateStopped ExitNormal - | otherwise = continue =<< doRestartChild pid spec reason st + | otherwise = doRestartChild pid spec reason st where childDown = (active ^= active') $ updateStopped childRemoved = (active ^= active') $ removeChild spec st updateStopped = maybe st id $ updateChild chKey (setChildStopped False) st chKey = childKey spec -doRestartChild :: ProcessId -> ChildSpec -> DiedReason -> State -> Process State +doRestartChild :: ProcessId -> ChildSpec -> DiedReason -> State -> Process (ProcessAction State) doRestartChild _ spec _ state = do -- TODO: use ProcessId and DiedReason to log state' <- addRestart state case state' of @@ -1167,23 +1167,26 @@ doRestartChild _ spec _ state = do -- TODO: use ProcessId and DiedReason to log Just st -> do start' <- doStartChild spec st case start' of - Right (ref, st') -> do - return $ markActive st' ref spec + Right (ref, st') -> continue $ markActive st' ref spec Left err -> do -- All child failures are handled via monitor signals, apart from -- BadClosure and UnresolvableAddress from the StarterProcess -- variants of ChildStart, which both come back from -- doStartChild as (Left err). sup <- getSelfPid - logEntry Log.error $ - mkReport "Unrecoverable error in child" sup (childKey spec) (show err) if isTemporary (childRestart spec) - then return $ ( (active ^: Map.filter (/= chKey)) + then do + logEntry Log.warning $ + mkReport "Error in temporary child" sup (childKey spec) (show err) + continue $ ( (active ^: Map.filter (/= chKey)) . (bumpStats Active chType decrement) . (bumpStats Specified chType decrement) - $ removeChild spec st - ) - else die $ "Unrecoverable error in child " ++ (childKey spec) + $ removeChild spec st) + else do + logEntry Log.error $ + mkReport "Unrecoverable error in child. Stopping supervisor" + sup (childKey spec) (show err) + stopWith st $ ExitOther $ "Unrecoverable error in child " ++ (childKey spec) -- TODO: convert this to a meaningful exception type where