frankbria · timothy-20 · Feb 28, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -176,7 +176,7 @@ tmux attach -t <session-name>
 
 ### Running Tests
 ```bash
-# Run all tests (568 tests)
+# Run all tests (573 tests)
 npm test
 
 # Run specific test suites
@@ -519,13 +519,13 @@ Ralph uses a multi-layered strategy to prevent Claude from accidentally deleting
 
 ## Test Suite
 
-### Test Files (568 tests total)
+### Test Files (573 tests total)
 
 | File | Tests | Description |
 |------|-------|-------------|
 | `test_circuit_breaker_recovery.bats` | 19 | Cooldown timer, auto-reset, parse_iso_to_epoch, CLI flag (Issue #160) |
 | `test_cli_parsing.bats` | 35 | CLI argument parsing for all flags + monitor parameter forwarding |
-| `test_cli_modern.bats` | 68 | Modern CLI commands (Phase 1.1) + build_claude_command fix + live mode text format fix (#164) + errexit pipeline guard (#175) + ALLOWED_TOOLS tightening (#149) + API limit false positive detection (#183) + Claude CLI command validation (#97) + stale call counter fix (#196) |
+| `test_cli_modern.bats` | 73 | Modern CLI commands (Phase 1.1) + build_claude_command fix + live mode text format fix (#164) + ALLOWED_TOOLS tightening (#149) + API limit false positive detection (#183) + Claude CLI command validation (#97) + stale call counter fix (#196) + set-e removal with explicit error handling (#208) + stderr separation (#190) |
 | `test_json_parsing.bats` | 52 | JSON output format parsing + Claude CLI format + session management + array format |
 | `test_session_continuity.bats` | 44 | Session lifecycle management + expiration + circuit breaker integration + issue #91 fix |
 | `test_exit_detection.bats` | 53 | Exit signal detection + EXIT_SIGNAL-based completion indicators + progress detection |

diff --git a/ralph_loop.sh b/ralph_loop.sh
@@ -3,20 +3,18 @@
 # Claude Code Ralph Loop with Rate Limiting and Documentation
 # Adaptation of the Ralph technique for Claude Code with usage management
 
-set -e  # Exit on any error
-
 # Note: CLAUDE_CODE_ENABLE_DANGEROUS_PERMISSIONS_IN_SANDBOX and IS_SANDBOX
 # environment variables are NOT exported here. Tool restrictions are handled
 # via --allowedTools flag in CLAUDE_CMD_ARGS, which is the proper approach.
 # Exporting sandbox variables without a verified sandbox would be misleading.
 
 # Source library components
 SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
-source "$SCRIPT_DIR/lib/date_utils.sh"
-source "$SCRIPT_DIR/lib/timeout_utils.sh"
-source "$SCRIPT_DIR/lib/response_analyzer.sh"
-source "$SCRIPT_DIR/lib/circuit_breaker.sh"
-source "$SCRIPT_DIR/lib/file_protection.sh"
+source "$SCRIPT_DIR/lib/date_utils.sh" || { echo "FATAL: Failed to source lib/date_utils.sh" >&2; exit 1; }
+source "$SCRIPT_DIR/lib/timeout_utils.sh" || { echo "FATAL: Failed to source lib/timeout_utils.sh" >&2; exit 1; }
+source "$SCRIPT_DIR/lib/response_analyzer.sh" || { echo "FATAL: Failed to source lib/response_analyzer.sh" >&2; exit 1; }
+source "$SCRIPT_DIR/lib/circuit_breaker.sh" || { echo "FATAL: Failed to source lib/circuit_breaker.sh" >&2; exit 1; }
+source "$SCRIPT_DIR/lib/file_protection.sh" || { echo "FATAL: Failed to source lib/file_protection.sh" >&2; exit 1; }
 
 # Configuration
 # Ralph-specific files live in .ralph/ subfolder
@@ -571,10 +569,8 @@ should_exit_gracefully() {
     # Fix #144: Only match valid markdown checkboxes, not date entries like [2026-01-29]
     # Valid patterns: "- [ ]" (uncompleted) and "- [x]" or "- [X]" (completed)
     if [[ -f "$RALPH_DIR/fix_plan.md" ]]; then
-        local uncompleted_items=$(grep -cE "^[[:space:]]*- \[ \]" "$RALPH_DIR/fix_plan.md" 2>/dev/null || true)
-        [[ -z "$uncompleted_items" ]] && uncompleted_items=0
-        local completed_items=$(grep -cE "^[[:space:]]*- \[[xX]\]" "$RALPH_DIR/fix_plan.md" 2>/dev/null || true)
-        [[ -z "$completed_items" ]] && completed_items=0
+        local uncompleted_items=$(grep -cE "^[[:space:]]*- \[ \]" "$RALPH_DIR/fix_plan.md" 2>/dev/null || echo "0")
+        local completed_items=$(grep -cE "^[[:space:]]*- \[[xX]\]" "$RALPH_DIR/fix_plan.md" 2>/dev/null || echo "0")
         local total_items=$((uncompleted_items + completed_items))
 
         if [[ $total_items -gt 0 ]] && [[ $completed_items -eq $total_items ]]; then
@@ -680,8 +676,7 @@ build_loop_context() {
     # Extract incomplete tasks from fix_plan.md
     # Bug #3 Fix: Support indented markdown checkboxes with [[:space:]]* pattern
     if [[ -f "$RALPH_DIR/fix_plan.md" ]]; then
-        local incomplete_tasks=$(grep -cE "^[[:space:]]*- \[ \]" "$RALPH_DIR/fix_plan.md" 2>/dev/null || true)
-        [[ -z "$incomplete_tasks" ]] && incomplete_tasks=0
+        local incomplete_tasks=$(grep -cE "^[[:space:]]*- \[ \]" "$RALPH_DIR/fix_plan.md" 2>/dev/null || echo "0")
         context+="Remaining tasks: ${incomplete_tasks}. "
     fi
 
@@ -876,8 +871,8 @@ reset_session() {
     # Clear response analysis to prevent stale EXIT_SIGNAL from previous session
     rm -f "$RESPONSE_ANALYSIS_FILE" 2>/dev/null
 
-    # Log the session transition (non-fatal to prevent script exit under set -e)
-    log_session_transition "active" "reset" "$reason" "${loop_count:-0}" || true
+    # Log the session transition
+    log_session_transition "active" "reset" "$reason" "${loop_count:-0}"
 
     log_status "INFO" "Session reset: $reason"
 }
@@ -1217,17 +1212,14 @@ execute_claude_code() {
         # Capture all pipeline exit codes for proper error handling
         # stdin must be redirected from /dev/null because newer Claude CLI versions
         # read from stdin even in -p (print) mode, causing the process to hang
-        # Disable errexit for pipeline - timeout returns non-zero exit code 124
-        # which would cause set -e to silently kill the entire script (Issue #175)
-        set +e
-        set -o pipefail
+        # Redirect stderr to separate file to prevent Node.js warnings (e.g., UNDICI)
+        # from corrupting the jq JSON pipeline (Issue #190)
+        local stderr_file="${LOG_DIR}/claude_stderr_$(date '+%Y%m%d_%H%M%S').log"
         portable_timeout ${timeout_seconds}s stdbuf -oL "${LIVE_CMD_ARGS[@]}" \
-            < /dev/null 2>&1 | stdbuf -oL tee "$output_file" | stdbuf -oL jq --unbuffered -j "$jq_filter" 2>/dev/null | tee "$LIVE_LOG_FILE"
+            < /dev/null 2>"$stderr_file" | stdbuf -oL tee "$output_file" | stdbuf -oL jq --unbuffered -j "$jq_filter" 2>/dev/null | tee "$LIVE_LOG_FILE"
 
         # Capture exit codes from pipeline
         local -a pipe_status=("${PIPESTATUS[@]}")
-        set +o pipefail
-        set -e  # Re-enable errexit now that exit codes are captured
 
         # Primary exit code is from Claude/timeout (first command in pipeline)
         exit_code=${pipe_status[0]}
@@ -1237,6 +1229,13 @@ execute_claude_code() {
             log_status "WARN" "Claude Code execution timed out after ${CLAUDE_TIMEOUT_MINUTES} minutes"
         fi
 
+        # Log stderr if non-empty, clean up empty stderr files
+        if [[ -s "$stderr_file" ]]; then
+            log_status "WARN" "Claude CLI wrote to stderr (see: $stderr_file)"
+        else
+            rm -f "$stderr_file" 2>/dev/null
+        fi
+
         # Check for tee failures (second command) - could break logging/session
         if [[ ${pipe_status[1]} -ne 0 ]]; then
             log_status "WARN" "Failed to write stream output to log file (exit code ${pipe_status[1]})"
@@ -1407,11 +1406,16 @@ EOF
         analyze_response "$output_file" "$loop_count"
         local analysis_exit_code=$?
 
-        # Update exit signals based on analysis
-        update_exit_signals
+        if [[ $analysis_exit_code -eq 0 ]]; then
+            # Update exit signals based on analysis
+            update_exit_signals
 
-        # Log analysis summary
-        log_analysis_summary
+            # Log analysis summary
+            log_analysis_summary
+        else
+            log_status "WARN" "Response analysis failed (exit $analysis_exit_code); skipping signal updates"
+            rm -f "$RESPONSE_ANALYSIS_FILE"
+        fi
 
         # Get file change count for circuit breaker
         # Fix #141: Detect both uncommitted changes AND committed changes
@@ -1518,10 +1522,20 @@ EOF
 
 # Cleanup function
 cleanup() {
-    log_status "INFO" "Ralph loop interrupted. Cleaning up..."
-    reset_session "manual_interrupt"
-    update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "interrupted" "stopped"
-    exit 0
+    local trap_exit_code=$?
+
+    # Reentrancy guard — prevent double execution from EXIT + signal combination
+    if [[ "$_CLEANUP_DONE" == "true" ]]; then return; fi
+    _CLEANUP_DONE=true
+
+    # Only record "interrupted" status for abnormal exits (non-zero exit code)
+    # Normal exit (code 0) preserves the status already written by the main loop
+    if [[ $loop_count -gt 0 && $trap_exit_code -ne 0 ]]; then
+        log_status "INFO" "Ralph loop interrupted. Cleaning up..."
+        reset_session "manual_interrupt"
+        update_status "$loop_count" "$(cat "$CALL_COUNT_FILE" 2>/dev/null || echo "0")" "interrupted" "stopped"
+    fi
+    # No exit here — EXIT trap handles natural termination
 }
 
 # Set up signal handlers
@@ -1616,7 +1630,7 @@ main() {
         # Verify Ralph's critical files still exist (Issue #149)
         if ! validate_ralph_integrity; then
             # Ensure log directory exists for logging even if .ralph/ was deleted
-            mkdir -p "$LOG_DIR" 2>/dev/null || true
+            mkdir -p "$LOG_DIR" 2>/dev/null
             log_status "ERROR" "Ralph integrity check failed - critical files missing"
             echo ""
             echo "$(get_integrity_report)"

diff --git a/tests/unit/test_cli_modern.bats b/tests/unit/test_cli_modern.bats
@@ -703,7 +703,8 @@ EOF
     local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
 
     # The live mode has LIVE_CMD_ARGS on one line and < /dev/null on the next
-    run grep '< /dev/null 2>&1 |' "$script"
+    # stderr is redirected to a separate file (Issue #190)
+    run grep '< /dev/null 2>"$stderr_file" |' "$script"
 
     assert_success
     [[ "$output" == *'< /dev/null'* ]]
@@ -723,8 +724,8 @@ EOF
     run grep 'portable_timeout.*CLAUDE_CMD_ARGS.*< /dev/null' "$script"
     assert_success
 
-    # Live mode: has < /dev/null on continuation line
-    run grep '< /dev/null 2>&1 |' "$script"
+    # Live mode: has < /dev/null with stderr redirect on continuation line
+    run grep '< /dev/null 2>"$stderr_file" |' "$script"
     assert_success
 
     # Legacy mode: has < "$PROMPT_FILE" on same line
@@ -865,64 +866,24 @@ EOF
 }
 
 # =============================================================================
-# LIVE MODE PIPELINE ERREXIT PROTECTION TESTS (Issue #175)
-# set -e + set -o pipefail caused silent script death when Claude timed out.
-# The fix disables errexit around the pipeline so PIPESTATUS can be captured.
+# LIVE MODE PIPELINE ERROR HANDLING TESTS
+# set -e was removed globally; the live pipeline no longer needs errexit toggles.
+# These tests verify the new explicit error handling approach.
 # =============================================================================
 
-@test "live mode pipeline has set +e before set -o pipefail" {
-    # Verify that errexit is disabled BEFORE pipefail is enabled.
-    # Without this, timeout exit code 124 silently kills the script.
-    # Scoped to the live-mode block to avoid false positives from other sections.
+@test "live mode pipeline does not use set +e/set -e toggles" {
+    # With set -e removed globally, the live mode pipeline no longer needs
+    # to toggle errexit. Verify no set +e/set -e appears in the live block.
     local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
 
-    # Extract only the live-mode section (from "Live output mode enabled" to "End of Output")
     local live_block
     live_block=$(sed -n '/Live output mode enabled/,/End of Output/p' "$script")
 
-    # set +e must appear before set -o pipefail within the live-mode block
-    echo "$live_block" | grep -q 'set +e'
-    echo "$live_block" | grep -q 'set -o pipefail'
-
-    # Verify ordering: set +e comes first
-    local plus_e_line=$(echo "$live_block" | grep -n 'set +e' | head -1 | cut -d: -f1)
-    local pipefail_line=$(echo "$live_block" | grep -n 'set -o pipefail' | head -1 | cut -d: -f1)
-
-    [[ -n "$plus_e_line" ]]
-    [[ -n "$pipefail_line" ]]
-    [[ $plus_e_line -lt $pipefail_line ]]
-}
-
-@test "live mode pipeline re-enables set -e after PIPESTATUS capture" {
-    # Verify that errexit is re-enabled after the pipeline exit codes are captured.
-    # Scoped to the live-mode block to avoid matching the global set -e at line 6.
-    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
-
-    # Extract only the live-mode section
-    local live_block
-    live_block=$(sed -n '/Live output mode enabled/,/End of Output/p' "$script")
-
-    # set +o pipefail and set -e must both exist in the live block
-    echo "$live_block" | grep -q 'set +o pipefail'
-    echo "$live_block" | grep -q 'set -e'
-
-    # Verify ordering: set -e comes after set +o pipefail
-    local pipefail_off_line=$(echo "$live_block" | grep -n 'set +o pipefail' | head -1 | cut -d: -f1)
-    local re_enable_line=$(echo "$live_block" | grep -n '^\s*set -e' | awk -F: -v threshold="$pipefail_off_line" '$1 > threshold {print $1; exit}')
-
-    [[ -n "$pipefail_off_line" ]]
-    [[ -n "$re_enable_line" ]]
-    [[ $re_enable_line -gt $pipefail_off_line ]]
-}
-
-@test "live mode pipeline has errexit guard comment referencing Issue #175" {
-    # Verify the fix is documented with context about why errexit is disabled
-    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
-
-    run grep -c 'Issue #175' "$script"
-    assert_success
-    # At least one reference to the issue
-    [[ "${output}" -ge 1 ]]
+    # set +e and set -e should NOT appear in the live block
+    ! echo "$live_block" | grep -q '^[[:space:]]*set +e$'
+    ! echo "$live_block" | grep -q '^[[:space:]]*set -e'
+    ! echo "$live_block" | grep -q 'set -o pipefail'
+    ! echo "$live_block" | grep -q 'set +o pipefail'
 }
 
 @test "live mode pipeline logs timeout events with exit code 124" {
@@ -1213,3 +1174,70 @@ EOF
     run grep 'Only increment counter on successful execution' "$script"
     assert_failure
 }
+
+# ─── set -e removal: explicit error handling (#208) ───
+
+@test "ralph_loop.sh does not use set -e" {
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # set -e must not appear (except in comments or test descriptions)
+    run bash -c "grep -n '^set -e' '$script'"
+    assert_failure
+}
+
+@test "source statements have explicit error guards" {
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # All 5 library source lines must have || { echo "FATAL: ..."; exit 1; }
+    local libs=("date_utils.sh" "timeout_utils.sh" "response_analyzer.sh" "circuit_breaker.sh" "file_protection.sh")
+    for lib in "${libs[@]}"; do
+        run grep "source.*${lib}.*|| { echo.*FATAL.*exit 1; }" "$script"
+        assert_success
+    done
+}
+
+@test "cleanup skips interrupt status on normal exit (exit code 0)" {
+    # Verify cleanup captures trap_exit_code and only records interrupt on non-zero
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # cleanup() must capture exit code as first statement
+    run bash -c "sed -n '/^cleanup()/,/^}/p' '$script' | head -3 | grep 'trap_exit_code=\$?'"
+    assert_success
+
+    # The condition must check for non-zero exit code
+    run bash -c "sed -n '/^cleanup()/,/^}/p' '$script' | grep 'trap_exit_code -ne 0'"
+    assert_success
+}
+
+@test "analyze_response failure skips signal updates" {
+    # Verify that when analysis fails, stale response_analysis file is removed
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # The pattern: analysis failure should remove the response analysis file
+    run bash -c "grep -A 3 'analysis_exit_code' '$script' | grep 'rm -f.*RESPONSE_ANALYSIS_FILE'"
+    assert_success
+}
+
+@test "live mode pipeline does not merge stderr into stdout" {
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # The old pattern "2>&1 |" must NOT exist in the live pipeline
+    run bash -c "grep 'LIVE_CMD_ARGS.*2>&1' '$script'"
+    assert_failure
+}
+
+@test "live mode pipeline redirects stderr to separate file" {
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # stderr must be redirected to a separate file (continuation line)
+    run grep '2>"$stderr_file"' "$script"
+    assert_success
+}
+
+@test "live mode logs stderr output when non-empty" {
+    local script="${BATS_TEST_DIRNAME}/../../ralph_loop.sh"
+
+    # When stderr file has content, a WARN should be logged
+    run grep 'Claude CLI wrote to stderr' "$script"
+    assert_success
+}