From c69ed8556c9ed104424a2b1f9f0f631a0f64b1ac Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 6 Oct 2025 13:48:27 -0700 Subject: [PATCH 1/2] runc create/run/exec: show fatal errors from init In case early stage of runc init (nsenter) fails for some reason, it logs error(s) with FATAL log level, via bail(). The runc init log is read by a parent (runc create/run/exec) and is logged via normal logrus mechanism, which is all fine and dandy, except when `runc init` fails, we return the error from the parent (which is usually not too helpful, for example): runc run failed: unable to start container process: can't get final child's PID from pipe: EOF Now, the actual underlying error is from runc init and it was logged earlier; here's how full runc output looks like: FATA[0000] nsexec-1[3247792]: failed to unshare remaining namespaces: No space left on device FATA[0000] nsexec-0[3247790]: failed to sync with stage-1: next state: Success ERRO[0000] runc run failed: unable to start container process: can't get final child's PID from pipe: EOF The problem is, upper level runtimes tend to ignore everything except the last line from runc, and thus error reported by e.g. docker is not very helpful. This patch tries to improve the situation by collecting FATAL errors from runc init and appending those to the error returned (instead of logging). With it, the above error will look like this: ERRO[0000] runc run failed: unable to start container process: can't get final child's PID from pipe: EOF; runc init error(s): nsexec-1[141549]: failed to unshare remaining namespaces: No space left on device; nsexec-0[141547]: failed to sync with stage-1: next state: Success Yes, it is long and ugly, but at least the upper level runtime will report it. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 17 ++++++++-------- libcontainer/logs/logs.go | 35 +++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 6168854930e..5593caefca2 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -338,8 +338,6 @@ func (c *Container) start(process *Process) (retErr error) { // We do not need the cloned binaries once the process is spawned. defer process.closeClonedExes() - logsDone := parent.forwardChildLogs() - // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC // to make sure we don't leak any files into "runc init". Any files to be // passed to "runc init" through ExtraFiles will get dup2'd by the Go @@ -349,21 +347,24 @@ func (c *Container) start(process *Process) (retErr error) { if err := utils.CloseExecFrom(3); err != nil { return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) } - if err := parent.start(); err != nil { - return fmt.Errorf("unable to start container process: %w", err) - } - if logsDone != nil { + if logsDone := parent.forwardChildLogs(); logsDone != nil { defer func() { // Wait for log forwarder to finish. This depends on // runc init closing the _LIBCONTAINER_LOGPIPE log fd. err := <-logsDone - if err != nil && retErr == nil { - retErr = fmt.Errorf("unable to forward init logs: %w", err) + // An error from child logs is nsexec FATAL messages. + // Always append those to the original error. + if err != nil { + retErr = fmt.Errorf("%w; runc init error(s): %w ", retErr, err) } }() } + if err := parent.start(); err != nil { + return fmt.Errorf("unable to start container process: %w", err) + } + if process.Init { c.fifo.Close() if c.config.HasHook(configs.Poststart) { diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index db12b851820..8cb2ff9aa8a 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -4,12 +4,16 @@ package logs import ( "bufio" + "bytes" "encoding/json" + "errors" "io" "github.com/sirupsen/logrus" ) +var fatalsSep = []byte("; ") + func ForwardLogs(logPipe io.ReadCloser) chan error { done := make(chan error, 1) s := bufio.NewScanner(logPipe) @@ -25,24 +29,33 @@ func ForwardLogs(logPipe io.ReadCloser) chan error { } go func() { + fatals := []byte{} for s.Scan() { - processEntry(s.Bytes(), logger) + fatals = processEntry(s.Bytes(), logger, fatals) + } + if err := s.Err(); err != nil { + logrus.Errorf("error reading from log source: %v", err) } if err := logPipe.Close(); err != nil { logrus.Errorf("error closing log source: %v", err) } - // The only error we want to return is when reading from - // logPipe has failed. - done <- s.Err() + // The only error we return is fatal messages from runc init. + var err error + if len(fatals) > 0 { + err = errors.New(string(bytes.TrimSuffix(fatals, fatalsSep))) + } + done <- err close(done) }() return done } -func processEntry(text []byte, logger *logrus.Logger) { +// processEntry parses the error and either logs it via the standard logger or, +// if this is a fatal error, appends its text to fatals. +func processEntry(text []byte, logger *logrus.Logger, fatals []byte) []byte { if len(text) == 0 { - return + return fatals } var jl struct { @@ -51,8 +64,14 @@ func processEntry(text []byte, logger *logrus.Logger) { } if err := json.Unmarshal(text, &jl); err != nil { logrus.Errorf("failed to decode %q to json: %v", text, err) - return + return fatals } - logger.Log(jl.Level, jl.Msg) + if jl.Level == logrus.FatalLevel { + fatals = append(fatals, jl.Msg...) + fatals = append(fatals, fatalsSep...) + } else { + logger.Log(jl.Level, jl.Msg) + } + return fatals } From a2a30fe08d95340e6048dbca012a71835c8a46be Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 5 Nov 2025 19:58:50 -0800 Subject: [PATCH 2/2] ci: disable golangci-lint cache This will result in slower runs but we are having issues with golangci-lint (false positives) that are most probably related to caching. Signed-off-by: Kir Kolyshkin --- .github/workflows/validate.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 971705fe050..6d37bb4573d 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -42,6 +42,7 @@ jobs: - uses: golangci/golangci-lint-action@v8 with: version: v2.5 + skip-cache: true # Extra linters, only checking new code from a pull request to main. - name: lint-extra if: github.event_name == 'pull_request' && github.base_ref == 'main'