Skip to content

Commit 58c9221

Browse files
committed
feat (postStart) : Allow debugging poststart failures with sleep by trapping errors
Add an optional debug mechanism for postStart lifecycle hooks. When enabled via the `controller.devfile.io/debug-start: "true"` annotation, any failure in a postStart command results in the container sleeping for some seconds as per configured progressTimeout, allowing developers time to inspect the container state. - Added `enableDebugStart` parameter to poststart methods. - Injects `trap ... sleep` into postStart scripts when debug mode is enabled. - Includes support for both timeout-wrapped (`postStartTimeout`) and non-timeout lifecycle scripts. This feature improves debuggability of DevWorkspaces where postStart hooks fail and would otherwise cause container crash/restarts. Signed-off-by: Rohan Kumar <[email protected]>
1 parent 7861627 commit 58c9221

18 files changed

+286
-34
lines changed

controllers/workspace/devworkspace_controller.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,12 +323,17 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
323323
}
324324
}
325325

326+
postStartDebugTrapSleepDuration := ""
327+
if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" {
328+
postStartDebugTrapSleepDuration = workspace.Config.Workspace.ProgressTimeout
329+
}
326330
devfilePodAdditions, err := containerlib.GetKubeContainersFromDevfile(
327331
&workspace.Spec.Template,
328332
workspace.Config.Workspace.ContainerSecurityContext,
329333
workspace.Config.Workspace.ImagePullPolicy,
330334
workspace.Config.Workspace.DefaultContainerResources,
331335
workspace.Config.Workspace.PostStartTimeout,
336+
postStartDebugTrapSleepDuration,
332337
)
333338
if err != nil {
334339
return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil

docs/additional-configuration.adoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,14 @@ The DevWorkspace Operator sets the `volumeMounts` by default for config files, m
348348
## Debugging a failing workspace
349349
Normally, when a workspace fails to start, the deployment will be scaled down and the workspace will be stopped in a `Failed` state. This can make it difficult to debug misconfiguration errors, so the annotation `controller.devfile.io/debug-start: "true"` can be applied to DevWorkspaces to leave resources for failed workspaces on the cluster. This allows viewing logs from workspace containers.
350350

351+
It also enables a specialized debug mode for `postStart` lifecycle hooks, which are often used for initial setup tasks.
352+
353+
When a postStart command fails:
354+
- The container will not immediately crash or restart. It would stay in `ContainerCreating` phase.
355+
- The command failure is trapped, and the container is instead forced to sleep for some seconds as per configured DevWorkspace progressTimeout (by default, 5 minutes).
356+
357+
This trap sleep pause is a critical window that allows developers to connect to the container (e.g., using `kubectl exec`), inspect the file system, and review logs `/tmp/poststart-stderr.txt` / `/tmp/poststart-stdout.txt` to diagnose the exact cause of the postStart failure before the workspace ultimately scales down. This applies to both standard and timeout-wrapped postStart scripts.
358+
351359
## Setting RuntimeClass for workspace pods
352360
To run a DevWorkspace with a specific RuntimeClass, the attribute `controller.devfile.io/runtime-class` can be set on the DevWorkspace with the name of the RuntimeClass to be used. If the specified RuntimeClass does not exist, the workspace will fail to start. For example, to run a DevWorkspace using the https://github.com/kata-containers/kata-containers[kata containers] runtime in clusters where this is enabled, the DevWorkspace can be specified:
353361
[source,yaml]

pkg/library/container/container.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ import (
4545
// rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes
4646
//
4747
// Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin)
48-
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string) (*v1alpha1.PodAdditions, error) {
48+
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string, postStartDebugTrapSleepDuration string) (*v1alpha1.PodAdditions, error) {
4949
if !flatten.DevWorkspaceIsFlattened(workspace, nil) {
5050
return nil, fmt.Errorf("devfile is not flattened")
5151
}
@@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi
7777
podAdditions.Containers = append(podAdditions.Containers, *k8sContainer)
7878
}
7979

80-
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil {
80+
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout, postStartDebugTrapSleepDuration); err != nil {
8181
return nil, err
8282
}
8383

pkg/library/container/container_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func TestGetKubeContainersFromDevfile(t *testing.T) {
8787
t.Run(tt.Name, func(t *testing.T) {
8888
// sanity check that file is read correctly.
8989
assert.True(t, len(tt.Input.Components) > 0, "Input defines no components")
90-
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "")
90+
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "", "")
9191
if tt.Output.ErrRegexp != nil && assert.Error(t, err) {
9292
assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match")
9393
} else {

pkg/library/lifecycle/poststart.go

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package lifecycle
1515

1616
import (
1717
"fmt"
18+
"regexp"
1819
"strings"
1920
"time"
2021

@@ -41,7 +42,9 @@ const (
4142
`
4243
)
4344

44-
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error {
45+
var trapErrRegex = regexp.MustCompile(`\btrap\b.*\bERR\b`)
46+
47+
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string, postStartDebugTrapSleepDuration string) error {
4548
if wksp.Events == nil || len(wksp.Events.PostStart) == 0 {
4649
return nil
4750
}
@@ -69,7 +72,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
6972
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7073
}
7174

72-
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout)
75+
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout, postStartDebugTrapSleepDuration)
7376
if err != nil {
7477
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7578
}
@@ -85,10 +88,10 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
8588

8689
// processCommandsForPostStart processes a list of DevWorkspace commands
8790
// and generates a corev1.LifecycleHandler for the PostStart lifecycle hook.
88-
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) {
91+
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) {
8992
if postStartTimeout == "" {
9093
// use the fallback if no timeout propagated
91-
return processCommandsWithoutTimeoutFallback(commands)
94+
return processCommandsWithoutTimeoutFallback(postStartDebugTrapSleepDuration, commands)
9295
}
9396

9497
originalUserScript, err := buildUserScript(commands)
@@ -101,7 +104,7 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
101104
scriptToExecute := "set -e\n" + originalUserScript
102105
escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`)
103106

104-
fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout)
107+
fullScriptWithTimeout := generateScriptWithTimeout(postStartDebugTrapSleepDuration, escapedUserScriptForTimeoutWrapper, postStartTimeout)
105108

106109
finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout)
107110

@@ -128,8 +131,10 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
128131
// - |
129132
// cd <workingDir>
130133
// <commandline>
131-
func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) {
134+
func processCommandsWithoutTimeoutFallback(postStartDebugTrapSleepDuration string, commands []dw.Command) (*corev1.LifecycleHandler, error) {
132135
var dwCommands []string
136+
postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(postStartDebugTrapSleepDuration)
137+
hasErrTrapInUserScript := false
133138
for _, command := range commands {
134139
execCmd := command.Exec
135140
if len(execCmd.Env) > 0 {
@@ -139,6 +144,21 @@ func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.Lifec
139144
dwCommands = append(dwCommands, fmt.Sprintf("cd %s", execCmd.WorkingDir))
140145
}
141146
dwCommands = append(dwCommands, execCmd.CommandLine)
147+
if trapErrRegex.MatchString(execCmd.CommandLine) {
148+
hasErrTrapInUserScript = true
149+
}
150+
}
151+
152+
if postStartFailureDebugSleepSeconds > 0 && !hasErrTrapInUserScript {
153+
debugTrap := fmt.Sprintf(`
154+
trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep %d' ERR
155+
`, postStartFailureDebugSleepSeconds)
156+
debugTrapLine := strings.ReplaceAll(strings.TrimSpace(debugTrap), "\n", " ")
157+
158+
dwCommands = append([]string{
159+
"set -e",
160+
debugTrapLine,
161+
}, dwCommands...)
142162
}
143163

144164
joinedCommands := strings.Join(dwCommands, "\n")
@@ -187,7 +207,7 @@ func buildUserScript(commands []dw.Command) (string, error) {
187207
// environment variable exports, and specific exit code handling.
188208
// The killAfterDurationSeconds is hardcoded to 5s within this generated script.
189209
// It conditionally prefixes the user script with the timeout command if available.
190-
func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string {
210+
func generateScriptWithTimeout(postStartDebugTrapSleepDuration string, escapedUserScript string, postStartTimeout string) string {
191211
// Convert `postStartTimeout` into the `timeout` format
192212
var timeoutSeconds int64
193213
if postStartTimeout != "" && postStartTimeout != "0" {
@@ -199,10 +219,12 @@ func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string
199219
timeoutSeconds = int64(duration.Seconds())
200220
}
201221
}
222+
postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(postStartDebugTrapSleepDuration)
202223

203224
return fmt.Sprintf(`
204225
export POSTSTART_TIMEOUT_DURATION="%d"
205226
export POSTSTART_KILL_AFTER_DURATION="5"
227+
export DEBUG_ENABLED="%t"
206228
207229
_TIMEOUT_COMMAND_PART=""
208230
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -219,6 +241,11 @@ fi
219241
${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s'
220242
exit_code=$?
221243
244+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
245+
echo "[postStart] failure encountered, sleep for debugging" >&2
246+
sleep %d
247+
fi
248+
222249
# Check the exit code based on whether timeout was attempted
223250
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
224251
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -239,5 +266,18 @@ else
239266
fi
240267
241268
exit $exit_code
242-
`, timeoutSeconds, escapedUserScript)
269+
`, timeoutSeconds, postStartFailureDebugSleepSeconds > 0, escapedUserScript, postStartFailureDebugSleepSeconds)
270+
}
271+
272+
func parsePostStartFailureDebugSleepDurationToSeconds(durationStr string) int {
273+
if durationStr == "" {
274+
return 0
275+
}
276+
277+
d, err := time.ParseDuration(durationStr)
278+
if err != nil {
279+
return 0
280+
}
281+
282+
return int(d.Seconds())
243283
}

0 commit comments

Comments
 (0)