Skip to content

Commit 67661ea

Browse files
committed
feat (postStart) : Allow debugging poststart failures with sleep by trapping errors
Add an optional debug mechanism for postStart lifecycle hooks. When enabled via the `controller.devfile.io/debug-start: "true"` annotation, any failure in a postStart command results in the container sleeping for some seconds as per configured progressTimeout, allowing developers time to inspect the container state. - Added `enableDebugStart` parameter to poststart methods. - Injects `trap ... sleep` into postStart scripts when debug mode is enabled. - Includes support for both timeout-wrapped (`postStartTimeout`) and non-timeout lifecycle scripts. This feature improves debuggability of DevWorkspaces where postStart hooks fail and would otherwise cause container crash/restarts. Signed-off-by: Rohan Kumar <[email protected]>
1 parent 7861627 commit 67661ea

17 files changed

+248
-34
lines changed

controllers/workspace/devworkspace_controller.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,12 +323,17 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
323323
}
324324
}
325325

326+
postStartDebugTrapSleepDuration := ""
327+
if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" {
328+
postStartDebugTrapSleepDuration = workspace.Config.Workspace.ProgressTimeout
329+
}
326330
devfilePodAdditions, err := containerlib.GetKubeContainersFromDevfile(
327331
&workspace.Spec.Template,
328332
workspace.Config.Workspace.ContainerSecurityContext,
329333
workspace.Config.Workspace.ImagePullPolicy,
330334
workspace.Config.Workspace.DefaultContainerResources,
331335
workspace.Config.Workspace.PostStartTimeout,
336+
postStartDebugTrapSleepDuration,
332337
)
333338
if err != nil {
334339
return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil

docs/additional-configuration.adoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,14 @@ The DevWorkspace Operator sets the `volumeMounts` by default for config files, m
348348
## Debugging a failing workspace
349349
Normally, when a workspace fails to start, the deployment will be scaled down and the workspace will be stopped in a `Failed` state. This can make it difficult to debug misconfiguration errors, so the annotation `controller.devfile.io/debug-start: "true"` can be applied to DevWorkspaces to leave resources for failed workspaces on the cluster. This allows viewing logs from workspace containers.
350350

351+
It also enables a specialized debug mode for `postStart` lifecycle hooks, which are often used for initial setup tasks.
352+
353+
When a postStart command fails:
354+
- The container will not immediately crash or restart. It would stay in `ContainerCreating` phase.
355+
- The command failure is trapped, and the container is instead forced to sleep for some seconds as per configured DevWorkspace progressTimeout (by default, 5 minutes).
356+
357+
This trap sleep pause is a critical window that allows developers to connect to the container (e.g., using `kubectl exec`), inspect the file system, and review logs `/tmp/poststart-stderr.txt` / `/tmp/poststart-stdout.txt` to diagnose the exact cause of the postStart failure before the workspace ultimately scales down. This applies to both standard and timeout-wrapped postStart scripts.
358+
351359
## Setting RuntimeClass for workspace pods
352360
To run a DevWorkspace with a specific RuntimeClass, the attribute `controller.devfile.io/runtime-class` can be set on the DevWorkspace with the name of the RuntimeClass to be used. If the specified RuntimeClass does not exist, the workspace will fail to start. For example, to run a DevWorkspace using the https://github.com/kata-containers/kata-containers[kata containers] runtime in clusters where this is enabled, the DevWorkspace can be specified:
353361
[source,yaml]

pkg/library/container/container.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ import (
4545
// rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes
4646
//
4747
// Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin)
48-
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string) (*v1alpha1.PodAdditions, error) {
48+
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string, postStartDebugTrapSleepDuration string) (*v1alpha1.PodAdditions, error) {
4949
if !flatten.DevWorkspaceIsFlattened(workspace, nil) {
5050
return nil, fmt.Errorf("devfile is not flattened")
5151
}
@@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi
7777
podAdditions.Containers = append(podAdditions.Containers, *k8sContainer)
7878
}
7979

80-
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil {
80+
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout, postStartDebugTrapSleepDuration); err != nil {
8181
return nil, err
8282
}
8383

pkg/library/container/container_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func TestGetKubeContainersFromDevfile(t *testing.T) {
8787
t.Run(tt.Name, func(t *testing.T) {
8888
// sanity check that file is read correctly.
8989
assert.True(t, len(tt.Input.Components) > 0, "Input defines no components")
90-
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "")
90+
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "", "")
9191
if tt.Output.ErrRegexp != nil && assert.Error(t, err) {
9292
assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match")
9393
} else {

pkg/library/lifecycle/poststart.go

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ const (
4141
`
4242
)
4343

44-
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error {
44+
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string, postStartDebugTrapSleepDuration string) error {
4545
if wksp.Events == nil || len(wksp.Events.PostStart) == 0 {
4646
return nil
4747
}
@@ -69,7 +69,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
6969
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7070
}
7171

72-
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout)
72+
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout, postStartDebugTrapSleepDuration)
7373
if err != nil {
7474
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7575
}
@@ -85,10 +85,10 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
8585

8686
// processCommandsForPostStart processes a list of DevWorkspace commands
8787
// and generates a corev1.LifecycleHandler for the PostStart lifecycle hook.
88-
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) {
88+
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) {
8989
if postStartTimeout == "" {
9090
// use the fallback if no timeout propagated
91-
return processCommandsWithoutTimeoutFallback(commands)
91+
return processCommandsWithoutTimeoutFallback(postStartDebugTrapSleepDuration, commands)
9292
}
9393

9494
originalUserScript, err := buildUserScript(commands)
@@ -101,7 +101,7 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
101101
scriptToExecute := "set -e\n" + originalUserScript
102102
escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`)
103103

104-
fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout)
104+
fullScriptWithTimeout := generateScriptWithTimeout(postStartDebugTrapSleepDuration, escapedUserScriptForTimeoutWrapper, postStartTimeout)
105105

106106
finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout)
107107

@@ -128,8 +128,16 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
128128
// - |
129129
// cd <workingDir>
130130
// <commandline>
131-
func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) {
131+
func processCommandsWithoutTimeoutFallback(postStartDebugTrapSleepDuration string, commands []dw.Command) (*corev1.LifecycleHandler, error) {
132132
var dwCommands []string
133+
postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(postStartDebugTrapSleepDuration)
134+
if postStartFailureDebugSleepSeconds > 0 {
135+
dwCommands = append(dwCommands, "set -e")
136+
debugTrap := fmt.Sprintf(`
137+
trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep %d' ERR
138+
`, postStartFailureDebugSleepSeconds)
139+
dwCommands = append(dwCommands, strings.ReplaceAll(strings.TrimSpace(debugTrap), "\n", " "))
140+
}
133141
for _, command := range commands {
134142
execCmd := command.Exec
135143
if len(execCmd.Env) > 0 {
@@ -187,7 +195,7 @@ func buildUserScript(commands []dw.Command) (string, error) {
187195
// environment variable exports, and specific exit code handling.
188196
// The killAfterDurationSeconds is hardcoded to 5s within this generated script.
189197
// It conditionally prefixes the user script with the timeout command if available.
190-
func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string {
198+
func generateScriptWithTimeout(postStartDebugTrapSleepDuration string, escapedUserScript string, postStartTimeout string) string {
191199
// Convert `postStartTimeout` into the `timeout` format
192200
var timeoutSeconds int64
193201
if postStartTimeout != "" && postStartTimeout != "0" {
@@ -199,10 +207,12 @@ func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string
199207
timeoutSeconds = int64(duration.Seconds())
200208
}
201209
}
210+
postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(postStartDebugTrapSleepDuration)
202211

203212
return fmt.Sprintf(`
204213
export POSTSTART_TIMEOUT_DURATION="%d"
205214
export POSTSTART_KILL_AFTER_DURATION="5"
215+
export DEBUG_ENABLED="%t"
206216
207217
_TIMEOUT_COMMAND_PART=""
208218
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -219,6 +229,11 @@ fi
219229
${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s'
220230
exit_code=$?
221231
232+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
233+
echo "[postStart] failure encountered, sleep for debugging" >&2
234+
sleep %d
235+
fi
236+
222237
# Check the exit code based on whether timeout was attempted
223238
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
224239
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -239,5 +254,18 @@ else
239254
fi
240255
241256
exit $exit_code
242-
`, timeoutSeconds, escapedUserScript)
257+
`, timeoutSeconds, postStartFailureDebugSleepSeconds > 0, escapedUserScript, postStartFailureDebugSleepSeconds)
258+
}
259+
260+
func parsePostStartFailureDebugSleepDurationToSeconds(durationStr string) int {
261+
if durationStr == "" {
262+
return 0
263+
}
264+
265+
d, err := time.ParseDuration(durationStr)
266+
if err != nil {
267+
return 0
268+
}
269+
270+
return int(d.Seconds())
243271
}

0 commit comments

Comments
 (0)