Skip to content

Commit ea21eb5

Browse files
committed
feat (postStart) : Allow debugging poststart failures with sleep by trapping errors
Add an optional debug mechanism for postStart lifecycle hooks. When enabled via the `controller.devfile.io/debug-start: "true"` annotation, any failure in a postStart command results in the container sleeping for 3600 seconds, allowing developers time to inspect the container state. - Added `enableDebugStart` parameter to poststart methods. - Injects `trap ... sleep` into postStart scripts when debug mode is enabled. - Includes support for both timeout-wrapped (`postStartTimeout`) and non-timeout lifecycle scripts. This feature improves debuggability of DevWorkspaces where postStart hooks fail and would otherwise cause container crash/restarts. Signed-off-by: Rohan Kumar <[email protected]>
1 parent e0ed8d2 commit ea21eb5

21 files changed

+414
-25
lines changed

controllers/workspace/devworkspace_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
329329
workspace.Config.Workspace.ImagePullPolicy,
330330
workspace.Config.Workspace.DefaultContainerResources,
331331
workspace.Config.Workspace.PostStartTimeout,
332+
workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true",
332333
)
333334
if err != nil {
334335
return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil

pkg/library/container/container.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ import (
4545
// rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes
4646
//
4747
// Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin)
48-
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string) (*v1alpha1.PodAdditions, error) {
48+
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string, enableDebugStart bool) (*v1alpha1.PodAdditions, error) {
4949
if !flatten.DevWorkspaceIsFlattened(workspace, nil) {
5050
return nil, fmt.Errorf("devfile is not flattened")
5151
}
@@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi
7777
podAdditions.Containers = append(podAdditions.Containers, *k8sContainer)
7878
}
7979

80-
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil {
80+
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout, enableDebugStart); err != nil {
8181
return nil, err
8282
}
8383

pkg/library/container/container_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func TestGetKubeContainersFromDevfile(t *testing.T) {
8787
t.Run(tt.Name, func(t *testing.T) {
8888
// sanity check that file is read correctly.
8989
assert.True(t, len(tt.Input.Components) > 0, "Input defines no components")
90-
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "")
90+
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "", false)
9191
if tt.Output.ErrRegexp != nil && assert.Error(t, err) {
9292
assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match")
9393
} else {

pkg/library/lifecycle/poststart.go

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ const (
4141
`
4242
)
4343

44-
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error {
44+
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string, enableDebugStart bool) error {
4545
if wksp.Events == nil || len(wksp.Events.PostStart) == 0 {
4646
return nil
4747
}
@@ -69,7 +69,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
6969
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7070
}
7171

72-
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout)
72+
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout, enableDebugStart)
7373
if err != nil {
7474
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7575
}
@@ -85,10 +85,10 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
8585

8686
// processCommandsForPostStart processes a list of DevWorkspace commands
8787
// and generates a corev1.LifecycleHandler for the PostStart lifecycle hook.
88-
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) {
88+
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string, enableDebugStart bool) (*corev1.LifecycleHandler, error) {
8989
if postStartTimeout == "" {
9090
// use the fallback if no timeout propagated
91-
return processCommandsWithoutTimeoutFallback(commands)
91+
return processCommandsWithoutTimeoutFallback(enableDebugStart, commands)
9292
}
9393

9494
originalUserScript, err := buildUserScript(commands)
@@ -101,7 +101,7 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
101101
scriptToExecute := "set -e\n" + originalUserScript
102102
escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`)
103103

104-
fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout)
104+
fullScriptWithTimeout := generateScriptWithTimeout(enableDebugStart, escapedUserScriptForTimeoutWrapper, postStartTimeout)
105105

106106
finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout)
107107

@@ -128,8 +128,13 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
128128
// - |
129129
// cd <workingDir>
130130
// <commandline>
131-
func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) {
131+
func processCommandsWithoutTimeoutFallback(debugEnabled bool, commands []dw.Command) (*corev1.LifecycleHandler, error) {
132132
var dwCommands []string
133+
if debugEnabled {
134+
dwCommands = append(dwCommands, "set -e")
135+
// TODO: Make sleep configurable?
136+
dwCommands = append(dwCommands, "trap 'echo \"[postStart] failure encountered, sleep for debugging\"; sleep 3600' ERR")
137+
}
133138
for _, command := range commands {
134139
execCmd := command.Exec
135140
if len(execCmd.Env) > 0 {
@@ -142,6 +147,15 @@ func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.Lifec
142147
}
143148

144149
joinedCommands := strings.Join(dwCommands, "\n")
150+
if debugEnabled {
151+
joinedCommands = fmt.Sprintf(`cat << 'EOF' > /tmp/poststart.sh
152+
#!/bin/sh
153+
%s
154+
EOF
155+
chmod +x /tmp/poststart.sh
156+
/tmp/poststart.sh
157+
`, joinedCommands)
158+
}
145159

146160
handler := &corev1.LifecycleHandler{
147161
Exec: &corev1.ExecAction{
@@ -187,7 +201,7 @@ func buildUserScript(commands []dw.Command) (string, error) {
187201
// environment variable exports, and specific exit code handling.
188202
// The killAfterDurationSeconds is hardcoded to 5s within this generated script.
189203
// It conditionally prefixes the user script with the timeout command if available.
190-
func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string {
204+
func generateScriptWithTimeout(debugEnabled bool, escapedUserScript string, postStartTimeout string) string {
191205
// Convert `postStartTimeout` into the `timeout` format
192206
var timeoutSeconds int64
193207
if postStartTimeout != "" && postStartTimeout != "0" {
@@ -203,6 +217,7 @@ func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string
203217
return fmt.Sprintf(`
204218
export POSTSTART_TIMEOUT_DURATION="%d"
205219
export POSTSTART_KILL_AFTER_DURATION="5"
220+
export DEBUG_ENABLED="%t"
206221
207222
_TIMEOUT_COMMAND_PART=""
208223
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -219,6 +234,11 @@ fi
219234
${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s'
220235
exit_code=$?
221236
237+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
238+
echo "[postStart] failure encountered, sleep for debugging" >&2
239+
sleep 3600
240+
fi
241+
222242
# Check the exit code based on whether timeout was attempted
223243
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
224244
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -239,5 +259,5 @@ else
239259
fi
240260
241261
exit $exit_code
242-
`, timeoutSeconds, escapedUserScript)
262+
`, timeoutSeconds, debugEnabled, escapedUserScript)
243263
}

pkg/library/lifecycle/poststart_test.go

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ type postStartTestCase struct {
3333
}
3434

3535
type postStartTestInput struct {
36-
Devfile *dw.DevWorkspaceTemplateSpec `json:"devfile,omitempty"`
37-
Containers []corev1.Container `json:"containers,omitempty"`
36+
Devfile *dw.DevWorkspaceTemplateSpec `json:"devfile,omitempty"`
37+
DebugEnabled bool `json:"debugEnabled,omitempty"`
38+
Containers []corev1.Container `json:"containers,omitempty"`
3839
}
3940

4041
type postStartTestOutput struct {
@@ -76,7 +77,7 @@ func TestAddPostStartLifecycleHooks(t *testing.T) {
7677
for _, tt := range tests {
7778
t.Run(fmt.Sprintf("%s (%s)", tt.Name, tt.testPath), func(t *testing.T) {
7879
var timeout string
79-
err := AddPostStartLifecycleHooks(tt.Input.Devfile, tt.Input.Containers, timeout)
80+
err := AddPostStartLifecycleHooks(tt.Input.Devfile, tt.Input.Containers, timeout, tt.Input.DebugEnabled)
8081
if tt.Output.ErrRegexp != nil && assert.Error(t, err) {
8182
assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match")
8283
} else {
@@ -299,15 +300,18 @@ func TestGenerateScriptWithTimeout(t *testing.T) {
299300
name string
300301
escapedUserScript string
301302
timeout string
303+
debugEnabled bool
302304
expectedScript string
303305
}{
304306
{
305307
name: "Basic script with timeout",
306308
escapedUserScript: "echo 'hello world'\nsleep 1",
307309
timeout: "10s",
310+
debugEnabled: false,
308311
expectedScript: `
309312
export POSTSTART_TIMEOUT_DURATION="10"
310313
export POSTSTART_KILL_AFTER_DURATION="5"
314+
export DEBUG_ENABLED="false"
311315
312316
_TIMEOUT_COMMAND_PART=""
313317
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -325,6 +329,11 @@ ${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'hello world'
325329
sleep 1'
326330
exit_code=$?
327331
332+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
333+
echo "[postStart] failure encountered, sleep for debugging" >&2
334+
sleep 3600
335+
fi
336+
328337
# Check the exit code based on whether timeout was attempted
329338
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
330339
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -351,9 +360,11 @@ exit $exit_code
351360
name: "Script with zero timeout (no timeout)",
352361
escapedUserScript: "echo 'running indefinitely...'",
353362
timeout: "0s",
363+
debugEnabled: false,
354364
expectedScript: `
355365
export POSTSTART_TIMEOUT_DURATION="0"
356366
export POSTSTART_KILL_AFTER_DURATION="5"
367+
export DEBUG_ENABLED="false"
357368
358369
_TIMEOUT_COMMAND_PART=""
359370
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -370,6 +381,11 @@ fi
370381
${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'running indefinitely...''
371382
exit_code=$?
372383
384+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
385+
echo "[postStart] failure encountered, sleep for debugging" >&2
386+
sleep 3600
387+
fi
388+
373389
# Check the exit code based on whether timeout was attempted
374390
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
375391
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -396,9 +412,11 @@ exit $exit_code
396412
name: "Empty user script",
397413
escapedUserScript: "",
398414
timeout: "5s",
415+
debugEnabled: false,
399416
expectedScript: `
400417
export POSTSTART_TIMEOUT_DURATION="5"
401418
export POSTSTART_KILL_AFTER_DURATION="5"
419+
export DEBUG_ENABLED="false"
402420
403421
_TIMEOUT_COMMAND_PART=""
404422
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -415,6 +433,11 @@ fi
415433
${_TIMEOUT_COMMAND_PART} /bin/sh -c ''
416434
exit_code=$?
417435
436+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
437+
echo "[postStart] failure encountered, sleep for debugging" >&2
438+
sleep 3600
439+
fi
440+
418441
# Check the exit code based on whether timeout was attempted
419442
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
420443
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -441,9 +464,11 @@ exit $exit_code
441464
name: "User script with already escaped single quotes",
442465
escapedUserScript: "echo 'it'\\''s complex'",
443466
timeout: "30s",
467+
debugEnabled: false,
444468
expectedScript: `
445469
export POSTSTART_TIMEOUT_DURATION="30"
446470
export POSTSTART_KILL_AFTER_DURATION="5"
471+
export DEBUG_ENABLED="false"
447472
448473
_TIMEOUT_COMMAND_PART=""
449474
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -460,6 +485,11 @@ fi
460485
${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'it'\''s complex''
461486
exit_code=$?
462487
488+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
489+
echo "[postStart] failure encountered, sleep for debugging" >&2
490+
sleep 3600
491+
fi
492+
463493
# Check the exit code based on whether timeout was attempted
464494
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
465495
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -486,9 +516,11 @@ exit $exit_code
486516
name: "User script with minute timeout",
487517
escapedUserScript: "echo 'wait for it...'",
488518
timeout: "2m",
519+
debugEnabled: false,
489520
expectedScript: `
490521
export POSTSTART_TIMEOUT_DURATION="120"
491522
export POSTSTART_KILL_AFTER_DURATION="5"
523+
export DEBUG_ENABLED="false"
492524
493525
_TIMEOUT_COMMAND_PART=""
494526
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -505,6 +537,64 @@ fi
505537
${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'wait for it...''
506538
exit_code=$?
507539
540+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
541+
echo "[postStart] failure encountered, sleep for debugging" >&2
542+
sleep 3600
543+
fi
544+
545+
# Check the exit code based on whether timeout was attempted
546+
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
547+
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
548+
echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2
549+
elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL)
550+
echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2
551+
elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code
552+
echo "[postStart hook] Commands failed with exit code $exit_code." >&2
553+
else
554+
echo "[postStart hook] Commands completed successfully within the time limit." >&2
555+
fi
556+
else
557+
if [ $exit_code -ne 0 ]; then
558+
echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2
559+
else
560+
echo "[postStart hook] Commands completed successfully (no timeout)." >&2
561+
fi
562+
fi
563+
564+
exit $exit_code
565+
`,
566+
},
567+
{
568+
name: "Basic script with timeout and debug enabled",
569+
escapedUserScript: "echo 'hello world'\nsleep 1",
570+
timeout: "10s",
571+
debugEnabled: true,
572+
expectedScript: `
573+
export POSTSTART_TIMEOUT_DURATION="10"
574+
export POSTSTART_KILL_AFTER_DURATION="5"
575+
export DEBUG_ENABLED="true"
576+
577+
_TIMEOUT_COMMAND_PART=""
578+
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
579+
580+
if command -v timeout >/dev/null 2>&1; then
581+
echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2
582+
_TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}"
583+
_WAS_TIMEOUT_USED="true"
584+
else
585+
echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2
586+
fi
587+
588+
# Execute the user's script
589+
${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'hello world'
590+
sleep 1'
591+
exit_code=$?
592+
593+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
594+
echo "[postStart] failure encountered, sleep for debugging" >&2
595+
sleep 3600
596+
fi
597+
508598
# Check the exit code based on whether timeout was attempted
509599
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
510600
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -531,7 +621,7 @@ exit $exit_code
531621

532622
for _, tt := range tests {
533623
t.Run(tt.name, func(t *testing.T) {
534-
script := generateScriptWithTimeout(tt.escapedUserScript, tt.timeout)
624+
script := generateScriptWithTimeout(tt.debugEnabled, tt.escapedUserScript, tt.timeout)
535625
assert.Equal(t, tt.expectedScript, script)
536626
})
537627
}

pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: "Adds all postStart events to containers"
22

33
input:
4+
debugEnabled: false
45
devfile:
56
commands:
67
- id: test-postStart-1

pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: "Should add postStart lifecycle hook for basic event"
22

33
input:
4+
debugEnabled: false
45
devfile:
56
commands:
67
- id: test-postStart

0 commit comments

Comments
 (0)