-
Notifications
You must be signed in to change notification settings - Fork 103
OCPBUGS-38120: Improve error messages for project Delete errors #520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,11 +3,13 @@ package proxy | |
| import ( | ||
| "context" | ||
| "fmt" | ||
| "time" | ||
|
|
||
| kerrors "k8s.io/apimachinery/pkg/api/errors" | ||
| metainternal "k8s.io/apimachinery/pkg/apis/meta/internalversion" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| "k8s.io/apimachinery/pkg/util/wait" | ||
| "k8s.io/apimachinery/pkg/watch" | ||
| apirequest "k8s.io/apiserver/pkg/endpoints/request" | ||
| "k8s.io/apiserver/pkg/registry/rest" | ||
|
|
@@ -208,11 +210,84 @@ func (s *REST) Update(ctx context.Context, name string, objInfo rest.UpdatedObje | |
|
|
||
| var _ = rest.GracefulDeleter(&REST{}) | ||
|
|
||
| // maxRetriesOnConflict is the maximum retry count for Delete calls which | ||
| // result in resource conflicts. | ||
| const maxRetriesOnConflict = 10 | ||
|
|
||
| // maxDuration set max duration of delete retries. Deleting a project affects apiserver latency, | ||
| // so this should be kept as small as possible | ||
| const maxDuration = time.Second | ||
|
|
||
| // Delete deletes a Project specified by its name | ||
| func (s *REST) Delete(ctx context.Context, name string, objectFunc rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) { | ||
| var opts metav1.DeleteOptions | ||
| if options != nil { | ||
| opts = *options | ||
| } | ||
| return &metav1.Status{Status: metav1.StatusSuccess}, false, s.client.Delete(ctx, name, opts) | ||
| var lastErr error | ||
| err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{Steps: maxRetriesOnConflict, Duration: maxDuration}, func(ctx context.Context) (bool, error) { | ||
| var err error | ||
| if objectFunc != nil { | ||
| var obj runtime.Object | ||
| getOpts := metav1.GetOptions{} | ||
| if opts.Preconditions != nil && opts.Preconditions.ResourceVersion != nil { | ||
| getOpts.ResourceVersion = *opts.Preconditions.ResourceVersion | ||
| } | ||
| obj, err = s.Get(ctx, name, &getOpts) | ||
| if err != nil { | ||
| lastErr = fmt.Errorf("unable to get project: %w", err) | ||
| return false, nil | ||
| } | ||
| projectObj, ok := obj.(*projectapi.Project) | ||
| if !ok || projectObj == nil { | ||
| lastErr = fmt.Errorf("not a project: %#v", obj) | ||
| return false, nil | ||
| } | ||
| if opts.Preconditions == nil { | ||
| opts.Preconditions = &metav1.Preconditions{} | ||
| } | ||
| if options.Preconditions != nil { | ||
| // Throw an error if the UID or ResourceVersion preconditions do not match fetched object already | ||
| // This would avoid extra retries when user has provided invalid preconditions | ||
| if opts.Preconditions.UID != nil && projectObj.UID != *options.Preconditions.UID { | ||
| lastErr = fmt.Errorf("precondition UID %s does not match project UID %s", *opts.Preconditions.UID, projectObj.UID) | ||
| return false, nil | ||
| } | ||
| if opts.Preconditions.ResourceVersion != nil && projectObj.ResourceVersion != *options.Preconditions.ResourceVersion { | ||
| lastErr = fmt.Errorf("precondition RV %s does not match project RV %s", *opts.Preconditions.ResourceVersion, projectObj.ResourceVersion) | ||
| return false, nil | ||
| } | ||
| } | ||
| // Make sure the object hasn't changed between Get and Delete - pass UID and RV to delete options | ||
| // unless Precondition is already set | ||
| if opts.Preconditions.UID == nil { | ||
| opts.Preconditions.UID = &projectObj.UID | ||
| } | ||
| if opts.Preconditions.ResourceVersion == nil { | ||
| opts.Preconditions.ResourceVersion = &projectObj.ResourceVersion | ||
| } | ||
|
Comment on lines
+263
to
+268
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to retry conflicts that are caused by client-provided preconditions (they are probably doomed unless the request changes). If we might have propagated one precondition from the request, and added a second precondition here, it becomes hard to robustly determine which precondition caused a conflict. One way to solve this might be to inspect the fresh namespace returned from Get and enforce any client-provided preconditions immediately. After that, we know that both preconditions passed to the namespace Delete came from this code and that a retry might succeed with a newer UID/RV.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check that fetched project matches RV and/or UID before proceeding with delete |
||
|
|
||
| if err := objectFunc(ctx, obj); err != nil { | ||
| lastErr = fmt.Errorf("validation func failed: %w", err) | ||
| return false, nil | ||
| } | ||
| } | ||
| err = s.client.Delete(ctx, name, opts) | ||
| switch { | ||
| case err == nil: | ||
| return true, nil | ||
| case kerrors.IsConflict(err): | ||
| lastErr = err | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add tests showing that retry happens on conflict, no retry happens on non-conflict, and one where retries are exhausted please? |
||
| return false, nil | ||
| default: | ||
| return false, err | ||
| } | ||
| }) | ||
| if err != nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't understand how this helped. Isn't the doc for this function telling me that it would return immediately if there were a delete conflict? |
||
| if wait.Interrupted(err) { | ||
| err = lastErr | ||
| } | ||
| return &metav1.Status{Status: metav1.StatusFailure}, false, err | ||
| } | ||
| return &metav1.Status{Status: metav1.StatusSuccess}, false, nil | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens when
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, missed that part - we should replace |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I'm understanding the Godoc for Duration correctly, this means that we will sleep for one second between retries. That seems high to me. I bet it is a lot longer than a typical total latency of both namespace requests combined.
We can configure the other fields for exponential backoff so that initial retry is fairly fast.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, right, somehow I though "Duration" is max duration we're allowed to spend. I think
wait.Backoff{Steps: maxRetriesOnConflict, Factor: 1/maxRetriesOnConflict, Cap: maxDuration, Duration: maxDuration/maxRetriesOnConflict}would make it "up to 1 second" and ensure it has several retries