diff --git a/CHANGELOG.md b/CHANGELOG.md index f92f9e0c6..c81c95450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). * Delete the CancelLoop function, fixing a cluster status update bug * Correctly detect failed version checker Pods +* retry cluster status updates, reducing test flakes # [v2.7.0](https://github.com/cockroachdb/cockroach-operator/compare/v2.6.0...v2.7.0) diff --git a/pkg/controller/BUILD.bazel b/pkg/controller/BUILD.bazel index feb933dcc..e45d26462 100644 --- a/pkg/controller/BUILD.bazel +++ b/pkg/controller/BUILD.bazel @@ -24,6 +24,7 @@ go_library( "@io_k8s_api//policy/v1beta1:go_default_library", "@io_k8s_apimachinery//pkg/runtime:go_default_library", "@io_k8s_client_go//kubernetes:go_default_library", + "@io_k8s_client_go//util/retry:go_default_library", "@io_k8s_sigs_controller_runtime//:go_default_library", "@io_k8s_sigs_controller_runtime//pkg/client:go_default_library", "@io_k8s_sigs_controller_runtime//pkg/reconcile:go_default_library", diff --git a/pkg/controller/cluster_controller.go b/pkg/controller/cluster_controller.go index 7a99463db..0c89df858 100644 --- a/pkg/controller/cluster_controller.go +++ b/pkg/controller/cluster_controller.go @@ -37,6 +37,7 @@ import ( policy "k8s.io/api/policy/v1beta1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -121,8 +122,8 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request // we added a state called Starting for field ClusterStatus to accomplish this if cluster.Status().ClusterStatus == "" { cluster.SetClusterStatusOnFirstReconcile() - if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil { - log.Error(err, "failed to update cluster status on action") + if err := r.updateClusterStatus(ctx, log, &cluster); err != nil { + log.Error(err, "failed to update cluster status") return requeueIfError(err) } return requeueImmediately() @@ -132,8 +133,8 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request if cluster.True(api.CrdbVersionChecked) { if cluster.GetCockroachDBImageName() != cluster.Status().CrdbContainerImage { cluster.SetFalse(api.CrdbVersionChecked) - if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil { - log.Error(err, "failed to update cluster status on action") + if err := r.updateClusterStatus(ctx, log, &cluster); err != nil { + log.Error(err, "failed to update cluster status") return requeueIfError(err) } return requeueImmediately() @@ -148,8 +149,6 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request return noRequeue() } - ctx = context.Background() - log.Info(fmt.Sprintf("Running action with name: %s", actorToExecute.GetActionType())) if err := actorToExecute.Act(ctx, &cluster, log); err != nil { // Save the error on the Status for each action @@ -157,7 +156,7 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request cluster.SetActionFailed(actorToExecute.GetActionType(), err.Error()) defer func(ctx context.Context, cluster *resource.Cluster) { - if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil { + if err := r.updateClusterStatus(ctx, log, cluster); err != nil { log.Error(err, "failed to update cluster status") } }(ctx, &cluster) @@ -193,20 +192,7 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request cluster.SetActionFinished(actorToExecute.GetActionType()) } - // Check if the resource has been updated while the controller worked on it - fresh, err := cluster.IsFresh(fetcher) - if err != nil { - return requeueIfError(err) - } - - // If the resource was updated, it is needed to start all over again - // to ensure that the latest state was reconciled - if !fresh { - log.V(int(zapcore.DebugLevel)).Info("cluster resources is not up to date") - return requeueImmediately() - } - cluster.SetClusterStatus() - if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil { + if err := r.updateClusterStatus(ctx, log, &cluster); err != nil { log.Error(err, "failed to update cluster status") return requeueIfError(err) } @@ -215,6 +201,15 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request return noRequeue() } +// updateClusterStatus preprocesses a cluster's Status and then persists it to +// the Kubernetes API. updateClusterStatus will retry on conflict errors. +func (r *ClusterReconciler) updateClusterStatus(ctx context.Context, log logr.Logger, cluster *resource.Cluster) error { + cluster.SetClusterStatus() + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + return r.Client.Status().Update(ctx, cluster.Unwrap()) + }) +} + // SetupWithManager registers the controller with the controller.Manager from controller-runtime func (r *ClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { var ingress client.Object diff --git a/pkg/resource/cluster.go b/pkg/resource/cluster.go index bdf900925..8a5f6c263 100644 --- a/pkg/resource/cluster.go +++ b/pkg/resource/cluster.go @@ -319,15 +319,6 @@ func (cluster Cluster) SecureMode() string { return "--insecure" } -func (cluster Cluster) IsFresh(fetcher Fetcher) (bool, error) { - actual := ClusterPlaceholder(cluster.Name()) - if err := fetcher.Fetch(actual); err != nil { - return false, errors.Wrapf(err, "failed to fetch cluster resource") - } - - return cluster.cr.ResourceVersion == actual.ResourceVersion, nil -} - func (cluster Cluster) LoggingConfiguration(fetcher Fetcher) (string, error) { if cluster.Spec().LogConfigMap != "" { cm := &corev1.ConfigMap{