From e5d154b3029b5af530ac13a72a871912918e4ba6 Mon Sep 17 00:00:00 2001 From: raaizik <132667934+raaizik@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:49:47 +0300 Subject: [PATCH 1/2] Available CRDs check feature Reasons for this enhancement: - A controller cannot set up a watch for a CRD that is not installed on the cluster, trying to set up a watch will panic the operator - There is no known way, that we are aware of, to add a watch later without client cache issue How does the enhancement work around the issue: - A new controller to watch creation/deletion for the CRDs of interest to prevent unnecessary reconciles - On start of the operator(main), detect which CRDs are avail (out of a fixed list) - At the start each reconcile of new controller, we fetch the CRDs available again and compare it with CRDs fetched in previous step, If there is any change, we panic the op Signed-off-by: raaizik <132667934+raaizik@users.noreply.github.com> Co-Authored-By: Rewant Soni --- .../initialization_reconciler_test.go | 1 + controllers/storagecluster/reconcile.go | 14 +++++++++- .../storagecluster_controller.go | 27 ++++++++++++++++++ .../storagecluster_controller_test.go | 1 + controllers/util/k8sutil.go | 18 ++++++++++++ main.go | 28 +++++++++++-------- 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/controllers/storagecluster/initialization_reconciler_test.go b/controllers/storagecluster/initialization_reconciler_test.go index dd39da05dd..a872cd5b01 100644 --- a/controllers/storagecluster/initialization_reconciler_test.go +++ b/controllers/storagecluster/initialization_reconciler_test.go @@ -401,5 +401,6 @@ func createFakeInitializationStorageClusterReconciler(t *testing.T, obj ...runti Scheme: scheme, OperatorCondition: newStubOperatorCondition(), Log: logf.Log.WithName("controller_storagecluster_test"), + AvailableCrds: make(map[string]bool), } } diff --git a/controllers/storagecluster/reconcile.go b/controllers/storagecluster/reconcile.go index 9c8acac146..cfb63aeb01 100644 --- a/controllers/storagecluster/reconcile.go +++ b/controllers/storagecluster/reconcile.go @@ -4,6 +4,8 @@ import ( "context" error1 "errors" "fmt" + "os" + "reflect" "strings" "time" @@ -159,6 +161,17 @@ func (r *StorageClusterReconciler) Reconcile(ctx context.Context, request reconc return reconcile.Result{}, err } + var err error + availableCrds, err := statusutil.MapCRDAvailability(ctx, r.Client, statusutil.GetCrds()...) + if err != nil { + r.Log.Error(err, "error fetching the CRDs") + return reconcile.Result{}, err + } + if !reflect.DeepEqual(availableCrds, r.AvailableCrds) { + r.Log.Info("CustomResourceDefinitions created/deleted. Restarting process.") + os.Exit(42) + } + if err := r.validateStorageClusterSpec(sc); err != nil { return reconcile.Result{}, err } @@ -166,7 +179,6 @@ func (r *StorageClusterReconciler) Reconcile(ctx context.Context, request reconc r.IsNoobaaStandalone = sc.Spec.MultiCloudGateway != nil && ReconcileStrategy(sc.Spec.MultiCloudGateway.ReconcileStrategy) == ReconcileStrategyStandalone - var err error r.clusters, err = statusutil.GetClusters(ctx, r.Client) if err != nil { r.Log.Error(err, "Failed to get clusters") diff --git a/controllers/storagecluster/storagecluster_controller.go b/controllers/storagecluster/storagecluster_controller.go index 11c6564969..a6ced19007 100644 --- a/controllers/storagecluster/storagecluster_controller.go +++ b/controllers/storagecluster/storagecluster_controller.go @@ -96,6 +96,7 @@ type StorageClusterReconciler struct { IsMultipleStorageClusters bool clusters *util.Clusters OperatorNamespace string + AvailableCrds map[string]bool } // SetupWithManager sets up a controller with manager @@ -211,6 +212,31 @@ func (r *StorageClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { }, } + crdPredicate := predicate.Funcs{ + CreateFunc: func(e event.TypedCreateEvent[client.Object]) bool { + crdAvailable, keyExist := r.AvailableCrds[e.Object.GetName()] + if keyExist && !crdAvailable { + r.Log.Info("CustomResourceDefinition %s was Created.", e.Object.GetName()) + return true + } + return false + }, + DeleteFunc: func(e event.TypedDeleteEvent[client.Object]) bool { + crdAvailable, keyExist := r.AvailableCrds[e.Object.GetName()] + if keyExist && crdAvailable { + r.Log.Info("CustomResourceDefinition %s was Deleted.", e.Object.GetName()) + return true + } + return false + }, + UpdateFunc: func(e event.TypedUpdateEvent[client.Object]) bool { + return false + }, + GenericFunc: func(e event.TypedGenericEvent[client.Object]) bool { + return false + }, + } + build := ctrl.NewControllerManagedBy(mgr). For(&ocsv1.StorageCluster{}, builder.WithPredicates(scPredicate)). Owns(&cephv1.CephCluster{}, builder.WithPredicates(cephClusterIgnoreTimeUpdatePredicate)). @@ -228,6 +254,7 @@ func (r *StorageClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { Owns(&corev1.Secret{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). Owns(&routev1.Route{}). Owns(&templatev1.Template{}). + Watches(&extv1.CustomResourceDefinition{}, enqueueStorageClusterRequest, builder.WithPredicates(crdPredicate)). Watches(&storagev1.StorageClass{}, enqueueStorageClusterRequest). Watches(&volumesnapshotv1.VolumeSnapshotClass{}, enqueueStorageClusterRequest). Watches(&ocsclientv1a1.StorageClient{}, enqueueStorageClusterRequest). diff --git a/controllers/storagecluster/storagecluster_controller_test.go b/controllers/storagecluster/storagecluster_controller_test.go index 46c710e26d..cb84a6e39d 100644 --- a/controllers/storagecluster/storagecluster_controller_test.go +++ b/controllers/storagecluster/storagecluster_controller_test.go @@ -1168,6 +1168,7 @@ func createFakeStorageClusterReconciler(t *testing.T, obj ...runtime.Object) Sto Log: logf.Log.WithName("controller_storagecluster_test"), clusters: clusters, OperatorNamespace: operatorNamespace, + AvailableCrds: make(map[string]bool), } } diff --git a/controllers/util/k8sutil.go b/controllers/util/k8sutil.go index 01a68536c8..f6a0dbea28 100644 --- a/controllers/util/k8sutil.go +++ b/controllers/util/k8sutil.go @@ -11,6 +11,7 @@ import ( ocsv1 "github.com/red-hat-storage/ocs-operator/api/v4/v1" corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" @@ -44,6 +45,10 @@ const ( OwnerUIDIndexName = "ownerUID" ) +func GetCrds() []string { + return []string{} +} + // GetWatchNamespace returns the namespace the operator should be watching for changes func GetWatchNamespace() (string, error) { ns, found := os.LookupEnv(WatchNamespaceEnvVar) @@ -149,3 +154,16 @@ func GenerateNameForNonResilientCephBlockPoolSC(initData *ocsv1.StorageCluster) } return fmt.Sprintf("%s-ceph-non-resilient-rbd", initData.Name) } + +func MapCRDAvailability(ctx context.Context, clnt client.Client, crdNames ...string) (map[string]bool, error) { + crdExist := map[string]bool{} + for _, crdName := range crdNames { + crd := &apiextensionsv1.CustomResourceDefinition{} + crd.Name = crdName + if err := clnt.Get(ctx, client.ObjectKeyFromObject(crd), crd); client.IgnoreNotFound(err) != nil { + return nil, fmt.Errorf("error getting CRD, %v", err) + } + crdExist[crdName] = crd.UID != "" + } + return crdExist, nil +} diff --git a/main.go b/main.go index 4782f7417d..0308a5f0fe 100644 --- a/main.go +++ b/main.go @@ -166,6 +166,22 @@ func main() { setupLog.Error(err, "Unable to get OperatorCondition") os.Exit(1) } + // apiclient.New() returns a client without cache. + // cache is not initialized before mgr.Start() + // we need this because we need to interact with OperatorCondition + apiClient, err := apiclient.New(mgr.GetConfig(), apiclient.Options{ + Scheme: mgr.GetScheme(), + }) + if err != nil { + setupLog.Error(err, "Unable to get Client") + os.Exit(1) + } + + availCrds, err := util.MapCRDAvailability(context.Background(), apiClient, util.GetCrds()...) + if err != nil { + setupLog.Error(err, "Unable to get CRD") + os.Exit(1) + } if err = (&ocsinitialization.OCSInitializationReconciler{ Client: mgr.GetClient(), @@ -184,6 +200,7 @@ func main() { Scheme: mgr.GetScheme(), OperatorNamespace: operatorNamespace, OperatorCondition: condition, + AvailableCrds: availCrds, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "StorageCluster") os.Exit(1) @@ -241,17 +258,6 @@ func main() { os.Exit(1) } - // apiclient.New() returns a client without cache. - // cache is not initialized before mgr.Start() - // we need this because we need to interact with OperatorCondition - apiClient, err := apiclient.New(mgr.GetConfig(), apiclient.Options{ - Scheme: mgr.GetScheme(), - }) - if err != nil { - setupLog.Error(err, "Unable to get Client") - os.Exit(1) - } - // Set OperatorCondition Upgradeable to True // We have to at least default the condition to True or // OLM will use the Readiness condition via our readiness probe instead: From 3617868b7441f8f6d0cf36788714bebbd8438582 Mon Sep 17 00:00:00 2001 From: raaizik <132667934+raaizik@users.noreply.github.com> Date: Thu, 8 Aug 2024 17:23:07 +0300 Subject: [PATCH 2/2] Available CRDs check feature w script Adds a script that bypasses pod restarts Signed-off-by: raaizik <132667934+raaizik@users.noreply.github.com> Co-Authored-By: Rewant Soni --- Dockerfile | 5 +++-- config/manager/manager.yaml | 5 +---- deploy/csv-templates/ocs-operator.csv.yaml.in | 7 ++----- .../manifests/ocs-operator.clusterserviceversion.yaml | 7 ++----- hack/crdavail.sh | 11 +++++++++++ 5 files changed, 19 insertions(+), 16 deletions(-) create mode 100755 hack/crdavail.sh diff --git a/Dockerfile b/Dockerfile index d43476fd75..4faa6dce28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,9 +24,10 @@ COPY --from=builder workspace/provider-api /usr/local/bin/provider-api COPY --from=builder workspace/onboarding-validation-keys-gen /usr/local/bin/onboarding-validation-keys-gen COPY --from=builder workspace/metrics/deploy/*rules*.yaml /ocs-prometheus-rules/ COPY --from=builder workspace/ux-backend-server /usr/local/bin/ux-backend-server +COPY --from=builder workspace/hack/crdavail.sh /usr/local/bin/crdavail -RUN chmod +x /usr/local/bin/ocs-operator /usr/local/bin/provider-api +RUN chmod +x /usr/local/bin/ocs-operator /usr/local/bin/provider-api /usr/local/bin/crdavail USER operator -ENTRYPOINT ["/usr/local/bin/ocs-operator"] +ENTRYPOINT ["/usr/local/bin/crdavail"] diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 63babb6d03..b2f1263761 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -23,10 +23,7 @@ spec: serviceAccountName: ocs-operator containers: - command: - - ocs-operator - args: - - --enable-leader-election - - "--health-probe-bind-address=:8081" + - crdavail image: ocs-dev/ocs-operator:latest imagePullPolicy: Always name: ocs-operator diff --git a/deploy/csv-templates/ocs-operator.csv.yaml.in b/deploy/csv-templates/ocs-operator.csv.yaml.in index 0be47ececd..063e4419d4 100644 --- a/deploy/csv-templates/ocs-operator.csv.yaml.in +++ b/deploy/csv-templates/ocs-operator.csv.yaml.in @@ -611,11 +611,8 @@ spec: name: ocs-operator spec: containers: - - args: - - --enable-leader-election - - --health-probe-bind-address=:8081 - command: - - ocs-operator + - command: + - crdavail env: - name: WATCH_NAMESPACE valueFrom: diff --git a/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml b/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml index 3e0765bb70..977395ba80 100644 --- a/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml +++ b/deploy/ocs-operator/manifests/ocs-operator.clusterserviceversion.yaml @@ -620,11 +620,8 @@ spec: name: ocs-operator spec: containers: - - args: - - --enable-leader-election - - --health-probe-bind-address=:8081 - command: - - ocs-operator + - command: + - crdavail env: - name: WATCH_NAMESPACE valueFrom: diff --git a/hack/crdavail.sh b/hack/crdavail.sh new file mode 100755 index 0000000000..031ed2be00 --- /dev/null +++ b/hack/crdavail.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +RESTART_EXIT_CODE=42 + +while true; do + ./usr/local/bin/ocs-operator --enable-leader-election --health-probe-bind-address=:8081 + EXIT_CODE=$? + if [ $EXIT_CODE -ne $RESTART_EXIT_CODE ]; then + exit $EXIT_CODE + fi +done \ No newline at end of file