From 6323f53eadd712cca60ca5e20c9922156ece733c Mon Sep 17 00:00:00 2001 From: Daniel Mellado Date: Fri, 10 Oct 2025 14:39:33 +0200 Subject: [PATCH] operator: implement ClusterMonitoring CRD AlertmanagerConfig support adds ClusterMonitoring controller that watches the CRD and triggers reconciliation. implements merge logic to apply AlertmanagerConfig settings from the CRD over the existing ConfigMap configuration. supports three deployment modes (Disabled, DefaultConfig, CustomConfig) with fields for pod scheduling, resources, secrets, volumeClaimTemplate and logLevel. --- .../cluster-monitoring-operator.libsonnet | 5 + ...0_cluster-monitoring-operator_02-role.yaml | 8 + ...onitoring-operator_06-clusteroperator.yaml | 3 + pkg/client/client.go | 14 ++ pkg/clustermonitoring/controller.go | 165 ++++++++++++++++++ pkg/operator/operator.go | 134 +++++++++++++- pkg/operator/operator_test.go | 142 +++++++++++++++ test/e2e/cluster_monitoring_test.go | 154 ++++++++++++++++ 8 files changed, 617 insertions(+), 8 deletions(-) create mode 100644 pkg/clustermonitoring/controller.go create mode 100644 test/e2e/cluster_monitoring_test.go diff --git a/jsonnet/components/cluster-monitoring-operator.libsonnet b/jsonnet/components/cluster-monitoring-operator.libsonnet index c9d2b9b8f5..da209e5643 100644 --- a/jsonnet/components/cluster-monitoring-operator.libsonnet +++ b/jsonnet/components/cluster-monitoring-operator.libsonnet @@ -247,6 +247,11 @@ function(params) { resources: ['featuregates'], verbs: ['get', 'list', 'watch'], }, + { + apiGroups: ['config.openshift.io'], + resources: ['clustermonitorings'], + verbs: ['get', 'list', 'watch'], + }, { apiGroups: ['certificates.k8s.io'], resources: ['certificatesigningrequests'], diff --git a/manifests/0000_50_cluster-monitoring-operator_02-role.yaml b/manifests/0000_50_cluster-monitoring-operator_02-role.yaml index 58afb2eaee..6cb285a7e7 100644 --- a/manifests/0000_50_cluster-monitoring-operator_02-role.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_02-role.yaml @@ -135,6 +135,14 @@ rules: - get - list - watch +- apiGroups: + - config.openshift.io + resources: + - clustermonitorings + verbs: + - get + - list + - watch - apiGroups: - certificates.k8s.io resources: diff --git a/manifests/0000_50_cluster-monitoring-operator_06-clusteroperator.yaml b/manifests/0000_50_cluster-monitoring-operator_06-clusteroperator.yaml index d76a1747f6..5c3da8a36d 100644 --- a/manifests/0000_50_cluster-monitoring-operator_06-clusteroperator.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_06-clusteroperator.yaml @@ -40,3 +40,6 @@ status: - group: monitoring.coreos.com name: '' resource: alertmanagerconfigs + - group: config.openshift.io + name: cluster + resource: clustermonitorings diff --git a/pkg/client/client.go b/pkg/client/client.go index e9058276f3..806edae140 100644 --- a/pkg/client/client.go +++ b/pkg/client/client.go @@ -27,6 +27,7 @@ import ( "github.com/imdario/mergo" configv1 "github.com/openshift/api/config/v1" + configv1alpha1 "github.com/openshift/api/config/v1alpha1" consolev1 "github.com/openshift/api/console/v1" osmv1 "github.com/openshift/api/monitoring/v1" routev1 "github.com/openshift/api/route/v1" @@ -417,6 +418,15 @@ func (c *Client) ClusterOperatorListWatch(ctx context.Context, name string) *cac } } +func (c *Client) ClusterMonitoringListWatch() *cache.ListWatch { + return cache.NewListWatchFromClient( + c.oscclient.ConfigV1alpha1().RESTClient(), + "clustermonitorings", + "", + fields.Everything(), + ) +} + func (c *Client) HasRouteCapability(ctx context.Context) (bool, error) { _, err := c.oscclient.ConfigV1().ClusterOperators().Get(ctx, "ingress", metav1.GetOptions{}) if apierrors.IsNotFound(err) { @@ -595,6 +605,10 @@ func (c *Client) GetConsoleConfig(ctx context.Context, name string) (*configv1.C return c.oscclient.ConfigV1().Consoles().Get(ctx, name, metav1.GetOptions{}) } +func (c *Client) GetClusterMonitoring(ctx context.Context, name string) (*configv1alpha1.ClusterMonitoring, error) { + return c.oscclient.ConfigV1alpha1().ClusterMonitorings().Get(ctx, name, metav1.GetOptions{}) +} + func (c *Client) GetConfigmap(ctx context.Context, namespace, name string) (*v1.ConfigMap, error) { return c.kclient.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{}) } diff --git a/pkg/clustermonitoring/controller.go b/pkg/clustermonitoring/controller.go new file mode 100644 index 0000000000..028668e5eb --- /dev/null +++ b/pkg/clustermonitoring/controller.go @@ -0,0 +1,165 @@ +// Copyright 2025 The Cluster Monitoring Operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package clustermonitoring + +import ( + "context" + "fmt" + "time" + + configv1alpha1 "github.com/openshift/api/config/v1alpha1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + + "github.com/openshift/cluster-monitoring-operator/pkg/client" +) + +const ( + controllerName = "cluster-monitoring" + resyncPeriod = 15 * time.Minute + queueBaseDelay = 50 * time.Millisecond + queueMaxDelay = 3 * time.Minute +) + +// Controller is a controller for ClusterMonitoring resources. +type Controller struct { + client *client.Client + queue workqueue.TypedRateLimitingInterface[string] + informer cache.SharedIndexInformer + triggerReconcile func() +} + +// NewController returns a new ClusterMonitoring controller. +func NewController(ctx context.Context, client *client.Client, version string, triggerReconcile func()) (*Controller, error) { + informer := cache.NewSharedIndexInformer( + client.ClusterMonitoringListWatch(), + &configv1alpha1.ClusterMonitoring{}, + resyncPeriod, + cache.Indexers{}, + ) + + queue := workqueue.NewTypedRateLimitingQueueWithConfig[string]( + workqueue.NewTypedItemExponentialFailureRateLimiter[string](queueBaseDelay, queueMaxDelay), + workqueue.TypedRateLimitingQueueConfig[string]{Name: controllerName}, + ) + + controller := &Controller{ + client: client, + queue: queue, + informer: informer, + triggerReconcile: triggerReconcile, + } + + _, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: controller.handleAdd, + UpdateFunc: controller.handleUpdate, + DeleteFunc: controller.handleDelete, + }) + if err != nil { + return nil, err + } + + return controller, nil +} + +// Run starts the controller. +func (c *Controller) Run(ctx context.Context, workers int) { + klog.Info("Starting ClusterMonitoring controller") + defer c.queue.ShutDown() + + go c.informer.Run(ctx.Done()) + + if !cache.WaitForNamedCacheSync("ClusterMonitoring controller", ctx.Done(), c.informer.HasSynced) { + klog.Error("Failed to sync ClusterMonitoring controller cache") + return + } + + go c.worker(ctx) + + klog.Info("ClusterMonitoring controller started") + <-ctx.Done() + klog.Info("ClusterMonitoring controller stopped") +} + +func (c *Controller) worker(ctx context.Context) { + for c.processNextWorkItem(ctx) { + } +} + +func (c *Controller) processNextWorkItem(ctx context.Context) bool { + key, quit := c.queue.Get() + if quit { + return false + } + defer c.queue.Done(key) + + if err := c.sync(ctx, key); err != nil { + utilruntime.HandleError(fmt.Errorf("error syncing ClusterMonitoring (%s): %w", key, err)) + c.queue.AddRateLimited(key) + return true + } + + klog.V(4).Infof("ClusterMonitoring successfully synced: %s", key) + c.queue.Forget(key) + return true +} + +func (c *Controller) sync(ctx context.Context, key string) error { + klog.V(4).Infof("ClusterMonitoring controller processing: %s", key) + + if c.triggerReconcile != nil { + c.triggerReconcile() + } + + return nil +} + +func (c *Controller) handleAdd(obj interface{}) { + key, ok := c.keyFunc(obj) + if !ok { + return + } + klog.Infof("ClusterMonitoring added: %s", key) + c.queue.Add(key) +} + +func (c *Controller) handleUpdate(oldObj, newObj interface{}) { + key, ok := c.keyFunc(newObj) + if !ok { + return + } + klog.Infof("ClusterMonitoring updated: %s", key) + c.queue.Add(key) +} + +func (c *Controller) handleDelete(obj interface{}) { + key, ok := c.keyFunc(obj) + if !ok { + return + } + klog.Infof("ClusterMonitoring deleted: %s", key) + c.queue.Add(key) +} + +func (c *Controller) keyFunc(obj interface{}) (string, bool) { + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Creating key for ClusterMonitoring object failed: %v", err) + return key, false + } + return key, true +} diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 91f8e6ecc7..113be6d6b9 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -24,6 +24,7 @@ import ( "github.com/blang/semver/v4" configv1 "github.com/openshift/api/config/v1" + configv1alpha1 "github.com/openshift/api/config/v1alpha1" "github.com/openshift/api/features" configv1client "github.com/openshift/client-go/config/clientset/versioned" configv1informers "github.com/openshift/client-go/config/informers/externalversions" @@ -31,6 +32,7 @@ import ( "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" "github.com/openshift/library-go/pkg/operator/csr" "github.com/openshift/library-go/pkg/operator/events" + monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" certapiv1 "k8s.io/api/certificates/v1" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -45,9 +47,11 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" "k8s.io/utils/clock" + "k8s.io/utils/ptr" "github.com/openshift/cluster-monitoring-operator/pkg/alert" "github.com/openshift/cluster-monitoring-operator/pkg/client" + "github.com/openshift/cluster-monitoring-operator/pkg/clustermonitoring" "github.com/openshift/cluster-monitoring-operator/pkg/manifests" "github.com/openshift/cluster-monitoring-operator/pkg/metrics" "github.com/openshift/cluster-monitoring-operator/pkg/tasks" @@ -177,12 +181,13 @@ const ( type Operator struct { namespace, namespaceUserWorkload string - configMapName string - userWorkloadConfigMapName string - images map[string]string - telemetryMatches []string - remoteWrite bool - CollectionProfilesEnabled bool + configMapName string + userWorkloadConfigMapName string + images map[string]string + telemetryMatches []string + remoteWrite bool + CollectionProfilesEnabled bool + ClusterMonitoringConfigEnabled bool lastKnowInfrastructureConfig *InfrastructureConfig lastKnowProxyConfig *ProxyConfig @@ -202,8 +207,9 @@ type Operator struct { assets *manifests.Assets - ruleController *alert.RuleController - relabelController *alert.RelabelConfigController + ruleController *alert.RuleController + relabelController *alert.RelabelConfigController + clusterMonitoringController *clustermonitoring.Controller } func New( @@ -452,10 +458,23 @@ func New( return nil, err } o.CollectionProfilesEnabled = featureGates.Enabled(features.FeatureGateMetricsCollectionProfiles) + o.ClusterMonitoringConfigEnabled = featureGates.Enabled(features.FeatureGateClusterMonitoringConfig) case <-time.After(1 * time.Minute): return nil, fmt.Errorf("timed out waiting for FeatureGate detection") } + // Only create the ClusterMonitoring controller if the feature gate is enabled + if o.ClusterMonitoringConfigEnabled { + clusterMonitoringController, err := clustermonitoring.NewController(ctx, c, version, func() { + key := o.namespace + "/" + o.configMapName + o.enqueue(key) + }) + if err != nil { + return nil, fmt.Errorf("failed to create cluster monitoring controller: %w", err) + } + o.clusterMonitoringController = clusterMonitoringController + } + // csrController runs a controller that requests a client TLS certificate // for Prometheus k8s. This certificate is used to authenticate against the // /metrics endpoint of the targets. @@ -534,6 +553,10 @@ func New( o.relabelController.Run, ) + if o.clusterMonitoringController != nil { + o.controllersToRunFunc = append(o.controllersToRunFunc, o.clusterMonitoringController.Run) + } + return o, nil } @@ -964,6 +987,96 @@ func (o *Operator) loadUserWorkloadConfig(ctx context.Context) (*manifests.UserW return manifests.NewUserWorkloadConfigFromConfigMap(userCM) } +func (o *Operator) mergeClusterMonitoringCRD(ctx context.Context, c *manifests.Config) error { + if !o.ClusterMonitoringConfigEnabled { + return nil + } + + crd, err := o.client.GetClusterMonitoring(ctx, clusterResourceName) + if err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get ClusterMonitoring CRD: %w", err) + } + + if err := o.mergeAlertmanagerConfig(c, &crd.Spec.AlertmanagerConfig); err != nil { + return fmt.Errorf("failed to merge AlertmanagerConfig: %w", err) + } + + return nil +} + +func (o *Operator) mergeAlertmanagerConfig(c *manifests.Config, amConfig *configv1alpha1.AlertmanagerConfig) error { + if c.ClusterMonitoringConfiguration.AlertmanagerMainConfig == nil { + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig = &manifests.AlertmanagerMainConfig{} + } + + switch amConfig.DeploymentMode { + case configv1alpha1.AlertManagerDeployModeDisabled: + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled = ptr.To(false) + + case configv1alpha1.AlertManagerDeployModeDefaultConfig: + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled = ptr.To(true) + + case configv1alpha1.AlertManagerDeployModeCustomConfig: + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled = ptr.To(true) + + cfg := &amConfig.CustomConfig + + if cfg.LogLevel != "" { + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.LogLevel = strings.ToLower(string(cfg.LogLevel)) + } + + if len(cfg.NodeSelector) > 0 { + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.NodeSelector = cfg.NodeSelector + } + + if len(cfg.Resources) > 0 { + resources := &v1.ResourceRequirements{ + Requests: v1.ResourceList{}, + Limits: v1.ResourceList{}, + } + for _, res := range cfg.Resources { + if !res.Request.IsZero() { + resources.Requests[v1.ResourceName(res.Name)] = res.Request + } + if !res.Limit.IsZero() { + resources.Limits[v1.ResourceName(res.Name)] = res.Limit + } + } + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Resources = resources + } + + if len(cfg.Secrets) > 0 { + secrets := make([]string, len(cfg.Secrets)) + for i, s := range cfg.Secrets { + secrets[i] = string(s) + } + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Secrets = secrets + } + + if len(cfg.Tolerations) > 0 { + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Tolerations = cfg.Tolerations + } + + if len(cfg.TopologySpreadConstraints) > 0 { + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.TopologySpreadConstraints = cfg.TopologySpreadConstraints + } + + if cfg.VolumeClaimTemplate != nil { + c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.VolumeClaimTemplate = &monv1.EmbeddedPersistentVolumeClaim{ + Spec: cfg.VolumeClaimTemplate.Spec, + } + } + + default: + return fmt.Errorf("unknown DeploymentMode: %s", amConfig.DeploymentMode) + } + + return nil +} + func (o *Operator) loadConfig(key string) (*manifests.Config, error) { obj, found, err := o.cmapInf.GetStore().GetByKey(key) if err != nil { @@ -985,6 +1098,11 @@ func (o *Operator) Config(ctx context.Context, key string) (*manifests.Config, e return nil, err } + err = o.mergeClusterMonitoringCRD(ctx, c) + if err != nil { + klog.Warningf("failed to merge ClusterMonitoring CRD configuration: %v", err) + } + err = c.Precheck() if err != nil { return nil, err diff --git a/pkg/operator/operator_test.go b/pkg/operator/operator_test.go index 2bb994ca96..a9de9861fb 100644 --- a/pkg/operator/operator_test.go +++ b/pkg/operator/operator_test.go @@ -21,7 +21,9 @@ import ( "testing" configv1 "github.com/openshift/api/config/v1" + configv1alpha1 "github.com/openshift/api/config/v1alpha1" "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/api/resource" apiutilerrors "k8s.io/apimachinery/pkg/util/errors" "github.com/openshift/cluster-monitoring-operator/pkg/client" @@ -558,3 +560,143 @@ func TestGenerateRunReportFromTaskErrors(t *testing.T) { }) } } + +func TestMergeAlertmanagerConfig(t *testing.T) { + for _, tc := range []struct { + name string + baseConfig *manifests.Config + crdConfig *configv1alpha1.AlertmanagerConfig + expectError bool + validate func(*testing.T, *manifests.Config) + }{ + { + name: "disabled mode", + baseConfig: &manifests.Config{ + ClusterMonitoringConfiguration: &manifests.ClusterMonitoringConfiguration{ + AlertmanagerMainConfig: &manifests.AlertmanagerMainConfig{}, + }, + }, + crdConfig: &configv1alpha1.AlertmanagerConfig{ + DeploymentMode: configv1alpha1.AlertManagerDeployModeDisabled, + }, + validate: func(t *testing.T, c *manifests.Config) { + if c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled == nil { + t.Error("expected Enabled to be set") + return + } + if *c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled { + t.Error("expected Enabled to be false") + } + }, + }, + { + name: "default config mode", + baseConfig: &manifests.Config{ + ClusterMonitoringConfiguration: &manifests.ClusterMonitoringConfiguration{ + AlertmanagerMainConfig: &manifests.AlertmanagerMainConfig{}, + }, + }, + crdConfig: &configv1alpha1.AlertmanagerConfig{ + DeploymentMode: configv1alpha1.AlertManagerDeployModeDefaultConfig, + }, + validate: func(t *testing.T, c *manifests.Config) { + if c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled == nil { + t.Error("expected Enabled to be set") + return + } + if !*c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Enabled { + t.Error("expected Enabled to be true") + } + }, + }, + { + name: "custom config with log level", + baseConfig: &manifests.Config{ + ClusterMonitoringConfiguration: &manifests.ClusterMonitoringConfiguration{ + AlertmanagerMainConfig: &manifests.AlertmanagerMainConfig{}, + }, + }, + crdConfig: &configv1alpha1.AlertmanagerConfig{ + DeploymentMode: configv1alpha1.AlertManagerDeployModeCustomConfig, + CustomConfig: configv1alpha1.AlertmanagerCustomConfig{ + LogLevel: configv1alpha1.LogLevelDebug, + }, + }, + validate: func(t *testing.T, c *manifests.Config) { + if c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.LogLevel != "debug" { + t.Errorf("expected LogLevel debug, got %s", c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.LogLevel) + } + }, + }, + { + name: "custom config with resources", + baseConfig: &manifests.Config{ + ClusterMonitoringConfiguration: &manifests.ClusterMonitoringConfiguration{ + AlertmanagerMainConfig: &manifests.AlertmanagerMainConfig{}, + }, + }, + crdConfig: &configv1alpha1.AlertmanagerConfig{ + DeploymentMode: configv1alpha1.AlertManagerDeployModeCustomConfig, + CustomConfig: configv1alpha1.AlertmanagerCustomConfig{ + Resources: []configv1alpha1.ContainerResource{ + { + Name: "cpu", + Request: resource.MustParse("100m"), + }, + }, + }, + }, + validate: func(t *testing.T, c *manifests.Config) { + if c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Resources == nil { + t.Error("expected Resources to be set") + return + } + if cpu := c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.Resources.Requests.Cpu().String(); cpu != "100m" { + t.Errorf("expected CPU request 100m, got %s", cpu) + } + }, + }, + { + name: "custom config overrides configmap", + baseConfig: &manifests.Config{ + ClusterMonitoringConfiguration: &manifests.ClusterMonitoringConfiguration{ + AlertmanagerMainConfig: &manifests.AlertmanagerMainConfig{ + LogLevel: "info", + }, + }, + }, + crdConfig: &configv1alpha1.AlertmanagerConfig{ + DeploymentMode: configv1alpha1.AlertManagerDeployModeCustomConfig, + CustomConfig: configv1alpha1.AlertmanagerCustomConfig{ + LogLevel: configv1alpha1.LogLevelDebug, + }, + }, + validate: func(t *testing.T, c *manifests.Config) { + if c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.LogLevel != "debug" { + t.Errorf("expected CRD to override ConfigMap, got %s", c.ClusterMonitoringConfiguration.AlertmanagerMainConfig.LogLevel) + } + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + o := &Operator{} + err := o.mergeAlertmanagerConfig(tc.baseConfig, tc.crdConfig) + + if tc.expectError { + if err == nil { + t.Error("expected error, got nil") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if tc.validate != nil { + tc.validate(t, tc.baseConfig) + } + }) + } +} diff --git a/test/e2e/cluster_monitoring_test.go b/test/e2e/cluster_monitoring_test.go new file mode 100644 index 0000000000..3b5206ad87 --- /dev/null +++ b/test/e2e/cluster_monitoring_test.go @@ -0,0 +1,154 @@ +// Copyright 2025 The Cluster Monitoring Operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "context" + "testing" + "time" + + configv1alpha1 "github.com/openshift/api/config/v1alpha1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" +) + +const ( + clusterMonitoringName = "cluster" +) + +func TestClusterMonitoring(t *testing.T) { + ctx := context.Background() + clusterMonitorings := f.OpenShiftConfigClient.ConfigV1alpha1().ClusterMonitorings() + + _, err := clusterMonitorings.List(ctx, metav1.ListOptions{Limit: 1}) + if err != nil { + if apierrors.IsNotFound(err) { + t.Skip("ClusterMonitoring CRD not available - ClusterMonitoringConfig feature gate may not be enabled") + return + } + t.Fatalf("unexpected error checking ClusterMonitoring CRD availability: %v", err) + } + + cm := &configv1alpha1.ClusterMonitoring{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterMonitoringName, + }, + Spec: configv1alpha1.ClusterMonitoringSpec{ + AlertmanagerConfig: configv1alpha1.AlertmanagerConfig{ + DeploymentMode: configv1alpha1.AlertManagerDeployModeDefaultConfig, + }, + }, + } + + createdCM, err := clusterMonitorings.Create(ctx, cm, metav1.CreateOptions{}) + if err != nil { + if !apierrors.IsAlreadyExists(err) { + t.Fatalf("failed to create ClusterMonitoring: %v", err) + } + existingCM, err := clusterMonitorings.Get(ctx, clusterMonitoringName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("failed to get existing ClusterMonitoring: %v", err) + } + existingCM.Spec = cm.Spec + createdCM, err = clusterMonitorings.Update(ctx, existingCM, metav1.UpdateOptions{}) + if err != nil { + t.Fatalf("failed to update ClusterMonitoring: %v", err) + } + } + + t.Logf("configured ClusterMonitoring resource: %s", createdCM.Name) + + err = wait.PollImmediate(5*time.Second, 30*time.Second, func() (bool, error) { + retrievedCM, err := clusterMonitorings.Get(ctx, clusterMonitoringName, metav1.GetOptions{}) + if err != nil { + t.Logf("waiting for ClusterMonitoring: %v", err) + return false, nil + } + + if retrievedCM.Spec.AlertmanagerConfig.DeploymentMode != configv1alpha1.AlertManagerDeployModeDefaultConfig { + t.Logf("waiting for correct DeploymentMode, got: %s", retrievedCM.Spec.AlertmanagerConfig.DeploymentMode) + return false, nil + } + + return true, nil + }) + + if err != nil { + t.Fatalf("ClusterMonitoring resource not properly created: %v", err) + } + + t.Log("updating ClusterMonitoring resource") + err = wait.PollImmediate(2*time.Second, 30*time.Second, func() (bool, error) { + currentCM, err := clusterMonitorings.Get(ctx, clusterMonitoringName, metav1.GetOptions{}) + if err != nil { + return false, err + } + + currentCM.Spec.AlertmanagerConfig.DeploymentMode = configv1alpha1.AlertManagerDeployModeCustomConfig + currentCM.Spec.AlertmanagerConfig.CustomConfig = configv1alpha1.AlertmanagerCustomConfig{ + LogLevel: configv1alpha1.LogLevelInfo, + } + + _, err = clusterMonitorings.Update(ctx, currentCM, metav1.UpdateOptions{}) + if err != nil { + t.Logf("Retrying update due to: %v", err) + return false, nil // Retry on conflict + } + + return true, nil + }) + + if err != nil { + t.Fatalf("Failed to update ClusterMonitoring: %v", err) + } + + updatedCM, err := clusterMonitorings.Get(ctx, clusterMonitoringName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated ClusterMonitoring: %v", err) + } + + if updatedCM.Spec.AlertmanagerConfig.DeploymentMode != configv1alpha1.AlertManagerDeployModeCustomConfig { + t.Errorf("Expected DeploymentMode to be CustomConfig, got: %s", updatedCM.Spec.AlertmanagerConfig.DeploymentMode) + } + + if updatedCM.Spec.AlertmanagerConfig.CustomConfig.LogLevel != configv1alpha1.LogLevelInfo { + t.Errorf("expected LogLevel Info, got: %s", updatedCM.Spec.AlertmanagerConfig.CustomConfig.LogLevel) + } + + t.Log("verifying alertmanager deployment reflects CRD configuration") + err = wait.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) { + am, err := f.MonitoringClient.Alertmanagers(f.Ns).Get(ctx, "main", metav1.GetOptions{}) + if err != nil { + t.Logf("waiting for alertmanager: %v", err) + return false, nil + } + + if am.Status.UpdatedReplicas < 1 { + t.Logf("waiting for alertmanager replicas, current: %d", am.Status.UpdatedReplicas) + return false, nil + } + + if am.Spec.LogLevel != "info" { + t.Logf("unexpected log level: %s", am.Spec.LogLevel) + } + + return true, nil + }) + + if err != nil { + t.Fatalf("alertmanager not properly configured from CRD: %v", err) + } +}