diff --git a/cmd/main.go b/cmd/main.go index 3b44745047..5eadaf892a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -16,12 +16,15 @@ package main import ( "context" + "crypto/tls" + "crypto/x509" "flag" "fmt" "net/url" "os" goruntime "runtime" "strings" + "sync" "time" "github.com/cloudflare/cfssl/log" @@ -34,6 +37,7 @@ import ( "github.com/tigera/operator/pkg/awssgsetup" "github.com/tigera/operator/pkg/common" "github.com/tigera/operator/pkg/components" + "github.com/tigera/operator/pkg/controller/metrics" "github.com/tigera/operator/pkg/controller/options" "github.com/tigera/operator/pkg/controller/utils" "github.com/tigera/operator/pkg/dns" @@ -44,13 +48,17 @@ import ( "github.com/tigera/operator/pkg/render/istio" "github.com/tigera/operator/pkg/render/logstorage" "github.com/tigera/operator/pkg/render/logstorage/eck" + "github.com/tigera/operator/pkg/render/monitor" + "github.com/tigera/operator/pkg/tls/certificatemanagement" "github.com/tigera/operator/version" operatortigeraiov1 "github.com/tigera/operator/api/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -61,6 +69,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/client/config" "sigs.k8s.io/controller-runtime/pkg/log/zap" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/yaml" @@ -68,7 +77,7 @@ import ( ) var ( - defaultMetricsPort int32 = 8484 + defaultMetricsPort int32 = 9484 scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") ) @@ -251,11 +260,33 @@ If a value other than 'all' is specified, the first CRD with a prefix of the spe active.WaitUntilActive(cs, c, sigHandler, setupLog) log.Info("Active operator: proceeding") + metricsOpts := server.Options{ + BindAddress: metricsAddr(), + } + var certLoader *dynamicCertLoader + if metricsTLSEnabled() { + certLoader = newDynamicCertLoader() + metricsOpts.TLSOpts = []func(*tls.Config){ + func(cfg *tls.Config) { + cfg.GetCertificate = certLoader.GetCertificate + cfg.ClientAuth = tls.RequireAndVerifyClientCert + cfg.GetConfigForClient = func(*tls.ClientHelloInfo) (*tls.Config, error) { + pool := certLoader.GetClientCAs() + return &tls.Config{ + GetCertificate: certLoader.GetCertificate, + ClientAuth: tls.RequireAndVerifyClientCert, + ClientCAs: pool, + MinVersion: tls.VersionTLS12, + }, nil + } + cfg.MinVersion = tls.VersionTLS12 + }, + } + } + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - Metrics: server.Options{ - BindAddress: metricsAddr(), - }, + Scheme: scheme, + Metrics: metricsOpts, WebhookServer: webhook.NewServer(webhook.Options{ Port: 9443, }), @@ -474,6 +505,17 @@ If a value other than 'all' is specified, the first CRD with a prefix of the spe os.Exit(1) } + // Register custom Prometheus metrics collector. + if metricsEnabled() { + collector := metrics.NewOperatorCollector(mgr.GetClient()) + ctrlmetrics.Registry.MustRegister(collector) + } + + // Start watching TLS secrets for the mTLS metrics endpoint. + if certLoader != nil { + go watchMetricsTLSSecrets(ctx, mgr, certLoader) + } + setupLog.Info("starting manager") if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") @@ -512,29 +554,34 @@ func setKubernetesServiceEnv(kubeconfigFile string) error { return nil } -// metricsAddr processes user-specified metrics host and port and sets -// default values accordingly. +// metricsAddr returns the bind address for the metrics endpoint. +// When METRICS_ENABLED is not "true", returns "0" to disable metrics. +// Otherwise, defaults to 0.0.0.0:9484 and allows overriding via +// METRICS_HOST and METRICS_PORT. func metricsAddr() string { - metricsHost := os.Getenv("METRICS_HOST") - metricsPort := os.Getenv("METRICS_PORT") - - // if neither are specified, disable metrics. - if metricsHost == "" && metricsPort == "" { + if !metricsEnabled() { // the controller-runtime accepts '0' to denote that metrics should be disabled. return "0" } - // if just a host is specified, listen on port 8484 of that host. - if metricsHost != "" && metricsPort == "" { - // the controller-runtime will choose a random port if none is specified. - // so use the defaultMetricsPort in that case. + + metricsHost := os.Getenv("METRICS_HOST") + if metricsHost == "" { + metricsHost = "0.0.0.0" + } + + metricsPort := os.Getenv("METRICS_PORT") + if metricsPort == "" { return fmt.Sprintf("%s:%d", metricsHost, defaultMetricsPort) } - // finally, handle cases where just a port is specified or both are specified in the same case - // since controller-runtime correctly uses all interfaces if no host is specified. return fmt.Sprintf("%s:%s", metricsHost, metricsPort) } +// metricsEnabled returns true when the operator metrics endpoint is enabled. +func metricsEnabled() bool { + return strings.EqualFold(os.Getenv("METRICS_ENABLED"), "true") +} + func showCRDs(variant operatortigeraiov1.ProductVariant, outputType string) error { first := true for _, v := range crds.GetCRDs(variant, os.Getenv("CALICO_API_GROUP") == "projectcalico.org/v3") { @@ -625,3 +672,137 @@ func verifyConfiguration(ctx context.Context, cs kubernetes.Interface, opts opti return fmt.Errorf("refusing to run: configured as internal-es but secret/%s found which suggests external ES", logstorage.ExternalCertsSecret) } } + +// metricsTLSEnabled returns true when the operator metrics endpoint should use mTLS. +func metricsTLSEnabled() bool { + return strings.EqualFold(os.Getenv("METRICS_SCHEME"), "https") +} + +// dynamicCertLoader dynamically loads TLS certificates from Kubernetes secrets +// for the metrics endpoint. The monitor controller creates the server cert, and +// the client CA is loaded from the Prometheus client TLS secret. +type dynamicCertLoader struct { + mu sync.RWMutex + cert *tls.Certificate + clientCA *x509.CertPool +} + +func newDynamicCertLoader() *dynamicCertLoader { + return &dynamicCertLoader{ + clientCA: x509.NewCertPool(), + } +} + +// GetCertificate returns the current server certificate for the metrics endpoint. +func (d *dynamicCertLoader) GetCertificate(*tls.ClientHelloInfo) (*tls.Certificate, error) { + d.mu.RLock() + defer d.mu.RUnlock() + if d.cert == nil { + return nil, fmt.Errorf("operator metrics TLS certificate not yet available") + } + return d.cert, nil +} + +// GetClientCAs returns the current client CA pool. +func (d *dynamicCertLoader) GetClientCAs() *x509.CertPool { + d.mu.RLock() + defer d.mu.RUnlock() + return d.clientCA +} + +// updateServerCert updates the server certificate from a Kubernetes TLS secret. +func (d *dynamicCertLoader) updateServerCert(secret *corev1.Secret) error { + certPEM, ok := secret.Data[corev1.TLSCertKey] + if !ok { + return fmt.Errorf("secret %s/%s missing %s", secret.Namespace, secret.Name, corev1.TLSCertKey) + } + keyPEM, ok := secret.Data[corev1.TLSPrivateKeyKey] + if !ok { + return fmt.Errorf("secret %s/%s missing %s", secret.Namespace, secret.Name, corev1.TLSPrivateKeyKey) + } + cert, err := tls.X509KeyPair(certPEM, keyPEM) + if err != nil { + return fmt.Errorf("failed to parse TLS keypair from %s/%s: %w", secret.Namespace, secret.Name, err) + } + d.mu.Lock() + defer d.mu.Unlock() + d.cert = &cert + return nil +} + +// updateClientCA updates the client CA pool from a Kubernetes secret containing a certificate. +func (d *dynamicCertLoader) updateClientCA(secrets ...*corev1.Secret) { + pool := x509.NewCertPool() + for _, s := range secrets { + if s == nil { + continue + } + if certPEM, ok := s.Data[corev1.TLSCertKey]; ok { + pool.AppendCertsFromPEM(certPEM) + } + } + d.mu.Lock() + defer d.mu.Unlock() + d.clientCA = pool +} + +// watchMetricsTLSSecrets periodically loads TLS secrets for the metrics endpoint. +// It runs until the context is canceled. +func watchMetricsTLSSecrets(ctx context.Context, mgr ctrl.Manager, loader *dynamicCertLoader) { + logger := ctrl.Log.WithName("metrics-tls") + + // Wait for the cache to start before reading secrets. + if !mgr.GetCache().WaitForCacheSync(ctx) { + logger.Error(fmt.Errorf("cache sync failed"), "Cannot watch metrics TLS secrets") + return + } + + c := mgr.GetClient() + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + operatorNs := common.OperatorNamespace() + + serverCertLoaded := false + loadSecrets := func() { + // Load operator server TLS secret. + serverSecret := &corev1.Secret{} + if err := c.Get(ctx, types.NamespacedName{Name: monitor.OperatorMetricsSecretName, Namespace: operatorNs}, serverSecret); err != nil { + if !serverCertLoaded { + logger.Info("Metrics mTLS is enabled but the server certificate secret is not yet available. "+ + "Create the secret manually or apply the Monitor CR to have it provisioned automatically.", + "secret", monitor.OperatorMetricsSecretName, "namespace", operatorNs) + } else { + logger.V(2).Info("Operator metrics TLS secret not yet available", "error", err) + } + } else { + if err := loader.updateServerCert(serverSecret); err != nil { + logger.Error(err, "Failed to update operator metrics server cert") + } else { + if !serverCertLoaded { + logger.Info("Operator metrics TLS certificate loaded successfully", "secret", monitor.OperatorMetricsSecretName) + } + serverCertLoaded = true + } + } + + // Load client CA from the tigera-ca-private secret. Any cert signed by this CA + // will be trusted for mTLS client authentication. + caSecret := &corev1.Secret{} + if err := c.Get(ctx, types.NamespacedName{Name: certificatemanagement.CASecretName, Namespace: operatorNs}, caSecret); err == nil { + loader.updateClientCA(caSecret) + } + } + + // Initial load. + loadSecrets() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + loadSecrets() + } + } +} diff --git a/go.mod b/go.mod index d16850a0a0..ef17de58f5 100644 --- a/go.mod +++ b/go.mod @@ -53,6 +53,8 @@ require ( sigs.k8s.io/yaml v1.6.0 ) +require github.com/prometheus/client_golang v1.23.0 + require ( al.essio.dev/pkg/shellescape v1.5.1 // indirect dario.cat/mergo v1.0.2 // indirect @@ -118,6 +120,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.0 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect github.com/lib/pq v1.10.9 // indirect @@ -143,7 +146,6 @@ require ( github.com/pelletier/go-toml v1.9.5 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.65.0 // indirect github.com/prometheus/procfs v0.17.0 // indirect diff --git a/pkg/controller/metrics/collectors.go b/pkg/controller/metrics/collectors.go new file mode 100644 index 0000000000..f677b6d72f --- /dev/null +++ b/pkg/controller/metrics/collectors.go @@ -0,0 +1,203 @@ +// Copyright (c) 2026 Tigera, Inc. All rights reserved. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + "time" + + "github.com/prometheus/client_golang/prometheus" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + operatorv1 "github.com/tigera/operator/api/v1" + "github.com/tigera/operator/pkg/controller/utils" +) + +const ( + // Label and annotation keys used by the certificate management system. + signerLabel = "certificates.operator.tigera.io/signer" + expiryAnnotation = "certificates.operator.tigera.io/expiry" + + // Time format used in expiry annotations. + expiryFormat = "2006-01-02T15:04:05Z" +) + +var ( + componentStatusDesc = prometheus.NewDesc( + "tigera_operator_component_status", + "TigeraStatus conditions for operator-managed components. 1 = true, 0 = false.", + []string{"component", "condition"}, + nil, + ) + + tlsCertExpiryDesc = prometheus.NewDesc( + "tigera_operator_tls_certificate_expiry_timestamp_seconds", + "Unix timestamp of certificate expiry for operator-managed TLS secrets.", + []string{"name", "namespace", "issuer"}, + nil, + ) + + licenseExpiryDesc = prometheus.NewDesc( + "tigera_operator_license_expiry_timestamp_seconds", + "Unix timestamp of Tigera license expiry.", + []string{"package"}, + nil, + ) + + licenseValidDesc = prometheus.NewDesc( + "tigera_operator_license_valid", + "Whether the Tigera license is valid (including grace period). 1 = valid, 0 = invalid.", + []string{"package"}, + nil, + ) +) + +// OperatorCollector implements prometheus.Collector and exposes custom operator metrics. +type OperatorCollector struct { + client client.Client +} + +// NewOperatorCollector creates a new OperatorCollector. +func NewOperatorCollector(c client.Client) *OperatorCollector { + return &OperatorCollector{client: c} +} + +// Describe implements prometheus.Collector. +func (c *OperatorCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- componentStatusDesc + ch <- tlsCertExpiryDesc + ch <- licenseExpiryDesc + ch <- licenseValidDesc +} + +// Collect implements prometheus.Collector. +func (c *OperatorCollector) Collect(ch chan<- prometheus.Metric) { + ctx := context.Background() + c.collectComponentStatus(ctx, ch) + c.collectTLSCertExpiry(ctx, ch) + c.collectLicense(ctx, ch) +} + +func (c *OperatorCollector) collectComponentStatus(ctx context.Context, ch chan<- prometheus.Metric) { + statusList := &operatorv1.TigeraStatusList{} + if err := c.client.List(ctx, statusList); err != nil { + return + } + + conditions := []operatorv1.StatusConditionType{ + operatorv1.ComponentAvailable, + operatorv1.ComponentProgressing, + operatorv1.ComponentDegraded, + } + + for _, ts := range statusList.Items { + for _, condType := range conditions { + val := float64(0) + for _, cond := range ts.Status.Conditions { + if cond.Type == condType && cond.Status == operatorv1.ConditionTrue { + val = 1 + break + } + } + ch <- prometheus.MustNewConstMetric( + componentStatusDesc, + prometheus.GaugeValue, + val, + ts.Name, + conditionLabel(condType), + ) + } + } +} + +func conditionLabel(ct operatorv1.StatusConditionType) string { + switch ct { + case operatorv1.ComponentAvailable: + return "available" + case operatorv1.ComponentProgressing: + return "progressing" + case operatorv1.ComponentDegraded: + return "degraded" + default: + return string(ct) + } +} + +func (c *OperatorCollector) collectTLSCertExpiry(ctx context.Context, ch chan<- prometheus.Metric) { + secrets := &corev1.SecretList{} + if err := c.client.List(ctx, secrets, client.HasLabels{signerLabel}); err != nil { + return + } + + for _, s := range secrets.Items { + expiryStr, ok := s.Annotations[expiryAnnotation] + if !ok { + continue + } + + expiry, err := time.Parse(expiryFormat, expiryStr) + if err != nil { + continue + } + + issuer := s.Annotations["certificates.operator.tigera.io/issuer"] + + ch <- prometheus.MustNewConstMetric( + tlsCertExpiryDesc, + prometheus.GaugeValue, + float64(expiry.Unix()), + s.Name, + s.Namespace, + issuer, + ) + } +} + +func (c *OperatorCollector) collectLicense(ctx context.Context, ch chan<- prometheus.Metric) { + license, err := utils.FetchLicenseKey(ctx, c.client) + if err != nil { + // License not available (e.g., Calico OSS). Skip gracefully. + return + } + + pkg := string(license.Status.Package) + if pkg == "" { + pkg = "Enterprise" + } + + if !license.Status.Expiry.IsZero() { + ch <- prometheus.MustNewConstMetric( + licenseExpiryDesc, + prometheus.GaugeValue, + float64(license.Status.Expiry.Unix()), + pkg, + ) + } + + gracePeriod := utils.ParseGracePeriod(license.Status.GracePeriod) + licenseStatus := utils.GetLicenseStatus(license, gracePeriod) + valid := float64(1) + if licenseStatus == utils.LicenseStatusExpired { + valid = 0 + } + + ch <- prometheus.MustNewConstMetric( + licenseValidDesc, + prometheus.GaugeValue, + valid, + pkg, + ) +} diff --git a/pkg/controller/metrics/collectors_test.go b/pkg/controller/metrics/collectors_test.go new file mode 100644 index 0000000000..67799b2cdb --- /dev/null +++ b/pkg/controller/metrics/collectors_test.go @@ -0,0 +1,259 @@ +// Copyright (c) 2026 Tigera, Inc. All rights reserved. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics_test + +import ( + "strings" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus/testutil" + + v3 "github.com/tigera/api/pkg/apis/projectcalico/v3" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + + operatorv1 "github.com/tigera/operator/api/v1" + "github.com/tigera/operator/pkg/apis" + "github.com/tigera/operator/pkg/controller/metrics" + ctrlrfake "github.com/tigera/operator/pkg/ctrlruntime/client/fake" +) + +func TestMetrics(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Metrics Suite") +} + +var _ = Describe("OperatorCollector", func() { + var ( + scheme *runtime.Scheme + ) + + BeforeEach(func() { + scheme = runtime.NewScheme() + Expect(apis.AddToScheme(scheme, false)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + }) + + Context("component status metrics", func() { + It("should emit metrics for TigeraStatus objects", func() { + ts := &operatorv1.TigeraStatus{ + ObjectMeta: metav1.ObjectMeta{Name: "calico"}, + Status: operatorv1.TigeraStatusStatus{ + Conditions: []operatorv1.TigeraStatusCondition{ + {Type: operatorv1.ComponentAvailable, Status: operatorv1.ConditionTrue}, + {Type: operatorv1.ComponentProgressing, Status: operatorv1.ConditionFalse}, + {Type: operatorv1.ComponentDegraded, Status: operatorv1.ConditionFalse}, + }, + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(ts).Build() + collector := metrics.NewOperatorCollector(cli) + + expected := ` +# HELP tigera_operator_component_status TigeraStatus conditions for operator-managed components. 1 = true, 0 = false. +# TYPE tigera_operator_component_status gauge +tigera_operator_component_status{component="calico",condition="available"} 1 +tigera_operator_component_status{component="calico",condition="progressing"} 0 +tigera_operator_component_status{component="calico",condition="degraded"} 0 +` + Expect(testutil.CollectAndCompare(collector, strings.NewReader(expected), + "tigera_operator_component_status")).NotTo(HaveOccurred()) + }) + + It("should emit 0 for all conditions when status has no conditions", func() { + ts := &operatorv1.TigeraStatus{ + ObjectMeta: metav1.ObjectMeta{Name: "monitor"}, + Status: operatorv1.TigeraStatusStatus{}, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(ts).Build() + collector := metrics.NewOperatorCollector(cli) + + expected := ` +# HELP tigera_operator_component_status TigeraStatus conditions for operator-managed components. 1 = true, 0 = false. +# TYPE tigera_operator_component_status gauge +tigera_operator_component_status{component="monitor",condition="available"} 0 +tigera_operator_component_status{component="monitor",condition="progressing"} 0 +tigera_operator_component_status{component="monitor",condition="degraded"} 0 +` + Expect(testutil.CollectAndCompare(collector, strings.NewReader(expected), + "tigera_operator_component_status")).NotTo(HaveOccurred()) + }) + + It("should handle multiple TigeraStatus objects", func() { + ts1 := &operatorv1.TigeraStatus{ + ObjectMeta: metav1.ObjectMeta{Name: "calico"}, + Status: operatorv1.TigeraStatusStatus{ + Conditions: []operatorv1.TigeraStatusCondition{ + {Type: operatorv1.ComponentAvailable, Status: operatorv1.ConditionTrue}, + {Type: operatorv1.ComponentProgressing, Status: operatorv1.ConditionFalse}, + {Type: operatorv1.ComponentDegraded, Status: operatorv1.ConditionFalse}, + }, + }, + } + ts2 := &operatorv1.TigeraStatus{ + ObjectMeta: metav1.ObjectMeta{Name: "apiserver"}, + Status: operatorv1.TigeraStatusStatus{ + Conditions: []operatorv1.TigeraStatusCondition{ + {Type: operatorv1.ComponentAvailable, Status: operatorv1.ConditionFalse}, + {Type: operatorv1.ComponentProgressing, Status: operatorv1.ConditionTrue}, + {Type: operatorv1.ComponentDegraded, Status: operatorv1.ConditionTrue}, + }, + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(ts1, ts2).Build() + collector := metrics.NewOperatorCollector(cli) + + // Verify each component's metrics individually + count := testutil.CollectAndCount(collector, "tigera_operator_component_status") + Expect(count).To(Equal(6)) // 2 components * 3 conditions + }) + }) + + Context("TLS certificate expiry metrics", func() { + It("should emit metrics for secrets with signer label and expiry annotation", func() { + expiry := time.Date(2027, 6, 15, 12, 0, 0, 0, time.UTC) + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "manager-tls", + Namespace: "calico-system", + Labels: map[string]string{ + "certificates.operator.tigera.io/signer": "tigera-operator-signer", + }, + Annotations: map[string]string{ + "certificates.operator.tigera.io/expiry": expiry.Format("2006-01-02T15:04:05Z"), + "certificates.operator.tigera.io/issuer": "tigera-operator-signer", + }, + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(secret).Build() + collector := metrics.NewOperatorCollector(cli) + + count := testutil.CollectAndCount(collector, "tigera_operator_tls_certificate_expiry_timestamp_seconds") + Expect(count).To(Equal(1)) + }) + + It("should skip secrets without signer label", func() { + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-secret", + Namespace: "calico-system", + Annotations: map[string]string{ + "certificates.operator.tigera.io/expiry": "2027-06-15T12:00:00Z", + }, + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(secret).Build() + collector := metrics.NewOperatorCollector(cli) + + count := testutil.CollectAndCount(collector, "tigera_operator_tls_certificate_expiry_timestamp_seconds") + Expect(count).To(Equal(0)) + }) + + It("should skip secrets without expiry annotation", func() { + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "manager-tls", + Namespace: "calico-system", + Labels: map[string]string{ + "certificates.operator.tigera.io/signer": "tigera-operator-signer", + }, + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(secret).Build() + collector := metrics.NewOperatorCollector(cli) + + count := testutil.CollectAndCount(collector, "tigera_operator_tls_certificate_expiry_timestamp_seconds") + Expect(count).To(Equal(0)) + }) + }) + + Context("license metrics", func() { + It("should emit license metrics when license exists", func() { + license := &v3.LicenseKey{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Status: v3.LicenseKeyStatus{ + Expiry: metav1.Time{Time: time.Now().Add(365 * 24 * time.Hour)}, + GracePeriod: "90d", + Package: "Enterprise", + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(license).Build() + collector := metrics.NewOperatorCollector(cli) + + expiryCount := testutil.CollectAndCount(collector, "tigera_operator_license_expiry_timestamp_seconds") + Expect(expiryCount).To(Equal(1)) + + validCount := testutil.CollectAndCount(collector, "tigera_operator_license_valid") + Expect(validCount).To(Equal(1)) + }) + + It("should report valid=0 when license is expired past grace period", func() { + license := &v3.LicenseKey{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Status: v3.LicenseKeyStatus{ + Expiry: metav1.Time{Time: time.Now().Add(-200 * 24 * time.Hour)}, + GracePeriod: "90d", + Package: "Enterprise", + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(license).Build() + collector := metrics.NewOperatorCollector(cli) + + expected := ` +# HELP tigera_operator_license_valid Whether the Tigera license is valid (including grace period). 1 = valid, 0 = invalid. +# TYPE tigera_operator_license_valid gauge +tigera_operator_license_valid{package="Enterprise"} 0 +` + Expect(testutil.CollectAndCompare(collector, strings.NewReader(expected), + "tigera_operator_license_valid")).NotTo(HaveOccurred()) + }) + + It("should not emit license metrics when license does not exist", func() { + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + collector := metrics.NewOperatorCollector(cli) + + expiryCount := testutil.CollectAndCount(collector, "tigera_operator_license_expiry_timestamp_seconds") + Expect(expiryCount).To(Equal(0)) + + validCount := testutil.CollectAndCount(collector, "tigera_operator_license_valid") + Expect(validCount).To(Equal(0)) + }) + + It("should report valid=1 when license is in grace period", func() { + license := &v3.LicenseKey{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Status: v3.LicenseKeyStatus{ + Expiry: metav1.Time{Time: time.Now().Add(-30 * 24 * time.Hour)}, + GracePeriod: "90d", + Package: "Enterprise", + }, + } + cli := ctrlrfake.DefaultFakeClientBuilder(scheme).WithObjects(license).Build() + collector := metrics.NewOperatorCollector(cli) + + expected := ` +# HELP tigera_operator_license_valid Whether the Tigera license is valid (including grace period). 1 = valid, 0 = invalid. +# TYPE tigera_operator_license_valid gauge +tigera_operator_license_valid{package="Enterprise"} 1 +` + Expect(testutil.CollectAndCompare(collector, strings.NewReader(expected), + "tigera_operator_license_valid")).NotTo(HaveOccurred()) + }) + }) +}) diff --git a/pkg/controller/monitor/monitor_controller.go b/pkg/controller/monitor/monitor_controller.go index 8f3038464b..dc6683ca39 100644 --- a/pkg/controller/monitor/monitor_controller.go +++ b/pkg/controller/monitor/monitor_controller.go @@ -18,7 +18,9 @@ import ( "context" _ "embed" "fmt" + "os" "reflect" + "strings" "time" corev1 "k8s.io/api/core/v1" @@ -408,6 +410,20 @@ func (r *ReconcileMonitor) Reconcile(ctx context.Context, request reconcile.Requ return reconcile.Result{}, err } + // Create operator TLS keypair for mTLS metrics if metrics are enabled. + operatorMetricsEnabled := metricsEnabled() + var operatorTLSSecret certificatemanagement.KeyPairInterface + if operatorMetricsEnabled { + operatorMetricsServiceName := common.OperatorName() + "-metrics" + operatorTLSDNSNames := dns.GetServiceDNSNames(operatorMetricsServiceName, common.OperatorNamespace(), r.clusterDomain) + operatorTLSSecret, err = certificateManager.GetOrCreateKeyPair(r.client, monitor.OperatorMetricsSecretName, common.OperatorNamespace(), operatorTLSDNSNames) + if err != nil { + r.status.SetDegraded(operatorv1.ResourceCreateError, "Error creating operator metrics TLS certificate", err, reqLogger) + return reconcile.Result{}, err + } + trustedBundle.AddCertificates(operatorTLSSecret) + } + monitorCfg := &monitor.Config{ Monitor: instance.Spec, Installation: install, @@ -422,6 +438,10 @@ func (r *ReconcileMonitor) Reconcile(ctx context.Context, request reconcile.Requ KubeControllerPort: kubeControllersMetricsPort, FelixPrometheusMetricsEnabled: utils.IsFelixPrometheusMetricsEnabled(felixConfiguration), LicenseExpired: licenseExpired, + OperatorMetricsEnabled: operatorMetricsEnabled, + OperatorNamespace: common.OperatorNamespace(), + OperatorName: common.OperatorName(), + OperatorTLSSecret: operatorTLSSecret, } // Render prometheus component @@ -596,3 +616,8 @@ func (r *ReconcileMonitor) readAlertmanagerConfigSecret(ctx context.Context) (*c // Operator should create a new default secret and set the owner reference. return defaultConfigSecret, true, nil } + +// metricsEnabled returns true when the operator metrics endpoint is enabled. +func metricsEnabled() bool { + return strings.EqualFold(os.Getenv("METRICS_ENABLED"), "true") +} diff --git a/pkg/render/monitor/monitor.go b/pkg/render/monitor/monitor.go index 0f524e2f80..0441a4c55b 100644 --- a/pkg/render/monitor/monitor.go +++ b/pkg/render/monitor/monitor.go @@ -91,6 +91,11 @@ const ( bearerTokenFile = "/var/run/secrets/kubernetes.io/serviceaccount/token" KubeControllerMetrics = "calico-kube-controllers-metrics" + + OperatorMetricsSecretName = "tigera-operator-tls" + OperatorMetricsServiceName = "tigera-operator-metrics" + OperatorMetricsPortName = "tigera-operator-metrics-port" + OperatorMetricsPort = 9484 ) var alertManagerSelector = fmt.Sprintf( @@ -146,6 +151,12 @@ type Config struct { KubeControllerPort int FelixPrometheusMetricsEnabled bool LicenseExpired bool + + // Operator metrics fields. + OperatorMetricsEnabled bool + OperatorNamespace string + OperatorName string + OperatorTLSSecret certificatemanagement.KeyPairInterface } type monitorComponent struct { @@ -265,6 +276,17 @@ func (mc *monitorComponent) Objects() ([]client.Object, []client.Object) { } } + if mc.cfg.OperatorMetricsEnabled { + toCreate = append(toCreate, mc.operatorMetricsService()) + if mc.cfg.LicenseExpired { + toDelete = append(toDelete, mc.serviceMonitorOperator()) + } else { + toCreate = append(toCreate, mc.serviceMonitorOperator()) + } + } else { + toDelete = append(toDelete, mc.operatorMetricsService(), mc.serviceMonitorOperator()) + } + if mc.cfg.Installation.TyphaMetricsPort != nil { toCreate = append(toCreate, mc.typhaServiceMonitor()) } else { @@ -1640,3 +1662,72 @@ func (mc *monitorComponent) typhaServiceMonitor() client.Object { }, } } + +// operatorMetricsService creates a Service for the operator's metrics endpoint in the operator namespace. +func (mc *monitorComponent) operatorMetricsService() *corev1.Service { + return &corev1.Service{ + TypeMeta: metav1.TypeMeta{Kind: "Service", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Name: OperatorMetricsServiceName, + Namespace: mc.cfg.OperatorNamespace, + Labels: map[string]string{ + "k8s-app": mc.cfg.OperatorName, + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Ports: []corev1.ServicePort{ + { + Name: OperatorMetricsPortName, + Port: int32(OperatorMetricsPort), + Protocol: corev1.ProtocolTCP, + TargetPort: intstr.FromInt(OperatorMetricsPort), + }, + }, + Selector: map[string]string{ + "k8s-app": mc.cfg.OperatorName, + }, + }, + } +} + +// serviceMonitorOperator creates a ServiceMonitor for the operator's metrics endpoint. +func (mc *monitorComponent) serviceMonitorOperator() *monitoringv1.ServiceMonitor { + return &monitoringv1.ServiceMonitor{ + TypeMeta: metav1.TypeMeta{Kind: monitoringv1.ServiceMonitorsKind, APIVersion: MonitoringAPIVersion}, + ObjectMeta: metav1.ObjectMeta{ + Name: OperatorMetricsServiceName, + Namespace: common.TigeraPrometheusNamespace, + Labels: map[string]string{"team": "network-operators"}, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "k8s-app": mc.cfg.OperatorName, + }, + }, + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{mc.cfg.OperatorNamespace}, + }, + Endpoints: []monitoringv1.Endpoint{ + { + HonorLabels: true, + Interval: "5s", + Port: OperatorMetricsPortName, + ScrapeTimeout: "5s", + RelabelConfigs: []monitoringv1.RelabelConfig{ + { + TargetLabel: "__scheme__", + Replacement: ptr.To("https"), + }, + }, + HTTPConfigWithProxyAndTLSFiles: monitoringv1.HTTPConfigWithProxyAndTLSFiles{ + HTTPConfigWithTLSFiles: monitoringv1.HTTPConfigWithTLSFiles{ + TLSConfig: mc.tlsConfig(OperatorMetricsServiceName), + }, + }, + }, + }, + }, + } +} diff --git a/pkg/render/monitor/monitor_test.go b/pkg/render/monitor/monitor_test.go index 69f76bf78f..4472c6bc26 100644 --- a/pkg/render/monitor/monitor_test.go +++ b/pkg/render/monitor/monitor_test.go @@ -113,7 +113,7 @@ var _ = Describe("monitor rendering tests", func() { expectedResources := expectedBaseResources() rtest.ExpectResources(toCreate, expectedResources) - Expect(toDelete).To(HaveLen(3)) + Expect(toDelete).To(HaveLen(5)) // Check the namespace. namespace := rtest.GetResource(toCreate, "tigera-prometheus", "", "", "v1", "Namespace").(*corev1.Namespace) @@ -169,7 +169,7 @@ var _ = Describe("monitor rendering tests", func() { component := monitor.Monitor(cfg) Expect(component.ResolveImages(nil)).NotTo(HaveOccurred()) toCreate, toDelete := component.Objects() - Expect(toDelete).To(HaveLen(3)) + Expect(toDelete).To(HaveLen(5)) // Prometheus prometheusObj, ok := rtest.GetResource(toCreate, monitor.CalicoNodePrometheus, common.TigeraPrometheusNamespace, "monitoring.coreos.com", "v1", monitoringv1.PrometheusesKind).(*monitoringv1.Prometheus) @@ -675,7 +675,7 @@ var _ = Describe("monitor rendering tests", func() { expectedResources := expectedBaseResources() rtest.ExpectResources(toCreate, expectedResources) - Expect(toDelete).To(HaveLen(3)) + Expect(toDelete).To(HaveLen(5)) // Prometheus prometheusObj, ok := rtest.GetResource(toCreate, monitor.CalicoNodePrometheus, common.TigeraPrometheusNamespace, "monitoring.coreos.com", "v1", monitoringv1.PrometheusesKind).(*monitoringv1.Prometheus) @@ -865,7 +865,7 @@ var _ = Describe("monitor rendering tests", func() { ) rtest.ExpectResources(toCreate, expectedResources) - Expect(toDelete).To(HaveLen(3)) + Expect(toDelete).To(HaveLen(5)) }) It("Should render external prometheus resources with service monitor and custom token", func() { @@ -891,7 +891,7 @@ var _ = Describe("monitor rendering tests", func() { ) rtest.ExpectResources(toCreate, expectedResources) - Expect(toDelete).To(HaveLen(3)) + Expect(toDelete).To(HaveLen(5)) }) It("Should render external prometheus resources without service monitor", func() { @@ -907,7 +907,7 @@ var _ = Describe("monitor rendering tests", func() { ) rtest.ExpectResources(toCreate, expectedResources) - Expect(toDelete).To(HaveLen(3)) + Expect(toDelete).To(HaveLen(5)) }) It("Should render typha service monitor if typha metrics are enabled", func() { @@ -921,7 +921,7 @@ var _ = Describe("monitor rendering tests", func() { ) rtest.ExpectResources(toCreate, expectedResources) - Expect(toDelete).To(HaveLen(2)) + Expect(toDelete).To(HaveLen(4)) sm := rtest.GetResource(toCreate, "calico-typha-metrics", "tigera-prometheus", "monitoring.coreos.com", "v1", "ServiceMonitor").(*monitoringv1.ServiceMonitor) Expect(sm).To(Equal(&monitoringv1.ServiceMonitor{ TypeMeta: metav1.TypeMeta{Kind: monitoringv1.ServiceMonitorsKind, APIVersion: "monitoring.coreos.com/v1"}, @@ -1039,6 +1039,49 @@ var _ = Describe("monitor rendering tests", func() { Expect(found).To(BeTrue(), "Expected ServiceMonitor %s to be in toCreate", name) } }) + + It("Should create operator metrics Service and ServiceMonitor when OperatorMetricsEnabled is true", func() { + cfg.OperatorMetricsEnabled = true + cfg.OperatorNamespace = "tigera-operator" + cfg.OperatorName = "tigera-operator" + component := monitor.Monitor(cfg) + Expect(component.ResolveImages(nil)).NotTo(HaveOccurred()) + toCreate, toDelete := component.Objects() + + // Operator metrics service should be in toCreate. + svc := rtest.GetResource(toCreate, monitor.OperatorMetricsServiceName, "tigera-operator", "", "v1", "Service") + Expect(svc).NotTo(BeNil()) + service := svc.(*corev1.Service) + Expect(service.Spec.Ports[0].Port).To(Equal(int32(monitor.OperatorMetricsPort))) + Expect(service.Spec.Selector["k8s-app"]).To(Equal("tigera-operator")) + + // Operator ServiceMonitor should be in toCreate. + sm := rtest.GetResource(toCreate, monitor.OperatorMetricsServiceName, common.TigeraPrometheusNamespace, "monitoring.coreos.com", "v1", "ServiceMonitor") + Expect(sm).NotTo(BeNil()) + serviceMonitor := sm.(*monitoringv1.ServiceMonitor) + Expect(serviceMonitor.Spec.Endpoints[0].Port).To(Equal(monitor.OperatorMetricsPortName)) + + // Neither should be in toDelete (only PodMonitor, Deployment, typhaServiceMonitor). + Expect(toDelete).To(HaveLen(3)) + }) + + It("Should delete operator metrics resources when OperatorMetricsEnabled is false", func() { + cfg.OperatorMetricsEnabled = false + cfg.OperatorNamespace = "tigera-operator" + cfg.OperatorName = "tigera-operator" + component := monitor.Monitor(cfg) + Expect(component.ResolveImages(nil)).NotTo(HaveOccurred()) + _, toDelete := component.Objects() + + // Both operator metrics resources should be in toDelete. + found := 0 + for _, obj := range toDelete { + if obj.GetName() == monitor.OperatorMetricsServiceName { + found++ + } + } + Expect(found).To(Equal(2)) // Service + ServiceMonitor + }) }) // expectedBaseResources These are the expected resources in the most basic setup.