diff --git a/services/centralized-kubecost/2.5.0/cosi-storage/cosi-bucket.yaml b/services/centralized-kubecost/2.5.0/cosi-storage/cosi-bucket.yaml new file mode 100644 index 000000000..cc1edc91a --- /dev/null +++ b/services/centralized-kubecost/2.5.0/cosi-storage/cosi-bucket.yaml @@ -0,0 +1,46 @@ +--- +# TODO: delete after merging the cosi-bucket-kit chart +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: takirala + namespace: kommander-flux +spec: + interval: 10m + timeout: 1m + url: https://takirala.github.io/charts/stable +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta2 +kind: HelmRelease +metadata: + name: kubecost-cosi-storage + namespace: ${releaseNamespace} +spec: + chart: + spec: + chart: cosi-bucket-kit + sourceRef: + kind: HelmRepository + name: takirala + #name: mesosphere.github.io-charts-stable + namespace: kommander-flux + version: 0.0.1-alpha.0 + interval: 15s + install: + crds: CreateReplace + remediation: + retries: 30 + createNamespace: true + upgrade: + crds: CreateReplace + remediation: + retries: 30 + releaseName: kubecost-cosi-storage + targetNamespace: ${releaseNamespace} + valuesFrom: + - kind: ConfigMap + name: centralized-kubecost-2.5.0-d2iq-defaults + - kind: ConfigMap + name: centralized-kubecost-overrides + optional: true +--- diff --git a/services/centralized-kubecost/2.5.0/cosi-storage/kustomization.yaml b/services/centralized-kubecost/2.5.0/cosi-storage/kustomization.yaml index 97c19c90a..c80c84207 100644 --- a/services/centralized-kubecost/2.5.0/cosi-storage/kustomization.yaml +++ b/services/centralized-kubecost/2.5.0/cosi-storage/kustomization.yaml @@ -1,5 +1,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- move-to-rook-ceph-cluster-driver.yaml -- todo-create-a-new-chart-in-mesosphere_charts_stable.yaml +- cosi-bucket.yaml diff --git a/services/centralized-kubecost/2.5.0/cosi-storage/move-to-rook-ceph-cluster-driver.yaml b/services/centralized-kubecost/2.5.0/cosi-storage/move-to-rook-ceph-cluster-driver.yaml deleted file mode 100644 index 9b8ec583b..000000000 --- a/services/centralized-kubecost/2.5.0/cosi-storage/move-to-rook-ceph-cluster-driver.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: ceph.rook.io/v1 -kind: CephCOSIDriver -metadata: - name: ceph-cosi-driver - namespace: kommander -spec: - deploymentStrategy: "Auto" ---- -# The Ceph-COSI driver needs a privileged user for each CephObjectStore -# in order to provision buckets and users -apiVersion: ceph.rook.io/v1 -kind: CephObjectStoreUser -metadata: - name: cosi-admin - namespace: kommander -spec: - displayName: "cosi user" - store: dkp-object-store # name of the CephObjectStore - capabilities: - bucket: "*" - user: "*" ---- -# Following are "ADMIN" operations ---- -kind: BucketClass -apiVersion: objectstorage.k8s.io/v1alpha1 -metadata: - name: test-bc - # Cluster scoped resource -driverName: rook-ceph.ceph.objectstorage.k8s.io -deletionPolicy: Delete -parameters: - objectStoreUserSecretName: rook-ceph-object-user-dkp-object-store-cosi-admin - objectStoreUserSecretNamespace: kommander ---- -kind: BucketAccessClass -apiVersion: objectstorage.k8s.io/v1alpha1 -metadata: - name: test-bac - # Cluster scoped resource -driverName: rook-ceph.ceph.objectstorage.k8s.io -authenticationType: KEY -parameters: - objectStoreUserSecretName: rook-ceph-object-user-dkp-object-store-cosi-admin - objectStoreUserSecretNamespace: kommander diff --git a/services/centralized-kubecost/2.5.0/cosi-storage/todo-create-a-new-chart-in-mesosphere_charts_stable.yaml b/services/centralized-kubecost/2.5.0/cosi-storage/todo-create-a-new-chart-in-mesosphere_charts_stable.yaml deleted file mode 100644 index 88ff3b89a..000000000 --- a/services/centralized-kubecost/2.5.0/cosi-storage/todo-create-a-new-chart-in-mesosphere_charts_stable.yaml +++ /dev/null @@ -1,22 +0,0 @@ ---- -kind: BucketClaim -apiVersion: objectstorage.k8s.io/v1alpha1 -metadata: - name: test-bclaim - namespace: kubecost -spec: - bucketClassName: test-bc - protocols: - - s3 ---- -kind: BucketAccess -apiVersion: objectstorage.k8s.io/v1alpha1 -metadata: - name: test-ba - namespace: kubecost -spec: - bucketAccessClassName: test-bac - bucketClaimName: test-bclaim - protocol: s3 - credentialsSecretName: federated-store ---- diff --git a/services/centralized-kubecost/2.5.0/defaults/cm.yaml b/services/centralized-kubecost/2.5.0/defaults/cm.yaml index 5b1ddcf56..c8d5b9c5b 100644 --- a/services/centralized-kubecost/2.5.0/defaults/cm.yaml +++ b/services/centralized-kubecost/2.5.0/defaults/cm.yaml @@ -38,7 +38,7 @@ data: # kubecostAggregator.deployMethod: # kA.dM = "singlepod" -> cloudCost is run as container inside cost-analyzer # kA.dM = "statefulset" -> cloudCost is run as single-replica Deployment - enabled: false # TODO: document how to enable here + enabled: false # Log level for the aggregator container. Options are "trace", "debug", "info", "warn", "error", "fatal", "panic" logLevel: info resources: @@ -66,7 +66,6 @@ data: forecasting: # Enable this to use kubecost's cost forecosting model - # TODO(takirala): do we enable this and create yet another pod or disable this but ship the image for airgap bundle? enabled: false # Define persistence volume for cost-analyzer, more information at https://github.com/kubecost/docs/blob/master/storage.md @@ -95,129 +94,22 @@ data: enabled: false prometheus: - kube-state-metrics: - fullnameOverride: "kommander-kubecost-prometheus-kube-state-metrics" - priorityClassName: dkp-high-priority - extraScrapeConfigs: | - - job_name: kubecost - honor_labels: true - scrape_interval: 1m - scrape_timeout: 10s - metrics_path: /metrics - scheme: http - dns_sd_configs: - - names: - - {{ .Release.Name }}-cost-analyzer - type: 'A' - port: 9003 - - job_name: kubecost-networking - kubernetes_sd_configs: - - role: pod - relabel_configs: - # Scrape only the the targets matching the following metadata - - source_labels: [__meta_kubernetes_pod_label_app] - action: keep - regex: {{ .Release.Name }}-network-costs - + fullnameOverride: "kommander-kubecost-prometheus" server: fullnameOverride: "kommander-kubecost-prometheus-server" - image: - repository: quay.io/prometheus/prometheus - tag: v2.55.0 - # If clusterIDConfigmap is defined, instead use user-generated configmap with key CLUSTER_ID - # to use as unique cluster ID in kubecost cost-analyzer deployment. - # This overrides the cluster_id set in prometheus.server.global.external_labels. - # NOTE: This does not affect the external_labels set in prometheus config. - clusterIDConfigmap: kubecost-cluster-info-configmap - extraFlags: - - web.enable-admin-api - - web.enable-lifecycle - - storage.tsdb.wal-compression - resources: - limits: - cpu: 1000m - memory: 2500Mi - requests: - cpu: 300m - memory: 1500Mi + priorityClassName: dkp-high-priority global: scrape_interval: 1m scrape_timeout: 10s evaluation_interval: 1m external_labels: cluster_id: $CLUSTER_ID - persistentVolume: - size: 32Gi - enabled: true - extraArgs: - log.level: info - log.format: json - storage.tsdb.min-block-duration: 2h - storage.tsdb.max-block-duration: 2h - query.max-concurrency: 1 - query.max-samples: 100000000 - enableAdminApi: true - service: - gRPC: - enabled: true - priorityClassName: dkp-high-priority - configmapReload: - prometheus: - enabled: true - #image: - #repository: ghcr.io/jimmidyson/configmap-reload - #tag: v0.14.0 - alertmanager: - enabled: true - #image: - #repository: ghcr.io/jimmidyson/configmap-reload - #tag: v0.14.0 alertmanager: fullnameOverride: "kommander-kubecost-prometheus-alertmanager" priorityClassName: dkp-high-priority - enabled: true - image: - repository: quay.io/prometheus/alertmanager - tag: v0.27.0 - resources: - limits: - cpu: 50m - memory: 100Mi - requests: - cpu: 10m - memory: 50Mi - persistentVolume: - enabled: true - pushgateway: - enabled: false - persistentVolume: - enabled: false - serverFiles: - alerts: - groups: - - name: Kubecost - rules: - - alert: kubecostDown - expr: up{job="kubecost"} == 0 - annotations: - message: 'Kubecost metrics endpoint is not being scraped successfully.' - for: 10m - labels: - severity: warning - - alert: kubecostMetricsUnavailable - expr: sum(sum_over_time(node_cpu_hourly_cost[5m])) == 0 - annotations: - message: 'Kubecost metrics are not available in Prometheus.' - for: 10m - labels: - severity: warning - - alert: kubecostRecordingRulesNotEvaluated - expr: avg_over_time(kubecost_cluster_memory_working_set_bytes[5m]) == 0 - annotations: - message: 'Kubecost recording rules are not being successfully evaluated.' - for: 10m - labels: - severity: warning + kube-state-metrics: + fullnameOverride: "kommander-kubecost-prometheus-kube-state-metrics" + priorityClassName: dkp-high-priority grafana: sidecar: @@ -234,10 +126,56 @@ data: kubecostProductConfigs: grafanaURL: "/dkp/kommander/monitoring/grafana" - # used for display in Kubecost UI clusterName: "" clusterProfile: production - cloudIntegrationSecret: # TODO(takirala): Do we want to enable this by default? + cloudIntegrationSecret: "" productKey: enabled: false #key: YOUR_KEY + + # COSI related resources + bucketClasses: # Cluster scoped resource + - name: kubecost-cosi-storage + driverName: rook-ceph.ceph.objectstorage.k8s.io + deletionPolicy: Delete + parameters: + objectStoreUserSecretName: rook-ceph-object-user-dkp-object-store-cosi-admin + objectStoreUserSecretNamespace: kommander + bucketAccessClasses: # Cluster scoped resource + - name: kubecost-cosi-storage + driverName: rook-ceph.ceph.objectstorage.k8s.io + authenticationType: KEY + parameters: + objectStoreUserSecretName: rook-ceph-object-user-dkp-object-store-cosi-admin + objectStoreUserSecretNamespace: kommander + bucketClaims: # Namespace scoped resource + - name: kubecost-cosi-storage + namespace: kubecost + bucketClassName: kubecost-cosi-storage + protocols: + - s3 + bucketAccesses: # Namespace scoped resource + - name: kubecost-cosi-storage + namespace: kubecost + bucketAccessClassName: kubecost-cosi-storage + bucketClaimName: kubecost-cosi-storage + protocol: s3 + credentialsSecretName: federated-store + cosiProviders: + ceph: + driver: + enabled: true + name: ceph-cosi-driver + namespace: kommander + spec: + deploymentStrategy: Auto + adminuser: + enabled: true + name: cosi-admin + namespace: kommander + spec: + displayName: "ceph cosi admin" + store: dkp-object-store # name of the CephObjectStore + capabilities: + bucket: "*" + user: "*" diff --git a/services/centralized-kubecost/2.5.0/move-to-konvoy/move-to-konvoy-cosi-hr.yaml b/services/centralized-kubecost/2.5.0/move-to-konvoy/move-to-konvoy-cosi-hr.yaml index 16917f8e4..d009bcaf6 100644 --- a/services/centralized-kubecost/2.5.0/move-to-konvoy/move-to-konvoy-cosi-hr.yaml +++ b/services/centralized-kubecost/2.5.0/move-to-konvoy/move-to-konvoy-cosi-hr.yaml @@ -1,3 +1,4 @@ +# TODO: https://jira.nutanix.com/browse/NCN-104793 and https://jira.nutanix.com/browse/NCN-104743 apiVersion: helm.toolkit.fluxcd.io/v2beta2 kind: HelmRelease metadata: @@ -9,9 +10,10 @@ spec: chart: cosi sourceRef: kind: HelmRepository - name: mesosphere.github.io-charts-stable + name: takirala + # name: mesosphere.github.io-charts-stable namespace: kommander-flux - version: 0.0.1-alpha.1 + version: 0.0.1-alpha.2 interval: 15s install: crds: CreateReplace diff --git a/services/centralized-kubecost/2.5.0/release/release.yaml b/services/centralized-kubecost/2.5.0/release/release.yaml index 8b8a18dcb..b7daa5833 100644 --- a/services/centralized-kubecost/2.5.0/release/release.yaml +++ b/services/centralized-kubecost/2.5.0/release/release.yaml @@ -28,6 +28,9 @@ spec: valuesFrom: - kind: ConfigMap name: centralized-kubecost-2.5.0-d2iq-defaults + - kind: ConfigMap + name: centralized-kubecost-overrides + optional: true targetNamespace: kubecost --- apiVersion: v1 diff --git a/services/kubecost/2.5.0/defaults/cm.yaml b/services/kubecost/2.5.0/defaults/cm.yaml index 006ddf5e9..a99ffcb37 100644 --- a/services/kubecost/2.5.0/defaults/cm.yaml +++ b/services/kubecost/2.5.0/defaults/cm.yaml @@ -37,15 +37,132 @@ data: name: dkp-high-priority prometheus: + kubeStateMetrics: + enabled: false + kube-state-metrics: + disabled: true + + extraScrapeConfigs: | + - job_name: kubecost + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + dns_sd_configs: + - names: + - {{ .Release.Name }}-cost-analyzer + type: 'A' + port: 9003 + - job_name: kubecost-networking + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Scrape only the the targets matching the following metadata + - source_labels: [__meta_kubernetes_pod_label_app] + action: keep + regex: {{ .Release.Name }}-network-costs + server: + priorityClassName: dkp-high-priority retention: 14d + fullnameOverride: "kommander-kubecost-prometheus-server" + image: + repository: quay.io/prometheus/prometheus + tag: v2.55.1 + # If clusterIDConfigmap is defined, instead use user-generated configmap with key CLUSTER_ID + # to use as unique cluster ID in kubecost cost-analyzer deployment. + # This overrides the cluster_id set in prometheus.server.global.external_labels. + # NOTE: This does not affect the external_labels set in prometheus config. clusterIDConfigmap: kubecost-cluster-info-configmap + extraFlags: + - web.enable-admin-api + - web.enable-lifecycle + - storage.tsdb.wal-compression + resources: + limits: + cpu: 1000m + memory: 2500Mi + requests: + cpu: 300m + memory: 1500Mi global: scrape_interval: 1m scrape_timeout: 10s evaluation_interval: 1m external_labels: cluster_id: $CLUSTER_ID + persistentVolume: + size: 32Gi + enabled: true + extraArgs: + log.level: info + log.format: json + storage.tsdb.min-block-duration: 2h + storage.tsdb.max-block-duration: 2h + query.max-concurrency: 1 + query.max-samples: 100000000 + enableAdminApi: true + service: + gRPC: + enabled: true + configmapReload: + prometheus: + enabled: true + #image: + #repository: ghcr.io/jimmidyson/configmap-reload + #tag: v0.14.0 + alertmanager: + enabled: true + #image: + #repository: ghcr.io/jimmidyson/configmap-reload + #tag: v0.14.0 + alertmanager: + fullnameOverride: "kommander-kubecost-prometheus-alertmanager" + priorityClassName: dkp-high-priority + enabled: true + image: + repository: quay.io/prometheus/alertmanager + tag: v0.27.0 + resources: + limits: + cpu: 50m + memory: 100Mi + requests: + cpu: 10m + memory: 50Mi + persistentVolume: + enabled: true + pushgateway: + enabled: false + persistentVolume: + enabled: false + serverFiles: + alerts: + groups: + - name: Kubecost + rules: + - alert: kubecostDown + expr: up{job="kubecost"} == 0 + annotations: + message: 'Kubecost metrics endpoint is not being scraped successfully.' + for: 10m + labels: + severity: warning + - alert: kubecostMetricsUnavailable + expr: sum(sum_over_time(node_cpu_hourly_cost[5m])) == 0 + annotations: + message: 'Kubecost metrics are not available in Prometheus.' + for: 10m + labels: + severity: warning + - alert: kubecostRecordingRulesNotEvaluated + expr: avg_over_time(kubecost_cluster_memory_working_set_bytes[5m]) == 0 + annotations: + message: 'Kubecost recording rules are not being successfully evaluated.' + for: 10m + labels: + severity: warning kubecostProductConfigs: # used for display in Kubecost UI