diff --git a/apps/datalake-metrics/manifests/HelmRelease.yaml b/apps/datalake-metrics/manifests/HelmRelease.yaml new file mode 100644 index 00000000..9e4c1e12 --- /dev/null +++ b/apps/datalake-metrics/manifests/HelmRelease.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta2 +kind: HelmRelease +metadata: + name: thanos + namespace: datalake-metrics +spec: + chart: + spec: + chart: thanos + version: "15.8.1" + sourceRef: + kind: HelmRepository + name: thanos + namespace: datalake-metrics + interval: 5m + valuesFile: values.yaml + interval: 5m + timeout: 20m + install: + timeout: 20m + disableWait: false + crds: CreateReplace + createNamespace: true + remediation: + retries: 3 + upgrade: + timeout: 20m + disableWait: false + crds: CreateReplace + valuesFrom: + - kind: ConfigMap + name: values diff --git a/apps/datalake-metrics/manifests/HelmRepository.yaml b/apps/datalake-metrics/manifests/HelmRepository.yaml new file mode 100644 index 00000000..bbf9198b --- /dev/null +++ b/apps/datalake-metrics/manifests/HelmRepository.yaml @@ -0,0 +1,9 @@ +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: thanos + namespace: datalake-metrics +spec: + interval: 5m + type: oci + url: oci://registry-1.docker.io/bitnamicharts diff --git a/apps/datalake-metrics/manifests/values.yaml b/apps/datalake-metrics/manifests/values.yaml new file mode 100644 index 00000000..6d73b585 --- /dev/null +++ b/apps/datalake-metrics/manifests/values.yaml @@ -0,0 +1,140 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: values + namespace: datalake-metrics +data: + values.yaml: |- + existingObjstoreSecret: thanos-objectstorage + + minio: + enabled: false + + metrics: + enabled: true + serviceMonitor: + enabled: true + prometheusRule: + default: + create: false + disabled: + # Disabling alerts that are not relevant + ThanosSidecarIsDown: true + ThanosSidecarBucketOperationsFailed: true + ThanosSidecarNoConnectionToStartedPrometheus: true + ThanosBucketReplicateErrorRate: true + ThanosBucketReplicateRunLatency: true + ThanosCompactIsDown: true + ThanosQueryHttpRequestQueryErrorRateHigh: true + + query: + enabled: false + logFormat: json + replicaLabel: [prometheus_replica, replica] + resources: + requests: + cpu: 15m + memory: 40Mi + replicaCount: 2 + extraFlags: + - --query.auto-downsampling + ingress: + enabled: false + + queryFrontend: + enabled: false + + #indexCacheConfig: | + # type: MEMCACHED + # config: + # addresses: + # - memcached-0.memcached.datalake-metrics.svc.cluster.local:11211 + # - memcached-1.memcached.datalake-metrics.svc.cluster.local:11211 + # max_item_size: 10MiB + + #bucketCacheConfig: | + # type: MEMCACHED + # config: + # addresses: + # - memcached-0.memcached.datalake-metrics.svc.cluster.local:11211 + # - memcached-1.memcached.datalake-metrics.svc.cluster.local:11211 + # max_item_size: 10MiB + + bucketweb: + enabled: false + + compactor: + enabled: true + logFormat: json + resources: + requests: + cpu: 1 + #memory: 20Gi + limits: + cpu: 1 + #memory: 20Gi + persistence: + storageClass: longhorn-r2 + # Note: we already know this cannot be lower than 250Gi + size: 7Gi + retentionResolutionRaw: 33d # Keep raw samples for 33 days + retentionResolution5m: 120d # Keep samples downsampled to 5m for 99 days (note that 5m downsampling kicks in after 40h) + retentionResolution1h: 365d # Keep samples downsampled to 1h for 1 year (note that 1h downsampling kicks in after 10d) + extraFlags: + # Thanos cannot deduplicate data on write and it should be deduplicated by compactor + # However to do this, compactor needs to know what are the labels attached by thanos receiver and ruler + - --deduplication.replica-label=replica + - --hash-func=SHA256 + + storegateway: + enabled: false + replicaCount: 2 + logFormat: json + resources: + requests: + cpu: 500m + memory: 1Gi + pdb: + create: true + sharded: + enabled: false + persistence: + storageClass: longhorn + size: 7Gi + + ruler: + enabled: false + + receive: + enabled: false + logFormat: json + # WARNING: when increasing replica count, remember to update the config section with new endpoints! + replicaCount: 3 + replicaLabel: replica + replicationFactor: 2 + tsdbRetention: 2d + resources: + requests: + cpu: 150m + memory: 1100Mi + extraFlags: + - --receive.hashrings-algorithm=ketama + ingress: + enabled: false # FIXME: Move ingress to helm chart after migration + + service: + # Due to magic of helm charts, enabling additional headless SVC allows for proper receiver identification + # Without this, all receivers identify as "127.0.0.1:10901" which can cause issues with data querying + additionalHeadless: true + + config: + - hashring: default + tenants: [] + endpoints: + - thanos-receive-0.thanos-receive-headless.datalake-metrics.svc.cluster.local:10901 + - thanos-receive-1.thanos-receive-headless.datalake-metrics.svc.cluster.local:10901 + - thanos-receive-2.thanos-receive-headless.datalake-metrics.svc.cluster.local:10901 + + persistence: + storageClass: lvm + size: 35Gi