Skip to content

Commit 9164415

Browse files
authored
Add failure modes telemetry (#646)
Telemetry to cover failure modes which are not covered by container logs and metrics for finding resource constraints. ## Motivation and Context When there is any issue with registry container we should be notified. ## How Has This Been Tested? - Local seup ## Breaking Changes - No ## Types of changes <!-- What types of changes does your code introduce? Put an `x` in all the boxes that apply: --> - [ ] Bug fix (non-breaking change which fixes an issue) - [X] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update ## Checklist <!-- Go over all the following points, and put an `x` in all the boxes that apply. --> - [ ] I have read the [MCP Documentation](https://modelcontextprotocol.io) - [ ] My code follows the repository's style guidelines - [ ] New and existing tests pass locally - [ ] I have added appropriate error handling - [ ] I have added or updated documentation as needed ## Additional context - No additional exporter is used, taken advantage of opentelemetry collector - It covers metrics related to resource constraints, currently only limited to default namespace. - Takes cares of kubernetes events as logs which are the source of figuring out any problem with service, covers all such scenarios where pod is not able to start yet and get missed because there are no container logs for such cases. Limited to default namespace. - Taken care of daemonset deployment i.e. deploying otel collector as agent by using correct filtering. - Cardinality contributing factors are only pod ids (but have to observe more), node ids will not increase cardinality as scale up will lead to limited nodes. - Shipping of metrics for resources happens every 60s and list of metrics that will be emitted [https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/metadata.yaml](url) - Container errors <img width="1440" height="816" alt="Screenshot 2025-10-10 at 1 21 14 AM" src="https://github.com/user-attachments/assets/ba90a217-2a49-4522-aa44-a98c02adf95b" /> - Resource metrics <img width="1440" height="816" alt="Screenshot 2025-10-10 at 1 23 51 AM" src="https://github.com/user-attachments/assets/3467be96-db3c-4930-afa2-3cbf5f0ced8b" />
1 parent 8f047f3 commit 9164415

File tree

1 file changed

+101
-29
lines changed

1 file changed

+101
-29
lines changed

deploy/pkg/k8s/monitoring.go

Lines changed: 101 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
174174
"repository": pulumi.String("otel/opentelemetry-collector-contrib"),
175175
"tag": pulumi.String("0.133.0"),
176176
},
177+
"hostNetwork": pulumi.Bool(true),
178+
"dnsPolicy": pulumi.String("ClusterFirstWithHostNet"),
177179
"clusterRole": pulumi.Map{
178180
"create": pulumi.Bool(true),
179181
"rules": pulumi.Array{
@@ -183,14 +185,39 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
183185
pulumi.String("pods"),
184186
pulumi.String("pods/log"),
185187
pulumi.String("nodes"),
188+
pulumi.String("nodes/stats"),
189+
pulumi.String("nodes/proxy"),
186190
pulumi.String("namespaces"),
191+
pulumi.String("events"),
187192
},
188193
"verbs": pulumi.StringArray{
189194
pulumi.String("get"),
190195
pulumi.String("list"),
191196
pulumi.String("watch"),
192197
},
193198
},
199+
pulumi.Map{
200+
"apiGroups": pulumi.StringArray{pulumi.String("apps")},
201+
"resources": pulumi.StringArray{
202+
pulumi.String("replicasets"),
203+
pulumi.String("deployments"),
204+
pulumi.String("daemonsets"),
205+
},
206+
"verbs": pulumi.StringArray{
207+
pulumi.String("get"),
208+
pulumi.String("list"),
209+
pulumi.String("watch"),
210+
},
211+
},
212+
pulumi.Map{
213+
"nonResourceURLs": pulumi.StringArray{
214+
pulumi.String("/stats/*"),
215+
pulumi.String("/metrics"),
216+
},
217+
"verbs": pulumi.StringArray{
218+
pulumi.String("get"),
219+
},
220+
},
194221
},
195222
},
196223
"config": pulumi.Map{
@@ -205,27 +232,12 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
205232
pulumi.Map{
206233
"type": pulumi.String("regex_parser"),
207234
"id": pulumi.String("extract_metadata_from_filepath"),
208-
"regex": pulumi.String(`^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log`),
235+
"regex": pulumi.String(`^.*\/[^_]+_[^_]+_(?P<uid>[a-f0-9\-]{36})\/[^\._]+\/(?P<restart_count>\d+)\.log`),
209236
"parse_from": pulumi.String("attributes[\"log.file.path\"]"),
210237
"cache": pulumi.Map{
211238
"size": pulumi.Int(128),
212239
},
213240
},
214-
pulumi.Map{
215-
"type": pulumi.String("move"),
216-
"from": pulumi.String("attributes.container_name"),
217-
"to": pulumi.String("resource[\"k8s.container.name\"]"),
218-
},
219-
pulumi.Map{
220-
"type": pulumi.String("move"),
221-
"from": pulumi.String("attributes.namespace"),
222-
"to": pulumi.String("resource[\"k8s.namespace.name\"]"),
223-
},
224-
pulumi.Map{
225-
"type": pulumi.String("move"),
226-
"from": pulumi.String("attributes.pod_name"),
227-
"to": pulumi.String("resource[\"k8s.pod.name\"]"),
228-
},
229241
pulumi.Map{
230242
"type": pulumi.String("move"),
231243
"from": pulumi.String("attributes.restart_count"),
@@ -238,9 +250,28 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
238250
},
239251
},
240252
},
253+
"kubeletstats": pulumi.Map{
254+
"collection_interval": pulumi.String("60s"),
255+
"auth_type": pulumi.String("serviceAccount"),
256+
"endpoint": pulumi.String("https://${env:KUBERNETES_NODE_NAME}:10250"),
257+
"insecure_skip_verify": pulumi.Bool(true),
258+
},
259+
"k8s_events": pulumi.Map{
260+
"auth_type": pulumi.String("serviceAccount"),
261+
"namespaces": pulumi.StringArray{
262+
pulumi.String("default"),
263+
},
264+
},
241265
},
242266
"processors": pulumi.Map{
243267
"batch": pulumi.Map{},
268+
"filter/kubeletstats_filter": pulumi.Map{
269+
"metrics": pulumi.Map{
270+
"datapoint": pulumi.StringArray{
271+
pulumi.String(`resource.attributes["k8s.namespace.name"] != "default"`),
272+
},
273+
},
274+
},
244275
"k8sattributes": pulumi.Map{
245276
"auth_type": pulumi.String("serviceAccount"),
246277
"passthrough": pulumi.Bool(false),
@@ -254,27 +285,31 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
254285
pulumi.String("k8s.deployment.name"),
255286
pulumi.String("k8s.namespace.name"),
256287
pulumi.String("k8s.node.name"),
257-
pulumi.String("k8s.pod.start_time"),
258-
pulumi.String("k8s.cluster.uid"),
259-
},
260-
"labels": pulumi.Array{
261-
pulumi.Map{
262-
"tag_name": pulumi.String("app"),
263-
"key": pulumi.String("app"),
264-
"from": pulumi.String("pod"),
265-
},
288+
pulumi.String("container.image.name"),
289+
pulumi.String("container.image.tag"),
266290
},
267291
},
268292
"pod_association": pulumi.Array{
269293
pulumi.Map{
270294
"sources": pulumi.Array{
271295
pulumi.Map{
272296
"from": pulumi.String("resource_attribute"),
273-
"name": pulumi.String("k8s.pod.name"),
297+
"name": pulumi.String("k8s.pod.ip"),
274298
},
299+
},
300+
},
301+
pulumi.Map{
302+
"sources": pulumi.Array{
275303
pulumi.Map{
276304
"from": pulumi.String("resource_attribute"),
277-
"name": pulumi.String("k8s.namespace.name"),
305+
"name": pulumi.String("k8s.pod.uid"),
306+
},
307+
},
308+
},
309+
pulumi.Map{
310+
"sources": pulumi.Array{
311+
pulumi.Map{
312+
"from": pulumi.String("connection"),
278313
},
279314
},
280315
},
@@ -302,14 +337,29 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
302337
"queue_size": pulumi.Int(50),
303338
},
304339
},
340+
"otlphttp/victoriametrics": pulumi.Map{
341+
"metrics_endpoint": pulumi.String("http://victoria-metrics-victoria-metrics-single-server:8428/opentelemetry/v1/metrics"),
342+
"timeout": pulumi.String("10s"),
343+
"retry_on_failure": pulumi.Map{
344+
"enabled": pulumi.Bool(true),
345+
"initial_interval": pulumi.String("5s"),
346+
"max_interval": pulumi.String("30s"),
347+
"max_elapsed_time": pulumi.String("300s"),
348+
},
349+
},
305350
},
306351
"service": pulumi.Map{
307352
"pipelines": pulumi.Map{
308353
"logs": pulumi.Map{
309-
"receivers": pulumi.StringArray{pulumi.String("filelog")},
310-
"processors": pulumi.StringArray{pulumi.String("batch"), pulumi.String("k8sattributes")},
354+
"receivers": pulumi.StringArray{pulumi.String("filelog"), pulumi.String("k8s_events")},
355+
"processors": pulumi.StringArray{pulumi.String("k8sattributes"), pulumi.String("batch")},
311356
"exporters": pulumi.StringArray{pulumi.String("otlphttp/victorialogs")},
312357
},
358+
"metrics": pulumi.Map{
359+
"receivers": pulumi.StringArray{pulumi.String("kubeletstats")},
360+
"processors": pulumi.StringArray{pulumi.String("k8sattributes"), pulumi.String("filter/kubeletstats_filter"), pulumi.String("batch")},
361+
"exporters": pulumi.StringArray{pulumi.String("otlphttp/victoriametrics")},
362+
},
313363
},
314364
},
315365
},
@@ -326,6 +376,18 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
326376
"path": pulumi.String("/var/lib/docker/containers"),
327377
},
328378
},
379+
pulumi.Map{
380+
"name": pulumi.String("proc"),
381+
"hostPath": pulumi.Map{
382+
"path": pulumi.String("/proc"),
383+
},
384+
},
385+
pulumi.Map{
386+
"name": pulumi.String("sys"),
387+
"hostPath": pulumi.Map{
388+
"path": pulumi.String("/sys"),
389+
},
390+
},
329391
},
330392
"extraVolumeMounts": pulumi.Array{
331393
pulumi.Map{
@@ -338,6 +400,16 @@ func deployOtelCollectorDaemonSet(ctx *pulumi.Context, cluster *providers.Provid
338400
"mountPath": pulumi.String("/var/lib/docker/containers"),
339401
"readOnly": pulumi.Bool(true),
340402
},
403+
pulumi.Map{
404+
"name": pulumi.String("proc"),
405+
"mountPath": pulumi.String("/host/proc"),
406+
"readOnly": pulumi.Bool(true),
407+
},
408+
pulumi.Map{
409+
"name": pulumi.String("sys"),
410+
"mountPath": pulumi.String("/host/sys"),
411+
"readOnly": pulumi.Bool(true),
412+
},
341413
},
342414
"extraEnvs": pulumi.Array{
343415
pulumi.Map{

0 commit comments

Comments
 (0)