Skip to content

[Karpenter Integration]: add missed metrics based on karpenter v1.4 documentations #20110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions karpenter/changelog.d/20110.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add missed karpenter v1.4 metrics
30 changes: 30 additions & 0 deletions karpenter/datadog_checks/karpenter/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,20 @@
# https://karpenter.sh/docs/reference/metrics/

METRIC_MAP = {
'aws_sdk_go_request': 'aws.sdk_go.request',
'aws_sdk_go_request_retry': 'aws.sdk_go.request.retry',
'aws_sdk_go_request_duration_seconds': 'aws.sdk_go.request.duration_seconds',
'aws_sdk_go_request_attempt': 'aws.sdk_go.request_attempt',
'aws_sdk_go_request_attempt_duration_seconds': 'aws.sdk_go.request_attempt.duration_seconds',
'certwatcher_read_certificate': 'certwatcher.read.certificate',
'certwatcher_read_certificate_errors': 'certwatcher.read.certificate.errors',
'controller_runtime_active_workers': 'controller.runtime.active_workers',
'controller_runtime_max_concurrent_reconciles': 'controller.runtime.max.concurrent_reconciles',
'controller_runtime_reconcile': 'controller.runtime.reconcile',
'controller_runtime_reconcile_errors': 'controller.runtime.reconcile_errors',
'controller_runtime_reconcile_panics': 'controller.runtime.reconcile_panics',
'controller_runtime_reconcile_time_seconds': 'controller.runtime.reconcile.time_seconds',
'controller_runtime_terminal_reconcile_errors': 'controller.runtime.terminal.reconcile.errors',
'go_gc_duration_seconds': 'go.gc.duration_seconds',
'go_goroutines': 'go_goroutines',
'go_info': 'go_info',
Expand Down Expand Up @@ -69,6 +76,7 @@
'karpenter_disruption_queue_depth': 'disruption.queue_depth',
'karpenter_disruption_pods_disrupted': 'disruption.pods.disrupted',
'karpenter_disruption_nodes_disrupted': 'disruption.nodes.disrupted',
'karpenter_ignored_pod_count': 'ignored_pod_count',
'karpenter_interruption_actions_performed': 'interruption.actions_performed',
'karpenter_interruption_deleted_messages': 'interruption.deleted_messages',
'karpenter_interruption_message_latency_time_seconds': 'interruption.message.latency.time_seconds',
Expand All @@ -84,9 +92,11 @@
'karpenter_nodeclaims_disrupted': 'nodeclaims_disrupted',
'karpenter_nodeclaims_drifted': 'nodeclaims_drifted',
'karpenter_nodeclaims_initialized': 'nodeclaims_initialized',
'karpenter_nodeclaims_instance_termination_duration_seconds': 'nodeclaims_instance_termination.duration_seconds',
'karpenter_nodeclaims_launched': 'nodeclaims_launched',
'karpenter_nodeclaims_registered': 'nodeclaims_registered',
'karpenter_nodeclaims_terminated': 'nodeclaims_terminated',
'karpenter_nodeclaims_termination_duration_seconds': 'nodeclaims_termination.duration_seconds',
'karpenter_nodepool_limit': 'nodepool_limit',
'karpenter_nodepool_usage': 'nodepool_usage',
'karpenter_nodes_allocatable': 'nodes.allocatable',
Expand All @@ -109,8 +119,11 @@
'karpenter_provisioner_usage': 'provisioner.usage',
'karpenter_provisioner_usage_pct': 'provisioner.usage.pct',
'karpenter_cluster_state_synced': 'cluster_state.synced',
'karpenter_cluster_state_unsynced_time_seconds': 'cluster_state.unsynced.time_seconds',
'karpenter_cluster_state_node_count': 'cluster_state.node_count',
'karpenter_cluster_utilization_percent': 'cluster.utilization.percent',
'leader_election_master_status': 'leader_election.master_status',
'leader_election_slowpath': 'leader_election.slowpath',
'process_cpu_seconds': 'process.cpu_seconds',
'process_max_fds': 'process.max_fds',
'process_open_fds': 'process.open_fds',
Expand Down Expand Up @@ -140,6 +153,23 @@
'karpenter_interruption_message_queue_duration_seconds': 'interruption.message.latency.time_seconds',
'karpenter_nodepools_usage': 'nodepool_usage',
'karpenter_nodepools_limit': 'nodepool_limit',
'operator_ec2nodeclass_status_condition_transitions': 'operator.ec2nodeclass.status_condition.transitions',
'operator_ec2nodeclass_status_condition_current_status_seconds': 'operator.ec2nodeclass.status_condition.current_status.seconds',
'operator_ec2nodeclass_status_condition_count': 'operator.ec2nodeclass.status_condition_count',
'operator_node_event_count': 'operator.node.event_count',
'operator_node_status_condition_transitions': 'operator.node.status_condition.transitions',
'operator_node_status_condition_transition_seconds': 'operator.node.status_condition.transitions.seconds',
'operator_node_status_condition_current_status_seconds': 'operator.node.status_condition.current_status.seconds',
'operator_node_status_condition_count': 'operator.node.status_condition_count',
'operator_node_termination_duration_seconds': 'operator.node.termination.duration_seconds',
'operator_nodeclaim_status_condition_transitions': 'operator.nodeclaim.status_condition.transitions',
'operator_nodeclaim_status_condition_transition_seconds': 'operator.nodeclaim.status_condition.transitions.seconds',
'operator_nodeclaim_status_condition_current_status_seconds': 'operator.nodeclaim.status_condition.current_status.seconds',
'operator_nodeclaim_status_condition_count': 'operator.nodeclaim.status_condition_count',
'operator_nodeclaim_termination_duration_seconds': 'operator.nodeclaim.termination.duration_seconds',
'operator_nodepool_status_condition_transitions': 'operator.nodepool.status_condition.transitions',
'operator_nodepool_status_condition_current_status_seconds': 'operator.nodepool.status_condition.current_status.seconds',
'operator_nodepool_status_condition_count': 'operator.nodepool.status_condition_count',
}

RENAME_LABELS_MAP = {
Expand Down
34 changes: 34 additions & 0 deletions karpenter/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags
karpenter.aws.sdk_go.request.count,count,,,,The total number of AWS SDK Go requests,0,karpenter,,,
karpenter.aws.sdk_go.request.duration_seconds.bucket,count,,,,Latency of AWS SDK Go requests histogram buckets,0,karpenter,,,
karpenter.aws.sdk_go.request.duration_seconds.count,count,,,,Count of AWS SDK Go request durations,0,karpenter,,,
karpenter.aws.sdk_go.request.duration_seconds.sum,count,,second,,Sum of AWS SDK Go request durations,0,karpenter,,,
karpenter.aws.sdk_go.request_attempt.count,count,,,,The total number of AWS SDK Go request attempts,0,karpenter,,,
karpenter.aws.sdk_go.request_attempt.duration_seconds.bucket,count,,second,,Latency of AWS SDK Go request attempts histogram buckets,0,karpenter,,,
karpenter.aws.sdk_go.request_attempt.duration_seconds.count,count,,request,,Count of AWS SDK Go request attempt durations,0,karpenter,,,
karpenter.aws.sdk_go.request_attempt.duration_seconds.sum,count,,second,,Sum of AWS SDK Go request attempt durations,0,karpenter,,,
karpenter.build_info,gauge,,,,A metric with a constant '1' value labeled by version from which Karpenter was built.,0,karpenter,,,
karpenter.certwatcher.read.certificate.count,count,,read,,The count of certificate reads,0,karpenter,,,
karpenter.certwatcher.read.certificate.errors.count,count,,error,,The count of certificate read errors,0,karpenter,,,
Expand All @@ -16,8 +24,10 @@ karpenter.cloudprovider.instance.type.cpu_cores,gauge,,core,,VCPUs cores for a g
karpenter.cloudprovider.instance.type.memory_bytes,gauge,,byte,,"Memory, in bytes, for a given instance type",0,karpenter,,,
karpenter.cloudprovider.instance.type.offering_available,gauge,,,,"Instance type offering availability, based on instance type, capacity type, and zone",0,karpenter,,,
karpenter.cloudprovider.instance.type.price_estimate,gauge,,,,Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours,0,karpenter,,,
karpenter.cluster.utilization.percent,gauge,,percent,,Utilization of allocatable resources by pod requests,0,karpenter,,,
karpenter.cluster_state.node_count,gauge,,node,,Current count of nodes in cluster state.,0,karpenter,,,
karpenter.cluster_state.synced,gauge,,,,Returns 1 if cluster state is synced and 0 otherwise. Synced checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state,0,karpenter,,,
karpenter.cluster_state.unsynced.time_seconds,gauge,,second,,The time for which cluster state is not synced,0,karpenter,,,
karpenter.consistency.errors,gauge,,error,,Number of consistency checks that have failed,0,karpenter,,,
karpenter.controller.runtime.active_workers,gauge,,worker,,Number of currently used workers per controller,0,karpenter,,,
karpenter.controller.runtime.max.concurrent_reconciles,gauge,,,,Maximum number of concurrent reconciles per controller,0,karpenter,,,
Expand All @@ -26,6 +36,8 @@ karpenter.controller.runtime.reconcile.time_seconds.bucket,count,,,,The count of
karpenter.controller.runtime.reconcile.time_seconds.count,count,,,,The count of observations in the reconciliation per controller histogram,0,karpenter,,,
karpenter.controller.runtime.reconcile.time_seconds.sum,count,,second,,The sum of time per reconciliation per controller,0,karpenter,,,
karpenter.controller.runtime.reconcile_errors.count,count,,error,,The count of reconciliation errors per controller,0,karpenter,,,
karpenter.controller.runtime.reconcile_panics.count,count,,,,Total number of reconciliation panics per controller,0,karpenter,,,
karpenter.controller.runtime.terminal.reconcile.errors.count,count,,,,Total number of terminal reconciliation errors per controller,0,karpenter,,,
karpenter.deprovisioning.actions_performed.count,count,,execution,,The count of deprovisioning actions performed. Labeled by deprovisioner,0,karpenter,,,
karpenter.deprovisioning.consolidation_timeouts,gauge,,timeout,,Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type,0,karpenter,,,
karpenter.deprovisioning.eligible_machines,gauge,,,,Number of machines eligible for deprovisioning by Karpenter. Labeled by deprovisioner,0,karpenter,,,
Expand Down Expand Up @@ -97,9 +109,15 @@ karpenter.nodeclaims_created,gauge,,,,Number of nodeclaims created in total by K
karpenter.nodeclaims_disrupted,gauge,,,,Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool,0,karpenter,,,
karpenter.nodeclaims_drifted,gauge,,,,Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool,0,karpenter,,,
karpenter.nodeclaims_initialized,gauge,,,,Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool,0,karpenter,,,
karpenter.nodeclaims_instance_termination.duration_seconds.bucket,count,,,,Histogram buckets for CloudProvider Instance termination duration,0,karpenter,,,
karpenter.nodeclaims_instance_termination.duration_seconds.count,count,,,,Count of CloudProvider Instance termination observations,0,karpenter,,,
karpenter.nodeclaims_instance_termination.duration_seconds.sum,count,,second,,Sum of CloudProvider Instance termination durations,0,karpenter,,,
karpenter.nodeclaims_launched,gauge,,,,Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool,0,karpenter,,,
karpenter.nodeclaims_registered,gauge,,,,Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool,0,karpenter,,,
karpenter.nodeclaims_terminated,gauge,,,,Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool,0,karpenter,,,
karpenter.nodeclaims_termination.duration_seconds.bucket,count,,,,Histogram buckets for NodeClaim termination duration,0,karpenter,,,
karpenter.nodeclaims_termination.duration_seconds.count,count,,,,Count of NodeClaim termination duration observations,0,karpenter,,,
karpenter.nodeclaims_termination.duration_seconds.sum,count,,second,,Sum of NodeClaim termination durations,0,karpenter,,,
karpenter.nodepool_limit,gauge,,,,The nodepool limits are the limits specified on the provisioner that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type,0,karpenter,,,
karpenter.nodepool_usage,gauge,,,,The nodepool usage is the amount of resources that have been provisioned by a particular nodepool. Labeled by nodepool name and resource type,0,karpenter,,,
karpenter.nodes.allocatable,gauge,,,,The amount of resources allocatable by nodes,0,karpenter,,,
Expand All @@ -115,6 +133,22 @@ karpenter.nodes.total.daemon_limits,gauge,,,,Total resources specified by Daemon
karpenter.nodes.total.daemon_requests,gauge,,,,Total resources requested by DaemonSet pods,0,karpenter,,,
karpenter.nodes.total.pod_limits,gauge,,,,Total pod resources specified by non-DaemonSet pod limits,0,karpenter,,,
karpenter.nodes.total.pod_requests,gauge,,,,Total pod resources requested by non-DaemonSet pods bound,0,karpenter,,,
karpenter.operator.ec2nodeclass.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for ec2nodeclass,0,karpenter,,,
karpenter.operator.ec2nodeclass.status_condition.transitions.count,count,,,,Count of status condition transitions for ec2nodeclass,0,karpenter,,,
karpenter.operator.ec2nodeclass.status_condition_count,gauge,,,,Number of conditions for ec2nodeclass,0,karpenter,,,
karpenter.operator.node.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for node,0,karpenter,,,
karpenter.operator.node.status_condition.transitions.count,count,,,,Count of status condition transitions for node,0,karpenter,,,
karpenter.operator.node.status_condition.transitions.seconds.bucket,count,,second,,Histogram of condition state durations for node,0,karpenter,,,
karpenter.operator.node.status_condition_count,gauge,,,,Number of conditions for node,0,karpenter,,,
karpenter.operator.node.termination.duration_seconds.bucket,count,,second,,Histogram buckets for node termination durations,0,karpenter,,,
karpenter.operator.nodeclaim.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for nodeclaim,0,karpenter,,,
karpenter.operator.nodeclaim.status_condition.transitions.count,count,,,,Count of status condition transitions for nodeclaim,0,karpenter,,,
karpenter.operator.nodeclaim.status_condition.transitions.seconds.bucket,count,,second,,Histogram of condition state durations for nodeclaim,0,karpenter,,,
karpenter.operator.nodeclaim.status_condition_count,gauge,,,,Number of conditions for nodeclaim,0,karpenter,,,
karpenter.operator.nodeclaim.termination.duration_seconds.bucket,count,,,,Histogram buckets for nodeclaim termination durations,0,karpenter,,,
karpenter.operator.nodepool.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for nodepool,0,karpenter,,,
karpenter.operator.nodepool.status_condition.transitions.count,count,,,,Count of status condition transitions for nodepool,0,karpenter,,,
karpenter.operator.nodepool.status_condition_count,gauge,,,,Number of conditions for nodepool,0,karpenter,,,
karpenter.pods.startup.time_seconds.count,count,,,,The count of the observations in the pod startup summary,0,karpenter,,,
karpenter.pods.startup.time_seconds.quantile,gauge,,,,The time taken between pod creation and the pod being in a running state by `quantile`,0,karpenter,,,
karpenter.pods.startup.time_seconds.sum,count,,second,,The sum of the time from pod creation and the pod being in a running state,0,karpenter,,,
Expand Down
34 changes: 34 additions & 0 deletions karpenter/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,40 @@ def get_fixture_path(filename):
'karpenter.interruption.message.latency.time_seconds.bucket',
'karpenter.nodepool_usage',
'karpenter.nodepool_limit',
'karpenter.aws.sdk_go.request.count',
'karpenter.aws.sdk_go.request.duration_seconds.bucket',
'karpenter.aws.sdk_go.request.duration_seconds.count',
'karpenter.aws.sdk_go.request.duration_seconds.sum',
'karpenter.aws.sdk_go.request_attempt.count',
'karpenter.aws.sdk_go.request_attempt.duration_seconds.bucket',
'karpenter.aws.sdk_go.request_attempt.duration_seconds.count',
'karpenter.aws.sdk_go.request_attempt.duration_seconds.sum',
'karpenter.cluster.utilization.percent',
'karpenter.cluster_state.unsynced.time_seconds',
'karpenter.controller.runtime.reconcile_panics.count',
'karpenter.controller.runtime.terminal.reconcile.errors.count',
'karpenter.nodeclaims_instance_termination.duration_seconds.bucket',
'karpenter.nodeclaims_instance_termination.duration_seconds.count',
'karpenter.nodeclaims_instance_termination.duration_seconds.sum',
'karpenter.nodeclaims_termination.duration_seconds.bucket',
'karpenter.nodeclaims_termination.duration_seconds.count',
'karpenter.nodeclaims_termination.duration_seconds.sum',
'karpenter.operator.ec2nodeclass.status_condition.current_status.seconds',
'karpenter.operator.ec2nodeclass.status_condition.transitions.count',
'karpenter.operator.ec2nodeclass.status_condition_count',
'karpenter.operator.node.status_condition.current_status.seconds',
'karpenter.operator.node.status_condition.transitions.count',
'karpenter.operator.node.status_condition.transitions.seconds.bucket',
'karpenter.operator.node.status_condition_count',
'karpenter.operator.node.termination.duration_seconds.bucket',
'karpenter.operator.nodeclaim.status_condition.current_status.seconds',
'karpenter.operator.nodeclaim.status_condition.transitions.count',
'karpenter.operator.nodeclaim.status_condition.transitions.seconds.bucket',
'karpenter.operator.nodeclaim.status_condition_count',
'karpenter.operator.nodeclaim.termination.duration_seconds.bucket',
'karpenter.operator.nodepool.status_condition.current_status.seconds',
'karpenter.operator.nodepool.status_condition.transitions.count',
'karpenter.operator.nodepool.status_condition_count',
]
RENAMED_LABELS = [
'go_version:go1.20.6',
Expand Down
Loading
Loading