Skip to content

Commit

Permalink
Merge pull request #23 from msfidelis/feature/karpenter
Browse files Browse the repository at this point in the history
Feature/karpenter
  • Loading branch information
msfidelis authored Oct 24, 2023
2 parents d3f358d + 6d6c5f1 commit d74cde4
Show file tree
Hide file tree
Showing 13 changed files with 699 additions and 75 deletions.
54 changes: 44 additions & 10 deletions README.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions data.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ data "aws_eks_cluster_auth" "default" {
}

data "aws_caller_identity" "current" {}

data "aws_ssm_parameter" "eks" {
name = format("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", var.k8s_version)
}
39 changes: 39 additions & 0 deletions helm/karpenter/templates/provisioner.yml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: ${EKS_CLUSTER}
spec:
# consolidation:
# enabled: true
ttlSecondsAfterEmpty: 30
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
whenUnsatisfiable: ScheduleAnyway
requirements:
- key: karpenter.k8s.aws/instance-family
operator: In
values:
%{ for ifm in INSTANCE_FAMILY ~}
- ${ifm}
%{ endfor ~}
- key: karpenter.sh/capacity-type
operator: In
values:
%{ for cpct in CAPACITY_TYPE ~}
- ${cpct}
%{ endfor ~}
- key: karpenter.k8s.aws/instance-size
operator: In
values:
%{ for ifs in INSTANCE_SIZES ~}
- ${ifs}
%{ endfor ~}
- key: "topology.kubernetes.io/zone"
operator: In
values:
%{ for az in AVAILABILITY_ZONES ~}
- ${az}
%{ endfor ~}
providerRef:
name: ${EKS_CLUSTER}
8 changes: 8 additions & 0 deletions helm/karpenter/templates/template.yml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: ${EKS_CLUSTER}
spec:
subnetSelector:
aws-ids: ${EKS_SUBNETS}
launchTemplate: ${LAUNCH_TEMPLATE}
4 changes: 4 additions & 0 deletions helm/karpenter/templates/user-data.sh.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -ex

/etc/eks/bootstrap.sh ${CLUSTER_ID} --b64-cluster-ca ${B64_CLUSTER_CA} --apiserver-endpoint ${APISERVER_ENDPOINT}
2 changes: 1 addition & 1 deletion helm_cluster_autoscaler.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ resource "helm_release" "cluster_autoscaler" {

set {
name = "rbac.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = aws_iam_role.cluster_autoscaler_role.arn
value = aws_iam_role.cluster_autoscaler_role[count.index].arn
}

set {
Expand Down
105 changes: 105 additions & 0 deletions helm_karpenter.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
resource "helm_release" "karpenter" {
count = var.karpenter_toggle ? 1 : 0
namespace = "karpenter"
create_namespace = true

name = "karpenter"
repository = "https://charts.karpenter.sh"
chart = "karpenter"
version = "v0.15.0"

set {
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = aws_iam_role.karpenter_role[count.index].arn
}

set {
name = "clusterName"
value = var.cluster_name
}

set {
name = "clusterEndpoint"
value = aws_eks_cluster.eks_cluster.endpoint
}

set {
name = "aws.defaultInstanceProfile"
value = aws_iam_instance_profile.nodes.name
}

depends_on = [
aws_eks_cluster.eks_cluster,
kubernetes_config_map.aws-auth,
]

}

resource "kubectl_manifest" "karpenter_provisioner" {
count = var.karpenter_toggle ? 1 : 0
yaml_body = templatefile(
"${path.module}/helm/karpenter/templates/provisioner.yml.tpl", {
EKS_CLUSTER = var.cluster_name,
CAPACITY_TYPE = var.karpenter_capacity_type
INSTANCE_FAMILY = var.karpenter_instance_family
INSTANCE_SIZES = var.karpenter_instance_sizes
AVAILABILITY_ZONES = var.karpenter_availability_zones
})

depends_on = [
helm_release.karpenter
]
}

resource "kubectl_manifest" "karpenter_template" {
count = var.karpenter_toggle ? 1 : 0
yaml_body = templatefile(
"${path.module}/helm/karpenter/templates/template.yml.tpl", {
EKS_CLUSTER = var.cluster_name,
EKS_SUBNETS = join(", ", [
aws_subnet.private_subnet_1a.id,
aws_subnet.private_subnet_1b.id,
aws_subnet.private_subnet_1c.id
])
LAUNCH_TEMPLATE = format("%s-karpenter", var.cluster_name)
})

depends_on = [
helm_release.karpenter
]
}

resource "aws_launch_template" "karpenter" {
count = var.karpenter_toggle ? 1 : 0
image_id = data.aws_ssm_parameter.eks.value
name = format("%s-karpenter", var.cluster_name)

update_default_version = true

vpc_security_group_ids = [
aws_eks_cluster.eks_cluster.vpc_config[0].cluster_security_group_id
]

user_data = base64encode(templatefile(
"${path.module}/helm/karpenter/templates/user-data.sh.tpl",
{
CLUSTER_NAME = var.cluster_name,
CLUSTER_ID = var.cluster_name,
APISERVER_ENDPOINT = aws_eks_cluster.eks_cluster.endpoint,
B64_CLUSTER_CA = aws_eks_cluster.eks_cluster.certificate_authority.0.data
}
))

iam_instance_profile {
name = aws_iam_instance_profile.nodes.name
}

tag_specifications {
resource_type = "instance"

tags = {
"Name" : format("%s-karpanter", var.cluster_name)
"aws-node-termination-handler/managed" = "true"
}
}
}
203 changes: 203 additions & 0 deletions helm_node_termination_handler.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
resource "helm_release" "node_termination_handler" {
count = var.node_termination_handler_toggle ? 1 : 0
name = "aws-node-termination-handler"
namespace = "kube-system"

chart = "aws-node-termination-handler"
repository = "https://aws.github.io/eks-charts/"
version = "0.21.0"

set {
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = aws_iam_role.aws_node_termination_handler_role[count.index].arn
}

set {
name = "awsRegion"
value = var.aws_region
}

set {
name = "queueURL"
value = aws_sqs_queue.node_termination_handler[count.index].url
}

set {
name = "enableSqsTerminationDraining"
value = true
}

set {
name = "enableSpotInterruptionDraining"
value = true
}

set {
name = "enableRebalanceMonitoring"
value = true
}

set {
name = "enableRebalanceDraining"
value = true
}

set {
name = "enableScheduledEventDraining"
value = true
}

set {
name = "deleteSqsMsgIfNodeNotFound"
value = true
}

set {
name = "checkTagBeforeDraining"
value = false
}

}

resource "aws_sqs_queue" "node_termination_handler" {
count = var.node_termination_handler_toggle ? 1 : 0
name = format("%s-aws-node-termination-handler", var.cluster_name)
delay_seconds = 0
max_message_size = 2048
message_retention_seconds = 86400
receive_wait_time_seconds = 10
visibility_timeout_seconds = 60
}

resource "aws_sqs_queue_policy" "node_termination_handler" {
count = var.node_termination_handler_toggle ? 1 : 0
queue_url = aws_sqs_queue.node_termination_handler[count.index].id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": "*",
"Action": [
"sqs:SendMessage"
],
"Resource": [
"${aws_sqs_queue.node_termination_handler[count.index].arn}"
]
}
]
}
EOF
}

resource "aws_cloudwatch_event_rule" "node_termination_handler_instance_terminate" {
count = var.node_termination_handler_toggle ? 1 : 0
name = format("%s-node-termination-handler-instance-terminate", var.cluster_name)
description = var.cluster_name

event_pattern = jsonencode({
source = ["aws.autoscaling"]
detail-type = [
"EC2 Instance-terminate Lifecycle Action"
]
})
}

resource "aws_cloudwatch_event_target" "node_termination_handler_instance_terminate" {
count = var.node_termination_handler_toggle ? 1 : 0
rule = aws_cloudwatch_event_rule.node_termination_handler_instance_terminate[count.index].name
target_id = "SendToSQS"
arn = aws_sqs_queue.node_termination_handler[count.index].arn
}


resource "aws_cloudwatch_event_rule" "node_termination_handler_scheduled_change" {
count = var.node_termination_handler_toggle ? 1 : 0
name = format("%s-node-termination-handler-scheduled-change", var.cluster_name)
description = var.cluster_name

event_pattern = jsonencode({
source = ["aws.health"]
detail-type = [
"AWS Health Event"
]
detail = {
service = [
"EC2"
]
eventTypeCategory = [
"scheduledChange"
]
}
})
}

resource "aws_cloudwatch_event_target" "node_termination_handler_scheduled_change" {
count = var.node_termination_handler_toggle ? 1 : 0
rule = aws_cloudwatch_event_rule.node_termination_handler_scheduled_change[count.index].name
target_id = "SendToSQS"
arn = aws_sqs_queue.node_termination_handler[count.index].arn
}

resource "aws_cloudwatch_event_rule" "node_termination_handler_spot_termination" {
count = var.node_termination_handler_toggle ? 1 : 0
name = format("%s-node-termination-handler-spot-termination", var.cluster_name)
description = var.cluster_name

event_pattern = jsonencode({
source = ["aws.ec2"]
detail-type = [
"EC2 Spot Instance Interruption Warning"
]
})
}

resource "aws_cloudwatch_event_target" "node_termination_handler_spot_termination" {
count = var.node_termination_handler_toggle ? 1 : 0
rule = aws_cloudwatch_event_rule.node_termination_handler_spot_termination[count.index].name
target_id = "SendToSQS"
arn = aws_sqs_queue.node_termination_handler[count.index].arn
}


resource "aws_cloudwatch_event_rule" "node_termination_handler_rebalance" {
count = var.node_termination_handler_toggle ? 1 : 0
name = format("%s-node-termination-handler-rebalance", var.cluster_name)
description = var.cluster_name

event_pattern = jsonencode({
source = ["aws.ec2"]
detail-type = [
"EC2 Instance Rebalance Recommendation"
]
})
}

resource "aws_cloudwatch_event_target" "node_termination_handler_rebalance" {
count = var.node_termination_handler_toggle ? 1 : 0
rule = aws_cloudwatch_event_rule.node_termination_handler_rebalance[count.index].name
target_id = "SendToSQS"
arn = aws_sqs_queue.node_termination_handler[count.index].arn
}


resource "aws_cloudwatch_event_rule" "node_termination_handler_state_change" {
count = var.node_termination_handler_toggle ? 1 : 0
name = format("%s-node-termination-handler-state-change", var.cluster_name)
description = var.cluster_name

event_pattern = jsonencode({
source = ["aws.ec2"]
detail-type = [
"EC2 Instance State-change Notification"
]
})
}

resource "aws_cloudwatch_event_target" "node_termination_handler_state_change" {
count = var.node_termination_handler_toggle ? 1 : 0
rule = aws_cloudwatch_event_rule.node_termination_handler_state_change[count.index].name
target_id = "SendToSQS"
arn = aws_sqs_queue.node_termination_handler[count.index].arn
}
Loading

0 comments on commit d74cde4

Please sign in to comment.