diff --git a/deployment/grid/charts/agent-htc-lambda/Chart.yaml b/deployment/grid/charts/agent-htc-lambda/Chart.yaml index 765b1d88..c30dad04 100644 --- a/deployment/grid/charts/agent-htc-lambda/Chart.yaml +++ b/deployment/grid/charts/agent-htc-lambda/Chart.yaml @@ -18,8 +18,8 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.1.0 +version: 0.2.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. -appVersion: 1.16.0 +appVersion: 1.17.0 diff --git a/deployment/grid/charts/agent-htc-lambda/templates/hpa.yaml b/deployment/grid/charts/agent-htc-lambda/templates/hpa.yaml index 3c1f1afb..ca859f58 100644 --- a/deployment/grid/charts/agent-htc-lambda/templates/hpa.yaml +++ b/deployment/grid/charts/agent-htc-lambda/templates/hpa.yaml @@ -17,17 +17,9 @@ spec: minReplicaCount: {{ .Values.hpa.minAgent }} maxReplicaCount: {{ .Values.hpa.maxAgent }} triggers: - - type: aws-cloudwatch + - type: aws-sqs-queue metadata: - identityOwner: operator - namespace: {{ .Values.hpa.metric.namespace }} - dimensionName: {{ .Values.hpa.metric.dimensionName }} - dimensionValue: {{ .Values.hpa.metric.dimensionValue }} - metricName: {{ .Values.hpa.metric.name }} - metricUnit: "Count" - metricStatPeriod: "30" - targetMetricValue: {{ .Values.hpa.metric.targetValue | quote}} - minMetricValue: "0" + queueURL: {{ .Values.hpa.queueURL | quote }} + queueLength: {{ .Values.hpa.metric.targetValue | quote }} awsRegion: {{ .Values.hpa.metric.region }} - # authenticationRef: - # name: htc-agent-scaler-auth + identityOwner: operator diff --git a/deployment/grid/charts/agent-htc-lambda/values.yaml b/deployment/grid/charts/agent-htc-lambda/values.yaml index dad3b896..d0fa83eb 100644 --- a/deployment/grid/charts/agent-htc-lambda/values.yaml +++ b/deployment/grid/charts/agent-htc-lambda/values.yaml @@ -111,6 +111,7 @@ testTolerations: testAffinity: {} hpa: + queueURL: "" metric: namespace: "" dimensionName: "" diff --git a/deployment/grid/terraform/compute_plane/aws_iam.tf b/deployment/grid/terraform/compute_plane/aws_iam.tf index c94c6e5e..51834490 100644 --- a/deployment/grid/terraform/compute_plane/aws_iam.tf +++ b/deployment/grid/terraform/compute_plane/aws_iam.tf @@ -34,11 +34,10 @@ resource "aws_iam_policy" "keda_permissions" { "Statement": [ { "Action": [ - "cloudwatch:GetMetricData", - "cloudwatch:GetMetricStatistics", - "cloudwatch:ListMetrics" + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl" ], - "Resource": "*", + "Resource": "${var.sqs_queue_arn}", "Effect": "Allow" } ] diff --git a/deployment/grid/terraform/compute_plane/variables.tf b/deployment/grid/terraform/compute_plane/variables.tf index 7f00f056..c30ebee9 100644 --- a/deployment/grid/terraform/compute_plane/variables.tf +++ b/deployment/grid/terraform/compute_plane/variables.tf @@ -124,3 +124,9 @@ variable "eks_node_volume_size" { type = number default = 50 } + +variable "sqs_queue_arn" { + description = "The ARN of the SQS queue for KEDA" + type = string + default = "" +} diff --git a/deployment/grid/terraform/control_plane/outputs.tf b/deployment/grid/terraform/control_plane/outputs.tf index 6e473816..506abad3 100644 --- a/deployment/grid/terraform/control_plane/outputs.tf +++ b/deployment/grid/terraform/control_plane/outputs.tf @@ -66,3 +66,13 @@ output "cognito_userpool_client_id" { description = "Cognito User Pool Client ID" value = aws_cognito_user_pool_client.user_data_client.id } + +output "sqs_queue_url" { + description = "The URL of the SQS queue" + value = aws_sqs_queue.htc_task_queue["__0"].id +} + +output "sqs_queue_arn" { + description = "The ARN of the SQS queue" + value = aws_sqs_queue.htc_task_queue["__0"].arn +} diff --git a/deployment/grid/terraform/htc-agent/htc_agent.tf b/deployment/grid/terraform/htc-agent/htc_agent.tf index a2be855a..27c25175 100644 --- a/deployment/grid/terraform/htc-agent/htc_agent.tf +++ b/deployment/grid/terraform/htc-agent/htc_agent.tf @@ -155,6 +155,10 @@ module "htc-agent" { { name = "hpa.minAgent" value = var.min_htc_agents + }, + { + name = "hpa.queueURL" + value = var.sqs_queue_url } ] diff --git a/deployment/grid/terraform/htc-agent/variables.tf b/deployment/grid/terraform/htc-agent/variables.tf index 09571b19..1e0bb375 100644 --- a/deployment/grid/terraform/htc-agent/variables.tf +++ b/deployment/grid/terraform/htc-agent/variables.tf @@ -202,6 +202,12 @@ variable "lambda_handler_function_name" { type = string } +variable "sqs_queue_url" { + description = "The URL of the SQS queue" + type = string + default = "" +} + variable "namespace_metrics" { description = "NameSpace for metrics" type = string diff --git a/deployment/grid/terraform/main.tf b/deployment/grid/terraform/main.tf index ef702d02..24c9412b 100644 --- a/deployment/grid/terraform/main.tf +++ b/deployment/grid/terraform/main.tf @@ -146,6 +146,7 @@ module "compute_plane" { cognito_userpool_id = module.control_plane.cognito_userpool_id kms_key_admin_roles = var.kms_key_admin_roles kms_deletion_window = var.kms_deletion_window + sqs_queue_arn = module.control_plane.sqs_queue_arn # allowed_access_cidr_blocks = local.allowed_access_cidr_blocks } @@ -257,6 +258,7 @@ module "htc_agent" { test_agent_image_tag = try(var.agent_configuration.test.tag, local.default_agent_configuration.test.tag) test_pull_policy = try(var.agent_configuration.test.pullPolicy, local.default_agent_configuration.test.pullPolicy) test_agent_image_repository = try(var.agent_configuration.test.image, local.default_agent_configuration.test.image) + sqs_queue_url = module.control_plane.sqs_queue_url depends_on = [ module.vpc, diff --git a/docs/workshop/content/10_architecture/70_compute_plane/_index.en.md b/docs/workshop/content/10_architecture/70_compute_plane/_index.en.md index 24029346..9fdff289 100644 --- a/docs/workshop/content/10_architecture/70_compute_plane/_index.en.md +++ b/docs/workshop/content/10_architecture/70_compute_plane/_index.en.md @@ -13,7 +13,7 @@ EKS service configured with the default [Horizontal Pod Autoscaler](https://kube {{< img "compute-plane-eks.png" "HTC-compute-plane-eks" >}} -As HTC-Agents are treated as a [Kubernetes deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/), a fixed number of pods that will be guaranteed to run on an inactive cluster. Scaling behaviour is then controlled by the auto scaling lambda which regularly checks the depth of the task queue and triggers the appropriate adjustment to the number of nodes. The CloudWatch Adapter exposes a Kubernetes API so the HPA can access metrics stored in Cloud Watch by the auto scaling Lambda. The Pod Autoscaler (using HPA) adds/removes pods based on these Cloud Watch metrics. Finally, the Node Autoscaler , adds/removes EC2 instances based on the resource reservation or usage. +As HTC-Agents are treated as a [Kubernetes deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/), a fixed number of pods are guaranteed to run on an inactive cluster. Scaling behaviour is managed by [KEDA](https://keda.sh/), which allows the Horizontal Pod Autoscaler (HPA) to scale the number of agent pods based on the number of messages in the AWS SQS queue. When the number of messages in the queue (the queue depth) increases, KEDA triggers the HPA to add more pods. Finally, the Kubernetes Cluster Autoscaler adds or removes EC2 instances based on the resource reservation or usage. {{< img "scale-up-pods.png" "scale-up-pods" >}}