Skip to content

Commit 98cf5c2

Browse files
jmfiolaJacob Fiola
andauthored
fix: add support for multiple azs (closes #17) (#18)
* add support for multiple azs * update tests * remove jfiola from defaults * update asg lambda role policy to include all management subnet arns --------- Co-authored-by: Jacob Fiola <[email protected]>
1 parent 5ac0c67 commit 98cf5c2

File tree

11 files changed

+88
-40
lines changed

11 files changed

+88
-40
lines changed

README.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ Terraform for Corelight's AWS Cloud Sensor Deployment.
88
```terraform
99
1010
data "aws_subnet" "management" {
11-
id = "<management subnet id>"
11+
for_each = toset(["<management subnet id 1>", "<management subnet id 2>"])
12+
id = each.value
1213
}
1314
1415
module "asg_lambda_role" {
@@ -17,26 +18,31 @@ module "asg_lambda_role" {
1718
lambda_cloudwatch_log_group_arn = module.sensor.cloudwatch_log_group_arn
1819
sensor_autoscaling_group_arn = module.sensor.autoscaling_group_arn
1920
security_group_arn = module.sensor.management_security_group_arn
20-
subnet_arn = data.aws_subnet.management.arn
21+
subnet_arns = [for subnet in data.aws_subnet.management : subnet.arn]
2122
}
2223
2324
module "sensor" {
2425
source = "github.com/corelight/terraform-aws-sensor"
2526
26-
# Recommend deploying a sensor per availability zone. Multiple AZs can
27-
# be set but GWLB cross availability zone support is not recommended.
28-
auto_scaling_availability_zones = ["<availability zone>"]
27+
# Multi-AZ support: provide one subnet per availability zone
28+
# The ASG will automatically distribute instances across AZs
29+
availability_zones = ["us-east-1a", "us-east-1b"]
2930
aws_key_pair_name = "<key pair name>"
3031
3132
# Request access to Corelight sensor AMI from you Account Executive
3233
corelight_sensor_ami_id = "<sensor AMI ID>"
3334
license_key = "<your Corelight sensor license key>"
34-
management_subnet_id = "<management subnet>"
35-
monitoring_subnet_id = "<monitoring subnet>"
35+
36+
# Provide one management subnet per AZ (must match availability_zones order)
37+
management_subnet_ids = ["<management subnet in us-east-1a>", "<management subnet in us-east-1b>"]
38+
39+
# Provide one monitoring subnet per AZ (must match availability_zones order)
40+
monitoring_subnet_ids = ["<monitoring subnet in us-east-1a>", "<monitoring subnet in us-east-1b>"]
41+
3642
community_string = "<password for the sensor api>"
3743
vpc_id = "<vpc where the sensor autoscaling group is deployed>"
3844
asg_lambda_iam_role_arn = module.asg_lambda_role.role_arn
39-
45+
4046
fleet_token = "<the pairing token from the Fleet UI>"
4147
fleet_url = "<the URL of the fleet instance from the Fleet UI>"
4248
fleet_server_sslname = "<the ssl name provided by Fleet>"

autoscaling_group.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ resource "aws_autoscaling_group" "sensor_asg" {
99
version = aws_launch_template.sensor_launch_template.latest_version
1010
}
1111

12-
availability_zones = var.availability_zones
12+
vpc_zone_identifier = var.monitoring_subnet_ids
1313
target_group_arns = [aws_lb_target_group.health_check.arn]
1414
health_check_type = "EC2"
1515
health_check_grace_period = 300

data.tf

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@ data "aws_vpc" "provided" {
22
id = var.vpc_id
33
}
44

5-
data "aws_subnet" "monitoring_subnet" {
6-
id = var.monitoring_subnet_id
5+
data "aws_subnet" "monitoring_subnets" {
6+
for_each = toset(var.monitoring_subnet_ids)
7+
id = each.value
8+
}
9+
10+
data "aws_subnet" "management_subnets" {
11+
for_each = toset(var.management_subnet_ids)
12+
id = each.value
713
}

lambda.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ resource "aws_lambda_function" "auto_scaling_lambda" {
1414

1515
environment {
1616
variables = {
17-
TARGET_SUBNET = var.management_subnet_id
17+
TARGET_SUBNETS = jsonencode({ for subnet in data.aws_subnet.management_subnets : subnet.availability_zone => subnet.id })
1818
TARGET_SECURITY_GROUP_ID = aws_security_group.management.id
1919
}
2020
}

launch_template.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ resource "aws_launch_template" "sensor_launch_template" {
1515
}
1616

1717
network_interfaces {
18-
subnet_id = var.monitoring_subnet_id
18+
device_index = 0
1919
security_groups = [aws_security_group.monitoring.id]
2020
delete_on_termination = true
2121
}

load_balancer.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
resource "aws_lb" "sensor_lb" {
22
name = var.sensor_asg_load_balancer_name
33
load_balancer_type = "gateway"
4-
subnets = [data.aws_subnet.monitoring_subnet.id]
4+
subnets = [for subnet in data.aws_subnet.monitoring_subnets : subnet.id]
55
enable_cross_zone_load_balancing = true
66
}
77

modules/iam/lambda/main.tf

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,13 @@ data "aws_iam_policy_document" "lambda_nic_manager_policy" {
7171
"ec2:CreateNetworkInterface",
7272
"ec2:CreateTags",
7373
]
74-
resources = [
75-
var.subnet_arn,
76-
var.security_group_arn,
77-
"arn:aws:ec2:*:*:network-interface/*"
78-
]
74+
resources = concat(
75+
var.subnet_arns,
76+
[
77+
var.security_group_arn,
78+
"arn:aws:ec2:*:*:network-interface/*"
79+
]
80+
)
7981
}
8082
}
8183

modules/iam/lambda/variables.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ variable "sensor_autoscaling_group_arn" {
88
type = string
99
}
1010

11-
variable "subnet_arn" {
12-
description = "ARN of the subnet where new ENIs should be created (management)"
13-
type = string
11+
variable "subnet_arns" {
12+
description = "ARNs of the subnets where new ENIs should be created (management), one per availability zone"
13+
type = list(string)
1414
}
1515

1616
variable "security_group_arn" {

scripts/corelight_sensor_asg_nic_manager.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import json
23
from enum import Enum
34

45
import boto3
@@ -9,7 +10,7 @@
910

1011
@dataclass
1112
class EnvironmentConfig:
12-
subnet_id: str
13+
subnet_map: dict # Maps AZ to subnet ID
1314
security_group_id: str
1415

1516

@@ -38,7 +39,7 @@ class LifecycleActionResult(Enum):
3839

3940

4041
class EnvironmentVariables(Enum):
41-
TARGET_SUBNET = "TARGET_SUBNET"
42+
TARGET_SUBNETS = "TARGET_SUBNETS"
4243
TARGET_SECURITY_GROUP_ID = "TARGET_SECURITY_GROUP_ID"
4344

4445

@@ -135,7 +136,18 @@ def __init__(self, config: EnvironmentConfig, aws_client: AwsClient):
135136
self.instance_data = {}
136137

137138
def process_event(self, event: Ec2LifecycleHookEvent):
138-
network_interface_id = self.aws_client.create_interface(self.config.subnet_id, self.config.security_group_id)
139+
# Get the AZ of the instance
140+
instance_az = self.instance_data['Placement']['AvailabilityZone']
141+
logging.info(f"Instance {event.instance_id} is in AZ {instance_az}")
142+
143+
# Find the matching management subnet for this AZ
144+
if instance_az not in self.config.subnet_map:
145+
raise Exception(f"No management subnet configured for AZ {instance_az}. Available AZs: {list(self.config.subnet_map.keys())}")
146+
147+
target_subnet_id = self.config.subnet_map[instance_az]
148+
logging.info(f"Using management subnet {target_subnet_id} for AZ {instance_az}")
149+
150+
network_interface_id = self.aws_client.create_interface(target_subnet_id, self.config.security_group_id)
139151
try:
140152
attachment_resp = self.aws_client.attach_interface(network_interface_id, event.instance_id)
141153
self.aws_client.modify_attachment_to_delete_on_termination(attachment_resp["AttachmentId"], network_interface_id)
@@ -179,21 +191,29 @@ def lambda_handler(event, context):
179191
parsed_event: Ec2LifecycleHookEvent = from_aws_event_bridge_json(event)
180192

181193
try:
194+
if not lifecycle_event_svc.should_process_event(parsed_event):
195+
logging.error("Event validation failed, abandoning lifecycle action")
196+
lifecycle_event_svc.complete_lifecycle_action(parsed_event, LifecycleActionResult.ABANDON)
197+
return
198+
182199
lifecycle_event_svc.process_event(parsed_event)
183200
lifecycle_event_svc.complete_lifecycle_action(parsed_event, LifecycleActionResult.CONTINUE)
184201
logging.info("Lifecycle action completed successfully")
185202
except Exception as e:
186-
logging.info(f"failed to process event: {e}")
187-
lifecycle_event_svc.complete_lifecycle_action(parsed_event, LifecycleActionResult.ABANDON)
203+
logging.error(f"failed to process event: {e}")
204+
try:
205+
lifecycle_event_svc.complete_lifecycle_action(parsed_event, LifecycleActionResult.ABANDON)
206+
except Exception as complete_error:
207+
logging.error(f"Failed to complete lifecycle action with ABANDON: {complete_error}")
188208
raise e
189209

190210

191211
def parse_environment() -> EnvironmentConfig:
192-
subnet = os.getenv(EnvironmentVariables.TARGET_SUBNET.value, "")
212+
subnets_json = os.getenv(EnvironmentVariables.TARGET_SUBNETS.value, "")
193213
security_group_id = os.getenv(EnvironmentVariables.TARGET_SECURITY_GROUP_ID.value, "")
194214

195-
if subnet == "":
196-
msg = f"environment variable ${EnvironmentVariables.TARGET_SUBNET.value} is not defined"
215+
if subnets_json == "":
216+
msg = f"environment variable ${EnvironmentVariables.TARGET_SUBNETS.value} is not defined"
197217
logging.error(msg)
198218
raise Exception(msg)
199219

@@ -202,4 +222,11 @@ def parse_environment() -> EnvironmentConfig:
202222
logging.error(msg)
203223
raise Exception(msg)
204224

205-
return EnvironmentConfig(subnet_id=subnet, security_group_id=security_group_id)
225+
try:
226+
subnet_map = json.loads(subnets_json)
227+
except json.JSONDecodeError as e:
228+
msg = f"Failed to parse TARGET_SUBNETS as JSON: {e}"
229+
logging.error(msg)
230+
raise Exception(msg)
231+
232+
return EnvironmentConfig(subnet_map=subnet_map, security_group_id=security_group_id)

scripts/tests/test_lifecycle_event_service.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
lifecycle_action_token="87654321-4321-4321-4321-210987654321"
1818
)
1919

20-
cfg = EnvironmentConfig("foo", "bar")
20+
cfg = EnvironmentConfig({"us-east-1a": "subnet-foo", "us-east-1b": "subnet-bar"}, "sg-12345")
2121
aws_client = AwsClient("foo", "bar")
2222

2323

@@ -90,6 +90,7 @@ def test_process_event_should_raise_exception_on_nic_creation_client_error(mocke
9090
)
9191

9292
svc = LifecycleEventService(cfg, aws_client)
93+
svc.instance_data = {"Placement": {"AvailabilityZone": "us-east-1a"}}
9394

9495
with pytest.raises(botocore.exceptions.ClientError):
9596
svc.process_event(event)
@@ -108,8 +109,11 @@ def test_process_event_should_raise_exception_and_delete_nic_if_attachment_fails
108109

109110
delete_nic_mocker = mocker.patch.object(aws_client, "delete_interface", return_value=None)
110111

112+
svc = LifecycleEventService(cfg, aws_client)
113+
svc.instance_data = {"Placement": {"AvailabilityZone": "us-east-1a"}}
114+
111115
with pytest.raises(botocore.exceptions.ClientError):
112-
LifecycleEventService(cfg, aws_client).process_event(event)
116+
svc.process_event(event)
113117
assert create_nic_mocker.call_count == 1 and \
114118
attach_interface_mocker.call_count == 1 and \
115119
delete_nic_mocker.call_count == 1
@@ -133,8 +137,11 @@ def test_process_event_should_raise_exception_and_delete_nic_if_attachment_modif
133137

134138
delete_nic_mocker = mocker.patch.object(aws_client, "delete_interface", return_value=None)
135139

140+
svc = LifecycleEventService(cfg, aws_client)
141+
svc.instance_data = {"Placement": {"AvailabilityZone": "us-east-1a"}}
142+
136143
with pytest.raises(botocore.exceptions.ClientError):
137-
LifecycleEventService(cfg, aws_client).process_event(event)
144+
svc.process_event(event)
138145

139146
assert create_nic_mocker.call_count == 1 and \
140147
attach_mocker.call_count == 1 and \

0 commit comments

Comments
 (0)