-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKinesis-Setup-CF-template.yaml
534 lines (489 loc) · 19.2 KB
/
Kinesis-Setup-CF-template.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
Transform: AWS::SecretsManager-2020-07-23
Description: "Environment Setup for Streaming Data Pipeline - Github TanishkaMarrott"
Parameters:
#Customizable Input Parameters, while deploying the CloudFormation Stack
EEAssetsBucket:
Description: "Region-specific assets"
Type: String
Default: "ee-assets-prod-us-east-1"
EEAssetsKeyPrefix:
Description: "S3 key prefix for storing module assets"
Type: String
Default: "modules/599e7c685a254c2b892cdbf58a7b3b4f/v1/"
KDADatabaseName:
Type: String
MinLength: "1"
MaxLength: 255
Description: "Name of Database for KDA Application."
Default: "kinesislab"
KDAKinesisStreamName:
Type: String
MinLength: "1"
MaxLength: 255
Description: "Name of Kinesis Stream for ingesting data."
Default: "input-stream"
Mappings:
ConfigurationMap:
Cloud9:
# Configuring IP Address Range for Cloud9 Instances within the VPC
CidrBlock: "10.43.0.0/28"
Resources:
#S3 Buckets
TaxiTripDataSet:
Type: AWS::S3::Bucket
DeletionPolicy: Retain
Properties:
BucketName:
!Sub
- 'nyctaxitrips-${AWS::AccountId}-${AWS::Region}-${RandomGUID}'
- { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
VersioningConfiguration:
Status: Enabled
LifecycleConfiguration:
Rules:
- Id: ExpireOldObjects
Status: Enabled
ExpirationInDays: 90
NoncurrentVersionExpirationInDays: 30
Tags:
- Key: "Project"
Value: "Kinesis Data Pipeline"
CuratedDataSet:
Type: AWS::S3::Bucket
DeletionPolicy: Retain
Properties:
BucketName:
!Sub
- 'curateddata-${AWS::AccountId}-${AWS::Region}-${RandomGUID}'
- { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
VersioningConfiguration:
Status: Enabled
LifecycleConfiguration:
Rules:
- Id: ExpireOldObjects
Status: Enabled
ExpirationInDays: 90
NoncurrentVersionExpirationInDays: 30
Tags:
- Key: "Project"
Value: "Kinesis Data Pipeline"
#Cloud9 Instance
Cloud9:
Type: AWS::Cloud9::EnvironmentEC2
Properties:
AutomaticStopTimeMinutes: 30
Description: "Development environment for real-time streaming with Amazon Kinesis. This Cloud9 IDE is optimized for building and testing Kinesis data processing pipelines."
InstanceType: !Sub "t3.medium" # Managed instance types for flexibility and cost-efficiency
Name: !Sub "KinesisRealTimeStreaming-${AWS::StackName}"
ImageId: amazonlinux-2-x86_64
#Glue Database
Database:
Type: AWS::Glue::Database
Properties:
CatalogId: !Ref 'AWS::AccountId'
DatabaseInput:
Name: !Ref KDADatabaseName
Description: Database for KDA Application Source and Target Tables
#Kinesis Analytics IAM Role
KinesisAnalyticsRole:
Type: AWS::IAM::Role
Properties:
RoleName: !Sub 'Kinesis-analytics-KDA-${AWS::StackName}'
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service:
- kinesisanalytics.amazonaws.com
Action:
- sts:AssumeRole
Policies:
- PolicyName: !Sub 'Kinesis-analytics-KDA-${AWS::StackName}'
PolicyDocument:
Version: '2012-10-17'
Statement:
- Sid: ReadOnlySid
Effect: Allow
Action:
- ec2:DescribeVpcs
- ec2:DescribeDhcpOptions
- ec2:DescribeSubnets
- ec2:DescribeSecurityGroups
Resource:
- "*"
- Sid: LogGroupSid
Effect: Allow
Action:
- logs:CreateLogGroup
- logs:CreateLogStream
- logs:PutLogEvents
- logs:AssociateKmsKey
Resource:
- arn:aws:logs:*:*:/aws-glue/*
- arn:aws:logs:*:*:log-group/aws/kinesis-analytics/*
- Sid: GlueTableSid
Effect: Allow
Action:
- glue:GetConnection
- glue:GetTable
- glue:GetTables
- glue:CreateTable
- glue:UpdateTable
- glue:GetUserDefinedFunction
- glue:GetPartitions
- glue:DeleteTable
- glue:GetDatabase
- glue:GetDatabases
- glue:GetUserDefinedFunction
Resource:
- "*"
- Sid: KinesisEfoConsumer
Effect: Allow
Action:
- kinesis:DescribeStreamConsumer
- kinesis:SubscribeToShard
Resource:
- !Sub 'arn:aws:kinesis:${AWS::Region}:${AWS::AccountId}:stream/${KDAKinesisStreamName}/consumer/*'
- !Sub 'arn:aws:kinesis:${AWS::Region}:${AWS::AccountId}:stream/${KDAKinesisStreamName}/consumer/*'
- Sid: KinesisStreamSid
Effect: Allow
Action:
- kinesis:GetShardIterator
- kinesis:GetRecords
- kinesis:PutRecords
- kinesis:DescribeStream
- kinesis:DescribeStreamSummary
- kinesis:RegisterStreamConsumer
- kinesis:DeregisterStreamConsumer
Resource:
- !Sub 'arn:aws:kinesis:${AWS::Region}:${AWS::AccountId}:stream/${KDAKinesisStreamName}'
- Sid: KinesisStreamListShardsID
Effect: Allow
Action:
- kinesis:*
Resource:
- "*"
- Sid: S3DataAccessSid
Effect: Allow
Action:
- s3:*
Resource:
- !Sub 'arn:aws:s3:::${TaxiTripDataSet}'
- !Sub 'arn:aws:s3:::${CuratedDataSet}'
- !Sub 'arn:aws:s3:::${TaxiTripDataSet}/*'
- !Sub 'arn:aws:s3:::${CuratedDataSet}/*'
- Sid: KinesisAnalyticsSid
Effect: Allow
Action:
- kinesisanalytics:DescribeApplication
Resource:
- !Sub "arn:aws:kinesisanalytics:${AWS::Region}:${AWS::AccountId}:application/KDA-studio-1-*"
- Sid: S3AssetsBucket
Effect: Allow
Action:
- s3:GetObject
- s3:GetObjectVersion
Resource:
- !Sub 'arn:aws:s3:::${EEAssetsBucket}/'
- !Sub 'arn:aws:s3:::${EEAssetsBucket}/*'
#Kinesis Analytics Studio Application
KinesisAnalyticsStudio:
Type: AWS::KinesisAnalyticsV2::Application
Properties:
ApplicationName: !Sub 'KDA-studio-1-${AWS::StackName}'
ApplicationDescription: Kinesis Flink SQL Demo
RuntimeEnvironment: ZEPPELIN-FLINK-2_0
ApplicationMode: INTERACTIVE
ServiceExecutionRole: !GetAtt 'KinesisAnalyticsRole.Arn'
ApplicationConfiguration:
FlinkApplicationConfiguration:
MonitoringConfiguration:
LogLevel: INFO # Saving on Log Costs
MetricsLevel: APPLICATION # Monitoring/ Debugging metrics
ParallelismConfiguration:
ConfigurationType: CUSTOM
Parallelism: 4
ParallelismPerKPU: 1
AutoScalingEnabled: TRUE # We're optimising on resource usage
CheckpointConfiguration: # Enable checkpointing for fault tolerance
ConfigurationType: DEFAULT
CheckpointingEnabled: TRUE
CheckpointInterval: 300000 # Checkpoint interval in milliseconds
MinPauseBetweenCheckpoints: 10000 # Optimising on performance
ZeppelinApplicationConfiguration:
CatalogConfiguration:
GlueDataCatalogConfiguration:
DatabaseARN: !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:database/${Database}'
CustomArtifactsConfiguration:
- ArtifactType: DEPENDENCY_JAR
MavenReference:
GroupId: org.apache.flink
ArtifactId: flink-sql-connector-kinesis_2.12
Version: 1.13.2
- ArtifactType: DEPENDENCY_JAR
MavenReference:
GroupId: software.amazon.msk
ArtifactId: aws-msk-iam-auth
Version: 1.1.0
- ArtifactType: DEPENDENCY_JAR
S3ContentLocation:
BucketARN: !Sub 'arn:aws:s3:::${EEAssetsBucket}'
FileKey: !Sub '${EEAssetsKeyPrefix}flink-sql-connector-elasticsearch7_2.11-1.13.2.jar'
# Data Transformation LambdaFunction
LambdaFunction:
Type: AWS::Lambda::Function
Properties:
Handler: "index.lambda_handler"
Role: !GetAtt LambdaRole.Arn
Runtime: "python3.9"
Timeout: 10
MemorySize: 256 # Optimized memory allocation based on profiling
FunctionName: !Sub "NYCTaxiTrips-DataTransformation-${AWS::StackName}"
Code:
ZipFile: |
import base64
import json
from datetime import datetime
import logging
# Setup logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def lambda_handler(event, context):
output = []
for record in event['records']:
logger.info(f"Processing record ID: {record['recordId']}")
try:
payload = base64.b64decode(record['data']).decode('utf-8')
trip = json.loads(payload)
# Perform lightweight validation and transformation
if not validate_data(trip):
logger.error(f"Validation failed for record {record['recordId']}")
continue
# Simple transformation to ensure data format consistency
trip['pickupDate'] = convert_to_iso(trip['pickupDate'])
trip['dropoffDate'] = convert_to_iso(trip['dropoffDate'])
# Serialize for Kinesis Firehose to send to AWS Glue for further processing
output_record = {
'recordId': record['recordId'],
'result': 'Ok',
'data': base64.b64encode(json.dumps(trip).encode('utf-8')).decode('utf-8')
}
output.append(output_record)
except Exception as e:
logger.error(f"Error processing record {record['recordId']}: {str(e)}")
if is_recoverable(e) and should_retry(e):
logger.info("Error is recoverable, retrying...")
continue # Implement retry logic as necessary
else:
continue
logger.info(f'Successfully processed {len(output)} out of {len(event["records"])} records.')
return {'records': output}
def validate_data(trip):
"""Perform simple data validation checks."""
try:
# Example validation: Ensure longitude and latitude are within bounds
if not (-180 <= float(trip['pickupLongitude']) <= 180) and not (-90 <= float(trip['pickupLatitude']) <= 90):
return False
return True
except ValueError:
return False
def convert_to_iso(date_str):
"""Convert date strings to ISO format."""
try:
return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').isoformat()
except ValueError:
return None
def should_retry(exception):
"""Decide if a retry should be attempted based on the type of error."""
error_str = str(exception).lower()
return 'throttlingexception' in error_str or 'serviceunavailable' in error_str
ReservedConcurrentExecutions: 10 # Reserved provisioned concurrency for critical function
# Enable AWS X-Ray for tracing
TracingConfig:
Mode: Active
LambdaHandlerVersion:
Type: AWS::Lambda::Version
Properties:
FunctionName: !Ref LambdaFunction
LambdaHandlerAlias:
Type: AWS::Lambda::Alias
Properties:
FunctionName: !Ref LambdaFunction
FunctionVersion: !GetAtt LambdaHandlerVersion.Version
Name: "Prod"
ProvisionedConcurrencyConfig:
ProvisionedConcurrentExecutions: 5
# Enables the automated management of Kinesis Data Analytics applications within a CloudFormation stack,
# allowing users to start and stop applications as part of their infrastructure deployment and management processes.
StartKDALambdaFunction:
Type: AWS::Lambda::Function
Properties:
Handler: "index.lambda_handler"
Role: !GetAtt StartKDALambdaRole.Arn
Runtime: "python3.9"
Timeout: 10
FunctionName: !Sub "StartKDA-${AWS::StackName}"
Code:
ZipFile: |
import os
import json
import boto3
import cfnresponse
client = boto3.client('kinesisanalyticsv2')
def lambda_handler(event, context):
print(event)
responseData = {}
if event['RequestType'] == 'Delete':
cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData, "CustomResourcePhysicalID")
return
application_name = event['ResourceProperties']['ApplicationName']
try:
response = client.start_application(ApplicationName=application_name)
print(response)
responseValue = "Started the Application"
responseData['Data'] = responseValue
except Exception as e:
# Log the error
print(f"Error: {str(e)}")
# Set failure response data with error message
responseData['Data'] = f"Failed to start the Application: {str(e)}"
# Send CloudFormation response with appropriate status and data
cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData, "CustomResourcePhysicalID")
# Log function end
print('Function execution completed.')
# Log function execution duration
log_response = lambda_logs.describe_log_streams(
logGroupName=context.log_group_name,
logStreamNamePrefix=context.log_stream_name
)
print(f"Function execution duration: {log_response['logStreams'][0]['storedBytes']} bytes processed.")
StartKDALambdaHandlerVersion:
Type: AWS::Lambda::Version
Properties:
FunctionName: !Ref StartKDALambdaFunction
StartKDALambdaHandlerAlias:
Type: AWS::Lambda::Alias
Properties:
FunctionName: !Ref StartKDALambdaFunction
FunctionVersion: !GetAtt StartKDALambdaHandlerVersion.Version
Name: "Prod"
ProvisionedConcurrencyConfig:
ProvisionedConcurrentExecutions: 5
#StartKDA Lambda Role
StartKDALambdaRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service:
- lambda.amazonaws.com
Action:
- sts:AssumeRole
Path: "/"
Policies:
- PolicyName: LambdaFunctionPolicy
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- logs:CreateLogGroup
Resource: !Join [ ":" , ["arn:aws:logs",!Ref AWS::Region,!Ref AWS::AccountId,"*" ] ]
- Effect: Allow
Action:
- logs:CreateLogStream
- logs:PutLogEvents
Resource: !Join [ ":" , ["arn:aws:logs",!Ref AWS::Region,!Ref AWS::AccountId,"log-group:/aws/lambda/StartKDA-*:*" ] ]
- Effect: Allow
Action:
- kinesisanalytics:StartApplication
Resource: !Join [ ":" , ["arn:aws:kinesisanalytics",!Ref AWS::Region,!Ref AWS::AccountId,"application/KDA-studio-*" ] ]
#Start KDA Application
StartKDA:
Type: "Custom::StartKDA"
Properties:
ServiceToken: !GetAtt StartKDALambdaFunction.Arn
ApplicationName: !Sub 'KDA-studio-1-${AWS::StackName}'
DependsOn: KinesisAnalyticsStudio
#Admin password for OpenSearch Instance
OpenSearchPassword:
Type: AWS::SecretsManager::Secret
Properties:
GenerateSecretString:
SecretStringTemplate: '{"username": "admin"}'
GenerateStringKey: password
PasswordLength: 16
ExcludeCharacters: "\"@/\\"
#provisions a secure and scalable OpenSearch domain
#with encryption both in transit and at rest, access control policies, and EBS volume configurations.
OpenSearchInstance:
Type: AWS::OpenSearchService::Domain
Properties:
DomainName: "tanishka-domain"
EngineVersion: 'OpenSearch_1.2'
ClusterConfig:
InstanceCount: 2 #Fault Tolerance & Scalabality
InstanceType: "c5.large" # Updated the instance type - this will be compute-optimised
AccessPolicies:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Principal:
AWS: '*'
Action: 'es:*'
Resource: !Sub "arn:aws:es:${AWS::Region}:${AWS::AccountId}:domain/tanishka-domain/*"
AdvancedSecurityOptions:
Enabled: true
InternalUserDatabaseEnabled: true
MasterUserOptions:
MasterUserName: !Sub "{{resolve:secretsmanager:${OpenSearchPassword}::username}}"
MasterUserPassword: !Sub "{{resolve:secretsmanager:${OpenSearchPassword}::password}}"
NodeToNodeEncryptionOptions:
Enabled: true
EncryptionAtRestOptions:
Enabled: true
DomainEndpointOptions:
EnforceHTTPS: true
EBSOptions:
EBSEnabled: true
VolumeSize: 20 #Volume Size based on Data Needs
VolumeType: gp2
#Outputs will go here
Outputs:
TaxiTripsS3Bucket:
Value: !Ref TaxiTripDataSet
Description: "Taxi Trip Data Set S3 Bucket"
CuratedS3Bucket:
Value: !Ref CuratedDataSet
Description: "Curated Data Set S3 Bucket"
Cloud9URL:
Description: Cloud9 Environment
Value:
Fn::Join:
- ''
- - !Sub https://${AWS::Region}.console.aws.amazon.com/cloud9/ide/
- !Ref 'Cloud9'
LambdaFunctionArn:
Description: Lambda Function ARN
Value: !GetAtt LambdaFunction.Arn
GlueDatabaseName:
Description: Name of the Glue Database
Value: !Ref Database
KinesisAnalyticsStudio:
Description: Kinesis Analytics Studio
Value: !Ref KinesisAnalyticsStudio
OpenSearchSecretsManagerReference:
Description: OpenSearch credentials stored in secrets manager
Value: !Select [6, !Split [":", !Ref OpenSearchPassword ]]