Skip to content

Commit 1846a73

Browse files
author
kendavis2
committed
monitor evenless changes
1 parent cf10dec commit 1846a73

File tree

5 files changed

+121
-27
lines changed

5 files changed

+121
-27
lines changed

component/app/model_app.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,25 @@ import (
1111
"go.mongodb.org/mongo-driver/bson/primitive"
1212
)
1313

14-
const Undetermined = "undetermined"
14+
const (
15+
AppTypeService = "service"
16+
AppTypeScheduledTask = "scheduled-task"
17+
AppTypeLambda = "lambda"
18+
AppTypeS3 = "s3"
19+
20+
// FailedTaskExpiration determines the minutes an AWS ECS failed tasks should be considered.
21+
//
22+
// Currently the AWS ECS container restart throttle may wait a maximum of 15 minutes before
23+
// attempting a restart. 20 minutes has been chosen as the time to consider failed tasks.
24+
// This ensures a service does not appear to be healthy due to lack of attempted restarts
25+
// while reducing the time it takes to determine a service is healthy from 1 hour
26+
// down to 20 minutes.
27+
//
28+
// https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-throttle-logic.html
29+
FailedTaskExpiration = 20
30+
31+
Undetermined = "undetermined"
32+
)
1533

1634
// Application ...
1735
type Application struct {
@@ -56,6 +74,19 @@ func (a Application) GetInstances(filter []string) map[string]Instance {
5674
return instances
5775
}
5876

77+
// GetErrorInstances ...
78+
func (a Application) GetErrorInstances() map[string]Instance {
79+
instances := map[string]Instance{}
80+
81+
for k, i := range a.Instances {
82+
if i.CurrentState.Error != nil {
83+
instances[k] = i
84+
}
85+
}
86+
87+
return instances
88+
}
89+
5990
// Matches ...
6091
func (a Application) Matches(filter Filter) bool {
6192
for _, t := range filter.Terms {

component/build/registry.go

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,6 @@ type buildView struct {
2424
Registry bool `json:"registry"`
2525
}
2626

27-
const (
28-
appTypeService = "service"
29-
appTypeScheduledTask = "scheduled-task"
30-
appTypeLambda = "lambda"
31-
appTypeS3 = "s3"
32-
)
33-
3427
// GetBuilds ...
3528
func GetBuilds(c echo.Context) error {
3629
ctx := c.Get("ctx").(mongo.SessionContext)
@@ -52,7 +45,7 @@ func GetBuilds(c echo.Context) error {
5245
builds := map[string]app.Definition{}
5346

5447
switch apps[0].Type {
55-
case appTypeService, appTypeScheduledTask:
48+
case app.AppTypeService, app.AppTypeScheduledTask:
5649
builds, err = task.ListDefinitions(sourceRegistry.Task)
5750
if err != nil {
5851
return err
@@ -83,7 +76,7 @@ func GetBuilds(c echo.Context) error {
8376
builds[sourceRegistry.FormatVersion()] = sourceRegistry.Task.Definition
8477
}
8578

86-
case appTypeLambda:
79+
case app.AppTypeLambda:
8780
if len(sourceRegistry.S3RegistryBucket) > 0 {
8881
builds, err = s3.ListDefinitions(sourceRegistry)
8982
if err != nil {
@@ -95,7 +88,7 @@ func GetBuilds(c echo.Context) error {
9588
return err
9689
}
9790
}
98-
case appTypeS3:
91+
case app.AppTypeS3:
9992
builds, err = s3.ListDefinitions(sourceRegistry)
10093
if err != nil {
10194
return err

component/integration/aws/service/populate.go

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"fmt"
66
"regexp"
7+
"time"
78

89
"github.com/turnerlabs/udeploy/component/app"
910

@@ -78,7 +79,7 @@ func populateInst(i app.Instance, scalableTargets []*applicationautoscaling.Scal
7879

7980
tasks := append(runningTasks, stoppedTasks...)
8081

81-
if err := checkError(svcs, stoppedTasks, i); err != nil {
82+
if err := checkError(svcs, stoppedTasks, app.FailedTaskExpiration*time.Minute); err != nil {
8283
state.SetError(err)
8384
state.SetPending()
8485
} else if isPending(svcs) {
@@ -108,17 +109,30 @@ func populateInst(i app.Instance, scalableTargets []*applicationautoscaling.Scal
108109
return i, state, err
109110
}
110111

111-
i.Links = append(i.Links, app.Link{
112-
Generated: true,
113-
Description: "AWS Console Service Logs",
114-
Name: "logs",
115-
URL: fmt.Sprintf("https://console.aws.amazon.com/ecs/home?region=%s#/clusters/%s/services/%s/logs",
116-
region, i.Cluster, i.Service),
117-
})
112+
linkName := "logs"
113+
if missingLink(linkName, i.Links) {
114+
i.Links = append(i.Links, app.Link{
115+
Generated: true,
116+
Description: "AWS Console Service Logs",
117+
Name: linkName,
118+
URL: fmt.Sprintf("https://console.aws.amazon.com/ecs/home?region=%s#/clusters/%s/services/%s/logs",
119+
region, i.Cluster, i.Service),
120+
})
121+
}
118122

119123
return i, state, nil
120124
}
121125

126+
func missingLink(name string, links []app.Link) bool {
127+
for _, l := range links {
128+
if l.Generated && l.Name == name {
129+
return false
130+
}
131+
}
132+
133+
return true
134+
}
135+
122136
func getRegion(arn string) (string, error) {
123137
tag := regexp.MustCompile("([a-z]{2}-[a-z]*-[0-9]{1})")
124138

@@ -134,14 +148,14 @@ func getRegion(arn string) (string, error) {
134148
return "", errors.New("failed to get region")
135149
}
136150

137-
func checkError(svcs *ecs.Service, tasks []*ecs.Task, inst app.Instance) error {
151+
func checkError(svcs *ecs.Service, tasks []*ecs.Task, errorExpiration time.Duration) error {
138152

139153
if *svcs.DesiredCount == 0 {
140154
return nil
141155
}
142156

143-
if count, err := getTaskError(tasks); err != nil {
144-
return app.InstanceError{Problem: fmt.Sprintf("%d failed task(s) (%s)", count, err)}
157+
if _, err := getServiceError(tasks, errorExpiration); err != nil {
158+
return app.InstanceError{Problem: err.Error()}
145159
}
146160

147161
return nil
@@ -182,15 +196,18 @@ func getTaskDetails(svc *ecs.ECS, inst app.Instance, tasks []*ecs.Task, status,
182196
return getTaskDetails(svc, inst, tasks, status, nextToken)
183197
}
184198

185-
func getTaskError(tasks []*ecs.Task) (int, error) {
199+
func getServiceError(tasks []*ecs.Task, expiration time.Duration) (int, error) {
186200
var reason error
187201
count := 0
188202

189203
for _, t := range tasks {
190204
if t.StopCode != nil && t.StoppedReason != nil {
191205
if *t.StopCode != ecs.TaskStopCodeUserInitiated {
192-
reason = errors.New(*t.StoppedReason)
193-
count++
206+
207+
if time.Now().Sub(*t.ExecutionStoppedAt) < expiration {
208+
reason = errors.New(*t.StoppedReason)
209+
count++
210+
}
194211
}
195212
}
196213
}
@@ -199,8 +216,6 @@ func getTaskError(tasks []*ecs.Task) (int, error) {
199216
}
200217

201218
func isPending(svc *ecs.Service) bool {
202-
203-
//return (len(svc.Deployments) > 1 && *svc.DesiredCount > 0) || *svc.PendingCount > 0 || *svc.DesiredCount > *svc.RunningCount
204219
return (len(svc.Deployments) > 1 && *svc.DesiredCount > 0) || *svc.DesiredCount > 0 && *svc.RunningCount == 0
205220
}
206221

component/sync/aws.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package sync
22

33
import (
4+
"log"
5+
"time"
6+
47
"github.com/turnerlabs/udeploy/component/action"
58
"github.com/turnerlabs/udeploy/component/app"
69
"github.com/turnerlabs/udeploy/component/cache"
@@ -9,6 +12,49 @@ import (
912
"go.mongodb.org/mongo-driver/mongo"
1013
)
1114

15+
// AWSPollEventlessChanges monitors and updates the instance state
16+
// for specific changes that do not trigger AWS events. This polling
17+
// technique is only used when changes cannot be detected by AWS
18+
// events. To avoid AWS rate limits, this technique should be used
19+
// SPARINGLY.
20+
//
21+
// Currently AWS does not fire events when errors expire from ECS Task
22+
// history. Since monitored errors in ECS Task history cause an a
23+
// pplication to display an error state, the history must be monitored
24+
// to determine when a service returns to a healthly state.
25+
func AWSPollEventlessChanges(ctx mongo.SessionContext) error {
26+
27+
ticker := time.NewTicker(app.FailedTaskExpiration * time.Minute)
28+
29+
for {
30+
select {
31+
case <-ticker.C:
32+
for _, a := range cache.Apps.GetAll() {
33+
if a.Type != app.AppTypeService {
34+
continue
35+
}
36+
37+
targeted := a.GetErrorInstances()
38+
if len(targeted) == 0 {
39+
continue
40+
}
41+
42+
log.Printf("Updating App: %s\n", a.Name)
43+
44+
supplemented, err := supplement.Instances(ctx, a.Type, targeted, false)
45+
if err != nil {
46+
log.Printf("failed to update %s state (%s)\n", a.Name, err)
47+
continue
48+
}
49+
50+
cache.Apps.UpdateInstances(a.Name, supplemented)
51+
52+
time.Sleep(time.Second)
53+
}
54+
}
55+
}
56+
}
57+
1258
// AWSWatchEvents ...
1359
func AWSWatchEvents(ctx mongo.SessionContext) error {
1460
return sqs.MonitorChanges(ctx, handleChange)

monitor.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ func monitorChanges(ctx context.Context, sess mongo.Session) {
4646
}
4747
}()
4848

49+
go func() {
50+
if err := mongo.WithSession(ctx, sess, func(sctx mongo.SessionContext) error {
51+
log.Fatal(sync.AWSPollEventlessChanges(sctx))
52+
return nil
53+
}); err != nil {
54+
log.Fatal(err)
55+
}
56+
}()
57+
4958
//--------------------------------------------------
5059
//- Watch for cloudwatch alarms
5160
//--------------------------------------------------

0 commit comments

Comments
 (0)