Skip to content

Commit 191e351

Browse files
authored
Relaunch workers if they fail (#34)
* Relaunch workers if they fail goroutine can fail. WIP * Deal with connection leaks * Create sub workers after pg conn is established * Update todo.org
1 parent 14aa4c1 commit 191e351

File tree

3 files changed

+53
-44
lines changed

3 files changed

+53
-44
lines changed

cmd/sandbox-api/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ func main() {
153153
// Create AWS STS client
154154
worker := NewWorker(*baseHandler)
155155

156-
go worker.WatchLifecycleDBChannels()
156+
go worker.WatchLifecycleDBChannels(context.Background())
157157

158158
// ---------------------------------------------------------------------
159159
// Middlewares

cmd/sandbox-api/workers.go

+47-39
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,13 @@ func (w Worker) Execute(j *models.LifecycleResourceJob) error {
9090
}
9191

9292
// consumeChannels is a goroutine that listens to the golang channels and processes the events
93-
func (w Worker) consumeChannels(LifecycleResourceJobsStatusChannel chan string, LifecyclePlacementJobsStatusChannel chan string) {
93+
func (w Worker) consumeChannels(ctx context.Context, LifecycleResourceJobsStatusChannel chan string, LifecyclePlacementJobsStatusChannel chan string) {
9494
WorkerLoop:
9595
for {
9696
select {
97+
case <-ctx.Done():
98+
log.Logger.Warn("Context cancelled, exiting consumeChannels worker")
99+
return
97100
case msg := <-LifecycleResourceJobsStatusChannel:
98101
id, err := strconv.Atoi(msg)
99102
if err != nil {
@@ -200,7 +203,7 @@ WorkerLoop:
200203
}
201204
}
202205

203-
func (w Worker) WatchLifecycleDBChannels() error {
206+
func (w Worker) WatchLifecycleDBChannels(ctx context.Context) error {
204207

205208
// Create channels for resource lifecycle events
206209
LifecycleResourceJobsStatusChannel := make(chan string)
@@ -213,55 +216,60 @@ func (w Worker) WatchLifecycleDBChannels() error {
213216
return err
214217
}
215218

219+
ctx, cancel := context.WithCancel(ctx)
220+
// In case this goroutine stop, stop all workers and restart it
221+
defer func() {
222+
// Log that we are restarting
223+
log.Logger.Warn("Restarting worker WatchLifecycleDBChannels and its workers")
224+
cancel()
225+
// sleep for 5 seconds before restarting
226+
time.Sleep(5 * time.Second)
227+
228+
go w.WatchLifecycleDBChannels(context.Background())
229+
}()
230+
231+
conn, err := w.Dbpool.Acquire(context.Background())
232+
if err != nil {
233+
log.Logger.Error("Error acquiring connection", "error", err)
234+
return err
235+
}
236+
defer conn.Release()
237+
238+
channels := []string{
239+
"lifecycle_placement_jobs_status_channel",
240+
"lifecycle_resource_jobs_status_channel",
241+
}
242+
for _, pgChan := range channels {
243+
_, err = conn.Exec(context.Background(), fmt.Sprintf("LISTEN %s", pgChan))
244+
if err != nil {
245+
log.Logger.Error("Error listening to the channel", "channel", pgChan, "error", err)
246+
return err
247+
}
248+
log.Logger.Info("Listening to channel", "channel", pgChan)
249+
}
250+
216251
// Create go routines to listen to the Golang channels
217252
for i := 0; i < workers; i++ {
218-
go w.consumeChannels(LifecycleResourceJobsStatusChannel, LifecyclePlacementJobsStatusChannel)
253+
go w.consumeChannels(ctx, LifecycleResourceJobsStatusChannel, LifecyclePlacementJobsStatusChannel)
219254
}
220255

221-
// Listen to the DB channels and publish to the Golang channels
222-
MainLoop:
223256
for {
224-
time.Sleep(1 * time.Second)
225-
conn, err := w.Dbpool.Acquire(context.Background())
226-
defer conn.Release()
227-
257+
notification, err := conn.Conn().WaitForNotification(context.Background())
228258
if err != nil {
229-
log.Logger.Error("Error acquiring connection", "error", err)
230-
continue
231-
}
232-
233-
channels := []string{
234-
"lifecycle_placement_jobs_status_channel",
235-
"lifecycle_resource_jobs_status_channel",
236-
}
237-
for _, pgChan := range channels {
238-
_, err = conn.Exec(context.Background(), fmt.Sprintf("LISTEN %s", pgChan))
239-
if err != nil {
240-
log.Logger.Error("Error listening to the channel", "channel", pgChan, "error", err)
241-
continue MainLoop
242-
}
243-
log.Logger.Info("Listening to channel", "channel", pgChan)
259+
log.Logger.Error("Error while listening to the channel", "error", err)
260+
return err
244261
}
245262

246-
for {
247-
notification, err := conn.Conn().WaitForNotification(context.Background())
248-
if err != nil {
249-
log.Logger.Error("Error while listening to the channel", "error", err)
250-
break MainLoop // Restart pool acquisition
251-
}
252-
253-
log.Logger.Debug("Notification received", "PID", notification.PID, "Channel", notification.Channel, "Payload", notification.Payload)
263+
log.Logger.Debug("Notification received", "PID", notification.PID, "Channel", notification.Channel, "Payload", notification.Payload)
254264

255-
switch notification.Channel {
256-
case "lifecycle_placement_jobs_status_channel":
257-
LifecyclePlacementJobsStatusChannel <- notification.Payload
265+
switch notification.Channel {
266+
case "lifecycle_placement_jobs_status_channel":
267+
LifecyclePlacementJobsStatusChannel <- notification.Payload
258268

259-
case "lifecycle_resource_jobs_status_channel":
260-
LifecycleResourceJobsStatusChannel <- notification.Payload
261-
}
269+
case "lifecycle_resource_jobs_status_channel":
270+
LifecycleResourceJobsStatusChannel <- notification.Payload
262271
}
263272
}
264-
return nil
265273
}
266274

267275
// NewWorker creates a new worker

todo.org

+5-4
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
*** DONE GET one
1515
*** DONE PUT (mark for cleanup)
1616
** Lifecycle
17-
*** TODO STOP
18-
*** TODO START
19-
*** TODO STATUS
17+
*** DONE STOP
18+
*** DONE START
19+
*** DONE STATUS
2020
* DONE validate requests using the openAPI schema
2121
* DONE embed the schema in binary
2222
* DONE add credential type
@@ -55,10 +55,11 @@
5555
*** DONE proper lifecyclePlacementResponse with examples
5656
* DONE OpenShift limit and req for pods
5757
* TODO patch clients (sandbox-list, mark_for_cleanup script, etc) to use the sandbox-API instead of dynamodb
58-
* TODO unit tests and fixture/functional tests
5958
* TODO documentation coverage
6059
* TODO move handlers per version?
6160
* Post MVP
61+
** TODO unit tests and fixture/functional tests
62+
** TODO prometheus endpoint and metrics
6263
** TODO Encrypt IAM secret key using AWS KMS instead of ansible-vault. Use and support both while transitioning
6364
** TODO aws lambda function to replicate changes from dynamoDB to postgresql
6465
** TODO add POST /refresh to get new access token

0 commit comments

Comments
 (0)