Skip to content

Commit

Permalink
Ensure a switch update during allocation of multiple machines works (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
majst01 authored Mar 17, 2022
1 parent 3e95365 commit 69466a9
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 13 deletions.
46 changes: 39 additions & 7 deletions cmd/metal-api/internal/service/machine-service.go
Original file line number Diff line number Diff line change
Expand Up @@ -754,12 +754,22 @@ func (r machineResource) registerMachine(request *restful.Request, response *res
}

old := *m
err = connectMachineWithSwitches(r.ds, m)
if checkError(request, response, utils.CurrentFuncName(), err) {
return
}
err = retry.Do(
func() error {
err := connectMachineWithSwitches(r.ds, m)
if err != nil {
return err
}
return r.ds.UpdateMachine(&old, m)
},
retry.Attempts(10),
retry.RetryIf(func(err error) bool {
return strings.Contains(err.Error(), datastore.EntityAlreadyModifiedErrorMessage)
}),
retry.DelayType(retry.CombineDelay(retry.BackOffDelay, retry.RandomDelay)),
retry.LastErrorOnly(true),
)

err = r.ds.UpdateMachine(&old, m)
if checkError(request, response, utils.CurrentFuncName(), err) {
return
}
Expand Down Expand Up @@ -1712,7 +1722,18 @@ func (r machineResource) finalizeAllocation(request *restful.Request, response *
}
}

_, err = setVrfAtSwitches(r.ds, m, vrf)
err = retry.Do(
func() error {
_, err := setVrfAtSwitches(r.ds, m, vrf)
return err
},
retry.Attempts(10),
retry.RetryIf(func(err error) bool {
return strings.Contains(err.Error(), datastore.EntityAlreadyModifiedErrorMessage)
}),
retry.DelayType(retry.CombineDelay(retry.BackOffDelay, retry.RandomDelay)),
retry.LastErrorOnly(true),
)
if err != nil {
if checkError(request, response, utils.CurrentFuncName(), fmt.Errorf("the machine %q could not be enslaved into the vrf %s, error: %w", id, vrf, err)) {
return
Expand Down Expand Up @@ -1976,7 +1997,18 @@ func (r machineResource) abortReinstallMachine(request *restful.Request, respons

func deleteVRFSwitches(ds *datastore.RethinkStore, m *metal.Machine, logger *zap.Logger) error {
logger.Info("set VRF at switch", zap.String("machineID", m.ID))
_, err := setVrfAtSwitches(ds, m, "")
err := retry.Do(
func() error {
_, err := setVrfAtSwitches(ds, m, "")
return err
},
retry.Attempts(10),
retry.RetryIf(func(err error) bool {
return strings.Contains(err.Error(), datastore.EntityAlreadyModifiedErrorMessage)
}),
retry.DelayType(retry.CombineDelay(retry.BackOffDelay, retry.RandomDelay)),
retry.LastErrorOnly(true),
)
if err != nil {
logger.Error("cannot delete vrf switches", zap.String("machineID", m.ID), zap.Error(err))
return fmt.Errorf("cannot delete vrf switches: %w", err)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//go:build integration
// +build integration

package service
Expand Down Expand Up @@ -38,10 +39,7 @@ var (
)

func TestMachineAllocationIntegration(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}
machineCount := 50
machineCount := 30

// Setup
rs, container := setupTestEnvironment(machineCount, t)
Expand Down
30 changes: 28 additions & 2 deletions cmd/metal-api/internal/service/switch-service.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import (
"fmt"
"net/http"
"sort"
"strings"
"time"

"github.com/avast/retry-go/v4"
restfulspec "github.com/emicklei/go-restful-openapi/v2"
restful "github.com/emicklei/go-restful/v3"
"github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore"
Expand Down Expand Up @@ -166,6 +168,7 @@ func (r switchResource) deleteSwitch(request *restful.Request, response *restful
}
}

// notifySwitch is called periodically from every switch to report last duration and error if ocurred
func (r switchResource) notifySwitch(request *restful.Request, response *restful.Response) {
var requestPayload v1.SwitchNotifyRequest
err := request.ReadEntity(&requestPayload)
Expand Down Expand Up @@ -194,6 +197,7 @@ func (r switchResource) notifySwitch(request *restful.Request, response *restful
s.LastSyncError = sync
}

// FIXME needs https://github.com/metal-stack/metal-api/issues/263
err = r.ds.UpdateSwitch(&old, s)
if checkError(request, response, utils.CurrentFuncName(), err) {
return
Expand Down Expand Up @@ -231,7 +235,18 @@ func (r switchResource) updateSwitch(request *restful.Request, response *restful

newSwitch.Mode = metal.SwitchModeFrom(requestPayload.Mode)

err = r.ds.UpdateSwitch(oldSwitch, &newSwitch)
err = retry.Do(
func() error {
err := r.ds.UpdateSwitch(oldSwitch, &newSwitch)
return err
},
retry.Attempts(10),
retry.RetryIf(func(err error) bool {
return strings.Contains(err.Error(), datastore.EntityAlreadyModifiedErrorMessage)
}),
retry.DelayType(retry.CombineDelay(retry.BackOffDelay, retry.RandomDelay)),
retry.LastErrorOnly(true),
)
if checkError(request, response, utils.CurrentFuncName(), err) {
return
}
Expand Down Expand Up @@ -323,7 +338,18 @@ func (r switchResource) registerSwitch(request *restful.Request, response *restf
s.Nics = nics
// Do not replace connections here: We do not want to loose them!

err = r.ds.UpdateSwitch(&old, s)
err = retry.Do(
func() error {
err := r.ds.UpdateSwitch(&old, s)
return err
},
retry.Attempts(10),
retry.RetryIf(func(err error) bool {
return strings.Contains(err.Error(), datastore.EntityAlreadyModifiedErrorMessage)
}),
retry.DelayType(retry.CombineDelay(retry.BackOffDelay, retry.RandomDelay)),
retry.LastErrorOnly(true),
)

if checkError(request, response, utils.CurrentFuncName(), err) {
return
Expand Down

0 comments on commit 69466a9

Please sign in to comment.