Skip to content

Commit

Permalink
[cluster-autoscaler-release-1.29] fix: undefined instance state on pr…
Browse files Browse the repository at this point in the history
…ovisioning state failed (#7755)

* fix: undefined instance state on provisioning state failed

* test: add unit tests for provisioning state failed + fast delete

* test: support both fast/not fast delete on an affected test

---------

Co-authored-by: Robin Deeboonchai <[email protected]>
  • Loading branch information
k8s-infra-cherrypick-robot and comtalyst authored Jan 23, 2025
1 parent d8a6d30 commit 13954c6
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 22 deletions.
3 changes: 3 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,7 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st

// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus
// instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible
// Suggestion: reunify this with scaleSet.instanceStatusFromVM()
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus {
if provisioningState == nil {
return nil
Expand All @@ -814,6 +815,8 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi
case provisioningStateCreating:
status.State = cloudprovider.InstanceCreating
case provisioningStateFailed:
status.State = cloudprovider.InstanceRunning

if enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu
}

// instanceStatusFromVM converts the VM provisioning state to cloudprovider.InstanceStatus.
// Suggestion: reunify this with instanceStatusFromProvisioningStateAndPowerState() in azure_scale_set.go
func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSetVM) *cloudprovider.InstanceStatus {
// Prefer the proactive cache view of the instance state if we aren't in a terminal state
// This is because the power state may be taking longer to update and we don't want
Expand All @@ -224,6 +225,8 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
case string(compute.GalleryProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.GalleryProvisioningStateFailed):
status.State = cloudprovider.InstanceRunning

klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning))
if scaleSet.enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,11 @@ package azure
import (
"fmt"
"testing"
"time"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
"github.com/stretchr/testify/assert"
"go.uber.org/mock/gomock"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"

"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient"
)

var (
ctrl *gomock.Controller
currentTime, expiredTime time.Time
provider *AzureCloudProvider
scaleSet *ScaleSet
mockVMSSVMClient *mockvmssvmclient.MockInterface
expectedVMSSVMs []compute.VirtualMachineScaleSetVM
expectedStates []cloudprovider.InstanceState
instanceCache, expectedInstanceCache []cloudprovider.Instance
)

func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineScaleSetVM,
Expand All @@ -53,3 +38,128 @@ func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineSc
}
return instanceCacheTest
}

// Suggestion: could populate all combinations, should reunify with TestInstanceStatusFromProvisioningStateAndPowerState
func TestInstanceStatusFromVM(t *testing.T) {
t.Run("fast delete enablement = false", func(t *testing.T) {
provider := newTestProvider(t)
scaleSet := newTestScaleSet(provider.azureManager, "testScaleSet")

t.Run("provisioning state = failed, power state = starting", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStarting)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = running", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateRunning)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopping)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopped)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateDeallocated)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateUnknown)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})
})

t.Run("fast delete enablement = true", func(t *testing.T) {
provider := newTestProvider(t)
scaleSet := newTestScaleSetWithFastDelete(provider.azureManager, "testScaleSet")

t.Run("provisioning state = failed, power state = starting", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStarting)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = running", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateRunning)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceRunning, status.State)
})

t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopping)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
assert.NotNil(t, status.ErrorInfo)
})

t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopped)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
assert.NotNil(t, status.ErrorInfo)
})

t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateDeallocated)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
assert.NotNil(t, status.ErrorInfo)
})

t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) {
vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateUnknown)

status := scaleSet.instanceStatusFromVM(vm)

assert.NotNil(t, status)
assert.Equal(t, cloudprovider.InstanceCreating, status.State)
assert.NotNil(t, status.ErrorInfo)
})
})
}
Loading

0 comments on commit 13954c6

Please sign in to comment.