diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 1b57dee63e0..e7a9ae8e94f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -800,6 +800,7 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st // instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus // instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible +// Suggestion: reunify this with scaleSet.instanceStatusFromVM() func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus { if provisioningState == nil { return nil @@ -814,6 +815,8 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi case provisioningStateCreating: status.State = cloudprovider.InstanceCreating case provisioningStateFailed: + status.State = cloudprovider.InstanceRunning + if enableFastDeleteOnFailedProvisioning { // Provisioning can fail both during instance creation or after the instance is running. // Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states, diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go index 4bbcdb06f0c..0c21bdc12e5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go @@ -198,6 +198,7 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu } // instanceStatusFromVM converts the VM provisioning state to cloudprovider.InstanceStatus. +// Suggestion: reunify this with instanceStatusFromProvisioningStateAndPowerState() in azure_scale_set.go func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSetVM) *cloudprovider.InstanceStatus { // Prefer the proactive cache view of the instance state if we aren't in a terminal state // This is because the power state may be taking longer to update and we don't want @@ -224,6 +225,8 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe case string(compute.GalleryProvisioningStateCreating): status.State = cloudprovider.InstanceCreating case string(compute.GalleryProvisioningStateFailed): + status.State = cloudprovider.InstanceRunning + klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning)) if scaleSet.enableFastDeleteOnFailedProvisioning { // Provisioning can fail both during instance creation or after the instance is running. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go index f491897a804..4d94556a579 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go @@ -19,26 +19,11 @@ package azure import ( "fmt" "testing" - "time" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" "github.com/stretchr/testify/assert" - "go.uber.org/mock/gomock" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" - - "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient" -) - -var ( - ctrl *gomock.Controller - currentTime, expiredTime time.Time - provider *AzureCloudProvider - scaleSet *ScaleSet - mockVMSSVMClient *mockvmssvmclient.MockInterface - expectedVMSSVMs []compute.VirtualMachineScaleSetVM - expectedStates []cloudprovider.InstanceState - instanceCache, expectedInstanceCache []cloudprovider.Instance ) func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineScaleSetVM, @@ -53,3 +38,128 @@ func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineSc } return instanceCacheTest } + +// Suggestion: could populate all combinations, should reunify with TestInstanceStatusFromProvisioningStateAndPowerState +func TestInstanceStatusFromVM(t *testing.T) { + t.Run("fast delete enablement = false", func(t *testing.T) { + provider := newTestProvider(t) + scaleSet := newTestScaleSet(provider.azureManager, "testScaleSet") + + t.Run("provisioning state = failed, power state = starting", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStarting) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = running", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateRunning) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopping) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopped) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateDeallocated) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateUnknown) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + }) + + t.Run("fast delete enablement = true", func(t *testing.T) { + provider := newTestProvider(t) + scaleSet := newTestScaleSetWithFastDelete(provider.azureManager, "testScaleSet") + + t.Run("provisioning state = failed, power state = starting", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStarting) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = running", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateRunning) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopping) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + + t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateStopped) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + + t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateDeallocated) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + + t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) { + vm := newVMObjectWithState(string(compute.GalleryProvisioningStateFailed), vmPowerStateUnknown) + + status := scaleSet.instanceStatusFromVM(vm) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + }) +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 8f8f89c73c3..db8d8f2eda0 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -43,21 +43,32 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet { azureRef: azureRef{ Name: name, }, - manager: manager, - minSize: 1, - maxSize: 5, - enableForceDelete: manager.config.EnableForceDelete, - enableFastDeleteOnFailedProvisioning: true, + manager: manager, + minSize: 1, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, } } func newTestScaleSetMinSizeZero(manager *AzureManager, name string) *ScaleSet { + return &ScaleSet{ + azureRef: azureRef{ + Name: name, + }, + manager: manager, + minSize: 0, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, + } +} + +func newTestScaleSetWithFastDelete(manager *AzureManager, name string) *ScaleSet { return &ScaleSet{ azureRef: azureRef{ Name: name, }, manager: manager, - minSize: 0, + minSize: 1, maxSize: 5, enableForceDelete: manager.config.EnableForceDelete, enableFastDeleteOnFailedProvisioning: true, @@ -355,6 +366,95 @@ func TestIncreaseSize(t *testing.T) { // TestIncreaseSizeOnVMProvisioningFailed has been tweeked only for Uniform Orchestration mode. // If ProvisioningState == failed and power state is not running, Status.State == InstanceCreating with errorInfo populated. func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { + testCases := map[string]struct { + expectInstanceRunning bool + isMissingInstanceView bool + statuses []compute.InstanceViewStatus + expectErrorInfoPopulated bool + }{ + "out of resources when no power state exists": { + expectErrorInfoPopulated: false, + }, + "out of resources when VM is stopped": { + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateStopped)}}, + expectErrorInfoPopulated: false, + }, + "out of resources when VM reports invalid power state": { + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr("PowerState/invalid")}}, + expectErrorInfoPopulated: false, + }, + "instance running when power state is running": { + expectInstanceRunning: true, + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateRunning)}}, + expectErrorInfoPopulated: false, + }, + "instance running if instance view cannot be retrieved": { + expectInstanceRunning: true, + isMissingInstanceView: true, + expectErrorInfoPopulated: false, + }, + } + for testName, testCase := range testCases { + t.Run(testName, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager := newTestAzureManager(t) + vmssName := "vmss-failed-upscale" + + expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform) + expectedVMSSVMs := newTestVMSSVMList(3) + // The failed state is important line of code here + expectedVMs := newTestVMList(3) + expectedVMSSVMs[2].ProvisioningState = to.StringPtr(provisioningStateFailed) + if !testCase.isMissingInstanceView { + expectedVMSSVMs[2].InstanceView = &compute.VirtualMachineScaleSetVMInstanceView{Statuses: &testCase.statuses} + } + + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil) + mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil) + mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + + mockVMClient := mockvmclient.NewMockInterface(ctrl) + mockVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() + manager.azClient.virtualMachinesClient = mockVMClient + + manager.explicitlyConfigured["vmss-failed-upscale"] = true + registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName)) + assert.True(t, registered) + manager.Refresh() + + provider, err := BuildAzureCloudProvider(manager, nil) + assert.NoError(t, err) + + scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) + assert.True(t, ok) + + // Increase size by one, but the new node fails provisioning + err = scaleSet.IncreaseSize(1) + assert.NoError(t, err) + + nodes, err := scaleSet.Nodes() + assert.NoError(t, err) + + assert.Equal(t, 3, len(nodes)) + + assert.Equal(t, testCase.expectErrorInfoPopulated, nodes[2].Status.ErrorInfo != nil) + if testCase.expectErrorInfoPopulated { + assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State) + } else { + assert.Equal(t, cloudprovider.InstanceRunning, nodes[2].Status.State) + } + }) + } +} + +func TestIncreaseSizeOnVMProvisioningFailedWithFastDelete(t *testing.T) { testCases := map[string]struct { expectInstanceRunning bool isMissingInstanceView bool @@ -414,7 +514,7 @@ func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { manager.azClient.virtualMachinesClient = mockVMClient manager.explicitlyConfigured["vmss-failed-upscale"] = true - registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName)) + registered := manager.RegisterNodeGroup(newTestScaleSetWithFastDelete(manager, vmssName)) assert.True(t, registered) manager.Refresh() @@ -1265,3 +1365,113 @@ func TestCseErrors(t *testing.T) { assert.Equal(t, []string(nil), actualCSEErrorMessage) }) } + +func newVMObjectWithState(provisioningState string, powerState string) *compute.VirtualMachineScaleSetVM { + return &compute.VirtualMachineScaleSetVM{ + ID: to.StringPtr("1"), // Beware; refactor if needed + VirtualMachineScaleSetVMProperties: &compute.VirtualMachineScaleSetVMProperties{ + ProvisioningState: to.StringPtr(provisioningState), + InstanceView: &compute.VirtualMachineScaleSetVMInstanceView{ + Statuses: &[]compute.InstanceViewStatus{ + {Code: to.StringPtr(powerState)}, + }, + }, + }, + } +} + +// Suggestion: could populate all combinations, should reunify with TestInstanceStatusFromVM +func TestInstanceStatusFromProvisioningStateAndPowerState(t *testing.T) { + t.Run("fast delete enablement = false", func(t *testing.T) { + t.Run("provisioning state = failed, power state = starting", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateStarting, false) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = running", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateRunning, false) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateStopping, false) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) { + + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateStopped, false) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateDeallocated, false) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateUnknown, false) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + }) + + t.Run("fast delete enablement = true", func(t *testing.T) { + t.Run("provisioning state = failed, power state = starting", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateStarting, true) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = running", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateRunning, true) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceRunning, status.State) + }) + + t.Run("provisioning state = failed, power state = stopping", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateStopping, true) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + + t.Run("provisioning state = failed, power state = stopped", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateStopped, true) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + + t.Run("provisioning state = failed, power state = deallocated", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateDeallocated, true) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + + t.Run("provisioning state = failed, power state = unknown", func(t *testing.T) { + status := instanceStatusFromProvisioningStateAndPowerState("1", to.StringPtr(string(compute.GalleryProvisioningStateFailed)), vmPowerStateUnknown, true) + + assert.NotNil(t, status) + assert.Equal(t, cloudprovider.InstanceCreating, status.State) + assert.NotNil(t, status.ErrorInfo) + }) + }) +}