diff --git a/api/v1beta1/awsmachine_conversion.go b/api/v1beta1/awsmachine_conversion.go index 85f7a1c36b..f7f55af221 100644 --- a/api/v1beta1/awsmachine_conversion.go +++ b/api/v1beta1/awsmachine_conversion.go @@ -136,6 +136,9 @@ func (r *AWSMachineTemplate) ConvertTo(dstRaw conversion.Hub) error { } } + // Restore Status fields that don't exist in v1beta1. + dst.Status.NodeInfo = restored.Status.NodeInfo + return nil } diff --git a/api/v1beta1/conversion.go b/api/v1beta1/conversion.go index c3000ee97c..0f3833fba8 100644 --- a/api/v1beta1/conversion.go +++ b/api/v1beta1/conversion.go @@ -108,3 +108,8 @@ func Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta2.AW // Note: DedicatedHostID is not present in v1beta1, so it will be dropped during conversion return autoConvert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in, out, s) } + +func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { + // NodeInfo field is ignored (dropped) as it doesn't exist in v1beta1 + return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) +} diff --git a/api/v1beta1/zz_generated.conversion.go b/api/v1beta1/zz_generated.conversion.go index ebd82d07d9..19b919643e 100644 --- a/api/v1beta1/zz_generated.conversion.go +++ b/api/v1beta1/zz_generated.conversion.go @@ -1625,14 +1625,10 @@ func Convert_v1beta1_AWSMachineTemplateStatus_To_v1beta2_AWSMachineTemplateStatu func autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { out.Capacity = *(*v1.ResourceList)(unsafe.Pointer(&in.Capacity)) + // WARNING: in.NodeInfo requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus is an autogenerated conversion function. -func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { - return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) -} - func autoConvert_v1beta1_AWSResourceReference_To_v1beta2_AWSResourceReference(in *AWSResourceReference, out *v1beta2.AWSResourceReference, s conversion.Scope) error { out.ID = (*string)(unsafe.Pointer(in.ID)) // WARNING: in.ARN requires manual conversion: does not exist in peer-type diff --git a/api/v1beta2/awsmachinetemplate_types.go b/api/v1beta2/awsmachinetemplate_types.go index 50d8dda22d..e2429573f1 100644 --- a/api/v1beta2/awsmachinetemplate_types.go +++ b/api/v1beta2/awsmachinetemplate_types.go @@ -23,6 +23,38 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" ) +// Architecture represents the CPU architecture of the node. +// Its underlying type is a string and its value can be any of amd64, arm64. +// +kubebuilder:validation:Enum=amd64;arm64 +// +enum +type Architecture string + +// Architecture constants. +const ( + ArchitectureAmd64 Architecture = "amd64" + ArchitectureArm64 Architecture = "arm64" +) + +// Operating system constants. +const ( + // OperatingSystemLinux represents the Linux operating system. + OperatingSystemLinux = "linux" + // OperatingSystemWindows represents the Windows operating system. + OperatingSystemWindows = "windows" +) + +// NodeInfo contains information about the node's architecture and operating system. +type NodeInfo struct { + // Architecture is the CPU architecture of the node. + // Its underlying type is a string and its value can be any of amd64, arm64. + // +optional + Architecture Architecture `json:"architecture,omitempty"` + // OperatingSystem is a string representing the operating system of the node. + // This may be a string like 'linux' or 'windows'. + // +optional + OperatingSystem string `json:"operatingSystem,omitempty"` +} + // AWSMachineTemplateStatus defines a status for an AWSMachineTemplate. type AWSMachineTemplateStatus struct { // Capacity defines the resource capacity for this machine. @@ -30,6 +62,12 @@ type AWSMachineTemplateStatus struct { // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md // +optional Capacity corev1.ResourceList `json:"capacity,omitempty"` + + // NodeInfo contains information about the node's architecture and operating system. + // This value is used for autoscaling from zero operations as defined in: + // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md + // +optional + NodeInfo *NodeInfo `json:"nodeInfo,omitempty"` } // AWSMachineTemplateSpec defines the desired state of AWSMachineTemplate. @@ -40,6 +78,7 @@ type AWSMachineTemplateSpec struct { // +kubebuilder:object:root=true // +kubebuilder:resource:path=awsmachinetemplates,scope=Namespaced,categories=cluster-api,shortName=awsmt // +kubebuilder:storageversion +// +kubebuilder:subresource:status // +k8s:defaulter-gen=true // AWSMachineTemplate is the schema for the Amazon EC2 Machine Templates API. diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 1bd4313128..6a30f71412 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -948,6 +948,11 @@ func (in *AWSMachineTemplateStatus) DeepCopyInto(out *AWSMachineTemplateStatus) (*out)[key] = val.DeepCopy() } } + if in.NodeInfo != nil { + in, out := &in.NodeInfo, &out.NodeInfo + *out = new(NodeInfo) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSMachineTemplateStatus. @@ -2013,6 +2018,21 @@ func (in *NetworkStatus) DeepCopy() *NetworkStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeInfo) DeepCopyInto(out *NodeInfo) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeInfo. +func (in *NodeInfo) DeepCopy() *NodeInfo { + if in == nil { + return nil + } + out := new(NodeInfo) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PrivateDNSName) DeepCopyInto(out *PrivateDNSName) { *out = *in diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml index 1d3f40efed..1aa707b50c 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml @@ -1156,7 +1156,29 @@ spec: This value is used for autoscaling from zero operations as defined in: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md type: object + nodeInfo: + description: |- + NodeInfo contains information about the node's architecture and operating system. + This value is used for autoscaling from zero operations as defined in: + https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md + properties: + architecture: + description: |- + Architecture is the CPU architecture of the node. + Its underlying type is a string and its value can be any of amd64, arm64. + enum: + - amd64 + - arm64 + type: string + operatingSystem: + description: |- + OperatingSystem is a string representing the operating system of the node. + This may be a string like 'linux' or 'windows'. + type: string + type: object type: object type: object served: true storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9fc33ff9ea..b127ec6308 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -101,6 +101,7 @@ rules: - controlplane.cluster.x-k8s.io resources: - '*' + - kubeadmcontrolplanes verbs: - get - list @@ -175,6 +176,7 @@ rules: resources: - awsclusters/status - awsfargateprofiles/status + - awsmachinetemplates/status - rosaclusters/status - rosanetworks/status - rosaroleconfigs/status diff --git a/controllers/awsmachinetemplate_controller.go b/controllers/awsmachinetemplate_controller.go new file mode 100644 index 0000000000..58414836cd --- /dev/null +++ b/controllers/awsmachinetemplate_controller.go @@ -0,0 +1,374 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "strings" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + ec2service "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/ec2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/predicates" +) + +// AWSMachineTemplateReconciler reconciles AWSMachineTemplate objects. +// +// This controller automatically populates capacity information for AWSMachineTemplate resources +// to enable autoscaling from zero. +// +// See: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md +type AWSMachineTemplateReconciler struct { + client.Client + WatchFilterValue string +} + +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates,verbs=get;list;watch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsclusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinedeployments,verbs=get;list;watch +// +kubebuilder:rbac:groups=controlplane.cluster.x-k8s.io,resources=kubeadmcontrolplanes,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch + +// Reconcile populates capacity information for AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logger.FromContext(ctx) + + // Fetch the AWSMachineTemplate + awsMachineTemplate := &infrav1.AWSMachineTemplate{} + if err := r.Get(ctx, req.NamespacedName, awsMachineTemplate); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Skip if capacity and nodeInfo are already set + if len(awsMachineTemplate.Status.Capacity) > 0 && awsMachineTemplate.Status.NodeInfo != nil { + return ctrl.Result{}, nil + } + + // Get instance type from spec + instanceType := awsMachineTemplate.Spec.Template.Spec.InstanceType + if instanceType == "" { + return ctrl.Result{}, nil + } + + // Find the region by checking ownerReferences + region, err := r.getRegion(ctx, awsMachineTemplate) + if err != nil { + return ctrl.Result{}, err + } + if region == "" { + return ctrl.Result{}, nil + } + + // Create global scope for this region + // Reference: exp/instancestate/awsinstancestate_controller.go:68-76 + globalScope, err := scope.NewGlobalScope(scope.GlobalScopeParams{ + ControllerName: "awsmachinetemplate", + Region: region, + }) + if err != nil { + record.Warnf(awsMachineTemplate, "AWSSessionFailed", "Failed to create AWS session for region %q: %v", region, err) + return ctrl.Result{}, nil + } + + // Create EC2 client from global scope + ec2Client := ec2.NewFromConfig(globalScope.Session()) + + // Query instance type capacity + capacity, err := r.getInstanceTypeCapacity(ctx, ec2Client, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "CapacityQueryFailed", "Failed to query capacity for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Query node info (architecture and OS) + nodeInfo, err := r.getNodeInfo(ctx, ec2Client, awsMachineTemplate, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "NodeInfoQueryFailed", "Failed to query node info for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Save original before modifying, then update all status fields at once + original := awsMachineTemplate.DeepCopy() + if len(capacity) > 0 { + awsMachineTemplate.Status.Capacity = capacity + } + if nodeInfo != nil && (nodeInfo.Architecture != "" || nodeInfo.OperatingSystem != "") { + awsMachineTemplate.Status.NodeInfo = nodeInfo + } + if err := r.Status().Patch(ctx, awsMachineTemplate, client.MergeFrom(original)); err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to update AWSMachineTemplate status") + } + + log.Info("Successfully populated capacity and nodeInfo", "instanceType", instanceType, "region", region, "capacity", capacity, "nodeInfo", nodeInfo) + return ctrl.Result{}, nil +} + +// getRegion finds the region by checking the template's owner cluster reference. +func (r *AWSMachineTemplateReconciler) getRegion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { + // Get the owner cluster + cluster, err := util.GetOwnerCluster(ctx, r.Client, template.ObjectMeta) + if err != nil { + return "", err + } + if cluster == nil { + return "", errors.New("no owner cluster found") + } + + // Get region from AWSCluster (standard EC2-based cluster) + if cluster.Spec.InfrastructureRef != nil && cluster.Spec.InfrastructureRef.Kind == "AWSCluster" { + awsCluster := &infrav1.AWSCluster{} + if err := r.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.InfrastructureRef.Name, + }, awsCluster); err != nil { + if !apierrors.IsNotFound(err) { + return "", errors.Wrapf(err, "failed to get AWSCluster %s/%s", cluster.Namespace, cluster.Spec.InfrastructureRef.Name) + } + } else if awsCluster.Spec.Region != "" { + return awsCluster.Spec.Region, nil + } + } + + return "", nil +} + +// getInstanceTypeCapacity queries AWS EC2 API for instance type capacity information. +// Returns the resource list (CPU, Memory). +func (r *AWSMachineTemplateReconciler) getInstanceTypeCapacity(ctx context.Context, ec2Client *ec2.Client, instanceType string) (corev1.ResourceList, error) { + // Query instance type information + input := &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + } + + result, err := ec2Client.DescribeInstanceTypes(ctx, input) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + // Extract capacity information + info := result.InstanceTypes[0] + resourceList := corev1.ResourceList{} + + // CPU + if info.VCpuInfo != nil && info.VCpuInfo.DefaultVCpus != nil { + resourceList[corev1.ResourceCPU] = *resource.NewQuantity(int64(*info.VCpuInfo.DefaultVCpus), resource.DecimalSI) + } + + // Memory + if info.MemoryInfo != nil && info.MemoryInfo.SizeInMiB != nil { + memoryBytes := *info.MemoryInfo.SizeInMiB * 1024 * 1024 + resourceList[corev1.ResourceMemory] = *resource.NewQuantity(memoryBytes, resource.BinarySI) + } + + return resourceList, nil +} + +// getNodeInfo queries node information (architecture and OS) for the AWSMachineTemplate. +// It uses AMI ID if specified, otherwise attempts AMI lookup or falls back to instance type info. +func (r *AWSMachineTemplateReconciler) getNodeInfo(ctx context.Context, ec2Client *ec2.Client, template *infrav1.AWSMachineTemplate, instanceType string) (*infrav1.NodeInfo, error) { + nodeInfo := &infrav1.NodeInfo{} + amiID := template.Spec.Template.Spec.AMI.ID + if amiID != nil && *amiID != "" { + // AMI ID is specified, query it directly + arch, os, err := r.getNodeInfoFromAMI(ctx, ec2Client, *amiID) + if err == nil { + if arch != "" { + nodeInfo.Architecture = arch + } + if os != "" { + nodeInfo.OperatingSystem = os + } + } + } else { + // AMI ID is not specified, query instance type to get architecture + input := &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + } + + result, err := ec2Client.DescribeInstanceTypes(ctx, input) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + instanceTypeInfo := result.InstanceTypes[0] + + // Infer architecture from instance type + var architecture string + if instanceTypeInfo.ProcessorInfo != nil && len(instanceTypeInfo.ProcessorInfo.SupportedArchitectures) == 1 { + // Use the supported architecture + switch instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0] { + case ec2types.ArchitectureTypeX8664: + architecture = ec2service.Amd64ArchitectureTag + nodeInfo.Architecture = infrav1.ArchitectureAmd64 + case ec2types.ArchitectureTypeArm64: + architecture = ec2service.Arm64ArchitectureTag + nodeInfo.Architecture = infrav1.ArchitectureArm64 + } + } else { + return nil, errors.Errorf("instance type must support exactly one architecture, got %d", len(instanceTypeInfo.ProcessorInfo.SupportedArchitectures)) + } + + // Attempt to get Kubernetes version from MachineDeployment + kubernetesVersion, versionErr := r.getKubernetesVersion(ctx, template) + if versionErr == nil && kubernetesVersion != "" { + // Try to look up AMI using the version + image, err := ec2service.DefaultAMILookup( + ec2Client, + template.Spec.Template.Spec.ImageLookupOrg, + template.Spec.Template.Spec.ImageLookupBaseOS, + kubernetesVersion, + architecture, + template.Spec.Template.Spec.ImageLookupFormat, + ) + if err == nil && image != nil { + // Successfully found AMI, extract accurate nodeInfo from it + arch, os, _ := r.getNodeInfoFromAMI(ctx, ec2Client, *image.ImageId) + if arch != "" { + nodeInfo.Architecture = arch + } + if os != "" { + nodeInfo.OperatingSystem = os + } + return nodeInfo, nil + } + // AMI lookup failed, fall through to defaults + } + } + + return nodeInfo, nil +} + +// getNodeInfoFromAMI queries the AMI to determine architecture and operating system. +func (r *AWSMachineTemplateReconciler) getNodeInfoFromAMI(ctx context.Context, ec2Client *ec2.Client, amiID string) (infrav1.Architecture, string, error) { + input := &ec2.DescribeImagesInput{ + ImageIds: []string{amiID}, + } + + result, err := ec2Client.DescribeImages(ctx, input) + if err != nil { + return "", "", errors.Wrapf(err, "failed to describe AMI %q", amiID) + } + + if len(result.Images) == 0 { + return "", "", errors.Errorf("no information found for AMI %q", amiID) + } + + image := result.Images[0] + + // Get architecture from AMI + var arch infrav1.Architecture + switch image.Architecture { + case ec2types.ArchitectureValuesX8664: + arch = infrav1.ArchitectureAmd64 + case ec2types.ArchitectureValuesArm64: + arch = infrav1.ArchitectureArm64 + } + + // Determine OS - default to Linux, change to Windows if detected + // Most AMIs are Linux-based, so we initialize with Linux as the default + os := infrav1.OperatingSystemLinux + + // 1. Check Platform field (most reliable for Windows detection) + if image.Platform == ec2types.PlatformValuesWindows { + os = infrav1.OperatingSystemWindows + } + + // 2. Check PlatformDetails field for Windows indication + if os != infrav1.OperatingSystemWindows && image.PlatformDetails != nil { + platformDetails := strings.ToLower(*image.PlatformDetails) + if strings.Contains(platformDetails, infrav1.OperatingSystemWindows) { + os = infrav1.OperatingSystemWindows + } + } + + return arch, os, nil +} + +// getKubernetesVersion attempts to find the Kubernetes version by querying MachineDeployments +// or KubeadmControlPlanes that reference this AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) getKubernetesVersion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { + // Try to find version from MachineDeployment first + machineDeploymentList := &clusterv1.MachineDeploymentList{} + if err := r.List(ctx, machineDeploymentList, client.InNamespace(template.Namespace)); err != nil { + return "", errors.Wrap(err, "failed to list MachineDeployments") + } + + // Find MachineDeployments that reference this AWSMachineTemplate + for _, md := range machineDeploymentList.Items { + if md.Spec.Template.Spec.InfrastructureRef.Kind == "AWSMachineTemplate" && + md.Spec.Template.Spec.InfrastructureRef.Name == template.Name && + md.Spec.Template.Spec.Version != nil { + return *md.Spec.Template.Spec.Version, nil + } + } + + // If not found in MachineDeployment, try KubeadmControlPlane + kcpList := &controlplanev1.KubeadmControlPlaneList{} + if err := r.List(ctx, kcpList, client.InNamespace(template.Namespace)); err != nil { + return "", errors.Wrap(err, "failed to list KubeadmControlPlanes") + } + + // Find KubeadmControlPlanes that reference this AWSMachineTemplate + for _, kcp := range kcpList.Items { + if kcp.Spec.MachineTemplate.InfrastructureRef.Kind == "AWSMachineTemplate" && + kcp.Spec.MachineTemplate.InfrastructureRef.Name == template.Name && + kcp.Spec.Version != "" { + return kcp.Spec.Version, nil + } + } + + return "", errors.New("no MachineDeployment or KubeadmControlPlane found referencing this AWSMachineTemplate with a version") +} + +// SetupWithManager sets up the controller with the Manager. +func (r *AWSMachineTemplateReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { + log := logger.FromContext(ctx) + + return ctrl.NewControllerManagedBy(mgr). + For(&infrav1.AWSMachineTemplate{}). + WithOptions(options). + WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), log.GetLogger(), r.WatchFilterValue)). + Complete(r) +} diff --git a/controllers/awsmachinetemplate_controller_unit_test.go b/controllers/awsmachinetemplate_controller_unit_test.go new file mode 100644 index 0000000000..2f446f3823 --- /dev/null +++ b/controllers/awsmachinetemplate_controller_unit_test.go @@ -0,0 +1,347 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +func TestAWSMachineTemplateReconciler(t *testing.T) { + setupScheme := func() *runtime.Scheme { + scheme := runtime.NewScheme() + _ = infrav1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + return scheme + } + + newFakeClient := func(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(setupScheme()). + WithObjects(objs...). + WithStatusSubresource(&infrav1.AWSMachineTemplate{}). + Build() + } + + newAWSMachineTemplate := func(name string) *infrav1.AWSMachineTemplate { + return &infrav1.AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: infrav1.AWSMachineTemplateSpec{ + Template: infrav1.AWSMachineTemplateResource{ + Spec: infrav1.AWSMachineSpec{ + InstanceType: "t3.medium", + }, + }, + }, + } + } + + t.Run("getRegion", func(t *testing.T) { + t.Run("should get region from AWSCluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("us-west-2")) + }) + + t.Run("should return error when no owner cluster found", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when cluster has no infrastructure ref", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when AWSCluster not found", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + }) + + // Note: getInstanceTypeInfo tests are skipped as they require EC2 client injection + // which would need significant refactoring. The function is tested indirectly through + // integration tests. + + t.Run("Reconcile", func(t *testing.T) { + t.Run("should skip when capacity and nodeInfo already set", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + } + template.Status.NodeInfo = &infrav1.NodeInfo{ + Architecture: infrav1.ArchitectureAmd64, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should reconcile when capacity set but nodeInfo is not", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + } + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + // This will fail at AWS API call, but demonstrates that reconcile proceeds + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when instance type is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Spec.Template.Spec.InstanceType = "" + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return error when no owner cluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when region is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return nil when template not found", func(t *testing.T) { + g := NewWithT(t) + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKey{ + Namespace: "default", + Name: "nonexistent", + }, + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + }) +} diff --git a/main.go b/main.go index 785d1e7969..022b95dffb 100644 --- a/main.go +++ b/main.go @@ -407,6 +407,15 @@ func setupReconcilersAndWebhooks(ctx context.Context, mgr ctrl.Manager, } } + setupLog.Debug("enabling AWSMachineTemplate controller") + if err := (&controllers.AWSMachineTemplateReconciler{ + Client: mgr.GetClient(), + WatchFilterValue: watchFilterValue, + }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: awsClusterConcurrency, RecoverPanic: ptr.To[bool](true)}); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "AWSMachineTemplate") + os.Exit(1) + } + if err := (&infrav1.AWSMachineTemplate{}).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "AWSMachineTemplate") os.Exit(1) diff --git a/test/e2e/suites/unmanaged/unmanaged_functional_test.go b/test/e2e/suites/unmanaged/unmanaged_functional_test.go index f4d6d42e94..87ebd430b7 100644 --- a/test/e2e/suites/unmanaged/unmanaged_functional_test.go +++ b/test/e2e/suites/unmanaged/unmanaged_functional_test.go @@ -337,7 +337,7 @@ var _ = ginkgo.Context("[unmanaged] [functional]", func() { configCluster.ControlPlaneMachineCount = ptr.To[int64](1) configCluster.WorkerMachineCount = ptr.To[int64](1) configCluster.Flavor = shared.SSMFlavor - _, md, _ := createCluster(ctx, configCluster, result) + cluster, md, _ := createCluster(ctx, configCluster, result) workerMachines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{ Lister: e2eCtx.Environment.BootstrapClusterProxy.GetClient(), @@ -352,6 +352,43 @@ var _ = ginkgo.Context("[unmanaged] [functional]", func() { }) Expect(len(workerMachines)).To(Equal(1)) Expect(len(controlPlaneMachines)).To(Equal(1)) + + ginkgo.By("Verifying AWSMachineTemplate capacity is populated for autoscaling from zero") + awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} + err := e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name)) + Expect(err).To(BeNil()) + Expect(len(awsMachineTemplateList.Items)).To(BeNumerically(">", 0), "Expected at least one AWSMachineTemplate") + + ginkgo.By(fmt.Sprintf("Found %d AWSMachineTemplates", len(awsMachineTemplateList.Items))) + ginkgo.By(fmt.Sprintf("Cluster: name=%s, namespace=%s, infrastructureRef=%v", + cluster.Name, cluster.Namespace, cluster.Spec.InfrastructureRef)) + + // Print each AWSMachineTemplate for debugging + for i, template := range awsMachineTemplateList.Items { + ginkgo.By(fmt.Sprintf("AWSMachineTemplate[%d]: %+v", i, template)) + } + + foundTemplateWithCapacity := false + foundTemplateWithNodeInfo := false + for _, template := range awsMachineTemplateList.Items { + if len(template.Status.Capacity) > 0 { + foundTemplateWithCapacity = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has capacity populated: %v", template.Name, template.Status.Capacity)) + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceCPU), "Expected CPU to be set in capacity") + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceMemory), "Expected Memory to be set in capacity") + } + if template.Status.NodeInfo != nil { + foundTemplateWithNodeInfo = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has nodeInfo populated: %v", template.Name, template.Status.NodeInfo)) + // Verify architecture is set (should be either amd64 or arm64 for AWS) + Expect(template.Status.NodeInfo.Architecture).ToNot(BeEmpty(), "Expected architecture to be set in nodeInfo") + Expect(string(template.Status.NodeInfo.Architecture)).To(MatchRegexp("^(amd64|arm64)$"), "Expected architecture to be amd64 or arm64") + // Verify operating system is set + Expect(template.Status.NodeInfo.OperatingSystem).ToNot(BeEmpty(), "Expected operatingSystem to be set in nodeInfo") + } + } + Expect(foundTemplateWithCapacity).To(BeTrue(), "Expected at least one AWSMachineTemplate to have capacity populated") + Expect(foundTemplateWithNodeInfo).To(BeTrue(), "Expected at least one AWSMachineTemplate to have nodeInfo populated") }) })