Skip to content

Commit

Permalink
use deleting machine method to clear failure domain
Browse files Browse the repository at this point in the history
  • Loading branch information
wanyufe committed Sep 21, 2022
1 parent 23d447b commit 6606010
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 12 deletions.
33 changes: 32 additions & 1 deletion config/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand Down Expand Up @@ -68,6 +67,14 @@ rules:
- get
- list
- watch
- apiGroups:
- cluster.x-k8s.io
resources:
- machinesets/status
verbs:
- get
- list
- watch
- apiGroups:
- controlplane.cluster.x-k8s.io
resources:
Expand All @@ -76,6 +83,30 @@ rules:
- get
- list
- watch
- apiGroups:
- controlplane.cluster.x-k8s.io
resources:
- kubeadmcontrolplanes/status
verbs:
- get
- list
- watch
- apiGroups:
- etcdcluster.cluster.x-k8s.io
resources:
- etcdadmclusters
verbs:
- get
- list
- watch
- apiGroups:
- etcdcluster.cluster.x-k8s.io
resources:
- etcdadmclusters/status
verbs:
- get
- list
- watch
- apiGroups:
- infrastructure.cluster.x-k8s.io
resources:
Expand Down
113 changes: 102 additions & 11 deletions controllers/cloudstackfailuredomain_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ package controllers

import (
"context"

"github.com/pkg/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sort"

infrav1 "sigs.k8s.io/cluster-api-provider-cloudstack/api/v1beta2"
csCtrlrUtils "sigs.k8s.io/cluster-api-provider-cloudstack/controllers/utils"
Expand All @@ -38,13 +40,20 @@ type CloudStackFailureDomainReconciler struct {
//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudstackfailuredomains,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudstackfailuredomains/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=cloudstackfailuredomains/finalizers,verbs=update
//+kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinesets,verbs=get;list;watch
//+kubebuilder:rbac:groups=etcdcluster.cluster.x-k8s.io,resources=etcdadmclusters,verbs=get;list;watch
//+kubebuilder:rbac:groups=controlplane.cluster.x-k8s.io,resources=kubeadmcontrolplanes,verbs=get;list;watch
//+kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinesets/status,verbs=get;list;watch
//+kubebuilder:rbac:groups=etcdcluster.cluster.x-k8s.io,resources=etcdadmclusters/status,verbs=get;list;watch
//+kubebuilder:rbac:groups=controlplane.cluster.x-k8s.io,resources=kubeadmcontrolplanes/status,verbs=get;list;watch

// CloudStackFailureDomainReconciliationRunner is a ReconciliationRunner with extensions specific to CloudStackFailureDomains.
// The runner does the actual reconciliation.
type CloudStackFailureDomainReconciliationRunner struct {
*csCtrlrUtils.ReconciliationRunner
ReconciliationSubject *infrav1.CloudStackFailureDomain
IsoNet *infrav1.CloudStackIsolatedNetwork
Machines []infrav1.CloudStackMachine
}

// Initialize a new CloudStackFailureDomain reconciliation runner with concrete types and initialized member fields.
Expand Down Expand Up @@ -112,6 +121,8 @@ func (r *CloudStackFailureDomainReconciliationRunner) ReconcileDelete() (ctrl.Re
r.Log.Info("Deleting CloudStackFailureDomain")

return r.RunReconciliationStages(
r.GetAllMachinesInFailureDomain,
r.AllMachinesCanBeCleared,
r.ClearMachines,
r.DeleteOwnedObjects(
infrav1.GroupVersion.WithKind("CloudStackAffinityGroup"),
Expand All @@ -123,29 +134,109 @@ func (r *CloudStackFailureDomainReconciliationRunner) ReconcileDelete() (ctrl.Re
)
}

// ClearMachines checks for any machines in failure domain, deletes the CAPI machine for any still in FailureDomain,
// and requeus until all CloudStack machines are cleared from the FailureDomain.
func (r *CloudStackFailureDomainReconciliationRunner) ClearMachines() (ctrl.Result, error) {
// GetAllMachinesInFailureDomain get all cloudstack machines deployed in this failure domain.
// machines are sorted by name so that it can be processed one by one in a determined order.
func (r *CloudStackFailureDomainReconciliationRunner) GetAllMachinesInFailureDomain() (ctrl.Result, error) {
machines := &infrav1.CloudStackMachineList{}
if err := r.K8sClient.List(r.RequestCtx, machines, client.MatchingLabels{infrav1.FailureDomainLabelName: r.ReconciliationSubject.Name}); err != nil {
return ctrl.Result{}, err
}
// Deleted CAPI machines for CloudStack machines found.
for _, machine := range machines.Items {
items := machines.Items
sort.Slice(items, func(i, j int) bool {
return items[i].Name < items[j].Name
})
r.Machines = items
return ctrl.Result{}, nil
}

// AllMachinesCanBeCleared checks for each machine in failure domain, check if it is possible to delete it.
// if machine is the only machine in worker node group, it cannot be deleted.
// if machine is the only machine in control plane, it cannot be deleted.
// if machine is the only machine in etcdadmcluster, it cannot be deleted.
// if deletes the CAPI machine for any still in FailureDomain,
// and requeus until all CloudStack machines are cleared from the FailureDomain.
func (r *CloudStackFailureDomainReconciliationRunner) AllMachinesCanBeCleared() (ctrl.Result, error) {
// check CAPI machines for CloudStack machines found.
for _, machine := range r.Machines {
for _, ref := range machine.OwnerReferences {
if ref.Kind != "Machine" {
owner := &unstructured.Unstructured{}
owner.SetGroupVersionKind(schema.FromAPIVersionAndKind(ref.APIVersion, ref.Kind))
if err := r.K8sClient.Get(r.RequestCtx, client.ObjectKey{Namespace: machine.Namespace, Name: ref.Name}, owner); err != nil {
return ctrl.Result{}, err
}
specReplicas, statusReplicas, err := replicasLargerThanOne(owner, ref.Name, machine.Name)
if err != nil {
return ctrl.Result{}, err
}
if specReplicas != statusReplicas {
return r.RequeueWithMessage("spec.replicas <> status.replicas, ", "machineOwner", "owner", ref.Name)
}

statusReady, found, err := unstructured.NestedBool(owner.Object, "status", "ready")
if found && err != nil {
return ctrl.Result{}, err
}
if found && !statusReady {
return r.RequeueWithMessage("status.ready not true, ", "owner", ref.Name)
}

statusReadyReplicas, found, err := unstructured.NestedInt64(owner.Object, "status", "readyReplicas")
if found && err != nil {
return ctrl.Result{}, err
}
if found && statusReadyReplicas != statusReplicas {
return r.RequeueWithMessage("status.replicas <> status.readyReplicas, ", "owner", ref.Name, "status.replicas", statusReplicas, "status.readyReplicas", statusReadyReplicas)
}
}
}
}
return ctrl.Result{}, nil
}

func replicasLargerThanOne(owner *unstructured.Unstructured, ownerName, machineName string) (int64, int64, error) {
specReplicas, found, err := unstructured.NestedInt64(owner.Object, "spec", "replicas")
if err != nil {
return 0, 0, err
}
if !found {
return 0, 0, errors.Errorf("spec.replicas not found in %s", ownerName)
}

statusReplicas, found, err := unstructured.NestedInt64(owner.Object, "status", "replicas")
if err != nil {
return specReplicas, 0, err
}
if !found {
return specReplicas, 0, errors.Errorf("status.replicas not found in %s", ownerName)
}

if specReplicas < 2 {
return specReplicas, 0, errors.Errorf("spec.replicas < 2 in %s, %s cannot be moved away from failure domain", ownerName, machineName)
}

return specReplicas, statusReplicas, nil
}

// ClearMachines deletes the CAPI machine in FailureDomain.
func (r *CloudStackFailureDomainReconciliationRunner) ClearMachines() (ctrl.Result, error) {
for _, csMachine := range r.Machines {
for _, ref := range csMachine.OwnerReferences {
if ref.Kind == "Machine" {
machine := &clusterv1.Machine{}
machine.Name = ref.Name
machine.Namespace = r.ReconciliationSubject.Namespace
if err := r.K8sClient.Get(r.RequestCtx, client.ObjectKey{Namespace: r.ReconciliationSubject.Namespace, Name: ref.Name}, machine); err != nil {
return ctrl.Result{}, err
}
if !machine.DeletionTimestamp.IsZero() {
return r.RequeueWithMessage("machine is being deleted, ", "machine", machine.Name)
}
if err := r.K8sClient.Delete(r.RequestCtx, machine); err != nil {
return ctrl.Result{}, err
}
return r.RequeueWithMessage("start to delete machine, ", "machine", machine.Name)
}
}
}
if len(machines.Items) > 0 {
return r.RequeueWithMessage("FailureDomain still has machine(s) in it.")
}
return ctrl.Result{}, nil
}

Expand Down

0 comments on commit 6606010

Please sign in to comment.