Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support draining multiple node #469

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
92 changes: 58 additions & 34 deletions chaoslib/litmus/node-drain/lib/node-drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,16 @@ func PrepareNodeDrain(experimentsDetails *experimentTypes.ExperimentDetails, cli
common.WaitForDuration(experimentsDetails.RampTime)
}

if experimentsDetails.TargetNode == "" {
if experimentsDetails.TargetNodes == "" {
//Select node for kubelet-service-kill
experimentsDetails.TargetNode, err = common.GetNodeName(experimentsDetails.AppNS, experimentsDetails.AppLabel, experimentsDetails.NodeLabel, clients)
experimentsDetails.TargetNodes, err = common.GetNodeName(experimentsDetails.AppNS, experimentsDetails.AppLabel, experimentsDetails.NodeLabel, clients)
if err != nil {
return err
}
}

if experimentsDetails.EngineName != "" {
msg := "Injecting " + experimentsDetails.ExperimentName + " chaos on " + experimentsDetails.TargetNode + " node"
msg := "Injecting " + experimentsDetails.ExperimentName + " chaos on " + experimentsDetails.TargetNodes + " node"
types.SetEngineEventAttributes(eventsDetails, types.ChaosInject, msg, "Normal", chaosDetails)
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")
}
Expand Down Expand Up @@ -114,45 +114,65 @@ func PrepareNodeDrain(experimentsDetails *experimentTypes.ExperimentDetails, cli
// drainNode drain the application node
func drainNode(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error {

select {
case <-inject:
// stopping the chaos execution, if abort signal received
os.Exit(0)
default:
log.Infof("[Inject]: Draining the %v node", experimentsDetails.TargetNode)
targetNodes := strings.Split(experimentsDetails.TargetNodes, ",")
if len(targetNodes) == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even for an empty string value in experimentsDetails.TargetNodes, the len(targetNodes) will be equal to 1. We can modify the check as follows:

Suggested change
if len(targetNodes) == 0 {
if experimentsDetails.TargetNodes == "" {

return errors.Errorf("No target nodes provided, expected the comma-separated names of one or more nodes")
}

command := exec.Command("kubectl", "drain", experimentsDetails.TargetNode, "--ignore-daemonsets", "--delete-local-data", "--force", "--timeout", strconv.Itoa(experimentsDetails.ChaosDuration)+"s")
var out, stderr bytes.Buffer
command.Stdout = &out
command.Stderr = &stderr
if err := command.Run(); err != nil {
log.Infof("Error String: %v", stderr.String())
return errors.Errorf("Unable to drain the %v node, err: %v", experimentsDetails.TargetNode, err)
}
log.Infof("Target nodes list: %v", targetNodes)
for _, targetNode := range targetNodes {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also handle the panic case when the length of the list is zero and give an error message to provide the target node name.

Copy link
Member

@uditgaurav uditgaurav Jan 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we run chaos for more nodes we could perform the pre and post chaos check for all the target nodes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also change experimentsDetails.TargetNode to experimentsDetails.TargetNodes in the structure and other places where it is used.


common.SetTargets(experimentsDetails.TargetNode, "injected", "node", chaosDetails)
select {
case <-inject:
// stopping the chaos execution, if abort signal received
os.Exit(0)
default:
log.Infof("[Inject]: Draining the %v node", targetNode)

command := exec.Command("kubectl", "drain", targetNode, "--ignore-daemonsets", "--delete-emptydir-data", "--force", "--timeout", strconv.Itoa(experimentsDetails.ChaosDuration)+"s")
var out, stderr bytes.Buffer
command.Stdout = &out
command.Stderr = &stderr
if err := command.Run(); err != nil {
log.Infof("Error String: %v", stderr.String())
return errors.Errorf("Unable to drain the %v node, err: %v", targetNode, err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can pass the stderr here since the err will contain the exit code description e.g. exit code 1.

}

return retry.
Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)).
Wait(time.Duration(experimentsDetails.Delay) * time.Second).
Try(func(attempt uint) error {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(experimentsDetails.TargetNode, v1.GetOptions{})
if err != nil {
return err
}
if !nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is not in unschedulable state", experimentsDetails.TargetNode)
}
return nil
})
common.SetTargets(targetNode, "injected", "node", chaosDetails)

err = retry.
Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)).
Wait(time.Duration(experimentsDetails.Delay) * time.Second).
Try(func(attempt uint) error {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, v1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
return nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we add a log here to specify that the resource was not found?

} else {
return err
}
}
if !nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is not in unschedulable state", targetNode)
}
return nil
})
if err != nil {
return err
}
}
}
return nil
}

// uncordonNode uncordon the application node
func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error {

targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
targetNodes := strings.Split(experimentsDetails.TargetNodes, ",")
if len(targetNodes) == 0 {
return errors.Errorf("No target nodes provided, expected the comma-separated names of one or more nodes")
}

for _, targetNode := range targetNodes {

//Check node exist before uncordon the node
Expand Down Expand Up @@ -183,7 +203,11 @@ func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients
Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)).
Wait(time.Duration(experimentsDetails.Delay) * time.Second).
Try(func(attempt uint) error {
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
targetNodes := strings.Split(experimentsDetails.TargetNodes, ",")
if len(targetNodes) == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above, this check will fail for an empty string.

return errors.Errorf("No target nodes provided, expected the comma-separated names of one or more nodes")
}

for _, targetNode := range targetNodes {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, v1.GetOptions{})
if err != nil {
Expand All @@ -194,7 +218,7 @@ func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients
}
}
if nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
return errors.Errorf("%v node is in unschedulable state", targetNode)
}
}
return nil
Expand Down
37 changes: 25 additions & 12 deletions experiments/generic/node-drain/experiment/node-drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package experiment

import (
"os"
"strings"

"github.com/litmuschaos/chaos-operator/pkg/apis/litmuschaos/v1alpha1"
litmusLIB "github.com/litmuschaos/litmus-go/chaoslib/litmus/node-drain/lib"
Expand Down Expand Up @@ -64,10 +65,18 @@ func NodeDrain(clients clients.ClientSets) {
//DISPLAY THE APP INFORMATION
log.InfoWithValues("[Info]: The application information is as follows", logrus.Fields{
"Node Label": experimentsDetails.NodeLabel,
"Target Node": experimentsDetails.TargetNode,
"Target Nodes": experimentsDetails.TargetNodes,
"Chaos Duration": experimentsDetails.ChaosDuration,
})

targetNodes := strings.Split(experimentsDetails.TargetNodes, ",")
if len(targetNodes) == 0 {
log.Errorf("No target nodes provided, expected the comma-separated names of one or more nodes")
failStep := "[pre-chaos]: No target nodes provided, expected the comma-separated names of one or more nodes"
result.RecordAfterFailure(&chaosDetails, &resultDetails, failStep, clients, &eventsDetails)
return
}

// Calling AbortWatcher go routine, it will continuously watch for the abort signal and generate the required events and result
go common.AbortWatcherWithoutExit(experimentsDetails.ExperimentName, clients, &resultDetails, &chaosDetails, &eventsDetails)

Expand All @@ -93,13 +102,15 @@ func NodeDrain(clients clients.ClientSets) {

// Checking the status of target nodes
log.Info("[Status]: Getting the status of target nodes")
if err := status.CheckNodeStatus(experimentsDetails.TargetNode, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
log.Errorf("Target nodes are not in the ready state, err: %v", err)
failStep := "[pre-chaos]: Failed to verify the status of nodes, err: " + err.Error()
types.SetEngineEventAttributes(&eventsDetails, types.PreChaosCheck, "NUT: Not Ready", "Warning", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
result.RecordAfterFailure(&chaosDetails, &resultDetails, failStep, clients, &eventsDetails)
return
for _, targetNode := range targetNodes {
if err := status.CheckNodeStatus(targetNode, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
log.Errorf("Target nodes are not in the ready state, err: %v", err)
failStep := "[pre-chaos]: Failed to verify the status of nodes, err: " + err.Error()
types.SetEngineEventAttributes(&eventsDetails, types.PreChaosCheck, "NUT: Not Ready", "Warning", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
result.RecordAfterFailure(&chaosDetails, &resultDetails, failStep, clients, &eventsDetails)
return
}
}

if experimentsDetails.EngineName != "" {
Expand Down Expand Up @@ -166,10 +177,12 @@ func NodeDrain(clients clients.ClientSets) {

// Checking the status of target nodes
log.Info("[Status]: Getting the status of target nodes")
if err := status.CheckNodeStatus(experimentsDetails.TargetNode, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
log.Warnf("Target nodes are not in the ready state, you may need to manually recover the node, err: %v", err)
types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "NUT: Not Ready", "Warning", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
for _, targetNode := range targetNodes {
if err := status.CheckNodeStatus(targetNode, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
log.Warnf("Target nodes are not in the ready state, you may need to manually recover the node, err: %v", err)
types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "NUT: Not Ready", "Warning", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
}
}

if experimentsDetails.EngineName != "" {
Expand Down
2 changes: 1 addition & 1 deletion pkg/generic/node-drain/environment/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func GetENV(experimentDetails *experimentTypes.ExperimentDetails) {
experimentDetails.InstanceID = types.Getenv("INSTANCE_ID", "")
experimentDetails.ChaosPodName = types.Getenv("POD_NAME", "")
experimentDetails.AuxiliaryAppInfo = types.Getenv("AUXILIARY_APPINFO", "")
experimentDetails.TargetNode = types.Getenv("TARGET_NODE", "")
experimentDetails.TargetNodes = types.Getenv("TARGET_NODES", "")
experimentDetails.Delay, _ = strconv.Atoi(types.Getenv("STATUS_CHECK_DELAY", "2"))
experimentDetails.Timeout, _ = strconv.Atoi(types.Getenv("STATUS_CHECK_TIMEOUT", "180"))
experimentDetails.TargetContainer = types.Getenv("TARGET_CONTAINER", "")
Expand Down
2 changes: 1 addition & 1 deletion pkg/generic/node-drain/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ type ExperimentDetails struct {
InstanceID string
ChaosNamespace string
ChaosPodName string
TargetNode string
TargetNodes string
AuxiliaryAppInfo string
Timeout int
Delay int
Expand Down