diff --git a/cmd/kops/toolbox_dump.go b/cmd/kops/toolbox_dump.go index 2887903b34056..b2e484844ba18 100644 --- a/cmd/kops/toolbox_dump.go +++ b/cmd/kops/toolbox_dump.go @@ -19,12 +19,10 @@ package main import ( "context" "encoding/json" - "errors" "fmt" "io" "os" "path/filepath" - "slices" "strings" "github.com/spf13/cobra" @@ -36,7 +34,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" "k8s.io/kops/pkg/apis/kops" - "k8s.io/kops/pkg/apis/kops/util" "k8s.io/kops/pkg/commands/commandutils" "k8s.io/kops/pkg/dump" "k8s.io/kops/pkg/resources" @@ -195,11 +192,6 @@ func RunToolboxDump(ctx context.Context, f commandutils.Factory, out io.Writer, } } - err = truncateNodeList(&nodes, options.MaxNodes) - if err != nil { - klog.Warningf("not limiting number of nodes dumped: %v", err) - } - sshConfig := &ssh.ClientConfig{ Config: ssh.Config{}, User: options.SSHUser, @@ -239,24 +231,19 @@ func RunToolboxDump(ctx context.Context, f commandutils.Factory, out io.Writer, } } } - } - dumper := dump.NewLogDumper(bastionAddress, sshConfig, keyRing, options.Dir) - - var additionalIPs []string - var additionalPrivateIPs []string - if cloudResources != nil { - for _, instance := range cloudResources.Instances { - if len(instance.PublicAddresses) != 0 { - additionalIPs = append(additionalIPs, instance.PublicAddresses[0]) - } else if len(instance.PrivateAddresses) != 0 { - additionalPrivateIPs = append(additionalPrivateIPs, instance.PrivateAddresses[0]) - } else { - klog.Warningf("no IP for instance %q", instance.Name) + // If we don't have a bastion, use a control plane instance that has public IPs + if bastionAddress == "" { + for _, instance := range cloudResources.Instances { + if strings.Contains(instance.Name, "control-plane") && len(instance.PublicAddresses) > 0 { + bastionAddress = instance.PublicAddresses[0] + } } } } - if err := dumper.DumpAllNodes(ctx, nodes, options.MaxNodes, additionalIPs, additionalPrivateIPs); err != nil { + dumper := dump.NewLogDumper(bastionAddress, sshConfig, keyRing, options.Dir) + + if err := dumper.DumpAllNodes(ctx, nodes, options.MaxNodes, cloudResources); err != nil { klog.Warningf("error dumping nodes: %v", err) } @@ -309,20 +296,3 @@ func RunToolboxDump(ctx context.Context, f commandutils.Factory, out io.Writer, } return nil } - -func truncateNodeList(nodes *corev1.NodeList, max int) error { - if max < 0 { - return errors.New("--max-nodes must be greater than zero") - } - // Move control plane nodes to the start of the list and truncate the remainder - slices.SortFunc[[]corev1.Node](nodes.Items, func(a corev1.Node, e corev1.Node) int { - if role := util.GetNodeRole(&a); role == "control-plane" || role == "apiserver" { - return -1 - } - return 1 - }) - if len(nodes.Items) > max { - nodes.Items = nodes.Items[:max] - } - return nil -} diff --git a/cmd/kops/toolbox_dump_test.go b/cmd/kops/toolbox_dump_test.go deleted file mode 100644 index b5e5e7f9a14af..0000000000000 --- a/cmd/kops/toolbox_dump_test.go +++ /dev/null @@ -1,111 +0,0 @@ -/* -Copyright 2023 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "testing" - - "github.com/stretchr/testify/assert" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestTruncateNodeList(t *testing.T) { - cases := []struct { - name string - input []corev1.Node - max int - expected []corev1.Node - err bool - }{ - { - name: "less than max", - input: []corev1.Node{ - makeNode(), - makeNode(), - makeControlPlaneNode(), - }, - max: 5, - expected: []corev1.Node{ - makeControlPlaneNode(), - makeNode(), - makeNode(), - }, - }, - { - name: "truncate", - input: []corev1.Node{ - makeNode(), - makeNode(), - makeNode(), - makeControlPlaneNode(), - makeNode(), - }, - max: 4, - expected: []corev1.Node{ - makeControlPlaneNode(), - makeNode(), - makeNode(), - makeNode(), - }, - }, - { - name: "less than zero", - input: []corev1.Node{ - makeNode(), - makeNode(), - makeNode(), - makeControlPlaneNode(), - makeNode(), - }, - max: -1, - err: true, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - nodeList := corev1.NodeList{Items: tc.input} - err := truncateNodeList(&nodeList, tc.max) - if tc.err { - assert.Error(t, err) - } else { - assert.NoError(t, err) - assert.Equal(t, tc.expected, nodeList.Items) - } - }) - } -} - -func makeControlPlaneNode() corev1.Node { - return corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - "node-role.kubernetes.io/control-plane": "", - }, - }, - } -} - -func makeNode() corev1.Node { - return corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - "node-role.kubernetes.io/node": "", - }, - }, - } -} diff --git a/pkg/dump/dumper.go b/pkg/dump/dumper.go index db92991183ea0..1bddb76cc6816 100644 --- a/pkg/dump/dumper.go +++ b/pkg/dump/dumper.go @@ -21,11 +21,9 @@ import ( "context" "fmt" "io" - "log" "net" "os" "path/filepath" - "slices" "strings" "time" @@ -33,6 +31,7 @@ import ( "golang.org/x/crypto/ssh/agent" corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" + "k8s.io/kops/pkg/resources" ) // logDumper gets all the nodes from a kubernetes cluster and dumps a well-known set of logs @@ -53,7 +52,7 @@ func NewLogDumper(bastionAddress string, sshConfig *ssh.ClientConfig, keyRing ag sshConfig: sshConfig, } if bastionAddress != "" { - log.Printf("detected a bastion instance, with the address: %s", bastionAddress) + klog.Infof("detected a bastion instance, with the address: %s", bastionAddress) sshClientFactory.bastion = bastionAddress } @@ -105,78 +104,86 @@ func NewLogDumper(bastionAddress string, sshConfig *ssh.ClientConfig, keyRing ag // if the IPs are not found from kubectl get nodes, then these will be dumped also. // This allows for dumping log on nodes even if they don't register as a kubernetes // node, or if a node fails to register, or if the whole cluster fails to start. -func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, maxNodesToDump int, additionalIPs, additionalPrivateIPs []string) error { +func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, maxNodesToDump int, cloudResources *resources.Dump) error { var special, regular []*corev1.Node + var missingK8sNodes []*resources.Instance var dumped []string - log.Printf("starting to dump %d nodes fetched through the Kubernetes APIs", len(nodes.Items)) - for i := range nodes.Items { - node := &nodes.Items[i] - - if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok { - special = append(special, node) - continue - } - if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok { - special = append(special, node) - continue + foundInstanceNames := make(map[string]struct{}) + for _, cloudNode := range cloudResources.Instances { + for _, k8sNode := range nodes.Items { + if k8sNode.Name == cloudNode.Name { + foundInstanceNames[cloudNode.Name] = struct{}{} + if _, ok := k8sNode.Labels["node-role.kubernetes.io/master"]; ok { + special = append(special, &k8sNode) + continue + } + if _, ok := k8sNode.Labels["node-role.kubernetes.io/control-plane"]; ok { + special = append(special, &k8sNode) + continue + } + if _, ok := k8sNode.Labels["node-role.kubernetes.io/api-server"]; ok { + special = append(special, &k8sNode) + continue + } + regular = append(regular, &k8sNode) + } } - if _, ok := node.Labels["node-role.kubernetes.io/api-server"]; ok { - special = append(special, node) - continue + } + + for _, cloudNode := range cloudResources.Instances { + if _, found := foundInstanceNames[cloudNode.Name]; !found { + missingK8sNodes = append(missingK8sNodes, cloudNode) } + } - regular = append(regular, node) + if len(missingK8sNodes) > 0 { + klog.V(2).Infof("number of nodes from kubernetes (%d) differs from number of instances from cloud resources (%d)", len(nodes.Items), len(cloudResources.Instances)) } + // Dumping priority + // 1. control plane & special nodes + // 2. IP of nodes that haven't joined the kubernetes cluster aka unregistered nodes + // 3. remaining Kubernetes + + klog.Infof("starting to dump %d control plane nodes fetched through the Kubernetes APIs", len(special)) + for i := range special { node := special[i] ip, err := d.dumpRegistered(ctx, node) if err != nil { - log.Printf("could not dump node %s: %v", node.Name, err) + klog.Infof("could not dump node %s: %v", node.Name, err) } else { dumped = append(dumped, ip) } } - for i := range regular { + for i := range missingK8sNodes { if len(dumped) >= maxNodesToDump { - log.Printf("stopping dumping nodes: %d nodes dumped", maxNodesToDump) + klog.Infof("stopping dumping nodes: %d nodes dumped", maxNodesToDump) return nil } - node := regular[i] - ip, err := d.dumpRegistered(ctx, node) + node := missingK8sNodes[i] + ip, err := d.dumpNotRegistered(ctx, node) if err != nil { - log.Printf("could not dump node %s: %v", node.Name, err) + klog.Infof("could not dump node %s: %v", node.Name, err) } else { dumped = append(dumped, ip) } } - notDumped := findInstancesNotDumped(additionalIPs, dumped) - for _, ip := range notDumped { - if len(dumped) >= maxNodesToDump { - log.Printf("stopping dumping nodes: %d nodes dumped", maxNodesToDump) - return nil - } - err := d.dumpNotRegistered(ctx, ip, false) - if err != nil { - return err - } - dumped = append(dumped, ip) - } - - notDumped = findInstancesNotDumped(additionalPrivateIPs, dumped) - for _, ip := range notDumped { + for i := range regular { if len(dumped) >= maxNodesToDump { - log.Printf("stopping dumping nodes: %d nodes dumped", maxNodesToDump) + klog.Infof("stopping dumping nodes: %d nodes dumped", maxNodesToDump) return nil } - err := d.dumpNotRegistered(ctx, ip, true) + node := regular[i] + ip, err := d.dumpRegistered(ctx, node) if err != nil { - return err + klog.Infof("could not dump node %s: %v", node.Name, err) + } else { + dumped = append(dumped, ip) } - dumped = append(dumped, ip) } return nil @@ -184,7 +191,7 @@ func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, max func (d *logDumper) dumpRegistered(ctx context.Context, node *corev1.Node) (string, error) { if ctx.Err() != nil { - log.Printf("stopping dumping nodes: %v", ctx.Err()) + klog.Infof("stopping dumping nodes: %v", ctx.Err()) return "", ctx.Err() } @@ -212,29 +219,21 @@ func (d *logDumper) dumpRegistered(ctx context.Context, node *corev1.Node) (stri } } -func (d *logDumper) dumpNotRegistered(ctx context.Context, ip string, useBastion bool) error { +func (d *logDumper) dumpNotRegistered(ctx context.Context, node *resources.Instance) (string, error) { if ctx.Err() != nil { - log.Printf("stopping dumping nodes: %v", ctx.Err()) - return ctx.Err() + klog.Infof("stopping dumping nodes: %v", ctx.Err()) + return "", ctx.Err() } - log.Printf("dumping node not registered in kubernetes: %s", ip) - err := d.dumpNode(ctx, ip, ip, useBastion) - if err != nil { - log.Printf("error dumping node %s: %v", ip, err) + klog.Infof("dumping node not registered in kubernetes: %s", node.Name) + if len(node.PublicAddresses) > 0 { + return node.PublicAddresses[0], d.dumpNode(ctx, node.PublicAddresses[0], node.PublicAddresses[0], false) } - return nil -} -// findInstancesNotDumped returns ips from the slice that do not appear as any address of the nodes -func findInstancesNotDumped(ips, dumped []string) []string { - var notDumped []string - for _, ip := range ips { - if !slices.Contains(dumped, ip) { - notDumped = append(notDumped, ip) - } + if len(node.PrivateAddresses) > 0 { + return node.PrivateAddresses[0], d.dumpNode(ctx, node.PrivateAddresses[0], node.PrivateAddresses[0], true) } - return notDumped + return "", fmt.Errorf("no known addresses for node %s", node.Name) } // DumpNode connects to a node and dumps the logs. @@ -243,7 +242,7 @@ func (d *logDumper) dumpNode(ctx context.Context, name string, ip string, useBas return fmt.Errorf("could not find address for %v, ", name) } - log.Printf("Dumping node %s", name) + klog.Infof("Dumping node %s", name) n, err := d.connectToNode(ctx, name, ip, useBastion) if err != nil { @@ -256,11 +255,11 @@ func (d *logDumper) dumpNode(ctx context.Context, name string, ip string, useBas // TODO(justinsb): clean up / rationalize errors := n.dump(ctx) for _, e := range errors { - log.Printf("error dumping node %s: %v", name, e) + klog.Warningf("error dumping node %s: %v", name, e) } if err := n.Close(); err != nil { - log.Printf("error closing connection: %v", err) + klog.Warningf("error closing connection: %v", err) } return nil @@ -448,7 +447,7 @@ func (n *logDumperNode) listSystemdUnits(ctx context.Context) ([]string, error) // shellToFile executes a command and copies the output to a file func (n *logDumperNode) shellToFile(ctx context.Context, command string, destPath string) error { if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil { - log.Printf("unable to mkdir on %q: %v", filepath.Dir(destPath), err) + klog.Warningf("unable to mkdir on %q: %v", filepath.Dir(destPath), err) } f, err := os.Create(destPath) @@ -501,7 +500,7 @@ func (s *sshClientImplementation) ExecPiped(ctx context.Context, cmd string, std select { case <-ctx.Done(): - log.Print("closing SSH tcp connection due to context completion") + klog.Infof("closing SSH tcp connection due to context completion") // terminate the TCP connection to force a disconnect - we assume everyone is using the same context. // We could make this better by sending a signal on the session, waiting and then closing the session, @@ -610,7 +609,7 @@ func (f *sshClientFactoryImplementation) Dial(ctx context.Context, host string, select { case <-ctx.Done(): - log.Print("cancelling SSH tcp connection due to context completion") + klog.Infof("cancelling SSH tcp connection due to context completion") conn.Close() // Close the TCP connection to force cancellation <-finished // Wait for cancellation return nil, ctx.Err() diff --git a/pkg/resources/gce/dump.go b/pkg/resources/gce/dump.go index ca42a803c5877..c7b94583e590c 100644 --- a/pkg/resources/gce/dump.go +++ b/pkg/resources/gce/dump.go @@ -19,6 +19,7 @@ package gce import ( "context" "fmt" + "strings" "sync" compute "google.golang.org/api/compute/v1" @@ -77,6 +78,21 @@ func DumpManagedInstance(op *resources.DumpOperation, r *resources.Resource) err } } + isControlPlane := false + for key, value := range instanceDetails.Labels { + if !strings.HasPrefix(key, gce.GceLabelNameRolePrefix) { + continue + } + if value == "control-plane" { + isControlPlane = true + } else { + i.Roles = append(i.Roles, value) + } + } + if isControlPlane { + i.Roles = append(i.Roles, "control-plane") + } + op.Dump.Instances = append(op.Dump.Instances, i) op.Dump.Resources = append(op.Dump.Resources, instanceDetails)