diff --git a/cmd/installer/cli/enable_ha.go b/cmd/installer/cli/enable_ha.go new file mode 100644 index 000000000..8e46f57f0 --- /dev/null +++ b/cmd/installer/cli/enable_ha.go @@ -0,0 +1,86 @@ +package cli + +import ( + "context" + "fmt" + "os" + + "github.com/replicatedhq/embedded-cluster/pkg/addons" + "github.com/replicatedhq/embedded-cluster/pkg/helm" + "github.com/replicatedhq/embedded-cluster/pkg/kubeutils" + "github.com/replicatedhq/embedded-cluster/pkg/runtimeconfig" + rcutil "github.com/replicatedhq/embedded-cluster/pkg/runtimeconfig/util" + "github.com/replicatedhq/embedded-cluster/pkg/versions" + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" +) + +// EnableHACmd is the command for enabling HA mode. +func EnableHACmd(ctx context.Context, name string) *cobra.Command { + cmd := &cobra.Command{ + Use: "enable-ha", + Short: fmt.Sprintf("Enable high availability for the %s cluster", name), + PreRunE: func(cmd *cobra.Command, args []string) error { + if os.Getuid() != 0 { + return fmt.Errorf("enable-ha command must be run as root") + } + + rcutil.InitBestRuntimeConfig(cmd.Context()) + + os.Setenv("KUBECONFIG", runtimeconfig.PathToKubeConfig()) + os.Setenv("TMPDIR", runtimeconfig.EmbeddedClusterTmpSubDir()) + + return nil + }, + PostRun: func(cmd *cobra.Command, args []string) { + runtimeconfig.Cleanup() + }, + RunE: func(cmd *cobra.Command, args []string) error { + if err := runEnableHA(cmd.Context()); err != nil { + return err + } + + return nil + }, + } + + return cmd +} + +func runEnableHA(ctx context.Context) error { + kcli, err := kubeutils.KubeClient() + if err != nil { + return fmt.Errorf("unable to get kube client: %w", err) + } + + canEnableHA, reason, err := addons.CanEnableHA(ctx, kcli) + if err != nil { + return fmt.Errorf("unable to check if HA can be enabled: %w", err) + } + if !canEnableHA { + logrus.Warnf("High availability cannot be enabled: %s", reason) + return NewErrorNothingElseToAdd(fmt.Errorf("high availability cannot be enabled: %s", reason)) + } + + in, err := kubeutils.GetLatestInstallation(ctx, kcli) + if err != nil { + return fmt.Errorf("unable to get latest installation: %w", err) + } + + airgapChartsPath := "" + if in.Spec.AirGap { + airgapChartsPath = runtimeconfig.EmbeddedClusterChartsSubDir() + } + + hcli, err := helm.NewClient(helm.HelmOptions{ + KubeConfig: runtimeconfig.PathToKubeConfig(), + K0sVersion: versions.K0sVersion, + AirgapPath: airgapChartsPath, + }) + if err != nil { + return fmt.Errorf("unable to create helm client: %w", err) + } + defer hcli.Close() + + return addons.EnableHA(ctx, kcli, hcli, in.Spec.AirGap, in.Spec.Network.ServiceCIDR, in.Spec.Proxy, in.Spec.Config) +} diff --git a/cmd/installer/cli/install.go b/cmd/installer/cli/install.go index fb63be628..b426b7274 100644 --- a/cmd/installer/cli/install.go +++ b/cmd/installer/cli/install.go @@ -78,9 +78,6 @@ type InstallCmdFlags struct { } // InstallCmd returns a cobra command for installing the embedded cluster. -// This is the upcoming version of install without the operator and where -// install does all of the work. This is a hidden command until it's tested -// and ready. func InstallCmd(ctx context.Context, name string) *cobra.Command { var flags InstallCmdFlags diff --git a/cmd/installer/cli/join.go b/cmd/installer/cli/join.go index b061438dd..650b737f1 100644 --- a/cmd/installer/cli/join.go +++ b/cmd/installer/cli/join.go @@ -41,9 +41,7 @@ type JoinCmdFlags struct { ignoreHostPreflights bool } -// This is the upcoming version of join without the operator and where -// join does all of the work. This is a hidden command until it's tested -// and ready. +// JoinCmd returns a cobra command for joining a node to the cluster. func JoinCmd(ctx context.Context, name string) *cobra.Command { var flags JoinCmdFlags @@ -202,22 +200,22 @@ func runJoin(ctx context.Context, name string, flags JoinCmdFlags, jcmd *kotsadm return nil } - airgapChartsPath := "" - if flags.isAirgap { - airgapChartsPath = runtimeconfig.EmbeddedClusterChartsSubDir() - } + if flags.enableHighAvailability { + airgapChartsPath := "" + if flags.isAirgap { + airgapChartsPath = runtimeconfig.EmbeddedClusterChartsSubDir() + } - hcli, err := helm.NewClient(helm.HelmOptions{ - KubeConfig: runtimeconfig.PathToKubeConfig(), - K0sVersion: versions.K0sVersion, - AirgapPath: airgapChartsPath, - }) - if err != nil { - return fmt.Errorf("unable to create helm client: %w", err) - } - defer hcli.Close() + hcli, err := helm.NewClient(helm.HelmOptions{ + KubeConfig: runtimeconfig.PathToKubeConfig(), + K0sVersion: versions.K0sVersion, + AirgapPath: airgapChartsPath, + }) + if err != nil { + return fmt.Errorf("unable to create helm client: %w", err) + } + defer hcli.Close() - if flags.enableHighAvailability { if err := maybeEnableHA(ctx, kcli, hcli, flags.isAirgap, cidrCfg.ServiceCIDR, jcmd.InstallationSpec.Proxy, jcmd.InstallationSpec.Config); err != nil { return fmt.Errorf("unable to enable high availability: %w", err) } @@ -494,7 +492,7 @@ func waitForNodeToJoin(ctx context.Context, kcli client.Client, hostname string, } func maybeEnableHA(ctx context.Context, kcli client.Client, hcli helm.Client, isAirgap bool, serviceCIDR string, proxy *ecv1beta1.ProxySpec, cfgspec *ecv1beta1.ConfigSpec) error { - canEnableHA, err := addons.CanEnableHA(ctx, kcli) + canEnableHA, _, err := addons.CanEnableHA(ctx, kcli) if err != nil { return fmt.Errorf("unable to check if HA can be enabled: %w", err) } diff --git a/cmd/installer/cli/root.go b/cmd/installer/cli/root.go index 8e03b1f73..41d2a7321 100644 --- a/cmd/installer/cli/root.go +++ b/cmd/installer/cli/root.go @@ -100,6 +100,7 @@ func RootCmd(ctx context.Context, name string) *cobra.Command { cmd.AddCommand(JoinCmd(ctx, name)) cmd.AddCommand(ShellCmd(ctx, name)) cmd.AddCommand(NodeCmd(ctx, name)) + cmd.AddCommand(EnableHACmd(ctx, name)) cmd.AddCommand(VersionCmd(ctx, name)) cmd.AddCommand(ResetCmd(ctx, name)) cmd.AddCommand(MaterializeCmd(ctx, name)) diff --git a/go.mod b/go.mod index dc7afe7b5..7697a85d3 100644 --- a/go.mod +++ b/go.mod @@ -41,7 +41,6 @@ require ( github.com/vmware-tanzu/velero v1.15.2 go.uber.org/multierr v1.11.0 golang.org/x/crypto v0.33.0 - golang.org/x/sync v0.11.0 golang.org/x/term v0.29.0 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 @@ -272,6 +271,7 @@ require ( go.opentelemetry.io/otel/trace v1.34.0 // indirect golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 // indirect golang.org/x/mod v0.23.0 // indirect + golang.org/x/sync v0.11.0 // indirect golang.org/x/tools v0.28.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/api v0.197.0 // indirect diff --git a/pkg/addons/embeddedclusteroperator/upgrade.go b/pkg/addons/embeddedclusteroperator/upgrade.go index 84369d813..cd2e3e047 100644 --- a/pkg/addons/embeddedclusteroperator/upgrade.go +++ b/pkg/addons/embeddedclusteroperator/upgrade.go @@ -2,10 +2,10 @@ package embeddedclusteroperator import ( "context" - "log/slog" "github.com/pkg/errors" "github.com/replicatedhq/embedded-cluster/pkg/helm" + "github.com/sirupsen/logrus" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -15,7 +15,7 @@ func (e *EmbeddedClusterOperator) Upgrade(ctx context.Context, kcli client.Clien return errors.Wrap(err, "check if release exists") } if !exists { - slog.Info("Release not found, installing", "release", releaseName, "namespace", namespace) + logrus.Debugf("Release not found, installing release %s in namespace %s", releaseName, namespace) if err := e.Install(ctx, kcli, hcli, overrides, nil); err != nil { return errors.Wrap(err, "install") } diff --git a/pkg/addons/highavailability.go b/pkg/addons/highavailability.go index 6457a5e0d..0372b0953 100644 --- a/pkg/addons/highavailability.go +++ b/pkg/addons/highavailability.go @@ -7,6 +7,7 @@ import ( ecv1beta1 "github.com/replicatedhq/embedded-cluster/kinds/apis/v1beta1" "github.com/replicatedhq/embedded-cluster/pkg/addons/adminconsole" "github.com/replicatedhq/embedded-cluster/pkg/addons/registry" + registrymigrate "github.com/replicatedhq/embedded-cluster/pkg/addons/registry/migrate" "github.com/replicatedhq/embedded-cluster/pkg/addons/seaweedfs" "github.com/replicatedhq/embedded-cluster/pkg/constants" "github.com/replicatedhq/embedded-cluster/pkg/helm" @@ -15,32 +16,34 @@ import ( "github.com/replicatedhq/embedded-cluster/pkg/spinner" "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) // CanEnableHA checks if high availability can be enabled in the cluster. -func CanEnableHA(ctx context.Context, kcli client.Client) (bool, error) { +func CanEnableHA(ctx context.Context, kcli client.Client) (bool, string, error) { in, err := kubeutils.GetLatestInstallation(ctx, kcli) if err != nil { - return false, errors.Wrap(err, "get latest installation") + return false, "", errors.Wrap(err, "get latest installation") } if in.Spec.HighAvailability { - return false, nil + return false, "already enabled", nil } if err := kcli.Get(ctx, types.NamespacedName{Name: constants.EcRestoreStateCMName, Namespace: "embedded-cluster"}, &corev1.ConfigMap{}); err == nil { - return false, nil // cannot enable HA during a restore - } else if !k8serrors.IsNotFound(err) { - return false, errors.Wrap(err, "get restore state configmap") + return false, "a restore is in progress", nil + } else if client.IgnoreNotFound(err) != nil { + return false, "", errors.Wrap(err, "get restore state configmap") } ncps, err := kubeutils.NumOfControlPlaneNodes(ctx, kcli) if err != nil { - return false, errors.Wrap(err, "check control plane nodes") + return false, "", errors.Wrap(err, "check control plane nodes") } - return ncps >= 3, nil + if ncps < 3 { + return false, "number of control plane nodes is less than 3", nil + } + return true, "", nil } // EnableHA enables high availability. @@ -60,36 +63,45 @@ func EnableHA(ctx context.Context, kcli client.Client, hcli helm.Client, isAirga ServiceCIDR: serviceCIDR, ProxyRegistryDomain: domains.ProxyRegistryDomain, } - exists, err := hcli.ReleaseExists(ctx, sw.Namespace(), sw.ReleaseName()) - if err != nil { - return errors.Wrap(err, "check if seaweedfs release exists") - } - if !exists { - logrus.Debugf("Installing seaweedfs") - if err := sw.Install(ctx, kcli, hcli, addOnOverrides(sw, cfgspec, nil), nil); err != nil { - return errors.Wrap(err, "install seaweedfs") - } - logrus.Debugf("Seaweedfs installed!") - } else { - logrus.Debugf("Seaweedfs already installed") + + logrus.Debugf("Maybe removing existing seaweedfs") + if err := sw.Uninstall(ctx, kcli); err != nil { + return errors.Wrap(err, "uninstall seaweedfs") } - // TODO (@salah): add support for end user overrides - reg := ®istry.Registry{ - ServiceCIDR: serviceCIDR, - ProxyRegistryDomain: domains.ProxyRegistryDomain, - IsHA: true, + logrus.Debugf("Installing seaweedfs") + if err := sw.Install(ctx, kcli, hcli, addOnOverrides(sw, cfgspec, nil), nil); err != nil { + return errors.Wrap(err, "install seaweedfs") } - logrus.Debugf("Migrating registry data") - if err := reg.Migrate(ctx, kcli, loading); err != nil { - return errors.Wrap(err, "migrate registry data") + logrus.Debugf("Seaweedfs installed!") + + // TODO: timeout + + loading.Infof("Migrating data for high availability") + logrus.Debugf("Migrating data for high availability") + + progressCh := make(chan registrymigrate.Progress) + errCh := make(chan error, 1) + + go func() { + errCh <- registrymigrate.RegistryData(ctx, kcli, progressCh) + close(errCh) + }() + + err := waitForDataMigrationAndLogProgress(loading, progressCh, errCh) + if err != nil { + return errors.Wrap(err, "registry data migration job failed") } - logrus.Debugf("Registry migration complete!") - logrus.Debugf("Upgrading registry") - if err := reg.Upgrade(ctx, kcli, hcli, addOnOverrides(reg, cfgspec, nil)); err != nil { - return errors.Wrap(err, "upgrade registry") + + logrus.Debugf("Data migration complete!") + + loading.Infof("Enabling registry high availability") + logrus.Debugf("Enabling registry high availability") + err = enableRegistryHA(ctx, kcli, hcli, serviceCIDR, cfgspec) + if err != nil { + return errors.Wrap(err, "enable registry high availability") } - logrus.Debugf("Registry upgraded!") + logrus.Debugf("Registry high availability enabled!") } loading.Infof("Updating the Admin Console for high availability") @@ -117,6 +129,23 @@ func EnableHA(ctx context.Context, kcli client.Client, hcli helm.Client, isAirga return nil } +// enableRegistryHA scales the registry deployment to the desired number of replicas. +func enableRegistryHA(ctx context.Context, kcli client.Client, hcli helm.Client, serviceCIDR string, cfgspec *ecv1beta1.ConfigSpec) error { + domains := runtimeconfig.GetDomains(cfgspec) + + // TODO (@salah): add support for end user overrides + r := ®istry.Registry{ + ServiceCIDR: serviceCIDR, + ProxyRegistryDomain: domains.ProxyRegistryDomain, + IsHA: true, + } + if err := r.Upgrade(ctx, kcli, hcli, addOnOverrides(r, cfgspec, nil)); err != nil { + return errors.Wrap(err, "upgrade registry") + } + + return nil +} + // EnableAdminConsoleHA enables high availability for the admin console. func EnableAdminConsoleHA(ctx context.Context, kcli client.Client, hcli helm.Client, isAirgap bool, serviceCIDR string, proxy *ecv1beta1.ProxySpec, cfgspec *ecv1beta1.ConfigSpec) error { domains := runtimeconfig.GetDomains(cfgspec) @@ -137,3 +166,16 @@ func EnableAdminConsoleHA(ctx context.Context, kcli client.Client, hcli helm.Cli return nil } + +func waitForDataMigrationAndLogProgress(progressWriter *spinner.MessageWriter, progressCh <-chan registrymigrate.Progress, errCh <-chan error) error { + for { + select { + case err := <-errCh: + return err + case progress := <-progressCh: + percent := progress.Current * 100 / progress.Total + logrus.Debugf("Migrating data for high availability (%d%%)", percent) + progressWriter.Infof("Migrating data for high availability (%d%%)", percent) + } + } +} diff --git a/pkg/addons/highavailability_test.go b/pkg/addons/highavailability_test.go index c1d67ce26..319ab6005 100644 --- a/pkg/addons/highavailability_test.go +++ b/pkg/addons/highavailability_test.go @@ -6,6 +6,7 @@ import ( "github.com/replicatedhq/embedded-cluster/kinds/apis/v1beta1" "github.com/replicatedhq/embedded-cluster/pkg/constants" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v12 "k8s.io/api/core/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -22,10 +23,11 @@ func Test_canEnableHA(t *testing.T) { kcli client.Client } tests := []struct { - name string - args args - want bool - wantErr bool + name string + args args + want bool + wantReason string + wantErr bool }{ { name: "high availability is not enabled and there is three or more controller nodes", @@ -55,7 +57,8 @@ func Test_canEnableHA(t *testing.T) { &v12.Node{ObjectMeta: v1.ObjectMeta{Name: "node3"}}, ).Build(), }, - want: false, + want: false, + wantReason: "number of control plane nodes is less than 3", }, { name: "high availability is already enabled", @@ -70,7 +73,8 @@ func Test_canEnableHA(t *testing.T) { &v12.Node{ObjectMeta: v1.ObjectMeta{Name: "node3", Labels: controllerLabels}}, ).Build(), }, - want: false, + want: false, + wantReason: "already enabled", }, { name: "high availability is not enabled and there is three or more controller nodes but a restore is in progress", @@ -88,20 +92,23 @@ func Test_canEnableHA(t *testing.T) { &v12.Node{ObjectMeta: v1.ObjectMeta{Name: "node3", Labels: controllerLabels}}, ).Build(), }, - want: false, + want: false, + wantReason: "a restore is in progress", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { req := require.New(t) + assert := assert.New(t) ctx := context.Background() - got, err := CanEnableHA(ctx, tt.args.kcli) + got, reason, err := CanEnableHA(ctx, tt.args.kcli) if tt.wantErr { req.Error(err) return } req.NoError(err) - req.Equal(tt.want, got) + assert.Equal(tt.want, got) + assert.Equal(tt.wantReason, reason) }) } } diff --git a/pkg/addons/openebs/upgrade.go b/pkg/addons/openebs/upgrade.go index 1f9b9a7ed..d6b93b511 100644 --- a/pkg/addons/openebs/upgrade.go +++ b/pkg/addons/openebs/upgrade.go @@ -2,10 +2,10 @@ package openebs import ( "context" - "log/slog" "github.com/pkg/errors" "github.com/replicatedhq/embedded-cluster/pkg/helm" + "github.com/sirupsen/logrus" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -15,7 +15,7 @@ func (o *OpenEBS) Upgrade(ctx context.Context, kcli client.Client, hcli helm.Cli return errors.Wrap(err, "check if release exists") } if !exists { - slog.Info("Release not found, installing", "release", releaseName, "namespace", namespace) + logrus.Debugf("Release not found, installing release %s in namespace %s", releaseName, namespace) if err := o.Install(ctx, kcli, hcli, overrides, nil); err != nil { return errors.Wrap(err, "install") } diff --git a/pkg/addons/registry/migrate.go b/pkg/addons/registry/migrate.go deleted file mode 100644 index 6e7e8e4b3..000000000 --- a/pkg/addons/registry/migrate.go +++ /dev/null @@ -1,222 +0,0 @@ -package registry - -import ( - "archive/tar" - "bytes" - "context" - "fmt" - "io" - "path/filepath" - "strconv" - "strings" - - "github.com/aws/aws-sdk-go-v2/aws" - awsconfig "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/credentials" - s3manager "github.com/aws/aws-sdk-go-v2/feature/s3/manager" - "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/pkg/errors" - "github.com/replicatedhq/embedded-cluster/pkg/addons/seaweedfs" - "github.com/replicatedhq/embedded-cluster/pkg/spinner" - "github.com/sirupsen/logrus" - "golang.org/x/sync/errgroup" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/remotecommand" - "sigs.k8s.io/controller-runtime/pkg/client" - k8sconfig "sigs.k8s.io/controller-runtime/pkg/client/config" -) - -var ( - s3Bucket = "registry" - s3RootDirectory = "registry" - labelSelector = "app=docker-registry" -) - -// Migrate runs a migration that copies data on disk in the registry PVC to the seaweedfs s3 store. -func (r *Registry) Migrate(ctx context.Context, kcli client.Client, progressWriter *spinner.MessageWriter) error { - s3Client, err := getS3Client(ctx, kcli, r.ServiceCIDR) - if err != nil { - return errors.Wrap(err, "get s3 client") - } - - logrus.Debug("Ensuring registry bucket") - if err := ensureRegistryBucket(ctx, s3Client); err != nil { - return errors.Wrap(err, "ensure registry bucket") - } - logrus.Debug("Registry bucket ensured!") - - pipeReader, pipeWriter := io.Pipe() - g, ctx := errgroup.WithContext(ctx) - - g.Go(func() error { - defer pipeWriter.Close() - return readRegistryData(ctx, pipeWriter) - }) - - g.Go(func() error { - return writeRegistryData(ctx, pipeReader, s3manager.NewUploader(s3Client), progressWriter) - }) - - logrus.Debug("Copying registry data") - if err := g.Wait(); err != nil { - return err - } - logrus.Debug("Registry data copied!") - - return nil -} - -func getS3Client(ctx context.Context, kcli client.Client, serviceCIDR string) (*s3.Client, error) { - accessKey, secretKey, err := seaweedfs.GetS3RWCreds(ctx, kcli) - if err != nil { - return nil, errors.Wrap(err, "get seaweedfs s3 rw creds") - } - - creds := credentials.NewStaticCredentialsProvider(accessKey, secretKey, "") - conf, err := awsconfig.LoadDefaultConfig( - ctx, - awsconfig.WithCredentialsProvider(creds), - awsconfig.WithRegion("us-east-1"), - ) - if err != nil { - return nil, errors.Wrap(err, "load aws config") - } - - s3URL, err := seaweedfs.GetS3URL(serviceCIDR) - if err != nil { - return nil, errors.Wrap(err, "get seaweedfs s3 endpoint") - } - - s3Client := s3.NewFromConfig(conf, func(o *s3.Options) { - o.UsePathStyle = true - o.BaseEndpoint = aws.String(s3URL) - }) - - return s3Client, nil -} - -func ensureRegistryBucket(ctx context.Context, s3Client *s3.Client) error { - _, err := s3Client.CreateBucket(ctx, &s3.CreateBucketInput{ - Bucket: &s3Bucket, - }) - if err != nil { - if !strings.Contains(err.Error(), "BucketAlreadyExists") { - return errors.Wrap(err, "create bucket") - } - } - return nil -} - -func readRegistryData(ctx context.Context, writer io.Writer) error { - return execInPod(ctx, []string{"tar", "-c", "-C", "/var/lib/registry", "."}, writer) -} - -func writeRegistryData(ctx context.Context, reader io.Reader, s3Uploader *s3manager.Uploader, progressWriter *spinner.MessageWriter) error { - total, err := countRegistryFiles(ctx) - if err != nil { - return errors.Wrap(err, "count registry files") - } - - progress := 0 - tr := tar.NewReader(reader) - for { - header, err := tr.Next() - if err == io.EOF { - break - } - if err != nil { - return errors.Wrap(err, "read tar header") - } - - if header.FileInfo().IsDir() { - continue - } - - relPath, err := filepath.Rel("./", header.Name) - if err != nil { - return errors.Wrap(err, "get relative path") - } - - _, err = s3Uploader.Upload(ctx, &s3.PutObjectInput{ - Bucket: &s3Bucket, - Key: aws.String(filepath.Join(s3RootDirectory, relPath)), - Body: tr, - }) - if err != nil { - return errors.Wrap(err, "upload to s3") - } - - progress++ - progressWriter.Infof("Migrating data for high availability (%d%%)", (progress*100)/total) - } - - return nil -} - -func countRegistryFiles(ctx context.Context) (int, error) { - var stdout bytes.Buffer - if err := execInPod(ctx, []string{"sh", "-c", "find /var/lib/registry -type f | wc -l"}, &stdout); err != nil { - return 0, errors.Wrap(err, "exec in pod") - } - return strconv.Atoi(strings.TrimSpace(stdout.String())) -} - -func execInPod(ctx context.Context, command []string, stdout io.Writer) error { - cfg, err := k8sconfig.GetConfig() - if err != nil { - return errors.Wrap(err, "get kubernetes config") - } - - clientSet, err := kubernetes.NewForConfig(cfg) - if err != nil { - return errors.Wrap(err, "create kubernetes clientset") - } - - scheme := runtime.NewScheme() - if err := corev1.AddToScheme(scheme); err != nil { - return errors.Wrap(err, "add corev1 scheme") - } - - pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: labelSelector, - }) - if err != nil { - return errors.Wrap(err, "list registry pods") - } - if len(pods.Items) == 0 { - return errors.New("no registry pods found") - } - podName := pods.Items[0].Name - - req := clientSet.CoreV1().RESTClient().Post(). - Resource("pods"). - Name(podName). - Namespace(namespace). - SubResource("exec") - - parameterCodec := runtime.NewParameterCodec(scheme) - req.VersionedParams(&corev1.PodExecOptions{ - Command: command, - Container: "docker-registry", - Stdout: true, - Stderr: true, - }, parameterCodec) - - executor, err := remotecommand.NewSPDYExecutor(cfg, "POST", req.URL()) - if err != nil { - return errors.Wrap(err, "create exec") - } - - var stderr bytes.Buffer - if err := executor.StreamWithContext(ctx, remotecommand.StreamOptions{ - Stdout: stdout, - Stderr: &stderr, - }); err != nil { - return fmt.Errorf("stream exec: %v: %s", err, stderr.String()) - } - - return nil -} diff --git a/pkg/addons/registry/migrate/migrate.go b/pkg/addons/registry/migrate/migrate.go new file mode 100644 index 000000000..bece1f37e --- /dev/null +++ b/pkg/addons/registry/migrate/migrate.go @@ -0,0 +1,230 @@ +package migrate + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + s3manager "github.com/aws/aws-sdk-go-v2/feature/s3/manager" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/pkg/errors" + "github.com/replicatedhq/embedded-cluster/pkg/runtimeconfig" + "github.com/sirupsen/logrus" + appsv1 "k8s.io/api/apps/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + s3Bucket = "registry" + s3RootDirectory = "registry" + labelSelector = "app=docker-registry" +) + +type Progress struct { + Total int + Current int +} + +// RegistryData runs a migration that copies data on disk in the registry-data PVC to the seaweedfs +// s3 store. If it fails, it will scale the registry deployment back to 1. It takes a progress +// channel as an argument to report progress. +func RegistryData(ctx context.Context, cli client.Client, progressCh chan<- Progress) error { + defer close(progressCh) + + // TODO: should we check seaweedfs health? + + // if the migration fails, we need to scale the registry back to 1 + success := false + + logrus.Debugf("Scaling registry to 0 replicas") + + err := registryScale(ctx, cli, 0) + if err != nil { + return fmt.Errorf("scale registry to 0 replicas: %w", err) + } + + defer func() { + r := recover() + + if !success { + logrus.Debugf("Scaling registry back to 1 replica after migration failure") + + // this should use the background context as we want it to run even if the context expired + err := registryScale(context.Background(), cli, 1) + if err != nil { + logrus.Errorf("Failed to scale registry back to 1 replica: %v", err) + } + } + + if r != nil { + panic(r) + } + }() + + logrus.Debugf("Connecting to s3") + + s3Client, err := getS3Client(ctx) + if err != nil { + return errors.Wrap(err, "get s3 client") + } + + logrus.Debugf("Ensuring registry bucket") + + err = ensureRegistryBucket(ctx, s3Client) + if err != nil { + return errors.Wrap(err, "ensure registry bucket") + } + + logrus.Debugf("Counting registry files") + + total, err := countRegistryFiles() + if err != nil { + return errors.Wrap(err, "count registry files") + } + + logrus.Debugf("Running registry data migration") + + s3Uploader := s3manager.NewUploader(s3Client) + + count := 0 + err = filepath.Walk("/registry", func(path string, info os.FileInfo, err error) error { + if err != nil { + return fmt.Errorf("walk: %w", err) + } + + if info.IsDir() { + return nil + } + + f, err := os.Open(path) + if err != nil { + return fmt.Errorf("open file: %w", err) + } + defer f.Close() + + relPath, err := filepath.Rel("/", path) + if err != nil { + return fmt.Errorf("get relative path: %w", err) + } + + logrus.Debugf("Uploading object: %s", relPath) + + var lasterr error + err = wait.ExponentialBackoffWithContext(ctx, wait.Backoff{ + Duration: 1 * time.Second, + Factor: 2, + Steps: 5, + }, func(ctx context.Context) (bool, error) { + _, err = s3Uploader.Upload(ctx, &s3.PutObjectInput{ + Bucket: ptr.To(s3Bucket), + Key: &relPath, + Body: f, + }) + lasterr = err + return err == nil, nil + }) + if err != nil { + if lasterr == nil { + lasterr = err + } + return fmt.Errorf("upload object: %w", lasterr) + } + + count++ + progressCh <- Progress{ + Total: total, + Current: count, + } + + return nil + }) + if err != nil { + return fmt.Errorf("walk registry data: %w", err) + } + + success = true + + logrus.Debugf("Registry data migration complete") + + return nil +} + +// registryScale scales the registry deployment to the given replica count. +// '0' and '1' are the only acceptable values. +func registryScale(ctx context.Context, cli client.Client, scale int32) error { + if scale != 0 && scale != 1 { + return fmt.Errorf("invalid scale: %d", scale) + } + + currentRegistry := &appsv1.Deployment{} + err := cli.Get(ctx, client.ObjectKey{Namespace: runtimeconfig.RegistryNamespace, Name: "registry"}, currentRegistry) + if err != nil { + return fmt.Errorf("get registry deployment: %w", err) + } + + currentRegistry.Spec.Replicas = &scale + + err = cli.Update(ctx, currentRegistry) + if err != nil { + return fmt.Errorf("update registry deployment: %w", err) + } + + return nil +} + +func getS3Client(ctx context.Context) (*s3.Client, error) { + creds := credentials.NewStaticCredentialsProvider(os.Getenv("s3AccessKey"), os.Getenv("s3SecretKey"), "") + conf, err := config.LoadDefaultConfig(ctx, + config.WithCredentialsProvider(creds), + config.WithRegion("us-east-1"), + ) + if err != nil { + return nil, fmt.Errorf("load aws config: %w", err) + } + + s3Client := s3.NewFromConfig(conf, func(o *s3.Options) { + o.UsePathStyle = true + o.BaseEndpoint = aws.String("http://seaweedfs-s3.seaweedfs:8333/") + }) + + return s3Client, nil +} + +func ensureRegistryBucket(ctx context.Context, s3Client *s3.Client) error { + _, err := s3Client.CreateBucket(ctx, &s3.CreateBucketInput{ + Bucket: ptr.To(s3Bucket), + }) + if err != nil { + var bne *s3types.BucketAlreadyExists + if !errors.As(err, &bne) { + return errors.Wrap(err, "create bucket") + } + } + return nil +} + +func countRegistryFiles() (int, error) { + var count int + err := filepath.Walk("/registry", func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + count++ + return nil + }) + if err != nil { + return 0, fmt.Errorf("walk /registry directory: %w", err) + } + return count, nil +} diff --git a/pkg/addons/registry/upgrade.go b/pkg/addons/registry/upgrade.go index 979187d1d..b431a9eae 100644 --- a/pkg/addons/registry/upgrade.go +++ b/pkg/addons/registry/upgrade.go @@ -2,11 +2,11 @@ package registry import ( "context" - "log/slog" "github.com/pkg/errors" "github.com/replicatedhq/embedded-cluster/pkg/addons/seaweedfs" "github.com/replicatedhq/embedded-cluster/pkg/helm" + "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -26,7 +26,7 @@ func (r *Registry) Upgrade(ctx context.Context, kcli client.Client, hcli helm.Cl return errors.Wrap(err, "check if release exists") } if !exists { - slog.Info("Release not found, installing", "release", releaseName, "namespace", namespace) + logrus.Debugf("Release not found, installing release %s in namespace %s", releaseName, namespace) if err := r.Install(ctx, kcli, hcli, overrides, nil); err != nil { return errors.Wrap(err, "install") } @@ -60,7 +60,7 @@ func (r *Registry) Upgrade(ctx context.Context, kcli client.Client, hcli helm.Cl func (r *Registry) createUpgradePreRequisites(ctx context.Context, kcli client.Client) error { if r.IsHA { - if err := createS3Secret(ctx, kcli); err != nil { + if err := ensureS3Secret(ctx, kcli); err != nil { return errors.Wrap(err, "create s3 secret") } } @@ -68,7 +68,7 @@ func (r *Registry) createUpgradePreRequisites(ctx context.Context, kcli client.C return nil } -func createS3Secret(ctx context.Context, kcli client.Client) error { +func ensureS3Secret(ctx context.Context, kcli client.Client) error { accessKey, secretKey, err := seaweedfs.GetS3RWCreds(ctx, kcli) if err != nil { return errors.Wrap(err, "get seaweedfs s3 rw creds") diff --git a/pkg/addons/seaweedfs/install.go b/pkg/addons/seaweedfs/install.go index 19297261e..113c56b6c 100644 --- a/pkg/addons/seaweedfs/install.go +++ b/pkg/addons/seaweedfs/install.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "time" "github.com/pkg/errors" "github.com/replicatedhq/embedded-cluster/pkg/helm" @@ -13,36 +14,38 @@ import ( k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" ) func (s *SeaweedFS) Install(ctx context.Context, kcli client.Client, hcli helm.Client, overrides []string, writer *spinner.MessageWriter) error { - if err := s.createPreRequisites(ctx, kcli); err != nil { - return errors.Wrap(err, "create prerequisites") - } + return s.Upgrade(ctx, kcli, hcli, overrides) +} - values, err := s.GenerateHelmValues(ctx, kcli, overrides) - if err != nil { - return errors.Wrap(err, "generate helm values") +func (s *SeaweedFS) Uninstall(ctx context.Context, kcli client.Client) error { + ns := corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + err := kcli.Delete(ctx, &ns) + if client.IgnoreNotFound(err) != nil { + return errors.Wrap(err, "delete namespace") } - _, err = hcli.Install(ctx, helm.InstallOptions{ - ReleaseName: releaseName, - ChartPath: s.ChartLocation(), - ChartVersion: Metadata.Version, - Values: values, - Namespace: namespace, - Labels: getBackupLabels(), + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 1*time.Minute, false, func(ctx context.Context) (bool, error) { + err := kcli.Get(ctx, client.ObjectKey{Name: namespace}, &corev1.Namespace{}) + return err != nil, nil }) if err != nil { - return errors.Wrap(err, "helm install") + return errors.Wrap(err, "wait for namespace to be deleted") } return nil } -func (s *SeaweedFS) createPreRequisites(ctx context.Context, kcli client.Client) error { - if err := createNamespace(ctx, kcli, namespace); err != nil { +func (s *SeaweedFS) ensurePreRequisites(ctx context.Context, kcli client.Client) error { + if err := ensureNamespace(ctx, kcli, namespace); err != nil { return errors.Wrap(err, "create namespace") } @@ -50,14 +53,14 @@ func (s *SeaweedFS) createPreRequisites(ctx context.Context, kcli client.Client) return errors.Wrap(err, "create s3 service") } - if err := createS3Secret(ctx, kcli); err != nil { + if err := ensureS3Secret(ctx, kcli); err != nil { return errors.Wrap(err, "create s3 secret") } return nil } -func createNamespace(ctx context.Context, kcli client.Client, namespace string) error { +func ensureNamespace(ctx context.Context, kcli client.Client, namespace string) error { ns := corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: namespace, @@ -121,7 +124,7 @@ func ensureService(ctx context.Context, kcli client.Client, serviceCIDR string) return nil } -func createS3Secret(ctx context.Context, kcli client.Client) error { +func ensureS3Secret(ctx context.Context, kcli client.Client) error { var config seaweedfsConfig config.Identities = append(config.Identities, seaweedfsIdentity{ Name: "anvAdmin", diff --git a/pkg/addons/seaweedfs/upgrade.go b/pkg/addons/seaweedfs/upgrade.go index 151c75bb6..06b4d3823 100644 --- a/pkg/addons/seaweedfs/upgrade.go +++ b/pkg/addons/seaweedfs/upgrade.go @@ -2,31 +2,43 @@ package seaweedfs import ( "context" - "log/slog" "github.com/pkg/errors" "github.com/replicatedhq/embedded-cluster/pkg/helm" + "github.com/sirupsen/logrus" "sigs.k8s.io/controller-runtime/pkg/client" ) func (s *SeaweedFS) Upgrade(ctx context.Context, kcli client.Client, hcli helm.Client, overrides []string) error { + if err := s.ensurePreRequisites(ctx, kcli); err != nil { + return errors.Wrap(err, "create prerequisites") + } + + values, err := s.GenerateHelmValues(ctx, kcli, overrides) + if err != nil { + return errors.Wrap(err, "generate helm values") + } + exists, err := hcli.ReleaseExists(ctx, namespace, releaseName) if err != nil { return errors.Wrap(err, "check if release exists") } if !exists { - slog.Info("Release not found, installing", "release", releaseName, "namespace", namespace) - if err := s.Install(ctx, kcli, hcli, overrides, nil); err != nil { - return errors.Wrap(err, "install") + logrus.Debugf("Release not found, installing release %s in namespace %s", releaseName, namespace) + _, err = hcli.Install(ctx, helm.InstallOptions{ + ReleaseName: releaseName, + ChartPath: s.ChartLocation(), + ChartVersion: Metadata.Version, + Values: values, + Namespace: namespace, + Labels: getBackupLabels(), + }) + if err != nil { + return errors.Wrap(err, "helm install") } return nil } - values, err := s.GenerateHelmValues(ctx, kcli, overrides) - if err != nil { - return errors.Wrap(err, "generate helm values") - } - _, err = hcli.Upgrade(ctx, helm.UpgradeOptions{ ReleaseName: releaseName, ChartPath: s.ChartLocation(), diff --git a/pkg/addons/velero/upgrade.go b/pkg/addons/velero/upgrade.go index 052c4e649..162beb38e 100644 --- a/pkg/addons/velero/upgrade.go +++ b/pkg/addons/velero/upgrade.go @@ -2,10 +2,10 @@ package velero import ( "context" - "log/slog" "github.com/pkg/errors" "github.com/replicatedhq/embedded-cluster/pkg/helm" + "github.com/sirupsen/logrus" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -15,7 +15,7 @@ func (v *Velero) Upgrade(ctx context.Context, kcli client.Client, hcli helm.Clie return errors.Wrap(err, "check if release exists") } if !exists { - slog.Info("Release not found, installing", "release", releaseName, "namespace", namespace) + logrus.Debugf("Release not found, installing release %s in namespace %s", releaseName, namespace) if err := v.Install(ctx, kcli, hcli, overrides, nil); err != nil { return errors.Wrap(err, "install") } diff --git a/pkg/dryrun/kubeutils.go b/pkg/dryrun/kubeutils.go index cf4bb3897..9723fcb82 100644 --- a/pkg/dryrun/kubeutils.go +++ b/pkg/dryrun/kubeutils.go @@ -64,10 +64,6 @@ func (k *KubeUtils) IsDaemonsetReady(ctx context.Context, cli client.Client, ns, return true, nil } -func (k *KubeUtils) IsJobComplete(ctx context.Context, cli client.Client, ns, name string, completions int32) (bool, error) { - return true, nil -} - func (k *KubeUtils) WaitForKubernetes(ctx context.Context, cli client.Client) <-chan error { errCh := make(chan error) close(errCh) diff --git a/pkg/helm/client.go b/pkg/helm/client.go index bce7327b3..7f5c242ef 100644 --- a/pkg/helm/client.go +++ b/pkg/helm/client.go @@ -219,6 +219,24 @@ func (h *HelmClient) Latest(reponame, chart string) (string, error) { return "", fmt.Errorf("repository %s not found", reponame) } +func (h *HelmClient) PullByRefWithRetries(ctx context.Context, ref string, version string, tries int) (string, error) { + for i := 0; ; i++ { + localPath, err := h.PullByRef(ref, version) + if err == nil { + return localPath, nil + } + logrus.Debugf("Failed to pull %s:%v (%d/%d): %v", ref, version, i+1, tries, err) + if i == tries-1 { + return "", err + } + select { + case <-time.After(5 * time.Second): + case <-ctx.Done(): + return "", ctx.Err() + } + } +} + func (h *HelmClient) Pull(reponame, chart string, version string) (string, error) { ref := fmt.Sprintf("%s/%s", reponame, chart) return h.PullByRef(ref, version) @@ -321,7 +339,7 @@ func (h *HelmClient) Install(ctx context.Context, opts InstallOptions) (*release var localPath string if h.airgapPath == "" { // online, pull chart from remote - localPath, err = h.PullByRef(opts.ChartPath, opts.ChartVersion) + localPath, err = h.PullByRefWithRetries(ctx, opts.ChartPath, opts.ChartVersion, 3) if err != nil { return nil, fmt.Errorf("pull: %w", err) } @@ -378,7 +396,7 @@ func (h *HelmClient) Upgrade(ctx context.Context, opts UpgradeOptions) (*release var localPath string if h.airgapPath == "" { // online, pull chart from remote - localPath, err = h.PullByRef(opts.ChartPath, opts.ChartVersion) + localPath, err = h.PullByRefWithRetries(ctx, opts.ChartPath, opts.ChartVersion, 3) if err != nil { return nil, fmt.Errorf("pull: %w", err) } diff --git a/pkg/kubeutils/interface.go b/pkg/kubeutils/interface.go index a97adb5bd..a639c5844 100644 --- a/pkg/kubeutils/interface.go +++ b/pkg/kubeutils/interface.go @@ -33,7 +33,6 @@ type KubeUtilsInterface interface { IsDeploymentReady(ctx context.Context, cli client.Client, ns, name string) (bool, error) IsStatefulSetReady(ctx context.Context, cli client.Client, ns, name string) (bool, error) IsDaemonsetReady(ctx context.Context, cli client.Client, ns, name string) (bool, error) - IsJobComplete(ctx context.Context, cli client.Client, ns, name string, completions int32) (bool, error) WaitForKubernetes(ctx context.Context, cli client.Client) <-chan error WaitForCRDToBeReady(ctx context.Context, cli client.Client, name string) error KubeClient() (client.Client, error) @@ -106,10 +105,6 @@ func IsDaemonsetReady(ctx context.Context, cli client.Client, ns, name string) ( return kb.IsDaemonsetReady(ctx, cli, ns, name) } -func IsJobComplete(ctx context.Context, cli client.Client, ns, name string, completions int32) (bool, error) { - return kb.IsJobComplete(ctx, cli, ns, name, completions) -} - func WaitForKubernetes(ctx context.Context, cli client.Client) <-chan error { return kb.WaitForKubernetes(ctx, cli) } diff --git a/pkg/kubeutils/kubeutils.go b/pkg/kubeutils/kubeutils.go index 15f2811cd..31d9d69e9 100644 --- a/pkg/kubeutils/kubeutils.go +++ b/pkg/kubeutils/kubeutils.go @@ -10,6 +10,7 @@ import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" @@ -117,19 +118,39 @@ func (k *KubeUtils) WaitForJob(ctx context.Context, cli client.Client, ns, name var lasterr error if err := wait.ExponentialBackoffWithContext( ctx, backoff, func(ctx context.Context) (bool, error) { - ready, err := k.IsJobComplete(ctx, cli, ns, name, completions) - if err != nil { - lasterr = fmt.Errorf("unable to get job status: %w", err) + var job batchv1.Job + err := cli.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, &job) + if k8serrors.IsNotFound(err) { + // exit + lasterr = fmt.Errorf("job not found") + return false, lasterr + } else if err != nil { + lasterr = fmt.Errorf("unable to get job: %w", err) return false, nil } - return ready, nil + + failed := k.isJobFailed(job) + if failed { + // exit + lasterr = fmt.Errorf("job failed") + return false, lasterr + } + + completed := k.isJobCompleted(job, completions) + if completed { + return true, nil + } + + // TODO: need to handle the case where the pod get stuck in pending + // This can happen if nodes are not schedulable or if a volume is not found + + return false, nil }, ); err != nil { if lasterr != nil { - return fmt.Errorf("timed out waiting for job %s: %w", name, lasterr) - } else { - return fmt.Errorf("timed out waiting for job %s", name) + return lasterr } + return fmt.Errorf("timed out waiting for job %s", name) } return nil } @@ -277,17 +298,20 @@ func (k *KubeUtils) IsDaemonsetReady(ctx context.Context, cli client.Client, ns, return false, nil } -// IsJobComplete returns true if the job has been completed successfully. -func (k *KubeUtils) IsJobComplete(ctx context.Context, cli client.Client, ns, name string, completions int32) (bool, error) { - var job batchv1.Job - nsn := types.NamespacedName{Namespace: ns, Name: name} - if err := cli.Get(ctx, nsn, &job); err != nil { - return false, err - } - if job.Status.Succeeded >= completions { - return true, nil +// isJobCompleted returns true if the job has been completed successfully. +func (k *KubeUtils) isJobCompleted(job batchv1.Job, completions int32) bool { + isSucceeded := job.Status.Succeeded >= completions + return isSucceeded +} + +// isJobFailed if the job has exceeded the backoff limit. +func (k *KubeUtils) isJobFailed(job batchv1.Job) bool { + backoffLimit := int32(6) // default + if job.Spec.BackoffLimit != nil { + backoffLimit = *job.Spec.BackoffLimit } - return false, nil + exceedsBackoffLimit := job.Status.Failed > backoffLimit + return exceedsBackoffLimit } // IsPodComplete returns true if the pod has completed.