Skip to content

Commit

Permalink
Fix the clusterstatus based on last operatorAction
Browse files Browse the repository at this point in the history
  • Loading branch information
prafull01 committed Nov 12, 2024
1 parent e7f898b commit f157001
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 50 deletions.
2 changes: 1 addition & 1 deletion .bazelrc
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
build --workspace_status_command hack/build/print-workspace-status.sh
test --test_output=errors --test_timeout=-1,-1,-1,2400
test --test_output=all --test_timeout=-1,-1,-1,2400
2 changes: 1 addition & 1 deletion config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ rules:
resources:
- persistentvolumeclaims
verbs:
- delete
- list
- update
- delete
- apiGroups:
- ""
resources:
Expand Down
16 changes: 10 additions & 6 deletions e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ var (

// Some common values used in e2e test suites.
const (
MinorVersion1 = "cockroachdb/cockroach:v24.1.0"
MinorVersion2 = "cockroachdb/cockroach:v24.1.2"
MajorVersion = "cockroachdb/cockroach:v24.2.2"
NonExistentVersion = "cockroachdb/cockroach-non-existent:v21.1.999"
SkipFeatureVersion = "cockroachdb/cockroach:v20.1.0"
InvalidImage = "nginx:latest"
MinorVersion1 = "cockroachdb/cockroach:v24.1.0"
MinorVersion2 = "cockroachdb/cockroach:v24.1.2"
MajorVersion = "cockroachdb/cockroach:v24.2.2"
NonExistentVersion = "cockroachdb/cockroach-non-existent:v21.1.999"
SkipFeatureVersion = "cockroachdb/cockroach:v20.1.0"
InvalidImage = "nginx:latest"
DefaultCPULimit = "800m"
DefaultMemoryLimit = "1.5Gi"
DefaultCPURequest = "500m"
DefaultMemoryRequest = "1Gi"
)
2 changes: 2 additions & 0 deletions e2e/upgrades/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ go_test(
"//pkg/testutil/env:go_default_library",
"@com_github_go_logr_zapr//:go_default_library",
"@com_github_stretchr_testify//require:go_default_library",
"@io_k8s_api//core/v1:go_default_library",
"@io_k8s_apimachinery//pkg/api/resource:go_default_library",
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
"@org_uber_go_zap//zaptest:go_default_library",
],
Expand Down
32 changes: 25 additions & 7 deletions e2e/upgrades/upgrades_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,21 @@ import (
"github.com/go-logr/zapr"
"github.com/stretchr/testify/require"
"go.uber.org/zap/zaptest"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

var (
resRequirements = corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(e2e.DefaultCPULimit),
corev1.ResourceMemory: resource.MustParse(e2e.DefaultMemoryLimit),
},
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(e2e.DefaultCPURequest),
corev1.ResourceMemory: resource.MustParse(e2e.DefaultMemoryRequest),
},
}
)

// TestUpgradesMinorVersion tests a minor version bump
Expand All @@ -52,7 +67,7 @@ func TestUpgradesMinorVersion(t *testing.T) {

builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
WithImage(e2e.MinorVersion1).
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down Expand Up @@ -103,7 +118,7 @@ func TestUpgradesMajorVersion20to21(t *testing.T) {

builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
WithImage(e2e.MinorVersion2).
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down Expand Up @@ -152,7 +167,7 @@ func TestUpgradesMajorVersion20_1To20_2(t *testing.T) {

builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
WithImage("cockroachdb/cockroach:v20.1.16").
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down Expand Up @@ -209,7 +224,8 @@ func TestUpgradesMinorVersionThenRollback(t *testing.T) {
WithNodeCount(3).
WithTLS().
WithImage(e2e.MinorVersion1).
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").
WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down Expand Up @@ -274,7 +290,8 @@ func TestUpgradeWithInvalidVersion(t *testing.T) {

builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
WithImage(e2e.MinorVersion1).
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").
WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down Expand Up @@ -324,7 +341,8 @@ func TestUpgradeWithInvalidImage(t *testing.T) {

builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
WithImage(e2e.MinorVersion1).
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").
WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down Expand Up @@ -374,7 +392,7 @@ func TestUpgradeWithMajorVersionExcludingMajorFeature(t *testing.T) {

builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
WithImage(e2e.SkipFeatureVersion).
WithPVDataStore("1Gi")
WithPVDataStore("1Gi").WithResources(resRequirements)

steps := testutil.Steps{
{
Expand Down
2 changes: 1 addition & 1 deletion install/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ rules:
resources:
- persistentvolumeclaims
verbs:
- delete
- list
- update
- delete
- apiGroups:
- ""
resources:
Expand Down
27 changes: 13 additions & 14 deletions pkg/actor/validate_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
cluster.SetCrdbContainerImage(containerImage)
cluster.SetAnnotationContainerImage(containerImage)
jobName := cluster.JobName()
changed, err := (resource.Reconciler{
_, err := (resource.Reconciler{
ManagedResource: r,
Builder: resource.JobBuilder{
Cluster: cluster,
Expand All @@ -132,17 +132,10 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
log.Error(err, "failed to reconcile job")
return err
}
log.Error(err, "failed to reconcile job only err: ", err.Error())
log.Error(err, "failed to reconcile job only error")
return err
}

if changed {
log.V(int(zapcore.DebugLevel)).Info("created/updated job, stopping request processing")
// Return a non error error here to prevent the controller from
// clearing any previously set Status fields.
return NotReadyErr{errors.New("job changed")}
}

log.V(int(zapcore.DebugLevel)).Info("version checker", "job", jobName)
key := kubetypes.NamespacedName{
Namespace: cluster.Namespace(),
Expand Down Expand Up @@ -184,11 +177,15 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
}
return errors.Wrapf(err, "failed to check the version of the crdb")
}

// Tail only last two lines as one line contains version info and one line has newline character. It is much faster.
podLines := int64(2)
podLogOpts := corev1.PodLogOptions{
Container: resource.JobContainerName,
TailLines: &podLines,
}
//get pod for the job we created

//get pod for the job we created
pods, err := v.clientset.CoreV1().Pods(job.Namespace).List(ctx, metav1.ListOptions{
LabelSelector: labels.Set(job.Spec.Selector.MatchLabels).AsSelector().String(),
})
Expand Down Expand Up @@ -231,11 +228,12 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
}
output := buf.String()

log.Info("Logs of version checker pod", "log", output)
// This is the value from Build Tag taken from the container
calVersion = strings.Replace(output, "\n", "", -1)
// if no image is retrieved we exit
// if no logs are found we will retry later to fetch the logs
if calVersion == "" {
err := PermanentErr{Err: errors.New("failed to check the version of the cluster")}
err := NotReadyErr{Err: errors.New("failed to check the version of the cluster")}
log.Error(err, "crdb version not found")
return err
}
Expand Down Expand Up @@ -366,7 +364,8 @@ func IsJobPodRunning(
}
pod := pods.Items[0]
if !kube.IsPodReady(&pod) {
return LogError("job pod is not ready yet waiting longer", nil, l)
pErr := fmt.Errorf("pod %s is not ready, and in %s phase", pod.Name, pod.Status.Phase)
return LogError("job pod is not ready yet waiting longer", pErr, l)
}
l.V(int(zapcore.DebugLevel)).Info("job pod is ready")
return nil
Expand Down Expand Up @@ -406,7 +405,7 @@ func WaitUntilJobPodIsRunning(ctx context.Context, clientset kubernetes.Interfac
return IsJobPodRunning(ctx, clientset, job, l)
}
b := backoff.NewExponentialBackOff()
b.MaxElapsedTime = 120 * time.Second
b.MaxElapsedTime = 180 * time.Second
b.MaxInterval = 10 * time.Second
if err := backoff.Retry(f, b); err != nil {
return errors.Wrapf(err, "pod is not running for job: %s", job.Name)
Expand Down
24 changes: 9 additions & 15 deletions pkg/clusterstatus/clusterstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,19 @@ func SetClusterStatusOnFirstReconcile(status *api.CrdbClusterStatus) {
status.ClusterStatus = api.ActionStatus(api.Starting).String()
}

// TODO: Do we need to check for all OperatorActions or for just the 0th element in SetClusterStatus func
// nolint
// SetClusterStatus will set the status of the cluster based on the status of the operator actions
// operatorActions are not always appended, rather any particular action type is replaced if its status changes.
// The status of the cluster is set to the status of the last operator action based on lastTransitionTime.
func SetClusterStatus(status *api.CrdbClusterStatus) {
InitOperatorActionsIfNeeded(status, metav1.Now())
for _, a := range status.OperatorActions {
if a.Status == api.ActionStatus(api.Failed).String() {
status.ClusterStatus = api.ActionStatus(api.Failed).String()
return
for i := range status.OperatorActions {
if i == 0 || status.OperatorActions[i-1].LastTransitionTime.Before(&status.OperatorActions[i].LastTransitionTime) {
status.ClusterStatus = status.OperatorActions[i].Status
}

if a.Status == api.ActionStatus(api.Unknown).String() {
status.ClusterStatus = api.ActionStatus(api.Unknown).String()
return
}

status.ClusterStatus = api.ActionStatus(api.Finished).String()
return
}
status.ClusterStatus = api.ActionStatus(api.Finished).String()
if status.ClusterStatus == "" {
status.ClusterStatus = api.ActionStatus(api.Starting).String()
}
}

func InitOperatorActionsIfNeeded(status *api.CrdbClusterStatus, now metav1.Time) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ type ClusterReconciler struct {
// +kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create
// +kubebuilder:rbac:groups=core,resources=pods/log,verbs=get
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=list;update
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=list;update;delete
// +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;create;watch
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;create;watch
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;create;watch
Expand Down
5 changes: 1 addition & 4 deletions pkg/resource/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (b JobBuilder) Build(obj client.Object) error {
job.Annotations = b.Spec().AdditionalAnnotations

// we recreate spec from ground only if we do not find the container job
if dbContainer, err := kube.FindContainer(JobContainerName, &job.Spec.Template.Spec); err != nil {
if _, err := kube.FindContainer(JobContainerName, &job.Spec.Template.Spec); err != nil {
backoffLimit := int32(2)
job.Spec = kbatch.JobSpec{
// This field is alpha-level and is only honored by servers that enable the TTLAfterFinished feature.
Expand All @@ -71,9 +71,6 @@ func (b JobBuilder) Build(obj client.Object) error {
Template: b.buildPodTemplate(),
BackoffLimit: &backoffLimit,
}
} else {
//if job with the container already exists we update the image only
dbContainer.Image = b.GetCockroachDBImageName()
}

return nil
Expand Down

0 comments on commit f157001

Please sign in to comment.