diff --git a/api/v1beta1/disruption_rollout_types.go b/api/v1beta1/disruption_rollout_types.go deleted file mode 100644 index deecfd2cc3..0000000000 --- a/api/v1beta1/disruption_rollout_types.go +++ /dev/null @@ -1,81 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2025 Datadog, Inc. - -package v1beta1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func init() { - SchemeBuilder.Register(&DisruptionRollout{}, &DisruptionRolloutList{}) -} - -//+kubebuilder:object:root=true - -// DisruptionRollout is the Schema for the disruptionrollout API -// +kubebuilder:resource:shortName=diroll -// +kubebuilder:subresource:status -type DisruptionRollout struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - Spec DisruptionRolloutSpec `json:"spec,omitempty"` - Status DisruptionRolloutStatus `json:"status,omitempty"` -} - -// +kubebuilder:object:root=true - -// DisruptionRolloutList contains a list of DisruptionRollout -type DisruptionRolloutList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []DisruptionRollout `json:"items"` -} - -// DisruptionRolloutSpec defines the desired state of DisruptionRollout -type DisruptionRolloutSpec struct { - // DelayedStartTolerance specifies the allowed deadline to start the disruption - // after detecting a change in the target resource. If the disruption - // does not start within this duration, the execution is considered failed. - // +nullable - DelayedStartTolerance DisruptionDuration `json:"delayedStartTolerance,omitempty"` - - // +kubebuilder:validation:Required - // TargetResource specifies the resource to run disruptions against. - // It can only be a deployment or statefulset. - TargetResource TargetResourceSpec `json:"targetResource"` - - // +kubebuilder:validation:Required - // Specifies the Disruption that will be created when executing a disruptionrollout. - DisruptionTemplate DisruptionSpec `json:"disruptionTemplate"` -} - -// DisruptionRolloutStatus defines the observed state of DisruptionRollout -type DisruptionRolloutStatus struct { - // LatestInitContainersHash provides a map of the latest observed hashes for - // each InitContainer of the TargetResource. - // The key is the name of the InitContainer, and the value is its MD5 hash. - // +nullable - LatestInitContainersHash map[string]string `json:"latestInitContainersHash,omitempty"` - - // LatestContainersHash provides a map of the latest observed hashes for - // each Container of the TargetResource. - // The key is the name of the Container, and the value is its MD5 hash. - // +nullable - LatestContainersHash map[string]string `json:"latestContainersHash,omitempty"` - - // LastModificationTimestamp captures the time when a change in the containers - // of the TargetResource was detected. - // +nullable - LastContainerChangeTime *metav1.Time `json:"lastContainerChangeTime,omitempty"` - - // The last time when the disruption was last successfully scheduled. - // +nullable - LastScheduleTime *metav1.Time `json:"lastScheduleTime,omitempty"` - - // Time when the target resource was previously missing. - // +nullable - TargetResourcePreviouslyMissing *metav1.Time `json:"targetResourcePreviouslyMissing,omitempty"` -} diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index e45a1666fa..38774020ee 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -430,123 +430,6 @@ func (in *DisruptionPulse) DeepCopy() *DisruptionPulse { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DisruptionRollout) DeepCopyInto(out *DisruptionRollout) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DisruptionRollout. -func (in *DisruptionRollout) DeepCopy() *DisruptionRollout { - if in == nil { - return nil - } - out := new(DisruptionRollout) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *DisruptionRollout) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DisruptionRolloutList) DeepCopyInto(out *DisruptionRolloutList) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]DisruptionRollout, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DisruptionRolloutList. -func (in *DisruptionRolloutList) DeepCopy() *DisruptionRolloutList { - if in == nil { - return nil - } - out := new(DisruptionRolloutList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *DisruptionRolloutList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DisruptionRolloutSpec) DeepCopyInto(out *DisruptionRolloutSpec) { - *out = *in - out.TargetResource = in.TargetResource - in.DisruptionTemplate.DeepCopyInto(&out.DisruptionTemplate) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DisruptionRolloutSpec. -func (in *DisruptionRolloutSpec) DeepCopy() *DisruptionRolloutSpec { - if in == nil { - return nil - } - out := new(DisruptionRolloutSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DisruptionRolloutStatus) DeepCopyInto(out *DisruptionRolloutStatus) { - *out = *in - if in.LatestInitContainersHash != nil { - in, out := &in.LatestInitContainersHash, &out.LatestInitContainersHash - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - if in.LatestContainersHash != nil { - in, out := &in.LatestContainersHash, &out.LatestContainersHash - *out = make(map[string]string, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - if in.LastContainerChangeTime != nil { - in, out := &in.LastContainerChangeTime, &out.LastContainerChangeTime - *out = (*in).DeepCopy() - } - if in.LastScheduleTime != nil { - in, out := &in.LastScheduleTime, &out.LastScheduleTime - *out = (*in).DeepCopy() - } - if in.TargetResourcePreviouslyMissing != nil { - in, out := &in.TargetResourcePreviouslyMissing, &out.TargetResourcePreviouslyMissing - *out = (*in).DeepCopy() - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DisruptionRolloutStatus. -func (in *DisruptionRolloutStatus) DeepCopy() *DisruptionRolloutStatus { - if in == nil { - return nil - } - out := new(DisruptionRolloutStatus) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DisruptionSpec) DeepCopyInto(out *DisruptionSpec) { *out = *in diff --git a/chart/templates/clusterroles.yaml b/chart/templates/clusterroles.yaml index cf3de3bc07..1b099e38ff 100644 --- a/chart/templates/clusterroles.yaml +++ b/chart/templates/clusterroles.yaml @@ -15,7 +15,7 @@ metadata: rbac.authorization.k8s.io/aggregate-to-admin: "true" rules: - apiGroups: ["chaos.datadoghq.com"] - resources: ["disruptions", "disruptioncrons", "disruptionrollouts"] + resources: ["disruptions", "disruptioncrons"] verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 @@ -28,6 +28,6 @@ metadata: rbac.authorization.k8s.io/aggregate-to-admin: "true" rules: - apiGroups: ["chaos.datadoghq.com"] - resources: ["disruptions", "disruptioncrons", "disruptionrollouts"] + resources: ["disruptions", "disruptioncrons"] verbs: ["create", "delete", "deletecollection", "patch", "update"] {{- end }} diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 33d9384a38..330c56a3d9 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -85,7 +85,6 @@ data: allowNodeLevel: {{ .Values.controller.safeMode.allowNodeLevel }} allowNodeFailure: {{ .Values.controller.safeMode.allowNodeFailure }} disruptionCronEnabled: {{ .Values.controller.disruptionCronEnabled }} - disruptionRolloutEnabled: {{ .Values.controller.disruptionRolloutEnabled }} disruptionDeletionTimeout: {{ .Values.controller.disruptionDeletionTimeout }} disabledDisruptions: {{- range $index, $kind := .Values.controller.disabledDisruptions }} diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml deleted file mode 100644 index 48937d8183..0000000000 --- a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml +++ /dev/null @@ -1,689 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.14.0 - name: disruptionrollouts.chaos.datadoghq.com -spec: - group: chaos.datadoghq.com - names: - kind: DisruptionRollout - listKind: DisruptionRolloutList - plural: disruptionrollouts - shortNames: - - diroll - singular: disruptionrollout - scope: Namespaced - versions: - - name: v1beta1 - schema: - openAPIV3Schema: - description: DisruptionRollout is the Schema for the disruptionrollout API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: DisruptionRolloutSpec defines the desired state of DisruptionRollout - properties: - delayedStartTolerance: - description: |- - DelayedStartTolerance specifies the allowed deadline to start the disruption - after detecting a change in the target resource. If the disruption - does not start within this duration, the execution is considered failed. - nullable: true - type: string - disruptionTemplate: - description: Specifies the Disruption that will be created when executing a disruptionrollout. - properties: - advancedSelector: - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - nullable: true - type: array - allowDisruptedTargets: - description: |- - AllowDisruptedTargets allow pods with one or several other active disruptions, with disruption kinds that does not intersect - with this disruption kinds, to be returned as part of eligible targets for this disruption - - e.g. apply a CPU pressure and later, apply a container failure for a short duration - NB: it's ALWAYS forbidden to apply the same disruption kind to the same target to avoid unreliable effects due to competing interactions - type: boolean - containerFailure: - description: ContainerFailureSpec represents a container failure injection - nullable: true - properties: - forced: - type: boolean - type: object - containers: - items: - type: string - type: array - count: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - cpuPressure: - description: CPUPressureSpec represents a cpu pressure disruption - nullable: true - properties: - count: - anyOf: - - type: integer - - type: string - description: |- - Count represents the number of cores to target - either an integer form or a percentage form appended with a % - if empty, it will be considered to be 100% - x-kubernetes-int-or-string: true - type: object - diskFailure: - description: DiskFailureSpec represents a disk failure disruption - nullable: true - properties: - openat: - description: OpenatSyscallSpec syscall specs - nullable: true - properties: - exitCode: - description: 'Refer to this documentation: https://linux.die.net/man/2/open' - enum: - - EACCES - - EDQUOT - - EEXIST - - EFAULT - - EFBIG - - EINTR - - EISDIR - - ELOOP - - EMFILE - - ENAMETOOLONG - - ENFILE - - ENODEV - - ENOENT - - ENOMEM - - ENOSPC - - ENOTDIR - - ENXIO - - EOVERFLOW - - EPERM - - EROFS - - ETXTBSY - - EWOULDBLOCK - type: string - required: - - exitCode - type: object - paths: - items: - type: string - type: array - probability: - type: string - required: - - paths - type: object - diskPressure: - description: DiskPressureSpec represents a disk pressure disruption - nullable: true - properties: - path: - type: string - throttling: - description: DiskPressureThrottlingSpec represents a throttle on read and write disk operations - properties: - readBytesPerSec: - type: integer - writeBytesPerSec: - type: integer - type: object - required: - - path - - throttling - type: object - dns: - description: DNSDisruptionSpec represents a dns disruption - items: - description: HostRecordPair represents a hostname and a corresponding dns record override - properties: - hostname: - type: string - record: - description: DNSRecord represents a type of DNS Record, such as A or CNAME, and the value of that record - properties: - type: - type: string - value: - type: string - required: - - type - - value - type: object - required: - - hostname - - record - type: object - nullable: true - type: array - dryRun: - type: boolean - duration: - type: string - filter: - nullable: true - properties: - annotations: - additionalProperties: - type: string - description: Set is a map of label:value. It implements Labels. - type: object - type: object - grpc: - description: GRPCDisruptionSpec represents a gRPC disruption - nullable: true - properties: - endpoints: - items: - description: EndpointAlteration represents an endpoint to disrupt and the corresponding error to return - properties: - endpoint: - type: string - error: - enum: - - OK - - CANCELED - - UNKNOWN - - INVALID_ARGUMENT - - DEADLINE_EXCEEDED - - NOT_FOUND - - ALREADY_EXISTS - - PERMISSION_DENIED - - RESOURCE_EXHAUSTED - - FAILED_PRECONDITION - - ABORTED - - OUT_OF_RANGE - - UNIMPLEMENTED - - INTERNAL - - UNAVAILABLE - - DATA_LOSS - - UNAUTHENTICATED - type: string - override: - type: string - queryPercent: - maximum: 100 - minimum: 0 - type: integer - required: - - endpoint - type: object - type: array - port: - maximum: 65535 - minimum: 1 - type: integer - required: - - endpoints - - port - type: object - level: - default: pod - description: Level defines what the disruption will target, either a pod or a node - enum: - - pod - - node - type: string - network: - description: NetworkDisruptionSpec represents a network disruption injection - nullable: true - properties: - allowedHosts: - items: - properties: - connState: - enum: - - new - - est - - "" - type: string - flow: - enum: - - ingress - - egress - - "" - type: string - host: - type: string - port: - maximum: 65535 - minimum: 0 - type: integer - protocol: - enum: - - tcp - - udp - - "" - type: string - type: object - nullable: true - type: array - bandwidthLimit: - minimum: 0 - type: integer - cloud: - nullable: true - properties: - aws: - items: - properties: - connState: - enum: - - new - - est - - "" - type: string - flow: - enum: - - ingress - - egress - - "" - type: string - protocol: - enum: - - tcp - - udp - - "" - type: string - service: - type: string - required: - - service - type: object - type: array - datadog: - items: - properties: - connState: - enum: - - new - - est - - "" - type: string - flow: - enum: - - ingress - - egress - - "" - type: string - protocol: - enum: - - tcp - - udp - - "" - type: string - service: - type: string - required: - - service - type: object - type: array - gcp: - items: - properties: - connState: - enum: - - new - - est - - "" - type: string - flow: - enum: - - ingress - - egress - - "" - type: string - protocol: - enum: - - tcp - - udp - - "" - type: string - service: - type: string - required: - - service - type: object - type: array - type: object - corrupt: - maximum: 100 - minimum: 0 - type: integer - delay: - maximum: 60000 - minimum: 0 - type: integer - delayJitter: - maximum: 100 - minimum: 0 - type: integer - disableDefaultAllowedHosts: - type: boolean - drop: - maximum: 100 - minimum: 0 - type: integer - duplicate: - maximum: 100 - minimum: 0 - type: integer - hosts: - items: - properties: - connState: - enum: - - new - - est - - "" - type: string - flow: - enum: - - ingress - - egress - - "" - type: string - host: - type: string - port: - maximum: 65535 - minimum: 0 - type: integer - protocol: - enum: - - tcp - - udp - - "" - type: string - type: object - nullable: true - type: array - http: - description: NetworkHTTPFilters contains http filters - nullable: true - properties: - methods: - items: - type: string - type: array - paths: - items: - type: string - type: array - type: object - services: - items: - properties: - name: - type: string - namespace: - type: string - ports: - items: - properties: - name: - type: string - port: - maximum: 65535 - minimum: 0 - type: integer - type: object - type: array - required: - - name - - namespace - type: object - nullable: true - type: array - type: object - nodeFailure: - description: NodeFailureSpec represents a node failure injection - nullable: true - properties: - shutdown: - type: boolean - type: object - onInit: - type: boolean - pulse: - description: DisruptionPulse contains the active disruption duration and the dormant disruption duration - nullable: true - properties: - activeDuration: - type: string - dormantDuration: - type: string - initialDelay: - type: string - type: object - reporting: - description: |- - Reporting provides additional reporting options in order to send a message to a custom slack channel - it expects the main controller to have the slack notifier enabled - it expects a slack bot to be added to the defined slack channel - nullable: true - properties: - minNotificationType: - description: |- - MinNotificationType is the minimal notification type we want to receive informations for - In order of importance it's Info, Success, Warning, Error - Default level is considered Success, meaning all info will be ignored - enum: - - Info - - Success - - Completion - - Warning - - Error - type: string - purpose: - description: |- - Purpose determines contextual informations about the disruption - a brief context to determines disruption goal - minLength: 10 - type: string - slackChannel: - description: |- - SlackChannel is the destination slack channel to send reporting informations to. - It's expected to follow slack naming conventions https://api.slack.com/methods/conversations.create#naming or slack channel ID format - maxLength: 80 - pattern: (^[a-z0-9-_]+$)|(^C[A-Z0-9]+$) - type: string - slackUserEmail: - description: SlackUserEmail is the email of the user to send reporting information to - maxLength: 320 - type: string - type: object - selector: - additionalProperties: - type: string - description: Set is a map of label:value. It implements Labels. - nullable: true - type: object - staticTargeting: - type: boolean - triggers: - description: DisruptionTriggers holds the options for changing when injector pods are created, and the timing of when the injection occurs - nullable: true - properties: - createPods: - properties: - notBefore: - description: |- - inject.notBefore: Normal reconciliation and chaos pod creation will occur, but chaos pods will wait to inject until NotInjectedBefore. Must be after NoPodsBefore if both are specified - createPods.notBefore: Will skip reconciliation until this time, no chaos pods will be created until after NoPodsBefore - format: date-time - nullable: true - type: string - offset: - description: |- - inject.offset: Identical to NotBefore, but specified as an offset from max(CreationTimestamp, NoPodsBefore) instead of as a metav1.Time - pods.offset: Identical to NotBefore, but specified as an offset from CreationTimestamp instead of as a metav1.Time - nullable: true - type: string - type: object - inject: - properties: - notBefore: - description: |- - inject.notBefore: Normal reconciliation and chaos pod creation will occur, but chaos pods will wait to inject until NotInjectedBefore. Must be after NoPodsBefore if both are specified - createPods.notBefore: Will skip reconciliation until this time, no chaos pods will be created until after NoPodsBefore - format: date-time - nullable: true - type: string - offset: - description: |- - inject.offset: Identical to NotBefore, but specified as an offset from max(CreationTimestamp, NoPodsBefore) instead of as a metav1.Time - pods.offset: Identical to NotBefore, but specified as an offset from CreationTimestamp instead of as a metav1.Time - nullable: true - type: string - type: object - type: object - unsafeMode: - description: |- - UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption - All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true - properties: - allowRootDiskFailure: - type: boolean - config: - description: Config represents any configurable parameters for the safetynets, all of which have defaults - properties: - countTooLarge: - description: CountTooLargeConfig represents the configuration for the countTooLarge safetynet - properties: - clusterThreshold: - maximum: 100 - minimum: 1 - type: integer - namespaceThreshold: - maximum: 100 - minimum: 1 - type: integer - type: object - type: object - disableAll: - type: boolean - disableCountTooLarge: - type: boolean - disableNeitherHostNorPort: - type: boolean - disableSpecificContainDisk: - type: boolean - type: object - required: - - count - type: object - targetResource: - description: |- - TargetResource specifies the resource to run disruptions against. - It can only be a deployment or statefulset. - properties: - kind: - description: 'Kind specifies the type of the long-lived resource. Allowed values: "deployment", "statefulset".' - enum: - - deployment - - statefulset - type: string - name: - description: Name specifies the name of the specific instance of the long-lived resource to be targeted. - type: string - required: - - kind - - name - type: object - required: - - disruptionTemplate - - targetResource - type: object - status: - description: DisruptionRolloutStatus defines the observed state of DisruptionRollout - properties: - lastContainerChangeTime: - description: |- - LastModificationTimestamp captures the time when a change in the containers - of the TargetResource was detected. - format: date-time - nullable: true - type: string - lastScheduleTime: - description: The last time when the disruption was last successfully scheduled. - format: date-time - nullable: true - type: string - latestContainersHash: - additionalProperties: - type: string - description: |- - LatestContainersHash provides a map of the latest observed hashes for - each Container of the TargetResource. - The key is the name of the Container, and the value is its MD5 hash. - nullable: true - type: object - latestInitContainersHash: - additionalProperties: - type: string - description: |- - LatestInitContainersHash provides a map of the latest observed hashes for - each InitContainer of the TargetResource. - The key is the name of the InitContainer, and the value is its MD5 hash. - nullable: true - type: object - targetResourcePreviouslyMissing: - description: Time when the target resource was previously missing. - format: date-time - nullable: true - type: string - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/chart/templates/generated/role.yaml b/chart/templates/generated/role.yaml index f073801daf..9668c6cd85 100644 --- a/chart/templates/generated/role.yaml +++ b/chart/templates/generated/role.yaml @@ -15,7 +15,6 @@ rules: - chaos.datadoghq.com resources: - disruptioncrons - - disruptionrollouts - disruptions verbs: - create @@ -29,7 +28,6 @@ rules: - chaos.datadoghq.com resources: - disruptioncrons/finalizers - - disruptionrollouts/finalizers - disruptions/finalizers verbs: - update @@ -37,7 +35,6 @@ rules: - chaos.datadoghq.com resources: - disruptioncrons/status - - disruptionrollouts/status - disruptions/status verbs: - get diff --git a/chart/values.yaml b/chart/values.yaml index a84c7febe3..1e2760c70d 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -72,7 +72,7 @@ controller: defaultCronDelayedStartTolerance: 15m minimumCronFrequency: 15m # a disruption cron with a spec.schedule that runs more often than this will be rejected. finalizerDeletionDelay: 20s - targetResourceMissingThreshold: 24h # duration after a cron or rollout self-delete if target is missing for this long + targetResourceMissingThreshold: 24h # duration after a cron self-delete if target is missing for this long expiredDisruptionGCDelay: 10m # time after a disruption expires before deleting it userInfoHook: true webhook: # admission webhook configuration @@ -95,7 +95,6 @@ controller: memory: 300Mi ephemeralStorage: 1Gi disruptionCronEnabled: true - disruptionRolloutEnabled: false disruptionDeletionTimeout: 15m # The duration after which a disruption will be marked as "stuck on removal" if its removal process exceeds this duration. aggregateToClusterRole: false # If this is true two aggregated cluster roles are created for viewing and editing (https://kubernetes.io/docs/reference/access-authn-authz/rbac/#aggregated-clusterroles) # disabledDisruptions: # List of disruption kinds to disable at admission diff --git a/chart/values/local.yaml b/chart/values/local.yaml index d36685981c..cfa9355cb9 100644 --- a/chart/values/local.yaml +++ b/chart/values/local.yaml @@ -49,7 +49,6 @@ controller: host: "" port: 9443 disruptionCronEnabled: true - disruptionRolloutEnabled: false injector: serviceAccount: chaos-injector chaosNamespace: chaos-engineering diff --git a/config/config.go b/config/config.go index db7a57c0e5..cb288979e6 100644 --- a/config/config.go +++ b/config/config.go @@ -52,7 +52,6 @@ type controllerConfig struct { ProfilerSink string `json:"profilerSink" yaml:"profilerSink"` TracerSink string `json:"tracerSink" yaml:"tracerSink"` DisruptionCronEnabled bool `json:"disruptionCronEnabled" yaml:"disruptionCronEnabled"` - DisruptionRolloutEnabled bool `json:"disruptionRolloutEnabled" yaml:"disruptionRolloutEnabled"` DisruptionDeletionTimeout time.Duration `json:"disruptionDeletionTimeout" yaml:"disruptionDeletionTimeout"` FinalizerDeletionDelay time.Duration `json:"finalizerDeletionDelay" yaml:"finalizerDeletionDelay"` TargetResourceMissingThreshold time.Duration `json:"targetResourceMissingThreshold" yaml:"targetResourceMissingThreshold"` @@ -552,12 +551,6 @@ func New(client corev1client.ConfigMapInterface, logger *zap.SugaredLogger, osAr return cfg, err } - mainFS.BoolVar(&cfg.Controller.DisruptionRolloutEnabled, "disruption-rollout-enabled", false, "Enable the DisruptionRollout CRD and its controller") - - if err := viper.BindPFlag("controller.disruptionRolloutEnabled", mainFS.Lookup("disruption-rollout-enabled")); err != nil { - return cfg, err - } - mainFS.DurationVar(&cfg.Controller.DisruptionDeletionTimeout, "disruption-deletion-timeout", DefaultDisruptionDeletionTimeout, "If the deletion time of the disruption is greater than the delete timeout, the disruption is marked as stuck on removal") if err := viper.BindPFlag("controller.disruptionDeletionTimeout", mainFS.Lookup("disruption-deletion-timeout")); err != nil { @@ -570,7 +563,7 @@ func New(client corev1client.ConfigMapInterface, logger *zap.SugaredLogger, osAr return cfg, err } - mainFS.DurationVar(&cfg.Controller.TargetResourceMissingThreshold, "target-resource-missing-threshold", time.Hour*24, "Define the amount of time a cron or rollout will tolerate its target missing before self-deleting") + mainFS.DurationVar(&cfg.Controller.TargetResourceMissingThreshold, "target-resource-missing-threshold", time.Hour*24, "Define the amount of time a cron will tolerate its target missing before self-deleting") if err := viper.BindPFlag("controller.targetResourceMissingThreshold", mainFS.Lookup("target-resource-missing-threshold")); err != nil { return cfg, err diff --git a/controllers/cron_rollout_helpers.go b/controllers/cron_rollout_helpers.go index 09a3f4db64..69da837de7 100644 --- a/controllers/cron_rollout_helpers.go +++ b/controllers/cron_rollout_helpers.go @@ -26,8 +26,7 @@ import ( ) const ( - DisruptionCronNameLabel = chaosv1beta1.GroupName + "/disruption-cron-name" - DisruptionRolloutNameLabel = chaosv1beta1.GroupName + "/disruption-rollout-name" + DisruptionCronNameLabel = chaosv1beta1.GroupName + "/disruption-cron-name" ) // GetChildDisruptions retrieves disruptions associated with a resource by its label. @@ -209,11 +208,8 @@ func GetMostRecentScheduleTime(log *zap.SugaredLogger, disruptions *chaosv1beta1 // generateDisruptionName produces a disruption name based on the specific CR controller, that's invoking it. // It returns a formatted string name. func generateDisruptionName(owner metav1.Object) string { - switch typedOwner := owner.(type) { - case *chaosv1beta1.DisruptionCron: + if typedOwner, ok := owner.(*chaosv1beta1.DisruptionCron); ok { return fmt.Sprintf("disruption-cron-%s", typedOwner.GetName()) - case *chaosv1beta1.DisruptionRollout: - return fmt.Sprintf("disruption-rollout-%s", typedOwner.GetName()) } return "" @@ -222,11 +218,8 @@ func generateDisruptionName(owner metav1.Object) string { // getOwnerNameLabel derives the appropriate label for the owner CR. // It returns the label string. func getOwnerNameLabel(owner metav1.Object) string { - switch owner.(type) { - case *chaosv1beta1.DisruptionCron: + if _, ok := owner.(*chaosv1beta1.DisruptionCron); ok { return DisruptionCronNameLabel - case *chaosv1beta1.DisruptionRollout: - return DisruptionRolloutNameLabel } return "" diff --git a/controllers/disruption_controller.go b/controllers/disruption_controller.go index f6e5807910..ae298e068d 100644 --- a/controllers/disruption_controller.go +++ b/controllers/disruption_controller.go @@ -5,9 +5,9 @@ package controllers -// +kubebuilder:rbac:groups=chaos.datadoghq.com,resources=disruptions;disruptioncrons;disruptionrollouts,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=chaos.datadoghq.com,resources=disruptions/status;disruptioncrons/status;disruptionrollouts/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=chaos.datadoghq.com,resources=disruptions/finalizers;disruptioncrons/finalizers;disruptionrollouts/finalizers,verbs=update +// +kubebuilder:rbac:groups=chaos.datadoghq.com,resources=disruptions;disruptioncrons,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=chaos.datadoghq.com,resources=disruptions/status;disruptioncrons/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=chaos.datadoghq.com,resources=disruptions/finalizers;disruptioncrons/finalizers,verbs=update // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;update;patch diff --git a/controllers/disruption_rollout_controller.go b/controllers/disruption_rollout_controller.go deleted file mode 100644 index 39dd307260..0000000000 --- a/controllers/disruption_rollout_controller.go +++ /dev/null @@ -1,289 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2025 Datadog, Inc. - -package controllers - -import ( - "context" - "fmt" - "math/rand" - "time" - - cLog "github.com/DataDog/chaos-controller/log" - "github.com/DataDog/chaos-controller/o11y/metrics" - - chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1" - "go.uber.org/zap" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var DisruptionRolloutTags = []string{} - -type DisruptionRolloutReconciler struct { - Client client.Client - Scheme *runtime.Scheme - BaseLog *zap.SugaredLogger - log *zap.SugaredLogger - MetricsSink metrics.Sink - TargetResourceMissingThreshold time.Duration -} - -func (r *DisruptionRolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { - r.log = r.BaseLog.With("disruptionRolloutNamespace", req.Namespace, "disruptionRolloutName", req.Name) - r.log.Info("Reconciling DisruptionRollout") - - instance := &chaosv1beta1.DisruptionRollout{} - randSource := rand.New(rand.NewSource(time.Now().UnixNano())) - - // reconcile metrics - r.handleMetricSinkError(r.MetricsSink.MetricReconcile()) - - defer func(tsStart time.Time) { - tags := []string{} - if instance.Name != "" { - tags = append(tags, "disruptionRolloutName:"+instance.Name, "disruptionRolloutNamespace:"+instance.Namespace) - } - - r.handleMetricSinkError(r.MetricsSink.MetricReconcileDuration(time.Since(tsStart), tags)) - }(time.Now()) - - // Fetch DisruptionRollout instance - if err := r.Client.Get(ctx, req.NamespacedName, instance); err != nil { - return ctrl.Result{}, client.IgnoreNotFound(err) - } - - DisruptionRolloutTags = []string{"disruptionRolloutName:" + instance.Name, "disruptionRolloutNamespace:", instance.Namespace, "targetName:", instance.Spec.TargetResource.Name} - - if !instance.DeletionTimestamp.IsZero() { - // Add finalizer here if required - return ctrl.Result{}, nil - } - - // Update the DisruptionRollout status based on the presence of the target resource - // If the target resource has been missing for longer than the TargetResourceMissingThreshold, delete the instance - targetResourceExists, instanceDeleted, err := r.updateTargetResourcePreviouslyMissing(ctx, instance) - if err != nil { - // Log error and requeue if status update or deletion fails - r.log.Errorw("failed to handle target resource status", "err", err) - return ctrl.Result{}, err - } - - if instanceDeleted { - // Skip reconciliation since the instance has been deleted - return ctrl.Result{}, nil - } - - disruptions, err := GetChildDisruptions(ctx, r.Client, r.log, instance.Namespace, DisruptionRolloutNameLabel, instance.Name) - if err != nil { - return ctrl.Result{}, nil - } - - // Update the DisruptionRollout status with the time when the last disruption was successfully scheduled - if err := r.updateLastScheduleTime(ctx, instance, disruptions); err != nil { - r.log.Errorw("unable to update LastScheduleTime of DisruptionCron status", "err", err) - return ctrl.Result{}, err - } - - // Calculate next requeue time - requeueAfter := time.Duration(randSource.Intn(5)+15) * time.Second //nolint:gosec - requeueTime := requeueAfter.Round(time.Second) - scheduledResult := ctrl.Result{RequeueAfter: requeueAfter} - - // Run a new disruption if the following conditions are met: - // 1. The target resource is available - // 2. The target resource has been updated - // 3. The target resource update has not been tested - // 4. It's not blocked by another disruption already running - // 5. It's not past the deadline - if !targetResourceExists { - r.log.Infow(fmt.Sprintf("target resource is missing, scheduling next check in %s", requeueTime)) - return scheduledResult, nil - } - - if !r.targetResourceUpdated(&instance.Status) { - r.log.Infow("target resource hasn't been modified yet, sleeping") - return ctrl.Result{}, nil - } - - if instance.Status.LastContainerChangeTime.Before(instance.Status.LastScheduleTime) || instance.Status.LastContainerChangeTime.Equal(instance.Status.LastScheduleTime) { - r.log.Debugw("target resource update has already been tested, sleeping", - "LastContainerChangeTime", instance.Status.LastContainerChangeTime, - "LastScheduleTime", instance.Status.LastScheduleTime) - - return ctrl.Result{}, nil - } - - if len(disruptions.Items) > 0 { - r.log.Infow(fmt.Sprintf("cannot start a new disruption as a prior one is still running, scheduling next check in %s", requeueTime), "numActiveDisruptions", len(disruptions.Items)) - return scheduledResult, nil - } - - tooLate := false - if instance.Spec.DelayedStartTolerance.Duration() > 0 && !instance.Status.LastContainerChangeTime.IsZero() { - tooLate = instance.Status.LastContainerChangeTime.Add(instance.Spec.DelayedStartTolerance.Duration()).Before(time.Now()) - } - - if tooLate { - r.handleMetricSinkError(r.MetricsSink.MetricTooLate(DisruptionRolloutTags)) - r.log.Infow("missed schedule to start a disruption, sleeping", - "LastContainerChangeTime", instance.Status.LastContainerChangeTime, - "DelayedStartTolerance", instance.Spec.DelayedStartTolerance) - - return ctrl.Result{}, nil - } - - // Create disruption - scheduledTime := time.Now() - disruption, err := CreateDisruptionFromTemplate(ctx, r.Client, r.Scheme, instance, &instance.Spec.TargetResource, &instance.Spec.DisruptionTemplate, scheduledTime, r.log) - - if err != nil { - r.log.Warnw("unable to construct disruption from template", "err", err) - return scheduledResult, nil - } - - if err := r.Client.Create(ctx, disruption); err != nil { - r.log.Warnw("unable to create Disruption for DisruptionRollout", "disruption", disruption, "err", err) - return ctrl.Result{}, err - } - - r.handleMetricSinkError(r.MetricsSink.MetricDisruptionScheduled(append(DisruptionRolloutTags, "disruptionName:"+disruption.Name))) - - r.log.Infow("created Disruption for DisruptionRollout run", cLog.DisruptionNameKey, disruption.Name) - - // ------------------------------------------------------------------ // - // If this process restarts at this point (after posting a disruption, but - // before updating the status), we might try to start the disruption again - // the next time. To prevent this, we use the same disruption name for every - // execution, acting as a lock to prevent creating the disruption twice. - - // Add the start time of the just initiated disruption to the status - instance.Status.LastScheduleTime = &metav1.Time{Time: scheduledTime} - if err := r.Client.Status().Update(ctx, instance); err != nil { - r.log.Warnw("unable to update LastScheduleTime of DisruptionCron status", "err", err) - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil -} - -// updateLastScheduleTime updates the LastScheduleTime in the status of a DisruptionRollout instance -// based on the most recent schedule time among the given disruptions. -func (r *DisruptionRolloutReconciler) updateLastScheduleTime(ctx context.Context, instance *chaosv1beta1.DisruptionRollout, disruptions *chaosv1beta1.DisruptionList) error { - mostRecentScheduleTime := GetMostRecentScheduleTime(r.log, disruptions) // find the last run so we can update the status - if !mostRecentScheduleTime.IsZero() { - instance.Status.LastScheduleTime = &metav1.Time{Time: mostRecentScheduleTime} - return r.Client.Status().Update(ctx, instance) - } - - return nil // No need to update if mostRecentScheduleTime is nil -} - -// updateTargetResourcePreviouslyMissing updates the status when the target resource was previously missing. -// The function returns three values: -// - bool: Indicates whether the target resource is currently found. -// - bool: Indicates whether the disruptionrollout was deleted due to the target resource being missing for more than the expiration duration. -// - error: Represents any error that occurred during the execution of the function. -func (r *DisruptionRolloutReconciler) updateTargetResourcePreviouslyMissing(ctx context.Context, instance *chaosv1beta1.DisruptionRollout) (bool, bool, error) { - disruptionRolloutDeleted := false - targetResourceExists, err := CheckTargetResourceExists(ctx, r.Client, &instance.Spec.TargetResource, instance.Namespace) - - if err != nil { - return targetResourceExists, disruptionRolloutDeleted, err - } - - if !targetResourceExists { - r.log.Warnw("target does not exist, this disruption rollout will be deleted if that continues", "error", err) - - if instance.Status.TargetResourcePreviouslyMissing == nil { - r.log.Warnw("target is missing for the first time, updating status") - - return targetResourceExists, disruptionRolloutDeleted, r.handleTargetResourceFirstMissing(ctx, instance) - } - - if time.Since(instance.Status.TargetResourcePreviouslyMissing.Time) > r.TargetResourceMissingThreshold { - r.log.Errorw("target has been missing for over one day, deleting this schedule", - "timeMissing", time.Since(instance.Status.TargetResourcePreviouslyMissing.Time)) - - disruptionRolloutDeleted = true - - return targetResourceExists, disruptionRolloutDeleted, r.handleTargetResourceMissingPastExpiration(ctx, instance) - } - - r.handleMetricSinkError(r.MetricsSink.MetricTargetMissing(time.Since(instance.Status.TargetResourcePreviouslyMissing.Time), DisruptionRolloutTags)) - } else if instance.Status.TargetResourcePreviouslyMissing != nil { - r.log.Infow("target was previously missing, but now present. updating the status accordingly") - r.handleMetricSinkError(r.MetricsSink.MetricMissingTargetFound(DisruptionRolloutTags)) - - return targetResourceExists, disruptionRolloutDeleted, r.handleTargetResourceNowPresent(ctx, instance) - } - - return targetResourceExists, disruptionRolloutDeleted, nil -} - -// handleTargetResourceFirstMissing handles the scenario when the target resource is missing for the first time. -// It updates the status of the DisruptionRollout instance. -func (r *DisruptionRolloutReconciler) handleTargetResourceFirstMissing(ctx context.Context, instance *chaosv1beta1.DisruptionRollout) error { - instance.Status.TargetResourcePreviouslyMissing = &metav1.Time{Time: time.Now()} - if err := r.Client.Status().Update(ctx, instance); err != nil { - return fmt.Errorf("failed to update status: %w", err) - } - - return nil -} - -// handleTargetResourceMissingPastExpiration handles the scenario when the target resource has been missing for more than the expiration period. -// It deletes the DisruptionRollout instance. -func (r *DisruptionRolloutReconciler) handleTargetResourceMissingPastExpiration(ctx context.Context, instance *chaosv1beta1.DisruptionRollout) error { - if err := r.Client.Delete(ctx, instance); err != nil { - return fmt.Errorf("failed to delete instance: %w", err) - } - - r.handleMetricSinkError(r.MetricsSink.MetricMissingTargetDeleted(DisruptionRolloutTags)) - - return nil -} - -// handleTargetResourceNowPresent handles the scenario when the target resource was previously missing but is now present. -// It updates the status of the DisruptionRollout instance. -func (r *DisruptionRolloutReconciler) handleTargetResourceNowPresent(ctx context.Context, instance *chaosv1beta1.DisruptionRollout) error { - instance.Status.TargetResourcePreviouslyMissing = nil - if err := r.Client.Status().Update(ctx, instance); err != nil { - return fmt.Errorf("failed to update status: %w", err) - } - - return nil -} - -// handleMetricSinkError logs the given metric sink error if it is not nil -func (r *DisruptionRolloutReconciler) handleMetricSinkError(err error) { - if err != nil { - r.log.Errorw("error sending a metric", "error", err) - } -} - -// targetResourceUpdated checks whether the target resource has been updated or not. -func (r *DisruptionRolloutReconciler) targetResourceUpdated(status *chaosv1beta1.DisruptionRolloutStatus) bool { - if status == nil { - return false - } - - if status.LatestInitContainersHash == nil && - status.LatestContainersHash == nil && - status.LastContainerChangeTime == nil { - return false - } - - return true -} - -// SetupWithManager setups the current reconciler with the given manager -func (r *DisruptionRolloutReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&chaosv1beta1.DisruptionRollout{}). - Complete(r) -} diff --git a/examples/disruption_rollouts/disruption_rollout_network_drop.yaml b/examples/disruption_rollouts/disruption_rollout_network_drop.yaml deleted file mode 100644 index db9b04601a..0000000000 --- a/examples/disruption_rollouts/disruption_rollout_network_drop.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Unless explicitly stated otherwise all files in this repository are licensed -# under the Apache License Version 2.0. -# This product includes software developed at Datadog (https://www.datadoghq.com/). -# Copyright 2025 Datadog, Inc. - -apiVersion: chaos.datadoghq.com/v1beta1 -kind: DisruptionRollout -metadata: - name: network-drop - namespace: chaos-demo -spec: - targetResource: - kind: deployment - name: demo-curl - disruptionTemplate: - level: pod - count: 1 - network: - drop: 100 # percentage of outgoing packets to drop - duration: 10s # disruption will time out after 10 seconds diff --git a/main.go b/main.go index efb4613a31..3dfefcb9d4 100644 --- a/main.go +++ b/main.go @@ -7,7 +7,6 @@ package main import ( "context" - "fmt" "net/http" "os" "time" @@ -45,10 +44,8 @@ import ( "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" - "k8s.io/client-go/tools/cache" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -276,78 +273,6 @@ func main() { go disruptionReconciler.ReportMetrics(ctx) - if cfg.Controller.DisruptionRolloutEnabled { - // create deployment and statefulset informers - globalInformerFactory := kubeinformers.NewSharedInformerFactory(informerClient, time.Hour*24) - deploymentInformer := globalInformerFactory.Apps().V1().Deployments().Informer() - statefulsetInformer := globalInformerFactory.Apps().V1().StatefulSets().Informer() - - deploymentHandler := watchers.NewDeploymentHandler(mgr.GetClient(), logger) - statefulsetHandler := watchers.NewStatefulSetHandler(mgr.GetClient(), logger) - - _, err = deploymentInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: deploymentHandler.OnAdd, - UpdateFunc: deploymentHandler.OnUpdate, - DeleteFunc: deploymentHandler.OnDelete, - }) - if err != nil { - logger.Fatalw("unable to add event handler for Deployments", "error", err) - } - - _, err = statefulsetInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: statefulsetHandler.OnAdd, - UpdateFunc: statefulsetHandler.OnUpdate, - DeleteFunc: statefulsetHandler.OnDelete, - }) - if err != nil { - logger.Fatalw("unable to add event handler for StatefulSets", "error", err) - } - - // wait for the deployment and statefulset informer caches to be synced - synced := globalInformerFactory.WaitForCacheSync(ctx.Done()) - for informerType, ok := range synced { - if !ok { - logger.Errorw("failed to wait for informer cache to sync", "informer", informerType) - return - } - } - - // start the deployment and statefulset informers - globalInformerFactory.Start(stopCh) - - // create disruption rollout reconciler - disruptionRolloutReconciler := &controllers.DisruptionRolloutReconciler{ - Client: mgr.GetClient(), - BaseLog: logger, - Scheme: mgr.GetScheme(), - // new metrics sink for rollout controller - MetricsSink: initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppRolloutController), - TargetResourceMissingThreshold: cfg.Controller.TargetResourceMissingThreshold, - } - - defer closeMetricsSink(logger, disruptionRolloutReconciler.MetricsSink) - - if err := disruptionRolloutReconciler.SetupWithManager(mgr); err != nil { - logger.Errorw("unable to create controller", "controller", "DisruptionRollout", "error", err) - os.Exit(1) //nolint:gocritic - } - - // add the indexer on target resource for disruption rollouts - err = mgr.GetCache().IndexField(context.Background(), &chaosv1beta1.DisruptionRollout{}, "targetResource", func(obj client.Object) []string { - dr, ok := obj.(*chaosv1beta1.DisruptionRollout) - if !ok { - return []string{""} - } - - targetResource := fmt.Sprintf("%s-%s-%s", dr.Spec.TargetResource.Kind, dr.GetNamespace(), dr.Spec.TargetResource.Name) - - return []string{targetResource} - }) - if err != nil { - logger.Fatalw("unable to add index", "controller", "DisruptionRollout", "error", err) - } - } - // register disruption validating webhook setupWebhookConfig := utils.SetupWebhookWithManagerConfig{ Manager: mgr, diff --git a/o11y/metrics/datadog/datadog.go b/o11y/metrics/datadog/datadog.go index 70980d3fd8..ce5b1bd26a 100644 --- a/o11y/metrics/datadog/datadog.go +++ b/o11y/metrics/datadog/datadog.go @@ -17,10 +17,9 @@ import ( ) const ( - metricPrefixInjector = "chaos.injector." - metricPrefixController = "chaos.controller." - metricPrefixRolloutController = "chaos.rollout.controller." - metricPrefixCronController = "chaos.cron.controller." + metricPrefixInjector = "chaos.injector." + metricPrefixController = "chaos.controller." + metricPrefixCronController = "chaos.cron.controller." ) // Sink describes a Datadog sink (statsd) @@ -54,8 +53,6 @@ func GetPrefixFromApp(app types.SinkApp) (string, error) { switch app { case types.SinkAppController: return metricPrefixController, nil - case types.SinkAppRolloutController: - return metricPrefixRolloutController, nil case types.SinkAppCronController: return metricPrefixCronController, nil case types.SinkAppInjector: @@ -233,7 +230,7 @@ func (d Sink) MetricWatcherCalls(tags []string) error { } // MetricTooLate reports when a scheduled Disruption misses its configured time to be run, -// specific to cron and rollout controllers +// specific to cron controllers func (d Sink) MetricTooLate(tags []string) error { return d.metricWithStatus(d.prefix+"schedule.too_late", tags) } diff --git a/o11y/metrics/metrics.go b/o11y/metrics/metrics.go index 9a45896b47..2d0bd242d1 100644 --- a/o11y/metrics/metrics.go +++ b/o11y/metrics/metrics.go @@ -89,7 +89,7 @@ type Sink interface { // MetricOrphanFound increments when a chaos pod without a corresponding disruption resource is found MetricOrphanFound(tags []string) error // MetricTooLate reports when a scheduled Disruption misses its configured time to be run, - // specific to cron and rollout controllers + // specific to cron controllers MetricTooLate(tags []string) error // MetricTargetMissing reports anytime scheduled Disruption can not find its specified target MetricTargetMissing(duration time.Duration, tags []string) error @@ -101,7 +101,7 @@ type Sink interface { MetricMissingTargetDeleted(tags []string) error // MetricNextScheduledTime reports the duration until this scheduled Disruption's next scheduled disruption should run MetricNextScheduledTime(time time.Duration, tags []string) error - // MetricDisruptionScheduled reports when a new disruption is scheduled by a cron or rollout + // MetricDisruptionScheduled reports when a new disruption is scheduled by a cron MetricDisruptionScheduled(tags []string) error // MetricPausedCron reports when a disruption cron has reconciled in a paused state MetricPausedCron(tags []string) error diff --git a/o11y/metrics/noop/noop.go b/o11y/metrics/noop/noop.go index 86ab30049b..d0eb9712c0 100644 --- a/o11y/metrics/noop/noop.go +++ b/o11y/metrics/noop/noop.go @@ -221,7 +221,7 @@ func (n Sink) MetricWatcherCalls(tags []string) error { } // MetricTooLate reports when a scheduled Disruption misses its configured time to be run, -// specific to cron and rollout controllers +// specific to cron controllers func (n Sink) MetricTooLate(tags []string) error { n.log.Debugf("NOOP: MetricTooLate %s\n", tags) diff --git a/o11y/metrics/types/types.go b/o11y/metrics/types/types.go index 6cbdb2be9a..2c8410ef07 100644 --- a/o11y/metrics/types/types.go +++ b/o11y/metrics/types/types.go @@ -23,9 +23,6 @@ const ( // SinkAppController is the chaos controller SinkAppController SinkApp = "chaos-controller" - // SinkAppRolloutController is the rollout controller - SinkAppRolloutController SinkApp = "chaos-rollout-controller" - // SinkAppCronController is the cron controller SinkAppCronController SinkApp = "chaos-cron-controller" diff --git a/tasks/header.py b/tasks/header.py index 65ce1d13e4..34e96c32ed 100644 --- a/tasks/header.py +++ b/tasks/header.py @@ -31,7 +31,6 @@ "bin/injector/dns_disruption_resolver.py", "chart/templates/generated/chaos.datadoghq.com_disruptions.yaml", "chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml", - "chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml", "chart/templates/generated/role.yaml", "cpuset/cpuset.go", "grpc/disruptionlistener/disruptionlistener_grpc.pb.go", diff --git a/types/types.go b/types/types.go index 1f77ff8d15..73ec4dc4ed 100644 --- a/types/types.go +++ b/types/types.go @@ -120,12 +120,12 @@ const ( // DisruptionNamespaceLabel is the label used to identify the disruption namespace for a chaos pod. This is used to determine pod ownership. DisruptionNamespaceLabel = GroupName + "/disruption-namespace" - // ScheduledAtAnnotation is the annotation key for the scheduled time of the disruption when it is created from DisruptionCron or DisruptionRollout. + // ScheduledAtAnnotation is the annotation key for the scheduled time of the disruption when it is created from DisruptionCron. ScheduledAtAnnotation = GroupName + "/scheduled-at" - // UserAnnotation is the annotation key that stores the username of the user who created the parent resource (DisruptionCron or DisruptionRollout). + // UserAnnotation is the annotation key that stores the username of the user who created the parent resource (DisruptionCron). UserAnnotation = GroupName + "/user" - // UserGroupsAnnotation is the annotation key that stores the user groups of the individual who created the parent resource (DisruptionCron or DisruptionRollout). + // UserGroupsAnnotation is the annotation key that stores the user groups of the individual who created the parent resource (DisruptionCron). UserGroupsAnnotation = GroupName + "/user-groups" finalizerPrefix = "finalizer." + GroupName diff --git a/watchers/deployment_handler.go b/watchers/deployment_handler.go deleted file mode 100644 index 271bcf8cd1..0000000000 --- a/watchers/deployment_handler.go +++ /dev/null @@ -1,140 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2025 Datadog, Inc. - -package watchers - -import ( - "context" - "time" - - chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1" - "go.uber.org/zap" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type DeploymentHandler struct { - Client client.Client - log *zap.SugaredLogger -} - -func NewDeploymentHandler(client client.Client, logger *zap.SugaredLogger) DeploymentHandler { - return DeploymentHandler{ - Client: client, - log: logger, - } -} - -// OnAdd is a handler function for the add of a deployment -func (h DeploymentHandler) OnAdd(obj interface{}) { - deployment, ok := obj.(*appsv1.Deployment) - - // If the object is not a deployment, do nothing - if !ok { - return - } - - // If deployment doesn't have associated disruption rollout, do nothing - hasDisruptionRollout, err := h.HasAssociatedDisruptionRollout(deployment) - if err != nil { - return - } - - if !hasDisruptionRollout { - return - } - - initContainersHash, containersHash, err := HashPodSpec(&deployment.Spec.Template.Spec) - if err != nil { - return - } - - err = h.UpdateDisruptionRolloutStatus(deployment, initContainersHash, containersHash) - if err != nil { - return - } -} - -// OnUpdate is a handler function for the update of a deployment -func (h DeploymentHandler) OnUpdate(oldObj, newObj interface{}) { - // Convert oldObj and newObj to Deployment objects - oldDeployment, okOldDeployment := oldObj.(*appsv1.Deployment) - newDeployment, okNewDeployment := newObj.(*appsv1.Deployment) - - // If both old and new are not deployments, do nothing - if !okOldDeployment || !okNewDeployment { - return - } - - // If deployment doesn't have associated disruption rollout, do nothing - hasDisruptionRollout, err := h.HasAssociatedDisruptionRollout(newDeployment) - if !hasDisruptionRollout || err != nil { - return - } - - // If containers have't changed, do nothing - containersChanged, initContainersHash, containersHash, err := ContainersChanged(&oldDeployment.Spec.Template.Spec, &newDeployment.Spec.Template.Spec, h.log) - if !containersChanged || err != nil { - return - } - - err = h.UpdateDisruptionRolloutStatus(newDeployment, initContainersHash, containersHash) - if err != nil { - return - } -} - -// OnDelete is a handler function for the delete of a deployment -func (h DeploymentHandler) OnDelete(_ interface{}) { - // Do nothing on delete event -} - -func (h DeploymentHandler) FetchAssociatedDisruptionRollouts(deployment *appsv1.Deployment) (*chaosv1beta1.DisruptionRolloutList, error) { - indexedValue := "deployment" + "-" + deployment.Namespace + "-" + deployment.Name - - // It would be more efficient to use label selectors, - // however it would require a webhook to add those labels when new rollouts are created - disruptionRollouts := &chaosv1beta1.DisruptionRolloutList{} - err := h.Client.List(context.Background(), disruptionRollouts, client.MatchingFields{"targetResource": indexedValue}) - - if err != nil { - h.log.Errorw("unable to fetch DisruptionRollouts using index", "error", err, "indexedValue", indexedValue) - return nil, err - } - - return disruptionRollouts, nil -} - -func (h DeploymentHandler) HasAssociatedDisruptionRollout(deployment *appsv1.Deployment) (bool, error) { - disruptionRollouts, err := h.FetchAssociatedDisruptionRollouts(deployment) - if err != nil { - h.log.Errorw("unable to check for associated DisruptionRollout", "Deployment", deployment.Name, "error", err) - return false, err - } - - return len(disruptionRollouts.Items) > 0, nil -} - -func (h DeploymentHandler) UpdateDisruptionRolloutStatus(deployment *appsv1.Deployment, initContainersHash, containersHash map[string]string) error { - disruptionRollouts, err := h.FetchAssociatedDisruptionRollouts(deployment) - if err != nil { - return err - } - - for _, dr := range disruptionRollouts.Items { - dr.Status.LatestInitContainersHash = initContainersHash - dr.Status.LatestContainersHash = containersHash - dr.Status.LastContainerChangeTime = &metav1.Time{Time: time.Now()} - - err = h.Client.Status().Update(context.Background(), &dr) - if err != nil { - h.log.Errorw("unable to update DisruptionRollout status", "DisruptionRollout", dr.Name, "error", err) - return err - } - } - - return nil -} diff --git a/watchers/statefulset_handler.go b/watchers/statefulset_handler.go deleted file mode 100644 index c731d2c729..0000000000 --- a/watchers/statefulset_handler.go +++ /dev/null @@ -1,140 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2025 Datadog, Inc. - -package watchers - -import ( - context "context" - "time" - - chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1" - "go.uber.org/zap" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type StatefulSetHandler struct { - Client client.Client - log *zap.SugaredLogger -} - -func NewStatefulSetHandler(client client.Client, logger *zap.SugaredLogger) StatefulSetHandler { - return StatefulSetHandler{ - Client: client, - log: logger, - } -} - -// OnAdd is a handler function for the add of a statefulset -func (h StatefulSetHandler) OnAdd(obj interface{}) { - statefulset, ok := obj.(*appsv1.StatefulSet) - - // If the object is not a statefulset, do nothing - if !ok { - return - } - - // If statefulset doesn't have associated disruption rollout, do nothing - hasDisruptionRollout, err := h.HasAssociatedDisruptionRollout(statefulset) - if err != nil { - return - } - - if !hasDisruptionRollout { - return - } - - initContainersHash, containersHash, err := HashPodSpec(&statefulset.Spec.Template.Spec) - if err != nil { - return - } - - err = h.UpdateDisruptionRolloutStatus(statefulset, initContainersHash, containersHash) - if err != nil { - return - } -} - -// OnUpdate is a handler function for the update of a statefulset -func (h StatefulSetHandler) OnUpdate(oldObj, newObj interface{}) { - // Convert oldObj and newObj to Deployment objects - oldStatefulSet, okOldStatefulSet := oldObj.(*appsv1.StatefulSet) - newStatefulSet, okNewStatefulSet := newObj.(*appsv1.StatefulSet) - - // If both old and new are not statefulsets, do nothing - if !okOldStatefulSet || !okNewStatefulSet { - return - } - - // If statefulset doesn't have associated disruption rollout, do nothing - hasDisruptionRollout, err := h.HasAssociatedDisruptionRollout(newStatefulSet) - if !hasDisruptionRollout || err != nil { - return - } - - // If containers have't changed, do nothing - containersChanged, initContainersHash, containersHash, err := ContainersChanged(&oldStatefulSet.Spec.Template.Spec, &newStatefulSet.Spec.Template.Spec, h.log) - if !containersChanged || err != nil { - return - } - - err = h.UpdateDisruptionRolloutStatus(newStatefulSet, initContainersHash, containersHash) - if err != nil { - return - } -} - -// OnDelete is a handler function for the delete of a statefulset -func (h StatefulSetHandler) OnDelete(_ interface{}) { - // Do nothing on delete event -} - -func (h StatefulSetHandler) FetchAssociatedDisruptionRollouts(statefulset *appsv1.StatefulSet) (*chaosv1beta1.DisruptionRolloutList, error) { - indexedValue := "statefulset" + "-" + statefulset.Namespace + "-" + statefulset.Name - - // It would be more efficient to use label selectors, - // however it would require a webhook to add those labels when new rollouts are created - disruptionRollouts := &chaosv1beta1.DisruptionRolloutList{} - err := h.Client.List(context.Background(), disruptionRollouts, client.MatchingFields{"targetResource": indexedValue}) - - if err != nil { - h.log.Errorw("unable to fetch DisruptionRollouts using index", "error", err, "indexedValue", indexedValue) - return nil, err - } - - return disruptionRollouts, nil -} - -func (h StatefulSetHandler) HasAssociatedDisruptionRollout(statefulset *appsv1.StatefulSet) (bool, error) { - disruptionRollouts, err := h.FetchAssociatedDisruptionRollouts(statefulset) - if err != nil { - h.log.Errorw("unable to check for associated DisruptionRollout", "StatefulSet", statefulset.Name, "error", err) - return false, err - } - - return len(disruptionRollouts.Items) > 0, nil -} - -func (h StatefulSetHandler) UpdateDisruptionRolloutStatus(statefulset *appsv1.StatefulSet, initContainersHash, containersHash map[string]string) error { - disruptionRollouts, err := h.FetchAssociatedDisruptionRollouts(statefulset) - if err != nil { - return err - } - - for _, dr := range disruptionRollouts.Items { - dr.Status.LatestInitContainersHash = initContainersHash - dr.Status.LatestContainersHash = containersHash - dr.Status.LastContainerChangeTime = &metav1.Time{Time: time.Now()} - - err = h.Client.Status().Update(context.Background(), &dr) - if err != nil { - h.log.Errorw("unable to update DisruptionRollout status", "DisruptionRollout", dr.Name, "error", err) - return err - } - } - - return nil -}