diff --git a/slice/README.md b/slice/README.md index 9154639af..1b3bd8456 100644 --- a/slice/README.md +++ b/slice/README.md @@ -110,6 +110,27 @@ the '--force' flag and manually ensure that any custom configuration previously added to 'dist/chart/values.yaml' or 'dist/chart/manager/manager.yaml' is manually re-applied afterwards. +#### Installing kueue +```bash +helm install kueue oci://registry.k8s.io/kueue/charts/kueue --version="0.14.0" \ + --create-namespace --namespace=kueue-system +``` + +#### Installing slice using Helm +```bash +IMAGE_REPO=/slice-controller +IMAGE_TAG=tag +helm install slice ./charts/slice --set image.repository=$IMAGE_REPO --set image.tag=$IMAGE_TAG --set kueueResources.create=true --create-namespace --namespace slice-controller-system + +``` + +### Uninstallation +```bash +helm uninstall slice --namespace=slice-controller-system +helm uninstall kueue --namespace=kueue-system +``` + + ## Contributing // TODO(user): Add detailed information on how you would like others to contribute to this project diff --git a/slice/charts/slice/.helmignore b/slice/charts/slice/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/slice/charts/slice/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/slice/charts/slice/Chart.yaml b/slice/charts/slice/Chart.yaml new file mode 100644 index 000000000..f4fc14f48 --- /dev/null +++ b/slice/charts/slice/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +name: slice +description: A Helm chart for slice-controller-system + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.1.0" diff --git a/slice/charts/slice/crds/slice.accelerator.gke.io_slices.yaml b/slice/charts/slice/crds/slice.accelerator.gke.io_slices.yaml new file mode 100644 index 000000000..f37d252cd --- /dev/null +++ b/slice/charts/slice/crds/slice.accelerator.gke.io_slices.yaml @@ -0,0 +1,158 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.18.0 + name: slices.slice.accelerator.gke.io +spec: + group: slice.accelerator.gke.io + names: + kind: Slice + listKind: SliceList + plural: slices + singular: slice + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.acceleratorType + name: Type + type: string + - jsonPath: .spec.acceleratorTopology + name: Topology + type: string + - jsonPath: .status.conditions[0].type + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: Slice is the Schema for the slices API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: SliceSpec defines the desired state of Slice. + properties: + acceleratorTopology: + description: Topology represents the network topology of the slice. + type: string + acceleratorType: + description: AcceleratorType specifies the type of accelerator used + in this slice. + type: string + nodeSelector: + additionalProperties: + items: + type: string + type: array + description: |- + Required, set of nodes to use to form a slice. + NodeSelector specifies a set of label-based selectors for nodes that can form the + slice. The controller will select nodes where for each key-value pair in the map, + the node's label value for that key is present in the corresponding string slice. + This allows for a flexible "match any of these values for this label" selection. + The nodeSelector will follow an AND over the map entries but an OR within the list + items of the entry. + For example, to select nodes in cubes cube-1 and cube-2, you could use: + {"cloud.google.com/gke-tpu-reservation-subblock": ["cube-1", "cube-2"]} + type: object + required: + - acceleratorTopology + - acceleratorType + - nodeSelector + type: object + status: + description: SliceStatus defines the observed state of Slice. + properties: + blockId: + description: Populated to match the physical topology of block the + Super-Slice is running on + type: string + conditions: + description: Conditions store the status conditions of the Slice + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + subBlockIds: + description: Populated to list of physical topology of sub-block the + Super-Slice is running on + items: + type: string + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/slice/charts/slice/templates/_helpers.tpl b/slice/charts/slice/templates/_helpers.tpl new file mode 100644 index 000000000..7774e1809 --- /dev/null +++ b/slice/charts/slice/templates/_helpers.tpl @@ -0,0 +1,55 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "slice.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "slice.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "slice.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "slice.labels" -}} +helm.sh/chart: {{ include "slice.chart" . }} +app.kubernetes.io/name: {{ include "slice.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "slice.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "slice.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end -}} diff --git a/slice/charts/slice/templates/deployment.yaml b/slice/charts/slice/templates/deployment.yaml new file mode 100644 index 000000000..f3443e6d8 --- /dev/null +++ b/slice/charts/slice/templates/deployment.yaml @@ -0,0 +1,50 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "slice.fullname" . }} + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "slice.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "slice.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "slice.serviceAccountName" . }} + securityContext: + runAsNonRoot: true + containers: + - name: {{ .Chart.Name }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: webhook-server + containerPort: 9443 + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: slice-controller-webhook-server-cert + optional: true diff --git a/slice/charts/slice/templates/kueue-setup.yaml b/slice/charts/slice/templates/kueue-setup.yaml new file mode 100644 index 000000000..0a7220dea --- /dev/null +++ b/slice/charts/slice/templates/kueue-setup.yaml @@ -0,0 +1,63 @@ +{{- if .Values.kueueResources.create -}} +apiVersion: kueue.x-k8s.io/v1beta1 +kind: Topology +metadata: + name: {{ .Values.kueueResources.topology.name }} + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + levels: + {{- range .Values.kueueResources.topology.levels }} + - nodeLabel: {{ . }} + {{- end }} +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: {{ .Values.kueueResources.resourceFlavor.name }} + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + nodeLabels: {{- toYaml .Values.kueueResources.resourceFlavor.nodeLabels | nindent 4 }} + topologyName: {{ .Values.kueueResources.topology.name }} +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: AdmissionCheck +metadata: + name: {{ .Values.kueueResources.admissionCheck.name }} + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + controllerName: accelerator.gke.io/slice +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: {{ .Values.kueueResources.clusterQueue.name }} + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + namespaceSelector: {} + admissionChecks: + - {{ .Values.kueueResources.admissionCheck.name }} + resourceGroups: + - coveredResources: + {{- toYaml .Values.kueueResources.clusterQueue.coveredResources | nindent 4 }} + flavors: + - name: {{ .Values.kueueResources.resourceFlavor.name }} + resources: + {{- range .Values.kueueResources.clusterQueue.coveredResources }} + - name: {{ . }} + nominalQuota: "9999" + {{- end }} +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: {{ .Values.kueueResources.localQueue.name }} + namespace: {{ .Values.kueueResources.localQueue.namespace }} + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + clusterQueue: {{ .Values.kueueResources.clusterQueue.name }} +{{- end }} diff --git a/slice/charts/slice/templates/rbac.yaml b/slice/charts/slice/templates/rbac.yaml new file mode 100644 index 000000000..5c9e0699e --- /dev/null +++ b/slice/charts/slice/templates/rbac.yaml @@ -0,0 +1,116 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "slice.fullname" . }}-manager-role +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update + - watch +- apiGroups: + - "" + resources: + - nodes + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - update + - watch +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + verbs: + - get + - list + - update + - watch +- apiGroups: + - jobset.x-k8s.io + resources: + - jobsets + verbs: + - get + - list + - watch +- apiGroups: + - kueue.x-k8s.io + resources: + - admissionchecks + verbs: + - get + - list + - watch +- apiGroups: + - kueue.x-k8s.io + resources: + - admissionchecks/status + - workloads/status + verbs: + - get + - patch + - update +- apiGroups: + - kueue.x-k8s.io + resources: + - workloads + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - slice.accelerator.gke.io + resources: + - slices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - slice.accelerator.gke.io + resources: + - slices/finalizers + verbs: + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "slice.fullname" . }}-manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "slice.fullname" . }}-manager-role +subjects: +- kind: ServiceAccount + name: {{ include "slice.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: slice-controller-webhook-server-cert + namespace: {{ .Release.Namespace }} + labels: + {{- include "slice.labels" . | nindent 4 }} diff --git a/slice/charts/slice/templates/service.yaml b/slice/charts/slice/templates/service.yaml new file mode 100644 index 000000000..620323352 --- /dev/null +++ b/slice/charts/slice/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: slice-controller-webhook-service + namespace: slice-controller-system + labels: + {{- include "slice.labels" . | nindent 4 }} +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + app.kubernetes.io/name: {{ include "slice.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} \ No newline at end of file diff --git a/slice/charts/slice/templates/serviceaccount.yaml b/slice/charts/slice/templates/serviceaccount.yaml new file mode 100644 index 000000000..1423af2bf --- /dev/null +++ b/slice/charts/slice/templates/serviceaccount.yaml @@ -0,0 +1,8 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "slice.serviceAccountName" . }} + labels: + {{- include "slice.labels" . | nindent 4 }} +{{- end -}} diff --git a/slice/charts/slice/templates/webhook.yaml b/slice/charts/slice/templates/webhook.yaml new file mode 100644 index 000000000..65fe96bfc --- /dev/null +++ b/slice/charts/slice/templates/webhook.yaml @@ -0,0 +1,27 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: slice-controller-mutating-webhook-configuration + labels: + {{- include "slice.labels" . | nindent 4 }} +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: slice-controller-webhook-service + namespace: slice-controller-system + path: /mutate-jobset-x-k8s-io-v1alpha2-jobset + failurePolicy: Fail + name: mjobset.kb.io + rules: + - apiGroups: + - jobset.x-k8s.io + apiVersions: + - v1alpha2 + operations: + - CREATE + - UPDATE + resources: + - jobsets + sideEffects: None diff --git a/slice/charts/slice/values.yaml b/slice/charts/slice/values.yaml new file mode 100644 index 000000000..6141a7c16 --- /dev/null +++ b/slice/charts/slice/values.yaml @@ -0,0 +1,40 @@ +nameOverride: "" +fullnameOverride: "" + +replicaCount: 1 + +image: + repository: us-central1-docker.pkg.dev/k8s-staging-images/slice/slice + pullPolicy: IfNotPresent + tag: main + +imagePullSecrets: [] + +serviceAccount: + create: true + name: "" + +resources: {} + +kueueResources: + create: false + topology: + name: topology + levels: + - cloud.google.com/gce-topology-block + - cloud.google.com/gke-tpu-slice-4x4x4-id + - kubernetes.io/hostname + resourceFlavor: + name: superslice-flavor + nodeLabels: + cloud.google.com/gke-tpu-accelerator: tpu-v7x + cloud.google.com/gke-tpu-slice-4x4x4-health: "true" + admissionCheck: + name: admissioncheck + clusterQueue: + name: superslice-clusterqueue + coveredResources: + - cloud.google.com/tpu + localQueue: + name: superslice-queue + namespace: default