diff --git a/builtin/files/cluster.yaml.tmpl b/builtin/files/cluster.yaml.tmpl index 0170b35ae..dfe8bd962 100644 --- a/builtin/files/cluster.yaml.tmpl +++ b/builtin/files/cluster.yaml.tmpl @@ -1573,7 +1573,9 @@ kubeAwsPlugins: # See plugins/aws-iam-authenticator/plugin.yaml for more info awsIamAuthenticator: enabled: false - # see plugins/cluster-autoscaler/plugin.yaml for more info + + # clusterAutoscaler provides kubernetes cluster-autoscaler functionality - https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler + # Replaces original built-in functionality with a plugin and upgrades to the latest version clusterAutoscaler: enabled: false replicas: 2 @@ -1604,3 +1606,15 @@ kubeAwsPlugins: # selectors for autodiscovery selector: prometheus: monitoring + + # upgradeHelper - assists when rolling out new versions of kubernetes + # It actively disables old controllers and temporarily removes mutating/validating webhooks whilst + # the upgraded controller is starting up. + # NOTE: You will normally not need this plugin - so ONLY enable if you are experiencing issues testing migrating across versions. + # It will only kill controller that are a different release from currently spinning up version, e.g. :- + # it will kill v1.13.2 controllers when rolling out v1.14.0 + # it will NOT kill v1.14.0 controllers when rolling out v1.14.3 + upgradeHelper: + enabled: false + # disableWebhooks can be used to turn off the webhook feature if required + disableWebhooks: true diff --git a/builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh b/builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh new file mode 100644 index 000000000..de9d359fc --- /dev/null +++ b/builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Restore webhooks that were exported and then deleted by upgrade-helper.sh + +retries=5 +hyperkube_image="{{ .Config.HyperkubeImage.RepoWithTag }}" +disable_webhooks="{{ if .Values.disableWebhooks }}true{{else}}false{{end}}" + +kubectl() { + /usr/bin/docker run -i --rm -v /etc/kubernetes:/etc/kubernetes:ro --net=host ${hyperkube_image} /hyperkube kubectl --kubeconfig=/etc/kubernetes/kubeconfig/admin.yaml "$@" +} + +list_not_empty() { + local file=$1 + if ! [[ -s $file ]]; then + return 1 + fi + if cat $file | grep -se 'items: \[\]'; then + return 1 + fi + return 0 +} + +applyall() { + kubectl apply --force -f $(echo "$@" | tr ' ' ',') +} + +restore_webhooks() { + local type=$1 + local file=$2 + + if list_not_empty $file; then + echo "Restoring all ${type} webhooks from ${file}" + applyall $file + else + echo "no webhooks to restore in $file" + fi +} + +if [[ "${disable_webhooks}" == "true" ]]; then + echo "Restoring all validating and mutating webhooks..." + restore_webhooks validating /srv/kubernetes/validating_webhooks.yaml + restore_webhooks mutating /srv/kubernetes/mutating_webhooks.yaml +fi +exit 0 \ No newline at end of file diff --git a/builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh b/builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh new file mode 100644 index 000000000..a61cc28a5 --- /dev/null +++ b/builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh @@ -0,0 +1,214 @@ +#!/bin/bash +# Smooths upgrades/roll-backs where the release of kubernetes jumps a release +# It kills old controllers so that this one takes over all api functions, so we don't get an +# extended period of old and new running side-by-side and the incompatibilities that this can bring. +# It also removes any mutating and validating webhooks in the system so that install-kube-system can run without interference. +# +# A request to disable is a configmap matching the hostname and kubernetes version containing a list of core service to stop: - +# apiVersion: v1 +# kind: ConfigMap +# metadata: +# name: kube-aws-migration-disable-ip-10-29-26-83.us-west-2.compute.internal +# namespace: kube-system +# data: +# kubernetesVersion: v1.9.3 +# disable: "kube-apiserver kube-controller-manager kube-scheduler" + +retries=5 +hyperkube_image="{{ .Config.HyperkubeImage.RepoWithTag }}" +my_kubernetes_version="{{ .Config.HyperkubeImage.Tag }}" +myhostname=$(hostname -f) +disable_webhooks="{{ if .Values.disableWebhooks }}true{{else}}false{{end}}" + +kubectl() { + /usr/bin/docker run -i --rm -v /etc/kubernetes:/etc/kubernetes:ro --net=host ${hyperkube_image} /hyperkube kubectl --kubeconfig=/etc/kubernetes/kubeconfig/admin.yaml "$@" +} + +kubectl_with_retries() { + local tries=0 + local result_text="" + local return_code=0 + + while [ "$tries" -lt "$retries" ]; do + result_text=$(kubectl "$@") + return_code=$? + if [ "$return_code" -eq "0" ]; then + echo "${result_text}" + break + fi + sleep 10 + tries=$((tries+1)) + done + return $return_code +} + +log() { + echo "$@" >&2 +} + +get_masters() { + kubectl get nodes -l kubernetes.io/role=master --no-headers -o custom-columns=NAME:metadata.name,VERSION:status.nodeInfo.kubeletVersion | awk '{printf "%s:%s\n", $1, $2}' +} + +valid_version() { + match=$(echo $1 | awk -e '(/^v[0-9]+\.[0-9]+\.[0-9]+/){print "match"}') + [[ "$match" == "match" ]] +} + +version_jumps() { + # only a minor release change is NOT a version jump + if [[ "${1%.*}" != "${2%.*}" ]]; then + return 0 + fi + return 1 +} + +# stop a controller by writing a special kube-aws disable service configmap +disable_controller() { + local controller=$1 + local version=$2 + + local request="$(cat <$file + if list_not_empty $file; then + echo "deleting $type webhooks..." + ensuredelete $file + fi + fi +} + +list_not_empty() { + local file=$1 + if ! [[ -s $file ]]; then + return 1 + fi + if cat $file | grep -se 'items: \[\]'; then + return 1 + fi + return 0 +} + +ensuredelete() { + kubectl delete --cascade=true --ignore-not-found=true -f $(echo "$@" | tr ' ' ',') +} + +# MAIN + +if ! $(valid_version ${my_kubernetes_version}); then + log "My kubernetes version ${my_kubernetes_version} is invalid - aborting!" + exit 1 +fi + +while ! kubectl get ns kube-system; do + echo "waiting for apiserver to be available..." + sleep 3 +done + +# Disable all mutating and validating webhooks because they can interfere with the stack migration) +if [[ "${disable_webhooks}" == "true" ]]; then + echo "Storing and removing all validating and mutating webhooks..." + save_webhooks validating /srv/kubernetes/validating_webhooks.yaml + save_webhooks mutating /srv/kubernetes/mutating_webhooks.yaml +fi + +log "" +log "CHECKING CONTROLLER VERSIONS..." +log "" +found="" +for controller in $(get_masters); do + controller_name=$(echo "${controller%%:*}") + controller_version=$(echo "${controller##*:}") + if [[ "${controller_name}" != "$myhostname" ]]; then + if ! $(valid_version ${controller_version}); then + log "Controller ${controller_name} has an invalid version number ${controller_version}!" + continue + fi + + if $(version_jumps ${my_kubernetes_version} ${controller_version}); then + log "Detected a version jump on ${controller_name}: my version is ${my_kubernetes_version} and theirs is ${controller_version}" + log "Disabling kube-apiserver, kube-scheduler and kube-controller-manager..." + if [[ -z "${found}" ]]; then + found="${controller_name}" + else + found="${found} ${controller_name}" + fi + disable_controller ${controller_name} ${controller_version} + else + log "No version jump on ${controller_name}: my version is ${my_kubernetes_version} and theirs is ${controller_version}" + fi + fi +done + +if [[ -n "${found}" ]]; then + log "" + log "WAITING FOR FOUND CONTROLLERS TO STOP..." + log "" + wait_stopped "${found}" +fi +exit 0 \ No newline at end of file diff --git a/builtin/files/plugins/upgrade-helper/plugin.yaml b/builtin/files/plugins/upgrade-helper/plugin.yaml new file mode 100644 index 000000000..52fcbf4e1 --- /dev/null +++ b/builtin/files/plugins/upgrade-helper/plugin.yaml @@ -0,0 +1,54 @@ +metadata: + name: upgrade-helper + version: 0.1.0 +spec: + cluster: + values: + disableWebhooks: true + machine: + roles: + controller: + files: + - path: /etc/systemd/system/install-kube-system.service.d/10-upgrade-helper-dependency.conf + permissions: 0644 + content: | + [Unit] + Requires=kube-aws-upgrade-helper.service + After=kube-aws-upgrade-helper.service + Before=restore-webhooks.service + + [Service] + ExecStartPre=/usr/bin/bash -c "until /usr/bin/systemctl is-active kube-aws-upgrade-helper.service; do echo waiting until kube-aws-upgrade-helper.service starts; sleep 10; done" + - path: /opt/bin/upgrade-helper.sh + permissions: 0755 + source: + path: assets/upgrade-helper.sh + - path: /opt/bin/restore-webhooks.sh + permissions: 0755 + source: + path: assets/restore-webhooks.sh + systemd: + units: + - name: kube-aws-upgrade-helper.service + content: | + [Unit] + Requires=kubelet.service + After=kubelet.service + Before=install-kube-system.service + + [Service] + Type=oneshot + StartLimitInterval=0 + RemainAfterExit=true + ExecStart=/usr/bin/bash -c '/opt/bin/upgrade-helper.sh' + - name: restore-webhooks.service + content: | + [Unit] + Requires=install-kube-system.service + After=install-kube-system.service + + [Service] + Type=oneshot + StartLimitInterval=0 + RemainAfterExit=true + ExecStart=/usr/bin/bash -c '/opt/bin/restore-webhooks.sh'