From 6d87a8f97b4d99c023f2614c091f10abc8d546cd Mon Sep 17 00:00:00 2001
From: Dave McCormick <davidemccormick@gmail.com>
Date: Fri, 12 Jul 2019 13:09:41 +0100
Subject: [PATCH] Implement a simple upgradeHelper plugin which will disable
 existing controllers when a new kubernetes release is being rolled out.
 (#1680)

Save mutating and validating webhooks before install-kube-system runs and restore again afterwards.

Webhook feature can be toggled using 'disableWebhooks' boolean plugin config entry.
---
 builtin/files/cluster.yaml.tmpl               |  16 +-
 .../upgrade-helper/assets/restore-webhooks.sh |  44 ++++
 .../upgrade-helper/assets/upgrade-helper.sh   | 214 ++++++++++++++++++
 .../files/plugins/upgrade-helper/plugin.yaml  |  54 +++++
 4 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh
 create mode 100644 builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh
 create mode 100644 builtin/files/plugins/upgrade-helper/plugin.yaml

diff --git a/builtin/files/cluster.yaml.tmpl b/builtin/files/cluster.yaml.tmpl
index 0170b35ae..dfe8bd962 100644
--- a/builtin/files/cluster.yaml.tmpl
+++ b/builtin/files/cluster.yaml.tmpl
@@ -1573,7 +1573,9 @@ kubeAwsPlugins:
   # See plugins/aws-iam-authenticator/plugin.yaml for more info
   awsIamAuthenticator:
     enabled: false
-    # see plugins/cluster-autoscaler/plugin.yaml for more info
+
+  # clusterAutoscaler provides kubernetes cluster-autoscaler functionality - https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler
+  # Replaces original built-in functionality with a plugin and upgrades to the latest version
   clusterAutoscaler:
     enabled: false
     replicas: 2
@@ -1604,3 +1606,15 @@ kubeAwsPlugins:
       # selectors for autodiscovery
       selector:
         prometheus: monitoring
+
+    # upgradeHelper - assists when rolling out new versions of kubernetes
+    # It actively disables old controllers and temporarily removes mutating/validating webhooks whilst
+    # the upgraded controller is starting up.
+    # NOTE: You will normally not need this plugin - so ONLY enable if you are experiencing issues testing migrating across versions.
+    # It will only kill controller that are a different release from currently spinning up version, e.g. :-
+    #   it will kill v1.13.2 controllers when rolling out v1.14.0
+    #   it will NOT kill v1.14.0 controllers when rolling out v1.14.3
+    upgradeHelper:
+      enabled: false
+      # disableWebhooks can be used to turn off the webhook feature if required
+      disableWebhooks: true
diff --git a/builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh b/builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh
new file mode 100644
index 000000000..de9d359fc
--- /dev/null
+++ b/builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Restore webhooks that were exported and then deleted by upgrade-helper.sh
+
+retries=5
+hyperkube_image="{{ .Config.HyperkubeImage.RepoWithTag }}"
+disable_webhooks="{{ if .Values.disableWebhooks }}true{{else}}false{{end}}"
+
+kubectl() {
+  /usr/bin/docker run -i --rm -v /etc/kubernetes:/etc/kubernetes:ro --net=host ${hyperkube_image} /hyperkube kubectl --kubeconfig=/etc/kubernetes/kubeconfig/admin.yaml "$@"
+}
+
+list_not_empty() {
+  local file=$1
+  if ! [[ -s $file ]]; then
+    return 1
+  fi
+  if cat $file | grep -se 'items: \[\]'; then
+    return 1
+  fi
+  return 0
+}
+
+applyall() {
+  kubectl apply --force -f $(echo "$@" | tr ' ' ',')
+}
+
+restore_webhooks() {
+  local type=$1
+  local file=$2
+
+  if list_not_empty $file; then
+    echo "Restoring all ${type} webhooks from ${file}"
+    applyall $file
+  else
+      echo "no webhooks to restore in $file"
+  fi
+}
+
+if [[ "${disable_webhooks}" == "true" ]]; then
+    echo "Restoring all validating and mutating webhooks..."
+    restore_webhooks validating /srv/kubernetes/validating_webhooks.yaml
+    restore_webhooks mutating /srv/kubernetes/mutating_webhooks.yaml
+fi
+exit 0
\ No newline at end of file
diff --git a/builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh b/builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh
new file mode 100644
index 000000000..a61cc28a5
--- /dev/null
+++ b/builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+# Smooths upgrades/roll-backs where the release of kubernetes jumps a release
+# It kills old controllers so that this one takes over all api functions, so we don't get an 
+# extended period of old and new running side-by-side and the incompatibilities that this can bring.
+# It also removes any mutating and validating webhooks in the system so that install-kube-system can run without interference.
+# 
+# A request to disable is a configmap matching the hostname and kubernetes version containing a list of core service to stop: -
+# apiVersion: v1
+# kind: ConfigMap
+# metadata:
+#   name: kube-aws-migration-disable-ip-10-29-26-83.us-west-2.compute.internal
+#   namespace: kube-system
+# data:
+#   kubernetesVersion: v1.9.3
+#   disable: "kube-apiserver kube-controller-manager kube-scheduler"
+
+retries=5
+hyperkube_image="{{ .Config.HyperkubeImage.RepoWithTag }}"
+my_kubernetes_version="{{ .Config.HyperkubeImage.Tag }}"
+myhostname=$(hostname -f)
+disable_webhooks="{{ if .Values.disableWebhooks }}true{{else}}false{{end}}"
+
+kubectl() {
+    /usr/bin/docker run -i --rm -v /etc/kubernetes:/etc/kubernetes:ro --net=host ${hyperkube_image} /hyperkube kubectl --kubeconfig=/etc/kubernetes/kubeconfig/admin.yaml "$@"
+}
+
+kubectl_with_retries() {
+  local tries=0
+  local result_text=""
+  local return_code=0
+
+  while [ "$tries" -lt "$retries" ]; do
+    result_text=$(kubectl "$@")
+    return_code=$?
+    if [ "$return_code" -eq "0" ]; then
+      echo "${result_text}"
+      break
+    fi
+    sleep 10
+    tries=$((tries+1))
+  done
+  return $return_code
+}
+
+log() {
+  echo "$@" >&2
+}
+
+get_masters() {
+  kubectl get nodes -l kubernetes.io/role=master --no-headers -o custom-columns=NAME:metadata.name,VERSION:status.nodeInfo.kubeletVersion | awk '{printf "%s:%s\n", $1, $2}'
+}
+
+valid_version() {
+  match=$(echo $1 | awk -e '(/^v[0-9]+\.[0-9]+\.[0-9]+/){print "match"}')
+  [[ "$match" == "match" ]]
+}
+
+version_jumps() {
+  # only a minor release change is NOT a version jump
+  if [[ "${1%.*}" != "${2%.*}" ]]; then
+    return 0
+  fi
+  return 1
+}
+
+# stop a controller by writing a special kube-aws disable service configmap
+disable_controller() {
+  local controller=$1
+  local version=$2
+
+  local request="$(cat <<EOT
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kube-aws-migration-disable-${controller}
+  namespace: kube-system
+data:
+  kubernetesVersion: ${version}
+  disable: "kube-controller-manager kube-scheduler kube-apiserver"
+EOT
+)"
+
+  log "Creating disable service configmap kube-system/kube-aws-migration-disable-${controller}"
+  echo "${request}" | kubectl_with_retries -n kube-system apply -f - || return 1
+  return 0
+}
+
+find_pod() {
+  local name=$1
+  local host=$2
+
+  kubectl -n kube-system get pod "${name}-${host}" --no-headers -o wide --ignore-not-found
+}
+
+node_running() {
+  local node=$1
+
+  ready=$(kubectl -n kube-system get node "${node}" --no-headers --ignore-not-found | awk '{print $2}')
+  if [[ "${ready}" == "Ready" ]]; then
+    return 0
+  fi
+
+  return 1
+}
+
+wait_stopped() {
+  local controllers=$1
+  log ""
+  log "WAITING FOR ALL MATCHED CONTROLLERS TO STOP:-"
+  log "${controllers}"
+  log ""
+
+  local test=1
+  while [ "$test" -eq "1" ]; do
+    test=0
+
+    for cont in $controllers; do
+      if node_running $cont; then
+        test=1
+      fi
+    done
+
+    if [ "$test" -eq "1" ]; then
+      log "Controllers still active, waiting 5 seconds..."
+      sleep 5
+    fi
+  done
+}
+
+save_webhooks() {
+  local type=$1
+  local file=$2
+
+  echo "Storing and removing all ${type} webhooks to ${file}"
+  if [[ -s $file ]]; then
+    echo "$file already saved"
+  else
+    kubectl get ${type}webhookconfigurations -o yaml --export >$file
+    if list_not_empty $file; then
+      echo "deleting $type webhooks..."
+      ensuredelete $file
+    fi
+  fi
+}
+
+list_not_empty() {
+  local file=$1
+  if ! [[ -s $file ]]; then
+    return 1
+  fi
+  if cat $file | grep -se 'items: \[\]'; then
+    return 1
+  fi
+  return 0
+}
+
+ensuredelete() {
+  kubectl delete --cascade=true --ignore-not-found=true -f $(echo "$@" | tr ' ' ',')
+}
+
+# MAIN
+
+if ! $(valid_version ${my_kubernetes_version}); then
+  log "My kubernetes version ${my_kubernetes_version} is invalid - aborting!"
+  exit 1
+fi
+
+while ! kubectl get ns kube-system; do
+  echo "waiting for apiserver to be available..."
+  sleep 3
+done
+
+# Disable all mutating and validating webhooks because they can interfere with the stack migration)
+if [[ "${disable_webhooks}" == "true" ]]; then
+  echo "Storing and removing all validating and mutating webhooks..."
+  save_webhooks validating /srv/kubernetes/validating_webhooks.yaml
+  save_webhooks mutating /srv/kubernetes/mutating_webhooks.yaml
+fi
+
+log ""
+log "CHECKING CONTROLLER VERSIONS..."
+log ""
+found=""
+for controller in $(get_masters); do
+  controller_name=$(echo "${controller%%:*}")
+  controller_version=$(echo "${controller##*:}")
+  if [[ "${controller_name}" != "$myhostname" ]]; then
+    if ! $(valid_version ${controller_version}); then
+      log "Controller ${controller_name} has an invalid version number ${controller_version}!"
+      continue
+    fi
+
+    if $(version_jumps ${my_kubernetes_version} ${controller_version}); then
+      log "Detected a version jump on ${controller_name}: my version is ${my_kubernetes_version} and theirs is ${controller_version}"
+      log "Disabling kube-apiserver, kube-scheduler and kube-controller-manager..."
+      if [[ -z "${found}" ]]; then
+        found="${controller_name}"
+      else
+        found="${found} ${controller_name}"
+      fi
+      disable_controller ${controller_name} ${controller_version}
+    else
+      log "No version jump on ${controller_name}: my version is ${my_kubernetes_version} and theirs is ${controller_version}"
+    fi
+  fi
+done
+
+if [[ -n "${found}" ]]; then
+    log ""
+    log "WAITING FOR FOUND CONTROLLERS TO STOP..."
+    log ""
+    wait_stopped "${found}"
+fi
+exit 0
\ No newline at end of file
diff --git a/builtin/files/plugins/upgrade-helper/plugin.yaml b/builtin/files/plugins/upgrade-helper/plugin.yaml
new file mode 100644
index 000000000..52fcbf4e1
--- /dev/null
+++ b/builtin/files/plugins/upgrade-helper/plugin.yaml
@@ -0,0 +1,54 @@
+metadata:
+  name: upgrade-helper
+  version: 0.1.0
+spec:
+  cluster:
+    values:
+      disableWebhooks: true
+    machine:
+      roles:
+        controller:
+          files:
+          - path: /etc/systemd/system/install-kube-system.service.d/10-upgrade-helper-dependency.conf
+            permissions: 0644
+            content: |
+              [Unit]
+              Requires=kube-aws-upgrade-helper.service
+              After=kube-aws-upgrade-helper.service
+              Before=restore-webhooks.service
+              
+              [Service]
+              ExecStartPre=/usr/bin/bash -c "until /usr/bin/systemctl is-active kube-aws-upgrade-helper.service; do echo waiting until kube-aws-upgrade-helper.service starts; sleep 10; done"
+          - path: /opt/bin/upgrade-helper.sh
+            permissions: 0755
+            source:
+              path: assets/upgrade-helper.sh
+          - path: /opt/bin/restore-webhooks.sh
+            permissions: 0755
+            source:
+              path: assets/restore-webhooks.sh
+          systemd:
+            units:
+            - name: kube-aws-upgrade-helper.service
+              content: |
+                [Unit]
+                Requires=kubelet.service
+                After=kubelet.service
+                Before=install-kube-system.service
+
+                [Service]
+                Type=oneshot
+                StartLimitInterval=0
+                RemainAfterExit=true
+                ExecStart=/usr/bin/bash -c '/opt/bin/upgrade-helper.sh'
+            - name: restore-webhooks.service
+              content: |
+                [Unit]
+                Requires=install-kube-system.service
+                After=install-kube-system.service
+
+                [Service]
+                Type=oneshot
+                StartLimitInterval=0
+                RemainAfterExit=true
+                ExecStart=/usr/bin/bash -c '/opt/bin/restore-webhooks.sh'