rancher-sandbox · mook-as · May 16, 2024 · Jun 18, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
@@ -269,6 +269,7 @@ gcs
 GENERALIZEDTIME
 getwindowid
 ghp
+gitjob
 gitmodules
 gitrepo
 gke

diff --git a/bats/Makefile b/bats/Makefile
@@ -16,12 +16,12 @@ SC_EXCLUDES ?= SC1091,SC2034,SC2154
 lint:
 	find tests -name '*.bash' | xargs ./scripts/bats-lint.pl
 	find tests -name '*.bats' | xargs ./scripts/bats-lint.pl
-	find tests -name '*.bash' | xargs shellcheck -s bash -e $(SC_EXCLUDES)
-	find tests -name '*.bats' | xargs shellcheck -s bash -e $(SC_EXCLUDES)
-	find scripts -name '*.sh' | xargs shellcheck -s bash -e $(SC_EXCLUDES)
-	find tests -name '*.bash' | xargs shfmt -s -d
-	find tests -name '*.bats' | xargs shfmt -s -d
-	find scripts -name '*.sh' | xargs shfmt -s -d
+	find tests -name '*.bash' | xargs shellcheck --shell=bash --exclude=$(SC_EXCLUDES)
+	find tests -name '*.bats' | xargs shellcheck --shell=bash --exclude=$(SC_EXCLUDES)
+	find scripts -name '*.sh' | xargs shellcheck --shell=bash --exclude=$(SC_EXCLUDES)
+	find tests -name '*.bash' | xargs shfmt --simplify --diff --language-dialect bats --indent 4
+	find tests -name '*.bats' | xargs shfmt --simplify --diff --language-dialect bats --indent 4
+	find scripts -name '*.sh' | xargs shfmt --simplify --diff
 
 DEPS = bin/darwin/jq bin/linux/jq
 

diff --git a/bats/tests/helpers/utils.bash b/bats/tests/helpers/utils.bash
@@ -388,7 +388,9 @@ capture_logs() {
         cp -LR "${PATH_LOGS}/" "$logdir"
         echo "${BATS_TEST_DESCRIPTION:-teardown}" >"${logdir}/test_description"
         # Capture settings.json
-        cp "$PATH_CONFIG_FILE" "$logdir"
+        if [[ -f $PATH_CONFIG_FILE ]]; then
+            cp "$PATH_CONFIG_FILE" "$logdir"
+        fi
         foreach_profile export_profile "$logdir"
     fi
 }

diff --git a/bats/tests/k8s/helm-install-rancher.bats b/bats/tests/k8s/helm-install-rancher.bats
@@ -1,7 +1,10 @@
 # Test case 11 & 12
 
 load '../helpers/load'
-RD_FILE_RAMDISK_SIZE=12 # We need more disk to run the Rancher image.
+
+local_setup_file() {
+    RD_USE_RAMDISK=false
+}
 
 local_setup() {
     needs_port 443
@@ -84,6 +87,65 @@ determine_chart_version() {
         fail || return
 }
 
+assert_not_empty_list() {
+    run "$@"
+    assert_success || return
+    run jq_output length
+    assert_success || return
+    refute_output 0 || return
+}
+
+assert_true() {
+    run --separate-stderr "$@"
+    assert_success || return
+    assert_output --regexp '^([Tt]rue|1)$' || return
+}
+
+# Given namespace and app name, assert that a log line contains the given string.
+assert_pod_log_line() {
+    local namespace="$1"
+    local selector="app=$2"
+    shift 2
+    local expect="$*"
+    run kubectl get pod --namespace "$namespace" --selector "$selector" --output=jsonpath='{.items[0].metadata.name}'
+    assert_success
+    assert_output || return
+    local name="$output"
+
+    run kubectl logs --namespace "$namespace" "$name"
+    assert_success || return
+    assert_output --partial "$expect" || return
+}
+
+# Pull down the image manually first so we are less likely to time out when
+# deploying rancher
+pull_rancher_image() {
+    local rancher_chart_version
+    if ! load_var rancher_chart_version; then
+        fail "Could not restore Rancher chart version"
+    fi
+    local CONTAINERD_NAMESPACE=k8s.io
+    try ctrctl pull --quiet "rancher/rancher:v$rancher_chart_version"
+}
+
+wait_for_rancher_pod() {
+    try assert_pod_log_line cattle-system rancher Listening on :443
+    try assert_pod_log_line cattle-system rancher Starting catalog controller
-    try assert_pod_log_line cattle-system rancher Listening on :443
-    try assert_pod_log_line cattle-system rancher Starting catalog controller
+    try assert_pod_log_line cattle-system rancher "Listening on :443" || return
+    try assert_pod_log_line cattle-system rancher "Starting catalog controller" || return
-    try assert_pod_log_line cattle-system rancher Listening on :443
-    try assert_pod_log_line cattle-system rancher Starting catalog controller
+    local NAMESPACE=cattle-system
+    local APP=rancher
+    
+    try assert_pod_log_line "Listening on :443" || return
+    try assert_pod_log_line "Starting catalog controller" || return
-    try assert_pod_log_line cattle-system rancher Listening on :443
-    try assert_pod_log_line cattle-system rancher Starting catalog controller
+    try assert_pod_log_line cattle-system rancher "Listening on :443" || return
+    try assert_pod_log_line cattle-system rancher "Starting catalog controller" || return
-    try assert_pod_log_line cattle-system rancher Listening on :443
-    try assert_pod_log_line cattle-system rancher Starting catalog controller
+    local NAMESPACE=cattle-system
+    local APP=rancher
+    
+    try assert_pod_log_line "Listening on :443" || return
+    try assert_pod_log_line "Starting catalog controller" || return
+    try --max 60 --delay 10 assert_pod_log_line cattle-system rancher Watching metadata for rke-machine-config.cattle.io/v1
+    try --max 60 --delay 10 assert_pod_log_line cattle-system rancher 'Creating clusterRole for roleTemplate Cluster Owner (cluster-owner).'
+    try assert_pod_log_line cattle-system rancher Rancher startup complete
+    try assert_pod_log_line cattle-system rancher Created machine for node
+}
+
+wait_for_webhook_pod() {
+    try assert_pod_log_line cattle-system rancher-webhook Rancher-webhook version
+    try assert_pod_log_line cattle-system rancher-webhook Listening on :9443
+    # Depending on version, this is either "cattle-webhook-tls" or "cattle-system/cattle-webhook-tls"
+    try assert_pod_log_line cattle-system rancher-webhook Creating new TLS secret for cattle-
+    try assert_pod_log_line cattle-system rancher-webhook Active TLS secret cattle-
+    try assert_pod_log_line cattle-system rancher-webhook 'Sleeping for 15 seconds then applying webhook config'
+}
+
 deploy_rancher() {
     # TODO remove `skip_unless_host_ip` once `traefik_hostname` no longer needs it
     if is_windows; then
@@ -98,22 +160,70 @@ deploy_rancher() {
     helm upgrade \
         --install cert-manager jetstack/cert-manager \
         --namespace cert-manager \
-        --set installCRDs=true \
+        --set crds.enabled=true \
+        --set crds.keep=true \
+        --set prometheus.enabled=false \
         --set "extraArgs[0]=--enable-certificate-owner-ref=true" \
         --create-namespace
+    try assert_not_empty_list helm list --namespace cert-manager --deployed --output json --selector name=cert-manager
+    wait_for_kube_deployment_available --namespace cert-manager cert-manager
 
     local host
     host=$(traefik_hostname) || return
 
     comment "Installing rancher $rancher_chart_version"
+    # The helm install can take a long time, especially on CI.  Therefore we
+    # avoid using --wait / --timeout, and instead check for forward progress
+    # at each step.
     helm upgrade \
         --install rancher rancher-latest/rancher \
         --version "$rancher_chart_version" \
         --namespace cattle-system \
         --set hostname="$host" \
-        --wait \
-        --timeout=10m \
+        --set replicas=1 \
         --create-namespace
+
+    try assert_not_empty_list helm list --all --output json --namespace cattle-system --selector name=rancher
+    try assert_not_empty_list helm list --deployed --output json --namespace cattle-system --selector name=rancher
+    try kubectl get ingress --namespace cattle-system rancher
+    try assert_not_empty_list kubectl get ingress --namespace cattle-system rancher --output jsonpath='{.status.loadBalancer.ingress}'
+
+    try --max 60 --delay 10 kubectl get namespace fleet-local
+    try --max 60 --delay 10 kubectl get namespace local
+    try --max 60 --delay 10 kubectl get namespace cattle-global-data
+    try --max 60 --delay 10 kubectl get namespace fleet-default
+
+    try assert_not_empty_list kubectl get pods --namespace cattle-system --selector app=rancher --output jsonpath='{.items}'
+
+    # Unfortunately, the Rancher pod could get restarted; this may lead to the
+    # wait steps to fail and we need to start again from the top.
+    try wait_for_rancher_pod
+
+    try assert_true kubectl get APIServices v3.project.cattle.io --output=jsonpath='{.status.conditions[?(@.type=="Available")].status}'
+
+    try kubectl get namespace cattle-fleet-system
+    try kubectl get namespace cattle-system
+
+    try --max 48 kubectl get deployment --namespace cattle-fleet-system fleet-controller
+    try assert_kube_deployment_available --namespace cattle-fleet-system gitjob
+    try assert_kube_deployment_available --namespace cattle-fleet-system fleet-controller
+
+    try --max 60 --delay 10 assert_not_empty_list kubectl get pods --namespace cattle-system --selector app=rancher-webhook --output jsonpath='{.items}'
+
+    # Unfortunately, the webhook pod might restart too :(
+    try wait_for_webhook_pod
+
+    try --max 120 assert_kube_deployment_available --namespace cattle-system rancher
+    try --max 120 assert_kube_deployment_available --namespace cattle-fleet-local-system fleet-agent
+    try --max 60 assert_kube_deployment_available --namespace cattle-system rancher-webhook
+
+    # The rancher pod sometimes falls over on its own; retry in a loop to
+    # detect flapping.
+    local i
+    for i in {1..10}; do
+        sleep 1
+        try --max 60 --delay 10 assert_kube_deployment_available --namespace cattle-system rancher
+    done
 }
 
 verify_rancher() {
@@ -122,24 +232,32 @@ verify_rancher() {
         skip_unless_host_ip
     fi
 
+    # Get k3s logs if possible before things fail
+    kubectl get deployments --all-namespaces || :
+    kubectl get pods --all-namespaces || :
+
+    local name
+    name="$(kubectl get pod -n cattle-system --selector app=rancher --output=jsonpath='{.items[].metadata.name}' || echo '')"
+    if [[ -n $name ]]; then
+        kubectl logs -n cattle-system "$name" || :
+    fi
+
+    name="$(kubectl get pod -n cattle-system --selector app=rancher-webhook --output=jsonpath='{.items[].metadata.name}' || echo '')"
+    if [[ -n $name ]]; then
+        kubectl logs -n cattle-system "$name" || :
+    fi
+
     local host
     host=$(traefik_hostname) || return
 
-    run try --max 9 --delay 10 curl --insecure --silent --show-error "https://${host}/dashboard/auth/login"
+    run try --max 9 --delay 10 curl --insecure --show-error "https://${host}/dashboard/auth/login"
     assert_success
     assert_output --partial 'href="/dashboard/'
-    run kubectl get secret --namespace cattle-system bootstrap-secret -o json
+    run try kubectl get secret --namespace cattle-system bootstrap-secret -o json
     assert_success
     assert_output --partial "bootstrapPassword"
 }
 
-uninstall_rancher() {
-    run helm uninstall rancher --namespace cattle-system --wait
-    assert_nothing
-    run helm uninstall cert-manager --namespace cert-manager --wait
-    assert_nothing
-}
-
 @test 'add helm repo' {
     helm repo add jetstack https://charts.jetstack.io
     helm repo add rancher-latest https://releases.rancher.com/server-charts/latest
@@ -152,6 +270,6 @@ foreach_k3s_version \
     start_kubernetes \
     wait_for_kubelet \
     wait_for_traefik \
+    pull_rancher_image \
     deploy_rancher \
-    verify_rancher \
-    uninstall_rancher
+    verify_rancher
-Original file line number
+Diff line change
@@ Expand Up / @@ -269,6 +269,7 @@ gcs @@
     GENERALIZEDTIME
     getwindowid
     ghp
+    gitjob
     gitmodules
     gitrepo
     gke
@@ Expand Down @@