#!/bin/bash set -euo pipefail # Available env vars: # $TMP_DIR # $CLUSTER_NAME # $KUBECONFIG # $AEMM_URL # $AEMM_VERSION function fail_and_exit { echo "❌ Scheduled Maintenance Event System Reboot Test failed $CLUSTER_NAME ❌" exit "${1:-1}" } echo "Starting Maintenance Event Cancellation Test for Node Termination Handler" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" common_helm_args=() [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") anth_helm_args=( upgrade --install --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set taintNode="true" --wait --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") [[ ${#common_helm_args[@]} -gt 0 ]] && anth_helm_args+=("${common_helm_args[@]}") set -x helm "${anth_helm_args[@]}" set +x emtp_helm_args=( upgrade --install --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") [[ ${#common_helm_args[@]} -gt 0 ]] && emtp_helm_args+=("${common_helm_args[@]}") set -x helm "${emtp_helm_args[@]}" set +x aemm_helm_args=( upgrade --install --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" --set servicePort="$IMDS_PORT" --set arguments='{events}' --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") set -x retry 5 helm "${aemm_helm_args[@]}" set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 deployed=0 for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" deployed=1 break fi echo "Setup Loop $i/$TAINT_CHECK_SLEEP, sleeping for $TAINT_CHECK_SLEEP seconds" sleep $TAINT_CHECK_SLEEP done if [[ $deployed -eq 0 ]]; then echo "❌ Failed test setup for regular-pod" fail_and_exit 2 fi cordoned=0 tainted=0 test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" for i in $(seq 1 $TAINT_CHECK_CYCLES); do if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then echo "✅ Verified the worker node was cordoned for maintenance event reboot!" cordoned=1 fi if [[ $cordoned -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then echo "✅ Verified the worker node was tainted!" tainted=1 fi if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" break fi echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" sleep $TAINT_CHECK_SLEEP done if [[ $cordoned -eq 0 ]]; then echo "❌ Failed cordoning node for scheduled maintenance event" fail_and_exit 3 fi if [[ $tainted -eq 0 ]]; then echo "❌ Failed tainting node for scheduled maintenance event" fail_and_exit 3 fi mock_uptime_filepath="/uptime" if [[ "${TEST_WINDOWS:-"false"}" != "true" ]]; then echo "Copy uptime file to Kind k8s nodes" for node in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do docker exec "$node" sh -c "rm -rf $mock_uptime_filepath" docker cp "$SCRIPTPATH/../assets/uptime-reboot" "$node:$mock_uptime_filepath" docker exec "$node" sh -c "chmod 0444 $mock_uptime_filepath && chown root $mock_uptime_filepath && chgrp root $mock_uptime_filepath" done else echo "Copy uptime file to $TEST_NODE" kubectl cp "$SCRIPTPATH/../assets/uptime-root kube-system/$(get_nth_worker_pod):$mock_uptime_filepath" fi echo "Remove amazon-ec2-metadata-mock to prevent another drain event" deployment=$(kubectl get deployments | grep 'amazon-ec2-metadata-mock' | cut -d' ' -f1) kubectl delete deployments "$deployment" ## Restart NTH which will simulate a system reboot by mounting a new uptime file anth_helm_args=( upgrade --install --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set procUptimeFile="$mock_uptime_filepath" --set enableSpotInterruptionDraining="true" --set enableScheduledEventDraining="true" --set taintNode="true" --wait --force ) [[ ${#common_helm_args[@]} -gt 0 ]] && anth_helm_args+=("${common_helm_args[@]}") set -x helm "${anth_helm_args[@]}" set +x uncordoned=0 untainted=0 for i in $(seq 1 $TAINT_CHECK_CYCLES); do NODE_LINE=$(kubectl get nodes "${test_node}" | grep -v 'STATUS') if [[ $uncordoned -eq 0 && -z $(echo "${NODE_LINE}" | grep SchedulingDisabled) ]] && [[ ! -z $(echo "${NODE_LINE}" | grep Ready) ]]; then echo "✅ Verified the worker node was uncordoned!" uncordoned=1 fi if [[ $uncordoned -eq 1 && $untainted -eq 0 ]] && ! kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then echo "✅ Verified the worked node was untainted!" untainted=1 fi if [[ $untainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified the regular-pod-test pod was rescheduled" echo "✅ Scheduled Maintenance Event System Reboot Test Passed $CLUSTER_NAME! ✅" exit 0 fi echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" sleep $TAINT_CHECK_SLEEP done if [[ $uncordoned -eq 0 ]]; then echo "❌ Worker node was not UNcordoned" elif [[ $untainted -eq 0 ]]; then echo "❌ Worked node was not UNtainted" else echo "❌ regular-pod-test pod was not rescheduled" fi fail_and_exit 1