#!/bin/bash set -euo pipefail # Available env vars: # $TMP_DIR # $CLUSTER_NAME # $KUBECONFIG # $NODE_TERMINATION_HANDLER_DOCKER_REPO # $NODE_TERMINATION_HANDLER_DOCKER_TAG # $WEBHOOK_DOCKER_REPO # $WEBHOOK_DOCKER_TAG # $AEMM_URL # $AEMM_VERSION echo "Starting Spot Interruption Test for Node Termination Handler with Prometheus server enabled" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" PROMETHEUS_HELM_VERSION="41.7.4" common_helm_args=() [[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows") [[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL") helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update retry 5 helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack --version ${PROMETHEUS_HELM_VERSION} --set prometheusOperator.admissionWebhooks.enabled="false" --set grafana.enabled="false" --set nodeExporter.enabled="false" --set kubeStateMetrics.enabled="false" anth_helm_args=( upgrade --install --namespace kube-system "$CLUSTER_NAME-anth" "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" --set enableScheduledEventDraining="false" --set enableSpotInterruptionDraining="true" --set taintNode="true" --set enablePrometheusServer="true" --set podMonitor.create="true" --set daemonsetTolerations="" --wait --force ) [[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") [[ ${#common_helm_args[@]} -gt 0 ]] && anth_helm_args+=("${common_helm_args[@]}") set -x helm "${anth_helm_args[@]}" set +x emtp_helm_args=( upgrade --install --namespace default "$CLUSTER_NAME-emtp" "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" --wait ) [[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") [[ ${#common_helm_args[@]} -gt 0 ]] && emtp_helm_args+=("${common_helm_args[@]}") set -x helm "${emtp_helm_args[@]}" set +x aemm_helm_args=( upgrade --install --namespace default "$CLUSTER_NAME-aemm" "$AEMM_DL_URL" --set servicePort="$IMDS_PORT" --set 'tolerations[0].effect=NoSchedule' --set 'tolerations[0].operator=Exists' --wait ) [[ ${#common_helm_args[@]} -gt 0 ]] && aemm_helm_args+=("${common_helm_args[@]}") set -x retry 5 helm "${aemm_helm_args[@]}" set +x TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 DEPLOYED=0 for i in $(seq 1 10); do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified regular-pod-test pod was scheduled and started!" DEPLOYED=1 break fi sleep 5 done if [[ $DEPLOYED -eq 0 ]]; then exit 2 fi EXIT_STATUS=1 for i in $(seq 1 $TAINT_CHECK_CYCLES); do if kubectl get nodes "${CLUSTER_NAME}-worker" | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" if kubectl get nodes "${CLUSTER_NAME}-worker" -o json | grep -q "aws-node-termination-handler/spot-itn"; then echo "✅ Verified the worked node was tainted!" else echo "❌ Failed tainting node for spot termination event" EXIT_STATUS=3 break fi if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅" EXIT_STATUS=0 break fi fi echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" sleep $TAINT_CHECK_SLEEP done if [[ $EXIT_STATUS -eq 3 ]];then exit 3 fi POD_NAME=$(get_nth_worker_pod) echo "✅ Fetched the pod $POD_NAME " kubectl -n kube-system port-forward "$POD_NAME" 7000:9092 & PORT_FORWARD_PID=$! trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR echo "✅ Port-forwarded pod $POD_NAME" sleep 10 for i in $(seq 1 $TAINT_CHECK_CYCLES); do METRICS_RESPONSE=$(curl -L localhost:7000/metrics) echo "✅ Fetched /metrics." failed="" for METRIC in cordon-and-drain pre-drain runtime_go_gc runtime_go_goroutines runtime_go_mem; do if [[ $METRICS_RESPONSE == *"$METRIC"* ]]; then echo "✅ Metric $METRIC!" else echo "⚠️ Metric $METRIC" failed=$METRIC break fi done if [ -z $failed ]; then exit 0 fi echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" sleep $TAINT_CHECK_SLEEP done echo "❌ Failed checking metric for $METRIC" exit 3