#!/bin/bash set -euo pipefail START=$(date +%s) START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") export START_FOR_QUERYING SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" CLUSTER_NAME_BASE="nth-test" PRESERVE=false TMP_DIR="" DOCKER_ARGS="" PROVISION_CLUSTER_ARGS="" DELETE_CLUSTER_ARGS="" DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG="node-termination-handler:customtest" NODE_TERMINATION_HANDLER_DOCKER_IMG="" DEFAULT_WEBHOOK_DOCKER_IMG="webhook-test-proxy:customtest" WEBHOOK_DOCKER_IMG="" OVERRIDE_PATH=0 K8S_VERSION="1.22" AEMM_URL="amazon-ec2-metadata-mock-service.default.svc.cluster.local" AEMM_VERSION="1.8.1" AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz" WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"} ASSERTION_SCRIPTS=$(find "$SCRIPTPATH/../e2e" -type f | sort) function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } function relpath() { perl -e 'use File::Spec; print File::Spec->abs2rel(@ARGV) . "\n"' "$1" "$2" } function retry { local retries=$1 shift local count=0 set +e trap "set -e" RETURN until "$@"; do exit=$? set -e wait=$((2 ** count)) count=$((count + 1)) if [ $count -lt "$retries" ]; then echo "Retry $count/$retries exited $exit, retrying in $wait seconds..." sleep $wait else echo "Retry $count/$retries exited $exit, no more retries left." return $exit fi set +e done return 0 } export -f retry function clean_up { if [[ "$PRESERVE" == false ]]; then "$SCRIPTPATH/../k8s-local-cluster-test/delete-cluster" "$DELETE_CLUSTER_ARGS" || : return fi echo "To resume test with the same cluster use: \"-c $TMP_DIR\"""" } function exit_and_fail { local pod_id pod_id=$(get_nth_worker_pod || :) kubectl logs "$pod_id" --namespace kube-system || : END=$(date +%s) echo "⏰ Took $((END - START))sec" echo "❌ NTH Integration Test FAILED $CLUSTER_NAME! ❌" exit 1 } function reset_cluster { echo "Resetting cluster" charts=$(helm ls --all --short) if [[ -n "$charts" ]]; then helm del $charts --no-hooks || : fi system_charts=$(helm ls --all --short --namespace kube-system) if [[ -n "$system_charts" ]]; then helm del $system_charts --no-hooks --namespace kube-system || : fi for node in $(kubectl get nodes | tail -n+2 | cut -d' ' -f1); do kubectl uncordon "$node" kubectl taint node "$node" aws-node-termination-handler/scheduled-maintenance- || true kubectl taint node "$node" aws-node-termination-handler/spot-itn- || true kubectl taint node "$node" aws-node-termination-handler/asg-lifecycle-termination- || true kubectl taint node "$node" aws-node-termination-handler/rebalance-recommendation- || true done remove_labels || : kubectl delete events --all sleep 2 } function remove_labels { echo "Removing labels from NTH cluster nodes" labels_to_remove=() while IFS='' read -r line; do labels_to_remove+=("$line"); done < <(kubectl get nodes -o json | jq '.items[].metadata.labels' | grep 'aws-node-termination-handler' | tr -d '[:blank:]' | tr -d '\"' | cut -d':' -f1) if [[ "${#labels_to_remove[@]}" -ne 0 ]]; then for l in "${labels_to_remove[@]}"; do for n in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do echo "Deleting label $l on node $n" kubectl label node "$n" "$l"- done done fi } function get_nth_worker_pod { kubectl get pods -n kube-system \ --selector 'app.kubernetes.io/name=aws-node-termination-handler' \ --field-selector="status.phase=Running" \ --sort-by=.metadata.creationTimestamp \ -o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ | awk '$2 >= "'"${START_FOR_QUERYING//+0000/Z}"'" { print $1 }' | head -1 } USAGE=$(cat << 'EOM' Usage: run-test [-p] [-d] [-o] [-a ] [-v K8S_VERSION] [-w ] Executes a test within a provisioned kubernetes cluster with NTH and IMDS pre-loaded. Example: run-test -p -n node-termination-handler:customtest Optional: -a Assertion script (default is ALL tests in the e2e dir) -b Base name of test (will be used for cluster too) -c Cluster context directory, if operating on an existing cluster -p Preserve kind k8s cluster for inspection -n Node Termination Handler Docker Image -d use GOPROXY=direct to bypass proxy.golang.org -o Override path w/ your own kubectl and kind binaries -v Kubernetes Version (Default: 1.22) [1.22, 1.23, 1.24, 1.25, and 1.26] -w Webhook Docker Image EOM ) # Process our input arguments while getopts "pdn:oc:a:b:v:w:" opt; do case ${opt} in p ) # PRESERVE K8s Cluster echo "❄️ This run will preserve the cluster as you requested" PRESERVE=true ;; n ) # Node Termination Handler Docker Image NODE_TERMINATION_HANDLER_DOCKER_IMG="$OPTARG" ;; d ) # use GOPROXY=direct DOCKER_ARGS="--build-arg GOPROXY=direct" ;; o ) # Override path with your own kubectl and kind binaries DELETE_CLUSTER_ARGS="$DELETE_CLUSTER_ARGS -o" PROVISION_CLUSTER_ARGS="$PROVISION_CLUSTER_ARGS -o" OVERRIDE_PATH=1 ;; c ) # Cluster context directory to operate on existing cluster TMP_DIR=$OPTARG ;; a ) # Assertion script ASSERTION_SCRIPTS=$(echo "$OPTARG" | tr "," "\n") ;; b ) # Base cluster name CLUSTER_NAME_BASE=$OPTARG ;; v ) # K8s VERSION K8S_VERSION=$OPTARG ;; w ) # Webhook Docker Image WEBHOOK_DOCKER_IMG="$OPTARG" ;; \? ) echo "$USAGE" 1>&2 exit ;; esac done if [[ -z $TMP_DIR ]]; then TMP_DIR=$("$SCRIPTPATH/../k8s-local-cluster-test/provision-cluster" -b "$CLUSTER_NAME_BASE" -v "$K8S_VERSION $PROVISION_CLUSTER_ARGS") fi if [[ $OVERRIDE_PATH -eq 0 ]]; then export PATH=$TMP_DIR:$PATH else export PATH=$PATH:$TMP_DIR fi CLUSTER_NAME=$(cat "$TMP_DIR/clustername") ## Build and Load Docker Images if [ -z "$NODE_TERMINATION_HANDLER_DOCKER_IMG" ]; then echo "🥑 Building the node-termination-handler docker image" docker buildx build --load $DOCKER_ARGS -t "$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" "$SCRIPTPATH/../../." NODE_TERMINATION_HANDLER_DOCKER_IMG="$DEFAULT_NODE_TERMINATION_HANDLER_DOCKER_IMG" echo "👍 Built the node-termination-handler docker image" else echo "🥑 Skipping building the node-termination-handler docker image, since one was specified ($NODE_TERMINATION_HANDLER_DOCKER_IMG)" fi echo "$NODE_TERMINATION_HANDLER_DOCKER_IMG" > "$TMP_DIR/nth-docker-img" NODE_TERMINATION_HANDLER_DOCKER_REPO=$(echo "$NODE_TERMINATION_HANDLER_DOCKER_IMG" | cut -d':' -f1) NODE_TERMINATION_HANDLER_DOCKER_TAG=$(echo "$NODE_TERMINATION_HANDLER_DOCKER_IMG" | cut -d':' -f2) if [ -z "$WEBHOOK_DOCKER_IMG" ]; then echo "🥑 Building the webhook-test-proxy docker image" docker buildx build --load $DOCKER_ARGS -t "$DEFAULT_WEBHOOK_DOCKER_IMG" "$SCRIPTPATH/../webhook-test-proxy/." WEBHOOK_DOCKER_IMG="$DEFAULT_WEBHOOK_DOCKER_IMG" echo "👍 Built the webhook-test-proxy docker image" else echo "🥑 Skipping building the webhook-test-proxy docker image, since one was specified ($WEBHOOK_DOCKER_IMG)" fi echo "$WEBHOOK_DOCKER_IMG" > "$TMP_DIR/webhook-test-proxy-docker-img" WEBHOOK_DOCKER_REPO=$(echo "$WEBHOOK_DOCKER_IMG" | cut -d':' -f1) WEBHOOK_DOCKER_TAG=$(echo "$WEBHOOK_DOCKER_IMG" | cut -d':' -f2) echo "🥑 Loading both images into the cluster" kind load docker-image --name "$CLUSTER_NAME" "$NODE_TERMINATION_HANDLER_DOCKER_IMG" kind load docker-image --name "$CLUSTER_NAME" "$WEBHOOK_DOCKER_IMG" echo "👍 Loaded both images into the cluster" export KUBECONFIG="$TMP_DIR/kubeconfig" trap "exit_and_fail" INT TERM ERR trap "clean_up" EXIT cat << EOF >$TMP_DIR/env export KUBECONFIG=$TMP_DIR/kubeconfig echo "Updated KUBECONFIG=$KUBECONFIG" export PATH=$TMP_DIR:\$PATH echo "Updated PATH=$PATH" EOF echo "======================================================================================================" echo "To poke around your test manually:" echo ". $TMP_DIR/env" echo "kubectl get pods -A" echo "======================================================================================================" ### exported vars and funcs that tests can use export TMP_DIR export CLUSTER_NAME export AEMM_URL export AEMM_VERSION export AEMM_DL_URL export WEBHOOK_URL export -f timeout export -f relpath export -f get_nth_worker_pod export NODE_TERMINATION_HANDLER_DOCKER_IMG export NODE_TERMINATION_HANDLER_DOCKER_REPO export NODE_TERMINATION_HANDLER_DOCKER_TAG export WEBHOOK_DOCKER_IMG export WEBHOOK_DOCKER_REPO export WEBHOOK_DOCKER_TAG export AWS_REGION="us-east-1" export WORKER_IP="192.168.0.1" export NTH_CONTROL_LABEL="kubernetes\.io/hostname=$CLUSTER_NAME-control-plane" export NTH_WORKER_LABEL="kubernetes\.io/hostname=ip-${WORKER_IP//\./-}.ec2.internal" ### ## Need to override hostname label for CTH localstack tests kubectl label node "${CLUSTER_NAME}-worker" "$(echo $NTH_WORKER_LABEL | tr -d '\\')" --overwrite ## Mark worker2 only for Critical Add-Ons like dns kubectl taint node "${CLUSTER_NAME}-worker2" CriticalAddonsOnly=true:NoSchedule --overwrite i=0 for assert_script in $ASSERTION_SCRIPTS; do reset_cluster START_FOR_QUERYING=$(date -u +"%Y-%m-%dT%TZ") IMDS_PORT=$((i + 1338)) export IMDS_PORT i=$((i + 1)) echo "======================================================================================================" echo "🥑 Running assertion script $(basename "$assert_script")" echo "======================================================================================================" assert_start=$(date +%s) $assert_script assert_end=$(date +%s) echo "⏰ Took $((assert_end - assert_start))sec" POD_ID=$(get_nth_worker_pod || :) kubectl logs "$POD_ID" --namespace kube-system || : ## Resets cluster to run another test on the same cluster reset_cluster echo "✅ Assertion test $assert_script PASSED! ✅" done echo "======================================================================================================" echo "✅ All tests passed! ✅" echo "======================================================================================================"