#!/bin/bash # This script run integration tests on the EKS VPC Resource Controller # This is not intended to run integration tests when controller is running # on the data plane for development and testing purposes. The script expects # an EKS Cluster with atlest 3 Windows and Linux Nodes to be pre-created to # run the Canary tests. This scrip is invoked from external sources. set -e SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" GINKGO_TEST_BUILD_DIR="$SCRIPT_DIR/../../build" SECONDS=0 VPC_CNI_ADDON_NAME="vpc-cni" source "$SCRIPT_DIR"/lib/cluster.sh echo "Running VPC Resource Controller integration test with the following variables KUBE CONFIG: $KUBE_CONFIG_PATH CLUSTER_NAME: $CLUSTER_NAME REGION: $REGION" # Default Proxy is not allowed in China Region if [[ $REGION == "cn-north-1" || $REGION == "cn-northwest-1" ]]; then go env -w GOPROXY=https://goproxy.cn,direct go env -w GOSUMDB=sum.golang.google.cn fi function load_addon_details() { echo "loading $VPC_CNI_ADDON_NAME addon details" DESCRIBE_ADDON_VERSIONS=$(aws eks describe-addon-versions $ENDPOINT_FLAG --addon-name $VPC_CNI_ADDON_NAME --kubernetes-version "$K8S_VERSION") LATEST_ADDON_VERSION=$(echo "$DESCRIBE_ADDON_VERSIONS" | jq '.addons[0].addonVersions[0].addonVersion' -r) DEFAULT_ADDON_VERSION=$(echo "$DESCRIBE_ADDON_VERSIONS" | jq -r '.addons[].addonVersions[] | select(.compatibilities[0].defaultVersion == true) | .addonVersion') } function wait_for_addon_status() { local expected_status=$1 local retry_attempt=0 if [ "$expected_status" = "DELETED" ]; then while $(aws eks describe-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name $VPC_CNI_ADDON_NAME --region "$REGION"); do if [ $retry_attempt -ge 30 ]; then echo "failed to delete addon, qutting after too many attempts" exit 1 fi echo "addon is still not deleted" sleep 5 ((retry_attempt=retry_attempt+1)) done echo "addon deleted" # Even after the addon API Returns an error, if you immediately try to create a new addon it sometimes fails sleep 10 return fi retry_attempt=0 while true do STATUS=$(aws eks describe-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name $VPC_CNI_ADDON_NAME --region "$REGION" | jq -r '.addon.status') if [ "$STATUS" = "$expected_status" ]; then echo "addon status matches expected status" return fi # We have seen the canary test get stuck, we don't know what causes this but most likely suspect is # the addon update/delete attempts. So adding limited retries. if [ $retry_attempt -ge 30 ]; then echo "failed to get desired add-on status: $STATUS, qutting after too many attempts" exit 1 fi echo "addon status is not equal to $expected_status" sleep 10 ((retry_attempt=retry_attempt+1)) done } function install_add_on() { local new_addon_version=$1 if [[ -z "$new_addon_version" || "$new_addon_version" == "null" ]]; then echo "addon information for $VPC_CNI_ADDON_NAME not available, skipping EKS-managed addon installation. Tests will run against self-managed addon." return fi if DESCRIBE_ADDON=$(aws eks describe-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name $VPC_CNI_ADDON_NAME --region $REGION); then local current_addon_version=$(echo "$DESCRIBE_ADDON" | jq '.addon.addonVersion' -r) if [ "$new_addon_version" != "$current_addon_version" ]; then echo "deleting the $current_addon_version to install $new_addon_version" aws eks delete-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name "$VPC_CNI_ADDON_NAME" --region $REGION wait_for_addon_status "DELETED" else echo "addon version $current_addon_version already installed" return fi fi echo "installing addon $new_addon_version" aws eks create-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name $VPC_CNI_ADDON_NAME --resolve-conflicts OVERWRITE --addon-version $new_addon_version --region $REGION wait_for_addon_status "ACTIVE" } function run_canary_tests() { if [[ -z "${SKIP_MAKE_TEST_BINARIES}" ]]; then echo "making ginkgo test binaries" (cd $SCRIPT_DIR/../.. && make build-test-binaries) else echo "skipping making ginkgo test binaries" fi # For each component, we want to cover the most important test cases. We also don't want to take more than 30 minutes # per repository as these tests are run sequentially along with tests from other repositories # Currently the overall execution time is ~50 minutes and we will reduce it in future (CGO_ENABLED=0 ginkgo --no-color --focus="CANARY" $EXTRA_GINKGO_FLAGS -v --timeout 10m $GINKGO_TEST_BUILD_DIR/perpodsg.test -- --cluster-kubeconfig=$KUBE_CONFIG_PATH --cluster-name=$CLUSTER_NAME --aws-region=$REGION --aws-vpc-id=$VPC_ID) if [[ -z "${SKIP_WINDOWS_TEST}" ]]; then (CGO_ENABLED=0 ginkgo --no-color --focus="CANARY" $EXTRA_GINKGO_FLAGS -v --timeout 25m $GINKGO_TEST_BUILD_DIR/windows.test -- --cluster-kubeconfig=$KUBE_CONFIG_PATH --cluster-name=$CLUSTER_NAME --aws-region=$REGION --aws-vpc-id=$VPC_ID) else echo "skipping Windows tests" fi (CGO_ENABLED=0 ginkgo --no-color --focus="CANARY" $EXTRA_GINKGO_FLAGS -v --timeout 5m $GINKGO_TEST_BUILD_DIR/webhook.test -- --cluster-kubeconfig=$KUBE_CONFIG_PATH --cluster-name=$CLUSTER_NAME --aws-region=$REGION --aws-vpc-id=$VPC_ID) } echo "Starting the ginkgo test suite" load_cluster_details # Addons is supported from 1.18 onwards load_addon_details install_add_on "$LATEST_ADDON_VERSION" attach_controller_policy_cluster_role set_env_aws_node "ENABLE_POD_ENI" "true" run_canary_tests set_env_aws_node "ENABLE_POD_ENI" "false" detach_controller_policy_cluster_role echo "Successfully ran all tests in $(($SECONDS / 60)) minutes and $(($SECONDS % 60)) seconds"