#!/usr/bin/env bash # Builds and installs the controlller on EKS DataPlane. This is for # testing purposes only and it will not work for accounts that are # not allowlisted for ENI Trunking feature. set -eo pipefail USAGE=$(cat << 'EOM' Usage: test-with-eksctl.sh -n [cluster-name] -r [user-role-arn] Builds and installs the controlller on EKS DataPlane. This is for testing purposes only and it will not work for accounts that are not allowlisted for ENI Trunking feature. Required: [Only one of the flag must be set] -n name of the existing EKS cluster Optional: -i IAM role ARN that will be assumed by the controller to manage trunk/branch ENI -s Suffix that will be added to each resource. This is useful when running in CI Setup to prevent parallel runs from modifying resources. -r AWS Region where the controller image will be hosted. Defaults to us-west-2 EOM ) SCRIPTS_DIR=$(cd "$(dirname "$0")" || exit 1; pwd) TEMPLATE_DIR=$SCRIPTS_DIR/template/rbac BASHPID=$$ source "$SCRIPTS_DIR/lib/common.sh" while getopts "n:i:r:s:" o; do case "${o}" in n) # Name of the EKS Cluster CLUSTER_NAME=${OPTARG} ;; i) # Role ARN that will be assumed by the VPC Resource Controller VPC_RC_ROLE_ARN=${OPTARG} ;; r) # Region where ECR image will be hosted AWS_REGION=${OPTARG} ;; s) # Resource Suffix attached to AWS Resource Names RESOURCE_SUFFIX=${OPTARG} ;; *) echoerr "${USAGE}" exit 1 ;; esac done shift $((OPTIND-1)) if [[ -z "$CLUSTER_NAME" ]]; then echoerr "${USAGE}\n\nmissing: -n is a required flag\n" exit 1 fi if [[ -z "$AWS_REGION" ]]; then AWS_REGION="us-west-2" echo "no regions defined, will fallback to default region $AWS_REGION" fi source "$SCRIPTS_DIR/lib/aws.sh" source "$SCRIPTS_DIR/lib/k8s.sh" source "$SCRIPTS_DIR/lib/config.sh" check_is_installed aws check_is_installed docker check_is_installed ginkgo check_is_installed eksctl check_is_installed kubectl check_is_installed kustomize check_is_installed jq CLUSTER_NAME=$(add_suffix "$CLUSTER_NAME") AWS_ACCOUNT_ID="${AWS_ACCOUNT_ID:-$(aws sts get-caller-identity | jq .Account -r)}" VPC_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --region $AWS_REGION | jq -r '.cluster.resourcesVpcConfig.vpcId') KUBE_CONFIG_PATH=~/.kube/config CONTROLLER_LOG_FILE=/tmp/$CLUSTER_NAME.logs TEST_FAILED=false # If Role name is not provided, use the default role name if [[ -z "$VPC_RC_ROLE_ARN" ]]; then VPC_RC_ROLE_ARN="arn:aws:iam::$AWS_ACCOUNT_ID:role/$VPC_RC_ROLE_NAME" fi ECR_URL=${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com ECR_REPOSITORY=amazon/vpc-resource-controller ECR_IMAGE_TAG=$(add_suffix "test") IMAGE=$ECR_URL/$ECR_REPOSITORY:$ECR_IMAGE_TAG function build_and_push_image() { echo "building and pushing controller image to ECR" IMAGE=$IMAGE AWS_ACCOUNT=$AWS_ACCOUNT_ID AWS_REGION=$AWS_REGION make docker-build IMAGE=$IMAGE AWS_ACCOUNT=$AWS_ACCOUNT_ID AWS_REGION=$AWS_REGION make docker-push } function install_controller() { echo "installing amazon-vpc-resource-controller-k8s" IMAGE=$IMAGE \ AWS_ACCOUNT=$AWS_ACCOUNT_ID \ AWS_REGION=$AWS_REGION \ CLUSTER_NAME=$CLUSTER_NAME \ USER_ROLE_ARN=$VPC_RC_ROLE_ARN \ make deploy check_deployment_rollout vpc-resource-local-controller kube-system 2m } function disable_eks_controller() { echo "disabling the default amazon-vpc-resource-controller-k8s controller" # Delete Mutating Webhook Configuration kubectl delete mutatingwebhookconfigurations.admissionregistration.k8s.io vpc-resource-mutating-webhook # Delete the Validating Webhook Configuration kubectl delete validatingwebhookconfigurations.admissionregistration.k8s.io vpc-resource-validating-webhook # Remove the patch/update permission on ConfigMap from the EKS VPC RC Leader election Role kubectl patch roles -n kube-system vpc-resource-controller-leader-election-role \ --patch "$(cat "$TEMPLATE_DIR/cp-vpc-leader-election-role-patch.yaml")" } function set_pod_eni_flag_on_ipamd() { local flag=$1 echo "Setting Pod ENI on aws-node/ipamd to $flag" kubectl set env daemonset aws-node -n kube-system ENABLE_POD_ENI=$flag kubectl rollout status daemonset aws-node -n kube-system } function run_integration_test() { local additional_gingko_params=$1 # SGP Tests (cd test/integration/perpodsg && \ CGO_ENABLED=0 ginkgo "$additional_gingko_params" \ -v -timeout 40m -- \ -cluster-kubeconfig=$KUBE_CONFIG_PATH \ -cluster-name=$CLUSTER_NAME \ --aws-region=$AWS_REGION \ --aws-vpc-id $VPC_ID) || TEST_FAILED=true # Windows Test (cd test/integration/windows && \ CGO_ENABLED=0 ginkgo "$additional_gingko_params" -v -timeout 85m -- \ -cluster-kubeconfig=$KUBE_CONFIG_PATH \ -cluster-name=$CLUSTER_NAME \ --aws-region=$AWS_REGION \ --aws-vpc-id=$VPC_ID) || TEST_FAILED=true # SGP + Windows Webhook Test (cd test/integration/webhook && \ CGO_ENABLED=0 ginkgo "$additional_gingko_params" -v -timeout 10m -- \ -cluster-kubeconfig=$KUBE_CONFIG_PATH \ -cluster-name=$CLUSTER_NAME \ --aws-region=$AWS_REGION \ --aws-vpc-id=$VPC_ID) || TEST_FAILED=true # SGP + Metrics for Regression Test if [[ $additional_gingko_params == "--focus=LOCAL" ]]; then # We need a larger group of nodes to test the regression of controllers ng=$(eksctl get nodegroup --cluster=$CLUSTER_NAME --output=json | jq '.[0]') ng_name=$(echo $ng | jq -r '.Name') desired_size=$(echo $ng | jq '.DesiredCapacity') max_size=$(echo $ng | jq '.MaxSize') min_size=$(echo $ng | jq '.MinSize') eksctl scale nodegroup --cluster=$CLUSTER_NAME --name=$ng_name --nodes=100 --nodes-min=0 --nodes-max=150 sleep 300 (cd test/integration/metrics && \ CGO_ENABLED=0 ginkgo "$additional_gingko_params" -v -timeout 20m -- \ -cluster-kubeconfig=$KUBE_CONFIG_PATH \ -cluster-name=$CLUSTER_NAME \ --aws-region=$AWS_REGION \ --aws-vpc-id=$VPC_ID \ --latest-released-rc-image-tag=v1.1.3 ) || TEST_FAILED=true eksctl scale nodegroup --cluster=$CLUSTER_NAME --name=$ng_name --nodes=$desired_size --nodes-min=$min_size --nodes-max=$max_size fi } function verify_controller_has_lease() { # Get the name of the VPC Resource Controller Pod local controller_pod_names="$(kubectl get pods -n kube-system -l app=vpc-resource-controller \ --no-headers -o custom-columns=":metadata.name")" # Wait till the new controller has acquired the leader lease i=0 while : do local lease_holder="$(kubectl get configmap -n kube-system cp-vpc-resource-controller -o json \ | jq '.metadata.annotations."control-plane.alpha.kubernetes.io/leader"' --raw-output \ | jq .holderIdentity --raw-output)" for controller_pod_name in $controller_pod_names do if [[ $lease_holder == $controller_pod_name* ]]; then echo "one of the new controller has the lease: $lease_holder" break 2 fi done if [[ i -ge 20 ]]; then echo "new controller failed to accquire leader lease in $i attempts" exit 1 else echo "new controller doesn't have the leader lease yet, will retry" sleep 10 i=$((i+1)) fi done # Get the lease transition count at the time the new controller acquired lease LEADER_TRANSITION_BEFORE_TEST=$(get_leader_lease_transistion_count) } function verify_leader_lease_didnt_change() { # Get the leader lease transition count after running all integration tests LEADER_TRANSITION_AFTER_TEST=$(get_leader_lease_transistion_count) # If the leader lease count increased between the time when the test was running, then # we will assume the tests failed as it could mean either the controller failed to hold # the lease or it restarted during test execution which should not happen ideally if [[ $LEADER_TRANSITION_BEFORE_TEST != "$LEADER_TRANSITION_AFTER_TEST" ]]; then echo "leader transitioned during the tests, the new controller failed to keep the leader lease" exit 1 fi } # Get the number of times the leader lease transition between different owners function get_leader_lease_transistion_count() { kubectl get configmap -n kube-system cp-vpc-resource-controller -o json \ | jq '.metadata.annotations."control-plane.alpha.kubernetes.io/leader"' --raw-output \ | jq .leaderTransitions } function output_logs() { cat $CONTROLLER_LOG_FILE } function clean_up() { echo "cleaning up..." delete_ecr_image "$ECR_REPOSITORY" "$ECR_IMAGE_TAG" output_logs } function redirect_vpc_controller_logs() { # Till the time the bash process keeps running, keep on redirecting the logs to the file # The parent process will log the output of the file on exit while ps -p $BASHPID > /dev/null do kubectl logs -n kube-system -l app=vpc-resource-controller \ --tail -1 -f >> $CONTROLLER_LOG_FILE || echo "LOG COLLECTOR:existing controller killed, will retry" # Allow for the new controller to come up sleep 10 done echo "LOG COLLECTOR: exiting the process" } # Delete the IAM Policies, Roles and the EKS Cluster trap 'clean_up' EXIT # Cordon the Windows Nodes as cert manager and other future 3rd pary # dependency may not have nodeselectors to schedule pods on linux kubectl cordon -l kubernetes.io/os=windows # Install the stable version of VPC CNI bash "$SCRIPTS_DIR/install-vpc-cni.sh" "1.11" # Install Cert Manager which is used for generating the # certificates for the Webhooks bash "$SCRIPTS_DIR/install-cert-manager.sh" # Login to ECR to push the controller image ecr_login "$AWS_REGION" "$ECR_URL" # Create the repository but don't delete it as it can hold # test images from other GitHub runs create_repository "$ECR_REPOSITORY" # Push the image to ECR build_and_push_image # Install the CRD and the Controller on the Data Plane install_controller # Uncordon the Windows Node, all new deployment/pods fromt this # point must have nodeSelecotrs to be scheduled on the right OS kubectl uncordon -l kubernetes.io/os=windows # Start redirecting the logs to the log file which will be outputted at # when the script exits. This log should be run in background and exit # when the bash script exits redirect_vpc_controller_logs & # Disable the Controller on EKS Control Plane disable_eks_controller # Verify the Controller on Data Plane has the leader lease verify_controller_has_lease # Enables the SGP feature on IPAMD, which lables the node # with a fetaure flag used by controller to start managing # the node for ENI Trunking/Branching set_pod_eni_flag_on_ipamd "true" # Allow for IPAMD to label the node after startup # TODO: Handle this in the Test Suite in more concrete manner sleep 60 # Run Ginko Test for Security Group for Pods and skip all the local tests as # they require restarts and it will lead to leader lease being switched and the # next validation step failing run_integration_test "--skip=LOCAL" # Verify the leader lease didn't transition during the execution of test cases verify_leader_lease_didnt_change # Run Local Ginko Test that require multiple restarts of controller for negative # scenarios testing run_integration_test "--focus=LOCAL" # Revert back to initial state after the test set_pod_eni_flag_on_ipamd "false" # If any of the test failed, exit with non zero exit code if [ $TEST_FAILED = true ]; then exit 1 fi