#!/bin/bash # # Copyright 2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. # A copy of the License is located at # # http://aws.amazon.com/apache2.0 # # or in the "license" file accompanying this file. This file is distributed # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. # ELB_LIST defines which Elastic Load Balancers this instance should be part of. # The elements in ELB_LIST should be separated by space. Safe default is "". # Set to "_all_" to automatically find all load balancers the instance is registered to. # Set to "_any_" will work as "_all_" but will not fail if instance is not attached to # any ASG or ELB, giving flexibility. ELB_LIST="" # Under normal circumstances, you shouldn't need to change anything below this line. # ----------------------------------------------------------------------------- export PATH="$PATH:/usr/bin:/usr/local/bin" # If true, all messages will be printed. If false, only fatal errors are printed. DEBUG=true # If true, all commands will have a initial jitter - use this if deploying to significant number of instances only INITIAL_JITTER=false # Number of times to check for a resouce to be in the desired state. WAITER_ATTEMPTS=60 # Number of seconds to wait between attempts for resource to be in a state. WAITER_INTERVAL=3 # AutoScaling Standby features at minimum require this version to work. MIN_CLI_VERSION='1.3.25' # Create a flagfile for each deployment FLAGFILE="/tmp/asg_codedeploy_flags-$DEPLOYMENT_GROUP_ID-$DEPLOYMENT_ID" # Handle ASG processes HANDLE_PROCS=false # # Performs CLI command and provides expotential backoff with Jitter between any failed CLI commands # FullJitter algorithm taken from: https://www.awsarchitectureblog.com/2015/03/backoff.html # Optional pre-jitter can be enabled via GLOBAL var INITIAL_JITTER (set to "true" to enable) # exec_with_fulljitter_retry() { local MAX_RETRIES=${EXPBACKOFF_MAX_RETRIES:-8} # Max number of retries local BASE=${EXPBACKOFF_BASE:-2} # Base value for backoff calculation local MAX=${EXPBACKOFF_MAX:-120} # Max value for backoff calculation local FAILURES=0 local RESP # Perform initial jitter sleep if enabled if [ "$INITIAL_JITTER" = "true" ]; then local SECONDS=$(( $RANDOM % ( ($BASE * 2) ** 2 ) )) sleep $SECONDS fi # Execute Provided Command RESP=$(eval $@) until [ $? -eq 0 ]; do FAILURES=$(( $FAILURES + 1 )) if (( $FAILURES > $MAX_RETRIES )); then echo "$@" >&2 echo " * Failed, max retries exceeded" >&2 return 1 else local SECONDS=$(( $RANDOM % ( ($BASE * 2) ** $FAILURES ) )) if (( $SECONDS > $MAX )); then SECONDS=$MAX fi echo "$@" >&2 echo " * $FAILURES failure(s), retrying in $SECONDS second(s)" >&2 sleep $SECONDS # Re-Execute provided command RESP=$(eval $@) fi done # Echo out CLI response which is captured by calling function echo $RESP return 0 } # Usage: get_instance_region # # Writes to STDOUT the AWS region as known by the local instance. get_instance_region() { if [ -z "$AWS_REGION" ]; then AWS_REGION=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document \ | grep -i region \ | awk -F\" '{print $4}') fi echo $AWS_REGION } AWS_CLI="exec_with_fulljitter_retry aws --region $(get_instance_region)" # Usage: set_flag # # Writes = to FLAGFILE set_flag() { if echo "$1=$2" >> $FLAGFILE; then return 0 else error_exit "Unable to write flag \"$1=$2\" to $FLAGFILE" fi } # Usage: get_flag # # Checks for in FLAGFILE. Echoes it's value and returns 0 on success or non-zero if it fails to read the file. get_flag() { if [ -r $FLAGFILE ]; then local result=$(awk -F= -v flag="$1" '{if ( $1 == flag ) {print $2}}' $FLAGFILE | tail -1) echo "${result}" return 0 else # FLAGFILE doesn't exist return 1 fi } # Usage: check_suspended_processes # # Checks processes suspended on the ASG before beginning and store them in # the FLAGFILE to avoid resuming afterwards. Also abort if Launch process # is suspended. check_suspended_processes() { # Get suspended processes in an array local suspended=($($AWS_CLI autoscaling describe-auto-scaling-groups \ --auto-scaling-group-name \"${asg_name}\" \ --query \'AutoScalingGroups[].SuspendedProcesses\' \ --output text \| awk \'{printf \$1\" \"}\')) if [ ${#suspended[@]} -eq 0 ]; then msg "No processes were suspended on the ASG before starting." else msg "This processes were suspended on the ASG before starting: ${suspended[*]}" fi # If "Launch" process is suspended abort because we will not be able to recover from StandBy. Note the "[[ ... =~" bashism. if [[ "${suspended[@]}" =~ "Launch" ]]; then error_exit "'Launch' process of AutoScaling is suspended which will not allow us to recover the instance from StandBy. Aborting." fi for process in ${suspended[@]}; do set_flag "$process" "true" done } # Usage: suspend_processes # # Suspend processes known to cause problems during deployments. # The API call is idempotent so it doesn't matter if any were previously suspended. suspend_processes() { local -a processes=(AZRebalance AlarmNotification ScheduledActions ReplaceUnhealthy) msg "Suspending ${processes[*]} processes" $AWS_CLI autoscaling suspend-processes \ --auto-scaling-group-name \"${asg_name}\" \ --scaling-processes ${processes[@]} if [ $? != 0 ]; then error_exit "Failed to suspend ${processes[*]} processes for ASG ${asg_name}. Aborting as this may cause issues." fi } # Usage: resume_processes # # Resume processes suspended, except for the one suspended before deployment. resume_processes() { local -a processes=(AZRebalance AlarmNotification ScheduledActions ReplaceUnhealthy) local -a to_resume for p in ${processes[@]}; do if ! local tmp_flag_value=$(get_flag "$p"); then error_exit "$FLAGFILE doesn't exist or is unreadable" elif [ ! "$tmp_flag_value" = "true" ] ; then to_resume=("${to_resume[@]}" "$p") fi done msg "Resuming ${to_resume[*]} processes" $AWS_CLI autoscaling resume-processes \ --auto-scaling-group-name "${asg_name}" \ --scaling-processes ${to_resume[@]} if [ $? != 0 ]; then error_exit "Failed to resume ${to_resume[*]} processes for ASG ${asg_name}. Aborting as this may cause issues." fi } # Usage: remove_flagfile # # Removes FLAGFILE. Returns non-zero if failure. remove_flagfile() { if rm $FLAGFILE; then msg "Successfully removed flagfile $FLAGFILE" return 0 else msg "WARNING: Failed to remove flagfile $FLAGFILE." fi } # Usage: finish_msg # # Prints some finishing statistics finish_msg() { msg "Finished $(basename $0) at $(/bin/date "+%F %T")" end_sec=$(/bin/date +%s.%N) elapsed_seconds=$(echo "$end_sec" "$start_sec" | awk '{ print $1 - $2 }') msg "Elapsed time: $elapsed_seconds" } # Usage: autoscaling_group_name # # Prints to STDOUT the name of the AutoScaling group this instance is a part of and returns 0. If # it is not part of any groups, then it prints nothing. On error calling autoscaling, returns # non-zero. autoscaling_group_name() { local instance_id=$1 # This operates under the assumption that instances are only ever part of a single ASG. local autoscaling_name=$($AWS_CLI autoscaling describe-auto-scaling-instances \ --instance-ids $instance_id \ --output text \ --query AutoScalingInstances[0].AutoScalingGroupName | tr -d '\n\r') if [ $? != 0 ]; then return 1 elif [ "$autoscaling_name" == "None" ]; then echo "" else echo "${autoscaling_name}" fi return 0 } # Usage: autoscaling_enter_standby # # Move into the Standby state in AutoScaling group . Doing so will # pull it out of any Elastic Load Balancer that might be in front of the group. # # Returns 0 if the instance was successfully moved to standby. Non-zero otherwise. autoscaling_enter_standby() { local instance_id=$1 local asg_name=${2} msg "Checking if this instance has already been moved in the Standby state" local instance_state=$(get_instance_state_asg $instance_id) if [ $? != 0 ]; then msg "Unable to get this instance's lifecycle state." return 1 fi if [ "$instance_state" == "Standby" ]; then msg "Instance is already in Standby; nothing to do." return 0 fi if [ "$instance_state" == "Pending:Wait" ]; then msg "Instance is Pending:Wait; nothing to do." return 0 fi if [ "$HANDLE_PROCS" = "true" ]; then msg "Checking ASG ${asg_name} suspended processes" check_suspended_processes # Suspend troublesome processes while deploying suspend_processes fi msg "Checking to see if ASG ${asg_name} will let us decrease desired capacity" local min_desired=$($AWS_CLI autoscaling describe-auto-scaling-groups \ --auto-scaling-group-name "${asg_name}" \ --query \'AutoScalingGroups[0].[MinSize, DesiredCapacity]\' \ --output text) local min_cap=$(echo $min_desired | awk '{print $1}') local desired_cap=$(echo $min_desired | awk '{print $2}') if [ -z "$min_cap" -o -z "$desired_cap" ]; then msg "Unable to determine minimum and desired capacity for ASG ${asg_name}." msg "Attempting to put this instance into standby regardless." set_flag "asgmindecremented" "false" elif [ $min_cap == $desired_cap -a $min_cap -gt 0 ]; then local new_min=$(($min_cap - 1)) msg "Decrementing ASG ${asg_name}'s minimum size to $new_min" msg $($AWS_CLI autoscaling update-auto-scaling-group \ --auto-scaling-group-name \"${asg_name}\" \ --min-size $new_min) if [ $? != 0 ]; then msg "Failed to reduce ASG ${asg_name}'s minimum size to $new_min. Cannot put this instance into Standby." return 1 else msg "ASG ${asg_name}'s minimum size has been decremented, creating flag in file $FLAGFILE" # Create a "flag" denote that the ASG min has been decremented set_flag "asgmindecremented" "true" fi else msg "No need to decrement ASG ${asg_name}'s minimum size" set_flag "asgmindecremented" "false" fi msg "Putting instance $instance_id into Standby" $AWS_CLI autoscaling enter-standby \ --instance-ids $instance_id \ --auto-scaling-group-name \"${asg_name}\" \ --should-decrement-desired-capacity if [ $? != 0 ]; then msg "Failed to put instance $instance_id into Standby for ASG ${asg_name}." return 1 fi msg "Waiting for move to Standby to finish" wait_for_state "autoscaling" $instance_id "Standby" if [ $? != 0 ]; then local wait_timeout=$(($WAITER_INTERVAL * $WAITER_ATTEMPTS)) msg "Instance $instance_id did not make it to standby after $wait_timeout seconds" return 1 fi return 0 } # Usage: autoscaling_exit_standby # # Attempts to move instance out of Standby and into InService. Returns 0 if # successful. autoscaling_exit_standby() { local instance_id=$1 local asg_name=${2} msg "Checking if this instance has already been moved out of Standby state" local instance_state=$(get_instance_state_asg $instance_id) if [ $? != 0 ]; then msg "Unable to get this instance's lifecycle state." return 1 fi if [ "$instance_state" == "InService" ]; then msg "Instance is already InService; nothing to do." return 0 fi if [ "$instance_state" == "Pending:Wait" ]; then msg "Instance is Pending:Wait; nothing to do." return 0 fi msg "Moving instance $instance_id out of Standby" $AWS_CLI autoscaling exit-standby \ --instance-ids $instance_id \ --auto-scaling-group-name \"${asg_name}\" if [ $? != 0 ]; then msg "Failed to put instance $instance_id back into InService for ASG ${asg_name}." return 1 fi msg "Waiting for exit-standby to finish" wait_for_state "autoscaling" $instance_id "InService" if [ $? != 0 ]; then local wait_timeout=$(($WAITER_INTERVAL * $WAITER_ATTEMPTS)) msg "Instance $instance_id did not make it to InService after $wait_timeout seconds" return 1 fi if ! local tmp_flag_value=$(get_flag "asgmindecremented"); then error_exit "$FLAGFILE doesn't exist or is unreadable" elif [ "$tmp_flag_value" = "true" ]; then local min_desired=$($AWS_CLI autoscaling describe-auto-scaling-groups \ --auto-scaling-group-name \"${asg_name}\" \ --query \'AutoScalingGroups[0].[MinSize, DesiredCapacity]\' \ --output text) local min_cap=$(echo $min_desired | awk '{print $1}') local new_min=$(($min_cap + 1)) msg "Incrementing ASG ${asg_name}'s minimum size to $new_min" msg $($AWS_CLI autoscaling update-auto-scaling-group \ --auto-scaling-group-name \"${asg_name}\" \ --min-size $new_min) if [ $? != 0 ]; then msg "Failed to increase ASG ${asg_name}'s minimum size to $new_min." remove_flagfile return 1 else msg "Successfully incremented ASG ${asg_name}'s minimum size" fi else msg "Auto scaling group was not decremented previously, not incrementing min value" fi if [ "$HANDLE_PROCS" = "true" ]; then # Resume processes, except for the ones suspended before deployment resume_processes fi # Clean up the FLAGFILE remove_flagfile return 0 } # Usage: get_instance_state_asg # # Gets the state of the given as known by the AutoScaling group it's a part of. # Health is printed to STDOUT and the function returns 0. Otherwise, no output and return is # non-zero. get_instance_state_asg() { local instance_id=$1 local state=$($AWS_CLI autoscaling describe-auto-scaling-instances \ --instance-ids $instance_id \ --query \"AutoScalingInstances[?InstanceId == \'$instance_id\'].LifecycleState \| [0]\" \ --output text) if [ $? != 0 ]; then return 1 else echo $state return 0 fi } # Usage: reset_waiter_timeout # # Resets the timeout value to account for the ELB timeout and also connection draining. reset_waiter_timeout() { local elb=$1 local state_name=$2 if [ "$state_name" == "InService" ]; then # Wait for a health check to succeed local elb_info=$($AWS_CLI elb describe-load-balancers \ --load-balancer-name $elb \ --query \'LoadBalancerDescriptions[0].HealthCheck.[HealthyThreshold,Interval,Timeout]\' \ --output text) local health_check_threshold=$(echo $elb_info | awk '{print $1}') local health_check_interval=$(echo $elb_info | awk '{print $2}') local health_check_timeout=$(echo $elb_info | awk '{print $3}') local timeout=$((health_check_threshold * (health_check_interval + health_check_timeout))) elif [ "$state_name" == "OutOfService" ]; then # If connection draining is enabled, wait for connections to drain local draining_values=$($AWS_CLI elb describe-load-balancer-attributes \ --load-balancer-name $elb \ --query \'LoadBalancerAttributes.ConnectionDraining.[Enabled,Timeout]\' \ --output text) local draining_enabled=$(echo $draining_values | awk '{print $1}') local timeout=$(echo $draining_values | awk '{print $2}') if [ "$draining_enabled" != "True" ]; then timeout=0 fi else msg "Unknown state name, '$state_name'"; return 1; fi # Base register/deregister action may take up to about 30 seconds timeout=$((timeout + 30)) WAITER_ATTEMPTS=$((timeout / WAITER_INTERVAL)) } # Usage: wait_for_state [ELB name] # # Waits for the state of to be in as seen by . Returns 0 if # it successfully made it to that state; non-zero if not. By default, checks $WAITER_ATTEMPTS # times, every $WAITER_INTERVAL seconds. If giving an [ELB name] to check under, these are reset # to that ELB's timeout values. wait_for_state() { local service=$1 local instance_id=$2 local state_name=$3 local elb=$4 local instance_state_cmd if [ "$service" == "elb" ]; then instance_state_cmd="get_instance_health_elb $instance_id $elb" reset_waiter_timeout $elb $state_name if [ $? != 0 ]; then error_exit "Failed resetting waiter timeout for $elb" fi elif [ "$service" == "autoscaling" ]; then instance_state_cmd="get_instance_state_asg $instance_id" else msg "Cannot wait for instance state; unknown service type, '$service'" return 1 fi msg "Checking $WAITER_ATTEMPTS times, every $WAITER_INTERVAL seconds, for instance $instance_id to be in state $state_name" local instance_state=$($instance_state_cmd | tr -d '\n\r') local count=1 msg "Instance is currently in state: $instance_state" while [ "$instance_state" != "$state_name" ]; do if [ $count -ge $WAITER_ATTEMPTS ]; then local timeout=$(($WAITER_ATTEMPTS * $WAITER_INTERVAL)) msg "Instance failed to reach state, $state_name within $timeout seconds" return 1 fi sleep $WAITER_INTERVAL instance_state=$($instance_state_cmd | tr -d '\n\r') count=$(($count + 1)) msg "Instance is currently in state: $instance_state" done return 0 } # Usage: get_instance_health_elb # # Gets the health of the given as known by . If it's a valid health # status (one of InService|OutOfService|Unknown), then the health is printed to STDOUT and the # function returns 0. Otherwise, no output and return is non-zero. get_instance_health_elb() { local instance_id=$1 local elb_name=$2 msg "Checking status of instance '$instance_id' in load balancer '$elb_name'" # If describe-instance-health for this instance returns an error, then it's not part of # this ELB. But, if the call was successful let's still double check that the status is # valid. local instance_status=$($AWS_CLI elb describe-instance-health \ --load-balancer-name $elb_name \ --instances $instance_id \ --query \'InstanceStates[].State\' \ --output text 2>/dev/null) if [ $? == 0 ]; then case "$instance_status" in InService|OutOfService|Unknown) echo -n $instance_status return 0 ;; *) msg "Instance '$instance_id' not part of ELB '$elb_name'" return 1 esac fi } # Usage: validate_elb # # Validates that the Elastic Load Balancer with name exists, is describable, and # contains as one of its instances. # # If any of these checks are false, the function returns non-zero. validate_elb() { local instance_id=$1 local elb_name=$2 # Get the list of active instances for this LB. local elb_instances=$($AWS_CLI elb describe-load-balancers \ --load-balancer-name $elb_name \ --query \'LoadBalancerDescriptions[*].Instances[*].InstanceId\' \ --output text) if [ $? != 0 ]; then msg "Couldn't describe ELB instance named '$elb_name'" return 1 fi msg "Checking health of '$instance_id' as known by ELB '$elb_name'" local instance_health=$(get_instance_health_elb $instance_id $elb_name) if [ $? != 0 ]; then return 1 fi return 0 } # Usage: get_elb_list # # Finds all the ELBs that this instance is registered to. After execution, the variable # "ELB_LIST" will contain the list of load balancers for the given instance. # # If the given instance ID isn't found registered to any ELBs, the function returns non-zero get_elb_list() { local instance_id=$1 local elb_list="" elb_list=$($AWS_CLI elb describe-load-balancers \ --query \"LoadBalancerDescriptions[].[join\(\',\',Instances[?InstanceId==\'$instance_id\'].InstanceId\),LoadBalancerName]\" \ --output text \| grep $instance_id \| awk \'{ORS=\" \"\;print \$2}\') if [ -z "$elb_list" ]; then return 1 else msg "Got load balancer list of: $elb_list" ELB_LIST=$elb_list return 0 fi } # Usage: deregister_instance # # Deregisters from . deregister_instance() { local instance_id=$1 local elb_name=$2 $AWS_CLI elb deregister-instances-from-load-balancer \ --load-balancer-name $elb_name \ --instances $instance_id 1> /dev/null return $? } # Usage: register_instance # # Registers to . register_instance() { local instance_id=$1 local elb_name=$2 $AWS_CLI elb register-instances-with-load-balancer \ --load-balancer-name $elb_name \ --instances $instance_id 1> /dev/null return $? } # Usage: check_cli_version [version-to-check] [desired version] # # Without any arguments, checks that the installed version of the AWS CLI is at least at version # $MIN_CLI_VERSION. Returns non-zero if the version is not high enough. check_cli_version() { if [ -z $1 ]; then version=$($AWS_CLI --version 2\>\&1 \| cut -f1 -d\' \' \| cut -f2 -d/) else version=$1 fi if [ -z "$2" ]; then min_version=$MIN_CLI_VERSION else min_version=$2 fi x=$(echo $version | cut -f1 -d.) y=$(echo $version | cut -f2 -d.) z=$(echo $version | cut -f3 -d.) min_x=$(echo $min_version | cut -f1 -d.) min_y=$(echo $min_version | cut -f2 -d.) min_z=$(echo $min_version | cut -f3 -d.) msg "Checking minimum required CLI version (${min_version}) against installed version ($version)" if [ $x -lt $min_x ]; then return 1 elif [ $y -lt $min_y ]; then return 1 elif [ $y -gt $min_y ]; then return 0 elif [ $z -ge $min_z ]; then return 0 else return 1 fi } # Usage: msg # # Writes to STDERR only if $DEBUG is true, otherwise has no effect. msg() { local message=$1 $DEBUG && echo $message 1>&2 } # Usage: error_exit # # Writes to STDERR as a "fatal" and immediately exits the currently running script. error_exit() { local message=$1 echo "[FATAL] $message" 1>&2 exit 1 } # Usage: get_instance_id # # Writes to STDOUT the EC2 instance ID for the local instance. Returns non-zero if the local # instance metadata URL is inaccessible. get_instance_id() { curl -s http://169.254.169.254/latest/meta-data/instance-id return $? }