#!/bin/bash #=============================================================================== # EMR Cluster Checks #=============================================================================== # number of days before we can consider a cluster long running LONG_RUNNING_THRESHOLD=1 eval_cluster() { report_header "Cluster Basic Checks" _check_emr_release _check_long_running_cluster _check_bootstrap_actions _check_autoscaling _check_aws_tags } _check_emr_release() { # retrieve the latest release for the current branch used emr_latest_release=$($AWS_CLI emr list-release-labels | jq -r .ReleaseLabels[] | cut -c5- | grep ^$cluster_emr_release_major | head -n 1) MSG_OK=$(echo "You're already using the latest EMR release ($emr_latest_release) for EMR $cluster_emr_release_major.x") MSG_WARN=$(echo "Your cluster is running $cluster_emr_release_num. Please consider upgrading to the latest EMR release for this branch: $emr_latest_release") [[ "$cluster_emr_release_num" == "$emr_latest_release" ]] && report_success "$MSG_OK" || report_warning "$MSG_WARN" } _check_long_running_cluster() { if [[ $cluster_running_days -gt $LONG_RUNNING_THRESHOLD ]]; then MSG_OK=$(echo "Termination Protection is enabled") MSG_WARN=$(echo "This cluster has been running for $cluster_running_days days and doesn't have Termination Protection enabled. Consider enabling the Termination Protection feature. https://docs.aws.amazon.com/emr/latest/ManagementGuide/UsingEMR_TerminationProtection.html") [[ "$cluster_termination_protected" == "true" ]] && report_success "$MSG_OK" || report_warning "$MSG_WARN" MSG_OK=$(echo "Multi Master feature enabled") MSG_WARN=$(echo "This cluster has been running for $cluster_running_days days. Consider enabling the EMR Multi Master feature. https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-ha-launch.html-") [[ "$(is_multimaster)" == "true" ]] && report_success "$MSG_OK" || report_warning "$MSG_WARN" else report_success "This cluster has been running for $cluster_running_days days. Skipping long running checks (Long Running if running > $LONG_RUNNING_THRESHOLD days)" fi } _check_autoscaling() { if [[ "$cluster_managed_scaling" == "true" ]]; then report_success "Managed Scaling is enabled" else report_warning "Consider enabling Managed Scaling to reduce costs. https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-scaling.html" fi } _check_aws_tags() { [[ ! -z $cluster_tags ]] && success "Cluster is using AWS Tags" || report_warning "The Cluster does not have any AWS Tag associated. Consider adding AWS Tags for auditing purposes. https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-tags.html" } _check_bootstrap_actions() { BA_THRESHOLD=2 BA_LONG_THRESHOLD=60 # seconds if [[ $cluster_ba_count -gt 0 ]]; then [[ $cluster_ba_count -le $BA_THRESHOLD ]] && report_success "You have only $cluster_ba_count Bootstrap Actions attached to the cluster" || report_warning "You have $cluster_ba_count Bootstrap Actions attached to the cluster. Consider using a Custom AMI with needed changes to improve startup time. https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-custom-ami.html" # check if BA logs have been deleted by logpusher cat /var/log/bootstrap-actions/**/controller &>/dev/null if [[ $? -eq 0 ]]; then [[ $cluster_ba_longest -le $BA_LONG_THRESHOLD ]] && report_success "Your longest Boostrap Action took $cluster_ba_longest seconds to complete." || report_warning "Your longest Boostrap Action took $cluster_ba_longest seconds to complete. Consider using a Custom AMI with needed changes to improve startup time. https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-custom-ami.html" else report_warning "Can't process Bootstrap Logs. Logs have been already pushed to S3 and deleted from the node." fi else report_success "No Bootstrap Action detected" fi } #=============================================================================== # Cluster Topology Checks #=============================================================================== # min number of CORE nodes for a production cluster CORE_NUM_THRESHOLD=2 eval_topology() { report_header "Cluster Topology Checks" _check_nodes_master _check_nodes_core _check_instances _check_nodes_disks } _check_nodes_master() { [[ "$nodes_master_market" == "ON_DEMAND" ]] && report_success "MASTER nodes are running as ON DEMAND instances" || report_warning "MASTER nodes are running on $nodes_core_market instances. Consider using only ON DEMAND instances for MASTER nodes." } _check_nodes_core() { [[ $nodes_core_number -gt $CORE_NUM_THRESHOLD ]] && report_success "There are at least $CORE_NUM_THRESHOLD CORE nodes in the cluster" || report_warning "You're using only $nodes_core_number CORE nodes. If this is a production cluster consider using a minimum of $CORE_NUM_THRESHOLD CORE nodes to increase resiliency to HDFS issues." [[ "$nodes_core_market" == "ON_DEMAND" ]] && report_success "CORE nodes are running as ON DEMAND instances" || report_warning "CORE nodes are running on $nodes_core_market instances. Consider using only ON DEMAND instances for CORE nodes." } _check_nodes_tasks() { echo } _check_instances() { # check if using graviton instances graviton_instances=$(echo $INSTANCES_DATA | jq -r '.Instances[] | select(.Status.State=="RUNNING") | select(.InstanceType | contains("6g"))') [[ ! -z $graviton_instances ]] && report_success "Cluster using Graviton instances" || report_warning "Cluster not using Graviton instances. Consider switching to reduce costs and increase performance" # check if cluster is using instance types of different types i_families=$(echo $INSTANCES_DATA | jq -r '.Instances[] | select(.Status.State=="RUNNING") | .InstanceType' | uniq | awk -F. '{ print $1 }' | cut -c 1-1 | uniq | wc -l) [[ $i_families -gt 1 ]] && report_warning "Cluster using different families. If this is a transient cluster running a single job in parallel, you might not be able to fully utilize all cluster resources. For this specific use cases select a single instance family." || report_success "Cluster using a single instance family" } _check_nodes_disks() { declare -A instance_disk instance_disk=( ['large']="1" ['xlarge']="2" ['2xlarge']="4" ['4xlarge']="4" ['8xlarge']="4" ['9xlarge']="4" ['10xlarge']="4" ['12xlarge']="4" ['16xlarge']="4" ['18xlarge']="4" ['24xlarge']="4" ) volumes_data=$(echo $INSTANCES_DATA | jq '.Instances[] | select(.Status.State=="RUNNING") | { Id: (if has("InstanceGroupId") then .InstanceGroupId else .InstanceFleetId end), InstanceType: .InstanceType, State: .Status.State, EbsVolumes: (.EbsVolumes|length)}' | jq -s '. | unique') readarray -t tmp_instances < <(echo $volumes_data | jq -r '.[].Id') for i in "${tmp_instances[@]}"; do tmp_data=$(echo $volumes_data | jq --arg I_ID "$i" '.[] | select(.Id == $I_ID)') i_type=$(echo $tmp_data | jq -r '.InstanceType') i_family=$(echo $i_type | awk -F. '{ print $1 }') i_size=$(echo $i_type | awk -F. '{ print $2 }') i_disks=$(echo $tmp_data | jq -r '.EbsVolumes') [[ $i_disks -ge "${instance_disk[$i_size]}" ]] && report_success "$i - $i_type has $i_disks volumes attached" || report_warning "$i - $i_type has $i_disks volumes attached. Consider using ${instance_disk[$i_size]} to increase performance" done } #=============================================================================== # Cluster Networking Checks #=============================================================================== IP_THRESHOLD=20 eval_networking() { report_header "Cluster Networking Checks" _check_subnet_ip _check_s3_gw } _check_subnet_ip() { [[ $subnet_available_ip -lt $IP_THRESHOLD ]] && report_warning "Your subnet has only $subnet_available_ip available IP. Consider increasing the subnet mask" || report_success "You have $subnet_available_ip available IP" } _check_s3_gw() { [[ "$subnet_has_s3_gw" == "true" ]] && report_success "Your subnet has an S3 Gateway Endpoint" || report_warning "Your subnet doesn't have an S3 Gateway Endpoint. Consider adding one to improve performance. https://docs.aws.amazon.com/vpc/latest/privatelink/vpc-endpoints-s3.html" } #=============================================================================== # Master Node Checks #=============================================================================== MEM_THRESHOLD=85 eval_system() { report_header "Cluster Master Node Checks" _check_memory } _check_memory() { used_mem_perc=$(free | grep Mem | awk '{print $3/$2 * 100.0}') [[ $(echo "if (${used_mem_perc} > $MEM_THRESHOLD) 1 else 0" | bc) -eq 1 ]] && report_warning "Your memory utilization is greater than $MEM_THRESHOLD%. Consider using an instance with more available memory" || report_success "Memory utilization is lower than $MEM_THRESHOLD%" oom_count=$(sudo dmesg | grep oom_reaper | wc -l) [[ $oom_count -gt 0 ]] && report_warning "Detected $oom_count Out Of Memory issues. Some processes were killed by the kernel due to the lack of memory. Consider using an instance with more RAM" || report_success "No Out Of Memory issues detected" } #=============================================================================== # Application Frameworks Checks #=============================================================================== eval_frameworks() { report_header "Cluster Frameworks Checks" _check_hdfs } # HDFS _check_hdfs() { HDFS_USAGE_PERCENTAGE=80 # 80% HDFS_USAGE_BYTES=536870912000 # 500GB cluster_hdfs_replication=$(hdfs getconf -confKey dfs.replication 2>/dev/null) [[ $cluster_hdfs_replication -gt 1 ]] && report_success "You have a minimal HDFS replication factor of $cluster_hdfs_replication" || report_warning "You have a minimal HDFS replication factor of $cluster_hdfs_replication. Consider increasing it, by tuning the hdfs-site configuration dfs.replication. For more details, see https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hdfs-config.html" [[ $cluster_hdfs_replication -le $nodes_core_number ]] && report_success "You have sufficient number of CORE nodes to replicate HDFS blocks" || report_warning "You have a minimal HDFS replication factor of $cluster_hdfs_replication, which is greater than the number of CORE nodes ($nodes_core_number). You will not be able to replicate your data. Consider increasing the number of CORE nodes at least to $cluster_hdfs_replication" hdfs_utilization=$(hdfs dfsadmin -report 2> /dev/null | sed -nr "s/DFS Used%: (.*)%$/\1/p" | head -n 1) [[ $(echo "$hdfs_utilization < $HDFS_USAGE_PERCENTAGE" |bc -l) ]] && report_success "HDFS utilization is $hdfs_utilization%, which is lower than alert threshold $HDFS_USAGE_PERCENTAGE%" || report_warning "HDFS utilization is greater than $HDFS_USAGE_PERCENTAGE%. Consider adding more CORE nodes before running out of space" hdfs_utilization=$(hdfs dfs -du -s / 2> /dev/null | awk '{print $1}') [[ $(echo "$HDFS_USAGE_BYTES - $hdfs_utilization" | bc -l) -gt 0 ]] && report_success "HDFS utilization pattern low ( usage < 500 GB)" || report_warning "HDFS utilization pattern high ( usage > 500 GB). Consider storing your data on Amazon S3" under_replica_files=$(hdfs fsck / 2>/dev/null | grep 'Under replicated' | awk -F':' '{print $1}' | wc -l) [[ $under_replica_files -eq 0 ]] && report_success "There are no files with under replicated blocks" || report_warning "You have $under_replica_files files with under replicated blocks. Use the 'emr hdfs' command to generate a detailed report" }