#!/bin/bash # This script is executed every time the NE resources configuration service # is restarted. # Its purpose is to reserve the requested memory and CPUs (specified in # /etc/nitro_enclaves/allocator.yaml). # Set language to English. This way the parsing logic from the current script # works properly when a different language is set in the locale configuration # (e.g. /etc/locale.conf). LANG=en_US.UTF-8 # The file which holds the CPU pool. CPU_POOL_FILE="/sys/module/nitro_enclaves/parameters/ne_cpus" # Flag for deciding whether to print stdout messages or not. VERBOSE="1" # Path to the allocator config file. CONFIG_FILE_PATH="$NITRO_CLI_INSTALL_DIR/etc/nitro_enclaves/allocator.yaml" # Config variables to be populated when parsing the config file. declare -A CONFIG CONFIG[memory_mib]= CONFIG[cpu_count]= CONFIG[cpu_pool]= # Error codes. ERR_EMPTY_CPU_POOL=200 ERR_MISSING_NODE=201 ERR_INVALID_NODE=202 ERR_CLEAR_PAGE=203 ERR_SET_PAGE=204 ERR_ROLLBACK_PAGE=205 ERR_INSUFFICIENT_MEMORY=206 # Print stdout messages only if the `VERBOSE` flag is set. function print { if [ "$VERBOSE" -eq 1 ]; then echo "$@" fi } # Check if a provided string is a positive integer. function check_if_number { [[ "$1" =~ ^[0-9]+$ ]] } # Print an error message and fail. The argument may be: # - A string, which is printed as-is. # - An error code, which indicates a specific error message. function fail { local rc # Check if the argument is a numeric error code or a message string. set +e check_if_number "$1" rc=$? set +e if [ "$rc" -eq 0 ]; then echo -n "Error: " case "$1" in "$ERR_EMPTY_CPU_POOL") echo "No CPUs are off-line. Please configure the CPU pool first." ;; "$ERR_MISSING_NODE") echo "Failed to find NUMA node for a CPU. This indicates an invalid SysFS configuration." ;; "$ERR_INVALID_NODE") echo "Invalid NUMA node for a CPU. This indicates an invalid CPU ID." ;; "$ERR_CLEAR_PAGE") echo "Failed to clear huge page(s). Some pages may be in use." ;; "$ERR_SET_PAGE") echo "Failed to set a number of huge pages. This may indicate insufficient system resources." ;; "$ERR_ROLLBACK_PAGE") echo "Failed to roll back a number of huge pages. Some pages may be in use." ;; "$ERR_INSUFFICIENT_MEMORY") echo "Failed to configure entire amount of requested memory. This indicates insufficient system resources." ;; \?) echo "An unknown error has occurred: $1" ;; esac else echo "Error: $1" fi exit 1 } # Parse the NE allocator configuration file and populate $CONFIG[] # Note: this is a trivial yaml parser, able to recognize only a # root-level object with key-value pairs. function parse_config { local count=0 local skip=true while read line; do count=$((count+1)) # Skip comment lines echo "$line" | egrep -q "^\\s*#" && continue # Skip everything before the YAML data start. [[ "$line" = "---" ]] && { skip=false; continue; } [[ $skip = false ]] || continue local key key=$(echo "$line" | cut -d: -f1 | egrep -io "^([-_a-z0-9])+") [[ -z ${CONFIG[$key]+x} ]] && fail "Error in $CONFIG_FILE_PATH:$count - unexpected: $key" local value value=$(echo "$line" | sed -E "s/^.+://" | sed -E "s/\\s+\"?//" | sed -E "s/\"?\$//") [[ -z $value ]] && fail "Error in $CONFIG_FILE_PATH:$count - missing value for $key" CONFIG[$key]="$value" done < "$CONFIG_FILE_PATH" # Some trivial config validation. [[ -z ${CONFIG[memory_mib]} ]] && \ fail "Config error: missing memory reservation (\`memory_mib\`)." [[ -z ${CONFIG[cpu_count]} ]] && [[ -z ${CONFIG[cpu_pool]} ]] && \ fail "Config error: missing CPU reservation (either \`cpu_count\` or \`cpu_pool\`)." [[ ! -z ${CONFIG[cpu_count]} ]] && [[ ! -z ${CONFIG[cpu_pool]} ]] && \ fail "Config error: \`cpu_count\` conflicts with \`cpu_pool\`." } # Set all CPUs online before trying to reconfigure the CPU pool (in case the service is restarted). function enable_offline_cpus { echo "" > /sys/module/nitro_enclaves/parameters/ne_cpus 2> .tmp_file err_info=$(cat .tmp_file) rm .tmp_file # Check error code in order to determine if there are any enclaves running if [[ $err_info == *"Operation not permitted"* ]]; then fail "Could not re-online CPUs because there is at least one enclave currently running. Make sure you stop all enclaves prior to re-starting the nitro-enclaves-allocator.service." fi } # Configure the CPU pool. function configure_cpu_pool { [ -f "$CPU_POOL_FILE" ] || fail "The CPU pool file is missing. Please make sure the Nitro Enclaves driver is inserted." print "Configuring the enclave CPU pool..." echo $1 > $CPU_POOL_FILE || fail "Failed to configure the CPU pool." print "Done." } # Identify all CPU IDs that have been off-lined for enclave use. function get_enclave_cpus { [ -f "$CPU_POOL_FILE" ] || return # Split the CPU configuration into CPU groups. IFS=',' read -r -a cpu_groups <<< "$(cat "$CPU_POOL_FILE")" for cpu_group in "${cpu_groups[@]}" do # Print each individual CPU from each group. cpu_start=$(echo "$cpu_group" | cut -d'-' -f1) cpu_end=$(echo "$cpu_group" | cut -d'-' -f2) for cpu_id in $(seq "$cpu_start" "$cpu_end") do echo "$cpu_id" done done } # Determine the NUMA node which contains the enclave-available CPUs. If no arguments are given, # the CPU pool is taken from the pool file, which must have been configurated earlier. function get_numa_node_from_cpus { local offline_cpus="$*" local numa_node="" set +u [ -n "$offline_cpus" ] || offline_cpus="$(get_enclave_cpus)" set -u [ -n "$offline_cpus" ] || return $ERR_EMPTY_CPU_POOL # Next, check the NUMA node for each CPU. for cpu_id in $offline_cpus do node=$(basename "$(file /sys/devices/system/cpu/cpu"$cpu_id"/node* | cut -d':' -f1)") [ -n "$node" ] || return $ERR_MISSING_NODE # Ensure the NUMA node is the same for all off-line CPUs. if [ -z "$numa_node" ] then numa_node="$node" else [ "$numa_node" == "$node" ] || return $ERR_INVALID_NODE fi done # Set and validate the target NUMA node. NUMA_NODE="$numa_node" [ -n "$NUMA_NODE" ] || return $ERR_MISSING_NODE } # Obtain a list of supported hugepage sizes, normalized to byte values. function get_hugepage_sizes { # Make sure the NUMA node is set. [ -n "$NUMA_NODE" ] || return hugepage_sizes=$(file /sys/devices/system/node/"$NUMA_NODE"/hugepages/hugepages-* | cut -d':' -f1) for hugepage in $hugepage_sizes do # Retain only the dimension of the hugepage. hugepage=$(basename "$hugepage" | cut -d'-' -f2) # Get the size of the huge page. page_size=$(echo "$hugepage" | tr -dc '0-9') # Get the multiplier (kB, mB etc.) in upper-case page_multiplier=$(echo "$hugepage" | tr -d '0-9') page_multiplier="${page_multiplier^}" case "$page_multiplier" in "KB") page_size=$((page_size * (1 << 10))) ;; "MB") page_size=$((page_size * (1 << 20))) ;; "GB") page_size=$((page_size * (1 << 30))) ;; esac # Export a "key:value" pair. The key is needed for later indexing. echo "$hugepage:$page_size" done } # Set all previous hugepage numbers to 0. This is needed in order to guarantee allocation starts # from a clean state. We also need to avoid the following scenario: assume a previous allocation of # 1 x 1 GB page and an incoming allocation request of only 300 MB in 150 x 2 MB pages. If we don't # clear the 1 GB page first and only attempt to set the 2 MB page count, we may either run out of # memory or end up reserving (far) more than we need (in this case, 1.3 GB instead of 300 MB). function clear_previous_hugepage_reservations { local hugepage_sizes="$1" for hugepage_entry in $hugepage_sizes do page_key=$(echo "$hugepage_entry" | cut -d':' -f1) echo 0 > /sys/devices/system/node/$NUMA_NODE/hugepages/hugepages-$page_key/nr_hugepages || return $ERR_CLEAR_PAGE done } # Partition the requested memory size among the supported hugepages. # The algorithm always picks the largest hugepage size first, selecting as many pages as possible # without exceeding the requested memory. Only the smallest hugepage size is used if it is # necessary to exceed the requested memory. Depending on available resources, it may not always be # possible to actually allocate a desired number of a specific hugepage size; in this case, the # algorithm allocates as may as possible then proceeds to the next smaller size and tries again. function set_required_hugepages { declare -A rollback_sizes local remaining_memory="$(($1 * (1 << 20)))" local hugepage_sizes local rc shift 1 # Sort the hugepage sizes in descending order. hugepage_sizes=$(echo "$@" | tr ' ' '\n' | sort -n -r) # Ensure the requested memory is always a multiple of the smallest hugepage size. smallest_hugepage_size=$(echo "$hugepage_sizes" | tr ' ' '\n' | tail -n1 | cut -d':' -f2) remaining_memory=$(((1 + (remaining_memory - 1) / smallest_hugepage_size) * smallest_hugepage_size)) # Store the existing number of hugepages for the given size, in case roll-back is necessary. for hugepage_entry in $hugepage_sizes do page_key=$(echo "$hugepage_entry" | cut -d':' -f1) rollback_size="$(cat /sys/devices/system/node/"$NUMA_NODE"/hugepages/hugepages-"$page_key"/nr_hugepages)" rollback_sizes["$page_key"]="$rollback_size" done # Clear the previous page reservations. clear_previous_hugepage_reservations "$hugepage_sizes" rc=$? [ "$rc" -eq 0 ] || return $rc # Attempt to reserve the necessary hugepages. for hugepage_entry in $hugepage_sizes do page_key=$(echo "$hugepage_entry" | cut -d':' -f1) page_size=$(echo "$hugepage_entry" | cut -d':' -f2) if [ "$page_size" -le 0 ]; then print "Invalid page size found ($page_size bytes). Skipping pages of type: $page_key." continue fi needed_num_pages=$((remaining_memory / page_size)) # Attempt to set the required number of hugepages of the current size. echo $needed_num_pages > /sys/devices/system/node/$NUMA_NODE/hugepages/hugepages-$page_key/nr_hugepages || return $ERR_SET_PAGE # Read the actual number of pages that have been set (this is dependent on the available system resources). actual_num_pages=$(cat /sys/devices/system/node/"$NUMA_NODE"/hugepages/hugepages-"$page_key"/nr_hugepages) [ "$actual_num_pages" -eq 0 ] || print "- Reserved $actual_num_pages pages of type: $page_key." # Calculate the remaining memory that needs to be handled by smaller hugepage sizes. remaining_memory=$((remaining_memory - page_size * actual_num_pages)) # Break early if there's no remaining memory to configure. [ "$remaining_memory" -ne 0 ] || break done # At this point, we fail if we have allocated too much or too little memory. The latter case indicates that we have ran out of resources. if [ "$remaining_memory" -ne 0 ] then print "Memory configuration failed, rolling back memory reservations..." # Again, clear the previous reservations in order to perform the roll-back. clear_previous_hugepage_reservations "$hugepage_sizes" rc=$? [ "$rc" -eq 0 ] || return $rc # Roll back memory reservations in case of failure. for page_key in "${!rollback_sizes[@]}" do # Set the previous number of hugepages for the current size. echo ${rollback_sizes[$page_key]} > /sys/devices/system/node/$NUMA_NODE/hugepages/hugepages-$page_key/nr_hugepages || return $ERR_SET_PAGE done # Verify that the number of hugepages has actually been set. for page_key in "${!rollback_sizes[@]}" do crt_num_pages=$(cat /sys/devices/system/node/"$NUMA_NODE"/hugepages/hugepages-"$page_key"/nr_hugepages) [ "${rollback_sizes[$page_key]}" -eq "$crt_num_pages" ] || return $ERR_ROLLBACK_PAGE done fi [ "$remaining_memory" -eq 0 ] || return $ERR_INSUFFICIENT_MEMORY } # Configure the needed number of huge pages. function try_configure_huge_pages { local needed_mem local rc print "Configuring the huge page memory..." # Get the requested memory, trimming starting and ending whitespace. needed_mem="$1" shift 1 check_if_number "$needed_mem" || fail "The needed memory amount ($needed_mem) is invalid." # Get the NUMA node which contains the CPUs. get_numa_node_from_cpus "$@" rc=$? [ "$rc" -eq 0 ] || return $rc # Get the available huge page sizes in bytes. set_required_hugepages "$needed_mem" "$(get_hugepage_sizes)" rc=$? [ "$rc" -eq 0 ] || return $rc print "Done." } # Configure the needed number of huge pages on the same NUMA node as the provided CPU list. # $1 - The amount of requested memory, in MB. # $2 - The list of CPUs needed to determine the target NUMA node (optional). function configure_huge_pages { local rc # The "-e" flag is temporarily disabled since we don't always want the entire script to fail # if the memory hasn't been successfully configured. set +e try_configure_huge_pages "$@" rc=$? set -e return $rc } # Configure the CPU pool using the provided CPU count. # Auto-generate a CPU pool given the following conditions: # * All the CPUs need to be from the same NUMA node. # * CPU 0 and its siblings need to remain available to the primary / parent VM. # * Full CPU core(s) need(s) to be included in the CPU pool. function configure_cpu_pool_by_cpu_count { local core_id="" local cpu_0_numa_node="" local cpu_pool="" local cpu_pool_array=() local cpu_pool_count="$1" local cpus_per_numa_node="" local nr_cpus="" local nr_cpus_per_numa_node="" local nr_numa_nodes="" local nr_threads_per_core="" local threads_per_core="" local threads_per_core_count="" local memory_request="$2" local rc_configure_huge_pages="" # Ensure the CPU pool file is present. [ -f "$CPU_POOL_FILE" ] || fail "The CPU pool file is missing. Please make sure the Nitro Enclaves driver is inserted." print "Auto-generating the enclave CPU pool by using the CPU count..." # Get the number of available CPUs, CPU threads (siblings) per core and the NUMA nodes count. nr_cpus="$(lscpu | grep "^CPU(s):" | cut -d ":" -f 2 | tr -d " \t")" [ -z "$nr_cpus" ] && fail "Failed to get the number of CPUs." nr_numa_nodes="$(lscpu | grep "^NUMA node(s):" | cut -d ":" -f 2 | tr -d " \t")" [ -z "$nr_numa_nodes" ] && fail "Failed to get the number of available NUMA nodes." nr_threads_per_core="$(lscpu | grep "^Thread(s) per core:" | cut -d ":" -f 2 | tr -d " \t")" [ -z "$nr_threads_per_core" ] && fail "Failed to get the number of threads per core." # CPU 0 and its siblings need to remain available to the primary / parent VM. # Get its NUMA node to count for remaining CPUs in this NUMA node. cpu_0_numa_node="$(lscpu -p=cpu,node | grep -v "#" | grep "^0," | cut -d "," -f 2)" [ -z "$cpu_0_numa_node" ] && fail "Failed to get the NUMA node of CPU 0." # Sanity check the given CPU count for the NE CPU pool. check_if_number "$cpu_pool_count" || fail "The CPU count value ($cpu_pool_count) is invalid." [ "$cpu_pool_count" -gt 0 ] || fail "Provided CPU count is not higher than 0." [ "$cpu_pool_count" -le "$nr_cpus" ] || \ fail "Provided CPU count is higher than available CPUs - $nr_cpus." [ $((cpu_pool_count % nr_threads_per_core)) -eq 0 ] || \ fail "The CPU count is not multiple of $nr_threads_per_core (threads per core)." # Iterate through each NUMA node and try to get a CPU pool that matches all requirements. # This also includes the amount of memory that has been requested, if any. for (( numa_node=0; numa_node<"$nr_numa_nodes"; numa_node++ )) do cpu_pool_array=() nr_cpus_per_numa_node="$(lscpu -p=node | grep -v "#" | grep -c "^$numa_node$")" if [ -z "$nr_cpus_per_numa_node" ] ; then continue fi # Skip CPU 0 and its siblings. if [ "$numa_node" -eq "$cpu_0_numa_node" ] ; then nr_cpus_per_numa_node=$((nr_cpus_per_numa_node - nr_threads_per_core)) fi if [ "$cpu_pool_count" -gt "$nr_cpus_per_numa_node" ] ; then continue fi cpus_per_numa_node="$(lscpu -p=cpu,node | grep -v "#" | grep ",$numa_node$" | cut -d "," -f 1)" [ -z "$cpus_per_numa_node" ] && \ fail "Failed to get the available CPUs of NUMA node $numa_node." # Iterate through each CPU from the current NUMA node and find full CPU cores # to add to the CPU pool. while read -r cpu_per_numa_node do # Skip CPU 0. if [ "$cpu_per_numa_node" -eq 0 ] ; then continue fi # Get all the CPU threads (siblings) from a CPU core. core_id="$(lscpu -p=cpu,core | grep -v "#" | grep "^$cpu_per_numa_node," | cut -d "," -f 2)" [ -z "$core_id" ] && fail "Failed to get the core id for CPU $cpu_per_numa_node." threads_per_core="$(lscpu -p=cpu,core | grep -v "#" | grep -v "^0," | grep ",$core_id$" | cut -d "," -f 1)" [ -z "$threads_per_core" ] && fail "Failed to get the threads for CPU core $core_id." threads_per_core_count="$(lscpu -p=cpu,core | grep -v "#" | grep -v "^0," | grep -c ",$core_id$")" # Check if full CPU core. if [ "$threads_per_core_count" -ne "$nr_threads_per_core" ] ; then continue fi # Include the CPU core in the CPU pool. while read -r cpu_thread do if [ "${#cpu_pool_array[@]}" -eq 0 ] ; then cpu_pool_array=("$cpu_thread") continue fi for cpu in "${cpu_pool_array[@]}" do if [ "$cpu_thread" -eq "$cpu" ] ; then continue 2 fi done cpu_pool_array=("${cpu_pool_array[@]}" "$cpu_thread") done < <(echo "$threads_per_core") # Found a CPU pool that matches all the necessary conditions. # Exit early only if the memory requirements are also satisfied. If not, continue # and try with the next NUMA node. if [ "${#cpu_pool_array[@]}" -eq "$cpu_pool_count" ]; then [ -n "$memory_request" ] || break 2 print "Will try to reserve $memory_request MB of memory on node $numa_node." if configure_huge_pages "$memory_request" "${cpu_pool_array[@]}"; then rc_configure_huge_pages="$?" break 2 else rc_configure_huge_pages="$?" fi break 1 fi done < <(echo "$cpus_per_numa_node") done # Not enough CPUs found to be added in the NE CPU pool. [ "${#cpu_pool_array[@]}" -ne "$cpu_pool_count" ] && \ fail "Failed to find suitable CPUs for the Nitro Enclaves CPU pool after checking all NUMA nodes." # Hugepages configuration failed. [ "$rc_configure_huge_pages" != "0" ] && fail "$rc_configure_huge_pages" for cpu in "${cpu_pool_array[@]}" do if [ -z "$cpu_pool" ] ; then cpu_pool="$cpu" continue fi cpu_pool="$cpu_pool,$cpu" done print "Auto-generated the enclave CPU pool: $cpu_pool." configure_cpu_pool "$cpu_pool" } function main() { parse_config local cpu_msg if [[ ! -z ${CONFIG[cpu_count]} ]]; then enable_offline_cpus cpu_msg="${CONFIG[cpu_count]} CPUs" configure_cpu_pool_by_cpu_count "${CONFIG[cpu_count]}" "${CONFIG[memory_mib]}" elif [[ ! -z ${CONFIG[cpu_pool]} ]]; then enable_offline_cpus cpu_msg="CPU pool: ${CONFIG[cpu_pool]}" configure_cpu_pool "${CONFIG[cpu_pool]}" if configure_huge_pages "${CONFIG[memory_mib]}"; then true else fail "$?" fi else fail "Config error: missing CPU reservation." fi print "Successfully allocated Nitro Enclaves resources: ${CONFIG[memory_mib]} MiB, $cpu_msg" } main "$@"