#!/usr/bin/env bash set -o errexit set -o pipefail set -o nounset err_report() { echo "Exited with error on line $1" } trap 'err_report $LINENO' ERR print_help() { echo "usage: $0 " echo "Sets up Amazon EC2 Instance Store NVMe disks" echo "" echo "-d, --dir directory to mount the filesystem(s) (default: /mnt/k8s-disks/)" echo "-h, --help print this help" } # Sets up a RAID-0 of NVMe instance storage disks, moves # the contents of /var/lib/kubelet and /var/lib/containerd # to the new mounted RAID, and bind mounts the kubelet and # containerd state directories. maybe_raid0() { local md_name="kubernetes" local md_device="/dev/md/${md_name}" local md_config="/.aws/mdadm.conf" local array_mount_point="${MNT_DIR}/0" mkdir -p "$(dirname "${md_config}")" if [[ ! -s "${md_config}" ]]; then mdadm --create --force --verbose \ "${md_device}" \ --level=0 \ --name="${md_name}" \ --raid-devices="${#EPHEMERAL_DISKS[@]}" \ "${EPHEMERAL_DISKS[@]}" while [ -n "$(mdadm --detail "${md_device}" | grep -ioE 'State :.*resyncing')" ]; do echo "Raid is resyncing..." sleep 1 done mdadm --detail --scan > "${md_config}" fi ## Check if the device symlink has changed on reboot to include a homehost identifier local current_md_device=$(find /dev/md/ -type l -regex ".*/${md_name}_?[0-9a-z]*$" | tail -n1) if [[ ! -z ${current_md_device} ]]; then md_device="${current_md_device}" fi # Format the array if not already formatted. if [[ -z "$(lsblk "${md_device}" -o fstype --noheadings)" ]]; then ## By default, mkfs tries to use the stripe unit of the array (512k), ## for the log stripe unit, but the max log stripe unit is 256k. ## So instead, we use 32k (8 blocks) to avoid a warning of breaching the max. ## mkfs.xfs defaults to 32k after logging the warning since the default log buffer size is 32k. mkfs.xfs -l su=8b "${md_device}" fi ## Create the mount directory mkdir -p "${array_mount_point}" local dev_uuid=$(blkid -s UUID -o value "${md_device}") local mount_unit_name="$(systemd-escape --path --suffix=mount "${array_mount_point}")" cat > "/etc/systemd/system/${mount_unit_name}" << EOF [Unit] Description=Mount EC2 Instance Store NVMe disk RAID0 [Mount] What=UUID=${dev_uuid} Where=${array_mount_point} Type=xfs Options=defaults,noatime [Install] WantedBy=multi-user.target EOF systemd-analyze verify "${mount_unit_name}" systemctl enable "${mount_unit_name}" --now prev_running="" needs_linked="" for unit in "kubelet" "containerd"; do ## Check if the bind mount from the RAID already exists if [[ "$(systemctl is-active var-lib-${unit}.mount)" != "active" ]]; then # Check if components that depend on the RAID are running and, if so, stop them if systemctl is-active "${unit}" > /dev/null 2>&1; then prev_running+=" ${unit}" fi needs_linked+=" /var/lib/${unit}" fi done ## Check if /var/log/pods has been bind mounted and make sure kubelet is stopped if [[ "$(systemctl is-active var-log-pods.mount)" != "active" ]]; then if systemctl is-active "kubelet" > /dev/null 2>&1; then prev_running+=" ${unit}" fi needs_linked+=" /var/log/pods" fi if [[ ! -z "${prev_running}" ]]; then systemctl stop ${prev_running} fi # Transfer state directories to the array, if they exist. for mount_point in ${needs_linked}; do local unit="$(basename "${mount_point}")" local array_mount_point_unit="${array_mount_point}/${unit}" mkdir -p "${mount_point}" echo "Copying ${mount_point}/ to ${array_mount_point_unit}/" cp -a "${mount_point}/" "${array_mount_point_unit}/" local mount_unit_name="$(systemd-escape --path --suffix=mount "${mount_point}")" cat > "/etc/systemd/system/${mount_unit_name}" << EOF [Unit] Description=Mount ${unit} on EC2 Instance Store NVMe RAID0 [Mount] What=${array_mount_point_unit} Where=${mount_point} Type=none Options=bind [Install] WantedBy=multi-user.target EOF systemd-analyze verify "${mount_unit_name}" systemctl enable "${mount_unit_name}" --now done if [[ ! -z "${prev_running}" ]]; then systemctl start ${prev_running} fi } # Mounts and creates xfs file systems on all EC2 instance store NVMe disks # without existing file systems. Mounts in /mnt/k8s-disks/{1..} by default maybe_mount() { idx=1 for dev in "${EPHEMERAL_DISKS[@]}"; do if [[ -z "$(lsblk "${dev}" -o fstype --noheadings)" ]]; then mkfs.xfs -l su=8b "${dev}" fi if [[ ! -z "$(lsblk "${dev}" -o MOUNTPOINT --noheadings)" ]]; then echo "${dev} is already mounted." continue fi local mount_point="${MNT_DIR}/${idx}" local mount_unit_name="$(systemd-escape --path --suffix=mount "${mount_point}")" mkdir -p "${mount_point}" cat > "/etc/systemd/system/${mount_unit_name}" << EOF [Unit] Description=Mount EC2 Instance Store NVMe disk ${idx} [Mount] What=${dev} Where=${mount_point} Type=xfs Options=defaults,noatime [Install] WantedBy=multi-user.target EOF systemd-analyze verify "${mount_unit_name}" systemctl enable "${mount_unit_name}" --now idx=$((idx + 1)) done } ## Main logic MNT_DIR="/mnt/k8s-disks" while [[ $# -gt 0 ]]; do key="$1" case $key in -h | --help) print_help exit 0 ;; -d | --dir) MNT_DIR="$2" shift shift ;; *) # unknown option POSITIONAL+=("$1") # save it in an array for later shift # past argument ;; esac done set +u set -- "${POSITIONAL[@]}" # restore positional parameters DISK_SETUP="$1" set -u if [[ "${DISK_SETUP}" != "raid0" && "${DISK_SETUP}" != "mount" ]]; then echo "Valid disk setup options are: raid0 or mount" exit 1 fi disks=($(find -L /dev/disk/by-id/ -xtype l -name '*NVMe_Instance_Storage_*')) ## Bail early if there are no ephemeral disks to setup if [[ "${#disks[@]}" -eq 0 ]]; then echo "no ephemeral disks found, skipping disk setup" exit 0 fi if [ "$(id --user)" -ne 0 ]; then echo "Must be run as root" exit 1 fi ## Get devices of NVMe instance storage ephemeral disks EPHEMERAL_DISKS=($(realpath "${disks[@]}" | sort -u)) case "${DISK_SETUP}" in "raid0") maybe_raid0 echo "Successfully setup RAID-0 consisting of ${EPHEMERAL_DISKS[@]}" ;; "mount") maybe_mount echo "Successfully setup disk mounts consisting of ${EPHEMERAL_DISKS[@]}" ;; esac