# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Performance benchmark for snapshot restore.""" import json import tempfile from functools import lru_cache import pytest import framework.stats as st import host_tools.drive as drive_tools from framework.artifacts import create_net_devices_configuration from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType from framework.stats.baseline import Provider as BaselineProvider from framework.stats.metadata import DictProvider as DictMetadataProvider from framework.utils import get_kernel_version from integration_tests.performance.configs import defs TEST_ID = "snapshot_restore_performance" WORKLOAD = "restore" CONFIG_NAME_REL = "test_{}_config_{}.json".format(TEST_ID, get_kernel_version(level=1)) CONFIG_NAME_ABS = defs.CFG_LOCATION / CONFIG_NAME_REL BASE_VCPU_COUNT = 1 BASE_MEM_SIZE_MIB = 128 BASE_NET_COUNT = 1 BASE_BLOCK_COUNT = 1 USEC_IN_MSEC = 1000 # Measurements tags. RESTORE_LATENCY = "latency" # Define 4 net device configurations. net_ifaces = create_net_devices_configuration(4) # pylint: disable=R0903 class SnapRestoreBaselinesProvider(BaselineProvider): """Baselines provider for snapshot restore latency.""" def __init__(self, env_id, workload, raw_baselines): """Snapshot baseline provider initialization.""" super().__init__(raw_baselines) self._tag = "baselines/{}/" + env_id + "/{}/" + workload def get(self, metric_name: str, statistic_name: str) -> dict: """Return the baseline value corresponding to the key.""" key = self._tag.format(metric_name, statistic_name) baseline = self._baselines.get(key) if baseline: target = baseline.get("target") delta_percentage = baseline.get("delta_percentage") return { "target": target, "delta": delta_percentage * target / 100, } return None @lru_cache def get_scratch_drives(): """Create an array of scratch disks.""" scratchdisks = ["vdb", "vdc", "vdd", "vde"] return [ (drive, drive_tools.FilesystemFile(tempfile.mktemp(), size=64)) for drive in scratchdisks ] def default_lambda_consumer(env_id, workload): """Create a default lambda consumer for the snapshot restore test.""" raw_baselines = json.loads(CONFIG_NAME_ABS.read_text("utf-8")) return st.consumer.LambdaConsumer( metadata_provider=DictMetadataProvider( raw_baselines["measurements"], SnapRestoreBaselinesProvider(env_id, workload, raw_baselines), ), func=consume_output, ) def get_snap_restore_latency( vm_builder, microvm_factory, guest_kernel, rootfs, vcpus, mem_size, nets=3, blocks=3, all_devices=False, iterations=30, ): """Restore snapshots with various configs to measure latency.""" scratch_drives = get_scratch_drives() ifaces = net_ifaces[:nets] vm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) vm.spawn(use_ramdisk=True, log_level="Info") vm.basic_config( vcpu_count=vcpus, mem_size_mib=mem_size, rootfs_io_engine="Sync", use_initrd=True, ) for iface in ifaces: vm.create_tap_and_ssh_config( host_ip=iface.host_ip, guest_ip=iface.guest_ip, netmask_len=iface.netmask, tapname=iface.tap_name, ) response = vm.network.put( iface_id=iface.dev_name, host_dev_name=iface.tap_name, guest_mac=iface.guest_mac, ) assert vm.api_session.is_status_no_content(response.status_code) extra_disk_paths = [] if blocks > 1: for name, diskfile in scratch_drives[: (blocks - 1)]: vm.add_drive(name, diskfile.path, use_ramdisk=True, io_engine="Sync") extra_disk_paths.append(diskfile.path) assert len(extra_disk_paths) > 0 if all_devices: response = vm.balloon.put( amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1 ) assert vm.api_session.is_status_no_content(response.status_code) response = vm.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path="/v.sock") assert vm.api_session.is_status_no_content(response.status_code) vm.start() # Create a snapshot builder from a microvm. snapshot_builder = SnapshotBuilder(vm) full_snapshot = snapshot_builder.create( [vm.rootfs_file] + extra_disk_paths, rootfs.ssh_key(), SnapshotType.FULL, net_ifaces=ifaces, use_ramdisk=True, ) vm.kill() values = [] for _ in range(iterations): microvm, metrics_fifo = vm_builder.build_from_snapshot( full_snapshot, resume=True, use_ramdisk=True ) # Check if guest still runs commands. exit_code, _, _ = microvm.ssh.execute_command("dmesg") assert exit_code == 0 value = 0 # Parse all metric data points in search of load_snapshot time. metrics = microvm.get_all_metrics(metrics_fifo) for data_point in metrics: metrics = json.loads(data_point) cur_value = metrics["latencies_us"]["load_snapshot"] if cur_value > 0: value = cur_value / USEC_IN_MSEC break values.append(value) microvm.kill() microvm.jailer.cleanup() full_snapshot.cleanup() vm.jailer.cleanup() return values def consume_output(cons, latencies): """Consumer function.""" for value in latencies: yield RESTORE_LATENCY, value, "Milliseconds" cons.consume_data(RESTORE_LATENCY, value) @pytest.mark.nonci @pytest.mark.parametrize( "mem, vcpus", [ (128, 1), (1024, 1), (2048, 2), (4096, 3), (6144, 4), (8192, 5), (10240, 6), (12288, 7), ], ) def test_snapshot_scaling( bin_cloner_path, microvm_factory, rootfs, guest_kernel, st_core, mem, vcpus ): """ Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB, which resembles firecracker production setups. """ # The guest kernel does not "participate" in snapshot restore, so just pick some # arbitrary one if "4.14" not in guest_kernel.name(): pytest.skip() guest_config = f"{vcpus}vcpu_{mem}mb" env_id = f"{guest_kernel.name()}/{rootfs.name()}/{guest_config}" st_prod = st.producer.LambdaProducer( func=get_snap_restore_latency, func_kwargs={ "vm_builder": MicrovmBuilder(bin_cloner_path), "microvm_factory": microvm_factory, "guest_kernel": guest_kernel, "rootfs": rootfs, "vcpus": vcpus, "mem_size": mem, }, ) st_cons = default_lambda_consumer(env_id, WORKLOAD) st_core.add_pipe(st_prod, st_cons, f"{env_id}/{WORKLOAD}") st_core.name = TEST_ID st_core.custom["guest_config"] = guest_config st_core.run_exercise() @pytest.mark.nonci def test_snapshot_all_devices( bin_cloner_path, microvm_factory, rootfs, guest_kernel, st_core ): """Restore snapshots with one of each devices.""" # The guest kernel does not "participate" in snapshot restore, so just pick some # arbitrary one if "4.14" not in guest_kernel.name(): pytest.skip() guest_config = "all_dev" env_id = f"{guest_kernel.name()}/{rootfs.name()}/{guest_config}" st_prod = st.producer.LambdaProducer( func=get_snap_restore_latency, func_kwargs={ "vm_builder": MicrovmBuilder(bin_cloner_path), "microvm_factory": microvm_factory, "guest_kernel": guest_kernel, "rootfs": rootfs, "nets": 1, "blocks": 1, "vcpus": BASE_VCPU_COUNT, "mem_size": BASE_MEM_SIZE_MIB, "all_devices": True, }, ) st_cons = default_lambda_consumer(env_id, WORKLOAD) st_core.add_pipe(st_prod, st_cons, f"{env_id}/{WORKLOAD}") st_core.name = TEST_ID st_core.custom["guest_config"] = guest_config st_core.run_exercise()