Path: blob/main/tests/integration_tests/performance/test_snapshot_perf.py
1958 views
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.1# SPDX-License-Identifier: Apache-2.02"""Basic tests scenarios for snapshot save/restore."""34import json5import logging6import os7import platform8from conftest import _test_images_s3_bucket9from framework.artifacts import ArtifactCollection, ArtifactSet10from framework.defs import DEFAULT_TEST_IMAGES_S3_BUCKET11from framework.matrix import TestMatrix, TestContext12from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType13from framework.utils import CpuMap, get_firecracker_version_from_toml14import host_tools.network as net_tools # pylint: disable=import-error15import host_tools.logging as log_tools1617# How many latencies do we sample per test.18SAMPLE_COUNT = 319USEC_IN_MSEC = 100020PLATFORM = platform.machine()2122# Latencies in milliseconds.23# The latency for snapshot creation has high variance due to scheduler noise.24# The issue is tracked here:25# https://github.com/firecracker-microvm/firecracker/issues/234626# TODO: Update baseline values after fix.27CREATE_LATENCY_BASELINES = {28'x86_64': {29'2vcpu_256mb.json': {30'FULL': 180,31'DIFF': 70,32},33'2vcpu_512mb.json': {34'FULL': 280,35'DIFF': 75,36}37},38'aarch64': {39'2vcpu_256mb.json': {40'FULL': 160,41'DIFF': 70,42},43'2vcpu_512mb.json': {44'FULL': 300,45'DIFF': 75,46}47},48}4950# The latencies are pretty high during integration tests and51# this is tracked here:52# https://github.com/firecracker-microvm/firecracker/issues/202753# TODO: Update the table after fix. Target is < 5ms.54LOAD_LATENCY_BASELINES = {55'x86_64': {56'2vcpu_256mb.json': 9,57'2vcpu_512mb.json': 9,58},59'aarch64': {60'2vcpu_256mb.json': 3,61'2vcpu_512mb.json': 3,62}63}646566def _test_snapshot_create_latency(context):67logger = context.custom['logger']68vm_builder = context.custom['builder']69snapshot_type = context.custom['snapshot_type']70diff_snapshots = snapshot_type == SnapshotType.DIFF7172# Create a rw copy artifact.73rw_disk = context.disk.copy()74# Get ssh key from read-only artifact.75ssh_key = context.disk.ssh_key()7677logger.info("Fetching firecracker/jailer versions from {}."78.format(DEFAULT_TEST_IMAGES_S3_BUCKET))79artifacts = ArtifactCollection(_test_images_s3_bucket())80firecracker_versions = artifacts.firecracker_versions(81older_than=get_firecracker_version_from_toml())82assert len(firecracker_versions) > 08384# Test snapshot creation for every supported target version.85for target_version in firecracker_versions:86logger.info("""Measuring snapshot create({}) latency for target87version: {} and microvm: \"{}\", kernel {}, disk {} """88.format(snapshot_type,89target_version,90context.microvm.name(),91context.kernel.name(),92context.disk.name()))9394# Measure a burst of snapshot create calls.95for i in range(SAMPLE_COUNT):96# Create a fresh microVM from artifacts.97vm_instance = vm_builder.build(kernel=context.kernel,98disks=[rw_disk],99ssh_key=ssh_key,100config=context.microvm,101diff_snapshots=diff_snapshots,102use_ramdisk=True)103vm = vm_instance.vm104# Configure metrics system.105metrics_fifo_path = os.path.join(vm.path, 'metrics_fifo')106metrics_fifo = log_tools.Fifo(metrics_fifo_path)107108response = vm.metrics.put(109metrics_path=vm.create_jailed_resource(metrics_fifo.path)110)111assert vm.api_session.is_status_no_content(response.status_code)112113vm.start()114115# Check if the needed CPU cores are available. We have the API116# thread, VMM thread and then one thread for each configured vCPU.117assert CpuMap.len() >= 2 + vm.vcpus_count118119# Pin uVM threads to physical cores.120current_cpu_id = 0121assert vm.pin_vmm(current_cpu_id), \122"Failed to pin firecracker thread."123current_cpu_id += 1124assert vm.pin_api(current_cpu_id), \125"Failed to pin fc_api thread."126for idx_vcpu in range(vm.vcpus_count):127current_cpu_id += 1128assert vm.pin_vcpu(idx_vcpu, current_cpu_id + idx_vcpu), \129f"Failed to pin fc_vcpu {idx_vcpu} thread."130131# Create a snapshot builder from a microVM.132snapshot_builder = SnapshotBuilder(vm)133snapshot_builder.create(disks=[rw_disk],134ssh_key=ssh_key,135snapshot_type=snapshot_type,136target_version=target_version,137use_ramdisk=True)138metrics = vm.flush_metrics(metrics_fifo)139vm_name = context.microvm.name()140141if snapshot_type == SnapshotType.FULL:142value = metrics['latencies_us']['full_create_snapshot']143baseline = CREATE_LATENCY_BASELINES[PLATFORM][vm_name]['FULL']144else:145value = metrics['latencies_us']['diff_create_snapshot']146baseline = CREATE_LATENCY_BASELINES[PLATFORM][vm_name]['DIFF']147148value = value / USEC_IN_MSEC149150assert baseline > value, "CreateSnapshot latency degraded."151152logger.info("Latency {}/3: {} ms".format(i + 1, value))153vm.kill()154155156def _test_snapshot_resume_latency(context):157logger = context.custom['logger']158vm_builder = context.custom['builder']159snapshot_type = context.custom['snapshot_type']160diff_snapshots = snapshot_type == SnapshotType.DIFF161162logger.info("""Measuring snapshot resume({}) latency for microvm: \"{}\",163kernel {}, disk {} """.format(snapshot_type,164context.microvm.name(),165context.kernel.name(),166context.disk.name()))167168# Create a rw copy artifact.169rw_disk = context.disk.copy()170# Get ssh key from read-only artifact.171ssh_key = context.disk.ssh_key()172# Create a fresh microvm from aftifacts.173vm_instance = vm_builder.build(kernel=context.kernel,174disks=[rw_disk],175ssh_key=ssh_key,176config=context.microvm,177diff_snapshots=diff_snapshots,178use_ramdisk=True)179basevm = vm_instance.vm180basevm.start()181ssh_connection = net_tools.SSHConnection(basevm.ssh_config)182183# Check if guest works.184exit_code, _, _ = ssh_connection.execute_command("ls")185assert exit_code == 0186187logger.info("Create {}.".format(snapshot_type))188# Create a snapshot builder from a microvm.189snapshot_builder = SnapshotBuilder(basevm)190191snapshot = snapshot_builder.create([rw_disk.local_path()],192ssh_key,193snapshot_type,194use_ramdisk=True)195196basevm.kill()197198for i in range(SAMPLE_COUNT):199microvm, metrics_fifo = vm_builder.build_from_snapshot(200snapshot,201True,202diff_snapshots,203use_ramdisk=True)204205# Attempt to connect to resumed microvm.206ssh_connection = net_tools.SSHConnection(microvm.ssh_config)207208# Verify if guest can run commands.209exit_code, _, _ = ssh_connection.execute_command("ls")210assert exit_code == 0211212value = 0213# Parse all metric data points in search of load_snapshot time.214metrics = microvm.get_all_metrics(metrics_fifo)215for data_point in metrics:216metrics = json.loads(data_point)217cur_value = metrics['latencies_us']['load_snapshot'] / USEC_IN_MSEC218if cur_value > 0:219value = cur_value220break221222baseline = LOAD_LATENCY_BASELINES[PLATFORM][context.microvm.name()]223logger.info("Latency {}/{}: {} ms".format(i + 1, SAMPLE_COUNT, value))224assert baseline > value, "LoadSnapshot latency degraded."225226microvm.kill()227228229def test_snapshot_create_full_latency(network_config,230bin_cloner_path):231"""Test scenario: Full snapshot create performance measurement."""232logger = logging.getLogger("snapshot_sequence")233artifacts = ArtifactCollection(_test_images_s3_bucket())234# Testing matrix:235# - Guest kernel: Linux 4.14236# - Rootfs: Ubuntu 18.04237# - Microvm: 2vCPU with 256/512 MB RAM238# TODO: Multiple microvm sizes must be tested in the async pipeline.239microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))240microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))241kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))242disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))243244# Create a test context and add builder, logger, network.245test_context = TestContext()246test_context.custom = {247'builder': MicrovmBuilder(bin_cloner_path),248'network_config': network_config,249'logger': logger,250'snapshot_type': SnapshotType.FULL,251}252253# Create the test matrix.254test_matrix = TestMatrix(context=test_context,255artifact_sets=[256microvm_artifacts,257kernel_artifacts,258disk_artifacts259])260261test_matrix.run_test(_test_snapshot_create_latency)262263264def test_snapshot_create_diff_latency(network_config,265bin_cloner_path):266"""Test scenario: Diff snapshot create performance measurement."""267logger = logging.getLogger("snapshot_sequence")268artifacts = ArtifactCollection(_test_images_s3_bucket())269# Testing matrix:270# - Guest kernel: Linux 4.14271# - Rootfs: Ubuntu 18.04272# - Microvm: 2vCPU with 256/512 MB RAM273# TODO: Multiple microvm sizes must be tested in the async pipeline.274microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))275microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))276kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))277disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))278279# Create a test context and add builder, logger, network.280test_context = TestContext()281test_context.custom = {282'builder': MicrovmBuilder(bin_cloner_path),283'network_config': network_config,284'logger': logger,285'snapshot_type': SnapshotType.DIFF,286}287288# Create the test matrix.289test_matrix = TestMatrix(context=test_context,290artifact_sets=[291microvm_artifacts,292kernel_artifacts,293disk_artifacts294])295296test_matrix.run_test(_test_snapshot_create_latency)297298299def test_snapshot_resume_latency(network_config,300bin_cloner_path):301"""Test scenario: Snapshot load performance measurement."""302logger = logging.getLogger("snapshot_load")303304artifacts = ArtifactCollection(_test_images_s3_bucket())305# Testing matrix:306# - Guest kernel: Linux 4.14307# - Rootfs: Ubuntu 18.04308# - Microvm: 2vCPU with 256/512 MB RAM309# TODO: Multiple microvm sizes must be tested in the async pipeline.310microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))311microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))312313kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))314disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))315316# Create a test context and add builder, logger, network.317test_context = TestContext()318test_context.custom = {319'builder': MicrovmBuilder(bin_cloner_path),320'network_config': network_config,321'logger': logger,322'snapshot_type': SnapshotType.FULL,323}324325# Create the test matrix.326test_matrix = TestMatrix(context=test_context,327artifact_sets=[328microvm_artifacts,329kernel_artifacts,330disk_artifacts331])332333test_matrix.run_test(_test_snapshot_resume_latency)334335336def test_older_snapshot_resume_latency(bin_cloner_path):337"""Test scenario: Older snapshot load performance measurement."""338logger = logging.getLogger("old_snapshot_load")339340builder = MicrovmBuilder(bin_cloner_path)341342artifacts = ArtifactCollection(_test_images_s3_bucket())343# Fetch all firecracker binaries.344# With each binary create a snapshot and try to restore in current345# version.346firecracker_artifacts = artifacts.firecrackers(347older_than=get_firecracker_version_from_toml())348assert len(firecracker_artifacts) > 0349350for firecracker in firecracker_artifacts:351firecracker.download()352jailer = firecracker.jailer()353jailer.download()354fc_version = firecracker.base_name()[1:]355logger.info("Firecracker version: %s", fc_version)356logger.info("Source Firecracker: %s", firecracker.local_path())357logger.info("Source Jailer: %s", jailer.local_path())358359for i in range(SAMPLE_COUNT):360# Create a fresh microvm with the binary artifacts.361vm_instance = builder.build_vm_micro(firecracker.local_path(),362jailer.local_path())363basevm = vm_instance.vm364basevm.start()365ssh_connection = net_tools.SSHConnection(basevm.ssh_config)366367# Check if guest works.368exit_code, _, _ = ssh_connection.execute_command("ls")369assert exit_code == 0370371# The snapshot builder expects disks as paths, not artifacts.372disks = []373for disk in vm_instance.disks:374disks.append(disk.local_path())375376# Create a snapshot builder from a microvm.377snapshot_builder = SnapshotBuilder(basevm)378snapshot = snapshot_builder.create(disks,379vm_instance.ssh_key,380SnapshotType.FULL)381382basevm.kill()383microvm, metrics_fifo = builder.build_from_snapshot(snapshot,384True,385False)386# Attempt to connect to resumed microvm.387ssh_connection = net_tools.SSHConnection(microvm.ssh_config)388# Check if guest still runs commands.389exit_code, _, _ = ssh_connection.execute_command("dmesg")390assert exit_code == 0391392value = 0393# Parse all metric data points in search of load_snapshot time.394metrics = microvm.get_all_metrics(metrics_fifo)395for data_point in metrics:396metrics = json.loads(data_point)397cur_value = metrics['latencies_us']['load_snapshot']398if cur_value > 0:399value = cur_value / USEC_IN_MSEC400break401402baseline = LOAD_LATENCY_BASELINES[PLATFORM]['2vcpu_512mb.json']403logger.info("Latency %s/%s: %s ms", i + 1, SAMPLE_COUNT, value)404assert baseline > value, "LoadSnapshot latency degraded."405microvm.kill()406407408