CoCalc -- test_snapshot

GitHub Repository: aos/firecracker
Path: blob/main/tests/integration_tests/performance/test_snapshot_perf.py
¹⁹⁵⁸ views
1
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
# SPDX-License-Identifier: Apache-2.0
3
"""Basic tests scenarios for snapshot save/restore."""
4

5
import json
6
import logging
7
import os
8
import platform
9
from conftest import _test_images_s3_bucket
10
from framework.artifacts import ArtifactCollection, ArtifactSet
11
from framework.defs import DEFAULT_TEST_IMAGES_S3_BUCKET
12
from framework.matrix import TestMatrix, TestContext
13
from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType
14
from framework.utils import CpuMap, get_firecracker_version_from_toml
15
import host_tools.network as net_tools  # pylint: disable=import-error
16
import host_tools.logging as log_tools
17

18
# How many latencies do we sample per test.
19
SAMPLE_COUNT = 3
20
USEC_IN_MSEC = 1000
21
PLATFORM = platform.machine()
22

23
# Latencies in milliseconds.
24
# The latency for snapshot creation has high variance due to scheduler noise.
25
# The issue is tracked here:
26
# https://github.com/firecracker-microvm/firecracker/issues/2346
27
# TODO: Update baseline values after fix.
28
CREATE_LATENCY_BASELINES = {
29
    'x86_64': {
30
        '2vcpu_256mb.json': {
31
            'FULL':  180,
32
            'DIFF':  70,
33
        },
34
        '2vcpu_512mb.json': {
35
            'FULL':  280,
36
            'DIFF':  75,
37
        }
38
    },
39
    'aarch64': {
40
        '2vcpu_256mb.json': {
41
            'FULL':  160,
42
            'DIFF':  70,
43
        },
44
        '2vcpu_512mb.json': {
45
            'FULL':  300,
46
            'DIFF':  75,
47
        }
48
    },
49
}
50

51
# The latencies are pretty high during integration tests and
52
# this is tracked here:
53
# https://github.com/firecracker-microvm/firecracker/issues/2027
54
# TODO: Update the table after fix. Target is < 5ms.
55
LOAD_LATENCY_BASELINES = {
56
    'x86_64': {
57
        '2vcpu_256mb.json': 9,
58
        '2vcpu_512mb.json': 9,
59
    },
60
    'aarch64': {
61
        '2vcpu_256mb.json': 3,
62
        '2vcpu_512mb.json': 3,
63
    }
64
}
65

66

67
def _test_snapshot_create_latency(context):
68
    logger = context.custom['logger']
69
    vm_builder = context.custom['builder']
70
    snapshot_type = context.custom['snapshot_type']
71
    diff_snapshots = snapshot_type == SnapshotType.DIFF
72

73
    # Create a rw copy artifact.
74
    rw_disk = context.disk.copy()
75
    # Get ssh key from read-only artifact.
76
    ssh_key = context.disk.ssh_key()
77

78
    logger.info("Fetching firecracker/jailer versions from {}."
79
                .format(DEFAULT_TEST_IMAGES_S3_BUCKET))
80
    artifacts = ArtifactCollection(_test_images_s3_bucket())
81
    firecracker_versions = artifacts.firecracker_versions(
82
        older_than=get_firecracker_version_from_toml())
83
    assert len(firecracker_versions) > 0
84

85
    # Test snapshot creation for every supported target version.
86
    for target_version in firecracker_versions:
87
        logger.info("""Measuring snapshot create({}) latency for target
88
        version: {} and microvm: \"{}\", kernel {}, disk {} """
89
                    .format(snapshot_type,
90
                            target_version,
91
                            context.microvm.name(),
92
                            context.kernel.name(),
93
                            context.disk.name()))
94

95
        # Measure a burst of snapshot create calls.
96
        for i in range(SAMPLE_COUNT):
97
            # Create a fresh microVM from artifacts.
98
            vm_instance = vm_builder.build(kernel=context.kernel,
99
                                           disks=[rw_disk],
100
                                           ssh_key=ssh_key,
101
                                           config=context.microvm,
102
                                           diff_snapshots=diff_snapshots,
103
                                           use_ramdisk=True)
104
            vm = vm_instance.vm
105
            # Configure metrics system.
106
            metrics_fifo_path = os.path.join(vm.path, 'metrics_fifo')
107
            metrics_fifo = log_tools.Fifo(metrics_fifo_path)
108

109
            response = vm.metrics.put(
110
                metrics_path=vm.create_jailed_resource(metrics_fifo.path)
111
            )
112
            assert vm.api_session.is_status_no_content(response.status_code)
113

114
            vm.start()
115

116
            # Check if the needed CPU cores are available. We have the API
117
            # thread, VMM thread and then one thread for each configured vCPU.
118
            assert CpuMap.len() >= 2 + vm.vcpus_count
119

120
            # Pin uVM threads to physical cores.
121
            current_cpu_id = 0
122
            assert vm.pin_vmm(current_cpu_id), \
123
                "Failed to pin firecracker thread."
124
            current_cpu_id += 1
125
            assert vm.pin_api(current_cpu_id), \
126
                "Failed to pin fc_api thread."
127
            for idx_vcpu in range(vm.vcpus_count):
128
                current_cpu_id += 1
129
                assert vm.pin_vcpu(idx_vcpu, current_cpu_id + idx_vcpu), \
130
                    f"Failed to pin fc_vcpu {idx_vcpu} thread."
131

132
            # Create a snapshot builder from a microVM.
133
            snapshot_builder = SnapshotBuilder(vm)
134
            snapshot_builder.create(disks=[rw_disk],
135
                                    ssh_key=ssh_key,
136
                                    snapshot_type=snapshot_type,
137
                                    target_version=target_version,
138
                                    use_ramdisk=True)
139
            metrics = vm.flush_metrics(metrics_fifo)
140
            vm_name = context.microvm.name()
141

142
            if snapshot_type == SnapshotType.FULL:
143
                value = metrics['latencies_us']['full_create_snapshot']
144
                baseline = CREATE_LATENCY_BASELINES[PLATFORM][vm_name]['FULL']
145
            else:
146
                value = metrics['latencies_us']['diff_create_snapshot']
147
                baseline = CREATE_LATENCY_BASELINES[PLATFORM][vm_name]['DIFF']
148

149
            value = value / USEC_IN_MSEC
150

151
            assert baseline > value, "CreateSnapshot latency degraded."
152

153
            logger.info("Latency {}/3: {} ms".format(i + 1, value))
154
            vm.kill()
155

156

157
def _test_snapshot_resume_latency(context):
158
    logger = context.custom['logger']
159
    vm_builder = context.custom['builder']
160
    snapshot_type = context.custom['snapshot_type']
161
    diff_snapshots = snapshot_type == SnapshotType.DIFF
162

163
    logger.info("""Measuring snapshot resume({}) latency for microvm: \"{}\",
164
kernel {}, disk {} """.format(snapshot_type,
165
                              context.microvm.name(),
166
                              context.kernel.name(),
167
                              context.disk.name()))
168

169
    # Create a rw copy artifact.
170
    rw_disk = context.disk.copy()
171
    # Get ssh key from read-only artifact.
172
    ssh_key = context.disk.ssh_key()
173
    # Create a fresh microvm from aftifacts.
174
    vm_instance = vm_builder.build(kernel=context.kernel,
175
                                   disks=[rw_disk],
176
                                   ssh_key=ssh_key,
177
                                   config=context.microvm,
178
                                   diff_snapshots=diff_snapshots,
179
                                   use_ramdisk=True)
180
    basevm = vm_instance.vm
181
    basevm.start()
182
    ssh_connection = net_tools.SSHConnection(basevm.ssh_config)
183

184
    # Check if guest works.
185
    exit_code, _, _ = ssh_connection.execute_command("ls")
186
    assert exit_code == 0
187

188
    logger.info("Create {}.".format(snapshot_type))
189
    # Create a snapshot builder from a microvm.
190
    snapshot_builder = SnapshotBuilder(basevm)
191

192
    snapshot = snapshot_builder.create([rw_disk.local_path()],
193
                                       ssh_key,
194
                                       snapshot_type,
195
                                       use_ramdisk=True)
196

197
    basevm.kill()
198

199
    for i in range(SAMPLE_COUNT):
200
        microvm, metrics_fifo = vm_builder.build_from_snapshot(
201
            snapshot,
202
            True,
203
            diff_snapshots,
204
            use_ramdisk=True)
205

206
        # Attempt to connect to resumed microvm.
207
        ssh_connection = net_tools.SSHConnection(microvm.ssh_config)
208

209
        # Verify if guest can run commands.
210
        exit_code, _, _ = ssh_connection.execute_command("ls")
211
        assert exit_code == 0
212

213
        value = 0
214
        # Parse all metric data points in search of load_snapshot time.
215
        metrics = microvm.get_all_metrics(metrics_fifo)
216
        for data_point in metrics:
217
            metrics = json.loads(data_point)
218
            cur_value = metrics['latencies_us']['load_snapshot'] / USEC_IN_MSEC
219
            if cur_value > 0:
220
                value = cur_value
221
                break
222

223
        baseline = LOAD_LATENCY_BASELINES[PLATFORM][context.microvm.name()]
224
        logger.info("Latency {}/{}: {} ms".format(i + 1, SAMPLE_COUNT, value))
225
        assert baseline > value, "LoadSnapshot latency degraded."
226

227
        microvm.kill()
228

229

230
def test_snapshot_create_full_latency(network_config,
231
                                      bin_cloner_path):
232
    """Test scenario: Full snapshot create performance measurement."""
233
    logger = logging.getLogger("snapshot_sequence")
234
    artifacts = ArtifactCollection(_test_images_s3_bucket())
235
    # Testing matrix:
236
    # - Guest kernel: Linux 4.14
237
    # - Rootfs: Ubuntu 18.04
238
    # - Microvm: 2vCPU with 256/512 MB RAM
239
    # TODO: Multiple microvm sizes must be tested in the async pipeline.
240
    microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))
241
    microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))
242
    kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))
243
    disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))
244

245
    # Create a test context and add builder, logger, network.
246
    test_context = TestContext()
247
    test_context.custom = {
248
        'builder': MicrovmBuilder(bin_cloner_path),
249
        'network_config': network_config,
250
        'logger': logger,
251
        'snapshot_type': SnapshotType.FULL,
252
    }
253

254
    # Create the test matrix.
255
    test_matrix = TestMatrix(context=test_context,
256
                             artifact_sets=[
257
                                 microvm_artifacts,
258
                                 kernel_artifacts,
259
                                 disk_artifacts
260
                             ])
261

262
    test_matrix.run_test(_test_snapshot_create_latency)
263

264

265
def test_snapshot_create_diff_latency(network_config,
266
                                      bin_cloner_path):
267
    """Test scenario: Diff snapshot create performance measurement."""
268
    logger = logging.getLogger("snapshot_sequence")
269
    artifacts = ArtifactCollection(_test_images_s3_bucket())
270
    # Testing matrix:
271
    # - Guest kernel: Linux 4.14
272
    # - Rootfs: Ubuntu 18.04
273
    # - Microvm: 2vCPU with 256/512 MB RAM
274
    # TODO: Multiple microvm sizes must be tested in the async pipeline.
275
    microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))
276
    microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))
277
    kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))
278
    disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))
279

280
    # Create a test context and add builder, logger, network.
281
    test_context = TestContext()
282
    test_context.custom = {
283
        'builder': MicrovmBuilder(bin_cloner_path),
284
        'network_config': network_config,
285
        'logger': logger,
286
        'snapshot_type': SnapshotType.DIFF,
287
    }
288

289
    # Create the test matrix.
290
    test_matrix = TestMatrix(context=test_context,
291
                             artifact_sets=[
292
                                 microvm_artifacts,
293
                                 kernel_artifacts,
294
                                 disk_artifacts
295
                             ])
296

297
    test_matrix.run_test(_test_snapshot_create_latency)
298

299

300
def test_snapshot_resume_latency(network_config,
301
                                 bin_cloner_path):
302
    """Test scenario: Snapshot load performance measurement."""
303
    logger = logging.getLogger("snapshot_load")
304

305
    artifacts = ArtifactCollection(_test_images_s3_bucket())
306
    # Testing matrix:
307
    # - Guest kernel: Linux 4.14
308
    # - Rootfs: Ubuntu 18.04
309
    # - Microvm: 2vCPU with 256/512 MB RAM
310
    # TODO: Multiple microvm sizes must be tested in the async pipeline.
311
    microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))
312
    microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))
313

314
    kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))
315
    disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))
316

317
    # Create a test context and add builder, logger, network.
318
    test_context = TestContext()
319
    test_context.custom = {
320
        'builder': MicrovmBuilder(bin_cloner_path),
321
        'network_config': network_config,
322
        'logger': logger,
323
        'snapshot_type': SnapshotType.FULL,
324
    }
325

326
    # Create the test matrix.
327
    test_matrix = TestMatrix(context=test_context,
328
                             artifact_sets=[
329
                                 microvm_artifacts,
330
                                 kernel_artifacts,
331
                                 disk_artifacts
332
                             ])
333

334
    test_matrix.run_test(_test_snapshot_resume_latency)
335

336

337
def test_older_snapshot_resume_latency(bin_cloner_path):
338
    """Test scenario: Older snapshot load performance measurement."""
339
    logger = logging.getLogger("old_snapshot_load")
340

341
    builder = MicrovmBuilder(bin_cloner_path)
342

343
    artifacts = ArtifactCollection(_test_images_s3_bucket())
344
    # Fetch all firecracker binaries.
345
    # With each binary create a snapshot and try to restore in current
346
    # version.
347
    firecracker_artifacts = artifacts.firecrackers(
348
        older_than=get_firecracker_version_from_toml())
349
    assert len(firecracker_artifacts) > 0
350

351
    for firecracker in firecracker_artifacts:
352
        firecracker.download()
353
        jailer = firecracker.jailer()
354
        jailer.download()
355
        fc_version = firecracker.base_name()[1:]
356
        logger.info("Firecracker version: %s", fc_version)
357
        logger.info("Source Firecracker: %s", firecracker.local_path())
358
        logger.info("Source Jailer: %s", jailer.local_path())
359

360
        for i in range(SAMPLE_COUNT):
361
            # Create a fresh microvm with the binary artifacts.
362
            vm_instance = builder.build_vm_micro(firecracker.local_path(),
363
                                                 jailer.local_path())
364
            basevm = vm_instance.vm
365
            basevm.start()
366
            ssh_connection = net_tools.SSHConnection(basevm.ssh_config)
367

368
            # Check if guest works.
369
            exit_code, _, _ = ssh_connection.execute_command("ls")
370
            assert exit_code == 0
371

372
            # The snapshot builder expects disks as paths, not artifacts.
373
            disks = []
374
            for disk in vm_instance.disks:
375
                disks.append(disk.local_path())
376

377
            # Create a snapshot builder from a microvm.
378
            snapshot_builder = SnapshotBuilder(basevm)
379
            snapshot = snapshot_builder.create(disks,
380
                                               vm_instance.ssh_key,
381
                                               SnapshotType.FULL)
382

383
            basevm.kill()
384
            microvm, metrics_fifo = builder.build_from_snapshot(snapshot,
385
                                                                True,
386
                                                                False)
387
            # Attempt to connect to resumed microvm.
388
            ssh_connection = net_tools.SSHConnection(microvm.ssh_config)
389
            # Check if guest still runs commands.
390
            exit_code, _, _ = ssh_connection.execute_command("dmesg")
391
            assert exit_code == 0
392

393
            value = 0
394
            # Parse all metric data points in search of load_snapshot time.
395
            metrics = microvm.get_all_metrics(metrics_fifo)
396
            for data_point in metrics:
397
                metrics = json.loads(data_point)
398
                cur_value = metrics['latencies_us']['load_snapshot']
399
                if cur_value > 0:
400
                    value = cur_value / USEC_IN_MSEC
401
                    break
402

403
            baseline = LOAD_LATENCY_BASELINES[PLATFORM]['2vcpu_512mb.json']
404
            logger.info("Latency %s/%s: %s ms", i + 1, SAMPLE_COUNT, value)
405
            assert baseline > value, "LoadSnapshot latency degraded."
406
            microvm.kill()
407

408
Product

Resources

Company