Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aos
GitHub Repository: aos/firecracker
Path: blob/main/tests/integration_tests/performance/test_snapshot_perf.py
1958 views
1
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
# SPDX-License-Identifier: Apache-2.0
3
"""Basic tests scenarios for snapshot save/restore."""
4
5
import json
6
import logging
7
import os
8
import platform
9
from conftest import _test_images_s3_bucket
10
from framework.artifacts import ArtifactCollection, ArtifactSet
11
from framework.defs import DEFAULT_TEST_IMAGES_S3_BUCKET
12
from framework.matrix import TestMatrix, TestContext
13
from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType
14
from framework.utils import CpuMap, get_firecracker_version_from_toml
15
import host_tools.network as net_tools # pylint: disable=import-error
16
import host_tools.logging as log_tools
17
18
# How many latencies do we sample per test.
19
SAMPLE_COUNT = 3
20
USEC_IN_MSEC = 1000
21
PLATFORM = platform.machine()
22
23
# Latencies in milliseconds.
24
# The latency for snapshot creation has high variance due to scheduler noise.
25
# The issue is tracked here:
26
# https://github.com/firecracker-microvm/firecracker/issues/2346
27
# TODO: Update baseline values after fix.
28
CREATE_LATENCY_BASELINES = {
29
'x86_64': {
30
'2vcpu_256mb.json': {
31
'FULL': 180,
32
'DIFF': 70,
33
},
34
'2vcpu_512mb.json': {
35
'FULL': 280,
36
'DIFF': 75,
37
}
38
},
39
'aarch64': {
40
'2vcpu_256mb.json': {
41
'FULL': 160,
42
'DIFF': 70,
43
},
44
'2vcpu_512mb.json': {
45
'FULL': 300,
46
'DIFF': 75,
47
}
48
},
49
}
50
51
# The latencies are pretty high during integration tests and
52
# this is tracked here:
53
# https://github.com/firecracker-microvm/firecracker/issues/2027
54
# TODO: Update the table after fix. Target is < 5ms.
55
LOAD_LATENCY_BASELINES = {
56
'x86_64': {
57
'2vcpu_256mb.json': 9,
58
'2vcpu_512mb.json': 9,
59
},
60
'aarch64': {
61
'2vcpu_256mb.json': 3,
62
'2vcpu_512mb.json': 3,
63
}
64
}
65
66
67
def _test_snapshot_create_latency(context):
68
logger = context.custom['logger']
69
vm_builder = context.custom['builder']
70
snapshot_type = context.custom['snapshot_type']
71
diff_snapshots = snapshot_type == SnapshotType.DIFF
72
73
# Create a rw copy artifact.
74
rw_disk = context.disk.copy()
75
# Get ssh key from read-only artifact.
76
ssh_key = context.disk.ssh_key()
77
78
logger.info("Fetching firecracker/jailer versions from {}."
79
.format(DEFAULT_TEST_IMAGES_S3_BUCKET))
80
artifacts = ArtifactCollection(_test_images_s3_bucket())
81
firecracker_versions = artifacts.firecracker_versions(
82
older_than=get_firecracker_version_from_toml())
83
assert len(firecracker_versions) > 0
84
85
# Test snapshot creation for every supported target version.
86
for target_version in firecracker_versions:
87
logger.info("""Measuring snapshot create({}) latency for target
88
version: {} and microvm: \"{}\", kernel {}, disk {} """
89
.format(snapshot_type,
90
target_version,
91
context.microvm.name(),
92
context.kernel.name(),
93
context.disk.name()))
94
95
# Measure a burst of snapshot create calls.
96
for i in range(SAMPLE_COUNT):
97
# Create a fresh microVM from artifacts.
98
vm_instance = vm_builder.build(kernel=context.kernel,
99
disks=[rw_disk],
100
ssh_key=ssh_key,
101
config=context.microvm,
102
diff_snapshots=diff_snapshots,
103
use_ramdisk=True)
104
vm = vm_instance.vm
105
# Configure metrics system.
106
metrics_fifo_path = os.path.join(vm.path, 'metrics_fifo')
107
metrics_fifo = log_tools.Fifo(metrics_fifo_path)
108
109
response = vm.metrics.put(
110
metrics_path=vm.create_jailed_resource(metrics_fifo.path)
111
)
112
assert vm.api_session.is_status_no_content(response.status_code)
113
114
vm.start()
115
116
# Check if the needed CPU cores are available. We have the API
117
# thread, VMM thread and then one thread for each configured vCPU.
118
assert CpuMap.len() >= 2 + vm.vcpus_count
119
120
# Pin uVM threads to physical cores.
121
current_cpu_id = 0
122
assert vm.pin_vmm(current_cpu_id), \
123
"Failed to pin firecracker thread."
124
current_cpu_id += 1
125
assert vm.pin_api(current_cpu_id), \
126
"Failed to pin fc_api thread."
127
for idx_vcpu in range(vm.vcpus_count):
128
current_cpu_id += 1
129
assert vm.pin_vcpu(idx_vcpu, current_cpu_id + idx_vcpu), \
130
f"Failed to pin fc_vcpu {idx_vcpu} thread."
131
132
# Create a snapshot builder from a microVM.
133
snapshot_builder = SnapshotBuilder(vm)
134
snapshot_builder.create(disks=[rw_disk],
135
ssh_key=ssh_key,
136
snapshot_type=snapshot_type,
137
target_version=target_version,
138
use_ramdisk=True)
139
metrics = vm.flush_metrics(metrics_fifo)
140
vm_name = context.microvm.name()
141
142
if snapshot_type == SnapshotType.FULL:
143
value = metrics['latencies_us']['full_create_snapshot']
144
baseline = CREATE_LATENCY_BASELINES[PLATFORM][vm_name]['FULL']
145
else:
146
value = metrics['latencies_us']['diff_create_snapshot']
147
baseline = CREATE_LATENCY_BASELINES[PLATFORM][vm_name]['DIFF']
148
149
value = value / USEC_IN_MSEC
150
151
assert baseline > value, "CreateSnapshot latency degraded."
152
153
logger.info("Latency {}/3: {} ms".format(i + 1, value))
154
vm.kill()
155
156
157
def _test_snapshot_resume_latency(context):
158
logger = context.custom['logger']
159
vm_builder = context.custom['builder']
160
snapshot_type = context.custom['snapshot_type']
161
diff_snapshots = snapshot_type == SnapshotType.DIFF
162
163
logger.info("""Measuring snapshot resume({}) latency for microvm: \"{}\",
164
kernel {}, disk {} """.format(snapshot_type,
165
context.microvm.name(),
166
context.kernel.name(),
167
context.disk.name()))
168
169
# Create a rw copy artifact.
170
rw_disk = context.disk.copy()
171
# Get ssh key from read-only artifact.
172
ssh_key = context.disk.ssh_key()
173
# Create a fresh microvm from aftifacts.
174
vm_instance = vm_builder.build(kernel=context.kernel,
175
disks=[rw_disk],
176
ssh_key=ssh_key,
177
config=context.microvm,
178
diff_snapshots=diff_snapshots,
179
use_ramdisk=True)
180
basevm = vm_instance.vm
181
basevm.start()
182
ssh_connection = net_tools.SSHConnection(basevm.ssh_config)
183
184
# Check if guest works.
185
exit_code, _, _ = ssh_connection.execute_command("ls")
186
assert exit_code == 0
187
188
logger.info("Create {}.".format(snapshot_type))
189
# Create a snapshot builder from a microvm.
190
snapshot_builder = SnapshotBuilder(basevm)
191
192
snapshot = snapshot_builder.create([rw_disk.local_path()],
193
ssh_key,
194
snapshot_type,
195
use_ramdisk=True)
196
197
basevm.kill()
198
199
for i in range(SAMPLE_COUNT):
200
microvm, metrics_fifo = vm_builder.build_from_snapshot(
201
snapshot,
202
True,
203
diff_snapshots,
204
use_ramdisk=True)
205
206
# Attempt to connect to resumed microvm.
207
ssh_connection = net_tools.SSHConnection(microvm.ssh_config)
208
209
# Verify if guest can run commands.
210
exit_code, _, _ = ssh_connection.execute_command("ls")
211
assert exit_code == 0
212
213
value = 0
214
# Parse all metric data points in search of load_snapshot time.
215
metrics = microvm.get_all_metrics(metrics_fifo)
216
for data_point in metrics:
217
metrics = json.loads(data_point)
218
cur_value = metrics['latencies_us']['load_snapshot'] / USEC_IN_MSEC
219
if cur_value > 0:
220
value = cur_value
221
break
222
223
baseline = LOAD_LATENCY_BASELINES[PLATFORM][context.microvm.name()]
224
logger.info("Latency {}/{}: {} ms".format(i + 1, SAMPLE_COUNT, value))
225
assert baseline > value, "LoadSnapshot latency degraded."
226
227
microvm.kill()
228
229
230
def test_snapshot_create_full_latency(network_config,
231
bin_cloner_path):
232
"""Test scenario: Full snapshot create performance measurement."""
233
logger = logging.getLogger("snapshot_sequence")
234
artifacts = ArtifactCollection(_test_images_s3_bucket())
235
# Testing matrix:
236
# - Guest kernel: Linux 4.14
237
# - Rootfs: Ubuntu 18.04
238
# - Microvm: 2vCPU with 256/512 MB RAM
239
# TODO: Multiple microvm sizes must be tested in the async pipeline.
240
microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))
241
microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))
242
kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))
243
disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))
244
245
# Create a test context and add builder, logger, network.
246
test_context = TestContext()
247
test_context.custom = {
248
'builder': MicrovmBuilder(bin_cloner_path),
249
'network_config': network_config,
250
'logger': logger,
251
'snapshot_type': SnapshotType.FULL,
252
}
253
254
# Create the test matrix.
255
test_matrix = TestMatrix(context=test_context,
256
artifact_sets=[
257
microvm_artifacts,
258
kernel_artifacts,
259
disk_artifacts
260
])
261
262
test_matrix.run_test(_test_snapshot_create_latency)
263
264
265
def test_snapshot_create_diff_latency(network_config,
266
bin_cloner_path):
267
"""Test scenario: Diff snapshot create performance measurement."""
268
logger = logging.getLogger("snapshot_sequence")
269
artifacts = ArtifactCollection(_test_images_s3_bucket())
270
# Testing matrix:
271
# - Guest kernel: Linux 4.14
272
# - Rootfs: Ubuntu 18.04
273
# - Microvm: 2vCPU with 256/512 MB RAM
274
# TODO: Multiple microvm sizes must be tested in the async pipeline.
275
microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))
276
microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))
277
kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))
278
disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))
279
280
# Create a test context and add builder, logger, network.
281
test_context = TestContext()
282
test_context.custom = {
283
'builder': MicrovmBuilder(bin_cloner_path),
284
'network_config': network_config,
285
'logger': logger,
286
'snapshot_type': SnapshotType.DIFF,
287
}
288
289
# Create the test matrix.
290
test_matrix = TestMatrix(context=test_context,
291
artifact_sets=[
292
microvm_artifacts,
293
kernel_artifacts,
294
disk_artifacts
295
])
296
297
test_matrix.run_test(_test_snapshot_create_latency)
298
299
300
def test_snapshot_resume_latency(network_config,
301
bin_cloner_path):
302
"""Test scenario: Snapshot load performance measurement."""
303
logger = logging.getLogger("snapshot_load")
304
305
artifacts = ArtifactCollection(_test_images_s3_bucket())
306
# Testing matrix:
307
# - Guest kernel: Linux 4.14
308
# - Rootfs: Ubuntu 18.04
309
# - Microvm: 2vCPU with 256/512 MB RAM
310
# TODO: Multiple microvm sizes must be tested in the async pipeline.
311
microvm_artifacts = ArtifactSet(artifacts.microvms(keyword="2vcpu_512mb"))
312
microvm_artifacts.insert(artifacts.microvms(keyword="2vcpu_256mb"))
313
314
kernel_artifacts = ArtifactSet(artifacts.kernels(keyword="4.14"))
315
disk_artifacts = ArtifactSet(artifacts.disks(keyword="ubuntu"))
316
317
# Create a test context and add builder, logger, network.
318
test_context = TestContext()
319
test_context.custom = {
320
'builder': MicrovmBuilder(bin_cloner_path),
321
'network_config': network_config,
322
'logger': logger,
323
'snapshot_type': SnapshotType.FULL,
324
}
325
326
# Create the test matrix.
327
test_matrix = TestMatrix(context=test_context,
328
artifact_sets=[
329
microvm_artifacts,
330
kernel_artifacts,
331
disk_artifacts
332
])
333
334
test_matrix.run_test(_test_snapshot_resume_latency)
335
336
337
def test_older_snapshot_resume_latency(bin_cloner_path):
338
"""Test scenario: Older snapshot load performance measurement."""
339
logger = logging.getLogger("old_snapshot_load")
340
341
builder = MicrovmBuilder(bin_cloner_path)
342
343
artifacts = ArtifactCollection(_test_images_s3_bucket())
344
# Fetch all firecracker binaries.
345
# With each binary create a snapshot and try to restore in current
346
# version.
347
firecracker_artifacts = artifacts.firecrackers(
348
older_than=get_firecracker_version_from_toml())
349
assert len(firecracker_artifacts) > 0
350
351
for firecracker in firecracker_artifacts:
352
firecracker.download()
353
jailer = firecracker.jailer()
354
jailer.download()
355
fc_version = firecracker.base_name()[1:]
356
logger.info("Firecracker version: %s", fc_version)
357
logger.info("Source Firecracker: %s", firecracker.local_path())
358
logger.info("Source Jailer: %s", jailer.local_path())
359
360
for i in range(SAMPLE_COUNT):
361
# Create a fresh microvm with the binary artifacts.
362
vm_instance = builder.build_vm_micro(firecracker.local_path(),
363
jailer.local_path())
364
basevm = vm_instance.vm
365
basevm.start()
366
ssh_connection = net_tools.SSHConnection(basevm.ssh_config)
367
368
# Check if guest works.
369
exit_code, _, _ = ssh_connection.execute_command("ls")
370
assert exit_code == 0
371
372
# The snapshot builder expects disks as paths, not artifacts.
373
disks = []
374
for disk in vm_instance.disks:
375
disks.append(disk.local_path())
376
377
# Create a snapshot builder from a microvm.
378
snapshot_builder = SnapshotBuilder(basevm)
379
snapshot = snapshot_builder.create(disks,
380
vm_instance.ssh_key,
381
SnapshotType.FULL)
382
383
basevm.kill()
384
microvm, metrics_fifo = builder.build_from_snapshot(snapshot,
385
True,
386
False)
387
# Attempt to connect to resumed microvm.
388
ssh_connection = net_tools.SSHConnection(microvm.ssh_config)
389
# Check if guest still runs commands.
390
exit_code, _, _ = ssh_connection.execute_command("dmesg")
391
assert exit_code == 0
392
393
value = 0
394
# Parse all metric data points in search of load_snapshot time.
395
metrics = microvm.get_all_metrics(metrics_fifo)
396
for data_point in metrics:
397
metrics = json.loads(data_point)
398
cur_value = metrics['latencies_us']['load_snapshot']
399
if cur_value > 0:
400
value = cur_value / USEC_IN_MSEC
401
break
402
403
baseline = LOAD_LATENCY_BASELINES[PLATFORM]['2vcpu_512mb.json']
404
logger.info("Latency %s/%s: %s ms", i + 1, SAMPLE_COUNT, value)
405
assert baseline > value, "LoadSnapshot latency degraded."
406
microvm.kill()
407
408