Path: blob/master/tools/testing/selftests/kvm/access_tracking_perf_test.c
38189 views
// SPDX-License-Identifier: GPL-2.01/*2* access_tracking_perf_test3*4* Copyright (C) 2021, Google, Inc.5*6* This test measures the performance effects of KVM's access tracking.7* Access tracking is driven by the MMU notifiers test_young, clear_young, and8* clear_flush_young. These notifiers do not have a direct userspace API,9* however the clear_young notifier can be triggered either by10* 1. marking a pages as idle in /sys/kernel/mm/page_idle/bitmap OR11* 2. adding a new MGLRU generation using the lru_gen debugfs file.12* This test leverages page_idle to enable access tracking on guest memory13* unless MGLRU is enabled, in which case MGLRU is used.14*15* To measure performance this test runs a VM with a configurable number of16* vCPUs that each touch every page in disjoint regions of memory. Performance17* is measured in the time it takes all vCPUs to finish touching their18* predefined region.19*20* Note that a deterministic correctness test of access tracking is not possible21* by using page_idle or MGLRU aging as it exists today. This is for a few22* reasons:23*24* 1. page_idle and MGLRU only issue clear_young notifiers, which lack a TLB flush.25* This means subsequent guest accesses are not guaranteed to see page table26* updates made by KVM until some time in the future.27*28* 2. page_idle only operates on LRU pages. Newly allocated pages are not29* immediately allocated to LRU lists. Instead they are held in a "pagevec",30* which is drained to LRU lists some time in the future. There is no31* userspace API to force this drain to occur.32*33* These limitations are worked around in this test by using a large enough34* region of memory for each vCPU such that the number of translations cached in35* the TLB and the number of pages held in pagevecs are a small fraction of the36* overall workload. And if either of those conditions are not true (for example37* in nesting, where TLB size is unlimited) this test will print a warning38* rather than silently passing.39*/40#include <inttypes.h>41#include <limits.h>42#include <pthread.h>43#include <sys/mman.h>44#include <sys/types.h>45#include <sys/stat.h>4647#include "kvm_util.h"48#include "test_util.h"49#include "memstress.h"50#include "guest_modes.h"51#include "processor.h"52#include "ucall_common.h"5354#include "cgroup_util.h"55#include "lru_gen_util.h"5657static const char *TEST_MEMCG_NAME = "access_tracking_perf_test";5859/* Global variable used to synchronize all of the vCPU threads. */60static int iteration;6162/* The cgroup memory controller root. Needed for lru_gen-based aging. */63char cgroup_root[PATH_MAX];6465/* Defines what vCPU threads should do during a given iteration. */66static enum {67/* Run the vCPU to access all its memory. */68ITERATION_ACCESS_MEMORY,69/* Mark the vCPU's memory idle in page_idle. */70ITERATION_MARK_IDLE,71} iteration_work;7273/* The iteration that was last completed by each vCPU. */74static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];7576/* Whether to overlap the regions of memory vCPUs access. */77static bool overlap_memory_access;7879/*80* If the test should only warn if there are too many idle pages (i.e., it is81* expected).82* -1: Not yet set.83* 0: We do not expect too many idle pages, so FAIL if too many idle pages.84* 1: Having too many idle pages is expected, so merely print a warning if85* too many idle pages are found.86*/87static int idle_pages_warn_only = -1;8889/* Whether or not to use MGLRU instead of page_idle for access tracking */90static bool use_lru_gen;9192/* Total number of pages to expect in the memcg after touching everything */93static long test_pages;9495/* Last generation we found the pages in */96static int lru_gen_last_gen = -1;9798struct test_params {99/* The backing source for the region of memory. */100enum vm_mem_backing_src_type backing_src;101102/* The amount of memory to allocate for each vCPU. */103uint64_t vcpu_memory_bytes;104105/* The number of vCPUs to create in the VM. */106int nr_vcpus;107};108109static uint64_t pread_uint64(int fd, const char *filename, uint64_t index)110{111uint64_t value;112off_t offset = index * sizeof(value);113114TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value),115"pread from %s offset 0x%" PRIx64 " failed!",116filename, offset);117118return value;119120}121122#define PAGEMAP_PRESENT (1ULL << 63)123#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1)124125static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva)126{127uint64_t hva = (uint64_t) addr_gva2hva(vm, gva);128uint64_t entry;129uint64_t pfn;130131entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize());132if (!(entry & PAGEMAP_PRESENT))133return 0;134135pfn = entry & PAGEMAP_PFN_MASK;136__TEST_REQUIRE(pfn, "Looking up PFNs requires CAP_SYS_ADMIN");137138return pfn;139}140141static bool is_page_idle(int page_idle_fd, uint64_t pfn)142{143uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64);144145return !!((bits >> (pfn % 64)) & 1);146}147148static void mark_page_idle(int page_idle_fd, uint64_t pfn)149{150uint64_t bits = 1ULL << (pfn % 64);151152TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8,153"Set page_idle bits for PFN 0x%" PRIx64, pfn);154}155156static void too_many_idle_pages(long idle_pages, long total_pages, int vcpu_idx)157{158char prefix[18] = {};159160if (vcpu_idx >= 0)161snprintf(prefix, 18, "vCPU%d: ", vcpu_idx);162163TEST_ASSERT(idle_pages_warn_only,164"%sToo many pages still idle (%lu out of %lu)",165prefix, idle_pages, total_pages);166167printf("WARNING: %sToo many pages still idle (%lu out of %lu), "168"this will affect performance results.\n",169prefix, idle_pages, total_pages);170}171172static void pageidle_mark_vcpu_memory_idle(struct kvm_vm *vm,173struct memstress_vcpu_args *vcpu_args)174{175int vcpu_idx = vcpu_args->vcpu_idx;176uint64_t base_gva = vcpu_args->gva;177uint64_t pages = vcpu_args->pages;178uint64_t page;179uint64_t still_idle = 0;180uint64_t no_pfn = 0;181int page_idle_fd;182int pagemap_fd;183184/* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */185if (overlap_memory_access && vcpu_idx)186return;187188page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);189TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle.");190191pagemap_fd = open("/proc/self/pagemap", O_RDONLY);192TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap.");193194for (page = 0; page < pages; page++) {195uint64_t gva = base_gva + page * memstress_args.guest_page_size;196uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva);197198if (!pfn) {199no_pfn++;200continue;201}202203if (is_page_idle(page_idle_fd, pfn)) {204still_idle++;205continue;206}207208mark_page_idle(page_idle_fd, pfn);209}210211/*212* Assumption: Less than 1% of pages are going to be swapped out from213* under us during this test.214*/215TEST_ASSERT(no_pfn < pages / 100,216"vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.",217vcpu_idx, no_pfn, pages);218219/*220* Check that at least 90% of memory has been marked idle (the rest221* might not be marked idle because the pages have not yet made it to an222* LRU list or the translations are still cached in the TLB). 90% is223* arbitrary; high enough that we ensure most memory access went through224* access tracking but low enough as to not make the test too brittle225* over time and across architectures.226*/227if (still_idle >= pages / 10)228too_many_idle_pages(still_idle, pages,229overlap_memory_access ? -1 : vcpu_idx);230231close(page_idle_fd);232close(pagemap_fd);233}234235int find_generation(struct memcg_stats *stats, long total_pages)236{237/*238* For finding the generation that contains our pages, use the same239* 90% threshold that page_idle uses.240*/241int gen = lru_gen_find_generation(stats, total_pages * 9 / 10);242243if (gen >= 0)244return gen;245246if (!idle_pages_warn_only) {247TEST_FAIL("Could not find a generation with 90%% of guest memory (%ld pages).",248total_pages * 9 / 10);249return gen;250}251252/*253* We couldn't find a generation with 90% of guest memory, which can254* happen if access tracking is unreliable. Simply look for a majority255* of pages.256*/257puts("WARNING: Couldn't find a generation with 90% of guest memory. "258"Performance results may not be accurate.");259gen = lru_gen_find_generation(stats, total_pages / 2);260TEST_ASSERT(gen >= 0,261"Could not find a generation with 50%% of guest memory (%ld pages).",262total_pages / 2);263return gen;264}265266static void lru_gen_mark_memory_idle(struct kvm_vm *vm)267{268struct timespec ts_start;269struct timespec ts_elapsed;270struct memcg_stats stats;271int new_gen;272273/* Make a new generation */274clock_gettime(CLOCK_MONOTONIC, &ts_start);275lru_gen_do_aging(&stats, TEST_MEMCG_NAME);276ts_elapsed = timespec_elapsed(ts_start);277278/* Check the generation again */279new_gen = find_generation(&stats, test_pages);280281/*282* This function should only be invoked with newly-accessed pages,283* so pages should always move to a newer generation.284*/285if (new_gen <= lru_gen_last_gen) {286/* We did not move to a newer generation. */287long idle_pages = lru_gen_sum_memcg_stats_for_gen(lru_gen_last_gen,288&stats);289290too_many_idle_pages(min_t(long, idle_pages, test_pages),291test_pages, -1);292}293pr_info("%-30s: %ld.%09lds\n",294"Mark memory idle (lru_gen)", ts_elapsed.tv_sec,295ts_elapsed.tv_nsec);296lru_gen_last_gen = new_gen;297}298299static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall)300{301struct ucall uc;302uint64_t actual_ucall = get_ucall(vcpu, &uc);303304TEST_ASSERT(expected_ucall == actual_ucall,305"Guest exited unexpectedly (expected ucall %" PRIu64306", got %" PRIu64 ")",307expected_ucall, actual_ucall);308}309310static bool spin_wait_for_next_iteration(int *current_iteration)311{312int last_iteration = *current_iteration;313314do {315if (READ_ONCE(memstress_args.stop_vcpus))316return false;317318*current_iteration = READ_ONCE(iteration);319} while (last_iteration == *current_iteration);320321return true;322}323324static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args)325{326struct kvm_vcpu *vcpu = vcpu_args->vcpu;327struct kvm_vm *vm = memstress_args.vm;328int vcpu_idx = vcpu_args->vcpu_idx;329int current_iteration = 0;330331while (spin_wait_for_next_iteration(¤t_iteration)) {332switch (READ_ONCE(iteration_work)) {333case ITERATION_ACCESS_MEMORY:334vcpu_run(vcpu);335assert_ucall(vcpu, UCALL_SYNC);336break;337case ITERATION_MARK_IDLE:338pageidle_mark_vcpu_memory_idle(vm, vcpu_args);339break;340}341342vcpu_last_completed_iteration[vcpu_idx] = current_iteration;343}344}345346static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration)347{348while (READ_ONCE(vcpu_last_completed_iteration[vcpu_idx]) !=349target_iteration) {350continue;351}352}353354/* The type of memory accesses to perform in the VM. */355enum access_type {356ACCESS_READ,357ACCESS_WRITE,358};359360static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description)361{362struct timespec ts_start;363struct timespec ts_elapsed;364int next_iteration, i;365366/* Kick off the vCPUs by incrementing iteration. */367next_iteration = ++iteration;368369clock_gettime(CLOCK_MONOTONIC, &ts_start);370371/* Wait for all vCPUs to finish the iteration. */372for (i = 0; i < nr_vcpus; i++)373spin_wait_for_vcpu(i, next_iteration);374375ts_elapsed = timespec_elapsed(ts_start);376pr_info("%-30s: %ld.%09lds\n",377description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec);378}379380static void access_memory(struct kvm_vm *vm, int nr_vcpus,381enum access_type access, const char *description)382{383memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100);384iteration_work = ITERATION_ACCESS_MEMORY;385run_iteration(vm, nr_vcpus, description);386}387388static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus)389{390if (use_lru_gen)391return lru_gen_mark_memory_idle(vm);392393/*394* Even though this parallelizes the work across vCPUs, this is still a395* very slow operation because page_idle forces the test to mark one pfn396* at a time and the clear_young notifier may serialize on the KVM MMU397* lock.398*/399pr_debug("Marking VM memory idle (slow)...\n");400iteration_work = ITERATION_MARK_IDLE;401run_iteration(vm, nr_vcpus, "Mark memory idle (page_idle)");402}403404static void run_test(enum vm_guest_mode mode, void *arg)405{406struct test_params *params = arg;407struct kvm_vm *vm;408int nr_vcpus = params->nr_vcpus;409410vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,411params->backing_src, !overlap_memory_access);412413/*414* If guest_page_size is larger than the host's page size, the415* guest (memstress) will only fault in a subset of the host's pages.416*/417test_pages = params->nr_vcpus * params->vcpu_memory_bytes /418max(memstress_args.guest_page_size,419(uint64_t)getpagesize());420421memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main);422423pr_info("\n");424access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory");425426if (use_lru_gen) {427struct memcg_stats stats;428429/*430* Do a page table scan now. Following initial population, aging431* may not cause the pages to move to a newer generation. Do432* an aging pass now so that future aging passes always move433* pages to a newer generation.434*/435printf("Initial aging pass (lru_gen)\n");436lru_gen_do_aging(&stats, TEST_MEMCG_NAME);437TEST_ASSERT(lru_gen_sum_memcg_stats(&stats) >= test_pages,438"Not all pages accounted for (looking for %ld). "439"Was the memcg set up correctly?", test_pages);440access_memory(vm, nr_vcpus, ACCESS_WRITE, "Re-populating memory");441lru_gen_read_memcg_stats(&stats, TEST_MEMCG_NAME);442lru_gen_last_gen = find_generation(&stats, test_pages);443}444445/* As a control, read and write to the populated memory first. */446access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory");447access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory");448449/* Repeat on memory that has been marked as idle. */450mark_memory_idle(vm, nr_vcpus);451access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to idle memory");452mark_memory_idle(vm, nr_vcpus);453access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory");454455memstress_join_vcpu_threads(nr_vcpus);456memstress_destroy_vm(vm);457}458459static int access_tracking_unreliable(void)460{461#ifdef __x86_64__462/*463* When running nested, the TLB size may be effectively unlimited (for464* example, this is the case when running on KVM L0), and KVM doesn't465* explicitly flush the TLB when aging SPTEs. As a result, more pages466* are cached and the guest won't see the "idle" bit cleared.467*/468if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {469puts("Skipping idle page count sanity check, because the test is run nested");470return 1;471}472#endif473/*474* When NUMA balancing is enabled, guest memory will be unmapped to get475* NUMA faults, dropping the Accessed bits.476*/477if (is_numa_balancing_enabled()) {478puts("Skipping idle page count sanity check, because NUMA balancing is enabled");479return 1;480}481return 0;482}483484static int run_test_for_each_guest_mode(const char *cgroup, void *arg)485{486for_each_guest_mode(run_test, arg);487return 0;488}489490static void help(char *name)491{492puts("");493printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n",494name);495puts("");496printf(" -h: Display this help message.");497guest_modes_help();498printf(" -b: specify the size of the memory region which should be\n"499" dirtied by each vCPU. e.g. 10M or 3G.\n"500" (default: 1G)\n");501printf(" -v: specify the number of vCPUs to run.\n");502printf(" -o: Overlap guest memory accesses instead of partitioning\n"503" them into a separate region of memory for each vCPU.\n");504printf(" -w: Control whether the test warns or fails if more than 10%%\n"505" of pages are still seen as idle/old after accessing guest\n"506" memory. >0 == warn only, 0 == fail, <0 == auto. For auto\n"507" mode, the test fails by default, but switches to warn only\n"508" if NUMA balancing is enabled or the test detects it's running\n"509" in a VM.\n");510backing_src_help("-s");511puts("");512exit(0);513}514515void destroy_cgroup(char *cg)516{517printf("Destroying cgroup: %s\n", cg);518}519520int main(int argc, char *argv[])521{522struct test_params params = {523.backing_src = DEFAULT_VM_MEM_SRC,524.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,525.nr_vcpus = 1,526};527char *new_cg = NULL;528int page_idle_fd;529int opt;530531guest_modes_append_default();532533while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) {534switch (opt) {535case 'm':536guest_modes_cmdline(optarg);537break;538case 'b':539params.vcpu_memory_bytes = parse_size(optarg);540break;541case 'v':542params.nr_vcpus = atoi_positive("Number of vCPUs", optarg);543break;544case 'o':545overlap_memory_access = true;546break;547case 's':548params.backing_src = parse_backing_src_type(optarg);549break;550case 'w':551idle_pages_warn_only =552atoi_non_negative("Idle pages warning",553optarg);554break;555case 'h':556default:557help(argv[0]);558break;559}560}561562if (idle_pages_warn_only == -1)563idle_pages_warn_only = access_tracking_unreliable();564565if (lru_gen_usable()) {566bool cg_created = true;567int ret;568569puts("Using lru_gen for aging");570use_lru_gen = true;571572if (cg_find_controller_root(cgroup_root, sizeof(cgroup_root), "memory"))573ksft_exit_skip("Cannot find memory cgroup controller\n");574575new_cg = cg_name(cgroup_root, TEST_MEMCG_NAME);576printf("Creating cgroup: %s\n", new_cg);577if (cg_create(new_cg)) {578if (errno == EEXIST) {579printf("Found existing cgroup");580cg_created = false;581} else {582ksft_exit_skip("could not create new cgroup: %s\n", new_cg);583}584}585586/*587* This will fork off a new process to run the test within588* a new memcg, so we need to properly propagate the return589* value up.590*/591ret = cg_run(new_cg, &run_test_for_each_guest_mode, ¶ms);592if (cg_created)593cg_destroy(new_cg);594if (ret < 0)595TEST_FAIL("child did not spawn or was abnormally killed");596if (ret)597return ret;598} else {599page_idle_fd = __open_path_or_exit("/sys/kernel/mm/page_idle/bitmap", O_RDWR,600"Is CONFIG_IDLE_PAGE_TRACKING enabled?");601close(page_idle_fd);602603puts("Using page_idle for aging");604run_test_for_each_guest_mode(NULL, ¶ms);605}606607return 0;608}609610611