Path: blob/master/tools/testing/selftests/kvm/rseq_test.c
38189 views
// SPDX-License-Identifier: GPL-2.0-only12/*3* Include rseq.c without _GNU_SOURCE defined, before including any headers, so4* that rseq.c is compiled with its configuration, not KVM selftests' config.5*/6#undef _GNU_SOURCE7#include "../rseq/rseq.c"8#define _GNU_SOURCE910#include <errno.h>11#include <fcntl.h>12#include <pthread.h>13#include <sched.h>14#include <stdio.h>15#include <stdlib.h>16#include <string.h>17#include <signal.h>18#include <syscall.h>19#include <sys/ioctl.h>20#include <sys/sysinfo.h>21#include <asm/barrier.h>22#include <linux/atomic.h>23#include <linux/rseq.h>24#include <linux/unistd.h>2526#include "kvm_util.h"27#include "processor.h"28#include "test_util.h"29#include "ucall_common.h"3031/*32* Any bug related to task migration is likely to be timing-dependent; perform33* a large number of migrations to reduce the odds of a false negative.34*/35#define NR_TASK_MIGRATIONS 1000003637static pthread_t migration_thread;38static cpu_set_t possible_mask;39static int min_cpu, max_cpu;40static bool done;4142static atomic_t seq_cnt;4344static void guest_code(void)45{46for (;;)47GUEST_SYNC(0);48}4950static int next_cpu(int cpu)51{52/*53* Advance to the next CPU, skipping those that weren't in the original54* affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's55* data storage is considered as opaque. Note, if this task is pinned56* to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will57* burn a lot cycles and the test will take longer than normal to58* complete.59*/60do {61cpu++;62if (cpu > max_cpu) {63cpu = min_cpu;64TEST_ASSERT(CPU_ISSET(cpu, &possible_mask),65"Min CPU = %d must always be usable", cpu);66break;67}68} while (!CPU_ISSET(cpu, &possible_mask));6970return cpu;71}7273static void *migration_worker(void *__rseq_tid)74{75pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid;76cpu_set_t allowed_mask;77int r, i, cpu;7879CPU_ZERO(&allowed_mask);8081for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) {82CPU_SET(cpu, &allowed_mask);8384/*85* Bump the sequence count twice to allow the reader to detect86* that a migration may have occurred in between rseq and sched87* CPU ID reads. An odd sequence count indicates a migration88* is in-progress, while a completely different count indicates89* a migration occurred since the count was last read.90*/91atomic_inc(&seq_cnt);9293/*94* Ensure the odd count is visible while getcpu() isn't95* stable, i.e. while changing affinity is in-progress.96*/97smp_wmb();98r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask);99TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)",100errno, strerror(errno));101smp_wmb();102atomic_inc(&seq_cnt);103104CPU_CLR(cpu, &allowed_mask);105106/*107* Wait 1-10us before proceeding to the next iteration and more108* specifically, before bumping seq_cnt again. A delay is109* needed on three fronts:110*111* 1. To allow sched_setaffinity() to prompt migration before112* ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME113* (or TIF_NEED_RESCHED, which indirectly leads to handling114* NOTIFY_RESUME) is handled in KVM context.115*116* If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters117* the guest, the guest will trigger a IO/MMIO exit all the118* way to userspace and the TIF flags will be handled by119* the generic "exit to userspace" logic, not by KVM. The120* exit to userspace is necessary to give the test a chance121* to check the rseq CPU ID (see #2).122*123* Alternatively, guest_code() could include an instruction124* to trigger an exit that is handled by KVM, but any such125* exit requires architecture specific code.126*127* 2. To let ioctl(KVM_RUN) make its way back to the test128* before the next round of migration. The test's check on129* the rseq CPU ID must wait for migration to complete in130* order to avoid false positive, thus any kernel rseq bug131* will be missed if the next migration starts before the132* check completes.133*134* 3. To ensure the read-side makes efficient forward progress,135* e.g. if getcpu() involves a syscall. Stalling the read-side136* means the test will spend more time waiting for getcpu()137* to stabilize and less time trying to hit the timing-dependent138* bug.139*140* Because any bug in this area is likely to be timing-dependent,141* run with a range of delays at 1us intervals from 1us to 10us142* as a best effort to avoid tuning the test to the point where143* it can hit _only_ the original bug and not detect future144* regressions.145*146* The original bug can reproduce with a delay up to ~500us on147* x86-64, but starts to require more iterations to reproduce148* as the delay creeps above ~10us, and the average runtime of149* each iteration obviously increases as well. Cap the delay150* at 10us to keep test runtime reasonable while minimizing151* potential coverage loss.152*153* The lower bound for reproducing the bug is likely below 1us,154* e.g. failures occur on x86-64 with nanosleep(0), but at that155* point the overhead of the syscall likely dominates the delay.156* Use usleep() for simplicity and to avoid unnecessary kernel157* dependencies.158*/159usleep((i % 10) + 1);160}161done = true;162return NULL;163}164165static void calc_min_max_cpu(void)166{167int i, cnt, nproc;168169TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2);170171/*172* CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that173* this task is affined to in order to reduce the time spent querying174* unusable CPUs, e.g. if this task is pinned to a small percentage of175* total CPUs.176*/177nproc = get_nprocs_conf();178min_cpu = -1;179max_cpu = -1;180cnt = 0;181182for (i = 0; i < nproc; i++) {183if (!CPU_ISSET(i, &possible_mask))184continue;185if (min_cpu == -1)186min_cpu = i;187max_cpu = i;188cnt++;189}190191__TEST_REQUIRE(cnt >= 2,192"Only one usable CPU, task migration not possible");193}194195static void help(const char *name)196{197puts("");198printf("usage: %s [-h] [-u] [-l latency]\n", name);199printf(" -u: Don't sanity check the number of successful KVM_RUNs\n");200printf(" -l: Set /dev/cpu_dma_latency to suppress deep sleep states\n");201puts("");202exit(0);203}204205int main(int argc, char *argv[])206{207int r, i, snapshot, opt, fd = -1, latency = -1;208bool skip_sanity_check = false;209struct kvm_vm *vm;210struct kvm_vcpu *vcpu;211u32 cpu, rseq_cpu;212213while ((opt = getopt(argc, argv, "hl:u")) != -1) {214switch (opt) {215case 'u':216skip_sanity_check = true;217case 'l':218latency = atoi_paranoid(optarg);219break;220case 'h':221default:222help(argv[0]);223break;224}225}226227r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);228TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,229strerror(errno));230231calc_min_max_cpu();232233r = rseq_register_current_thread();234TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)",235errno, strerror(errno));236237/*238* Create and run a dummy VM that immediately exits to userspace via239* GUEST_SYNC, while concurrently migrating the process by setting its240* CPU affinity.241*/242vm = vm_create_with_one_vcpu(&vcpu, guest_code);243244pthread_create(&migration_thread, NULL, migration_worker,245(void *)(unsigned long)syscall(SYS_gettid));246247if (latency >= 0) {248/*249* Writes to cpu_dma_latency persist only while the file is250* open, i.e. it allows userspace to provide guaranteed latency251* while running a workload. Keep the file open until the test252* completes, otherwise writing cpu_dma_latency is meaningless.253*/254fd = open("/dev/cpu_dma_latency", O_RDWR);255TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("open() /dev/cpu_dma_latency", fd));256257r = write(fd, &latency, 4);258TEST_ASSERT(r >= 1, "Error setting /dev/cpu_dma_latency");259}260261for (i = 0; !done; i++) {262vcpu_run(vcpu);263TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,264"Guest failed?");265266/*267* Verify rseq's CPU matches sched's CPU. Ensure migration268* doesn't occur between getcpu() and reading the rseq cpu_id269* by rereading both if the sequence count changes, or if the270* count is odd (migration in-progress).271*/272do {273/*274* Drop bit 0 to force a mismatch if the count is odd,275* i.e. if a migration is in-progress.276*/277snapshot = atomic_read(&seq_cnt) & ~1;278279/*280* Ensure calling getcpu() and reading rseq.cpu_id complete281* in a single "no migration" window, i.e. are not reordered282* across the seq_cnt reads.283*/284smp_rmb();285r = sys_getcpu(&cpu, NULL);286TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)",287errno, strerror(errno));288rseq_cpu = rseq_current_cpu_raw();289smp_rmb();290} while (snapshot != atomic_read(&seq_cnt));291292TEST_ASSERT(rseq_cpu == cpu,293"rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu);294}295296if (fd > 0)297close(fd);298299/*300* Sanity check that the test was able to enter the guest a reasonable301* number of times, e.g. didn't get stalled too often/long waiting for302* getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly303* conservative ratio on x86-64, which can do _more_ KVM_RUNs than304* migrations given the 1us+ delay in the migration task.305*306* Another reason why it may have small migration:KVM_RUN ratio is that,307* on systems with large low power mode wakeup latency, it may happen308* quite often that the scheduler is not able to wake up the target CPU309* before the vCPU thread is scheduled to another CPU.310*/311TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2),312"Only performed %d KVM_RUNs, task stalled too much?\n\n"313" Try disabling deep sleep states to reduce CPU wakeup latency,\n"314" e.g. via cpuidle.off=1 or via -l <latency>, or run with -u to\n"315" disable this sanity check.", i);316317pthread_join(migration_thread, NULL);318319kvm_vm_free(vm);320321rseq_unregister_current_thread();322323return 0;324}325326327