/* SPDX-License-Identifier: GPL-2.0 */1/*2* A minimal userland scheduler.3*4* In terms of scheduling, this provides two different types of behaviors:5* 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.6* All such tasks are direct-dispatched from the kernel, and are never7* enqueued in user space.8* 2. A primitive vruntime scheduler that is implemented in user space, for all9* other tasks.10*11* Some parts of this example user space scheduler could be implemented more12* efficiently using more complex and sophisticated data structures. For13* example, rather than using BPF_MAP_TYPE_QUEUE's,14* BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between15* user space and kernel space. Similarly, we use a simple vruntime-sorted list16* in user space, but an rbtree could be used instead.17*18* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.19* Copyright (c) 2022 Tejun Heo <[email protected]>20* Copyright (c) 2022 David Vernet <[email protected]>21*/22#include <scx/common.bpf.h>23#include "scx_userland.h"2425/*26* Maximum amount of tasks enqueued/dispatched between kernel and user-space.27*/28#define MAX_ENQUEUED_TASKS 40962930char _license[] SEC("license") = "GPL";3132const volatile s32 usersched_pid;3334/* !0 for veristat, set during init */35const volatile u32 num_possible_cpus = 64;3637/* Stats that are printed by user space. */38u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;3940/*41* Number of tasks that are queued for scheduling.42*43* This number is incremented by the BPF component when a task is queued to the44* user-space scheduler and it must be decremented by the user-space scheduler45* when a task is consumed.46*/47volatile u64 nr_queued;4849/*50* Number of tasks that are waiting for scheduling.51*52* This number must be updated by the user-space scheduler to keep track if53* there is still some scheduling work to do.54*/55volatile u64 nr_scheduled;5657UEI_DEFINE(uei);5859/*60* The map containing tasks that are enqueued in user space from the kernel.61*62* This map is drained by the user space scheduler.63*/64struct {65__uint(type, BPF_MAP_TYPE_QUEUE);66__uint(max_entries, MAX_ENQUEUED_TASKS);67__type(value, struct scx_userland_enqueued_task);68} enqueued SEC(".maps");6970/*71* The map containing tasks that are dispatched to the kernel from user space.72*73* Drained by the kernel in userland_dispatch().74*/75struct {76__uint(type, BPF_MAP_TYPE_QUEUE);77__uint(max_entries, MAX_ENQUEUED_TASKS);78__type(value, s32);79} dispatched SEC(".maps");8081/* Per-task scheduling context */82struct task_ctx {83bool force_local; /* Dispatch directly to local DSQ */84};8586/* Map that contains task-local storage. */87struct {88__uint(type, BPF_MAP_TYPE_TASK_STORAGE);89__uint(map_flags, BPF_F_NO_PREALLOC);90__type(key, int);91__type(value, struct task_ctx);92} task_ctx_stor SEC(".maps");9394/*95* Flag used to wake-up the user-space scheduler.96*/97static volatile u32 usersched_needed;9899/*100* Set user-space scheduler wake-up flag (equivalent to an atomic release101* operation).102*/103static void set_usersched_needed(void)104{105__sync_fetch_and_or(&usersched_needed, 1);106}107108/*109* Check and clear user-space scheduler wake-up flag (equivalent to an atomic110* acquire operation).111*/112static bool test_and_clear_usersched_needed(void)113{114return __sync_fetch_and_and(&usersched_needed, 0) == 1;115}116117static bool is_usersched_task(const struct task_struct *p)118{119return p->pid == usersched_pid;120}121122static bool keep_in_kernel(const struct task_struct *p)123{124return p->nr_cpus_allowed < num_possible_cpus;125}126127static struct task_struct *usersched_task(void)128{129struct task_struct *p;130131p = bpf_task_from_pid(usersched_pid);132/*133* Should never happen -- the usersched task should always be managed134* by sched_ext.135*/136if (!p)137scx_bpf_error("Failed to find usersched task %d", usersched_pid);138139return p;140}141142s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,143s32 prev_cpu, u64 wake_flags)144{145if (keep_in_kernel(p)) {146s32 cpu;147struct task_ctx *tctx;148149tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);150if (!tctx) {151scx_bpf_error("Failed to look up task-local storage for %s", p->comm);152return -ESRCH;153}154155if (p->nr_cpus_allowed == 1 ||156scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {157tctx->force_local = true;158return prev_cpu;159}160161cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);162if (cpu >= 0) {163tctx->force_local = true;164return cpu;165}166}167168return prev_cpu;169}170171static void dispatch_user_scheduler(void)172{173struct task_struct *p;174175p = usersched_task();176if (p) {177scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);178bpf_task_release(p);179}180}181182static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)183{184struct scx_userland_enqueued_task task = {};185186task.pid = p->pid;187task.sum_exec_runtime = p->se.sum_exec_runtime;188task.weight = p->scx.weight;189190if (bpf_map_push_elem(&enqueued, &task, 0)) {191/*192* If we fail to enqueue the task in user space, put it193* directly on the global DSQ.194*/195__sync_fetch_and_add(&nr_failed_enqueues, 1);196scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);197} else {198__sync_fetch_and_add(&nr_user_enqueues, 1);199set_usersched_needed();200}201}202203void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)204{205if (keep_in_kernel(p)) {206u64 dsq_id = SCX_DSQ_GLOBAL;207struct task_ctx *tctx;208209tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);210if (!tctx) {211scx_bpf_error("Failed to lookup task ctx for %s", p->comm);212return;213}214215if (tctx->force_local)216dsq_id = SCX_DSQ_LOCAL;217tctx->force_local = false;218scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags);219__sync_fetch_and_add(&nr_kernel_enqueues, 1);220return;221} else if (!is_usersched_task(p)) {222enqueue_task_in_user_space(p, enq_flags);223}224}225226void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)227{228if (test_and_clear_usersched_needed())229dispatch_user_scheduler();230231bpf_repeat(MAX_ENQUEUED_TASKS) {232s32 pid;233struct task_struct *p;234235if (bpf_map_pop_elem(&dispatched, &pid))236break;237238/*239* The task could have exited by the time we get around to240* dispatching it. Treat this as a normal occurrence, and simply241* move onto the next iteration.242*/243p = bpf_task_from_pid(pid);244if (!p)245continue;246247scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);248bpf_task_release(p);249}250}251252/*253* A CPU is about to change its idle state. If the CPU is going idle, ensure254* that the user-space scheduler has a chance to run if there is any remaining255* work to do.256*/257void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle)258{259/*260* Don't do anything if we exit from and idle state, a CPU owner will261* be assigned in .running().262*/263if (!idle)264return;265/*266* A CPU is now available, notify the user-space scheduler that tasks267* can be dispatched, if there is at least one task waiting to be268* scheduled, either queued (accounted in nr_queued) or scheduled269* (accounted in nr_scheduled).270*271* NOTE: nr_queued is incremented by the BPF component, more exactly in272* enqueue(), when a task is sent to the user-space scheduler, then273* the scheduler drains the queued tasks (updating nr_queued) and adds274* them to its internal data structures / state; at this point tasks275* become "scheduled" and the user-space scheduler will take care of276* updating nr_scheduled accordingly; lastly tasks will be dispatched277* and the user-space scheduler will update nr_scheduled again.278*279* Checking both counters allows to determine if there is still some280* pending work to do for the scheduler: new tasks have been queued281* since last check, or there are still tasks "queued" or "scheduled"282* since the previous user-space scheduler run. If the counters are283* both zero it is pointless to wake-up the scheduler (even if a CPU284* becomes idle), because there is nothing to do.285*286* Keep in mind that update_idle() doesn't run concurrently with the287* user-space scheduler (that is single-threaded): this function is288* naturally serialized with the user-space scheduler code, therefore289* this check here is also safe from a concurrency perspective.290*/291if (nr_queued || nr_scheduled) {292/*293* Kick the CPU to make it immediately ready to accept294* dispatched tasks.295*/296set_usersched_needed();297scx_bpf_kick_cpu(cpu, 0);298}299}300301s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p,302struct scx_init_task_args *args)303{304if (bpf_task_storage_get(&task_ctx_stor, p, 0,305BPF_LOCAL_STORAGE_GET_F_CREATE))306return 0;307else308return -ENOMEM;309}310311s32 BPF_STRUCT_OPS(userland_init)312{313if (num_possible_cpus == 0) {314scx_bpf_error("User scheduler # CPUs uninitialized (%d)",315num_possible_cpus);316return -EINVAL;317}318319if (usersched_pid <= 0) {320scx_bpf_error("User scheduler pid uninitialized (%d)",321usersched_pid);322return -EINVAL;323}324325return 0;326}327328void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)329{330UEI_RECORD(uei, ei);331}332333SCX_OPS_DEFINE(userland_ops,334.select_cpu = (void *)userland_select_cpu,335.enqueue = (void *)userland_enqueue,336.dispatch = (void *)userland_dispatch,337.update_idle = (void *)userland_update_idle,338.init_task = (void *)userland_init_task,339.init = (void *)userland_init,340.exit = (void *)userland_exit,341.flags = SCX_OPS_ENQ_LAST |342SCX_OPS_KEEP_BUILTIN_IDLE,343.name = "userland");344345346