/* SPDX-License-Identifier: GPL-2.0 */1/*2* A demo sched_ext core-scheduler which always makes every sibling CPU pair3* execute from the same CPU cgroup.4*5* This scheduler is a minimal implementation and would need some form of6* priority handling both inside each cgroup and across the cgroups to be7* practically useful.8*9* Each CPU in the system is paired with exactly one other CPU, according to a10* "stride" value that can be specified when the BPF scheduler program is first11* loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee12* that they will only ever schedule tasks that belong to the same CPU cgroup.13*14* Scheduler Initialization15* ------------------------16*17* The scheduler BPF program is first initialized from user space, before it is18* enabled. During this initialization process, each CPU on the system is19* assigned several values that are constant throughout its runtime:20*21* 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling22* decisions. Paired CPUs always schedule tasks from the same23* CPU cgroup, and synchronize with each other to guarantee24* that this constraint is not violated.25* 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access26* a struct pair_ctx object that is shared between the pair.27* 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the28* pair. Each struct pair_ctx has an active_mask field,29* which is a bitmap used to indicate whether each core30* in the pair currently has an actively running task.31* This index specifies which entry in the bitmap corresponds32* to each CPU in the pair.33*34* During this initialization, the CPUs are paired according to a "stride" that35* may be specified when invoking the user space program that initializes and36* loads the scheduler. By default, the stride is 1/2 the total number of CPUs.37*38* Tasks and cgroups39* -----------------40*41* Every cgroup in the system is registered with the scheduler using the42* pair_cgroup_init() callback, and every task in the system is associated with43* exactly one cgroup. At a high level, the idea with the pair scheduler is to44* always schedule tasks from the same cgroup within a given CPU pair. When a45* task is enqueued (i.e. passed to the pair_enqueue() callback function), its46* cgroup ID is read from its task struct, and then a corresponding queue map47* is used to FIFO-enqueue the task for that cgroup.48*49* If you look through the implementation of the scheduler, you'll notice that50* there is quite a bit of complexity involved with looking up the per-cgroup51* FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash52* BPF hash map that is used to map a cgroup ID to a globally unique ID that's53* allocated in the BPF program. This is done because we use separate maps to54* store the FIFO queue of tasks, and the length of that map, per cgroup. This55* complexity is only present because of current deficiencies in BPF that will56* soon be addressed. The main point to keep in mind is that newly enqueued57* tasks are added to their cgroup's FIFO queue.58*59* Dispatching tasks60* -----------------61*62* This section will describe how enqueued tasks are dispatched and scheduled.63* Tasks are dispatched in pair_dispatch(), and at a high level the workflow is64* as follows:65*66* 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is67* the structure that's used to synchronize amongst the two pair CPUs in their68* scheduling decisions. After any of the following events have occurred:69*70* - The cgroup's slice run has expired, or71* - The cgroup becomes empty, or72* - Either CPU in the pair is preempted by a higher priority scheduling class73*74* The cgroup transitions to the draining state and stops executing new tasks75* from the cgroup.76*77* 2. If the pair is still executing a task, mark the pair_ctx as draining, and78* wait for the pair CPU to be preempted.79*80* 3. Otherwise, if the pair CPU is not running a task, we can move onto81* scheduling new tasks. Pop the next cgroup id from the top_q queue.82*83* 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.84*85* Note again that this scheduling behavior is simple, but the implementation86* is complex mostly because this it hits several BPF shortcomings and has to87* work around in often awkward ways. Most of the shortcomings are expected to88* be resolved in the near future which should allow greatly simplifying this89* scheduler.90*91* Dealing with preemption92* -----------------------93*94* SCX is the lowest priority sched_class, and could be preempted by them at95* any time. To address this, the scheduler implements pair_cpu_release() and96* pair_cpu_acquire() callbacks which are invoked by the core scheduler when97* the scheduler loses and gains control of the CPU respectively.98*99* In pair_cpu_release(), we mark the pair_ctx as having been preempted, and100* then invoke:101*102* scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);103*104* This preempts the pair CPU, and waits until it has re-entered the scheduler105* before returning. This is necessary to ensure that the higher priority106* sched_class that preempted our scheduler does not schedule a task107* concurrently with our pair CPU.108*109* When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption110* in the pair_ctx, and send another resched IPI to the pair CPU to re-enable111* pair scheduling.112*113* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.114* Copyright (c) 2022 Tejun Heo <[email protected]>115* Copyright (c) 2022 David Vernet <[email protected]>116*/117#include <scx/common.bpf.h>118#include "scx_pair.h"119120char _license[] SEC("license") = "GPL";121122/* !0 for veristat, set during init */123const volatile u32 nr_cpu_ids = 1;124125/* a pair of CPUs stay on a cgroup for this duration */126const volatile u32 pair_batch_dur_ns;127128/* cpu ID -> pair cpu ID */129const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);130131/* cpu ID -> pair_id */132const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);133134/* CPU ID -> CPU # in the pair (0 or 1) */135const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);136137struct pair_ctx {138struct bpf_spin_lock lock;139140/* the cgroup the pair is currently executing */141u64 cgid;142143/* the pair started executing the current cgroup at */144u64 started_at;145146/* whether the current cgroup is draining */147bool draining;148149/* the CPUs that are currently active on the cgroup */150u32 active_mask;151152/*153* the CPUs that are currently preempted and running tasks in a154* different scheduler.155*/156u32 preempted_mask;157};158159struct {160__uint(type, BPF_MAP_TYPE_ARRAY);161__type(key, u32);162__type(value, struct pair_ctx);163} pair_ctx SEC(".maps");164165/* queue of cgrp_q's possibly with tasks on them */166struct {167__uint(type, BPF_MAP_TYPE_QUEUE);168/*169* Because it's difficult to build strong synchronization encompassing170* multiple non-trivial operations in BPF, this queue is managed in an171* opportunistic way so that we guarantee that a cgroup w/ active tasks172* is always on it but possibly multiple times. Once we have more robust173* synchronization constructs and e.g. linked list, we should be able to174* do this in a prettier way but for now just size it big enough.175*/176__uint(max_entries, 4 * MAX_CGRPS);177__type(value, u64);178} top_q SEC(".maps");179180/* per-cgroup q which FIFOs the tasks from the cgroup */181struct cgrp_q {182__uint(type, BPF_MAP_TYPE_QUEUE);183__uint(max_entries, MAX_QUEUED);184__type(value, u32);185};186187/*188* Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local189* storage; however, a cgroup local storage can only be accessed from the BPF190* progs attached to the cgroup. For now, work around by allocating array of191* cgrp_q's and then allocating per-cgroup indices.192*193* Another caveat: It's difficult to populate a large array of maps statically194* or from BPF. Initialize it from userland.195*/196struct {197__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);198__uint(max_entries, MAX_CGRPS);199__type(key, s32);200__array(values, struct cgrp_q);201} cgrp_q_arr SEC(".maps");202203static u64 cgrp_q_len[MAX_CGRPS];204205/*206* This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be207* useful to have as a map type.208*/209static u32 cgrp_q_idx_cursor;210static u64 cgrp_q_idx_busy[MAX_CGRPS];211212/*213* All added up, the following is what we do:214*215* 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking216* for a free ID. If not found, fail cgroup creation with -EBUSY.217*218* 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following219* cgrp_q_idx_hash.220*221* 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from222* cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.223*224* This is sadly complicated for something pretty simple. Hopefully, we should225* be able to simplify in the future.226*/227struct {228__uint(type, BPF_MAP_TYPE_HASH);229__uint(max_entries, MAX_CGRPS);230__uint(key_size, sizeof(u64)); /* cgrp ID */231__uint(value_size, sizeof(s32)); /* cgrp_q idx */232} cgrp_q_idx_hash SEC(".maps");233234/* statistics */235u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;236u64 nr_exps, nr_exp_waits, nr_exp_empty;237u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;238239UEI_DEFINE(uei);240241void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)242{243struct cgroup *cgrp;244struct cgrp_q *cgq;245s32 pid = p->pid;246u64 cgid;247u32 *q_idx;248u64 *cgq_len;249250__sync_fetch_and_add(&nr_total, 1);251252cgrp = scx_bpf_task_cgroup(p);253cgid = cgrp->kn->id;254bpf_cgroup_release(cgrp);255256/* find the cgroup's q and push @p into it */257q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);258if (!q_idx) {259scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);260return;261}262263cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);264if (!cgq) {265scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",266cgid, *q_idx);267return;268}269270if (bpf_map_push_elem(cgq, &pid, 0)) {271scx_bpf_error("cgroup[%llu] queue overflow", cgid);272return;273}274275/* bump q len, if going 0 -> 1, queue cgroup into the top_q */276cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);277if (!cgq_len) {278scx_bpf_error("MEMBER_VTPR malfunction");279return;280}281282if (!__sync_fetch_and_add(cgq_len, 1) &&283bpf_map_push_elem(&top_q, &cgid, 0)) {284scx_bpf_error("top_q overflow");285return;286}287}288289static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)290{291u32 *vptr;292293vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);294if (!vptr)295return -EINVAL;296297*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);298if (!(*pairc))299return -EINVAL;300301vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);302if (!vptr)303return -EINVAL;304305*mask = 1U << *vptr;306307return 0;308}309310__attribute__((noinline))311static int try_dispatch(s32 cpu)312{313struct pair_ctx *pairc;314struct bpf_map *cgq_map;315struct task_struct *p;316u64 now = scx_bpf_now();317bool kick_pair = false;318bool expired, pair_preempted;319u32 *vptr, in_pair_mask;320s32 pid, q_idx;321u64 cgid;322int ret;323324ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);325if (ret) {326scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",327cpu);328return -ENOENT;329}330331bpf_spin_lock(&pairc->lock);332pairc->active_mask &= ~in_pair_mask;333334expired = time_before(pairc->started_at + pair_batch_dur_ns, now);335if (expired || pairc->draining) {336u64 new_cgid = 0;337338__sync_fetch_and_add(&nr_exps, 1);339340/*341* We're done with the current cgid. An obvious optimization342* would be not draining if the next cgroup is the current one.343* For now, be dumb and always expire.344*/345pairc->draining = true;346347pair_preempted = pairc->preempted_mask;348if (pairc->active_mask || pair_preempted) {349/*350* The other CPU is still active, or is no longer under351* our control due to e.g. being preempted by a higher352* priority sched_class. We want to wait until this353* cgroup expires, or until control of our pair CPU has354* been returned to us.355*356* If the pair controls its CPU, and the time already357* expired, kick. When the other CPU arrives at358* dispatch and clears its active mask, it'll push the359* pair to the next cgroup and kick this CPU.360*/361__sync_fetch_and_add(&nr_exp_waits, 1);362bpf_spin_unlock(&pairc->lock);363if (expired && !pair_preempted)364kick_pair = true;365goto out_maybe_kick;366}367368bpf_spin_unlock(&pairc->lock);369370/*371* Pick the next cgroup. It'd be easier / cleaner to not drop372* pairc->lock and use stronger synchronization here especially373* given that we'll be switching cgroups significantly less374* frequently than tasks. Unfortunately, bpf_spin_lock can't375* really protect anything non-trivial. Let's do opportunistic376* operations instead.377*/378bpf_repeat(BPF_MAX_LOOPS) {379u32 *q_idx;380u64 *cgq_len;381382if (bpf_map_pop_elem(&top_q, &new_cgid)) {383/* no active cgroup, go idle */384__sync_fetch_and_add(&nr_exp_empty, 1);385return 0;386}387388q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);389if (!q_idx)390continue;391392/*393* This is the only place where empty cgroups are taken394* off the top_q.395*/396cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);397if (!cgq_len || !*cgq_len)398continue;399400/*401* If it has any tasks, requeue as we may race and not402* execute it.403*/404bpf_map_push_elem(&top_q, &new_cgid, 0);405break;406}407408bpf_spin_lock(&pairc->lock);409410/*411* The other CPU may already have started on a new cgroup while412* we dropped the lock. Make sure that we're still draining and413* start on the new cgroup.414*/415if (pairc->draining && !pairc->active_mask) {416__sync_fetch_and_add(&nr_cgrp_next, 1);417pairc->cgid = new_cgid;418pairc->started_at = now;419pairc->draining = false;420kick_pair = true;421} else {422__sync_fetch_and_add(&nr_cgrp_coll, 1);423}424}425426cgid = pairc->cgid;427pairc->active_mask |= in_pair_mask;428bpf_spin_unlock(&pairc->lock);429430/* again, it'd be better to do all these with the lock held, oh well */431vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);432if (!vptr) {433scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);434return -ENOENT;435}436q_idx = *vptr;437438/* claim one task from cgrp_q w/ q_idx */439bpf_repeat(BPF_MAX_LOOPS) {440u64 *cgq_len, len;441442cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);443if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) {444/* the cgroup must be empty, expire and repeat */445__sync_fetch_and_add(&nr_cgrp_empty, 1);446bpf_spin_lock(&pairc->lock);447pairc->draining = true;448pairc->active_mask &= ~in_pair_mask;449bpf_spin_unlock(&pairc->lock);450return -EAGAIN;451}452453if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)454continue;455456break;457}458459cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);460if (!cgq_map) {461scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",462cgid, q_idx);463return -ENOENT;464}465466if (bpf_map_pop_elem(cgq_map, &pid)) {467scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",468cgid, q_idx);469return -ENOENT;470}471472p = bpf_task_from_pid(pid);473if (p) {474__sync_fetch_and_add(&nr_dispatched, 1);475scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);476bpf_task_release(p);477} else {478/* we don't handle dequeues, retry on lost tasks */479__sync_fetch_and_add(&nr_missing, 1);480return -EAGAIN;481}482483out_maybe_kick:484if (kick_pair) {485s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);486if (pair) {487__sync_fetch_and_add(&nr_kicks, 1);488scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);489}490}491return 0;492}493494void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)495{496bpf_repeat(BPF_MAX_LOOPS) {497if (try_dispatch(cpu) != -EAGAIN)498break;499}500}501502void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)503{504int ret;505u32 in_pair_mask;506struct pair_ctx *pairc;507bool kick_pair;508509ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);510if (ret)511return;512513bpf_spin_lock(&pairc->lock);514pairc->preempted_mask &= ~in_pair_mask;515/* Kick the pair CPU, unless it was also preempted. */516kick_pair = !pairc->preempted_mask;517bpf_spin_unlock(&pairc->lock);518519if (kick_pair) {520s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);521522if (pair) {523__sync_fetch_and_add(&nr_kicks, 1);524scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);525}526}527}528529void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)530{531int ret;532u32 in_pair_mask;533struct pair_ctx *pairc;534bool kick_pair;535536ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);537if (ret)538return;539540bpf_spin_lock(&pairc->lock);541pairc->preempted_mask |= in_pair_mask;542pairc->active_mask &= ~in_pair_mask;543/* Kick the pair CPU if it's still running. */544kick_pair = pairc->active_mask;545pairc->draining = true;546bpf_spin_unlock(&pairc->lock);547548if (kick_pair) {549s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);550551if (pair) {552__sync_fetch_and_add(&nr_kicks, 1);553scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);554}555}556__sync_fetch_and_add(&nr_preemptions, 1);557}558559s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)560{561u64 cgid = cgrp->kn->id;562s32 i, q_idx;563564bpf_for(i, 0, MAX_CGRPS) {565q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;566if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))567break;568}569if (i == MAX_CGRPS)570return -EBUSY;571572if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {573u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);574if (busy)575*busy = 0;576return -EBUSY;577}578579return 0;580}581582void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)583{584u64 cgid = cgrp->kn->id;585s32 *q_idx;586587q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);588if (q_idx) {589u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);590if (busy)591*busy = 0;592bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);593}594}595596void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)597{598UEI_RECORD(uei, ei);599}600601SCX_OPS_DEFINE(pair_ops,602.enqueue = (void *)pair_enqueue,603.dispatch = (void *)pair_dispatch,604.cpu_acquire = (void *)pair_cpu_acquire,605.cpu_release = (void *)pair_cpu_release,606.cgroup_init = (void *)pair_cgroup_init,607.cgroup_exit = (void *)pair_cgroup_exit,608.exit = (void *)pair_exit,609.name = "pair");610611612