Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/sched_ext/scx_qmap.bpf.c
50693 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* A simple five-level FIFO queue scheduler.
4
*
5
* There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6
* assigned to one depending on its compound weight. Each CPU round robins
7
* through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8
* queue0, 2 from queue1, 4 from queue2 and so on.
9
*
10
* This scheduler demonstrates:
11
*
12
* - BPF-side queueing using PIDs.
13
* - Sleepable per-task storage allocation using ops.prep_enable().
14
* - Using ops.cpu_release() to handle a higher priority scheduling class taking
15
* the CPU away.
16
* - Core-sched support.
17
*
18
* This scheduler is primarily for demonstration and testing of sched_ext
19
* features and unlikely to be useful for actual workloads.
20
*
21
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22
* Copyright (c) 2022 Tejun Heo <[email protected]>
23
* Copyright (c) 2022 David Vernet <[email protected]>
24
*/
25
#include <scx/common.bpf.h>
26
27
enum consts {
28
ONE_SEC_IN_NS = 1000000000,
29
SHARED_DSQ = 0,
30
HIGHPRI_DSQ = 1,
31
HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
32
};
33
34
char _license[] SEC("license") = "GPL";
35
36
const volatile u64 slice_ns;
37
const volatile u32 stall_user_nth;
38
const volatile u32 stall_kernel_nth;
39
const volatile u32 dsp_inf_loop_after;
40
const volatile u32 dsp_batch;
41
const volatile bool highpri_boosting;
42
const volatile bool print_dsqs_and_events;
43
const volatile bool print_msgs;
44
const volatile s32 disallow_tgid;
45
const volatile bool suppress_dump;
46
47
u64 nr_highpri_queued;
48
u32 test_error_cnt;
49
50
UEI_DEFINE(uei);
51
52
struct qmap {
53
__uint(type, BPF_MAP_TYPE_QUEUE);
54
__uint(max_entries, 4096);
55
__type(value, u32);
56
} queue0 SEC(".maps"),
57
queue1 SEC(".maps"),
58
queue2 SEC(".maps"),
59
queue3 SEC(".maps"),
60
queue4 SEC(".maps"),
61
dump_store SEC(".maps");
62
63
struct {
64
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
65
__uint(max_entries, 5);
66
__type(key, int);
67
__array(values, struct qmap);
68
} queue_arr SEC(".maps") = {
69
.values = {
70
[0] = &queue0,
71
[1] = &queue1,
72
[2] = &queue2,
73
[3] = &queue3,
74
[4] = &queue4,
75
},
76
};
77
78
/*
79
* If enabled, CPU performance target is set according to the queue index
80
* according to the following table.
81
*/
82
static const u32 qidx_to_cpuperf_target[] = {
83
[0] = SCX_CPUPERF_ONE * 0 / 4,
84
[1] = SCX_CPUPERF_ONE * 1 / 4,
85
[2] = SCX_CPUPERF_ONE * 2 / 4,
86
[3] = SCX_CPUPERF_ONE * 3 / 4,
87
[4] = SCX_CPUPERF_ONE * 4 / 4,
88
};
89
90
/*
91
* Per-queue sequence numbers to implement core-sched ordering.
92
*
93
* Tail seq is assigned to each queued task and incremented. Head seq tracks the
94
* sequence number of the latest dispatched task. The distance between the a
95
* task's seq and the associated queue's head seq is called the queue distance
96
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
97
*/
98
static u64 core_sched_head_seqs[5];
99
static u64 core_sched_tail_seqs[5];
100
101
/* Per-task scheduling context */
102
struct task_ctx {
103
bool force_local; /* Dispatch directly to local_dsq */
104
bool highpri;
105
u64 core_sched_seq;
106
};
107
108
struct {
109
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
110
__uint(map_flags, BPF_F_NO_PREALLOC);
111
__type(key, int);
112
__type(value, struct task_ctx);
113
} task_ctx_stor SEC(".maps");
114
115
struct cpu_ctx {
116
u64 dsp_idx; /* dispatch index */
117
u64 dsp_cnt; /* remaining count */
118
u32 avg_weight;
119
u32 cpuperf_target;
120
};
121
122
struct {
123
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
124
__uint(max_entries, 1);
125
__type(key, u32);
126
__type(value, struct cpu_ctx);
127
} cpu_ctx_stor SEC(".maps");
128
129
/* Statistics */
130
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
131
u64 nr_core_sched_execed;
132
u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
133
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
134
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
135
136
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
137
{
138
s32 cpu;
139
140
if (p->nr_cpus_allowed == 1 ||
141
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
142
return prev_cpu;
143
144
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
145
if (cpu >= 0)
146
return cpu;
147
148
return -1;
149
}
150
151
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
152
{
153
struct task_ctx *tctx;
154
155
if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
156
scx_bpf_error("task_ctx lookup failed");
157
return NULL;
158
}
159
return tctx;
160
}
161
162
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
163
s32 prev_cpu, u64 wake_flags)
164
{
165
struct task_ctx *tctx;
166
s32 cpu;
167
168
if (!(tctx = lookup_task_ctx(p)))
169
return -ESRCH;
170
171
cpu = pick_direct_dispatch_cpu(p, prev_cpu);
172
173
if (cpu >= 0) {
174
tctx->force_local = true;
175
return cpu;
176
} else {
177
return prev_cpu;
178
}
179
}
180
181
static int weight_to_idx(u32 weight)
182
{
183
/* Coarsely map the compound weight to a FIFO. */
184
if (weight <= 25)
185
return 0;
186
else if (weight <= 50)
187
return 1;
188
else if (weight < 200)
189
return 2;
190
else if (weight < 400)
191
return 3;
192
else
193
return 4;
194
}
195
196
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
197
{
198
static u32 user_cnt, kernel_cnt;
199
struct task_ctx *tctx;
200
u32 pid = p->pid;
201
int idx = weight_to_idx(p->scx.weight);
202
void *ring;
203
s32 cpu;
204
205
if (enq_flags & SCX_ENQ_REENQ)
206
__sync_fetch_and_add(&nr_reenqueued, 1);
207
208
if (p->flags & PF_KTHREAD) {
209
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
210
return;
211
} else {
212
if (stall_user_nth && !(++user_cnt % stall_user_nth))
213
return;
214
}
215
216
if (test_error_cnt && !--test_error_cnt)
217
scx_bpf_error("test triggering error");
218
219
if (!(tctx = lookup_task_ctx(p)))
220
return;
221
222
/*
223
* All enqueued tasks must have their core_sched_seq updated for correct
224
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
225
*/
226
tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
227
228
/*
229
* If qmap_select_cpu() is telling us to or this is the last runnable
230
* task on the CPU, enqueue locally.
231
*/
232
if (tctx->force_local) {
233
tctx->force_local = false;
234
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
235
return;
236
}
237
238
/* if select_cpu() wasn't called, try direct dispatch */
239
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
240
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
241
__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
242
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
243
return;
244
}
245
246
/*
247
* If the task was re-enqueued due to the CPU being preempted by a
248
* higher priority scheduling class, just re-enqueue the task directly
249
* on the global DSQ. As we want another CPU to pick it up, find and
250
* kick an idle CPU.
251
*/
252
if (enq_flags & SCX_ENQ_REENQ) {
253
s32 cpu;
254
255
scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
256
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
257
if (cpu >= 0)
258
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
259
return;
260
}
261
262
ring = bpf_map_lookup_elem(&queue_arr, &idx);
263
if (!ring) {
264
scx_bpf_error("failed to find ring %d", idx);
265
return;
266
}
267
268
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
269
if (bpf_map_push_elem(ring, &pid, 0)) {
270
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
271
return;
272
}
273
274
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
275
tctx->highpri = true;
276
__sync_fetch_and_add(&nr_highpri_queued, 1);
277
}
278
__sync_fetch_and_add(&nr_enqueued, 1);
279
}
280
281
/*
282
* The BPF queue map doesn't support removal and sched_ext can handle spurious
283
* dispatches. qmap_dequeue() is only used to collect statistics.
284
*/
285
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
286
{
287
__sync_fetch_and_add(&nr_dequeued, 1);
288
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
289
__sync_fetch_and_add(&nr_core_sched_execed, 1);
290
}
291
292
static void update_core_sched_head_seq(struct task_struct *p)
293
{
294
int idx = weight_to_idx(p->scx.weight);
295
struct task_ctx *tctx;
296
297
if ((tctx = lookup_task_ctx(p)))
298
core_sched_head_seqs[idx] = tctx->core_sched_seq;
299
}
300
301
/*
302
* To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
303
* priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
304
* moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
305
* difference only when dsp_batch is larger than 1.
306
*
307
* scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
308
* non-rq-lock holding BPF programs. As demonstration, this function is called
309
* from qmap_dispatch() and monitor_timerfn().
310
*/
311
static bool dispatch_highpri(bool from_timer)
312
{
313
struct task_struct *p;
314
s32 this_cpu = bpf_get_smp_processor_id();
315
316
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
317
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
318
static u64 highpri_seq;
319
struct task_ctx *tctx;
320
321
if (!(tctx = lookup_task_ctx(p)))
322
return false;
323
324
if (tctx->highpri) {
325
/* exercise the set_*() and vtime interface too */
326
scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
327
scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
328
scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
329
}
330
}
331
332
/*
333
* Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
334
* is found.
335
*/
336
bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
337
bool dispatched = false;
338
s32 cpu;
339
340
if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
341
cpu = this_cpu;
342
else
343
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
344
345
if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
346
SCX_ENQ_PREEMPT)) {
347
if (cpu == this_cpu) {
348
dispatched = true;
349
__sync_fetch_and_add(&nr_expedited_local, 1);
350
} else {
351
__sync_fetch_and_add(&nr_expedited_remote, 1);
352
}
353
if (from_timer)
354
__sync_fetch_and_add(&nr_expedited_from_timer, 1);
355
} else {
356
__sync_fetch_and_add(&nr_expedited_lost, 1);
357
}
358
359
if (dispatched)
360
return true;
361
}
362
363
return false;
364
}
365
366
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
367
{
368
struct task_struct *p;
369
struct cpu_ctx *cpuc;
370
struct task_ctx *tctx;
371
u32 zero = 0, batch = dsp_batch ?: 1;
372
void *fifo;
373
s32 i, pid;
374
375
if (dispatch_highpri(false))
376
return;
377
378
if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
379
return;
380
381
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
382
/*
383
* PID 2 should be kthreadd which should mostly be idle and off
384
* the scheduler. Let's keep dispatching it to force the kernel
385
* to call this function over and over again.
386
*/
387
p = bpf_task_from_pid(2);
388
if (p) {
389
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
390
bpf_task_release(p);
391
return;
392
}
393
}
394
395
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
396
scx_bpf_error("failed to look up cpu_ctx");
397
return;
398
}
399
400
for (i = 0; i < 5; i++) {
401
/* Advance the dispatch cursor and pick the fifo. */
402
if (!cpuc->dsp_cnt) {
403
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
404
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
405
}
406
407
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
408
if (!fifo) {
409
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
410
return;
411
}
412
413
/* Dispatch or advance. */
414
bpf_repeat(BPF_MAX_LOOPS) {
415
struct task_ctx *tctx;
416
417
if (bpf_map_pop_elem(fifo, &pid))
418
break;
419
420
p = bpf_task_from_pid(pid);
421
if (!p)
422
continue;
423
424
if (!(tctx = lookup_task_ctx(p))) {
425
bpf_task_release(p);
426
return;
427
}
428
429
if (tctx->highpri)
430
__sync_fetch_and_sub(&nr_highpri_queued, 1);
431
432
update_core_sched_head_seq(p);
433
__sync_fetch_and_add(&nr_dispatched, 1);
434
435
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
436
bpf_task_release(p);
437
438
batch--;
439
cpuc->dsp_cnt--;
440
if (!batch || !scx_bpf_dispatch_nr_slots()) {
441
if (dispatch_highpri(false))
442
return;
443
scx_bpf_dsq_move_to_local(SHARED_DSQ);
444
return;
445
}
446
if (!cpuc->dsp_cnt)
447
break;
448
}
449
450
cpuc->dsp_cnt = 0;
451
}
452
453
/*
454
* No other tasks. @prev will keep running. Update its core_sched_seq as
455
* if the task were enqueued and dispatched immediately.
456
*/
457
if (prev) {
458
tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
459
if (!tctx) {
460
scx_bpf_error("task_ctx lookup failed");
461
return;
462
}
463
464
tctx->core_sched_seq =
465
core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
466
}
467
}
468
469
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
470
{
471
struct cpu_ctx *cpuc;
472
u32 zero = 0;
473
int idx;
474
475
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
476
scx_bpf_error("failed to look up cpu_ctx");
477
return;
478
}
479
480
/*
481
* Use the running avg of weights to select the target cpuperf level.
482
* This is a demonstration of the cpuperf feature rather than a
483
* practical strategy to regulate CPU frequency.
484
*/
485
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
486
idx = weight_to_idx(cpuc->avg_weight);
487
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
488
489
scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
490
}
491
492
/*
493
* The distance from the head of the queue scaled by the weight of the queue.
494
* The lower the number, the older the task and the higher the priority.
495
*/
496
static s64 task_qdist(struct task_struct *p)
497
{
498
int idx = weight_to_idx(p->scx.weight);
499
struct task_ctx *tctx;
500
s64 qdist;
501
502
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
503
if (!tctx) {
504
scx_bpf_error("task_ctx lookup failed");
505
return 0;
506
}
507
508
qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
509
510
/*
511
* As queue index increments, the priority doubles. The queue w/ index 3
512
* is dispatched twice more frequently than 2. Reflect the difference by
513
* scaling qdists accordingly. Note that the shift amount needs to be
514
* flipped depending on the sign to avoid flipping priority direction.
515
*/
516
if (qdist >= 0)
517
return qdist << (4 - idx);
518
else
519
return qdist << idx;
520
}
521
522
/*
523
* This is called to determine the task ordering when core-sched is picking
524
* tasks to execute on SMT siblings and should encode about the same ordering as
525
* the regular scheduling path. Use the priority-scaled distances from the head
526
* of the queues to compare the two tasks which should be consistent with the
527
* dispatch path behavior.
528
*/
529
bool BPF_STRUCT_OPS(qmap_core_sched_before,
530
struct task_struct *a, struct task_struct *b)
531
{
532
return task_qdist(a) > task_qdist(b);
533
}
534
535
SEC("tp_btf/sched_switch")
536
int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
537
struct task_struct *next, unsigned long prev_state)
538
{
539
if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
540
return 0;
541
542
/*
543
* If @cpu is taken by a higher priority scheduling class, it is no
544
* longer available for executing sched_ext tasks. As we don't want the
545
* tasks in @cpu's local dsq to sit there until @cpu becomes available
546
* again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
547
* handling in qmap_enqueue().
548
*/
549
switch (next->policy) {
550
case 1: /* SCHED_FIFO */
551
case 2: /* SCHED_RR */
552
case 6: /* SCHED_DEADLINE */
553
scx_bpf_reenqueue_local();
554
}
555
556
return 0;
557
}
558
559
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
560
{
561
/* see qmap_sched_switch() to learn how to do this on newer kernels */
562
if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
563
scx_bpf_reenqueue_local();
564
}
565
566
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
567
struct scx_init_task_args *args)
568
{
569
if (p->tgid == disallow_tgid)
570
p->scx.disallow = true;
571
572
/*
573
* @p is new. Let's ensure that its task_ctx is available. We can sleep
574
* in this function and the following will automatically use GFP_KERNEL.
575
*/
576
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
577
BPF_LOCAL_STORAGE_GET_F_CREATE))
578
return 0;
579
else
580
return -ENOMEM;
581
}
582
583
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
584
{
585
s32 i, pid;
586
587
if (suppress_dump)
588
return;
589
590
bpf_for(i, 0, 5) {
591
void *fifo;
592
593
if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
594
return;
595
596
scx_bpf_dump("QMAP FIFO[%d]:", i);
597
598
/*
599
* Dump can be invoked anytime and there is no way to iterate in
600
* a non-destructive way. Pop and store in dump_store and then
601
* restore afterwards. If racing against new enqueues, ordering
602
* can get mixed up.
603
*/
604
bpf_repeat(4096) {
605
if (bpf_map_pop_elem(fifo, &pid))
606
break;
607
bpf_map_push_elem(&dump_store, &pid, 0);
608
scx_bpf_dump(" %d", pid);
609
}
610
611
bpf_repeat(4096) {
612
if (bpf_map_pop_elem(&dump_store, &pid))
613
break;
614
bpf_map_push_elem(fifo, &pid, 0);
615
}
616
617
scx_bpf_dump("\n");
618
}
619
}
620
621
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
622
{
623
u32 zero = 0;
624
struct cpu_ctx *cpuc;
625
626
if (suppress_dump || idle)
627
return;
628
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
629
return;
630
631
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
632
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
633
cpuc->cpuperf_target);
634
}
635
636
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
637
{
638
struct task_ctx *taskc;
639
640
if (suppress_dump)
641
return;
642
if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
643
return;
644
645
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
646
taskc->force_local, taskc->core_sched_seq);
647
}
648
649
s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
650
{
651
if (print_msgs)
652
bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
653
cgrp->kn->id, args->weight, args->bw_period_us,
654
args->bw_quota_us, args->bw_burst_us);
655
return 0;
656
}
657
658
void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
659
{
660
if (print_msgs)
661
bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
662
}
663
664
void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
665
u64 period_us, u64 quota_us, u64 burst_us)
666
{
667
if (print_msgs)
668
bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
669
cgrp->kn->id, period_us, quota_us, burst_us);
670
}
671
672
/*
673
* Print out the online and possible CPU map using bpf_printk() as a
674
* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
675
*/
676
static void print_cpus(void)
677
{
678
const struct cpumask *possible, *online;
679
s32 cpu;
680
char buf[128] = "", *p;
681
int idx;
682
683
possible = scx_bpf_get_possible_cpumask();
684
online = scx_bpf_get_online_cpumask();
685
686
idx = 0;
687
bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
688
if (!(p = MEMBER_VPTR(buf, [idx++])))
689
break;
690
if (bpf_cpumask_test_cpu(cpu, online))
691
*p++ = 'O';
692
else if (bpf_cpumask_test_cpu(cpu, possible))
693
*p++ = 'X';
694
else
695
*p++ = ' ';
696
697
if ((cpu & 7) == 7) {
698
if (!(p = MEMBER_VPTR(buf, [idx++])))
699
break;
700
*p++ = '|';
701
}
702
}
703
buf[sizeof(buf) - 1] = '\0';
704
705
scx_bpf_put_cpumask(online);
706
scx_bpf_put_cpumask(possible);
707
708
bpf_printk("CPUS: |%s", buf);
709
}
710
711
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
712
{
713
if (print_msgs) {
714
bpf_printk("CPU %d coming online", cpu);
715
/* @cpu is already online at this point */
716
print_cpus();
717
}
718
}
719
720
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
721
{
722
if (print_msgs) {
723
bpf_printk("CPU %d going offline", cpu);
724
/* @cpu is still online at this point */
725
print_cpus();
726
}
727
}
728
729
struct monitor_timer {
730
struct bpf_timer timer;
731
};
732
733
struct {
734
__uint(type, BPF_MAP_TYPE_ARRAY);
735
__uint(max_entries, 1);
736
__type(key, u32);
737
__type(value, struct monitor_timer);
738
} monitor_timer SEC(".maps");
739
740
/*
741
* Print out the min, avg and max performance levels of CPUs every second to
742
* demonstrate the cpuperf interface.
743
*/
744
static void monitor_cpuperf(void)
745
{
746
u32 zero = 0, nr_cpu_ids;
747
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
748
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
749
const struct cpumask *online;
750
int i, nr_online_cpus = 0;
751
752
nr_cpu_ids = scx_bpf_nr_cpu_ids();
753
online = scx_bpf_get_online_cpumask();
754
755
bpf_for(i, 0, nr_cpu_ids) {
756
struct cpu_ctx *cpuc;
757
u32 cap, cur;
758
759
if (!bpf_cpumask_test_cpu(i, online))
760
continue;
761
nr_online_cpus++;
762
763
/* collect the capacity and current cpuperf */
764
cap = scx_bpf_cpuperf_cap(i);
765
cur = scx_bpf_cpuperf_cur(i);
766
767
cur_min = cur < cur_min ? cur : cur_min;
768
cur_max = cur > cur_max ? cur : cur_max;
769
770
/*
771
* $cur is relative to $cap. Scale it down accordingly so that
772
* it's in the same scale as other CPUs and $cur_sum/$cap_sum
773
* makes sense.
774
*/
775
cur_sum += cur * cap / SCX_CPUPERF_ONE;
776
cap_sum += cap;
777
778
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
779
scx_bpf_error("failed to look up cpu_ctx");
780
goto out;
781
}
782
783
/* collect target */
784
cur = cpuc->cpuperf_target;
785
target_sum += cur;
786
target_min = cur < target_min ? cur : target_min;
787
target_max = cur > target_max ? cur : target_max;
788
}
789
790
cpuperf_min = cur_min;
791
cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
792
cpuperf_max = cur_max;
793
794
cpuperf_target_min = target_min;
795
cpuperf_target_avg = target_sum / nr_online_cpus;
796
cpuperf_target_max = target_max;
797
out:
798
scx_bpf_put_cpumask(online);
799
}
800
801
/*
802
* Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
803
* scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
804
* see meaningful dumps in the trace pipe.
805
*/
806
static void dump_shared_dsq(void)
807
{
808
struct task_struct *p;
809
s32 nr;
810
811
if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
812
return;
813
814
bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
815
816
bpf_rcu_read_lock();
817
bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
818
bpf_printk("%s[%d]", p->comm, p->pid);
819
bpf_rcu_read_unlock();
820
}
821
822
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
823
{
824
bpf_rcu_read_lock();
825
dispatch_highpri(true);
826
bpf_rcu_read_unlock();
827
828
monitor_cpuperf();
829
830
if (print_dsqs_and_events) {
831
struct scx_event_stats events;
832
833
dump_shared_dsq();
834
835
__COMPAT_scx_bpf_events(&events, sizeof(events));
836
837
bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
838
scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
839
bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
840
scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
841
bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
842
scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
843
bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
844
scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
845
bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
846
scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
847
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
848
scx_read_event(&events, SCX_EV_BYPASS_DURATION));
849
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
850
scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
851
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
852
scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
853
}
854
855
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
856
return 0;
857
}
858
859
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
860
{
861
u32 key = 0;
862
struct bpf_timer *timer;
863
s32 ret;
864
865
if (print_msgs)
866
print_cpus();
867
868
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
869
if (ret)
870
return ret;
871
872
ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
873
if (ret)
874
return ret;
875
876
timer = bpf_map_lookup_elem(&monitor_timer, &key);
877
if (!timer)
878
return -ESRCH;
879
880
bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
881
bpf_timer_set_callback(timer, monitor_timerfn);
882
883
return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
884
}
885
886
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
887
{
888
UEI_RECORD(uei, ei);
889
}
890
891
SCX_OPS_DEFINE(qmap_ops,
892
.select_cpu = (void *)qmap_select_cpu,
893
.enqueue = (void *)qmap_enqueue,
894
.dequeue = (void *)qmap_dequeue,
895
.dispatch = (void *)qmap_dispatch,
896
.tick = (void *)qmap_tick,
897
.core_sched_before = (void *)qmap_core_sched_before,
898
.cpu_release = (void *)qmap_cpu_release,
899
.init_task = (void *)qmap_init_task,
900
.dump = (void *)qmap_dump,
901
.dump_cpu = (void *)qmap_dump_cpu,
902
.dump_task = (void *)qmap_dump_task,
903
.cgroup_init = (void *)qmap_cgroup_init,
904
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
905
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
906
.cpu_online = (void *)qmap_cpu_online,
907
.cpu_offline = (void *)qmap_cpu_offline,
908
.init = (void *)qmap_init,
909
.exit = (void *)qmap_exit,
910
.timeout_ms = 5000U,
911
.name = "qmap");
912
913