Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/sched_ext/scx_qmap.bpf.c
26282 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* A simple five-level FIFO queue scheduler.
4
*
5
* There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6
* assigned to one depending on its compound weight. Each CPU round robins
7
* through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8
* queue0, 2 from queue1, 4 from queue2 and so on.
9
*
10
* This scheduler demonstrates:
11
*
12
* - BPF-side queueing using PIDs.
13
* - Sleepable per-task storage allocation using ops.prep_enable().
14
* - Using ops.cpu_release() to handle a higher priority scheduling class taking
15
* the CPU away.
16
* - Core-sched support.
17
*
18
* This scheduler is primarily for demonstration and testing of sched_ext
19
* features and unlikely to be useful for actual workloads.
20
*
21
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22
* Copyright (c) 2022 Tejun Heo <[email protected]>
23
* Copyright (c) 2022 David Vernet <[email protected]>
24
*/
25
#include <scx/common.bpf.h>
26
27
enum consts {
28
ONE_SEC_IN_NS = 1000000000,
29
SHARED_DSQ = 0,
30
HIGHPRI_DSQ = 1,
31
HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
32
};
33
34
char _license[] SEC("license") = "GPL";
35
36
const volatile u64 slice_ns;
37
const volatile u32 stall_user_nth;
38
const volatile u32 stall_kernel_nth;
39
const volatile u32 dsp_inf_loop_after;
40
const volatile u32 dsp_batch;
41
const volatile bool highpri_boosting;
42
const volatile bool print_shared_dsq;
43
const volatile s32 disallow_tgid;
44
const volatile bool suppress_dump;
45
46
u64 nr_highpri_queued;
47
u32 test_error_cnt;
48
49
UEI_DEFINE(uei);
50
51
struct qmap {
52
__uint(type, BPF_MAP_TYPE_QUEUE);
53
__uint(max_entries, 4096);
54
__type(value, u32);
55
} queue0 SEC(".maps"),
56
queue1 SEC(".maps"),
57
queue2 SEC(".maps"),
58
queue3 SEC(".maps"),
59
queue4 SEC(".maps");
60
61
struct {
62
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
63
__uint(max_entries, 5);
64
__type(key, int);
65
__array(values, struct qmap);
66
} queue_arr SEC(".maps") = {
67
.values = {
68
[0] = &queue0,
69
[1] = &queue1,
70
[2] = &queue2,
71
[3] = &queue3,
72
[4] = &queue4,
73
},
74
};
75
76
/*
77
* If enabled, CPU performance target is set according to the queue index
78
* according to the following table.
79
*/
80
static const u32 qidx_to_cpuperf_target[] = {
81
[0] = SCX_CPUPERF_ONE * 0 / 4,
82
[1] = SCX_CPUPERF_ONE * 1 / 4,
83
[2] = SCX_CPUPERF_ONE * 2 / 4,
84
[3] = SCX_CPUPERF_ONE * 3 / 4,
85
[4] = SCX_CPUPERF_ONE * 4 / 4,
86
};
87
88
/*
89
* Per-queue sequence numbers to implement core-sched ordering.
90
*
91
* Tail seq is assigned to each queued task and incremented. Head seq tracks the
92
* sequence number of the latest dispatched task. The distance between the a
93
* task's seq and the associated queue's head seq is called the queue distance
94
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
95
*/
96
static u64 core_sched_head_seqs[5];
97
static u64 core_sched_tail_seqs[5];
98
99
/* Per-task scheduling context */
100
struct task_ctx {
101
bool force_local; /* Dispatch directly to local_dsq */
102
bool highpri;
103
u64 core_sched_seq;
104
};
105
106
struct {
107
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
108
__uint(map_flags, BPF_F_NO_PREALLOC);
109
__type(key, int);
110
__type(value, struct task_ctx);
111
} task_ctx_stor SEC(".maps");
112
113
struct cpu_ctx {
114
u64 dsp_idx; /* dispatch index */
115
u64 dsp_cnt; /* remaining count */
116
u32 avg_weight;
117
u32 cpuperf_target;
118
};
119
120
struct {
121
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122
__uint(max_entries, 1);
123
__type(key, u32);
124
__type(value, struct cpu_ctx);
125
} cpu_ctx_stor SEC(".maps");
126
127
/* Statistics */
128
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
129
u64 nr_core_sched_execed;
130
u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
131
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
132
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
133
134
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
135
{
136
s32 cpu;
137
138
if (p->nr_cpus_allowed == 1 ||
139
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
140
return prev_cpu;
141
142
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
143
if (cpu >= 0)
144
return cpu;
145
146
return -1;
147
}
148
149
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
150
{
151
struct task_ctx *tctx;
152
153
if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
154
scx_bpf_error("task_ctx lookup failed");
155
return NULL;
156
}
157
return tctx;
158
}
159
160
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
161
s32 prev_cpu, u64 wake_flags)
162
{
163
struct task_ctx *tctx;
164
s32 cpu;
165
166
if (!(tctx = lookup_task_ctx(p)))
167
return -ESRCH;
168
169
cpu = pick_direct_dispatch_cpu(p, prev_cpu);
170
171
if (cpu >= 0) {
172
tctx->force_local = true;
173
return cpu;
174
} else {
175
return prev_cpu;
176
}
177
}
178
179
static int weight_to_idx(u32 weight)
180
{
181
/* Coarsely map the compound weight to a FIFO. */
182
if (weight <= 25)
183
return 0;
184
else if (weight <= 50)
185
return 1;
186
else if (weight < 200)
187
return 2;
188
else if (weight < 400)
189
return 3;
190
else
191
return 4;
192
}
193
194
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
195
{
196
static u32 user_cnt, kernel_cnt;
197
struct task_ctx *tctx;
198
u32 pid = p->pid;
199
int idx = weight_to_idx(p->scx.weight);
200
void *ring;
201
s32 cpu;
202
203
if (p->flags & PF_KTHREAD) {
204
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
205
return;
206
} else {
207
if (stall_user_nth && !(++user_cnt % stall_user_nth))
208
return;
209
}
210
211
if (test_error_cnt && !--test_error_cnt)
212
scx_bpf_error("test triggering error");
213
214
if (!(tctx = lookup_task_ctx(p)))
215
return;
216
217
/*
218
* All enqueued tasks must have their core_sched_seq updated for correct
219
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
220
*/
221
tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
222
223
/*
224
* If qmap_select_cpu() is telling us to or this is the last runnable
225
* task on the CPU, enqueue locally.
226
*/
227
if (tctx->force_local) {
228
tctx->force_local = false;
229
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
230
return;
231
}
232
233
/* if select_cpu() wasn't called, try direct dispatch */
234
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
235
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
236
__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
237
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
238
return;
239
}
240
241
/*
242
* If the task was re-enqueued due to the CPU being preempted by a
243
* higher priority scheduling class, just re-enqueue the task directly
244
* on the global DSQ. As we want another CPU to pick it up, find and
245
* kick an idle CPU.
246
*/
247
if (enq_flags & SCX_ENQ_REENQ) {
248
s32 cpu;
249
250
scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
251
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
252
if (cpu >= 0)
253
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
254
return;
255
}
256
257
ring = bpf_map_lookup_elem(&queue_arr, &idx);
258
if (!ring) {
259
scx_bpf_error("failed to find ring %d", idx);
260
return;
261
}
262
263
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
264
if (bpf_map_push_elem(ring, &pid, 0)) {
265
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
266
return;
267
}
268
269
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
270
tctx->highpri = true;
271
__sync_fetch_and_add(&nr_highpri_queued, 1);
272
}
273
__sync_fetch_and_add(&nr_enqueued, 1);
274
}
275
276
/*
277
* The BPF queue map doesn't support removal and sched_ext can handle spurious
278
* dispatches. qmap_dequeue() is only used to collect statistics.
279
*/
280
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
281
{
282
__sync_fetch_and_add(&nr_dequeued, 1);
283
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
284
__sync_fetch_and_add(&nr_core_sched_execed, 1);
285
}
286
287
static void update_core_sched_head_seq(struct task_struct *p)
288
{
289
int idx = weight_to_idx(p->scx.weight);
290
struct task_ctx *tctx;
291
292
if ((tctx = lookup_task_ctx(p)))
293
core_sched_head_seqs[idx] = tctx->core_sched_seq;
294
}
295
296
/*
297
* To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
298
* priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
299
* moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
300
* difference only when dsp_batch is larger than 1.
301
*
302
* scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
303
* non-rq-lock holding BPF programs. As demonstration, this function is called
304
* from qmap_dispatch() and monitor_timerfn().
305
*/
306
static bool dispatch_highpri(bool from_timer)
307
{
308
struct task_struct *p;
309
s32 this_cpu = bpf_get_smp_processor_id();
310
311
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
312
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
313
static u64 highpri_seq;
314
struct task_ctx *tctx;
315
316
if (!(tctx = lookup_task_ctx(p)))
317
return false;
318
319
if (tctx->highpri) {
320
/* exercise the set_*() and vtime interface too */
321
__COMPAT_scx_bpf_dsq_move_set_slice(
322
BPF_FOR_EACH_ITER, slice_ns * 2);
323
__COMPAT_scx_bpf_dsq_move_set_vtime(
324
BPF_FOR_EACH_ITER, highpri_seq++);
325
__COMPAT_scx_bpf_dsq_move_vtime(
326
BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
327
}
328
}
329
330
/*
331
* Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
332
* is found.
333
*/
334
bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
335
bool dispatched = false;
336
s32 cpu;
337
338
if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
339
cpu = this_cpu;
340
else
341
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
342
343
if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
344
SCX_DSQ_LOCAL_ON | cpu,
345
SCX_ENQ_PREEMPT)) {
346
if (cpu == this_cpu) {
347
dispatched = true;
348
__sync_fetch_and_add(&nr_expedited_local, 1);
349
} else {
350
__sync_fetch_and_add(&nr_expedited_remote, 1);
351
}
352
if (from_timer)
353
__sync_fetch_and_add(&nr_expedited_from_timer, 1);
354
} else {
355
__sync_fetch_and_add(&nr_expedited_lost, 1);
356
}
357
358
if (dispatched)
359
return true;
360
}
361
362
return false;
363
}
364
365
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
366
{
367
struct task_struct *p;
368
struct cpu_ctx *cpuc;
369
struct task_ctx *tctx;
370
u32 zero = 0, batch = dsp_batch ?: 1;
371
void *fifo;
372
s32 i, pid;
373
374
if (dispatch_highpri(false))
375
return;
376
377
if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
378
return;
379
380
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
381
/*
382
* PID 2 should be kthreadd which should mostly be idle and off
383
* the scheduler. Let's keep dispatching it to force the kernel
384
* to call this function over and over again.
385
*/
386
p = bpf_task_from_pid(2);
387
if (p) {
388
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
389
bpf_task_release(p);
390
return;
391
}
392
}
393
394
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
395
scx_bpf_error("failed to look up cpu_ctx");
396
return;
397
}
398
399
for (i = 0; i < 5; i++) {
400
/* Advance the dispatch cursor and pick the fifo. */
401
if (!cpuc->dsp_cnt) {
402
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
403
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
404
}
405
406
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
407
if (!fifo) {
408
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
409
return;
410
}
411
412
/* Dispatch or advance. */
413
bpf_repeat(BPF_MAX_LOOPS) {
414
struct task_ctx *tctx;
415
416
if (bpf_map_pop_elem(fifo, &pid))
417
break;
418
419
p = bpf_task_from_pid(pid);
420
if (!p)
421
continue;
422
423
if (!(tctx = lookup_task_ctx(p))) {
424
bpf_task_release(p);
425
return;
426
}
427
428
if (tctx->highpri)
429
__sync_fetch_and_sub(&nr_highpri_queued, 1);
430
431
update_core_sched_head_seq(p);
432
__sync_fetch_and_add(&nr_dispatched, 1);
433
434
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
435
bpf_task_release(p);
436
437
batch--;
438
cpuc->dsp_cnt--;
439
if (!batch || !scx_bpf_dispatch_nr_slots()) {
440
if (dispatch_highpri(false))
441
return;
442
scx_bpf_dsq_move_to_local(SHARED_DSQ);
443
return;
444
}
445
if (!cpuc->dsp_cnt)
446
break;
447
}
448
449
cpuc->dsp_cnt = 0;
450
}
451
452
/*
453
* No other tasks. @prev will keep running. Update its core_sched_seq as
454
* if the task were enqueued and dispatched immediately.
455
*/
456
if (prev) {
457
tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
458
if (!tctx) {
459
scx_bpf_error("task_ctx lookup failed");
460
return;
461
}
462
463
tctx->core_sched_seq =
464
core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
465
}
466
}
467
468
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
469
{
470
struct cpu_ctx *cpuc;
471
u32 zero = 0;
472
int idx;
473
474
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
475
scx_bpf_error("failed to look up cpu_ctx");
476
return;
477
}
478
479
/*
480
* Use the running avg of weights to select the target cpuperf level.
481
* This is a demonstration of the cpuperf feature rather than a
482
* practical strategy to regulate CPU frequency.
483
*/
484
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
485
idx = weight_to_idx(cpuc->avg_weight);
486
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
487
488
scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
489
}
490
491
/*
492
* The distance from the head of the queue scaled by the weight of the queue.
493
* The lower the number, the older the task and the higher the priority.
494
*/
495
static s64 task_qdist(struct task_struct *p)
496
{
497
int idx = weight_to_idx(p->scx.weight);
498
struct task_ctx *tctx;
499
s64 qdist;
500
501
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
502
if (!tctx) {
503
scx_bpf_error("task_ctx lookup failed");
504
return 0;
505
}
506
507
qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
508
509
/*
510
* As queue index increments, the priority doubles. The queue w/ index 3
511
* is dispatched twice more frequently than 2. Reflect the difference by
512
* scaling qdists accordingly. Note that the shift amount needs to be
513
* flipped depending on the sign to avoid flipping priority direction.
514
*/
515
if (qdist >= 0)
516
return qdist << (4 - idx);
517
else
518
return qdist << idx;
519
}
520
521
/*
522
* This is called to determine the task ordering when core-sched is picking
523
* tasks to execute on SMT siblings and should encode about the same ordering as
524
* the regular scheduling path. Use the priority-scaled distances from the head
525
* of the queues to compare the two tasks which should be consistent with the
526
* dispatch path behavior.
527
*/
528
bool BPF_STRUCT_OPS(qmap_core_sched_before,
529
struct task_struct *a, struct task_struct *b)
530
{
531
return task_qdist(a) > task_qdist(b);
532
}
533
534
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
535
{
536
u32 cnt;
537
538
/*
539
* Called when @cpu is taken by a higher priority scheduling class. This
540
* makes @cpu no longer available for executing sched_ext tasks. As we
541
* don't want the tasks in @cpu's local dsq to sit there until @cpu
542
* becomes available again, re-enqueue them into the global dsq. See
543
* %SCX_ENQ_REENQ handling in qmap_enqueue().
544
*/
545
cnt = scx_bpf_reenqueue_local();
546
if (cnt)
547
__sync_fetch_and_add(&nr_reenqueued, cnt);
548
}
549
550
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
551
struct scx_init_task_args *args)
552
{
553
if (p->tgid == disallow_tgid)
554
p->scx.disallow = true;
555
556
/*
557
* @p is new. Let's ensure that its task_ctx is available. We can sleep
558
* in this function and the following will automatically use GFP_KERNEL.
559
*/
560
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
561
BPF_LOCAL_STORAGE_GET_F_CREATE))
562
return 0;
563
else
564
return -ENOMEM;
565
}
566
567
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
568
{
569
s32 i, pid;
570
571
if (suppress_dump)
572
return;
573
574
bpf_for(i, 0, 5) {
575
void *fifo;
576
577
if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
578
return;
579
580
scx_bpf_dump("QMAP FIFO[%d]:", i);
581
bpf_repeat(4096) {
582
if (bpf_map_pop_elem(fifo, &pid))
583
break;
584
scx_bpf_dump(" %d", pid);
585
}
586
scx_bpf_dump("\n");
587
}
588
}
589
590
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
591
{
592
u32 zero = 0;
593
struct cpu_ctx *cpuc;
594
595
if (suppress_dump || idle)
596
return;
597
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
598
return;
599
600
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
601
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
602
cpuc->cpuperf_target);
603
}
604
605
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
606
{
607
struct task_ctx *taskc;
608
609
if (suppress_dump)
610
return;
611
if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
612
return;
613
614
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
615
taskc->force_local, taskc->core_sched_seq);
616
}
617
618
s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
619
{
620
bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
621
cgrp->kn->id, args->weight, args->bw_period_us,
622
args->bw_quota_us, args->bw_burst_us);
623
return 0;
624
}
625
626
void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
627
{
628
bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
629
}
630
631
void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
632
u64 period_us, u64 quota_us, u64 burst_us)
633
{
634
bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
635
period_us, quota_us, burst_us);
636
}
637
638
/*
639
* Print out the online and possible CPU map using bpf_printk() as a
640
* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
641
*/
642
static void print_cpus(void)
643
{
644
const struct cpumask *possible, *online;
645
s32 cpu;
646
char buf[128] = "", *p;
647
int idx;
648
649
possible = scx_bpf_get_possible_cpumask();
650
online = scx_bpf_get_online_cpumask();
651
652
idx = 0;
653
bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
654
if (!(p = MEMBER_VPTR(buf, [idx++])))
655
break;
656
if (bpf_cpumask_test_cpu(cpu, online))
657
*p++ = 'O';
658
else if (bpf_cpumask_test_cpu(cpu, possible))
659
*p++ = 'X';
660
else
661
*p++ = ' ';
662
663
if ((cpu & 7) == 7) {
664
if (!(p = MEMBER_VPTR(buf, [idx++])))
665
break;
666
*p++ = '|';
667
}
668
}
669
buf[sizeof(buf) - 1] = '\0';
670
671
scx_bpf_put_cpumask(online);
672
scx_bpf_put_cpumask(possible);
673
674
bpf_printk("CPUS: |%s", buf);
675
}
676
677
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
678
{
679
bpf_printk("CPU %d coming online", cpu);
680
/* @cpu is already online at this point */
681
print_cpus();
682
}
683
684
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
685
{
686
bpf_printk("CPU %d going offline", cpu);
687
/* @cpu is still online at this point */
688
print_cpus();
689
}
690
691
struct monitor_timer {
692
struct bpf_timer timer;
693
};
694
695
struct {
696
__uint(type, BPF_MAP_TYPE_ARRAY);
697
__uint(max_entries, 1);
698
__type(key, u32);
699
__type(value, struct monitor_timer);
700
} monitor_timer SEC(".maps");
701
702
/*
703
* Print out the min, avg and max performance levels of CPUs every second to
704
* demonstrate the cpuperf interface.
705
*/
706
static void monitor_cpuperf(void)
707
{
708
u32 zero = 0, nr_cpu_ids;
709
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
710
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
711
const struct cpumask *online;
712
int i, nr_online_cpus = 0;
713
714
nr_cpu_ids = scx_bpf_nr_cpu_ids();
715
online = scx_bpf_get_online_cpumask();
716
717
bpf_for(i, 0, nr_cpu_ids) {
718
struct cpu_ctx *cpuc;
719
u32 cap, cur;
720
721
if (!bpf_cpumask_test_cpu(i, online))
722
continue;
723
nr_online_cpus++;
724
725
/* collect the capacity and current cpuperf */
726
cap = scx_bpf_cpuperf_cap(i);
727
cur = scx_bpf_cpuperf_cur(i);
728
729
cur_min = cur < cur_min ? cur : cur_min;
730
cur_max = cur > cur_max ? cur : cur_max;
731
732
/*
733
* $cur is relative to $cap. Scale it down accordingly so that
734
* it's in the same scale as other CPUs and $cur_sum/$cap_sum
735
* makes sense.
736
*/
737
cur_sum += cur * cap / SCX_CPUPERF_ONE;
738
cap_sum += cap;
739
740
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
741
scx_bpf_error("failed to look up cpu_ctx");
742
goto out;
743
}
744
745
/* collect target */
746
cur = cpuc->cpuperf_target;
747
target_sum += cur;
748
target_min = cur < target_min ? cur : target_min;
749
target_max = cur > target_max ? cur : target_max;
750
}
751
752
cpuperf_min = cur_min;
753
cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
754
cpuperf_max = cur_max;
755
756
cpuperf_target_min = target_min;
757
cpuperf_target_avg = target_sum / nr_online_cpus;
758
cpuperf_target_max = target_max;
759
out:
760
scx_bpf_put_cpumask(online);
761
}
762
763
/*
764
* Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
765
* scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
766
* see meaningful dumps in the trace pipe.
767
*/
768
static void dump_shared_dsq(void)
769
{
770
struct task_struct *p;
771
s32 nr;
772
773
if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
774
return;
775
776
bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
777
778
bpf_rcu_read_lock();
779
bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
780
bpf_printk("%s[%d]", p->comm, p->pid);
781
bpf_rcu_read_unlock();
782
}
783
784
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
785
{
786
struct scx_event_stats events;
787
788
bpf_rcu_read_lock();
789
dispatch_highpri(true);
790
bpf_rcu_read_unlock();
791
792
monitor_cpuperf();
793
794
if (print_shared_dsq)
795
dump_shared_dsq();
796
797
__COMPAT_scx_bpf_events(&events, sizeof(events));
798
799
bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
800
scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
801
bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
802
scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
803
bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
804
scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
805
bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
806
scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
807
bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
808
scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
809
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
810
scx_read_event(&events, SCX_EV_BYPASS_DURATION));
811
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
812
scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
813
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
814
scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
815
816
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
817
return 0;
818
}
819
820
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
821
{
822
u32 key = 0;
823
struct bpf_timer *timer;
824
s32 ret;
825
826
print_cpus();
827
828
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
829
if (ret)
830
return ret;
831
832
ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
833
if (ret)
834
return ret;
835
836
timer = bpf_map_lookup_elem(&monitor_timer, &key);
837
if (!timer)
838
return -ESRCH;
839
840
bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
841
bpf_timer_set_callback(timer, monitor_timerfn);
842
843
return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
844
}
845
846
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
847
{
848
UEI_RECORD(uei, ei);
849
}
850
851
SCX_OPS_DEFINE(qmap_ops,
852
.select_cpu = (void *)qmap_select_cpu,
853
.enqueue = (void *)qmap_enqueue,
854
.dequeue = (void *)qmap_dequeue,
855
.dispatch = (void *)qmap_dispatch,
856
.tick = (void *)qmap_tick,
857
.core_sched_before = (void *)qmap_core_sched_before,
858
.cpu_release = (void *)qmap_cpu_release,
859
.init_task = (void *)qmap_init_task,
860
.dump = (void *)qmap_dump,
861
.dump_cpu = (void *)qmap_dump_cpu,
862
.dump_task = (void *)qmap_dump_task,
863
.cgroup_init = (void *)qmap_cgroup_init,
864
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
865
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
866
.cpu_online = (void *)qmap_cpu_online,
867
.cpu_offline = (void *)qmap_cpu_offline,
868
.init = (void *)qmap_init,
869
.exit = (void *)qmap_exit,
870
.timeout_ms = 5000U,
871
.name = "qmap");
872
873