CoCalc -- scx_qmap.bpf.c

GitHub Repository: torvalds/linux
Path: blob/master/tools/sched_ext/scx_qmap.bpf.c
²⁶²⁸² views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * A simple five-level FIFO queue scheduler.
4
 *
5
 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6
 * assigned to one depending on its compound weight. Each CPU round robins
7
 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8
 * queue0, 2 from queue1, 4 from queue2 and so on.
9
 *
10
 * This scheduler demonstrates:
11
 *
12
 * - BPF-side queueing using PIDs.
13
 * - Sleepable per-task storage allocation using ops.prep_enable().
14
 * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15
 *   the CPU away.
16
 * - Core-sched support.
17
 *
18
 * This scheduler is primarily for demonstration and testing of sched_ext
19
 * features and unlikely to be useful for actual workloads.
20
 *
21
 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22
 * Copyright (c) 2022 Tejun Heo <[email protected]>
23
 * Copyright (c) 2022 David Vernet <[email protected]>
24
 */
25
#include <scx/common.bpf.h>
26

27
enum consts {
28
	ONE_SEC_IN_NS		= 1000000000,
29
	SHARED_DSQ		= 0,
30
	HIGHPRI_DSQ		= 1,
31
	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
32
};
33

34
char _license[] SEC("license") = "GPL";
35

36
const volatile u64 slice_ns;
37
const volatile u32 stall_user_nth;
38
const volatile u32 stall_kernel_nth;
39
const volatile u32 dsp_inf_loop_after;
40
const volatile u32 dsp_batch;
41
const volatile bool highpri_boosting;
42
const volatile bool print_shared_dsq;
43
const volatile s32 disallow_tgid;
44
const volatile bool suppress_dump;
45

46
u64 nr_highpri_queued;
47
u32 test_error_cnt;
48

49
UEI_DEFINE(uei);
50

51
struct qmap {
52
	__uint(type, BPF_MAP_TYPE_QUEUE);
53
	__uint(max_entries, 4096);
54
	__type(value, u32);
55
} queue0 SEC(".maps"),
56
  queue1 SEC(".maps"),
57
  queue2 SEC(".maps"),
58
  queue3 SEC(".maps"),
59
  queue4 SEC(".maps");
60

61
struct {
62
	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
63
	__uint(max_entries, 5);
64
	__type(key, int);
65
	__array(values, struct qmap);
66
} queue_arr SEC(".maps") = {
67
	.values = {
68
		[0] = &queue0,
69
		[1] = &queue1,
70
		[2] = &queue2,
71
		[3] = &queue3,
72
		[4] = &queue4,
73
	},
74
};
75

76
/*
77
 * If enabled, CPU performance target is set according to the queue index
78
 * according to the following table.
79
 */
80
static const u32 qidx_to_cpuperf_target[] = {
81
	[0] = SCX_CPUPERF_ONE * 0 / 4,
82
	[1] = SCX_CPUPERF_ONE * 1 / 4,
83
	[2] = SCX_CPUPERF_ONE * 2 / 4,
84
	[3] = SCX_CPUPERF_ONE * 3 / 4,
85
	[4] = SCX_CPUPERF_ONE * 4 / 4,
86
};
87

88
/*
89
 * Per-queue sequence numbers to implement core-sched ordering.
90
 *
91
 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
92
 * sequence number of the latest dispatched task. The distance between the a
93
 * task's seq and the associated queue's head seq is called the queue distance
94
 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
95
 */
96
static u64 core_sched_head_seqs[5];
97
static u64 core_sched_tail_seqs[5];
98

99
/* Per-task scheduling context */
100
struct task_ctx {
101
	bool	force_local;	/* Dispatch directly to local_dsq */
102
	bool	highpri;
103
	u64	core_sched_seq;
104
};
105

106
struct {
107
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
108
	__uint(map_flags, BPF_F_NO_PREALLOC);
109
	__type(key, int);
110
	__type(value, struct task_ctx);
111
} task_ctx_stor SEC(".maps");
112

113
struct cpu_ctx {
114
	u64	dsp_idx;	/* dispatch index */
115
	u64	dsp_cnt;	/* remaining count */
116
	u32	avg_weight;
117
	u32	cpuperf_target;
118
};
119

120
struct {
121
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122
	__uint(max_entries, 1);
123
	__type(key, u32);
124
	__type(value, struct cpu_ctx);
125
} cpu_ctx_stor SEC(".maps");
126

127
/* Statistics */
128
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
129
u64 nr_core_sched_execed;
130
u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
131
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
132
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
133

134
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
135
{
136
	s32 cpu;
137

138
	if (p->nr_cpus_allowed == 1 ||
139
	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
140
		return prev_cpu;
141

142
	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
143
	if (cpu >= 0)
144
		return cpu;
145

146
	return -1;
147
}
148

149
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
150
{
151
	struct task_ctx *tctx;
152

153
	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
154
		scx_bpf_error("task_ctx lookup failed");
155
		return NULL;
156
	}
157
	return tctx;
158
}
159

160
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
161
		   s32 prev_cpu, u64 wake_flags)
162
{
163
	struct task_ctx *tctx;
164
	s32 cpu;
165

166
	if (!(tctx = lookup_task_ctx(p)))
167
		return -ESRCH;
168

169
	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
170

171
	if (cpu >= 0) {
172
		tctx->force_local = true;
173
		return cpu;
174
	} else {
175
		return prev_cpu;
176
	}
177
}
178

179
static int weight_to_idx(u32 weight)
180
{
181
	/* Coarsely map the compound weight to a FIFO. */
182
	if (weight <= 25)
183
		return 0;
184
	else if (weight <= 50)
185
		return 1;
186
	else if (weight < 200)
187
		return 2;
188
	else if (weight < 400)
189
		return 3;
190
	else
191
		return 4;
192
}
193

194
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
195
{
196
	static u32 user_cnt, kernel_cnt;
197
	struct task_ctx *tctx;
198
	u32 pid = p->pid;
199
	int idx = weight_to_idx(p->scx.weight);
200
	void *ring;
201
	s32 cpu;
202

203
	if (p->flags & PF_KTHREAD) {
204
		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
205
			return;
206
	} else {
207
		if (stall_user_nth && !(++user_cnt % stall_user_nth))
208
			return;
209
	}
210

211
	if (test_error_cnt && !--test_error_cnt)
212
		scx_bpf_error("test triggering error");
213

214
	if (!(tctx = lookup_task_ctx(p)))
215
		return;
216

217
	/*
218
	 * All enqueued tasks must have their core_sched_seq updated for correct
219
	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
220
	 */
221
	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
222

223
	/*
224
	 * If qmap_select_cpu() is telling us to or this is the last runnable
225
	 * task on the CPU, enqueue locally.
226
	 */
227
	if (tctx->force_local) {
228
		tctx->force_local = false;
229
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
230
		return;
231
	}
232

233
	/* if select_cpu() wasn't called, try direct dispatch */
234
	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
235
	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
236
		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
237
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
238
		return;
239
	}
240

241
	/*
242
	 * If the task was re-enqueued due to the CPU being preempted by a
243
	 * higher priority scheduling class, just re-enqueue the task directly
244
	 * on the global DSQ. As we want another CPU to pick it up, find and
245
	 * kick an idle CPU.
246
	 */
247
	if (enq_flags & SCX_ENQ_REENQ) {
248
		s32 cpu;
249

250
		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
251
		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
252
		if (cpu >= 0)
253
			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
254
		return;
255
	}
256

257
	ring = bpf_map_lookup_elem(&queue_arr, &idx);
258
	if (!ring) {
259
		scx_bpf_error("failed to find ring %d", idx);
260
		return;
261
	}
262

263
	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
264
	if (bpf_map_push_elem(ring, &pid, 0)) {
265
		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
266
		return;
267
	}
268

269
	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
270
		tctx->highpri = true;
271
		__sync_fetch_and_add(&nr_highpri_queued, 1);
272
	}
273
	__sync_fetch_and_add(&nr_enqueued, 1);
274
}
275

276
/*
277
 * The BPF queue map doesn't support removal and sched_ext can handle spurious
278
 * dispatches. qmap_dequeue() is only used to collect statistics.
279
 */
280
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
281
{
282
	__sync_fetch_and_add(&nr_dequeued, 1);
283
	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
284
		__sync_fetch_and_add(&nr_core_sched_execed, 1);
285
}
286

287
static void update_core_sched_head_seq(struct task_struct *p)
288
{
289
	int idx = weight_to_idx(p->scx.weight);
290
	struct task_ctx *tctx;
291

292
	if ((tctx = lookup_task_ctx(p)))
293
		core_sched_head_seqs[idx] = tctx->core_sched_seq;
294
}
295

296
/*
297
 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
298
 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
299
 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
300
 * difference only when dsp_batch is larger than 1.
301
 *
302
 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
303
 * non-rq-lock holding BPF programs. As demonstration, this function is called
304
 * from qmap_dispatch() and monitor_timerfn().
305
 */
306
static bool dispatch_highpri(bool from_timer)
307
{
308
	struct task_struct *p;
309
	s32 this_cpu = bpf_get_smp_processor_id();
310

311
	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
312
	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
313
		static u64 highpri_seq;
314
		struct task_ctx *tctx;
315

316
		if (!(tctx = lookup_task_ctx(p)))
317
			return false;
318

319
		if (tctx->highpri) {
320
			/* exercise the set_*() and vtime interface too */
321
			__COMPAT_scx_bpf_dsq_move_set_slice(
322
				BPF_FOR_EACH_ITER, slice_ns * 2);
323
			__COMPAT_scx_bpf_dsq_move_set_vtime(
324
				BPF_FOR_EACH_ITER, highpri_seq++);
325
			__COMPAT_scx_bpf_dsq_move_vtime(
326
				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
327
		}
328
	}
329

330
	/*
331
	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
332
	 * is found.
333
	 */
334
	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
335
		bool dispatched = false;
336
		s32 cpu;
337

338
		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
339
			cpu = this_cpu;
340
		else
341
			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
342

343
		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
344
					      SCX_DSQ_LOCAL_ON | cpu,
345
					      SCX_ENQ_PREEMPT)) {
346
			if (cpu == this_cpu) {
347
				dispatched = true;
348
				__sync_fetch_and_add(&nr_expedited_local, 1);
349
			} else {
350
				__sync_fetch_and_add(&nr_expedited_remote, 1);
351
			}
352
			if (from_timer)
353
				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
354
		} else {
355
			__sync_fetch_and_add(&nr_expedited_lost, 1);
356
		}
357

358
		if (dispatched)
359
			return true;
360
	}
361

362
	return false;
363
}
364

365
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
366
{
367
	struct task_struct *p;
368
	struct cpu_ctx *cpuc;
369
	struct task_ctx *tctx;
370
	u32 zero = 0, batch = dsp_batch ?: 1;
371
	void *fifo;
372
	s32 i, pid;
373

374
	if (dispatch_highpri(false))
375
		return;
376

377
	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
378
		return;
379

380
	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
381
		/*
382
		 * PID 2 should be kthreadd which should mostly be idle and off
383
		 * the scheduler. Let's keep dispatching it to force the kernel
384
		 * to call this function over and over again.
385
		 */
386
		p = bpf_task_from_pid(2);
387
		if (p) {
388
			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
389
			bpf_task_release(p);
390
			return;
391
		}
392
	}
393

394
	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
395
		scx_bpf_error("failed to look up cpu_ctx");
396
		return;
397
	}
398

399
	for (i = 0; i < 5; i++) {
400
		/* Advance the dispatch cursor and pick the fifo. */
401
		if (!cpuc->dsp_cnt) {
402
			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
403
			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
404
		}
405

406
		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
407
		if (!fifo) {
408
			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
409
			return;
410
		}
411

412
		/* Dispatch or advance. */
413
		bpf_repeat(BPF_MAX_LOOPS) {
414
			struct task_ctx *tctx;
415

416
			if (bpf_map_pop_elem(fifo, &pid))
417
				break;
418

419
			p = bpf_task_from_pid(pid);
420
			if (!p)
421
				continue;
422

423
			if (!(tctx = lookup_task_ctx(p))) {
424
				bpf_task_release(p);
425
				return;
426
			}
427

428
			if (tctx->highpri)
429
				__sync_fetch_and_sub(&nr_highpri_queued, 1);
430

431
			update_core_sched_head_seq(p);
432
			__sync_fetch_and_add(&nr_dispatched, 1);
433

434
			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
435
			bpf_task_release(p);
436

437
			batch--;
438
			cpuc->dsp_cnt--;
439
			if (!batch || !scx_bpf_dispatch_nr_slots()) {
440
				if (dispatch_highpri(false))
441
					return;
442
				scx_bpf_dsq_move_to_local(SHARED_DSQ);
443
				return;
444
			}
445
			if (!cpuc->dsp_cnt)
446
				break;
447
		}
448

449
		cpuc->dsp_cnt = 0;
450
	}
451

452
	/*
453
	 * No other tasks. @prev will keep running. Update its core_sched_seq as
454
	 * if the task were enqueued and dispatched immediately.
455
	 */
456
	if (prev) {
457
		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
458
		if (!tctx) {
459
			scx_bpf_error("task_ctx lookup failed");
460
			return;
461
		}
462

463
		tctx->core_sched_seq =
464
			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
465
	}
466
}
467

468
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
469
{
470
	struct cpu_ctx *cpuc;
471
	u32 zero = 0;
472
	int idx;
473

474
	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
475
		scx_bpf_error("failed to look up cpu_ctx");
476
		return;
477
	}
478

479
	/*
480
	 * Use the running avg of weights to select the target cpuperf level.
481
	 * This is a demonstration of the cpuperf feature rather than a
482
	 * practical strategy to regulate CPU frequency.
483
	 */
484
	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
485
	idx = weight_to_idx(cpuc->avg_weight);
486
	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
487

488
	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
489
}
490

491
/*
492
 * The distance from the head of the queue scaled by the weight of the queue.
493
 * The lower the number, the older the task and the higher the priority.
494
 */
495
static s64 task_qdist(struct task_struct *p)
496
{
497
	int idx = weight_to_idx(p->scx.weight);
498
	struct task_ctx *tctx;
499
	s64 qdist;
500

501
	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
502
	if (!tctx) {
503
		scx_bpf_error("task_ctx lookup failed");
504
		return 0;
505
	}
506

507
	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
508

509
	/*
510
	 * As queue index increments, the priority doubles. The queue w/ index 3
511
	 * is dispatched twice more frequently than 2. Reflect the difference by
512
	 * scaling qdists accordingly. Note that the shift amount needs to be
513
	 * flipped depending on the sign to avoid flipping priority direction.
514
	 */
515
	if (qdist >= 0)
516
		return qdist << (4 - idx);
517
	else
518
		return qdist << idx;
519
}
520

521
/*
522
 * This is called to determine the task ordering when core-sched is picking
523
 * tasks to execute on SMT siblings and should encode about the same ordering as
524
 * the regular scheduling path. Use the priority-scaled distances from the head
525
 * of the queues to compare the two tasks which should be consistent with the
526
 * dispatch path behavior.
527
 */
528
bool BPF_STRUCT_OPS(qmap_core_sched_before,
529
		    struct task_struct *a, struct task_struct *b)
530
{
531
	return task_qdist(a) > task_qdist(b);
532
}
533

534
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
535
{
536
	u32 cnt;
537

538
	/*
539
	 * Called when @cpu is taken by a higher priority scheduling class. This
540
	 * makes @cpu no longer available for executing sched_ext tasks. As we
541
	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
542
	 * becomes available again, re-enqueue them into the global dsq. See
543
	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
544
	 */
545
	cnt = scx_bpf_reenqueue_local();
546
	if (cnt)
547
		__sync_fetch_and_add(&nr_reenqueued, cnt);
548
}
549

550
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
551
		   struct scx_init_task_args *args)
552
{
553
	if (p->tgid == disallow_tgid)
554
		p->scx.disallow = true;
555

556
	/*
557
	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
558
	 * in this function and the following will automatically use GFP_KERNEL.
559
	 */
560
	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
561
				 BPF_LOCAL_STORAGE_GET_F_CREATE))
562
		return 0;
563
	else
564
		return -ENOMEM;
565
}
566

567
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
568
{
569
	s32 i, pid;
570

571
	if (suppress_dump)
572
		return;
573

574
	bpf_for(i, 0, 5) {
575
		void *fifo;
576

577
		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
578
			return;
579

580
		scx_bpf_dump("QMAP FIFO[%d]:", i);
581
		bpf_repeat(4096) {
582
			if (bpf_map_pop_elem(fifo, &pid))
583
				break;
584
			scx_bpf_dump(" %d", pid);
585
		}
586
		scx_bpf_dump("\n");
587
	}
588
}
589

590
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
591
{
592
	u32 zero = 0;
593
	struct cpu_ctx *cpuc;
594

595
	if (suppress_dump || idle)
596
		return;
597
	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
598
		return;
599

600
	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
601
		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
602
		     cpuc->cpuperf_target);
603
}
604

605
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
606
{
607
	struct task_ctx *taskc;
608

609
	if (suppress_dump)
610
		return;
611
	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
612
		return;
613

614
	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
615
		     taskc->force_local, taskc->core_sched_seq);
616
}
617

618
s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
619
{
620
	bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
621
		   cgrp->kn->id, args->weight, args->bw_period_us,
622
		   args->bw_quota_us, args->bw_burst_us);
623
	return 0;
624
}
625

626
void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
627
{
628
	bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
629
}
630

631
void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
632
		    u64 period_us, u64 quota_us, u64 burst_us)
633
{
634
	bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
635
		   period_us, quota_us, burst_us);
636
}
637

638
/*
639
 * Print out the online and possible CPU map using bpf_printk() as a
640
 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
641
 */
642
static void print_cpus(void)
643
{
644
	const struct cpumask *possible, *online;
645
	s32 cpu;
646
	char buf[128] = "", *p;
647
	int idx;
648

649
	possible = scx_bpf_get_possible_cpumask();
650
	online = scx_bpf_get_online_cpumask();
651

652
	idx = 0;
653
	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
654
		if (!(p = MEMBER_VPTR(buf, [idx++])))
655
			break;
656
		if (bpf_cpumask_test_cpu(cpu, online))
657
			*p++ = 'O';
658
		else if (bpf_cpumask_test_cpu(cpu, possible))
659
			*p++ = 'X';
660
		else
661
			*p++ = ' ';
662

663
		if ((cpu & 7) == 7) {
664
			if (!(p = MEMBER_VPTR(buf, [idx++])))
665
				break;
666
			*p++ = '|';
667
		}
668
	}
669
	buf[sizeof(buf) - 1] = '\0';
670

671
	scx_bpf_put_cpumask(online);
672
	scx_bpf_put_cpumask(possible);
673

674
	bpf_printk("CPUS: |%s", buf);
675
}
676

677
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
678
{
679
	bpf_printk("CPU %d coming online", cpu);
680
	/* @cpu is already online at this point */
681
	print_cpus();
682
}
683

684
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
685
{
686
	bpf_printk("CPU %d going offline", cpu);
687
	/* @cpu is still online at this point */
688
	print_cpus();
689
}
690

691
struct monitor_timer {
692
	struct bpf_timer timer;
693
};
694

695
struct {
696
	__uint(type, BPF_MAP_TYPE_ARRAY);
697
	__uint(max_entries, 1);
698
	__type(key, u32);
699
	__type(value, struct monitor_timer);
700
} monitor_timer SEC(".maps");
701

702
/*
703
 * Print out the min, avg and max performance levels of CPUs every second to
704
 * demonstrate the cpuperf interface.
705
 */
706
static void monitor_cpuperf(void)
707
{
708
	u32 zero = 0, nr_cpu_ids;
709
	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
710
	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
711
	const struct cpumask *online;
712
	int i, nr_online_cpus = 0;
713

714
	nr_cpu_ids = scx_bpf_nr_cpu_ids();
715
	online = scx_bpf_get_online_cpumask();
716

717
	bpf_for(i, 0, nr_cpu_ids) {
718
		struct cpu_ctx *cpuc;
719
		u32 cap, cur;
720

721
		if (!bpf_cpumask_test_cpu(i, online))
722
			continue;
723
		nr_online_cpus++;
724

725
		/* collect the capacity and current cpuperf */
726
		cap = scx_bpf_cpuperf_cap(i);
727
		cur = scx_bpf_cpuperf_cur(i);
728

729
		cur_min = cur < cur_min ? cur : cur_min;
730
		cur_max = cur > cur_max ? cur : cur_max;
731

732
		/*
733
		 * $cur is relative to $cap. Scale it down accordingly so that
734
		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
735
		 * makes sense.
736
		 */
737
		cur_sum += cur * cap / SCX_CPUPERF_ONE;
738
		cap_sum += cap;
739

740
		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
741
			scx_bpf_error("failed to look up cpu_ctx");
742
			goto out;
743
		}
744

745
		/* collect target */
746
		cur = cpuc->cpuperf_target;
747
		target_sum += cur;
748
		target_min = cur < target_min ? cur : target_min;
749
		target_max = cur > target_max ? cur : target_max;
750
	}
751

752
	cpuperf_min = cur_min;
753
	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
754
	cpuperf_max = cur_max;
755

756
	cpuperf_target_min = target_min;
757
	cpuperf_target_avg = target_sum / nr_online_cpus;
758
	cpuperf_target_max = target_max;
759
out:
760
	scx_bpf_put_cpumask(online);
761
}
762

763
/*
764
 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
765
 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
766
 * see meaningful dumps in the trace pipe.
767
 */
768
static void dump_shared_dsq(void)
769
{
770
	struct task_struct *p;
771
	s32 nr;
772

773
	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
774
		return;
775

776
	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
777

778
	bpf_rcu_read_lock();
779
	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
780
		bpf_printk("%s[%d]", p->comm, p->pid);
781
	bpf_rcu_read_unlock();
782
}
783

784
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
785
{
786
	struct scx_event_stats events;
787

788
	bpf_rcu_read_lock();
789
	dispatch_highpri(true);
790
	bpf_rcu_read_unlock();
791

792
	monitor_cpuperf();
793

794
	if (print_shared_dsq)
795
		dump_shared_dsq();
796

797
	__COMPAT_scx_bpf_events(&events, sizeof(events));
798

799
	bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
800
		   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
801
	bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
802
		   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
803
	bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
804
		   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
805
	bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
806
		   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
807
	bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
808
		   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
809
	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
810
		   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
811
	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
812
		   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
813
	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
814
		   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
815

816
	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
817
	return 0;
818
}
819

820
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
821
{
822
	u32 key = 0;
823
	struct bpf_timer *timer;
824
	s32 ret;
825

826
	print_cpus();
827

828
	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
829
	if (ret)
830
		return ret;
831

832
	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
833
	if (ret)
834
		return ret;
835

836
	timer = bpf_map_lookup_elem(&monitor_timer, &key);
837
	if (!timer)
838
		return -ESRCH;
839

840
	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
841
	bpf_timer_set_callback(timer, monitor_timerfn);
842

843
	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
844
}
845

846
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
847
{
848
	UEI_RECORD(uei, ei);
849
}
850

851
SCX_OPS_DEFINE(qmap_ops,
852
	       .select_cpu		= (void *)qmap_select_cpu,
853
	       .enqueue			= (void *)qmap_enqueue,
854
	       .dequeue			= (void *)qmap_dequeue,
855
	       .dispatch		= (void *)qmap_dispatch,
856
	       .tick			= (void *)qmap_tick,
857
	       .core_sched_before	= (void *)qmap_core_sched_before,
858
	       .cpu_release		= (void *)qmap_cpu_release,
859
	       .init_task		= (void *)qmap_init_task,
860
	       .dump			= (void *)qmap_dump,
861
	       .dump_cpu		= (void *)qmap_dump_cpu,
862
	       .dump_task		= (void *)qmap_dump_task,
863
	       .cgroup_init		= (void *)qmap_cgroup_init,
864
	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
865
	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
866
	       .cpu_online		= (void *)qmap_cpu_online,
867
	       .cpu_offline		= (void *)qmap_cpu_offline,
868
	       .init			= (void *)qmap_init,
869
	       .exit			= (void *)qmap_exit,
870
	       .timeout_ms		= 5000U,
871
	       .name			= "qmap");
872

873
Product

Resources

Company