Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/sched_ext/scx_sdt.bpf.c
121797 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* Arena-based task data scheduler. This is a variation of scx_simple
4
* that uses a combined allocator and indexing structure to organize
5
* task data. Task context allocation is done when a task enters the
6
* scheduler, while freeing is done when it exits. Task contexts are
7
* retrieved from task-local storage, pointing to the allocated memory.
8
*
9
* The main purpose of this scheduler is to demostrate arena memory
10
* management.
11
*
12
* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
13
* Copyright (c) 2024-2025 Emil Tsalapatis <[email protected]>
14
* Copyright (c) 2024-2025 Tejun Heo <[email protected]>
15
*
16
*/
17
#include <scx/common.bpf.h>
18
#include <scx/bpf_arena_common.bpf.h>
19
20
#include "scx_sdt.h"
21
22
char _license[] SEC("license") = "GPL";
23
24
UEI_DEFINE(uei);
25
26
struct {
27
__uint(type, BPF_MAP_TYPE_ARENA);
28
__uint(map_flags, BPF_F_MMAPABLE);
29
#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
30
__uint(max_entries, 1 << 16); /* number of pages */
31
__ulong(map_extra, (1ull << 32)); /* start of mmap() region */
32
#else
33
__uint(max_entries, 1 << 20); /* number of pages */
34
__ulong(map_extra, (1ull << 44)); /* start of mmap() region */
35
#endif
36
} arena __weak SEC(".maps");
37
38
#define SHARED_DSQ 0
39
40
#define DEFINE_SDT_STAT(metric) \
41
static inline void \
42
stat_inc_##metric(struct scx_stats __arena *stats) \
43
{ \
44
cast_kern(stats); \
45
stats->metric += 1; \
46
} \
47
__u64 stat_##metric; \
48
49
DEFINE_SDT_STAT(enqueue);
50
DEFINE_SDT_STAT(init);
51
DEFINE_SDT_STAT(exit);
52
DEFINE_SDT_STAT(select_idle_cpu);
53
DEFINE_SDT_STAT(select_busy_cpu);
54
55
/*
56
* Necessary for cond_break/can_loop's semantics. According to kernel commit
57
* 011832b, the loop counter variable must be seen as imprecise and bounded
58
* by the verifier. Initializing it from a constant (e.g., i = 0;), then,
59
* makes it precise and prevents may_goto from helping with converging the
60
* loop. For these loops we must initialize the loop counter from a variable
61
* whose value the verifier cannot reason about when checking the program, so
62
* that the loop counter's value is imprecise.
63
*/
64
static __u64 zero = 0;
65
66
/*
67
* XXX Hack to get the verifier to find the arena for sdt_exit_task.
68
* As of 6.12-rc5, The verifier associates arenas with programs by
69
* checking LD.IMM instruction operands for an arena and populating
70
* the program state with the first instance it finds. This requires
71
* accessing our global arena variable, but scx methods do not necessarily
72
* do so while still using pointers from that arena. Insert a bpf_printk
73
* statement that triggers at most once to generate an LD.IMM instruction
74
* to access the arena and help the verifier.
75
*/
76
static volatile bool scx_arena_verify_once;
77
78
__hidden void scx_arena_subprog_init(void)
79
{
80
if (scx_arena_verify_once)
81
return;
82
83
bpf_printk("%s: arena pointer %p", __func__, &arena);
84
scx_arena_verify_once = true;
85
}
86
87
88
private(LOCK) struct bpf_spin_lock alloc_lock;
89
private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
90
91
/* allocation pools */
92
struct sdt_pool desc_pool;
93
struct sdt_pool chunk_pool;
94
95
/* Protected by alloc_lock. */
96
struct scx_alloc_stats alloc_stats;
97
98
99
/* Allocate element from the pool. Must be called with a then pool lock held. */
100
static
101
void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
102
{
103
__u64 elem_size, max_elems;
104
void __arena *slab;
105
void __arena *ptr;
106
107
elem_size = pool->elem_size;
108
max_elems = pool->max_elems;
109
110
/* If the chunk is spent, get a new one. */
111
if (pool->idx >= max_elems) {
112
slab = bpf_arena_alloc_pages(&arena, NULL,
113
div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
114
if (!slab)
115
return NULL;
116
117
pool->slab = slab;
118
pool->idx = 0;
119
}
120
121
ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
122
pool->idx += 1;
123
124
return ptr;
125
}
126
127
/* Alloc desc and associated chunk. Called with the allocator spinlock held. */
128
static sdt_desc_t *scx_alloc_chunk(void)
129
{
130
struct sdt_chunk __arena *chunk;
131
sdt_desc_t *desc;
132
sdt_desc_t *out;
133
134
chunk = scx_alloc_from_pool(&chunk_pool);
135
if (!chunk)
136
return NULL;
137
138
desc = scx_alloc_from_pool(&desc_pool);
139
if (!desc) {
140
/*
141
* Effectively frees the previous chunk allocation.
142
* Index cannot be 0, so decrementing is always
143
* valid.
144
*/
145
chunk_pool.idx -= 1;
146
return NULL;
147
}
148
149
out = desc;
150
151
desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
152
desc->chunk = chunk;
153
154
alloc_stats.chunk_allocs += 1;
155
156
return out;
157
}
158
159
static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
160
{
161
if (unlikely(data_size % 8))
162
return -EINVAL;
163
164
if (unlikely(nr_pages == 0))
165
return -EINVAL;
166
167
pool->elem_size = data_size;
168
pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
169
/* Populate the pool slab on the first allocation. */
170
pool->idx = pool->max_elems;
171
172
return 0;
173
}
174
175
/* Initialize both the base pool allocators and the root chunk of the index. */
176
__hidden int
177
scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
178
{
179
size_t min_chunk_size;
180
int ret;
181
182
_Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
183
"chunk size must fit into a page");
184
185
ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
186
if (ret != 0)
187
return ret;
188
189
ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
190
if (ret != 0)
191
return ret;
192
193
/* Wrap data into a descriptor and word align. */
194
data_size += sizeof(struct sdt_data);
195
data_size = round_up(data_size, 8);
196
197
/*
198
* Ensure we allocate large enough chunks from the arena to avoid excessive
199
* internal fragmentation when turning chunks it into structs.
200
*/
201
min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
202
ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
203
if (ret != 0)
204
return ret;
205
206
bpf_spin_lock(&alloc_lock);
207
alloc->root = scx_alloc_chunk();
208
bpf_spin_unlock(&alloc_lock);
209
if (!alloc->root)
210
return -ENOMEM;
211
212
return 0;
213
}
214
215
static
216
int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
217
{
218
__u64 __arena *allocated = desc->allocated;
219
__u64 bit;
220
221
if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
222
return -EINVAL;
223
224
bit = (__u64)1 << (pos % 64);
225
226
if (state)
227
allocated[pos / 64] |= bit;
228
else
229
allocated[pos / 64] &= ~bit;
230
231
return 0;
232
}
233
234
static __noinline
235
int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
236
{
237
sdt_desc_t *desc;
238
__u64 u, level;
239
int ret;
240
241
for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
242
level = SDT_TASK_LEVELS - 1 - u;
243
244
/* Only propagate upwards if we are the parent's only free chunk. */
245
desc = lv_desc[level];
246
247
ret = set_idx_state(desc, lv_pos[level], false);
248
if (unlikely(ret != 0))
249
return ret;
250
251
desc->nr_free += 1;
252
if (desc->nr_free > 1)
253
return 0;
254
}
255
256
return 0;
257
}
258
259
/*
260
* Free the allocated struct with the given index. Called with the
261
* allocator lock taken.
262
*/
263
__hidden
264
int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
265
{
266
const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
267
sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
268
sdt_desc_t * __arena *desc_children;
269
struct sdt_chunk __arena *chunk;
270
sdt_desc_t *desc;
271
struct sdt_data __arena *data;
272
__u64 level, shift, pos;
273
__u64 lv_pos[SDT_TASK_LEVELS];
274
int ret;
275
int i;
276
277
if (!alloc)
278
return 0;
279
280
desc = alloc->root;
281
if (unlikely(!desc))
282
return -EINVAL;
283
284
/* To appease the verifier. */
285
for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
286
lv_desc[level] = NULL;
287
lv_pos[level] = 0;
288
}
289
290
/* Find the leaf node containing the index. */
291
for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
292
shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
293
pos = (idx >> shift) & mask;
294
295
lv_desc[level] = desc;
296
lv_pos[level] = pos;
297
298
if (level == SDT_TASK_LEVELS - 1)
299
break;
300
301
chunk = desc->chunk;
302
303
desc_children = (sdt_desc_t * __arena *)chunk->descs;
304
desc = desc_children[pos];
305
306
if (unlikely(!desc))
307
return -EINVAL;
308
}
309
310
chunk = desc->chunk;
311
312
pos = idx & mask;
313
data = chunk->data[pos];
314
if (likely(data)) {
315
*data = (struct sdt_data) {
316
.tid.genn = data->tid.genn + 1,
317
};
318
319
/* Zero out one word at a time. */
320
for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
321
data->payload[i] = 0;
322
}
323
}
324
325
ret = mark_nodes_avail(lv_desc, lv_pos);
326
if (unlikely(ret != 0))
327
return ret;
328
329
alloc_stats.active_allocs -= 1;
330
alloc_stats.free_ops += 1;
331
332
return 0;
333
}
334
335
static inline
336
int ffs(__u64 word)
337
{
338
unsigned int num = 0;
339
340
if ((word & 0xffffffff) == 0) {
341
num += 32;
342
word >>= 32;
343
}
344
345
if ((word & 0xffff) == 0) {
346
num += 16;
347
word >>= 16;
348
}
349
350
if ((word & 0xff) == 0) {
351
num += 8;
352
word >>= 8;
353
}
354
355
if ((word & 0xf) == 0) {
356
num += 4;
357
word >>= 4;
358
}
359
360
if ((word & 0x3) == 0) {
361
num += 2;
362
word >>= 2;
363
}
364
365
if ((word & 0x1) == 0) {
366
num += 1;
367
word >>= 1;
368
}
369
370
return num;
371
}
372
373
374
/* find the first empty slot */
375
__hidden
376
__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
377
{
378
__u64 freeslots;
379
__u64 i;
380
381
for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
382
freeslots = ~desc->allocated[i];
383
if (freeslots == (__u64)0)
384
continue;
385
386
return (i * 64) + ffs(freeslots);
387
}
388
389
return SDT_TASK_ENTS_PER_CHUNK;
390
}
391
392
/*
393
* Find and return an available idx on the allocator.
394
* Called with the task spinlock held.
395
*/
396
static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
397
{
398
sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
399
sdt_desc_t * __arena *desc_children;
400
struct sdt_chunk __arena *chunk;
401
sdt_desc_t *tmp;
402
__u64 lv_pos[SDT_TASK_LEVELS];
403
__u64 u, pos, level;
404
__u64 idx = 0;
405
int ret;
406
407
for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
408
pos = chunk_find_empty(desc);
409
410
/* If we error out, something has gone very wrong. */
411
if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
412
return NULL;
413
414
if (pos == SDT_TASK_ENTS_PER_CHUNK)
415
return NULL;
416
417
idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
418
idx |= pos;
419
420
/* Log the levels to complete allocation. */
421
lv_desc[level] = desc;
422
lv_pos[level] = pos;
423
424
/* The rest of the loop is for internal node traversal. */
425
if (level == SDT_TASK_LEVELS - 1)
426
break;
427
428
/* Allocate an internal node if necessary. */
429
chunk = desc->chunk;
430
desc_children = (sdt_desc_t * __arena *)chunk->descs;
431
432
desc = desc_children[pos];
433
if (!desc) {
434
desc = scx_alloc_chunk();
435
if (!desc)
436
return NULL;
437
438
desc_children[pos] = desc;
439
}
440
}
441
442
/*
443
* Finding the descriptor along with any internal node
444
* allocations was successful. Update all levels with
445
* the new allocation.
446
*/
447
bpf_for(u, 0, SDT_TASK_LEVELS) {
448
level = SDT_TASK_LEVELS - 1 - u;
449
tmp = lv_desc[level];
450
451
ret = set_idx_state(tmp, lv_pos[level], true);
452
if (ret != 0)
453
break;
454
455
tmp->nr_free -= 1;
456
if (tmp->nr_free > 0)
457
break;
458
}
459
460
*idxp = idx;
461
462
return desc;
463
}
464
465
__hidden
466
void __arena *scx_alloc(struct scx_allocator *alloc)
467
{
468
struct sdt_data __arena *data = NULL;
469
struct sdt_chunk __arena *chunk;
470
sdt_desc_t *desc;
471
__u64 idx, pos;
472
473
if (!alloc)
474
return NULL;
475
476
bpf_spin_lock(&alloc_lock);
477
478
/* We unlock if we encounter an error in the function. */
479
desc = desc_find_empty(alloc->root, &idx);
480
if (unlikely(desc == NULL)) {
481
bpf_spin_unlock(&alloc_lock);
482
return NULL;
483
}
484
485
chunk = desc->chunk;
486
487
/* Populate the leaf node if necessary. */
488
pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
489
data = chunk->data[pos];
490
if (!data) {
491
data = scx_alloc_from_pool(&alloc->pool);
492
if (!data) {
493
scx_alloc_free_idx(alloc, idx);
494
bpf_spin_unlock(&alloc_lock);
495
return NULL;
496
}
497
}
498
499
chunk->data[pos] = data;
500
501
/* The data counts as a chunk */
502
alloc_stats.data_allocs += 1;
503
alloc_stats.alloc_ops += 1;
504
alloc_stats.active_allocs += 1;
505
506
data->tid.idx = idx;
507
508
bpf_spin_unlock(&alloc_lock);
509
510
return data;
511
}
512
513
/*
514
* Task BPF map entry recording the task's assigned ID and pointing to the data
515
* area allocated in arena.
516
*/
517
struct scx_task_map_val {
518
union sdt_id tid;
519
__u64 tptr;
520
struct sdt_data __arena *data;
521
};
522
523
struct {
524
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
525
__uint(map_flags, BPF_F_NO_PREALLOC);
526
__type(key, int);
527
__type(value, struct scx_task_map_val);
528
} scx_task_map SEC(".maps");
529
530
static struct scx_allocator scx_task_allocator;
531
532
__hidden
533
void __arena *scx_task_alloc(struct task_struct *p)
534
{
535
struct sdt_data __arena *data = NULL;
536
struct scx_task_map_val *mval;
537
538
mval = bpf_task_storage_get(&scx_task_map, p, 0,
539
BPF_LOCAL_STORAGE_GET_F_CREATE);
540
if (!mval)
541
return NULL;
542
543
data = scx_alloc(&scx_task_allocator);
544
if (unlikely(!data))
545
return NULL;
546
547
mval->tid = data->tid;
548
mval->tptr = (__u64) p;
549
mval->data = data;
550
551
return (void __arena *)data->payload;
552
}
553
554
__hidden
555
int scx_task_init(__u64 data_size)
556
{
557
return scx_alloc_init(&scx_task_allocator, data_size);
558
}
559
560
__hidden
561
void __arena *scx_task_data(struct task_struct *p)
562
{
563
struct sdt_data __arena *data;
564
struct scx_task_map_val *mval;
565
566
scx_arena_subprog_init();
567
568
mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
569
if (!mval)
570
return NULL;
571
572
data = mval->data;
573
574
return (void __arena *)data->payload;
575
}
576
577
__hidden
578
void scx_task_free(struct task_struct *p)
579
{
580
struct scx_task_map_val *mval;
581
582
scx_arena_subprog_init();
583
584
mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
585
if (!mval)
586
return;
587
588
bpf_spin_lock(&alloc_lock);
589
scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
590
bpf_spin_unlock(&alloc_lock);
591
592
bpf_task_storage_delete(&scx_task_map, p);
593
}
594
595
static inline void
596
scx_stat_global_update(struct scx_stats __arena *stats)
597
{
598
cast_kern(stats);
599
__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
600
__sync_fetch_and_add(&stat_init, stats->init);
601
__sync_fetch_and_add(&stat_exit, stats->exit);
602
__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
603
__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
604
}
605
606
s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
607
{
608
struct scx_stats __arena *stats;
609
bool is_idle = false;
610
s32 cpu;
611
612
stats = scx_task_data(p);
613
if (!stats) {
614
scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
615
return 0;
616
}
617
618
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
619
if (is_idle) {
620
stat_inc_select_idle_cpu(stats);
621
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
622
} else {
623
stat_inc_select_busy_cpu(stats);
624
}
625
626
return cpu;
627
}
628
629
void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
630
{
631
struct scx_stats __arena *stats;
632
633
stats = scx_task_data(p);
634
if (!stats) {
635
scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
636
return;
637
}
638
639
stat_inc_enqueue(stats);
640
641
scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
642
}
643
644
void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
645
{
646
scx_bpf_dsq_move_to_local(SHARED_DSQ);
647
}
648
649
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
650
struct scx_init_task_args *args)
651
{
652
struct scx_stats __arena *stats;
653
654
stats = scx_task_alloc(p);
655
if (!stats) {
656
scx_bpf_error("arena allocator out of memory");
657
return -ENOMEM;
658
}
659
660
stats->pid = p->pid;
661
662
stat_inc_init(stats);
663
664
return 0;
665
}
666
667
void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
668
struct scx_exit_task_args *args)
669
{
670
struct scx_stats __arena *stats;
671
672
stats = scx_task_data(p);
673
if (!stats) {
674
scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
675
return;
676
}
677
678
stat_inc_exit(stats);
679
scx_stat_global_update(stats);
680
681
scx_task_free(p);
682
}
683
684
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
685
{
686
int ret;
687
688
ret = scx_task_init(sizeof(struct scx_stats));
689
if (ret < 0) {
690
scx_bpf_error("%s: failed with %d", __func__, ret);
691
return ret;
692
}
693
694
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
695
if (ret) {
696
scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
697
return ret;
698
}
699
700
return 0;
701
}
702
703
void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
704
{
705
UEI_RECORD(uei, ei);
706
}
707
708
SCX_OPS_DEFINE(sdt_ops,
709
.select_cpu = (void *)sdt_select_cpu,
710
.enqueue = (void *)sdt_enqueue,
711
.dispatch = (void *)sdt_dispatch,
712
.init_task = (void *)sdt_init_task,
713
.exit_task = (void *)sdt_exit_task,
714
.init = (void *)sdt_init,
715
.exit = (void *)sdt_exit,
716
.name = "sdt");
717
718