CoCalc -- ringbuf.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/ringbuf.c
²⁵⁹²² views
1
#include <linux/bpf.h>
2
#include <linux/btf.h>
3
#include <linux/err.h>
4
#include <linux/irq_work.h>
5
#include <linux/slab.h>
6
#include <linux/filter.h>
7
#include <linux/mm.h>
8
#include <linux/vmalloc.h>
9
#include <linux/wait.h>
10
#include <linux/poll.h>
11
#include <linux/kmemleak.h>
12
#include <uapi/linux/btf.h>
13
#include <linux/btf_ids.h>
14
#include <asm/rqspinlock.h>
15

16
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
17

18
/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
19
#define RINGBUF_PGOFF \
20
	(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
21
/* consumer page and producer page */
22
#define RINGBUF_POS_PAGES 2
23
#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
24

25
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
26

27
struct bpf_ringbuf {
28
	wait_queue_head_t waitq;
29
	struct irq_work work;
30
	u64 mask;
31
	struct page **pages;
32
	int nr_pages;
33
	rqspinlock_t spinlock ____cacheline_aligned_in_smp;
34
	/* For user-space producer ring buffers, an atomic_t busy bit is used
35
	 * to synchronize access to the ring buffers in the kernel, rather than
36
	 * the spinlock that is used for kernel-producer ring buffers. This is
37
	 * done because the ring buffer must hold a lock across a BPF program's
38
	 * callback:
39
	 *
40
	 *    __bpf_user_ringbuf_peek() // lock acquired
41
	 * -> program callback_fn()
42
	 * -> __bpf_user_ringbuf_sample_release() // lock released
43
	 *
44
	 * It is unsafe and incorrect to hold an IRQ spinlock across what could
45
	 * be a long execution window, so we instead simply disallow concurrent
46
	 * access to the ring buffer by kernel consumers, and return -EBUSY from
47
	 * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
48
	 */
49
	atomic_t busy ____cacheline_aligned_in_smp;
50
	/* Consumer and producer counters are put into separate pages to
51
	 * allow each position to be mapped with different permissions.
52
	 * This prevents a user-space application from modifying the
53
	 * position and ruining in-kernel tracking. The permissions of the
54
	 * pages depend on who is producing samples: user-space or the
55
	 * kernel. Note that the pending counter is placed in the same
56
	 * page as the producer, so that it shares the same cache line.
57
	 *
58
	 * Kernel-producer
59
	 * ---------------
60
	 * The producer position and data pages are mapped as r/o in
61
	 * userspace. For this approach, bits in the header of samples are
62
	 * used to signal to user-space, and to other producers, whether a
63
	 * sample is currently being written.
64
	 *
65
	 * User-space producer
66
	 * -------------------
67
	 * Only the page containing the consumer position is mapped r/o in
68
	 * user-space. User-space producers also use bits of the header to
69
	 * communicate to the kernel, but the kernel must carefully check and
70
	 * validate each sample to ensure that they're correctly formatted, and
71
	 * fully contained within the ring buffer.
72
	 */
73
	unsigned long consumer_pos __aligned(PAGE_SIZE);
74
	unsigned long producer_pos __aligned(PAGE_SIZE);
75
	unsigned long pending_pos;
76
	char data[] __aligned(PAGE_SIZE);
77
};
78

79
struct bpf_ringbuf_map {
80
	struct bpf_map map;
81
	struct bpf_ringbuf *rb;
82
};
83

84
/* 8-byte ring buffer record header structure */
85
struct bpf_ringbuf_hdr {
86
	u32 len;
87
	u32 pg_off;
88
};
89

90
static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
91
{
92
	const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
93
			    __GFP_NOWARN | __GFP_ZERO;
94
	int nr_meta_pages = RINGBUF_NR_META_PAGES;
95
	int nr_data_pages = data_sz >> PAGE_SHIFT;
96
	int nr_pages = nr_meta_pages + nr_data_pages;
97
	struct page **pages, *page;
98
	struct bpf_ringbuf *rb;
99
	size_t array_size;
100
	int i;
101

102
	/* Each data page is mapped twice to allow "virtual"
103
	 * continuous read of samples wrapping around the end of ring
104
	 * buffer area:
105
	 * ------------------------------------------------------
106
	 * | meta pages |  real data pages  |  same data pages  |
107
	 * ------------------------------------------------------
108
	 * |            | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
109
	 * ------------------------------------------------------
110
	 * |            | TA             DA | TA             DA |
111
	 * ------------------------------------------------------
112
	 *                               ^^^^^^^
113
	 *                                  |
114
	 * Here, no need to worry about special handling of wrapped-around
115
	 * data due to double-mapped data pages. This works both in kernel and
116
	 * when mmap()'ed in user-space, simplifying both kernel and
117
	 * user-space implementations significantly.
118
	 */
119
	array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
120
	pages = bpf_map_area_alloc(array_size, numa_node);
121
	if (!pages)
122
		return NULL;
123

124
	for (i = 0; i < nr_pages; i++) {
125
		page = alloc_pages_node(numa_node, flags, 0);
126
		if (!page) {
127
			nr_pages = i;
128
			goto err_free_pages;
129
		}
130
		pages[i] = page;
131
		if (i >= nr_meta_pages)
132
			pages[nr_data_pages + i] = page;
133
	}
134

135
	rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
136
		  VM_MAP | VM_USERMAP, PAGE_KERNEL);
137
	if (rb) {
138
		kmemleak_not_leak(pages);
139
		rb->pages = pages;
140
		rb->nr_pages = nr_pages;
141
		return rb;
142
	}
143

144
err_free_pages:
145
	for (i = 0; i < nr_pages; i++)
146
		__free_page(pages[i]);
147
	bpf_map_area_free(pages);
148
	return NULL;
149
}
150

151
static void bpf_ringbuf_notify(struct irq_work *work)
152
{
153
	struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
154

155
	wake_up_all(&rb->waitq);
156
}
157

158
/* Maximum size of ring buffer area is limited by 32-bit page offset within
159
 * record header, counted in pages. Reserve 8 bits for extensibility, and
160
 * take into account few extra pages for consumer/producer pages and
161
 * non-mmap()'able parts, the current maximum size would be:
162
 *
163
 *     (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
164
 *
165
 * This gives 64GB limit, which seems plenty for single ring buffer. Now
166
 * considering that the maximum value of data_sz is (4GB - 1), there
167
 * will be no overflow, so just note the size limit in the comments.
168
 */
169
static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
170
{
171
	struct bpf_ringbuf *rb;
172

173
	rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
174
	if (!rb)
175
		return NULL;
176

177
	raw_res_spin_lock_init(&rb->spinlock);
178
	atomic_set(&rb->busy, 0);
179
	init_waitqueue_head(&rb->waitq);
180
	init_irq_work(&rb->work, bpf_ringbuf_notify);
181

182
	rb->mask = data_sz - 1;
183
	rb->consumer_pos = 0;
184
	rb->producer_pos = 0;
185
	rb->pending_pos = 0;
186

187
	return rb;
188
}
189

190
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
191
{
192
	struct bpf_ringbuf_map *rb_map;
193

194
	if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
195
		return ERR_PTR(-EINVAL);
196

197
	if (attr->key_size || attr->value_size ||
198
	    !is_power_of_2(attr->max_entries) ||
199
	    !PAGE_ALIGNED(attr->max_entries))
200
		return ERR_PTR(-EINVAL);
201

202
	rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
203
	if (!rb_map)
204
		return ERR_PTR(-ENOMEM);
205

206
	bpf_map_init_from_attr(&rb_map->map, attr);
207

208
	rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
209
	if (!rb_map->rb) {
210
		bpf_map_area_free(rb_map);
211
		return ERR_PTR(-ENOMEM);
212
	}
213

214
	return &rb_map->map;
215
}
216

217
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
218
{
219
	/* copy pages pointer and nr_pages to local variable, as we are going
220
	 * to unmap rb itself with vunmap() below
221
	 */
222
	struct page **pages = rb->pages;
223
	int i, nr_pages = rb->nr_pages;
224

225
	vunmap(rb);
226
	for (i = 0; i < nr_pages; i++)
227
		__free_page(pages[i]);
228
	bpf_map_area_free(pages);
229
}
230

231
static void ringbuf_map_free(struct bpf_map *map)
232
{
233
	struct bpf_ringbuf_map *rb_map;
234

235
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
236
	bpf_ringbuf_free(rb_map->rb);
237
	bpf_map_area_free(rb_map);
238
}
239

240
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
241
{
242
	return ERR_PTR(-ENOTSUPP);
243
}
244

245
static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
246
				    u64 flags)
247
{
248
	return -ENOTSUPP;
249
}
250

251
static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
252
{
253
	return -ENOTSUPP;
254
}
255

256
static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
257
				    void *next_key)
258
{
259
	return -ENOTSUPP;
260
}
261

262
static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
263
{
264
	struct bpf_ringbuf_map *rb_map;
265

266
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
267

268
	if (vma->vm_flags & VM_WRITE) {
269
		/* allow writable mapping for the consumer_pos only */
270
		if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
271
			return -EPERM;
272
	}
273
	/* remap_vmalloc_range() checks size and offset constraints */
274
	return remap_vmalloc_range(vma, rb_map->rb,
275
				   vma->vm_pgoff + RINGBUF_PGOFF);
276
}
277

278
static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
279
{
280
	struct bpf_ringbuf_map *rb_map;
281

282
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
283

284
	if (vma->vm_flags & VM_WRITE) {
285
		if (vma->vm_pgoff == 0)
286
			/* Disallow writable mappings to the consumer pointer,
287
			 * and allow writable mappings to both the producer
288
			 * position, and the ring buffer data itself.
289
			 */
290
			return -EPERM;
291
	}
292
	/* remap_vmalloc_range() checks size and offset constraints */
293
	return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
294
}
295

296
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
297
{
298
	unsigned long cons_pos, prod_pos;
299

300
	cons_pos = smp_load_acquire(&rb->consumer_pos);
301
	prod_pos = smp_load_acquire(&rb->producer_pos);
302
	return prod_pos - cons_pos;
303
}
304

305
static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
306
{
307
	return rb->mask + 1;
308
}
309

310
static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
311
				      struct poll_table_struct *pts)
312
{
313
	struct bpf_ringbuf_map *rb_map;
314

315
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
316
	poll_wait(filp, &rb_map->rb->waitq, pts);
317

318
	if (ringbuf_avail_data_sz(rb_map->rb))
319
		return EPOLLIN | EPOLLRDNORM;
320
	return 0;
321
}
322

323
static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
324
				      struct poll_table_struct *pts)
325
{
326
	struct bpf_ringbuf_map *rb_map;
327

328
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
329
	poll_wait(filp, &rb_map->rb->waitq, pts);
330

331
	if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
332
		return EPOLLOUT | EPOLLWRNORM;
333
	return 0;
334
}
335

336
static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
337
{
338
	struct bpf_ringbuf *rb;
339
	int nr_data_pages;
340
	int nr_meta_pages;
341
	u64 usage = sizeof(struct bpf_ringbuf_map);
342

343
	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
344
	usage += (u64)rb->nr_pages << PAGE_SHIFT;
345
	nr_meta_pages = RINGBUF_NR_META_PAGES;
346
	nr_data_pages = map->max_entries >> PAGE_SHIFT;
347
	usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
348
	return usage;
349
}
350

351
BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
352
const struct bpf_map_ops ringbuf_map_ops = {
353
	.map_meta_equal = bpf_map_meta_equal,
354
	.map_alloc = ringbuf_map_alloc,
355
	.map_free = ringbuf_map_free,
356
	.map_mmap = ringbuf_map_mmap_kern,
357
	.map_poll = ringbuf_map_poll_kern,
358
	.map_lookup_elem = ringbuf_map_lookup_elem,
359
	.map_update_elem = ringbuf_map_update_elem,
360
	.map_delete_elem = ringbuf_map_delete_elem,
361
	.map_get_next_key = ringbuf_map_get_next_key,
362
	.map_mem_usage = ringbuf_map_mem_usage,
363
	.map_btf_id = &ringbuf_map_btf_ids[0],
364
};
365

366
BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
367
const struct bpf_map_ops user_ringbuf_map_ops = {
368
	.map_meta_equal = bpf_map_meta_equal,
369
	.map_alloc = ringbuf_map_alloc,
370
	.map_free = ringbuf_map_free,
371
	.map_mmap = ringbuf_map_mmap_user,
372
	.map_poll = ringbuf_map_poll_user,
373
	.map_lookup_elem = ringbuf_map_lookup_elem,
374
	.map_update_elem = ringbuf_map_update_elem,
375
	.map_delete_elem = ringbuf_map_delete_elem,
376
	.map_get_next_key = ringbuf_map_get_next_key,
377
	.map_mem_usage = ringbuf_map_mem_usage,
378
	.map_btf_id = &user_ringbuf_map_btf_ids[0],
379
};
380

381
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
382
 * calculate offset from record metadata to ring buffer in pages, rounded
383
 * down. This page offset is stored as part of record metadata and allows to
384
 * restore struct bpf_ringbuf * from record pointer. This page offset is
385
 * stored at offset 4 of record metadata header.
386
 */
387
static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
388
				     struct bpf_ringbuf_hdr *hdr)
389
{
390
	return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
391
}
392

393
/* Given pointer to ring buffer record header, restore pointer to struct
394
 * bpf_ringbuf itself by using page offset stored at offset 4
395
 */
396
static struct bpf_ringbuf *
397
bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
398
{
399
	unsigned long addr = (unsigned long)(void *)hdr;
400
	unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
401

402
	return (void*)((addr & PAGE_MASK) - off);
403
}
404

405
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
406
{
407
	unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
408
	struct bpf_ringbuf_hdr *hdr;
409
	u32 len, pg_off, tmp_size, hdr_len;
410

411
	if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
412
		return NULL;
413

414
	len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
415
	if (len > ringbuf_total_data_sz(rb))
416
		return NULL;
417

418
	cons_pos = smp_load_acquire(&rb->consumer_pos);
419

420
	if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
421
		return NULL;
422

423
	pend_pos = rb->pending_pos;
424
	prod_pos = rb->producer_pos;
425
	new_prod_pos = prod_pos + len;
426

427
	while (pend_pos < prod_pos) {
428
		hdr = (void *)rb->data + (pend_pos & rb->mask);
429
		hdr_len = READ_ONCE(hdr->len);
430
		if (hdr_len & BPF_RINGBUF_BUSY_BIT)
431
			break;
432
		tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
433
		tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
434
		pend_pos += tmp_size;
435
	}
436
	rb->pending_pos = pend_pos;
437

438
	/* check for out of ringbuf space:
439
	 * - by ensuring producer position doesn't advance more than
440
	 *   (ringbuf_size - 1) ahead
441
	 * - by ensuring oldest not yet committed record until newest
442
	 *   record does not span more than (ringbuf_size - 1)
443
	 */
444
	if (new_prod_pos - cons_pos > rb->mask ||
445
	    new_prod_pos - pend_pos > rb->mask) {
446
		raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
447
		return NULL;
448
	}
449

450
	hdr = (void *)rb->data + (prod_pos & rb->mask);
451
	pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
452
	hdr->len = size | BPF_RINGBUF_BUSY_BIT;
453
	hdr->pg_off = pg_off;
454

455
	/* pairs with consumer's smp_load_acquire() */
456
	smp_store_release(&rb->producer_pos, new_prod_pos);
457

458
	raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
459

460
	return (void *)hdr + BPF_RINGBUF_HDR_SZ;
461
}
462

463
BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
464
{
465
	struct bpf_ringbuf_map *rb_map;
466

467
	if (unlikely(flags))
468
		return 0;
469

470
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
471
	return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
472
}
473

474
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
475
	.func		= bpf_ringbuf_reserve,
476
	.ret_type	= RET_PTR_TO_RINGBUF_MEM_OR_NULL,
477
	.arg1_type	= ARG_CONST_MAP_PTR,
478
	.arg2_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
479
	.arg3_type	= ARG_ANYTHING,
480
};
481

482
static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
483
{
484
	unsigned long rec_pos, cons_pos;
485
	struct bpf_ringbuf_hdr *hdr;
486
	struct bpf_ringbuf *rb;
487
	u32 new_len;
488

489
	hdr = sample - BPF_RINGBUF_HDR_SZ;
490
	rb = bpf_ringbuf_restore_from_rec(hdr);
491
	new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
492
	if (discard)
493
		new_len |= BPF_RINGBUF_DISCARD_BIT;
494

495
	/* update record header with correct final size prefix */
496
	xchg(&hdr->len, new_len);
497

498
	/* if consumer caught up and is waiting for our record, notify about
499
	 * new data availability
500
	 */
501
	rec_pos = (void *)hdr - (void *)rb->data;
502
	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
503

504
	if (flags & BPF_RB_FORCE_WAKEUP)
505
		irq_work_queue(&rb->work);
506
	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
507
		irq_work_queue(&rb->work);
508
}
509

510
BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
511
{
512
	bpf_ringbuf_commit(sample, flags, false /* discard */);
513
	return 0;
514
}
515

516
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
517
	.func		= bpf_ringbuf_submit,
518
	.ret_type	= RET_VOID,
519
	.arg1_type	= ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
520
	.arg2_type	= ARG_ANYTHING,
521
};
522

523
BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
524
{
525
	bpf_ringbuf_commit(sample, flags, true /* discard */);
526
	return 0;
527
}
528

529
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
530
	.func		= bpf_ringbuf_discard,
531
	.ret_type	= RET_VOID,
532
	.arg1_type	= ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
533
	.arg2_type	= ARG_ANYTHING,
534
};
535

536
BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
537
	   u64, flags)
538
{
539
	struct bpf_ringbuf_map *rb_map;
540
	void *rec;
541

542
	if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
543
		return -EINVAL;
544

545
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
546
	rec = __bpf_ringbuf_reserve(rb_map->rb, size);
547
	if (!rec)
548
		return -EAGAIN;
549

550
	memcpy(rec, data, size);
551
	bpf_ringbuf_commit(rec, flags, false /* discard */);
552
	return 0;
553
}
554

555
const struct bpf_func_proto bpf_ringbuf_output_proto = {
556
	.func		= bpf_ringbuf_output,
557
	.ret_type	= RET_INTEGER,
558
	.arg1_type	= ARG_CONST_MAP_PTR,
559
	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
560
	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
561
	.arg4_type	= ARG_ANYTHING,
562
};
563

564
BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
565
{
566
	struct bpf_ringbuf *rb;
567

568
	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
569

570
	switch (flags) {
571
	case BPF_RB_AVAIL_DATA:
572
		return ringbuf_avail_data_sz(rb);
573
	case BPF_RB_RING_SIZE:
574
		return ringbuf_total_data_sz(rb);
575
	case BPF_RB_CONS_POS:
576
		return smp_load_acquire(&rb->consumer_pos);
577
	case BPF_RB_PROD_POS:
578
		return smp_load_acquire(&rb->producer_pos);
579
	default:
580
		return 0;
581
	}
582
}
583

584
const struct bpf_func_proto bpf_ringbuf_query_proto = {
585
	.func		= bpf_ringbuf_query,
586
	.ret_type	= RET_INTEGER,
587
	.arg1_type	= ARG_CONST_MAP_PTR,
588
	.arg2_type	= ARG_ANYTHING,
589
};
590

591
BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
592
	   struct bpf_dynptr_kern *, ptr)
593
{
594
	struct bpf_ringbuf_map *rb_map;
595
	void *sample;
596
	int err;
597

598
	if (unlikely(flags)) {
599
		bpf_dynptr_set_null(ptr);
600
		return -EINVAL;
601
	}
602

603
	err = bpf_dynptr_check_size(size);
604
	if (err) {
605
		bpf_dynptr_set_null(ptr);
606
		return err;
607
	}
608

609
	rb_map = container_of(map, struct bpf_ringbuf_map, map);
610

611
	sample = __bpf_ringbuf_reserve(rb_map->rb, size);
612
	if (!sample) {
613
		bpf_dynptr_set_null(ptr);
614
		return -EINVAL;
615
	}
616

617
	bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
618

619
	return 0;
620
}
621

622
const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
623
	.func		= bpf_ringbuf_reserve_dynptr,
624
	.ret_type	= RET_INTEGER,
625
	.arg1_type	= ARG_CONST_MAP_PTR,
626
	.arg2_type	= ARG_ANYTHING,
627
	.arg3_type	= ARG_ANYTHING,
628
	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
629
};
630

631
BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
632
{
633
	if (!ptr->data)
634
		return 0;
635

636
	bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
637

638
	bpf_dynptr_set_null(ptr);
639

640
	return 0;
641
}
642

643
const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
644
	.func		= bpf_ringbuf_submit_dynptr,
645
	.ret_type	= RET_VOID,
646
	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
647
	.arg2_type	= ARG_ANYTHING,
648
};
649

650
BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
651
{
652
	if (!ptr->data)
653
		return 0;
654

655
	bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
656

657
	bpf_dynptr_set_null(ptr);
658

659
	return 0;
660
}
661

662
const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
663
	.func		= bpf_ringbuf_discard_dynptr,
664
	.ret_type	= RET_VOID,
665
	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
666
	.arg2_type	= ARG_ANYTHING,
667
};
668

669
static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
670
{
671
	int err;
672
	u32 hdr_len, sample_len, total_len, flags, *hdr;
673
	u64 cons_pos, prod_pos;
674

675
	/* Synchronizes with smp_store_release() in user-space producer. */
676
	prod_pos = smp_load_acquire(&rb->producer_pos);
677
	if (prod_pos % 8)
678
		return -EINVAL;
679

680
	/* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
681
	cons_pos = smp_load_acquire(&rb->consumer_pos);
682
	if (cons_pos >= prod_pos)
683
		return -ENODATA;
684

685
	hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
686
	/* Synchronizes with smp_store_release() in user-space producer. */
687
	hdr_len = smp_load_acquire(hdr);
688
	flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
689
	sample_len = hdr_len & ~flags;
690
	total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
691

692
	/* The sample must fit within the region advertised by the producer position. */
693
	if (total_len > prod_pos - cons_pos)
694
		return -EINVAL;
695

696
	/* The sample must fit within the data region of the ring buffer. */
697
	if (total_len > ringbuf_total_data_sz(rb))
698
		return -E2BIG;
699

700
	/* The sample must fit into a struct bpf_dynptr. */
701
	err = bpf_dynptr_check_size(sample_len);
702
	if (err)
703
		return -E2BIG;
704

705
	if (flags & BPF_RINGBUF_DISCARD_BIT) {
706
		/* If the discard bit is set, the sample should be skipped.
707
		 *
708
		 * Update the consumer pos, and return -EAGAIN so the caller
709
		 * knows to skip this sample and try to read the next one.
710
		 */
711
		smp_store_release(&rb->consumer_pos, cons_pos + total_len);
712
		return -EAGAIN;
713
	}
714

715
	if (flags & BPF_RINGBUF_BUSY_BIT)
716
		return -ENODATA;
717

718
	*sample = (void *)((uintptr_t)rb->data +
719
			   (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
720
	*size = sample_len;
721
	return 0;
722
}
723

724
static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
725
{
726
	u64 consumer_pos;
727
	u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
728

729
	/* Using smp_load_acquire() is unnecessary here, as the busy-bit
730
	 * prevents another task from writing to consumer_pos after it was read
731
	 * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
732
	 */
733
	consumer_pos = rb->consumer_pos;
734
	 /* Synchronizes with smp_load_acquire() in user-space producer. */
735
	smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
736
}
737

738
BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
739
	   void *, callback_fn, void *, callback_ctx, u64, flags)
740
{
741
	struct bpf_ringbuf *rb;
742
	long samples, discarded_samples = 0, ret = 0;
743
	bpf_callback_t callback = (bpf_callback_t)callback_fn;
744
	u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
745
	int busy = 0;
746

747
	if (unlikely(flags & ~wakeup_flags))
748
		return -EINVAL;
749

750
	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
751

752
	/* If another consumer is already consuming a sample, wait for them to finish. */
753
	if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
754
		return -EBUSY;
755

756
	for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
757
		int err;
758
		u32 size;
759
		void *sample;
760
		struct bpf_dynptr_kern dynptr;
761

762
		err = __bpf_user_ringbuf_peek(rb, &sample, &size);
763
		if (err) {
764
			if (err == -ENODATA) {
765
				break;
766
			} else if (err == -EAGAIN) {
767
				discarded_samples++;
768
				continue;
769
			} else {
770
				ret = err;
771
				goto schedule_work_return;
772
			}
773
		}
774

775
		bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
776
		ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
777
		__bpf_user_ringbuf_sample_release(rb, size, flags);
778
	}
779
	ret = samples - discarded_samples;
780

781
schedule_work_return:
782
	/* Prevent the clearing of the busy-bit from being reordered before the
783
	 * storing of any rb consumer or producer positions.
784
	 */
785
	atomic_set_release(&rb->busy, 0);
786

787
	if (flags & BPF_RB_FORCE_WAKEUP)
788
		irq_work_queue(&rb->work);
789
	else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
790
		irq_work_queue(&rb->work);
791
	return ret;
792
}
793

794
const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
795
	.func		= bpf_user_ringbuf_drain,
796
	.ret_type	= RET_INTEGER,
797
	.arg1_type	= ARG_CONST_MAP_PTR,
798
	.arg2_type	= ARG_PTR_TO_FUNC,
799
	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
800
	.arg4_type	= ARG_ANYTHING,
801
};
802

803
Product

Resources

Company