Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/net/bpf_zerocopy.c
39476 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2007 Seccuris Inc.
5
* All rights reserved.
6
*
7
* This software was developed by Robert N. M. Watson under contract to
8
* Seccuris Inc.
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
*
19
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
* SUCH DAMAGE.
30
*/
31
32
#include <sys/cdefs.h>
33
#include "opt_bpf.h"
34
35
#include <sys/param.h>
36
#include <sys/lock.h>
37
#include <sys/malloc.h>
38
#include <sys/mbuf.h>
39
#include <sys/mutex.h>
40
#include <sys/proc.h>
41
#include <sys/sf_buf.h>
42
#include <sys/socket.h>
43
#include <sys/uio.h>
44
45
#include <machine/atomic.h>
46
47
#include <net/if.h>
48
#include <net/bpf.h>
49
#include <net/bpf_zerocopy.h>
50
#include <net/bpfdesc.h>
51
52
#include <vm/vm.h>
53
#include <vm/vm_param.h>
54
#include <vm/pmap.h>
55
#include <vm/vm_extern.h>
56
#include <vm/vm_map.h>
57
#include <vm/vm_page.h>
58
59
/*
60
* Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
61
* are mapped into the kernel address space using sf_bufs and used directly
62
* by BPF. Memory is wired since page faults cannot be tolerated in the
63
* contexts where the buffers are copied to (locks held, interrupt context,
64
* etc). Access to shared memory buffers is synchronized using a header on
65
* each buffer, allowing the number of system calls to go to zero as BPF
66
* reaches saturation (buffers filled as fast as they can be drained by the
67
* user process). Full details of the protocol for communicating between the
68
* user process and BPF may be found in bpf(4).
69
*/
70
71
/*
72
* Maximum number of pages per buffer. Since all BPF devices use two, the
73
* maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of
74
* sf_bufs may be an issue, so do not set this too high. On older systems,
75
* kernel address space limits may also be an issue.
76
*/
77
#define BPF_MAX_PAGES 512
78
79
/*
80
* struct zbuf describes a memory buffer loaned by a user process to the
81
* kernel. We represent this as a series of pages managed using an array of
82
* sf_bufs. Even though the memory is contiguous in user space, it may not
83
* be mapped contiguously in the kernel (i.e., a set of physically
84
* non-contiguous pages in the direct map region) so we must implement
85
* scatter-gather copying. One significant mitigating factor is that on
86
* systems with a direct memory map, we can avoid TLB misses.
87
*
88
* At the front of the shared memory region is a bpf_zbuf_header, which
89
* contains shared control data to allow user space and the kernel to
90
* synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
91
* knows that the space is not available.
92
*/
93
struct zbuf {
94
vm_offset_t zb_uaddr; /* User address at time of setup. */
95
size_t zb_size; /* Size of buffer, incl. header. */
96
u_int zb_numpages; /* Number of pages. */
97
int zb_flags; /* Flags on zbuf. */
98
struct sf_buf **zb_pages; /* Pages themselves. */
99
struct bpf_zbuf_header *zb_header; /* Shared header. */
100
};
101
102
/*
103
* When a buffer has been assigned to userspace, flag it as such, as the
104
* buffer may remain in the store position as a result of the user process
105
* not yet having acknowledged the buffer in the hold position yet.
106
*/
107
#define ZBUF_FLAG_ASSIGNED 0x00000001 /* Set when owned by user. */
108
109
/*
110
* Release a page we've previously wired.
111
*/
112
static void
113
zbuf_page_free(vm_page_t pp)
114
{
115
116
vm_page_unwire(pp, PQ_INACTIVE);
117
}
118
119
/*
120
* Free an sf_buf with attached page.
121
*/
122
static void
123
zbuf_sfbuf_free(struct sf_buf *sf)
124
{
125
vm_page_t pp;
126
127
pp = sf_buf_page(sf);
128
sf_buf_free(sf);
129
zbuf_page_free(pp);
130
}
131
132
/*
133
* Free a zbuf, including its page array, sbufs, and pages. Allow partially
134
* allocated zbufs to be freed so that it may be used even during a zbuf
135
* setup.
136
*/
137
static void
138
zbuf_free(struct zbuf *zb)
139
{
140
int i;
141
142
for (i = 0; i < zb->zb_numpages; i++) {
143
if (zb->zb_pages[i] != NULL)
144
zbuf_sfbuf_free(zb->zb_pages[i]);
145
}
146
free(zb->zb_pages, M_BPF);
147
free(zb, M_BPF);
148
}
149
150
/*
151
* Given a user pointer to a page of user memory, return an sf_buf for the
152
* page. Because we may be requesting quite a few sf_bufs, prefer failure to
153
* deadlock and use SFB_NOWAIT.
154
*/
155
static struct sf_buf *
156
zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
157
{
158
struct sf_buf *sf;
159
vm_page_t pp;
160
161
if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ |
162
VM_PROT_WRITE, &pp, 1) < 0)
163
return (NULL);
164
sf = sf_buf_alloc(pp, SFB_NOWAIT);
165
if (sf == NULL) {
166
zbuf_page_free(pp);
167
return (NULL);
168
}
169
return (sf);
170
}
171
172
/*
173
* Create a zbuf describing a range of user address space memory. Validate
174
* page alignment, size requirements, etc.
175
*/
176
static int
177
zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
178
struct zbuf **zbp)
179
{
180
struct zbuf *zb;
181
struct vm_map *map;
182
int error, i;
183
184
*zbp = NULL;
185
186
/*
187
* User address must be page-aligned.
188
*/
189
if (uaddr & PAGE_MASK)
190
return (EINVAL);
191
192
/*
193
* Length must be an integer number of full pages.
194
*/
195
if (len & PAGE_MASK)
196
return (EINVAL);
197
198
/*
199
* Length must not exceed per-buffer resource limit.
200
*/
201
if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
202
return (EINVAL);
203
204
/*
205
* Allocate the buffer and set up each page with is own sf_buf.
206
*/
207
error = 0;
208
zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
209
zb->zb_uaddr = uaddr;
210
zb->zb_size = len;
211
zb->zb_numpages = len / PAGE_SIZE;
212
zb->zb_pages = malloc(sizeof(struct sf_buf *) *
213
zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
214
map = &td->td_proc->p_vmspace->vm_map;
215
for (i = 0; i < zb->zb_numpages; i++) {
216
zb->zb_pages[i] = zbuf_sfbuf_get(map,
217
uaddr + (i * PAGE_SIZE));
218
if (zb->zb_pages[i] == NULL) {
219
error = EFAULT;
220
goto error;
221
}
222
}
223
zb->zb_header =
224
(struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
225
bzero(zb->zb_header, sizeof(*zb->zb_header));
226
*zbp = zb;
227
return (0);
228
229
error:
230
zbuf_free(zb);
231
return (error);
232
}
233
234
/*
235
* Copy bytes from a source into the specified zbuf. The caller is
236
* responsible for performing bounds checking, etc.
237
*/
238
void
239
bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
240
void *src, u_int len)
241
{
242
u_int count, page, poffset;
243
u_char *src_bytes;
244
struct zbuf *zb;
245
246
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
247
("bpf_zerocopy_append_bytes: not in zbuf mode"));
248
KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
249
250
src_bytes = (u_char *)src;
251
zb = (struct zbuf *)buf;
252
253
KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,
254
("bpf_zerocopy_append_bytes: ZBUF_FLAG_ASSIGNED"));
255
256
/*
257
* Scatter-gather copy to user pages mapped into kernel address space
258
* using sf_bufs: copy up to a page at a time.
259
*/
260
offset += sizeof(struct bpf_zbuf_header);
261
page = offset / PAGE_SIZE;
262
poffset = offset % PAGE_SIZE;
263
while (len > 0) {
264
KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
265
" page overflow (%d p %d np)\n", page, zb->zb_numpages));
266
267
count = min(len, PAGE_SIZE - poffset);
268
bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
269
poffset, count);
270
poffset += count;
271
if (poffset == PAGE_SIZE) {
272
poffset = 0;
273
page++;
274
}
275
KASSERT(poffset < PAGE_SIZE,
276
("bpf_zerocopy_append_bytes: page offset overflow (%d)",
277
poffset));
278
len -= count;
279
src_bytes += count;
280
}
281
}
282
283
/*
284
* Copy bytes from an mbuf chain to the specified zbuf: copying will be
285
* scatter-gather both from mbufs, which may be fragmented over memory, and
286
* to pages, which may not be contiguously mapped in kernel address space.
287
* As with bpf_zerocopy_append_bytes(), the caller is responsible for
288
* checking that this will not exceed the buffer limit.
289
*/
290
void
291
bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
292
void *src, u_int len)
293
{
294
u_int count, moffset, page, poffset;
295
const struct mbuf *m;
296
struct zbuf *zb;
297
298
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
299
("bpf_zerocopy_append_mbuf not in zbuf mode"));
300
KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
301
302
m = (struct mbuf *)src;
303
zb = (struct zbuf *)buf;
304
305
KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,
306
("bpf_zerocopy_append_mbuf: ZBUF_FLAG_ASSIGNED"));
307
308
/*
309
* Scatter gather both from an mbuf chain and to a user page set
310
* mapped into kernel address space using sf_bufs. If we're lucky,
311
* each mbuf requires one copy operation, but if page alignment and
312
* mbuf alignment work out less well, we'll be doing two copies per
313
* mbuf.
314
*/
315
offset += sizeof(struct bpf_zbuf_header);
316
page = offset / PAGE_SIZE;
317
poffset = offset % PAGE_SIZE;
318
moffset = 0;
319
while (len > 0) {
320
KASSERT(page < zb->zb_numpages,
321
("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
322
"np)\n", page, zb->zb_numpages));
323
KASSERT(m != NULL,
324
("bpf_zerocopy_append_mbuf: end of mbuf chain"));
325
326
count = min(m->m_len - moffset, len);
327
count = min(count, PAGE_SIZE - poffset);
328
bcopy(mtod(m, u_char *) + moffset,
329
((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
330
count);
331
poffset += count;
332
if (poffset == PAGE_SIZE) {
333
poffset = 0;
334
page++;
335
}
336
KASSERT(poffset < PAGE_SIZE,
337
("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
338
poffset));
339
moffset += count;
340
if (moffset == m->m_len) {
341
m = m->m_next;
342
moffset = 0;
343
}
344
len -= count;
345
}
346
}
347
348
/*
349
* Notification from the BPF framework that a buffer in the store position is
350
* rejecting packets and may be considered full. We mark the buffer as
351
* immutable and assign to userspace so that it is immediately available for
352
* the user process to access.
353
*/
354
void
355
bpf_zerocopy_buffull(struct bpf_d *d)
356
{
357
struct zbuf *zb;
358
359
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
360
("bpf_zerocopy_buffull: not in zbuf mode"));
361
362
zb = (struct zbuf *)d->bd_sbuf;
363
KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL"));
364
365
if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {
366
zb->zb_flags |= ZBUF_FLAG_ASSIGNED;
367
zb->zb_header->bzh_kernel_len = d->bd_slen;
368
atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
369
}
370
}
371
372
/*
373
* Notification from the BPF framework that a buffer has moved into the held
374
* slot on a descriptor. Zero-copy BPF will update the shared page to let
375
* the user process know and flag the buffer as assigned if it hasn't already
376
* been marked assigned due to filling while it was in the store position.
377
*
378
* Note: identical logic as in bpf_zerocopy_buffull(), except that we operate
379
* on bd_hbuf and bd_hlen.
380
*/
381
void
382
bpf_zerocopy_bufheld(struct bpf_d *d)
383
{
384
struct zbuf *zb;
385
386
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
387
("bpf_zerocopy_bufheld: not in zbuf mode"));
388
389
zb = (struct zbuf *)d->bd_hbuf;
390
KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
391
392
if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {
393
zb->zb_flags |= ZBUF_FLAG_ASSIGNED;
394
zb->zb_header->bzh_kernel_len = d->bd_hlen;
395
atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
396
}
397
}
398
399
/*
400
* Notification from the BPF framework that the free buffer has been been
401
* rotated out of the held position to the free position. This happens when
402
* the user acknowledges the held buffer.
403
*/
404
void
405
bpf_zerocopy_buf_reclaimed(struct bpf_d *d)
406
{
407
struct zbuf *zb;
408
409
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
410
("bpf_zerocopy_reclaim_buf: not in zbuf mode"));
411
412
KASSERT(d->bd_fbuf != NULL,
413
("bpf_zerocopy_buf_reclaimed: NULL free buf"));
414
zb = (struct zbuf *)d->bd_fbuf;
415
zb->zb_flags &= ~ZBUF_FLAG_ASSIGNED;
416
}
417
418
/*
419
* Query from the BPF framework regarding whether the buffer currently in the
420
* held position can be moved to the free position, which can be indicated by
421
* the user process making their generation number equal to the kernel
422
* generation number.
423
*/
424
int
425
bpf_zerocopy_canfreebuf(struct bpf_d *d)
426
{
427
struct zbuf *zb;
428
429
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
430
("bpf_zerocopy_canfreebuf: not in zbuf mode"));
431
432
zb = (struct zbuf *)d->bd_hbuf;
433
if (zb == NULL)
434
return (0);
435
if (zb->zb_header->bzh_kernel_gen ==
436
atomic_load_acq_int(&zb->zb_header->bzh_user_gen))
437
return (1);
438
return (0);
439
}
440
441
/*
442
* Query from the BPF framework as to whether or not the buffer current in
443
* the store position can actually be written to. This may return false if
444
* the store buffer is assigned to userspace before the hold buffer is
445
* acknowledged.
446
*/
447
int
448
bpf_zerocopy_canwritebuf(struct bpf_d *d)
449
{
450
struct zbuf *zb;
451
452
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
453
("bpf_zerocopy_canwritebuf: not in zbuf mode"));
454
455
zb = (struct zbuf *)d->bd_sbuf;
456
KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL"));
457
458
if (zb->zb_flags & ZBUF_FLAG_ASSIGNED)
459
return (0);
460
return (1);
461
}
462
463
/*
464
* Free zero copy buffers at request of descriptor.
465
*/
466
void
467
bpf_zerocopy_free(struct bpf_d *d)
468
{
469
struct zbuf *zb;
470
471
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
472
("bpf_zerocopy_free: not in zbuf mode"));
473
474
zb = (struct zbuf *)d->bd_sbuf;
475
if (zb != NULL)
476
zbuf_free(zb);
477
zb = (struct zbuf *)d->bd_hbuf;
478
if (zb != NULL)
479
zbuf_free(zb);
480
zb = (struct zbuf *)d->bd_fbuf;
481
if (zb != NULL)
482
zbuf_free(zb);
483
}
484
485
/*
486
* Ioctl to return the maximum buffer size.
487
*/
488
int
489
bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
490
{
491
492
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
493
("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
494
495
*i = BPF_MAX_PAGES * PAGE_SIZE;
496
return (0);
497
}
498
499
/*
500
* Ioctl to force rotation of the two buffers, if there's any data available.
501
* This can be used by user space to implement timeouts when waiting for a
502
* buffer to fill.
503
*/
504
int
505
bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
506
struct bpf_zbuf *bz)
507
{
508
struct zbuf *bzh;
509
510
bzero(bz, sizeof(*bz));
511
BPFD_LOCK(d);
512
if (d->bd_hbuf == NULL && d->bd_slen != 0) {
513
ROTATE_BUFFERS(d);
514
bzh = (struct zbuf *)d->bd_hbuf;
515
bz->bz_bufa = (void *)bzh->zb_uaddr;
516
bz->bz_buflen = d->bd_hlen;
517
}
518
BPFD_UNLOCK(d);
519
return (0);
520
}
521
522
/*
523
* Ioctl to configure zero-copy buffers -- may be done only once.
524
*/
525
int
526
bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
527
struct bpf_zbuf *bz)
528
{
529
struct zbuf *zba, *zbb;
530
int error;
531
532
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
533
("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
534
535
/*
536
* Must set both buffers. Cannot clear them.
537
*/
538
if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
539
return (EINVAL);
540
541
/*
542
* Buffers must have a size greater than 0. Alignment and other size
543
* validity checking is done in zbuf_setup().
544
*/
545
if (bz->bz_buflen == 0)
546
return (EINVAL);
547
548
/*
549
* Allocate new buffers.
550
*/
551
error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
552
&zba);
553
if (error)
554
return (error);
555
error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
556
&zbb);
557
if (error) {
558
zbuf_free(zba);
559
return (error);
560
}
561
562
/*
563
* We only allow buffers to be installed once, so atomically check
564
* that no buffers are currently installed and install new buffers.
565
*/
566
BPFD_LOCK(d);
567
if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
568
d->bd_bif != NULL) {
569
BPFD_UNLOCK(d);
570
zbuf_free(zba);
571
zbuf_free(zbb);
572
return (EINVAL);
573
}
574
575
/*
576
* Point BPF descriptor at buffers; initialize sbuf as zba so that
577
* it is always filled first in the sequence, per bpf(4).
578
*/
579
d->bd_fbuf = (caddr_t)zbb;
580
d->bd_sbuf = (caddr_t)zba;
581
d->bd_slen = 0;
582
d->bd_hlen = 0;
583
584
/*
585
* We expose only the space left in the buffer after the size of the
586
* shared management region.
587
*/
588
d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
589
BPFD_UNLOCK(d);
590
return (0);
591
}
592
593