/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2007 Seccuris Inc.4* All rights reserved.5*6* This software was developed by Robert N. M. Watson under contract to7* Seccuris Inc.8*9* Redistribution and use in source and binary forms, with or without10* modification, are permitted provided that the following conditions11* are met:12* 1. Redistributions of source code must retain the above copyright13* notice, this list of conditions and the following disclaimer.14* 2. Redistributions in binary form must reproduce the above copyright15* notice, this list of conditions and the following disclaimer in the16* documentation and/or other materials provided with the distribution.17*18* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND19* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE20* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE21* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE22* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL23* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS24* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)25* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT26* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY27* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF28* SUCH DAMAGE.29*/3031#include <sys/cdefs.h>32#include "opt_bpf.h"3334#include <sys/param.h>35#include <sys/lock.h>36#include <sys/malloc.h>37#include <sys/mbuf.h>38#include <sys/mutex.h>39#include <sys/proc.h>40#include <sys/sf_buf.h>41#include <sys/socket.h>42#include <sys/uio.h>4344#include <machine/atomic.h>4546#include <net/if.h>47#include <net/bpf.h>48#include <net/bpf_zerocopy.h>49#include <net/bpfdesc.h>5051#include <vm/vm.h>52#include <vm/vm_param.h>53#include <vm/pmap.h>54#include <vm/vm_extern.h>55#include <vm/vm_map.h>56#include <vm/vm_page.h>5758/*59* Zero-copy buffer scheme for BPF: user space "donates" two buffers, which60* are mapped into the kernel address space using sf_bufs and used directly61* by BPF. Memory is wired since page faults cannot be tolerated in the62* contexts where the buffers are copied to (locks held, interrupt context,63* etc). Access to shared memory buffers is synchronized using a header on64* each buffer, allowing the number of system calls to go to zero as BPF65* reaches saturation (buffers filled as fast as they can be drained by the66* user process). Full details of the protocol for communicating between the67* user process and BPF may be found in bpf(4).68*/6970/*71* Maximum number of pages per buffer. Since all BPF devices use two, the72* maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of73* sf_bufs may be an issue, so do not set this too high. On older systems,74* kernel address space limits may also be an issue.75*/76#define BPF_MAX_PAGES 5127778/*79* struct zbuf describes a memory buffer loaned by a user process to the80* kernel. We represent this as a series of pages managed using an array of81* sf_bufs. Even though the memory is contiguous in user space, it may not82* be mapped contiguously in the kernel (i.e., a set of physically83* non-contiguous pages in the direct map region) so we must implement84* scatter-gather copying. One significant mitigating factor is that on85* systems with a direct memory map, we can avoid TLB misses.86*87* At the front of the shared memory region is a bpf_zbuf_header, which88* contains shared control data to allow user space and the kernel to89* synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF90* knows that the space is not available.91*/92struct zbuf {93vm_offset_t zb_uaddr; /* User address at time of setup. */94size_t zb_size; /* Size of buffer, incl. header. */95u_int zb_numpages; /* Number of pages. */96int zb_flags; /* Flags on zbuf. */97struct sf_buf **zb_pages; /* Pages themselves. */98struct bpf_zbuf_header *zb_header; /* Shared header. */99};100101/*102* When a buffer has been assigned to userspace, flag it as such, as the103* buffer may remain in the store position as a result of the user process104* not yet having acknowledged the buffer in the hold position yet.105*/106#define ZBUF_FLAG_ASSIGNED 0x00000001 /* Set when owned by user. */107108/*109* Release a page we've previously wired.110*/111static void112zbuf_page_free(vm_page_t pp)113{114115vm_page_unwire(pp, PQ_INACTIVE);116}117118/*119* Free an sf_buf with attached page.120*/121static void122zbuf_sfbuf_free(struct sf_buf *sf)123{124vm_page_t pp;125126pp = sf_buf_page(sf);127sf_buf_free(sf);128zbuf_page_free(pp);129}130131/*132* Free a zbuf, including its page array, sbufs, and pages. Allow partially133* allocated zbufs to be freed so that it may be used even during a zbuf134* setup.135*/136static void137zbuf_free(struct zbuf *zb)138{139int i;140141for (i = 0; i < zb->zb_numpages; i++) {142if (zb->zb_pages[i] != NULL)143zbuf_sfbuf_free(zb->zb_pages[i]);144}145free(zb->zb_pages, M_BPF);146free(zb, M_BPF);147}148149/*150* Given a user pointer to a page of user memory, return an sf_buf for the151* page. Because we may be requesting quite a few sf_bufs, prefer failure to152* deadlock and use SFB_NOWAIT.153*/154static struct sf_buf *155zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)156{157struct sf_buf *sf;158vm_page_t pp;159160if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ |161VM_PROT_WRITE, &pp, 1) < 0)162return (NULL);163sf = sf_buf_alloc(pp, SFB_NOWAIT);164if (sf == NULL) {165zbuf_page_free(pp);166return (NULL);167}168return (sf);169}170171/*172* Create a zbuf describing a range of user address space memory. Validate173* page alignment, size requirements, etc.174*/175static int176zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,177struct zbuf **zbp)178{179struct zbuf *zb;180struct vm_map *map;181int error, i;182183*zbp = NULL;184185/*186* User address must be page-aligned.187*/188if (uaddr & PAGE_MASK)189return (EINVAL);190191/*192* Length must be an integer number of full pages.193*/194if (len & PAGE_MASK)195return (EINVAL);196197/*198* Length must not exceed per-buffer resource limit.199*/200if ((len / PAGE_SIZE) > BPF_MAX_PAGES)201return (EINVAL);202203/*204* Allocate the buffer and set up each page with is own sf_buf.205*/206error = 0;207zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);208zb->zb_uaddr = uaddr;209zb->zb_size = len;210zb->zb_numpages = len / PAGE_SIZE;211zb->zb_pages = malloc(sizeof(struct sf_buf *) *212zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);213map = &td->td_proc->p_vmspace->vm_map;214for (i = 0; i < zb->zb_numpages; i++) {215zb->zb_pages[i] = zbuf_sfbuf_get(map,216uaddr + (i * PAGE_SIZE));217if (zb->zb_pages[i] == NULL) {218error = EFAULT;219goto error;220}221}222zb->zb_header =223(struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);224bzero(zb->zb_header, sizeof(*zb->zb_header));225*zbp = zb;226return (0);227228error:229zbuf_free(zb);230return (error);231}232233/*234* Copy bytes from a source into the specified zbuf. The caller is235* responsible for performing bounds checking, etc.236*/237void238bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,239void *src, u_int len)240{241u_int count, page, poffset;242u_char *src_bytes;243struct zbuf *zb;244245KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,246("bpf_zerocopy_append_bytes: not in zbuf mode"));247KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));248249src_bytes = (u_char *)src;250zb = (struct zbuf *)buf;251252KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,253("bpf_zerocopy_append_bytes: ZBUF_FLAG_ASSIGNED"));254255/*256* Scatter-gather copy to user pages mapped into kernel address space257* using sf_bufs: copy up to a page at a time.258*/259offset += sizeof(struct bpf_zbuf_header);260page = offset / PAGE_SIZE;261poffset = offset % PAGE_SIZE;262while (len > 0) {263KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"264" page overflow (%d p %d np)\n", page, zb->zb_numpages));265266count = min(len, PAGE_SIZE - poffset);267bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +268poffset, count);269poffset += count;270if (poffset == PAGE_SIZE) {271poffset = 0;272page++;273}274KASSERT(poffset < PAGE_SIZE,275("bpf_zerocopy_append_bytes: page offset overflow (%d)",276poffset));277len -= count;278src_bytes += count;279}280}281282/*283* Copy bytes from an mbuf chain to the specified zbuf: copying will be284* scatter-gather both from mbufs, which may be fragmented over memory, and285* to pages, which may not be contiguously mapped in kernel address space.286* As with bpf_zerocopy_append_bytes(), the caller is responsible for287* checking that this will not exceed the buffer limit.288*/289void290bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,291void *src, u_int len)292{293u_int count, moffset, page, poffset;294const struct mbuf *m;295struct zbuf *zb;296297KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,298("bpf_zerocopy_append_mbuf not in zbuf mode"));299KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));300301m = (struct mbuf *)src;302zb = (struct zbuf *)buf;303304KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,305("bpf_zerocopy_append_mbuf: ZBUF_FLAG_ASSIGNED"));306307/*308* Scatter gather both from an mbuf chain and to a user page set309* mapped into kernel address space using sf_bufs. If we're lucky,310* each mbuf requires one copy operation, but if page alignment and311* mbuf alignment work out less well, we'll be doing two copies per312* mbuf.313*/314offset += sizeof(struct bpf_zbuf_header);315page = offset / PAGE_SIZE;316poffset = offset % PAGE_SIZE;317moffset = 0;318while (len > 0) {319KASSERT(page < zb->zb_numpages,320("bpf_zerocopy_append_mbuf: page overflow (%d p %d "321"np)\n", page, zb->zb_numpages));322KASSERT(m != NULL,323("bpf_zerocopy_append_mbuf: end of mbuf chain"));324325count = min(m->m_len - moffset, len);326count = min(count, PAGE_SIZE - poffset);327bcopy(mtod(m, u_char *) + moffset,328((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,329count);330poffset += count;331if (poffset == PAGE_SIZE) {332poffset = 0;333page++;334}335KASSERT(poffset < PAGE_SIZE,336("bpf_zerocopy_append_mbuf: page offset overflow (%d)",337poffset));338moffset += count;339if (moffset == m->m_len) {340m = m->m_next;341moffset = 0;342}343len -= count;344}345}346347/*348* Notification from the BPF framework that a buffer in the store position is349* rejecting packets and may be considered full. We mark the buffer as350* immutable and assign to userspace so that it is immediately available for351* the user process to access.352*/353void354bpf_zerocopy_buffull(struct bpf_d *d)355{356struct zbuf *zb;357358KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,359("bpf_zerocopy_buffull: not in zbuf mode"));360361zb = (struct zbuf *)d->bd_sbuf;362KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL"));363364if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {365zb->zb_flags |= ZBUF_FLAG_ASSIGNED;366zb->zb_header->bzh_kernel_len = d->bd_slen;367atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);368}369}370371/*372* Notification from the BPF framework that a buffer has moved into the held373* slot on a descriptor. Zero-copy BPF will update the shared page to let374* the user process know and flag the buffer as assigned if it hasn't already375* been marked assigned due to filling while it was in the store position.376*377* Note: identical logic as in bpf_zerocopy_buffull(), except that we operate378* on bd_hbuf and bd_hlen.379*/380void381bpf_zerocopy_bufheld(struct bpf_d *d)382{383struct zbuf *zb;384385KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,386("bpf_zerocopy_bufheld: not in zbuf mode"));387388zb = (struct zbuf *)d->bd_hbuf;389KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));390391if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {392zb->zb_flags |= ZBUF_FLAG_ASSIGNED;393zb->zb_header->bzh_kernel_len = d->bd_hlen;394atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);395}396}397398/*399* Notification from the BPF framework that the free buffer has been been400* rotated out of the held position to the free position. This happens when401* the user acknowledges the held buffer.402*/403void404bpf_zerocopy_buf_reclaimed(struct bpf_d *d)405{406struct zbuf *zb;407408KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,409("bpf_zerocopy_reclaim_buf: not in zbuf mode"));410411KASSERT(d->bd_fbuf != NULL,412("bpf_zerocopy_buf_reclaimed: NULL free buf"));413zb = (struct zbuf *)d->bd_fbuf;414zb->zb_flags &= ~ZBUF_FLAG_ASSIGNED;415}416417/*418* Query from the BPF framework regarding whether the buffer currently in the419* held position can be moved to the free position, which can be indicated by420* the user process making their generation number equal to the kernel421* generation number.422*/423int424bpf_zerocopy_canfreebuf(struct bpf_d *d)425{426struct zbuf *zb;427428KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,429("bpf_zerocopy_canfreebuf: not in zbuf mode"));430431zb = (struct zbuf *)d->bd_hbuf;432if (zb == NULL)433return (0);434if (zb->zb_header->bzh_kernel_gen ==435atomic_load_acq_int(&zb->zb_header->bzh_user_gen))436return (1);437return (0);438}439440/*441* Query from the BPF framework as to whether or not the buffer current in442* the store position can actually be written to. This may return false if443* the store buffer is assigned to userspace before the hold buffer is444* acknowledged.445*/446int447bpf_zerocopy_canwritebuf(struct bpf_d *d)448{449struct zbuf *zb;450451KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,452("bpf_zerocopy_canwritebuf: not in zbuf mode"));453454zb = (struct zbuf *)d->bd_sbuf;455KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL"));456457if (zb->zb_flags & ZBUF_FLAG_ASSIGNED)458return (0);459return (1);460}461462/*463* Free zero copy buffers at request of descriptor.464*/465void466bpf_zerocopy_free(struct bpf_d *d)467{468struct zbuf *zb;469470KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,471("bpf_zerocopy_free: not in zbuf mode"));472473zb = (struct zbuf *)d->bd_sbuf;474if (zb != NULL)475zbuf_free(zb);476zb = (struct zbuf *)d->bd_hbuf;477if (zb != NULL)478zbuf_free(zb);479zb = (struct zbuf *)d->bd_fbuf;480if (zb != NULL)481zbuf_free(zb);482}483484/*485* Ioctl to return the maximum buffer size.486*/487int488bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)489{490491KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,492("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));493494*i = BPF_MAX_PAGES * PAGE_SIZE;495return (0);496}497498/*499* Ioctl to force rotation of the two buffers, if there's any data available.500* This can be used by user space to implement timeouts when waiting for a501* buffer to fill.502*/503int504bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,505struct bpf_zbuf *bz)506{507struct zbuf *bzh;508509bzero(bz, sizeof(*bz));510BPFD_LOCK(d);511if (d->bd_hbuf == NULL && d->bd_slen != 0) {512ROTATE_BUFFERS(d);513bzh = (struct zbuf *)d->bd_hbuf;514bz->bz_bufa = (void *)bzh->zb_uaddr;515bz->bz_buflen = d->bd_hlen;516}517BPFD_UNLOCK(d);518return (0);519}520521/*522* Ioctl to configure zero-copy buffers -- may be done only once.523*/524int525bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,526struct bpf_zbuf *bz)527{528struct zbuf *zba, *zbb;529int error;530531KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,532("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));533534/*535* Must set both buffers. Cannot clear them.536*/537if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)538return (EINVAL);539540/*541* Buffers must have a size greater than 0. Alignment and other size542* validity checking is done in zbuf_setup().543*/544if (bz->bz_buflen == 0)545return (EINVAL);546547/*548* Allocate new buffers.549*/550error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,551&zba);552if (error)553return (error);554error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,555&zbb);556if (error) {557zbuf_free(zba);558return (error);559}560561/*562* We only allow buffers to be installed once, so atomically check563* that no buffers are currently installed and install new buffers.564*/565BPFD_LOCK(d);566if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||567d->bd_bif != NULL) {568BPFD_UNLOCK(d);569zbuf_free(zba);570zbuf_free(zbb);571return (EINVAL);572}573574/*575* Point BPF descriptor at buffers; initialize sbuf as zba so that576* it is always filled first in the sequence, per bpf(4).577*/578d->bd_fbuf = (caddr_t)zbb;579d->bd_sbuf = (caddr_t)zba;580d->bd_slen = 0;581d->bd_hlen = 0;582583/*584* We expose only the space left in the buffer after the size of the585* shared management region.586*/587d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);588BPFD_UNLOCK(d);589return (0);590}591592593