Path: blob/main/sys/compat/linuxkpi/common/src/linux_compat.c
102415 views
/*-1* Copyright (c) 2010 Isilon Systems, Inc.2* Copyright (c) 2010 iX Systems, Inc.3* Copyright (c) 2010 Panasas, Inc.4* Copyright (c) 2013-2021 Mellanox Technologies, Ltd.5* All rights reserved.6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions9* are met:10* 1. Redistributions of source code must retain the above copyright11* notice unmodified, this list of conditions, and the following12* disclaimer.13* 2. Redistributions in binary form must reproduce the above copyright14* notice, this list of conditions and the following disclaimer in the15* documentation and/or other materials provided with the distribution.16*17* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR18* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES19* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.20* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,21* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT22* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,23* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY24* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT25* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF26* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.27*/2829#include <sys/cdefs.h>30#include "opt_global.h"31#include "opt_stack.h"3233#include <sys/param.h>34#include <sys/systm.h>35#include <sys/malloc.h>36#include <sys/kernel.h>37#include <sys/sysctl.h>38#include <sys/proc.h>39#include <sys/sglist.h>40#include <sys/sleepqueue.h>41#include <sys/refcount.h>42#include <sys/lock.h>43#include <sys/mutex.h>44#include <sys/bus.h>45#include <sys/eventhandler.h>46#include <sys/fcntl.h>47#include <sys/file.h>48#include <sys/filio.h>49#include <sys/rwlock.h>50#include <sys/mman.h>51#include <sys/stack.h>52#include <sys/stdarg.h>53#include <sys/sysent.h>54#include <sys/time.h>55#include <sys/user.h>5657#include <vm/vm.h>58#include <vm/pmap.h>59#include <vm/vm_object.h>60#include <vm/vm_page.h>61#include <vm/vm_pager.h>62#include <vm/vm_radix.h>6364#if defined(__i386__) || defined(__amd64__)65#include <machine/cputypes.h>66#include <machine/md_var.h>67#endif6869#include <linux/kobject.h>70#include <linux/cpu.h>71#include <linux/device.h>72#include <linux/slab.h>73#include <linux/module.h>74#include <linux/moduleparam.h>75#include <linux/cdev.h>76#include <linux/file.h>77#include <linux/fs.h>78#include <linux/sysfs.h>79#include <linux/mm.h>80#include <linux/io.h>81#include <linux/vmalloc.h>82#include <linux/netdevice.h>83#include <linux/timer.h>84#include <linux/interrupt.h>85#include <linux/uaccess.h>86#include <linux/utsname.h>87#include <linux/list.h>88#include <linux/kthread.h>89#include <linux/kernel.h>90#include <linux/compat.h>91#include <linux/io-mapping.h>92#include <linux/poll.h>93#include <linux/smp.h>94#include <linux/wait_bit.h>95#include <linux/rcupdate.h>96#include <linux/interval_tree.h>97#include <linux/interval_tree_generic.h>98#include <linux/printk.h>99#include <linux/seq_file.h>100101#if defined(__i386__) || defined(__amd64__)102#include <asm/smp.h>103#include <asm/processor.h>104#endif105106#include <xen/xen.h>107#ifdef XENHVM108#undef xen_pv_domain109#undef xen_initial_domain110/* xen/xen-os.h redefines __must_check */111#undef __must_check112#include <xen/xen-os.h>113#endif114115SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,116"LinuxKPI parameters");117118int linuxkpi_debug;119SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,120&linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");121122int linuxkpi_rcu_debug;123SYSCTL_INT(_compat_linuxkpi, OID_AUTO, rcu_debug, CTLFLAG_RWTUN,124&linuxkpi_rcu_debug, 0, "Set to enable RCU warning. Clear to disable.");125126int linuxkpi_warn_dump_stack = 0;127SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN,128&linuxkpi_warn_dump_stack, 0,129"Set to enable stack traces from WARN_ON(). Clear to disable.");130131static struct timeval lkpi_net_lastlog;132static int lkpi_net_curpps;133static int lkpi_net_maxpps = 99;134SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN,135&lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second.");136137MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat");138139#include <linux/rbtree.h>140/* Undo Linux compat changes. */141#undef RB_ROOT142#undef file143#undef cdev144#define RB_ROOT(head) (head)->rbh_root145146static void linux_destroy_dev(struct linux_cdev *);147static void linux_cdev_deref(struct linux_cdev *ldev);148static struct vm_area_struct *linux_cdev_handle_find(void *handle);149150cpumask_t cpu_online_mask;151static cpumask_t **static_single_cpu_mask;152static cpumask_t *static_single_cpu_mask_lcs;153struct kobject linux_class_root;154struct device linux_root_device;155struct class linux_class_misc;156struct list_head pci_drivers;157struct list_head pci_devices;158spinlock_t pci_lock;159struct uts_namespace init_uts_ns;160161unsigned long linux_timer_hz_mask;162163wait_queue_head_t linux_bit_waitq;164wait_queue_head_t linux_var_waitq;165166int167panic_cmp(struct rb_node *one, struct rb_node *two)168{169panic("no cmp");170}171172RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);173174#define START(node) ((node)->start)175#define LAST(node) ((node)->last)176177INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START,178LAST,, lkpi_interval_tree)179180static void181linux_device_release(struct device *dev)182{183pr_debug("linux_device_release: %s\n", dev_name(dev));184kfree(dev);185}186187static ssize_t188linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)189{190struct class_attribute *dattr;191ssize_t error;192193dattr = container_of(attr, struct class_attribute, attr);194error = -EIO;195if (dattr->show)196error = dattr->show(container_of(kobj, struct class, kobj),197dattr, buf);198return (error);199}200201static ssize_t202linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,203size_t count)204{205struct class_attribute *dattr;206ssize_t error;207208dattr = container_of(attr, struct class_attribute, attr);209error = -EIO;210if (dattr->store)211error = dattr->store(container_of(kobj, struct class, kobj),212dattr, buf, count);213return (error);214}215216static void217linux_class_release(struct kobject *kobj)218{219struct class *class;220221class = container_of(kobj, struct class, kobj);222if (class->class_release)223class->class_release(class);224}225226static const struct sysfs_ops linux_class_sysfs = {227.show = linux_class_show,228.store = linux_class_store,229};230231const struct kobj_type linux_class_ktype = {232.release = linux_class_release,233.sysfs_ops = &linux_class_sysfs234};235236static void237linux_dev_release(struct kobject *kobj)238{239struct device *dev;240241dev = container_of(kobj, struct device, kobj);242/* This is the precedence defined by linux. */243if (dev->release)244dev->release(dev);245else if (dev->class && dev->class->dev_release)246dev->class->dev_release(dev);247}248249static ssize_t250linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)251{252struct device_attribute *dattr;253ssize_t error;254255dattr = container_of(attr, struct device_attribute, attr);256error = -EIO;257if (dattr->show)258error = dattr->show(container_of(kobj, struct device, kobj),259dattr, buf);260return (error);261}262263static ssize_t264linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,265size_t count)266{267struct device_attribute *dattr;268ssize_t error;269270dattr = container_of(attr, struct device_attribute, attr);271error = -EIO;272if (dattr->store)273error = dattr->store(container_of(kobj, struct device, kobj),274dattr, buf, count);275return (error);276}277278static const struct sysfs_ops linux_dev_sysfs = {279.show = linux_dev_show,280.store = linux_dev_store,281};282283const struct kobj_type linux_dev_ktype = {284.release = linux_dev_release,285.sysfs_ops = &linux_dev_sysfs286};287288struct device *289device_create(struct class *class, struct device *parent, dev_t devt,290void *drvdata, const char *fmt, ...)291{292struct device *dev;293va_list args;294295dev = kzalloc(sizeof(*dev), M_WAITOK);296dev->parent = parent;297dev->class = class;298dev->devt = devt;299dev->driver_data = drvdata;300dev->release = linux_device_release;301va_start(args, fmt);302kobject_set_name_vargs(&dev->kobj, fmt, args);303va_end(args);304device_register(dev);305306return (dev);307}308309struct device *310device_create_groups_vargs(struct class *class, struct device *parent,311dev_t devt, void *drvdata, const struct attribute_group **groups,312const char *fmt, va_list args)313{314struct device *dev = NULL;315int retval = -ENODEV;316317if (class == NULL || IS_ERR(class))318goto error;319320dev = kzalloc(sizeof(*dev), GFP_KERNEL);321if (!dev) {322retval = -ENOMEM;323goto error;324}325326dev->devt = devt;327dev->class = class;328dev->parent = parent;329dev->groups = groups;330dev->release = device_create_release;331/* device_initialize() needs the class and parent to be set */332device_initialize(dev);333dev_set_drvdata(dev, drvdata);334335retval = kobject_set_name_vargs(&dev->kobj, fmt, args);336if (retval)337goto error;338339retval = device_add(dev);340if (retval)341goto error;342343return dev;344345error:346put_device(dev);347return ERR_PTR(retval);348}349350struct class *351lkpi_class_create(const char *name)352{353struct class *class;354int error;355356class = kzalloc(sizeof(*class), M_WAITOK);357class->name = name;358class->class_release = linux_class_kfree;359error = class_register(class);360if (error) {361kfree(class);362return (NULL);363}364365return (class);366}367368static void369linux_kq_lock(void *arg)370{371spinlock_t *s = arg;372373spin_lock(s);374}375static void376linux_kq_unlock(void *arg)377{378spinlock_t *s = arg;379380spin_unlock(s);381}382383static void384linux_kq_assert_lock(void *arg, int what)385{386#ifdef INVARIANTS387spinlock_t *s = arg;388389if (what == LA_LOCKED)390mtx_assert(s, MA_OWNED);391else392mtx_assert(s, MA_NOTOWNED);393#endif394}395396static void397linux_file_kqfilter_poll(struct linux_file *, int);398399struct linux_file *400linux_file_alloc(void)401{402struct linux_file *filp;403404filp = kzalloc(sizeof(*filp), GFP_KERNEL);405406/* set initial refcount */407filp->f_count = 1;408409/* setup fields needed by kqueue support */410spin_lock_init(&filp->f_kqlock);411knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,412linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock);413414return (filp);415}416417void418linux_file_free(struct linux_file *filp)419{420if (filp->_file == NULL) {421if (filp->f_op != NULL && filp->f_op->release != NULL)422filp->f_op->release(filp->f_vnode, filp);423if (filp->f_shmem != NULL)424vm_object_deallocate(filp->f_shmem);425kfree_rcu(filp, rcu);426} else {427/*428* The close method of the character device or file429* will free the linux_file structure:430*/431_fdrop(filp->_file, curthread);432}433}434435struct linux_cdev *436cdev_alloc(void)437{438struct linux_cdev *cdev;439440cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);441kobject_init(&cdev->kobj, &linux_cdev_ktype);442cdev->refs = 1;443return (cdev);444}445446static int447linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,448vm_page_t *mres)449{450struct vm_area_struct *vmap;451452vmap = linux_cdev_handle_find(vm_obj->handle);453454MPASS(vmap != NULL);455MPASS(vmap->vm_private_data == vm_obj->handle);456457if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {458vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;459vm_page_t page;460461if (((*mres)->flags & PG_FICTITIOUS) != 0) {462/*463* If the passed in result page is a fake464* page, update it with the new physical465* address.466*/467page = *mres;468vm_page_updatefake(page, paddr, vm_obj->memattr);469} else {470/*471* Replace the passed in "mres" page with our472* own fake page and free up the all of the473* original pages.474*/475VM_OBJECT_WUNLOCK(vm_obj);476page = vm_page_getfake(paddr, vm_obj->memattr);477VM_OBJECT_WLOCK(vm_obj);478479vm_page_replace(page, vm_obj, (*mres)->pindex, *mres);480*mres = page;481}482vm_page_valid(page);483return (VM_PAGER_OK);484}485return (VM_PAGER_FAIL);486}487488static int489linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,490vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)491{492struct vm_area_struct *vmap;493int err;494495/* get VM area structure */496vmap = linux_cdev_handle_find(vm_obj->handle);497MPASS(vmap != NULL);498MPASS(vmap->vm_private_data == vm_obj->handle);499500VM_OBJECT_WUNLOCK(vm_obj);501502linux_set_current(curthread);503504down_write(&vmap->vm_mm->mmap_sem);505if (unlikely(vmap->vm_ops == NULL)) {506err = VM_FAULT_SIGBUS;507} else {508struct vm_fault vmf;509510/* fill out VM fault structure */511vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);512vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;513vmf.pgoff = 0;514vmf.page = NULL;515vmf.vma = vmap;516517vmap->vm_pfn_count = 0;518vmap->vm_pfn_pcount = &vmap->vm_pfn_count;519vmap->vm_obj = vm_obj;520521err = vmap->vm_ops->fault(&vmf);522523while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {524kern_yield(PRI_USER);525err = vmap->vm_ops->fault(&vmf);526}527}528529/* translate return code */530switch (err) {531case VM_FAULT_OOM:532err = VM_PAGER_AGAIN;533break;534case VM_FAULT_SIGBUS:535err = VM_PAGER_BAD;536break;537case VM_FAULT_NOPAGE:538/*539* By contract the fault handler will return having540* busied all the pages itself. If pidx is already541* found in the object, it will simply xbusy the first542* page and return with vm_pfn_count set to 1.543*/544*first = vmap->vm_pfn_first;545*last = *first + vmap->vm_pfn_count - 1;546err = VM_PAGER_OK;547break;548default:549err = VM_PAGER_ERROR;550break;551}552up_write(&vmap->vm_mm->mmap_sem);553VM_OBJECT_WLOCK(vm_obj);554return (err);555}556557static struct rwlock linux_vma_lock;558static TAILQ_HEAD(, vm_area_struct) linux_vma_head =559TAILQ_HEAD_INITIALIZER(linux_vma_head);560561static void562linux_cdev_handle_free(struct vm_area_struct *vmap)563{564/* Drop reference on vm_file */565if (vmap->vm_file != NULL)566fput(vmap->vm_file);567568/* Drop reference on mm_struct */569mmput(vmap->vm_mm);570571kfree(vmap);572}573574static void575linux_cdev_handle_remove(struct vm_area_struct *vmap)576{577rw_wlock(&linux_vma_lock);578TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);579rw_wunlock(&linux_vma_lock);580}581582static struct vm_area_struct *583linux_cdev_handle_find(void *handle)584{585struct vm_area_struct *vmap;586587rw_rlock(&linux_vma_lock);588TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {589if (vmap->vm_private_data == handle)590break;591}592rw_runlock(&linux_vma_lock);593return (vmap);594}595596static int597linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,598vm_ooffset_t foff, struct ucred *cred, u_short *color)599{600601MPASS(linux_cdev_handle_find(handle) != NULL);602*color = 0;603return (0);604}605606static void607linux_cdev_pager_dtor(void *handle)608{609const struct vm_operations_struct *vm_ops;610struct vm_area_struct *vmap;611612vmap = linux_cdev_handle_find(handle);613MPASS(vmap != NULL);614615/*616* Remove handle before calling close operation to prevent617* other threads from reusing the handle pointer.618*/619linux_cdev_handle_remove(vmap);620621down_write(&vmap->vm_mm->mmap_sem);622vm_ops = vmap->vm_ops;623if (likely(vm_ops != NULL))624vm_ops->close(vmap);625up_write(&vmap->vm_mm->mmap_sem);626627linux_cdev_handle_free(vmap);628}629630static struct cdev_pager_ops linux_cdev_pager_ops[2] = {631{632/* OBJT_MGTDEVICE */633.cdev_pg_populate = linux_cdev_pager_populate,634.cdev_pg_ctor = linux_cdev_pager_ctor,635.cdev_pg_dtor = linux_cdev_pager_dtor636},637{638/* OBJT_DEVICE */639.cdev_pg_fault = linux_cdev_pager_fault,640.cdev_pg_ctor = linux_cdev_pager_ctor,641.cdev_pg_dtor = linux_cdev_pager_dtor642},643};644645int646zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,647unsigned long size)648{649struct pctrie_iter pages;650vm_object_t obj;651vm_page_t m;652653obj = vma->vm_obj;654if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)655return (-ENOTSUP);656VM_OBJECT_RLOCK(obj);657vm_page_iter_limit_init(&pages, obj, OFF_TO_IDX(address + size));658VM_RADIX_FOREACH_FROM(m, &pages, OFF_TO_IDX(address))659pmap_remove_all(m);660VM_OBJECT_RUNLOCK(obj);661return (0);662}663664void665vma_set_file(struct vm_area_struct *vma, struct linux_file *file)666{667struct linux_file *tmp;668669/* Changing an anonymous vma with this is illegal */670get_file(file);671tmp = vma->vm_file;672vma->vm_file = file;673fput(tmp);674}675676static struct file_operations dummy_ldev_ops = {677/* XXXKIB */678};679680static struct linux_cdev dummy_ldev = {681.ops = &dummy_ldev_ops,682};683684#define LDEV_SI_DTR 0x0001685#define LDEV_SI_REF 0x0002686687static void688linux_get_fop(struct linux_file *filp, const struct file_operations **fop,689struct linux_cdev **dev)690{691struct linux_cdev *ldev;692u_int siref;693694ldev = filp->f_cdev;695*fop = filp->f_op;696if (ldev != NULL) {697if (ldev->kobj.ktype == &linux_cdev_static_ktype) {698refcount_acquire(&ldev->refs);699} else {700for (siref = ldev->siref;;) {701if ((siref & LDEV_SI_DTR) != 0) {702ldev = &dummy_ldev;703*fop = ldev->ops;704siref = ldev->siref;705MPASS((ldev->siref & LDEV_SI_DTR) == 0);706} else if (atomic_fcmpset_int(&ldev->siref,707&siref, siref + LDEV_SI_REF)) {708break;709}710}711}712}713*dev = ldev;714}715716static void717linux_drop_fop(struct linux_cdev *ldev)718{719720if (ldev == NULL)721return;722if (ldev->kobj.ktype == &linux_cdev_static_ktype) {723linux_cdev_deref(ldev);724} else {725MPASS(ldev->kobj.ktype == &linux_cdev_ktype);726MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);727atomic_subtract_int(&ldev->siref, LDEV_SI_REF);728}729}730731#define OPW(fp,td,code) ({ \732struct file *__fpop; \733__typeof(code) __retval; \734\735__fpop = (td)->td_fpop; \736(td)->td_fpop = (fp); \737__retval = (code); \738(td)->td_fpop = __fpop; \739__retval; \740})741742static int743linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,744struct file *file)745{746struct linux_cdev *ldev;747struct linux_file *filp;748const struct file_operations *fop;749int error;750751ldev = dev->si_drv1;752753filp = linux_file_alloc();754filp->f_dentry = &filp->f_dentry_store;755filp->f_op = ldev->ops;756filp->f_mode = file->f_flag;757filp->f_flags = file->f_flag;758filp->f_vnode = file->f_vnode;759filp->_file = file;760refcount_acquire(&ldev->refs);761filp->f_cdev = ldev;762763linux_set_current(td);764linux_get_fop(filp, &fop, &ldev);765766if (fop->open != NULL) {767error = -fop->open(file->f_vnode, filp);768if (error != 0) {769linux_drop_fop(ldev);770linux_cdev_deref(filp->f_cdev);771kfree(filp);772return (error);773}774}775776/* hold on to the vnode - used for fstat() */777vref(filp->f_vnode);778779/* release the file from devfs */780finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);781linux_drop_fop(ldev);782return (ENXIO);783}784785#define LINUX_IOCTL_MIN_PTR 0x10000UL786#define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)787788static inline int789linux_remap_address(void **uaddr, size_t len)790{791uintptr_t uaddr_val = (uintptr_t)(*uaddr);792793if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&794uaddr_val < LINUX_IOCTL_MAX_PTR)) {795struct task_struct *pts = current;796if (pts == NULL) {797*uaddr = NULL;798return (1);799}800801/* compute data offset */802uaddr_val -= LINUX_IOCTL_MIN_PTR;803804/* check that length is within bounds */805if ((len > IOCPARM_MAX) ||806(uaddr_val + len) > pts->bsd_ioctl_len) {807*uaddr = NULL;808return (1);809}810811/* re-add kernel buffer address */812uaddr_val += (uintptr_t)pts->bsd_ioctl_data;813814/* update address location */815*uaddr = (void *)uaddr_val;816return (1);817}818return (0);819}820821int822linux_copyin(const void *uaddr, void *kaddr, size_t len)823{824if (linux_remap_address(__DECONST(void **, &uaddr), len)) {825if (uaddr == NULL)826return (-EFAULT);827memcpy(kaddr, uaddr, len);828return (0);829}830return (-copyin(uaddr, kaddr, len));831}832833int834linux_copyout(const void *kaddr, void *uaddr, size_t len)835{836if (linux_remap_address(&uaddr, len)) {837if (uaddr == NULL)838return (-EFAULT);839memcpy(uaddr, kaddr, len);840return (0);841}842return (-copyout(kaddr, uaddr, len));843}844845size_t846linux_clear_user(void *_uaddr, size_t _len)847{848uint8_t *uaddr = _uaddr;849size_t len = _len;850851/* make sure uaddr is aligned before going into the fast loop */852while (((uintptr_t)uaddr & 7) != 0 && len > 7) {853if (subyte(uaddr, 0))854return (_len);855uaddr++;856len--;857}858859/* zero 8 bytes at a time */860while (len > 7) {861#ifdef __LP64__862if (suword64(uaddr, 0))863return (_len);864#else865if (suword32(uaddr, 0))866return (_len);867if (suword32(uaddr + 4, 0))868return (_len);869#endif870uaddr += 8;871len -= 8;872}873874/* zero fill end, if any */875while (len > 0) {876if (subyte(uaddr, 0))877return (_len);878uaddr++;879len--;880}881return (0);882}883884int885linux_access_ok(const void *uaddr, size_t len)886{887uintptr_t saddr;888uintptr_t eaddr;889890/* get start and end address */891saddr = (uintptr_t)uaddr;892eaddr = (uintptr_t)uaddr + len;893894/* verify addresses are valid for userspace */895return ((saddr == eaddr) ||896(eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));897}898899/*900* This function should return either EINTR or ERESTART depending on901* the signal type sent to this thread:902*/903static int904linux_get_error(struct task_struct *task, int error)905{906/* check for signal type interrupt code */907if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {908error = -linux_schedule_get_interrupt_value(task);909if (error == 0)910error = EINTR;911}912return (error);913}914915static int916linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,917const struct file_operations *fop, u_long cmd, caddr_t data,918struct thread *td)919{920struct task_struct *task = current;921unsigned size;922int error;923924size = IOCPARM_LEN(cmd);925/* refer to logic in sys_ioctl() */926if (size > 0) {927/*928* Setup hint for linux_copyin() and linux_copyout().929*930* Background: Linux code expects a user-space address931* while FreeBSD supplies a kernel-space address.932*/933task->bsd_ioctl_data = data;934task->bsd_ioctl_len = size;935data = (void *)LINUX_IOCTL_MIN_PTR;936} else {937/* fetch user-space pointer */938data = *(void **)data;939}940#ifdef COMPAT_FREEBSD32941if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {942/* try the compat IOCTL handler first */943if (fop->compat_ioctl != NULL) {944error = -OPW(fp, td, fop->compat_ioctl(filp,945cmd, (u_long)data));946} else {947error = ENOTTY;948}949950/* fallback to the regular IOCTL handler, if any */951if (error == ENOTTY && fop->unlocked_ioctl != NULL) {952error = -OPW(fp, td, fop->unlocked_ioctl(filp,953cmd, (u_long)data));954}955} else956#endif957{958if (fop->unlocked_ioctl != NULL) {959error = -OPW(fp, td, fop->unlocked_ioctl(filp,960cmd, (u_long)data));961} else {962error = ENOTTY;963}964}965if (size > 0) {966task->bsd_ioctl_data = NULL;967task->bsd_ioctl_len = 0;968}969970if (error == EWOULDBLOCK) {971/* update kqfilter status, if any */972linux_file_kqfilter_poll(filp,973LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);974} else {975error = linux_get_error(task, error);976}977return (error);978}979980#define LINUX_POLL_TABLE_NORMAL ((poll_table *)1)981982/*983* This function atomically updates the poll wakeup state and returns984* the previous state at the time of update.985*/986static uint8_t987linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)988{989int c, old;990991c = v->counter;992993while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)994c = old;995996return (c);997}998999static int1000linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)1001{1002static const uint8_t state[LINUX_FWQ_STATE_MAX] = {1003[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */1004[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */1005[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,1006[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */1007};1008struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);10091010switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {1011case LINUX_FWQ_STATE_QUEUED:1012linux_poll_wakeup(filp);1013return (1);1014default:1015return (0);1016}1017}10181019void1020linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)1021{1022static const uint8_t state[LINUX_FWQ_STATE_MAX] = {1023[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,1024[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */1025[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */1026[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,1027};10281029/* check if we are called inside the select system call */1030if (p == LINUX_POLL_TABLE_NORMAL)1031selrecord(curthread, &filp->f_selinfo);10321033switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {1034case LINUX_FWQ_STATE_INIT:1035/* NOTE: file handles can only belong to one wait-queue */1036filp->f_wait_queue.wqh = wqh;1037filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;1038add_wait_queue(wqh, &filp->f_wait_queue.wq);1039atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);1040break;1041default:1042break;1043}1044}10451046static void1047linux_poll_wait_dequeue(struct linux_file *filp)1048{1049static const uint8_t state[LINUX_FWQ_STATE_MAX] = {1050[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */1051[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,1052[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,1053[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,1054};10551056seldrain(&filp->f_selinfo);10571058switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {1059case LINUX_FWQ_STATE_NOT_READY:1060case LINUX_FWQ_STATE_QUEUED:1061case LINUX_FWQ_STATE_READY:1062remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);1063break;1064default:1065break;1066}1067}10681069void1070linux_poll_wakeup(struct linux_file *filp)1071{1072/* this function should be NULL-safe */1073if (filp == NULL)1074return;10751076selwakeup(&filp->f_selinfo);10771078spin_lock(&filp->f_kqlock);1079filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |1080LINUX_KQ_FLAG_NEED_WRITE;10811082/* make sure the "knote" gets woken up */1083KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);1084spin_unlock(&filp->f_kqlock);1085}10861087static struct linux_file *1088__get_file_rcu(struct linux_file **f)1089{1090struct linux_file *file1, *file2;10911092file1 = READ_ONCE(*f);1093if (file1 == NULL)1094return (NULL);10951096if (!refcount_acquire_if_not_zero(1097file1->_file == NULL ? &file1->f_count : &file1->_file->f_count))1098return (ERR_PTR(-EAGAIN));10991100file2 = READ_ONCE(*f);1101if (file2 == file1)1102return (file2);11031104fput(file1);1105return (ERR_PTR(-EAGAIN));1106}11071108struct linux_file *1109linux_get_file_rcu(struct linux_file **f)1110{1111struct linux_file *file1;11121113for (;;) {1114file1 = __get_file_rcu(f);1115if (file1 == NULL)1116return (NULL);11171118if (IS_ERR(file1))1119continue;11201121return (file1);1122}1123}11241125struct linux_file *1126get_file_active(struct linux_file **f)1127{1128struct linux_file *file1;11291130rcu_read_lock();1131file1 = __get_file_rcu(f);1132rcu_read_unlock();1133if (IS_ERR(file1))1134file1 = NULL;11351136return (file1);1137}11381139static void1140linux_file_kqfilter_detach(struct knote *kn)1141{1142struct linux_file *filp = kn->kn_hook;11431144spin_lock(&filp->f_kqlock);1145knlist_remove(&filp->f_selinfo.si_note, kn, 1);1146spin_unlock(&filp->f_kqlock);1147}11481149static int1150linux_file_kqfilter_read_event(struct knote *kn, long hint)1151{1152struct linux_file *filp = kn->kn_hook;11531154mtx_assert(&filp->f_kqlock, MA_OWNED);11551156return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);1157}11581159static int1160linux_file_kqfilter_write_event(struct knote *kn, long hint)1161{1162struct linux_file *filp = kn->kn_hook;11631164mtx_assert(&filp->f_kqlock, MA_OWNED);11651166return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);1167}11681169static const struct filterops linux_dev_kqfiltops_read = {1170.f_isfd = 1,1171.f_detach = linux_file_kqfilter_detach,1172.f_event = linux_file_kqfilter_read_event,1173.f_copy = knote_triv_copy,1174};11751176static const struct filterops linux_dev_kqfiltops_write = {1177.f_isfd = 1,1178.f_detach = linux_file_kqfilter_detach,1179.f_event = linux_file_kqfilter_write_event,1180.f_copy = knote_triv_copy,1181};11821183static void1184linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)1185{1186struct thread *td;1187const struct file_operations *fop;1188struct linux_cdev *ldev;1189int temp;11901191if ((filp->f_kqflags & kqflags) == 0)1192return;11931194td = curthread;11951196linux_get_fop(filp, &fop, &ldev);1197/* get the latest polling state */1198temp = OPW(filp->_file, td, fop->poll(filp, NULL));1199linux_drop_fop(ldev);12001201spin_lock(&filp->f_kqlock);1202/* clear kqflags */1203filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |1204LINUX_KQ_FLAG_NEED_WRITE);1205/* update kqflags */1206if ((temp & (POLLIN | POLLOUT)) != 0) {1207if ((temp & POLLIN) != 0)1208filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;1209if ((temp & POLLOUT) != 0)1210filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;12111212/* make sure the "knote" gets woken up */1213KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);1214}1215spin_unlock(&filp->f_kqlock);1216}12171218static int1219linux_file_kqfilter(struct file *file, struct knote *kn)1220{1221struct linux_file *filp;1222struct thread *td;1223int error;12241225td = curthread;1226filp = (struct linux_file *)file->f_data;1227filp->f_flags = file->f_flag;1228if (filp->f_op->poll == NULL)1229return (EINVAL);12301231spin_lock(&filp->f_kqlock);1232switch (kn->kn_filter) {1233case EVFILT_READ:1234filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;1235kn->kn_fop = &linux_dev_kqfiltops_read;1236kn->kn_hook = filp;1237knlist_add(&filp->f_selinfo.si_note, kn, 1);1238error = 0;1239break;1240case EVFILT_WRITE:1241filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;1242kn->kn_fop = &linux_dev_kqfiltops_write;1243kn->kn_hook = filp;1244knlist_add(&filp->f_selinfo.si_note, kn, 1);1245error = 0;1246break;1247default:1248error = EINVAL;1249break;1250}1251spin_unlock(&filp->f_kqlock);12521253if (error == 0) {1254linux_set_current(td);12551256/* update kqfilter status, if any */1257linux_file_kqfilter_poll(filp,1258LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);1259}1260return (error);1261}12621263static int1264linux_file_mmap_single(struct file *fp, const struct file_operations *fop,1265vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,1266int nprot, bool is_shared, struct thread *td)1267{1268struct task_struct *task;1269struct vm_area_struct *vmap;1270struct mm_struct *mm;1271struct linux_file *filp;1272vm_memattr_t attr;1273int error;12741275filp = (struct linux_file *)fp->f_data;1276filp->f_flags = fp->f_flag;12771278if (fop->mmap == NULL)1279return (EOPNOTSUPP);12801281linux_set_current(td);12821283/*1284* The same VM object might be shared by multiple processes1285* and the mm_struct is usually freed when a process exits.1286*1287* The atomic reference below makes sure the mm_struct is1288* available as long as the vmap is in the linux_vma_head.1289*/1290task = current;1291mm = task->mm;1292if (atomic_inc_not_zero(&mm->mm_users) == 0)1293return (EINVAL);12941295vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);1296vmap->vm_start = 0;1297vmap->vm_end = size;1298vmap->vm_pgoff = *offset / PAGE_SIZE;1299vmap->vm_pfn = 0;1300vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);1301if (is_shared)1302vmap->vm_flags |= VM_SHARED;1303vmap->vm_ops = NULL;1304vmap->vm_file = get_file(filp);1305vmap->vm_mm = mm;13061307if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {1308error = linux_get_error(task, EINTR);1309} else {1310error = -OPW(fp, td, fop->mmap(filp, vmap));1311error = linux_get_error(task, error);1312up_write(&vmap->vm_mm->mmap_sem);1313}13141315if (error != 0) {1316linux_cdev_handle_free(vmap);1317return (error);1318}13191320attr = pgprot2cachemode(vmap->vm_page_prot);13211322if (vmap->vm_ops != NULL) {1323struct vm_area_struct *ptr;1324void *vm_private_data;1325bool vm_no_fault;13261327if (vmap->vm_ops->open == NULL ||1328vmap->vm_ops->close == NULL ||1329vmap->vm_private_data == NULL) {1330/* free allocated VM area struct */1331linux_cdev_handle_free(vmap);1332return (EINVAL);1333}13341335vm_private_data = vmap->vm_private_data;13361337rw_wlock(&linux_vma_lock);1338TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {1339if (ptr->vm_private_data == vm_private_data)1340break;1341}1342/* check if there is an existing VM area struct */1343if (ptr != NULL) {1344/* check if the VM area structure is invalid */1345if (ptr->vm_ops == NULL ||1346ptr->vm_ops->open == NULL ||1347ptr->vm_ops->close == NULL) {1348error = ESTALE;1349vm_no_fault = 1;1350} else {1351error = EEXIST;1352vm_no_fault = (ptr->vm_ops->fault == NULL);1353}1354} else {1355/* insert VM area structure into list */1356TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);1357error = 0;1358vm_no_fault = (vmap->vm_ops->fault == NULL);1359}1360rw_wunlock(&linux_vma_lock);13611362if (error != 0) {1363/* free allocated VM area struct */1364linux_cdev_handle_free(vmap);1365/* check for stale VM area struct */1366if (error != EEXIST)1367return (error);1368}13691370/* check if there is no fault handler */1371if (vm_no_fault) {1372*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,1373&linux_cdev_pager_ops[1], size, nprot, *offset,1374td->td_ucred);1375} else {1376*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,1377&linux_cdev_pager_ops[0], size, nprot, *offset,1378td->td_ucred);1379}13801381/* check if allocating the VM object failed */1382if (*object == NULL) {1383if (error == 0) {1384/* remove VM area struct from list */1385linux_cdev_handle_remove(vmap);1386/* free allocated VM area struct */1387linux_cdev_handle_free(vmap);1388}1389return (EINVAL);1390}1391} else {1392struct sglist *sg;13931394sg = sglist_alloc(1, M_WAITOK);1395sglist_append_phys(sg,1396(vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);13971398*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,1399nprot, 0, td->td_ucred);14001401linux_cdev_handle_free(vmap);14021403if (*object == NULL) {1404sglist_free(sg);1405return (EINVAL);1406}1407}14081409if (attr != VM_MEMATTR_DEFAULT) {1410VM_OBJECT_WLOCK(*object);1411vm_object_set_memattr(*object, attr);1412VM_OBJECT_WUNLOCK(*object);1413}1414*offset = 0;1415return (0);1416}14171418struct cdevsw linuxcdevsw = {1419.d_version = D_VERSION,1420.d_fdopen = linux_dev_fdopen,1421.d_name = "lkpidev",1422};14231424static int1425linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,1426int flags, struct thread *td)1427{1428struct linux_file *filp;1429const struct file_operations *fop;1430struct linux_cdev *ldev;1431ssize_t bytes;1432int error;14331434error = 0;1435filp = (struct linux_file *)file->f_data;1436filp->f_flags = file->f_flag;1437/* XXX no support for I/O vectors currently */1438if (uio->uio_iovcnt != 1)1439return (EOPNOTSUPP);1440if (uio->uio_resid > DEVFS_IOSIZE_MAX)1441return (EINVAL);1442linux_set_current(td);1443linux_get_fop(filp, &fop, &ldev);1444if (fop->read != NULL) {1445bytes = OPW(file, td, fop->read(filp,1446uio->uio_iov->iov_base,1447uio->uio_iov->iov_len, &uio->uio_offset));1448if (bytes >= 0) {1449uio->uio_iov->iov_base =1450((uint8_t *)uio->uio_iov->iov_base) + bytes;1451uio->uio_iov->iov_len -= bytes;1452uio->uio_resid -= bytes;1453} else {1454error = linux_get_error(current, -bytes);1455}1456} else1457error = ENXIO;14581459/* update kqfilter status, if any */1460linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);1461linux_drop_fop(ldev);14621463return (error);1464}14651466static int1467linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,1468int flags, struct thread *td)1469{1470struct linux_file *filp;1471const struct file_operations *fop;1472struct linux_cdev *ldev;1473ssize_t bytes;1474int error;14751476filp = (struct linux_file *)file->f_data;1477filp->f_flags = file->f_flag;1478/* XXX no support for I/O vectors currently */1479if (uio->uio_iovcnt != 1)1480return (EOPNOTSUPP);1481if (uio->uio_resid > DEVFS_IOSIZE_MAX)1482return (EINVAL);1483linux_set_current(td);1484linux_get_fop(filp, &fop, &ldev);1485if (fop->write != NULL) {1486bytes = OPW(file, td, fop->write(filp,1487uio->uio_iov->iov_base,1488uio->uio_iov->iov_len, &uio->uio_offset));1489if (bytes >= 0) {1490uio->uio_iov->iov_base =1491((uint8_t *)uio->uio_iov->iov_base) + bytes;1492uio->uio_iov->iov_len -= bytes;1493uio->uio_resid -= bytes;1494error = 0;1495} else {1496error = linux_get_error(current, -bytes);1497}1498} else1499error = ENXIO;15001501/* update kqfilter status, if any */1502linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);15031504linux_drop_fop(ldev);15051506return (error);1507}15081509static int1510linux_file_poll(struct file *file, int events, struct ucred *active_cred,1511struct thread *td)1512{1513struct linux_file *filp;1514const struct file_operations *fop;1515struct linux_cdev *ldev;1516int revents;15171518filp = (struct linux_file *)file->f_data;1519filp->f_flags = file->f_flag;1520linux_set_current(td);1521linux_get_fop(filp, &fop, &ldev);1522if (fop->poll != NULL) {1523revents = OPW(file, td, fop->poll(filp,1524LINUX_POLL_TABLE_NORMAL)) & events;1525} else {1526revents = 0;1527}1528linux_drop_fop(ldev);1529return (revents);1530}15311532static int1533linux_file_close(struct file *file, struct thread *td)1534{1535struct linux_file *filp;1536int (*release)(struct inode *, struct linux_file *);1537const struct file_operations *fop;1538struct linux_cdev *ldev;1539int error;15401541filp = (struct linux_file *)file->f_data;15421543KASSERT(file_count(filp) == 0,1544("File refcount(%d) is not zero", file_count(filp)));15451546if (td == NULL)1547td = curthread;15481549error = 0;1550filp->f_flags = file->f_flag;1551linux_set_current(td);1552linux_poll_wait_dequeue(filp);1553linux_get_fop(filp, &fop, &ldev);1554/*1555* Always use the real release function, if any, to avoid1556* leaking device resources:1557*/1558release = filp->f_op->release;1559if (release != NULL)1560error = -OPW(file, td, release(filp->f_vnode, filp));1561funsetown(&filp->f_sigio);1562if (filp->f_vnode != NULL)1563vrele(filp->f_vnode);1564linux_drop_fop(ldev);1565ldev = filp->f_cdev;1566if (ldev != NULL)1567linux_cdev_deref(ldev);1568linux_synchronize_rcu(RCU_TYPE_REGULAR);1569kfree(filp);15701571return (error);1572}15731574static int1575linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,1576struct thread *td)1577{1578struct linux_file *filp;1579const struct file_operations *fop;1580struct linux_cdev *ldev;1581struct fiodgname_arg *fgn;1582const char *p;1583int error, i;15841585error = 0;1586filp = (struct linux_file *)fp->f_data;1587filp->f_flags = fp->f_flag;1588linux_get_fop(filp, &fop, &ldev);15891590linux_set_current(td);1591switch (cmd) {1592case FIONBIO:1593break;1594case FIOASYNC:1595if (fop->fasync == NULL)1596break;1597error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));1598break;1599case FIOSETOWN:1600error = fsetown(*(int *)data, &filp->f_sigio);1601if (error == 0) {1602if (fop->fasync == NULL)1603break;1604error = -OPW(fp, td, fop->fasync(0, filp,1605fp->f_flag & FASYNC));1606}1607break;1608case FIOGETOWN:1609*(int *)data = fgetown(&filp->f_sigio);1610break;1611case FIODGNAME:1612#ifdef COMPAT_FREEBSD321613case FIODGNAME_32:1614#endif1615if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {1616error = ENXIO;1617break;1618}1619fgn = data;1620p = devtoname(filp->f_cdev->cdev);1621i = strlen(p) + 1;1622if (i > fgn->len) {1623error = EINVAL;1624break;1625}1626error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i);1627break;1628default:1629error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);1630break;1631}1632linux_drop_fop(ldev);1633return (error);1634}16351636static int1637linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,1638vm_prot_t maxprot, int flags, struct file *fp,1639vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)1640{1641/*1642* Character devices do not provide private mappings1643* of any kind:1644*/1645if ((maxprot & VM_PROT_WRITE) == 0 &&1646(prot & VM_PROT_WRITE) != 0)1647return (EACCES);1648if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0)1649return (EINVAL);16501651return (linux_file_mmap_single(fp, fop, foff, objsize, objp,1652(int)prot, (flags & MAP_SHARED) ? true : false, td));1653}16541655static int1656linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,1657vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,1658struct thread *td)1659{1660struct linux_file *filp;1661const struct file_operations *fop;1662struct linux_cdev *ldev;1663struct mount *mp;1664struct vnode *vp;1665vm_object_t object;1666vm_prot_t maxprot;1667int error;16681669filp = (struct linux_file *)fp->f_data;16701671vp = filp->f_vnode;1672if (vp == NULL)1673return (EOPNOTSUPP);16741675/*1676* Ensure that file and memory protections are1677* compatible.1678*/1679mp = vp->v_mount;1680if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {1681maxprot = VM_PROT_NONE;1682if ((prot & VM_PROT_EXECUTE) != 0)1683return (EACCES);1684} else1685maxprot = VM_PROT_EXECUTE;1686if ((fp->f_flag & FREAD) != 0)1687maxprot |= VM_PROT_READ;1688else if ((prot & VM_PROT_READ) != 0)1689return (EACCES);16901691/*1692* If we are sharing potential changes via MAP_SHARED and we1693* are trying to get write permission although we opened it1694* without asking for it, bail out.1695*1696* Note that most character devices always share mappings.1697*1698* Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE1699* requests rather than doing it here.1700*/1701if ((flags & MAP_SHARED) != 0) {1702if ((fp->f_flag & FWRITE) != 0)1703maxprot |= VM_PROT_WRITE;1704else if ((prot & VM_PROT_WRITE) != 0)1705return (EACCES);1706}1707maxprot &= cap_maxprot;17081709linux_get_fop(filp, &fop, &ldev);1710error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp,1711&foff, fop, &object);1712if (error != 0)1713goto out;17141715error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,1716foff, FALSE, td);1717if (error != 0)1718vm_object_deallocate(object);1719out:1720linux_drop_fop(ldev);1721return (error);1722}17231724static int1725linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)1726{1727struct linux_file *filp;1728struct vnode *vp;1729int error;17301731filp = (struct linux_file *)fp->f_data;1732if (filp->f_vnode == NULL)1733return (EOPNOTSUPP);17341735vp = filp->f_vnode;17361737vn_lock(vp, LK_SHARED | LK_RETRY);1738error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED);1739VOP_UNLOCK(vp);17401741return (error);1742}17431744static int1745linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,1746struct filedesc *fdp)1747{1748struct linux_file *filp;1749struct vnode *vp;1750int error;17511752filp = fp->f_data;1753vp = filp->f_vnode;1754if (vp == NULL) {1755error = 0;1756kif->kf_type = KF_TYPE_DEV;1757} else {1758vref(vp);1759FILEDESC_SUNLOCK(fdp);1760error = vn_fill_kinfo_vnode(vp, kif);1761vrele(vp);1762kif->kf_type = KF_TYPE_VNODE;1763FILEDESC_SLOCK(fdp);1764}1765return (error);1766}17671768unsigned int1769linux_iminor(struct inode *inode)1770{1771struct linux_cdev *ldev;17721773if (inode == NULL || inode->v_rdev == NULL ||1774inode->v_rdev->si_devsw != &linuxcdevsw)1775return (-1U);1776ldev = inode->v_rdev->si_drv1;1777if (ldev == NULL)1778return (-1U);17791780return (minor(ldev->dev));1781}17821783static int1784linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td)1785{1786struct linux_file *filp1, *filp2;17871788if (fp2->f_type != DTYPE_DEV)1789return (3);17901791filp1 = fp1->f_data;1792filp2 = fp2->f_data;1793return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev));1794}17951796const struct fileops linuxfileops = {1797.fo_read = linux_file_read,1798.fo_write = linux_file_write,1799.fo_truncate = invfo_truncate,1800.fo_kqfilter = linux_file_kqfilter,1801.fo_stat = linux_file_stat,1802.fo_fill_kinfo = linux_file_fill_kinfo,1803.fo_poll = linux_file_poll,1804.fo_close = linux_file_close,1805.fo_ioctl = linux_file_ioctl,1806.fo_mmap = linux_file_mmap,1807.fo_chmod = invfo_chmod,1808.fo_chown = invfo_chown,1809.fo_sendfile = invfo_sendfile,1810.fo_cmp = linux_file_kcmp,1811.fo_flags = DFLAG_PASSABLE,1812};18131814static char *1815devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap)1816{1817unsigned int len;1818char *p;1819va_list aq;18201821va_copy(aq, ap);1822len = vsnprintf(NULL, 0, fmt, aq);1823va_end(aq);18241825if (dev != NULL)1826p = devm_kmalloc(dev, len + 1, gfp);1827else1828p = kmalloc(len + 1, gfp);1829if (p != NULL)1830vsnprintf(p, len + 1, fmt, ap);18311832return (p);1833}18341835char *1836kvasprintf(gfp_t gfp, const char *fmt, va_list ap)1837{18381839return (devm_kvasprintf(NULL, gfp, fmt, ap));1840}18411842char *1843lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)1844{1845va_list ap;1846char *p;18471848va_start(ap, fmt);1849p = devm_kvasprintf(dev, gfp, fmt, ap);1850va_end(ap);18511852return (p);1853}18541855char *1856kasprintf(gfp_t gfp, const char *fmt, ...)1857{1858va_list ap;1859char *p;18601861va_start(ap, fmt);1862p = kvasprintf(gfp, fmt, ap);1863va_end(ap);18641865return (p);1866}18671868int1869__lkpi_hexdump_printf(void *arg1 __unused, const char *fmt, ...)1870{1871va_list ap;1872int result;18731874va_start(ap, fmt);1875result = vprintf(fmt, ap);1876va_end(ap);1877return (result);1878}18791880int1881__lkpi_hexdump_sbuf_printf(void *arg1, const char *fmt, ...)1882{1883va_list ap;1884int result;18851886va_start(ap, fmt);1887result = sbuf_vprintf(arg1, fmt, ap);1888va_end(ap);1889return (result);1890}18911892void1893lkpi_hex_dump(int(*_fpf)(void *, const char *, ...), void *arg1,1894const char *level, const char *prefix_str,1895const int prefix_type, const int rowsize, const int groupsize,1896const void *buf, size_t len, const bool ascii, const bool trailing_newline)1897{1898typedef const struct { long long value; } __packed *print_64p_t;1899typedef const struct { uint32_t value; } __packed *print_32p_t;1900typedef const struct { uint16_t value; } __packed *print_16p_t;1901const void *buf_old = buf;1902int row, linelen, ret;19031904while (len > 0) {1905linelen = 0;1906if (level != NULL) {1907ret = _fpf(arg1, "%s", level);1908if (ret < 0)1909break;1910linelen += ret;1911}1912if (prefix_str != NULL) {1913ret = _fpf(1914arg1, "%s%s", linelen ? " " : "", prefix_str);1915if (ret < 0)1916break;1917linelen += ret;1918}19191920switch (prefix_type) {1921case DUMP_PREFIX_ADDRESS:1922ret = _fpf(1923arg1, "%s[%p]", linelen ? " " : "", buf);1924if (ret < 0)1925return;1926linelen += ret;1927break;1928case DUMP_PREFIX_OFFSET:1929ret = _fpf(1930arg1, "%s[%#tx]", linelen ? " " : "",1931((const char *)buf - (const char *)buf_old));1932if (ret < 0)1933return;1934linelen += ret;1935break;1936default:1937break;1938}1939for (row = 0; row != rowsize; row++) {1940if (groupsize == 8 && len > 7) {1941ret = _fpf(1942arg1, "%s%016llx", linelen ? " " : "",1943((print_64p_t)buf)->value);1944if (ret < 0)1945return;1946linelen += ret;1947buf = (const uint8_t *)buf + 8;1948len -= 8;1949} else if (groupsize == 4 && len > 3) {1950ret = _fpf(1951arg1, "%s%08x", linelen ? " " : "",1952((print_32p_t)buf)->value);1953if (ret < 0)1954return;1955linelen += ret;1956buf = (const uint8_t *)buf + 4;1957len -= 4;1958} else if (groupsize == 2 && len > 1) {1959ret = _fpf(1960arg1, "%s%04x", linelen ? " " : "",1961((print_16p_t)buf)->value);1962if (ret < 0)1963return;1964linelen += ret;1965buf = (const uint8_t *)buf + 2;1966len -= 2;1967} else if (len > 0) {1968ret = _fpf(1969arg1, "%s%02x", linelen ? " " : "",1970*(const uint8_t *)buf);1971if (ret < 0)1972return;1973linelen += ret;1974buf = (const uint8_t *)buf + 1;1975len--;1976} else {1977break;1978}1979}1980if (len > 0 && trailing_newline) {1981ret = _fpf(arg1, "\n");1982if (ret < 0)1983break;1984}1985}1986}19871988struct hdtb_context {1989char *linebuf;1990size_t linebuflen;1991int written;1992};19931994static int1995hdtb_cb(void *arg, const char *format, ...)1996{1997struct hdtb_context *context;1998int written;1999va_list args;20002001context = arg;20022003va_start(args, format);2004written = vsnprintf(2005context->linebuf, context->linebuflen, format, args);2006va_end(args);20072008if (written < 0)2009return (written);20102011/*2012* Linux' hex_dump_to_buffer() function has the same behaviour as2013* snprintf() basically. Therefore, it returns the number of bytes it2014* would have written if the destination buffer was large enough.2015*2016* If the destination buffer was exhausted, lkpi_hex_dump() will2017* continue to call this callback but it will only compute the bytes it2018* would have written but write nothing to that buffer.2019*/2020context->written += written;20212022if (written < context->linebuflen) {2023context->linebuf += written;2024context->linebuflen -= written;2025} else {2026context->linebuf += context->linebuflen;2027context->linebuflen = 0;2028}20292030return (written);2031}20322033int2034lkpi_hex_dump_to_buffer(const void *buf, size_t len, int rowsize,2035int groupsize, char *linebuf, size_t linebuflen, bool ascii)2036{2037int written;2038struct hdtb_context context;20392040context.linebuf = linebuf;2041context.linebuflen = linebuflen;2042context.written = 0;20432044if (rowsize != 16 && rowsize != 32)2045rowsize = 16;20462047len = min(len, rowsize);20482049lkpi_hex_dump(2050hdtb_cb, &context, NULL, NULL, DUMP_PREFIX_NONE,2051rowsize, groupsize, buf, len, ascii, false);20522053written = context.written;20542055return (written);2056}20572058static void2059linux_timer_callback_wrapper(void *context)2060{2061struct timer_list *timer;20622063timer = context;20642065/* the timer is about to be shutdown permanently */2066if (timer->function == NULL)2067return;20682069if (linux_set_current_flags(curthread, M_NOWAIT)) {2070/* try again later */2071callout_reset(&timer->callout, 1,2072&linux_timer_callback_wrapper, timer);2073return;2074}20752076timer->function(timer->data);2077}20782079static int2080linux_timer_jiffies_until(unsigned long expires)2081{2082unsigned long delta = expires - jiffies;20832084/*2085* Guard against already expired values and make sure that the value can2086* be used as a tick count, rather than a jiffies count.2087*/2088if ((long)delta < 1)2089delta = 1;2090else if (delta > INT_MAX)2091delta = INT_MAX;2092return ((int)delta);2093}20942095int2096mod_timer(struct timer_list *timer, unsigned long expires)2097{2098int ret;20992100timer->expires = expires;2101ret = callout_reset(&timer->callout,2102linux_timer_jiffies_until(expires),2103&linux_timer_callback_wrapper, timer);21042105MPASS(ret == 0 || ret == 1);21062107return (ret == 1);2108}21092110void2111add_timer(struct timer_list *timer)2112{21132114callout_reset(&timer->callout,2115linux_timer_jiffies_until(timer->expires),2116&linux_timer_callback_wrapper, timer);2117}21182119void2120add_timer_on(struct timer_list *timer, int cpu)2121{21222123callout_reset_on(&timer->callout,2124linux_timer_jiffies_until(timer->expires),2125&linux_timer_callback_wrapper, timer, cpu);2126}21272128int2129timer_delete(struct timer_list *timer)2130{21312132if (callout_stop(&(timer)->callout) == -1)2133return (0);2134return (1);2135}21362137int2138timer_delete_sync(struct timer_list *timer)2139{21402141if (callout_drain(&(timer)->callout) == -1)2142return (0);2143return (1);2144}21452146int2147timer_shutdown_sync(struct timer_list *timer)2148{21492150timer->function = NULL;2151return (del_timer_sync(timer));2152}21532154/* greatest common divisor, Euclid equation */2155static uint64_t2156lkpi_gcd_64(uint64_t a, uint64_t b)2157{2158uint64_t an;2159uint64_t bn;21602161while (b != 0) {2162an = b;2163bn = a % b;2164a = an;2165b = bn;2166}2167return (a);2168}21692170uint64_t lkpi_nsec2hz_rem;2171uint64_t lkpi_nsec2hz_div = 1000000000ULL;2172uint64_t lkpi_nsec2hz_max;21732174uint64_t lkpi_usec2hz_rem;2175uint64_t lkpi_usec2hz_div = 1000000ULL;2176uint64_t lkpi_usec2hz_max;21772178uint64_t lkpi_msec2hz_rem;2179uint64_t lkpi_msec2hz_div = 1000ULL;2180uint64_t lkpi_msec2hz_max;21812182static void2183linux_timer_init(void *arg)2184{2185uint64_t gcd;21862187/*2188* Compute an internal HZ value which can divide 2**32 to2189* avoid timer rounding problems when the tick value wraps2190* around 2**32:2191*/2192linux_timer_hz_mask = 1;2193while (linux_timer_hz_mask < (unsigned long)hz)2194linux_timer_hz_mask *= 2;2195linux_timer_hz_mask--;21962197/* compute some internal constants */21982199lkpi_nsec2hz_rem = hz;2200lkpi_usec2hz_rem = hz;2201lkpi_msec2hz_rem = hz;22022203gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);2204lkpi_nsec2hz_rem /= gcd;2205lkpi_nsec2hz_div /= gcd;2206lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;22072208gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);2209lkpi_usec2hz_rem /= gcd;2210lkpi_usec2hz_div /= gcd;2211lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;22122213gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);2214lkpi_msec2hz_rem /= gcd;2215lkpi_msec2hz_div /= gcd;2216lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;2217}2218SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);22192220void2221linux_complete_common(struct completion *c, int all)2222{2223sleepq_lock(c);2224if (all) {2225c->done = UINT_MAX;2226sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);2227} else {2228if (c->done != UINT_MAX)2229c->done++;2230sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);2231}2232sleepq_release(c);2233}22342235/*2236* Indefinite wait for done != 0 with or without signals.2237*/2238int2239linux_wait_for_common(struct completion *c, int flags)2240{2241struct task_struct *task;2242int error;22432244if (SCHEDULER_STOPPED())2245return (0);22462247task = current;22482249if (flags != 0)2250flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;2251else2252flags = SLEEPQ_SLEEP;2253error = 0;2254for (;;) {2255sleepq_lock(c);2256if (c->done)2257break;2258sleepq_add(c, NULL, "completion", flags, 0);2259if (flags & SLEEPQ_INTERRUPTIBLE) {2260DROP_GIANT();2261error = -sleepq_wait_sig(c, 0);2262PICKUP_GIANT();2263if (error != 0) {2264linux_schedule_save_interrupt_value(task, error);2265error = -ERESTARTSYS;2266goto intr;2267}2268} else {2269DROP_GIANT();2270sleepq_wait(c, 0);2271PICKUP_GIANT();2272}2273}2274if (c->done != UINT_MAX)2275c->done--;2276sleepq_release(c);22772278intr:2279return (error);2280}22812282/*2283* Time limited wait for done != 0 with or without signals.2284*/2285unsigned long2286linux_wait_for_timeout_common(struct completion *c, unsigned long timeout,2287int flags)2288{2289struct task_struct *task;2290unsigned long end = jiffies + timeout, error;22912292if (SCHEDULER_STOPPED())2293return (0);22942295task = current;22962297if (flags != 0)2298flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;2299else2300flags = SLEEPQ_SLEEP;23012302for (;;) {2303sleepq_lock(c);2304if (c->done)2305break;2306sleepq_add(c, NULL, "completion", flags, 0);2307sleepq_set_timeout(c, linux_timer_jiffies_until(end));23082309DROP_GIANT();2310if (flags & SLEEPQ_INTERRUPTIBLE)2311error = -sleepq_timedwait_sig(c, 0);2312else2313error = -sleepq_timedwait(c, 0);2314PICKUP_GIANT();23152316if (error != 0) {2317/* check for timeout */2318if (error == -EWOULDBLOCK) {2319error = 0; /* timeout */2320} else {2321/* signal happened */2322linux_schedule_save_interrupt_value(task, error);2323error = -ERESTARTSYS;2324}2325goto done;2326}2327}2328if (c->done != UINT_MAX)2329c->done--;2330sleepq_release(c);23312332/* return how many jiffies are left */2333error = linux_timer_jiffies_until(end);2334done:2335return (error);2336}23372338int2339linux_try_wait_for_completion(struct completion *c)2340{2341int isdone;23422343sleepq_lock(c);2344isdone = (c->done != 0);2345if (c->done != 0 && c->done != UINT_MAX)2346c->done--;2347sleepq_release(c);2348return (isdone);2349}23502351int2352linux_completion_done(struct completion *c)2353{2354int isdone;23552356sleepq_lock(c);2357isdone = (c->done != 0);2358sleepq_release(c);2359return (isdone);2360}23612362static void2363linux_cdev_deref(struct linux_cdev *ldev)2364{2365if (refcount_release(&ldev->refs) &&2366ldev->kobj.ktype == &linux_cdev_ktype)2367kfree(ldev);2368}23692370static void2371linux_cdev_release(struct kobject *kobj)2372{2373struct linux_cdev *cdev;2374struct kobject *parent;23752376cdev = container_of(kobj, struct linux_cdev, kobj);2377parent = kobj->parent;2378linux_destroy_dev(cdev);2379linux_cdev_deref(cdev);2380kobject_put(parent);2381}23822383static void2384linux_cdev_static_release(struct kobject *kobj)2385{2386struct cdev *cdev;2387struct linux_cdev *ldev;23882389ldev = container_of(kobj, struct linux_cdev, kobj);2390cdev = ldev->cdev;2391if (cdev != NULL) {2392destroy_dev(cdev);2393ldev->cdev = NULL;2394}2395kobject_put(kobj->parent);2396}23972398int2399linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev)2400{2401int ret;24022403if (dev->devt != 0) {2404/* Set parent kernel object. */2405ldev->kobj.parent = &dev->kobj;24062407/*2408* Unlike Linux we require the kobject of the2409* character device structure to have a valid name2410* before calling this function:2411*/2412if (ldev->kobj.name == NULL)2413return (-EINVAL);24142415ret = cdev_add(ldev, dev->devt, 1);2416if (ret)2417return (ret);2418}2419ret = device_add(dev);2420if (ret != 0 && dev->devt != 0)2421cdev_del(ldev);2422return (ret);2423}24242425void2426linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev)2427{2428device_del(dev);24292430if (dev->devt != 0)2431cdev_del(ldev);2432}24332434static void2435linux_destroy_dev(struct linux_cdev *ldev)2436{24372438if (ldev->cdev == NULL)2439return;24402441MPASS((ldev->siref & LDEV_SI_DTR) == 0);2442MPASS(ldev->kobj.ktype == &linux_cdev_ktype);24432444atomic_set_int(&ldev->siref, LDEV_SI_DTR);2445while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)2446pause("ldevdtr", hz / 4);24472448destroy_dev(ldev->cdev);2449ldev->cdev = NULL;2450}24512452const struct kobj_type linux_cdev_ktype = {2453.release = linux_cdev_release,2454};24552456const struct kobj_type linux_cdev_static_ktype = {2457.release = linux_cdev_static_release,2458};24592460static void2461linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)2462{2463struct notifier_block *nb;2464struct netdev_notifier_info ni;24652466nb = arg;2467ni.ifp = ifp;2468ni.dev = (struct net_device *)ifp;2469if (linkstate == LINK_STATE_UP)2470nb->notifier_call(nb, NETDEV_UP, &ni);2471else2472nb->notifier_call(nb, NETDEV_DOWN, &ni);2473}24742475static void2476linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)2477{2478struct notifier_block *nb;2479struct netdev_notifier_info ni;24802481nb = arg;2482ni.ifp = ifp;2483ni.dev = (struct net_device *)ifp;2484nb->notifier_call(nb, NETDEV_REGISTER, &ni);2485}24862487static void2488linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)2489{2490struct notifier_block *nb;2491struct netdev_notifier_info ni;24922493nb = arg;2494ni.ifp = ifp;2495ni.dev = (struct net_device *)ifp;2496nb->notifier_call(nb, NETDEV_UNREGISTER, &ni);2497}24982499static void2500linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)2501{2502struct notifier_block *nb;2503struct netdev_notifier_info ni;25042505nb = arg;2506ni.ifp = ifp;2507ni.dev = (struct net_device *)ifp;2508nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni);2509}25102511static void2512linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)2513{2514struct notifier_block *nb;2515struct netdev_notifier_info ni;25162517nb = arg;2518ni.ifp = ifp;2519ni.dev = (struct net_device *)ifp;2520nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni);2521}25222523int2524register_netdevice_notifier(struct notifier_block *nb)2525{25262527nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(2528ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);2529nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(2530ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);2531nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(2532ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);2533nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(2534iflladdr_event, linux_handle_iflladdr_event, nb, 0);25352536return (0);2537}25382539int2540register_inetaddr_notifier(struct notifier_block *nb)2541{25422543nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(2544ifaddr_event, linux_handle_ifaddr_event, nb, 0);2545return (0);2546}25472548int2549unregister_netdevice_notifier(struct notifier_block *nb)2550{25512552EVENTHANDLER_DEREGISTER(ifnet_link_event,2553nb->tags[NETDEV_UP]);2554EVENTHANDLER_DEREGISTER(ifnet_arrival_event,2555nb->tags[NETDEV_REGISTER]);2556EVENTHANDLER_DEREGISTER(ifnet_departure_event,2557nb->tags[NETDEV_UNREGISTER]);2558EVENTHANDLER_DEREGISTER(iflladdr_event,2559nb->tags[NETDEV_CHANGEADDR]);25602561return (0);2562}25632564int2565unregister_inetaddr_notifier(struct notifier_block *nb)2566{25672568EVENTHANDLER_DEREGISTER(ifaddr_event,2569nb->tags[NETDEV_CHANGEIFADDR]);25702571return (0);2572}25732574struct list_sort_thunk {2575int (*cmp)(void *, struct list_head *, struct list_head *);2576void *priv;2577};25782579static inline int2580linux_le_cmp(const void *d1, const void *d2, void *priv)2581{2582struct list_head *le1, *le2;2583struct list_sort_thunk *thunk;25842585thunk = priv;2586le1 = *(__DECONST(struct list_head **, d1));2587le2 = *(__DECONST(struct list_head **, d2));2588return ((thunk->cmp)(thunk->priv, le1, le2));2589}25902591void2592list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,2593struct list_head *a, struct list_head *b))2594{2595struct list_sort_thunk thunk;2596struct list_head **ar, *le;2597size_t count, i;25982599count = 0;2600list_for_each(le, head)2601count++;2602ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);2603i = 0;2604list_for_each(le, head)2605ar[i++] = le;2606thunk.cmp = cmp;2607thunk.priv = priv;2608qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk);2609INIT_LIST_HEAD(head);2610for (i = 0; i < count; i++)2611list_add_tail(ar[i], head);2612free(ar, M_KMALLOC);2613}26142615#if defined(__i386__) || defined(__amd64__)2616int2617linux_wbinvd_on_all_cpus(void)2618{26192620pmap_invalidate_cache();2621return (0);2622}2623#endif26242625int2626linux_on_each_cpu(void callback(void *), void *data)2627{26282629smp_rendezvous(smp_no_rendezvous_barrier, callback,2630smp_no_rendezvous_barrier, data);2631return (0);2632}26332634int2635linux_in_atomic(void)2636{26372638return ((curthread->td_pflags & TDP_NOFAULTING) != 0);2639}26402641struct linux_cdev *2642linux_find_cdev(const char *name, unsigned major, unsigned minor)2643{2644dev_t dev = MKDEV(major, minor);2645struct cdev *cdev;26462647dev_lock();2648LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {2649struct linux_cdev *ldev = cdev->si_drv1;2650if (ldev->dev == dev &&2651strcmp(kobject_name(&ldev->kobj), name) == 0) {2652break;2653}2654}2655dev_unlock();26562657return (cdev != NULL ? cdev->si_drv1 : NULL);2658}26592660int2661__register_chrdev(unsigned int major, unsigned int baseminor,2662unsigned int count, const char *name,2663const struct file_operations *fops)2664{2665struct linux_cdev *cdev;2666int ret = 0;2667int i;26682669for (i = baseminor; i < baseminor + count; i++) {2670cdev = cdev_alloc();2671cdev->ops = fops;2672kobject_set_name(&cdev->kobj, name);26732674ret = cdev_add(cdev, makedev(major, i), 1);2675if (ret != 0)2676break;2677}2678return (ret);2679}26802681int2682__register_chrdev_p(unsigned int major, unsigned int baseminor,2683unsigned int count, const char *name,2684const struct file_operations *fops, uid_t uid,2685gid_t gid, int mode)2686{2687struct linux_cdev *cdev;2688int ret = 0;2689int i;26902691for (i = baseminor; i < baseminor + count; i++) {2692cdev = cdev_alloc();2693cdev->ops = fops;2694kobject_set_name(&cdev->kobj, name);26952696ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);2697if (ret != 0)2698break;2699}2700return (ret);2701}27022703void2704__unregister_chrdev(unsigned int major, unsigned int baseminor,2705unsigned int count, const char *name)2706{2707struct linux_cdev *cdevp;2708int i;27092710for (i = baseminor; i < baseminor + count; i++) {2711cdevp = linux_find_cdev(name, major, i);2712if (cdevp != NULL)2713cdev_del(cdevp);2714}2715}27162717void2718linux_dump_stack(void)2719{2720#ifdef STACK2721struct stack st;27222723stack_save(&st);2724stack_print(&st);2725#endif2726}27272728int2729linuxkpi_net_ratelimit(void)2730{27312732return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps,2733lkpi_net_maxpps));2734}27352736struct io_mapping *2737io_mapping_create_wc(resource_size_t base, unsigned long size)2738{2739struct io_mapping *mapping;27402741mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);2742if (mapping == NULL)2743return (NULL);2744return (io_mapping_init_wc(mapping, base, size));2745}27462747/* We likely want a linuxkpi_device.c at some point. */2748bool2749device_can_wakeup(struct device *dev)2750{27512752if (dev == NULL)2753return (false);2754/*2755* XXX-BZ iwlwifi queries it as part of enabling WoWLAN.2756* Normally this would be based on a bool in dev->power.XXX.2757* Check such as PCI PCIM_PCAP_*PME. We have no way to enable this yet.2758* We may get away by directly calling into bsddev for as long as2759* we can assume PCI only avoiding changing struct device breaking KBI.2760*/2761pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__);2762return (false);2763}27642765static void2766devm_device_group_remove(struct device *dev, void *p)2767{2768const struct attribute_group **dr = p;2769const struct attribute_group *group = *dr;27702771sysfs_remove_group(&dev->kobj, group);2772}27732774int2775lkpi_devm_device_add_group(struct device *dev,2776const struct attribute_group *group)2777{2778const struct attribute_group **dr;2779int ret;27802781dr = devres_alloc(devm_device_group_remove, sizeof(*dr), GFP_KERNEL);2782if (dr == NULL)2783return (-ENOMEM);27842785ret = sysfs_create_group(&dev->kobj, group);2786if (ret == 0) {2787*dr = group;2788devres_add(dev, dr);2789} else2790devres_free(dr);27912792return (ret);2793}27942795#if defined(__i386__) || defined(__amd64__)2796bool linux_cpu_has_clflush;2797struct cpuinfo_x86 boot_cpu_data;2798struct cpuinfo_x86 *__cpu_data;2799#endif28002801cpumask_t *2802lkpi_get_static_single_cpu_mask(int cpuid)2803{28042805KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n",2806__func__, cpuid));2807KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n",2808__func__, cpuid));28092810return (static_single_cpu_mask[cpuid]);2811}28122813bool2814lkpi_xen_initial_domain(void)2815{2816#ifdef XENHVM2817return (xen_initial_domain());2818#else2819return (false);2820#endif2821}28222823bool2824lkpi_xen_pv_domain(void)2825{2826#ifdef XENHVM2827return (xen_pv_domain());2828#else2829return (false);2830#endif2831}28322833static void2834linux_compat_init(void *arg)2835{2836struct sysctl_oid *rootoid;2837int i;28382839#if defined(__i386__) || defined(__amd64__)2840static const uint32_t x86_vendors[X86_VENDOR_NUM] = {2841[X86_VENDOR_INTEL] = CPU_VENDOR_INTEL,2842[X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX,2843[X86_VENDOR_AMD] = CPU_VENDOR_AMD,2844[X86_VENDOR_UMC] = CPU_VENDOR_UMC,2845[X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR,2846[X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA,2847[X86_VENDOR_NSC] = CPU_VENDOR_NSC,2848[X86_VENDOR_HYGON] = CPU_VENDOR_HYGON,2849};2850uint8_t x86_vendor = X86_VENDOR_UNKNOWN;28512852for (i = 0; i < X86_VENDOR_NUM; i++) {2853if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) {2854x86_vendor = i;2855break;2856}2857}2858linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);2859boot_cpu_data.x86_clflush_size = cpu_clflush_line_size;2860boot_cpu_data.x86_max_cores = mp_ncpus;2861boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id);2862boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id);2863boot_cpu_data.x86_vendor = x86_vendor;28642865__cpu_data = kmalloc_array(mp_maxid + 1,2866sizeof(*__cpu_data), M_WAITOK | M_ZERO);2867CPU_FOREACH(i) {2868__cpu_data[i].x86_clflush_size = cpu_clflush_line_size;2869__cpu_data[i].x86_max_cores = mp_ncpus;2870__cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id);2871__cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id);2872__cpu_data[i].x86_vendor = x86_vendor;2873}2874#endif2875rw_init(&linux_vma_lock, "lkpi-vma-lock");28762877rootoid = SYSCTL_ADD_ROOT_NODE(NULL,2878OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");2879kobject_init(&linux_class_root, &linux_class_ktype);2880kobject_set_name(&linux_class_root, "class");2881linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),2882OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");2883kobject_init(&linux_root_device.kobj, &linux_dev_ktype);2884kobject_set_name(&linux_root_device.kobj, "device");2885linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,2886SYSCTL_CHILDREN(rootoid), OID_AUTO, "device",2887CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device");2888linux_root_device.bsddev = root_bus;2889linux_class_misc.name = "misc";2890class_register(&linux_class_misc);2891INIT_LIST_HEAD(&pci_drivers);2892INIT_LIST_HEAD(&pci_devices);2893spin_lock_init(&pci_lock);2894init_waitqueue_head(&linux_bit_waitq);2895init_waitqueue_head(&linux_var_waitq);28962897CPU_COPY(&all_cpus, &cpu_online_mask);2898/*2899* Generate a single-CPU cpumask_t for each CPU (possibly) in the system.2900* CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only2901* have itself in the cpumask, cupid 1 only itself on entry 1, and so on.2902* This is used by cpumask_of() (and possibly others in the future) for,2903* e.g., drivers to pass hints to irq_set_affinity_hint().2904*/2905static_single_cpu_mask = kmalloc_array(mp_maxid + 1,2906sizeof(static_single_cpu_mask), M_WAITOK | M_ZERO);29072908/*2909* When the number of CPUs reach a threshold, we start to save memory2910* given the sets are static by overlapping those having their single2911* bit set at same position in a bitset word. Asymptotically, this2912* regular scheme is in O(n²) whereas the overlapping one is in O(n)2913* only with n being the maximum number of CPUs, so the gain will become2914* huge quite quickly. The threshold for 64-bit architectures is 1282915* CPUs.2916*/2917if (mp_ncpus < (2 * _BITSET_BITS)) {2918cpumask_t *sscm_ptr;29192920/*2921* This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) *2922* (_BITSET_BITS / 8)' bytes (for comparison with the2923* overlapping scheme).2924*/2925static_single_cpu_mask_lcs = kmalloc_array(mp_ncpus,2926sizeof(*static_single_cpu_mask_lcs),2927M_WAITOK | M_ZERO);29282929sscm_ptr = static_single_cpu_mask_lcs;2930CPU_FOREACH(i) {2931static_single_cpu_mask[i] = sscm_ptr++;2932CPU_SET(i, static_single_cpu_mask[i]);2933}2934} else {2935/* Pointer to a bitset word. */2936__typeof(((cpuset_t *)NULL)->__bits[0]) *bwp;29372938/*2939* Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t'2940* really) with a single bit set that can be reused for all2941* single CPU masks by making them start at different offsets.2942* We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before2943* the word having its single bit set, and the same amount2944* after.2945*/2946static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS,2947(2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8),2948M_KMALLOC, M_WAITOK | M_ZERO);29492950/*2951* We rely below on cpuset_t and the bitset generic2952* implementation assigning words in the '__bits' array in the2953* same order of bits (i.e., little-endian ordering, not to be2954* confused with machine endianness, which concerns bits in2955* words and other integers). This is an imperfect test, but it2956* will detect a change to big-endian ordering.2957*/2958_Static_assert(2959__bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1,2960"Assumes a bitset implementation that is little-endian "2961"on its words");29622963/* Initialize the single bit of each static span. */2964bwp = (__typeof(bwp))static_single_cpu_mask_lcs +2965(__bitset_words(CPU_SETSIZE) - 1);2966for (i = 0; i < _BITSET_BITS; i++) {2967CPU_SET(i, (cpuset_t *)bwp);2968bwp += (2 * __bitset_words(CPU_SETSIZE) - 1);2969}29702971/*2972* Finally set all CPU masks to the proper word in their2973* relevant span.2974*/2975CPU_FOREACH(i) {2976bwp = (__typeof(bwp))static_single_cpu_mask_lcs;2977/* Find the non-zero word of the relevant span. */2978bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) *2979(i % _BITSET_BITS) +2980__bitset_words(CPU_SETSIZE) - 1;2981/* Shift to find the CPU mask start. */2982bwp -= (i / _BITSET_BITS);2983static_single_cpu_mask[i] = (cpuset_t *)bwp;2984}2985}29862987strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release));2988}2989SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);29902991static void2992linux_compat_uninit(void *arg)2993{2994linux_kobject_kfree_name(&linux_class_root);2995linux_kobject_kfree_name(&linux_root_device.kobj);2996linux_kobject_kfree_name(&linux_class_misc.kobj);29972998free(static_single_cpu_mask_lcs, M_KMALLOC);2999free(static_single_cpu_mask, M_KMALLOC);3000#if defined(__i386__) || defined(__amd64__)3001free(__cpu_data, M_KMALLOC);3002#endif30033004spin_lock_destroy(&pci_lock);3005rw_destroy(&linux_vma_lock);3006}3007SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);30083009/*3010* NOTE: Linux frequently uses "unsigned long" for pointer to integer3011* conversion and vice versa, where in FreeBSD "uintptr_t" would be3012* used. Assert these types have the same size, else some parts of the3013* LinuxKPI may not work like expected:3014*/3015CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));301630173018