Path: blob/main/sys/compat/linuxkpi/common/src/linux_compat.c
39586 views
/*-1* Copyright (c) 2010 Isilon Systems, Inc.2* Copyright (c) 2010 iX Systems, Inc.3* Copyright (c) 2010 Panasas, Inc.4* Copyright (c) 2013-2021 Mellanox Technologies, Ltd.5* All rights reserved.6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions9* are met:10* 1. Redistributions of source code must retain the above copyright11* notice unmodified, this list of conditions, and the following12* disclaimer.13* 2. Redistributions in binary form must reproduce the above copyright14* notice, this list of conditions and the following disclaimer in the15* documentation and/or other materials provided with the distribution.16*17* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR18* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES19* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.20* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,21* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT22* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,23* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY24* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT25* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF26* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.27*/2829#include <sys/cdefs.h>30#include "opt_global.h"31#include "opt_stack.h"3233#include <sys/param.h>34#include <sys/systm.h>35#include <sys/malloc.h>36#include <sys/kernel.h>37#include <sys/sysctl.h>38#include <sys/proc.h>39#include <sys/sglist.h>40#include <sys/sleepqueue.h>41#include <sys/refcount.h>42#include <sys/lock.h>43#include <sys/mutex.h>44#include <sys/bus.h>45#include <sys/eventhandler.h>46#include <sys/fcntl.h>47#include <sys/file.h>48#include <sys/filio.h>49#include <sys/rwlock.h>50#include <sys/mman.h>51#include <sys/stack.h>52#include <sys/stdarg.h>53#include <sys/sysent.h>54#include <sys/time.h>55#include <sys/user.h>5657#include <vm/vm.h>58#include <vm/pmap.h>59#include <vm/vm_object.h>60#include <vm/vm_page.h>61#include <vm/vm_pager.h>62#include <vm/vm_radix.h>6364#if defined(__i386__) || defined(__amd64__)65#include <machine/cputypes.h>66#include <machine/md_var.h>67#endif6869#include <linux/kobject.h>70#include <linux/cpu.h>71#include <linux/device.h>72#include <linux/slab.h>73#include <linux/module.h>74#include <linux/moduleparam.h>75#include <linux/cdev.h>76#include <linux/file.h>77#include <linux/fs.h>78#include <linux/sysfs.h>79#include <linux/mm.h>80#include <linux/io.h>81#include <linux/vmalloc.h>82#include <linux/netdevice.h>83#include <linux/timer.h>84#include <linux/interrupt.h>85#include <linux/uaccess.h>86#include <linux/utsname.h>87#include <linux/list.h>88#include <linux/kthread.h>89#include <linux/kernel.h>90#include <linux/compat.h>91#include <linux/io-mapping.h>92#include <linux/poll.h>93#include <linux/smp.h>94#include <linux/wait_bit.h>95#include <linux/rcupdate.h>96#include <linux/interval_tree.h>97#include <linux/interval_tree_generic.h>98#include <linux/printk.h>99#include <linux/seq_file.h>100101#if defined(__i386__) || defined(__amd64__)102#include <asm/smp.h>103#include <asm/processor.h>104#endif105106#include <xen/xen.h>107#ifdef XENHVM108#undef xen_pv_domain109#undef xen_initial_domain110/* xen/xen-os.h redefines __must_check */111#undef __must_check112#include <xen/xen-os.h>113#endif114115SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,116"LinuxKPI parameters");117118int linuxkpi_debug;119SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,120&linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");121122int linuxkpi_rcu_debug;123SYSCTL_INT(_compat_linuxkpi, OID_AUTO, rcu_debug, CTLFLAG_RWTUN,124&linuxkpi_rcu_debug, 0, "Set to enable RCU warning. Clear to disable.");125126int linuxkpi_warn_dump_stack = 0;127SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN,128&linuxkpi_warn_dump_stack, 0,129"Set to enable stack traces from WARN_ON(). Clear to disable.");130131static struct timeval lkpi_net_lastlog;132static int lkpi_net_curpps;133static int lkpi_net_maxpps = 99;134SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN,135&lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second.");136137MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat");138139#include <linux/rbtree.h>140/* Undo Linux compat changes. */141#undef RB_ROOT142#undef file143#undef cdev144#define RB_ROOT(head) (head)->rbh_root145146static void linux_destroy_dev(struct linux_cdev *);147static void linux_cdev_deref(struct linux_cdev *ldev);148static struct vm_area_struct *linux_cdev_handle_find(void *handle);149150cpumask_t cpu_online_mask;151static cpumask_t **static_single_cpu_mask;152static cpumask_t *static_single_cpu_mask_lcs;153struct kobject linux_class_root;154struct device linux_root_device;155struct class linux_class_misc;156struct list_head pci_drivers;157struct list_head pci_devices;158spinlock_t pci_lock;159struct uts_namespace init_uts_ns;160161unsigned long linux_timer_hz_mask;162163wait_queue_head_t linux_bit_waitq;164wait_queue_head_t linux_var_waitq;165166int167panic_cmp(struct rb_node *one, struct rb_node *two)168{169panic("no cmp");170}171172RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);173174#define START(node) ((node)->start)175#define LAST(node) ((node)->last)176177INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START,178LAST,, lkpi_interval_tree)179180static void181linux_device_release(struct device *dev)182{183pr_debug("linux_device_release: %s\n", dev_name(dev));184kfree(dev);185}186187static ssize_t188linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)189{190struct class_attribute *dattr;191ssize_t error;192193dattr = container_of(attr, struct class_attribute, attr);194error = -EIO;195if (dattr->show)196error = dattr->show(container_of(kobj, struct class, kobj),197dattr, buf);198return (error);199}200201static ssize_t202linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,203size_t count)204{205struct class_attribute *dattr;206ssize_t error;207208dattr = container_of(attr, struct class_attribute, attr);209error = -EIO;210if (dattr->store)211error = dattr->store(container_of(kobj, struct class, kobj),212dattr, buf, count);213return (error);214}215216static void217linux_class_release(struct kobject *kobj)218{219struct class *class;220221class = container_of(kobj, struct class, kobj);222if (class->class_release)223class->class_release(class);224}225226static const struct sysfs_ops linux_class_sysfs = {227.show = linux_class_show,228.store = linux_class_store,229};230231const struct kobj_type linux_class_ktype = {232.release = linux_class_release,233.sysfs_ops = &linux_class_sysfs234};235236static void237linux_dev_release(struct kobject *kobj)238{239struct device *dev;240241dev = container_of(kobj, struct device, kobj);242/* This is the precedence defined by linux. */243if (dev->release)244dev->release(dev);245else if (dev->class && dev->class->dev_release)246dev->class->dev_release(dev);247}248249static ssize_t250linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)251{252struct device_attribute *dattr;253ssize_t error;254255dattr = container_of(attr, struct device_attribute, attr);256error = -EIO;257if (dattr->show)258error = dattr->show(container_of(kobj, struct device, kobj),259dattr, buf);260return (error);261}262263static ssize_t264linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,265size_t count)266{267struct device_attribute *dattr;268ssize_t error;269270dattr = container_of(attr, struct device_attribute, attr);271error = -EIO;272if (dattr->store)273error = dattr->store(container_of(kobj, struct device, kobj),274dattr, buf, count);275return (error);276}277278static const struct sysfs_ops linux_dev_sysfs = {279.show = linux_dev_show,280.store = linux_dev_store,281};282283const struct kobj_type linux_dev_ktype = {284.release = linux_dev_release,285.sysfs_ops = &linux_dev_sysfs286};287288struct device *289device_create(struct class *class, struct device *parent, dev_t devt,290void *drvdata, const char *fmt, ...)291{292struct device *dev;293va_list args;294295dev = kzalloc(sizeof(*dev), M_WAITOK);296dev->parent = parent;297dev->class = class;298dev->devt = devt;299dev->driver_data = drvdata;300dev->release = linux_device_release;301va_start(args, fmt);302kobject_set_name_vargs(&dev->kobj, fmt, args);303va_end(args);304device_register(dev);305306return (dev);307}308309struct device *310device_create_groups_vargs(struct class *class, struct device *parent,311dev_t devt, void *drvdata, const struct attribute_group **groups,312const char *fmt, va_list args)313{314struct device *dev = NULL;315int retval = -ENODEV;316317if (class == NULL || IS_ERR(class))318goto error;319320dev = kzalloc(sizeof(*dev), GFP_KERNEL);321if (!dev) {322retval = -ENOMEM;323goto error;324}325326dev->devt = devt;327dev->class = class;328dev->parent = parent;329dev->groups = groups;330dev->release = device_create_release;331/* device_initialize() needs the class and parent to be set */332device_initialize(dev);333dev_set_drvdata(dev, drvdata);334335retval = kobject_set_name_vargs(&dev->kobj, fmt, args);336if (retval)337goto error;338339retval = device_add(dev);340if (retval)341goto error;342343return dev;344345error:346put_device(dev);347return ERR_PTR(retval);348}349350struct class *351lkpi_class_create(const char *name)352{353struct class *class;354int error;355356class = kzalloc(sizeof(*class), M_WAITOK);357class->name = name;358class->class_release = linux_class_kfree;359error = class_register(class);360if (error) {361kfree(class);362return (NULL);363}364365return (class);366}367368static void369linux_kq_lock(void *arg)370{371spinlock_t *s = arg;372373spin_lock(s);374}375static void376linux_kq_unlock(void *arg)377{378spinlock_t *s = arg;379380spin_unlock(s);381}382383static void384linux_kq_assert_lock(void *arg, int what)385{386#ifdef INVARIANTS387spinlock_t *s = arg;388389if (what == LA_LOCKED)390mtx_assert(s, MA_OWNED);391else392mtx_assert(s, MA_NOTOWNED);393#endif394}395396static void397linux_file_kqfilter_poll(struct linux_file *, int);398399struct linux_file *400linux_file_alloc(void)401{402struct linux_file *filp;403404filp = kzalloc(sizeof(*filp), GFP_KERNEL);405406/* set initial refcount */407filp->f_count = 1;408409/* setup fields needed by kqueue support */410spin_lock_init(&filp->f_kqlock);411knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,412linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock);413414return (filp);415}416417void418linux_file_free(struct linux_file *filp)419{420if (filp->_file == NULL) {421if (filp->f_op != NULL && filp->f_op->release != NULL)422filp->f_op->release(filp->f_vnode, filp);423if (filp->f_shmem != NULL)424vm_object_deallocate(filp->f_shmem);425kfree_rcu(filp, rcu);426} else {427/*428* The close method of the character device or file429* will free the linux_file structure:430*/431_fdrop(filp->_file, curthread);432}433}434435struct linux_cdev *436cdev_alloc(void)437{438struct linux_cdev *cdev;439440cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);441kobject_init(&cdev->kobj, &linux_cdev_ktype);442cdev->refs = 1;443return (cdev);444}445446static int447linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,448vm_page_t *mres)449{450struct vm_area_struct *vmap;451452vmap = linux_cdev_handle_find(vm_obj->handle);453454MPASS(vmap != NULL);455MPASS(vmap->vm_private_data == vm_obj->handle);456457if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {458vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;459vm_page_t page;460461if (((*mres)->flags & PG_FICTITIOUS) != 0) {462/*463* If the passed in result page is a fake464* page, update it with the new physical465* address.466*/467page = *mres;468vm_page_updatefake(page, paddr, vm_obj->memattr);469} else {470/*471* Replace the passed in "mres" page with our472* own fake page and free up the all of the473* original pages.474*/475VM_OBJECT_WUNLOCK(vm_obj);476page = vm_page_getfake(paddr, vm_obj->memattr);477VM_OBJECT_WLOCK(vm_obj);478479vm_page_replace(page, vm_obj, (*mres)->pindex, *mres);480*mres = page;481}482vm_page_valid(page);483return (VM_PAGER_OK);484}485return (VM_PAGER_FAIL);486}487488static int489linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,490vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)491{492struct vm_area_struct *vmap;493int err;494495/* get VM area structure */496vmap = linux_cdev_handle_find(vm_obj->handle);497MPASS(vmap != NULL);498MPASS(vmap->vm_private_data == vm_obj->handle);499500VM_OBJECT_WUNLOCK(vm_obj);501502linux_set_current(curthread);503504down_write(&vmap->vm_mm->mmap_sem);505if (unlikely(vmap->vm_ops == NULL)) {506err = VM_FAULT_SIGBUS;507} else {508struct vm_fault vmf;509510/* fill out VM fault structure */511vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);512vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;513vmf.pgoff = 0;514vmf.page = NULL;515vmf.vma = vmap;516517vmap->vm_pfn_count = 0;518vmap->vm_pfn_pcount = &vmap->vm_pfn_count;519vmap->vm_obj = vm_obj;520521err = vmap->vm_ops->fault(&vmf);522523while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {524kern_yield(PRI_USER);525err = vmap->vm_ops->fault(&vmf);526}527}528529/* translate return code */530switch (err) {531case VM_FAULT_OOM:532err = VM_PAGER_AGAIN;533break;534case VM_FAULT_SIGBUS:535err = VM_PAGER_BAD;536break;537case VM_FAULT_NOPAGE:538/*539* By contract the fault handler will return having540* busied all the pages itself. If pidx is already541* found in the object, it will simply xbusy the first542* page and return with vm_pfn_count set to 1.543*/544*first = vmap->vm_pfn_first;545*last = *first + vmap->vm_pfn_count - 1;546err = VM_PAGER_OK;547break;548default:549err = VM_PAGER_ERROR;550break;551}552up_write(&vmap->vm_mm->mmap_sem);553VM_OBJECT_WLOCK(vm_obj);554return (err);555}556557static struct rwlock linux_vma_lock;558static TAILQ_HEAD(, vm_area_struct) linux_vma_head =559TAILQ_HEAD_INITIALIZER(linux_vma_head);560561static void562linux_cdev_handle_free(struct vm_area_struct *vmap)563{564/* Drop reference on vm_file */565if (vmap->vm_file != NULL)566fput(vmap->vm_file);567568/* Drop reference on mm_struct */569mmput(vmap->vm_mm);570571kfree(vmap);572}573574static void575linux_cdev_handle_remove(struct vm_area_struct *vmap)576{577rw_wlock(&linux_vma_lock);578TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);579rw_wunlock(&linux_vma_lock);580}581582static struct vm_area_struct *583linux_cdev_handle_find(void *handle)584{585struct vm_area_struct *vmap;586587rw_rlock(&linux_vma_lock);588TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {589if (vmap->vm_private_data == handle)590break;591}592rw_runlock(&linux_vma_lock);593return (vmap);594}595596static int597linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,598vm_ooffset_t foff, struct ucred *cred, u_short *color)599{600601MPASS(linux_cdev_handle_find(handle) != NULL);602*color = 0;603return (0);604}605606static void607linux_cdev_pager_dtor(void *handle)608{609const struct vm_operations_struct *vm_ops;610struct vm_area_struct *vmap;611612vmap = linux_cdev_handle_find(handle);613MPASS(vmap != NULL);614615/*616* Remove handle before calling close operation to prevent617* other threads from reusing the handle pointer.618*/619linux_cdev_handle_remove(vmap);620621down_write(&vmap->vm_mm->mmap_sem);622vm_ops = vmap->vm_ops;623if (likely(vm_ops != NULL))624vm_ops->close(vmap);625up_write(&vmap->vm_mm->mmap_sem);626627linux_cdev_handle_free(vmap);628}629630static struct cdev_pager_ops linux_cdev_pager_ops[2] = {631{632/* OBJT_MGTDEVICE */633.cdev_pg_populate = linux_cdev_pager_populate,634.cdev_pg_ctor = linux_cdev_pager_ctor,635.cdev_pg_dtor = linux_cdev_pager_dtor636},637{638/* OBJT_DEVICE */639.cdev_pg_fault = linux_cdev_pager_fault,640.cdev_pg_ctor = linux_cdev_pager_ctor,641.cdev_pg_dtor = linux_cdev_pager_dtor642},643};644645int646zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,647unsigned long size)648{649struct pctrie_iter pages;650vm_object_t obj;651vm_page_t m;652653obj = vma->vm_obj;654if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)655return (-ENOTSUP);656VM_OBJECT_RLOCK(obj);657vm_page_iter_limit_init(&pages, obj, OFF_TO_IDX(address + size));658VM_RADIX_FOREACH_FROM(m, &pages, OFF_TO_IDX(address))659pmap_remove_all(m);660VM_OBJECT_RUNLOCK(obj);661return (0);662}663664void665vma_set_file(struct vm_area_struct *vma, struct linux_file *file)666{667struct linux_file *tmp;668669/* Changing an anonymous vma with this is illegal */670get_file(file);671tmp = vma->vm_file;672vma->vm_file = file;673fput(tmp);674}675676static struct file_operations dummy_ldev_ops = {677/* XXXKIB */678};679680static struct linux_cdev dummy_ldev = {681.ops = &dummy_ldev_ops,682};683684#define LDEV_SI_DTR 0x0001685#define LDEV_SI_REF 0x0002686687static void688linux_get_fop(struct linux_file *filp, const struct file_operations **fop,689struct linux_cdev **dev)690{691struct linux_cdev *ldev;692u_int siref;693694ldev = filp->f_cdev;695*fop = filp->f_op;696if (ldev != NULL) {697if (ldev->kobj.ktype == &linux_cdev_static_ktype) {698refcount_acquire(&ldev->refs);699} else {700for (siref = ldev->siref;;) {701if ((siref & LDEV_SI_DTR) != 0) {702ldev = &dummy_ldev;703*fop = ldev->ops;704siref = ldev->siref;705MPASS((ldev->siref & LDEV_SI_DTR) == 0);706} else if (atomic_fcmpset_int(&ldev->siref,707&siref, siref + LDEV_SI_REF)) {708break;709}710}711}712}713*dev = ldev;714}715716static void717linux_drop_fop(struct linux_cdev *ldev)718{719720if (ldev == NULL)721return;722if (ldev->kobj.ktype == &linux_cdev_static_ktype) {723linux_cdev_deref(ldev);724} else {725MPASS(ldev->kobj.ktype == &linux_cdev_ktype);726MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);727atomic_subtract_int(&ldev->siref, LDEV_SI_REF);728}729}730731#define OPW(fp,td,code) ({ \732struct file *__fpop; \733__typeof(code) __retval; \734\735__fpop = (td)->td_fpop; \736(td)->td_fpop = (fp); \737__retval = (code); \738(td)->td_fpop = __fpop; \739__retval; \740})741742static int743linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,744struct file *file)745{746struct linux_cdev *ldev;747struct linux_file *filp;748const struct file_operations *fop;749int error;750751ldev = dev->si_drv1;752753filp = linux_file_alloc();754filp->f_dentry = &filp->f_dentry_store;755filp->f_op = ldev->ops;756filp->f_mode = file->f_flag;757filp->f_flags = file->f_flag;758filp->f_vnode = file->f_vnode;759filp->_file = file;760refcount_acquire(&ldev->refs);761filp->f_cdev = ldev;762763linux_set_current(td);764linux_get_fop(filp, &fop, &ldev);765766if (fop->open != NULL) {767error = -fop->open(file->f_vnode, filp);768if (error != 0) {769linux_drop_fop(ldev);770linux_cdev_deref(filp->f_cdev);771kfree(filp);772return (error);773}774}775776/* hold on to the vnode - used for fstat() */777vref(filp->f_vnode);778779/* release the file from devfs */780finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);781linux_drop_fop(ldev);782return (ENXIO);783}784785#define LINUX_IOCTL_MIN_PTR 0x10000UL786#define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)787788static inline int789linux_remap_address(void **uaddr, size_t len)790{791uintptr_t uaddr_val = (uintptr_t)(*uaddr);792793if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&794uaddr_val < LINUX_IOCTL_MAX_PTR)) {795struct task_struct *pts = current;796if (pts == NULL) {797*uaddr = NULL;798return (1);799}800801/* compute data offset */802uaddr_val -= LINUX_IOCTL_MIN_PTR;803804/* check that length is within bounds */805if ((len > IOCPARM_MAX) ||806(uaddr_val + len) > pts->bsd_ioctl_len) {807*uaddr = NULL;808return (1);809}810811/* re-add kernel buffer address */812uaddr_val += (uintptr_t)pts->bsd_ioctl_data;813814/* update address location */815*uaddr = (void *)uaddr_val;816return (1);817}818return (0);819}820821int822linux_copyin(const void *uaddr, void *kaddr, size_t len)823{824if (linux_remap_address(__DECONST(void **, &uaddr), len)) {825if (uaddr == NULL)826return (-EFAULT);827memcpy(kaddr, uaddr, len);828return (0);829}830return (-copyin(uaddr, kaddr, len));831}832833int834linux_copyout(const void *kaddr, void *uaddr, size_t len)835{836if (linux_remap_address(&uaddr, len)) {837if (uaddr == NULL)838return (-EFAULT);839memcpy(uaddr, kaddr, len);840return (0);841}842return (-copyout(kaddr, uaddr, len));843}844845size_t846linux_clear_user(void *_uaddr, size_t _len)847{848uint8_t *uaddr = _uaddr;849size_t len = _len;850851/* make sure uaddr is aligned before going into the fast loop */852while (((uintptr_t)uaddr & 7) != 0 && len > 7) {853if (subyte(uaddr, 0))854return (_len);855uaddr++;856len--;857}858859/* zero 8 bytes at a time */860while (len > 7) {861#ifdef __LP64__862if (suword64(uaddr, 0))863return (_len);864#else865if (suword32(uaddr, 0))866return (_len);867if (suword32(uaddr + 4, 0))868return (_len);869#endif870uaddr += 8;871len -= 8;872}873874/* zero fill end, if any */875while (len > 0) {876if (subyte(uaddr, 0))877return (_len);878uaddr++;879len--;880}881return (0);882}883884int885linux_access_ok(const void *uaddr, size_t len)886{887uintptr_t saddr;888uintptr_t eaddr;889890/* get start and end address */891saddr = (uintptr_t)uaddr;892eaddr = (uintptr_t)uaddr + len;893894/* verify addresses are valid for userspace */895return ((saddr == eaddr) ||896(eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));897}898899/*900* This function should return either EINTR or ERESTART depending on901* the signal type sent to this thread:902*/903static int904linux_get_error(struct task_struct *task, int error)905{906/* check for signal type interrupt code */907if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {908error = -linux_schedule_get_interrupt_value(task);909if (error == 0)910error = EINTR;911}912return (error);913}914915static int916linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,917const struct file_operations *fop, u_long cmd, caddr_t data,918struct thread *td)919{920struct task_struct *task = current;921unsigned size;922int error;923924size = IOCPARM_LEN(cmd);925/* refer to logic in sys_ioctl() */926if (size > 0) {927/*928* Setup hint for linux_copyin() and linux_copyout().929*930* Background: Linux code expects a user-space address931* while FreeBSD supplies a kernel-space address.932*/933task->bsd_ioctl_data = data;934task->bsd_ioctl_len = size;935data = (void *)LINUX_IOCTL_MIN_PTR;936} else {937/* fetch user-space pointer */938data = *(void **)data;939}940#ifdef COMPAT_FREEBSD32941if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {942/* try the compat IOCTL handler first */943if (fop->compat_ioctl != NULL) {944error = -OPW(fp, td, fop->compat_ioctl(filp,945cmd, (u_long)data));946} else {947error = ENOTTY;948}949950/* fallback to the regular IOCTL handler, if any */951if (error == ENOTTY && fop->unlocked_ioctl != NULL) {952error = -OPW(fp, td, fop->unlocked_ioctl(filp,953cmd, (u_long)data));954}955} else956#endif957{958if (fop->unlocked_ioctl != NULL) {959error = -OPW(fp, td, fop->unlocked_ioctl(filp,960cmd, (u_long)data));961} else {962error = ENOTTY;963}964}965if (size > 0) {966task->bsd_ioctl_data = NULL;967task->bsd_ioctl_len = 0;968}969970if (error == EWOULDBLOCK) {971/* update kqfilter status, if any */972linux_file_kqfilter_poll(filp,973LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);974} else {975error = linux_get_error(task, error);976}977return (error);978}979980#define LINUX_POLL_TABLE_NORMAL ((poll_table *)1)981982/*983* This function atomically updates the poll wakeup state and returns984* the previous state at the time of update.985*/986static uint8_t987linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)988{989int c, old;990991c = v->counter;992993while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)994c = old;995996return (c);997}998999static int1000linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)1001{1002static const uint8_t state[LINUX_FWQ_STATE_MAX] = {1003[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */1004[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */1005[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,1006[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */1007};1008struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);10091010switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {1011case LINUX_FWQ_STATE_QUEUED:1012linux_poll_wakeup(filp);1013return (1);1014default:1015return (0);1016}1017}10181019void1020linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)1021{1022static const uint8_t state[LINUX_FWQ_STATE_MAX] = {1023[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,1024[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */1025[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */1026[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,1027};10281029/* check if we are called inside the select system call */1030if (p == LINUX_POLL_TABLE_NORMAL)1031selrecord(curthread, &filp->f_selinfo);10321033switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {1034case LINUX_FWQ_STATE_INIT:1035/* NOTE: file handles can only belong to one wait-queue */1036filp->f_wait_queue.wqh = wqh;1037filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;1038add_wait_queue(wqh, &filp->f_wait_queue.wq);1039atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);1040break;1041default:1042break;1043}1044}10451046static void1047linux_poll_wait_dequeue(struct linux_file *filp)1048{1049static const uint8_t state[LINUX_FWQ_STATE_MAX] = {1050[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */1051[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,1052[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,1053[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,1054};10551056seldrain(&filp->f_selinfo);10571058switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {1059case LINUX_FWQ_STATE_NOT_READY:1060case LINUX_FWQ_STATE_QUEUED:1061case LINUX_FWQ_STATE_READY:1062remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);1063break;1064default:1065break;1066}1067}10681069void1070linux_poll_wakeup(struct linux_file *filp)1071{1072/* this function should be NULL-safe */1073if (filp == NULL)1074return;10751076selwakeup(&filp->f_selinfo);10771078spin_lock(&filp->f_kqlock);1079filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |1080LINUX_KQ_FLAG_NEED_WRITE;10811082/* make sure the "knote" gets woken up */1083KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);1084spin_unlock(&filp->f_kqlock);1085}10861087static struct linux_file *1088__get_file_rcu(struct linux_file **f)1089{1090struct linux_file *file1, *file2;10911092file1 = READ_ONCE(*f);1093if (file1 == NULL)1094return (NULL);10951096if (!refcount_acquire_if_not_zero(1097file1->_file == NULL ? &file1->f_count : &file1->_file->f_count))1098return (ERR_PTR(-EAGAIN));10991100file2 = READ_ONCE(*f);1101if (file2 == file1)1102return (file2);11031104fput(file1);1105return (ERR_PTR(-EAGAIN));1106}11071108struct linux_file *1109linux_get_file_rcu(struct linux_file **f)1110{1111struct linux_file *file1;11121113for (;;) {1114file1 = __get_file_rcu(f);1115if (file1 == NULL)1116return (NULL);11171118if (IS_ERR(file1))1119continue;11201121return (file1);1122}1123}11241125struct linux_file *1126get_file_active(struct linux_file **f)1127{1128struct linux_file *file1;11291130rcu_read_lock();1131file1 = __get_file_rcu(f);1132rcu_read_unlock();1133if (IS_ERR(file1))1134file1 = NULL;11351136return (file1);1137}11381139static void1140linux_file_kqfilter_detach(struct knote *kn)1141{1142struct linux_file *filp = kn->kn_hook;11431144spin_lock(&filp->f_kqlock);1145knlist_remove(&filp->f_selinfo.si_note, kn, 1);1146spin_unlock(&filp->f_kqlock);1147}11481149static int1150linux_file_kqfilter_read_event(struct knote *kn, long hint)1151{1152struct linux_file *filp = kn->kn_hook;11531154mtx_assert(&filp->f_kqlock, MA_OWNED);11551156return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);1157}11581159static int1160linux_file_kqfilter_write_event(struct knote *kn, long hint)1161{1162struct linux_file *filp = kn->kn_hook;11631164mtx_assert(&filp->f_kqlock, MA_OWNED);11651166return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);1167}11681169static const struct filterops linux_dev_kqfiltops_read = {1170.f_isfd = 1,1171.f_detach = linux_file_kqfilter_detach,1172.f_event = linux_file_kqfilter_read_event,1173};11741175static const struct filterops linux_dev_kqfiltops_write = {1176.f_isfd = 1,1177.f_detach = linux_file_kqfilter_detach,1178.f_event = linux_file_kqfilter_write_event,1179};11801181static void1182linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)1183{1184struct thread *td;1185const struct file_operations *fop;1186struct linux_cdev *ldev;1187int temp;11881189if ((filp->f_kqflags & kqflags) == 0)1190return;11911192td = curthread;11931194linux_get_fop(filp, &fop, &ldev);1195/* get the latest polling state */1196temp = OPW(filp->_file, td, fop->poll(filp, NULL));1197linux_drop_fop(ldev);11981199spin_lock(&filp->f_kqlock);1200/* clear kqflags */1201filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |1202LINUX_KQ_FLAG_NEED_WRITE);1203/* update kqflags */1204if ((temp & (POLLIN | POLLOUT)) != 0) {1205if ((temp & POLLIN) != 0)1206filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;1207if ((temp & POLLOUT) != 0)1208filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;12091210/* make sure the "knote" gets woken up */1211KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);1212}1213spin_unlock(&filp->f_kqlock);1214}12151216static int1217linux_file_kqfilter(struct file *file, struct knote *kn)1218{1219struct linux_file *filp;1220struct thread *td;1221int error;12221223td = curthread;1224filp = (struct linux_file *)file->f_data;1225filp->f_flags = file->f_flag;1226if (filp->f_op->poll == NULL)1227return (EINVAL);12281229spin_lock(&filp->f_kqlock);1230switch (kn->kn_filter) {1231case EVFILT_READ:1232filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;1233kn->kn_fop = &linux_dev_kqfiltops_read;1234kn->kn_hook = filp;1235knlist_add(&filp->f_selinfo.si_note, kn, 1);1236error = 0;1237break;1238case EVFILT_WRITE:1239filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;1240kn->kn_fop = &linux_dev_kqfiltops_write;1241kn->kn_hook = filp;1242knlist_add(&filp->f_selinfo.si_note, kn, 1);1243error = 0;1244break;1245default:1246error = EINVAL;1247break;1248}1249spin_unlock(&filp->f_kqlock);12501251if (error == 0) {1252linux_set_current(td);12531254/* update kqfilter status, if any */1255linux_file_kqfilter_poll(filp,1256LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);1257}1258return (error);1259}12601261static int1262linux_file_mmap_single(struct file *fp, const struct file_operations *fop,1263vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,1264int nprot, bool is_shared, struct thread *td)1265{1266struct task_struct *task;1267struct vm_area_struct *vmap;1268struct mm_struct *mm;1269struct linux_file *filp;1270vm_memattr_t attr;1271int error;12721273filp = (struct linux_file *)fp->f_data;1274filp->f_flags = fp->f_flag;12751276if (fop->mmap == NULL)1277return (EOPNOTSUPP);12781279linux_set_current(td);12801281/*1282* The same VM object might be shared by multiple processes1283* and the mm_struct is usually freed when a process exits.1284*1285* The atomic reference below makes sure the mm_struct is1286* available as long as the vmap is in the linux_vma_head.1287*/1288task = current;1289mm = task->mm;1290if (atomic_inc_not_zero(&mm->mm_users) == 0)1291return (EINVAL);12921293vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);1294vmap->vm_start = 0;1295vmap->vm_end = size;1296vmap->vm_pgoff = *offset / PAGE_SIZE;1297vmap->vm_pfn = 0;1298vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);1299if (is_shared)1300vmap->vm_flags |= VM_SHARED;1301vmap->vm_ops = NULL;1302vmap->vm_file = get_file(filp);1303vmap->vm_mm = mm;13041305if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {1306error = linux_get_error(task, EINTR);1307} else {1308error = -OPW(fp, td, fop->mmap(filp, vmap));1309error = linux_get_error(task, error);1310up_write(&vmap->vm_mm->mmap_sem);1311}13121313if (error != 0) {1314linux_cdev_handle_free(vmap);1315return (error);1316}13171318attr = pgprot2cachemode(vmap->vm_page_prot);13191320if (vmap->vm_ops != NULL) {1321struct vm_area_struct *ptr;1322void *vm_private_data;1323bool vm_no_fault;13241325if (vmap->vm_ops->open == NULL ||1326vmap->vm_ops->close == NULL ||1327vmap->vm_private_data == NULL) {1328/* free allocated VM area struct */1329linux_cdev_handle_free(vmap);1330return (EINVAL);1331}13321333vm_private_data = vmap->vm_private_data;13341335rw_wlock(&linux_vma_lock);1336TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {1337if (ptr->vm_private_data == vm_private_data)1338break;1339}1340/* check if there is an existing VM area struct */1341if (ptr != NULL) {1342/* check if the VM area structure is invalid */1343if (ptr->vm_ops == NULL ||1344ptr->vm_ops->open == NULL ||1345ptr->vm_ops->close == NULL) {1346error = ESTALE;1347vm_no_fault = 1;1348} else {1349error = EEXIST;1350vm_no_fault = (ptr->vm_ops->fault == NULL);1351}1352} else {1353/* insert VM area structure into list */1354TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);1355error = 0;1356vm_no_fault = (vmap->vm_ops->fault == NULL);1357}1358rw_wunlock(&linux_vma_lock);13591360if (error != 0) {1361/* free allocated VM area struct */1362linux_cdev_handle_free(vmap);1363/* check for stale VM area struct */1364if (error != EEXIST)1365return (error);1366}13671368/* check if there is no fault handler */1369if (vm_no_fault) {1370*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,1371&linux_cdev_pager_ops[1], size, nprot, *offset,1372td->td_ucred);1373} else {1374*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,1375&linux_cdev_pager_ops[0], size, nprot, *offset,1376td->td_ucred);1377}13781379/* check if allocating the VM object failed */1380if (*object == NULL) {1381if (error == 0) {1382/* remove VM area struct from list */1383linux_cdev_handle_remove(vmap);1384/* free allocated VM area struct */1385linux_cdev_handle_free(vmap);1386}1387return (EINVAL);1388}1389} else {1390struct sglist *sg;13911392sg = sglist_alloc(1, M_WAITOK);1393sglist_append_phys(sg,1394(vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);13951396*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,1397nprot, 0, td->td_ucred);13981399linux_cdev_handle_free(vmap);14001401if (*object == NULL) {1402sglist_free(sg);1403return (EINVAL);1404}1405}14061407if (attr != VM_MEMATTR_DEFAULT) {1408VM_OBJECT_WLOCK(*object);1409vm_object_set_memattr(*object, attr);1410VM_OBJECT_WUNLOCK(*object);1411}1412*offset = 0;1413return (0);1414}14151416struct cdevsw linuxcdevsw = {1417.d_version = D_VERSION,1418.d_fdopen = linux_dev_fdopen,1419.d_name = "lkpidev",1420};14211422static int1423linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,1424int flags, struct thread *td)1425{1426struct linux_file *filp;1427const struct file_operations *fop;1428struct linux_cdev *ldev;1429ssize_t bytes;1430int error;14311432error = 0;1433filp = (struct linux_file *)file->f_data;1434filp->f_flags = file->f_flag;1435/* XXX no support for I/O vectors currently */1436if (uio->uio_iovcnt != 1)1437return (EOPNOTSUPP);1438if (uio->uio_resid > DEVFS_IOSIZE_MAX)1439return (EINVAL);1440linux_set_current(td);1441linux_get_fop(filp, &fop, &ldev);1442if (fop->read != NULL) {1443bytes = OPW(file, td, fop->read(filp,1444uio->uio_iov->iov_base,1445uio->uio_iov->iov_len, &uio->uio_offset));1446if (bytes >= 0) {1447uio->uio_iov->iov_base =1448((uint8_t *)uio->uio_iov->iov_base) + bytes;1449uio->uio_iov->iov_len -= bytes;1450uio->uio_resid -= bytes;1451} else {1452error = linux_get_error(current, -bytes);1453}1454} else1455error = ENXIO;14561457/* update kqfilter status, if any */1458linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);1459linux_drop_fop(ldev);14601461return (error);1462}14631464static int1465linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,1466int flags, struct thread *td)1467{1468struct linux_file *filp;1469const struct file_operations *fop;1470struct linux_cdev *ldev;1471ssize_t bytes;1472int error;14731474filp = (struct linux_file *)file->f_data;1475filp->f_flags = file->f_flag;1476/* XXX no support for I/O vectors currently */1477if (uio->uio_iovcnt != 1)1478return (EOPNOTSUPP);1479if (uio->uio_resid > DEVFS_IOSIZE_MAX)1480return (EINVAL);1481linux_set_current(td);1482linux_get_fop(filp, &fop, &ldev);1483if (fop->write != NULL) {1484bytes = OPW(file, td, fop->write(filp,1485uio->uio_iov->iov_base,1486uio->uio_iov->iov_len, &uio->uio_offset));1487if (bytes >= 0) {1488uio->uio_iov->iov_base =1489((uint8_t *)uio->uio_iov->iov_base) + bytes;1490uio->uio_iov->iov_len -= bytes;1491uio->uio_resid -= bytes;1492error = 0;1493} else {1494error = linux_get_error(current, -bytes);1495}1496} else1497error = ENXIO;14981499/* update kqfilter status, if any */1500linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);15011502linux_drop_fop(ldev);15031504return (error);1505}15061507static int1508linux_file_poll(struct file *file, int events, struct ucred *active_cred,1509struct thread *td)1510{1511struct linux_file *filp;1512const struct file_operations *fop;1513struct linux_cdev *ldev;1514int revents;15151516filp = (struct linux_file *)file->f_data;1517filp->f_flags = file->f_flag;1518linux_set_current(td);1519linux_get_fop(filp, &fop, &ldev);1520if (fop->poll != NULL) {1521revents = OPW(file, td, fop->poll(filp,1522LINUX_POLL_TABLE_NORMAL)) & events;1523} else {1524revents = 0;1525}1526linux_drop_fop(ldev);1527return (revents);1528}15291530static int1531linux_file_close(struct file *file, struct thread *td)1532{1533struct linux_file *filp;1534int (*release)(struct inode *, struct linux_file *);1535const struct file_operations *fop;1536struct linux_cdev *ldev;1537int error;15381539filp = (struct linux_file *)file->f_data;15401541KASSERT(file_count(filp) == 0,1542("File refcount(%d) is not zero", file_count(filp)));15431544if (td == NULL)1545td = curthread;15461547error = 0;1548filp->f_flags = file->f_flag;1549linux_set_current(td);1550linux_poll_wait_dequeue(filp);1551linux_get_fop(filp, &fop, &ldev);1552/*1553* Always use the real release function, if any, to avoid1554* leaking device resources:1555*/1556release = filp->f_op->release;1557if (release != NULL)1558error = -OPW(file, td, release(filp->f_vnode, filp));1559funsetown(&filp->f_sigio);1560if (filp->f_vnode != NULL)1561vrele(filp->f_vnode);1562linux_drop_fop(ldev);1563ldev = filp->f_cdev;1564if (ldev != NULL)1565linux_cdev_deref(ldev);1566linux_synchronize_rcu(RCU_TYPE_REGULAR);1567kfree(filp);15681569return (error);1570}15711572static int1573linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,1574struct thread *td)1575{1576struct linux_file *filp;1577const struct file_operations *fop;1578struct linux_cdev *ldev;1579struct fiodgname_arg *fgn;1580const char *p;1581int error, i;15821583error = 0;1584filp = (struct linux_file *)fp->f_data;1585filp->f_flags = fp->f_flag;1586linux_get_fop(filp, &fop, &ldev);15871588linux_set_current(td);1589switch (cmd) {1590case FIONBIO:1591break;1592case FIOASYNC:1593if (fop->fasync == NULL)1594break;1595error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));1596break;1597case FIOSETOWN:1598error = fsetown(*(int *)data, &filp->f_sigio);1599if (error == 0) {1600if (fop->fasync == NULL)1601break;1602error = -OPW(fp, td, fop->fasync(0, filp,1603fp->f_flag & FASYNC));1604}1605break;1606case FIOGETOWN:1607*(int *)data = fgetown(&filp->f_sigio);1608break;1609case FIODGNAME:1610#ifdef COMPAT_FREEBSD321611case FIODGNAME_32:1612#endif1613if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {1614error = ENXIO;1615break;1616}1617fgn = data;1618p = devtoname(filp->f_cdev->cdev);1619i = strlen(p) + 1;1620if (i > fgn->len) {1621error = EINVAL;1622break;1623}1624error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i);1625break;1626default:1627error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);1628break;1629}1630linux_drop_fop(ldev);1631return (error);1632}16331634static int1635linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,1636vm_prot_t maxprot, int flags, struct file *fp,1637vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)1638{1639/*1640* Character devices do not provide private mappings1641* of any kind:1642*/1643if ((maxprot & VM_PROT_WRITE) == 0 &&1644(prot & VM_PROT_WRITE) != 0)1645return (EACCES);1646if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0)1647return (EINVAL);16481649return (linux_file_mmap_single(fp, fop, foff, objsize, objp,1650(int)prot, (flags & MAP_SHARED) ? true : false, td));1651}16521653static int1654linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,1655vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,1656struct thread *td)1657{1658struct linux_file *filp;1659const struct file_operations *fop;1660struct linux_cdev *ldev;1661struct mount *mp;1662struct vnode *vp;1663vm_object_t object;1664vm_prot_t maxprot;1665int error;16661667filp = (struct linux_file *)fp->f_data;16681669vp = filp->f_vnode;1670if (vp == NULL)1671return (EOPNOTSUPP);16721673/*1674* Ensure that file and memory protections are1675* compatible.1676*/1677mp = vp->v_mount;1678if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {1679maxprot = VM_PROT_NONE;1680if ((prot & VM_PROT_EXECUTE) != 0)1681return (EACCES);1682} else1683maxprot = VM_PROT_EXECUTE;1684if ((fp->f_flag & FREAD) != 0)1685maxprot |= VM_PROT_READ;1686else if ((prot & VM_PROT_READ) != 0)1687return (EACCES);16881689/*1690* If we are sharing potential changes via MAP_SHARED and we1691* are trying to get write permission although we opened it1692* without asking for it, bail out.1693*1694* Note that most character devices always share mappings.1695*1696* Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE1697* requests rather than doing it here.1698*/1699if ((flags & MAP_SHARED) != 0) {1700if ((fp->f_flag & FWRITE) != 0)1701maxprot |= VM_PROT_WRITE;1702else if ((prot & VM_PROT_WRITE) != 0)1703return (EACCES);1704}1705maxprot &= cap_maxprot;17061707linux_get_fop(filp, &fop, &ldev);1708error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp,1709&foff, fop, &object);1710if (error != 0)1711goto out;17121713error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,1714foff, FALSE, td);1715if (error != 0)1716vm_object_deallocate(object);1717out:1718linux_drop_fop(ldev);1719return (error);1720}17211722static int1723linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)1724{1725struct linux_file *filp;1726struct vnode *vp;1727int error;17281729filp = (struct linux_file *)fp->f_data;1730if (filp->f_vnode == NULL)1731return (EOPNOTSUPP);17321733vp = filp->f_vnode;17341735vn_lock(vp, LK_SHARED | LK_RETRY);1736error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED);1737VOP_UNLOCK(vp);17381739return (error);1740}17411742static int1743linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,1744struct filedesc *fdp)1745{1746struct linux_file *filp;1747struct vnode *vp;1748int error;17491750filp = fp->f_data;1751vp = filp->f_vnode;1752if (vp == NULL) {1753error = 0;1754kif->kf_type = KF_TYPE_DEV;1755} else {1756vref(vp);1757FILEDESC_SUNLOCK(fdp);1758error = vn_fill_kinfo_vnode(vp, kif);1759vrele(vp);1760kif->kf_type = KF_TYPE_VNODE;1761FILEDESC_SLOCK(fdp);1762}1763return (error);1764}17651766unsigned int1767linux_iminor(struct inode *inode)1768{1769struct linux_cdev *ldev;17701771if (inode == NULL || inode->v_rdev == NULL ||1772inode->v_rdev->si_devsw != &linuxcdevsw)1773return (-1U);1774ldev = inode->v_rdev->si_drv1;1775if (ldev == NULL)1776return (-1U);17771778return (minor(ldev->dev));1779}17801781static int1782linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td)1783{1784struct linux_file *filp1, *filp2;17851786if (fp2->f_type != DTYPE_DEV)1787return (3);17881789filp1 = fp1->f_data;1790filp2 = fp2->f_data;1791return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev));1792}17931794const struct fileops linuxfileops = {1795.fo_read = linux_file_read,1796.fo_write = linux_file_write,1797.fo_truncate = invfo_truncate,1798.fo_kqfilter = linux_file_kqfilter,1799.fo_stat = linux_file_stat,1800.fo_fill_kinfo = linux_file_fill_kinfo,1801.fo_poll = linux_file_poll,1802.fo_close = linux_file_close,1803.fo_ioctl = linux_file_ioctl,1804.fo_mmap = linux_file_mmap,1805.fo_chmod = invfo_chmod,1806.fo_chown = invfo_chown,1807.fo_sendfile = invfo_sendfile,1808.fo_cmp = linux_file_kcmp,1809.fo_flags = DFLAG_PASSABLE,1810};18111812/*1813* Hash of vmmap addresses. This is infrequently accessed and does not1814* need to be particularly large. This is done because we must store the1815* caller's idea of the map size to properly unmap.1816*/1817struct vmmap {1818LIST_ENTRY(vmmap) vm_next;1819void *vm_addr;1820unsigned long vm_size;1821};18221823struct vmmaphd {1824struct vmmap *lh_first;1825};1826#define VMMAP_HASH_SIZE 641827#define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1)1828#define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK1829static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];1830static struct mtx vmmaplock;18311832static void1833vmmap_add(void *addr, unsigned long size)1834{1835struct vmmap *vmmap;18361837vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);1838mtx_lock(&vmmaplock);1839vmmap->vm_size = size;1840vmmap->vm_addr = addr;1841LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);1842mtx_unlock(&vmmaplock);1843}18441845static struct vmmap *1846vmmap_remove(void *addr)1847{1848struct vmmap *vmmap;18491850mtx_lock(&vmmaplock);1851LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)1852if (vmmap->vm_addr == addr)1853break;1854if (vmmap)1855LIST_REMOVE(vmmap, vm_next);1856mtx_unlock(&vmmaplock);18571858return (vmmap);1859}18601861#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)1862void *1863_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)1864{1865void *addr;18661867addr = pmap_mapdev_attr(phys_addr, size, attr);1868if (addr == NULL)1869return (NULL);1870vmmap_add(addr, size);18711872return (addr);1873}1874#endif18751876void1877iounmap(void *addr)1878{1879struct vmmap *vmmap;18801881vmmap = vmmap_remove(addr);1882if (vmmap == NULL)1883return;1884#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)1885pmap_unmapdev(addr, vmmap->vm_size);1886#endif1887kfree(vmmap);1888}18891890void *1891vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)1892{1893vm_offset_t off;1894size_t size;18951896size = count * PAGE_SIZE;1897off = kva_alloc(size);1898if (off == 0)1899return (NULL);1900vmmap_add((void *)off, size);1901pmap_qenter(off, pages, count);19021903return ((void *)off);1904}19051906void1907vunmap(void *addr)1908{1909struct vmmap *vmmap;19101911vmmap = vmmap_remove(addr);1912if (vmmap == NULL)1913return;1914pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);1915kva_free((vm_offset_t)addr, vmmap->vm_size);1916kfree(vmmap);1917}19181919static char *1920devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap)1921{1922unsigned int len;1923char *p;1924va_list aq;19251926va_copy(aq, ap);1927len = vsnprintf(NULL, 0, fmt, aq);1928va_end(aq);19291930if (dev != NULL)1931p = devm_kmalloc(dev, len + 1, gfp);1932else1933p = kmalloc(len + 1, gfp);1934if (p != NULL)1935vsnprintf(p, len + 1, fmt, ap);19361937return (p);1938}19391940char *1941kvasprintf(gfp_t gfp, const char *fmt, va_list ap)1942{19431944return (devm_kvasprintf(NULL, gfp, fmt, ap));1945}19461947char *1948lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)1949{1950va_list ap;1951char *p;19521953va_start(ap, fmt);1954p = devm_kvasprintf(dev, gfp, fmt, ap);1955va_end(ap);19561957return (p);1958}19591960char *1961kasprintf(gfp_t gfp, const char *fmt, ...)1962{1963va_list ap;1964char *p;19651966va_start(ap, fmt);1967p = kvasprintf(gfp, fmt, ap);1968va_end(ap);19691970return (p);1971}19721973int1974__lkpi_hexdump_printf(void *arg1 __unused, const char *fmt, ...)1975{1976va_list ap;1977int result;19781979va_start(ap, fmt);1980result = vprintf(fmt, ap);1981va_end(ap);1982return (result);1983}19841985int1986__lkpi_hexdump_sbuf_printf(void *arg1, const char *fmt, ...)1987{1988va_list ap;1989int result;19901991va_start(ap, fmt);1992result = sbuf_vprintf(arg1, fmt, ap);1993va_end(ap);1994return (result);1995}19961997void1998lkpi_hex_dump(int(*_fpf)(void *, const char *, ...), void *arg1,1999const char *level, const char *prefix_str,2000const int prefix_type, const int rowsize, const int groupsize,2001const void *buf, size_t len, const bool ascii)2002{2003typedef const struct { long long value; } __packed *print_64p_t;2004typedef const struct { uint32_t value; } __packed *print_32p_t;2005typedef const struct { uint16_t value; } __packed *print_16p_t;2006const void *buf_old = buf;2007int row;20082009while (len > 0) {2010if (level != NULL)2011_fpf(arg1, "%s", level);2012if (prefix_str != NULL)2013_fpf(arg1, "%s ", prefix_str);20142015switch (prefix_type) {2016case DUMP_PREFIX_ADDRESS:2017_fpf(arg1, "[%p] ", buf);2018break;2019case DUMP_PREFIX_OFFSET:2020_fpf(arg1, "[%#tx] ", ((const char *)buf -2021(const char *)buf_old));2022break;2023default:2024break;2025}2026for (row = 0; row != rowsize; row++) {2027if (groupsize == 8 && len > 7) {2028_fpf(arg1, "%016llx ", ((print_64p_t)buf)->value);2029buf = (const uint8_t *)buf + 8;2030len -= 8;2031} else if (groupsize == 4 && len > 3) {2032_fpf(arg1, "%08x ", ((print_32p_t)buf)->value);2033buf = (const uint8_t *)buf + 4;2034len -= 4;2035} else if (groupsize == 2 && len > 1) {2036_fpf(arg1, "%04x ", ((print_16p_t)buf)->value);2037buf = (const uint8_t *)buf + 2;2038len -= 2;2039} else if (len > 0) {2040_fpf(arg1, "%02x ", *(const uint8_t *)buf);2041buf = (const uint8_t *)buf + 1;2042len--;2043} else {2044break;2045}2046}2047_fpf(arg1, "\n");2048}2049}20502051static void2052linux_timer_callback_wrapper(void *context)2053{2054struct timer_list *timer;20552056timer = context;20572058/* the timer is about to be shutdown permanently */2059if (timer->function == NULL)2060return;20612062if (linux_set_current_flags(curthread, M_NOWAIT)) {2063/* try again later */2064callout_reset(&timer->callout, 1,2065&linux_timer_callback_wrapper, timer);2066return;2067}20682069timer->function(timer->data);2070}20712072static int2073linux_timer_jiffies_until(unsigned long expires)2074{2075unsigned long delta = expires - jiffies;20762077/*2078* Guard against already expired values and make sure that the value can2079* be used as a tick count, rather than a jiffies count.2080*/2081if ((long)delta < 1)2082delta = 1;2083else if (delta > INT_MAX)2084delta = INT_MAX;2085return ((int)delta);2086}20872088int2089mod_timer(struct timer_list *timer, unsigned long expires)2090{2091int ret;20922093timer->expires = expires;2094ret = callout_reset(&timer->callout,2095linux_timer_jiffies_until(expires),2096&linux_timer_callback_wrapper, timer);20972098MPASS(ret == 0 || ret == 1);20992100return (ret == 1);2101}21022103void2104add_timer(struct timer_list *timer)2105{21062107callout_reset(&timer->callout,2108linux_timer_jiffies_until(timer->expires),2109&linux_timer_callback_wrapper, timer);2110}21112112void2113add_timer_on(struct timer_list *timer, int cpu)2114{21152116callout_reset_on(&timer->callout,2117linux_timer_jiffies_until(timer->expires),2118&linux_timer_callback_wrapper, timer, cpu);2119}21202121int2122timer_delete(struct timer_list *timer)2123{21242125if (callout_stop(&(timer)->callout) == -1)2126return (0);2127return (1);2128}21292130int2131timer_delete_sync(struct timer_list *timer)2132{21332134if (callout_drain(&(timer)->callout) == -1)2135return (0);2136return (1);2137}21382139int2140timer_shutdown_sync(struct timer_list *timer)2141{21422143timer->function = NULL;2144return (del_timer_sync(timer));2145}21462147/* greatest common divisor, Euclid equation */2148static uint64_t2149lkpi_gcd_64(uint64_t a, uint64_t b)2150{2151uint64_t an;2152uint64_t bn;21532154while (b != 0) {2155an = b;2156bn = a % b;2157a = an;2158b = bn;2159}2160return (a);2161}21622163uint64_t lkpi_nsec2hz_rem;2164uint64_t lkpi_nsec2hz_div = 1000000000ULL;2165uint64_t lkpi_nsec2hz_max;21662167uint64_t lkpi_usec2hz_rem;2168uint64_t lkpi_usec2hz_div = 1000000ULL;2169uint64_t lkpi_usec2hz_max;21702171uint64_t lkpi_msec2hz_rem;2172uint64_t lkpi_msec2hz_div = 1000ULL;2173uint64_t lkpi_msec2hz_max;21742175static void2176linux_timer_init(void *arg)2177{2178uint64_t gcd;21792180/*2181* Compute an internal HZ value which can divide 2**32 to2182* avoid timer rounding problems when the tick value wraps2183* around 2**32:2184*/2185linux_timer_hz_mask = 1;2186while (linux_timer_hz_mask < (unsigned long)hz)2187linux_timer_hz_mask *= 2;2188linux_timer_hz_mask--;21892190/* compute some internal constants */21912192lkpi_nsec2hz_rem = hz;2193lkpi_usec2hz_rem = hz;2194lkpi_msec2hz_rem = hz;21952196gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);2197lkpi_nsec2hz_rem /= gcd;2198lkpi_nsec2hz_div /= gcd;2199lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;22002201gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);2202lkpi_usec2hz_rem /= gcd;2203lkpi_usec2hz_div /= gcd;2204lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;22052206gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);2207lkpi_msec2hz_rem /= gcd;2208lkpi_msec2hz_div /= gcd;2209lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;2210}2211SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);22122213void2214linux_complete_common(struct completion *c, int all)2215{2216sleepq_lock(c);2217if (all) {2218c->done = UINT_MAX;2219sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);2220} else {2221if (c->done != UINT_MAX)2222c->done++;2223sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);2224}2225sleepq_release(c);2226}22272228/*2229* Indefinite wait for done != 0 with or without signals.2230*/2231int2232linux_wait_for_common(struct completion *c, int flags)2233{2234struct task_struct *task;2235int error;22362237if (SCHEDULER_STOPPED())2238return (0);22392240task = current;22412242if (flags != 0)2243flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;2244else2245flags = SLEEPQ_SLEEP;2246error = 0;2247for (;;) {2248sleepq_lock(c);2249if (c->done)2250break;2251sleepq_add(c, NULL, "completion", flags, 0);2252if (flags & SLEEPQ_INTERRUPTIBLE) {2253DROP_GIANT();2254error = -sleepq_wait_sig(c, 0);2255PICKUP_GIANT();2256if (error != 0) {2257linux_schedule_save_interrupt_value(task, error);2258error = -ERESTARTSYS;2259goto intr;2260}2261} else {2262DROP_GIANT();2263sleepq_wait(c, 0);2264PICKUP_GIANT();2265}2266}2267if (c->done != UINT_MAX)2268c->done--;2269sleepq_release(c);22702271intr:2272return (error);2273}22742275/*2276* Time limited wait for done != 0 with or without signals.2277*/2278unsigned long2279linux_wait_for_timeout_common(struct completion *c, unsigned long timeout,2280int flags)2281{2282struct task_struct *task;2283unsigned long end = jiffies + timeout, error;22842285if (SCHEDULER_STOPPED())2286return (0);22872288task = current;22892290if (flags != 0)2291flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;2292else2293flags = SLEEPQ_SLEEP;22942295for (;;) {2296sleepq_lock(c);2297if (c->done)2298break;2299sleepq_add(c, NULL, "completion", flags, 0);2300sleepq_set_timeout(c, linux_timer_jiffies_until(end));23012302DROP_GIANT();2303if (flags & SLEEPQ_INTERRUPTIBLE)2304error = -sleepq_timedwait_sig(c, 0);2305else2306error = -sleepq_timedwait(c, 0);2307PICKUP_GIANT();23082309if (error != 0) {2310/* check for timeout */2311if (error == -EWOULDBLOCK) {2312error = 0; /* timeout */2313} else {2314/* signal happened */2315linux_schedule_save_interrupt_value(task, error);2316error = -ERESTARTSYS;2317}2318goto done;2319}2320}2321if (c->done != UINT_MAX)2322c->done--;2323sleepq_release(c);23242325/* return how many jiffies are left */2326error = linux_timer_jiffies_until(end);2327done:2328return (error);2329}23302331int2332linux_try_wait_for_completion(struct completion *c)2333{2334int isdone;23352336sleepq_lock(c);2337isdone = (c->done != 0);2338if (c->done != 0 && c->done != UINT_MAX)2339c->done--;2340sleepq_release(c);2341return (isdone);2342}23432344int2345linux_completion_done(struct completion *c)2346{2347int isdone;23482349sleepq_lock(c);2350isdone = (c->done != 0);2351sleepq_release(c);2352return (isdone);2353}23542355static void2356linux_cdev_deref(struct linux_cdev *ldev)2357{2358if (refcount_release(&ldev->refs) &&2359ldev->kobj.ktype == &linux_cdev_ktype)2360kfree(ldev);2361}23622363static void2364linux_cdev_release(struct kobject *kobj)2365{2366struct linux_cdev *cdev;2367struct kobject *parent;23682369cdev = container_of(kobj, struct linux_cdev, kobj);2370parent = kobj->parent;2371linux_destroy_dev(cdev);2372linux_cdev_deref(cdev);2373kobject_put(parent);2374}23752376static void2377linux_cdev_static_release(struct kobject *kobj)2378{2379struct cdev *cdev;2380struct linux_cdev *ldev;23812382ldev = container_of(kobj, struct linux_cdev, kobj);2383cdev = ldev->cdev;2384if (cdev != NULL) {2385destroy_dev(cdev);2386ldev->cdev = NULL;2387}2388kobject_put(kobj->parent);2389}23902391int2392linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev)2393{2394int ret;23952396if (dev->devt != 0) {2397/* Set parent kernel object. */2398ldev->kobj.parent = &dev->kobj;23992400/*2401* Unlike Linux we require the kobject of the2402* character device structure to have a valid name2403* before calling this function:2404*/2405if (ldev->kobj.name == NULL)2406return (-EINVAL);24072408ret = cdev_add(ldev, dev->devt, 1);2409if (ret)2410return (ret);2411}2412ret = device_add(dev);2413if (ret != 0 && dev->devt != 0)2414cdev_del(ldev);2415return (ret);2416}24172418void2419linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev)2420{2421device_del(dev);24222423if (dev->devt != 0)2424cdev_del(ldev);2425}24262427static void2428linux_destroy_dev(struct linux_cdev *ldev)2429{24302431if (ldev->cdev == NULL)2432return;24332434MPASS((ldev->siref & LDEV_SI_DTR) == 0);2435MPASS(ldev->kobj.ktype == &linux_cdev_ktype);24362437atomic_set_int(&ldev->siref, LDEV_SI_DTR);2438while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)2439pause("ldevdtr", hz / 4);24402441destroy_dev(ldev->cdev);2442ldev->cdev = NULL;2443}24442445const struct kobj_type linux_cdev_ktype = {2446.release = linux_cdev_release,2447};24482449const struct kobj_type linux_cdev_static_ktype = {2450.release = linux_cdev_static_release,2451};24522453static void2454linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)2455{2456struct notifier_block *nb;2457struct netdev_notifier_info ni;24582459nb = arg;2460ni.ifp = ifp;2461ni.dev = (struct net_device *)ifp;2462if (linkstate == LINK_STATE_UP)2463nb->notifier_call(nb, NETDEV_UP, &ni);2464else2465nb->notifier_call(nb, NETDEV_DOWN, &ni);2466}24672468static void2469linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)2470{2471struct notifier_block *nb;2472struct netdev_notifier_info ni;24732474nb = arg;2475ni.ifp = ifp;2476ni.dev = (struct net_device *)ifp;2477nb->notifier_call(nb, NETDEV_REGISTER, &ni);2478}24792480static void2481linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)2482{2483struct notifier_block *nb;2484struct netdev_notifier_info ni;24852486nb = arg;2487ni.ifp = ifp;2488ni.dev = (struct net_device *)ifp;2489nb->notifier_call(nb, NETDEV_UNREGISTER, &ni);2490}24912492static void2493linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)2494{2495struct notifier_block *nb;2496struct netdev_notifier_info ni;24972498nb = arg;2499ni.ifp = ifp;2500ni.dev = (struct net_device *)ifp;2501nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni);2502}25032504static void2505linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)2506{2507struct notifier_block *nb;2508struct netdev_notifier_info ni;25092510nb = arg;2511ni.ifp = ifp;2512ni.dev = (struct net_device *)ifp;2513nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni);2514}25152516int2517register_netdevice_notifier(struct notifier_block *nb)2518{25192520nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(2521ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);2522nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(2523ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);2524nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(2525ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);2526nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(2527iflladdr_event, linux_handle_iflladdr_event, nb, 0);25282529return (0);2530}25312532int2533register_inetaddr_notifier(struct notifier_block *nb)2534{25352536nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(2537ifaddr_event, linux_handle_ifaddr_event, nb, 0);2538return (0);2539}25402541int2542unregister_netdevice_notifier(struct notifier_block *nb)2543{25442545EVENTHANDLER_DEREGISTER(ifnet_link_event,2546nb->tags[NETDEV_UP]);2547EVENTHANDLER_DEREGISTER(ifnet_arrival_event,2548nb->tags[NETDEV_REGISTER]);2549EVENTHANDLER_DEREGISTER(ifnet_departure_event,2550nb->tags[NETDEV_UNREGISTER]);2551EVENTHANDLER_DEREGISTER(iflladdr_event,2552nb->tags[NETDEV_CHANGEADDR]);25532554return (0);2555}25562557int2558unregister_inetaddr_notifier(struct notifier_block *nb)2559{25602561EVENTHANDLER_DEREGISTER(ifaddr_event,2562nb->tags[NETDEV_CHANGEIFADDR]);25632564return (0);2565}25662567struct list_sort_thunk {2568int (*cmp)(void *, struct list_head *, struct list_head *);2569void *priv;2570};25712572static inline int2573linux_le_cmp(const void *d1, const void *d2, void *priv)2574{2575struct list_head *le1, *le2;2576struct list_sort_thunk *thunk;25772578thunk = priv;2579le1 = *(__DECONST(struct list_head **, d1));2580le2 = *(__DECONST(struct list_head **, d2));2581return ((thunk->cmp)(thunk->priv, le1, le2));2582}25832584void2585list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,2586struct list_head *a, struct list_head *b))2587{2588struct list_sort_thunk thunk;2589struct list_head **ar, *le;2590size_t count, i;25912592count = 0;2593list_for_each(le, head)2594count++;2595ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);2596i = 0;2597list_for_each(le, head)2598ar[i++] = le;2599thunk.cmp = cmp;2600thunk.priv = priv;2601qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk);2602INIT_LIST_HEAD(head);2603for (i = 0; i < count; i++)2604list_add_tail(ar[i], head);2605free(ar, M_KMALLOC);2606}26072608#if defined(__i386__) || defined(__amd64__)2609int2610linux_wbinvd_on_all_cpus(void)2611{26122613pmap_invalidate_cache();2614return (0);2615}2616#endif26172618int2619linux_on_each_cpu(void callback(void *), void *data)2620{26212622smp_rendezvous(smp_no_rendezvous_barrier, callback,2623smp_no_rendezvous_barrier, data);2624return (0);2625}26262627int2628linux_in_atomic(void)2629{26302631return ((curthread->td_pflags & TDP_NOFAULTING) != 0);2632}26332634struct linux_cdev *2635linux_find_cdev(const char *name, unsigned major, unsigned minor)2636{2637dev_t dev = MKDEV(major, minor);2638struct cdev *cdev;26392640dev_lock();2641LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {2642struct linux_cdev *ldev = cdev->si_drv1;2643if (ldev->dev == dev &&2644strcmp(kobject_name(&ldev->kobj), name) == 0) {2645break;2646}2647}2648dev_unlock();26492650return (cdev != NULL ? cdev->si_drv1 : NULL);2651}26522653int2654__register_chrdev(unsigned int major, unsigned int baseminor,2655unsigned int count, const char *name,2656const struct file_operations *fops)2657{2658struct linux_cdev *cdev;2659int ret = 0;2660int i;26612662for (i = baseminor; i < baseminor + count; i++) {2663cdev = cdev_alloc();2664cdev->ops = fops;2665kobject_set_name(&cdev->kobj, name);26662667ret = cdev_add(cdev, makedev(major, i), 1);2668if (ret != 0)2669break;2670}2671return (ret);2672}26732674int2675__register_chrdev_p(unsigned int major, unsigned int baseminor,2676unsigned int count, const char *name,2677const struct file_operations *fops, uid_t uid,2678gid_t gid, int mode)2679{2680struct linux_cdev *cdev;2681int ret = 0;2682int i;26832684for (i = baseminor; i < baseminor + count; i++) {2685cdev = cdev_alloc();2686cdev->ops = fops;2687kobject_set_name(&cdev->kobj, name);26882689ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);2690if (ret != 0)2691break;2692}2693return (ret);2694}26952696void2697__unregister_chrdev(unsigned int major, unsigned int baseminor,2698unsigned int count, const char *name)2699{2700struct linux_cdev *cdevp;2701int i;27022703for (i = baseminor; i < baseminor + count; i++) {2704cdevp = linux_find_cdev(name, major, i);2705if (cdevp != NULL)2706cdev_del(cdevp);2707}2708}27092710void2711linux_dump_stack(void)2712{2713#ifdef STACK2714struct stack st;27152716stack_save(&st);2717stack_print(&st);2718#endif2719}27202721int2722linuxkpi_net_ratelimit(void)2723{27242725return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps,2726lkpi_net_maxpps));2727}27282729struct io_mapping *2730io_mapping_create_wc(resource_size_t base, unsigned long size)2731{2732struct io_mapping *mapping;27332734mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);2735if (mapping == NULL)2736return (NULL);2737return (io_mapping_init_wc(mapping, base, size));2738}27392740/* We likely want a linuxkpi_device.c at some point. */2741bool2742device_can_wakeup(struct device *dev)2743{27442745if (dev == NULL)2746return (false);2747/*2748* XXX-BZ iwlwifi queries it as part of enabling WoWLAN.2749* Normally this would be based on a bool in dev->power.XXX.2750* Check such as PCI PCIM_PCAP_*PME. We have no way to enable this yet.2751* We may get away by directly calling into bsddev for as long as2752* we can assume PCI only avoiding changing struct device breaking KBI.2753*/2754pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__);2755return (false);2756}27572758static void2759devm_device_group_remove(struct device *dev, void *p)2760{2761const struct attribute_group **dr = p;2762const struct attribute_group *group = *dr;27632764sysfs_remove_group(&dev->kobj, group);2765}27662767int2768lkpi_devm_device_add_group(struct device *dev,2769const struct attribute_group *group)2770{2771const struct attribute_group **dr;2772int ret;27732774dr = devres_alloc(devm_device_group_remove, sizeof(*dr), GFP_KERNEL);2775if (dr == NULL)2776return (-ENOMEM);27772778ret = sysfs_create_group(&dev->kobj, group);2779if (ret == 0) {2780*dr = group;2781devres_add(dev, dr);2782} else2783devres_free(dr);27842785return (ret);2786}27872788#if defined(__i386__) || defined(__amd64__)2789bool linux_cpu_has_clflush;2790struct cpuinfo_x86 boot_cpu_data;2791struct cpuinfo_x86 *__cpu_data;2792#endif27932794cpumask_t *2795lkpi_get_static_single_cpu_mask(int cpuid)2796{27972798KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n",2799__func__, cpuid));2800KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n",2801__func__, cpuid));28022803return (static_single_cpu_mask[cpuid]);2804}28052806bool2807lkpi_xen_initial_domain(void)2808{2809#ifdef XENHVM2810return (xen_initial_domain());2811#else2812return (false);2813#endif2814}28152816bool2817lkpi_xen_pv_domain(void)2818{2819#ifdef XENHVM2820return (xen_pv_domain());2821#else2822return (false);2823#endif2824}28252826static void2827linux_compat_init(void *arg)2828{2829struct sysctl_oid *rootoid;2830int i;28312832#if defined(__i386__) || defined(__amd64__)2833static const uint32_t x86_vendors[X86_VENDOR_NUM] = {2834[X86_VENDOR_INTEL] = CPU_VENDOR_INTEL,2835[X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX,2836[X86_VENDOR_AMD] = CPU_VENDOR_AMD,2837[X86_VENDOR_UMC] = CPU_VENDOR_UMC,2838[X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR,2839[X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA,2840[X86_VENDOR_NSC] = CPU_VENDOR_NSC,2841[X86_VENDOR_HYGON] = CPU_VENDOR_HYGON,2842};2843uint8_t x86_vendor = X86_VENDOR_UNKNOWN;28442845for (i = 0; i < X86_VENDOR_NUM; i++) {2846if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) {2847x86_vendor = i;2848break;2849}2850}2851linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);2852boot_cpu_data.x86_clflush_size = cpu_clflush_line_size;2853boot_cpu_data.x86_max_cores = mp_ncpus;2854boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id);2855boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id);2856boot_cpu_data.x86_vendor = x86_vendor;28572858__cpu_data = kmalloc_array(mp_maxid + 1,2859sizeof(*__cpu_data), M_WAITOK | M_ZERO);2860CPU_FOREACH(i) {2861__cpu_data[i].x86_clflush_size = cpu_clflush_line_size;2862__cpu_data[i].x86_max_cores = mp_ncpus;2863__cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id);2864__cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id);2865__cpu_data[i].x86_vendor = x86_vendor;2866}2867#endif2868rw_init(&linux_vma_lock, "lkpi-vma-lock");28692870rootoid = SYSCTL_ADD_ROOT_NODE(NULL,2871OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");2872kobject_init(&linux_class_root, &linux_class_ktype);2873kobject_set_name(&linux_class_root, "class");2874linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),2875OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");2876kobject_init(&linux_root_device.kobj, &linux_dev_ktype);2877kobject_set_name(&linux_root_device.kobj, "device");2878linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,2879SYSCTL_CHILDREN(rootoid), OID_AUTO, "device",2880CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device");2881linux_root_device.bsddev = root_bus;2882linux_class_misc.name = "misc";2883class_register(&linux_class_misc);2884INIT_LIST_HEAD(&pci_drivers);2885INIT_LIST_HEAD(&pci_devices);2886spin_lock_init(&pci_lock);2887mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);2888for (i = 0; i < VMMAP_HASH_SIZE; i++)2889LIST_INIT(&vmmaphead[i]);2890init_waitqueue_head(&linux_bit_waitq);2891init_waitqueue_head(&linux_var_waitq);28922893CPU_COPY(&all_cpus, &cpu_online_mask);2894/*2895* Generate a single-CPU cpumask_t for each CPU (possibly) in the system.2896* CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only2897* have itself in the cpumask, cupid 1 only itself on entry 1, and so on.2898* This is used by cpumask_of() (and possibly others in the future) for,2899* e.g., drivers to pass hints to irq_set_affinity_hint().2900*/2901static_single_cpu_mask = kmalloc_array(mp_maxid + 1,2902sizeof(static_single_cpu_mask), M_WAITOK | M_ZERO);29032904/*2905* When the number of CPUs reach a threshold, we start to save memory2906* given the sets are static by overlapping those having their single2907* bit set at same position in a bitset word. Asymptotically, this2908* regular scheme is in O(n²) whereas the overlapping one is in O(n)2909* only with n being the maximum number of CPUs, so the gain will become2910* huge quite quickly. The threshold for 64-bit architectures is 1282911* CPUs.2912*/2913if (mp_ncpus < (2 * _BITSET_BITS)) {2914cpumask_t *sscm_ptr;29152916/*2917* This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) *2918* (_BITSET_BITS / 8)' bytes (for comparison with the2919* overlapping scheme).2920*/2921static_single_cpu_mask_lcs = kmalloc_array(mp_ncpus,2922sizeof(*static_single_cpu_mask_lcs),2923M_WAITOK | M_ZERO);29242925sscm_ptr = static_single_cpu_mask_lcs;2926CPU_FOREACH(i) {2927static_single_cpu_mask[i] = sscm_ptr++;2928CPU_SET(i, static_single_cpu_mask[i]);2929}2930} else {2931/* Pointer to a bitset word. */2932__typeof(((cpuset_t *)NULL)->__bits[0]) *bwp;29332934/*2935* Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t'2936* really) with a single bit set that can be reused for all2937* single CPU masks by making them start at different offsets.2938* We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before2939* the word having its single bit set, and the same amount2940* after.2941*/2942static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS,2943(2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8),2944M_KMALLOC, M_WAITOK | M_ZERO);29452946/*2947* We rely below on cpuset_t and the bitset generic2948* implementation assigning words in the '__bits' array in the2949* same order of bits (i.e., little-endian ordering, not to be2950* confused with machine endianness, which concerns bits in2951* words and other integers). This is an imperfect test, but it2952* will detect a change to big-endian ordering.2953*/2954_Static_assert(2955__bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1,2956"Assumes a bitset implementation that is little-endian "2957"on its words");29582959/* Initialize the single bit of each static span. */2960bwp = (__typeof(bwp))static_single_cpu_mask_lcs +2961(__bitset_words(CPU_SETSIZE) - 1);2962for (i = 0; i < _BITSET_BITS; i++) {2963CPU_SET(i, (cpuset_t *)bwp);2964bwp += (2 * __bitset_words(CPU_SETSIZE) - 1);2965}29662967/*2968* Finally set all CPU masks to the proper word in their2969* relevant span.2970*/2971CPU_FOREACH(i) {2972bwp = (__typeof(bwp))static_single_cpu_mask_lcs;2973/* Find the non-zero word of the relevant span. */2974bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) *2975(i % _BITSET_BITS) +2976__bitset_words(CPU_SETSIZE) - 1;2977/* Shift to find the CPU mask start. */2978bwp -= (i / _BITSET_BITS);2979static_single_cpu_mask[i] = (cpuset_t *)bwp;2980}2981}29822983strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release));2984}2985SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);29862987static void2988linux_compat_uninit(void *arg)2989{2990linux_kobject_kfree_name(&linux_class_root);2991linux_kobject_kfree_name(&linux_root_device.kobj);2992linux_kobject_kfree_name(&linux_class_misc.kobj);29932994free(static_single_cpu_mask_lcs, M_KMALLOC);2995free(static_single_cpu_mask, M_KMALLOC);2996#if defined(__i386__) || defined(__amd64__)2997free(__cpu_data, M_KMALLOC);2998#endif29993000mtx_destroy(&vmmaplock);3001spin_lock_destroy(&pci_lock);3002rw_destroy(&linux_vma_lock);3003}3004SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);30053006/*3007* NOTE: Linux frequently uses "unsigned long" for pointer to integer3008* conversion and vice versa, where in FreeBSD "uintptr_t" would be3009* used. Assert these types have the same size, else some parts of the3010* LinuxKPI may not work like expected:3011*/3012CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));301330143015