Path: blob/master/drivers/infiniband/hw/qib/qib_file_ops.c
15112 views
/*1* Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.2* All rights reserved.3* Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.4*5* This software is available to you under a choice of one of two6* licenses. You may choose to be licensed under the terms of the GNU7* General Public License (GPL) Version 2, available from the file8* COPYING in the main directory of this source tree, or the9* OpenIB.org BSD license below:10*11* Redistribution and use in source and binary forms, with or12* without modification, are permitted provided that the following13* conditions are met:14*15* - Redistributions of source code must retain the above16* copyright notice, this list of conditions and the following17* disclaimer.18*19* - Redistributions in binary form must reproduce the above20* copyright notice, this list of conditions and the following21* disclaimer in the documentation and/or other materials22* provided with the distribution.23*24* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,25* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF26* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND27* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS28* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN29* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN30* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE31* SOFTWARE.32*/3334#include <linux/pci.h>35#include <linux/poll.h>36#include <linux/cdev.h>37#include <linux/swap.h>38#include <linux/vmalloc.h>39#include <linux/highmem.h>40#include <linux/io.h>41#include <linux/uio.h>42#include <linux/jiffies.h>43#include <asm/pgtable.h>44#include <linux/delay.h>4546#include "qib.h"47#include "qib_common.h"48#include "qib_user_sdma.h"4950static int qib_open(struct inode *, struct file *);51static int qib_close(struct inode *, struct file *);52static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *);53static ssize_t qib_aio_write(struct kiocb *, const struct iovec *,54unsigned long, loff_t);55static unsigned int qib_poll(struct file *, struct poll_table_struct *);56static int qib_mmapf(struct file *, struct vm_area_struct *);5758static const struct file_operations qib_file_ops = {59.owner = THIS_MODULE,60.write = qib_write,61.aio_write = qib_aio_write,62.open = qib_open,63.release = qib_close,64.poll = qib_poll,65.mmap = qib_mmapf,66.llseek = noop_llseek,67};6869/*70* Convert kernel virtual addresses to physical addresses so they don't71* potentially conflict with the chip addresses used as mmap offsets.72* It doesn't really matter what mmap offset we use as long as we can73* interpret it correctly.74*/75static u64 cvt_kvaddr(void *p)76{77struct page *page;78u64 paddr = 0;7980page = vmalloc_to_page(p);81if (page)82paddr = page_to_pfn(page) << PAGE_SHIFT;8384return paddr;85}8687static int qib_get_base_info(struct file *fp, void __user *ubase,88size_t ubase_size)89{90struct qib_ctxtdata *rcd = ctxt_fp(fp);91int ret = 0;92struct qib_base_info *kinfo = NULL;93struct qib_devdata *dd = rcd->dd;94struct qib_pportdata *ppd = rcd->ppd;95unsigned subctxt_cnt;96int shared, master;97size_t sz;9899subctxt_cnt = rcd->subctxt_cnt;100if (!subctxt_cnt) {101shared = 0;102master = 0;103subctxt_cnt = 1;104} else {105shared = 1;106master = !subctxt_fp(fp);107}108109sz = sizeof(*kinfo);110/* If context sharing is not requested, allow the old size structure */111if (!shared)112sz -= 7 * sizeof(u64);113if (ubase_size < sz) {114ret = -EINVAL;115goto bail;116}117118kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);119if (kinfo == NULL) {120ret = -ENOMEM;121goto bail;122}123124ret = dd->f_get_base_info(rcd, kinfo);125if (ret < 0)126goto bail;127128kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt;129kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize;130kinfo->spi_tidegrcnt = rcd->rcvegrcnt;131kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize;132/*133* have to mmap whole thing134*/135kinfo->spi_rcv_egrbuftotlen =136rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;137kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk;138kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /139rcd->rcvegrbuf_chunks;140kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt;141if (master)142kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;143/*144* for this use, may be cfgctxts summed over all chips that145* are are configured and present146*/147kinfo->spi_nctxts = dd->cfgctxts;148/* unit (chip/board) our context is on */149kinfo->spi_unit = dd->unit;150kinfo->spi_port = ppd->port;151/* for now, only a single page */152kinfo->spi_tid_maxsize = PAGE_SIZE;153154/*155* Doing this per context, and based on the skip value, etc. This has156* to be the actual buffer size, since the protocol code treats it157* as an array.158*159* These have to be set to user addresses in the user code via mmap.160* These values are used on return to user code for the mmap target161* addresses only. For 32 bit, same 44 bit address problem, so use162* the physical address, not virtual. Before 2.6.11, using the163* page_address() macro worked, but in 2.6.11, even that returns the164* full 64 bit address (upper bits all 1's). So far, using the165* physical addresses (or chip offsets, for chip mapping) works, but166* no doubt some future kernel release will change that, and we'll be167* on to yet another method of dealing with this.168* Normally only one of rcvhdr_tailaddr or rhf_offset is useful169* since the chips with non-zero rhf_offset don't normally170* enable tail register updates to host memory, but for testing,171* both can be enabled and used.172*/173kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys;174kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys;175kinfo->spi_rhf_offset = dd->rhf_offset;176kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys;177kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;178/* setup per-unit (not port) status area for user programs */179kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +180(char *) ppd->statusp -181(char *) dd->pioavailregs_dma;182kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;183if (!shared) {184kinfo->spi_piocnt = rcd->piocnt;185kinfo->spi_piobufbase = (u64) rcd->piobufs;186kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask);187} else if (master) {188kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) +189(rcd->piocnt % subctxt_cnt);190/* Master's PIO buffers are after all the slave's */191kinfo->spi_piobufbase = (u64) rcd->piobufs +192dd->palign *193(rcd->piocnt - kinfo->spi_piocnt);194} else {195unsigned slave = subctxt_fp(fp) - 1;196197kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt;198kinfo->spi_piobufbase = (u64) rcd->piobufs +199dd->palign * kinfo->spi_piocnt * slave;200}201202if (shared) {203kinfo->spi_sendbuf_status =204cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]);205/* only spi_subctxt_* fields should be set in this block! */206kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase);207208kinfo->spi_subctxt_rcvegrbuf =209cvt_kvaddr(rcd->subctxt_rcvegrbuf);210kinfo->spi_subctxt_rcvhdr_base =211cvt_kvaddr(rcd->subctxt_rcvhdr_base);212}213214/*215* All user buffers are 2KB buffers. If we ever support216* giving 4KB buffers to user processes, this will need some217* work. Can't use piobufbase directly, because it has218* both 2K and 4K buffer base values.219*/220kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) /221dd->palign;222kinfo->spi_pioalign = dd->palign;223kinfo->spi_qpair = QIB_KD_QP;224/*225* user mode PIO buffers are always 2KB, even when 4KB can226* be received, and sent via the kernel; this is ibmaxlen227* for 2K MTU.228*/229kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32);230kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */231kinfo->spi_ctxt = rcd->ctxt;232kinfo->spi_subctxt = subctxt_fp(fp);233kinfo->spi_sw_version = QIB_KERN_SWVERSION;234kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */235kinfo->spi_hw_version = dd->revision;236237if (master)238kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER;239240sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);241if (copy_to_user(ubase, kinfo, sz))242ret = -EFAULT;243bail:244kfree(kinfo);245return ret;246}247248/**249* qib_tid_update - update a context TID250* @rcd: the context251* @fp: the qib device file252* @ti: the TID information253*254* The new implementation as of Oct 2004 is that the driver assigns255* the tid and returns it to the caller. To reduce search time, we256* keep a cursor for each context, walking the shadow tid array to find257* one that's not in use.258*259* For now, if we can't allocate the full list, we fail, although260* in the long run, we'll allocate as many as we can, and the261* caller will deal with that by trying the remaining pages later.262* That means that when we fail, we have to mark the tids as not in263* use again, in our shadow copy.264*265* It's up to the caller to free the tids when they are done.266* We'll unlock the pages as they free them.267*268* Also, right now we are locking one page at a time, but since269* the intended use of this routine is for a single group of270* virtually contiguous pages, that should change to improve271* performance.272*/273static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,274const struct qib_tid_info *ti)275{276int ret = 0, ntids;277u32 tid, ctxttid, cnt, i, tidcnt, tidoff;278u16 *tidlist;279struct qib_devdata *dd = rcd->dd;280u64 physaddr;281unsigned long vaddr;282u64 __iomem *tidbase;283unsigned long tidmap[8];284struct page **pagep = NULL;285unsigned subctxt = subctxt_fp(fp);286287if (!dd->pageshadow) {288ret = -ENOMEM;289goto done;290}291292cnt = ti->tidcnt;293if (!cnt) {294ret = -EFAULT;295goto done;296}297ctxttid = rcd->ctxt * dd->rcvtidcnt;298if (!rcd->subctxt_cnt) {299tidcnt = dd->rcvtidcnt;300tid = rcd->tidcursor;301tidoff = 0;302} else if (!subctxt) {303tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +304(dd->rcvtidcnt % rcd->subctxt_cnt);305tidoff = dd->rcvtidcnt - tidcnt;306ctxttid += tidoff;307tid = tidcursor_fp(fp);308} else {309tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;310tidoff = tidcnt * (subctxt - 1);311ctxttid += tidoff;312tid = tidcursor_fp(fp);313}314if (cnt > tidcnt) {315/* make sure it all fits in tid_pg_list */316qib_devinfo(dd->pcidev, "Process tried to allocate %u "317"TIDs, only trying max (%u)\n", cnt, tidcnt);318cnt = tidcnt;319}320pagep = (struct page **) rcd->tid_pg_list;321tidlist = (u16 *) &pagep[dd->rcvtidcnt];322pagep += tidoff;323tidlist += tidoff;324325memset(tidmap, 0, sizeof(tidmap));326/* before decrement; chip actual # */327ntids = tidcnt;328tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) +329dd->rcvtidbase +330ctxttid * sizeof(*tidbase));331332/* virtual address of first page in transfer */333vaddr = ti->tidvaddr;334if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,335cnt * PAGE_SIZE)) {336ret = -EFAULT;337goto done;338}339ret = qib_get_user_pages(vaddr, cnt, pagep);340if (ret) {341/*342* if (ret == -EBUSY)343* We can't continue because the pagep array won't be344* initialized. This should never happen,345* unless perhaps the user has mpin'ed the pages346* themselves.347*/348qib_devinfo(dd->pcidev,349"Failed to lock addr %p, %u pages: "350"errno %d\n", (void *) vaddr, cnt, -ret);351goto done;352}353for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {354for (; ntids--; tid++) {355if (tid == tidcnt)356tid = 0;357if (!dd->pageshadow[ctxttid + tid])358break;359}360if (ntids < 0) {361/*362* Oops, wrapped all the way through their TIDs,363* and didn't have enough free; see comments at364* start of routine365*/366i--; /* last tidlist[i] not filled in */367ret = -ENOMEM;368break;369}370tidlist[i] = tid + tidoff;371/* we "know" system pages and TID pages are same size */372dd->pageshadow[ctxttid + tid] = pagep[i];373dd->physshadow[ctxttid + tid] =374qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE,375PCI_DMA_FROMDEVICE);376/*377* don't need atomic or it's overhead378*/379__set_bit(tid, tidmap);380physaddr = dd->physshadow[ctxttid + tid];381/* PERFORMANCE: below should almost certainly be cached */382dd->f_put_tid(dd, &tidbase[tid],383RCVHQ_RCV_TYPE_EXPECTED, physaddr);384/*385* don't check this tid in qib_ctxtshadow, since we386* just filled it in; start with the next one.387*/388tid++;389}390391if (ret) {392u32 limit;393cleanup:394/* jump here if copy out of updated info failed... */395/* same code that's in qib_free_tid() */396limit = sizeof(tidmap) * BITS_PER_BYTE;397if (limit > tidcnt)398/* just in case size changes in future */399limit = tidcnt;400tid = find_first_bit((const unsigned long *)tidmap, limit);401for (; tid < limit; tid++) {402if (!test_bit(tid, tidmap))403continue;404if (dd->pageshadow[ctxttid + tid]) {405dma_addr_t phys;406407phys = dd->physshadow[ctxttid + tid];408dd->physshadow[ctxttid + tid] = dd->tidinvalid;409/* PERFORMANCE: below should almost certainly410* be cached411*/412dd->f_put_tid(dd, &tidbase[tid],413RCVHQ_RCV_TYPE_EXPECTED,414dd->tidinvalid);415pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,416PCI_DMA_FROMDEVICE);417dd->pageshadow[ctxttid + tid] = NULL;418}419}420qib_release_user_pages(pagep, cnt);421} else {422/*423* Copy the updated array, with qib_tid's filled in, back424* to user. Since we did the copy in already, this "should425* never fail" If it does, we have to clean up...426*/427if (copy_to_user((void __user *)428(unsigned long) ti->tidlist,429tidlist, cnt * sizeof(*tidlist))) {430ret = -EFAULT;431goto cleanup;432}433if (copy_to_user((void __user *) (unsigned long) ti->tidmap,434tidmap, sizeof tidmap)) {435ret = -EFAULT;436goto cleanup;437}438if (tid == tidcnt)439tid = 0;440if (!rcd->subctxt_cnt)441rcd->tidcursor = tid;442else443tidcursor_fp(fp) = tid;444}445446done:447return ret;448}449450/**451* qib_tid_free - free a context TID452* @rcd: the context453* @subctxt: the subcontext454* @ti: the TID info455*456* right now we are unlocking one page at a time, but since457* the intended use of this routine is for a single group of458* virtually contiguous pages, that should change to improve459* performance. We check that the TID is in range for this context460* but otherwise don't check validity; if user has an error and461* frees the wrong tid, it's only their own data that can thereby462* be corrupted. We do check that the TID was in use, for sanity463* We always use our idea of the saved address, not the address that464* they pass in to us.465*/466static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt,467const struct qib_tid_info *ti)468{469int ret = 0;470u32 tid, ctxttid, cnt, limit, tidcnt;471struct qib_devdata *dd = rcd->dd;472u64 __iomem *tidbase;473unsigned long tidmap[8];474475if (!dd->pageshadow) {476ret = -ENOMEM;477goto done;478}479480if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,481sizeof tidmap)) {482ret = -EFAULT;483goto done;484}485486ctxttid = rcd->ctxt * dd->rcvtidcnt;487if (!rcd->subctxt_cnt)488tidcnt = dd->rcvtidcnt;489else if (!subctxt) {490tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +491(dd->rcvtidcnt % rcd->subctxt_cnt);492ctxttid += dd->rcvtidcnt - tidcnt;493} else {494tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;495ctxttid += tidcnt * (subctxt - 1);496}497tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) +498dd->rcvtidbase +499ctxttid * sizeof(*tidbase));500501limit = sizeof(tidmap) * BITS_PER_BYTE;502if (limit > tidcnt)503/* just in case size changes in future */504limit = tidcnt;505tid = find_first_bit(tidmap, limit);506for (cnt = 0; tid < limit; tid++) {507/*508* small optimization; if we detect a run of 3 or so without509* any set, use find_first_bit again. That's mainly to510* accelerate the case where we wrapped, so we have some at511* the beginning, and some at the end, and a big gap512* in the middle.513*/514if (!test_bit(tid, tidmap))515continue;516cnt++;517if (dd->pageshadow[ctxttid + tid]) {518struct page *p;519dma_addr_t phys;520521p = dd->pageshadow[ctxttid + tid];522dd->pageshadow[ctxttid + tid] = NULL;523phys = dd->physshadow[ctxttid + tid];524dd->physshadow[ctxttid + tid] = dd->tidinvalid;525/* PERFORMANCE: below should almost certainly be526* cached527*/528dd->f_put_tid(dd, &tidbase[tid],529RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid);530pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,531PCI_DMA_FROMDEVICE);532qib_release_user_pages(&p, 1);533}534}535done:536return ret;537}538539/**540* qib_set_part_key - set a partition key541* @rcd: the context542* @key: the key543*544* We can have up to 4 active at a time (other than the default, which is545* always allowed). This is somewhat tricky, since multiple contexts may set546* the same key, so we reference count them, and clean up at exit. All 4547* partition keys are packed into a single qlogic_ib register. It's an548* error for a process to set the same pkey multiple times. We provide no549* mechanism to de-allocate a pkey at this time, we may eventually need to550* do that. I've used the atomic operations, and no locking, and only make551* a single pass through what's available. This should be more than552* adequate for some time. I'll think about spinlocks or the like if and as553* it's necessary.554*/555static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key)556{557struct qib_pportdata *ppd = rcd->ppd;558int i, any = 0, pidx = -1;559u16 lkey = key & 0x7FFF;560int ret;561562if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) {563/* nothing to do; this key always valid */564ret = 0;565goto bail;566}567568if (!lkey) {569ret = -EINVAL;570goto bail;571}572573/*574* Set the full membership bit, because it has to be575* set in the register or the packet, and it seems576* cleaner to set in the register than to force all577* callers to set it.578*/579key |= 0x8000;580581for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {582if (!rcd->pkeys[i] && pidx == -1)583pidx = i;584if (rcd->pkeys[i] == key) {585ret = -EEXIST;586goto bail;587}588}589if (pidx == -1) {590ret = -EBUSY;591goto bail;592}593for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {594if (!ppd->pkeys[i]) {595any++;596continue;597}598if (ppd->pkeys[i] == key) {599atomic_t *pkrefs = &ppd->pkeyrefs[i];600601if (atomic_inc_return(pkrefs) > 1) {602rcd->pkeys[pidx] = key;603ret = 0;604goto bail;605} else {606/*607* lost race, decrement count, catch below608*/609atomic_dec(pkrefs);610any++;611}612}613if ((ppd->pkeys[i] & 0x7FFF) == lkey) {614/*615* It makes no sense to have both the limited and616* full membership PKEY set at the same time since617* the unlimited one will disable the limited one.618*/619ret = -EEXIST;620goto bail;621}622}623if (!any) {624ret = -EBUSY;625goto bail;626}627for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {628if (!ppd->pkeys[i] &&629atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {630rcd->pkeys[pidx] = key;631ppd->pkeys[i] = key;632(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);633ret = 0;634goto bail;635}636}637ret = -EBUSY;638639bail:640return ret;641}642643/**644* qib_manage_rcvq - manage a context's receive queue645* @rcd: the context646* @subctxt: the subcontext647* @start_stop: action to carry out648*649* start_stop == 0 disables receive on the context, for use in queue650* overflow conditions. start_stop==1 re-enables, to be used to651* re-init the software copy of the head register652*/653static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt,654int start_stop)655{656struct qib_devdata *dd = rcd->dd;657unsigned int rcvctrl_op;658659if (subctxt)660goto bail;661/* atomically clear receive enable ctxt. */662if (start_stop) {663/*664* On enable, force in-memory copy of the tail register to665* 0, so that protocol code doesn't have to worry about666* whether or not the chip has yet updated the in-memory667* copy or not on return from the system call. The chip668* always resets it's tail register back to 0 on a669* transition from disabled to enabled.670*/671if (rcd->rcvhdrtail_kvaddr)672qib_clear_rcvhdrtail(rcd);673rcvctrl_op = QIB_RCVCTRL_CTXT_ENB;674} else675rcvctrl_op = QIB_RCVCTRL_CTXT_DIS;676dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt);677/* always; new head should be equal to new tail; see above */678bail:679return 0;680}681682static void qib_clean_part_key(struct qib_ctxtdata *rcd,683struct qib_devdata *dd)684{685int i, j, pchanged = 0;686u64 oldpkey;687struct qib_pportdata *ppd = rcd->ppd;688689/* for debugging only */690oldpkey = (u64) ppd->pkeys[0] |691((u64) ppd->pkeys[1] << 16) |692((u64) ppd->pkeys[2] << 32) |693((u64) ppd->pkeys[3] << 48);694695for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {696if (!rcd->pkeys[i])697continue;698for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) {699/* check for match independent of the global bit */700if ((ppd->pkeys[j] & 0x7fff) !=701(rcd->pkeys[i] & 0x7fff))702continue;703if (atomic_dec_and_test(&ppd->pkeyrefs[j])) {704ppd->pkeys[j] = 0;705pchanged++;706}707break;708}709rcd->pkeys[i] = 0;710}711if (pchanged)712(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);713}714715/* common code for the mappings on dma_alloc_coherent mem */716static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd,717unsigned len, void *kvaddr, u32 write_ok, char *what)718{719struct qib_devdata *dd = rcd->dd;720unsigned long pfn;721int ret;722723if ((vma->vm_end - vma->vm_start) > len) {724qib_devinfo(dd->pcidev,725"FAIL on %s: len %lx > %x\n", what,726vma->vm_end - vma->vm_start, len);727ret = -EFAULT;728goto bail;729}730731/*732* shared context user code requires rcvhdrq mapped r/w, others733* only allowed readonly mapping.734*/735if (!write_ok) {736if (vma->vm_flags & VM_WRITE) {737qib_devinfo(dd->pcidev,738"%s must be mapped readonly\n", what);739ret = -EPERM;740goto bail;741}742743/* don't allow them to later change with mprotect */744vma->vm_flags &= ~VM_MAYWRITE;745}746747pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;748ret = remap_pfn_range(vma, vma->vm_start, pfn,749len, vma->vm_page_prot);750if (ret)751qib_devinfo(dd->pcidev, "%s ctxt%u mmap of %lx, %x "752"bytes failed: %d\n", what, rcd->ctxt,753pfn, len, ret);754bail:755return ret;756}757758static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd,759u64 ureg)760{761unsigned long phys;762unsigned long sz;763int ret;764765/*766* This is real hardware, so use io_remap. This is the mechanism767* for the user process to update the head registers for their ctxt768* in the chip.769*/770sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE;771if ((vma->vm_end - vma->vm_start) > sz) {772qib_devinfo(dd->pcidev, "FAIL mmap userreg: reqlen "773"%lx > PAGE\n", vma->vm_end - vma->vm_start);774ret = -EFAULT;775} else {776phys = dd->physaddr + ureg;777vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);778779vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;780ret = io_remap_pfn_range(vma, vma->vm_start,781phys >> PAGE_SHIFT,782vma->vm_end - vma->vm_start,783vma->vm_page_prot);784}785return ret;786}787788static int mmap_piobufs(struct vm_area_struct *vma,789struct qib_devdata *dd,790struct qib_ctxtdata *rcd,791unsigned piobufs, unsigned piocnt)792{793unsigned long phys;794int ret;795796/*797* When we map the PIO buffers in the chip, we want to map them as798* writeonly, no read possible; unfortunately, x86 doesn't allow799* for this in hardware, but we still prevent users from asking800* for it.801*/802if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) {803qib_devinfo(dd->pcidev, "FAIL mmap piobufs: "804"reqlen %lx > PAGE\n",805vma->vm_end - vma->vm_start);806ret = -EINVAL;807goto bail;808}809810phys = dd->physaddr + piobufs;811812#if defined(__powerpc__)813/* There isn't a generic way to specify writethrough mappings */814pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;815pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;816pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;817#endif818819/*820* don't allow them to later change to readable with mprotect (for when821* not initially mapped readable, as is normally the case)822*/823vma->vm_flags &= ~VM_MAYREAD;824vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;825826if (qib_wc_pat)827vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);828829ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,830vma->vm_end - vma->vm_start,831vma->vm_page_prot);832bail:833return ret;834}835836static int mmap_rcvegrbufs(struct vm_area_struct *vma,837struct qib_ctxtdata *rcd)838{839struct qib_devdata *dd = rcd->dd;840unsigned long start, size;841size_t total_size, i;842unsigned long pfn;843int ret;844845size = rcd->rcvegrbuf_size;846total_size = rcd->rcvegrbuf_chunks * size;847if ((vma->vm_end - vma->vm_start) > total_size) {848qib_devinfo(dd->pcidev, "FAIL on egr bufs: "849"reqlen %lx > actual %lx\n",850vma->vm_end - vma->vm_start,851(unsigned long) total_size);852ret = -EINVAL;853goto bail;854}855856if (vma->vm_flags & VM_WRITE) {857qib_devinfo(dd->pcidev, "Can't map eager buffers as "858"writable (flags=%lx)\n", vma->vm_flags);859ret = -EPERM;860goto bail;861}862/* don't allow them to later change to writeable with mprotect */863vma->vm_flags &= ~VM_MAYWRITE;864865start = vma->vm_start;866867for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) {868pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT;869ret = remap_pfn_range(vma, start, pfn, size,870vma->vm_page_prot);871if (ret < 0)872goto bail;873}874ret = 0;875876bail:877return ret;878}879880/*881* qib_file_vma_fault - handle a VMA page fault.882*/883static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)884{885struct page *page;886887page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));888if (!page)889return VM_FAULT_SIGBUS;890891get_page(page);892vmf->page = page;893894return 0;895}896897static struct vm_operations_struct qib_file_vm_ops = {898.fault = qib_file_vma_fault,899};900901static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,902struct qib_ctxtdata *rcd, unsigned subctxt)903{904struct qib_devdata *dd = rcd->dd;905unsigned subctxt_cnt;906unsigned long len;907void *addr;908size_t size;909int ret = 0;910911subctxt_cnt = rcd->subctxt_cnt;912size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;913914/*915* Each process has all the subctxt uregbase, rcvhdrq, and916* rcvegrbufs mmapped - as an array for all the processes,917* and also separately for this process.918*/919if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) {920addr = rcd->subctxt_uregbase;921size = PAGE_SIZE * subctxt_cnt;922} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) {923addr = rcd->subctxt_rcvhdr_base;924size = rcd->rcvhdrq_size * subctxt_cnt;925} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) {926addr = rcd->subctxt_rcvegrbuf;927size *= subctxt_cnt;928} else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase +929PAGE_SIZE * subctxt)) {930addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt;931size = PAGE_SIZE;932} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base +933rcd->rcvhdrq_size * subctxt)) {934addr = rcd->subctxt_rcvhdr_base +935rcd->rcvhdrq_size * subctxt;936size = rcd->rcvhdrq_size;937} else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) {938addr = rcd->user_event_mask;939size = PAGE_SIZE;940} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf +941size * subctxt)) {942addr = rcd->subctxt_rcvegrbuf + size * subctxt;943/* rcvegrbufs are read-only on the slave */944if (vma->vm_flags & VM_WRITE) {945qib_devinfo(dd->pcidev,946"Can't map eager buffers as "947"writable (flags=%lx)\n", vma->vm_flags);948ret = -EPERM;949goto bail;950}951/*952* Don't allow permission to later change to writeable953* with mprotect.954*/955vma->vm_flags &= ~VM_MAYWRITE;956} else957goto bail;958len = vma->vm_end - vma->vm_start;959if (len > size) {960ret = -EINVAL;961goto bail;962}963964vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;965vma->vm_ops = &qib_file_vm_ops;966vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;967ret = 1;968969bail:970return ret;971}972973/**974* qib_mmapf - mmap various structures into user space975* @fp: the file pointer976* @vma: the VM area977*978* We use this to have a shared buffer between the kernel and the user code979* for the rcvhdr queue, egr buffers, and the per-context user regs and pio980* buffers in the chip. We have the open and close entries so we can bump981* the ref count and keep the driver from being unloaded while still mapped.982*/983static int qib_mmapf(struct file *fp, struct vm_area_struct *vma)984{985struct qib_ctxtdata *rcd;986struct qib_devdata *dd;987u64 pgaddr, ureg;988unsigned piobufs, piocnt;989int ret, match = 1;990991rcd = ctxt_fp(fp);992if (!rcd || !(vma->vm_flags & VM_SHARED)) {993ret = -EINVAL;994goto bail;995}996dd = rcd->dd;997998/*999* This is the qib_do_user_init() code, mapping the shared buffers1000* and per-context user registers into the user process. The address1001* referred to by vm_pgoff is the file offset passed via mmap().1002* For shared contexts, this is the kernel vmalloc() address of the1003* pages to share with the master.1004* For non-shared or master ctxts, this is a physical address.1005* We only do one mmap for each space mapped.1006*/1007pgaddr = vma->vm_pgoff << PAGE_SHIFT;10081009/*1010* Check for 0 in case one of the allocations failed, but user1011* called mmap anyway.1012*/1013if (!pgaddr) {1014ret = -EINVAL;1015goto bail;1016}10171018/*1019* Physical addresses must fit in 40 bits for our hardware.1020* Check for kernel virtual addresses first, anything else must1021* match a HW or memory address.1022*/1023ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp));1024if (ret) {1025if (ret > 0)1026ret = 0;1027goto bail;1028}10291030ureg = dd->uregbase + dd->ureg_align * rcd->ctxt;1031if (!rcd->subctxt_cnt) {1032/* ctxt is not shared */1033piocnt = rcd->piocnt;1034piobufs = rcd->piobufs;1035} else if (!subctxt_fp(fp)) {1036/* caller is the master */1037piocnt = (rcd->piocnt / rcd->subctxt_cnt) +1038(rcd->piocnt % rcd->subctxt_cnt);1039piobufs = rcd->piobufs +1040dd->palign * (rcd->piocnt - piocnt);1041} else {1042unsigned slave = subctxt_fp(fp) - 1;10431044/* caller is a slave */1045piocnt = rcd->piocnt / rcd->subctxt_cnt;1046piobufs = rcd->piobufs + dd->palign * piocnt * slave;1047}10481049if (pgaddr == ureg)1050ret = mmap_ureg(vma, dd, ureg);1051else if (pgaddr == piobufs)1052ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt);1053else if (pgaddr == dd->pioavailregs_phys)1054/* in-memory copy of pioavail registers */1055ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,1056(void *) dd->pioavailregs_dma, 0,1057"pioavail registers");1058else if (pgaddr == rcd->rcvegr_phys)1059ret = mmap_rcvegrbufs(vma, rcd);1060else if (pgaddr == (u64) rcd->rcvhdrq_phys)1061/*1062* The rcvhdrq itself; multiple pages, contiguous1063* from an i/o perspective. Shared contexts need1064* to map r/w, so we allow writing.1065*/1066ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size,1067rcd->rcvhdrq, 1, "rcvhdrq");1068else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys)1069/* in-memory copy of rcvhdrq tail register */1070ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,1071rcd->rcvhdrtail_kvaddr, 0,1072"rcvhdrq tail");1073else1074match = 0;1075if (!match)1076ret = -EINVAL;10771078vma->vm_private_data = NULL;10791080if (ret < 0)1081qib_devinfo(dd->pcidev,1082"mmap Failure %d: off %llx len %lx\n",1083-ret, (unsigned long long)pgaddr,1084vma->vm_end - vma->vm_start);1085bail:1086return ret;1087}10881089static unsigned int qib_poll_urgent(struct qib_ctxtdata *rcd,1090struct file *fp,1091struct poll_table_struct *pt)1092{1093struct qib_devdata *dd = rcd->dd;1094unsigned pollflag;10951096poll_wait(fp, &rcd->wait, pt);10971098spin_lock_irq(&dd->uctxt_lock);1099if (rcd->urgent != rcd->urgent_poll) {1100pollflag = POLLIN | POLLRDNORM;1101rcd->urgent_poll = rcd->urgent;1102} else {1103pollflag = 0;1104set_bit(QIB_CTXT_WAITING_URG, &rcd->flag);1105}1106spin_unlock_irq(&dd->uctxt_lock);11071108return pollflag;1109}11101111static unsigned int qib_poll_next(struct qib_ctxtdata *rcd,1112struct file *fp,1113struct poll_table_struct *pt)1114{1115struct qib_devdata *dd = rcd->dd;1116unsigned pollflag;11171118poll_wait(fp, &rcd->wait, pt);11191120spin_lock_irq(&dd->uctxt_lock);1121if (dd->f_hdrqempty(rcd)) {1122set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag);1123dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt);1124pollflag = 0;1125} else1126pollflag = POLLIN | POLLRDNORM;1127spin_unlock_irq(&dd->uctxt_lock);11281129return pollflag;1130}11311132static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)1133{1134struct qib_ctxtdata *rcd;1135unsigned pollflag;11361137rcd = ctxt_fp(fp);1138if (!rcd)1139pollflag = POLLERR;1140else if (rcd->poll_type == QIB_POLL_TYPE_URGENT)1141pollflag = qib_poll_urgent(rcd, fp, pt);1142else if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV)1143pollflag = qib_poll_next(rcd, fp, pt);1144else /* invalid */1145pollflag = POLLERR;11461147return pollflag;1148}11491150/*1151* Check that userland and driver are compatible for subcontexts.1152*/1153static int qib_compatible_subctxts(int user_swmajor, int user_swminor)1154{1155/* this code is written long-hand for clarity */1156if (QIB_USER_SWMAJOR != user_swmajor) {1157/* no promise of compatibility if major mismatch */1158return 0;1159}1160if (QIB_USER_SWMAJOR == 1) {1161switch (QIB_USER_SWMINOR) {1162case 0:1163case 1:1164case 2:1165/* no subctxt implementation so cannot be compatible */1166return 0;1167case 3:1168/* 3 is only compatible with itself */1169return user_swminor == 3;1170default:1171/* >= 4 are compatible (or are expected to be) */1172return user_swminor >= 4;1173}1174}1175/* make no promises yet for future major versions */1176return 0;1177}11781179static int init_subctxts(struct qib_devdata *dd,1180struct qib_ctxtdata *rcd,1181const struct qib_user_info *uinfo)1182{1183int ret = 0;1184unsigned num_subctxts;1185size_t size;11861187/*1188* If the user is requesting zero subctxts,1189* skip the subctxt allocation.1190*/1191if (uinfo->spu_subctxt_cnt <= 0)1192goto bail;1193num_subctxts = uinfo->spu_subctxt_cnt;11941195/* Check for subctxt compatibility */1196if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,1197uinfo->spu_userversion & 0xffff)) {1198qib_devinfo(dd->pcidev,1199"Mismatched user version (%d.%d) and driver "1200"version (%d.%d) while context sharing. Ensure "1201"that driver and library are from the same "1202"release.\n",1203(int) (uinfo->spu_userversion >> 16),1204(int) (uinfo->spu_userversion & 0xffff),1205QIB_USER_SWMAJOR, QIB_USER_SWMINOR);1206goto bail;1207}1208if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) {1209ret = -EINVAL;1210goto bail;1211}12121213rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts);1214if (!rcd->subctxt_uregbase) {1215ret = -ENOMEM;1216goto bail;1217}1218/* Note: rcd->rcvhdrq_size isn't initialized yet. */1219size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *1220sizeof(u32), PAGE_SIZE) * num_subctxts;1221rcd->subctxt_rcvhdr_base = vmalloc_user(size);1222if (!rcd->subctxt_rcvhdr_base) {1223ret = -ENOMEM;1224goto bail_ureg;1225}12261227rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks *1228rcd->rcvegrbuf_size *1229num_subctxts);1230if (!rcd->subctxt_rcvegrbuf) {1231ret = -ENOMEM;1232goto bail_rhdr;1233}12341235rcd->subctxt_cnt = uinfo->spu_subctxt_cnt;1236rcd->subctxt_id = uinfo->spu_subctxt_id;1237rcd->active_slaves = 1;1238rcd->redirect_seq_cnt = 1;1239set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);1240goto bail;12411242bail_rhdr:1243vfree(rcd->subctxt_rcvhdr_base);1244bail_ureg:1245vfree(rcd->subctxt_uregbase);1246rcd->subctxt_uregbase = NULL;1247bail:1248return ret;1249}12501251static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,1252struct file *fp, const struct qib_user_info *uinfo)1253{1254struct qib_devdata *dd = ppd->dd;1255struct qib_ctxtdata *rcd;1256void *ptmp = NULL;1257int ret;12581259rcd = qib_create_ctxtdata(ppd, ctxt);12601261/*1262* Allocate memory for use in qib_tid_update() at open to1263* reduce cost of expected send setup per message segment1264*/1265if (rcd)1266ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) +1267dd->rcvtidcnt * sizeof(struct page **),1268GFP_KERNEL);12691270if (!rcd || !ptmp) {1271qib_dev_err(dd, "Unable to allocate ctxtdata "1272"memory, failing open\n");1273ret = -ENOMEM;1274goto bailerr;1275}1276rcd->userversion = uinfo->spu_userversion;1277ret = init_subctxts(dd, rcd, uinfo);1278if (ret)1279goto bailerr;1280rcd->tid_pg_list = ptmp;1281rcd->pid = current->pid;1282init_waitqueue_head(&dd->rcd[ctxt]->wait);1283strlcpy(rcd->comm, current->comm, sizeof(rcd->comm));1284ctxt_fp(fp) = rcd;1285qib_stats.sps_ctxts++;1286ret = 0;1287goto bail;12881289bailerr:1290dd->rcd[ctxt] = NULL;1291kfree(rcd);1292kfree(ptmp);1293bail:1294return ret;1295}12961297static inline int usable(struct qib_pportdata *ppd)1298{1299struct qib_devdata *dd = ppd->dd;13001301return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid &&1302(ppd->lflags & QIBL_LINKACTIVE);1303}13041305/*1306* Select a context on the given device, either using a requested port1307* or the port based on the context number.1308*/1309static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port,1310const struct qib_user_info *uinfo)1311{1312struct qib_pportdata *ppd = NULL;1313int ret, ctxt;13141315if (port) {1316if (!usable(dd->pport + port - 1)) {1317ret = -ENETDOWN;1318goto done;1319} else1320ppd = dd->pport + port - 1;1321}1322for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt];1323ctxt++)1324;1325if (ctxt == dd->cfgctxts) {1326ret = -EBUSY;1327goto done;1328}1329if (!ppd) {1330u32 pidx = ctxt % dd->num_pports;1331if (usable(dd->pport + pidx))1332ppd = dd->pport + pidx;1333else {1334for (pidx = 0; pidx < dd->num_pports && !ppd;1335pidx++)1336if (usable(dd->pport + pidx))1337ppd = dd->pport + pidx;1338}1339}1340ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN;1341done:1342return ret;1343}13441345static int find_free_ctxt(int unit, struct file *fp,1346const struct qib_user_info *uinfo)1347{1348struct qib_devdata *dd = qib_lookup(unit);1349int ret;13501351if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports))1352ret = -ENODEV;1353else1354ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo);13551356return ret;1357}13581359static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo,1360unsigned alg)1361{1362struct qib_devdata *udd = NULL;1363int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i;1364u32 port = uinfo->spu_port, ctxt;13651366devmax = qib_count_units(&npresent, &nup);1367if (!npresent) {1368ret = -ENXIO;1369goto done;1370}1371if (nup == 0) {1372ret = -ENETDOWN;1373goto done;1374}13751376if (alg == QIB_PORT_ALG_ACROSS) {1377unsigned inuse = ~0U;1378/* find device (with ACTIVE ports) with fewest ctxts in use */1379for (ndev = 0; ndev < devmax; ndev++) {1380struct qib_devdata *dd = qib_lookup(ndev);1381unsigned cused = 0, cfree = 0, pusable = 0;1382if (!dd)1383continue;1384if (port && port <= dd->num_pports &&1385usable(dd->pport + port - 1))1386pusable = 1;1387else1388for (i = 0; i < dd->num_pports; i++)1389if (usable(dd->pport + i))1390pusable++;1391if (!pusable)1392continue;1393for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts;1394ctxt++)1395if (dd->rcd[ctxt])1396cused++;1397else1398cfree++;1399if (pusable && cfree && cused < inuse) {1400udd = dd;1401inuse = cused;1402}1403}1404if (udd) {1405ret = choose_port_ctxt(fp, udd, port, uinfo);1406goto done;1407}1408} else {1409for (ndev = 0; ndev < devmax; ndev++) {1410struct qib_devdata *dd = qib_lookup(ndev);1411if (dd) {1412ret = choose_port_ctxt(fp, dd, port, uinfo);1413if (!ret)1414goto done;1415if (ret == -EBUSY)1416dusable++;1417}1418}1419}1420ret = dusable ? -EBUSY : -ENETDOWN;14211422done:1423return ret;1424}14251426static int find_shared_ctxt(struct file *fp,1427const struct qib_user_info *uinfo)1428{1429int devmax, ndev, i;1430int ret = 0;14311432devmax = qib_count_units(NULL, NULL);14331434for (ndev = 0; ndev < devmax; ndev++) {1435struct qib_devdata *dd = qib_lookup(ndev);14361437/* device portion of usable() */1438if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase))1439continue;1440for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {1441struct qib_ctxtdata *rcd = dd->rcd[i];14421443/* Skip ctxts which are not yet open */1444if (!rcd || !rcd->cnt)1445continue;1446/* Skip ctxt if it doesn't match the requested one */1447if (rcd->subctxt_id != uinfo->spu_subctxt_id)1448continue;1449/* Verify the sharing process matches the master */1450if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||1451rcd->userversion != uinfo->spu_userversion ||1452rcd->cnt >= rcd->subctxt_cnt) {1453ret = -EINVAL;1454goto done;1455}1456ctxt_fp(fp) = rcd;1457subctxt_fp(fp) = rcd->cnt++;1458rcd->subpid[subctxt_fp(fp)] = current->pid;1459tidcursor_fp(fp) = 0;1460rcd->active_slaves |= 1 << subctxt_fp(fp);1461ret = 1;1462goto done;1463}1464}14651466done:1467return ret;1468}14691470static int qib_open(struct inode *in, struct file *fp)1471{1472/* The real work is performed later in qib_assign_ctxt() */1473fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL);1474if (fp->private_data) /* no cpu affinity by default */1475((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1;1476return fp->private_data ? 0 : -ENOMEM;1477}14781479/*1480* Get ctxt early, so can set affinity prior to memory allocation.1481*/1482static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)1483{1484int ret;1485int i_minor;1486unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS;14871488/* Check to be sure we haven't already initialized this file */1489if (ctxt_fp(fp)) {1490ret = -EINVAL;1491goto done;1492}14931494/* for now, if major version is different, bail */1495swmajor = uinfo->spu_userversion >> 16;1496if (swmajor != QIB_USER_SWMAJOR) {1497ret = -ENODEV;1498goto done;1499}15001501swminor = uinfo->spu_userversion & 0xffff;15021503if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT)1504alg = uinfo->spu_port_alg;15051506mutex_lock(&qib_mutex);15071508if (qib_compatible_subctxts(swmajor, swminor) &&1509uinfo->spu_subctxt_cnt) {1510ret = find_shared_ctxt(fp, uinfo);1511if (ret) {1512if (ret > 0)1513ret = 0;1514goto done_chk_sdma;1515}1516}15171518i_minor = iminor(fp->f_dentry->d_inode) - QIB_USER_MINOR_BASE;1519if (i_minor)1520ret = find_free_ctxt(i_minor - 1, fp, uinfo);1521else1522ret = get_a_ctxt(fp, uinfo, alg);15231524done_chk_sdma:1525if (!ret) {1526struct qib_filedata *fd = fp->private_data;1527const struct qib_ctxtdata *rcd = fd->rcd;1528const struct qib_devdata *dd = rcd->dd;15291530if (dd->flags & QIB_HAS_SEND_DMA) {1531fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,1532dd->unit,1533rcd->ctxt,1534fd->subctxt);1535if (!fd->pq)1536ret = -ENOMEM;1537}15381539/*1540* If process has NOT already set it's affinity, select and1541* reserve a processor for it, as a rendezvous for all1542* users of the driver. If they don't actually later1543* set affinity to this cpu, or set it to some other cpu,1544* it just means that sooner or later we don't recommend1545* a cpu, and let the scheduler do it's best.1546*/1547if (!ret && cpus_weight(current->cpus_allowed) >=1548qib_cpulist_count) {1549int cpu;1550cpu = find_first_zero_bit(qib_cpulist,1551qib_cpulist_count);1552if (cpu != qib_cpulist_count) {1553__set_bit(cpu, qib_cpulist);1554fd->rec_cpu_num = cpu;1555}1556} else if (cpus_weight(current->cpus_allowed) == 1 &&1557test_bit(first_cpu(current->cpus_allowed),1558qib_cpulist))1559qib_devinfo(dd->pcidev, "%s PID %u affinity "1560"set to cpu %d; already allocated\n",1561current->comm, current->pid,1562first_cpu(current->cpus_allowed));1563}15641565mutex_unlock(&qib_mutex);15661567done:1568return ret;1569}157015711572static int qib_do_user_init(struct file *fp,1573const struct qib_user_info *uinfo)1574{1575int ret;1576struct qib_ctxtdata *rcd = ctxt_fp(fp);1577struct qib_devdata *dd;1578unsigned uctxt;15791580/* Subctxts don't need to initialize anything since master did it. */1581if (subctxt_fp(fp)) {1582ret = wait_event_interruptible(rcd->wait,1583!test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag));1584goto bail;1585}15861587dd = rcd->dd;15881589/* some ctxts may get extra buffers, calculate that here */1590uctxt = rcd->ctxt - dd->first_user_ctxt;1591if (uctxt < dd->ctxts_extrabuf) {1592rcd->piocnt = dd->pbufsctxt + 1;1593rcd->pio_base = rcd->piocnt * uctxt;1594} else {1595rcd->piocnt = dd->pbufsctxt;1596rcd->pio_base = rcd->piocnt * uctxt +1597dd->ctxts_extrabuf;1598}15991600/*1601* All user buffers are 2KB buffers. If we ever support1602* giving 4KB buffers to user processes, this will need some1603* work. Can't use piobufbase directly, because it has1604* both 2K and 4K buffer base values. So check and handle.1605*/1606if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) {1607if (rcd->pio_base >= dd->piobcnt2k) {1608qib_dev_err(dd,1609"%u:ctxt%u: no 2KB buffers available\n",1610dd->unit, rcd->ctxt);1611ret = -ENOBUFS;1612goto bail;1613}1614rcd->piocnt = dd->piobcnt2k - rcd->pio_base;1615qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n",1616rcd->ctxt, rcd->piocnt);1617}16181619rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign;1620qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,1621TXCHK_CHG_TYPE_USER, rcd);1622/*1623* try to ensure that processes start up with consistent avail update1624* for their own range, at least. If system very quiet, it might1625* have the in-memory copy out of date at startup for this range of1626* buffers, when a context gets re-used. Do after the chg_pioavail1627* and before the rest of setup, so it's "almost certain" the dma1628* will have occurred (can't 100% guarantee, but should be many1629* decimals of 9s, with this ordering), given how much else happens1630* after this.1631*/1632dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);16331634/*1635* Now allocate the rcvhdr Q and eager TIDs; skip the TID1636* array for time being. If rcd->ctxt > chip-supported,1637* we need to do extra stuff here to handle by handling overflow1638* through ctxt 0, someday1639*/1640ret = qib_create_rcvhdrq(dd, rcd);1641if (!ret)1642ret = qib_setup_eagerbufs(rcd);1643if (ret)1644goto bail_pio;16451646rcd->tidcursor = 0; /* start at beginning after open */16471648/* initialize poll variables... */1649rcd->urgent = 0;1650rcd->urgent_poll = 0;16511652/*1653* Now enable the ctxt for receive.1654* For chips that are set to DMA the tail register to memory1655* when they change (and when the update bit transitions from1656* 0 to 1. So for those chips, we turn it off and then back on.1657* This will (very briefly) affect any other open ctxts, but the1658* duration is very short, and therefore isn't an issue. We1659* explicitly set the in-memory tail copy to 0 beforehand, so we1660* don't have to wait to be sure the DMA update has happened1661* (chip resets head/tail to 0 on transition to enable).1662*/1663if (rcd->rcvhdrtail_kvaddr)1664qib_clear_rcvhdrtail(rcd);16651666dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB,1667rcd->ctxt);16681669/* Notify any waiting slaves */1670if (rcd->subctxt_cnt) {1671clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);1672wake_up(&rcd->wait);1673}1674return 0;16751676bail_pio:1677qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,1678TXCHK_CHG_TYPE_KERN, rcd);1679bail:1680return ret;1681}16821683/**1684* unlock_exptid - unlock any expected TID entries context still had in use1685* @rcd: ctxt1686*1687* We don't actually update the chip here, because we do a bulk update1688* below, using f_clear_tids.1689*/1690static void unlock_expected_tids(struct qib_ctxtdata *rcd)1691{1692struct qib_devdata *dd = rcd->dd;1693int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt;1694int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt;16951696for (i = ctxt_tidbase; i < maxtid; i++) {1697struct page *p = dd->pageshadow[i];1698dma_addr_t phys;16991700if (!p)1701continue;17021703phys = dd->physshadow[i];1704dd->physshadow[i] = dd->tidinvalid;1705dd->pageshadow[i] = NULL;1706pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,1707PCI_DMA_FROMDEVICE);1708qib_release_user_pages(&p, 1);1709cnt++;1710}1711}17121713static int qib_close(struct inode *in, struct file *fp)1714{1715int ret = 0;1716struct qib_filedata *fd;1717struct qib_ctxtdata *rcd;1718struct qib_devdata *dd;1719unsigned long flags;1720unsigned ctxt;1721pid_t pid;17221723mutex_lock(&qib_mutex);17241725fd = fp->private_data;1726fp->private_data = NULL;1727rcd = fd->rcd;1728if (!rcd) {1729mutex_unlock(&qib_mutex);1730goto bail;1731}17321733dd = rcd->dd;17341735/* ensure all pio buffer writes in progress are flushed */1736qib_flush_wc();17371738/* drain user sdma queue */1739if (fd->pq) {1740qib_user_sdma_queue_drain(rcd->ppd, fd->pq);1741qib_user_sdma_queue_destroy(fd->pq);1742}17431744if (fd->rec_cpu_num != -1)1745__clear_bit(fd->rec_cpu_num, qib_cpulist);17461747if (--rcd->cnt) {1748/*1749* XXX If the master closes the context before the slave(s),1750* revoke the mmap for the eager receive queue so1751* the slave(s) don't wait for receive data forever.1752*/1753rcd->active_slaves &= ~(1 << fd->subctxt);1754rcd->subpid[fd->subctxt] = 0;1755mutex_unlock(&qib_mutex);1756goto bail;1757}17581759/* early; no interrupt users after this */1760spin_lock_irqsave(&dd->uctxt_lock, flags);1761ctxt = rcd->ctxt;1762dd->rcd[ctxt] = NULL;1763pid = rcd->pid;1764rcd->pid = 0;1765spin_unlock_irqrestore(&dd->uctxt_lock, flags);17661767if (rcd->rcvwait_to || rcd->piowait_to ||1768rcd->rcvnowait || rcd->pionowait) {1769rcd->rcvwait_to = 0;1770rcd->piowait_to = 0;1771rcd->rcvnowait = 0;1772rcd->pionowait = 0;1773}1774if (rcd->flag)1775rcd->flag = 0;17761777if (dd->kregbase) {1778/* atomically clear receive enable ctxt and intr avail. */1779dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS |1780QIB_RCVCTRL_INTRAVAIL_DIS, ctxt);17811782/* clean up the pkeys for this ctxt user */1783qib_clean_part_key(rcd, dd);1784qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt);1785qib_chg_pioavailkernel(dd, rcd->pio_base,1786rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL);17871788dd->f_clear_tids(dd, rcd);17891790if (dd->pageshadow)1791unlock_expected_tids(rcd);1792qib_stats.sps_ctxts--;1793}17941795mutex_unlock(&qib_mutex);1796qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */17971798bail:1799kfree(fd);1800return ret;1801}18021803static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo)1804{1805struct qib_ctxt_info info;1806int ret;1807size_t sz;1808struct qib_ctxtdata *rcd = ctxt_fp(fp);1809struct qib_filedata *fd;18101811fd = fp->private_data;18121813info.num_active = qib_count_active_units();1814info.unit = rcd->dd->unit;1815info.port = rcd->ppd->port;1816info.ctxt = rcd->ctxt;1817info.subctxt = subctxt_fp(fp);1818/* Number of user ctxts available for this device. */1819info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt;1820info.num_subctxts = rcd->subctxt_cnt;1821info.rec_cpu = fd->rec_cpu_num;1822sz = sizeof(info);18231824if (copy_to_user(uinfo, &info, sz)) {1825ret = -EFAULT;1826goto bail;1827}1828ret = 0;18291830bail:1831return ret;1832}18331834static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq,1835u32 __user *inflightp)1836{1837const u32 val = qib_user_sdma_inflight_counter(pq);18381839if (put_user(val, inflightp))1840return -EFAULT;18411842return 0;1843}18441845static int qib_sdma_get_complete(struct qib_pportdata *ppd,1846struct qib_user_sdma_queue *pq,1847u32 __user *completep)1848{1849u32 val;1850int err;18511852if (!pq)1853return -EINVAL;18541855err = qib_user_sdma_make_progress(ppd, pq);1856if (err < 0)1857return err;18581859val = qib_user_sdma_complete_counter(pq);1860if (put_user(val, completep))1861return -EFAULT;18621863return 0;1864}18651866static int disarm_req_delay(struct qib_ctxtdata *rcd)1867{1868int ret = 0;18691870if (!usable(rcd->ppd)) {1871int i;1872/*1873* if link is down, or otherwise not usable, delay1874* the caller up to 30 seconds, so we don't thrash1875* in trying to get the chip back to ACTIVE, and1876* set flag so they make the call again.1877*/1878if (rcd->user_event_mask) {1879/*1880* subctxt_cnt is 0 if not shared, so do base1881* separately, first, then remaining subctxt, if any1882*/1883set_bit(_QIB_EVENT_DISARM_BUFS_BIT,1884&rcd->user_event_mask[0]);1885for (i = 1; i < rcd->subctxt_cnt; i++)1886set_bit(_QIB_EVENT_DISARM_BUFS_BIT,1887&rcd->user_event_mask[i]);1888}1889for (i = 0; !usable(rcd->ppd) && i < 300; i++)1890msleep(100);1891ret = -ENETDOWN;1892}1893return ret;1894}18951896/*1897* Find all user contexts in use, and set the specified bit in their1898* event mask.1899* See also find_ctxt() for a similar use, that is specific to send buffers.1900*/1901int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit)1902{1903struct qib_ctxtdata *rcd;1904unsigned ctxt;1905int ret = 0;19061907spin_lock(&ppd->dd->uctxt_lock);1908for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts;1909ctxt++) {1910rcd = ppd->dd->rcd[ctxt];1911if (!rcd)1912continue;1913if (rcd->user_event_mask) {1914int i;1915/*1916* subctxt_cnt is 0 if not shared, so do base1917* separately, first, then remaining subctxt, if any1918*/1919set_bit(evtbit, &rcd->user_event_mask[0]);1920for (i = 1; i < rcd->subctxt_cnt; i++)1921set_bit(evtbit, &rcd->user_event_mask[i]);1922}1923ret = 1;1924break;1925}1926spin_unlock(&ppd->dd->uctxt_lock);19271928return ret;1929}19301931/*1932* clear the event notifier events for this context.1933* For the DISARM_BUFS case, we also take action (this obsoletes1934* the older QIB_CMD_DISARM_BUFS, but we keep it for backwards1935* compatibility.1936* Other bits don't currently require actions, just atomically clear.1937* User process then performs actions appropriate to bit having been1938* set, if desired, and checks again in future.1939*/1940static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt,1941unsigned long events)1942{1943int ret = 0, i;19441945for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) {1946if (!test_bit(i, &events))1947continue;1948if (i == _QIB_EVENT_DISARM_BUFS_BIT) {1949(void)qib_disarm_piobufs_ifneeded(rcd);1950ret = disarm_req_delay(rcd);1951} else1952clear_bit(i, &rcd->user_event_mask[subctxt]);1953}1954return ret;1955}19561957static ssize_t qib_write(struct file *fp, const char __user *data,1958size_t count, loff_t *off)1959{1960const struct qib_cmd __user *ucmd;1961struct qib_ctxtdata *rcd;1962const void __user *src;1963size_t consumed, copy = 0;1964struct qib_cmd cmd;1965ssize_t ret = 0;1966void *dest;19671968if (count < sizeof(cmd.type)) {1969ret = -EINVAL;1970goto bail;1971}19721973ucmd = (const struct qib_cmd __user *) data;19741975if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {1976ret = -EFAULT;1977goto bail;1978}19791980consumed = sizeof(cmd.type);19811982switch (cmd.type) {1983case QIB_CMD_ASSIGN_CTXT:1984case QIB_CMD_USER_INIT:1985copy = sizeof(cmd.cmd.user_info);1986dest = &cmd.cmd.user_info;1987src = &ucmd->cmd.user_info;1988break;19891990case QIB_CMD_RECV_CTRL:1991copy = sizeof(cmd.cmd.recv_ctrl);1992dest = &cmd.cmd.recv_ctrl;1993src = &ucmd->cmd.recv_ctrl;1994break;19951996case QIB_CMD_CTXT_INFO:1997copy = sizeof(cmd.cmd.ctxt_info);1998dest = &cmd.cmd.ctxt_info;1999src = &ucmd->cmd.ctxt_info;2000break;20012002case QIB_CMD_TID_UPDATE:2003case QIB_CMD_TID_FREE:2004copy = sizeof(cmd.cmd.tid_info);2005dest = &cmd.cmd.tid_info;2006src = &ucmd->cmd.tid_info;2007break;20082009case QIB_CMD_SET_PART_KEY:2010copy = sizeof(cmd.cmd.part_key);2011dest = &cmd.cmd.part_key;2012src = &ucmd->cmd.part_key;2013break;20142015case QIB_CMD_DISARM_BUFS:2016case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */2017copy = 0;2018src = NULL;2019dest = NULL;2020break;20212022case QIB_CMD_POLL_TYPE:2023copy = sizeof(cmd.cmd.poll_type);2024dest = &cmd.cmd.poll_type;2025src = &ucmd->cmd.poll_type;2026break;20272028case QIB_CMD_ARMLAUNCH_CTRL:2029copy = sizeof(cmd.cmd.armlaunch_ctrl);2030dest = &cmd.cmd.armlaunch_ctrl;2031src = &ucmd->cmd.armlaunch_ctrl;2032break;20332034case QIB_CMD_SDMA_INFLIGHT:2035copy = sizeof(cmd.cmd.sdma_inflight);2036dest = &cmd.cmd.sdma_inflight;2037src = &ucmd->cmd.sdma_inflight;2038break;20392040case QIB_CMD_SDMA_COMPLETE:2041copy = sizeof(cmd.cmd.sdma_complete);2042dest = &cmd.cmd.sdma_complete;2043src = &ucmd->cmd.sdma_complete;2044break;20452046case QIB_CMD_ACK_EVENT:2047copy = sizeof(cmd.cmd.event_mask);2048dest = &cmd.cmd.event_mask;2049src = &ucmd->cmd.event_mask;2050break;20512052default:2053ret = -EINVAL;2054goto bail;2055}20562057if (copy) {2058if ((count - consumed) < copy) {2059ret = -EINVAL;2060goto bail;2061}2062if (copy_from_user(dest, src, copy)) {2063ret = -EFAULT;2064goto bail;2065}2066consumed += copy;2067}20682069rcd = ctxt_fp(fp);2070if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) {2071ret = -EINVAL;2072goto bail;2073}20742075switch (cmd.type) {2076case QIB_CMD_ASSIGN_CTXT:2077ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);2078if (ret)2079goto bail;2080break;20812082case QIB_CMD_USER_INIT:2083ret = qib_do_user_init(fp, &cmd.cmd.user_info);2084if (ret)2085goto bail;2086ret = qib_get_base_info(fp, (void __user *) (unsigned long)2087cmd.cmd.user_info.spu_base_info,2088cmd.cmd.user_info.spu_base_info_size);2089break;20902091case QIB_CMD_RECV_CTRL:2092ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl);2093break;20942095case QIB_CMD_CTXT_INFO:2096ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *)2097(unsigned long) cmd.cmd.ctxt_info);2098break;20992100case QIB_CMD_TID_UPDATE:2101ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info);2102break;21032104case QIB_CMD_TID_FREE:2105ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info);2106break;21072108case QIB_CMD_SET_PART_KEY:2109ret = qib_set_part_key(rcd, cmd.cmd.part_key);2110break;21112112case QIB_CMD_DISARM_BUFS:2113(void)qib_disarm_piobufs_ifneeded(rcd);2114ret = disarm_req_delay(rcd);2115break;21162117case QIB_CMD_PIOAVAILUPD:2118qib_force_pio_avail_update(rcd->dd);2119break;21202121case QIB_CMD_POLL_TYPE:2122rcd->poll_type = cmd.cmd.poll_type;2123break;21242125case QIB_CMD_ARMLAUNCH_CTRL:2126rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl);2127break;21282129case QIB_CMD_SDMA_INFLIGHT:2130ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp),2131(u32 __user *) (unsigned long)2132cmd.cmd.sdma_inflight);2133break;21342135case QIB_CMD_SDMA_COMPLETE:2136ret = qib_sdma_get_complete(rcd->ppd,2137user_sdma_queue_fp(fp),2138(u32 __user *) (unsigned long)2139cmd.cmd.sdma_complete);2140break;21412142case QIB_CMD_ACK_EVENT:2143ret = qib_user_event_ack(rcd, subctxt_fp(fp),2144cmd.cmd.event_mask);2145break;2146}21472148if (ret >= 0)2149ret = consumed;21502151bail:2152return ret;2153}21542155static ssize_t qib_aio_write(struct kiocb *iocb, const struct iovec *iov,2156unsigned long dim, loff_t off)2157{2158struct qib_filedata *fp = iocb->ki_filp->private_data;2159struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);2160struct qib_user_sdma_queue *pq = fp->pq;21612162if (!dim || !pq)2163return -EINVAL;21642165return qib_user_sdma_writev(rcd, pq, iov, dim);2166}21672168static struct class *qib_class;2169static dev_t qib_dev;21702171int qib_cdev_init(int minor, const char *name,2172const struct file_operations *fops,2173struct cdev **cdevp, struct device **devp)2174{2175const dev_t dev = MKDEV(MAJOR(qib_dev), minor);2176struct cdev *cdev;2177struct device *device = NULL;2178int ret;21792180cdev = cdev_alloc();2181if (!cdev) {2182printk(KERN_ERR QIB_DRV_NAME2183": Could not allocate cdev for minor %d, %s\n",2184minor, name);2185ret = -ENOMEM;2186goto done;2187}21882189cdev->owner = THIS_MODULE;2190cdev->ops = fops;2191kobject_set_name(&cdev->kobj, name);21922193ret = cdev_add(cdev, dev, 1);2194if (ret < 0) {2195printk(KERN_ERR QIB_DRV_NAME2196": Could not add cdev for minor %d, %s (err %d)\n",2197minor, name, -ret);2198goto err_cdev;2199}22002201device = device_create(qib_class, NULL, dev, NULL, name);2202if (!IS_ERR(device))2203goto done;2204ret = PTR_ERR(device);2205device = NULL;2206printk(KERN_ERR QIB_DRV_NAME ": Could not create "2207"device for minor %d, %s (err %d)\n",2208minor, name, -ret);2209err_cdev:2210cdev_del(cdev);2211cdev = NULL;2212done:2213*cdevp = cdev;2214*devp = device;2215return ret;2216}22172218void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp)2219{2220struct device *device = *devp;22212222if (device) {2223device_unregister(device);2224*devp = NULL;2225}22262227if (*cdevp) {2228cdev_del(*cdevp);2229*cdevp = NULL;2230}2231}22322233static struct cdev *wildcard_cdev;2234static struct device *wildcard_device;22352236int __init qib_dev_init(void)2237{2238int ret;22392240ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME);2241if (ret < 0) {2242printk(KERN_ERR QIB_DRV_NAME ": Could not allocate "2243"chrdev region (err %d)\n", -ret);2244goto done;2245}22462247qib_class = class_create(THIS_MODULE, "ipath");2248if (IS_ERR(qib_class)) {2249ret = PTR_ERR(qib_class);2250printk(KERN_ERR QIB_DRV_NAME ": Could not create "2251"device class (err %d)\n", -ret);2252unregister_chrdev_region(qib_dev, QIB_NMINORS);2253}22542255done:2256return ret;2257}22582259void qib_dev_cleanup(void)2260{2261if (qib_class) {2262class_destroy(qib_class);2263qib_class = NULL;2264}22652266unregister_chrdev_region(qib_dev, QIB_NMINORS);2267}22682269static atomic_t user_count = ATOMIC_INIT(0);22702271static void qib_user_remove(struct qib_devdata *dd)2272{2273if (atomic_dec_return(&user_count) == 0)2274qib_cdev_cleanup(&wildcard_cdev, &wildcard_device);22752276qib_cdev_cleanup(&dd->user_cdev, &dd->user_device);2277}22782279static int qib_user_add(struct qib_devdata *dd)2280{2281char name[10];2282int ret;22832284if (atomic_inc_return(&user_count) == 1) {2285ret = qib_cdev_init(0, "ipath", &qib_file_ops,2286&wildcard_cdev, &wildcard_device);2287if (ret)2288goto done;2289}22902291snprintf(name, sizeof(name), "ipath%d", dd->unit);2292ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops,2293&dd->user_cdev, &dd->user_device);2294if (ret)2295qib_user_remove(dd);2296done:2297return ret;2298}22992300/*2301* Create per-unit files in /dev2302*/2303int qib_device_create(struct qib_devdata *dd)2304{2305int r, ret;23062307r = qib_user_add(dd);2308ret = qib_diag_add(dd);2309if (r && !ret)2310ret = r;2311return ret;2312}23132314/*2315* Remove per-unit files in /dev2316* void, core kernel returns no errors for this stuff2317*/2318void qib_device_remove(struct qib_devdata *dd)2319{2320qib_user_remove(dd);2321qib_diag_remove(dd);2322}232323242325