Path: blob/master/arch/powerpc/platforms/pseries/iommu.c
26481 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation3*4* Rewrite, cleanup:5*6* Copyright (C) 2004 Olof Johansson <[email protected]>, IBM Corporation7* Copyright (C) 2006 Olof Johansson <[email protected]>8*9* Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.10*/1112#include <linux/init.h>13#include <linux/types.h>14#include <linux/slab.h>15#include <linux/mm.h>16#include <linux/memblock.h>17#include <linux/spinlock.h>18#include <linux/string.h>19#include <linux/pci.h>20#include <linux/dma-mapping.h>21#include <linux/crash_dump.h>22#include <linux/memory.h>23#include <linux/vmalloc.h>24#include <linux/of.h>25#include <linux/of_address.h>26#include <linux/iommu.h>27#include <linux/rculist.h>28#include <asm/io.h>29#include <asm/prom.h>30#include <asm/rtas.h>31#include <asm/iommu.h>32#include <asm/pci-bridge.h>33#include <asm/machdep.h>34#include <asm/firmware.h>35#include <asm/tce.h>36#include <asm/ppc-pci.h>37#include <asm/udbg.h>38#include <asm/mmzone.h>39#include <asm/plpar_wrappers.h>4041#include "pseries.h"4243enum {44DDW_QUERY_PE_DMA_WIN = 0,45DDW_CREATE_PE_DMA_WIN = 1,46DDW_REMOVE_PE_DMA_WIN = 2,4748DDW_APPLICABLE_SIZE49};5051enum {52DDW_EXT_SIZE = 0,53DDW_EXT_RESET_DMA_WIN = 1,54DDW_EXT_QUERY_OUT_SIZE = 2,55DDW_EXT_LIMITED_ADDR_MODE = 356};5758static struct iommu_table *iommu_pseries_alloc_table(int node)59{60struct iommu_table *tbl;6162tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);63if (!tbl)64return NULL;6566INIT_LIST_HEAD_RCU(&tbl->it_group_list);67kref_init(&tbl->it_kref);68return tbl;69}7071#ifdef CONFIG_IOMMU_API72static struct iommu_table_group_ops spapr_tce_table_group_ops;73#endif7475static struct iommu_table_group *iommu_pseries_alloc_group(int node)76{77struct iommu_table_group *table_group;7879table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);80if (!table_group)81return NULL;8283#ifdef CONFIG_IOMMU_API84table_group->ops = &spapr_tce_table_group_ops;85table_group->pgsizes = SZ_4K;86#endif8788table_group->tables[0] = iommu_pseries_alloc_table(node);89if (table_group->tables[0])90return table_group;9192kfree(table_group);93return NULL;94}9596static void iommu_pseries_free_group(struct iommu_table_group *table_group,97const char *node_name)98{99if (!table_group)100return;101102#ifdef CONFIG_IOMMU_API103if (table_group->group) {104iommu_group_put(table_group->group);105BUG_ON(table_group->group);106}107#endif108109/* Default DMA window table is at index 0, while DDW at 1. SR-IOV110* adapters only have table on index 0(if not direct mapped).111*/112if (table_group->tables[0])113iommu_tce_table_put(table_group->tables[0]);114115if (table_group->tables[1])116iommu_tce_table_put(table_group->tables[1]);117118kfree(table_group);119}120121static int tce_build_pSeries(struct iommu_table *tbl, long index,122long npages, unsigned long uaddr,123enum dma_data_direction direction,124unsigned long attrs)125{126u64 proto_tce;127__be64 *tcep;128u64 rpn;129const unsigned long tceshift = tbl->it_page_shift;130const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);131132proto_tce = TCE_PCI_READ; // Read allowed133134if (direction != DMA_TO_DEVICE)135proto_tce |= TCE_PCI_WRITE;136137tcep = ((__be64 *)tbl->it_base) + index;138139while (npages--) {140/* can't move this out since we might cross MEMBLOCK boundary */141rpn = __pa(uaddr) >> tceshift;142*tcep = cpu_to_be64(proto_tce | rpn << tceshift);143144uaddr += pagesize;145tcep++;146}147return 0;148}149150151static void tce_clear_pSeries(struct iommu_table *tbl, long index, long npages)152{153__be64 *tcep;154155tcep = ((__be64 *)tbl->it_base) + index;156157while (npages--)158*(tcep++) = 0;159}160161static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)162{163__be64 *tcep;164165tcep = ((__be64 *)tbl->it_base) + index;166167return be64_to_cpu(*tcep);168}169170#ifdef CONFIG_IOMMU_API171static long pseries_tce_iommu_userspace_view_alloc(struct iommu_table *tbl)172{173unsigned long cb = ALIGN(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE);174unsigned long *uas;175176if (tbl->it_indirect_levels) /* Impossible */177return -EPERM;178179WARN_ON(tbl->it_userspace);180181uas = vzalloc(cb);182if (!uas)183return -ENOMEM;184185tbl->it_userspace = (__be64 *) uas;186187return 0;188}189#endif190191static void tce_iommu_userspace_view_free(struct iommu_table *tbl)192{193vfree(tbl->it_userspace);194tbl->it_userspace = NULL;195}196197static void tce_free_pSeries(struct iommu_table *tbl)198{199if (tbl->it_userspace)200tce_iommu_userspace_view_free(tbl);201}202203static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);204static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);205206static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,207long npages, unsigned long uaddr,208enum dma_data_direction direction,209unsigned long attrs)210{211u64 rc = 0;212u64 proto_tce, tce;213u64 rpn;214int ret = 0;215long tcenum_start = tcenum, npages_start = npages;216217rpn = __pa(uaddr) >> tceshift;218proto_tce = TCE_PCI_READ;219if (direction != DMA_TO_DEVICE)220proto_tce |= TCE_PCI_WRITE;221222while (npages--) {223tce = proto_tce | rpn << tceshift;224rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);225226if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {227ret = (int)rc;228tce_free_pSeriesLP(liobn, tcenum_start, tceshift,229(npages_start - (npages + 1)));230break;231}232233if (rc && printk_ratelimit()) {234printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);235printk("\tindex = 0x%llx\n", (u64)liobn);236printk("\ttcenum = 0x%llx\n", (u64)tcenum);237printk("\ttce val = 0x%llx\n", tce );238dump_stack();239}240241tcenum++;242rpn++;243}244return ret;245}246247static DEFINE_PER_CPU(__be64 *, tce_page);248249static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,250long npages, unsigned long uaddr,251enum dma_data_direction direction,252unsigned long attrs)253{254u64 rc = 0;255u64 proto_tce;256__be64 *tcep;257u64 rpn;258long l, limit;259long tcenum_start = tcenum, npages_start = npages;260int ret = 0;261unsigned long flags;262const unsigned long tceshift = tbl->it_page_shift;263264if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {265return tce_build_pSeriesLP(tbl->it_index, tcenum,266tceshift, npages, uaddr,267direction, attrs);268}269270local_irq_save(flags); /* to protect tcep and the page behind it */271272tcep = __this_cpu_read(tce_page);273274/* This is safe to do since interrupts are off when we're called275* from iommu_alloc{,_sg}()276*/277if (!tcep) {278tcep = (__be64 *)__get_free_page(GFP_ATOMIC);279/* If allocation fails, fall back to the loop implementation */280if (!tcep) {281local_irq_restore(flags);282return tce_build_pSeriesLP(tbl->it_index, tcenum,283tceshift,284npages, uaddr, direction, attrs);285}286__this_cpu_write(tce_page, tcep);287}288289rpn = __pa(uaddr) >> tceshift;290proto_tce = TCE_PCI_READ;291if (direction != DMA_TO_DEVICE)292proto_tce |= TCE_PCI_WRITE;293294/* We can map max one pageful of TCEs at a time */295do {296/*297* Set up the page with TCE data, looping through and setting298* the values.299*/300limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE);301302for (l = 0; l < limit; l++) {303tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);304rpn++;305}306307rc = plpar_tce_put_indirect((u64)tbl->it_index,308(u64)tcenum << tceshift,309(u64)__pa(tcep),310limit);311312npages -= limit;313tcenum += limit;314} while (npages > 0 && !rc);315316local_irq_restore(flags);317318if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {319ret = (int)rc;320tce_freemulti_pSeriesLP(tbl, tcenum_start,321(npages_start - (npages + limit)));322return ret;323}324325if (rc && printk_ratelimit()) {326printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);327printk("\tindex = 0x%llx\n", (u64)tbl->it_index);328printk("\tnpages = 0x%llx\n", (u64)npages);329printk("\ttce[0] val = 0x%llx\n", tcep[0]);330dump_stack();331}332return ret;333}334335static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,336long npages)337{338u64 rc;339340while (npages--) {341rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);342343if (rc && printk_ratelimit()) {344printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);345printk("\tindex = 0x%llx\n", (u64)liobn);346printk("\ttcenum = 0x%llx\n", (u64)tcenum);347dump_stack();348}349350tcenum++;351}352}353354355static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)356{357u64 rc;358long rpages = npages;359unsigned long limit;360361if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))362return tce_free_pSeriesLP(tbl->it_index, tcenum,363tbl->it_page_shift, npages);364365do {366limit = min_t(unsigned long, rpages, 512);367368rc = plpar_tce_stuff((u64)tbl->it_index,369(u64)tcenum << tbl->it_page_shift, 0, limit);370371rpages -= limit;372tcenum += limit;373} while (rpages > 0 && !rc);374375if (rc && printk_ratelimit()) {376printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");377printk("\trc = %lld\n", rc);378printk("\tindex = 0x%llx\n", (u64)tbl->it_index);379printk("\tnpages = 0x%llx\n", (u64)npages);380dump_stack();381}382}383384static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)385{386u64 rc;387unsigned long tce_ret;388389rc = plpar_tce_get((u64)tbl->it_index,390(u64)tcenum << tbl->it_page_shift, &tce_ret);391392if (rc && printk_ratelimit()) {393printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);394printk("\tindex = 0x%llx\n", (u64)tbl->it_index);395printk("\ttcenum = 0x%llx\n", (u64)tcenum);396dump_stack();397}398399return tce_ret;400}401402/* this is compatible with cells for the device tree property */403struct dynamic_dma_window_prop {404__be32 liobn; /* tce table number */405__be64 dma_base; /* address hi,lo */406__be32 tce_shift; /* ilog2(tce_page_size) */407__be32 window_shift; /* ilog2(tce_window_size) */408};409410struct dma_win {411struct device_node *device;412const struct dynamic_dma_window_prop *prop;413bool direct;414struct list_head list;415};416417/* Dynamic DMA Window support */418struct ddw_query_response {419u32 windows_available;420u64 largest_available_block;421u32 page_size;422u32 migration_capable;423};424425struct ddw_create_response {426u32 liobn;427u32 addr_hi;428u32 addr_lo;429};430431static LIST_HEAD(dma_win_list);432/* prevents races between memory on/offline and window creation */433static DEFINE_SPINLOCK(dma_win_list_lock);434/* protects initializing window twice for same device */435static DEFINE_MUTEX(dma_win_init_mutex);436437static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,438unsigned long num_pfn, const void *arg)439{440const struct dynamic_dma_window_prop *maprange = arg;441int rc;442u64 tce_size, num_tce, dma_offset, next;443u32 tce_shift;444long limit;445446tce_shift = be32_to_cpu(maprange->tce_shift);447tce_size = 1ULL << tce_shift;448next = start_pfn << PAGE_SHIFT;449num_tce = num_pfn << PAGE_SHIFT;450451/* round back to the beginning of the tce page size */452num_tce += next & (tce_size - 1);453next &= ~(tce_size - 1);454455/* covert to number of tces */456num_tce |= tce_size - 1;457num_tce >>= tce_shift;458459do {460/*461* Set up the page with TCE data, looping through and setting462* the values.463*/464limit = min_t(long, num_tce, 512);465dma_offset = next + be64_to_cpu(maprange->dma_base);466467rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),468dma_offset,4690, limit);470next += limit * tce_size;471num_tce -= limit;472} while (num_tce > 0 && !rc);473474return rc;475}476477static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,478unsigned long num_pfn, const void *arg)479{480const struct dynamic_dma_window_prop *maprange = arg;481u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;482__be64 *tcep;483u32 tce_shift;484u64 rc = 0;485long l, limit;486487if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {488unsigned long tceshift = be32_to_cpu(maprange->tce_shift);489unsigned long dmastart = (start_pfn << PAGE_SHIFT) +490be64_to_cpu(maprange->dma_base);491unsigned long tcenum = dmastart >> tceshift;492unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;493void *uaddr = __va(start_pfn << PAGE_SHIFT);494495return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),496tcenum, tceshift, npages, (unsigned long) uaddr,497DMA_BIDIRECTIONAL, 0);498}499500local_irq_disable(); /* to protect tcep and the page behind it */501tcep = __this_cpu_read(tce_page);502503if (!tcep) {504tcep = (__be64 *)__get_free_page(GFP_ATOMIC);505if (!tcep) {506local_irq_enable();507return -ENOMEM;508}509__this_cpu_write(tce_page, tcep);510}511512proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;513514liobn = (u64)be32_to_cpu(maprange->liobn);515tce_shift = be32_to_cpu(maprange->tce_shift);516tce_size = 1ULL << tce_shift;517next = start_pfn << PAGE_SHIFT;518num_tce = num_pfn << PAGE_SHIFT;519520/* round back to the beginning of the tce page size */521num_tce += next & (tce_size - 1);522next &= ~(tce_size - 1);523524/* covert to number of tces */525num_tce |= tce_size - 1;526num_tce >>= tce_shift;527528/* We can map max one pageful of TCEs at a time */529do {530/*531* Set up the page with TCE data, looping through and setting532* the values.533*/534limit = min_t(long, num_tce, 4096 / TCE_ENTRY_SIZE);535dma_offset = next + be64_to_cpu(maprange->dma_base);536537for (l = 0; l < limit; l++) {538tcep[l] = cpu_to_be64(proto_tce | next);539next += tce_size;540}541542rc = plpar_tce_put_indirect(liobn,543dma_offset,544(u64)__pa(tcep),545limit);546547num_tce -= limit;548} while (num_tce > 0 && !rc);549550/* error cleanup: caller will clear whole range */551552local_irq_enable();553return rc;554}555556static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,557unsigned long num_pfn, void *arg)558{559return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);560}561562static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,563unsigned long liobn, unsigned long win_addr,564unsigned long window_size, unsigned long page_shift,565void *base, struct iommu_table_ops *table_ops)566{567tbl->it_busno = busno;568tbl->it_index = liobn;569tbl->it_offset = win_addr >> page_shift;570tbl->it_size = window_size >> page_shift;571tbl->it_page_shift = page_shift;572tbl->it_base = (unsigned long)base;573tbl->it_blocksize = 16;574tbl->it_type = TCE_PCI;575tbl->it_ops = table_ops;576}577578struct iommu_table_ops iommu_table_pseries_ops;579580static void iommu_table_setparms(struct pci_controller *phb,581struct device_node *dn,582struct iommu_table *tbl)583{584struct device_node *node;585const unsigned long *basep;586const u32 *sizep;587588/* Test if we are going over 2GB of DMA space */589if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {590udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");591panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");592}593594node = phb->dn;595basep = of_get_property(node, "linux,tce-base", NULL);596sizep = of_get_property(node, "linux,tce-size", NULL);597if (basep == NULL || sizep == NULL) {598printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "599"missing tce entries !\n", dn);600return;601}602603iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,604phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,605__va(*basep), &iommu_table_pseries_ops);606607if (!is_kdump_kernel())608memset((void *)tbl->it_base, 0, *sizep);609610phb->dma_window_base_cur += phb->dma_window_size;611}612613struct iommu_table_ops iommu_table_lpar_multi_ops;614615struct iommu_table_ops iommu_table_pseries_ops = {616.set = tce_build_pSeries,617.clear = tce_clear_pSeries,618.get = tce_get_pseries619};620621static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)622{623struct device_node *dn;624struct iommu_table *tbl;625struct device_node *isa_dn, *isa_dn_orig;626struct device_node *tmp;627struct pci_dn *pci;628int children;629630dn = pci_bus_to_OF_node(bus);631632pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);633634if (bus->self) {635/* This is not a root bus, any setup will be done for the636* device-side of the bridge in iommu_dev_setup_pSeries().637*/638return;639}640pci = PCI_DN(dn);641642/* Check if the ISA bus on the system is under643* this PHB.644*/645isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");646647while (isa_dn && isa_dn != dn)648isa_dn = isa_dn->parent;649650of_node_put(isa_dn_orig);651652/* Count number of direct PCI children of the PHB. */653for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)654children++;655656pr_debug("Children: %d\n", children);657658/* Calculate amount of DMA window per slot. Each window must be659* a power of two (due to pci_alloc_consistent requirements).660*661* Keep 256MB aside for PHBs with ISA.662*/663664if (!isa_dn) {665/* No ISA/IDE - just set window size and return */666pci->phb->dma_window_size = 0x80000000ul; /* To be divided */667668while (pci->phb->dma_window_size * children > 0x80000000ul)669pci->phb->dma_window_size >>= 1;670pr_debug("No ISA/IDE, window size is 0x%llx\n",671pci->phb->dma_window_size);672pci->phb->dma_window_base_cur = 0;673674return;675}676677/* If we have ISA, then we probably have an IDE678* controller too. Allocate a 128MB table but679* skip the first 128MB to avoid stepping on ISA680* space.681*/682pci->phb->dma_window_size = 0x8000000ul;683pci->phb->dma_window_base_cur = 0x8000000ul;684685pci->table_group = iommu_pseries_alloc_group(pci->phb->node);686tbl = pci->table_group->tables[0];687688iommu_table_setparms(pci->phb, dn, tbl);689690if (!iommu_init_table(tbl, pci->phb->node, 0, 0))691panic("Failed to initialize iommu table");692693/* Divide the rest (1.75GB) among the children */694pci->phb->dma_window_size = 0x80000000ul;695while (pci->phb->dma_window_size * children > 0x70000000ul)696pci->phb->dma_window_size >>= 1;697698pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);699}700701#ifdef CONFIG_IOMMU_API702static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned703long *tce, enum dma_data_direction *direction)704{705long rc;706unsigned long ioba = (unsigned long) index << tbl->it_page_shift;707unsigned long flags, oldtce = 0;708u64 proto_tce = iommu_direction_to_tce_perm(*direction);709unsigned long newtce = *tce | proto_tce;710711spin_lock_irqsave(&tbl->large_pool.lock, flags);712713rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);714if (!rc)715rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);716717if (!rc) {718*direction = iommu_tce_direction(oldtce);719*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);720}721722spin_unlock_irqrestore(&tbl->large_pool.lock, flags);723724return rc;725}726727static __be64 *tce_useraddr_pSeriesLP(struct iommu_table *tbl, long index,728bool __always_unused alloc)729{730return tbl->it_userspace ? &tbl->it_userspace[index - tbl->it_offset] : NULL;731}732#endif733734struct iommu_table_ops iommu_table_lpar_multi_ops = {735.set = tce_buildmulti_pSeriesLP,736#ifdef CONFIG_IOMMU_API737.xchg_no_kill = tce_exchange_pseries,738.useraddrptr = tce_useraddr_pSeriesLP,739#endif740.clear = tce_freemulti_pSeriesLP,741.get = tce_get_pSeriesLP,742.free = tce_free_pSeries743};744745#ifdef CONFIG_IOMMU_API746/*747* When the DMA window properties might have been removed,748* the parent node has the table_group setup on it.749*/750static struct device_node *pci_dma_find_parent_node(struct pci_dev *dev,751struct iommu_table_group *table_group)752{753struct device_node *dn = pci_device_to_OF_node(dev);754struct pci_dn *rpdn;755756for (; dn && PCI_DN(dn); dn = dn->parent) {757rpdn = PCI_DN(dn);758759if (table_group == rpdn->table_group)760return dn;761}762763return NULL;764}765#endif766767/*768* Find nearest ibm,dma-window (default DMA window) or direct DMA window or769* dynamic 64bit DMA window, walking up the device tree.770*/771static struct device_node *pci_dma_find(struct device_node *dn,772struct dynamic_dma_window_prop *prop)773{774const __be32 *default_prop = NULL;775const __be32 *ddw_prop = NULL;776struct device_node *rdn = NULL;777bool default_win = false, ddw_win = false;778779for ( ; dn && PCI_DN(dn); dn = dn->parent) {780default_prop = of_get_property(dn, "ibm,dma-window", NULL);781if (default_prop) {782rdn = dn;783default_win = true;784}785ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);786if (ddw_prop) {787rdn = dn;788ddw_win = true;789break;790}791ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);792if (ddw_prop) {793rdn = dn;794ddw_win = true;795break;796}797798/* At least found default window, which is the case for normal boot */799if (default_win)800break;801}802803/* For PCI devices there will always be a DMA window, either on the device804* or parent bus805*/806WARN_ON(!(default_win | ddw_win));807808/* caller doesn't want to get DMA window property */809if (!prop)810return rdn;811812/* parse DMA window property. During normal system boot, only default813* DMA window is passed in OF. But, for kdump, a dedicated adapter might814* have both default and DDW in FDT. In this scenario, DDW takes precedence815* over default window.816*/817if (ddw_win) {818struct dynamic_dma_window_prop *p;819820p = (struct dynamic_dma_window_prop *)ddw_prop;821prop->liobn = p->liobn;822prop->dma_base = p->dma_base;823prop->tce_shift = p->tce_shift;824prop->window_shift = p->window_shift;825} else if (default_win) {826unsigned long offset, size, liobn;827828of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);829830prop->liobn = cpu_to_be32((u32)liobn);831prop->dma_base = cpu_to_be64(offset);832prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K);833prop->window_shift = cpu_to_be32(order_base_2(size));834}835836return rdn;837}838839static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)840{841struct iommu_table *tbl;842struct device_node *dn, *pdn;843struct pci_dn *ppci;844struct dynamic_dma_window_prop prop;845846dn = pci_bus_to_OF_node(bus);847848pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",849dn);850851pdn = pci_dma_find(dn, &prop);852853/* In PPC architecture, there will always be DMA window on bus or one of the854* parent bus. During reboot, there will be ibm,dma-window property to855* define DMA window. For kdump, there will at least be default window or DDW856* or both.857* There is an exception to the above. In case the PE goes into frozen858* state, firmware may not provide ibm,dma-window property at the time859* of LPAR boot up.860*/861862if (!pdn) {863pr_debug(" no ibm,dma-window property !\n");864return;865}866867ppci = PCI_DN(pdn);868869pr_debug(" parent is %pOF, iommu_table: 0x%p\n",870pdn, ppci->table_group);871872if (!ppci->table_group) {873ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);874tbl = ppci->table_group->tables[0];875876iommu_table_setparms_common(tbl, ppci->phb->bus->number,877be32_to_cpu(prop.liobn),878be64_to_cpu(prop.dma_base),8791ULL << be32_to_cpu(prop.window_shift),880be32_to_cpu(prop.tce_shift), NULL,881&iommu_table_lpar_multi_ops);882883if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))884panic("Failed to initialize iommu table");885886iommu_register_group(ppci->table_group,887pci_domain_nr(bus), 0);888pr_debug(" created table: %p\n", ppci->table_group);889}890}891892893static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)894{895struct device_node *dn;896struct iommu_table *tbl;897898pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));899900dn = dev->dev.of_node;901902/* If we're the direct child of a root bus, then we need to allocate903* an iommu table ourselves. The bus setup code should have setup904* the window sizes already.905*/906if (!dev->bus->self) {907struct pci_controller *phb = PCI_DN(dn)->phb;908909pr_debug(" --> first child, no bridge. Allocating iommu table.\n");910PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);911tbl = PCI_DN(dn)->table_group->tables[0];912iommu_table_setparms(phb, dn, tbl);913914if (!iommu_init_table(tbl, phb->node, 0, 0))915panic("Failed to initialize iommu table");916917set_iommu_table_base(&dev->dev, tbl);918return;919}920921/* If this device is further down the bus tree, search upwards until922* an already allocated iommu table is found and use that.923*/924925while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)926dn = dn->parent;927928if (dn && PCI_DN(dn))929set_iommu_table_base(&dev->dev,930PCI_DN(dn)->table_group->tables[0]);931else932printk(KERN_WARNING "iommu: Device %s has no iommu table\n",933pci_name(dev));934}935936static int __read_mostly disable_ddw;937938static int __init disable_ddw_setup(char *str)939{940disable_ddw = 1;941printk(KERN_INFO "ppc iommu: disabling ddw.\n");942943return 0;944}945946early_param("disable_ddw", disable_ddw_setup);947948static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)949{950int ret;951952ret = tce_clearrange_multi_pSeriesLP(0,9531ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);954if (ret)955pr_warn("%pOF failed to clear tces in window.\n",956np);957else958pr_debug("%pOF successfully cleared tces in window.\n",959np);960}961962/*963* Call only if DMA window is clean.964*/965static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)966{967int ret;968969ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);970if (ret)971pr_warn("%pOF: failed to remove DMA window: rtas returned "972"%d to ibm,remove-pe-dma-window(%x) %llx\n",973np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);974else975pr_debug("%pOF: successfully removed DMA window: rtas returned "976"%d to ibm,remove-pe-dma-window(%x) %llx\n",977np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);978}979980static void remove_dma_window(struct device_node *np, u32 *ddw_avail,981struct property *win, bool cleanup)982{983struct dynamic_dma_window_prop *dwp;984u64 liobn;985986dwp = win->value;987liobn = (u64)be32_to_cpu(dwp->liobn);988989if (cleanup)990clean_dma_window(np, dwp);991__remove_dma_window(np, ddw_avail, liobn);992}993994static void copy_property(struct device_node *pdn, const char *from, const char *to)995{996struct property *src, *dst;997998src = of_find_property(pdn, from, NULL);999if (!src)1000return;10011002dst = kzalloc(sizeof(*dst), GFP_KERNEL);1003if (!dst)1004return;10051006dst->name = kstrdup(to, GFP_KERNEL);1007dst->value = kmemdup(src->value, src->length, GFP_KERNEL);1008dst->length = src->length;1009if (!dst->name || !dst->value)1010return;10111012if (of_add_property(pdn, dst)) {1013pr_err("Unable to add DMA window property for %pOF", pdn);1014goto free_prop;1015}10161017return;10181019free_prop:1020kfree(dst->name);1021kfree(dst->value);1022kfree(dst);1023}10241025static int remove_dma_window_named(struct device_node *np, bool remove_prop, const char *win_name,1026bool cleanup)1027{1028struct property *win;1029u32 ddw_avail[DDW_APPLICABLE_SIZE];1030int ret = 0;10311032win = of_find_property(np, win_name, NULL);1033if (!win)1034return -EINVAL;10351036ret = of_property_read_u32_array(np, "ibm,ddw-applicable",1037&ddw_avail[0], DDW_APPLICABLE_SIZE);1038if (ret)1039return 0;10401041if (win->length >= sizeof(struct dynamic_dma_window_prop))1042remove_dma_window(np, ddw_avail, win, cleanup);10431044if (!remove_prop)1045return 0;10461047/* Default window property if removed is lost as reset-pe doesn't restore it.1048* Though FDT has a copy of it, the DLPAR hotplugged devices will not have a1049* node on FDT until next reboot. So, back it up.1050*/1051if ((strcmp(win_name, "ibm,dma-window") == 0) &&1052!of_find_property(np, "ibm,dma-window-saved", NULL))1053copy_property(np, win_name, "ibm,dma-window-saved");10541055ret = of_remove_property(np, win);1056if (ret)1057pr_warn("%pOF: failed to remove DMA window property: %d\n",1058np, ret);1059return 0;1060}10611062static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift,1063bool *direct_mapping)1064{1065struct dma_win *window;1066const struct dynamic_dma_window_prop *dma64;1067bool found = false;10681069spin_lock(&dma_win_list_lock);1070/* check if we already created a window and dupe that config if so */1071list_for_each_entry(window, &dma_win_list, list) {1072if (window->device == pdn) {1073dma64 = window->prop;1074*dma_addr = be64_to_cpu(dma64->dma_base);1075*window_shift = be32_to_cpu(dma64->window_shift);1076*direct_mapping = window->direct;1077found = true;1078break;1079}1080}1081spin_unlock(&dma_win_list_lock);10821083return found;1084}10851086static struct dma_win *ddw_list_new_entry(struct device_node *pdn,1087const struct dynamic_dma_window_prop *dma64)1088{1089struct dma_win *window;10901091window = kzalloc(sizeof(*window), GFP_KERNEL);1092if (!window)1093return NULL;10941095window->device = pdn;1096window->prop = dma64;1097window->direct = false;10981099return window;1100}11011102static void find_existing_ddw_windows_named(const char *name)1103{1104int len;1105struct device_node *pdn;1106struct dma_win *window;1107const struct dynamic_dma_window_prop *dma64;11081109for_each_node_with_property(pdn, name) {1110dma64 = of_get_property(pdn, name, &len);1111if (!dma64 || len < sizeof(*dma64)) {1112remove_dma_window_named(pdn, true, name, true);1113continue;1114}11151116/* If at the time of system initialization, there are DDWs in OF,1117* it means this is during kexec. DDW could be direct or dynamic.1118* We will just mark DDWs as "dynamic" since this is kdump path,1119* no need to worry about perforance. ddw_list_new_entry() will1120* set window->direct = false.1121*/1122window = ddw_list_new_entry(pdn, dma64);1123if (!window) {1124of_node_put(pdn);1125break;1126}11271128spin_lock(&dma_win_list_lock);1129list_add(&window->list, &dma_win_list);1130spin_unlock(&dma_win_list_lock);1131}1132}11331134static int find_existing_ddw_windows(void)1135{1136if (!firmware_has_feature(FW_FEATURE_LPAR))1137return 0;11381139find_existing_ddw_windows_named(DIRECT64_PROPNAME);1140find_existing_ddw_windows_named(DMA64_PROPNAME);11411142return 0;1143}1144machine_arch_initcall(pseries, find_existing_ddw_windows);11451146/**1147* ddw_read_ext - Get the value of an DDW extension1148* @np: device node from which the extension value is to be read.1149* @extnum: index number of the extension.1150* @value: pointer to return value, modified when extension is available.1151*1152* Checks if "ibm,ddw-extensions" exists for this node, and get the value1153* on index 'extnum'.1154* It can be used only to check if a property exists, passing value == NULL.1155*1156* Returns:1157* 0 if extension successfully read1158* -EINVAL if the "ibm,ddw-extensions" does not exist,1159* -ENODATA if "ibm,ddw-extensions" does not have a value, and1160* -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.1161*/1162static inline int ddw_read_ext(const struct device_node *np, int extnum,1163u32 *value)1164{1165static const char propname[] = "ibm,ddw-extensions";1166u32 count;1167int ret;11681169ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);1170if (ret)1171return ret;11721173if (count < extnum)1174return -EOVERFLOW;11751176if (!value)1177value = &count;11781179return of_property_read_u32_index(np, propname, extnum, value);1180}11811182static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,1183struct ddw_query_response *query,1184struct device_node *parent)1185{1186struct device_node *dn;1187struct pci_dn *pdn;1188u32 cfg_addr, ext_query, query_out[5];1189u64 buid;1190int ret, out_sz;11911192/*1193* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many1194* output parameters ibm,query-pe-dma-windows will have, ranging from1195* 5 to 6.1196*/1197ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);1198if (!ret && ext_query == 1)1199out_sz = 6;1200else1201out_sz = 5;12021203/*1204* Get the config address and phb buid of the PE window.1205* Rely on eeh to retrieve this for us.1206* Retrieve them from the pci device, not the node with the1207* dma-window property1208*/1209dn = pci_device_to_OF_node(dev);1210pdn = PCI_DN(dn);1211buid = pdn->phb->buid;1212cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));12131214ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,1215cfg_addr, BUID_HI(buid), BUID_LO(buid));12161217switch (out_sz) {1218case 5:1219query->windows_available = query_out[0];1220query->largest_available_block = query_out[1];1221query->page_size = query_out[2];1222query->migration_capable = query_out[3];1223break;1224case 6:1225query->windows_available = query_out[0];1226query->largest_available_block = ((u64)query_out[1] << 32) |1227query_out[2];1228query->page_size = query_out[3];1229query->migration_capable = query_out[4];1230break;1231}12321233dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n",1234ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),1235BUID_LO(buid), ret, query->largest_available_block,1236query->page_size, query->windows_available);12371238return ret;1239}12401241static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,1242struct ddw_create_response *create, int page_shift,1243int window_shift)1244{1245struct device_node *dn;1246struct pci_dn *pdn;1247u32 cfg_addr;1248u64 buid;1249int ret;12501251/*1252* Get the config address and phb buid of the PE window.1253* Rely on eeh to retrieve this for us.1254* Retrieve them from the pci device, not the node with the1255* dma-window property1256*/1257dn = pci_device_to_OF_node(dev);1258pdn = PCI_DN(dn);1259buid = pdn->phb->buid;1260cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));12611262do {1263/* extra outputs are LIOBN and dma-addr (hi, lo) */1264ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,1265(u32 *)create, cfg_addr, BUID_HI(buid),1266BUID_LO(buid), page_shift, window_shift);1267} while (rtas_busy_delay(ret));1268dev_info(&dev->dev,1269"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "1270"(liobn = 0x%x starting addr = %x %x)\n",1271ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),1272BUID_LO(buid), page_shift, window_shift, ret, create->liobn,1273create->addr_hi, create->addr_lo);12741275return ret;1276}12771278struct failed_ddw_pdn {1279struct device_node *pdn;1280struct list_head list;1281};12821283static LIST_HEAD(failed_ddw_pdn_list);12841285static phys_addr_t ddw_memory_hotplug_max(void)1286{1287resource_size_t max_addr;12881289#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)1290max_addr = hot_add_drconf_memory_max();1291#else1292max_addr = memblock_end_of_DRAM();1293#endif12941295return max_addr;1296}12971298/*1299* Platforms supporting the DDW option starting with LoPAR level 2.7 implement1300* ibm,ddw-extensions, which carries the rtas token for1301* ibm,reset-pe-dma-windows.1302* That rtas-call can be used to restore the default DMA window for the device.1303*/1304static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)1305{1306int ret;1307u32 cfg_addr, reset_dma_win;1308u64 buid;1309struct device_node *dn;1310struct pci_dn *pdn;13111312ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);1313if (ret)1314return;13151316dn = pci_device_to_OF_node(dev);1317pdn = PCI_DN(dn);1318buid = pdn->phb->buid;1319cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);13201321ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),1322BUID_LO(buid));1323if (ret)1324dev_info(&dev->dev,1325"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",1326reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),1327ret);1328}13291330/*1331* Platforms support placing PHB in limited address mode starting with LoPAR1332* level 2.13 implement. In this mode, the DMA address returned by DDW is over1333* 4GB but, less than 64-bits. This benefits IO adapters that don't support1334* 64-bits for DMA addresses.1335*/1336static int limited_dma_window(struct pci_dev *dev, struct device_node *par_dn)1337{1338int ret;1339u32 cfg_addr, reset_dma_win, las_supported;1340u64 buid;1341struct device_node *dn;1342struct pci_dn *pdn;13431344ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);1345if (ret)1346goto out;13471348ret = ddw_read_ext(par_dn, DDW_EXT_LIMITED_ADDR_MODE, &las_supported);13491350/* Limited Address Space extension available on the platform but DDW in1351* limited addressing mode not supported1352*/1353if (!ret && !las_supported)1354ret = -EPROTO;13551356if (ret) {1357dev_info(&dev->dev, "Limited Address Space for DDW not Supported, err: %d", ret);1358goto out;1359}13601361dn = pci_device_to_OF_node(dev);1362pdn = PCI_DN(dn);1363buid = pdn->phb->buid;1364cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);13651366ret = rtas_call(reset_dma_win, 4, 1, NULL, cfg_addr, BUID_HI(buid),1367BUID_LO(buid), 1);1368if (ret)1369dev_info(&dev->dev,1370"ibm,reset-pe-dma-windows(%x) for Limited Addr Support: %x %x %x returned %d ",1371reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),1372ret);13731374out:1375return ret;1376}13771378/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */1379static int iommu_get_page_shift(u32 query_page_size)1380{1381/* Supported IO page-sizes according to LoPAR, note that 2M is out of order */1382const int shift[] = {1383__builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),1384__builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),1385__builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M)1386};13871388int i = ARRAY_SIZE(shift) - 1;1389int ret = 0;13901391/*1392* On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:1393* - bit 31 means 4k pages are supported,1394* - bit 30 means 64k pages are supported, and so on.1395* Larger pagesizes map more memory with the same amount of TCEs, so start probing them.1396*/1397for (; i >= 0 ; i--) {1398if (query_page_size & (1 << i))1399ret = max(ret, shift[i]);1400}14011402return ret;1403}14041405static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,1406u32 page_shift, u32 window_shift)1407{1408struct dynamic_dma_window_prop *ddwprop;1409struct property *win64;14101411win64 = kzalloc(sizeof(*win64), GFP_KERNEL);1412if (!win64)1413return NULL;14141415win64->name = kstrdup(propname, GFP_KERNEL);1416ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);1417win64->value = ddwprop;1418win64->length = sizeof(*ddwprop);1419if (!win64->name || !win64->value) {1420kfree(win64->name);1421kfree(win64->value);1422kfree(win64);1423return NULL;1424}14251426ddwprop->liobn = cpu_to_be32(liobn);1427ddwprop->dma_base = cpu_to_be64(dma_addr);1428ddwprop->tce_shift = cpu_to_be32(page_shift);1429ddwprop->window_shift = cpu_to_be32(window_shift);14301431return win64;1432}14331434/*1435* If the PE supports dynamic dma windows, and there is space for a table1436* that can map all pages in a linear offset, then setup such a table,1437* and record the dma-offset in the struct device.1438*1439* dev: the pci device we are checking1440* pdn: the parent pe node with the ibm,dma_window property1441* Future: also check if we can remap the base window for our base page size1442*1443* returns true if can map all pages (direct mapping), false otherwise..1444*/1445static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn, u64 dma_mask)1446{1447int len = 0, ret;1448int max_ram_len = order_base_2(ddw_memory_hotplug_max());1449struct ddw_query_response query;1450struct ddw_create_response create;1451int page_shift;1452u64 win_addr, dynamic_offset = 0;1453const char *win_name;1454struct device_node *dn;1455u32 ddw_avail[DDW_APPLICABLE_SIZE];1456struct dma_win *window;1457struct property *win64;1458struct failed_ddw_pdn *fpdn;1459bool default_win_removed = false, direct_mapping = false;1460bool dynamic_mapping = false;1461bool pmem_present;1462struct pci_dn *pci = PCI_DN(pdn);1463struct property *default_win = NULL;1464bool limited_addr_req = false, limited_addr_enabled = false;1465int dev_max_ddw;1466int ddw_sz;14671468dn = of_find_node_by_type(NULL, "ibm,pmemory");1469pmem_present = dn != NULL;1470of_node_put(dn);14711472mutex_lock(&dma_win_init_mutex);14731474if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping))1475goto out_unlock;14761477/*1478* If we already went through this for a previous function of1479* the same device and failed, we don't want to muck with the1480* DMA window again, as it will race with in-flight operations1481* and can lead to EEHs. The above mutex protects access to the1482* list.1483*/1484list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {1485if (fpdn->pdn == pdn)1486goto out_unlock;1487}14881489/*1490* the ibm,ddw-applicable property holds the tokens for:1491* ibm,query-pe-dma-window1492* ibm,create-pe-dma-window1493* for the given node in that order.1494* the property is actually in the parent, not the PE1495*/1496ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",1497&ddw_avail[0], DDW_APPLICABLE_SIZE);1498if (ret)1499goto out_failed;15001501/*1502* Query if there is a second window of size to map the1503* whole partition. Query returns number of windows, largest1504* block assigned to PE (partition endpoint), and two bitmasks1505* of page sizes: supported and supported for migrate-dma.1506*/1507dn = pci_device_to_OF_node(dev);1508ret = query_ddw(dev, ddw_avail, &query, pdn);1509if (ret != 0)1510goto out_failed;15111512/* DMA Limited Addressing required? This is when the driver has1513* requested to create DDW but supports mask which is less than 64-bits1514*/1515limited_addr_req = (dma_mask != DMA_BIT_MASK(64));15161517/* place the PHB in Limited Addressing mode */1518if (limited_addr_req) {1519if (limited_dma_window(dev, pdn))1520goto out_failed;15211522/* PHB is in Limited address mode */1523limited_addr_enabled = true;1524}15251526/*1527* If there is no window available, remove the default DMA window,1528* if it's present. This will make all the resources available to the1529* new DDW window.1530* If anything fails after this, we need to restore it, so also check1531* for extensions presence.1532*/1533if (query.windows_available == 0) {1534int reset_win_ext;15351536/* DDW + IOMMU on single window may fail if there is any allocation */1537if (iommu_table_in_use(pci->table_group->tables[0])) {1538dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");1539goto out_failed;1540}15411542default_win = of_find_property(pdn, "ibm,dma-window", NULL);1543if (!default_win)1544goto out_failed;15451546reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);1547if (reset_win_ext)1548goto out_failed;15491550remove_dma_window(pdn, ddw_avail, default_win, true);1551default_win_removed = true;15521553/* Query again, to check if the window is available */1554ret = query_ddw(dev, ddw_avail, &query, pdn);1555if (ret != 0)1556goto out_failed;15571558if (query.windows_available == 0) {1559/* no windows are available for this device. */1560dev_dbg(&dev->dev, "no free dynamic windows");1561goto out_failed;1562}1563}15641565page_shift = iommu_get_page_shift(query.page_size);1566if (!page_shift) {1567dev_dbg(&dev->dev, "no supported page size in mask %x",1568query.page_size);1569goto out_failed;1570}15711572/* Maximum DMA window size that the device can address (in log2) */1573dev_max_ddw = fls64(dma_mask);15741575/* If the device DMA mask is less than 64-bits, make sure the DMA window1576* size is not bigger than what the device can access1577*/1578ddw_sz = min(order_base_2(query.largest_available_block << page_shift),1579dev_max_ddw);15801581/*1582* The "ibm,pmemory" can appear anywhere in the address space.1583* Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS1584* for the upper limit and fallback to max RAM otherwise but this1585* disables device::dma_ops_bypass.1586*/1587len = max_ram_len;1588if (pmem_present) {1589if (ddw_sz >= MAX_PHYSMEM_BITS)1590len = MAX_PHYSMEM_BITS;1591else1592dev_info(&dev->dev, "Skipping ibm,pmemory");1593}15941595/* check if the available block * number of ptes will map everything */1596if (ddw_sz < len) {1597dev_dbg(&dev->dev,1598"can't map partition max 0x%llx with %llu %llu-sized pages\n",15991ULL << len,1600query.largest_available_block,16011ULL << page_shift);16021603len = ddw_sz;1604dynamic_mapping = true;1605} else {1606direct_mapping = !default_win_removed ||1607(len == MAX_PHYSMEM_BITS) ||1608(!pmem_present && (len == max_ram_len));16091610/* DDW is big enough to direct map RAM. If there is vPMEM, check1611* if enough space is left in DDW where we can dynamically1612* allocate TCEs for vPMEM. For now, this Hybrid sharing of DDW1613* is only for SR-IOV devices.1614*/1615if (default_win_removed && pmem_present && !direct_mapping) {1616/* DDW is big enough to be split */1617if ((1ULL << ddw_sz) >=1618MIN_DDW_VPMEM_DMA_WINDOW + (1ULL << max_ram_len)) {16191620direct_mapping = true;16211622/* offset of the Dynamic part of DDW */1623dynamic_offset = 1ULL << max_ram_len;1624}16251626/* DDW will at least have dynamic allocation */1627dynamic_mapping = true;16281629/* create max size DDW possible */1630len = ddw_sz;1631}1632}16331634/* Even if the DDW is split into both direct mapped RAM and dynamically1635* mapped vPMEM, the DDW property in OF will be marked as Direct.1636*/1637win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME;16381639ret = create_ddw(dev, ddw_avail, &create, page_shift, len);1640if (ret != 0)1641goto out_failed;16421643dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",1644create.liobn, dn);16451646win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;1647win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);16481649if (!win64) {1650dev_info(&dev->dev,1651"couldn't allocate property, property name, or value\n");1652goto out_remove_win;1653}16541655ret = of_add_property(pdn, win64);1656if (ret) {1657dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",1658pdn, ret);1659goto out_free_prop;1660}16611662window = ddw_list_new_entry(pdn, win64->value);1663if (!window)1664goto out_del_prop;16651666window->direct = direct_mapping;16671668if (direct_mapping) {1669/* DDW maps the whole partition, so enable direct DMA mapping */1670ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT,1671win64->value, tce_setrange_multi_pSeriesLP_walk);1672if (ret) {1673dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",1674dn, ret);16751676/* Make sure to clean DDW if any TCE was set*/1677clean_dma_window(pdn, win64->value);1678goto out_del_list;1679}1680if (default_win_removed) {1681iommu_tce_table_put(pci->table_group->tables[0]);1682pci->table_group->tables[0] = NULL;1683set_iommu_table_base(&dev->dev, NULL);1684}1685}16861687if (dynamic_mapping) {1688struct iommu_table *newtbl;1689int i;1690unsigned long start = 0, end = 0;1691u64 dynamic_addr, dynamic_len;16921693for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {1694const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;16951696/* Look for MMIO32 */1697if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {1698start = pci->phb->mem_resources[i].start;1699end = pci->phb->mem_resources[i].end;1700break;1701}1702}17031704/* New table for using DDW instead of the default DMA window */1705newtbl = iommu_pseries_alloc_table(pci->phb->node);1706if (!newtbl) {1707dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");1708goto out_del_list;1709}17101711/* If the DDW is split between directly mapped RAM and Dynamic1712* mapped for TCES, offset into the DDW where the dynamic part1713* begins.1714*/1715dynamic_addr = win_addr + dynamic_offset;1716dynamic_len = (1UL << len) - dynamic_offset;1717iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn,1718dynamic_addr, dynamic_len, page_shift, NULL,1719&iommu_table_lpar_multi_ops);1720iommu_init_table(newtbl, pci->phb->node,1721start >> page_shift, end >> page_shift);17221723pci->table_group->tables[default_win_removed ? 0 : 1] = newtbl;17241725set_iommu_table_base(&dev->dev, newtbl);1726}17271728if (default_win_removed) {1729/* default_win is valid here because default_win_removed == true */1730if (!of_find_property(pdn, "ibm,dma-window-saved", NULL))1731copy_property(pdn, "ibm,dma-window", "ibm,dma-window-saved");1732of_remove_property(pdn, default_win);1733dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn);1734}17351736spin_lock(&dma_win_list_lock);1737list_add(&window->list, &dma_win_list);1738spin_unlock(&dma_win_list_lock);17391740dev->dev.archdata.dma_offset = win_addr;1741goto out_unlock;17421743out_del_list:1744kfree(window);17451746out_del_prop:1747of_remove_property(pdn, win64);17481749out_free_prop:1750kfree(win64->name);1751kfree(win64->value);1752kfree(win64);17531754out_remove_win:1755/* DDW is clean, so it's ok to call this directly. */1756__remove_dma_window(pdn, ddw_avail, create.liobn);17571758out_failed:1759if (default_win_removed || limited_addr_enabled)1760reset_dma_window(dev, pdn);17611762fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);1763if (!fpdn)1764goto out_unlock;1765fpdn->pdn = pdn;1766list_add(&fpdn->list, &failed_ddw_pdn_list);17671768out_unlock:1769mutex_unlock(&dma_win_init_mutex);17701771/* If we have persistent memory and the window size is not big enough1772* to directly map both RAM and vPMEM, then we need to set DMA limit.1773*/1774if (pmem_present && direct_mapping && len != MAX_PHYSMEM_BITS)1775dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset +1776(1ULL << max_ram_len);17771778dev_info(&dev->dev, "lsa_required: %x, lsa_enabled: %x, direct mapping: %x\n",1779limited_addr_req, limited_addr_enabled, direct_mapping);17801781return direct_mapping;1782}17831784static __u64 query_page_size_to_mask(u32 query_page_size)1785{1786const long shift[] = {1787(SZ_4K), (SZ_64K), (SZ_16M),1788(SZ_32M), (SZ_64M), (SZ_128M),1789(SZ_256M), (SZ_16G), (SZ_2M)1790};1791int i, ret = 0;17921793for (i = 0; i < ARRAY_SIZE(shift); i++) {1794if (query_page_size & (1 << i))1795ret |= shift[i];1796}17971798return ret;1799}18001801static void spapr_tce_init_table_group(struct pci_dev *pdev,1802struct device_node *pdn,1803struct dynamic_dma_window_prop prop)1804{1805struct iommu_table_group *table_group = PCI_DN(pdn)->table_group;1806u32 ddw_avail[DDW_APPLICABLE_SIZE];18071808struct ddw_query_response query;1809int ret;18101811/* Only for normal boot with default window. Doesn't matter during1812* kdump, since these will not be used during kdump.1813*/1814if (is_kdump_kernel())1815return;18161817if (table_group->max_dynamic_windows_supported != 0)1818return; /* already initialized */18191820table_group->tce32_start = be64_to_cpu(prop.dma_base);1821table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);18221823if (!of_find_property(pdn, "ibm,dma-window", NULL))1824dev_err(&pdev->dev, "default dma window missing!\n");18251826ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",1827&ddw_avail[0], DDW_APPLICABLE_SIZE);1828if (ret) {1829table_group->max_dynamic_windows_supported = -1;1830return;1831}18321833ret = query_ddw(pdev, ddw_avail, &query, pdn);1834if (ret) {1835dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__);1836table_group->max_dynamic_windows_supported = -1;1837return;1838}18391840if (query.windows_available == 0)1841table_group->max_dynamic_windows_supported = 1;1842else1843table_group->max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES;18441845table_group->max_levels = 1;1846table_group->pgsizes |= query_page_size_to_mask(query.page_size);1847}18481849static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)1850{1851struct device_node *pdn, *dn;1852struct iommu_table *tbl;1853struct pci_dn *pci;1854struct dynamic_dma_window_prop prop;18551856pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));18571858/* dev setup for LPAR is a little tricky, since the device tree might1859* contain the dma-window properties per-device and not necessarily1860* for the bus. So we need to search upwards in the tree until we1861* either hit a dma-window property, OR find a parent with a table1862* already allocated.1863*/1864dn = pci_device_to_OF_node(dev);1865pr_debug(" node is %pOF\n", dn);18661867pdn = pci_dma_find(dn, &prop);1868if (!pdn || !PCI_DN(pdn)) {1869printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "1870"no DMA window found for pci dev=%s dn=%pOF\n",1871pci_name(dev), dn);1872return;1873}1874pr_debug(" parent is %pOF\n", pdn);18751876pci = PCI_DN(pdn);1877if (!pci->table_group) {1878pci->table_group = iommu_pseries_alloc_group(pci->phb->node);1879tbl = pci->table_group->tables[0];18801881iommu_table_setparms_common(tbl, pci->phb->bus->number,1882be32_to_cpu(prop.liobn),1883be64_to_cpu(prop.dma_base),18841ULL << be32_to_cpu(prop.window_shift),1885be32_to_cpu(prop.tce_shift), NULL,1886&iommu_table_lpar_multi_ops);18871888iommu_init_table(tbl, pci->phb->node, 0, 0);1889iommu_register_group(pci->table_group,1890pci_domain_nr(pci->phb->bus), 0);1891pr_debug(" created table: %p\n", pci->table_group);1892} else {1893pr_debug(" found DMA window, table: %p\n", pci->table_group);1894}18951896spapr_tce_init_table_group(dev, pdn, prop);18971898set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);1899iommu_add_device(pci->table_group, &dev->dev);1900}19011902static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)1903{1904struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;19051906/* For DDW, DMA mask should be more than 32-bits. For mask more then1907* 32-bits but less then 64-bits, DMA addressing is supported in1908* Limited Addressing mode.1909*/1910if (dma_mask <= DMA_BIT_MASK(32))1911return false;19121913dev_dbg(&pdev->dev, "node is %pOF\n", dn);19141915/*1916* the device tree might contain the dma-window properties1917* per-device and not necessarily for the bus. So we need to1918* search upwards in the tree until we either hit a dma-window1919* property, OR find a parent with a table already allocated.1920*/1921pdn = pci_dma_find(dn, NULL);1922if (pdn && PCI_DN(pdn))1923return enable_ddw(pdev, pdn, dma_mask);19241925return false;1926}19271928#ifdef CONFIG_IOMMU_API1929/*1930* A simple iommu_table_group_ops which only allows reusing the existing1931* iommu_table. This handles VFIO for POWER7 or the nested KVM.1932* The ops does not allow creating windows and only allows reusing the existing1933* one if it matches table_group->tce32_start/tce32_size/page_shift.1934*/1935static unsigned long spapr_tce_get_table_size(__u32 page_shift,1936__u64 window_size, __u32 levels)1937{1938unsigned long size;19391940if (levels > 1)1941return ~0U;1942size = window_size >> (page_shift - 3);1943return size;1944}19451946static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group)1947{1948struct pci_dev *pdev = NULL;1949int ret;19501951/* No IOMMU group ? */1952if (!group)1953return NULL;19541955ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table);1956if (!ret || !pdev)1957return NULL;1958return pdev;1959}19601961static void restore_default_dma_window(struct pci_dev *pdev, struct device_node *pdn)1962{1963reset_dma_window(pdev, pdn);1964copy_property(pdn, "ibm,dma-window-saved", "ibm,dma-window");1965}19661967static long remove_dynamic_dma_windows(struct pci_dev *pdev, struct device_node *pdn)1968{1969struct pci_dn *pci = PCI_DN(pdn);1970struct dma_win *window;1971bool direct_mapping;1972int len;19731974if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) {1975remove_dma_window_named(pdn, true, direct_mapping ?1976DIRECT64_PROPNAME : DMA64_PROPNAME, true);1977if (!direct_mapping) {1978WARN_ON(!pci->table_group->tables[0] && !pci->table_group->tables[1]);19791980if (pci->table_group->tables[1]) {1981iommu_tce_table_put(pci->table_group->tables[1]);1982pci->table_group->tables[1] = NULL;1983} else if (pci->table_group->tables[0]) {1984/* Default window was removed and only the DDW exists */1985iommu_tce_table_put(pci->table_group->tables[0]);1986pci->table_group->tables[0] = NULL;1987}1988}1989spin_lock(&dma_win_list_lock);1990list_for_each_entry(window, &dma_win_list, list) {1991if (window->device == pdn) {1992list_del(&window->list);1993kfree(window);1994break;1995}1996}1997spin_unlock(&dma_win_list_lock);1998}19992000return 0;2001}20022003static long pseries_setup_default_iommu_config(struct iommu_table_group *table_group,2004struct device *dev)2005{2006struct pci_dev *pdev = to_pci_dev(dev);2007const __be32 *default_prop;2008long liobn, offset, size;2009struct device_node *pdn;2010struct iommu_table *tbl;2011struct pci_dn *pci;20122013pdn = pci_dma_find_parent_node(pdev, table_group);2014if (!pdn || !PCI_DN(pdn)) {2015dev_warn(&pdev->dev, "No table_group configured for the node %pOF\n", pdn);2016return -1;2017}2018pci = PCI_DN(pdn);20192020/* The default window is restored if not present already on removal of DDW.2021* However, if used by VFIO SPAPR sub driver, the user's order of removal of2022* windows might have been different to not leading to auto restoration,2023* suppose the DDW was removed first followed by the default one.2024* So, restore the default window with reset-pe-dma call explicitly.2025*/2026restore_default_dma_window(pdev, pdn);20272028default_prop = of_get_property(pdn, "ibm,dma-window", NULL);2029of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size);2030tbl = iommu_pseries_alloc_table(pci->phb->node);2031if (!tbl) {2032dev_err(&pdev->dev, "couldn't create new IOMMU table\n");2033return -1;2034}20352036iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, offset,2037size, IOMMU_PAGE_SHIFT_4K, NULL,2038&iommu_table_lpar_multi_ops);2039iommu_init_table(tbl, pci->phb->node, 0, 0);20402041pci->table_group->tables[0] = tbl;2042set_iommu_table_base(&pdev->dev, tbl);20432044return 0;2045}20462047static bool is_default_window_request(struct iommu_table_group *table_group, __u32 page_shift,2048__u64 window_size)2049{2050if ((window_size <= table_group->tce32_size) &&2051(page_shift == IOMMU_PAGE_SHIFT_4K))2052return true;20532054return false;2055}20562057static long spapr_tce_create_table(struct iommu_table_group *table_group, int num,2058__u32 page_shift, __u64 window_size, __u32 levels,2059struct iommu_table **ptbl)2060{2061struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group);2062u32 ddw_avail[DDW_APPLICABLE_SIZE];2063struct ddw_create_response create;2064unsigned long liobn, offset, size;2065unsigned long start = 0, end = 0;2066struct ddw_query_response query;2067const __be32 *default_prop;2068struct failed_ddw_pdn *fpdn;2069unsigned int window_shift;2070struct device_node *pdn;2071struct iommu_table *tbl;2072struct dma_win *window;2073struct property *win64;2074struct pci_dn *pci;2075u64 win_addr;2076int len, i;2077long ret;20782079if (!is_power_of_2(window_size) || levels > 1)2080return -EINVAL;20812082window_shift = order_base_2(window_size);20832084mutex_lock(&dma_win_init_mutex);20852086ret = -ENODEV;20872088pdn = pci_dma_find_parent_node(pdev, table_group);2089if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */2090dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn);2091goto out_failed;2092}2093pci = PCI_DN(pdn);20942095/* If the enable DDW failed for the pdn, dont retry! */2096list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {2097if (fpdn->pdn == pdn) {2098dev_info(&pdev->dev, "%pOF in failed DDW device list\n", pdn);2099goto out_unlock;2100}2101}21022103tbl = iommu_pseries_alloc_table(pci->phb->node);2104if (!tbl) {2105dev_dbg(&pdev->dev, "couldn't create new IOMMU table\n");2106goto out_unlock;2107}21082109if (num == 0) {2110bool direct_mapping;2111/* The request is not for default window? Ensure there is no DDW window already */2112if (!is_default_window_request(table_group, page_shift, window_size)) {2113if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len,2114&direct_mapping)) {2115dev_warn(&pdev->dev, "%pOF: 64-bit window already present.", pdn);2116ret = -EPERM;2117goto out_unlock;2118}2119} else {2120/* Request is for Default window, ensure there is no DDW if there is a2121* need to reset. reset-pe otherwise removes the DDW also2122*/2123default_prop = of_get_property(pdn, "ibm,dma-window", NULL);2124if (!default_prop) {2125if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len,2126&direct_mapping)) {2127dev_warn(&pdev->dev, "%pOF: Attempt to create window#0 when 64-bit window is present. Preventing the attempt as that would destroy the 64-bit window",2128pdn);2129ret = -EPERM;2130goto out_unlock;2131}21322133restore_default_dma_window(pdev, pdn);21342135default_prop = of_get_property(pdn, "ibm,dma-window", NULL);2136of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size);2137/* Limit the default window size to window_size */2138iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn,2139offset, 1UL << window_shift,2140IOMMU_PAGE_SHIFT_4K, NULL,2141&iommu_table_lpar_multi_ops);2142iommu_init_table(tbl, pci->phb->node,2143start >> IOMMU_PAGE_SHIFT_4K,2144end >> IOMMU_PAGE_SHIFT_4K);21452146table_group->tables[0] = tbl;21472148mutex_unlock(&dma_win_init_mutex);21492150goto exit;2151}2152}2153}21542155ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",2156&ddw_avail[0], DDW_APPLICABLE_SIZE);2157if (ret) {2158dev_info(&pdev->dev, "ibm,ddw-applicable not found\n");2159goto out_failed;2160}2161ret = -ENODEV;21622163pr_err("%s: Calling query %pOF\n", __func__, pdn);2164ret = query_ddw(pdev, ddw_avail, &query, pdn);2165if (ret)2166goto out_failed;2167ret = -ENODEV;21682169len = window_shift;2170if (query.largest_available_block < (1ULL << (len - page_shift))) {2171dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n",21721ULL << len, query.largest_available_block,21731ULL << page_shift);2174ret = -EINVAL; /* Retry with smaller window size */2175goto out_unlock;2176}21772178if (create_ddw(pdev, ddw_avail, &create, page_shift, len)) {2179pr_err("%s: Create ddw failed %pOF\n", __func__, pdn);2180goto out_failed;2181}21822183win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;2184win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len);2185if (!win64)2186goto remove_window;21872188ret = of_add_property(pdn, win64);2189if (ret) {2190dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %ld", pdn, ret);2191goto free_property;2192}2193ret = -ENODEV;21942195window = ddw_list_new_entry(pdn, win64->value);2196if (!window)2197goto remove_property;21982199window->direct = false;22002201for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {2202const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;22032204/* Look for MMIO32 */2205if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {2206start = pci->phb->mem_resources[i].start;2207end = pci->phb->mem_resources[i].end;2208break;2209}2210}22112212/* New table for using DDW instead of the default DMA window */2213iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr,22141UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);2215iommu_init_table(tbl, pci->phb->node, start >> page_shift, end >> page_shift);22162217pci->table_group->tables[num] = tbl;2218set_iommu_table_base(&pdev->dev, tbl);2219pdev->dev.archdata.dma_offset = win_addr;22202221spin_lock(&dma_win_list_lock);2222list_add(&window->list, &dma_win_list);2223spin_unlock(&dma_win_list_lock);22242225mutex_unlock(&dma_win_init_mutex);22262227goto exit;22282229remove_property:2230of_remove_property(pdn, win64);2231free_property:2232kfree(win64->name);2233kfree(win64->value);2234kfree(win64);2235remove_window:2236__remove_dma_window(pdn, ddw_avail, create.liobn);22372238out_failed:2239fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);2240if (!fpdn)2241goto out_unlock;2242fpdn->pdn = pdn;2243list_add(&fpdn->list, &failed_ddw_pdn_list);22442245out_unlock:2246mutex_unlock(&dma_win_init_mutex);22472248return ret;2249exit:2250/* Allocate the userspace view */2251pseries_tce_iommu_userspace_view_alloc(tbl);2252tbl->it_allocated_size = spapr_tce_get_table_size(page_shift, window_size, levels);22532254*ptbl = iommu_tce_table_get(tbl);22552256return 0;2257}22582259static bool is_default_window_table(struct iommu_table_group *table_group, struct iommu_table *tbl)2260{2261if (((tbl->it_size << tbl->it_page_shift) <= table_group->tce32_size) &&2262(tbl->it_page_shift == IOMMU_PAGE_SHIFT_4K))2263return true;22642265return false;2266}22672268static long spapr_tce_set_window(struct iommu_table_group *table_group,2269int num, struct iommu_table *tbl)2270{2271return tbl == table_group->tables[num] ? 0 : -EPERM;2272}22732274static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num)2275{2276struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group);2277struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;2278struct iommu_table *tbl = table_group->tables[num];2279struct failed_ddw_pdn *fpdn;2280struct dma_win *window;2281const char *win_name;2282int ret = -ENODEV;22832284if (!tbl) /* The table was never created OR window was never opened */2285return 0;22862287mutex_lock(&dma_win_init_mutex);22882289if ((num == 0) && is_default_window_table(table_group, tbl))2290win_name = "ibm,dma-window";2291else2292win_name = DMA64_PROPNAME;22932294pdn = pci_dma_find(dn, NULL);2295if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */2296dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn);2297goto out_failed;2298}22992300/* Dont clear the TCEs, User should have done it */2301if (remove_dma_window_named(pdn, true, win_name, false)) {2302pr_err("%s: The existing DDW removal failed for node %pOF\n", __func__, pdn);2303goto out_failed; /* Could not remove it either! */2304}23052306if (strcmp(win_name, DMA64_PROPNAME) == 0) {2307spin_lock(&dma_win_list_lock);2308list_for_each_entry(window, &dma_win_list, list) {2309if (window->device == pdn) {2310list_del(&window->list);2311kfree(window);2312break;2313}2314}2315spin_unlock(&dma_win_list_lock);2316}23172318iommu_tce_table_put(table_group->tables[num]);2319table_group->tables[num] = NULL;23202321ret = 0;23222323goto out_unlock;23242325out_failed:2326fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);2327if (!fpdn)2328goto out_unlock;2329fpdn->pdn = pdn;2330list_add(&fpdn->list, &failed_ddw_pdn_list);23312332out_unlock:2333mutex_unlock(&dma_win_init_mutex);23342335return ret;2336}23372338static long spapr_tce_take_ownership(struct iommu_table_group *table_group, struct device *dev)2339{2340struct iommu_table *tbl = table_group->tables[0];2341struct pci_dev *pdev = to_pci_dev(dev);2342struct device_node *dn = pci_device_to_OF_node(pdev);2343struct device_node *pdn;23442345/* SRIOV VFs using direct map by the host driver OR multifunction devices2346* where the ownership was taken on the attempt by the first function2347*/2348if (!tbl && (table_group->max_dynamic_windows_supported != 1))2349return 0;23502351mutex_lock(&dma_win_init_mutex);23522353pdn = pci_dma_find(dn, NULL);2354if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */2355dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn);2356mutex_unlock(&dma_win_init_mutex);2357return -1;2358}23592360/*2361* Though rtas call reset-pe removes the DDW, it doesn't clear the entries on the table2362* if there are any. In case of direct map, the entries will be left over, which2363* is fine for PEs with 2 DMA windows where the second window is created with create-pe2364* at which point the table is cleared. However, on VFs having only one DMA window, the2365* default window would end up seeing the entries left over from the direct map done2366* on the second window. So, remove the ddw explicitly so that clean_dma_window()2367* cleans up the entries if any.2368*/2369if (remove_dynamic_dma_windows(pdev, pdn)) {2370dev_warn(&pdev->dev, "The existing DDW removal failed for node %pOF\n", pdn);2371mutex_unlock(&dma_win_init_mutex);2372return -1;2373}23742375/* The table_group->tables[0] is not null now, it must be the default window2376* Remove it, let the userspace create it as it needs.2377*/2378if (table_group->tables[0]) {2379remove_dma_window_named(pdn, true, "ibm,dma-window", true);2380iommu_tce_table_put(tbl);2381table_group->tables[0] = NULL;2382}2383set_iommu_table_base(dev, NULL);23842385mutex_unlock(&dma_win_init_mutex);23862387return 0;2388}23892390static void spapr_tce_release_ownership(struct iommu_table_group *table_group, struct device *dev)2391{2392struct iommu_table *tbl = table_group->tables[0];23932394if (tbl) { /* Default window already restored */2395return;2396}23972398mutex_lock(&dma_win_init_mutex);23992400/* Restore the default window */2401pseries_setup_default_iommu_config(table_group, dev);24022403mutex_unlock(&dma_win_init_mutex);24042405return;2406}24072408static struct iommu_table_group_ops spapr_tce_table_group_ops = {2409.get_table_size = spapr_tce_get_table_size,2410.create_table = spapr_tce_create_table,2411.set_window = spapr_tce_set_window,2412.unset_window = spapr_tce_unset_window,2413.take_ownership = spapr_tce_take_ownership,2414.release_ownership = spapr_tce_release_ownership,2415};2416#endif24172418static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,2419void *data)2420{2421struct dma_win *window;2422struct memory_notify *arg = data;2423int ret = 0;24242425/* This notifier can get called when onlining persistent memory as well.2426* TCEs are not pre-mapped for persistent memory. Persistent memory will2427* always be above ddw_memory_hotplug_max()2428*/24292430switch (action) {2431case MEM_GOING_ONLINE:2432spin_lock(&dma_win_list_lock);2433list_for_each_entry(window, &dma_win_list, list) {2434if (window->direct && (arg->start_pfn << PAGE_SHIFT) <2435ddw_memory_hotplug_max()) {2436ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,2437arg->nr_pages, window->prop);2438}2439/* XXX log error */2440}2441spin_unlock(&dma_win_list_lock);2442break;2443case MEM_CANCEL_ONLINE:2444case MEM_OFFLINE:2445spin_lock(&dma_win_list_lock);2446list_for_each_entry(window, &dma_win_list, list) {2447if (window->direct && (arg->start_pfn << PAGE_SHIFT) <2448ddw_memory_hotplug_max()) {2449ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,2450arg->nr_pages, window->prop);2451}2452/* XXX log error */2453}2454spin_unlock(&dma_win_list_lock);2455break;2456default:2457break;2458}2459if (ret && action != MEM_CANCEL_ONLINE)2460return NOTIFY_BAD;24612462return NOTIFY_OK;2463}24642465static struct notifier_block iommu_mem_nb = {2466.notifier_call = iommu_mem_notifier,2467};24682469static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)2470{2471int err = NOTIFY_OK;2472struct of_reconfig_data *rd = data;2473struct device_node *np = rd->dn;2474struct pci_dn *pci = PCI_DN(np);2475struct dma_win *window;24762477switch (action) {2478case OF_RECONFIG_DETACH_NODE:2479/*2480* Removing the property will invoke the reconfig2481* notifier again, which causes dead-lock on the2482* read-write semaphore of the notifier chain. So2483* we have to remove the property when releasing2484* the device node.2485*/2486if (remove_dma_window_named(np, false, DIRECT64_PROPNAME, true))2487remove_dma_window_named(np, false, DMA64_PROPNAME, true);24882489if (pci && pci->table_group)2490iommu_pseries_free_group(pci->table_group,2491np->full_name);24922493spin_lock(&dma_win_list_lock);2494list_for_each_entry(window, &dma_win_list, list) {2495if (window->device == np) {2496list_del(&window->list);2497kfree(window);2498break;2499}2500}2501spin_unlock(&dma_win_list_lock);2502break;2503default:2504err = NOTIFY_DONE;2505break;2506}2507return err;2508}25092510static struct notifier_block iommu_reconfig_nb = {2511.notifier_call = iommu_reconfig_notifier,2512};25132514/* These are called very early. */2515void __init iommu_init_early_pSeries(void)2516{2517if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))2518return;25192520if (firmware_has_feature(FW_FEATURE_LPAR)) {2521pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;2522pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;2523if (!disable_ddw)2524pseries_pci_controller_ops.iommu_bypass_supported =2525iommu_bypass_supported_pSeriesLP;2526} else {2527pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;2528pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;2529}253025312532of_reconfig_notifier_register(&iommu_reconfig_nb);2533register_memory_notifier(&iommu_mem_nb);25342535set_pci_dma_ops(&dma_iommu_ops);2536}25372538static int __init disable_multitce(char *str)2539{2540if (strcmp(str, "off") == 0 &&2541firmware_has_feature(FW_FEATURE_LPAR) &&2542(firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||2543firmware_has_feature(FW_FEATURE_STUFF_TCE))) {2544printk(KERN_INFO "Disabling MULTITCE firmware feature\n");2545powerpc_firmware_features &=2546~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);2547}2548return 1;2549}25502551__setup("multitce=", disable_multitce);25522553#ifdef CONFIG_SPAPR_TCE_IOMMU2554struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose,2555struct pci_dev *pdev)2556{2557struct device_node *pdn, *dn = pdev->dev.of_node;2558struct iommu_group *grp;2559struct pci_dn *pci;25602561pdn = pci_dma_find(dn, NULL);2562if (!pdn || !PCI_DN(pdn))2563return ERR_PTR(-ENODEV);25642565pci = PCI_DN(pdn);2566if (!pci->table_group)2567return ERR_PTR(-ENODEV);25682569grp = pci->table_group->group;2570if (!grp)2571return ERR_PTR(-ENODEV);25722573return iommu_group_ref_get(grp);2574}2575#endif257625772578