Path: blob/master/arch/powerpc/platforms/pseries/iommu.c
51347 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation3*4* Rewrite, cleanup:5*6* Copyright (C) 2004 Olof Johansson <[email protected]>, IBM Corporation7* Copyright (C) 2006 Olof Johansson <[email protected]>8*9* Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.10*/1112#include <linux/init.h>13#include <linux/types.h>14#include <linux/slab.h>15#include <linux/mm.h>16#include <linux/memblock.h>17#include <linux/spinlock.h>18#include <linux/string.h>19#include <linux/pci.h>20#include <linux/dma-mapping.h>21#include <linux/crash_dump.h>22#include <linux/memory.h>23#include <linux/vmalloc.h>24#include <linux/of.h>25#include <linux/of_address.h>26#include <linux/iommu.h>27#include <linux/rculist.h>28#include <asm/io.h>29#include <asm/prom.h>30#include <asm/rtas.h>31#include <asm/iommu.h>32#include <asm/pci-bridge.h>33#include <asm/machdep.h>34#include <asm/firmware.h>35#include <asm/tce.h>36#include <asm/ppc-pci.h>37#include <asm/udbg.h>38#include <asm/mmzone.h>39#include <asm/plpar_wrappers.h>4041#include "pseries.h"4243enum {44DDW_QUERY_PE_DMA_WIN = 0,45DDW_CREATE_PE_DMA_WIN = 1,46DDW_REMOVE_PE_DMA_WIN = 2,4748DDW_APPLICABLE_SIZE49};5051enum {52DDW_EXT_SIZE = 0,53DDW_EXT_RESET_DMA_WIN = 1,54DDW_EXT_QUERY_OUT_SIZE = 2,55DDW_EXT_LIMITED_ADDR_MODE = 356};5758static struct iommu_table *iommu_pseries_alloc_table(int node)59{60struct iommu_table *tbl;6162tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);63if (!tbl)64return NULL;6566INIT_LIST_HEAD_RCU(&tbl->it_group_list);67kref_init(&tbl->it_kref);68return tbl;69}7071#ifdef CONFIG_IOMMU_API72static struct iommu_table_group_ops spapr_tce_table_group_ops;73#endif7475static struct iommu_table_group *iommu_pseries_alloc_group(int node)76{77struct iommu_table_group *table_group;7879table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);80if (!table_group)81return NULL;8283#ifdef CONFIG_IOMMU_API84table_group->ops = &spapr_tce_table_group_ops;85table_group->pgsizes = SZ_4K;86#endif8788table_group->tables[0] = iommu_pseries_alloc_table(node);89if (table_group->tables[0])90return table_group;9192kfree(table_group);93return NULL;94}9596static void iommu_pseries_free_group(struct iommu_table_group *table_group,97const char *node_name)98{99if (!table_group)100return;101102#ifdef CONFIG_IOMMU_API103if (table_group->group) {104iommu_group_put(table_group->group);105BUG_ON(table_group->group);106}107#endif108109/* Default DMA window table is at index 0, while DDW at 1. SR-IOV110* adapters only have table on index 0(if not direct mapped).111*/112if (table_group->tables[0])113iommu_tce_table_put(table_group->tables[0]);114115if (table_group->tables[1])116iommu_tce_table_put(table_group->tables[1]);117118kfree(table_group);119}120121static int tce_build_pSeries(struct iommu_table *tbl, long index,122long npages, unsigned long uaddr,123enum dma_data_direction direction,124unsigned long attrs)125{126u64 proto_tce;127__be64 *tcep;128u64 rpn;129const unsigned long tceshift = tbl->it_page_shift;130const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);131132proto_tce = TCE_PCI_READ; // Read allowed133134if (direction != DMA_TO_DEVICE)135proto_tce |= TCE_PCI_WRITE;136137tcep = ((__be64 *)tbl->it_base) + index;138139while (npages--) {140/* can't move this out since we might cross MEMBLOCK boundary */141rpn = __pa(uaddr) >> tceshift;142*tcep = cpu_to_be64(proto_tce | rpn << tceshift);143144uaddr += pagesize;145tcep++;146}147return 0;148}149150151static void tce_clear_pSeries(struct iommu_table *tbl, long index, long npages)152{153__be64 *tcep;154155tcep = ((__be64 *)tbl->it_base) + index;156157while (npages--)158*(tcep++) = 0;159}160161static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)162{163__be64 *tcep;164165tcep = ((__be64 *)tbl->it_base) + index;166167return be64_to_cpu(*tcep);168}169170#ifdef CONFIG_IOMMU_API171static long pseries_tce_iommu_userspace_view_alloc(struct iommu_table *tbl)172{173unsigned long cb = ALIGN(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE);174unsigned long *uas;175176if (tbl->it_indirect_levels) /* Impossible */177return -EPERM;178179WARN_ON(tbl->it_userspace);180181uas = vzalloc(cb);182if (!uas)183return -ENOMEM;184185tbl->it_userspace = (__be64 *) uas;186187return 0;188}189#endif190191static void tce_iommu_userspace_view_free(struct iommu_table *tbl)192{193vfree(tbl->it_userspace);194tbl->it_userspace = NULL;195}196197static void tce_free_pSeries(struct iommu_table *tbl)198{199if (tbl->it_userspace)200tce_iommu_userspace_view_free(tbl);201}202203static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);204static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);205206static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,207long npages, unsigned long uaddr,208enum dma_data_direction direction,209unsigned long attrs)210{211u64 rc = 0;212u64 proto_tce, tce;213u64 rpn;214int ret = 0;215long tcenum_start = tcenum, npages_start = npages;216217rpn = __pa(uaddr) >> tceshift;218proto_tce = TCE_PCI_READ;219if (direction != DMA_TO_DEVICE)220proto_tce |= TCE_PCI_WRITE;221222while (npages--) {223tce = proto_tce | rpn << tceshift;224rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);225226if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {227ret = (int)rc;228tce_free_pSeriesLP(liobn, tcenum_start, tceshift,229(npages_start - (npages + 1)));230break;231}232233if (rc && printk_ratelimit()) {234printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);235printk("\tindex = 0x%llx\n", (u64)liobn);236printk("\ttcenum = 0x%llx\n", (u64)tcenum);237printk("\ttce val = 0x%llx\n", tce );238dump_stack();239}240241tcenum++;242rpn++;243}244return ret;245}246247static DEFINE_PER_CPU(__be64 *, tce_page);248249static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,250long npages, unsigned long uaddr,251enum dma_data_direction direction,252unsigned long attrs)253{254u64 rc = 0;255u64 proto_tce;256__be64 *tcep;257u64 rpn;258long l, limit;259long tcenum_start = tcenum, npages_start = npages;260int ret = 0;261unsigned long flags;262const unsigned long tceshift = tbl->it_page_shift;263264if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {265return tce_build_pSeriesLP(tbl->it_index, tcenum,266tceshift, npages, uaddr,267direction, attrs);268}269270local_irq_save(flags); /* to protect tcep and the page behind it */271272tcep = __this_cpu_read(tce_page);273274/* This is safe to do since interrupts are off when we're called275* from iommu_alloc{,_sg}()276*/277if (!tcep) {278tcep = (__be64 *)__get_free_page(GFP_ATOMIC);279/* If allocation fails, fall back to the loop implementation */280if (!tcep) {281local_irq_restore(flags);282return tce_build_pSeriesLP(tbl->it_index, tcenum,283tceshift,284npages, uaddr, direction, attrs);285}286__this_cpu_write(tce_page, tcep);287}288289rpn = __pa(uaddr) >> tceshift;290proto_tce = TCE_PCI_READ;291if (direction != DMA_TO_DEVICE)292proto_tce |= TCE_PCI_WRITE;293294/* We can map max one pageful of TCEs at a time */295do {296/*297* Set up the page with TCE data, looping through and setting298* the values.299*/300limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE);301302for (l = 0; l < limit; l++) {303tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);304rpn++;305}306307rc = plpar_tce_put_indirect((u64)tbl->it_index,308(u64)tcenum << tceshift,309(u64)__pa(tcep),310limit);311312npages -= limit;313tcenum += limit;314} while (npages > 0 && !rc);315316local_irq_restore(flags);317318if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {319ret = (int)rc;320tce_freemulti_pSeriesLP(tbl, tcenum_start,321(npages_start - (npages + limit)));322return ret;323}324325if (rc && printk_ratelimit()) {326printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);327printk("\tindex = 0x%llx\n", (u64)tbl->it_index);328printk("\tnpages = 0x%llx\n", (u64)npages);329printk("\ttce[0] val = 0x%llx\n", tcep[0]);330dump_stack();331}332return ret;333}334335static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,336long npages)337{338u64 rc;339340while (npages--) {341rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);342343if (rc && printk_ratelimit()) {344printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);345printk("\tindex = 0x%llx\n", (u64)liobn);346printk("\ttcenum = 0x%llx\n", (u64)tcenum);347dump_stack();348}349350tcenum++;351}352}353354355static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)356{357u64 rc;358long rpages = npages;359unsigned long limit;360361if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))362return tce_free_pSeriesLP(tbl->it_index, tcenum,363tbl->it_page_shift, npages);364365do {366limit = min_t(unsigned long, rpages, 512);367368rc = plpar_tce_stuff((u64)tbl->it_index,369(u64)tcenum << tbl->it_page_shift, 0, limit);370371rpages -= limit;372tcenum += limit;373} while (rpages > 0 && !rc);374375if (rc && printk_ratelimit()) {376printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");377printk("\trc = %lld\n", rc);378printk("\tindex = 0x%llx\n", (u64)tbl->it_index);379printk("\tnpages = 0x%llx\n", (u64)npages);380dump_stack();381}382}383384static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)385{386u64 rc;387unsigned long tce_ret;388389rc = plpar_tce_get((u64)tbl->it_index,390(u64)tcenum << tbl->it_page_shift, &tce_ret);391392if (rc && printk_ratelimit()) {393printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);394printk("\tindex = 0x%llx\n", (u64)tbl->it_index);395printk("\ttcenum = 0x%llx\n", (u64)tcenum);396dump_stack();397}398399return tce_ret;400}401402/* this is compatible with cells for the device tree property */403struct dynamic_dma_window_prop {404__be32 liobn; /* tce table number */405__be64 dma_base; /* address hi,lo */406__be32 tce_shift; /* ilog2(tce_page_size) */407__be32 window_shift; /* ilog2(tce_window_size) */408};409410struct dma_win {411struct device_node *device;412const struct dynamic_dma_window_prop *prop;413bool direct;414struct list_head list;415};416417/* Dynamic DMA Window support */418struct ddw_query_response {419u32 windows_available;420u64 largest_available_block;421u32 page_size;422u32 migration_capable;423};424425struct ddw_create_response {426u32 liobn;427u32 addr_hi;428u32 addr_lo;429};430431static LIST_HEAD(dma_win_list);432/* prevents races between memory on/offline and window creation */433static DEFINE_SPINLOCK(dma_win_list_lock);434/* protects initializing window twice for same device */435static DEFINE_MUTEX(dma_win_init_mutex);436437static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,438unsigned long num_pfn, const void *arg)439{440const struct dynamic_dma_window_prop *maprange = arg;441int rc;442u64 tce_size, num_tce, dma_offset, next;443u32 tce_shift;444long limit;445446tce_shift = be32_to_cpu(maprange->tce_shift);447tce_size = 1ULL << tce_shift;448next = start_pfn << PAGE_SHIFT;449num_tce = num_pfn << PAGE_SHIFT;450451/* round back to the beginning of the tce page size */452num_tce += next & (tce_size - 1);453next &= ~(tce_size - 1);454455/* covert to number of tces */456num_tce |= tce_size - 1;457num_tce >>= tce_shift;458459do {460/*461* Set up the page with TCE data, looping through and setting462* the values.463*/464limit = min_t(long, num_tce, 512);465dma_offset = next + be64_to_cpu(maprange->dma_base);466467rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),468dma_offset,4690, limit);470next += limit * tce_size;471num_tce -= limit;472} while (num_tce > 0 && !rc);473474return rc;475}476477static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,478unsigned long num_pfn, const void *arg)479{480const struct dynamic_dma_window_prop *maprange = arg;481u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;482__be64 *tcep;483u32 tce_shift;484u64 rc = 0;485long l, limit;486487if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {488unsigned long tceshift = be32_to_cpu(maprange->tce_shift);489unsigned long dmastart = (start_pfn << PAGE_SHIFT) +490be64_to_cpu(maprange->dma_base);491unsigned long tcenum = dmastart >> tceshift;492unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;493void *uaddr = __va(start_pfn << PAGE_SHIFT);494495return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),496tcenum, tceshift, npages, (unsigned long) uaddr,497DMA_BIDIRECTIONAL, 0);498}499500local_irq_disable(); /* to protect tcep and the page behind it */501tcep = __this_cpu_read(tce_page);502503if (!tcep) {504tcep = (__be64 *)__get_free_page(GFP_ATOMIC);505if (!tcep) {506local_irq_enable();507return -ENOMEM;508}509__this_cpu_write(tce_page, tcep);510}511512proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;513514liobn = (u64)be32_to_cpu(maprange->liobn);515tce_shift = be32_to_cpu(maprange->tce_shift);516tce_size = 1ULL << tce_shift;517next = start_pfn << PAGE_SHIFT;518num_tce = num_pfn << PAGE_SHIFT;519520/* round back to the beginning of the tce page size */521num_tce += next & (tce_size - 1);522next &= ~(tce_size - 1);523524/* covert to number of tces */525num_tce |= tce_size - 1;526num_tce >>= tce_shift;527528/* We can map max one pageful of TCEs at a time */529do {530/*531* Set up the page with TCE data, looping through and setting532* the values.533*/534limit = min_t(long, num_tce, 4096 / TCE_ENTRY_SIZE);535dma_offset = next + be64_to_cpu(maprange->dma_base);536537for (l = 0; l < limit; l++) {538tcep[l] = cpu_to_be64(proto_tce | next);539next += tce_size;540}541542rc = plpar_tce_put_indirect(liobn,543dma_offset,544(u64)__pa(tcep),545limit);546547num_tce -= limit;548} while (num_tce > 0 && !rc);549550/* error cleanup: caller will clear whole range */551552local_irq_enable();553return rc;554}555556static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,557unsigned long num_pfn, void *arg)558{559return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);560}561562static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,563unsigned long liobn, unsigned long win_addr,564unsigned long window_size, unsigned long page_shift,565void *base, struct iommu_table_ops *table_ops)566{567tbl->it_busno = busno;568tbl->it_index = liobn;569tbl->it_offset = win_addr >> page_shift;570tbl->it_size = window_size >> page_shift;571tbl->it_page_shift = page_shift;572tbl->it_base = (unsigned long)base;573tbl->it_blocksize = 16;574tbl->it_type = TCE_PCI;575tbl->it_ops = table_ops;576}577578struct iommu_table_ops iommu_table_pseries_ops;579580static void iommu_table_setparms(struct pci_controller *phb,581struct device_node *dn,582struct iommu_table *tbl)583{584struct device_node *node;585const unsigned long *basep;586const u32 *sizep;587588/* Test if we are going over 2GB of DMA space */589if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {590udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");591panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");592}593594node = phb->dn;595basep = of_get_property(node, "linux,tce-base", NULL);596sizep = of_get_property(node, "linux,tce-size", NULL);597if (basep == NULL || sizep == NULL) {598printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "599"missing tce entries !\n", dn);600return;601}602603iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,604phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,605__va(*basep), &iommu_table_pseries_ops);606607if (!is_kdump_kernel())608memset((void *)tbl->it_base, 0, *sizep);609610phb->dma_window_base_cur += phb->dma_window_size;611}612613struct iommu_table_ops iommu_table_lpar_multi_ops;614615struct iommu_table_ops iommu_table_pseries_ops = {616.set = tce_build_pSeries,617.clear = tce_clear_pSeries,618.get = tce_get_pseries619};620621static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)622{623struct device_node *dn;624struct iommu_table *tbl;625struct device_node *isa_dn, *isa_dn_orig;626struct device_node *tmp;627struct pci_dn *pci;628int children;629630dn = pci_bus_to_OF_node(bus);631632pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);633634if (bus->self) {635/* This is not a root bus, any setup will be done for the636* device-side of the bridge in iommu_dev_setup_pSeries().637*/638return;639}640pci = PCI_DN(dn);641642/* Check if the ISA bus on the system is under643* this PHB.644*/645isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");646647while (isa_dn && isa_dn != dn)648isa_dn = isa_dn->parent;649650of_node_put(isa_dn_orig);651652/* Count number of direct PCI children of the PHB. */653for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)654children++;655656pr_debug("Children: %d\n", children);657658/* Calculate amount of DMA window per slot. Each window must be659* a power of two (due to pci_alloc_consistent requirements).660*661* Keep 256MB aside for PHBs with ISA.662*/663664if (!isa_dn) {665/* No ISA/IDE - just set window size and return */666pci->phb->dma_window_size = 0x80000000ul; /* To be divided */667668while (pci->phb->dma_window_size * children > 0x80000000ul)669pci->phb->dma_window_size >>= 1;670pr_debug("No ISA/IDE, window size is 0x%llx\n",671pci->phb->dma_window_size);672pci->phb->dma_window_base_cur = 0;673674return;675}676677/* If we have ISA, then we probably have an IDE678* controller too. Allocate a 128MB table but679* skip the first 128MB to avoid stepping on ISA680* space.681*/682pci->phb->dma_window_size = 0x8000000ul;683pci->phb->dma_window_base_cur = 0x8000000ul;684685pci->table_group = iommu_pseries_alloc_group(pci->phb->node);686tbl = pci->table_group->tables[0];687688iommu_table_setparms(pci->phb, dn, tbl);689690if (!iommu_init_table(tbl, pci->phb->node, 0, 0))691panic("Failed to initialize iommu table");692693/* Divide the rest (1.75GB) among the children */694pci->phb->dma_window_size = 0x80000000ul;695while (pci->phb->dma_window_size * children > 0x70000000ul)696pci->phb->dma_window_size >>= 1;697698pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);699}700701#ifdef CONFIG_IOMMU_API702static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned703long *tce, enum dma_data_direction *direction)704{705long rc;706unsigned long ioba = (unsigned long) index << tbl->it_page_shift;707unsigned long flags, oldtce = 0;708u64 proto_tce = iommu_direction_to_tce_perm(*direction);709unsigned long newtce = *tce | proto_tce;710711spin_lock_irqsave(&tbl->large_pool.lock, flags);712713rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);714if (!rc)715rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);716717if (!rc) {718*direction = iommu_tce_direction(oldtce);719*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);720}721722spin_unlock_irqrestore(&tbl->large_pool.lock, flags);723724return rc;725}726727static __be64 *tce_useraddr_pSeriesLP(struct iommu_table *tbl, long index,728bool __always_unused alloc)729{730return tbl->it_userspace ? &tbl->it_userspace[index - tbl->it_offset] : NULL;731}732#endif733734struct iommu_table_ops iommu_table_lpar_multi_ops = {735.set = tce_buildmulti_pSeriesLP,736#ifdef CONFIG_IOMMU_API737.xchg_no_kill = tce_exchange_pseries,738.useraddrptr = tce_useraddr_pSeriesLP,739#endif740.clear = tce_freemulti_pSeriesLP,741.get = tce_get_pSeriesLP,742.free = tce_free_pSeries743};744745#ifdef CONFIG_IOMMU_API746/*747* When the DMA window properties might have been removed,748* the parent node has the table_group setup on it.749*/750static struct device_node *pci_dma_find_parent_node(struct pci_dev *dev,751struct iommu_table_group *table_group)752{753struct device_node *dn = pci_device_to_OF_node(dev);754struct pci_dn *rpdn;755756for (; dn && PCI_DN(dn); dn = dn->parent) {757rpdn = PCI_DN(dn);758759if (table_group == rpdn->table_group)760return dn;761}762763return NULL;764}765#endif766767/*768* Find nearest ibm,dma-window (default DMA window) or direct DMA window or769* dynamic 64bit DMA window, walking up the device tree.770*/771static struct device_node *pci_dma_find(struct device_node *dn,772struct dynamic_dma_window_prop *prop)773{774const __be32 *default_prop = NULL;775const __be32 *ddw_prop = NULL;776struct device_node *rdn = NULL;777bool default_win = false, ddw_win = false;778779for ( ; dn && PCI_DN(dn); dn = dn->parent) {780default_prop = of_get_property(dn, "ibm,dma-window", NULL);781if (default_prop) {782rdn = dn;783default_win = true;784}785ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);786if (ddw_prop) {787rdn = dn;788ddw_win = true;789break;790}791ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);792if (ddw_prop) {793rdn = dn;794ddw_win = true;795break;796}797798/* At least found default window, which is the case for normal boot */799if (default_win)800break;801}802803/* For PCI devices there will always be a DMA window, either on the device804* or parent bus805*/806WARN_ON(!(default_win | ddw_win));807808/* caller doesn't want to get DMA window property */809if (!prop)810return rdn;811812/* parse DMA window property. During normal system boot, only default813* DMA window is passed in OF. But, for kdump, a dedicated adapter might814* have both default and DDW in FDT. In this scenario, DDW takes precedence815* over default window.816*/817if (ddw_win) {818struct dynamic_dma_window_prop *p;819820p = (struct dynamic_dma_window_prop *)ddw_prop;821prop->liobn = p->liobn;822prop->dma_base = p->dma_base;823prop->tce_shift = p->tce_shift;824prop->window_shift = p->window_shift;825} else if (default_win) {826unsigned long offset, size, liobn;827828of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);829830prop->liobn = cpu_to_be32((u32)liobn);831prop->dma_base = cpu_to_be64(offset);832prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K);833prop->window_shift = cpu_to_be32(order_base_2(size));834}835836return rdn;837}838839static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)840{841struct iommu_table *tbl;842struct device_node *dn, *pdn;843struct pci_dn *ppci;844struct dynamic_dma_window_prop prop;845846dn = pci_bus_to_OF_node(bus);847848pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",849dn);850851pdn = pci_dma_find(dn, &prop);852853/* In PPC architecture, there will always be DMA window on bus or one of the854* parent bus. During reboot, there will be ibm,dma-window property to855* define DMA window. For kdump, there will at least be default window or DDW856* or both.857* There is an exception to the above. In case the PE goes into frozen858* state, firmware may not provide ibm,dma-window property at the time859* of LPAR boot up.860*/861862if (!pdn) {863pr_debug(" no ibm,dma-window property !\n");864return;865}866867ppci = PCI_DN(pdn);868869pr_debug(" parent is %pOF, iommu_table: 0x%p\n",870pdn, ppci->table_group);871872if (!ppci->table_group) {873ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);874tbl = ppci->table_group->tables[0];875876iommu_table_setparms_common(tbl, ppci->phb->bus->number,877be32_to_cpu(prop.liobn),878be64_to_cpu(prop.dma_base),8791ULL << be32_to_cpu(prop.window_shift),880be32_to_cpu(prop.tce_shift), NULL,881&iommu_table_lpar_multi_ops);882883if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))884panic("Failed to initialize iommu table");885886iommu_register_group(ppci->table_group,887pci_domain_nr(bus), 0);888pr_debug(" created table: %p\n", ppci->table_group);889}890}891892893static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)894{895struct device_node *dn;896struct iommu_table *tbl;897898pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));899900dn = dev->dev.of_node;901902/* If we're the direct child of a root bus, then we need to allocate903* an iommu table ourselves. The bus setup code should have setup904* the window sizes already.905*/906if (!dev->bus->self) {907struct pci_controller *phb = PCI_DN(dn)->phb;908909pr_debug(" --> first child, no bridge. Allocating iommu table.\n");910PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);911tbl = PCI_DN(dn)->table_group->tables[0];912iommu_table_setparms(phb, dn, tbl);913914if (!iommu_init_table(tbl, phb->node, 0, 0))915panic("Failed to initialize iommu table");916917set_iommu_table_base(&dev->dev, tbl);918return;919}920921/* If this device is further down the bus tree, search upwards until922* an already allocated iommu table is found and use that.923*/924925while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)926dn = dn->parent;927928if (dn && PCI_DN(dn))929set_iommu_table_base(&dev->dev,930PCI_DN(dn)->table_group->tables[0]);931else932printk(KERN_WARNING "iommu: Device %s has no iommu table\n",933pci_name(dev));934}935936static int __read_mostly disable_ddw;937938static int __init disable_ddw_setup(char *str)939{940disable_ddw = 1;941printk(KERN_INFO "ppc iommu: disabling ddw.\n");942943return 0;944}945946early_param("disable_ddw", disable_ddw_setup);947948static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)949{950int ret;951952ret = tce_clearrange_multi_pSeriesLP(0,9531ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);954if (ret)955pr_warn("%pOF failed to clear tces in window.\n",956np);957else958pr_debug("%pOF successfully cleared tces in window.\n",959np);960}961962/*963* Call only if DMA window is clean.964*/965static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)966{967int ret;968969ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);970if (ret)971pr_warn("%pOF: failed to remove DMA window: rtas returned "972"%d to ibm,remove-pe-dma-window(%x) %llx\n",973np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);974else975pr_debug("%pOF: successfully removed DMA window: rtas returned "976"%d to ibm,remove-pe-dma-window(%x) %llx\n",977np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);978}979980static void remove_dma_window(struct device_node *np, u32 *ddw_avail,981struct property *win, bool cleanup)982{983struct dynamic_dma_window_prop *dwp;984u64 liobn;985986dwp = win->value;987liobn = (u64)be32_to_cpu(dwp->liobn);988989if (cleanup)990clean_dma_window(np, dwp);991__remove_dma_window(np, ddw_avail, liobn);992}993994static void copy_property(struct device_node *pdn, const char *from, const char *to)995{996struct property *src, *dst;997998src = of_find_property(pdn, from, NULL);999if (!src)1000return;10011002dst = kzalloc(sizeof(*dst), GFP_KERNEL);1003if (!dst)1004return;10051006dst->name = kstrdup(to, GFP_KERNEL);1007dst->value = kmemdup(src->value, src->length, GFP_KERNEL);1008dst->length = src->length;1009if (!dst->name || !dst->value)1010return;10111012if (of_add_property(pdn, dst)) {1013pr_err("Unable to add DMA window property for %pOF", pdn);1014goto free_prop;1015}10161017return;10181019free_prop:1020kfree(dst->name);1021kfree(dst->value);1022kfree(dst);1023}10241025static int remove_dma_window_named(struct device_node *np, bool remove_prop, const char *win_name,1026bool cleanup)1027{1028struct property *win;1029u32 ddw_avail[DDW_APPLICABLE_SIZE];1030int ret = 0;10311032win = of_find_property(np, win_name, NULL);1033if (!win)1034return -EINVAL;10351036ret = of_property_read_u32_array(np, "ibm,ddw-applicable",1037&ddw_avail[0], DDW_APPLICABLE_SIZE);1038if (ret)1039return 0;10401041if (win->length >= sizeof(struct dynamic_dma_window_prop))1042remove_dma_window(np, ddw_avail, win, cleanup);10431044if (!remove_prop)1045return 0;10461047/* Default window property if removed is lost as reset-pe doesn't restore it.1048* Though FDT has a copy of it, the DLPAR hotplugged devices will not have a1049* node on FDT until next reboot. So, back it up.1050*/1051if ((strcmp(win_name, "ibm,dma-window") == 0) &&1052!of_find_property(np, "ibm,dma-window-saved", NULL))1053copy_property(np, win_name, "ibm,dma-window-saved");10541055ret = of_remove_property(np, win);1056if (ret)1057pr_warn("%pOF: failed to remove DMA window property: %d\n",1058np, ret);1059return 0;1060}10611062static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift,1063bool *direct_mapping)1064{1065struct dma_win *window;1066const struct dynamic_dma_window_prop *dma64;1067bool found = false;10681069spin_lock(&dma_win_list_lock);1070/* check if we already created a window and dupe that config if so */1071list_for_each_entry(window, &dma_win_list, list) {1072if (window->device == pdn) {1073dma64 = window->prop;1074*dma_addr = be64_to_cpu(dma64->dma_base);1075*window_shift = be32_to_cpu(dma64->window_shift);1076*direct_mapping = window->direct;1077found = true;1078break;1079}1080}1081spin_unlock(&dma_win_list_lock);10821083return found;1084}10851086static struct dma_win *ddw_list_new_entry(struct device_node *pdn,1087const struct dynamic_dma_window_prop *dma64)1088{1089struct dma_win *window;10901091window = kzalloc(sizeof(*window), GFP_KERNEL);1092if (!window)1093return NULL;10941095window->device = pdn;1096window->prop = dma64;1097window->direct = false;10981099return window;1100}11011102static void find_existing_ddw_windows_named(const char *name)1103{1104int len;1105struct device_node *pdn;1106struct dma_win *window;1107const struct dynamic_dma_window_prop *dma64;11081109for_each_node_with_property(pdn, name) {1110dma64 = of_get_property(pdn, name, &len);1111if (!dma64 || len < sizeof(*dma64)) {1112remove_dma_window_named(pdn, true, name, true);1113continue;1114}11151116/* If at the time of system initialization, there are DDWs in OF,1117* it means this is during kexec. DDW could be direct or dynamic.1118* We will just mark DDWs as "dynamic" since this is kdump path,1119* no need to worry about perforance. ddw_list_new_entry() will1120* set window->direct = false.1121*/1122window = ddw_list_new_entry(pdn, dma64);1123if (!window) {1124of_node_put(pdn);1125break;1126}11271128spin_lock(&dma_win_list_lock);1129list_add(&window->list, &dma_win_list);1130spin_unlock(&dma_win_list_lock);1131}1132}11331134static int find_existing_ddw_windows(void)1135{1136if (!firmware_has_feature(FW_FEATURE_LPAR))1137return 0;11381139find_existing_ddw_windows_named(DIRECT64_PROPNAME);1140find_existing_ddw_windows_named(DMA64_PROPNAME);11411142return 0;1143}1144machine_arch_initcall(pseries, find_existing_ddw_windows);11451146/**1147* ddw_read_ext - Get the value of an DDW extension1148* @np: device node from which the extension value is to be read.1149* @extnum: index number of the extension.1150* @value: pointer to return value, modified when extension is available.1151*1152* Checks if "ibm,ddw-extensions" exists for this node, and get the value1153* on index 'extnum'.1154* It can be used only to check if a property exists, passing value == NULL.1155*1156* Returns:1157* 0 if extension successfully read1158* -EINVAL if the "ibm,ddw-extensions" does not exist,1159* -ENODATA if "ibm,ddw-extensions" does not have a value, and1160* -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.1161*/1162static inline int ddw_read_ext(const struct device_node *np, int extnum,1163u32 *value)1164{1165static const char propname[] = "ibm,ddw-extensions";1166u32 count;1167int ret;11681169ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);1170if (ret)1171return ret;11721173if (count < extnum)1174return -EOVERFLOW;11751176if (!value)1177value = &count;11781179return of_property_read_u32_index(np, propname, extnum, value);1180}11811182static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,1183struct ddw_query_response *query,1184struct device_node *parent)1185{1186struct device_node *dn;1187struct pci_dn *pdn;1188u32 cfg_addr, ext_query, query_out[5];1189u64 buid;1190int ret, out_sz;11911192/*1193* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many1194* output parameters ibm,query-pe-dma-windows will have, ranging from1195* 5 to 6.1196*/1197ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);1198if (!ret && ext_query == 1)1199out_sz = 6;1200else1201out_sz = 5;12021203/*1204* Get the config address and phb buid of the PE window.1205* Rely on eeh to retrieve this for us.1206* Retrieve them from the pci device, not the node with the1207* dma-window property1208*/1209dn = pci_device_to_OF_node(dev);1210pdn = PCI_DN(dn);1211buid = pdn->phb->buid;1212cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));12131214ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,1215cfg_addr, BUID_HI(buid), BUID_LO(buid));12161217switch (out_sz) {1218case 5:1219query->windows_available = query_out[0];1220query->largest_available_block = query_out[1];1221query->page_size = query_out[2];1222query->migration_capable = query_out[3];1223break;1224case 6:1225query->windows_available = query_out[0];1226query->largest_available_block = ((u64)query_out[1] << 32) |1227query_out[2];1228query->page_size = query_out[3];1229query->migration_capable = query_out[4];1230break;1231}12321233dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n",1234ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),1235BUID_LO(buid), ret, query->largest_available_block,1236query->page_size, query->windows_available);12371238return ret;1239}12401241static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,1242struct ddw_create_response *create, int page_shift,1243int window_shift)1244{1245struct device_node *dn;1246struct pci_dn *pdn;1247u32 cfg_addr;1248u64 buid;1249int ret;12501251/*1252* Get the config address and phb buid of the PE window.1253* Rely on eeh to retrieve this for us.1254* Retrieve them from the pci device, not the node with the1255* dma-window property1256*/1257dn = pci_device_to_OF_node(dev);1258pdn = PCI_DN(dn);1259buid = pdn->phb->buid;1260cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));12611262do {1263/* extra outputs are LIOBN and dma-addr (hi, lo) */1264ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,1265(u32 *)create, cfg_addr, BUID_HI(buid),1266BUID_LO(buid), page_shift, window_shift);1267} while (rtas_busy_delay(ret));1268dev_info(&dev->dev,1269"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "1270"(liobn = 0x%x starting addr = %x %x)\n",1271ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),1272BUID_LO(buid), page_shift, window_shift, ret, create->liobn,1273create->addr_hi, create->addr_lo);12741275return ret;1276}12771278struct failed_ddw_pdn {1279struct device_node *pdn;1280struct list_head list;1281};12821283static LIST_HEAD(failed_ddw_pdn_list);12841285static phys_addr_t ddw_memory_hotplug_max(void)1286{1287resource_size_t max_addr;12881289#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)1290max_addr = hot_add_drconf_memory_max();1291#else1292max_addr = memblock_end_of_DRAM();1293#endif12941295return max_addr;1296}12971298/*1299* Platforms supporting the DDW option starting with LoPAR level 2.7 implement1300* ibm,ddw-extensions, which carries the rtas token for1301* ibm,reset-pe-dma-windows.1302* That rtas-call can be used to restore the default DMA window for the device.1303*/1304static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)1305{1306int ret;1307u32 cfg_addr, reset_dma_win;1308u64 buid;1309struct device_node *dn;1310struct pci_dn *pdn;13111312ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);1313if (ret)1314return;13151316dn = pci_device_to_OF_node(dev);1317pdn = PCI_DN(dn);1318buid = pdn->phb->buid;1319cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);13201321ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),1322BUID_LO(buid));1323if (ret)1324dev_info(&dev->dev,1325"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",1326reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),1327ret);1328}13291330/*1331* Platforms support placing PHB in limited address mode starting with LoPAR1332* level 2.13 implement. In this mode, the DMA address returned by DDW is over1333* 4GB but, less than 64-bits. This benefits IO adapters that don't support1334* 64-bits for DMA addresses.1335*/1336static int limited_dma_window(struct pci_dev *dev, struct device_node *par_dn)1337{1338int ret;1339u32 cfg_addr, reset_dma_win, las_supported;1340u64 buid;1341struct device_node *dn;1342struct pci_dn *pdn;13431344ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);1345if (ret)1346goto out;13471348ret = ddw_read_ext(par_dn, DDW_EXT_LIMITED_ADDR_MODE, &las_supported);13491350/* Limited Address Space extension available on the platform but DDW in1351* limited addressing mode not supported1352*/1353if (!ret && !las_supported)1354ret = -EPROTO;13551356if (ret) {1357dev_info(&dev->dev, "Limited Address Space for DDW not Supported, err: %d", ret);1358goto out;1359}13601361dn = pci_device_to_OF_node(dev);1362pdn = PCI_DN(dn);1363buid = pdn->phb->buid;1364cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);13651366ret = rtas_call(reset_dma_win, 4, 1, NULL, cfg_addr, BUID_HI(buid),1367BUID_LO(buid), 1);1368if (ret)1369dev_info(&dev->dev,1370"ibm,reset-pe-dma-windows(%x) for Limited Addr Support: %x %x %x returned %d ",1371reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),1372ret);13731374out:1375return ret;1376}13771378/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */1379static int iommu_get_page_shift(u32 query_page_size)1380{1381/* Supported IO page-sizes according to LoPAR, note that 2M is out of order */1382const int shift[] = {1383__builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),1384__builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),1385__builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M)1386};13871388int i = ARRAY_SIZE(shift) - 1;1389int ret = 0;13901391/*1392* On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:1393* - bit 31 means 4k pages are supported,1394* - bit 30 means 64k pages are supported, and so on.1395* Larger pagesizes map more memory with the same amount of TCEs, so start probing them.1396*/1397for (; i >= 0 ; i--) {1398if (query_page_size & (1 << i))1399ret = max(ret, shift[i]);1400}14011402return ret;1403}14041405static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,1406u32 page_shift, u32 window_shift)1407{1408struct dynamic_dma_window_prop *ddwprop;1409struct property *win64;14101411win64 = kzalloc(sizeof(*win64), GFP_KERNEL);1412if (!win64)1413return NULL;14141415win64->name = kstrdup(propname, GFP_KERNEL);1416ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);1417win64->value = ddwprop;1418win64->length = sizeof(*ddwprop);1419if (!win64->name || !win64->value) {1420kfree(win64->name);1421kfree(win64->value);1422kfree(win64);1423return NULL;1424}14251426ddwprop->liobn = cpu_to_be32(liobn);1427ddwprop->dma_base = cpu_to_be64(dma_addr);1428ddwprop->tce_shift = cpu_to_be32(page_shift);1429ddwprop->window_shift = cpu_to_be32(window_shift);14301431return win64;1432}14331434/*1435* If the PE supports dynamic dma windows, and there is space for a table1436* that can map all pages in a linear offset, then setup such a table,1437* and record the dma-offset in the struct device.1438*1439* dev: the pci device we are checking1440* pdn: the parent pe node with the ibm,dma_window property1441* Future: also check if we can remap the base window for our base page size1442*1443* returns true if can map all pages (direct mapping), false otherwise..1444*/1445static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn, u64 dma_mask)1446{1447int len = 0, ret;1448int max_ram_len = order_base_2(ddw_memory_hotplug_max());1449struct ddw_query_response query;1450struct ddw_create_response create;1451int page_shift;1452u64 win_addr, dynamic_offset = 0;1453const char *win_name;1454struct device_node *dn;1455u32 ddw_avail[DDW_APPLICABLE_SIZE];1456struct dma_win *window;1457struct property *win64;1458struct failed_ddw_pdn *fpdn;1459bool default_win_removed = false, direct_mapping = false;1460bool dynamic_mapping = false;1461bool pmem_present;1462struct pci_dn *pci = PCI_DN(pdn);1463struct property *default_win = NULL;1464bool limited_addr_req = false, limited_addr_enabled = false;1465int dev_max_ddw;1466int ddw_sz;14671468dn = of_find_node_by_type(NULL, "ibm,pmemory");1469pmem_present = dn != NULL;1470of_node_put(dn);14711472mutex_lock(&dma_win_init_mutex);14731474if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping))1475goto out_unlock;14761477/*1478* If we already went through this for a previous function of1479* the same device and failed, we don't want to muck with the1480* DMA window again, as it will race with in-flight operations1481* and can lead to EEHs. The above mutex protects access to the1482* list.1483*/1484list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {1485if (fpdn->pdn == pdn)1486goto out_unlock;1487}14881489/*1490* the ibm,ddw-applicable property holds the tokens for:1491* ibm,query-pe-dma-window1492* ibm,create-pe-dma-window1493* for the given node in that order.1494* the property is actually in the parent, not the PE1495*/1496ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",1497&ddw_avail[0], DDW_APPLICABLE_SIZE);1498if (ret)1499goto out_failed;15001501/*1502* Query if there is a second window of size to map the1503* whole partition. Query returns number of windows, largest1504* block assigned to PE (partition endpoint), and two bitmasks1505* of page sizes: supported and supported for migrate-dma.1506*/1507dn = pci_device_to_OF_node(dev);1508ret = query_ddw(dev, ddw_avail, &query, pdn);1509if (ret != 0)1510goto out_failed;15111512/* DMA Limited Addressing required? This is when the driver has1513* requested to create DDW but supports mask which is less than 64-bits1514*/1515limited_addr_req = (dma_mask != DMA_BIT_MASK(64));15161517/* place the PHB in Limited Addressing mode */1518if (limited_addr_req) {1519if (limited_dma_window(dev, pdn))1520goto out_failed;15211522/* PHB is in Limited address mode */1523limited_addr_enabled = true;1524}15251526/*1527* If there is no window available, remove the default DMA window,1528* if it's present. This will make all the resources available to the1529* new DDW window.1530* If anything fails after this, we need to restore it, so also check1531* for extensions presence.1532*/1533if (query.windows_available == 0) {1534int reset_win_ext;15351536/* DDW + IOMMU on single window may fail if there is any allocation */1537if (iommu_table_in_use(pci->table_group->tables[0])) {1538dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");1539goto out_failed;1540}15411542default_win = of_find_property(pdn, "ibm,dma-window", NULL);1543if (!default_win)1544goto out_failed;15451546reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);1547if (reset_win_ext)1548goto out_failed;15491550remove_dma_window(pdn, ddw_avail, default_win, true);1551default_win_removed = true;15521553/* Query again, to check if the window is available */1554ret = query_ddw(dev, ddw_avail, &query, pdn);1555if (ret != 0)1556goto out_failed;15571558if (query.windows_available == 0) {1559/* no windows are available for this device. */1560dev_dbg(&dev->dev, "no free dynamic windows");1561goto out_failed;1562}1563}15641565page_shift = iommu_get_page_shift(query.page_size);1566if (!page_shift) {1567dev_dbg(&dev->dev, "no supported page size in mask %x",1568query.page_size);1569goto out_failed;1570}15711572/* Maximum DMA window size that the device can address (in log2) */1573dev_max_ddw = fls64(dma_mask);15741575/* If the device DMA mask is less than 64-bits, make sure the DMA window1576* size is not bigger than what the device can access1577*/1578ddw_sz = min(order_base_2(query.largest_available_block << page_shift),1579dev_max_ddw);15801581/*1582* The "ibm,pmemory" can appear anywhere in the address space.1583* Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS1584* for the upper limit and fallback to max RAM otherwise but this1585* disables device::dma_ops_bypass.1586*/1587len = max_ram_len;1588if (pmem_present) {1589if (ddw_sz >= MAX_PHYSMEM_BITS)1590len = MAX_PHYSMEM_BITS;1591else1592dev_info(&dev->dev, "Skipping ibm,pmemory");1593}15941595/* check if the available block * number of ptes will map everything */1596if (ddw_sz < len) {1597dev_dbg(&dev->dev,1598"can't map partition max 0x%llx with %llu %llu-sized pages\n",15991ULL << len,1600query.largest_available_block,16011ULL << page_shift);16021603len = ddw_sz;1604dynamic_mapping = true;1605} else {1606direct_mapping = !default_win_removed ||1607(len == MAX_PHYSMEM_BITS) ||1608(!pmem_present && (len == max_ram_len));16091610/* DDW is big enough to direct map RAM. If there is vPMEM, check1611* if enough space is left in DDW where we can dynamically1612* allocate TCEs for vPMEM. For now, this Hybrid sharing of DDW1613* is only for SR-IOV devices.1614*/1615if (default_win_removed && pmem_present && !direct_mapping) {1616/* DDW is big enough to be split */1617if ((1ULL << ddw_sz) >=1618MIN_DDW_VPMEM_DMA_WINDOW + (1ULL << max_ram_len)) {16191620direct_mapping = true;16211622/* offset of the Dynamic part of DDW */1623dynamic_offset = 1ULL << max_ram_len;1624}16251626/* DDW will at least have dynamic allocation */1627dynamic_mapping = true;16281629/* create max size DDW possible */1630len = ddw_sz;1631}1632}16331634/* Even if the DDW is split into both direct mapped RAM and dynamically1635* mapped vPMEM, the DDW property in OF will be marked as Direct.1636*/1637win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME;16381639ret = create_ddw(dev, ddw_avail, &create, page_shift, len);1640if (ret != 0)1641goto out_failed;16421643dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",1644create.liobn, dn);16451646win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;1647win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);16481649if (!win64) {1650dev_info(&dev->dev,1651"couldn't allocate property, property name, or value\n");1652goto out_remove_win;1653}16541655ret = of_add_property(pdn, win64);1656if (ret) {1657dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",1658pdn, ret);1659goto out_free_prop;1660}16611662window = ddw_list_new_entry(pdn, win64->value);1663if (!window)1664goto out_del_prop;16651666window->direct = direct_mapping;16671668if (direct_mapping) {1669/* DDW maps the whole partition, so enable direct DMA mapping */1670ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT,1671win64->value, tce_setrange_multi_pSeriesLP_walk);1672if (ret) {1673dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",1674dn, ret);16751676/* Make sure to clean DDW if any TCE was set*/1677clean_dma_window(pdn, win64->value);1678goto out_del_list;1679}1680if (default_win_removed) {1681iommu_tce_table_put(pci->table_group->tables[0]);1682pci->table_group->tables[0] = NULL;1683set_iommu_table_base(&dev->dev, NULL);1684}1685}16861687if (dynamic_mapping) {1688struct iommu_table *newtbl;1689int i;1690unsigned long start = 0, end = 0;1691u64 dynamic_addr, dynamic_len;16921693for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {1694const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;16951696/* Look for MMIO32 */1697if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {1698start = pci->phb->mem_resources[i].start;1699end = pci->phb->mem_resources[i].end;1700break;1701}1702}17031704/* New table for using DDW instead of the default DMA window */1705newtbl = iommu_pseries_alloc_table(pci->phb->node);1706if (!newtbl) {1707dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");1708goto out_del_list;1709}17101711/* If the DDW is split between directly mapped RAM and Dynamic1712* mapped for TCES, offset into the DDW where the dynamic part1713* begins.1714*/1715dynamic_addr = win_addr + dynamic_offset;1716dynamic_len = (1UL << len) - dynamic_offset;1717iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn,1718dynamic_addr, dynamic_len, page_shift, NULL,1719&iommu_table_lpar_multi_ops);1720iommu_init_table(newtbl, pci->phb->node,1721start >> page_shift, end >> page_shift);17221723pci->table_group->tables[default_win_removed ? 0 : 1] = newtbl;17241725set_iommu_table_base(&dev->dev, newtbl);1726}17271728if (default_win_removed) {1729/* default_win is valid here because default_win_removed == true */1730if (!of_find_property(pdn, "ibm,dma-window-saved", NULL))1731copy_property(pdn, "ibm,dma-window", "ibm,dma-window-saved");1732of_remove_property(pdn, default_win);1733dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn);1734}17351736spin_lock(&dma_win_list_lock);1737list_add(&window->list, &dma_win_list);1738spin_unlock(&dma_win_list_lock);17391740dev->dev.archdata.dma_offset = win_addr;1741goto out_unlock;17421743out_del_list:1744kfree(window);17451746out_del_prop:1747of_remove_property(pdn, win64);17481749out_free_prop:1750kfree(win64->name);1751kfree(win64->value);1752kfree(win64);17531754out_remove_win:1755/* DDW is clean, so it's ok to call this directly. */1756__remove_dma_window(pdn, ddw_avail, create.liobn);17571758out_failed:1759if (default_win_removed || limited_addr_enabled)1760reset_dma_window(dev, pdn);17611762fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);1763if (!fpdn)1764goto out_unlock;1765fpdn->pdn = pdn;1766list_add(&fpdn->list, &failed_ddw_pdn_list);17671768out_unlock:1769mutex_unlock(&dma_win_init_mutex);17701771/* For pre-mapped memory, set bus_dma_limit to the max RAM */1772if (direct_mapping)1773dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset +1774(1ULL << max_ram_len);17751776dev_info(&dev->dev, "lsa_required: %x, lsa_enabled: %x, direct mapping: %x\n",1777limited_addr_req, limited_addr_enabled, direct_mapping);17781779return direct_mapping;1780}17811782static __u64 query_page_size_to_mask(u32 query_page_size)1783{1784const long shift[] = {1785(SZ_4K), (SZ_64K), (SZ_16M),1786(SZ_32M), (SZ_64M), (SZ_128M),1787(SZ_256M), (SZ_16G), (SZ_2M)1788};1789int i, ret = 0;17901791for (i = 0; i < ARRAY_SIZE(shift); i++) {1792if (query_page_size & (1 << i))1793ret |= shift[i];1794}17951796return ret;1797}17981799static void spapr_tce_init_table_group(struct pci_dev *pdev,1800struct device_node *pdn,1801struct dynamic_dma_window_prop prop)1802{1803struct iommu_table_group *table_group = PCI_DN(pdn)->table_group;1804u32 ddw_avail[DDW_APPLICABLE_SIZE];18051806struct ddw_query_response query;1807int ret;18081809/* Only for normal boot with default window. Doesn't matter during1810* kdump, since these will not be used during kdump.1811*/1812if (is_kdump_kernel())1813return;18141815if (table_group->max_dynamic_windows_supported != 0)1816return; /* already initialized */18171818table_group->tce32_start = be64_to_cpu(prop.dma_base);1819table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);18201821if (!of_find_property(pdn, "ibm,dma-window", NULL))1822dev_err(&pdev->dev, "default dma window missing!\n");18231824ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",1825&ddw_avail[0], DDW_APPLICABLE_SIZE);1826if (ret) {1827table_group->max_dynamic_windows_supported = -1;1828return;1829}18301831ret = query_ddw(pdev, ddw_avail, &query, pdn);1832if (ret) {1833dev_err(&pdev->dev, "%s: query_ddw failed\n", __func__);1834table_group->max_dynamic_windows_supported = -1;1835return;1836}18371838if (query.windows_available == 0)1839table_group->max_dynamic_windows_supported = 1;1840else1841table_group->max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES;18421843table_group->max_levels = 1;1844table_group->pgsizes |= query_page_size_to_mask(query.page_size);1845}18461847static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)1848{1849struct device_node *pdn, *dn;1850struct iommu_table *tbl;1851struct pci_dn *pci;1852struct dynamic_dma_window_prop prop;18531854pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));18551856/* dev setup for LPAR is a little tricky, since the device tree might1857* contain the dma-window properties per-device and not necessarily1858* for the bus. So we need to search upwards in the tree until we1859* either hit a dma-window property, OR find a parent with a table1860* already allocated.1861*/1862dn = pci_device_to_OF_node(dev);1863pr_debug(" node is %pOF\n", dn);18641865pdn = pci_dma_find(dn, &prop);1866if (!pdn || !PCI_DN(pdn)) {1867printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "1868"no DMA window found for pci dev=%s dn=%pOF\n",1869pci_name(dev), dn);1870return;1871}1872pr_debug(" parent is %pOF\n", pdn);18731874pci = PCI_DN(pdn);1875if (!pci->table_group) {1876pci->table_group = iommu_pseries_alloc_group(pci->phb->node);1877tbl = pci->table_group->tables[0];18781879iommu_table_setparms_common(tbl, pci->phb->bus->number,1880be32_to_cpu(prop.liobn),1881be64_to_cpu(prop.dma_base),18821ULL << be32_to_cpu(prop.window_shift),1883be32_to_cpu(prop.tce_shift), NULL,1884&iommu_table_lpar_multi_ops);18851886iommu_init_table(tbl, pci->phb->node, 0, 0);1887iommu_register_group(pci->table_group,1888pci_domain_nr(pci->phb->bus), 0);1889pr_debug(" created table: %p\n", pci->table_group);1890} else {1891pr_debug(" found DMA window, table: %p\n", pci->table_group);1892}18931894spapr_tce_init_table_group(dev, pdn, prop);18951896set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);1897iommu_add_device(pci->table_group, &dev->dev);1898}18991900static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)1901{1902struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;19031904/* For DDW, DMA mask should be more than 32-bits. For mask more then1905* 32-bits but less then 64-bits, DMA addressing is supported in1906* Limited Addressing mode.1907*/1908if (dma_mask <= DMA_BIT_MASK(32))1909return false;19101911dev_dbg(&pdev->dev, "node is %pOF\n", dn);19121913/*1914* the device tree might contain the dma-window properties1915* per-device and not necessarily for the bus. So we need to1916* search upwards in the tree until we either hit a dma-window1917* property, OR find a parent with a table already allocated.1918*/1919pdn = pci_dma_find(dn, NULL);1920if (pdn && PCI_DN(pdn))1921return enable_ddw(pdev, pdn, dma_mask);19221923return false;1924}19251926#ifdef CONFIG_IOMMU_API1927/*1928* A simple iommu_table_group_ops which only allows reusing the existing1929* iommu_table. This handles VFIO for POWER7 or the nested KVM.1930* The ops does not allow creating windows and only allows reusing the existing1931* one if it matches table_group->tce32_start/tce32_size/page_shift.1932*/1933static unsigned long spapr_tce_get_table_size(__u32 page_shift,1934__u64 window_size, __u32 levels)1935{1936unsigned long size;19371938if (levels > 1)1939return ~0U;1940size = window_size >> (page_shift - 3);1941return size;1942}19431944static struct pci_dev *iommu_group_get_first_pci_dev(struct iommu_group *group)1945{1946struct pci_dev *pdev = NULL;1947int ret;19481949/* No IOMMU group ? */1950if (!group)1951return NULL;19521953ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table);1954if (!ret || !pdev)1955return NULL;1956return pdev;1957}19581959static void restore_default_dma_window(struct pci_dev *pdev, struct device_node *pdn)1960{1961reset_dma_window(pdev, pdn);1962copy_property(pdn, "ibm,dma-window-saved", "ibm,dma-window");1963}19641965static long remove_dynamic_dma_windows(struct pci_dev *pdev, struct device_node *pdn)1966{1967struct pci_dn *pci = PCI_DN(pdn);1968struct dma_win *window;1969bool direct_mapping;1970int len;19711972if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len, &direct_mapping)) {1973remove_dma_window_named(pdn, true, direct_mapping ?1974DIRECT64_PROPNAME : DMA64_PROPNAME, true);1975if (!direct_mapping) {1976WARN_ON(!pci->table_group->tables[0] && !pci->table_group->tables[1]);19771978if (pci->table_group->tables[1]) {1979iommu_tce_table_put(pci->table_group->tables[1]);1980pci->table_group->tables[1] = NULL;1981} else if (pci->table_group->tables[0]) {1982/* Default window was removed and only the DDW exists */1983iommu_tce_table_put(pci->table_group->tables[0]);1984pci->table_group->tables[0] = NULL;1985}1986}1987spin_lock(&dma_win_list_lock);1988list_for_each_entry(window, &dma_win_list, list) {1989if (window->device == pdn) {1990list_del(&window->list);1991kfree(window);1992break;1993}1994}1995spin_unlock(&dma_win_list_lock);1996}19971998return 0;1999}20002001static long pseries_setup_default_iommu_config(struct iommu_table_group *table_group,2002struct device *dev)2003{2004struct pci_dev *pdev = to_pci_dev(dev);2005const __be32 *default_prop;2006long liobn, offset, size;2007struct device_node *pdn;2008struct iommu_table *tbl;2009struct pci_dn *pci;20102011pdn = pci_dma_find_parent_node(pdev, table_group);2012if (!pdn || !PCI_DN(pdn)) {2013dev_warn(&pdev->dev, "No table_group configured for the node %pOF\n", pdn);2014return -1;2015}2016pci = PCI_DN(pdn);20172018/* The default window is restored if not present already on removal of DDW.2019* However, if used by VFIO SPAPR sub driver, the user's order of removal of2020* windows might have been different to not leading to auto restoration,2021* suppose the DDW was removed first followed by the default one.2022* So, restore the default window with reset-pe-dma call explicitly.2023*/2024restore_default_dma_window(pdev, pdn);20252026default_prop = of_get_property(pdn, "ibm,dma-window", NULL);2027of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size);2028tbl = iommu_pseries_alloc_table(pci->phb->node);2029if (!tbl) {2030dev_err(&pdev->dev, "couldn't create new IOMMU table\n");2031return -1;2032}20332034iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn, offset,2035size, IOMMU_PAGE_SHIFT_4K, NULL,2036&iommu_table_lpar_multi_ops);2037iommu_init_table(tbl, pci->phb->node, 0, 0);20382039pci->table_group->tables[0] = tbl;2040set_iommu_table_base(&pdev->dev, tbl);20412042return 0;2043}20442045static bool is_default_window_request(struct iommu_table_group *table_group, __u32 page_shift,2046__u64 window_size)2047{2048if ((window_size <= table_group->tce32_size) &&2049(page_shift == IOMMU_PAGE_SHIFT_4K))2050return true;20512052return false;2053}20542055static long spapr_tce_create_table(struct iommu_table_group *table_group, int num,2056__u32 page_shift, __u64 window_size, __u32 levels,2057struct iommu_table **ptbl)2058{2059struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group);2060u32 ddw_avail[DDW_APPLICABLE_SIZE];2061struct ddw_create_response create;2062unsigned long liobn, offset, size;2063unsigned long start = 0, end = 0;2064struct ddw_query_response query;2065const __be32 *default_prop;2066struct failed_ddw_pdn *fpdn;2067unsigned int window_shift;2068struct device_node *pdn;2069struct iommu_table *tbl;2070struct dma_win *window;2071struct property *win64;2072struct pci_dn *pci;2073u64 win_addr;2074int len, i;2075long ret;20762077if (!is_power_of_2(window_size) || levels > 1)2078return -EINVAL;20792080window_shift = order_base_2(window_size);20812082mutex_lock(&dma_win_init_mutex);20832084ret = -ENODEV;20852086pdn = pci_dma_find_parent_node(pdev, table_group);2087if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */2088dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn);2089goto out_failed;2090}2091pci = PCI_DN(pdn);20922093/* If the enable DDW failed for the pdn, dont retry! */2094list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {2095if (fpdn->pdn == pdn) {2096dev_info(&pdev->dev, "%pOF in failed DDW device list\n", pdn);2097goto out_unlock;2098}2099}21002101tbl = iommu_pseries_alloc_table(pci->phb->node);2102if (!tbl) {2103dev_dbg(&pdev->dev, "couldn't create new IOMMU table\n");2104goto out_unlock;2105}21062107if (num == 0) {2108bool direct_mapping;2109/* The request is not for default window? Ensure there is no DDW window already */2110if (!is_default_window_request(table_group, page_shift, window_size)) {2111if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len,2112&direct_mapping)) {2113dev_warn(&pdev->dev, "%pOF: 64-bit window already present.", pdn);2114ret = -EPERM;2115goto out_unlock;2116}2117} else {2118/* Request is for Default window, ensure there is no DDW if there is a2119* need to reset. reset-pe otherwise removes the DDW also2120*/2121default_prop = of_get_property(pdn, "ibm,dma-window", NULL);2122if (!default_prop) {2123if (find_existing_ddw(pdn, &pdev->dev.archdata.dma_offset, &len,2124&direct_mapping)) {2125dev_warn(&pdev->dev, "%pOF: Attempt to create window#0 when 64-bit window is present. Preventing the attempt as that would destroy the 64-bit window",2126pdn);2127ret = -EPERM;2128goto out_unlock;2129}21302131restore_default_dma_window(pdev, pdn);21322133default_prop = of_get_property(pdn, "ibm,dma-window", NULL);2134of_parse_dma_window(pdn, default_prop, &liobn, &offset, &size);2135/* Limit the default window size to window_size */2136iommu_table_setparms_common(tbl, pci->phb->bus->number, liobn,2137offset, 1UL << window_shift,2138IOMMU_PAGE_SHIFT_4K, NULL,2139&iommu_table_lpar_multi_ops);2140iommu_init_table(tbl, pci->phb->node,2141start >> IOMMU_PAGE_SHIFT_4K,2142end >> IOMMU_PAGE_SHIFT_4K);21432144table_group->tables[0] = tbl;21452146mutex_unlock(&dma_win_init_mutex);21472148goto exit;2149}2150}2151}21522153ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",2154&ddw_avail[0], DDW_APPLICABLE_SIZE);2155if (ret) {2156dev_info(&pdev->dev, "ibm,ddw-applicable not found\n");2157goto out_failed;2158}2159ret = -ENODEV;21602161pr_err("%s: Calling query %pOF\n", __func__, pdn);2162ret = query_ddw(pdev, ddw_avail, &query, pdn);2163if (ret)2164goto out_failed;2165ret = -ENODEV;21662167len = window_shift;2168if (query.largest_available_block < (1ULL << (len - page_shift))) {2169dev_dbg(&pdev->dev, "can't map window 0x%llx with %llu %llu-sized pages\n",21701ULL << len, query.largest_available_block,21711ULL << page_shift);2172ret = -EINVAL; /* Retry with smaller window size */2173goto out_unlock;2174}21752176if (create_ddw(pdev, ddw_avail, &create, page_shift, len)) {2177pr_err("%s: Create ddw failed %pOF\n", __func__, pdn);2178goto out_failed;2179}21802181win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;2182win64 = ddw_property_create(DMA64_PROPNAME, create.liobn, win_addr, page_shift, len);2183if (!win64)2184goto remove_window;21852186ret = of_add_property(pdn, win64);2187if (ret) {2188dev_err(&pdev->dev, "unable to add DMA window property for %pOF: %ld", pdn, ret);2189goto free_property;2190}2191ret = -ENODEV;21922193window = ddw_list_new_entry(pdn, win64->value);2194if (!window)2195goto remove_property;21962197window->direct = false;21982199for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {2200const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;22012202/* Look for MMIO32 */2203if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {2204start = pci->phb->mem_resources[i].start;2205end = pci->phb->mem_resources[i].end;2206break;2207}2208}22092210/* New table for using DDW instead of the default DMA window */2211iommu_table_setparms_common(tbl, pci->phb->bus->number, create.liobn, win_addr,22121UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);2213iommu_init_table(tbl, pci->phb->node, start >> page_shift, end >> page_shift);22142215pci->table_group->tables[num] = tbl;2216set_iommu_table_base(&pdev->dev, tbl);2217pdev->dev.archdata.dma_offset = win_addr;22182219spin_lock(&dma_win_list_lock);2220list_add(&window->list, &dma_win_list);2221spin_unlock(&dma_win_list_lock);22222223mutex_unlock(&dma_win_init_mutex);22242225goto exit;22262227remove_property:2228of_remove_property(pdn, win64);2229free_property:2230kfree(win64->name);2231kfree(win64->value);2232kfree(win64);2233remove_window:2234__remove_dma_window(pdn, ddw_avail, create.liobn);22352236out_failed:2237fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);2238if (!fpdn)2239goto out_unlock;2240fpdn->pdn = pdn;2241list_add(&fpdn->list, &failed_ddw_pdn_list);22422243out_unlock:2244mutex_unlock(&dma_win_init_mutex);22452246return ret;2247exit:2248/* Allocate the userspace view */2249pseries_tce_iommu_userspace_view_alloc(tbl);2250tbl->it_allocated_size = spapr_tce_get_table_size(page_shift, window_size, levels);22512252*ptbl = iommu_tce_table_get(tbl);22532254return 0;2255}22562257static bool is_default_window_table(struct iommu_table_group *table_group, struct iommu_table *tbl)2258{2259if (((tbl->it_size << tbl->it_page_shift) <= table_group->tce32_size) &&2260(tbl->it_page_shift == IOMMU_PAGE_SHIFT_4K))2261return true;22622263return false;2264}22652266static long spapr_tce_set_window(struct iommu_table_group *table_group,2267int num, struct iommu_table *tbl)2268{2269return tbl == table_group->tables[num] ? 0 : -EPERM;2270}22712272static long spapr_tce_unset_window(struct iommu_table_group *table_group, int num)2273{2274struct pci_dev *pdev = iommu_group_get_first_pci_dev(table_group->group);2275struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;2276struct iommu_table *tbl = table_group->tables[num];2277struct failed_ddw_pdn *fpdn;2278struct dma_win *window;2279const char *win_name;2280int ret = -ENODEV;22812282if (!tbl) /* The table was never created OR window was never opened */2283return 0;22842285mutex_lock(&dma_win_init_mutex);22862287if ((num == 0) && is_default_window_table(table_group, tbl))2288win_name = "ibm,dma-window";2289else2290win_name = DMA64_PROPNAME;22912292pdn = pci_dma_find(dn, NULL);2293if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */2294dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn);2295goto out_failed;2296}22972298/* Dont clear the TCEs, User should have done it */2299if (remove_dma_window_named(pdn, true, win_name, false)) {2300pr_err("%s: The existing DDW removal failed for node %pOF\n", __func__, pdn);2301goto out_failed; /* Could not remove it either! */2302}23032304if (strcmp(win_name, DMA64_PROPNAME) == 0) {2305spin_lock(&dma_win_list_lock);2306list_for_each_entry(window, &dma_win_list, list) {2307if (window->device == pdn) {2308list_del(&window->list);2309kfree(window);2310break;2311}2312}2313spin_unlock(&dma_win_list_lock);2314}23152316iommu_tce_table_put(table_group->tables[num]);2317table_group->tables[num] = NULL;23182319ret = 0;23202321goto out_unlock;23222323out_failed:2324fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);2325if (!fpdn)2326goto out_unlock;2327fpdn->pdn = pdn;2328list_add(&fpdn->list, &failed_ddw_pdn_list);23292330out_unlock:2331mutex_unlock(&dma_win_init_mutex);23322333return ret;2334}23352336static long spapr_tce_take_ownership(struct iommu_table_group *table_group, struct device *dev)2337{2338struct iommu_table *tbl = table_group->tables[0];2339struct pci_dev *pdev = to_pci_dev(dev);2340struct device_node *dn = pci_device_to_OF_node(pdev);2341struct device_node *pdn;23422343/* SRIOV VFs using direct map by the host driver OR multifunction devices2344* where the ownership was taken on the attempt by the first function2345*/2346if (!tbl && (table_group->max_dynamic_windows_supported != 1))2347return 0;23482349mutex_lock(&dma_win_init_mutex);23502351pdn = pci_dma_find(dn, NULL);2352if (!pdn || !PCI_DN(pdn)) { /* Niether of 32s|64-bit exist! */2353dev_warn(&pdev->dev, "No dma-windows exist for the node %pOF\n", pdn);2354mutex_unlock(&dma_win_init_mutex);2355return -1;2356}23572358/*2359* Though rtas call reset-pe removes the DDW, it doesn't clear the entries on the table2360* if there are any. In case of direct map, the entries will be left over, which2361* is fine for PEs with 2 DMA windows where the second window is created with create-pe2362* at which point the table is cleared. However, on VFs having only one DMA window, the2363* default window would end up seeing the entries left over from the direct map done2364* on the second window. So, remove the ddw explicitly so that clean_dma_window()2365* cleans up the entries if any.2366*/2367if (remove_dynamic_dma_windows(pdev, pdn)) {2368dev_warn(&pdev->dev, "The existing DDW removal failed for node %pOF\n", pdn);2369mutex_unlock(&dma_win_init_mutex);2370return -1;2371}23722373/* The table_group->tables[0] is not null now, it must be the default window2374* Remove it, let the userspace create it as it needs.2375*/2376if (table_group->tables[0]) {2377remove_dma_window_named(pdn, true, "ibm,dma-window", true);2378iommu_tce_table_put(tbl);2379table_group->tables[0] = NULL;2380}2381set_iommu_table_base(dev, NULL);23822383mutex_unlock(&dma_win_init_mutex);23842385return 0;2386}23872388static void spapr_tce_release_ownership(struct iommu_table_group *table_group, struct device *dev)2389{2390struct iommu_table *tbl = table_group->tables[0];23912392if (tbl) { /* Default window already restored */2393return;2394}23952396mutex_lock(&dma_win_init_mutex);23972398/* Restore the default window */2399pseries_setup_default_iommu_config(table_group, dev);24002401mutex_unlock(&dma_win_init_mutex);24022403return;2404}24052406static struct iommu_table_group_ops spapr_tce_table_group_ops = {2407.get_table_size = spapr_tce_get_table_size,2408.create_table = spapr_tce_create_table,2409.set_window = spapr_tce_set_window,2410.unset_window = spapr_tce_unset_window,2411.take_ownership = spapr_tce_take_ownership,2412.release_ownership = spapr_tce_release_ownership,2413};2414#endif24152416static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,2417void *data)2418{2419struct dma_win *window;2420struct memory_notify *arg = data;2421int ret = 0;24222423/* This notifier can get called when onlining persistent memory as well.2424* TCEs are not pre-mapped for persistent memory. Persistent memory will2425* always be above ddw_memory_hotplug_max()2426*/24272428switch (action) {2429case MEM_GOING_ONLINE:2430spin_lock(&dma_win_list_lock);2431list_for_each_entry(window, &dma_win_list, list) {2432if (window->direct && (arg->start_pfn << PAGE_SHIFT) <2433ddw_memory_hotplug_max()) {2434ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,2435arg->nr_pages, window->prop);2436}2437/* XXX log error */2438}2439spin_unlock(&dma_win_list_lock);2440break;2441case MEM_CANCEL_ONLINE:2442case MEM_OFFLINE:2443spin_lock(&dma_win_list_lock);2444list_for_each_entry(window, &dma_win_list, list) {2445if (window->direct && (arg->start_pfn << PAGE_SHIFT) <2446ddw_memory_hotplug_max()) {2447ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,2448arg->nr_pages, window->prop);2449}2450/* XXX log error */2451}2452spin_unlock(&dma_win_list_lock);2453break;2454default:2455break;2456}2457if (ret && action != MEM_CANCEL_ONLINE)2458return NOTIFY_BAD;24592460return NOTIFY_OK;2461}24622463static struct notifier_block iommu_mem_nb = {2464.notifier_call = iommu_mem_notifier,2465};24662467static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)2468{2469int err = NOTIFY_OK;2470struct of_reconfig_data *rd = data;2471struct device_node *np = rd->dn;2472struct pci_dn *pci = PCI_DN(np);2473struct dma_win *window;24742475switch (action) {2476case OF_RECONFIG_DETACH_NODE:2477/*2478* Removing the property will invoke the reconfig2479* notifier again, which causes dead-lock on the2480* read-write semaphore of the notifier chain. So2481* we have to remove the property when releasing2482* the device node.2483*/2484if (remove_dma_window_named(np, false, DIRECT64_PROPNAME, true))2485remove_dma_window_named(np, false, DMA64_PROPNAME, true);24862487if (pci && pci->table_group)2488iommu_pseries_free_group(pci->table_group,2489np->full_name);24902491spin_lock(&dma_win_list_lock);2492list_for_each_entry(window, &dma_win_list, list) {2493if (window->device == np) {2494list_del(&window->list);2495kfree(window);2496break;2497}2498}2499spin_unlock(&dma_win_list_lock);2500break;2501default:2502err = NOTIFY_DONE;2503break;2504}2505return err;2506}25072508static struct notifier_block iommu_reconfig_nb = {2509.notifier_call = iommu_reconfig_notifier,2510};25112512/* These are called very early. */2513void __init iommu_init_early_pSeries(void)2514{2515if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))2516return;25172518if (firmware_has_feature(FW_FEATURE_LPAR)) {2519pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;2520pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;2521if (!disable_ddw)2522pseries_pci_controller_ops.iommu_bypass_supported =2523iommu_bypass_supported_pSeriesLP;2524} else {2525pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;2526pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;2527}252825292530of_reconfig_notifier_register(&iommu_reconfig_nb);2531register_memory_notifier(&iommu_mem_nb);25322533set_pci_dma_ops(&dma_iommu_ops);2534}25352536static int __init disable_multitce(char *str)2537{2538if (strcmp(str, "off") == 0 &&2539firmware_has_feature(FW_FEATURE_LPAR) &&2540(firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||2541firmware_has_feature(FW_FEATURE_STUFF_TCE))) {2542printk(KERN_INFO "Disabling MULTITCE firmware feature\n");2543powerpc_firmware_features &=2544~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);2545}2546return 1;2547}25482549__setup("multitce=", disable_multitce);25502551#ifdef CONFIG_SPAPR_TCE_IOMMU2552struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose,2553struct pci_dev *pdev)2554{2555struct device_node *pdn, *dn = pdev->dev.of_node;2556struct iommu_group *grp;2557struct pci_dn *pci;25582559pdn = pci_dma_find(dn, NULL);2560if (!pdn || !PCI_DN(pdn))2561return ERR_PTR(-ENODEV);25622563pci = PCI_DN(pdn);2564if (!pci->table_group)2565return ERR_PTR(-ENODEV);25662567grp = pci->table_group->group;2568if (!grp)2569return ERR_PTR(-ENODEV);25702571return iommu_group_ref_get(grp);2572}2573#endif257425752576