Path: blob/master/arch/powerpc/platforms/pseries/iommu.c
10818 views
/*1* Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation2*3* Rewrite, cleanup:4*5* Copyright (C) 2004 Olof Johansson <[email protected]>, IBM Corporation6* Copyright (C) 2006 Olof Johansson <[email protected]>7*8* Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.9*10*11* This program is free software; you can redistribute it and/or modify12* it under the terms of the GNU General Public License as published by13* the Free Software Foundation; either version 2 of the License, or14* (at your option) any later version.15*16* This program is distributed in the hope that it will be useful,17* but WITHOUT ANY WARRANTY; without even the implied warranty of18* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the19* GNU General Public License for more details.20*21* You should have received a copy of the GNU General Public License22* along with this program; if not, write to the Free Software23* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA24*/2526#include <linux/init.h>27#include <linux/types.h>28#include <linux/slab.h>29#include <linux/mm.h>30#include <linux/spinlock.h>31#include <linux/string.h>32#include <linux/pci.h>33#include <linux/dma-mapping.h>34#include <linux/crash_dump.h>35#include <linux/memory.h>36#include <asm/io.h>37#include <asm/prom.h>38#include <asm/rtas.h>39#include <asm/iommu.h>40#include <asm/pci-bridge.h>41#include <asm/machdep.h>42#include <asm/abs_addr.h>43#include <asm/pSeries_reconfig.h>44#include <asm/firmware.h>45#include <asm/tce.h>46#include <asm/ppc-pci.h>47#include <asm/udbg.h>48#include <asm/mmzone.h>4950#include "plpar_wrappers.h"515253static int tce_build_pSeries(struct iommu_table *tbl, long index,54long npages, unsigned long uaddr,55enum dma_data_direction direction,56struct dma_attrs *attrs)57{58u64 proto_tce;59u64 *tcep;60u64 rpn;6162proto_tce = TCE_PCI_READ; // Read allowed6364if (direction != DMA_TO_DEVICE)65proto_tce |= TCE_PCI_WRITE;6667tcep = ((u64 *)tbl->it_base) + index;6869while (npages--) {70/* can't move this out since we might cross MEMBLOCK boundary */71rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;72*tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;7374uaddr += TCE_PAGE_SIZE;75tcep++;76}77return 0;78}798081static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)82{83u64 *tcep;8485tcep = ((u64 *)tbl->it_base) + index;8687while (npages--)88*(tcep++) = 0;89}9091static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)92{93u64 *tcep;9495tcep = ((u64 *)tbl->it_base) + index;9697return *tcep;98}99100static void tce_free_pSeriesLP(struct iommu_table*, long, long);101static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);102103static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,104long npages, unsigned long uaddr,105enum dma_data_direction direction,106struct dma_attrs *attrs)107{108u64 rc = 0;109u64 proto_tce, tce;110u64 rpn;111int ret = 0;112long tcenum_start = tcenum, npages_start = npages;113114rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;115proto_tce = TCE_PCI_READ;116if (direction != DMA_TO_DEVICE)117proto_tce |= TCE_PCI_WRITE;118119while (npages--) {120tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;121rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);122123if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {124ret = (int)rc;125tce_free_pSeriesLP(tbl, tcenum_start,126(npages_start - (npages + 1)));127break;128}129130if (rc && printk_ratelimit()) {131printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);132printk("\tindex = 0x%llx\n", (u64)tbl->it_index);133printk("\ttcenum = 0x%llx\n", (u64)tcenum);134printk("\ttce val = 0x%llx\n", tce );135show_stack(current, (unsigned long *)__get_SP());136}137138tcenum++;139rpn++;140}141return ret;142}143144static DEFINE_PER_CPU(u64 *, tce_page);145146static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,147long npages, unsigned long uaddr,148enum dma_data_direction direction,149struct dma_attrs *attrs)150{151u64 rc = 0;152u64 proto_tce;153u64 *tcep;154u64 rpn;155long l, limit;156long tcenum_start = tcenum, npages_start = npages;157int ret = 0;158159if (npages == 1) {160return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,161direction, attrs);162}163164tcep = __get_cpu_var(tce_page);165166/* This is safe to do since interrupts are off when we're called167* from iommu_alloc{,_sg}()168*/169if (!tcep) {170tcep = (u64 *)__get_free_page(GFP_ATOMIC);171/* If allocation fails, fall back to the loop implementation */172if (!tcep) {173return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,174direction, attrs);175}176__get_cpu_var(tce_page) = tcep;177}178179rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;180proto_tce = TCE_PCI_READ;181if (direction != DMA_TO_DEVICE)182proto_tce |= TCE_PCI_WRITE;183184/* We can map max one pageful of TCEs at a time */185do {186/*187* Set up the page with TCE data, looping through and setting188* the values.189*/190limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);191192for (l = 0; l < limit; l++) {193tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;194rpn++;195}196197rc = plpar_tce_put_indirect((u64)tbl->it_index,198(u64)tcenum << 12,199(u64)virt_to_abs(tcep),200limit);201202npages -= limit;203tcenum += limit;204} while (npages > 0 && !rc);205206if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {207ret = (int)rc;208tce_freemulti_pSeriesLP(tbl, tcenum_start,209(npages_start - (npages + limit)));210return ret;211}212213if (rc && printk_ratelimit()) {214printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);215printk("\tindex = 0x%llx\n", (u64)tbl->it_index);216printk("\tnpages = 0x%llx\n", (u64)npages);217printk("\ttce[0] val = 0x%llx\n", tcep[0]);218show_stack(current, (unsigned long *)__get_SP());219}220return ret;221}222223static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)224{225u64 rc;226227while (npages--) {228rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);229230if (rc && printk_ratelimit()) {231printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);232printk("\tindex = 0x%llx\n", (u64)tbl->it_index);233printk("\ttcenum = 0x%llx\n", (u64)tcenum);234show_stack(current, (unsigned long *)__get_SP());235}236237tcenum++;238}239}240241242static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)243{244u64 rc;245246rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);247248if (rc && printk_ratelimit()) {249printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");250printk("\trc = %lld\n", rc);251printk("\tindex = 0x%llx\n", (u64)tbl->it_index);252printk("\tnpages = 0x%llx\n", (u64)npages);253show_stack(current, (unsigned long *)__get_SP());254}255}256257static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)258{259u64 rc;260unsigned long tce_ret;261262rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);263264if (rc && printk_ratelimit()) {265printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);266printk("\tindex = 0x%llx\n", (u64)tbl->it_index);267printk("\ttcenum = 0x%llx\n", (u64)tcenum);268show_stack(current, (unsigned long *)__get_SP());269}270271return tce_ret;272}273274/* this is compatible with cells for the device tree property */275struct dynamic_dma_window_prop {276__be32 liobn; /* tce table number */277__be64 dma_base; /* address hi,lo */278__be32 tce_shift; /* ilog2(tce_page_size) */279__be32 window_shift; /* ilog2(tce_window_size) */280};281282struct direct_window {283struct device_node *device;284const struct dynamic_dma_window_prop *prop;285struct list_head list;286};287288/* Dynamic DMA Window support */289struct ddw_query_response {290u32 windows_available;291u32 largest_available_block;292u32 page_size;293u32 migration_capable;294};295296struct ddw_create_response {297u32 liobn;298u32 addr_hi;299u32 addr_lo;300};301302static LIST_HEAD(direct_window_list);303/* prevents races between memory on/offline and window creation */304static DEFINE_SPINLOCK(direct_window_list_lock);305/* protects initializing window twice for same device */306static DEFINE_MUTEX(direct_window_init_mutex);307#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"308309static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,310unsigned long num_pfn, const void *arg)311{312const struct dynamic_dma_window_prop *maprange = arg;313int rc;314u64 tce_size, num_tce, dma_offset, next;315u32 tce_shift;316long limit;317318tce_shift = be32_to_cpu(maprange->tce_shift);319tce_size = 1ULL << tce_shift;320next = start_pfn << PAGE_SHIFT;321num_tce = num_pfn << PAGE_SHIFT;322323/* round back to the beginning of the tce page size */324num_tce += next & (tce_size - 1);325next &= ~(tce_size - 1);326327/* covert to number of tces */328num_tce |= tce_size - 1;329num_tce >>= tce_shift;330331do {332/*333* Set up the page with TCE data, looping through and setting334* the values.335*/336limit = min_t(long, num_tce, 512);337dma_offset = next + be64_to_cpu(maprange->dma_base);338339rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),340dma_offset,3410, limit);342num_tce -= limit;343} while (num_tce > 0 && !rc);344345return rc;346}347348static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,349unsigned long num_pfn, const void *arg)350{351const struct dynamic_dma_window_prop *maprange = arg;352u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn;353u32 tce_shift;354u64 rc = 0;355long l, limit;356357local_irq_disable(); /* to protect tcep and the page behind it */358tcep = __get_cpu_var(tce_page);359360if (!tcep) {361tcep = (u64 *)__get_free_page(GFP_ATOMIC);362if (!tcep) {363local_irq_enable();364return -ENOMEM;365}366__get_cpu_var(tce_page) = tcep;367}368369proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;370371liobn = (u64)be32_to_cpu(maprange->liobn);372tce_shift = be32_to_cpu(maprange->tce_shift);373tce_size = 1ULL << tce_shift;374next = start_pfn << PAGE_SHIFT;375num_tce = num_pfn << PAGE_SHIFT;376377/* round back to the beginning of the tce page size */378num_tce += next & (tce_size - 1);379next &= ~(tce_size - 1);380381/* covert to number of tces */382num_tce |= tce_size - 1;383num_tce >>= tce_shift;384385/* We can map max one pageful of TCEs at a time */386do {387/*388* Set up the page with TCE data, looping through and setting389* the values.390*/391limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);392dma_offset = next + be64_to_cpu(maprange->dma_base);393394for (l = 0; l < limit; l++) {395tcep[l] = proto_tce | next;396next += tce_size;397}398399rc = plpar_tce_put_indirect(liobn,400dma_offset,401(u64)virt_to_abs(tcep),402limit);403404num_tce -= limit;405} while (num_tce > 0 && !rc);406407/* error cleanup: caller will clear whole range */408409local_irq_enable();410return rc;411}412413static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,414unsigned long num_pfn, void *arg)415{416return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);417}418419420#ifdef CONFIG_PCI421static void iommu_table_setparms(struct pci_controller *phb,422struct device_node *dn,423struct iommu_table *tbl)424{425struct device_node *node;426const unsigned long *basep;427const u32 *sizep;428429node = phb->dn;430431basep = of_get_property(node, "linux,tce-base", NULL);432sizep = of_get_property(node, "linux,tce-size", NULL);433if (basep == NULL || sizep == NULL) {434printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "435"missing tce entries !\n", dn->full_name);436return;437}438439tbl->it_base = (unsigned long)__va(*basep);440441if (!is_kdump_kernel())442memset((void *)tbl->it_base, 0, *sizep);443444tbl->it_busno = phb->bus->number;445446/* Units of tce entries */447tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;448449/* Test if we are going over 2GB of DMA space */450if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {451udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");452panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");453}454455phb->dma_window_base_cur += phb->dma_window_size;456457/* Set the tce table size - measured in entries */458tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;459460tbl->it_index = 0;461tbl->it_blocksize = 16;462tbl->it_type = TCE_PCI;463}464465/*466* iommu_table_setparms_lpar467*468* Function: On pSeries LPAR systems, return TCE table info, given a pci bus.469*/470static void iommu_table_setparms_lpar(struct pci_controller *phb,471struct device_node *dn,472struct iommu_table *tbl,473const void *dma_window)474{475unsigned long offset, size;476477of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);478479tbl->it_busno = phb->bus->number;480tbl->it_base = 0;481tbl->it_blocksize = 16;482tbl->it_type = TCE_PCI;483tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;484tbl->it_size = size >> IOMMU_PAGE_SHIFT;485}486487static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)488{489struct device_node *dn;490struct iommu_table *tbl;491struct device_node *isa_dn, *isa_dn_orig;492struct device_node *tmp;493struct pci_dn *pci;494int children;495496dn = pci_bus_to_OF_node(bus);497498pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name);499500if (bus->self) {501/* This is not a root bus, any setup will be done for the502* device-side of the bridge in iommu_dev_setup_pSeries().503*/504return;505}506pci = PCI_DN(dn);507508/* Check if the ISA bus on the system is under509* this PHB.510*/511isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");512513while (isa_dn && isa_dn != dn)514isa_dn = isa_dn->parent;515516if (isa_dn_orig)517of_node_put(isa_dn_orig);518519/* Count number of direct PCI children of the PHB. */520for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)521children++;522523pr_debug("Children: %d\n", children);524525/* Calculate amount of DMA window per slot. Each window must be526* a power of two (due to pci_alloc_consistent requirements).527*528* Keep 256MB aside for PHBs with ISA.529*/530531if (!isa_dn) {532/* No ISA/IDE - just set window size and return */533pci->phb->dma_window_size = 0x80000000ul; /* To be divided */534535while (pci->phb->dma_window_size * children > 0x80000000ul)536pci->phb->dma_window_size >>= 1;537pr_debug("No ISA/IDE, window size is 0x%llx\n",538pci->phb->dma_window_size);539pci->phb->dma_window_base_cur = 0;540541return;542}543544/* If we have ISA, then we probably have an IDE545* controller too. Allocate a 128MB table but546* skip the first 128MB to avoid stepping on ISA547* space.548*/549pci->phb->dma_window_size = 0x8000000ul;550pci->phb->dma_window_base_cur = 0x8000000ul;551552tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,553pci->phb->node);554555iommu_table_setparms(pci->phb, dn, tbl);556pci->iommu_table = iommu_init_table(tbl, pci->phb->node);557558/* Divide the rest (1.75GB) among the children */559pci->phb->dma_window_size = 0x80000000ul;560while (pci->phb->dma_window_size * children > 0x70000000ul)561pci->phb->dma_window_size >>= 1;562563pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);564}565566567static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)568{569struct iommu_table *tbl;570struct device_node *dn, *pdn;571struct pci_dn *ppci;572const void *dma_window = NULL;573574dn = pci_bus_to_OF_node(bus);575576pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n",577dn->full_name);578579/* Find nearest ibm,dma-window, walking up the device tree */580for (pdn = dn; pdn != NULL; pdn = pdn->parent) {581dma_window = of_get_property(pdn, "ibm,dma-window", NULL);582if (dma_window != NULL)583break;584}585586if (dma_window == NULL) {587pr_debug(" no ibm,dma-window property !\n");588return;589}590591ppci = PCI_DN(pdn);592593pr_debug(" parent is %s, iommu_table: 0x%p\n",594pdn->full_name, ppci->iommu_table);595596if (!ppci->iommu_table) {597tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,598ppci->phb->node);599iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);600ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);601pr_debug(" created table: %p\n", ppci->iommu_table);602}603}604605606static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)607{608struct device_node *dn;609struct iommu_table *tbl;610611pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));612613dn = dev->dev.of_node;614615/* If we're the direct child of a root bus, then we need to allocate616* an iommu table ourselves. The bus setup code should have setup617* the window sizes already.618*/619if (!dev->bus->self) {620struct pci_controller *phb = PCI_DN(dn)->phb;621622pr_debug(" --> first child, no bridge. Allocating iommu table.\n");623tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,624phb->node);625iommu_table_setparms(phb, dn, tbl);626PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);627set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);628return;629}630631/* If this device is further down the bus tree, search upwards until632* an already allocated iommu table is found and use that.633*/634635while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)636dn = dn->parent;637638if (dn && PCI_DN(dn))639set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);640else641printk(KERN_WARNING "iommu: Device %s has no iommu table\n",642pci_name(dev));643}644645static int __read_mostly disable_ddw;646647static int __init disable_ddw_setup(char *str)648{649disable_ddw = 1;650printk(KERN_INFO "ppc iommu: disabling ddw.\n");651652return 0;653}654655early_param("disable_ddw", disable_ddw_setup);656657static void remove_ddw(struct device_node *np)658{659struct dynamic_dma_window_prop *dwp;660struct property *win64;661const u32 *ddw_avail;662u64 liobn;663int len, ret;664665ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);666win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);667if (!win64)668return;669670if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp))671goto delprop;672673dwp = win64->value;674liobn = (u64)be32_to_cpu(dwp->liobn);675676/* clear the whole window, note the arg is in kernel pages */677ret = tce_clearrange_multi_pSeriesLP(0,6781ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);679if (ret)680pr_warning("%s failed to clear tces in window.\n",681np->full_name);682else683pr_debug("%s successfully cleared tces in window.\n",684np->full_name);685686ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);687if (ret)688pr_warning("%s: failed to remove direct window: rtas returned "689"%d to ibm,remove-pe-dma-window(%x) %llx\n",690np->full_name, ret, ddw_avail[2], liobn);691else692pr_debug("%s: successfully removed direct window: rtas returned "693"%d to ibm,remove-pe-dma-window(%x) %llx\n",694np->full_name, ret, ddw_avail[2], liobn);695696delprop:697ret = prom_remove_property(np, win64);698if (ret)699pr_warning("%s: failed to remove direct window property: %d\n",700np->full_name, ret);701}702703static u64 find_existing_ddw(struct device_node *pdn)704{705struct direct_window *window;706const struct dynamic_dma_window_prop *direct64;707u64 dma_addr = 0;708709spin_lock(&direct_window_list_lock);710/* check if we already created a window and dupe that config if so */711list_for_each_entry(window, &direct_window_list, list) {712if (window->device == pdn) {713direct64 = window->prop;714dma_addr = direct64->dma_base;715break;716}717}718spin_unlock(&direct_window_list_lock);719720return dma_addr;721}722723static int find_existing_ddw_windows(void)724{725int len;726struct device_node *pdn;727struct direct_window *window;728const struct dynamic_dma_window_prop *direct64;729730if (!firmware_has_feature(FW_FEATURE_LPAR))731return 0;732733for_each_node_with_property(pdn, DIRECT64_PROPNAME) {734direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);735if (!direct64)736continue;737738window = kzalloc(sizeof(*window), GFP_KERNEL);739if (!window || len < sizeof(struct dynamic_dma_window_prop)) {740kfree(window);741remove_ddw(pdn);742continue;743}744745window->device = pdn;746window->prop = direct64;747spin_lock(&direct_window_list_lock);748list_add(&window->list, &direct_window_list);749spin_unlock(&direct_window_list_lock);750}751752return 0;753}754machine_arch_initcall(pseries, find_existing_ddw_windows);755756static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,757struct ddw_query_response *query)758{759struct device_node *dn;760struct pci_dn *pcidn;761u32 cfg_addr;762u64 buid;763int ret;764765/*766* Get the config address and phb buid of the PE window.767* Rely on eeh to retrieve this for us.768* Retrieve them from the pci device, not the node with the769* dma-window property770*/771dn = pci_device_to_OF_node(dev);772pcidn = PCI_DN(dn);773cfg_addr = pcidn->eeh_config_addr;774if (pcidn->eeh_pe_config_addr)775cfg_addr = pcidn->eeh_pe_config_addr;776buid = pcidn->phb->buid;777ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,778cfg_addr, BUID_HI(buid), BUID_LO(buid));779dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"780" returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),781BUID_LO(buid), ret);782return ret;783}784785static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,786struct ddw_create_response *create, int page_shift,787int window_shift)788{789struct device_node *dn;790struct pci_dn *pcidn;791u32 cfg_addr;792u64 buid;793int ret;794795/*796* Get the config address and phb buid of the PE window.797* Rely on eeh to retrieve this for us.798* Retrieve them from the pci device, not the node with the799* dma-window property800*/801dn = pci_device_to_OF_node(dev);802pcidn = PCI_DN(dn);803cfg_addr = pcidn->eeh_config_addr;804if (pcidn->eeh_pe_config_addr)805cfg_addr = pcidn->eeh_pe_config_addr;806buid = pcidn->phb->buid;807808do {809/* extra outputs are LIOBN and dma-addr (hi, lo) */810ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr,811BUID_HI(buid), BUID_LO(buid), page_shift, window_shift);812} while (rtas_busy_delay(ret));813dev_info(&dev->dev,814"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "815"(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],816cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,817window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);818819return ret;820}821822/*823* If the PE supports dynamic dma windows, and there is space for a table824* that can map all pages in a linear offset, then setup such a table,825* and record the dma-offset in the struct device.826*827* dev: the pci device we are checking828* pdn: the parent pe node with the ibm,dma_window property829* Future: also check if we can remap the base window for our base page size830*831* returns the dma offset for use by dma_set_mask832*/833static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)834{835int len, ret;836struct ddw_query_response query;837struct ddw_create_response create;838int page_shift;839u64 dma_addr, max_addr;840struct device_node *dn;841const u32 *uninitialized_var(ddw_avail);842struct direct_window *window;843struct property *win64;844struct dynamic_dma_window_prop *ddwprop;845846mutex_lock(&direct_window_init_mutex);847848dma_addr = find_existing_ddw(pdn);849if (dma_addr != 0)850goto out_unlock;851852/*853* the ibm,ddw-applicable property holds the tokens for:854* ibm,query-pe-dma-window855* ibm,create-pe-dma-window856* ibm,remove-pe-dma-window857* for the given node in that order.858* the property is actually in the parent, not the PE859*/860ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len);861if (!ddw_avail || len < 3 * sizeof(u32))862goto out_unlock;863864/*865* Query if there is a second window of size to map the866* whole partition. Query returns number of windows, largest867* block assigned to PE (partition endpoint), and two bitmasks868* of page sizes: supported and supported for migrate-dma.869*/870dn = pci_device_to_OF_node(dev);871ret = query_ddw(dev, ddw_avail, &query);872if (ret != 0)873goto out_unlock;874875if (query.windows_available == 0) {876/*877* no additional windows are available for this device.878* We might be able to reallocate the existing window,879* trading in for a larger page size.880*/881dev_dbg(&dev->dev, "no free dynamic windows");882goto out_unlock;883}884if (query.page_size & 4) {885page_shift = 24; /* 16MB */886} else if (query.page_size & 2) {887page_shift = 16; /* 64kB */888} else if (query.page_size & 1) {889page_shift = 12; /* 4kB */890} else {891dev_dbg(&dev->dev, "no supported direct page size in mask %x",892query.page_size);893goto out_unlock;894}895/* verify the window * number of ptes will map the partition */896/* check largest block * page size > max memory hotplug addr */897max_addr = memory_hotplug_max();898if (query.largest_available_block < (max_addr >> page_shift)) {899dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "900"%llu-sized pages\n", max_addr, query.largest_available_block,9011ULL << page_shift);902goto out_unlock;903}904len = order_base_2(max_addr);905win64 = kzalloc(sizeof(struct property), GFP_KERNEL);906if (!win64) {907dev_info(&dev->dev,908"couldn't allocate property for 64bit dma window\n");909goto out_unlock;910}911win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);912win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);913win64->length = sizeof(*ddwprop);914if (!win64->name || !win64->value) {915dev_info(&dev->dev,916"couldn't allocate property name and value\n");917goto out_free_prop;918}919920ret = create_ddw(dev, ddw_avail, &create, page_shift, len);921if (ret != 0)922goto out_free_prop;923924ddwprop->liobn = cpu_to_be32(create.liobn);925ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2));926ddwprop->tce_shift = cpu_to_be32(page_shift);927ddwprop->window_shift = cpu_to_be32(len);928929dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",930create.liobn, dn->full_name);931932window = kzalloc(sizeof(*window), GFP_KERNEL);933if (!window)934goto out_clear_window;935936ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,937win64->value, tce_setrange_multi_pSeriesLP_walk);938if (ret) {939dev_info(&dev->dev, "failed to map direct window for %s: %d\n",940dn->full_name, ret);941goto out_clear_window;942}943944ret = prom_add_property(pdn, win64);945if (ret) {946dev_err(&dev->dev, "unable to add dma window property for %s: %d",947pdn->full_name, ret);948goto out_clear_window;949}950951window->device = pdn;952window->prop = ddwprop;953spin_lock(&direct_window_list_lock);954list_add(&window->list, &direct_window_list);955spin_unlock(&direct_window_list_lock);956957dma_addr = of_read_number(&create.addr_hi, 2);958goto out_unlock;959960out_clear_window:961remove_ddw(pdn);962963out_free_prop:964kfree(win64->name);965kfree(win64->value);966kfree(win64);967968out_unlock:969mutex_unlock(&direct_window_init_mutex);970return dma_addr;971}972973static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)974{975struct device_node *pdn, *dn;976struct iommu_table *tbl;977const void *dma_window = NULL;978struct pci_dn *pci;979980pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));981982/* dev setup for LPAR is a little tricky, since the device tree might983* contain the dma-window properties per-device and not necessarily984* for the bus. So we need to search upwards in the tree until we985* either hit a dma-window property, OR find a parent with a table986* already allocated.987*/988dn = pci_device_to_OF_node(dev);989pr_debug(" node is %s\n", dn->full_name);990991for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;992pdn = pdn->parent) {993dma_window = of_get_property(pdn, "ibm,dma-window", NULL);994if (dma_window)995break;996}997998if (!pdn || !PCI_DN(pdn)) {999printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "1000"no DMA window found for pci dev=%s dn=%s\n",1001pci_name(dev), dn? dn->full_name : "<null>");1002return;1003}1004pr_debug(" parent is %s\n", pdn->full_name);10051006pci = PCI_DN(pdn);1007if (!pci->iommu_table) {1008tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,1009pci->phb->node);1010iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);1011pci->iommu_table = iommu_init_table(tbl, pci->phb->node);1012pr_debug(" created table: %p\n", pci->iommu_table);1013} else {1014pr_debug(" found DMA window, table: %p\n", pci->iommu_table);1015}10161017set_iommu_table_base(&dev->dev, pci->iommu_table);1018}10191020static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)1021{1022bool ddw_enabled = false;1023struct device_node *pdn, *dn;1024struct pci_dev *pdev;1025const void *dma_window = NULL;1026u64 dma_offset;10271028if (!dev->dma_mask)1029return -EIO;10301031if (!dev_is_pci(dev))1032goto check_mask;10331034pdev = to_pci_dev(dev);10351036/* only attempt to use a new window if 64-bit DMA is requested */1037if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {1038dn = pci_device_to_OF_node(pdev);1039dev_dbg(dev, "node is %s\n", dn->full_name);10401041/*1042* the device tree might contain the dma-window properties1043* per-device and not necessarily for the bus. So we need to1044* search upwards in the tree until we either hit a dma-window1045* property, OR find a parent with a table already allocated.1046*/1047for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;1048pdn = pdn->parent) {1049dma_window = of_get_property(pdn, "ibm,dma-window", NULL);1050if (dma_window)1051break;1052}1053if (pdn && PCI_DN(pdn)) {1054dma_offset = enable_ddw(pdev, pdn);1055if (dma_offset != 0) {1056dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);1057set_dma_offset(dev, dma_offset);1058set_dma_ops(dev, &dma_direct_ops);1059ddw_enabled = true;1060}1061}1062}10631064/* fall back on iommu ops, restore table pointer with ops */1065if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {1066dev_info(dev, "Restoring 32-bit DMA via iommu\n");1067set_dma_ops(dev, &dma_iommu_ops);1068pci_dma_dev_setup_pSeriesLP(pdev);1069}10701071check_mask:1072if (!dma_supported(dev, dma_mask))1073return -EIO;10741075*dev->dma_mask = dma_mask;1076return 0;1077}10781079#else /* CONFIG_PCI */1080#define pci_dma_bus_setup_pSeries NULL1081#define pci_dma_dev_setup_pSeries NULL1082#define pci_dma_bus_setup_pSeriesLP NULL1083#define pci_dma_dev_setup_pSeriesLP NULL1084#define dma_set_mask_pSeriesLP NULL1085#endif /* !CONFIG_PCI */10861087static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,1088void *data)1089{1090struct direct_window *window;1091struct memory_notify *arg = data;1092int ret = 0;10931094switch (action) {1095case MEM_GOING_ONLINE:1096spin_lock(&direct_window_list_lock);1097list_for_each_entry(window, &direct_window_list, list) {1098ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,1099arg->nr_pages, window->prop);1100/* XXX log error */1101}1102spin_unlock(&direct_window_list_lock);1103break;1104case MEM_CANCEL_ONLINE:1105case MEM_OFFLINE:1106spin_lock(&direct_window_list_lock);1107list_for_each_entry(window, &direct_window_list, list) {1108ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,1109arg->nr_pages, window->prop);1110/* XXX log error */1111}1112spin_unlock(&direct_window_list_lock);1113break;1114default:1115break;1116}1117if (ret && action != MEM_CANCEL_ONLINE)1118return NOTIFY_BAD;11191120return NOTIFY_OK;1121}11221123static struct notifier_block iommu_mem_nb = {1124.notifier_call = iommu_mem_notifier,1125};11261127static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)1128{1129int err = NOTIFY_OK;1130struct device_node *np = node;1131struct pci_dn *pci = PCI_DN(np);1132struct direct_window *window;11331134switch (action) {1135case PSERIES_RECONFIG_REMOVE:1136if (pci && pci->iommu_table)1137iommu_free_table(pci->iommu_table, np->full_name);11381139spin_lock(&direct_window_list_lock);1140list_for_each_entry(window, &direct_window_list, list) {1141if (window->device == np) {1142list_del(&window->list);1143kfree(window);1144break;1145}1146}1147spin_unlock(&direct_window_list_lock);11481149/*1150* Because the notifier runs after isolation of the1151* slot, we are guaranteed any DMA window has already1152* been revoked and the TCEs have been marked invalid,1153* so we don't need a call to remove_ddw(np). However,1154* if an additional notifier action is added before the1155* isolate call, we should update this code for1156* completeness with such a call.1157*/1158break;1159default:1160err = NOTIFY_DONE;1161break;1162}1163return err;1164}11651166static struct notifier_block iommu_reconfig_nb = {1167.notifier_call = iommu_reconfig_notifier,1168};11691170/* These are called very early. */1171void iommu_init_early_pSeries(void)1172{1173if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))1174return;11751176if (firmware_has_feature(FW_FEATURE_LPAR)) {1177if (firmware_has_feature(FW_FEATURE_MULTITCE)) {1178ppc_md.tce_build = tce_buildmulti_pSeriesLP;1179ppc_md.tce_free = tce_freemulti_pSeriesLP;1180} else {1181ppc_md.tce_build = tce_build_pSeriesLP;1182ppc_md.tce_free = tce_free_pSeriesLP;1183}1184ppc_md.tce_get = tce_get_pSeriesLP;1185ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP;1186ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP;1187ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;1188} else {1189ppc_md.tce_build = tce_build_pSeries;1190ppc_md.tce_free = tce_free_pSeries;1191ppc_md.tce_get = tce_get_pseries;1192ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeries;1193ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeries;1194}119511961197pSeries_reconfig_notifier_register(&iommu_reconfig_nb);1198register_memory_notifier(&iommu_mem_nb);11991200set_pci_dma_ops(&dma_iommu_ops);1201}12021203static int __init disable_multitce(char *str)1204{1205if (strcmp(str, "off") == 0 &&1206firmware_has_feature(FW_FEATURE_LPAR) &&1207firmware_has_feature(FW_FEATURE_MULTITCE)) {1208printk(KERN_INFO "Disabling MULTITCE firmware feature\n");1209ppc_md.tce_build = tce_build_pSeriesLP;1210ppc_md.tce_free = tce_free_pSeriesLP;1211powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;1212}1213return 1;1214}12151216__setup("multitce=", disable_multitce);121712181219