Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/platforms/pseries/iommu.c
10818 views
1
/*
2
* Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
3
*
4
* Rewrite, cleanup:
5
*
6
* Copyright (C) 2004 Olof Johansson <[email protected]>, IBM Corporation
7
* Copyright (C) 2006 Olof Johansson <[email protected]>
8
*
9
* Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
10
*
11
*
12
* This program is free software; you can redistribute it and/or modify
13
* it under the terms of the GNU General Public License as published by
14
* the Free Software Foundation; either version 2 of the License, or
15
* (at your option) any later version.
16
*
17
* This program is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
* GNU General Public License for more details.
21
*
22
* You should have received a copy of the GNU General Public License
23
* along with this program; if not, write to the Free Software
24
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
*/
26
27
#include <linux/init.h>
28
#include <linux/types.h>
29
#include <linux/slab.h>
30
#include <linux/mm.h>
31
#include <linux/spinlock.h>
32
#include <linux/string.h>
33
#include <linux/pci.h>
34
#include <linux/dma-mapping.h>
35
#include <linux/crash_dump.h>
36
#include <linux/memory.h>
37
#include <asm/io.h>
38
#include <asm/prom.h>
39
#include <asm/rtas.h>
40
#include <asm/iommu.h>
41
#include <asm/pci-bridge.h>
42
#include <asm/machdep.h>
43
#include <asm/abs_addr.h>
44
#include <asm/pSeries_reconfig.h>
45
#include <asm/firmware.h>
46
#include <asm/tce.h>
47
#include <asm/ppc-pci.h>
48
#include <asm/udbg.h>
49
#include <asm/mmzone.h>
50
51
#include "plpar_wrappers.h"
52
53
54
static int tce_build_pSeries(struct iommu_table *tbl, long index,
55
long npages, unsigned long uaddr,
56
enum dma_data_direction direction,
57
struct dma_attrs *attrs)
58
{
59
u64 proto_tce;
60
u64 *tcep;
61
u64 rpn;
62
63
proto_tce = TCE_PCI_READ; // Read allowed
64
65
if (direction != DMA_TO_DEVICE)
66
proto_tce |= TCE_PCI_WRITE;
67
68
tcep = ((u64 *)tbl->it_base) + index;
69
70
while (npages--) {
71
/* can't move this out since we might cross MEMBLOCK boundary */
72
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
73
*tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
74
75
uaddr += TCE_PAGE_SIZE;
76
tcep++;
77
}
78
return 0;
79
}
80
81
82
static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
83
{
84
u64 *tcep;
85
86
tcep = ((u64 *)tbl->it_base) + index;
87
88
while (npages--)
89
*(tcep++) = 0;
90
}
91
92
static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
93
{
94
u64 *tcep;
95
96
tcep = ((u64 *)tbl->it_base) + index;
97
98
return *tcep;
99
}
100
101
static void tce_free_pSeriesLP(struct iommu_table*, long, long);
102
static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
103
104
static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
105
long npages, unsigned long uaddr,
106
enum dma_data_direction direction,
107
struct dma_attrs *attrs)
108
{
109
u64 rc = 0;
110
u64 proto_tce, tce;
111
u64 rpn;
112
int ret = 0;
113
long tcenum_start = tcenum, npages_start = npages;
114
115
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
116
proto_tce = TCE_PCI_READ;
117
if (direction != DMA_TO_DEVICE)
118
proto_tce |= TCE_PCI_WRITE;
119
120
while (npages--) {
121
tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
122
rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
123
124
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
125
ret = (int)rc;
126
tce_free_pSeriesLP(tbl, tcenum_start,
127
(npages_start - (npages + 1)));
128
break;
129
}
130
131
if (rc && printk_ratelimit()) {
132
printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
133
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
134
printk("\ttcenum = 0x%llx\n", (u64)tcenum);
135
printk("\ttce val = 0x%llx\n", tce );
136
show_stack(current, (unsigned long *)__get_SP());
137
}
138
139
tcenum++;
140
rpn++;
141
}
142
return ret;
143
}
144
145
static DEFINE_PER_CPU(u64 *, tce_page);
146
147
static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
148
long npages, unsigned long uaddr,
149
enum dma_data_direction direction,
150
struct dma_attrs *attrs)
151
{
152
u64 rc = 0;
153
u64 proto_tce;
154
u64 *tcep;
155
u64 rpn;
156
long l, limit;
157
long tcenum_start = tcenum, npages_start = npages;
158
int ret = 0;
159
160
if (npages == 1) {
161
return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
162
direction, attrs);
163
}
164
165
tcep = __get_cpu_var(tce_page);
166
167
/* This is safe to do since interrupts are off when we're called
168
* from iommu_alloc{,_sg}()
169
*/
170
if (!tcep) {
171
tcep = (u64 *)__get_free_page(GFP_ATOMIC);
172
/* If allocation fails, fall back to the loop implementation */
173
if (!tcep) {
174
return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
175
direction, attrs);
176
}
177
__get_cpu_var(tce_page) = tcep;
178
}
179
180
rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
181
proto_tce = TCE_PCI_READ;
182
if (direction != DMA_TO_DEVICE)
183
proto_tce |= TCE_PCI_WRITE;
184
185
/* We can map max one pageful of TCEs at a time */
186
do {
187
/*
188
* Set up the page with TCE data, looping through and setting
189
* the values.
190
*/
191
limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
192
193
for (l = 0; l < limit; l++) {
194
tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
195
rpn++;
196
}
197
198
rc = plpar_tce_put_indirect((u64)tbl->it_index,
199
(u64)tcenum << 12,
200
(u64)virt_to_abs(tcep),
201
limit);
202
203
npages -= limit;
204
tcenum += limit;
205
} while (npages > 0 && !rc);
206
207
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
208
ret = (int)rc;
209
tce_freemulti_pSeriesLP(tbl, tcenum_start,
210
(npages_start - (npages + limit)));
211
return ret;
212
}
213
214
if (rc && printk_ratelimit()) {
215
printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
216
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
217
printk("\tnpages = 0x%llx\n", (u64)npages);
218
printk("\ttce[0] val = 0x%llx\n", tcep[0]);
219
show_stack(current, (unsigned long *)__get_SP());
220
}
221
return ret;
222
}
223
224
static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
225
{
226
u64 rc;
227
228
while (npages--) {
229
rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
230
231
if (rc && printk_ratelimit()) {
232
printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
233
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
234
printk("\ttcenum = 0x%llx\n", (u64)tcenum);
235
show_stack(current, (unsigned long *)__get_SP());
236
}
237
238
tcenum++;
239
}
240
}
241
242
243
static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
244
{
245
u64 rc;
246
247
rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
248
249
if (rc && printk_ratelimit()) {
250
printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
251
printk("\trc = %lld\n", rc);
252
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
253
printk("\tnpages = 0x%llx\n", (u64)npages);
254
show_stack(current, (unsigned long *)__get_SP());
255
}
256
}
257
258
static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
259
{
260
u64 rc;
261
unsigned long tce_ret;
262
263
rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
264
265
if (rc && printk_ratelimit()) {
266
printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
267
printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
268
printk("\ttcenum = 0x%llx\n", (u64)tcenum);
269
show_stack(current, (unsigned long *)__get_SP());
270
}
271
272
return tce_ret;
273
}
274
275
/* this is compatible with cells for the device tree property */
276
struct dynamic_dma_window_prop {
277
__be32 liobn; /* tce table number */
278
__be64 dma_base; /* address hi,lo */
279
__be32 tce_shift; /* ilog2(tce_page_size) */
280
__be32 window_shift; /* ilog2(tce_window_size) */
281
};
282
283
struct direct_window {
284
struct device_node *device;
285
const struct dynamic_dma_window_prop *prop;
286
struct list_head list;
287
};
288
289
/* Dynamic DMA Window support */
290
struct ddw_query_response {
291
u32 windows_available;
292
u32 largest_available_block;
293
u32 page_size;
294
u32 migration_capable;
295
};
296
297
struct ddw_create_response {
298
u32 liobn;
299
u32 addr_hi;
300
u32 addr_lo;
301
};
302
303
static LIST_HEAD(direct_window_list);
304
/* prevents races between memory on/offline and window creation */
305
static DEFINE_SPINLOCK(direct_window_list_lock);
306
/* protects initializing window twice for same device */
307
static DEFINE_MUTEX(direct_window_init_mutex);
308
#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
309
310
static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
311
unsigned long num_pfn, const void *arg)
312
{
313
const struct dynamic_dma_window_prop *maprange = arg;
314
int rc;
315
u64 tce_size, num_tce, dma_offset, next;
316
u32 tce_shift;
317
long limit;
318
319
tce_shift = be32_to_cpu(maprange->tce_shift);
320
tce_size = 1ULL << tce_shift;
321
next = start_pfn << PAGE_SHIFT;
322
num_tce = num_pfn << PAGE_SHIFT;
323
324
/* round back to the beginning of the tce page size */
325
num_tce += next & (tce_size - 1);
326
next &= ~(tce_size - 1);
327
328
/* covert to number of tces */
329
num_tce |= tce_size - 1;
330
num_tce >>= tce_shift;
331
332
do {
333
/*
334
* Set up the page with TCE data, looping through and setting
335
* the values.
336
*/
337
limit = min_t(long, num_tce, 512);
338
dma_offset = next + be64_to_cpu(maprange->dma_base);
339
340
rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
341
dma_offset,
342
0, limit);
343
num_tce -= limit;
344
} while (num_tce > 0 && !rc);
345
346
return rc;
347
}
348
349
static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
350
unsigned long num_pfn, const void *arg)
351
{
352
const struct dynamic_dma_window_prop *maprange = arg;
353
u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn;
354
u32 tce_shift;
355
u64 rc = 0;
356
long l, limit;
357
358
local_irq_disable(); /* to protect tcep and the page behind it */
359
tcep = __get_cpu_var(tce_page);
360
361
if (!tcep) {
362
tcep = (u64 *)__get_free_page(GFP_ATOMIC);
363
if (!tcep) {
364
local_irq_enable();
365
return -ENOMEM;
366
}
367
__get_cpu_var(tce_page) = tcep;
368
}
369
370
proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
371
372
liobn = (u64)be32_to_cpu(maprange->liobn);
373
tce_shift = be32_to_cpu(maprange->tce_shift);
374
tce_size = 1ULL << tce_shift;
375
next = start_pfn << PAGE_SHIFT;
376
num_tce = num_pfn << PAGE_SHIFT;
377
378
/* round back to the beginning of the tce page size */
379
num_tce += next & (tce_size - 1);
380
next &= ~(tce_size - 1);
381
382
/* covert to number of tces */
383
num_tce |= tce_size - 1;
384
num_tce >>= tce_shift;
385
386
/* We can map max one pageful of TCEs at a time */
387
do {
388
/*
389
* Set up the page with TCE data, looping through and setting
390
* the values.
391
*/
392
limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
393
dma_offset = next + be64_to_cpu(maprange->dma_base);
394
395
for (l = 0; l < limit; l++) {
396
tcep[l] = proto_tce | next;
397
next += tce_size;
398
}
399
400
rc = plpar_tce_put_indirect(liobn,
401
dma_offset,
402
(u64)virt_to_abs(tcep),
403
limit);
404
405
num_tce -= limit;
406
} while (num_tce > 0 && !rc);
407
408
/* error cleanup: caller will clear whole range */
409
410
local_irq_enable();
411
return rc;
412
}
413
414
static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
415
unsigned long num_pfn, void *arg)
416
{
417
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
418
}
419
420
421
#ifdef CONFIG_PCI
422
static void iommu_table_setparms(struct pci_controller *phb,
423
struct device_node *dn,
424
struct iommu_table *tbl)
425
{
426
struct device_node *node;
427
const unsigned long *basep;
428
const u32 *sizep;
429
430
node = phb->dn;
431
432
basep = of_get_property(node, "linux,tce-base", NULL);
433
sizep = of_get_property(node, "linux,tce-size", NULL);
434
if (basep == NULL || sizep == NULL) {
435
printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
436
"missing tce entries !\n", dn->full_name);
437
return;
438
}
439
440
tbl->it_base = (unsigned long)__va(*basep);
441
442
if (!is_kdump_kernel())
443
memset((void *)tbl->it_base, 0, *sizep);
444
445
tbl->it_busno = phb->bus->number;
446
447
/* Units of tce entries */
448
tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
449
450
/* Test if we are going over 2GB of DMA space */
451
if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
452
udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
453
panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
454
}
455
456
phb->dma_window_base_cur += phb->dma_window_size;
457
458
/* Set the tce table size - measured in entries */
459
tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
460
461
tbl->it_index = 0;
462
tbl->it_blocksize = 16;
463
tbl->it_type = TCE_PCI;
464
}
465
466
/*
467
* iommu_table_setparms_lpar
468
*
469
* Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
470
*/
471
static void iommu_table_setparms_lpar(struct pci_controller *phb,
472
struct device_node *dn,
473
struct iommu_table *tbl,
474
const void *dma_window)
475
{
476
unsigned long offset, size;
477
478
of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
479
480
tbl->it_busno = phb->bus->number;
481
tbl->it_base = 0;
482
tbl->it_blocksize = 16;
483
tbl->it_type = TCE_PCI;
484
tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
485
tbl->it_size = size >> IOMMU_PAGE_SHIFT;
486
}
487
488
static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
489
{
490
struct device_node *dn;
491
struct iommu_table *tbl;
492
struct device_node *isa_dn, *isa_dn_orig;
493
struct device_node *tmp;
494
struct pci_dn *pci;
495
int children;
496
497
dn = pci_bus_to_OF_node(bus);
498
499
pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name);
500
501
if (bus->self) {
502
/* This is not a root bus, any setup will be done for the
503
* device-side of the bridge in iommu_dev_setup_pSeries().
504
*/
505
return;
506
}
507
pci = PCI_DN(dn);
508
509
/* Check if the ISA bus on the system is under
510
* this PHB.
511
*/
512
isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
513
514
while (isa_dn && isa_dn != dn)
515
isa_dn = isa_dn->parent;
516
517
if (isa_dn_orig)
518
of_node_put(isa_dn_orig);
519
520
/* Count number of direct PCI children of the PHB. */
521
for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
522
children++;
523
524
pr_debug("Children: %d\n", children);
525
526
/* Calculate amount of DMA window per slot. Each window must be
527
* a power of two (due to pci_alloc_consistent requirements).
528
*
529
* Keep 256MB aside for PHBs with ISA.
530
*/
531
532
if (!isa_dn) {
533
/* No ISA/IDE - just set window size and return */
534
pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
535
536
while (pci->phb->dma_window_size * children > 0x80000000ul)
537
pci->phb->dma_window_size >>= 1;
538
pr_debug("No ISA/IDE, window size is 0x%llx\n",
539
pci->phb->dma_window_size);
540
pci->phb->dma_window_base_cur = 0;
541
542
return;
543
}
544
545
/* If we have ISA, then we probably have an IDE
546
* controller too. Allocate a 128MB table but
547
* skip the first 128MB to avoid stepping on ISA
548
* space.
549
*/
550
pci->phb->dma_window_size = 0x8000000ul;
551
pci->phb->dma_window_base_cur = 0x8000000ul;
552
553
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
554
pci->phb->node);
555
556
iommu_table_setparms(pci->phb, dn, tbl);
557
pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
558
559
/* Divide the rest (1.75GB) among the children */
560
pci->phb->dma_window_size = 0x80000000ul;
561
while (pci->phb->dma_window_size * children > 0x70000000ul)
562
pci->phb->dma_window_size >>= 1;
563
564
pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
565
}
566
567
568
static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
569
{
570
struct iommu_table *tbl;
571
struct device_node *dn, *pdn;
572
struct pci_dn *ppci;
573
const void *dma_window = NULL;
574
575
dn = pci_bus_to_OF_node(bus);
576
577
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n",
578
dn->full_name);
579
580
/* Find nearest ibm,dma-window, walking up the device tree */
581
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
582
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
583
if (dma_window != NULL)
584
break;
585
}
586
587
if (dma_window == NULL) {
588
pr_debug(" no ibm,dma-window property !\n");
589
return;
590
}
591
592
ppci = PCI_DN(pdn);
593
594
pr_debug(" parent is %s, iommu_table: 0x%p\n",
595
pdn->full_name, ppci->iommu_table);
596
597
if (!ppci->iommu_table) {
598
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
599
ppci->phb->node);
600
iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
601
ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
602
pr_debug(" created table: %p\n", ppci->iommu_table);
603
}
604
}
605
606
607
static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
608
{
609
struct device_node *dn;
610
struct iommu_table *tbl;
611
612
pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
613
614
dn = dev->dev.of_node;
615
616
/* If we're the direct child of a root bus, then we need to allocate
617
* an iommu table ourselves. The bus setup code should have setup
618
* the window sizes already.
619
*/
620
if (!dev->bus->self) {
621
struct pci_controller *phb = PCI_DN(dn)->phb;
622
623
pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
624
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
625
phb->node);
626
iommu_table_setparms(phb, dn, tbl);
627
PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
628
set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
629
return;
630
}
631
632
/* If this device is further down the bus tree, search upwards until
633
* an already allocated iommu table is found and use that.
634
*/
635
636
while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
637
dn = dn->parent;
638
639
if (dn && PCI_DN(dn))
640
set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
641
else
642
printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
643
pci_name(dev));
644
}
645
646
static int __read_mostly disable_ddw;
647
648
static int __init disable_ddw_setup(char *str)
649
{
650
disable_ddw = 1;
651
printk(KERN_INFO "ppc iommu: disabling ddw.\n");
652
653
return 0;
654
}
655
656
early_param("disable_ddw", disable_ddw_setup);
657
658
static void remove_ddw(struct device_node *np)
659
{
660
struct dynamic_dma_window_prop *dwp;
661
struct property *win64;
662
const u32 *ddw_avail;
663
u64 liobn;
664
int len, ret;
665
666
ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);
667
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
668
if (!win64)
669
return;
670
671
if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp))
672
goto delprop;
673
674
dwp = win64->value;
675
liobn = (u64)be32_to_cpu(dwp->liobn);
676
677
/* clear the whole window, note the arg is in kernel pages */
678
ret = tce_clearrange_multi_pSeriesLP(0,
679
1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
680
if (ret)
681
pr_warning("%s failed to clear tces in window.\n",
682
np->full_name);
683
else
684
pr_debug("%s successfully cleared tces in window.\n",
685
np->full_name);
686
687
ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
688
if (ret)
689
pr_warning("%s: failed to remove direct window: rtas returned "
690
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
691
np->full_name, ret, ddw_avail[2], liobn);
692
else
693
pr_debug("%s: successfully removed direct window: rtas returned "
694
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
695
np->full_name, ret, ddw_avail[2], liobn);
696
697
delprop:
698
ret = prom_remove_property(np, win64);
699
if (ret)
700
pr_warning("%s: failed to remove direct window property: %d\n",
701
np->full_name, ret);
702
}
703
704
static u64 find_existing_ddw(struct device_node *pdn)
705
{
706
struct direct_window *window;
707
const struct dynamic_dma_window_prop *direct64;
708
u64 dma_addr = 0;
709
710
spin_lock(&direct_window_list_lock);
711
/* check if we already created a window and dupe that config if so */
712
list_for_each_entry(window, &direct_window_list, list) {
713
if (window->device == pdn) {
714
direct64 = window->prop;
715
dma_addr = direct64->dma_base;
716
break;
717
}
718
}
719
spin_unlock(&direct_window_list_lock);
720
721
return dma_addr;
722
}
723
724
static int find_existing_ddw_windows(void)
725
{
726
int len;
727
struct device_node *pdn;
728
struct direct_window *window;
729
const struct dynamic_dma_window_prop *direct64;
730
731
if (!firmware_has_feature(FW_FEATURE_LPAR))
732
return 0;
733
734
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
735
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
736
if (!direct64)
737
continue;
738
739
window = kzalloc(sizeof(*window), GFP_KERNEL);
740
if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
741
kfree(window);
742
remove_ddw(pdn);
743
continue;
744
}
745
746
window->device = pdn;
747
window->prop = direct64;
748
spin_lock(&direct_window_list_lock);
749
list_add(&window->list, &direct_window_list);
750
spin_unlock(&direct_window_list_lock);
751
}
752
753
return 0;
754
}
755
machine_arch_initcall(pseries, find_existing_ddw_windows);
756
757
static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
758
struct ddw_query_response *query)
759
{
760
struct device_node *dn;
761
struct pci_dn *pcidn;
762
u32 cfg_addr;
763
u64 buid;
764
int ret;
765
766
/*
767
* Get the config address and phb buid of the PE window.
768
* Rely on eeh to retrieve this for us.
769
* Retrieve them from the pci device, not the node with the
770
* dma-window property
771
*/
772
dn = pci_device_to_OF_node(dev);
773
pcidn = PCI_DN(dn);
774
cfg_addr = pcidn->eeh_config_addr;
775
if (pcidn->eeh_pe_config_addr)
776
cfg_addr = pcidn->eeh_pe_config_addr;
777
buid = pcidn->phb->buid;
778
ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
779
cfg_addr, BUID_HI(buid), BUID_LO(buid));
780
dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
781
" returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
782
BUID_LO(buid), ret);
783
return ret;
784
}
785
786
static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
787
struct ddw_create_response *create, int page_shift,
788
int window_shift)
789
{
790
struct device_node *dn;
791
struct pci_dn *pcidn;
792
u32 cfg_addr;
793
u64 buid;
794
int ret;
795
796
/*
797
* Get the config address and phb buid of the PE window.
798
* Rely on eeh to retrieve this for us.
799
* Retrieve them from the pci device, not the node with the
800
* dma-window property
801
*/
802
dn = pci_device_to_OF_node(dev);
803
pcidn = PCI_DN(dn);
804
cfg_addr = pcidn->eeh_config_addr;
805
if (pcidn->eeh_pe_config_addr)
806
cfg_addr = pcidn->eeh_pe_config_addr;
807
buid = pcidn->phb->buid;
808
809
do {
810
/* extra outputs are LIOBN and dma-addr (hi, lo) */
811
ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr,
812
BUID_HI(buid), BUID_LO(buid), page_shift, window_shift);
813
} while (rtas_busy_delay(ret));
814
dev_info(&dev->dev,
815
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
816
"(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
817
cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
818
window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
819
820
return ret;
821
}
822
823
/*
824
* If the PE supports dynamic dma windows, and there is space for a table
825
* that can map all pages in a linear offset, then setup such a table,
826
* and record the dma-offset in the struct device.
827
*
828
* dev: the pci device we are checking
829
* pdn: the parent pe node with the ibm,dma_window property
830
* Future: also check if we can remap the base window for our base page size
831
*
832
* returns the dma offset for use by dma_set_mask
833
*/
834
static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
835
{
836
int len, ret;
837
struct ddw_query_response query;
838
struct ddw_create_response create;
839
int page_shift;
840
u64 dma_addr, max_addr;
841
struct device_node *dn;
842
const u32 *uninitialized_var(ddw_avail);
843
struct direct_window *window;
844
struct property *win64;
845
struct dynamic_dma_window_prop *ddwprop;
846
847
mutex_lock(&direct_window_init_mutex);
848
849
dma_addr = find_existing_ddw(pdn);
850
if (dma_addr != 0)
851
goto out_unlock;
852
853
/*
854
* the ibm,ddw-applicable property holds the tokens for:
855
* ibm,query-pe-dma-window
856
* ibm,create-pe-dma-window
857
* ibm,remove-pe-dma-window
858
* for the given node in that order.
859
* the property is actually in the parent, not the PE
860
*/
861
ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len);
862
if (!ddw_avail || len < 3 * sizeof(u32))
863
goto out_unlock;
864
865
/*
866
* Query if there is a second window of size to map the
867
* whole partition. Query returns number of windows, largest
868
* block assigned to PE (partition endpoint), and two bitmasks
869
* of page sizes: supported and supported for migrate-dma.
870
*/
871
dn = pci_device_to_OF_node(dev);
872
ret = query_ddw(dev, ddw_avail, &query);
873
if (ret != 0)
874
goto out_unlock;
875
876
if (query.windows_available == 0) {
877
/*
878
* no additional windows are available for this device.
879
* We might be able to reallocate the existing window,
880
* trading in for a larger page size.
881
*/
882
dev_dbg(&dev->dev, "no free dynamic windows");
883
goto out_unlock;
884
}
885
if (query.page_size & 4) {
886
page_shift = 24; /* 16MB */
887
} else if (query.page_size & 2) {
888
page_shift = 16; /* 64kB */
889
} else if (query.page_size & 1) {
890
page_shift = 12; /* 4kB */
891
} else {
892
dev_dbg(&dev->dev, "no supported direct page size in mask %x",
893
query.page_size);
894
goto out_unlock;
895
}
896
/* verify the window * number of ptes will map the partition */
897
/* check largest block * page size > max memory hotplug addr */
898
max_addr = memory_hotplug_max();
899
if (query.largest_available_block < (max_addr >> page_shift)) {
900
dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
901
"%llu-sized pages\n", max_addr, query.largest_available_block,
902
1ULL << page_shift);
903
goto out_unlock;
904
}
905
len = order_base_2(max_addr);
906
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
907
if (!win64) {
908
dev_info(&dev->dev,
909
"couldn't allocate property for 64bit dma window\n");
910
goto out_unlock;
911
}
912
win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
913
win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
914
win64->length = sizeof(*ddwprop);
915
if (!win64->name || !win64->value) {
916
dev_info(&dev->dev,
917
"couldn't allocate property name and value\n");
918
goto out_free_prop;
919
}
920
921
ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
922
if (ret != 0)
923
goto out_free_prop;
924
925
ddwprop->liobn = cpu_to_be32(create.liobn);
926
ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2));
927
ddwprop->tce_shift = cpu_to_be32(page_shift);
928
ddwprop->window_shift = cpu_to_be32(len);
929
930
dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",
931
create.liobn, dn->full_name);
932
933
window = kzalloc(sizeof(*window), GFP_KERNEL);
934
if (!window)
935
goto out_clear_window;
936
937
ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
938
win64->value, tce_setrange_multi_pSeriesLP_walk);
939
if (ret) {
940
dev_info(&dev->dev, "failed to map direct window for %s: %d\n",
941
dn->full_name, ret);
942
goto out_clear_window;
943
}
944
945
ret = prom_add_property(pdn, win64);
946
if (ret) {
947
dev_err(&dev->dev, "unable to add dma window property for %s: %d",
948
pdn->full_name, ret);
949
goto out_clear_window;
950
}
951
952
window->device = pdn;
953
window->prop = ddwprop;
954
spin_lock(&direct_window_list_lock);
955
list_add(&window->list, &direct_window_list);
956
spin_unlock(&direct_window_list_lock);
957
958
dma_addr = of_read_number(&create.addr_hi, 2);
959
goto out_unlock;
960
961
out_clear_window:
962
remove_ddw(pdn);
963
964
out_free_prop:
965
kfree(win64->name);
966
kfree(win64->value);
967
kfree(win64);
968
969
out_unlock:
970
mutex_unlock(&direct_window_init_mutex);
971
return dma_addr;
972
}
973
974
static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
975
{
976
struct device_node *pdn, *dn;
977
struct iommu_table *tbl;
978
const void *dma_window = NULL;
979
struct pci_dn *pci;
980
981
pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
982
983
/* dev setup for LPAR is a little tricky, since the device tree might
984
* contain the dma-window properties per-device and not necessarily
985
* for the bus. So we need to search upwards in the tree until we
986
* either hit a dma-window property, OR find a parent with a table
987
* already allocated.
988
*/
989
dn = pci_device_to_OF_node(dev);
990
pr_debug(" node is %s\n", dn->full_name);
991
992
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
993
pdn = pdn->parent) {
994
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
995
if (dma_window)
996
break;
997
}
998
999
if (!pdn || !PCI_DN(pdn)) {
1000
printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
1001
"no DMA window found for pci dev=%s dn=%s\n",
1002
pci_name(dev), dn? dn->full_name : "<null>");
1003
return;
1004
}
1005
pr_debug(" parent is %s\n", pdn->full_name);
1006
1007
pci = PCI_DN(pdn);
1008
if (!pci->iommu_table) {
1009
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
1010
pci->phb->node);
1011
iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
1012
pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
1013
pr_debug(" created table: %p\n", pci->iommu_table);
1014
} else {
1015
pr_debug(" found DMA window, table: %p\n", pci->iommu_table);
1016
}
1017
1018
set_iommu_table_base(&dev->dev, pci->iommu_table);
1019
}
1020
1021
static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
1022
{
1023
bool ddw_enabled = false;
1024
struct device_node *pdn, *dn;
1025
struct pci_dev *pdev;
1026
const void *dma_window = NULL;
1027
u64 dma_offset;
1028
1029
if (!dev->dma_mask)
1030
return -EIO;
1031
1032
if (!dev_is_pci(dev))
1033
goto check_mask;
1034
1035
pdev = to_pci_dev(dev);
1036
1037
/* only attempt to use a new window if 64-bit DMA is requested */
1038
if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
1039
dn = pci_device_to_OF_node(pdev);
1040
dev_dbg(dev, "node is %s\n", dn->full_name);
1041
1042
/*
1043
* the device tree might contain the dma-window properties
1044
* per-device and not necessarily for the bus. So we need to
1045
* search upwards in the tree until we either hit a dma-window
1046
* property, OR find a parent with a table already allocated.
1047
*/
1048
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
1049
pdn = pdn->parent) {
1050
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1051
if (dma_window)
1052
break;
1053
}
1054
if (pdn && PCI_DN(pdn)) {
1055
dma_offset = enable_ddw(pdev, pdn);
1056
if (dma_offset != 0) {
1057
dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
1058
set_dma_offset(dev, dma_offset);
1059
set_dma_ops(dev, &dma_direct_ops);
1060
ddw_enabled = true;
1061
}
1062
}
1063
}
1064
1065
/* fall back on iommu ops, restore table pointer with ops */
1066
if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
1067
dev_info(dev, "Restoring 32-bit DMA via iommu\n");
1068
set_dma_ops(dev, &dma_iommu_ops);
1069
pci_dma_dev_setup_pSeriesLP(pdev);
1070
}
1071
1072
check_mask:
1073
if (!dma_supported(dev, dma_mask))
1074
return -EIO;
1075
1076
*dev->dma_mask = dma_mask;
1077
return 0;
1078
}
1079
1080
#else /* CONFIG_PCI */
1081
#define pci_dma_bus_setup_pSeries NULL
1082
#define pci_dma_dev_setup_pSeries NULL
1083
#define pci_dma_bus_setup_pSeriesLP NULL
1084
#define pci_dma_dev_setup_pSeriesLP NULL
1085
#define dma_set_mask_pSeriesLP NULL
1086
#endif /* !CONFIG_PCI */
1087
1088
static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
1089
void *data)
1090
{
1091
struct direct_window *window;
1092
struct memory_notify *arg = data;
1093
int ret = 0;
1094
1095
switch (action) {
1096
case MEM_GOING_ONLINE:
1097
spin_lock(&direct_window_list_lock);
1098
list_for_each_entry(window, &direct_window_list, list) {
1099
ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
1100
arg->nr_pages, window->prop);
1101
/* XXX log error */
1102
}
1103
spin_unlock(&direct_window_list_lock);
1104
break;
1105
case MEM_CANCEL_ONLINE:
1106
case MEM_OFFLINE:
1107
spin_lock(&direct_window_list_lock);
1108
list_for_each_entry(window, &direct_window_list, list) {
1109
ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
1110
arg->nr_pages, window->prop);
1111
/* XXX log error */
1112
}
1113
spin_unlock(&direct_window_list_lock);
1114
break;
1115
default:
1116
break;
1117
}
1118
if (ret && action != MEM_CANCEL_ONLINE)
1119
return NOTIFY_BAD;
1120
1121
return NOTIFY_OK;
1122
}
1123
1124
static struct notifier_block iommu_mem_nb = {
1125
.notifier_call = iommu_mem_notifier,
1126
};
1127
1128
static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
1129
{
1130
int err = NOTIFY_OK;
1131
struct device_node *np = node;
1132
struct pci_dn *pci = PCI_DN(np);
1133
struct direct_window *window;
1134
1135
switch (action) {
1136
case PSERIES_RECONFIG_REMOVE:
1137
if (pci && pci->iommu_table)
1138
iommu_free_table(pci->iommu_table, np->full_name);
1139
1140
spin_lock(&direct_window_list_lock);
1141
list_for_each_entry(window, &direct_window_list, list) {
1142
if (window->device == np) {
1143
list_del(&window->list);
1144
kfree(window);
1145
break;
1146
}
1147
}
1148
spin_unlock(&direct_window_list_lock);
1149
1150
/*
1151
* Because the notifier runs after isolation of the
1152
* slot, we are guaranteed any DMA window has already
1153
* been revoked and the TCEs have been marked invalid,
1154
* so we don't need a call to remove_ddw(np). However,
1155
* if an additional notifier action is added before the
1156
* isolate call, we should update this code for
1157
* completeness with such a call.
1158
*/
1159
break;
1160
default:
1161
err = NOTIFY_DONE;
1162
break;
1163
}
1164
return err;
1165
}
1166
1167
static struct notifier_block iommu_reconfig_nb = {
1168
.notifier_call = iommu_reconfig_notifier,
1169
};
1170
1171
/* These are called very early. */
1172
void iommu_init_early_pSeries(void)
1173
{
1174
if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
1175
return;
1176
1177
if (firmware_has_feature(FW_FEATURE_LPAR)) {
1178
if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
1179
ppc_md.tce_build = tce_buildmulti_pSeriesLP;
1180
ppc_md.tce_free = tce_freemulti_pSeriesLP;
1181
} else {
1182
ppc_md.tce_build = tce_build_pSeriesLP;
1183
ppc_md.tce_free = tce_free_pSeriesLP;
1184
}
1185
ppc_md.tce_get = tce_get_pSeriesLP;
1186
ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
1187
ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
1188
ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
1189
} else {
1190
ppc_md.tce_build = tce_build_pSeries;
1191
ppc_md.tce_free = tce_free_pSeries;
1192
ppc_md.tce_get = tce_get_pseries;
1193
ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeries;
1194
ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeries;
1195
}
1196
1197
1198
pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
1199
register_memory_notifier(&iommu_mem_nb);
1200
1201
set_pci_dma_ops(&dma_iommu_ops);
1202
}
1203
1204
static int __init disable_multitce(char *str)
1205
{
1206
if (strcmp(str, "off") == 0 &&
1207
firmware_has_feature(FW_FEATURE_LPAR) &&
1208
firmware_has_feature(FW_FEATURE_MULTITCE)) {
1209
printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
1210
ppc_md.tce_build = tce_build_pSeriesLP;
1211
ppc_md.tce_free = tce_free_pSeriesLP;
1212
powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
1213
}
1214
return 1;
1215
}
1216
1217
__setup("multitce=", disable_multitce);
1218
1219