Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/mm/hugetlbpage.c
10817 views
1
/*
2
* PPC64 (POWER4) Huge TLB Page Support for Kernel.
3
*
4
* Copyright (C) 2003 David Gibson, IBM Corporation.
5
*
6
* Based on the IA-32 version:
7
* Copyright (C) 2002, Rohit Seth <[email protected]>
8
*/
9
10
#include <linux/mm.h>
11
#include <linux/io.h>
12
#include <linux/slab.h>
13
#include <linux/hugetlb.h>
14
#include <asm/pgtable.h>
15
#include <asm/pgalloc.h>
16
#include <asm/tlb.h>
17
18
#define PAGE_SHIFT_64K 16
19
#define PAGE_SHIFT_16M 24
20
#define PAGE_SHIFT_16G 34
21
22
#define MAX_NUMBER_GPAGES 1024
23
24
/* Tracks the 16G pages after the device tree is scanned and before the
25
* huge_boot_pages list is ready. */
26
static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
27
static unsigned nr_gpages;
28
29
/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
30
* will choke on pointers to hugepte tables, which is handy for
31
* catching screwups early. */
32
33
static inline int shift_to_mmu_psize(unsigned int shift)
34
{
35
int psize;
36
37
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
38
if (mmu_psize_defs[psize].shift == shift)
39
return psize;
40
return -1;
41
}
42
43
static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
44
{
45
if (mmu_psize_defs[mmu_psize].shift)
46
return mmu_psize_defs[mmu_psize].shift;
47
BUG();
48
}
49
50
#define hugepd_none(hpd) ((hpd).pd == 0)
51
52
static inline pte_t *hugepd_page(hugepd_t hpd)
53
{
54
BUG_ON(!hugepd_ok(hpd));
55
return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
56
}
57
58
static inline unsigned int hugepd_shift(hugepd_t hpd)
59
{
60
return hpd.pd & HUGEPD_SHIFT_MASK;
61
}
62
63
static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
64
{
65
unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
66
pte_t *dir = hugepd_page(*hpdp);
67
68
return dir + idx;
69
}
70
71
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
72
{
73
pgd_t *pg;
74
pud_t *pu;
75
pmd_t *pm;
76
hugepd_t *hpdp = NULL;
77
unsigned pdshift = PGDIR_SHIFT;
78
79
if (shift)
80
*shift = 0;
81
82
pg = pgdir + pgd_index(ea);
83
if (is_hugepd(pg)) {
84
hpdp = (hugepd_t *)pg;
85
} else if (!pgd_none(*pg)) {
86
pdshift = PUD_SHIFT;
87
pu = pud_offset(pg, ea);
88
if (is_hugepd(pu))
89
hpdp = (hugepd_t *)pu;
90
else if (!pud_none(*pu)) {
91
pdshift = PMD_SHIFT;
92
pm = pmd_offset(pu, ea);
93
if (is_hugepd(pm))
94
hpdp = (hugepd_t *)pm;
95
else if (!pmd_none(*pm)) {
96
return pte_offset_map(pm, ea);
97
}
98
}
99
}
100
101
if (!hpdp)
102
return NULL;
103
104
if (shift)
105
*shift = hugepd_shift(*hpdp);
106
return hugepte_offset(hpdp, ea, pdshift);
107
}
108
109
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
110
{
111
return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
112
}
113
114
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
115
unsigned long address, unsigned pdshift, unsigned pshift)
116
{
117
pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
118
GFP_KERNEL|__GFP_REPEAT);
119
120
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
121
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
122
123
if (! new)
124
return -ENOMEM;
125
126
spin_lock(&mm->page_table_lock);
127
if (!hugepd_none(*hpdp))
128
kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
129
else
130
hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
131
spin_unlock(&mm->page_table_lock);
132
return 0;
133
}
134
135
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
136
{
137
pgd_t *pg;
138
pud_t *pu;
139
pmd_t *pm;
140
hugepd_t *hpdp = NULL;
141
unsigned pshift = __ffs(sz);
142
unsigned pdshift = PGDIR_SHIFT;
143
144
addr &= ~(sz-1);
145
146
pg = pgd_offset(mm, addr);
147
if (pshift >= PUD_SHIFT) {
148
hpdp = (hugepd_t *)pg;
149
} else {
150
pdshift = PUD_SHIFT;
151
pu = pud_alloc(mm, pg, addr);
152
if (pshift >= PMD_SHIFT) {
153
hpdp = (hugepd_t *)pu;
154
} else {
155
pdshift = PMD_SHIFT;
156
pm = pmd_alloc(mm, pu, addr);
157
hpdp = (hugepd_t *)pm;
158
}
159
}
160
161
if (!hpdp)
162
return NULL;
163
164
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
165
166
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
167
return NULL;
168
169
return hugepte_offset(hpdp, addr, pdshift);
170
}
171
172
/* Build list of addresses of gigantic pages. This function is used in early
173
* boot before the buddy or bootmem allocator is setup.
174
*/
175
void add_gpage(unsigned long addr, unsigned long page_size,
176
unsigned long number_of_pages)
177
{
178
if (!addr)
179
return;
180
while (number_of_pages > 0) {
181
gpage_freearray[nr_gpages] = addr;
182
nr_gpages++;
183
number_of_pages--;
184
addr += page_size;
185
}
186
}
187
188
/* Moves the gigantic page addresses from the temporary list to the
189
* huge_boot_pages list.
190
*/
191
int alloc_bootmem_huge_page(struct hstate *hstate)
192
{
193
struct huge_bootmem_page *m;
194
if (nr_gpages == 0)
195
return 0;
196
m = phys_to_virt(gpage_freearray[--nr_gpages]);
197
gpage_freearray[nr_gpages] = 0;
198
list_add(&m->list, &huge_boot_pages);
199
m->hstate = hstate;
200
return 1;
201
}
202
203
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
204
{
205
return 0;
206
}
207
208
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
209
unsigned long start, unsigned long end,
210
unsigned long floor, unsigned long ceiling)
211
{
212
pte_t *hugepte = hugepd_page(*hpdp);
213
unsigned shift = hugepd_shift(*hpdp);
214
unsigned long pdmask = ~((1UL << pdshift) - 1);
215
216
start &= pdmask;
217
if (start < floor)
218
return;
219
if (ceiling) {
220
ceiling &= pdmask;
221
if (! ceiling)
222
return;
223
}
224
if (end - 1 > ceiling - 1)
225
return;
226
227
hpdp->pd = 0;
228
tlb->need_flush = 1;
229
pgtable_free_tlb(tlb, hugepte, pdshift - shift);
230
}
231
232
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
233
unsigned long addr, unsigned long end,
234
unsigned long floor, unsigned long ceiling)
235
{
236
pmd_t *pmd;
237
unsigned long next;
238
unsigned long start;
239
240
start = addr;
241
pmd = pmd_offset(pud, addr);
242
do {
243
next = pmd_addr_end(addr, end);
244
if (pmd_none(*pmd))
245
continue;
246
free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
247
addr, next, floor, ceiling);
248
} while (pmd++, addr = next, addr != end);
249
250
start &= PUD_MASK;
251
if (start < floor)
252
return;
253
if (ceiling) {
254
ceiling &= PUD_MASK;
255
if (!ceiling)
256
return;
257
}
258
if (end - 1 > ceiling - 1)
259
return;
260
261
pmd = pmd_offset(pud, start);
262
pud_clear(pud);
263
pmd_free_tlb(tlb, pmd, start);
264
}
265
266
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
267
unsigned long addr, unsigned long end,
268
unsigned long floor, unsigned long ceiling)
269
{
270
pud_t *pud;
271
unsigned long next;
272
unsigned long start;
273
274
start = addr;
275
pud = pud_offset(pgd, addr);
276
do {
277
next = pud_addr_end(addr, end);
278
if (!is_hugepd(pud)) {
279
if (pud_none_or_clear_bad(pud))
280
continue;
281
hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
282
ceiling);
283
} else {
284
free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
285
addr, next, floor, ceiling);
286
}
287
} while (pud++, addr = next, addr != end);
288
289
start &= PGDIR_MASK;
290
if (start < floor)
291
return;
292
if (ceiling) {
293
ceiling &= PGDIR_MASK;
294
if (!ceiling)
295
return;
296
}
297
if (end - 1 > ceiling - 1)
298
return;
299
300
pud = pud_offset(pgd, start);
301
pgd_clear(pgd);
302
pud_free_tlb(tlb, pud, start);
303
}
304
305
/*
306
* This function frees user-level page tables of a process.
307
*
308
* Must be called with pagetable lock held.
309
*/
310
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
311
unsigned long addr, unsigned long end,
312
unsigned long floor, unsigned long ceiling)
313
{
314
pgd_t *pgd;
315
unsigned long next;
316
317
/*
318
* Because there are a number of different possible pagetable
319
* layouts for hugepage ranges, we limit knowledge of how
320
* things should be laid out to the allocation path
321
* (huge_pte_alloc(), above). Everything else works out the
322
* structure as it goes from information in the hugepd
323
* pointers. That means that we can't here use the
324
* optimization used in the normal page free_pgd_range(), of
325
* checking whether we're actually covering a large enough
326
* range to have to do anything at the top level of the walk
327
* instead of at the bottom.
328
*
329
* To make sense of this, you should probably go read the big
330
* block comment at the top of the normal free_pgd_range(),
331
* too.
332
*/
333
334
pgd = pgd_offset(tlb->mm, addr);
335
do {
336
next = pgd_addr_end(addr, end);
337
if (!is_hugepd(pgd)) {
338
if (pgd_none_or_clear_bad(pgd))
339
continue;
340
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
341
} else {
342
free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
343
addr, next, floor, ceiling);
344
}
345
} while (pgd++, addr = next, addr != end);
346
}
347
348
struct page *
349
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
350
{
351
pte_t *ptep;
352
struct page *page;
353
unsigned shift;
354
unsigned long mask;
355
356
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
357
358
/* Verify it is a huge page else bail. */
359
if (!ptep || !shift)
360
return ERR_PTR(-EINVAL);
361
362
mask = (1UL << shift) - 1;
363
page = pte_page(*ptep);
364
if (page)
365
page += (address & mask) / PAGE_SIZE;
366
367
return page;
368
}
369
370
int pmd_huge(pmd_t pmd)
371
{
372
return 0;
373
}
374
375
int pud_huge(pud_t pud)
376
{
377
return 0;
378
}
379
380
struct page *
381
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
382
pmd_t *pmd, int write)
383
{
384
BUG();
385
return NULL;
386
}
387
388
static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
389
unsigned long end, int write, struct page **pages, int *nr)
390
{
391
unsigned long mask;
392
unsigned long pte_end;
393
struct page *head, *page;
394
pte_t pte;
395
int refs;
396
397
pte_end = (addr + sz) & ~(sz-1);
398
if (pte_end < end)
399
end = pte_end;
400
401
pte = *ptep;
402
mask = _PAGE_PRESENT | _PAGE_USER;
403
if (write)
404
mask |= _PAGE_RW;
405
406
if ((pte_val(pte) & mask) != mask)
407
return 0;
408
409
/* hugepages are never "special" */
410
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
411
412
refs = 0;
413
head = pte_page(pte);
414
415
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
416
do {
417
VM_BUG_ON(compound_head(page) != head);
418
pages[*nr] = page;
419
(*nr)++;
420
page++;
421
refs++;
422
} while (addr += PAGE_SIZE, addr != end);
423
424
if (!page_cache_add_speculative(head, refs)) {
425
*nr -= refs;
426
return 0;
427
}
428
429
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
430
/* Could be optimized better */
431
while (*nr) {
432
put_page(page);
433
(*nr)--;
434
}
435
}
436
437
return 1;
438
}
439
440
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
441
unsigned long sz)
442
{
443
unsigned long __boundary = (addr + sz) & ~(sz-1);
444
return (__boundary - 1 < end - 1) ? __boundary : end;
445
}
446
447
int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
448
unsigned long addr, unsigned long end,
449
int write, struct page **pages, int *nr)
450
{
451
pte_t *ptep;
452
unsigned long sz = 1UL << hugepd_shift(*hugepd);
453
unsigned long next;
454
455
ptep = hugepte_offset(hugepd, addr, pdshift);
456
do {
457
next = hugepte_addr_end(addr, end, sz);
458
if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
459
return 0;
460
} while (ptep++, addr = next, addr != end);
461
462
return 1;
463
}
464
465
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
466
unsigned long len, unsigned long pgoff,
467
unsigned long flags)
468
{
469
struct hstate *hstate = hstate_file(file);
470
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
471
472
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
473
}
474
475
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
476
{
477
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
478
479
return 1UL << mmu_psize_to_shift(psize);
480
}
481
482
static int __init add_huge_page_size(unsigned long long size)
483
{
484
int shift = __ffs(size);
485
int mmu_psize;
486
487
/* Check that it is a page size supported by the hardware and
488
* that it fits within pagetable and slice limits. */
489
if (!is_power_of_2(size)
490
|| (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
491
return -EINVAL;
492
493
if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
494
return -EINVAL;
495
496
#ifdef CONFIG_SPU_FS_64K_LS
497
/* Disable support for 64K huge pages when 64K SPU local store
498
* support is enabled as the current implementation conflicts.
499
*/
500
if (shift == PAGE_SHIFT_64K)
501
return -EINVAL;
502
#endif /* CONFIG_SPU_FS_64K_LS */
503
504
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
505
506
/* Return if huge page size has already been setup */
507
if (size_to_hstate(size))
508
return 0;
509
510
hugetlb_add_hstate(shift - PAGE_SHIFT);
511
512
return 0;
513
}
514
515
static int __init hugepage_setup_sz(char *str)
516
{
517
unsigned long long size;
518
519
size = memparse(str, &str);
520
521
if (add_huge_page_size(size) != 0)
522
printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
523
524
return 1;
525
}
526
__setup("hugepagesz=", hugepage_setup_sz);
527
528
static int __init hugetlbpage_init(void)
529
{
530
int psize;
531
532
if (!mmu_has_feature(MMU_FTR_16M_PAGE))
533
return -ENODEV;
534
535
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
536
unsigned shift;
537
unsigned pdshift;
538
539
if (!mmu_psize_defs[psize].shift)
540
continue;
541
542
shift = mmu_psize_to_shift(psize);
543
544
if (add_huge_page_size(1ULL << shift) < 0)
545
continue;
546
547
if (shift < PMD_SHIFT)
548
pdshift = PMD_SHIFT;
549
else if (shift < PUD_SHIFT)
550
pdshift = PUD_SHIFT;
551
else
552
pdshift = PGDIR_SHIFT;
553
554
pgtable_cache_add(pdshift - shift, NULL);
555
if (!PGT_CACHE(pdshift - shift))
556
panic("hugetlbpage_init(): could not create "
557
"pgtable cache for %d bit pagesize\n", shift);
558
}
559
560
/* Set default large page size. Currently, we pick 16M or 1M
561
* depending on what is available
562
*/
563
if (mmu_psize_defs[MMU_PAGE_16M].shift)
564
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
565
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
566
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
567
568
return 0;
569
}
570
571
module_init(hugetlbpage_init);
572
573
void flush_dcache_icache_hugepage(struct page *page)
574
{
575
int i;
576
577
BUG_ON(!PageCompound(page));
578
579
for (i = 0; i < (1UL << compound_order(page)); i++)
580
__flush_dcache_icache(page_address(page+i));
581
}
582
583