Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/mm/book3s64/radix_pgtable.c
26481 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* Page table handling routines for radix page table.
4
*
5
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6
*/
7
8
#define pr_fmt(fmt) "radix-mmu: " fmt
9
10
#include <linux/io.h>
11
#include <linux/kernel.h>
12
#include <linux/sched/mm.h>
13
#include <linux/memblock.h>
14
#include <linux/of.h>
15
#include <linux/of_fdt.h>
16
#include <linux/mm.h>
17
#include <linux/hugetlb.h>
18
#include <linux/string_helpers.h>
19
#include <linux/memory.h>
20
#include <linux/kfence.h>
21
22
#include <asm/pgalloc.h>
23
#include <asm/mmu_context.h>
24
#include <asm/dma.h>
25
#include <asm/machdep.h>
26
#include <asm/mmu.h>
27
#include <asm/firmware.h>
28
#include <asm/powernv.h>
29
#include <asm/sections.h>
30
#include <asm/smp.h>
31
#include <asm/trace.h>
32
#include <asm/uaccess.h>
33
#include <asm/ultravisor.h>
34
#include <asm/set_memory.h>
35
#include <asm/kfence.h>
36
37
#include <trace/events/thp.h>
38
39
#include <mm/mmu_decl.h>
40
41
unsigned int mmu_base_pid;
42
43
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
44
unsigned long region_start, unsigned long region_end)
45
{
46
phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
47
phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
48
void *ptr;
49
50
if (region_start)
51
min_addr = region_start;
52
if (region_end)
53
max_addr = region_end;
54
55
ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
56
57
if (!ptr)
58
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
59
__func__, size, size, nid, &min_addr, &max_addr);
60
61
return ptr;
62
}
63
64
/*
65
* When allocating pud or pmd pointers, we allocate a complete page
66
* of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
67
* is to ensure that the page obtained from the memblock allocator
68
* can be completely used as page table page and can be freed
69
* correctly when the page table entries are removed.
70
*/
71
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
72
pgprot_t flags,
73
unsigned int map_page_size,
74
int nid,
75
unsigned long region_start, unsigned long region_end)
76
{
77
unsigned long pfn = pa >> PAGE_SHIFT;
78
pgd_t *pgdp;
79
p4d_t *p4dp;
80
pud_t *pudp;
81
pmd_t *pmdp;
82
pte_t *ptep;
83
84
pgdp = pgd_offset_k(ea);
85
p4dp = p4d_offset(pgdp, ea);
86
if (p4d_none(*p4dp)) {
87
pudp = early_alloc_pgtable(PAGE_SIZE, nid,
88
region_start, region_end);
89
p4d_populate(&init_mm, p4dp, pudp);
90
}
91
pudp = pud_offset(p4dp, ea);
92
if (map_page_size == PUD_SIZE) {
93
ptep = (pte_t *)pudp;
94
goto set_the_pte;
95
}
96
if (pud_none(*pudp)) {
97
pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
98
region_end);
99
pud_populate(&init_mm, pudp, pmdp);
100
}
101
pmdp = pmd_offset(pudp, ea);
102
if (map_page_size == PMD_SIZE) {
103
ptep = pmdp_ptep(pmdp);
104
goto set_the_pte;
105
}
106
if (!pmd_present(*pmdp)) {
107
ptep = early_alloc_pgtable(PAGE_SIZE, nid,
108
region_start, region_end);
109
pmd_populate_kernel(&init_mm, pmdp, ptep);
110
}
111
ptep = pte_offset_kernel(pmdp, ea);
112
113
set_the_pte:
114
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
115
asm volatile("ptesync": : :"memory");
116
return 0;
117
}
118
119
/*
120
* nid, region_start, and region_end are hints to try to place the page
121
* table memory in the same node or region.
122
*/
123
static int __map_kernel_page(unsigned long ea, unsigned long pa,
124
pgprot_t flags,
125
unsigned int map_page_size,
126
int nid,
127
unsigned long region_start, unsigned long region_end)
128
{
129
unsigned long pfn = pa >> PAGE_SHIFT;
130
pgd_t *pgdp;
131
p4d_t *p4dp;
132
pud_t *pudp;
133
pmd_t *pmdp;
134
pte_t *ptep;
135
/*
136
* Make sure task size is correct as per the max adddr
137
*/
138
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
139
140
#ifdef CONFIG_PPC_64K_PAGES
141
BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
142
#endif
143
144
if (unlikely(!slab_is_available()))
145
return early_map_kernel_page(ea, pa, flags, map_page_size,
146
nid, region_start, region_end);
147
148
/*
149
* Should make page table allocation functions be able to take a
150
* node, so we can place kernel page tables on the right nodes after
151
* boot.
152
*/
153
pgdp = pgd_offset_k(ea);
154
p4dp = p4d_offset(pgdp, ea);
155
pudp = pud_alloc(&init_mm, p4dp, ea);
156
if (!pudp)
157
return -ENOMEM;
158
if (map_page_size == PUD_SIZE) {
159
ptep = (pte_t *)pudp;
160
goto set_the_pte;
161
}
162
pmdp = pmd_alloc(&init_mm, pudp, ea);
163
if (!pmdp)
164
return -ENOMEM;
165
if (map_page_size == PMD_SIZE) {
166
ptep = pmdp_ptep(pmdp);
167
goto set_the_pte;
168
}
169
ptep = pte_alloc_kernel(pmdp, ea);
170
if (!ptep)
171
return -ENOMEM;
172
173
set_the_pte:
174
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
175
asm volatile("ptesync": : :"memory");
176
return 0;
177
}
178
179
int radix__map_kernel_page(unsigned long ea, unsigned long pa,
180
pgprot_t flags,
181
unsigned int map_page_size)
182
{
183
return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
184
}
185
186
#ifdef CONFIG_STRICT_KERNEL_RWX
187
static void radix__change_memory_range(unsigned long start, unsigned long end,
188
unsigned long clear)
189
{
190
unsigned long idx;
191
pgd_t *pgdp;
192
p4d_t *p4dp;
193
pud_t *pudp;
194
pmd_t *pmdp;
195
pte_t *ptep;
196
197
start = ALIGN_DOWN(start, PAGE_SIZE);
198
end = PAGE_ALIGN(end); // aligns up
199
200
pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
201
start, end, clear);
202
203
for (idx = start; idx < end; idx += PAGE_SIZE) {
204
pgdp = pgd_offset_k(idx);
205
p4dp = p4d_offset(pgdp, idx);
206
pudp = pud_alloc(&init_mm, p4dp, idx);
207
if (!pudp)
208
continue;
209
if (pud_leaf(*pudp)) {
210
ptep = (pte_t *)pudp;
211
goto update_the_pte;
212
}
213
pmdp = pmd_alloc(&init_mm, pudp, idx);
214
if (!pmdp)
215
continue;
216
if (pmd_leaf(*pmdp)) {
217
ptep = pmdp_ptep(pmdp);
218
goto update_the_pte;
219
}
220
ptep = pte_alloc_kernel(pmdp, idx);
221
if (!ptep)
222
continue;
223
update_the_pte:
224
radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
225
}
226
227
radix__flush_tlb_kernel_range(start, end);
228
}
229
230
void radix__mark_rodata_ro(void)
231
{
232
unsigned long start, end;
233
234
start = (unsigned long)_stext;
235
end = (unsigned long)__end_rodata;
236
237
radix__change_memory_range(start, end, _PAGE_WRITE);
238
239
for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
240
end = start + PAGE_SIZE;
241
if (overlaps_interrupt_vector_text(start, end))
242
radix__change_memory_range(start, end, _PAGE_WRITE);
243
else
244
break;
245
}
246
}
247
248
void radix__mark_initmem_nx(void)
249
{
250
unsigned long start = (unsigned long)__init_begin;
251
unsigned long end = (unsigned long)__init_end;
252
253
radix__change_memory_range(start, end, _PAGE_EXEC);
254
}
255
#endif /* CONFIG_STRICT_KERNEL_RWX */
256
257
static inline void __meminit
258
print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
259
{
260
char buf[10];
261
262
if (end <= start)
263
return;
264
265
string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
266
267
pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
268
exec ? " (exec)" : "");
269
}
270
271
static unsigned long next_boundary(unsigned long addr, unsigned long end)
272
{
273
#ifdef CONFIG_STRICT_KERNEL_RWX
274
unsigned long stext_phys;
275
276
stext_phys = __pa_symbol(_stext);
277
278
// Relocatable kernel running at non-zero real address
279
if (stext_phys != 0) {
280
// The end of interrupts code at zero is a rodata boundary
281
unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
282
if (addr < end_intr)
283
return end_intr;
284
285
// Start of relocated kernel text is a rodata boundary
286
if (addr < stext_phys)
287
return stext_phys;
288
}
289
290
if (addr < __pa_symbol(__srwx_boundary))
291
return __pa_symbol(__srwx_boundary);
292
#endif
293
return end;
294
}
295
296
static int __meminit create_physical_mapping(unsigned long start,
297
unsigned long end,
298
int nid, pgprot_t _prot,
299
unsigned long mapping_sz_limit)
300
{
301
unsigned long vaddr, addr, mapping_size = 0;
302
bool prev_exec, exec = false;
303
pgprot_t prot;
304
int psize;
305
unsigned long max_mapping_size = memory_block_size;
306
307
if (mapping_sz_limit < max_mapping_size)
308
max_mapping_size = mapping_sz_limit;
309
310
if (debug_pagealloc_enabled())
311
max_mapping_size = PAGE_SIZE;
312
313
start = ALIGN(start, PAGE_SIZE);
314
end = ALIGN_DOWN(end, PAGE_SIZE);
315
for (addr = start; addr < end; addr += mapping_size) {
316
unsigned long gap, previous_size;
317
int rc;
318
319
gap = next_boundary(addr, end) - addr;
320
if (gap > max_mapping_size)
321
gap = max_mapping_size;
322
previous_size = mapping_size;
323
prev_exec = exec;
324
325
if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
326
mmu_psize_defs[MMU_PAGE_1G].shift) {
327
mapping_size = PUD_SIZE;
328
psize = MMU_PAGE_1G;
329
} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
330
mmu_psize_defs[MMU_PAGE_2M].shift) {
331
mapping_size = PMD_SIZE;
332
psize = MMU_PAGE_2M;
333
} else {
334
mapping_size = PAGE_SIZE;
335
psize = mmu_virtual_psize;
336
}
337
338
vaddr = (unsigned long)__va(addr);
339
340
if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
341
overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
342
prot = PAGE_KERNEL_X;
343
exec = true;
344
} else {
345
prot = _prot;
346
exec = false;
347
}
348
349
if (mapping_size != previous_size || exec != prev_exec) {
350
print_mapping(start, addr, previous_size, prev_exec);
351
start = addr;
352
}
353
354
rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
355
if (rc)
356
return rc;
357
358
update_page_count(psize, 1);
359
}
360
361
print_mapping(start, addr, mapping_size, exec);
362
return 0;
363
}
364
365
#ifdef CONFIG_KFENCE
366
static __init phys_addr_t alloc_kfence_pool(void)
367
{
368
phys_addr_t kfence_pool;
369
370
/*
371
* TODO: Support to enable KFENCE after bootup depends on the ability to
372
* split page table mappings. As such support is not currently
373
* implemented for radix pagetables, support enabling KFENCE
374
* only at system startup for now.
375
*
376
* After support for splitting mappings is available on radix,
377
* alloc_kfence_pool() & map_kfence_pool() can be dropped and
378
* mapping for __kfence_pool memory can be
379
* split during arch_kfence_init_pool().
380
*/
381
if (!kfence_early_init)
382
goto no_kfence;
383
384
kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
385
if (!kfence_pool)
386
goto no_kfence;
387
388
memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
389
return kfence_pool;
390
391
no_kfence:
392
disable_kfence();
393
return 0;
394
}
395
396
static __init void map_kfence_pool(phys_addr_t kfence_pool)
397
{
398
if (!kfence_pool)
399
return;
400
401
if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
402
-1, PAGE_KERNEL, PAGE_SIZE))
403
goto err;
404
405
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
406
__kfence_pool = __va(kfence_pool);
407
return;
408
409
err:
410
memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
411
disable_kfence();
412
}
413
#else
414
static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
415
static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
416
#endif
417
418
static void __init radix_init_pgtable(void)
419
{
420
phys_addr_t kfence_pool;
421
unsigned long rts_field;
422
phys_addr_t start, end;
423
u64 i;
424
425
/* We don't support slb for radix */
426
slb_set_size(0);
427
428
kfence_pool = alloc_kfence_pool();
429
430
/*
431
* Create the linear mapping
432
*/
433
for_each_mem_range(i, &start, &end) {
434
/*
435
* The memblock allocator is up at this point, so the
436
* page tables will be allocated within the range. No
437
* need or a node (which we don't have yet).
438
*/
439
440
if (end >= RADIX_VMALLOC_START) {
441
pr_warn("Outside the supported range\n");
442
continue;
443
}
444
445
WARN_ON(create_physical_mapping(start, end,
446
-1, PAGE_KERNEL, ~0UL));
447
}
448
449
map_kfence_pool(kfence_pool);
450
451
if (!cpu_has_feature(CPU_FTR_HVMODE) &&
452
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
453
/*
454
* Older versions of KVM on these machines prefer if the
455
* guest only uses the low 19 PID bits.
456
*/
457
mmu_pid_bits = 19;
458
}
459
mmu_base_pid = 1;
460
461
/*
462
* Allocate Partition table and process table for the
463
* host.
464
*/
465
BUG_ON(PRTB_SIZE_SHIFT > 36);
466
process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
467
/*
468
* Fill in the process table.
469
*/
470
rts_field = radix__get_tree_size();
471
process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
472
473
/*
474
* The init_mm context is given the first available (non-zero) PID,
475
* which is the "guard PID" and contains no page table. PIDR should
476
* never be set to zero because that duplicates the kernel address
477
* space at the 0x0... offset (quadrant 0)!
478
*
479
* An arbitrary PID that may later be allocated by the PID allocator
480
* for userspace processes must not be used either, because that
481
* would cause stale user mappings for that PID on CPUs outside of
482
* the TLB invalidation scheme (because it won't be in mm_cpumask).
483
*
484
* So permanently carve out one PID for the purpose of a guard PID.
485
*/
486
init_mm.context.id = mmu_base_pid;
487
mmu_base_pid++;
488
}
489
490
static void __init radix_init_partition_table(void)
491
{
492
unsigned long rts_field, dw0, dw1;
493
494
mmu_partition_table_init();
495
rts_field = radix__get_tree_size();
496
dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
497
dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
498
mmu_partition_table_set_entry(0, dw0, dw1, false);
499
500
pr_info("Initializing Radix MMU\n");
501
}
502
503
static int __init get_idx_from_shift(unsigned int shift)
504
{
505
int idx = -1;
506
507
switch (shift) {
508
case 0xc:
509
idx = MMU_PAGE_4K;
510
break;
511
case 0x10:
512
idx = MMU_PAGE_64K;
513
break;
514
case 0x15:
515
idx = MMU_PAGE_2M;
516
break;
517
case 0x1e:
518
idx = MMU_PAGE_1G;
519
break;
520
}
521
return idx;
522
}
523
524
static int __init radix_dt_scan_page_sizes(unsigned long node,
525
const char *uname, int depth,
526
void *data)
527
{
528
int size = 0;
529
int shift, idx;
530
unsigned int ap;
531
const __be32 *prop;
532
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
533
534
/* We are scanning "cpu" nodes only */
535
if (type == NULL || strcmp(type, "cpu") != 0)
536
return 0;
537
538
/* Grab page size encodings */
539
prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
540
if (!prop)
541
return 0;
542
543
pr_info("Page sizes from device-tree:\n");
544
for (; size >= 4; size -= 4, ++prop) {
545
546
struct mmu_psize_def *def;
547
548
/* top 3 bit is AP encoding */
549
shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
550
ap = be32_to_cpu(prop[0]) >> 29;
551
pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
552
553
idx = get_idx_from_shift(shift);
554
if (idx < 0)
555
continue;
556
557
def = &mmu_psize_defs[idx];
558
def->shift = shift;
559
def->ap = ap;
560
def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
561
}
562
563
/* needed ? */
564
cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
565
return 1;
566
}
567
568
void __init radix__early_init_devtree(void)
569
{
570
int rc;
571
572
/*
573
* Try to find the available page sizes in the device-tree
574
*/
575
rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
576
if (!rc) {
577
/*
578
* No page size details found in device tree.
579
* Let's assume we have page 4k and 64k support
580
*/
581
mmu_psize_defs[MMU_PAGE_4K].shift = 12;
582
mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
583
mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
584
psize_to_rpti_pgsize(MMU_PAGE_4K);
585
586
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
587
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
588
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
589
psize_to_rpti_pgsize(MMU_PAGE_64K);
590
}
591
return;
592
}
593
594
void __init radix__early_init_mmu(void)
595
{
596
unsigned long lpcr;
597
598
#ifdef CONFIG_PPC_64S_HASH_MMU
599
#ifdef CONFIG_PPC_64K_PAGES
600
/* PAGE_SIZE mappings */
601
mmu_virtual_psize = MMU_PAGE_64K;
602
#else
603
mmu_virtual_psize = MMU_PAGE_4K;
604
#endif
605
#endif
606
/*
607
* initialize page table size
608
*/
609
__pte_index_size = RADIX_PTE_INDEX_SIZE;
610
__pmd_index_size = RADIX_PMD_INDEX_SIZE;
611
__pud_index_size = RADIX_PUD_INDEX_SIZE;
612
__pgd_index_size = RADIX_PGD_INDEX_SIZE;
613
__pud_cache_index = RADIX_PUD_INDEX_SIZE;
614
__pte_table_size = RADIX_PTE_TABLE_SIZE;
615
__pmd_table_size = RADIX_PMD_TABLE_SIZE;
616
__pud_table_size = RADIX_PUD_TABLE_SIZE;
617
__pgd_table_size = RADIX_PGD_TABLE_SIZE;
618
619
__pmd_val_bits = RADIX_PMD_VAL_BITS;
620
__pud_val_bits = RADIX_PUD_VAL_BITS;
621
__pgd_val_bits = RADIX_PGD_VAL_BITS;
622
623
__kernel_virt_start = RADIX_KERN_VIRT_START;
624
__vmalloc_start = RADIX_VMALLOC_START;
625
__vmalloc_end = RADIX_VMALLOC_END;
626
__kernel_io_start = RADIX_KERN_IO_START;
627
__kernel_io_end = RADIX_KERN_IO_END;
628
vmemmap = (struct page *)RADIX_VMEMMAP_START;
629
ioremap_bot = IOREMAP_BASE;
630
631
#ifdef CONFIG_PCI
632
pci_io_base = ISA_IO_BASE;
633
#endif
634
__pte_frag_nr = RADIX_PTE_FRAG_NR;
635
__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
636
__pmd_frag_nr = RADIX_PMD_FRAG_NR;
637
__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
638
639
radix_init_pgtable();
640
641
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
642
lpcr = mfspr(SPRN_LPCR);
643
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
644
radix_init_partition_table();
645
} else {
646
radix_init_pseries();
647
}
648
649
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
650
651
/* Switch to the guard PID before turning on MMU */
652
radix__switch_mmu_context(NULL, &init_mm);
653
tlbiel_all();
654
}
655
656
void radix__early_init_mmu_secondary(void)
657
{
658
unsigned long lpcr;
659
/*
660
* update partition table control register and UPRT
661
*/
662
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
663
lpcr = mfspr(SPRN_LPCR);
664
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
665
666
set_ptcr_when_no_uv(__pa(partition_tb) |
667
(PATB_SIZE_SHIFT - 12));
668
}
669
670
radix__switch_mmu_context(NULL, &init_mm);
671
tlbiel_all();
672
673
/* Make sure userspace can't change the AMR */
674
mtspr(SPRN_UAMOR, 0);
675
}
676
677
/* Called during kexec sequence with MMU off */
678
notrace void radix__mmu_cleanup_all(void)
679
{
680
unsigned long lpcr;
681
682
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
683
lpcr = mfspr(SPRN_LPCR);
684
mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
685
set_ptcr_when_no_uv(0);
686
powernv_set_nmmu_ptcr(0);
687
radix__flush_tlb_all();
688
}
689
}
690
691
#ifdef CONFIG_MEMORY_HOTPLUG
692
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
693
{
694
pte_t *pte;
695
int i;
696
697
for (i = 0; i < PTRS_PER_PTE; i++) {
698
pte = pte_start + i;
699
if (!pte_none(*pte))
700
return;
701
}
702
703
pte_free_kernel(&init_mm, pte_start);
704
pmd_clear(pmd);
705
}
706
707
static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
708
{
709
pmd_t *pmd;
710
int i;
711
712
for (i = 0; i < PTRS_PER_PMD; i++) {
713
pmd = pmd_start + i;
714
if (!pmd_none(*pmd))
715
return;
716
}
717
718
pmd_free(&init_mm, pmd_start);
719
pud_clear(pud);
720
}
721
722
static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
723
{
724
pud_t *pud;
725
int i;
726
727
for (i = 0; i < PTRS_PER_PUD; i++) {
728
pud = pud_start + i;
729
if (!pud_none(*pud))
730
return;
731
}
732
733
pud_free(&init_mm, pud_start);
734
p4d_clear(p4d);
735
}
736
737
#ifdef CONFIG_SPARSEMEM_VMEMMAP
738
static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
739
{
740
unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
741
742
return !vmemmap_populated(start, PMD_SIZE);
743
}
744
745
static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
746
{
747
unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
748
749
return !vmemmap_populated(start, PAGE_SIZE);
750
751
}
752
#endif
753
754
static void __meminit free_vmemmap_pages(struct page *page,
755
struct vmem_altmap *altmap,
756
int order)
757
{
758
unsigned int nr_pages = 1 << order;
759
760
if (altmap) {
761
unsigned long alt_start, alt_end;
762
unsigned long base_pfn = page_to_pfn(page);
763
764
/*
765
* with 2M vmemmap mmaping we can have things setup
766
* such that even though atlmap is specified we never
767
* used altmap.
768
*/
769
alt_start = altmap->base_pfn;
770
alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
771
772
if (base_pfn >= alt_start && base_pfn < alt_end) {
773
vmem_altmap_free(altmap, nr_pages);
774
return;
775
}
776
}
777
778
if (PageReserved(page)) {
779
/* allocated from memblock */
780
while (nr_pages--)
781
free_reserved_page(page++);
782
} else
783
free_pages((unsigned long)page_address(page), order);
784
}
785
786
static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
787
unsigned long end, bool direct,
788
struct vmem_altmap *altmap)
789
{
790
unsigned long next, pages = 0;
791
pte_t *pte;
792
793
pte = pte_start + pte_index(addr);
794
for (; addr < end; addr = next, pte++) {
795
next = (addr + PAGE_SIZE) & PAGE_MASK;
796
if (next > end)
797
next = end;
798
799
if (!pte_present(*pte))
800
continue;
801
802
if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
803
if (!direct)
804
free_vmemmap_pages(pte_page(*pte), altmap, 0);
805
pte_clear(&init_mm, addr, pte);
806
pages++;
807
}
808
#ifdef CONFIG_SPARSEMEM_VMEMMAP
809
else if (!direct && vmemmap_page_is_unused(addr, next)) {
810
free_vmemmap_pages(pte_page(*pte), altmap, 0);
811
pte_clear(&init_mm, addr, pte);
812
}
813
#endif
814
}
815
if (direct)
816
update_page_count(mmu_virtual_psize, -pages);
817
}
818
819
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
820
unsigned long end, bool direct,
821
struct vmem_altmap *altmap)
822
{
823
unsigned long next, pages = 0;
824
pte_t *pte_base;
825
pmd_t *pmd;
826
827
pmd = pmd_start + pmd_index(addr);
828
for (; addr < end; addr = next, pmd++) {
829
next = pmd_addr_end(addr, end);
830
831
if (!pmd_present(*pmd))
832
continue;
833
834
if (pmd_leaf(*pmd)) {
835
if (IS_ALIGNED(addr, PMD_SIZE) &&
836
IS_ALIGNED(next, PMD_SIZE)) {
837
if (!direct)
838
free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
839
pte_clear(&init_mm, addr, (pte_t *)pmd);
840
pages++;
841
}
842
#ifdef CONFIG_SPARSEMEM_VMEMMAP
843
else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
844
free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
845
pte_clear(&init_mm, addr, (pte_t *)pmd);
846
}
847
#endif
848
continue;
849
}
850
851
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
852
remove_pte_table(pte_base, addr, next, direct, altmap);
853
free_pte_table(pte_base, pmd);
854
}
855
if (direct)
856
update_page_count(MMU_PAGE_2M, -pages);
857
}
858
859
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
860
unsigned long end, bool direct,
861
struct vmem_altmap *altmap)
862
{
863
unsigned long next, pages = 0;
864
pmd_t *pmd_base;
865
pud_t *pud;
866
867
pud = pud_start + pud_index(addr);
868
for (; addr < end; addr = next, pud++) {
869
next = pud_addr_end(addr, end);
870
871
if (!pud_present(*pud))
872
continue;
873
874
if (pud_leaf(*pud)) {
875
if (!IS_ALIGNED(addr, PUD_SIZE) ||
876
!IS_ALIGNED(next, PUD_SIZE)) {
877
WARN_ONCE(1, "%s: unaligned range\n", __func__);
878
continue;
879
}
880
pte_clear(&init_mm, addr, (pte_t *)pud);
881
pages++;
882
continue;
883
}
884
885
pmd_base = pud_pgtable(*pud);
886
remove_pmd_table(pmd_base, addr, next, direct, altmap);
887
free_pmd_table(pmd_base, pud);
888
}
889
if (direct)
890
update_page_count(MMU_PAGE_1G, -pages);
891
}
892
893
static void __meminit
894
remove_pagetable(unsigned long start, unsigned long end, bool direct,
895
struct vmem_altmap *altmap)
896
{
897
unsigned long addr, next;
898
pud_t *pud_base;
899
pgd_t *pgd;
900
p4d_t *p4d;
901
902
spin_lock(&init_mm.page_table_lock);
903
904
for (addr = start; addr < end; addr = next) {
905
next = pgd_addr_end(addr, end);
906
907
pgd = pgd_offset_k(addr);
908
p4d = p4d_offset(pgd, addr);
909
if (!p4d_present(*p4d))
910
continue;
911
912
if (p4d_leaf(*p4d)) {
913
if (!IS_ALIGNED(addr, P4D_SIZE) ||
914
!IS_ALIGNED(next, P4D_SIZE)) {
915
WARN_ONCE(1, "%s: unaligned range\n", __func__);
916
continue;
917
}
918
919
pte_clear(&init_mm, addr, (pte_t *)pgd);
920
continue;
921
}
922
923
pud_base = p4d_pgtable(*p4d);
924
remove_pud_table(pud_base, addr, next, direct, altmap);
925
free_pud_table(pud_base, p4d);
926
}
927
928
spin_unlock(&init_mm.page_table_lock);
929
radix__flush_tlb_kernel_range(start, end);
930
}
931
932
int __meminit radix__create_section_mapping(unsigned long start,
933
unsigned long end, int nid,
934
pgprot_t prot)
935
{
936
if (end >= RADIX_VMALLOC_START) {
937
pr_warn("Outside the supported range\n");
938
return -1;
939
}
940
941
return create_physical_mapping(__pa(start), __pa(end),
942
nid, prot, ~0UL);
943
}
944
945
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
946
{
947
remove_pagetable(start, end, true, NULL);
948
return 0;
949
}
950
#endif /* CONFIG_MEMORY_HOTPLUG */
951
952
#ifdef CONFIG_SPARSEMEM_VMEMMAP
953
static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
954
pgprot_t flags, unsigned int map_page_size,
955
int nid)
956
{
957
return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
958
}
959
960
int __meminit radix__vmemmap_create_mapping(unsigned long start,
961
unsigned long page_size,
962
unsigned long phys)
963
{
964
/* Create a PTE encoding */
965
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
966
int ret;
967
968
if ((start + page_size) >= RADIX_VMEMMAP_END) {
969
pr_warn("Outside the supported range\n");
970
return -1;
971
}
972
973
ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
974
BUG_ON(ret);
975
976
return 0;
977
}
978
979
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
980
bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
981
{
982
if (radix_enabled())
983
return __vmemmap_can_optimize(altmap, pgmap);
984
985
return false;
986
}
987
#endif
988
989
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
990
unsigned long addr, unsigned long next)
991
{
992
int large = pmd_leaf(*pmdp);
993
994
if (large)
995
vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
996
997
return large;
998
}
999
1000
void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1001
unsigned long addr, unsigned long next)
1002
{
1003
pte_t entry;
1004
pte_t *ptep = pmdp_ptep(pmdp);
1005
1006
VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
1007
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1008
set_pte_at(&init_mm, addr, ptep, entry);
1009
asm volatile("ptesync": : :"memory");
1010
1011
vmemmap_verify(ptep, node, addr, next);
1012
}
1013
1014
static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
1015
int node,
1016
struct vmem_altmap *altmap,
1017
struct page *reuse)
1018
{
1019
pte_t *pte = pte_offset_kernel(pmdp, addr);
1020
1021
if (pte_none(*pte)) {
1022
pte_t entry;
1023
void *p;
1024
1025
if (!reuse) {
1026
/*
1027
* make sure we don't create altmap mappings
1028
* covering things outside the device.
1029
*/
1030
if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
1031
altmap = NULL;
1032
1033
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
1034
if (!p && altmap)
1035
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
1036
if (!p)
1037
return NULL;
1038
pr_debug("PAGE_SIZE vmemmap mapping\n");
1039
} else {
1040
/*
1041
* When a PTE/PMD entry is freed from the init_mm
1042
* there's a free_pages() call to this page allocated
1043
* above. Thus this get_page() is paired with the
1044
* put_page_testzero() on the freeing path.
1045
* This can only called by certain ZONE_DEVICE path,
1046
* and through vmemmap_populate_compound_pages() when
1047
* slab is available.
1048
*/
1049
get_page(reuse);
1050
p = page_to_virt(reuse);
1051
pr_debug("Tail page reuse vmemmap mapping\n");
1052
}
1053
1054
VM_BUG_ON(!PAGE_ALIGNED(addr));
1055
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1056
set_pte_at(&init_mm, addr, pte, entry);
1057
asm volatile("ptesync": : :"memory");
1058
}
1059
return pte;
1060
}
1061
1062
static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
1063
unsigned long address)
1064
{
1065
pud_t *pud;
1066
1067
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1068
if (unlikely(p4d_none(*p4dp))) {
1069
if (unlikely(!slab_is_available())) {
1070
pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1071
p4d_populate(&init_mm, p4dp, pud);
1072
/* go to the pud_offset */
1073
} else
1074
return pud_alloc(&init_mm, p4dp, address);
1075
}
1076
return pud_offset(p4dp, address);
1077
}
1078
1079
static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
1080
unsigned long address)
1081
{
1082
pmd_t *pmd;
1083
1084
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1085
if (unlikely(pud_none(*pudp))) {
1086
if (unlikely(!slab_is_available())) {
1087
pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1088
pud_populate(&init_mm, pudp, pmd);
1089
} else
1090
return pmd_alloc(&init_mm, pudp, address);
1091
}
1092
return pmd_offset(pudp, address);
1093
}
1094
1095
static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
1096
unsigned long address)
1097
{
1098
pte_t *pte;
1099
1100
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1101
if (unlikely(pmd_none(*pmdp))) {
1102
if (unlikely(!slab_is_available())) {
1103
pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1104
pmd_populate(&init_mm, pmdp, pte);
1105
} else
1106
return pte_alloc_kernel(pmdp, address);
1107
}
1108
return pte_offset_kernel(pmdp, address);
1109
}
1110
1111
1112
1113
int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
1114
struct vmem_altmap *altmap)
1115
{
1116
unsigned long addr;
1117
unsigned long next;
1118
pgd_t *pgd;
1119
p4d_t *p4d;
1120
pud_t *pud;
1121
pmd_t *pmd;
1122
pte_t *pte;
1123
1124
/*
1125
* If altmap is present, Make sure we align the start vmemmap addr
1126
* to PAGE_SIZE so that we calculate the correct start_pfn in
1127
* altmap boundary check to decide whether we should use altmap or
1128
* RAM based backing memory allocation. Also the address need to be
1129
* aligned for set_pte operation. If the start addr is already
1130
* PMD_SIZE aligned and with in the altmap boundary then we will
1131
* try to use a pmd size altmap mapping else we go for page size
1132
* mapping.
1133
*
1134
* If altmap is not present, align the vmemmap addr to PMD_SIZE and
1135
* always allocate a PMD size page for vmemmap backing.
1136
*
1137
*/
1138
1139
if (altmap)
1140
start = ALIGN_DOWN(start, PAGE_SIZE);
1141
else
1142
start = ALIGN_DOWN(start, PMD_SIZE);
1143
1144
for (addr = start; addr < end; addr = next) {
1145
next = pmd_addr_end(addr, end);
1146
1147
pgd = pgd_offset_k(addr);
1148
p4d = p4d_offset(pgd, addr);
1149
pud = vmemmap_pud_alloc(p4d, node, addr);
1150
if (!pud)
1151
return -ENOMEM;
1152
pmd = vmemmap_pmd_alloc(pud, node, addr);
1153
if (!pmd)
1154
return -ENOMEM;
1155
1156
if (pmd_none(READ_ONCE(*pmd))) {
1157
void *p;
1158
1159
/*
1160
* keep it simple by checking addr PMD_SIZE alignment
1161
* and verifying the device boundary condition.
1162
* For us to use a pmd mapping, both addr and pfn should
1163
* be aligned. We skip if addr is not aligned and for
1164
* pfn we hope we have extra area in the altmap that
1165
* can help to find an aligned block. This can result
1166
* in altmap block allocation failures, in which case
1167
* we fallback to RAM for vmemmap allocation.
1168
*/
1169
if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
1170
altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
1171
/*
1172
* make sure we don't create altmap mappings
1173
* covering things outside the device.
1174
*/
1175
goto base_mapping;
1176
}
1177
1178
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1179
if (p) {
1180
vmemmap_set_pmd(pmd, p, node, addr, next);
1181
pr_debug("PMD_SIZE vmemmap mapping\n");
1182
continue;
1183
} else {
1184
/*
1185
* A vmemmap block allocation can fail due to
1186
* alignment requirements and we trying to align
1187
* things aggressively there by running out of
1188
* space. Try base mapping on failure.
1189
*/
1190
goto base_mapping;
1191
}
1192
} else if (vmemmap_check_pmd(pmd, node, addr, next)) {
1193
/*
1194
* If a huge mapping exist due to early call to
1195
* vmemmap_populate, let's try to use that.
1196
*/
1197
continue;
1198
}
1199
base_mapping:
1200
/*
1201
* Not able allocate higher order memory to back memmap
1202
* or we found a pointer to pte page. Allocate base page
1203
* size vmemmap
1204
*/
1205
pte = vmemmap_pte_alloc(pmd, node, addr);
1206
if (!pte)
1207
return -ENOMEM;
1208
1209
pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
1210
if (!pte)
1211
return -ENOMEM;
1212
1213
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1214
next = addr + PAGE_SIZE;
1215
}
1216
return 0;
1217
}
1218
1219
static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
1220
struct vmem_altmap *altmap,
1221
struct page *reuse)
1222
{
1223
pgd_t *pgd;
1224
p4d_t *p4d;
1225
pud_t *pud;
1226
pmd_t *pmd;
1227
pte_t *pte;
1228
1229
pgd = pgd_offset_k(addr);
1230
p4d = p4d_offset(pgd, addr);
1231
pud = vmemmap_pud_alloc(p4d, node, addr);
1232
if (!pud)
1233
return NULL;
1234
pmd = vmemmap_pmd_alloc(pud, node, addr);
1235
if (!pmd)
1236
return NULL;
1237
if (pmd_leaf(*pmd))
1238
/*
1239
* The second page is mapped as a hugepage due to a nearby request.
1240
* Force our mapping to page size without deduplication
1241
*/
1242
return NULL;
1243
pte = vmemmap_pte_alloc(pmd, node, addr);
1244
if (!pte)
1245
return NULL;
1246
radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1247
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1248
1249
return pte;
1250
}
1251
1252
static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
1253
unsigned long pfn_offset, int node)
1254
{
1255
pgd_t *pgd;
1256
p4d_t *p4d;
1257
pud_t *pud;
1258
pmd_t *pmd;
1259
pte_t *pte;
1260
unsigned long map_addr;
1261
1262
/* the second vmemmap page which we use for duplication */
1263
map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
1264
pgd = pgd_offset_k(map_addr);
1265
p4d = p4d_offset(pgd, map_addr);
1266
pud = vmemmap_pud_alloc(p4d, node, map_addr);
1267
if (!pud)
1268
return NULL;
1269
pmd = vmemmap_pmd_alloc(pud, node, map_addr);
1270
if (!pmd)
1271
return NULL;
1272
if (pmd_leaf(*pmd))
1273
/*
1274
* The second page is mapped as a hugepage due to a nearby request.
1275
* Force our mapping to page size without deduplication
1276
*/
1277
return NULL;
1278
pte = vmemmap_pte_alloc(pmd, node, map_addr);
1279
if (!pte)
1280
return NULL;
1281
/*
1282
* Check if there exist a mapping to the left
1283
*/
1284
if (pte_none(*pte)) {
1285
/*
1286
* Populate the head page vmemmap page.
1287
* It can fall in different pmd, hence
1288
* vmemmap_populate_address()
1289
*/
1290
pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
1291
if (!pte)
1292
return NULL;
1293
/*
1294
* Populate the tail pages vmemmap page
1295
*/
1296
pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
1297
if (!pte)
1298
return NULL;
1299
vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
1300
return pte;
1301
}
1302
return pte;
1303
}
1304
1305
int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
1306
unsigned long start,
1307
unsigned long end, int node,
1308
struct dev_pagemap *pgmap)
1309
{
1310
/*
1311
* we want to map things as base page size mapping so that
1312
* we can save space in vmemmap. We could have huge mapping
1313
* covering out both edges.
1314
*/
1315
unsigned long addr;
1316
unsigned long addr_pfn = start_pfn;
1317
unsigned long next;
1318
pgd_t *pgd;
1319
p4d_t *p4d;
1320
pud_t *pud;
1321
pmd_t *pmd;
1322
pte_t *pte;
1323
1324
for (addr = start; addr < end; addr = next) {
1325
1326
pgd = pgd_offset_k(addr);
1327
p4d = p4d_offset(pgd, addr);
1328
pud = vmemmap_pud_alloc(p4d, node, addr);
1329
if (!pud)
1330
return -ENOMEM;
1331
pmd = vmemmap_pmd_alloc(pud, node, addr);
1332
if (!pmd)
1333
return -ENOMEM;
1334
1335
if (pmd_leaf(READ_ONCE(*pmd))) {
1336
/* existing huge mapping. Skip the range */
1337
addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
1338
next = pmd_addr_end(addr, end);
1339
continue;
1340
}
1341
pte = vmemmap_pte_alloc(pmd, node, addr);
1342
if (!pte)
1343
return -ENOMEM;
1344
if (!pte_none(*pte)) {
1345
/*
1346
* This could be because we already have a compound
1347
* page whose VMEMMAP_RESERVE_NR pages were mapped and
1348
* this request fall in those pages.
1349
*/
1350
addr_pfn += 1;
1351
next = addr + PAGE_SIZE;
1352
continue;
1353
} else {
1354
unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
1355
unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
1356
pte_t *tail_page_pte;
1357
1358
/*
1359
* if the address is aligned to huge page size it is the
1360
* head mapping.
1361
*/
1362
if (pfn_offset == 0) {
1363
/* Populate the head page vmemmap page */
1364
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1365
if (!pte)
1366
return -ENOMEM;
1367
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1368
1369
/*
1370
* Populate the tail pages vmemmap page
1371
* It can fall in different pmd, hence
1372
* vmemmap_populate_address()
1373
*/
1374
pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
1375
if (!pte)
1376
return -ENOMEM;
1377
1378
addr_pfn += 2;
1379
next = addr + 2 * PAGE_SIZE;
1380
continue;
1381
}
1382
/*
1383
* get the 2nd mapping details
1384
* Also create it if that doesn't exist
1385
*/
1386
tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
1387
if (!tail_page_pte) {
1388
1389
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1390
if (!pte)
1391
return -ENOMEM;
1392
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1393
1394
addr_pfn += 1;
1395
next = addr + PAGE_SIZE;
1396
continue;
1397
}
1398
1399
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
1400
if (!pte)
1401
return -ENOMEM;
1402
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1403
1404
addr_pfn += 1;
1405
next = addr + PAGE_SIZE;
1406
continue;
1407
}
1408
}
1409
return 0;
1410
}
1411
1412
1413
#ifdef CONFIG_MEMORY_HOTPLUG
1414
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
1415
{
1416
remove_pagetable(start, start + page_size, true, NULL);
1417
}
1418
1419
void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
1420
struct vmem_altmap *altmap)
1421
{
1422
remove_pagetable(start, end, false, altmap);
1423
}
1424
#endif
1425
#endif
1426
1427
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1428
1429
unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
1430
pmd_t *pmdp, unsigned long clr,
1431
unsigned long set)
1432
{
1433
unsigned long old;
1434
1435
#ifdef CONFIG_DEBUG_VM
1436
WARN_ON(!radix__pmd_trans_huge(*pmdp));
1437
assert_spin_locked(pmd_lockptr(mm, pmdp));
1438
#endif
1439
1440
old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
1441
trace_hugepage_update_pmd(addr, old, clr, set);
1442
1443
return old;
1444
}
1445
1446
unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
1447
pud_t *pudp, unsigned long clr,
1448
unsigned long set)
1449
{
1450
unsigned long old;
1451
1452
#ifdef CONFIG_DEBUG_VM
1453
WARN_ON(!pud_trans_huge(*pudp));
1454
assert_spin_locked(pud_lockptr(mm, pudp));
1455
#endif
1456
1457
old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
1458
trace_hugepage_update_pud(addr, old, clr, set);
1459
1460
return old;
1461
}
1462
1463
pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
1464
pmd_t *pmdp)
1465
1466
{
1467
pmd_t pmd;
1468
1469
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1470
VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
1471
/*
1472
* khugepaged calls this for normal pmd
1473
*/
1474
pmd = *pmdp;
1475
pmd_clear(pmdp);
1476
1477
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
1478
1479
return pmd;
1480
}
1481
1482
/*
1483
* For us pgtable_t is pte_t *. Inorder to save the deposisted
1484
* page table, we consider the allocated page table as a list
1485
* head. On withdraw we need to make sure we zero out the used
1486
* list_head memory area.
1487
*/
1488
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1489
pgtable_t pgtable)
1490
{
1491
struct list_head *lh = (struct list_head *) pgtable;
1492
1493
assert_spin_locked(pmd_lockptr(mm, pmdp));
1494
1495
/* FIFO */
1496
if (!pmd_huge_pte(mm, pmdp))
1497
INIT_LIST_HEAD(lh);
1498
else
1499
list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1500
pmd_huge_pte(mm, pmdp) = pgtable;
1501
}
1502
1503
pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1504
{
1505
pte_t *ptep;
1506
pgtable_t pgtable;
1507
struct list_head *lh;
1508
1509
assert_spin_locked(pmd_lockptr(mm, pmdp));
1510
1511
/* FIFO */
1512
pgtable = pmd_huge_pte(mm, pmdp);
1513
lh = (struct list_head *) pgtable;
1514
if (list_empty(lh))
1515
pmd_huge_pte(mm, pmdp) = NULL;
1516
else {
1517
pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1518
list_del(lh);
1519
}
1520
ptep = (pte_t *) pgtable;
1521
*ptep = __pte(0);
1522
ptep++;
1523
*ptep = __pte(0);
1524
return pgtable;
1525
}
1526
1527
pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1528
unsigned long addr, pmd_t *pmdp)
1529
{
1530
pmd_t old_pmd;
1531
unsigned long old;
1532
1533
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1534
old_pmd = __pmd(old);
1535
return old_pmd;
1536
}
1537
1538
pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
1539
unsigned long addr, pud_t *pudp)
1540
{
1541
pud_t old_pud;
1542
unsigned long old;
1543
1544
old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
1545
old_pud = __pud(old);
1546
return old_pud;
1547
}
1548
1549
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1550
1551
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1552
pte_t entry, unsigned long address, int psize)
1553
{
1554
struct mm_struct *mm = vma->vm_mm;
1555
unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
1556
_PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
1557
1558
unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1559
/*
1560
* On POWER9, the NMMU is not able to relax PTE access permissions
1561
* for a translation with a TLB. The PTE must be invalidated, TLB
1562
* flushed before the new PTE is installed.
1563
*
1564
* This only needs to be done for radix, because hash translation does
1565
* flush when updating the linux pte (and we don't support NMMU
1566
* accelerators on HPT on POWER9 anyway XXX: do we?).
1567
*
1568
* POWER10 (and P9P) NMMU does behave as per ISA.
1569
*/
1570
if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1571
atomic_read(&mm->context.copros) > 0) {
1572
unsigned long old_pte, new_pte;
1573
1574
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1575
new_pte = old_pte | set;
1576
radix__flush_tlb_page_psize(mm, address, psize);
1577
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1578
} else {
1579
__radix_pte_update(ptep, 0, set);
1580
/*
1581
* Book3S does not require a TLB flush when relaxing access
1582
* restrictions when the address space (modulo the POWER9 nest
1583
* MMU issue above) because the MMU will reload the PTE after
1584
* taking an access fault, as defined by the architecture. See
1585
* "Setting a Reference or Change Bit or Upgrading Access
1586
* Authority (PTE Subject to Atomic Hardware Updates)" in
1587
* Power ISA Version 3.1B.
1588
*/
1589
}
1590
/* See ptesync comment in radix__set_pte_at */
1591
}
1592
1593
void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1594
unsigned long addr, pte_t *ptep,
1595
pte_t old_pte, pte_t pte)
1596
{
1597
struct mm_struct *mm = vma->vm_mm;
1598
1599
/*
1600
* POWER9 NMMU must flush the TLB after clearing the PTE before
1601
* installing a PTE with more relaxed access permissions, see
1602
* radix__ptep_set_access_flags.
1603
*/
1604
if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1605
is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1606
(atomic_read(&mm->context.copros) > 0))
1607
radix__flush_tlb_page(vma, addr);
1608
1609
set_pte_at(mm, addr, ptep, pte);
1610
}
1611
1612
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1613
{
1614
pte_t *ptep = (pte_t *)pud;
1615
pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1616
1617
if (!radix_enabled())
1618
return 0;
1619
1620
set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1621
1622
return 1;
1623
}
1624
1625
int pud_clear_huge(pud_t *pud)
1626
{
1627
if (pud_leaf(*pud)) {
1628
pud_clear(pud);
1629
return 1;
1630
}
1631
1632
return 0;
1633
}
1634
1635
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1636
{
1637
pmd_t *pmd;
1638
int i;
1639
1640
pmd = pud_pgtable(*pud);
1641
pud_clear(pud);
1642
1643
flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1644
1645
for (i = 0; i < PTRS_PER_PMD; i++) {
1646
if (!pmd_none(pmd[i])) {
1647
pte_t *pte;
1648
pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1649
1650
pte_free_kernel(&init_mm, pte);
1651
}
1652
}
1653
1654
pmd_free(&init_mm, pmd);
1655
1656
return 1;
1657
}
1658
1659
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1660
{
1661
pte_t *ptep = (pte_t *)pmd;
1662
pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1663
1664
if (!radix_enabled())
1665
return 0;
1666
1667
set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1668
1669
return 1;
1670
}
1671
1672
int pmd_clear_huge(pmd_t *pmd)
1673
{
1674
if (pmd_leaf(*pmd)) {
1675
pmd_clear(pmd);
1676
return 1;
1677
}
1678
1679
return 0;
1680
}
1681
1682
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1683
{
1684
pte_t *pte;
1685
1686
pte = (pte_t *)pmd_page_vaddr(*pmd);
1687
pmd_clear(pmd);
1688
1689
flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1690
1691
pte_free_kernel(&init_mm, pte);
1692
1693
return 1;
1694
}
1695
1696