Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/mm/init_64.c
26424 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* PowerPC version
4
* Copyright (C) 1995-1996 Gary Thomas ([email protected])
5
*
6
* Modifications by Paul Mackerras (PowerMac) ([email protected])
7
* and Cort Dougan (PReP) ([email protected])
8
* Copyright (C) 1996 Paul Mackerras
9
*
10
* Derived from "arch/i386/mm/init.c"
11
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
12
*
13
* Dave Engebretsen <[email protected]>
14
* Rework for PPC64 port.
15
*/
16
17
#undef DEBUG
18
19
#include <linux/signal.h>
20
#include <linux/sched.h>
21
#include <linux/kernel.h>
22
#include <linux/errno.h>
23
#include <linux/string.h>
24
#include <linux/types.h>
25
#include <linux/mman.h>
26
#include <linux/mm.h>
27
#include <linux/swap.h>
28
#include <linux/stddef.h>
29
#include <linux/vmalloc.h>
30
#include <linux/init.h>
31
#include <linux/delay.h>
32
#include <linux/highmem.h>
33
#include <linux/idr.h>
34
#include <linux/nodemask.h>
35
#include <linux/module.h>
36
#include <linux/poison.h>
37
#include <linux/memblock.h>
38
#include <linux/hugetlb.h>
39
#include <linux/slab.h>
40
#include <linux/of_fdt.h>
41
#include <linux/libfdt.h>
42
#include <linux/memremap.h>
43
#include <linux/memory.h>
44
#include <linux/bootmem_info.h>
45
46
#include <asm/pgalloc.h>
47
#include <asm/page.h>
48
#include <asm/prom.h>
49
#include <asm/rtas.h>
50
#include <asm/io.h>
51
#include <asm/mmu_context.h>
52
#include <asm/mmu.h>
53
#include <linux/uaccess.h>
54
#include <asm/smp.h>
55
#include <asm/machdep.h>
56
#include <asm/tlb.h>
57
#include <asm/eeh.h>
58
#include <asm/processor.h>
59
#include <asm/mmzone.h>
60
#include <asm/cputable.h>
61
#include <asm/sections.h>
62
#include <asm/iommu.h>
63
#include <asm/vdso.h>
64
#include <asm/hugetlb.h>
65
66
#include <mm/mmu_decl.h>
67
68
#ifdef CONFIG_SPARSEMEM_VMEMMAP
69
/*
70
* Given an address within the vmemmap, determine the page that
71
* represents the start of the subsection it is within. Note that we have to
72
* do this by hand as the proffered address may not be correctly aligned.
73
* Subtraction of non-aligned pointers produces undefined results.
74
*/
75
static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr)
76
{
77
unsigned long start_pfn;
78
unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap));
79
80
/* Return the pfn of the start of the section. */
81
start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK;
82
return pfn_to_page(start_pfn);
83
}
84
85
/*
86
* Since memory is added in sub-section chunks, before creating a new vmemmap
87
* mapping, the kernel should check whether there is an existing memmap mapping
88
* covering the new subsection added. This is needed because kernel can map
89
* vmemmap area using 16MB pages which will cover a memory range of 16G. Such
90
* a range covers multiple subsections (2M)
91
*
92
* If any subsection in the 16G range mapped by vmemmap is valid we consider the
93
* vmemmap populated (There is a page table entry already present). We can't do
94
* a page table lookup here because with the hash translation we don't keep
95
* vmemmap details in linux page table.
96
*/
97
int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
98
{
99
struct page *start;
100
unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
101
start = vmemmap_subsection_start(vmemmap_addr);
102
103
for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION)
104
/*
105
* pfn valid check here is intended to really check
106
* whether we have any subsection already initialized
107
* in this range.
108
*/
109
if (pfn_valid(page_to_pfn(start)))
110
return 1;
111
112
return 0;
113
}
114
115
/*
116
* vmemmap virtual address space management does not have a traditional page
117
* table to track which virtual struct pages are backed by physical mapping.
118
* The virtual to physical mappings are tracked in a simple linked list
119
* format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
120
* all times where as the 'next' list maintains the available
121
* vmemmap_backing structures which have been deleted from the
122
* 'vmemmap_global' list during system runtime (memory hotplug remove
123
* operation). The freed 'vmemmap_backing' structures are reused later when
124
* new requests come in without allocating fresh memory. This pointer also
125
* tracks the allocated 'vmemmap_backing' structures as we allocate one
126
* full page memory at a time when we dont have any.
127
*/
128
struct vmemmap_backing *vmemmap_list;
129
static struct vmemmap_backing *next;
130
131
/*
132
* The same pointer 'next' tracks individual chunks inside the allocated
133
* full page during the boot time and again tracks the freed nodes during
134
* runtime. It is racy but it does not happen as they are separated by the
135
* boot process. Will create problem if some how we have memory hotplug
136
* operation during boot !!
137
*/
138
static int num_left;
139
static int num_freed;
140
141
static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
142
{
143
struct vmemmap_backing *vmem_back;
144
/* get from freed entries first */
145
if (num_freed) {
146
num_freed--;
147
vmem_back = next;
148
next = next->list;
149
150
return vmem_back;
151
}
152
153
/* allocate a page when required and hand out chunks */
154
if (!num_left) {
155
next = vmemmap_alloc_block(PAGE_SIZE, node);
156
if (unlikely(!next)) {
157
WARN_ON(1);
158
return NULL;
159
}
160
num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
161
}
162
163
num_left--;
164
165
return next++;
166
}
167
168
static __meminit int vmemmap_list_populate(unsigned long phys,
169
unsigned long start,
170
int node)
171
{
172
struct vmemmap_backing *vmem_back;
173
174
vmem_back = vmemmap_list_alloc(node);
175
if (unlikely(!vmem_back)) {
176
pr_debug("vmemap list allocation failed\n");
177
return -ENOMEM;
178
}
179
180
vmem_back->phys = phys;
181
vmem_back->virt_addr = start;
182
vmem_back->list = vmemmap_list;
183
184
vmemmap_list = vmem_back;
185
return 0;
186
}
187
188
bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
189
unsigned long page_size)
190
{
191
unsigned long nr_pfn = page_size / sizeof(struct page);
192
unsigned long start_pfn = page_to_pfn((struct page *)start);
193
194
if ((start_pfn + nr_pfn - 1) > altmap->end_pfn)
195
return true;
196
197
if (start_pfn < altmap->base_pfn)
198
return true;
199
200
return false;
201
}
202
203
static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
204
struct vmem_altmap *altmap)
205
{
206
bool altmap_alloc;
207
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
208
209
/* Align to the page size of the linear mapping. */
210
start = ALIGN_DOWN(start, page_size);
211
212
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
213
214
for (; start < end; start += page_size) {
215
void *p = NULL;
216
int rc;
217
218
/*
219
* This vmemmap range is backing different subsections. If any
220
* of that subsection is marked valid, that means we already
221
* have initialized a page table covering this range and hence
222
* the vmemmap range is populated.
223
*/
224
if (vmemmap_populated(start, page_size))
225
continue;
226
227
/*
228
* Allocate from the altmap first if we have one. This may
229
* fail due to alignment issues when using 16MB hugepages, so
230
* fall back to system memory if the altmap allocation fail.
231
*/
232
if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
233
p = vmemmap_alloc_block_buf(page_size, node, altmap);
234
if (!p)
235
pr_debug("altmap block allocation failed, falling back to system memory");
236
else
237
altmap_alloc = true;
238
}
239
if (!p) {
240
p = vmemmap_alloc_block_buf(page_size, node, NULL);
241
altmap_alloc = false;
242
}
243
if (!p)
244
return -ENOMEM;
245
246
if (vmemmap_list_populate(__pa(p), start, node)) {
247
/*
248
* If we don't populate vmemap list, we don't have
249
* the ability to free the allocated vmemmap
250
* pages in section_deactivate. Hence free them
251
* here.
252
*/
253
int nr_pfns = page_size >> PAGE_SHIFT;
254
unsigned long page_order = get_order(page_size);
255
256
if (altmap_alloc)
257
vmem_altmap_free(altmap, nr_pfns);
258
else
259
free_pages((unsigned long)p, page_order);
260
return -ENOMEM;
261
}
262
263
pr_debug(" * %016lx..%016lx allocated at %p\n",
264
start, start + page_size, p);
265
266
rc = vmemmap_create_mapping(start, page_size, __pa(p));
267
if (rc < 0) {
268
pr_warn("%s: Unable to create vmemmap mapping: %d\n",
269
__func__, rc);
270
return -EFAULT;
271
}
272
}
273
274
return 0;
275
}
276
277
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
278
struct vmem_altmap *altmap)
279
{
280
281
#ifdef CONFIG_PPC_BOOK3S_64
282
if (radix_enabled())
283
return radix__vmemmap_populate(start, end, node, altmap);
284
#endif
285
286
return __vmemmap_populate(start, end, node, altmap);
287
}
288
289
#ifdef CONFIG_MEMORY_HOTPLUG
290
static unsigned long vmemmap_list_free(unsigned long start)
291
{
292
struct vmemmap_backing *vmem_back, *vmem_back_prev;
293
294
vmem_back_prev = vmem_back = vmemmap_list;
295
296
/* look for it with prev pointer recorded */
297
for (; vmem_back; vmem_back = vmem_back->list) {
298
if (vmem_back->virt_addr == start)
299
break;
300
vmem_back_prev = vmem_back;
301
}
302
303
if (unlikely(!vmem_back))
304
return 0;
305
306
/* remove it from vmemmap_list */
307
if (vmem_back == vmemmap_list) /* remove head */
308
vmemmap_list = vmem_back->list;
309
else
310
vmem_back_prev->list = vmem_back->list;
311
312
/* next point to this freed entry */
313
vmem_back->list = next;
314
next = vmem_back;
315
num_freed++;
316
317
return vmem_back->phys;
318
}
319
320
static void __ref __vmemmap_free(unsigned long start, unsigned long end,
321
struct vmem_altmap *altmap)
322
{
323
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
324
unsigned long page_order = get_order(page_size);
325
unsigned long alt_start = ~0, alt_end = ~0;
326
unsigned long base_pfn;
327
328
start = ALIGN_DOWN(start, page_size);
329
if (altmap) {
330
alt_start = altmap->base_pfn;
331
alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
332
}
333
334
pr_debug("vmemmap_free %lx...%lx\n", start, end);
335
336
for (; start < end; start += page_size) {
337
unsigned long nr_pages, addr;
338
struct page *page;
339
340
/*
341
* We have already marked the subsection we are trying to remove
342
* invalid. So if we want to remove the vmemmap range, we
343
* need to make sure there is no subsection marked valid
344
* in this range.
345
*/
346
if (vmemmap_populated(start, page_size))
347
continue;
348
349
addr = vmemmap_list_free(start);
350
if (!addr)
351
continue;
352
353
page = pfn_to_page(addr >> PAGE_SHIFT);
354
nr_pages = 1 << page_order;
355
base_pfn = PHYS_PFN(addr);
356
357
if (base_pfn >= alt_start && base_pfn < alt_end) {
358
vmem_altmap_free(altmap, nr_pages);
359
} else if (PageReserved(page)) {
360
/* allocated from bootmem */
361
if (page_size < PAGE_SIZE) {
362
/*
363
* this shouldn't happen, but if it is
364
* the case, leave the memory there
365
*/
366
WARN_ON_ONCE(1);
367
} else {
368
while (nr_pages--)
369
free_reserved_page(page++);
370
}
371
} else {
372
free_pages((unsigned long)(__va(addr)), page_order);
373
}
374
375
vmemmap_remove_mapping(start, page_size);
376
}
377
}
378
379
void __ref vmemmap_free(unsigned long start, unsigned long end,
380
struct vmem_altmap *altmap)
381
{
382
#ifdef CONFIG_PPC_BOOK3S_64
383
if (radix_enabled())
384
return radix__vmemmap_free(start, end, altmap);
385
#endif
386
return __vmemmap_free(start, end, altmap);
387
}
388
389
#endif
390
391
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
392
void register_page_bootmem_memmap(unsigned long section_nr,
393
struct page *start_page, unsigned long size)
394
{
395
}
396
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
397
398
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
399
400
#ifdef CONFIG_PPC_BOOK3S_64
401
unsigned int mmu_lpid_bits;
402
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
403
EXPORT_SYMBOL_GPL(mmu_lpid_bits);
404
#endif
405
unsigned int mmu_pid_bits;
406
407
static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
408
409
static int __init parse_disable_radix(char *p)
410
{
411
bool val;
412
413
if (!p)
414
val = true;
415
else if (kstrtobool(p, &val))
416
return -EINVAL;
417
418
disable_radix = val;
419
420
return 0;
421
}
422
early_param("disable_radix", parse_disable_radix);
423
424
/*
425
* If we're running under a hypervisor, we need to check the contents of
426
* /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
427
* radix. If not, we clear the radix feature bit so we fall back to hash.
428
*/
429
static void __init early_check_vec5(void)
430
{
431
unsigned long root, chosen;
432
int size;
433
const u8 *vec5;
434
u8 mmu_supported;
435
436
root = of_get_flat_dt_root();
437
chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
438
if (chosen == -FDT_ERR_NOTFOUND) {
439
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
440
return;
441
}
442
vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
443
if (!vec5) {
444
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
445
return;
446
}
447
if (size <= OV5_INDX(OV5_MMU_SUPPORT)) {
448
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
449
return;
450
}
451
452
/* Check for supported configuration */
453
mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] &
454
OV5_FEAT(OV5_MMU_SUPPORT);
455
if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) {
456
/* Hypervisor only supports radix - check enabled && GTSE */
457
if (!early_radix_enabled()) {
458
pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
459
}
460
if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
461
OV5_FEAT(OV5_RADIX_GTSE))) {
462
cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
463
} else
464
cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
465
/* Do radix anyway - the hypervisor said we had to */
466
cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
467
} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
468
/* Hypervisor only supports hash - disable radix */
469
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
470
cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
471
}
472
}
473
474
static int __init dt_scan_mmu_pid_width(unsigned long node,
475
const char *uname, int depth,
476
void *data)
477
{
478
int size = 0;
479
const __be32 *prop;
480
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
481
482
/* We are scanning "cpu" nodes only */
483
if (type == NULL || strcmp(type, "cpu") != 0)
484
return 0;
485
486
/* Find MMU LPID, PID register size */
487
prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size);
488
if (prop && size == 4)
489
mmu_lpid_bits = be32_to_cpup(prop);
490
491
prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
492
if (prop && size == 4)
493
mmu_pid_bits = be32_to_cpup(prop);
494
495
if (!mmu_pid_bits && !mmu_lpid_bits)
496
return 0;
497
498
return 1;
499
}
500
501
/*
502
* Outside hotplug the kernel uses this value to map the kernel direct map
503
* with radix. To be compatible with older kernels, let's keep this value
504
* as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
505
* things with 1GB size in the case where we don't support hotplug.
506
*/
507
#ifndef CONFIG_MEMORY_HOTPLUG
508
#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M
509
#else
510
#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE
511
#endif
512
513
static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
514
{
515
unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
516
517
for (; *block_size > min_memory_block_size; *block_size >>= 2) {
518
if ((mem_size & *block_size) == 0)
519
break;
520
}
521
}
522
523
static int __init probe_memory_block_size(unsigned long node, const char *uname, int
524
depth, void *data)
525
{
526
const char *type;
527
unsigned long *block_size = (unsigned long *)data;
528
const __be32 *reg, *endp;
529
int l;
530
531
if (depth != 1)
532
return 0;
533
/*
534
* If we have dynamic-reconfiguration-memory node, use the
535
* lmb value.
536
*/
537
if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
538
539
const __be32 *prop;
540
541
prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
542
543
if (!prop || l < dt_root_size_cells * sizeof(__be32))
544
/*
545
* Nothing in the device tree
546
*/
547
*block_size = DEFAULT_MEMORY_BLOCK_SIZE;
548
else
549
*block_size = of_read_number(prop, dt_root_size_cells);
550
/*
551
* We have found the final value. Don't probe further.
552
*/
553
return 1;
554
}
555
/*
556
* Find all the device tree nodes of memory type and make sure
557
* the area can be mapped using the memory block size value
558
* we end up using. We start with 1G value and keep reducing
559
* it such that we can map the entire area using memory_block_size.
560
* This will be used on powernv and older pseries that don't
561
* have ibm,lmb-size node.
562
* For ex: with P5 we can end up with
563
* memory@0 -> 128MB
564
* memory@128M -> 64M
565
* This will end up using 64MB memory block size value.
566
*/
567
type = of_get_flat_dt_prop(node, "device_type", NULL);
568
if (type == NULL || strcmp(type, "memory") != 0)
569
return 0;
570
571
reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
572
if (!reg)
573
reg = of_get_flat_dt_prop(node, "reg", &l);
574
if (!reg)
575
return 0;
576
577
endp = reg + (l / sizeof(__be32));
578
while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
579
const char *compatible;
580
u64 size;
581
582
dt_mem_next_cell(dt_root_addr_cells, &reg);
583
size = dt_mem_next_cell(dt_root_size_cells, &reg);
584
585
if (size) {
586
update_memory_block_size(block_size, size);
587
continue;
588
}
589
/*
590
* ibm,coherent-device-memory with linux,usable-memory = 0
591
* Force 256MiB block size. Work around for GPUs on P9 PowerNV
592
* linux,usable-memory == 0 implies driver managed memory and
593
* we can't use large memory block size due to hotplug/unplug
594
* limitations.
595
*/
596
compatible = of_get_flat_dt_prop(node, "compatible", NULL);
597
if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
598
if (*block_size > SZ_256M)
599
*block_size = SZ_256M;
600
/*
601
* We keep 256M as the upper limit with GPU present.
602
*/
603
return 0;
604
}
605
}
606
/* continue looking for other memory device types */
607
return 0;
608
}
609
610
/*
611
* start with 1G memory block size. Early init will
612
* fix this with correct value.
613
*/
614
unsigned long memory_block_size __ro_after_init = 1UL << 30;
615
static void __init early_init_memory_block_size(void)
616
{
617
/*
618
* We need to do memory_block_size probe early so that
619
* radix__early_init_mmu() can use this as limit for
620
* mapping page size.
621
*/
622
of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
623
}
624
625
void __init mmu_early_init_devtree(void)
626
{
627
bool hvmode = !!(mfmsr() & MSR_HV);
628
629
/* Disable radix mode based on kernel command line. */
630
if (disable_radix) {
631
if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU))
632
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
633
else
634
pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
635
}
636
637
of_scan_flat_dt(dt_scan_mmu_pid_width, NULL);
638
if (hvmode && !mmu_lpid_bits) {
639
if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
640
mmu_lpid_bits = 12; /* POWER8-10 */
641
else
642
mmu_lpid_bits = 10; /* POWER7 */
643
}
644
if (!mmu_pid_bits) {
645
if (early_cpu_has_feature(CPU_FTR_ARCH_300))
646
mmu_pid_bits = 20; /* POWER9-10 */
647
}
648
649
/*
650
* Check /chosen/ibm,architecture-vec-5 if running as a guest.
651
* When running bare-metal, we can use radix if we like
652
* even though the ibm,architecture-vec-5 property created by
653
* skiboot doesn't have the necessary bits set.
654
*/
655
if (!hvmode)
656
early_check_vec5();
657
658
early_init_memory_block_size();
659
660
if (early_radix_enabled()) {
661
radix__early_init_devtree();
662
663
/*
664
* We have finalized the translation we are going to use by now.
665
* Radix mode is not limited by RMA / VRMA addressing.
666
* Hence don't limit memblock allocations.
667
*/
668
ppc64_rma_size = ULONG_MAX;
669
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
670
} else
671
hash__early_init_devtree();
672
673
if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
674
hugetlbpage_init_defaultsize();
675
676
if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) &&
677
!(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX))
678
panic("kernel does not support any MMU type offered by platform");
679
}
680
#endif /* CONFIG_PPC_BOOK3S_64 */
681
682