CoCalc -- mmu.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/mm/mmu.c
²⁶⁴²⁴ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Based on arch/arm/mm/mmu.c
4
 *
5
 * Copyright (C) 1995-2005 Russell King
6
 * Copyright (C) 2012 ARM Ltd.
7
 */
8

9
#include <linux/cache.h>
10
#include <linux/export.h>
11
#include <linux/kernel.h>
12
#include <linux/errno.h>
13
#include <linux/init.h>
14
#include <linux/ioport.h>
15
#include <linux/kexec.h>
16
#include <linux/libfdt.h>
17
#include <linux/mman.h>
18
#include <linux/nodemask.h>
19
#include <linux/memblock.h>
20
#include <linux/memremap.h>
21
#include <linux/memory.h>
22
#include <linux/fs.h>
23
#include <linux/io.h>
24
#include <linux/mm.h>
25
#include <linux/vmalloc.h>
26
#include <linux/set_memory.h>
27
#include <linux/kfence.h>
28
#include <linux/pkeys.h>
29
#include <linux/mm_inline.h>
30

31
#include <asm/barrier.h>
32
#include <asm/cputype.h>
33
#include <asm/fixmap.h>
34
#include <asm/kasan.h>
35
#include <asm/kernel-pgtable.h>
36
#include <asm/sections.h>
37
#include <asm/setup.h>
38
#include <linux/sizes.h>
39
#include <asm/tlb.h>
40
#include <asm/mmu_context.h>
41
#include <asm/ptdump.h>
42
#include <asm/tlbflush.h>
43
#include <asm/pgalloc.h>
44
#include <asm/kfence.h>
45

46
#define NO_BLOCK_MAPPINGS	BIT(0)
47
#define NO_CONT_MAPPINGS	BIT(1)
48
#define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
49

50
u64 kimage_voffset __ro_after_init;
51
EXPORT_SYMBOL(kimage_voffset);
52

53
u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
54

55
static bool rodata_is_rw __ro_after_init = true;
56

57
/*
58
 * The booting CPU updates the failed status @__early_cpu_boot_status,
59
 * with MMU turned off.
60
 */
61
long __section(".mmuoff.data.write") __early_cpu_boot_status;
62

63
/*
64
 * Empty_zero_page is a special page that is used for zero-initialized data
65
 * and COW.
66
 */
67
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
68
EXPORT_SYMBOL(empty_zero_page);
69

70
static DEFINE_SPINLOCK(swapper_pgdir_lock);
71
static DEFINE_MUTEX(fixmap_lock);
72

73
void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
74
{
75
	pgd_t *fixmap_pgdp;
76

77
	/*
78
	 * Don't bother with the fixmap if swapper_pg_dir is still mapped
79
	 * writable in the kernel mapping.
80
	 */
81
	if (rodata_is_rw) {
82
		WRITE_ONCE(*pgdp, pgd);
83
		dsb(ishst);
84
		isb();
85
		return;
86
	}
87

88
	spin_lock(&swapper_pgdir_lock);
89
	fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
90
	WRITE_ONCE(*fixmap_pgdp, pgd);
91
	/*
92
	 * We need dsb(ishst) here to ensure the page-table-walker sees
93
	 * our new entry before set_p?d() returns. The fixmap's
94
	 * flush_tlb_kernel_range() via clear_fixmap() does this for us.
95
	 */
96
	pgd_clear_fixmap();
97
	spin_unlock(&swapper_pgdir_lock);
98
}
99

100
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
101
			      unsigned long size, pgprot_t vma_prot)
102
{
103
	if (!pfn_is_map_memory(pfn))
104
		return pgprot_noncached(vma_prot);
105
	else if (file->f_flags & O_SYNC)
106
		return pgprot_writecombine(vma_prot);
107
	return vma_prot;
108
}
109
EXPORT_SYMBOL(phys_mem_access_prot);
110

111
static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
112
{
113
	phys_addr_t phys;
114

115
	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
116
					 MEMBLOCK_ALLOC_NOLEAKTRACE);
117
	if (!phys)
118
		panic("Failed to allocate page table page\n");
119

120
	return phys;
121
}
122

123
bool pgattr_change_is_safe(pteval_t old, pteval_t new)
124
{
125
	/*
126
	 * The following mapping attributes may be updated in live
127
	 * kernel mappings without the need for break-before-make.
128
	 */
129
	pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG |
130
			PTE_SWBITS_MASK;
131

132
	/* creating or taking down mappings is always safe */
133
	if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
134
		return true;
135

136
	/* A live entry's pfn should not change */
137
	if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
138
		return false;
139

140
	/* live contiguous mappings may not be manipulated at all */
141
	if ((old | new) & PTE_CONT)
142
		return false;
143

144
	/* Transitioning from Non-Global to Global is unsafe */
145
	if (old & ~new & PTE_NG)
146
		return false;
147

148
	/*
149
	 * Changing the memory type between Normal and Normal-Tagged is safe
150
	 * since Tagged is considered a permission attribute from the
151
	 * mismatched attribute aliases perspective.
152
	 */
153
	if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
154
	     (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
155
	    ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
156
	     (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
157
		mask |= PTE_ATTRINDX_MASK;
158

159
	return ((old ^ new) & ~mask) == 0;
160
}
161

162
static void init_clear_pgtable(void *table)
163
{
164
	clear_page(table);
165

166
	/* Ensure the zeroing is observed by page table walks. */
167
	dsb(ishst);
168
}
169

170
static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
171
		     phys_addr_t phys, pgprot_t prot)
172
{
173
	do {
174
		pte_t old_pte = __ptep_get(ptep);
175

176
		/*
177
		 * Required barriers to make this visible to the table walker
178
		 * are deferred to the end of alloc_init_cont_pte().
179
		 */
180
		__set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot));
181

182
		/*
183
		 * After the PTE entry has been populated once, we
184
		 * only allow updates to the permission attributes.
185
		 */
186
		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
187
					      pte_val(__ptep_get(ptep))));
188

189
		phys += PAGE_SIZE;
190
	} while (ptep++, addr += PAGE_SIZE, addr != end);
191
}
192

193
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
194
				unsigned long end, phys_addr_t phys,
195
				pgprot_t prot,
196
				phys_addr_t (*pgtable_alloc)(enum pgtable_type),
197
				int flags)
198
{
199
	unsigned long next;
200
	pmd_t pmd = READ_ONCE(*pmdp);
201
	pte_t *ptep;
202

203
	BUG_ON(pmd_sect(pmd));
204
	if (pmd_none(pmd)) {
205
		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
206
		phys_addr_t pte_phys;
207

208
		if (flags & NO_EXEC_MAPPINGS)
209
			pmdval |= PMD_TABLE_PXN;
210
		BUG_ON(!pgtable_alloc);
211
		pte_phys = pgtable_alloc(TABLE_PTE);
212
		ptep = pte_set_fixmap(pte_phys);
213
		init_clear_pgtable(ptep);
214
		ptep += pte_index(addr);
215
		__pmd_populate(pmdp, pte_phys, pmdval);
216
	} else {
217
		BUG_ON(pmd_bad(pmd));
218
		ptep = pte_set_fixmap_offset(pmdp, addr);
219
	}
220

221
	do {
222
		pgprot_t __prot = prot;
223

224
		next = pte_cont_addr_end(addr, end);
225

226
		/* use a contiguous mapping if the range is suitably aligned */
227
		if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
228
		    (flags & NO_CONT_MAPPINGS) == 0)
229
			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
230

231
		init_pte(ptep, addr, next, phys, __prot);
232

233
		ptep += pte_index(next) - pte_index(addr);
234
		phys += next - addr;
235
	} while (addr = next, addr != end);
236

237
	/*
238
	 * Note: barriers and maintenance necessary to clear the fixmap slot
239
	 * ensure that all previous pgtable writes are visible to the table
240
	 * walker.
241
	 */
242
	pte_clear_fixmap();
243
}
244

245
static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
246
		     phys_addr_t phys, pgprot_t prot,
247
		     phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
248
{
249
	unsigned long next;
250

251
	do {
252
		pmd_t old_pmd = READ_ONCE(*pmdp);
253

254
		next = pmd_addr_end(addr, end);
255

256
		/* try section mapping first */
257
		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
258
		    (flags & NO_BLOCK_MAPPINGS) == 0) {
259
			pmd_set_huge(pmdp, phys, prot);
260

261
			/*
262
			 * After the PMD entry has been populated once, we
263
			 * only allow updates to the permission attributes.
264
			 */
265
			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
266
						      READ_ONCE(pmd_val(*pmdp))));
267
		} else {
268
			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
269
					    pgtable_alloc, flags);
270

271
			BUG_ON(pmd_val(old_pmd) != 0 &&
272
			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
273
		}
274
		phys += next - addr;
275
	} while (pmdp++, addr = next, addr != end);
276
}
277

278
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
279
				unsigned long end, phys_addr_t phys,
280
				pgprot_t prot,
281
				phys_addr_t (*pgtable_alloc)(enum pgtable_type),
282
				int flags)
283
{
284
	unsigned long next;
285
	pud_t pud = READ_ONCE(*pudp);
286
	pmd_t *pmdp;
287

288
	/*
289
	 * Check for initial section mappings in the pgd/pud.
290
	 */
291
	BUG_ON(pud_sect(pud));
292
	if (pud_none(pud)) {
293
		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
294
		phys_addr_t pmd_phys;
295

296
		if (flags & NO_EXEC_MAPPINGS)
297
			pudval |= PUD_TABLE_PXN;
298
		BUG_ON(!pgtable_alloc);
299
		pmd_phys = pgtable_alloc(TABLE_PMD);
300
		pmdp = pmd_set_fixmap(pmd_phys);
301
		init_clear_pgtable(pmdp);
302
		pmdp += pmd_index(addr);
303
		__pud_populate(pudp, pmd_phys, pudval);
304
	} else {
305
		BUG_ON(pud_bad(pud));
306
		pmdp = pmd_set_fixmap_offset(pudp, addr);
307
	}
308

309
	do {
310
		pgprot_t __prot = prot;
311

312
		next = pmd_cont_addr_end(addr, end);
313

314
		/* use a contiguous mapping if the range is suitably aligned */
315
		if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
316
		    (flags & NO_CONT_MAPPINGS) == 0)
317
			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
318

319
		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
320

321
		pmdp += pmd_index(next) - pmd_index(addr);
322
		phys += next - addr;
323
	} while (addr = next, addr != end);
324

325
	pmd_clear_fixmap();
326
}
327

328
static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
329
			   phys_addr_t phys, pgprot_t prot,
330
			   phys_addr_t (*pgtable_alloc)(enum pgtable_type),
331
			   int flags)
332
{
333
	unsigned long next;
334
	p4d_t p4d = READ_ONCE(*p4dp);
335
	pud_t *pudp;
336

337
	if (p4d_none(p4d)) {
338
		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
339
		phys_addr_t pud_phys;
340

341
		if (flags & NO_EXEC_MAPPINGS)
342
			p4dval |= P4D_TABLE_PXN;
343
		BUG_ON(!pgtable_alloc);
344
		pud_phys = pgtable_alloc(TABLE_PUD);
345
		pudp = pud_set_fixmap(pud_phys);
346
		init_clear_pgtable(pudp);
347
		pudp += pud_index(addr);
348
		__p4d_populate(p4dp, pud_phys, p4dval);
349
	} else {
350
		BUG_ON(p4d_bad(p4d));
351
		pudp = pud_set_fixmap_offset(p4dp, addr);
352
	}
353

354
	do {
355
		pud_t old_pud = READ_ONCE(*pudp);
356

357
		next = pud_addr_end(addr, end);
358

359
		/*
360
		 * For 4K granule only, attempt to put down a 1GB block
361
		 */
362
		if (pud_sect_supported() &&
363
		   ((addr | next | phys) & ~PUD_MASK) == 0 &&
364
		    (flags & NO_BLOCK_MAPPINGS) == 0) {
365
			pud_set_huge(pudp, phys, prot);
366

367
			/*
368
			 * After the PUD entry has been populated once, we
369
			 * only allow updates to the permission attributes.
370
			 */
371
			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
372
						      READ_ONCE(pud_val(*pudp))));
373
		} else {
374
			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
375
					    pgtable_alloc, flags);
376

377
			BUG_ON(pud_val(old_pud) != 0 &&
378
			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
379
		}
380
		phys += next - addr;
381
	} while (pudp++, addr = next, addr != end);
382

383
	pud_clear_fixmap();
384
}
385

386
static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
387
			   phys_addr_t phys, pgprot_t prot,
388
			   phys_addr_t (*pgtable_alloc)(enum pgtable_type),
389
			   int flags)
390
{
391
	unsigned long next;
392
	pgd_t pgd = READ_ONCE(*pgdp);
393
	p4d_t *p4dp;
394

395
	if (pgd_none(pgd)) {
396
		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
397
		phys_addr_t p4d_phys;
398

399
		if (flags & NO_EXEC_MAPPINGS)
400
			pgdval |= PGD_TABLE_PXN;
401
		BUG_ON(!pgtable_alloc);
402
		p4d_phys = pgtable_alloc(TABLE_P4D);
403
		p4dp = p4d_set_fixmap(p4d_phys);
404
		init_clear_pgtable(p4dp);
405
		p4dp += p4d_index(addr);
406
		__pgd_populate(pgdp, p4d_phys, pgdval);
407
	} else {
408
		BUG_ON(pgd_bad(pgd));
409
		p4dp = p4d_set_fixmap_offset(pgdp, addr);
410
	}
411

412
	do {
413
		p4d_t old_p4d = READ_ONCE(*p4dp);
414

415
		next = p4d_addr_end(addr, end);
416

417
		alloc_init_pud(p4dp, addr, next, phys, prot,
418
			       pgtable_alloc, flags);
419

420
		BUG_ON(p4d_val(old_p4d) != 0 &&
421
		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
422

423
		phys += next - addr;
424
	} while (p4dp++, addr = next, addr != end);
425

426
	p4d_clear_fixmap();
427
}
428

429
static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
430
					unsigned long virt, phys_addr_t size,
431
					pgprot_t prot,
432
					phys_addr_t (*pgtable_alloc)(enum pgtable_type),
433
					int flags)
434
{
435
	unsigned long addr, end, next;
436
	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
437

438
	/*
439
	 * If the virtual and physical address don't have the same offset
440
	 * within a page, we cannot map the region as the caller expects.
441
	 */
442
	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
443
		return;
444

445
	phys &= PAGE_MASK;
446
	addr = virt & PAGE_MASK;
447
	end = PAGE_ALIGN(virt + size);
448

449
	do {
450
		next = pgd_addr_end(addr, end);
451
		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
452
			       flags);
453
		phys += next - addr;
454
	} while (pgdp++, addr = next, addr != end);
455
}
456

457
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
458
				 unsigned long virt, phys_addr_t size,
459
				 pgprot_t prot,
460
				 phys_addr_t (*pgtable_alloc)(enum pgtable_type),
461
				 int flags)
462
{
463
	mutex_lock(&fixmap_lock);
464
	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
465
				    pgtable_alloc, flags);
466
	mutex_unlock(&fixmap_lock);
467
}
468

469
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
470
extern __alias(__create_pgd_mapping_locked)
471
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
472
			     phys_addr_t size, pgprot_t prot,
473
			     phys_addr_t (*pgtable_alloc)(enum pgtable_type),
474
			     int flags);
475
#endif
476

477
static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
478
				       enum pgtable_type pgtable_type)
479
{
480
	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
481
	struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
482
	phys_addr_t pa;
483

484
	BUG_ON(!ptdesc);
485
	pa = page_to_phys(ptdesc_page(ptdesc));
486

487
	switch (pgtable_type) {
488
	case TABLE_PTE:
489
		BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
490
		break;
491
	case TABLE_PMD:
492
		BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
493
		break;
494
	case TABLE_PUD:
495
		pagetable_pud_ctor(ptdesc);
496
		break;
497
	case TABLE_P4D:
498
		pagetable_p4d_ctor(ptdesc);
499
		break;
500
	}
501

502
	return pa;
503
}
504

505
static phys_addr_t __maybe_unused
506
pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
507
{
508
	return __pgd_pgtable_alloc(&init_mm, pgtable_type);
509
}
510

511
static phys_addr_t
512
pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
513
{
514
	return __pgd_pgtable_alloc(NULL, pgtable_type);
515
}
516

517
/*
518
 * This function can only be used to modify existing table entries,
519
 * without allocating new levels of table. Note that this permits the
520
 * creation of new section or page entries.
521
 */
522
void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
523
				   phys_addr_t size, pgprot_t prot)
524
{
525
	if (virt < PAGE_OFFSET) {
526
		pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
527
			&phys, virt);
528
		return;
529
	}
530
	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
531
			     NO_CONT_MAPPINGS);
532
}
533

534
void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
535
			       unsigned long virt, phys_addr_t size,
536
			       pgprot_t prot, bool page_mappings_only)
537
{
538
	int flags = 0;
539

540
	BUG_ON(mm == &init_mm);
541

542
	if (page_mappings_only)
543
		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
544

545
	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
546
			     pgd_pgtable_alloc_special_mm, flags);
547
}
548

549
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
550
				phys_addr_t size, pgprot_t prot)
551
{
552
	if (virt < PAGE_OFFSET) {
553
		pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
554
			&phys, virt);
555
		return;
556
	}
557

558
	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
559
			     NO_CONT_MAPPINGS);
560

561
	/* flush the TLBs after updating live kernel mappings */
562
	flush_tlb_kernel_range(virt, virt + size);
563
}
564

565
static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
566
				  phys_addr_t end, pgprot_t prot, int flags)
567
{
568
	__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
569
			     prot, early_pgtable_alloc, flags);
570
}
571

572
void __init mark_linear_text_alias_ro(void)
573
{
574
	/*
575
	 * Remove the write permissions from the linear alias of .text/.rodata
576
	 */
577
	update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
578
			    (unsigned long)__init_begin - (unsigned long)_stext,
579
			    PAGE_KERNEL_RO);
580
}
581

582
#ifdef CONFIG_KFENCE
583

584
bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
585

586
/* early_param() will be parsed before map_mem() below. */
587
static int __init parse_kfence_early_init(char *arg)
588
{
589
	int val;
590

591
	if (get_option(&arg, &val))
592
		kfence_early_init = !!val;
593
	return 0;
594
}
595
early_param("kfence.sample_interval", parse_kfence_early_init);
596

597
static phys_addr_t __init arm64_kfence_alloc_pool(void)
598
{
599
	phys_addr_t kfence_pool;
600

601
	if (!kfence_early_init)
602
		return 0;
603

604
	kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
605
	if (!kfence_pool) {
606
		pr_err("failed to allocate kfence pool\n");
607
		kfence_early_init = false;
608
		return 0;
609
	}
610

611
	/* Temporarily mark as NOMAP. */
612
	memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
613

614
	return kfence_pool;
615
}
616

617
static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
618
{
619
	if (!kfence_pool)
620
		return;
621

622
	/* KFENCE pool needs page-level mapping. */
623
	__map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
624
			pgprot_tagged(PAGE_KERNEL),
625
			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
626
	memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
627
	__kfence_pool = phys_to_virt(kfence_pool);
628
}
629
#else /* CONFIG_KFENCE */
630

631
static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
632
static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { }
633

634
#endif /* CONFIG_KFENCE */
635

636
static void __init map_mem(pgd_t *pgdp)
637
{
638
	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
639
	phys_addr_t kernel_start = __pa_symbol(_stext);
640
	phys_addr_t kernel_end = __pa_symbol(__init_begin);
641
	phys_addr_t start, end;
642
	phys_addr_t early_kfence_pool;
643
	int flags = NO_EXEC_MAPPINGS;
644
	u64 i;
645

646
	/*
647
	 * Setting hierarchical PXNTable attributes on table entries covering
648
	 * the linear region is only possible if it is guaranteed that no table
649
	 * entries at any level are being shared between the linear region and
650
	 * the vmalloc region. Check whether this is true for the PGD level, in
651
	 * which case it is guaranteed to be true for all other levels as well.
652
	 * (Unless we are running with support for LPA2, in which case the
653
	 * entire reduced VA space is covered by a single pgd_t which will have
654
	 * been populated without the PXNTable attribute by the time we get here.)
655
	 */
656
	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) &&
657
		     pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1);
658

659
	early_kfence_pool = arm64_kfence_alloc_pool();
660

661
	if (can_set_direct_map())
662
		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
663

664
	/*
665
	 * Take care not to create a writable alias for the
666
	 * read-only text and rodata sections of the kernel image.
667
	 * So temporarily mark them as NOMAP to skip mappings in
668
	 * the following for-loop
669
	 */
670
	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
671

672
	/* map all the memory banks */
673
	for_each_mem_range(i, &start, &end) {
674
		if (start >= end)
675
			break;
676
		/*
677
		 * The linear map must allow allocation tags reading/writing
678
		 * if MTE is present. Otherwise, it has the same attributes as
679
		 * PAGE_KERNEL.
680
		 */
681
		__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
682
			       flags);
683
	}
684

685
	/*
686
	 * Map the linear alias of the [_stext, __init_begin) interval
687
	 * as non-executable now, and remove the write permission in
688
	 * mark_linear_text_alias_ro() below (which will be called after
689
	 * alternative patching has completed). This makes the contents
690
	 * of the region accessible to subsystems such as hibernate,
691
	 * but protects it from inadvertent modification or execution.
692
	 * Note that contiguous mappings cannot be remapped in this way,
693
	 * so we should avoid them here.
694
	 */
695
	__map_memblock(pgdp, kernel_start, kernel_end,
696
		       PAGE_KERNEL, NO_CONT_MAPPINGS);
697
	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
698
	arm64_kfence_map_pool(early_kfence_pool, pgdp);
699
}
700

701
void mark_rodata_ro(void)
702
{
703
	unsigned long section_size;
704

705
	/*
706
	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
707
	 * to cover NOTES and EXCEPTION_TABLE.
708
	 */
709
	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
710
	WRITE_ONCE(rodata_is_rw, false);
711
	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
712
			    section_size, PAGE_KERNEL_RO);
713
}
714

715
static void __init declare_vma(struct vm_struct *vma,
716
			       void *va_start, void *va_end,
717
			       unsigned long vm_flags)
718
{
719
	phys_addr_t pa_start = __pa_symbol(va_start);
720
	unsigned long size = va_end - va_start;
721

722
	BUG_ON(!PAGE_ALIGNED(pa_start));
723
	BUG_ON(!PAGE_ALIGNED(size));
724

725
	if (!(vm_flags & VM_NO_GUARD))
726
		size += PAGE_SIZE;
727

728
	vma->addr	= va_start;
729
	vma->phys_addr	= pa_start;
730
	vma->size	= size;
731
	vma->flags	= VM_MAP | vm_flags;
732
	vma->caller	= __builtin_return_address(0);
733

734
	vm_area_add_early(vma);
735
}
736

737
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
738
static pgprot_t kernel_exec_prot(void)
739
{
740
	return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
741
}
742

743
static int __init map_entry_trampoline(void)
744
{
745
	int i;
746

747
	if (!arm64_kernel_unmapped_at_el0())
748
		return 0;
749

750
	pgprot_t prot = kernel_exec_prot();
751
	phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
752

753
	/* The trampoline is always mapped and can therefore be global */
754
	pgprot_val(prot) &= ~PTE_NG;
755

756
	/* Map only the text into the trampoline page table */
757
	memset(tramp_pg_dir, 0, PGD_SIZE);
758
	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
759
			     entry_tramp_text_size(), prot,
760
			     pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
761

762
	/* Map both the text and data into the kernel page table */
763
	for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
764
		__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
765
			     pa_start + i * PAGE_SIZE, prot);
766

767
	if (IS_ENABLED(CONFIG_RELOCATABLE))
768
		__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
769
			     pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
770

771
	return 0;
772
}
773
core_initcall(map_entry_trampoline);
774
#endif
775

776
/*
777
 * Declare the VMA areas for the kernel
778
 */
779
static void __init declare_kernel_vmas(void)
780
{
781
	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
782

783
	declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
784
	declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
785
	declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
786
	declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
787
	declare_vma(&vmlinux_seg[4], _data, _end, 0);
788
}
789

790
void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
791
		    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
792

793
static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
794
	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
795

796
static void __init create_idmap(void)
797
{
798
	u64 start = __pa_symbol(__idmap_text_start);
799
	u64 end   = __pa_symbol(__idmap_text_end);
800
	u64 ptep  = __pa_symbol(idmap_ptes);
801

802
	__pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
803
		       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
804
		       __phys_to_virt(ptep) - ptep);
805

806
	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
807
		extern u32 __idmap_kpti_flag;
808
		u64 pa = __pa_symbol(&__idmap_kpti_flag);
809

810
		/*
811
		 * The KPTI G-to-nG conversion code needs a read-write mapping
812
		 * of its synchronization flag in the ID map.
813
		 */
814
		ptep = __pa_symbol(kpti_ptes);
815
		__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
816
			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
817
			       __phys_to_virt(ptep) - ptep);
818
	}
819
}
820

821
void __init paging_init(void)
822
{
823
	map_mem(swapper_pg_dir);
824

825
	memblock_allow_resize();
826

827
	create_idmap();
828
	declare_kernel_vmas();
829
}
830

831
#ifdef CONFIG_MEMORY_HOTPLUG
832
static void free_hotplug_page_range(struct page *page, size_t size,
833
				    struct vmem_altmap *altmap)
834
{
835
	if (altmap) {
836
		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
837
	} else {
838
		WARN_ON(PageReserved(page));
839
		free_pages((unsigned long)page_address(page), get_order(size));
840
	}
841
}
842

843
static void free_hotplug_pgtable_page(struct page *page)
844
{
845
	free_hotplug_page_range(page, PAGE_SIZE, NULL);
846
}
847

848
static bool pgtable_range_aligned(unsigned long start, unsigned long end,
849
				  unsigned long floor, unsigned long ceiling,
850
				  unsigned long mask)
851
{
852
	start &= mask;
853
	if (start < floor)
854
		return false;
855

856
	if (ceiling) {
857
		ceiling &= mask;
858
		if (!ceiling)
859
			return false;
860
	}
861

862
	if (end - 1 > ceiling - 1)
863
		return false;
864
	return true;
865
}
866

867
static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
868
				    unsigned long end, bool free_mapped,
869
				    struct vmem_altmap *altmap)
870
{
871
	pte_t *ptep, pte;
872

873
	do {
874
		ptep = pte_offset_kernel(pmdp, addr);
875
		pte = __ptep_get(ptep);
876
		if (pte_none(pte))
877
			continue;
878

879
		WARN_ON(!pte_present(pte));
880
		__pte_clear(&init_mm, addr, ptep);
881
		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
882
		if (free_mapped)
883
			free_hotplug_page_range(pte_page(pte),
884
						PAGE_SIZE, altmap);
885
	} while (addr += PAGE_SIZE, addr < end);
886
}
887

888
static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
889
				    unsigned long end, bool free_mapped,
890
				    struct vmem_altmap *altmap)
891
{
892
	unsigned long next;
893
	pmd_t *pmdp, pmd;
894

895
	do {
896
		next = pmd_addr_end(addr, end);
897
		pmdp = pmd_offset(pudp, addr);
898
		pmd = READ_ONCE(*pmdp);
899
		if (pmd_none(pmd))
900
			continue;
901

902
		WARN_ON(!pmd_present(pmd));
903
		if (pmd_sect(pmd)) {
904
			pmd_clear(pmdp);
905

906
			/*
907
			 * One TLBI should be sufficient here as the PMD_SIZE
908
			 * range is mapped with a single block entry.
909
			 */
910
			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
911
			if (free_mapped)
912
				free_hotplug_page_range(pmd_page(pmd),
913
							PMD_SIZE, altmap);
914
			continue;
915
		}
916
		WARN_ON(!pmd_table(pmd));
917
		unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
918
	} while (addr = next, addr < end);
919
}
920

921
static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
922
				    unsigned long end, bool free_mapped,
923
				    struct vmem_altmap *altmap)
924
{
925
	unsigned long next;
926
	pud_t *pudp, pud;
927

928
	do {
929
		next = pud_addr_end(addr, end);
930
		pudp = pud_offset(p4dp, addr);
931
		pud = READ_ONCE(*pudp);
932
		if (pud_none(pud))
933
			continue;
934

935
		WARN_ON(!pud_present(pud));
936
		if (pud_sect(pud)) {
937
			pud_clear(pudp);
938

939
			/*
940
			 * One TLBI should be sufficient here as the PUD_SIZE
941
			 * range is mapped with a single block entry.
942
			 */
943
			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
944
			if (free_mapped)
945
				free_hotplug_page_range(pud_page(pud),
946
							PUD_SIZE, altmap);
947
			continue;
948
		}
949
		WARN_ON(!pud_table(pud));
950
		unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
951
	} while (addr = next, addr < end);
952
}
953

954
static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
955
				    unsigned long end, bool free_mapped,
956
				    struct vmem_altmap *altmap)
957
{
958
	unsigned long next;
959
	p4d_t *p4dp, p4d;
960

961
	do {
962
		next = p4d_addr_end(addr, end);
963
		p4dp = p4d_offset(pgdp, addr);
964
		p4d = READ_ONCE(*p4dp);
965
		if (p4d_none(p4d))
966
			continue;
967

968
		WARN_ON(!p4d_present(p4d));
969
		unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
970
	} while (addr = next, addr < end);
971
}
972

973
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
974
				bool free_mapped, struct vmem_altmap *altmap)
975
{
976
	unsigned long next;
977
	pgd_t *pgdp, pgd;
978

979
	/*
980
	 * altmap can only be used as vmemmap mapping backing memory.
981
	 * In case the backing memory itself is not being freed, then
982
	 * altmap is irrelevant. Warn about this inconsistency when
983
	 * encountered.
984
	 */
985
	WARN_ON(!free_mapped && altmap);
986

987
	do {
988
		next = pgd_addr_end(addr, end);
989
		pgdp = pgd_offset_k(addr);
990
		pgd = READ_ONCE(*pgdp);
991
		if (pgd_none(pgd))
992
			continue;
993

994
		WARN_ON(!pgd_present(pgd));
995
		unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
996
	} while (addr = next, addr < end);
997
}
998

999
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
1000
				 unsigned long end, unsigned long floor,
1001
				 unsigned long ceiling)
1002
{
1003
	pte_t *ptep, pte;
1004
	unsigned long i, start = addr;
1005

1006
	do {
1007
		ptep = pte_offset_kernel(pmdp, addr);
1008
		pte = __ptep_get(ptep);
1009

1010
		/*
1011
		 * This is just a sanity check here which verifies that
1012
		 * pte clearing has been done by earlier unmap loops.
1013
		 */
1014
		WARN_ON(!pte_none(pte));
1015
	} while (addr += PAGE_SIZE, addr < end);
1016

1017
	if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
1018
		return;
1019

1020
	/*
1021
	 * Check whether we can free the pte page if the rest of the
1022
	 * entries are empty. Overlap with other regions have been
1023
	 * handled by the floor/ceiling check.
1024
	 */
1025
	ptep = pte_offset_kernel(pmdp, 0UL);
1026
	for (i = 0; i < PTRS_PER_PTE; i++) {
1027
		if (!pte_none(__ptep_get(&ptep[i])))
1028
			return;
1029
	}
1030

1031
	pmd_clear(pmdp);
1032
	__flush_tlb_kernel_pgtable(start);
1033
	free_hotplug_pgtable_page(virt_to_page(ptep));
1034
}
1035

1036
static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
1037
				 unsigned long end, unsigned long floor,
1038
				 unsigned long ceiling)
1039
{
1040
	pmd_t *pmdp, pmd;
1041
	unsigned long i, next, start = addr;
1042

1043
	do {
1044
		next = pmd_addr_end(addr, end);
1045
		pmdp = pmd_offset(pudp, addr);
1046
		pmd = READ_ONCE(*pmdp);
1047
		if (pmd_none(pmd))
1048
			continue;
1049

1050
		WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
1051
		free_empty_pte_table(pmdp, addr, next, floor, ceiling);
1052
	} while (addr = next, addr < end);
1053

1054
	if (CONFIG_PGTABLE_LEVELS <= 2)
1055
		return;
1056

1057
	if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
1058
		return;
1059

1060
	/*
1061
	 * Check whether we can free the pmd page if the rest of the
1062
	 * entries are empty. Overlap with other regions have been
1063
	 * handled by the floor/ceiling check.
1064
	 */
1065
	pmdp = pmd_offset(pudp, 0UL);
1066
	for (i = 0; i < PTRS_PER_PMD; i++) {
1067
		if (!pmd_none(READ_ONCE(pmdp[i])))
1068
			return;
1069
	}
1070

1071
	pud_clear(pudp);
1072
	__flush_tlb_kernel_pgtable(start);
1073
	free_hotplug_pgtable_page(virt_to_page(pmdp));
1074
}
1075

1076
static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
1077
				 unsigned long end, unsigned long floor,
1078
				 unsigned long ceiling)
1079
{
1080
	pud_t *pudp, pud;
1081
	unsigned long i, next, start = addr;
1082

1083
	do {
1084
		next = pud_addr_end(addr, end);
1085
		pudp = pud_offset(p4dp, addr);
1086
		pud = READ_ONCE(*pudp);
1087
		if (pud_none(pud))
1088
			continue;
1089

1090
		WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
1091
		free_empty_pmd_table(pudp, addr, next, floor, ceiling);
1092
	} while (addr = next, addr < end);
1093

1094
	if (!pgtable_l4_enabled())
1095
		return;
1096

1097
	if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK))
1098
		return;
1099

1100
	/*
1101
	 * Check whether we can free the pud page if the rest of the
1102
	 * entries are empty. Overlap with other regions have been
1103
	 * handled by the floor/ceiling check.
1104
	 */
1105
	pudp = pud_offset(p4dp, 0UL);
1106
	for (i = 0; i < PTRS_PER_PUD; i++) {
1107
		if (!pud_none(READ_ONCE(pudp[i])))
1108
			return;
1109
	}
1110

1111
	p4d_clear(p4dp);
1112
	__flush_tlb_kernel_pgtable(start);
1113
	free_hotplug_pgtable_page(virt_to_page(pudp));
1114
}
1115

1116
static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
1117
				 unsigned long end, unsigned long floor,
1118
				 unsigned long ceiling)
1119
{
1120
	p4d_t *p4dp, p4d;
1121
	unsigned long i, next, start = addr;
1122

1123
	do {
1124
		next = p4d_addr_end(addr, end);
1125
		p4dp = p4d_offset(pgdp, addr);
1126
		p4d = READ_ONCE(*p4dp);
1127
		if (p4d_none(p4d))
1128
			continue;
1129

1130
		WARN_ON(!p4d_present(p4d));
1131
		free_empty_pud_table(p4dp, addr, next, floor, ceiling);
1132
	} while (addr = next, addr < end);
1133

1134
	if (!pgtable_l5_enabled())
1135
		return;
1136

1137
	if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
1138
		return;
1139

1140
	/*
1141
	 * Check whether we can free the p4d page if the rest of the
1142
	 * entries are empty. Overlap with other regions have been
1143
	 * handled by the floor/ceiling check.
1144
	 */
1145
	p4dp = p4d_offset(pgdp, 0UL);
1146
	for (i = 0; i < PTRS_PER_P4D; i++) {
1147
		if (!p4d_none(READ_ONCE(p4dp[i])))
1148
			return;
1149
	}
1150

1151
	pgd_clear(pgdp);
1152
	__flush_tlb_kernel_pgtable(start);
1153
	free_hotplug_pgtable_page(virt_to_page(p4dp));
1154
}
1155

1156
static void free_empty_tables(unsigned long addr, unsigned long end,
1157
			      unsigned long floor, unsigned long ceiling)
1158
{
1159
	unsigned long next;
1160
	pgd_t *pgdp, pgd;
1161

1162
	do {
1163
		next = pgd_addr_end(addr, end);
1164
		pgdp = pgd_offset_k(addr);
1165
		pgd = READ_ONCE(*pgdp);
1166
		if (pgd_none(pgd))
1167
			continue;
1168

1169
		WARN_ON(!pgd_present(pgd));
1170
		free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
1171
	} while (addr = next, addr < end);
1172
}
1173
#endif
1174

1175
void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1176
			       unsigned long addr, unsigned long next)
1177
{
1178
	pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
1179
}
1180

1181
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
1182
				unsigned long addr, unsigned long next)
1183
{
1184
	vmemmap_verify((pte_t *)pmdp, node, addr, next);
1185

1186
	return pmd_sect(READ_ONCE(*pmdp));
1187
}
1188

1189
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1190
		struct vmem_altmap *altmap)
1191
{
1192
	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1193
	/* [start, end] should be within one section */
1194
	WARN_ON_ONCE(end - start > PAGES_PER_SECTION * sizeof(struct page));
1195

1196
	if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) ||
1197
	    (end - start < PAGES_PER_SECTION * sizeof(struct page)))
1198
		return vmemmap_populate_basepages(start, end, node, altmap);
1199
	else
1200
		return vmemmap_populate_hugepages(start, end, node, altmap);
1201
}
1202

1203
#ifdef CONFIG_MEMORY_HOTPLUG
1204
void vmemmap_free(unsigned long start, unsigned long end,
1205
		struct vmem_altmap *altmap)
1206
{
1207
	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1208

1209
	unmap_hotplug_range(start, end, true, altmap);
1210
	free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
1211
}
1212
#endif /* CONFIG_MEMORY_HOTPLUG */
1213

1214
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
1215
{
1216
	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
1217

1218
	/* Only allow permission changes for now */
1219
	if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
1220
				   pud_val(new_pud)))
1221
		return 0;
1222

1223
	VM_BUG_ON(phys & ~PUD_MASK);
1224
	set_pud(pudp, new_pud);
1225
	return 1;
1226
}
1227

1228
int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
1229
{
1230
	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
1231

1232
	/* Only allow permission changes for now */
1233
	if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
1234
				   pmd_val(new_pmd)))
1235
		return 0;
1236

1237
	VM_BUG_ON(phys & ~PMD_MASK);
1238
	set_pmd(pmdp, new_pmd);
1239
	return 1;
1240
}
1241

1242
#ifndef __PAGETABLE_P4D_FOLDED
1243
void p4d_clear_huge(p4d_t *p4dp)
1244
{
1245
}
1246
#endif
1247

1248
int pud_clear_huge(pud_t *pudp)
1249
{
1250
	if (!pud_sect(READ_ONCE(*pudp)))
1251
		return 0;
1252
	pud_clear(pudp);
1253
	return 1;
1254
}
1255

1256
int pmd_clear_huge(pmd_t *pmdp)
1257
{
1258
	if (!pmd_sect(READ_ONCE(*pmdp)))
1259
		return 0;
1260
	pmd_clear(pmdp);
1261
	return 1;
1262
}
1263

1264
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
1265
{
1266
	pte_t *table;
1267
	pmd_t pmd;
1268

1269
	pmd = READ_ONCE(*pmdp);
1270

1271
	if (!pmd_table(pmd)) {
1272
		VM_WARN_ON(1);
1273
		return 1;
1274
	}
1275

1276
	table = pte_offset_kernel(pmdp, addr);
1277
	pmd_clear(pmdp);
1278
	__flush_tlb_kernel_pgtable(addr);
1279
	pte_free_kernel(NULL, table);
1280
	return 1;
1281
}
1282

1283
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
1284
{
1285
	pmd_t *table;
1286
	pmd_t *pmdp;
1287
	pud_t pud;
1288
	unsigned long next, end;
1289

1290
	pud = READ_ONCE(*pudp);
1291

1292
	if (!pud_table(pud)) {
1293
		VM_WARN_ON(1);
1294
		return 1;
1295
	}
1296

1297
	table = pmd_offset(pudp, addr);
1298
	pmdp = table;
1299
	next = addr;
1300
	end = addr + PUD_SIZE;
1301
	do {
1302
		if (pmd_present(pmdp_get(pmdp)))
1303
			pmd_free_pte_page(pmdp, next);
1304
	} while (pmdp++, next += PMD_SIZE, next != end);
1305

1306
	pud_clear(pudp);
1307
	__flush_tlb_kernel_pgtable(addr);
1308
	pmd_free(NULL, table);
1309
	return 1;
1310
}
1311

1312
#ifdef CONFIG_MEMORY_HOTPLUG
1313
static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
1314
{
1315
	unsigned long end = start + size;
1316

1317
	WARN_ON(pgdir != init_mm.pgd);
1318
	WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
1319

1320
	unmap_hotplug_range(start, end, false, NULL);
1321
	free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
1322
}
1323

1324
struct range arch_get_mappable_range(void)
1325
{
1326
	struct range mhp_range;
1327
	u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
1328
	u64 end_linear_pa = __pa(PAGE_END - 1);
1329

1330
	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
1331
		/*
1332
		 * Check for a wrap, it is possible because of randomized linear
1333
		 * mapping the start physical address is actually bigger than
1334
		 * the end physical address. In this case set start to zero
1335
		 * because [0, end_linear_pa] range must still be able to cover
1336
		 * all addressable physical addresses.
1337
		 */
1338
		if (start_linear_pa > end_linear_pa)
1339
			start_linear_pa = 0;
1340
	}
1341

1342
	WARN_ON(start_linear_pa > end_linear_pa);
1343

1344
	/*
1345
	 * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
1346
	 * accommodating both its ends but excluding PAGE_END. Max physical
1347
	 * range which can be mapped inside this linear mapping range, must
1348
	 * also be derived from its end points.
1349
	 */
1350
	mhp_range.start = start_linear_pa;
1351
	mhp_range.end =  end_linear_pa;
1352

1353
	return mhp_range;
1354
}
1355

1356
int arch_add_memory(int nid, u64 start, u64 size,
1357
		    struct mhp_params *params)
1358
{
1359
	int ret, flags = NO_EXEC_MAPPINGS;
1360

1361
	VM_BUG_ON(!mhp_range_allowed(start, size, true));
1362

1363
	if (can_set_direct_map())
1364
		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
1365

1366
	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
1367
			     size, params->pgprot, pgd_pgtable_alloc_init_mm,
1368
			     flags);
1369

1370
	memblock_clear_nomap(start, size);
1371

1372
	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
1373
			   params);
1374
	if (ret)
1375
		__remove_pgd_mapping(swapper_pg_dir,
1376
				     __phys_to_virt(start), size);
1377
	else {
1378
		/* Address of hotplugged memory can be smaller */
1379
		max_pfn = max(max_pfn, PFN_UP(start + size));
1380
		max_low_pfn = max_pfn;
1381
	}
1382

1383
	return ret;
1384
}
1385

1386
void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
1387
{
1388
	unsigned long start_pfn = start >> PAGE_SHIFT;
1389
	unsigned long nr_pages = size >> PAGE_SHIFT;
1390

1391
	__remove_pages(start_pfn, nr_pages, altmap);
1392
	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
1393
}
1394

1395
/*
1396
 * This memory hotplug notifier helps prevent boot memory from being
1397
 * inadvertently removed as it blocks pfn range offlining process in
1398
 * __offline_pages(). Hence this prevents both offlining as well as
1399
 * removal process for boot memory which is initially always online.
1400
 * In future if and when boot memory could be removed, this notifier
1401
 * should be dropped and free_hotplug_page_range() should handle any
1402
 * reserved pages allocated during boot.
1403
 */
1404
static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
1405
					   unsigned long action, void *data)
1406
{
1407
	struct mem_section *ms;
1408
	struct memory_notify *arg = data;
1409
	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
1410
	unsigned long pfn = arg->start_pfn;
1411

1412
	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
1413
		return NOTIFY_OK;
1414

1415
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1416
		unsigned long start = PFN_PHYS(pfn);
1417
		unsigned long end = start + (1UL << PA_SECTION_SHIFT);
1418

1419
		ms = __pfn_to_section(pfn);
1420
		if (!early_section(ms))
1421
			continue;
1422

1423
		if (action == MEM_GOING_OFFLINE) {
1424
			/*
1425
			 * Boot memory removal is not supported. Prevent
1426
			 * it via blocking any attempted offline request
1427
			 * for the boot memory and just report it.
1428
			 */
1429
			pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
1430
			return NOTIFY_BAD;
1431
		} else if (action == MEM_OFFLINE) {
1432
			/*
1433
			 * This should have never happened. Boot memory
1434
			 * offlining should have been prevented by this
1435
			 * very notifier. Probably some memory removal
1436
			 * procedure might have changed which would then
1437
			 * require further debug.
1438
			 */
1439
			pr_err("Boot memory [%lx %lx] offlined\n", start, end);
1440

1441
			/*
1442
			 * Core memory hotplug does not process a return
1443
			 * code from the notifier for MEM_OFFLINE events.
1444
			 * The error condition has been reported. Return
1445
			 * from here as if ignored.
1446
			 */
1447
			return NOTIFY_DONE;
1448
		}
1449
	}
1450
	return NOTIFY_OK;
1451
}
1452

1453
static struct notifier_block prevent_bootmem_remove_nb = {
1454
	.notifier_call = prevent_bootmem_remove_notifier,
1455
};
1456

1457
/*
1458
 * This ensures that boot memory sections on the platform are online
1459
 * from early boot. Memory sections could not be prevented from being
1460
 * offlined, unless for some reason they are not online to begin with.
1461
 * This helps validate the basic assumption on which the above memory
1462
 * event notifier works to prevent boot memory section offlining and
1463
 * its possible removal.
1464
 */
1465
static void validate_bootmem_online(void)
1466
{
1467
	phys_addr_t start, end, addr;
1468
	struct mem_section *ms;
1469
	u64 i;
1470

1471
	/*
1472
	 * Scanning across all memblock might be expensive
1473
	 * on some big memory systems. Hence enable this
1474
	 * validation only with DEBUG_VM.
1475
	 */
1476
	if (!IS_ENABLED(CONFIG_DEBUG_VM))
1477
		return;
1478

1479
	for_each_mem_range(i, &start, &end) {
1480
		for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
1481
			ms = __pfn_to_section(PHYS_PFN(addr));
1482

1483
			/*
1484
			 * All memory ranges in the system at this point
1485
			 * should have been marked as early sections.
1486
			 */
1487
			WARN_ON(!early_section(ms));
1488

1489
			/*
1490
			 * Memory notifier mechanism here to prevent boot
1491
			 * memory offlining depends on the fact that each
1492
			 * early section memory on the system is initially
1493
			 * online. Otherwise a given memory section which
1494
			 * is already offline will be overlooked and can
1495
			 * be removed completely. Call out such sections.
1496
			 */
1497
			if (!online_section(ms))
1498
				pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
1499
					addr, addr + (1UL << PA_SECTION_SHIFT));
1500
		}
1501
	}
1502
}
1503

1504
static int __init prevent_bootmem_remove_init(void)
1505
{
1506
	int ret = 0;
1507

1508
	if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
1509
		return ret;
1510

1511
	validate_bootmem_online();
1512
	ret = register_memory_notifier(&prevent_bootmem_remove_nb);
1513
	if (ret)
1514
		pr_err("%s: Notifier registration failed %d\n", __func__, ret);
1515

1516
	return ret;
1517
}
1518
early_initcall(prevent_bootmem_remove_init);
1519
#endif
1520

1521
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
1522
			     pte_t *ptep, unsigned int nr)
1523
{
1524
	pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
1525

1526
	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
1527
		/*
1528
		 * Break-before-make (BBM) is required for all user space mappings
1529
		 * when the permission changes from executable to non-executable
1530
		 * in cases where cpu is affected with errata #2645198.
1531
		 */
1532
		if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
1533
			__flush_tlb_range(vma, addr, nr * PAGE_SIZE,
1534
					  PAGE_SIZE, true, 3);
1535
	}
1536

1537
	return pte;
1538
}
1539

1540
pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
1541
{
1542
	return modify_prot_start_ptes(vma, addr, ptep, 1);
1543
}
1544

1545
void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
1546
			     pte_t *ptep, pte_t old_pte, pte_t pte,
1547
			     unsigned int nr)
1548
{
1549
	set_ptes(vma->vm_mm, addr, ptep, pte, nr);
1550
}
1551

1552
void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
1553
			     pte_t old_pte, pte_t pte)
1554
{
1555
	modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1);
1556
}
1557

1558
/*
1559
 * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
1560
 * avoiding the possibility of conflicting TLB entries being allocated.
1561
 */
1562
void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
1563
{
1564
	typedef void (ttbr_replace_func)(phys_addr_t);
1565
	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
1566
	ttbr_replace_func *replace_phys;
1567
	unsigned long daif;
1568

1569
	/* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
1570
	phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
1571

1572
	if (cnp)
1573
		ttbr1 |= TTBR_CNP_BIT;
1574

1575
	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
1576

1577
	cpu_install_idmap();
1578

1579
	/*
1580
	 * We really don't want to take *any* exceptions while TTBR1 is
1581
	 * in the process of being replaced so mask everything.
1582
	 */
1583
	daif = local_daif_save();
1584
	replace_phys(ttbr1);
1585
	local_daif_restore(daif);
1586

1587
	cpu_uninstall_idmap();
1588
}
1589

1590
#ifdef CONFIG_ARCH_HAS_PKEYS
1591
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
1592
{
1593
	u64 new_por;
1594
	u64 old_por;
1595

1596
	if (!system_supports_poe())
1597
		return -ENOSPC;
1598

1599
	/*
1600
	 * This code should only be called with valid 'pkey'
1601
	 * values originating from in-kernel users.  Complain
1602
	 * if a bad value is observed.
1603
	 */
1604
	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1605
		return -EINVAL;
1606

1607
	/* Set the bits we need in POR:  */
1608
	new_por = POE_RWX;
1609
	if (init_val & PKEY_DISABLE_WRITE)
1610
		new_por &= ~POE_W;
1611
	if (init_val & PKEY_DISABLE_ACCESS)
1612
		new_por &= ~POE_RW;
1613
	if (init_val & PKEY_DISABLE_READ)
1614
		new_por &= ~POE_R;
1615
	if (init_val & PKEY_DISABLE_EXECUTE)
1616
		new_por &= ~POE_X;
1617

1618
	/* Shift the bits in to the correct place in POR for pkey: */
1619
	new_por = POR_ELx_PERM_PREP(pkey, new_por);
1620

1621
	/* Get old POR and mask off any old bits in place: */
1622
	old_por = read_sysreg_s(SYS_POR_EL0);
1623
	old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey));
1624

1625
	/* Write old part along with new part: */
1626
	write_sysreg_s(old_por | new_por, SYS_POR_EL0);
1627

1628
	return 0;
1629
}
1630
#endif
1631

1632
Product

Resources

Company