CoCalc -- pgtable.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/mm/pgtable.c
²⁶⁴²⁴ views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/mm.h>
3
#include <linux/gfp.h>
4
#include <linux/hugetlb.h>
5
#include <asm/pgalloc.h>
6
#include <asm/tlb.h>
7
#include <asm/fixmap.h>
8
#include <asm/mtrr.h>
9

10
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12
EXPORT_SYMBOL(physical_mask);
13
SYM_PIC_ALIAS(physical_mask);
14
#endif
15

16
pgtable_t pte_alloc_one(struct mm_struct *mm)
17
{
18
	return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19
}
20

21
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
22
{
23
	paravirt_release_pte(page_to_pfn(pte));
24
	tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25
}
26

27
#if CONFIG_PGTABLE_LEVELS > 2
28
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
29
{
30
	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31
	/*
32
	 * NOTE! For PAE, any changes to the top page-directory-pointer-table
33
	 * entries need a full cr3 reload to flush.
34
	 */
35
#ifdef CONFIG_X86_PAE
36
	tlb->need_flush_all = 1;
37
#endif
38
	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
39
}
40

41
#if CONFIG_PGTABLE_LEVELS > 3
42
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
43
{
44
	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45
	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
46
}
47

48
#if CONFIG_PGTABLE_LEVELS > 4
49
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
50
{
51
	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52
	tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
53
}
54
#endif	/* CONFIG_PGTABLE_LEVELS > 4 */
55
#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
56
#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
57

58
static inline void pgd_list_add(pgd_t *pgd)
59
{
60
	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
61

62
	list_add(&ptdesc->pt_list, &pgd_list);
63
}
64

65
static inline void pgd_list_del(pgd_t *pgd)
66
{
67
	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
68

69
	list_del(&ptdesc->pt_list);
70
}
71

72
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
73
{
74
	virt_to_ptdesc(pgd)->pt_mm = mm;
75
}
76

77
struct mm_struct *pgd_page_get_mm(struct page *page)
78
{
79
	return page_ptdesc(page)->pt_mm;
80
}
81

82
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
83
{
84
	/* PAE preallocates all its PMDs.  No cloning needed. */
85
	if (!IS_ENABLED(CONFIG_X86_PAE))
86
		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
87
				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88
				KERNEL_PGD_PTRS);
89

90
	/* List used to sync kernel mapping updates */
91
	pgd_set_mm(pgd, mm);
92
	pgd_list_add(pgd);
93
}
94

95
static void pgd_dtor(pgd_t *pgd)
96
{
97
	spin_lock(&pgd_lock);
98
	pgd_list_del(pgd);
99
	spin_unlock(&pgd_lock);
100
}
101

102
/*
103
 * List of all pgd's needed for non-PAE so it can invalidate entries
104
 * in both cached and uncached pgd's; not needed for PAE since the
105
 * kernel pmd is shared. If PAE were not to share the pmd a similar
106
 * tactic would be needed. This is essentially codepath-based locking
107
 * against pageattr.c; it is the unique case in which a valid change
108
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
109
 * vmalloc faults work because attached pagetables are never freed.
110
 * -- nyc
111
 */
112

113
#ifdef CONFIG_X86_PAE
114
/*
115
 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116
 * updating the top-level pagetable entries to guarantee the
117
 * processor notices the update.  Since this is expensive, and
118
 * all 4 top-level entries are used almost immediately in a
119
 * new process's life, we just pre-populate them here.
120
 */
121
#define PREALLOCATED_PMDS	PTRS_PER_PGD
122

123
/*
124
 * "USER_PMDS" are the PMDs for the user copy of the page tables when
125
 * PTI is enabled. They do not exist when PTI is disabled.  Note that
126
 * this is distinct from the user _portion_ of the kernel page tables
127
 * which always exists.
128
 *
129
 * We allocate separate PMDs for the kernel part of the user page-table
130
 * when PTI is enabled. We need them to map the per-process LDT into the
131
 * user-space page-table.
132
 */
133
#define PREALLOCATED_USER_PMDS	 (boot_cpu_has(X86_FEATURE_PTI) ? \
134
					KERNEL_PGD_PTRS : 0)
135
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
136

137
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
138
{
139
	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
140

141
	/* Note: almost everything apart from _PAGE_PRESENT is
142
	   reserved at the pmd (PDPT) level. */
143
	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
144

145
	/*
146
	 * According to Intel App note "TLBs, Paging-Structure Caches,
147
	 * and Their Invalidation", April 2007, document 317080-001,
148
	 * section 8.1: in PAE mode we explicitly have to flush the
149
	 * TLB via cr3 if the top-level pgd is changed...
150
	 */
151
	flush_tlb_mm(mm);
152
}
153
#else  /* !CONFIG_X86_PAE */
154

155
/* No need to prepopulate any pagetable entries in non-PAE modes. */
156
#define PREALLOCATED_PMDS	0
157
#define PREALLOCATED_USER_PMDS	 0
158
#define MAX_PREALLOCATED_USER_PMDS 0
159
#endif	/* CONFIG_X86_PAE */
160

161
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
162
{
163
	int i;
164
	struct ptdesc *ptdesc;
165

166
	for (i = 0; i < count; i++)
167
		if (pmds[i]) {
168
			ptdesc = virt_to_ptdesc(pmds[i]);
169

170
			pagetable_dtor(ptdesc);
171
			pagetable_free(ptdesc);
172
			mm_dec_nr_pmds(mm);
173
		}
174
}
175

176
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
177
{
178
	int i;
179
	bool failed = false;
180
	gfp_t gfp = GFP_PGTABLE_USER;
181

182
	if (mm == &init_mm)
183
		gfp &= ~__GFP_ACCOUNT;
184
	gfp &= ~__GFP_HIGHMEM;
185

186
	for (i = 0; i < count; i++) {
187
		pmd_t *pmd = NULL;
188
		struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
189

190
		if (!ptdesc)
191
			failed = true;
192
		if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
193
			pagetable_free(ptdesc);
194
			ptdesc = NULL;
195
			failed = true;
196
		}
197
		if (ptdesc) {
198
			mm_inc_nr_pmds(mm);
199
			pmd = ptdesc_address(ptdesc);
200
		}
201

202
		pmds[i] = pmd;
203
	}
204

205
	if (failed) {
206
		free_pmds(mm, pmds, count);
207
		return -ENOMEM;
208
	}
209

210
	return 0;
211
}
212

213
/*
214
 * Mop up any pmd pages which may still be attached to the pgd.
215
 * Normally they will be freed by munmap/exit_mmap, but any pmd we
216
 * preallocate which never got a corresponding vma will need to be
217
 * freed manually.
218
 */
219
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
220
{
221
	pgd_t pgd = *pgdp;
222

223
	if (pgd_val(pgd) != 0) {
224
		pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
225

226
		pgd_clear(pgdp);
227

228
		paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
229
		pmd_free(mm, pmd);
230
		mm_dec_nr_pmds(mm);
231
	}
232
}
233

234
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
235
{
236
	int i;
237

238
	for (i = 0; i < PREALLOCATED_PMDS; i++)
239
		mop_up_one_pmd(mm, &pgdp[i]);
240

241
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
242

243
	if (!boot_cpu_has(X86_FEATURE_PTI))
244
		return;
245

246
	pgdp = kernel_to_user_pgdp(pgdp);
247

248
	for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
249
		mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
250
#endif
251
}
252

253
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
254
{
255
	p4d_t *p4d;
256
	pud_t *pud;
257
	int i;
258

259
	p4d = p4d_offset(pgd, 0);
260
	pud = pud_offset(p4d, 0);
261

262
	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
263
		pmd_t *pmd = pmds[i];
264

265
		if (i >= KERNEL_PGD_BOUNDARY)
266
			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
267
			       sizeof(pmd_t) * PTRS_PER_PMD);
268

269
		pud_populate(mm, pud, pmd);
270
	}
271
}
272

273
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
274
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
275
				     pgd_t *k_pgd, pmd_t *pmds[])
276
{
277
	pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
278
	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
279
	p4d_t *u_p4d;
280
	pud_t *u_pud;
281
	int i;
282

283
	u_p4d = p4d_offset(u_pgd, 0);
284
	u_pud = pud_offset(u_p4d, 0);
285

286
	s_pgd += KERNEL_PGD_BOUNDARY;
287
	u_pud += KERNEL_PGD_BOUNDARY;
288

289
	for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
290
		pmd_t *pmd = pmds[i];
291

292
		memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
293
		       sizeof(pmd_t) * PTRS_PER_PMD);
294

295
		pud_populate(mm, u_pud, pmd);
296
	}
297

298
}
299
#else
300
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
301
				     pgd_t *k_pgd, pmd_t *pmds[])
302
{
303
}
304
#endif
305

306
static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
307
{
308
	/*
309
	 * PTI and Xen need a whole page for the PAE PGD
310
	 * even though the hardware only needs 32 bytes.
311
	 *
312
	 * For simplicity, allocate a page for all users.
313
	 */
314
	return __pgd_alloc(mm, pgd_allocation_order());
315
}
316

317
static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
318
{
319
	__pgd_free(mm, pgd);
320
}
321

322
pgd_t *pgd_alloc(struct mm_struct *mm)
323
{
324
	pgd_t *pgd;
325
	pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
326
	pmd_t *pmds[PREALLOCATED_PMDS];
327

328
	pgd = _pgd_alloc(mm);
329

330
	if (pgd == NULL)
331
		goto out;
332

333
	mm->pgd = pgd;
334

335
	if (sizeof(pmds) != 0 &&
336
			preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
337
		goto out_free_pgd;
338

339
	if (sizeof(u_pmds) != 0 &&
340
			preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
341
		goto out_free_pmds;
342

343
	if (paravirt_pgd_alloc(mm) != 0)
344
		goto out_free_user_pmds;
345

346
	/*
347
	 * Make sure that pre-populating the pmds is atomic with
348
	 * respect to anything walking the pgd_list, so that they
349
	 * never see a partially populated pgd.
350
	 */
351
	spin_lock(&pgd_lock);
352

353
	pgd_ctor(mm, pgd);
354
	if (sizeof(pmds) != 0)
355
		pgd_prepopulate_pmd(mm, pgd, pmds);
356

357
	if (sizeof(u_pmds) != 0)
358
		pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
359

360
	spin_unlock(&pgd_lock);
361

362
	return pgd;
363

364
out_free_user_pmds:
365
	if (sizeof(u_pmds) != 0)
366
		free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
367
out_free_pmds:
368
	if (sizeof(pmds) != 0)
369
		free_pmds(mm, pmds, PREALLOCATED_PMDS);
370
out_free_pgd:
371
	_pgd_free(mm, pgd);
372
out:
373
	return NULL;
374
}
375

376
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
377
{
378
	pgd_mop_up_pmds(mm, pgd);
379
	pgd_dtor(pgd);
380
	paravirt_pgd_free(mm, pgd);
381
	_pgd_free(mm, pgd);
382
}
383

384
/*
385
 * Used to set accessed or dirty bits in the page table entries
386
 * on other architectures. On x86, the accessed and dirty bits
387
 * are tracked by hardware. However, do_wp_page calls this function
388
 * to also make the pte writeable at the same time the dirty bit is
389
 * set. In that case we do actually need to write the PTE.
390
 */
391
int ptep_set_access_flags(struct vm_area_struct *vma,
392
			  unsigned long address, pte_t *ptep,
393
			  pte_t entry, int dirty)
394
{
395
	int changed = !pte_same(*ptep, entry);
396

397
	if (changed && dirty)
398
		set_pte(ptep, entry);
399

400
	return changed;
401
}
402

403
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
404
int pmdp_set_access_flags(struct vm_area_struct *vma,
405
			  unsigned long address, pmd_t *pmdp,
406
			  pmd_t entry, int dirty)
407
{
408
	int changed = !pmd_same(*pmdp, entry);
409

410
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
411

412
	if (changed && dirty) {
413
		set_pmd(pmdp, entry);
414
		/*
415
		 * We had a write-protection fault here and changed the pmd
416
		 * to to more permissive. No need to flush the TLB for that,
417
		 * #PF is architecturally guaranteed to do that and in the
418
		 * worst-case we'll generate a spurious fault.
419
		 */
420
	}
421

422
	return changed;
423
}
424

425
int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
426
			  pud_t *pudp, pud_t entry, int dirty)
427
{
428
	int changed = !pud_same(*pudp, entry);
429

430
	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
431

432
	if (changed && dirty) {
433
		set_pud(pudp, entry);
434
		/*
435
		 * We had a write-protection fault here and changed the pud
436
		 * to to more permissive. No need to flush the TLB for that,
437
		 * #PF is architecturally guaranteed to do that and in the
438
		 * worst-case we'll generate a spurious fault.
439
		 */
440
	}
441

442
	return changed;
443
}
444
#endif
445

446
int ptep_test_and_clear_young(struct vm_area_struct *vma,
447
			      unsigned long addr, pte_t *ptep)
448
{
449
	int ret = 0;
450

451
	if (pte_young(*ptep))
452
		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
453
					 (unsigned long *) &ptep->pte);
454

455
	return ret;
456
}
457

458
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
459
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
460
			      unsigned long addr, pmd_t *pmdp)
461
{
462
	int ret = 0;
463

464
	if (pmd_young(*pmdp))
465
		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
466
					 (unsigned long *)pmdp);
467

468
	return ret;
469
}
470
#endif
471

472
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473
int pudp_test_and_clear_young(struct vm_area_struct *vma,
474
			      unsigned long addr, pud_t *pudp)
475
{
476
	int ret = 0;
477

478
	if (pud_young(*pudp))
479
		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
480
					 (unsigned long *)pudp);
481

482
	return ret;
483
}
484
#endif
485

486
int ptep_clear_flush_young(struct vm_area_struct *vma,
487
			   unsigned long address, pte_t *ptep)
488
{
489
	/*
490
	 * On x86 CPUs, clearing the accessed bit without a TLB flush
491
	 * doesn't cause data corruption. [ It could cause incorrect
492
	 * page aging and the (mistaken) reclaim of hot pages, but the
493
	 * chance of that should be relatively low. ]
494
	 *
495
	 * So as a performance optimization don't flush the TLB when
496
	 * clearing the accessed bit, it will eventually be flushed by
497
	 * a context switch or a VM operation anyway. [ In the rare
498
	 * event of it not getting flushed for a long time the delay
499
	 * shouldn't really matter because there's no real memory
500
	 * pressure for swapout to react to. ]
501
	 */
502
	return ptep_test_and_clear_young(vma, address, ptep);
503
}
504

505
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
506
int pmdp_clear_flush_young(struct vm_area_struct *vma,
507
			   unsigned long address, pmd_t *pmdp)
508
{
509
	int young;
510

511
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
512

513
	young = pmdp_test_and_clear_young(vma, address, pmdp);
514
	if (young)
515
		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
516

517
	return young;
518
}
519

520
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
521
			 pmd_t *pmdp)
522
{
523
	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
524

525
	/*
526
	 * No flush is necessary. Once an invalid PTE is established, the PTE's
527
	 * access and dirty bits cannot be updated.
528
	 */
529
	return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
530
}
531
#endif
532

533
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
534
	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
535
pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
536
		     pud_t *pudp)
537
{
538
	VM_WARN_ON_ONCE(!pud_present(*pudp));
539
	pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
540
	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
541
	return old;
542
}
543
#endif
544

545
/**
546
 * reserve_top_address - Reserve a hole in the top of the kernel address space
547
 * @reserve: Size of hole to reserve
548
 *
549
 * Can be used to relocate the fixmap area and poke a hole in the top
550
 * of the kernel address space to make room for a hypervisor.
551
 */
552
void __init reserve_top_address(unsigned long reserve)
553
{
554
#ifdef CONFIG_X86_32
555
	BUG_ON(fixmaps_set > 0);
556
	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
557
	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
558
	       -reserve, __FIXADDR_TOP + PAGE_SIZE);
559
#endif
560
}
561

562
int fixmaps_set;
563

564
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
565
{
566
	unsigned long address = __fix_to_virt(idx);
567

568
#ifdef CONFIG_X86_64
569
       /*
570
	* Ensure that the static initial page tables are covering the
571
	* fixmap completely.
572
	*/
573
	BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
574
		     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
575
#endif
576

577
	if (idx >= __end_of_fixed_addresses) {
578
		BUG();
579
		return;
580
	}
581
	set_pte_vaddr(address, pte);
582
	fixmaps_set++;
583
}
584

585
void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
586
		       phys_addr_t phys, pgprot_t flags)
587
{
588
	/* Sanitize 'prot' against any unsupported bits: */
589
	pgprot_val(flags) &= __default_kernel_pte_mask;
590

591
	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
592
}
593

594
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
595
#if CONFIG_PGTABLE_LEVELS > 4
596
/**
597
 * p4d_set_huge - Set up kernel P4D mapping
598
 * @p4d: Pointer to the P4D entry
599
 * @addr: Virtual address associated with the P4D entry
600
 * @prot: Protection bits to use
601
 *
602
 * No 512GB pages yet -- always return 0
603
 */
604
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
605
{
606
	return 0;
607
}
608

609
/**
610
 * p4d_clear_huge - Clear kernel P4D mapping when it is set
611
 * @p4d: Pointer to the P4D entry to clear
612
 *
613
 * No 512GB pages yet -- do nothing
614
 */
615
void p4d_clear_huge(p4d_t *p4d)
616
{
617
}
618
#endif
619

620
/**
621
 * pud_set_huge - Set up kernel PUD mapping
622
 * @pud: Pointer to the PUD entry
623
 * @addr: Virtual address associated with the PUD entry
624
 * @prot: Protection bits to use
625
 *
626
 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
627
 * function sets up a huge page only if the complete range has the same MTRR
628
 * caching mode.
629
 *
630
 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
631
 * page mapping attempt fails.
632
 *
633
 * Returns 1 on success and 0 on failure.
634
 */
635
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
636
{
637
	u8 uniform;
638

639
	mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
640
	if (!uniform)
641
		return 0;
642

643
	/* Bail out if we are we on a populated non-leaf entry: */
644
	if (pud_present(*pud) && !pud_leaf(*pud))
645
		return 0;
646

647
	set_pte((pte_t *)pud, pfn_pte(
648
		(u64)addr >> PAGE_SHIFT,
649
		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
650

651
	return 1;
652
}
653

654
/**
655
 * pmd_set_huge - Set up kernel PMD mapping
656
 * @pmd: Pointer to the PMD entry
657
 * @addr: Virtual address associated with the PMD entry
658
 * @prot: Protection bits to use
659
 *
660
 * See text over pud_set_huge() above.
661
 *
662
 * Returns 1 on success and 0 on failure.
663
 */
664
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
665
{
666
	u8 uniform;
667

668
	mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
669
	if (!uniform) {
670
		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
671
			     __func__, addr, addr + PMD_SIZE);
672
		return 0;
673
	}
674

675
	/* Bail out if we are we on a populated non-leaf entry: */
676
	if (pmd_present(*pmd) && !pmd_leaf(*pmd))
677
		return 0;
678

679
	set_pte((pte_t *)pmd, pfn_pte(
680
		(u64)addr >> PAGE_SHIFT,
681
		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
682

683
	return 1;
684
}
685

686
/**
687
 * pud_clear_huge - Clear kernel PUD mapping when it is set
688
 * @pud: Pointer to the PUD entry to clear.
689
 *
690
 * Returns 1 on success and 0 on failure (no PUD map is found).
691
 */
692
int pud_clear_huge(pud_t *pud)
693
{
694
	if (pud_leaf(*pud)) {
695
		pud_clear(pud);
696
		return 1;
697
	}
698

699
	return 0;
700
}
701

702
/**
703
 * pmd_clear_huge - Clear kernel PMD mapping when it is set
704
 * @pmd: Pointer to the PMD entry to clear.
705
 *
706
 * Returns 1 on success and 0 on failure (no PMD map is found).
707
 */
708
int pmd_clear_huge(pmd_t *pmd)
709
{
710
	if (pmd_leaf(*pmd)) {
711
		pmd_clear(pmd);
712
		return 1;
713
	}
714

715
	return 0;
716
}
717

718
#ifdef CONFIG_X86_64
719
/**
720
 * pud_free_pmd_page - Clear PUD entry and free PMD page
721
 * @pud: Pointer to a PUD
722
 * @addr: Virtual address associated with PUD
723
 *
724
 * Context: The PUD range has been unmapped and TLB purged.
725
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
726
 *
727
 * NOTE: Callers must allow a single page allocation.
728
 */
729
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
730
{
731
	pmd_t *pmd, *pmd_sv;
732
	pte_t *pte;
733
	int i;
734

735
	pmd = pud_pgtable(*pud);
736
	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
737
	if (!pmd_sv)
738
		return 0;
739

740
	for (i = 0; i < PTRS_PER_PMD; i++) {
741
		pmd_sv[i] = pmd[i];
742
		if (!pmd_none(pmd[i]))
743
			pmd_clear(&pmd[i]);
744
	}
745

746
	pud_clear(pud);
747

748
	/* INVLPG to clear all paging-structure caches */
749
	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
750

751
	for (i = 0; i < PTRS_PER_PMD; i++) {
752
		if (!pmd_none(pmd_sv[i])) {
753
			pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
754
			pte_free_kernel(&init_mm, pte);
755
		}
756
	}
757

758
	free_page((unsigned long)pmd_sv);
759

760
	pmd_free(&init_mm, pmd);
761

762
	return 1;
763
}
764

765
/**
766
 * pmd_free_pte_page - Clear PMD entry and free PTE page.
767
 * @pmd: Pointer to the PMD
768
 * @addr: Virtual address associated with PMD
769
 *
770
 * Context: The PMD range has been unmapped and TLB purged.
771
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
772
 */
773
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
774
{
775
	pte_t *pte;
776

777
	pte = (pte_t *)pmd_page_vaddr(*pmd);
778
	pmd_clear(pmd);
779

780
	/* INVLPG to clear all paging-structure caches */
781
	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
782

783
	pte_free_kernel(&init_mm, pte);
784

785
	return 1;
786
}
787

788
#else /* !CONFIG_X86_64 */
789

790
/*
791
 * Disable free page handling on x86-PAE. This assures that ioremap()
792
 * does not update sync'd PMD entries. See vmalloc_sync_one().
793
 */
794
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
795
{
796
	return pmd_none(*pmd);
797
}
798

799
#endif /* CONFIG_X86_64 */
800
#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
801

802
pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
803
{
804
	if (vma->vm_flags & VM_SHADOW_STACK)
805
		return pte_mkwrite_shstk(pte);
806

807
	pte = pte_mkwrite_novma(pte);
808

809
	return pte_clear_saveddirty(pte);
810
}
811

812
pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
813
{
814
	if (vma->vm_flags & VM_SHADOW_STACK)
815
		return pmd_mkwrite_shstk(pmd);
816

817
	pmd = pmd_mkwrite_novma(pmd);
818

819
	return pmd_clear_saveddirty(pmd);
820
}
821

822
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
823
{
824
	/*
825
	 * Hardware before shadow stack can (rarely) set Dirty=1
826
	 * on a Write=0 PTE. So the below condition
827
	 * only indicates a software bug when shadow stack is
828
	 * supported by the HW. This checking is covered in
829
	 * pte_shstk().
830
	 */
831
	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
832
			pte_shstk(pte));
833
}
834

835
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
836
{
837
	/* See note in arch_check_zapped_pte() */
838
	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
839
			pmd_shstk(pmd));
840
}
841

842
void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
843
{
844
	/* See note in arch_check_zapped_pte() */
845
	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
846
}
847

848
Product

Resources

Company