Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/mm/pgtable.c
26424 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/mm.h>
3
#include <linux/gfp.h>
4
#include <linux/hugetlb.h>
5
#include <asm/pgalloc.h>
6
#include <asm/tlb.h>
7
#include <asm/fixmap.h>
8
#include <asm/mtrr.h>
9
10
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12
EXPORT_SYMBOL(physical_mask);
13
SYM_PIC_ALIAS(physical_mask);
14
#endif
15
16
pgtable_t pte_alloc_one(struct mm_struct *mm)
17
{
18
return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19
}
20
21
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
22
{
23
paravirt_release_pte(page_to_pfn(pte));
24
tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25
}
26
27
#if CONFIG_PGTABLE_LEVELS > 2
28
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
29
{
30
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31
/*
32
* NOTE! For PAE, any changes to the top page-directory-pointer-table
33
* entries need a full cr3 reload to flush.
34
*/
35
#ifdef CONFIG_X86_PAE
36
tlb->need_flush_all = 1;
37
#endif
38
tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
39
}
40
41
#if CONFIG_PGTABLE_LEVELS > 3
42
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
43
{
44
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45
tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
46
}
47
48
#if CONFIG_PGTABLE_LEVELS > 4
49
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
50
{
51
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52
tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
53
}
54
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
55
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
56
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
57
58
static inline void pgd_list_add(pgd_t *pgd)
59
{
60
struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
61
62
list_add(&ptdesc->pt_list, &pgd_list);
63
}
64
65
static inline void pgd_list_del(pgd_t *pgd)
66
{
67
struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
68
69
list_del(&ptdesc->pt_list);
70
}
71
72
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
73
{
74
virt_to_ptdesc(pgd)->pt_mm = mm;
75
}
76
77
struct mm_struct *pgd_page_get_mm(struct page *page)
78
{
79
return page_ptdesc(page)->pt_mm;
80
}
81
82
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
83
{
84
/* PAE preallocates all its PMDs. No cloning needed. */
85
if (!IS_ENABLED(CONFIG_X86_PAE))
86
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
87
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88
KERNEL_PGD_PTRS);
89
90
/* List used to sync kernel mapping updates */
91
pgd_set_mm(pgd, mm);
92
pgd_list_add(pgd);
93
}
94
95
static void pgd_dtor(pgd_t *pgd)
96
{
97
spin_lock(&pgd_lock);
98
pgd_list_del(pgd);
99
spin_unlock(&pgd_lock);
100
}
101
102
/*
103
* List of all pgd's needed for non-PAE so it can invalidate entries
104
* in both cached and uncached pgd's; not needed for PAE since the
105
* kernel pmd is shared. If PAE were not to share the pmd a similar
106
* tactic would be needed. This is essentially codepath-based locking
107
* against pageattr.c; it is the unique case in which a valid change
108
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
109
* vmalloc faults work because attached pagetables are never freed.
110
* -- nyc
111
*/
112
113
#ifdef CONFIG_X86_PAE
114
/*
115
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
116
* updating the top-level pagetable entries to guarantee the
117
* processor notices the update. Since this is expensive, and
118
* all 4 top-level entries are used almost immediately in a
119
* new process's life, we just pre-populate them here.
120
*/
121
#define PREALLOCATED_PMDS PTRS_PER_PGD
122
123
/*
124
* "USER_PMDS" are the PMDs for the user copy of the page tables when
125
* PTI is enabled. They do not exist when PTI is disabled. Note that
126
* this is distinct from the user _portion_ of the kernel page tables
127
* which always exists.
128
*
129
* We allocate separate PMDs for the kernel part of the user page-table
130
* when PTI is enabled. We need them to map the per-process LDT into the
131
* user-space page-table.
132
*/
133
#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
134
KERNEL_PGD_PTRS : 0)
135
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
136
137
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
138
{
139
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
140
141
/* Note: almost everything apart from _PAGE_PRESENT is
142
reserved at the pmd (PDPT) level. */
143
set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
144
145
/*
146
* According to Intel App note "TLBs, Paging-Structure Caches,
147
* and Their Invalidation", April 2007, document 317080-001,
148
* section 8.1: in PAE mode we explicitly have to flush the
149
* TLB via cr3 if the top-level pgd is changed...
150
*/
151
flush_tlb_mm(mm);
152
}
153
#else /* !CONFIG_X86_PAE */
154
155
/* No need to prepopulate any pagetable entries in non-PAE modes. */
156
#define PREALLOCATED_PMDS 0
157
#define PREALLOCATED_USER_PMDS 0
158
#define MAX_PREALLOCATED_USER_PMDS 0
159
#endif /* CONFIG_X86_PAE */
160
161
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
162
{
163
int i;
164
struct ptdesc *ptdesc;
165
166
for (i = 0; i < count; i++)
167
if (pmds[i]) {
168
ptdesc = virt_to_ptdesc(pmds[i]);
169
170
pagetable_dtor(ptdesc);
171
pagetable_free(ptdesc);
172
mm_dec_nr_pmds(mm);
173
}
174
}
175
176
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
177
{
178
int i;
179
bool failed = false;
180
gfp_t gfp = GFP_PGTABLE_USER;
181
182
if (mm == &init_mm)
183
gfp &= ~__GFP_ACCOUNT;
184
gfp &= ~__GFP_HIGHMEM;
185
186
for (i = 0; i < count; i++) {
187
pmd_t *pmd = NULL;
188
struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
189
190
if (!ptdesc)
191
failed = true;
192
if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
193
pagetable_free(ptdesc);
194
ptdesc = NULL;
195
failed = true;
196
}
197
if (ptdesc) {
198
mm_inc_nr_pmds(mm);
199
pmd = ptdesc_address(ptdesc);
200
}
201
202
pmds[i] = pmd;
203
}
204
205
if (failed) {
206
free_pmds(mm, pmds, count);
207
return -ENOMEM;
208
}
209
210
return 0;
211
}
212
213
/*
214
* Mop up any pmd pages which may still be attached to the pgd.
215
* Normally they will be freed by munmap/exit_mmap, but any pmd we
216
* preallocate which never got a corresponding vma will need to be
217
* freed manually.
218
*/
219
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
220
{
221
pgd_t pgd = *pgdp;
222
223
if (pgd_val(pgd) != 0) {
224
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
225
226
pgd_clear(pgdp);
227
228
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
229
pmd_free(mm, pmd);
230
mm_dec_nr_pmds(mm);
231
}
232
}
233
234
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
235
{
236
int i;
237
238
for (i = 0; i < PREALLOCATED_PMDS; i++)
239
mop_up_one_pmd(mm, &pgdp[i]);
240
241
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
242
243
if (!boot_cpu_has(X86_FEATURE_PTI))
244
return;
245
246
pgdp = kernel_to_user_pgdp(pgdp);
247
248
for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
249
mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
250
#endif
251
}
252
253
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
254
{
255
p4d_t *p4d;
256
pud_t *pud;
257
int i;
258
259
p4d = p4d_offset(pgd, 0);
260
pud = pud_offset(p4d, 0);
261
262
for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
263
pmd_t *pmd = pmds[i];
264
265
if (i >= KERNEL_PGD_BOUNDARY)
266
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
267
sizeof(pmd_t) * PTRS_PER_PMD);
268
269
pud_populate(mm, pud, pmd);
270
}
271
}
272
273
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
274
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
275
pgd_t *k_pgd, pmd_t *pmds[])
276
{
277
pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
278
pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
279
p4d_t *u_p4d;
280
pud_t *u_pud;
281
int i;
282
283
u_p4d = p4d_offset(u_pgd, 0);
284
u_pud = pud_offset(u_p4d, 0);
285
286
s_pgd += KERNEL_PGD_BOUNDARY;
287
u_pud += KERNEL_PGD_BOUNDARY;
288
289
for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
290
pmd_t *pmd = pmds[i];
291
292
memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
293
sizeof(pmd_t) * PTRS_PER_PMD);
294
295
pud_populate(mm, u_pud, pmd);
296
}
297
298
}
299
#else
300
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
301
pgd_t *k_pgd, pmd_t *pmds[])
302
{
303
}
304
#endif
305
306
static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
307
{
308
/*
309
* PTI and Xen need a whole page for the PAE PGD
310
* even though the hardware only needs 32 bytes.
311
*
312
* For simplicity, allocate a page for all users.
313
*/
314
return __pgd_alloc(mm, pgd_allocation_order());
315
}
316
317
static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
318
{
319
__pgd_free(mm, pgd);
320
}
321
322
pgd_t *pgd_alloc(struct mm_struct *mm)
323
{
324
pgd_t *pgd;
325
pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
326
pmd_t *pmds[PREALLOCATED_PMDS];
327
328
pgd = _pgd_alloc(mm);
329
330
if (pgd == NULL)
331
goto out;
332
333
mm->pgd = pgd;
334
335
if (sizeof(pmds) != 0 &&
336
preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
337
goto out_free_pgd;
338
339
if (sizeof(u_pmds) != 0 &&
340
preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
341
goto out_free_pmds;
342
343
if (paravirt_pgd_alloc(mm) != 0)
344
goto out_free_user_pmds;
345
346
/*
347
* Make sure that pre-populating the pmds is atomic with
348
* respect to anything walking the pgd_list, so that they
349
* never see a partially populated pgd.
350
*/
351
spin_lock(&pgd_lock);
352
353
pgd_ctor(mm, pgd);
354
if (sizeof(pmds) != 0)
355
pgd_prepopulate_pmd(mm, pgd, pmds);
356
357
if (sizeof(u_pmds) != 0)
358
pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
359
360
spin_unlock(&pgd_lock);
361
362
return pgd;
363
364
out_free_user_pmds:
365
if (sizeof(u_pmds) != 0)
366
free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
367
out_free_pmds:
368
if (sizeof(pmds) != 0)
369
free_pmds(mm, pmds, PREALLOCATED_PMDS);
370
out_free_pgd:
371
_pgd_free(mm, pgd);
372
out:
373
return NULL;
374
}
375
376
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
377
{
378
pgd_mop_up_pmds(mm, pgd);
379
pgd_dtor(pgd);
380
paravirt_pgd_free(mm, pgd);
381
_pgd_free(mm, pgd);
382
}
383
384
/*
385
* Used to set accessed or dirty bits in the page table entries
386
* on other architectures. On x86, the accessed and dirty bits
387
* are tracked by hardware. However, do_wp_page calls this function
388
* to also make the pte writeable at the same time the dirty bit is
389
* set. In that case we do actually need to write the PTE.
390
*/
391
int ptep_set_access_flags(struct vm_area_struct *vma,
392
unsigned long address, pte_t *ptep,
393
pte_t entry, int dirty)
394
{
395
int changed = !pte_same(*ptep, entry);
396
397
if (changed && dirty)
398
set_pte(ptep, entry);
399
400
return changed;
401
}
402
403
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
404
int pmdp_set_access_flags(struct vm_area_struct *vma,
405
unsigned long address, pmd_t *pmdp,
406
pmd_t entry, int dirty)
407
{
408
int changed = !pmd_same(*pmdp, entry);
409
410
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
411
412
if (changed && dirty) {
413
set_pmd(pmdp, entry);
414
/*
415
* We had a write-protection fault here and changed the pmd
416
* to to more permissive. No need to flush the TLB for that,
417
* #PF is architecturally guaranteed to do that and in the
418
* worst-case we'll generate a spurious fault.
419
*/
420
}
421
422
return changed;
423
}
424
425
int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
426
pud_t *pudp, pud_t entry, int dirty)
427
{
428
int changed = !pud_same(*pudp, entry);
429
430
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
431
432
if (changed && dirty) {
433
set_pud(pudp, entry);
434
/*
435
* We had a write-protection fault here and changed the pud
436
* to to more permissive. No need to flush the TLB for that,
437
* #PF is architecturally guaranteed to do that and in the
438
* worst-case we'll generate a spurious fault.
439
*/
440
}
441
442
return changed;
443
}
444
#endif
445
446
int ptep_test_and_clear_young(struct vm_area_struct *vma,
447
unsigned long addr, pte_t *ptep)
448
{
449
int ret = 0;
450
451
if (pte_young(*ptep))
452
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
453
(unsigned long *) &ptep->pte);
454
455
return ret;
456
}
457
458
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
459
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
460
unsigned long addr, pmd_t *pmdp)
461
{
462
int ret = 0;
463
464
if (pmd_young(*pmdp))
465
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
466
(unsigned long *)pmdp);
467
468
return ret;
469
}
470
#endif
471
472
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473
int pudp_test_and_clear_young(struct vm_area_struct *vma,
474
unsigned long addr, pud_t *pudp)
475
{
476
int ret = 0;
477
478
if (pud_young(*pudp))
479
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
480
(unsigned long *)pudp);
481
482
return ret;
483
}
484
#endif
485
486
int ptep_clear_flush_young(struct vm_area_struct *vma,
487
unsigned long address, pte_t *ptep)
488
{
489
/*
490
* On x86 CPUs, clearing the accessed bit without a TLB flush
491
* doesn't cause data corruption. [ It could cause incorrect
492
* page aging and the (mistaken) reclaim of hot pages, but the
493
* chance of that should be relatively low. ]
494
*
495
* So as a performance optimization don't flush the TLB when
496
* clearing the accessed bit, it will eventually be flushed by
497
* a context switch or a VM operation anyway. [ In the rare
498
* event of it not getting flushed for a long time the delay
499
* shouldn't really matter because there's no real memory
500
* pressure for swapout to react to. ]
501
*/
502
return ptep_test_and_clear_young(vma, address, ptep);
503
}
504
505
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
506
int pmdp_clear_flush_young(struct vm_area_struct *vma,
507
unsigned long address, pmd_t *pmdp)
508
{
509
int young;
510
511
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
512
513
young = pmdp_test_and_clear_young(vma, address, pmdp);
514
if (young)
515
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
516
517
return young;
518
}
519
520
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
521
pmd_t *pmdp)
522
{
523
VM_WARN_ON_ONCE(!pmd_present(*pmdp));
524
525
/*
526
* No flush is necessary. Once an invalid PTE is established, the PTE's
527
* access and dirty bits cannot be updated.
528
*/
529
return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
530
}
531
#endif
532
533
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
534
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
535
pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
536
pud_t *pudp)
537
{
538
VM_WARN_ON_ONCE(!pud_present(*pudp));
539
pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
540
flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
541
return old;
542
}
543
#endif
544
545
/**
546
* reserve_top_address - Reserve a hole in the top of the kernel address space
547
* @reserve: Size of hole to reserve
548
*
549
* Can be used to relocate the fixmap area and poke a hole in the top
550
* of the kernel address space to make room for a hypervisor.
551
*/
552
void __init reserve_top_address(unsigned long reserve)
553
{
554
#ifdef CONFIG_X86_32
555
BUG_ON(fixmaps_set > 0);
556
__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
557
printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
558
-reserve, __FIXADDR_TOP + PAGE_SIZE);
559
#endif
560
}
561
562
int fixmaps_set;
563
564
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
565
{
566
unsigned long address = __fix_to_virt(idx);
567
568
#ifdef CONFIG_X86_64
569
/*
570
* Ensure that the static initial page tables are covering the
571
* fixmap completely.
572
*/
573
BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
574
(FIXMAP_PMD_NUM * PTRS_PER_PTE));
575
#endif
576
577
if (idx >= __end_of_fixed_addresses) {
578
BUG();
579
return;
580
}
581
set_pte_vaddr(address, pte);
582
fixmaps_set++;
583
}
584
585
void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
586
phys_addr_t phys, pgprot_t flags)
587
{
588
/* Sanitize 'prot' against any unsupported bits: */
589
pgprot_val(flags) &= __default_kernel_pte_mask;
590
591
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
592
}
593
594
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
595
#if CONFIG_PGTABLE_LEVELS > 4
596
/**
597
* p4d_set_huge - Set up kernel P4D mapping
598
* @p4d: Pointer to the P4D entry
599
* @addr: Virtual address associated with the P4D entry
600
* @prot: Protection bits to use
601
*
602
* No 512GB pages yet -- always return 0
603
*/
604
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
605
{
606
return 0;
607
}
608
609
/**
610
* p4d_clear_huge - Clear kernel P4D mapping when it is set
611
* @p4d: Pointer to the P4D entry to clear
612
*
613
* No 512GB pages yet -- do nothing
614
*/
615
void p4d_clear_huge(p4d_t *p4d)
616
{
617
}
618
#endif
619
620
/**
621
* pud_set_huge - Set up kernel PUD mapping
622
* @pud: Pointer to the PUD entry
623
* @addr: Virtual address associated with the PUD entry
624
* @prot: Protection bits to use
625
*
626
* MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
627
* function sets up a huge page only if the complete range has the same MTRR
628
* caching mode.
629
*
630
* Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
631
* page mapping attempt fails.
632
*
633
* Returns 1 on success and 0 on failure.
634
*/
635
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
636
{
637
u8 uniform;
638
639
mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
640
if (!uniform)
641
return 0;
642
643
/* Bail out if we are we on a populated non-leaf entry: */
644
if (pud_present(*pud) && !pud_leaf(*pud))
645
return 0;
646
647
set_pte((pte_t *)pud, pfn_pte(
648
(u64)addr >> PAGE_SHIFT,
649
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
650
651
return 1;
652
}
653
654
/**
655
* pmd_set_huge - Set up kernel PMD mapping
656
* @pmd: Pointer to the PMD entry
657
* @addr: Virtual address associated with the PMD entry
658
* @prot: Protection bits to use
659
*
660
* See text over pud_set_huge() above.
661
*
662
* Returns 1 on success and 0 on failure.
663
*/
664
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
665
{
666
u8 uniform;
667
668
mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
669
if (!uniform) {
670
pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
671
__func__, addr, addr + PMD_SIZE);
672
return 0;
673
}
674
675
/* Bail out if we are we on a populated non-leaf entry: */
676
if (pmd_present(*pmd) && !pmd_leaf(*pmd))
677
return 0;
678
679
set_pte((pte_t *)pmd, pfn_pte(
680
(u64)addr >> PAGE_SHIFT,
681
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
682
683
return 1;
684
}
685
686
/**
687
* pud_clear_huge - Clear kernel PUD mapping when it is set
688
* @pud: Pointer to the PUD entry to clear.
689
*
690
* Returns 1 on success and 0 on failure (no PUD map is found).
691
*/
692
int pud_clear_huge(pud_t *pud)
693
{
694
if (pud_leaf(*pud)) {
695
pud_clear(pud);
696
return 1;
697
}
698
699
return 0;
700
}
701
702
/**
703
* pmd_clear_huge - Clear kernel PMD mapping when it is set
704
* @pmd: Pointer to the PMD entry to clear.
705
*
706
* Returns 1 on success and 0 on failure (no PMD map is found).
707
*/
708
int pmd_clear_huge(pmd_t *pmd)
709
{
710
if (pmd_leaf(*pmd)) {
711
pmd_clear(pmd);
712
return 1;
713
}
714
715
return 0;
716
}
717
718
#ifdef CONFIG_X86_64
719
/**
720
* pud_free_pmd_page - Clear PUD entry and free PMD page
721
* @pud: Pointer to a PUD
722
* @addr: Virtual address associated with PUD
723
*
724
* Context: The PUD range has been unmapped and TLB purged.
725
* Return: 1 if clearing the entry succeeded. 0 otherwise.
726
*
727
* NOTE: Callers must allow a single page allocation.
728
*/
729
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
730
{
731
pmd_t *pmd, *pmd_sv;
732
pte_t *pte;
733
int i;
734
735
pmd = pud_pgtable(*pud);
736
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
737
if (!pmd_sv)
738
return 0;
739
740
for (i = 0; i < PTRS_PER_PMD; i++) {
741
pmd_sv[i] = pmd[i];
742
if (!pmd_none(pmd[i]))
743
pmd_clear(&pmd[i]);
744
}
745
746
pud_clear(pud);
747
748
/* INVLPG to clear all paging-structure caches */
749
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
750
751
for (i = 0; i < PTRS_PER_PMD; i++) {
752
if (!pmd_none(pmd_sv[i])) {
753
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
754
pte_free_kernel(&init_mm, pte);
755
}
756
}
757
758
free_page((unsigned long)pmd_sv);
759
760
pmd_free(&init_mm, pmd);
761
762
return 1;
763
}
764
765
/**
766
* pmd_free_pte_page - Clear PMD entry and free PTE page.
767
* @pmd: Pointer to the PMD
768
* @addr: Virtual address associated with PMD
769
*
770
* Context: The PMD range has been unmapped and TLB purged.
771
* Return: 1 if clearing the entry succeeded. 0 otherwise.
772
*/
773
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
774
{
775
pte_t *pte;
776
777
pte = (pte_t *)pmd_page_vaddr(*pmd);
778
pmd_clear(pmd);
779
780
/* INVLPG to clear all paging-structure caches */
781
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
782
783
pte_free_kernel(&init_mm, pte);
784
785
return 1;
786
}
787
788
#else /* !CONFIG_X86_64 */
789
790
/*
791
* Disable free page handling on x86-PAE. This assures that ioremap()
792
* does not update sync'd PMD entries. See vmalloc_sync_one().
793
*/
794
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
795
{
796
return pmd_none(*pmd);
797
}
798
799
#endif /* CONFIG_X86_64 */
800
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
801
802
pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
803
{
804
if (vma->vm_flags & VM_SHADOW_STACK)
805
return pte_mkwrite_shstk(pte);
806
807
pte = pte_mkwrite_novma(pte);
808
809
return pte_clear_saveddirty(pte);
810
}
811
812
pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
813
{
814
if (vma->vm_flags & VM_SHADOW_STACK)
815
return pmd_mkwrite_shstk(pmd);
816
817
pmd = pmd_mkwrite_novma(pmd);
818
819
return pmd_clear_saveddirty(pmd);
820
}
821
822
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
823
{
824
/*
825
* Hardware before shadow stack can (rarely) set Dirty=1
826
* on a Write=0 PTE. So the below condition
827
* only indicates a software bug when shadow stack is
828
* supported by the HW. This checking is covered in
829
* pte_shstk().
830
*/
831
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
832
pte_shstk(pte));
833
}
834
835
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
836
{
837
/* See note in arch_check_zapped_pte() */
838
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
839
pmd_shstk(pmd));
840
}
841
842
void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
843
{
844
/* See note in arch_check_zapped_pte() */
845
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
846
}
847
848