Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/mm/book3s64/hash_pgtable.c
26481 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* Copyright 2005, Paul Mackerras, IBM Corporation.
4
* Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
5
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6
*/
7
8
#include <linux/sched.h>
9
#include <linux/mm_types.h>
10
#include <linux/mm.h>
11
#include <linux/stop_machine.h>
12
13
#include <asm/sections.h>
14
#include <asm/mmu.h>
15
#include <asm/tlb.h>
16
#include <asm/firmware.h>
17
18
#include <mm/mmu_decl.h>
19
20
#include <trace/events/thp.h>
21
22
#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
23
#warning Limited user VSID range means pagetable space is wasted
24
#endif
25
26
#ifdef CONFIG_SPARSEMEM_VMEMMAP
27
/*
28
* vmemmap is the starting address of the virtual address space where
29
* struct pages are allocated for all possible PFNs present on the system
30
* including holes and bad memory (hence sparse). These virtual struct
31
* pages are stored in sequence in this virtual address space irrespective
32
* of the fact whether the corresponding PFN is valid or not. This achieves
33
* constant relationship between address of struct page and its PFN.
34
*
35
* During boot or memory hotplug operation when a new memory section is
36
* added, physical memory allocation (including hash table bolting) will
37
* be performed for the set of struct pages which are part of the memory
38
* section. This saves memory by not allocating struct pages for PFNs
39
* which are not valid.
40
*
41
* ----------------------------------------------
42
* | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
43
* ----------------------------------------------
44
*
45
* f000000000000000 c000000000000000
46
* vmemmap +--------------+ +--------------+
47
* + | page struct | +--------------> | page struct |
48
* | +--------------+ +--------------+
49
* | | page struct | +--------------> | page struct |
50
* | +--------------+ | +--------------+
51
* | | page struct | + +------> | page struct |
52
* | +--------------+ | +--------------+
53
* | | page struct | | +--> | page struct |
54
* | +--------------+ | | +--------------+
55
* | | page struct | | |
56
* | +--------------+ | |
57
* | | page struct | | |
58
* | +--------------+ | |
59
* | | page struct | | |
60
* | +--------------+ | |
61
* | | page struct | | |
62
* | +--------------+ | |
63
* | | page struct | +-------+ |
64
* | +--------------+ |
65
* | | page struct | +-----------+
66
* | +--------------+
67
* | | page struct | No mapping
68
* | +--------------+
69
* | | page struct | No mapping
70
* v +--------------+
71
*
72
* -----------------------------------------
73
* | RELATION BETWEEN STRUCT PAGES AND PFNS|
74
* -----------------------------------------
75
*
76
* vmemmap +--------------+ +---------------+
77
* + | page struct | +-------------> | PFN |
78
* | +--------------+ +---------------+
79
* | | page struct | +-------------> | PFN |
80
* | +--------------+ +---------------+
81
* | | page struct | +-------------> | PFN |
82
* | +--------------+ +---------------+
83
* | | page struct | +-------------> | PFN |
84
* | +--------------+ +---------------+
85
* | | |
86
* | +--------------+
87
* | | |
88
* | +--------------+
89
* | | |
90
* | +--------------+ +---------------+
91
* | | page struct | +-------------> | PFN |
92
* | +--------------+ +---------------+
93
* | | |
94
* | +--------------+
95
* | | |
96
* | +--------------+ +---------------+
97
* | | page struct | +-------------> | PFN |
98
* | +--------------+ +---------------+
99
* | | page struct | +-------------> | PFN |
100
* v +--------------+ +---------------+
101
*/
102
/*
103
* On hash-based CPUs, the vmemmap is bolted in the hash table.
104
*
105
*/
106
int __meminit hash__vmemmap_create_mapping(unsigned long start,
107
unsigned long page_size,
108
unsigned long phys)
109
{
110
int rc;
111
112
if ((start + page_size) >= H_VMEMMAP_END) {
113
pr_warn("Outside the supported range\n");
114
return -1;
115
}
116
117
rc = htab_bolt_mapping(start, start + page_size, phys,
118
pgprot_val(PAGE_KERNEL),
119
mmu_vmemmap_psize, mmu_kernel_ssize);
120
if (rc < 0) {
121
int rc2 = htab_remove_mapping(start, start + page_size,
122
mmu_vmemmap_psize,
123
mmu_kernel_ssize);
124
BUG_ON(rc2 && (rc2 != -ENOENT));
125
}
126
return rc;
127
}
128
129
#ifdef CONFIG_MEMORY_HOTPLUG
130
void hash__vmemmap_remove_mapping(unsigned long start,
131
unsigned long page_size)
132
{
133
int rc = htab_remove_mapping(start, start + page_size,
134
mmu_vmemmap_psize,
135
mmu_kernel_ssize);
136
BUG_ON((rc < 0) && (rc != -ENOENT));
137
WARN_ON(rc == -ENOENT);
138
}
139
#endif
140
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
141
142
/*
143
* map_kernel_page currently only called by __ioremap
144
* map_kernel_page adds an entry to the ioremap page table
145
* and adds an entry to the HPT, possibly bolting it
146
*/
147
int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
148
{
149
pgd_t *pgdp;
150
p4d_t *p4dp;
151
pud_t *pudp;
152
pmd_t *pmdp;
153
pte_t *ptep;
154
155
BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
156
if (slab_is_available()) {
157
pgdp = pgd_offset_k(ea);
158
p4dp = p4d_offset(pgdp, ea);
159
pudp = pud_alloc(&init_mm, p4dp, ea);
160
if (!pudp)
161
return -ENOMEM;
162
pmdp = pmd_alloc(&init_mm, pudp, ea);
163
if (!pmdp)
164
return -ENOMEM;
165
ptep = pte_alloc_kernel(pmdp, ea);
166
if (!ptep)
167
return -ENOMEM;
168
set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
169
} else {
170
/*
171
* If the mm subsystem is not fully up, we cannot create a
172
* linux page table entry for this mapping. Simply bolt an
173
* entry in the hardware page table.
174
*
175
*/
176
if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
177
mmu_io_psize, mmu_kernel_ssize)) {
178
printk(KERN_ERR "Failed to do bolted mapping IO "
179
"memory at %016lx !\n", pa);
180
return -ENOMEM;
181
}
182
}
183
184
smp_wmb();
185
return 0;
186
}
187
188
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
189
190
unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
191
pmd_t *pmdp, unsigned long clr,
192
unsigned long set)
193
{
194
__be64 old_be, tmp;
195
unsigned long old;
196
197
#ifdef CONFIG_DEBUG_VM
198
WARN_ON(!hash__pmd_trans_huge(*pmdp));
199
assert_spin_locked(pmd_lockptr(mm, pmdp));
200
#endif
201
202
__asm__ __volatile__(
203
"1: ldarx %0,0,%3\n\
204
and. %1,%0,%6\n\
205
bne- 1b \n\
206
andc %1,%0,%4 \n\
207
or %1,%1,%7\n\
208
stdcx. %1,0,%3 \n\
209
bne- 1b"
210
: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
211
: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
212
"r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
213
: "cc" );
214
215
old = be64_to_cpu(old_be);
216
217
trace_hugepage_update_pmd(addr, old, clr, set);
218
if (old & H_PAGE_HASHPTE)
219
hpte_do_hugepage_flush(mm, addr, pmdp, old);
220
return old;
221
}
222
223
pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
224
pmd_t *pmdp)
225
{
226
pmd_t pmd;
227
228
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
229
VM_BUG_ON(pmd_trans_huge(*pmdp));
230
231
pmd = *pmdp;
232
pmd_clear(pmdp);
233
/*
234
* Wait for all pending hash_page to finish. This is needed
235
* in case of subpage collapse. When we collapse normal pages
236
* to hugepage, we first clear the pmd, then invalidate all
237
* the PTE entries. The assumption here is that any low level
238
* page fault will see a none pmd and take the slow path that
239
* will wait on mmap_lock. But we could very well be in a
240
* hash_page with local ptep pointer value. Such a hash page
241
* can result in adding new HPTE entries for normal subpages.
242
* That means we could be modifying the page content as we
243
* copy them to a huge page. So wait for parallel hash_page
244
* to finish before invalidating HPTE entries. We can do this
245
* by sending an IPI to all the cpus and executing a dummy
246
* function there.
247
*/
248
serialize_against_pte_lookup(vma->vm_mm);
249
/*
250
* Now invalidate the hpte entries in the range
251
* covered by pmd. This make sure we take a
252
* fault and will find the pmd as none, which will
253
* result in a major fault which takes mmap_lock and
254
* hence wait for collapse to complete. Without this
255
* the __collapse_huge_page_copy can result in copying
256
* the old content.
257
*/
258
flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
259
return pmd;
260
}
261
262
/*
263
* We want to put the pgtable in pmd and use pgtable for tracking
264
* the base page size hptes
265
*/
266
void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
267
pgtable_t pgtable)
268
{
269
pgtable_t *pgtable_slot;
270
271
assert_spin_locked(pmd_lockptr(mm, pmdp));
272
/*
273
* we store the pgtable in the second half of PMD
274
*/
275
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
276
*pgtable_slot = pgtable;
277
/*
278
* expose the deposited pgtable to other cpus.
279
* before we set the hugepage PTE at pmd level
280
* hash fault code looks at the deposted pgtable
281
* to store hash index values.
282
*/
283
smp_wmb();
284
}
285
286
pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
287
{
288
pgtable_t pgtable;
289
pgtable_t *pgtable_slot;
290
291
assert_spin_locked(pmd_lockptr(mm, pmdp));
292
293
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
294
pgtable = *pgtable_slot;
295
/*
296
* Once we withdraw, mark the entry NULL.
297
*/
298
*pgtable_slot = NULL;
299
/*
300
* We store HPTE information in the deposited PTE fragment.
301
* zero out the content on withdraw.
302
*/
303
memset(pgtable, 0, PTE_FRAG_SIZE);
304
return pgtable;
305
}
306
307
/*
308
* A linux hugepage PMD was changed and the corresponding hash table entries
309
* neesd to be flushed.
310
*/
311
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
312
pmd_t *pmdp, unsigned long old_pmd)
313
{
314
int ssize;
315
unsigned int psize;
316
unsigned long vsid;
317
unsigned long flags = 0;
318
319
/* get the base page size,vsid and segment size */
320
#ifdef CONFIG_DEBUG_VM
321
psize = get_slice_psize(mm, addr);
322
BUG_ON(psize == MMU_PAGE_16M);
323
#endif
324
if (old_pmd & H_PAGE_COMBO)
325
psize = MMU_PAGE_4K;
326
else
327
psize = MMU_PAGE_64K;
328
329
if (!is_kernel_addr(addr)) {
330
ssize = user_segment_size(addr);
331
vsid = get_user_vsid(&mm->context, addr, ssize);
332
WARN_ON(vsid == 0);
333
} else {
334
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
335
ssize = mmu_kernel_ssize;
336
}
337
338
if (mm_is_thread_local(mm))
339
flags |= HPTE_LOCAL_UPDATE;
340
341
return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
342
}
343
344
pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
345
unsigned long addr, pmd_t *pmdp)
346
{
347
pmd_t old_pmd;
348
pgtable_t pgtable;
349
unsigned long old;
350
pgtable_t *pgtable_slot;
351
352
old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
353
old_pmd = __pmd(old);
354
/*
355
* We have pmd == none and we are holding page_table_lock.
356
* So we can safely go and clear the pgtable hash
357
* index info.
358
*/
359
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
360
pgtable = *pgtable_slot;
361
/*
362
* Let's zero out old valid and hash index details
363
* hash fault look at them.
364
*/
365
memset(pgtable, 0, PTE_FRAG_SIZE);
366
return old_pmd;
367
}
368
369
int hash__has_transparent_hugepage(void)
370
{
371
372
if (!mmu_has_feature(MMU_FTR_16M_PAGE))
373
return 0;
374
/*
375
* We support THP only if PMD_SIZE is 16MB.
376
*/
377
if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
378
return 0;
379
/*
380
* We need to make sure that we support 16MB hugepage in a segment
381
* with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
382
* of 64K.
383
*/
384
/*
385
* If we have 64K HPTE, we will be using that by default
386
*/
387
if (mmu_psize_defs[MMU_PAGE_64K].shift &&
388
(mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
389
return 0;
390
/*
391
* Ok we only have 4K HPTE
392
*/
393
if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
394
return 0;
395
396
return 1;
397
}
398
EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
399
400
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
401
402
#ifdef CONFIG_STRICT_KERNEL_RWX
403
404
struct change_memory_parms {
405
unsigned long start, end, newpp;
406
unsigned int step, nr_cpus;
407
atomic_t master_cpu;
408
atomic_t cpu_counter;
409
};
410
411
// We'd rather this was on the stack but it has to be in the RMO
412
static struct change_memory_parms chmem_parms;
413
414
// And therefore we need a lock to protect it from concurrent use
415
static DEFINE_MUTEX(chmem_lock);
416
417
static void change_memory_range(unsigned long start, unsigned long end,
418
unsigned int step, unsigned long newpp)
419
{
420
unsigned long idx;
421
422
pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
423
start, end, newpp, step);
424
425
for (idx = start; idx < end; idx += step)
426
/* Not sure if we can do much with the return value */
427
mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
428
mmu_kernel_ssize);
429
}
430
431
static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
432
{
433
unsigned long msr, tmp, flags;
434
int *p;
435
436
p = &parms->cpu_counter.counter;
437
438
local_irq_save(flags);
439
hard_irq_disable();
440
441
asm volatile (
442
// Switch to real mode and leave interrupts off
443
"mfmsr %[msr] ;"
444
"li %[tmp], %[MSR_IR_DR] ;"
445
"andc %[tmp], %[msr], %[tmp] ;"
446
"mtmsrd %[tmp] ;"
447
448
// Tell the master we are in real mode
449
"1: "
450
"lwarx %[tmp], 0, %[p] ;"
451
"addic %[tmp], %[tmp], -1 ;"
452
"stwcx. %[tmp], 0, %[p] ;"
453
"bne- 1b ;"
454
455
// Spin until the counter goes to zero
456
"2: ;"
457
"lwz %[tmp], 0(%[p]) ;"
458
"cmpwi %[tmp], 0 ;"
459
"bne- 2b ;"
460
461
// Switch back to virtual mode
462
"mtmsrd %[msr] ;"
463
464
: // outputs
465
[msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
466
: // inputs
467
[p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
468
: // clobbers
469
"cc", "xer"
470
);
471
472
local_irq_restore(flags);
473
474
return 0;
475
}
476
477
static int change_memory_range_fn(void *data)
478
{
479
struct change_memory_parms *parms = data;
480
481
// First CPU goes through, all others wait.
482
if (atomic_xchg(&parms->master_cpu, 1) == 1)
483
return chmem_secondary_loop(parms);
484
485
// Wait for all but one CPU (this one) to call-in
486
while (atomic_read(&parms->cpu_counter) > 1)
487
barrier();
488
489
change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
490
491
mb();
492
493
// Signal the other CPUs that we're done
494
atomic_dec(&parms->cpu_counter);
495
496
return 0;
497
}
498
499
static bool hash__change_memory_range(unsigned long start, unsigned long end,
500
unsigned long newpp)
501
{
502
unsigned int step, shift;
503
504
shift = mmu_psize_defs[mmu_linear_psize].shift;
505
step = 1 << shift;
506
507
start = ALIGN_DOWN(start, step);
508
end = ALIGN(end, step); // aligns up
509
510
if (start >= end)
511
return false;
512
513
if (firmware_has_feature(FW_FEATURE_LPAR)) {
514
mutex_lock(&chmem_lock);
515
516
chmem_parms.start = start;
517
chmem_parms.end = end;
518
chmem_parms.step = step;
519
chmem_parms.newpp = newpp;
520
atomic_set(&chmem_parms.master_cpu, 0);
521
522
cpus_read_lock();
523
524
atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
525
526
// Ensure state is consistent before we call the other CPUs
527
mb();
528
529
stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
530
cpu_online_mask);
531
532
cpus_read_unlock();
533
mutex_unlock(&chmem_lock);
534
} else
535
change_memory_range(start, end, step, newpp);
536
537
return true;
538
}
539
540
void hash__mark_rodata_ro(void)
541
{
542
unsigned long start, end, pp;
543
544
start = (unsigned long)_stext;
545
end = (unsigned long)__end_rodata;
546
547
pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
548
549
WARN_ON(!hash__change_memory_range(start, end, pp));
550
}
551
552
void hash__mark_initmem_nx(void)
553
{
554
unsigned long start, end, pp;
555
556
start = (unsigned long)__init_begin;
557
end = (unsigned long)__init_end;
558
559
pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
560
561
WARN_ON(!hash__change_memory_range(start, end, pp));
562
}
563
#endif
564
565