Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/powerpc/aim/mmu_radix.c
106061 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2018 Matthew Macy
5
*
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions
8
* are met:
9
*
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#include "opt_platform.h"
29
30
#include <sys/param.h>
31
#include <sys/kernel.h>
32
#include <sys/systm.h>
33
#include <sys/conf.h>
34
#include <sys/bitstring.h>
35
#include <sys/queue.h>
36
#include <sys/cpuset.h>
37
#include <sys/endian.h>
38
#include <sys/kerneldump.h>
39
#include <sys/ktr.h>
40
#include <sys/lock.h>
41
#include <sys/syslog.h>
42
#include <sys/msgbuf.h>
43
#include <sys/malloc.h>
44
#include <sys/mman.h>
45
#include <sys/mutex.h>
46
#include <sys/proc.h>
47
#include <sys/rwlock.h>
48
#include <sys/sched.h>
49
#include <sys/sysctl.h>
50
#include <sys/systm.h>
51
#include <sys/vmem.h>
52
#include <sys/vmmeter.h>
53
#include <sys/smp.h>
54
55
#include <sys/kdb.h>
56
57
#include <dev/ofw/openfirm.h>
58
59
#include <vm/vm.h>
60
#include <vm/pmap.h>
61
#include <vm/vm_param.h>
62
#include <vm/vm_kern.h>
63
#include <vm/vm_page.h>
64
#include <vm/vm_map.h>
65
#include <vm/vm_object.h>
66
#include <vm/vm_extern.h>
67
#include <vm/vm_pageout.h>
68
#include <vm/vm_phys.h>
69
#include <vm/vm_radix.h>
70
#include <vm/vm_reserv.h>
71
#include <vm/vm_dumpset.h>
72
#include <vm/uma.h>
73
74
#include <machine/_inttypes.h>
75
#include <machine/cpu.h>
76
#include <machine/platform.h>
77
#include <machine/frame.h>
78
#include <machine/md_var.h>
79
#include <machine/psl.h>
80
#include <machine/bat.h>
81
#include <machine/hid.h>
82
#include <machine/pte.h>
83
#include <machine/sr.h>
84
#include <machine/trap.h>
85
#include <machine/mmuvar.h>
86
87
/* For pseries bit. */
88
#include <powerpc/pseries/phyp-hvcall.h>
89
90
#ifdef INVARIANTS
91
#include <vm/uma_dbg.h>
92
#endif
93
94
#define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit))
95
#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit))
96
#define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
97
98
#include "opt_ddb.h"
99
100
#ifdef DDB
101
static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
102
#endif
103
104
#define PG_W RPTE_WIRED
105
#define PG_V RPTE_VALID
106
#define PG_MANAGED RPTE_MANAGED
107
#define PG_PROMOTED RPTE_PROMOTED
108
#define PG_M RPTE_C
109
#define PG_A RPTE_R
110
#define PG_X RPTE_EAA_X
111
#define PG_RW RPTE_EAA_W
112
#define PG_PTE_CACHE RPTE_ATTR_MASK
113
114
#define RPTE_SHIFT 9
115
#define NLS_MASK ((1UL<<5)-1)
116
#define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
117
#define RPTE_MASK (RPTE_ENTRIES-1)
118
119
#define NLB_SHIFT 0
120
#define NLB_MASK (((1UL<<52)-1) << 8)
121
122
extern int nkpt;
123
extern caddr_t crashdumpmap;
124
125
#define RIC_FLUSH_TLB 0
126
#define RIC_FLUSH_PWC 1
127
#define RIC_FLUSH_ALL 2
128
129
#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */
130
131
#define PPC_INST_TLBIE 0x7c000264
132
#define PPC_INST_TLBIEL 0x7c000224
133
#define PPC_INST_SLBIA 0x7c0003e4
134
135
#define ___PPC_RA(a) (((a) & 0x1f) << 16)
136
#define ___PPC_RB(b) (((b) & 0x1f) << 11)
137
#define ___PPC_RS(s) (((s) & 0x1f) << 21)
138
#define ___PPC_RT(t) ___PPC_RS(t)
139
#define ___PPC_R(r) (((r) & 0x1) << 16)
140
#define ___PPC_PRS(prs) (((prs) & 0x1) << 17)
141
#define ___PPC_RIC(ric) (((ric) & 0x3) << 18)
142
143
#define PPC_SLBIA(IH) __XSTRING(.long PPC_INST_SLBIA | \
144
((IH & 0x7) << 21))
145
#define PPC_TLBIE_5(rb,rs,ric,prs,r) \
146
__XSTRING(.long PPC_INST_TLBIE | \
147
___PPC_RB(rb) | ___PPC_RS(rs) | \
148
___PPC_RIC(ric) | ___PPC_PRS(prs) | \
149
___PPC_R(r))
150
151
#define PPC_TLBIEL(rb,rs,ric,prs,r) \
152
__XSTRING(.long PPC_INST_TLBIEL | \
153
___PPC_RB(rb) | ___PPC_RS(rs) | \
154
___PPC_RIC(ric) | ___PPC_PRS(prs) | \
155
___PPC_R(r))
156
157
#define PPC_INVALIDATE_ERAT PPC_SLBIA(7)
158
159
static __inline void
160
ttusync(void)
161
{
162
__asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
163
}
164
165
#define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */
166
#define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */
167
#define TLBIEL_INVAL_SET_PID 0x400 /* invalidate a set for the current PID */
168
#define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */
169
#define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */
170
171
#define TLBIE_ACTUAL_PAGE_MASK 0xe0
172
#define TLBIE_ACTUAL_PAGE_4K 0x00
173
#define TLBIE_ACTUAL_PAGE_64K 0xa0
174
#define TLBIE_ACTUAL_PAGE_2M 0x20
175
#define TLBIE_ACTUAL_PAGE_1G 0x40
176
177
#define TLBIE_PRS_PARTITION_SCOPE 0x0
178
#define TLBIE_PRS_PROCESS_SCOPE 0x1
179
180
#define TLBIE_RIC_INVALIDATE_TLB 0x0 /* Invalidate just TLB */
181
#define TLBIE_RIC_INVALIDATE_PWC 0x1 /* Invalidate just PWC */
182
#define TLBIE_RIC_INVALIDATE_ALL 0x2 /* Invalidate TLB, PWC,
183
* cached {proc, part}tab entries
184
*/
185
#define TLBIE_RIC_INVALIDATE_SEQ 0x3 /* HPT - only:
186
* Invalidate a range of translations
187
*/
188
189
static __always_inline void
190
radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
191
vm_offset_t va, uint16_t ap)
192
{
193
uint64_t rb, rs;
194
195
MPASS((va & PAGE_MASK) == 0);
196
197
rs = ((uint64_t)pid << 32) | lpid;
198
rb = va | is | ap;
199
__asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
200
"r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory");
201
}
202
203
static __inline void
204
radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap)
205
{
206
207
__asm __volatile("ptesync" ::: "memory");
208
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
209
TLBIEL_INVAL_PAGE, 0, 0, va, ap);
210
__asm __volatile("ptesync" ::: "memory");
211
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
212
TLBIEL_INVAL_PAGE, pid, 0, va, ap);
213
}
214
215
static __inline void
216
radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
217
{
218
219
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
220
TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
221
radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K);
222
}
223
224
static __inline void
225
radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
226
{
227
228
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
229
TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
230
radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M);
231
}
232
233
static __inline void
234
radix_tlbie_invlpwc_user(uint32_t pid)
235
{
236
237
radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
238
TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
239
}
240
241
static __inline void
242
radix_tlbie_flush_user(uint32_t pid)
243
{
244
245
radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
246
TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
247
}
248
249
static __inline void
250
radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
251
{
252
253
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
254
TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
255
radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K);
256
}
257
258
static __inline void
259
radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
260
{
261
262
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
263
TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
264
radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M);
265
}
266
267
/* 1GB pages aren't currently supported. */
268
static __inline __unused void
269
radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
270
{
271
272
radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
273
TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
274
radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G);
275
}
276
277
static __inline void
278
radix_tlbie_invlpwc_kernel(void)
279
{
280
281
radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
282
TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
283
}
284
285
static __inline void
286
radix_tlbie_flush_kernel(void)
287
{
288
289
radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
290
TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
291
}
292
293
static __inline vm_pindex_t
294
pmap_l3e_pindex(vm_offset_t va)
295
{
296
return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
297
}
298
299
static __inline vm_pindex_t
300
pmap_pml3e_index(vm_offset_t va)
301
{
302
303
return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
304
}
305
306
static __inline vm_pindex_t
307
pmap_pml2e_index(vm_offset_t va)
308
{
309
return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
310
}
311
312
static __inline vm_pindex_t
313
pmap_pml1e_index(vm_offset_t va)
314
{
315
return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
316
}
317
318
/* Return various clipped indexes for a given VA */
319
static __inline vm_pindex_t
320
pmap_pte_index(vm_offset_t va)
321
{
322
323
return ((va >> PAGE_SHIFT) & RPTE_MASK);
324
}
325
326
/* Return a pointer to the PT slot that corresponds to a VA */
327
static __inline pt_entry_t *
328
pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
329
{
330
pt_entry_t *pte;
331
vm_paddr_t ptepa;
332
333
ptepa = (be64toh(*l3e) & NLB_MASK);
334
pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
335
return (&pte[pmap_pte_index(va)]);
336
}
337
338
/* Return a pointer to the PD slot that corresponds to a VA */
339
static __inline pt_entry_t *
340
pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
341
{
342
pt_entry_t *l3e;
343
vm_paddr_t l3pa;
344
345
l3pa = (be64toh(*l2e) & NLB_MASK);
346
l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
347
return (&l3e[pmap_pml3e_index(va)]);
348
}
349
350
/* Return a pointer to the PD slot that corresponds to a VA */
351
static __inline pt_entry_t *
352
pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
353
{
354
pt_entry_t *l2e;
355
vm_paddr_t l2pa;
356
357
l2pa = (be64toh(*l1e) & NLB_MASK);
358
359
l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
360
return (&l2e[pmap_pml2e_index(va)]);
361
}
362
363
static __inline pml1_entry_t *
364
pmap_pml1e(pmap_t pmap, vm_offset_t va)
365
{
366
367
return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
368
}
369
370
static pt_entry_t *
371
pmap_pml2e(pmap_t pmap, vm_offset_t va)
372
{
373
pt_entry_t *l1e;
374
375
l1e = pmap_pml1e(pmap, va);
376
if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0)
377
return (NULL);
378
return (pmap_l1e_to_l2e(l1e, va));
379
}
380
381
static __inline pt_entry_t *
382
pmap_pml3e(pmap_t pmap, vm_offset_t va)
383
{
384
pt_entry_t *l2e;
385
386
l2e = pmap_pml2e(pmap, va);
387
if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0)
388
return (NULL);
389
return (pmap_l2e_to_l3e(l2e, va));
390
}
391
392
static __inline pt_entry_t *
393
pmap_pte(pmap_t pmap, vm_offset_t va)
394
{
395
pt_entry_t *l3e;
396
397
l3e = pmap_pml3e(pmap, va);
398
if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
399
return (NULL);
400
return (pmap_l3e_to_pte(l3e, va));
401
}
402
403
int nkpt = 64;
404
SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
405
"Number of kernel page table pages allocated on bootup");
406
407
vm_paddr_t dmaplimit;
408
409
SYSCTL_DECL(_vm_pmap);
410
411
#ifdef INVARIANTS
412
#define VERBOSE_PMAP 0
413
#define VERBOSE_PROTECT 0
414
static int pmap_logging;
415
SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
416
&pmap_logging, 0, "verbose debug logging");
417
#endif
418
419
static u_int64_t KPTphys; /* phys addr of kernel level 1 */
420
421
//static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */
422
423
static vm_offset_t qframe = 0;
424
static struct mtx qframe_mtx;
425
426
void mmu_radix_activate(struct thread *);
427
void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
428
void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
429
vm_size_t);
430
void mmu_radix_clear_modify(vm_page_t);
431
void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
432
int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
433
int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
434
void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
435
vm_prot_t);
436
void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
437
vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
438
vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
439
void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
440
vm_paddr_t mmu_radix_kextract(vm_offset_t);
441
void mmu_radix_kremove(vm_offset_t);
442
bool mmu_radix_is_modified(vm_page_t);
443
bool mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
444
bool mmu_radix_is_referenced(vm_page_t);
445
void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
446
vm_pindex_t, vm_size_t);
447
bool mmu_radix_page_exists_quick(pmap_t, vm_page_t);
448
void mmu_radix_page_init(vm_page_t);
449
bool mmu_radix_page_is_mapped(vm_page_t m);
450
void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
451
int mmu_radix_page_wired_mappings(vm_page_t);
452
int mmu_radix_pinit(pmap_t);
453
void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
454
bool mmu_radix_ps_enabled(pmap_t);
455
void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
456
void mmu_radix_qremove(vm_offset_t, int);
457
vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
458
void mmu_radix_quick_remove_page(vm_offset_t);
459
int mmu_radix_ts_referenced(vm_page_t);
460
void mmu_radix_release(pmap_t);
461
void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
462
void mmu_radix_remove_all(vm_page_t);
463
void mmu_radix_remove_pages(pmap_t);
464
void mmu_radix_remove_write(vm_page_t);
465
void mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz);
466
void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
467
void mmu_radix_zero_page(vm_page_t);
468
void mmu_radix_zero_page_area(vm_page_t, int, int);
469
int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
470
void mmu_radix_page_array_startup(long pages);
471
472
#include "mmu_oea64.h"
473
474
/*
475
* Kernel MMU interface
476
*/
477
478
static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
479
480
static void mmu_radix_copy_page(vm_page_t, vm_page_t);
481
static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
482
vm_page_t *mb, vm_offset_t b_offset, int xfersize);
483
static int mmu_radix_growkernel(vm_offset_t);
484
static void mmu_radix_init(void);
485
static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
486
static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
487
static void mmu_radix_pinit0(pmap_t);
488
489
static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
490
static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
491
static void mmu_radix_unmapdev(void *, vm_size_t);
492
static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
493
static int mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
494
static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
495
static void mmu_radix_scan_init(void);
496
static void mmu_radix_cpu_bootstrap(int ap);
497
static void mmu_radix_tlbie_all(void);
498
499
static struct pmap_funcs mmu_radix_methods = {
500
.bootstrap = mmu_radix_bootstrap,
501
.copy_page = mmu_radix_copy_page,
502
.copy_pages = mmu_radix_copy_pages,
503
.cpu_bootstrap = mmu_radix_cpu_bootstrap,
504
.growkernel_nopanic = mmu_radix_growkernel,
505
.init = mmu_radix_init,
506
.map = mmu_radix_map,
507
.mincore = mmu_radix_mincore,
508
.pinit = mmu_radix_pinit,
509
.pinit0 = mmu_radix_pinit0,
510
511
.mapdev = mmu_radix_mapdev,
512
.mapdev_attr = mmu_radix_mapdev_attr,
513
.unmapdev = mmu_radix_unmapdev,
514
.kenter_attr = mmu_radix_kenter_attr,
515
.dev_direct_mapped = mmu_radix_dev_direct_mapped,
516
.dumpsys_pa_init = mmu_radix_scan_init,
517
.dumpsys_map_chunk = mmu_radix_dumpsys_map,
518
.page_is_mapped = mmu_radix_page_is_mapped,
519
.ps_enabled = mmu_radix_ps_enabled,
520
.align_superpage = mmu_radix_align_superpage,
521
.object_init_pt = mmu_radix_object_init_pt,
522
.protect = mmu_radix_protect,
523
/* pmap dispatcher interface */
524
.clear_modify = mmu_radix_clear_modify,
525
.copy = mmu_radix_copy,
526
.enter = mmu_radix_enter,
527
.enter_object = mmu_radix_enter_object,
528
.enter_quick = mmu_radix_enter_quick,
529
.extract = mmu_radix_extract,
530
.extract_and_hold = mmu_radix_extract_and_hold,
531
.is_modified = mmu_radix_is_modified,
532
.is_prefaultable = mmu_radix_is_prefaultable,
533
.is_referenced = mmu_radix_is_referenced,
534
.ts_referenced = mmu_radix_ts_referenced,
535
.page_exists_quick = mmu_radix_page_exists_quick,
536
.page_init = mmu_radix_page_init,
537
.page_wired_mappings = mmu_radix_page_wired_mappings,
538
.qenter = mmu_radix_qenter,
539
.qremove = mmu_radix_qremove,
540
.release = mmu_radix_release,
541
.remove = mmu_radix_remove,
542
.remove_all = mmu_radix_remove_all,
543
.remove_write = mmu_radix_remove_write,
544
.sync_icache = mmu_radix_sync_icache,
545
.unwire = mmu_radix_unwire,
546
.zero_page = mmu_radix_zero_page,
547
.zero_page_area = mmu_radix_zero_page_area,
548
.activate = mmu_radix_activate,
549
.quick_enter_page = mmu_radix_quick_enter_page,
550
.quick_remove_page = mmu_radix_quick_remove_page,
551
.page_set_memattr = mmu_radix_page_set_memattr,
552
.page_array_startup = mmu_radix_page_array_startup,
553
554
/* Internal interfaces */
555
.kenter = mmu_radix_kenter,
556
.kextract = mmu_radix_kextract,
557
.kremove = mmu_radix_kremove,
558
.change_attr = mmu_radix_change_attr,
559
.decode_kernel_ptr = mmu_radix_decode_kernel_ptr,
560
561
.tlbie_all = mmu_radix_tlbie_all,
562
};
563
564
MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
565
566
static bool pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
567
struct rwlock **lockp);
568
static bool pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
569
static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
570
static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
571
struct spglist *free, struct rwlock **lockp);
572
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
573
pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
574
static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
575
static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
576
struct spglist *free);
577
static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
578
pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
579
580
static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
581
u_int flags, struct rwlock **lockp);
582
#if VM_NRESERVLEVEL > 0
583
static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
584
struct rwlock **lockp);
585
#endif
586
static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
587
static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
588
static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
589
vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
590
591
static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
592
vm_prot_t prot, struct rwlock **lockp);
593
static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
594
u_int flags, vm_page_t m, struct rwlock **lockp);
595
596
static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
597
static void free_pv_chunk(struct pv_chunk *pc);
598
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
599
static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
600
struct rwlock **lockp);
601
static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
602
struct rwlock **lockp);
603
static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
604
struct spglist *free);
605
static bool pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
606
607
static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
608
static void pmap_invalidate_all(pmap_t pmap);
609
static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
610
static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
611
612
/*
613
* Internal flags for pmap_enter()'s helper functions.
614
*/
615
#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
616
#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
617
618
#define UNIMPLEMENTED() panic("%s not implemented", __func__)
619
#define UNTESTED() panic("%s not yet tested", __func__)
620
621
/* Number of supported PID bits */
622
static unsigned int isa3_pid_bits;
623
624
/* PID to start allocating from */
625
static unsigned int isa3_base_pid;
626
627
#define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4)
628
#define PROCTAB_ENTRIES (1ul << isa3_pid_bits)
629
630
/*
631
* Map of physical memory regions.
632
*/
633
static struct mem_region *regions, *pregions;
634
static struct numa_mem_region *numa_pregions;
635
static int regions_sz, pregions_sz, numa_pregions_sz;
636
static struct pate *isa3_parttab;
637
static struct prte *isa3_proctab;
638
static vmem_t *asid_arena;
639
640
extern void bs_remap_earlyboot(void);
641
642
#define RADIX_PGD_SIZE_SHIFT 16
643
#define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT)
644
645
#define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3)
646
#define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
647
#define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
648
649
#define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */
650
#define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */
651
#define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */
652
653
/* POWER9 only permits a 64k partition table size. */
654
#define PARTTAB_SIZE_SHIFT 16
655
#define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT)
656
657
#define PARTTAB_HR (1UL << 63) /* host uses radix */
658
#define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */
659
660
/* TLB flush actions. Used as argument to tlbiel_flush() */
661
enum {
662
TLB_INVAL_SCOPE_LPID = 2, /* invalidate TLBs for current LPID */
663
TLB_INVAL_SCOPE_GLOBAL = 3, /* invalidate all TLBs */
664
};
665
666
#define NPV_LIST_LOCKS MAXCPU
667
static int pmap_initialized;
668
static vm_paddr_t proctab0pa;
669
static vm_paddr_t parttab_phys;
670
CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
671
672
/*
673
* Data for the pv entry allocation mechanism.
674
* Updates to pv_invl_gen are protected by the pv_list_locks[]
675
* elements, but reads are not.
676
*/
677
static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
678
static struct mtx __exclusive_cache_line pv_chunks_mutex;
679
static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
680
static struct md_page *pv_table;
681
static struct md_page pv_dummy;
682
683
#ifdef PV_STATS
684
#define PV_STAT(x) do { x ; } while (0)
685
#else
686
#define PV_STAT(x) do { } while (0)
687
#endif
688
689
#define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT)
690
#define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)])
691
692
#define PHYS_TO_PV_LIST_LOCK(pa) \
693
(&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
694
695
#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
696
struct rwlock **_lockp = (lockp); \
697
struct rwlock *_new_lock; \
698
\
699
_new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
700
if (_new_lock != *_lockp) { \
701
if (*_lockp != NULL) \
702
rw_wunlock(*_lockp); \
703
*_lockp = _new_lock; \
704
rw_wlock(*_lockp); \
705
} \
706
} while (0)
707
708
#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
709
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
710
711
#define RELEASE_PV_LIST_LOCK(lockp) do { \
712
struct rwlock **_lockp = (lockp); \
713
\
714
if (*_lockp != NULL) { \
715
rw_wunlock(*_lockp); \
716
*_lockp = NULL; \
717
} \
718
} while (0)
719
720
#define VM_PAGE_TO_PV_LIST_LOCK(m) \
721
PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
722
723
/*
724
* We support 52 bits, hence:
725
* bits 52 - 31 = 21, 0b10101
726
* RTS encoding details
727
* bits 0 - 3 of rts -> bits 6 - 8 unsigned long
728
* bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
729
*/
730
#define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
731
732
static int powernv_enabled = 1;
733
734
static __always_inline void
735
tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
736
uint32_t pid, uint32_t ric, uint32_t prs)
737
{
738
uint64_t rb;
739
uint64_t rs;
740
741
rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
742
rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
743
744
__asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
745
: : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
746
: "memory");
747
}
748
749
static void
750
tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
751
{
752
uint32_t set;
753
754
__asm __volatile("ptesync": : :"memory");
755
756
/*
757
* Flush the first set of the TLB, and the entire Page Walk Cache
758
* and partition table entries. Then flush the remaining sets of the
759
* TLB.
760
*/
761
if (is == TLB_INVAL_SCOPE_GLOBAL) {
762
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
763
for (set = 1; set < num_sets; set++)
764
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
765
}
766
767
/* Do the same for process scoped entries. */
768
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
769
for (set = 1; set < num_sets; set++)
770
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
771
772
__asm __volatile("ptesync": : :"memory");
773
}
774
775
static void
776
mmu_radix_tlbiel_flush(int scope)
777
{
778
MPASS(scope == TLB_INVAL_SCOPE_LPID ||
779
scope == TLB_INVAL_SCOPE_GLOBAL);
780
781
tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope);
782
__asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
783
}
784
785
static void
786
mmu_radix_tlbie_all(void)
787
{
788
if (powernv_enabled)
789
mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
790
else
791
mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
792
}
793
794
static void
795
mmu_radix_init_amor(void)
796
{
797
/*
798
* In HV mode, we init AMOR (Authority Mask Override Register) so that
799
* the hypervisor and guest can setup IAMR (Instruction Authority Mask
800
* Register), enable key 0 and set it to 1.
801
*
802
* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
803
*/
804
mtspr(SPR_AMOR, (3ul << 62));
805
}
806
807
static void
808
mmu_radix_init_iamr(void)
809
{
810
/*
811
* Radix always uses key0 of the IAMR to determine if an access is
812
* allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
813
* fetch.
814
*/
815
mtspr(SPR_IAMR, (1ul << 62));
816
}
817
818
static void
819
mmu_radix_pid_set(pmap_t pmap)
820
{
821
822
mtspr(SPR_PID, pmap->pm_pid);
823
isync();
824
}
825
826
/* Quick sort callout for comparing physical addresses. */
827
static int
828
pa_cmp(const void *a, const void *b)
829
{
830
const vm_paddr_t *pa = a, *pb = b;
831
832
if (*pa < *pb)
833
return (-1);
834
else if (*pa > *pb)
835
return (1);
836
else
837
return (0);
838
}
839
840
#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte)
841
#define pte_load_clear(ptep) atomic_swap_long(ptep, 0)
842
#define pte_store(ptep, pte) do { \
843
MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \
844
*(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \
845
} while (0)
846
/*
847
* NB: should only be used for adding directories - not for direct mappings
848
*/
849
#define pde_store(ptep, pa) do { \
850
*(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \
851
} while (0)
852
853
#define pte_clear(ptep) do { \
854
*(u_long *)(ptep) = (u_long)(0); \
855
} while (0)
856
857
#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */
858
859
/*
860
* Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
861
* (PTE) page mappings have identical settings for the following fields:
862
*/
863
#define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
864
PG_M | PG_A | RPTE_EAA_MASK | PG_V)
865
866
static __inline void
867
pmap_resident_count_inc(pmap_t pmap, int count)
868
{
869
870
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
871
pmap->pm_stats.resident_count += count;
872
}
873
874
static __inline void
875
pmap_resident_count_dec(pmap_t pmap, int count)
876
{
877
878
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
879
KASSERT(pmap->pm_stats.resident_count >= count,
880
("pmap %p resident count underflow %ld %d", pmap,
881
pmap->pm_stats.resident_count, count));
882
pmap->pm_stats.resident_count -= count;
883
}
884
885
static void
886
pagezero(vm_offset_t va)
887
{
888
va = trunc_page(va);
889
890
bzero((void *)va, PAGE_SIZE);
891
}
892
893
static uint64_t
894
allocpages(int n)
895
{
896
u_int64_t ret;
897
898
ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
899
for (int i = 0; i < n; i++)
900
pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
901
return (ret);
902
}
903
904
static pt_entry_t *
905
kvtopte(vm_offset_t va)
906
{
907
pt_entry_t *l3e;
908
909
l3e = pmap_pml3e(kernel_pmap, va);
910
if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
911
return (NULL);
912
return (pmap_l3e_to_pte(l3e, va));
913
}
914
915
void
916
mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
917
{
918
pt_entry_t *pte;
919
920
pte = kvtopte(va);
921
MPASS(pte != NULL);
922
*pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \
923
RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A);
924
}
925
926
bool
927
mmu_radix_ps_enabled(pmap_t pmap)
928
{
929
return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
930
}
931
932
static pt_entry_t *
933
pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
934
{
935
pml3_entry_t *l3e;
936
pt_entry_t *pte;
937
938
va &= PG_PS_FRAME;
939
l3e = pmap_pml3e(pmap, va);
940
if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0)
941
return (NULL);
942
943
if (be64toh(*l3e) & RPTE_LEAF) {
944
*is_l3e = 1;
945
return (l3e);
946
}
947
*is_l3e = 0;
948
va &= PG_FRAME;
949
pte = pmap_l3e_to_pte(l3e, va);
950
if (pte == NULL || (be64toh(*pte) & PG_V) == 0)
951
return (NULL);
952
return (pte);
953
}
954
955
int
956
pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
957
{
958
pt_entry_t *pte;
959
pt_entry_t startpte, origpte, newpte;
960
vm_page_t m;
961
int is_l3e;
962
963
startpte = 0;
964
retry:
965
if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
966
return (KERN_INVALID_ADDRESS);
967
origpte = newpte = be64toh(*pte);
968
if (startpte == 0) {
969
startpte = origpte;
970
if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
971
((flags & VM_PROT_READ) && (startpte & PG_A))) {
972
pmap_invalidate_all(pmap);
973
#ifdef INVARIANTS
974
if (VERBOSE_PMAP || pmap_logging)
975
printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
976
__func__, pmap, va, flags, origpte);
977
#endif
978
return (KERN_FAILURE);
979
}
980
}
981
#ifdef INVARIANTS
982
if (VERBOSE_PMAP || pmap_logging)
983
printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
984
flags, origpte);
985
#endif
986
PMAP_LOCK(pmap);
987
if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
988
be64toh(*pte) != origpte) {
989
PMAP_UNLOCK(pmap);
990
return (KERN_FAILURE);
991
}
992
m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
993
MPASS(m != NULL);
994
switch (flags) {
995
case VM_PROT_READ:
996
if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
997
goto protfail;
998
newpte |= PG_A;
999
vm_page_aflag_set(m, PGA_REFERENCED);
1000
break;
1001
case VM_PROT_WRITE:
1002
if ((newpte & RPTE_EAA_W) == 0)
1003
goto protfail;
1004
if (is_l3e)
1005
goto protfail;
1006
newpte |= PG_M;
1007
vm_page_dirty(m);
1008
break;
1009
case VM_PROT_EXECUTE:
1010
if ((newpte & RPTE_EAA_X) == 0)
1011
goto protfail;
1012
newpte |= PG_A;
1013
vm_page_aflag_set(m, PGA_REFERENCED);
1014
break;
1015
}
1016
1017
if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
1018
goto retry;
1019
ptesync();
1020
PMAP_UNLOCK(pmap);
1021
if (startpte == newpte)
1022
return (KERN_FAILURE);
1023
return (0);
1024
protfail:
1025
PMAP_UNLOCK(pmap);
1026
return (KERN_PROTECTION_FAILURE);
1027
}
1028
1029
/*
1030
* Returns true if the given page is mapped individually or as part of
1031
* a 2mpage. Otherwise, returns false.
1032
*/
1033
bool
1034
mmu_radix_page_is_mapped(vm_page_t m)
1035
{
1036
struct rwlock *lock;
1037
bool rv;
1038
1039
if ((m->oflags & VPO_UNMANAGED) != 0)
1040
return (false);
1041
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
1042
rw_rlock(lock);
1043
rv = !TAILQ_EMPTY(&m->md.pv_list) ||
1044
((m->flags & PG_FICTITIOUS) == 0 &&
1045
!TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
1046
rw_runlock(lock);
1047
return (rv);
1048
}
1049
1050
/*
1051
* Determine the appropriate bits to set in a PTE or PDE for a specified
1052
* caching mode.
1053
*/
1054
static int
1055
pmap_cache_bits(vm_memattr_t ma)
1056
{
1057
if (ma != VM_MEMATTR_DEFAULT) {
1058
switch (ma) {
1059
case VM_MEMATTR_UNCACHEABLE:
1060
return (RPTE_ATTR_GUARDEDIO);
1061
case VM_MEMATTR_CACHEABLE:
1062
return (RPTE_ATTR_MEM);
1063
case VM_MEMATTR_WRITE_BACK:
1064
case VM_MEMATTR_PREFETCHABLE:
1065
case VM_MEMATTR_WRITE_COMBINING:
1066
return (RPTE_ATTR_UNGUARDEDIO);
1067
}
1068
}
1069
return (0);
1070
}
1071
1072
static void
1073
pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
1074
{
1075
ptesync();
1076
if (pmap == kernel_pmap)
1077
radix_tlbie_invlpg_kernel_4k(start);
1078
else
1079
radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1080
ttusync();
1081
}
1082
1083
static void
1084
pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
1085
{
1086
ptesync();
1087
if (pmap == kernel_pmap)
1088
radix_tlbie_invlpg_kernel_2m(start);
1089
else
1090
radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
1091
ttusync();
1092
}
1093
1094
static void
1095
pmap_invalidate_pwc(pmap_t pmap)
1096
{
1097
ptesync();
1098
if (pmap == kernel_pmap)
1099
radix_tlbie_invlpwc_kernel();
1100
else
1101
radix_tlbie_invlpwc_user(pmap->pm_pid);
1102
ttusync();
1103
}
1104
1105
static void
1106
pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1107
{
1108
if (((start - end) >> PAGE_SHIFT) > 8) {
1109
pmap_invalidate_all(pmap);
1110
return;
1111
}
1112
ptesync();
1113
if (pmap == kernel_pmap) {
1114
while (start < end) {
1115
radix_tlbie_invlpg_kernel_4k(start);
1116
start += PAGE_SIZE;
1117
}
1118
} else {
1119
while (start < end) {
1120
radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1121
start += PAGE_SIZE;
1122
}
1123
}
1124
ttusync();
1125
}
1126
1127
static void
1128
pmap_invalidate_all(pmap_t pmap)
1129
{
1130
ptesync();
1131
if (pmap == kernel_pmap)
1132
radix_tlbie_flush_kernel();
1133
else
1134
radix_tlbie_flush_user(pmap->pm_pid);
1135
ttusync();
1136
}
1137
1138
static void
1139
pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
1140
{
1141
1142
/*
1143
* When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1144
* by a promotion that did not invalidate the 512 4KB page mappings
1145
* that might exist in the TLB. Consequently, at this point, the TLB
1146
* may hold both 4KB and 2MB page mappings for the address range [va,
1147
* va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here.
1148
* In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1149
* 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
1150
* single INVLPG suffices to invalidate the 2MB page mapping from the
1151
* TLB.
1152
*/
1153
ptesync();
1154
if ((l3e & PG_PROMOTED) != 0)
1155
pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
1156
else
1157
pmap_invalidate_page_2m(pmap, va);
1158
1159
pmap_invalidate_pwc(pmap);
1160
}
1161
1162
static __inline struct pv_chunk *
1163
pv_to_chunk(pv_entry_t pv)
1164
{
1165
1166
return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1167
}
1168
1169
#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1170
1171
#define PC_FREE0 0xfffffffffffffffful
1172
#define PC_FREE1 ((1ul << (_NPCPV % 64)) - 1)
1173
1174
static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
1175
1176
/*
1177
* Ensure that the number of spare PV entries in the specified pmap meets or
1178
* exceeds the given count, "needed".
1179
*
1180
* The given PV list lock may be released.
1181
*/
1182
static void
1183
reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1184
{
1185
struct pch new_tail;
1186
struct pv_chunk *pc;
1187
vm_page_t m;
1188
int avail, free;
1189
bool reclaimed;
1190
1191
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1192
KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1193
1194
/*
1195
* Newly allocated PV chunks must be stored in a private list until
1196
* the required number of PV chunks have been allocated. Otherwise,
1197
* reclaim_pv_chunk() could recycle one of these chunks. In
1198
* contrast, these chunks must be added to the pmap upon allocation.
1199
*/
1200
TAILQ_INIT(&new_tail);
1201
retry:
1202
avail = 0;
1203
TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1204
// if ((cpu_feature2 & CPUID2_POPCNT) == 0)
1205
bit_count((bitstr_t *)pc->pc_map, 0,
1206
sizeof(pc->pc_map) * NBBY, &free);
1207
#if 0
1208
free = popcnt_pc_map_pq(pc->pc_map);
1209
#endif
1210
if (free == 0)
1211
break;
1212
avail += free;
1213
if (avail >= needed)
1214
break;
1215
}
1216
for (reclaimed = false; avail < needed; avail += _NPCPV) {
1217
m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1218
if (m == NULL) {
1219
m = reclaim_pv_chunk(pmap, lockp);
1220
if (m == NULL)
1221
goto retry;
1222
reclaimed = true;
1223
}
1224
PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1225
PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1226
dump_add_page(m->phys_addr);
1227
pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1228
pc->pc_pmap = pmap;
1229
pc->pc_map[0] = PC_FREE0;
1230
pc->pc_map[1] = PC_FREE1;
1231
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1232
TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1233
PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
1234
1235
/*
1236
* The reclaim might have freed a chunk from the current pmap.
1237
* If that chunk contained available entries, we need to
1238
* re-count the number of available entries.
1239
*/
1240
if (reclaimed)
1241
goto retry;
1242
}
1243
if (!TAILQ_EMPTY(&new_tail)) {
1244
mtx_lock(&pv_chunks_mutex);
1245
TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1246
mtx_unlock(&pv_chunks_mutex);
1247
}
1248
}
1249
1250
/*
1251
* First find and then remove the pv entry for the specified pmap and virtual
1252
* address from the specified pv list. Returns the pv entry if found and NULL
1253
* otherwise. This operation can be performed on pv lists for either 4KB or
1254
* 2MB page mappings.
1255
*/
1256
static __inline pv_entry_t
1257
pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1258
{
1259
pv_entry_t pv;
1260
1261
TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
1262
#ifdef INVARIANTS
1263
if (PV_PMAP(pv) == NULL) {
1264
printf("corrupted pv_chunk/pv %p\n", pv);
1265
printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
1266
}
1267
MPASS(PV_PMAP(pv) != NULL);
1268
MPASS(pv->pv_va != 0);
1269
#endif
1270
if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1271
TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
1272
pvh->pv_gen++;
1273
break;
1274
}
1275
}
1276
return (pv);
1277
}
1278
1279
/*
1280
* After demotion from a 2MB page mapping to 512 4KB page mappings,
1281
* destroy the pv entry for the 2MB page mapping and reinstantiate the pv
1282
* entries for each of the 4KB page mappings.
1283
*/
1284
static void
1285
pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1286
struct rwlock **lockp)
1287
{
1288
struct md_page *pvh;
1289
struct pv_chunk *pc;
1290
pv_entry_t pv;
1291
vm_offset_t va_last;
1292
vm_page_t m;
1293
int bit, field;
1294
1295
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1296
KASSERT((pa & L3_PAGE_MASK) == 0,
1297
("pmap_pv_demote_pde: pa is not 2mpage aligned"));
1298
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1299
1300
/*
1301
* Transfer the 2mpage's pv entry for this mapping to the first
1302
* page's pv list. Once this transfer begins, the pv list lock
1303
* must not be released until the last pv entry is reinstantiated.
1304
*/
1305
pvh = pa_to_pvh(pa);
1306
va = trunc_2mpage(va);
1307
pv = pmap_pvh_remove(pvh, pmap, va);
1308
KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
1309
m = PHYS_TO_VM_PAGE(pa);
1310
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1311
1312
m->md.pv_gen++;
1313
/* Instantiate the remaining NPTEPG - 1 pv entries. */
1314
PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
1315
va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1316
for (;;) {
1317
pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1318
KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
1319
, ("pmap_pv_demote_pde: missing spare"));
1320
for (field = 0; field < _NPCM; field++) {
1321
while (pc->pc_map[field]) {
1322
bit = cnttzd(pc->pc_map[field]);
1323
pc->pc_map[field] &= ~(1ul << bit);
1324
pv = &pc->pc_pventry[field * 64 + bit];
1325
va += PAGE_SIZE;
1326
pv->pv_va = va;
1327
m++;
1328
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1329
("pmap_pv_demote_pde: page %p is not managed", m));
1330
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1331
1332
m->md.pv_gen++;
1333
if (va == va_last)
1334
goto out;
1335
}
1336
}
1337
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1338
TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1339
}
1340
out:
1341
if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1342
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1343
TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1344
}
1345
PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
1346
PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
1347
}
1348
1349
static void
1350
reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
1351
{
1352
1353
if (pmap == NULL)
1354
return;
1355
pmap_invalidate_all(pmap);
1356
if (pmap != locked_pmap)
1357
PMAP_UNLOCK(pmap);
1358
}
1359
1360
/*
1361
* We are in a serious low memory condition. Resort to
1362
* drastic measures to free some pages so we can allocate
1363
* another pv entry chunk.
1364
*
1365
* Returns NULL if PV entries were reclaimed from the specified pmap.
1366
*
1367
* We do not, however, unmap 2mpages because subsequent accesses will
1368
* allocate per-page pv entries until repromotion occurs, thereby
1369
* exacerbating the shortage of free pv entries.
1370
*/
1371
static int active_reclaims = 0;
1372
static vm_page_t
1373
reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1374
{
1375
struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1376
struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1377
struct md_page *pvh;
1378
pml3_entry_t *l3e;
1379
pmap_t next_pmap, pmap;
1380
pt_entry_t *pte, tpte;
1381
pv_entry_t pv;
1382
vm_offset_t va;
1383
vm_page_t m, m_pc;
1384
struct spglist free;
1385
uint64_t inuse;
1386
int bit, field, freed;
1387
1388
PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1389
KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1390
pmap = NULL;
1391
m_pc = NULL;
1392
SLIST_INIT(&free);
1393
bzero(&pc_marker_b, sizeof(pc_marker_b));
1394
bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1395
pc_marker = (struct pv_chunk *)&pc_marker_b;
1396
pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1397
1398
mtx_lock(&pv_chunks_mutex);
1399
active_reclaims++;
1400
TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1401
TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1402
while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1403
SLIST_EMPTY(&free)) {
1404
next_pmap = pc->pc_pmap;
1405
if (next_pmap == NULL) {
1406
/*
1407
* The next chunk is a marker. However, it is
1408
* not our marker, so active_reclaims must be
1409
* > 1. Consequently, the next_chunk code
1410
* will not rotate the pv_chunks list.
1411
*/
1412
goto next_chunk;
1413
}
1414
mtx_unlock(&pv_chunks_mutex);
1415
1416
/*
1417
* A pv_chunk can only be removed from the pc_lru list
1418
* when both pc_chunks_mutex is owned and the
1419
* corresponding pmap is locked.
1420
*/
1421
if (pmap != next_pmap) {
1422
reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1423
pmap = next_pmap;
1424
/* Avoid deadlock and lock recursion. */
1425
if (pmap > locked_pmap) {
1426
RELEASE_PV_LIST_LOCK(lockp);
1427
PMAP_LOCK(pmap);
1428
mtx_lock(&pv_chunks_mutex);
1429
continue;
1430
} else if (pmap != locked_pmap) {
1431
if (PMAP_TRYLOCK(pmap)) {
1432
mtx_lock(&pv_chunks_mutex);
1433
continue;
1434
} else {
1435
pmap = NULL; /* pmap is not locked */
1436
mtx_lock(&pv_chunks_mutex);
1437
pc = TAILQ_NEXT(pc_marker, pc_lru);
1438
if (pc == NULL ||
1439
pc->pc_pmap != next_pmap)
1440
continue;
1441
goto next_chunk;
1442
}
1443
}
1444
}
1445
1446
/*
1447
* Destroy every non-wired, 4 KB page mapping in the chunk.
1448
*/
1449
freed = 0;
1450
for (field = 0; field < _NPCM; field++) {
1451
for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1452
inuse != 0; inuse &= ~(1UL << bit)) {
1453
bit = cnttzd(inuse);
1454
pv = &pc->pc_pventry[field * 64 + bit];
1455
va = pv->pv_va;
1456
l3e = pmap_pml3e(pmap, va);
1457
if ((be64toh(*l3e) & RPTE_LEAF) != 0)
1458
continue;
1459
pte = pmap_l3e_to_pte(l3e, va);
1460
if ((be64toh(*pte) & PG_W) != 0)
1461
continue;
1462
tpte = be64toh(pte_load_clear(pte));
1463
m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
1464
if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1465
vm_page_dirty(m);
1466
if ((tpte & PG_A) != 0)
1467
vm_page_aflag_set(m, PGA_REFERENCED);
1468
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1469
TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
1470
1471
m->md.pv_gen++;
1472
if (TAILQ_EMPTY(&m->md.pv_list) &&
1473
(m->flags & PG_FICTITIOUS) == 0) {
1474
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1475
if (TAILQ_EMPTY(&pvh->pv_list)) {
1476
vm_page_aflag_clear(m,
1477
PGA_WRITEABLE);
1478
}
1479
}
1480
pc->pc_map[field] |= 1UL << bit;
1481
pmap_unuse_pt(pmap, va, be64toh(*l3e), &free);
1482
freed++;
1483
}
1484
}
1485
if (freed == 0) {
1486
mtx_lock(&pv_chunks_mutex);
1487
goto next_chunk;
1488
}
1489
/* Every freed mapping is for a 4 KB page. */
1490
pmap_resident_count_dec(pmap, freed);
1491
PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1492
PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1493
PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1494
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1495
if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
1496
PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1497
PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1498
PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1499
/* Entire chunk is free; return it. */
1500
m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1501
dump_drop_page(m_pc->phys_addr);
1502
mtx_lock(&pv_chunks_mutex);
1503
TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1504
break;
1505
}
1506
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1507
mtx_lock(&pv_chunks_mutex);
1508
/* One freed pv entry in locked_pmap is sufficient. */
1509
if (pmap == locked_pmap)
1510
break;
1511
next_chunk:
1512
TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1513
TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
1514
if (active_reclaims == 1 && pmap != NULL) {
1515
/*
1516
* Rotate the pv chunks list so that we do not
1517
* scan the same pv chunks that could not be
1518
* freed (because they contained a wired
1519
* and/or superpage mapping) on every
1520
* invocation of reclaim_pv_chunk().
1521
*/
1522
while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
1523
MPASS(pc->pc_pmap != NULL);
1524
TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1525
TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1526
}
1527
}
1528
}
1529
TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1530
TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
1531
active_reclaims--;
1532
mtx_unlock(&pv_chunks_mutex);
1533
reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1534
if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1535
m_pc = SLIST_FIRST(&free);
1536
SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1537
/* Recycle a freed page table page. */
1538
m_pc->ref_count = 1;
1539
}
1540
vm_page_free_pages_toq(&free, true);
1541
return (m_pc);
1542
}
1543
1544
/*
1545
* free the pv_entry back to the free list
1546
*/
1547
static void
1548
free_pv_entry(pmap_t pmap, pv_entry_t pv)
1549
{
1550
struct pv_chunk *pc;
1551
int idx, field, bit;
1552
1553
#ifdef VERBOSE_PV
1554
if (pmap != kernel_pmap)
1555
printf("%s(%p, %p)\n", __func__, pmap, pv);
1556
#endif
1557
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1558
PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1559
PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1560
PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1561
pc = pv_to_chunk(pv);
1562
idx = pv - &pc->pc_pventry[0];
1563
field = idx / 64;
1564
bit = idx % 64;
1565
pc->pc_map[field] |= 1ul << bit;
1566
if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
1567
/* 98% of the time, pc is already at the head of the list. */
1568
if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1569
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1570
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1571
}
1572
return;
1573
}
1574
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1575
free_pv_chunk(pc);
1576
}
1577
1578
static void
1579
free_pv_chunk(struct pv_chunk *pc)
1580
{
1581
vm_page_t m;
1582
1583
mtx_lock(&pv_chunks_mutex);
1584
TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1585
mtx_unlock(&pv_chunks_mutex);
1586
PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1587
PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1588
PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1589
/* entire chunk is free, return it */
1590
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1591
dump_drop_page(m->phys_addr);
1592
vm_page_unwire_noq(m);
1593
vm_page_free(m);
1594
}
1595
1596
/*
1597
* Returns a new PV entry, allocating a new PV chunk from the system when
1598
* needed. If this PV chunk allocation fails and a PV list lock pointer was
1599
* given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
1600
* returned.
1601
*
1602
* The given PV list lock may be released.
1603
*/
1604
static pv_entry_t
1605
get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1606
{
1607
int bit, field;
1608
pv_entry_t pv;
1609
struct pv_chunk *pc;
1610
vm_page_t m;
1611
1612
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1613
PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1614
retry:
1615
pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1616
if (pc != NULL) {
1617
for (field = 0; field < _NPCM; field++) {
1618
if (pc->pc_map[field]) {
1619
bit = cnttzd(pc->pc_map[field]);
1620
break;
1621
}
1622
}
1623
if (field < _NPCM) {
1624
pv = &pc->pc_pventry[field * 64 + bit];
1625
pc->pc_map[field] &= ~(1ul << bit);
1626
/* If this was the last item, move it to tail */
1627
if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1628
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1629
TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1630
pc_list);
1631
}
1632
PV_STAT(atomic_add_long(&pv_entry_count, 1));
1633
PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1634
MPASS(PV_PMAP(pv) != NULL);
1635
return (pv);
1636
}
1637
}
1638
/* No free items, allocate another chunk */
1639
m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1640
if (m == NULL) {
1641
if (lockp == NULL) {
1642
PV_STAT(pc_chunk_tryfail++);
1643
return (NULL);
1644
}
1645
m = reclaim_pv_chunk(pmap, lockp);
1646
if (m == NULL)
1647
goto retry;
1648
}
1649
PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1650
PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1651
dump_add_page(m->phys_addr);
1652
pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1653
pc->pc_pmap = pmap;
1654
pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
1655
pc->pc_map[1] = PC_FREE1;
1656
mtx_lock(&pv_chunks_mutex);
1657
TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1658
mtx_unlock(&pv_chunks_mutex);
1659
pv = &pc->pc_pventry[0];
1660
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1661
PV_STAT(atomic_add_long(&pv_entry_count, 1));
1662
PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1663
MPASS(PV_PMAP(pv) != NULL);
1664
return (pv);
1665
}
1666
1667
#if VM_NRESERVLEVEL > 0
1668
/*
1669
* After promotion from 512 4KB page mappings to a single 2MB page mapping,
1670
* replace the many pv entries for the 4KB page mappings by a single pv entry
1671
* for the 2MB page mapping.
1672
*/
1673
static void
1674
pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1675
struct rwlock **lockp)
1676
{
1677
struct md_page *pvh;
1678
pv_entry_t pv;
1679
vm_offset_t va_last;
1680
vm_page_t m;
1681
1682
KASSERT((pa & L3_PAGE_MASK) == 0,
1683
("pmap_pv_promote_pde: pa is not 2mpage aligned"));
1684
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1685
1686
/*
1687
* Transfer the first page's pv entry for this mapping to the 2mpage's
1688
* pv list. Aside from avoiding the cost of a call to get_pv_entry(),
1689
* a transfer avoids the possibility that get_pv_entry() calls
1690
* reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
1691
* mappings that is being promoted.
1692
*/
1693
m = PHYS_TO_VM_PAGE(pa);
1694
va = trunc_2mpage(va);
1695
pv = pmap_pvh_remove(&m->md, pmap, va);
1696
KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
1697
pvh = pa_to_pvh(pa);
1698
TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
1699
pvh->pv_gen++;
1700
/* Free the remaining NPTEPG - 1 pv entries. */
1701
va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1702
do {
1703
m++;
1704
va += PAGE_SIZE;
1705
pmap_pvh_free(&m->md, pmap, va);
1706
} while (va < va_last);
1707
}
1708
#endif /* VM_NRESERVLEVEL > 0 */
1709
1710
/*
1711
* First find and then destroy the pv entry for the specified pmap and virtual
1712
* address. This operation can be performed on pv lists for either 4KB or 2MB
1713
* page mappings.
1714
*/
1715
static void
1716
pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1717
{
1718
pv_entry_t pv;
1719
1720
pv = pmap_pvh_remove(pvh, pmap, va);
1721
KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1722
free_pv_entry(pmap, pv);
1723
}
1724
1725
/*
1726
* Conditionally create the PV entry for a 4KB page mapping if the required
1727
* memory can be allocated without resorting to reclamation.
1728
*/
1729
static bool
1730
pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1731
struct rwlock **lockp)
1732
{
1733
pv_entry_t pv;
1734
1735
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1736
/* Pass NULL instead of the lock pointer to disable reclamation. */
1737
if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1738
pv->pv_va = va;
1739
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1740
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1741
m->md.pv_gen++;
1742
return (true);
1743
} else
1744
return (false);
1745
}
1746
1747
vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
1748
#ifdef INVARIANTS
1749
static void
1750
validate_addr(vm_paddr_t addr, vm_size_t size)
1751
{
1752
vm_paddr_t end = addr + size;
1753
bool found = false;
1754
1755
for (int i = 0; i < 2 * phys_avail_count; i += 2) {
1756
if (addr >= phys_avail_debug[i] &&
1757
end <= phys_avail_debug[i + 1]) {
1758
found = true;
1759
break;
1760
}
1761
}
1762
KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
1763
addr, end));
1764
}
1765
#else
1766
static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
1767
#endif
1768
#define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
1769
1770
static vm_paddr_t
1771
alloc_pt_page(void)
1772
{
1773
vm_paddr_t page;
1774
1775
page = allocpages(1);
1776
pagezero(PHYS_TO_DMAP(page));
1777
return (page);
1778
}
1779
1780
static void
1781
mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
1782
{
1783
pt_entry_t *pte, pteval;
1784
vm_paddr_t page;
1785
1786
if (bootverbose)
1787
printf("%s %lx -> %lx\n", __func__, start, end);
1788
while (start < end) {
1789
pteval = start | DMAP_PAGE_BITS;
1790
pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
1791
if ((be64toh(*pte) & RPTE_VALID) == 0) {
1792
page = alloc_pt_page();
1793
pde_store(pte, page);
1794
}
1795
pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
1796
if ((start & L2_PAGE_MASK) == 0 &&
1797
end - start >= L2_PAGE_SIZE) {
1798
start += L2_PAGE_SIZE;
1799
goto done;
1800
} else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1801
page = alloc_pt_page();
1802
pde_store(pte, page);
1803
}
1804
1805
pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
1806
if ((start & L3_PAGE_MASK) == 0 &&
1807
end - start >= L3_PAGE_SIZE) {
1808
start += L3_PAGE_SIZE;
1809
goto done;
1810
} else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1811
page = alloc_pt_page();
1812
pde_store(pte, page);
1813
}
1814
pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
1815
start += PAGE_SIZE;
1816
done:
1817
pte_store(pte, pteval);
1818
}
1819
}
1820
1821
static void
1822
mmu_radix_dmap_populate(vm_size_t hwphyssz)
1823
{
1824
vm_paddr_t start, end;
1825
1826
for (int i = 0; i < pregions_sz; i++) {
1827
start = pregions[i].mr_start;
1828
end = start + pregions[i].mr_size;
1829
if (hwphyssz && start >= hwphyssz)
1830
break;
1831
if (hwphyssz && hwphyssz < end)
1832
end = hwphyssz;
1833
mmu_radix_dmap_range(start, end);
1834
}
1835
}
1836
1837
static void
1838
mmu_radix_setup_pagetables(vm_size_t hwphyssz)
1839
{
1840
vm_paddr_t ptpages, pages;
1841
pt_entry_t *pte;
1842
vm_paddr_t l1phys;
1843
1844
bzero(kernel_pmap, sizeof(struct pmap));
1845
PMAP_LOCK_INIT(kernel_pmap);
1846
vm_radix_init(&kernel_pmap->pm_radix);
1847
1848
ptpages = allocpages(3);
1849
l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
1850
validate_addr(l1phys, RADIX_PGD_SIZE);
1851
if (bootverbose)
1852
printf("l1phys=%lx\n", l1phys);
1853
MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
1854
for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
1855
pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
1856
kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
1857
1858
mmu_radix_dmap_populate(hwphyssz);
1859
1860
/*
1861
* Create page tables for first 128MB of KVA
1862
*/
1863
pages = ptpages;
1864
pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
1865
*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1866
pages += PAGE_SIZE;
1867
pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
1868
*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1869
pages += PAGE_SIZE;
1870
pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
1871
/*
1872
* the kernel page table pages need to be preserved in
1873
* phys_avail and not overlap with previous allocations
1874
*/
1875
pages = allocpages(nkpt);
1876
if (bootverbose) {
1877
printf("phys_avail after dmap populate and nkpt allocation\n");
1878
for (int j = 0; j < 2 * phys_avail_count; j+=2)
1879
printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1880
j, phys_avail[j], j + 1, phys_avail[j + 1]);
1881
}
1882
KPTphys = pages;
1883
for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
1884
*pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1885
kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
1886
if (bootverbose)
1887
printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
1888
/*
1889
* Add a physical memory segment (vm_phys_seg) corresponding to the
1890
* preallocated kernel page table pages so that vm_page structures
1891
* representing these pages will be created. The vm_page structures
1892
* are required for promotion of the corresponding kernel virtual
1893
* addresses to superpage mappings.
1894
*/
1895
vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1896
}
1897
1898
static void
1899
mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
1900
{
1901
vm_paddr_t kpstart, kpend;
1902
vm_size_t physsz, hwphyssz;
1903
//uint64_t l2virt;
1904
int rm_pavail, proctab_size;
1905
int i, j;
1906
1907
kpstart = start & ~DMAP_BASE_ADDRESS;
1908
kpend = end & ~DMAP_BASE_ADDRESS;
1909
1910
/* Get physical memory regions from firmware */
1911
mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
1912
CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
1913
1914
if (2 * VM_PHYSSEG_MAX < regions_sz)
1915
panic("mmu_radix_early_bootstrap: phys_avail too small");
1916
1917
if (bootverbose)
1918
for (int i = 0; i < regions_sz; i++)
1919
printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
1920
i, regions[i].mr_start, i, regions[i].mr_size);
1921
/*
1922
* XXX workaround a simulator bug
1923
*/
1924
for (int i = 0; i < regions_sz; i++)
1925
if (regions[i].mr_start & PAGE_MASK) {
1926
regions[i].mr_start += PAGE_MASK;
1927
regions[i].mr_start &= ~PAGE_MASK;
1928
regions[i].mr_size &= ~PAGE_MASK;
1929
}
1930
if (bootverbose)
1931
for (int i = 0; i < pregions_sz; i++)
1932
printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
1933
i, pregions[i].mr_start, i, pregions[i].mr_size);
1934
1935
phys_avail_count = 0;
1936
physsz = 0;
1937
hwphyssz = 0;
1938
TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
1939
for (i = 0, j = 0; i < regions_sz; i++) {
1940
if (bootverbose)
1941
printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
1942
i, regions[i].mr_start, i, regions[i].mr_size);
1943
1944
if (regions[i].mr_size < PAGE_SIZE)
1945
continue;
1946
1947
if (hwphyssz != 0 &&
1948
(physsz + regions[i].mr_size) >= hwphyssz) {
1949
if (physsz < hwphyssz) {
1950
phys_avail[j] = regions[i].mr_start;
1951
phys_avail[j + 1] = regions[i].mr_start +
1952
(hwphyssz - physsz);
1953
physsz = hwphyssz;
1954
phys_avail_count++;
1955
dump_avail[j] = phys_avail[j];
1956
dump_avail[j + 1] = phys_avail[j + 1];
1957
}
1958
break;
1959
}
1960
phys_avail[j] = regions[i].mr_start;
1961
phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
1962
dump_avail[j] = phys_avail[j];
1963
dump_avail[j + 1] = phys_avail[j + 1];
1964
1965
phys_avail_count++;
1966
physsz += regions[i].mr_size;
1967
j += 2;
1968
}
1969
1970
/* Check for overlap with the kernel and exception vectors */
1971
rm_pavail = 0;
1972
for (j = 0; j < 2 * phys_avail_count; j+=2) {
1973
if (phys_avail[j] < EXC_LAST)
1974
phys_avail[j] += EXC_LAST;
1975
1976
if (phys_avail[j] >= kpstart &&
1977
phys_avail[j + 1] <= kpend) {
1978
phys_avail[j] = phys_avail[j + 1] = ~0;
1979
rm_pavail++;
1980
continue;
1981
}
1982
1983
if (kpstart >= phys_avail[j] &&
1984
kpstart < phys_avail[j + 1]) {
1985
if (kpend < phys_avail[j + 1]) {
1986
phys_avail[2 * phys_avail_count] =
1987
(kpend & ~PAGE_MASK) + PAGE_SIZE;
1988
phys_avail[2 * phys_avail_count + 1] =
1989
phys_avail[j + 1];
1990
phys_avail_count++;
1991
}
1992
1993
phys_avail[j + 1] = kpstart & ~PAGE_MASK;
1994
}
1995
1996
if (kpend >= phys_avail[j] &&
1997
kpend < phys_avail[j + 1]) {
1998
if (kpstart > phys_avail[j]) {
1999
phys_avail[2 * phys_avail_count] = phys_avail[j];
2000
phys_avail[2 * phys_avail_count + 1] =
2001
kpstart & ~PAGE_MASK;
2002
phys_avail_count++;
2003
}
2004
2005
phys_avail[j] = (kpend & ~PAGE_MASK) +
2006
PAGE_SIZE;
2007
}
2008
}
2009
qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
2010
for (i = 0; i < 2 * phys_avail_count; i++)
2011
phys_avail_debug[i] = phys_avail[i];
2012
2013
/* Remove physical available regions marked for removal (~0) */
2014
if (rm_pavail) {
2015
phys_avail_count -= rm_pavail;
2016
for (i = 2 * phys_avail_count;
2017
i < 2*(phys_avail_count + rm_pavail); i+=2)
2018
phys_avail[i] = phys_avail[i + 1] = 0;
2019
}
2020
if (bootverbose) {
2021
printf("phys_avail ranges after filtering:\n");
2022
for (j = 0; j < 2 * phys_avail_count; j+=2)
2023
printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
2024
j, phys_avail[j], j + 1, phys_avail[j + 1]);
2025
}
2026
physmem = btoc(physsz);
2027
2028
/* XXX assume we're running non-virtualized and
2029
* we don't support BHYVE
2030
*/
2031
if (isa3_pid_bits == 0)
2032
isa3_pid_bits = 20;
2033
if (powernv_enabled) {
2034
parttab_phys =
2035
moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
2036
validate_addr(parttab_phys, PARTTAB_SIZE);
2037
for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
2038
pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
2039
2040
}
2041
proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
2042
proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
2043
validate_addr(proctab0pa, proctab_size);
2044
for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
2045
pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
2046
2047
mmu_radix_setup_pagetables(hwphyssz);
2048
}
2049
2050
static void
2051
mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
2052
{
2053
int i;
2054
vm_paddr_t pa;
2055
void *dpcpu;
2056
vm_offset_t va;
2057
2058
/*
2059
* Set up the Open Firmware pmap and add its mappings if not in real
2060
* mode.
2061
*/
2062
if (bootverbose)
2063
printf("%s enter\n", __func__);
2064
2065
/*
2066
* Calculate the last available physical address, and reserve the
2067
* vm_page_array (upper bound).
2068
*/
2069
Maxmem = 0;
2070
for (i = 0; phys_avail[i + 1] != 0; i += 2)
2071
Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
2072
2073
/*
2074
* Remap any early IO mappings (console framebuffer, etc.)
2075
*/
2076
bs_remap_earlyboot();
2077
2078
/*
2079
* Allocate a kernel stack with a guard page for thread0 and map it
2080
* into the kernel page map.
2081
*/
2082
pa = allocpages(kstack_pages);
2083
va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
2084
virtual_avail = va + kstack_pages * PAGE_SIZE;
2085
CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
2086
thread0.td_kstack = va;
2087
for (i = 0; i < kstack_pages; i++) {
2088
mmu_radix_kenter(va, pa);
2089
pa += PAGE_SIZE;
2090
va += PAGE_SIZE;
2091
}
2092
thread0.td_kstack_pages = kstack_pages;
2093
2094
/*
2095
* Allocate virtual address space for the message buffer.
2096
*/
2097
pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT);
2098
msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
2099
2100
/*
2101
* Allocate virtual address space for the dynamic percpu area.
2102
*/
2103
pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
2104
dpcpu = (void *)PHYS_TO_DMAP(pa);
2105
dpcpu_init(dpcpu, curcpu);
2106
2107
crashdumpmap = (caddr_t)virtual_avail;
2108
virtual_avail += MAXDUMPPGS * PAGE_SIZE;
2109
2110
/*
2111
* Reserve some special page table entries/VA space for temporary
2112
* mapping of pages.
2113
*/
2114
}
2115
2116
static void
2117
mmu_parttab_init(void)
2118
{
2119
uint64_t ptcr;
2120
2121
isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
2122
2123
if (bootverbose)
2124
printf("%s parttab: %p\n", __func__, isa3_parttab);
2125
ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2126
if (bootverbose)
2127
printf("setting ptcr %lx\n", ptcr);
2128
mtspr(SPR_PTCR, ptcr);
2129
}
2130
2131
static void
2132
mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
2133
{
2134
uint64_t prev;
2135
2136
if (bootverbose)
2137
printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
2138
lpid, pagetab, proctab);
2139
prev = be64toh(isa3_parttab[lpid].pagetab);
2140
isa3_parttab[lpid].pagetab = htobe64(pagetab);
2141
isa3_parttab[lpid].proctab = htobe64(proctab);
2142
2143
if (prev & PARTTAB_HR) {
2144
__asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
2145
"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2146
__asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2147
"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2148
} else {
2149
__asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
2150
"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2151
}
2152
ttusync();
2153
}
2154
2155
static void
2156
mmu_radix_parttab_init(void)
2157
{
2158
uint64_t pagetab;
2159
2160
mmu_parttab_init();
2161
pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
2162
RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
2163
mmu_parttab_update(0, pagetab, 0);
2164
}
2165
2166
static void
2167
mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
2168
{
2169
uint64_t pagetab, proctab;
2170
2171
pagetab = be64toh(isa3_parttab[0].pagetab);
2172
proctab = proctabpa | table_size | PARTTAB_GR;
2173
mmu_parttab_update(0, pagetab, proctab);
2174
}
2175
2176
static void
2177
mmu_radix_proctab_init(void)
2178
{
2179
2180
isa3_base_pid = 1;
2181
2182
isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
2183
isa3_proctab->proctab0 =
2184
htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
2185
RADIX_PGD_INDEX_SHIFT);
2186
2187
if (powernv_enabled) {
2188
mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
2189
__asm __volatile("ptesync" : : : "memory");
2190
__asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2191
"r" (TLBIEL_INVAL_SET_LPID), "r" (0));
2192
__asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
2193
#ifdef PSERIES
2194
} else {
2195
int64_t rc;
2196
2197
rc = phyp_hcall(H_REGISTER_PROC_TBL,
2198
PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE,
2199
proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12);
2200
if (rc != H_SUCCESS)
2201
panic("mmu_radix_proctab_init: "
2202
"failed to register process table: rc=%jd",
2203
(intmax_t)rc);
2204
#endif
2205
}
2206
2207
if (bootverbose)
2208
printf("process table %p and kernel radix PDE: %p\n",
2209
isa3_proctab, kernel_pmap->pm_pml1);
2210
mtmsr(mfmsr() | PSL_DR );
2211
mtmsr(mfmsr() & ~PSL_DR);
2212
kernel_pmap->pm_pid = isa3_base_pid;
2213
isa3_base_pid++;
2214
}
2215
2216
void
2217
mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2218
int advice)
2219
{
2220
struct rwlock *lock;
2221
pml1_entry_t *l1e;
2222
pml2_entry_t *l2e;
2223
pml3_entry_t oldl3e, *l3e;
2224
pt_entry_t *pte;
2225
vm_offset_t va, va_next;
2226
vm_page_t m;
2227
bool anychanged;
2228
2229
if (advice != MADV_DONTNEED && advice != MADV_FREE)
2230
return;
2231
anychanged = false;
2232
PMAP_LOCK(pmap);
2233
for (; sva < eva; sva = va_next) {
2234
l1e = pmap_pml1e(pmap, sva);
2235
if ((be64toh(*l1e) & PG_V) == 0) {
2236
va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2237
if (va_next < sva)
2238
va_next = eva;
2239
continue;
2240
}
2241
l2e = pmap_l1e_to_l2e(l1e, sva);
2242
if ((be64toh(*l2e) & PG_V) == 0) {
2243
va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2244
if (va_next < sva)
2245
va_next = eva;
2246
continue;
2247
}
2248
va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2249
if (va_next < sva)
2250
va_next = eva;
2251
l3e = pmap_l2e_to_l3e(l2e, sva);
2252
oldl3e = be64toh(*l3e);
2253
if ((oldl3e & PG_V) == 0)
2254
continue;
2255
else if ((oldl3e & RPTE_LEAF) != 0) {
2256
if ((oldl3e & PG_MANAGED) == 0)
2257
continue;
2258
lock = NULL;
2259
if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
2260
if (lock != NULL)
2261
rw_wunlock(lock);
2262
2263
/*
2264
* The large page mapping was destroyed.
2265
*/
2266
continue;
2267
}
2268
2269
/*
2270
* Unless the page mappings are wired, remove the
2271
* mapping to a single page so that a subsequent
2272
* access may repromote. Choosing the last page
2273
* within the address range [sva, min(va_next, eva))
2274
* generally results in more repromotions. Since the
2275
* underlying page table page is fully populated, this
2276
* removal never frees a page table page.
2277
*/
2278
if ((oldl3e & PG_W) == 0) {
2279
va = eva;
2280
if (va > va_next)
2281
va = va_next;
2282
va -= PAGE_SIZE;
2283
KASSERT(va >= sva,
2284
("mmu_radix_advise: no address gap"));
2285
pte = pmap_l3e_to_pte(l3e, va);
2286
KASSERT((be64toh(*pte) & PG_V) != 0,
2287
("pmap_advise: invalid PTE"));
2288
pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL,
2289
&lock);
2290
anychanged = true;
2291
}
2292
if (lock != NULL)
2293
rw_wunlock(lock);
2294
}
2295
if (va_next > eva)
2296
va_next = eva;
2297
va = va_next;
2298
for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
2299
pte++, sva += PAGE_SIZE) {
2300
MPASS(pte == pmap_pte(pmap, sva));
2301
2302
if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
2303
goto maybe_invlrng;
2304
else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2305
if (advice == MADV_DONTNEED) {
2306
/*
2307
* Future calls to pmap_is_modified()
2308
* can be avoided by making the page
2309
* dirty now.
2310
*/
2311
m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME);
2312
vm_page_dirty(m);
2313
}
2314
atomic_clear_long(pte, htobe64(PG_M | PG_A));
2315
} else if ((be64toh(*pte) & PG_A) != 0)
2316
atomic_clear_long(pte, htobe64(PG_A));
2317
else
2318
goto maybe_invlrng;
2319
anychanged = true;
2320
continue;
2321
maybe_invlrng:
2322
if (va != va_next) {
2323
anychanged = true;
2324
va = va_next;
2325
}
2326
}
2327
if (va != va_next)
2328
anychanged = true;
2329
}
2330
if (anychanged)
2331
pmap_invalidate_all(pmap);
2332
PMAP_UNLOCK(pmap);
2333
}
2334
2335
/*
2336
* Routines used in machine-dependent code
2337
*/
2338
static void
2339
mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
2340
{
2341
uint64_t lpcr;
2342
2343
if (bootverbose)
2344
printf("%s\n", __func__);
2345
hw_direct_map = 1;
2346
powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0;
2347
mmu_radix_early_bootstrap(start, end);
2348
if (bootverbose)
2349
printf("early bootstrap complete\n");
2350
if (powernv_enabled) {
2351
lpcr = mfspr(SPR_LPCR);
2352
mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2353
mmu_radix_parttab_init();
2354
mmu_radix_init_amor();
2355
if (bootverbose)
2356
printf("powernv init complete\n");
2357
}
2358
mmu_radix_init_iamr();
2359
mmu_radix_proctab_init();
2360
mmu_radix_pid_set(kernel_pmap);
2361
if (powernv_enabled)
2362
mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2363
else
2364
mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
2365
2366
mmu_radix_late_bootstrap(start, end);
2367
numa_mem_regions(&numa_pregions, &numa_pregions_sz);
2368
if (bootverbose)
2369
printf("%s done\n", __func__);
2370
pmap_bootstrapped = 1;
2371
dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
2372
PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
2373
}
2374
2375
static void
2376
mmu_radix_cpu_bootstrap(int ap)
2377
{
2378
uint64_t lpcr;
2379
uint64_t ptcr;
2380
2381
if (powernv_enabled) {
2382
lpcr = mfspr(SPR_LPCR);
2383
mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2384
2385
ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2386
mtspr(SPR_PTCR, ptcr);
2387
mmu_radix_init_amor();
2388
}
2389
mmu_radix_init_iamr();
2390
mmu_radix_pid_set(kernel_pmap);
2391
if (powernv_enabled)
2392
mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2393
else
2394
mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
2395
}
2396
2397
static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
2398
"2MB page mapping counters");
2399
2400
static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions);
2401
SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
2402
&pmap_l3e_demotions, "2MB page demotions");
2403
2404
static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings);
2405
SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
2406
&pmap_l3e_mappings, "2MB page mappings");
2407
2408
static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures);
2409
SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
2410
&pmap_l3e_p_failures, "2MB page promotion failures");
2411
2412
static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions);
2413
SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
2414
&pmap_l3e_promotions, "2MB page promotions");
2415
2416
static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
2417
"1GB page mapping counters");
2418
2419
static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions);
2420
SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
2421
&pmap_l2e_demotions, "1GB page demotions");
2422
2423
void
2424
mmu_radix_clear_modify(vm_page_t m)
2425
{
2426
struct md_page *pvh;
2427
pmap_t pmap;
2428
pv_entry_t next_pv, pv;
2429
pml3_entry_t oldl3e, *l3e;
2430
pt_entry_t oldpte, *pte;
2431
struct rwlock *lock;
2432
vm_offset_t va;
2433
int md_gen, pvh_gen;
2434
2435
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2436
("pmap_clear_modify: page %p is not managed", m));
2437
vm_page_assert_busied(m);
2438
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
2439
2440
/*
2441
* If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
2442
* If the object containing the page is locked and the page is not
2443
* exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
2444
*/
2445
if ((m->a.flags & PGA_WRITEABLE) == 0)
2446
return;
2447
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2448
pa_to_pvh(VM_PAGE_TO_PHYS(m));
2449
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2450
rw_wlock(lock);
2451
restart:
2452
TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
2453
pmap = PV_PMAP(pv);
2454
if (!PMAP_TRYLOCK(pmap)) {
2455
pvh_gen = pvh->pv_gen;
2456
rw_wunlock(lock);
2457
PMAP_LOCK(pmap);
2458
rw_wlock(lock);
2459
if (pvh_gen != pvh->pv_gen) {
2460
PMAP_UNLOCK(pmap);
2461
goto restart;
2462
}
2463
}
2464
va = pv->pv_va;
2465
l3e = pmap_pml3e(pmap, va);
2466
oldl3e = be64toh(*l3e);
2467
if ((oldl3e & PG_RW) != 0 &&
2468
pmap_demote_l3e_locked(pmap, l3e, va, &lock) &&
2469
(oldl3e & PG_W) == 0) {
2470
/*
2471
* Write protect the mapping to a
2472
* single page so that a subsequent
2473
* write access may repromote.
2474
*/
2475
va += VM_PAGE_TO_PHYS(m) - (oldl3e &
2476
PG_PS_FRAME);
2477
pte = pmap_l3e_to_pte(l3e, va);
2478
oldpte = be64toh(*pte);
2479
while (!atomic_cmpset_long(pte,
2480
htobe64(oldpte),
2481
htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))))
2482
oldpte = be64toh(*pte);
2483
vm_page_dirty(m);
2484
pmap_invalidate_page(pmap, va);
2485
}
2486
PMAP_UNLOCK(pmap);
2487
}
2488
TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2489
pmap = PV_PMAP(pv);
2490
if (!PMAP_TRYLOCK(pmap)) {
2491
md_gen = m->md.pv_gen;
2492
pvh_gen = pvh->pv_gen;
2493
rw_wunlock(lock);
2494
PMAP_LOCK(pmap);
2495
rw_wlock(lock);
2496
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2497
PMAP_UNLOCK(pmap);
2498
goto restart;
2499
}
2500
}
2501
l3e = pmap_pml3e(pmap, pv->pv_va);
2502
KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
2503
" a 2mpage in page %p's pv list", m));
2504
pte = pmap_l3e_to_pte(l3e, pv->pv_va);
2505
if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2506
atomic_clear_long(pte, htobe64(PG_M));
2507
pmap_invalidate_page(pmap, pv->pv_va);
2508
}
2509
PMAP_UNLOCK(pmap);
2510
}
2511
rw_wunlock(lock);
2512
}
2513
2514
void
2515
mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2516
vm_size_t len, vm_offset_t src_addr)
2517
{
2518
struct rwlock *lock;
2519
struct spglist free;
2520
vm_offset_t addr;
2521
vm_offset_t end_addr = src_addr + len;
2522
vm_offset_t va_next;
2523
vm_page_t dst_pdpg, dstmpte, srcmpte;
2524
bool invalidate_all;
2525
2526
CTR6(KTR_PMAP,
2527
"%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
2528
__func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
2529
2530
if (dst_addr != src_addr)
2531
return;
2532
lock = NULL;
2533
invalidate_all = false;
2534
if (dst_pmap < src_pmap) {
2535
PMAP_LOCK(dst_pmap);
2536
PMAP_LOCK(src_pmap);
2537
} else {
2538
PMAP_LOCK(src_pmap);
2539
PMAP_LOCK(dst_pmap);
2540
}
2541
2542
for (addr = src_addr; addr < end_addr; addr = va_next) {
2543
pml1_entry_t *l1e;
2544
pml2_entry_t *l2e;
2545
pml3_entry_t srcptepaddr, *l3e;
2546
pt_entry_t *src_pte, *dst_pte;
2547
2548
l1e = pmap_pml1e(src_pmap, addr);
2549
if ((be64toh(*l1e) & PG_V) == 0) {
2550
va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2551
if (va_next < addr)
2552
va_next = end_addr;
2553
continue;
2554
}
2555
2556
l2e = pmap_l1e_to_l2e(l1e, addr);
2557
if ((be64toh(*l2e) & PG_V) == 0) {
2558
va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2559
if (va_next < addr)
2560
va_next = end_addr;
2561
continue;
2562
}
2563
2564
va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2565
if (va_next < addr)
2566
va_next = end_addr;
2567
2568
l3e = pmap_l2e_to_l3e(l2e, addr);
2569
srcptepaddr = be64toh(*l3e);
2570
if (srcptepaddr == 0)
2571
continue;
2572
2573
if (srcptepaddr & RPTE_LEAF) {
2574
if ((addr & L3_PAGE_MASK) != 0 ||
2575
addr + L3_PAGE_SIZE > end_addr)
2576
continue;
2577
dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
2578
if (dst_pdpg == NULL)
2579
break;
2580
l3e = (pml3_entry_t *)
2581
PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
2582
l3e = &l3e[pmap_pml3e_index(addr)];
2583
if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
2584
pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
2585
PMAP_ENTER_NORECLAIM, &lock))) {
2586
*l3e = htobe64(srcptepaddr & ~PG_W);
2587
pmap_resident_count_inc(dst_pmap,
2588
L3_PAGE_SIZE / PAGE_SIZE);
2589
counter_u64_add(pmap_l3e_mappings, 1);
2590
} else
2591
dst_pdpg->ref_count--;
2592
continue;
2593
}
2594
2595
srcptepaddr &= PG_FRAME;
2596
srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2597
KASSERT(srcmpte->ref_count > 0,
2598
("pmap_copy: source page table page is unused"));
2599
2600
if (va_next > end_addr)
2601
va_next = end_addr;
2602
2603
src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
2604
src_pte = &src_pte[pmap_pte_index(addr)];
2605
dstmpte = NULL;
2606
while (addr < va_next) {
2607
pt_entry_t ptetemp;
2608
ptetemp = be64toh(*src_pte);
2609
/*
2610
* we only virtual copy managed pages
2611
*/
2612
if ((ptetemp & PG_MANAGED) != 0) {
2613
if (dstmpte != NULL &&
2614
dstmpte->pindex == pmap_l3e_pindex(addr))
2615
dstmpte->ref_count++;
2616
else if ((dstmpte = pmap_allocpte(dst_pmap,
2617
addr, NULL)) == NULL)
2618
goto out;
2619
dst_pte = (pt_entry_t *)
2620
PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2621
dst_pte = &dst_pte[pmap_pte_index(addr)];
2622
if (be64toh(*dst_pte) == 0 &&
2623
pmap_try_insert_pv_entry(dst_pmap, addr,
2624
PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
2625
&lock)) {
2626
/*
2627
* Clear the wired, modified, and
2628
* accessed (referenced) bits
2629
* during the copy.
2630
*/
2631
*dst_pte = htobe64(ptetemp & ~(PG_W | PG_M |
2632
PG_A));
2633
pmap_resident_count_inc(dst_pmap, 1);
2634
} else {
2635
SLIST_INIT(&free);
2636
if (pmap_unwire_ptp(dst_pmap, addr,
2637
dstmpte, &free)) {
2638
/*
2639
* Although "addr" is not
2640
* mapped, paging-structure
2641
* caches could nonetheless
2642
* have entries that refer to
2643
* the freed page table pages.
2644
* Invalidate those entries.
2645
*/
2646
invalidate_all = true;
2647
vm_page_free_pages_toq(&free,
2648
true);
2649
}
2650
goto out;
2651
}
2652
if (dstmpte->ref_count >= srcmpte->ref_count)
2653
break;
2654
}
2655
addr += PAGE_SIZE;
2656
if (__predict_false((addr & L3_PAGE_MASK) == 0))
2657
src_pte = pmap_pte(src_pmap, addr);
2658
else
2659
src_pte++;
2660
}
2661
}
2662
out:
2663
if (invalidate_all)
2664
pmap_invalidate_all(dst_pmap);
2665
if (lock != NULL)
2666
rw_wunlock(lock);
2667
PMAP_UNLOCK(src_pmap);
2668
PMAP_UNLOCK(dst_pmap);
2669
}
2670
2671
static void
2672
mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
2673
{
2674
vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2675
vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2676
2677
CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
2678
/*
2679
* XXX slow
2680
*/
2681
bcopy((void *)src, (void *)dst, PAGE_SIZE);
2682
}
2683
2684
static void
2685
mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2686
vm_offset_t b_offset, int xfersize)
2687
{
2688
void *a_cp, *b_cp;
2689
vm_offset_t a_pg_offset, b_pg_offset;
2690
int cnt;
2691
2692
CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
2693
a_offset, mb, b_offset, xfersize);
2694
2695
while (xfersize > 0) {
2696
a_pg_offset = a_offset & PAGE_MASK;
2697
cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
2698
a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2699
VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
2700
a_pg_offset;
2701
b_pg_offset = b_offset & PAGE_MASK;
2702
cnt = min(cnt, PAGE_SIZE - b_pg_offset);
2703
b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2704
VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
2705
b_pg_offset;
2706
bcopy(a_cp, b_cp, cnt);
2707
a_offset += cnt;
2708
b_offset += cnt;
2709
xfersize -= cnt;
2710
}
2711
}
2712
2713
#if VM_NRESERVLEVEL > 0
2714
/*
2715
* Tries to promote the 512, contiguous 4KB page mappings that are within a
2716
* single page table page (PTP) to a single 2MB page mapping. For promotion
2717
* to occur, two conditions must be met: (1) the 4KB page mappings must map
2718
* aligned, contiguous physical memory and (2) the 4KB page mappings must have
2719
* identical characteristics.
2720
*/
2721
static int
2722
pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
2723
struct rwlock **lockp)
2724
{
2725
pml3_entry_t newpde;
2726
pt_entry_t *firstpte, oldpte, pa, *pte;
2727
vm_page_t mpte;
2728
2729
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2730
2731
/*
2732
* Examine the first PTE in the specified PTP. Abort if this PTE is
2733
* either invalid, unused, or does not map the first 4KB physical page
2734
* within a 2MB page.
2735
*/
2736
firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME);
2737
setpde:
2738
newpde = be64toh(*firstpte);
2739
if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2740
CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2741
" in pmap %p", va, pmap);
2742
goto fail;
2743
}
2744
if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2745
/*
2746
* When PG_M is already clear, PG_RW can be cleared without
2747
* a TLB invalidation.
2748
*/
2749
if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W)))
2750
goto setpde;
2751
newpde &= ~RPTE_EAA_W;
2752
}
2753
2754
/*
2755
* Examine each of the other PTEs in the specified PTP. Abort if this
2756
* PTE maps an unexpected 4KB physical page or does not have identical
2757
* characteristics to the first PTE.
2758
*/
2759
pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
2760
for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2761
setpte:
2762
oldpte = be64toh(*pte);
2763
if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2764
CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2765
" in pmap %p", va, pmap);
2766
goto fail;
2767
}
2768
if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2769
/*
2770
* When PG_M is already clear, PG_RW can be cleared
2771
* without a TLB invalidation.
2772
*/
2773
if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)))
2774
goto setpte;
2775
oldpte &= ~RPTE_EAA_W;
2776
CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
2777
" in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
2778
(va & ~L3_PAGE_MASK), pmap);
2779
}
2780
if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2781
CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2782
" in pmap %p", va, pmap);
2783
goto fail;
2784
}
2785
pa -= PAGE_SIZE;
2786
}
2787
2788
/*
2789
* Save the page table page in its current state until the PDE
2790
* mapping the superpage is demoted by pmap_demote_pde() or
2791
* destroyed by pmap_remove_pde().
2792
*/
2793
mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME);
2794
KASSERT(mpte >= vm_page_array &&
2795
mpte < &vm_page_array[vm_page_array_size],
2796
("pmap_promote_l3e: page table page is out of range"));
2797
KASSERT(mpte->pindex == pmap_l3e_pindex(va),
2798
("pmap_promote_l3e: page table page's pindex is wrong"));
2799
if (pmap_insert_pt_page(pmap, mpte)) {
2800
CTR2(KTR_PMAP,
2801
"pmap_promote_l3e: failure for va %#lx in pmap %p", va,
2802
pmap);
2803
goto fail;
2804
}
2805
2806
/*
2807
* Promote the pv entries.
2808
*/
2809
if ((newpde & PG_MANAGED) != 0)
2810
pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
2811
2812
pte_store(pde, PG_PROMOTED | newpde);
2813
ptesync();
2814
counter_u64_add(pmap_l3e_promotions, 1);
2815
CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
2816
" in pmap %p", va, pmap);
2817
return (0);
2818
fail:
2819
counter_u64_add(pmap_l3e_p_failures, 1);
2820
return (KERN_FAILURE);
2821
}
2822
#endif /* VM_NRESERVLEVEL > 0 */
2823
2824
int
2825
mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
2826
vm_prot_t prot, u_int flags, int8_t psind)
2827
{
2828
struct rwlock *lock;
2829
pml3_entry_t *l3e;
2830
pt_entry_t *pte;
2831
pt_entry_t newpte, origpte;
2832
pv_entry_t pv;
2833
vm_paddr_t opa, pa;
2834
vm_page_t mpte, om;
2835
int rv, retrycount;
2836
bool nosleep, invalidate_all, invalidate_page;
2837
2838
va = trunc_page(va);
2839
retrycount = 0;
2840
invalidate_page = invalidate_all = false;
2841
CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
2842
m, prot, flags, psind);
2843
KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2844
KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
2845
("pmap_enter: managed mapping within the clean submap"));
2846
if ((m->oflags & VPO_UNMANAGED) == 0)
2847
VM_PAGE_OBJECT_BUSY_ASSERT(m);
2848
2849
KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
2850
("pmap_enter: flags %u has reserved bits set", flags));
2851
pa = VM_PAGE_TO_PHYS(m);
2852
newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
2853
if ((flags & VM_PROT_WRITE) != 0)
2854
newpte |= PG_M;
2855
if ((flags & VM_PROT_READ) != 0)
2856
newpte |= PG_A;
2857
if (prot & VM_PROT_READ)
2858
newpte |= RPTE_EAA_R;
2859
if ((prot & VM_PROT_WRITE) != 0)
2860
newpte |= RPTE_EAA_W;
2861
KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
2862
("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
2863
2864
if (prot & VM_PROT_EXECUTE)
2865
newpte |= PG_X;
2866
if ((flags & PMAP_ENTER_WIRED) != 0)
2867
newpte |= PG_W;
2868
if (va >= DMAP_MIN_ADDRESS)
2869
newpte |= RPTE_EAA_P;
2870
newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
2871
/*
2872
* Set modified bit gratuitously for writeable mappings if
2873
* the page is unmanaged. We do not want to take a fault
2874
* to do the dirty bit accounting for these mappings.
2875
*/
2876
if ((m->oflags & VPO_UNMANAGED) != 0) {
2877
if ((newpte & PG_RW) != 0)
2878
newpte |= PG_M;
2879
} else
2880
newpte |= PG_MANAGED;
2881
2882
lock = NULL;
2883
PMAP_LOCK(pmap);
2884
if (psind == 1) {
2885
/* Assert the required virtual and physical alignment. */
2886
KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
2887
KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2888
rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
2889
goto out;
2890
}
2891
mpte = NULL;
2892
2893
/*
2894
* In the case that a page table page is not
2895
* resident, we are creating it here.
2896
*/
2897
retry:
2898
l3e = pmap_pml3e(pmap, va);
2899
if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 ||
2900
pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
2901
pte = pmap_l3e_to_pte(l3e, va);
2902
if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
2903
mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
2904
mpte->ref_count++;
2905
}
2906
} else if (va < VM_MAXUSER_ADDRESS) {
2907
/*
2908
* Here if the pte page isn't mapped, or if it has been
2909
* deallocated.
2910
*/
2911
nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2912
mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
2913
nosleep ? NULL : &lock);
2914
if (mpte == NULL && nosleep) {
2915
rv = KERN_RESOURCE_SHORTAGE;
2916
goto out;
2917
}
2918
if (__predict_false(retrycount++ == 6))
2919
panic("too many retries");
2920
invalidate_all = true;
2921
goto retry;
2922
} else
2923
panic("pmap_enter: invalid page directory va=%#lx", va);
2924
2925
origpte = be64toh(*pte);
2926
pv = NULL;
2927
2928
/*
2929
* Is the specified virtual address already mapped?
2930
*/
2931
if ((origpte & PG_V) != 0) {
2932
#ifdef INVARIANTS
2933
if (VERBOSE_PMAP || pmap_logging) {
2934
printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
2935
" asid=%lu curpid=%d name=%s origpte0x%lx\n",
2936
pmap, va, m, prot, flags, psind, pmap->pm_pid,
2937
curproc->p_pid, curproc->p_comm, origpte);
2938
#ifdef DDB
2939
pmap_pte_walk(pmap->pm_pml1, va);
2940
#endif
2941
}
2942
#endif
2943
/*
2944
* Wiring change, just update stats. We don't worry about
2945
* wiring PT pages as they remain resident as long as there
2946
* are valid mappings in them. Hence, if a user page is wired,
2947
* the PT page will be also.
2948
*/
2949
if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
2950
pmap->pm_stats.wired_count++;
2951
else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
2952
pmap->pm_stats.wired_count--;
2953
2954
/*
2955
* Remove the extra PT page reference.
2956
*/
2957
if (mpte != NULL) {
2958
mpte->ref_count--;
2959
KASSERT(mpte->ref_count > 0,
2960
("pmap_enter: missing reference to page table page,"
2961
" va: 0x%lx", va));
2962
}
2963
2964
/*
2965
* Has the physical page changed?
2966
*/
2967
opa = origpte & PG_FRAME;
2968
if (opa == pa) {
2969
/*
2970
* No, might be a protection or wiring change.
2971
*/
2972
if ((origpte & PG_MANAGED) != 0 &&
2973
(newpte & PG_RW) != 0)
2974
vm_page_aflag_set(m, PGA_WRITEABLE);
2975
if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
2976
if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
2977
if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
2978
goto retry;
2979
if ((newpte & PG_M) != (origpte & PG_M))
2980
vm_page_dirty(m);
2981
if ((newpte & PG_A) != (origpte & PG_A))
2982
vm_page_aflag_set(m, PGA_REFERENCED);
2983
ptesync();
2984
} else
2985
invalidate_all = true;
2986
if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
2987
goto unchanged;
2988
}
2989
goto validate;
2990
}
2991
2992
/*
2993
* The physical page has changed. Temporarily invalidate
2994
* the mapping. This ensures that all threads sharing the
2995
* pmap keep a consistent view of the mapping, which is
2996
* necessary for the correct handling of COW faults. It
2997
* also permits reuse of the old mapping's PV entry,
2998
* avoiding an allocation.
2999
*
3000
* For consistency, handle unmanaged mappings the same way.
3001
*/
3002
origpte = be64toh(pte_load_clear(pte));
3003
KASSERT((origpte & PG_FRAME) == opa,
3004
("pmap_enter: unexpected pa update for %#lx", va));
3005
if ((origpte & PG_MANAGED) != 0) {
3006
om = PHYS_TO_VM_PAGE(opa);
3007
3008
/*
3009
* The pmap lock is sufficient to synchronize with
3010
* concurrent calls to pmap_page_test_mappings() and
3011
* pmap_ts_referenced().
3012
*/
3013
if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3014
vm_page_dirty(om);
3015
if ((origpte & PG_A) != 0)
3016
vm_page_aflag_set(om, PGA_REFERENCED);
3017
CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3018
pv = pmap_pvh_remove(&om->md, pmap, va);
3019
if ((newpte & PG_MANAGED) == 0)
3020
free_pv_entry(pmap, pv);
3021
#ifdef INVARIANTS
3022
else if (origpte & PG_MANAGED) {
3023
if (pv == NULL) {
3024
#ifdef DDB
3025
pmap_page_print_mappings(om);
3026
#endif
3027
MPASS(pv != NULL);
3028
}
3029
}
3030
#endif
3031
if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3032
TAILQ_EMPTY(&om->md.pv_list) &&
3033
((om->flags & PG_FICTITIOUS) != 0 ||
3034
TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3035
vm_page_aflag_clear(om, PGA_WRITEABLE);
3036
}
3037
if ((origpte & PG_A) != 0)
3038
invalidate_page = true;
3039
origpte = 0;
3040
} else {
3041
if (pmap != kernel_pmap) {
3042
#ifdef INVARIANTS
3043
if (VERBOSE_PMAP || pmap_logging)
3044
printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
3045
pmap, va, m, prot, flags, psind,
3046
pmap->pm_pid, curproc->p_pid,
3047
curproc->p_comm);
3048
#endif
3049
}
3050
3051
/*
3052
* Increment the counters.
3053
*/
3054
if ((newpte & PG_W) != 0)
3055
pmap->pm_stats.wired_count++;
3056
pmap_resident_count_inc(pmap, 1);
3057
}
3058
3059
/*
3060
* Enter on the PV list if part of our managed memory.
3061
*/
3062
if ((newpte & PG_MANAGED) != 0) {
3063
if (pv == NULL) {
3064
pv = get_pv_entry(pmap, &lock);
3065
pv->pv_va = va;
3066
}
3067
#ifdef VERBOSE_PV
3068
else
3069
printf("reassigning pv: %p to pmap: %p\n",
3070
pv, pmap);
3071
#endif
3072
CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3073
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3074
m->md.pv_gen++;
3075
if ((newpte & PG_RW) != 0)
3076
vm_page_aflag_set(m, PGA_WRITEABLE);
3077
}
3078
3079
/*
3080
* Update the PTE.
3081
*/
3082
if ((origpte & PG_V) != 0) {
3083
validate:
3084
origpte = be64toh(pte_load_store(pte, htobe64(newpte)));
3085
KASSERT((origpte & PG_FRAME) == pa,
3086
("pmap_enter: unexpected pa update for %#lx", va));
3087
if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
3088
(PG_M | PG_RW)) {
3089
if ((origpte & PG_MANAGED) != 0)
3090
vm_page_dirty(m);
3091
invalidate_page = true;
3092
3093
/*
3094
* Although the PTE may still have PG_RW set, TLB
3095
* invalidation may nonetheless be required because
3096
* the PTE no longer has PG_M set.
3097
*/
3098
} else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
3099
/*
3100
* Removing capabilities requires invalidation on POWER
3101
*/
3102
invalidate_page = true;
3103
goto unchanged;
3104
}
3105
if ((origpte & PG_A) != 0)
3106
invalidate_page = true;
3107
} else {
3108
pte_store(pte, newpte);
3109
ptesync();
3110
}
3111
unchanged:
3112
3113
#if VM_NRESERVLEVEL > 0
3114
/*
3115
* If both the page table page and the reservation are fully
3116
* populated, then attempt promotion.
3117
*/
3118
if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
3119
mmu_radix_ps_enabled(pmap) &&
3120
(m->flags & PG_FICTITIOUS) == 0 &&
3121
vm_reserv_level_iffullpop(m) == 0 &&
3122
pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
3123
invalidate_all = true;
3124
#endif
3125
if (invalidate_all)
3126
pmap_invalidate_all(pmap);
3127
else if (invalidate_page)
3128
pmap_invalidate_page(pmap, va);
3129
3130
rv = KERN_SUCCESS;
3131
out:
3132
if (lock != NULL)
3133
rw_wunlock(lock);
3134
PMAP_UNLOCK(pmap);
3135
3136
return (rv);
3137
}
3138
3139
/*
3140
* Release a page table page reference after a failed attempt to create a
3141
* mapping.
3142
*/
3143
static void
3144
pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t pdpg)
3145
{
3146
struct spglist free;
3147
3148
SLIST_INIT(&free);
3149
if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
3150
/*
3151
* Although "va" is not mapped, paging-
3152
* structure caches could nonetheless have
3153
* entries that refer to the freed page table
3154
* pages. Invalidate those entries.
3155
*/
3156
pmap_invalidate_page(pmap, va);
3157
vm_page_free_pages_toq(&free, true);
3158
}
3159
}
3160
3161
/*
3162
* Tries to create a read- and/or execute-only 2MB page mapping. Returns true
3163
* if successful. Returns false if (1) a page table page cannot be allocated
3164
* without sleeping, (2) a mapping already exists at the specified virtual
3165
* address, or (3) a PV entry cannot be allocated without reclaiming another
3166
* PV entry.
3167
*/
3168
static bool
3169
pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3170
struct rwlock **lockp)
3171
{
3172
pml3_entry_t newpde;
3173
3174
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3175
newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
3176
RPTE_LEAF | PG_V;
3177
if ((m->oflags & VPO_UNMANAGED) == 0)
3178
newpde |= PG_MANAGED;
3179
if (prot & VM_PROT_EXECUTE)
3180
newpde |= PG_X;
3181
if (prot & VM_PROT_READ)
3182
newpde |= RPTE_EAA_R;
3183
if (va >= DMAP_MIN_ADDRESS)
3184
newpde |= RPTE_EAA_P;
3185
return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
3186
PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
3187
KERN_SUCCESS);
3188
}
3189
3190
/*
3191
* Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
3192
* the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
3193
* otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
3194
* a mapping already exists at the specified virtual address. Returns
3195
* KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
3196
* page allocation failed. Returns KERN_RESOURCE_SHORTAGE if
3197
* PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
3198
*
3199
* The parameter "m" is only used when creating a managed, writeable mapping.
3200
*/
3201
static int
3202
pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
3203
vm_page_t m, struct rwlock **lockp)
3204
{
3205
struct spglist free;
3206
pml3_entry_t oldl3e, *l3e;
3207
vm_page_t mt, pdpg;
3208
vm_page_t uwptpg;
3209
3210
KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
3211
("pmap_enter_pde: newpde is missing PG_M"));
3212
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3213
3214
if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3215
NULL : lockp)) == NULL) {
3216
CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3217
" in pmap %p", va, pmap);
3218
return (KERN_RESOURCE_SHORTAGE);
3219
}
3220
l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3221
l3e = &l3e[pmap_pml3e_index(va)];
3222
oldl3e = be64toh(*l3e);
3223
if ((oldl3e & PG_V) != 0) {
3224
KASSERT(pdpg->ref_count > 1,
3225
("pmap_enter_pde: pdpg's wire count is too low"));
3226
if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3227
pdpg->ref_count--;
3228
CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3229
" in pmap %p", va, pmap);
3230
return (KERN_FAILURE);
3231
}
3232
/* Break the existing mapping(s). */
3233
SLIST_INIT(&free);
3234
if ((oldl3e & RPTE_LEAF) != 0) {
3235
/*
3236
* The reference to the PD page that was acquired by
3237
* pmap_allocl3e() ensures that it won't be freed.
3238
* However, if the PDE resulted from a promotion, then
3239
* a reserved PT page could be freed.
3240
*/
3241
(void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
3242
pmap_invalidate_l3e_page(pmap, va, oldl3e);
3243
} else {
3244
if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
3245
&free, lockp))
3246
pmap_invalidate_all(pmap);
3247
}
3248
vm_page_free_pages_toq(&free, true);
3249
if (va >= VM_MAXUSER_ADDRESS) {
3250
mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
3251
if (pmap_insert_pt_page(pmap, mt)) {
3252
/*
3253
* XXX Currently, this can't happen because
3254
* we do not perform pmap_enter(psind == 1)
3255
* on the kernel pmap.
3256
*/
3257
panic("pmap_enter_pde: trie insert failed");
3258
}
3259
} else
3260
KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p",
3261
l3e));
3262
}
3263
3264
/*
3265
* Allocate leaf ptpage for wired userspace pages.
3266
*/
3267
uwptpg = NULL;
3268
if ((newpde & PG_W) != 0 && pmap != kernel_pmap) {
3269
uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3270
if (uwptpg == NULL) {
3271
pmap_abort_ptp(pmap, va, pdpg);
3272
return (KERN_RESOURCE_SHORTAGE);
3273
}
3274
uwptpg->pindex = pmap_l3e_pindex(va);
3275
if (pmap_insert_pt_page(pmap, uwptpg)) {
3276
vm_page_unwire_noq(uwptpg);
3277
vm_page_free(uwptpg);
3278
pmap_abort_ptp(pmap, va, pdpg);
3279
return (KERN_RESOURCE_SHORTAGE);
3280
}
3281
pmap_resident_count_inc(pmap, 1);
3282
uwptpg->ref_count = NPTEPG;
3283
pmap_fill_ptp((pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(uwptpg)),
3284
newpde);
3285
}
3286
if ((newpde & PG_MANAGED) != 0) {
3287
/*
3288
* Abort this mapping if its PV entry could not be created.
3289
*/
3290
if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
3291
pmap_abort_ptp(pmap, va, pdpg);
3292
if (uwptpg != NULL) {
3293
mt = pmap_remove_pt_page(pmap, va);
3294
KASSERT(mt == uwptpg,
3295
("removed pt page %p, expected %p", mt,
3296
uwptpg));
3297
pmap_resident_count_dec(pmap, 1);
3298
uwptpg->ref_count = 1;
3299
vm_page_unwire_noq(uwptpg);
3300
vm_page_free(uwptpg);
3301
}
3302
CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3303
" in pmap %p", va, pmap);
3304
return (KERN_RESOURCE_SHORTAGE);
3305
}
3306
if ((newpde & PG_RW) != 0) {
3307
for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
3308
vm_page_aflag_set(mt, PGA_WRITEABLE);
3309
}
3310
}
3311
3312
/*
3313
* Increment counters.
3314
*/
3315
if ((newpde & PG_W) != 0)
3316
pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
3317
pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
3318
3319
/*
3320
* Map the superpage. (This is not a promoted mapping; there will not
3321
* be any lingering 4KB page mappings in the TLB.)
3322
*/
3323
pte_store(l3e, newpde);
3324
ptesync();
3325
3326
counter_u64_add(pmap_l3e_mappings, 1);
3327
CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3328
" in pmap %p", va, pmap);
3329
return (KERN_SUCCESS);
3330
}
3331
3332
void
3333
mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
3334
vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
3335
{
3336
struct pctrie_iter pages;
3337
struct rwlock *lock;
3338
vm_offset_t va;
3339
vm_page_t m, mpte;
3340
bool invalidate;
3341
3342
VM_OBJECT_ASSERT_LOCKED(m_start->object);
3343
3344
CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
3345
end, m_start, prot);
3346
invalidate = false;
3347
mpte = NULL;
3348
vm_page_iter_limit_init(&pages, m_start->object,
3349
m_start->pindex + atop(end - start));
3350
m = vm_radix_iter_lookup(&pages, m_start->pindex);
3351
lock = NULL;
3352
PMAP_LOCK(pmap);
3353
while (m != NULL) {
3354
va = start + ptoa(m->pindex - m_start->pindex);
3355
if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
3356
m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
3357
pmap_enter_2mpage(pmap, va, m, prot, &lock)) {
3358
m = vm_radix_iter_jump(&pages, L3_PAGE_SIZE / PAGE_SIZE);
3359
} else {
3360
mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
3361
mpte, &lock, &invalidate);
3362
m = vm_radix_iter_step(&pages);
3363
}
3364
}
3365
ptesync();
3366
if (lock != NULL)
3367
rw_wunlock(lock);
3368
if (invalidate)
3369
pmap_invalidate_all(pmap);
3370
PMAP_UNLOCK(pmap);
3371
}
3372
3373
static vm_page_t
3374
mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3375
vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
3376
{
3377
struct spglist free;
3378
pt_entry_t *pte;
3379
vm_paddr_t pa;
3380
3381
KASSERT(!VA_IS_CLEANMAP(va) ||
3382
(m->oflags & VPO_UNMANAGED) != 0,
3383
("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
3384
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3385
3386
/*
3387
* In the case that a page table page is not
3388
* resident, we are creating it here.
3389
*/
3390
if (va < VM_MAXUSER_ADDRESS) {
3391
vm_pindex_t ptepindex;
3392
pml3_entry_t *ptepa;
3393
3394
/*
3395
* Calculate pagetable page index
3396
*/
3397
ptepindex = pmap_l3e_pindex(va);
3398
if (mpte && (mpte->pindex == ptepindex)) {
3399
mpte->ref_count++;
3400
} else {
3401
/*
3402
* Get the page directory entry
3403
*/
3404
ptepa = pmap_pml3e(pmap, va);
3405
3406
/*
3407
* If the page table page is mapped, we just increment
3408
* the hold count, and activate it. Otherwise, we
3409
* attempt to allocate a page table page. If this
3410
* attempt fails, we don't retry. Instead, we give up.
3411
*/
3412
if (ptepa && (be64toh(*ptepa) & PG_V) != 0) {
3413
if (be64toh(*ptepa) & RPTE_LEAF)
3414
return (NULL);
3415
mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME);
3416
mpte->ref_count++;
3417
} else {
3418
/*
3419
* Pass NULL instead of the PV list lock
3420
* pointer, because we don't intend to sleep.
3421
*/
3422
mpte = _pmap_allocpte(pmap, ptepindex, NULL);
3423
if (mpte == NULL)
3424
return (mpte);
3425
}
3426
}
3427
pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3428
pte = &pte[pmap_pte_index(va)];
3429
} else {
3430
mpte = NULL;
3431
pte = pmap_pte(pmap, va);
3432
}
3433
if (be64toh(*pte)) {
3434
if (mpte != NULL) {
3435
mpte->ref_count--;
3436
mpte = NULL;
3437
}
3438
return (mpte);
3439
}
3440
3441
/*
3442
* Enter on the PV list if part of our managed memory.
3443
*/
3444
if ((m->oflags & VPO_UNMANAGED) == 0 &&
3445
!pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3446
if (mpte != NULL) {
3447
SLIST_INIT(&free);
3448
if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3449
/*
3450
* Although "va" is not mapped, paging-
3451
* structure caches could nonetheless have
3452
* entries that refer to the freed page table
3453
* pages. Invalidate those entries.
3454
*/
3455
*invalidate = true;
3456
vm_page_free_pages_toq(&free, true);
3457
}
3458
mpte = NULL;
3459
}
3460
return (mpte);
3461
}
3462
3463
/*
3464
* Increment counters
3465
*/
3466
pmap_resident_count_inc(pmap, 1);
3467
3468
pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
3469
if (prot & VM_PROT_EXECUTE)
3470
pa |= PG_X;
3471
else
3472
pa |= RPTE_EAA_R;
3473
if ((m->oflags & VPO_UNMANAGED) == 0)
3474
pa |= PG_MANAGED;
3475
3476
pte_store(pte, pa);
3477
return (mpte);
3478
}
3479
3480
void
3481
mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
3482
vm_prot_t prot)
3483
{
3484
struct rwlock *lock;
3485
bool invalidate;
3486
3487
lock = NULL;
3488
invalidate = false;
3489
PMAP_LOCK(pmap);
3490
mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
3491
&invalidate);
3492
ptesync();
3493
if (lock != NULL)
3494
rw_wunlock(lock);
3495
if (invalidate)
3496
pmap_invalidate_all(pmap);
3497
PMAP_UNLOCK(pmap);
3498
}
3499
3500
vm_paddr_t
3501
mmu_radix_extract(pmap_t pmap, vm_offset_t va)
3502
{
3503
pml3_entry_t *l3e;
3504
pt_entry_t *pte;
3505
vm_paddr_t pa;
3506
3507
l3e = pmap_pml3e(pmap, va);
3508
if (__predict_false(l3e == NULL))
3509
return (0);
3510
if (be64toh(*l3e) & RPTE_LEAF) {
3511
pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
3512
pa |= (va & L3_PAGE_MASK);
3513
} else {
3514
/*
3515
* Beware of a concurrent promotion that changes the
3516
* PDE at this point! For example, vtopte() must not
3517
* be used to access the PTE because it would use the
3518
* new PDE. It is, however, safe to use the old PDE
3519
* because the page table page is preserved by the
3520
* promotion.
3521
*/
3522
pte = pmap_l3e_to_pte(l3e, va);
3523
if (__predict_false(pte == NULL))
3524
return (0);
3525
pa = be64toh(*pte);
3526
pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3527
pa |= (va & PAGE_MASK);
3528
}
3529
return (pa);
3530
}
3531
3532
vm_page_t
3533
mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3534
{
3535
pml3_entry_t l3e, *l3ep;
3536
pt_entry_t pte;
3537
vm_page_t m;
3538
3539
m = NULL;
3540
CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
3541
PMAP_LOCK(pmap);
3542
l3ep = pmap_pml3e(pmap, va);
3543
if (l3ep != NULL && (l3e = be64toh(*l3ep))) {
3544
if (l3e & RPTE_LEAF) {
3545
if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
3546
m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
3547
(va & L3_PAGE_MASK));
3548
} else {
3549
/* Native endian PTE, do not pass to pmap functions */
3550
pte = be64toh(*pmap_l3e_to_pte(l3ep, va));
3551
if ((pte & PG_V) &&
3552
((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
3553
m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3554
}
3555
if (m != NULL && !vm_page_wire_mapped(m))
3556
m = NULL;
3557
}
3558
PMAP_UNLOCK(pmap);
3559
return (m);
3560
}
3561
3562
static int
3563
mmu_radix_growkernel(vm_offset_t addr)
3564
{
3565
vm_paddr_t paddr;
3566
vm_page_t nkpg;
3567
pml3_entry_t *l3e;
3568
pml2_entry_t *l2e;
3569
3570
CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
3571
if (VM_MIN_KERNEL_ADDRESS < addr &&
3572
addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
3573
return (KERN_SUCCESS);
3574
3575
addr = roundup2(addr, L3_PAGE_SIZE);
3576
if (addr - 1 >= vm_map_max(kernel_map))
3577
addr = vm_map_max(kernel_map);
3578
while (kernel_vm_end < addr) {
3579
l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
3580
if ((be64toh(*l2e) & PG_V) == 0) {
3581
/* We need a new PDP entry */
3582
nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3583
VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3584
if (nkpg == NULL)
3585
return (KERN_RESOURCE_SHORTAGE);
3586
nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT;
3587
paddr = VM_PAGE_TO_PHYS(nkpg);
3588
pde_store(l2e, paddr);
3589
continue; /* try again */
3590
}
3591
l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
3592
if ((be64toh(*l3e) & PG_V) != 0) {
3593
kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3594
if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3595
kernel_vm_end = vm_map_max(kernel_map);
3596
break;
3597
}
3598
continue;
3599
}
3600
3601
nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3602
VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3603
if (nkpg == NULL)
3604
return (KERN_RESOURCE_SHORTAGE);
3605
nkpg->pindex = pmap_l3e_pindex(kernel_vm_end);
3606
paddr = VM_PAGE_TO_PHYS(nkpg);
3607
pde_store(l3e, paddr);
3608
3609
kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3610
if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3611
kernel_vm_end = vm_map_max(kernel_map);
3612
break;
3613
}
3614
}
3615
ptesync();
3616
return (KERN_SUCCESS);
3617
}
3618
3619
static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
3620
static uma_zone_t zone_radix_pgd;
3621
3622
static int
3623
radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
3624
int flags)
3625
{
3626
int req;
3627
3628
req = VM_ALLOC_WIRED | malloc2vm_flags(flags);
3629
for (int i = 0; i < count; i++) {
3630
vm_page_t m = vm_page_alloc_noobj_contig(req,
3631
RADIX_PGD_SIZE / PAGE_SIZE,
3632
0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
3633
VM_MEMATTR_DEFAULT);
3634
store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3635
}
3636
return (count);
3637
}
3638
3639
static void
3640
radix_pgd_release(void *arg __unused, void **store, int count)
3641
{
3642
vm_page_t m;
3643
struct spglist free;
3644
int page_count;
3645
3646
SLIST_INIT(&free);
3647
page_count = RADIX_PGD_SIZE/PAGE_SIZE;
3648
3649
for (int i = 0; i < count; i++) {
3650
/*
3651
* XXX selectively remove dmap and KVA entries so we don't
3652
* need to bzero
3653
*/
3654
m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
3655
for (int j = page_count-1; j >= 0; j--) {
3656
vm_page_unwire_noq(&m[j]);
3657
SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
3658
}
3659
vm_page_free_pages_toq(&free, false);
3660
}
3661
}
3662
3663
static void
3664
mmu_radix_init(void)
3665
{
3666
vm_page_t mpte;
3667
vm_size_t s;
3668
int error, i, pv_npg;
3669
3670
/* XXX is this really needed for POWER? */
3671
/* L1TF, reserve page @0 unconditionally */
3672
vm_page_blacklist_add(0, bootverbose);
3673
3674
zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
3675
RADIX_PGD_SIZE, NULL, NULL,
3676
#ifdef INVARIANTS
3677
trash_init, trash_fini,
3678
#else
3679
NULL, NULL,
3680
#endif
3681
radix_pgd_import, radix_pgd_release,
3682
NULL, UMA_ZONE_NOBUCKET);
3683
3684
/*
3685
* Initialize the vm page array entries for the kernel pmap's
3686
* page table pages.
3687
*/
3688
PMAP_LOCK(kernel_pmap);
3689
for (i = 0; i < nkpt; i++) {
3690
mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
3691
KASSERT(mpte >= vm_page_array &&
3692
mpte < &vm_page_array[vm_page_array_size],
3693
("pmap_init: page table page is out of range size: %lu",
3694
vm_page_array_size));
3695
mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
3696
mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
3697
MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
3698
//pmap_insert_pt_page(kernel_pmap, mpte);
3699
mpte->ref_count = 1;
3700
}
3701
PMAP_UNLOCK(kernel_pmap);
3702
vm_wire_add(nkpt);
3703
3704
CTR1(KTR_PMAP, "%s()", __func__);
3705
TAILQ_INIT(&pv_dummy.pv_list);
3706
3707
/*
3708
* Are large page mappings enabled?
3709
*/
3710
TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
3711
if (superpages_enabled) {
3712
KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
3713
("pmap_init: can't assign to pagesizes[1]"));
3714
pagesizes[1] = L3_PAGE_SIZE;
3715
}
3716
3717
/*
3718
* Initialize the pv chunk list mutex.
3719
*/
3720
mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
3721
3722
/*
3723
* Initialize the pool of pv list locks.
3724
*/
3725
for (i = 0; i < NPV_LIST_LOCKS; i++)
3726
rw_init(&pv_list_locks[i], "pmap pv list");
3727
3728
/*
3729
* Calculate the size of the pv head table for superpages.
3730
*/
3731
pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
3732
3733
/*
3734
* Allocate memory for the pv head table for superpages.
3735
*/
3736
s = (vm_size_t)(pv_npg * sizeof(struct md_page));
3737
s = round_page(s);
3738
pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
3739
for (i = 0; i < pv_npg; i++)
3740
TAILQ_INIT(&pv_table[i].pv_list);
3741
TAILQ_INIT(&pv_dummy.pv_list);
3742
3743
pmap_initialized = 1;
3744
mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
3745
error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3746
(vmem_addr_t *)&qframe);
3747
3748
if (error != 0)
3749
panic("qframe allocation failed");
3750
asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
3751
1, 1, M_WAITOK);
3752
}
3753
3754
static bool
3755
pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
3756
{
3757
struct rwlock *lock;
3758
pv_entry_t pv;
3759
struct md_page *pvh;
3760
pt_entry_t *pte, mask;
3761
pmap_t pmap;
3762
int md_gen, pvh_gen;
3763
bool rv;
3764
3765
rv = false;
3766
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3767
rw_rlock(lock);
3768
restart:
3769
TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
3770
pmap = PV_PMAP(pv);
3771
if (!PMAP_TRYLOCK(pmap)) {
3772
md_gen = m->md.pv_gen;
3773
rw_runlock(lock);
3774
PMAP_LOCK(pmap);
3775
rw_rlock(lock);
3776
if (md_gen != m->md.pv_gen) {
3777
PMAP_UNLOCK(pmap);
3778
goto restart;
3779
}
3780
}
3781
pte = pmap_pte(pmap, pv->pv_va);
3782
mask = 0;
3783
if (modified)
3784
mask |= PG_RW | PG_M;
3785
if (accessed)
3786
mask |= PG_V | PG_A;
3787
rv = (be64toh(*pte) & mask) == mask;
3788
PMAP_UNLOCK(pmap);
3789
if (rv)
3790
goto out;
3791
}
3792
if ((m->flags & PG_FICTITIOUS) == 0) {
3793
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3794
TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
3795
pmap = PV_PMAP(pv);
3796
if (!PMAP_TRYLOCK(pmap)) {
3797
md_gen = m->md.pv_gen;
3798
pvh_gen = pvh->pv_gen;
3799
rw_runlock(lock);
3800
PMAP_LOCK(pmap);
3801
rw_rlock(lock);
3802
if (md_gen != m->md.pv_gen ||
3803
pvh_gen != pvh->pv_gen) {
3804
PMAP_UNLOCK(pmap);
3805
goto restart;
3806
}
3807
}
3808
pte = pmap_pml3e(pmap, pv->pv_va);
3809
mask = 0;
3810
if (modified)
3811
mask |= PG_RW | PG_M;
3812
if (accessed)
3813
mask |= PG_V | PG_A;
3814
rv = (be64toh(*pte) & mask) == mask;
3815
PMAP_UNLOCK(pmap);
3816
if (rv)
3817
goto out;
3818
}
3819
}
3820
out:
3821
rw_runlock(lock);
3822
return (rv);
3823
}
3824
3825
/*
3826
* pmap_is_modified:
3827
*
3828
* Return whether or not the specified physical page was modified
3829
* in any physical maps.
3830
*/
3831
bool
3832
mmu_radix_is_modified(vm_page_t m)
3833
{
3834
3835
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3836
("pmap_is_modified: page %p is not managed", m));
3837
3838
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3839
/*
3840
* If the page is not busied then this check is racy.
3841
*/
3842
if (!pmap_page_is_write_mapped(m))
3843
return (false);
3844
return (pmap_page_test_mappings(m, false, true));
3845
}
3846
3847
bool
3848
mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3849
{
3850
pml3_entry_t *l3e;
3851
pt_entry_t *pte;
3852
bool rv;
3853
3854
CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
3855
rv = false;
3856
PMAP_LOCK(pmap);
3857
l3e = pmap_pml3e(pmap, addr);
3858
if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) {
3859
pte = pmap_l3e_to_pte(l3e, addr);
3860
rv = (be64toh(*pte) & PG_V) == 0;
3861
}
3862
PMAP_UNLOCK(pmap);
3863
return (rv);
3864
}
3865
3866
bool
3867
mmu_radix_is_referenced(vm_page_t m)
3868
{
3869
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3870
("pmap_is_referenced: page %p is not managed", m));
3871
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3872
return (pmap_page_test_mappings(m, true, false));
3873
}
3874
3875
/*
3876
* pmap_ts_referenced:
3877
*
3878
* Return a count of reference bits for a page, clearing those bits.
3879
* It is not necessary for every reference bit to be cleared, but it
3880
* is necessary that 0 only be returned when there are truly no
3881
* reference bits set.
3882
*
3883
* As an optimization, update the page's dirty field if a modified bit is
3884
* found while counting reference bits. This opportunistic update can be
3885
* performed at low cost and can eliminate the need for some future calls
3886
* to pmap_is_modified(). However, since this function stops after
3887
* finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
3888
* dirty pages. Those dirty pages will only be detected by a future call
3889
* to pmap_is_modified().
3890
*
3891
* A DI block is not needed within this function, because
3892
* invalidations are performed before the PV list lock is
3893
* released.
3894
*/
3895
int
3896
mmu_radix_ts_referenced(vm_page_t m)
3897
{
3898
struct md_page *pvh;
3899
pv_entry_t pv, pvf;
3900
pmap_t pmap;
3901
struct rwlock *lock;
3902
pml3_entry_t oldl3e, *l3e;
3903
pt_entry_t *pte;
3904
vm_paddr_t pa;
3905
int cleared, md_gen, not_cleared, pvh_gen;
3906
struct spglist free;
3907
3908
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3909
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3910
("pmap_ts_referenced: page %p is not managed", m));
3911
SLIST_INIT(&free);
3912
cleared = 0;
3913
pa = VM_PAGE_TO_PHYS(m);
3914
lock = PHYS_TO_PV_LIST_LOCK(pa);
3915
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
3916
rw_wlock(lock);
3917
retry:
3918
not_cleared = 0;
3919
if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
3920
goto small_mappings;
3921
pv = pvf;
3922
do {
3923
if (pvf == NULL)
3924
pvf = pv;
3925
pmap = PV_PMAP(pv);
3926
if (!PMAP_TRYLOCK(pmap)) {
3927
pvh_gen = pvh->pv_gen;
3928
rw_wunlock(lock);
3929
PMAP_LOCK(pmap);
3930
rw_wlock(lock);
3931
if (pvh_gen != pvh->pv_gen) {
3932
PMAP_UNLOCK(pmap);
3933
goto retry;
3934
}
3935
}
3936
l3e = pmap_pml3e(pmap, pv->pv_va);
3937
oldl3e = be64toh(*l3e);
3938
if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3939
/*
3940
* Although "oldpde" is mapping a 2MB page, because
3941
* this function is called at a 4KB page granularity,
3942
* we only update the 4KB page under test.
3943
*/
3944
vm_page_dirty(m);
3945
}
3946
if ((oldl3e & PG_A) != 0) {
3947
/*
3948
* Since this reference bit is shared by 512 4KB
3949
* pages, it should not be cleared every time it is
3950
* tested. Apply a simple "hash" function on the
3951
* physical page number, the virtual superpage number,
3952
* and the pmap address to select one 4KB page out of
3953
* the 512 on which testing the reference bit will
3954
* result in clearing that reference bit. This
3955
* function is designed to avoid the selection of the
3956
* same 4KB page for every 2MB page mapping.
3957
*
3958
* On demotion, a mapping that hasn't been referenced
3959
* is simply destroyed. To avoid the possibility of a
3960
* subsequent page fault on a demoted wired mapping,
3961
* always leave its reference bit set. Moreover,
3962
* since the superpage is wired, the current state of
3963
* its reference bit won't affect page replacement.
3964
*/
3965
if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
3966
(uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
3967
(oldl3e & PG_W) == 0) {
3968
atomic_clear_long(l3e, htobe64(PG_A));
3969
pmap_invalidate_page(pmap, pv->pv_va);
3970
cleared++;
3971
KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3972
("inconsistent pv lock %p %p for page %p",
3973
lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3974
} else
3975
not_cleared++;
3976
}
3977
PMAP_UNLOCK(pmap);
3978
/* Rotate the PV list if it has more than one entry. */
3979
if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3980
TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
3981
TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
3982
pvh->pv_gen++;
3983
}
3984
if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
3985
goto out;
3986
} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
3987
small_mappings:
3988
if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
3989
goto out;
3990
pv = pvf;
3991
do {
3992
if (pvf == NULL)
3993
pvf = pv;
3994
pmap = PV_PMAP(pv);
3995
if (!PMAP_TRYLOCK(pmap)) {
3996
pvh_gen = pvh->pv_gen;
3997
md_gen = m->md.pv_gen;
3998
rw_wunlock(lock);
3999
PMAP_LOCK(pmap);
4000
rw_wlock(lock);
4001
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4002
PMAP_UNLOCK(pmap);
4003
goto retry;
4004
}
4005
}
4006
l3e = pmap_pml3e(pmap, pv->pv_va);
4007
KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
4008
("pmap_ts_referenced: found a 2mpage in page %p's pv list",
4009
m));
4010
pte = pmap_l3e_to_pte(l3e, pv->pv_va);
4011
if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW))
4012
vm_page_dirty(m);
4013
if ((be64toh(*pte) & PG_A) != 0) {
4014
atomic_clear_long(pte, htobe64(PG_A));
4015
pmap_invalidate_page(pmap, pv->pv_va);
4016
cleared++;
4017
}
4018
PMAP_UNLOCK(pmap);
4019
/* Rotate the PV list if it has more than one entry. */
4020
if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
4021
TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
4022
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
4023
m->md.pv_gen++;
4024
}
4025
} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4026
not_cleared < PMAP_TS_REFERENCED_MAX);
4027
out:
4028
rw_wunlock(lock);
4029
vm_page_free_pages_toq(&free, true);
4030
return (cleared + not_cleared);
4031
}
4032
4033
static vm_offset_t
4034
mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
4035
vm_paddr_t end, int prot __unused)
4036
{
4037
4038
CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
4039
prot);
4040
return (PHYS_TO_DMAP(start));
4041
}
4042
4043
void
4044
mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
4045
vm_object_t object, vm_pindex_t pindex, vm_size_t size)
4046
{
4047
struct pctrie_iter pages;
4048
pml3_entry_t *l3e;
4049
vm_paddr_t pa, ptepa;
4050
vm_page_t p, pdpg;
4051
vm_memattr_t ma;
4052
4053
CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
4054
object, pindex, size);
4055
VM_OBJECT_ASSERT_WLOCKED(object);
4056
KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4057
("pmap_object_init_pt: non-device object"));
4058
/* NB: size can be logically ored with addr here */
4059
if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
4060
if (!mmu_radix_ps_enabled(pmap))
4061
return;
4062
if (!vm_object_populate(object, pindex, pindex + atop(size)))
4063
return;
4064
vm_page_iter_init(&pages, object);
4065
p = vm_radix_iter_lookup(&pages, pindex);
4066
4067
KASSERT(p->valid == VM_PAGE_BITS_ALL,
4068
("pmap_object_init_pt: invalid page %p", p));
4069
ma = p->md.mdpg_cache_attrs;
4070
4071
/*
4072
* Abort the mapping if the first page is not physically
4073
* aligned to a 2MB page boundary.
4074
*/
4075
ptepa = VM_PAGE_TO_PHYS(p);
4076
if (ptepa & L3_PAGE_MASK)
4077
return;
4078
4079
/*
4080
* Skip the first page. Abort the mapping if the rest of
4081
* the pages are not physically contiguous or have differing
4082
* memory attributes.
4083
*/
4084
for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4085
pa += PAGE_SIZE) {
4086
p = vm_radix_iter_next(&pages);
4087
KASSERT(p->valid == VM_PAGE_BITS_ALL,
4088
("pmap_object_init_pt: invalid page %p", p));
4089
if (pa != VM_PAGE_TO_PHYS(p) ||
4090
ma != p->md.mdpg_cache_attrs)
4091
return;
4092
}
4093
4094
PMAP_LOCK(pmap);
4095
for (pa = ptepa | pmap_cache_bits(ma);
4096
pa < ptepa + size; pa += L3_PAGE_SIZE) {
4097
pdpg = pmap_allocl3e(pmap, addr, NULL);
4098
if (pdpg == NULL) {
4099
/*
4100
* The creation of mappings below is only an
4101
* optimization. If a page directory page
4102
* cannot be allocated without blocking,
4103
* continue on to the next mapping rather than
4104
* blocking.
4105
*/
4106
addr += L3_PAGE_SIZE;
4107
continue;
4108
}
4109
l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4110
l3e = &l3e[pmap_pml3e_index(addr)];
4111
if ((be64toh(*l3e) & PG_V) == 0) {
4112
pa |= PG_M | PG_A | PG_RW;
4113
pte_store(l3e, pa);
4114
pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
4115
counter_u64_add(pmap_l3e_mappings, 1);
4116
} else {
4117
/* Continue on if the PDE is already valid. */
4118
pdpg->ref_count--;
4119
KASSERT(pdpg->ref_count > 0,
4120
("pmap_object_init_pt: missing reference "
4121
"to page directory page, va: 0x%lx", addr));
4122
}
4123
addr += L3_PAGE_SIZE;
4124
}
4125
ptesync();
4126
PMAP_UNLOCK(pmap);
4127
}
4128
}
4129
4130
bool
4131
mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
4132
{
4133
struct md_page *pvh;
4134
struct rwlock *lock;
4135
pv_entry_t pv;
4136
int loops = 0;
4137
bool rv;
4138
4139
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4140
("pmap_page_exists_quick: page %p is not managed", m));
4141
CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
4142
rv = false;
4143
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4144
rw_rlock(lock);
4145
TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4146
if (PV_PMAP(pv) == pmap) {
4147
rv = true;
4148
break;
4149
}
4150
loops++;
4151
if (loops >= 16)
4152
break;
4153
}
4154
if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4155
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4156
TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4157
if (PV_PMAP(pv) == pmap) {
4158
rv = true;
4159
break;
4160
}
4161
loops++;
4162
if (loops >= 16)
4163
break;
4164
}
4165
}
4166
rw_runlock(lock);
4167
return (rv);
4168
}
4169
4170
void
4171
mmu_radix_page_init(vm_page_t m)
4172
{
4173
4174
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4175
TAILQ_INIT(&m->md.pv_list);
4176
m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
4177
}
4178
4179
int
4180
mmu_radix_page_wired_mappings(vm_page_t m)
4181
{
4182
struct rwlock *lock;
4183
struct md_page *pvh;
4184
pmap_t pmap;
4185
pt_entry_t *pte;
4186
pv_entry_t pv;
4187
int count, md_gen, pvh_gen;
4188
4189
if ((m->oflags & VPO_UNMANAGED) != 0)
4190
return (0);
4191
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4192
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4193
rw_rlock(lock);
4194
restart:
4195
count = 0;
4196
TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4197
pmap = PV_PMAP(pv);
4198
if (!PMAP_TRYLOCK(pmap)) {
4199
md_gen = m->md.pv_gen;
4200
rw_runlock(lock);
4201
PMAP_LOCK(pmap);
4202
rw_rlock(lock);
4203
if (md_gen != m->md.pv_gen) {
4204
PMAP_UNLOCK(pmap);
4205
goto restart;
4206
}
4207
}
4208
pte = pmap_pte(pmap, pv->pv_va);
4209
if ((be64toh(*pte) & PG_W) != 0)
4210
count++;
4211
PMAP_UNLOCK(pmap);
4212
}
4213
if ((m->flags & PG_FICTITIOUS) == 0) {
4214
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4215
TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4216
pmap = PV_PMAP(pv);
4217
if (!PMAP_TRYLOCK(pmap)) {
4218
md_gen = m->md.pv_gen;
4219
pvh_gen = pvh->pv_gen;
4220
rw_runlock(lock);
4221
PMAP_LOCK(pmap);
4222
rw_rlock(lock);
4223
if (md_gen != m->md.pv_gen ||
4224
pvh_gen != pvh->pv_gen) {
4225
PMAP_UNLOCK(pmap);
4226
goto restart;
4227
}
4228
}
4229
pte = pmap_pml3e(pmap, pv->pv_va);
4230
if ((be64toh(*pte) & PG_W) != 0)
4231
count++;
4232
PMAP_UNLOCK(pmap);
4233
}
4234
}
4235
rw_runlock(lock);
4236
return (count);
4237
}
4238
4239
static void
4240
mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
4241
{
4242
isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT);
4243
}
4244
4245
int
4246
mmu_radix_pinit(pmap_t pmap)
4247
{
4248
vmem_addr_t pid;
4249
vm_paddr_t l1pa;
4250
4251
CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4252
4253
/*
4254
* allocate the page directory page
4255
*/
4256
pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
4257
4258
for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++)
4259
pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
4260
vm_radix_init(&pmap->pm_radix);
4261
TAILQ_INIT(&pmap->pm_pvchunk);
4262
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4263
pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4264
vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
4265
4266
pmap->pm_pid = pid;
4267
l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
4268
mmu_radix_update_proctab(pid, l1pa);
4269
__asm __volatile("ptesync;isync" : : : "memory");
4270
4271
return (1);
4272
}
4273
4274
/*
4275
* This routine is called if the desired page table page does not exist.
4276
*
4277
* If page table page allocation fails, this routine may sleep before
4278
* returning NULL. It sleeps only if a lock pointer was given.
4279
*
4280
* Note: If a page allocation fails at page table level two or three,
4281
* one or two pages may be held during the wait, only to be released
4282
* afterwards. This conservative approach is easily argued to avoid
4283
* race conditions.
4284
*/
4285
static vm_page_t
4286
_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
4287
{
4288
vm_page_t m, pdppg, pdpg;
4289
4290
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4291
4292
/*
4293
* Allocate a page table page.
4294
*/
4295
if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
4296
if (lockp != NULL) {
4297
RELEASE_PV_LIST_LOCK(lockp);
4298
PMAP_UNLOCK(pmap);
4299
vm_wait(NULL);
4300
PMAP_LOCK(pmap);
4301
}
4302
/*
4303
* Indicate the need to retry. While waiting, the page table
4304
* page may have been allocated.
4305
*/
4306
return (NULL);
4307
}
4308
m->pindex = ptepindex;
4309
4310
/*
4311
* Map the pagetable page into the process address space, if
4312
* it isn't already there.
4313
*/
4314
4315
if (ptepindex >= (NUPDE + NUPDPE)) {
4316
pml1_entry_t *l1e;
4317
vm_pindex_t pml1index;
4318
4319
/* Wire up a new PDPE page */
4320
pml1index = ptepindex - (NUPDE + NUPDPE);
4321
l1e = &pmap->pm_pml1[pml1index];
4322
KASSERT((be64toh(*l1e) & PG_V) == 0,
4323
("%s: L1 entry %#lx is valid", __func__, *l1e));
4324
pde_store(l1e, VM_PAGE_TO_PHYS(m));
4325
} else if (ptepindex >= NUPDE) {
4326
vm_pindex_t pml1index;
4327
vm_pindex_t pdpindex;
4328
pml1_entry_t *l1e;
4329
pml2_entry_t *l2e;
4330
4331
/* Wire up a new l2e page */
4332
pdpindex = ptepindex - NUPDE;
4333
pml1index = pdpindex >> RPTE_SHIFT;
4334
4335
l1e = &pmap->pm_pml1[pml1index];
4336
if ((be64toh(*l1e) & PG_V) == 0) {
4337
/* Have to allocate a new pdp, recurse */
4338
if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
4339
lockp) == NULL) {
4340
vm_page_unwire_noq(m);
4341
vm_page_free_zero(m);
4342
return (NULL);
4343
}
4344
} else {
4345
/* Add reference to l2e page */
4346
pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME);
4347
pdppg->ref_count++;
4348
}
4349
l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4350
4351
/* Now find the pdp page */
4352
l2e = &l2e[pdpindex & RPTE_MASK];
4353
KASSERT((be64toh(*l2e) & PG_V) == 0,
4354
("%s: L2 entry %#lx is valid", __func__, *l2e));
4355
pde_store(l2e, VM_PAGE_TO_PHYS(m));
4356
} else {
4357
vm_pindex_t pml1index;
4358
vm_pindex_t pdpindex;
4359
pml1_entry_t *l1e;
4360
pml2_entry_t *l2e;
4361
pml3_entry_t *l3e;
4362
4363
/* Wire up a new PTE page */
4364
pdpindex = ptepindex >> RPTE_SHIFT;
4365
pml1index = pdpindex >> RPTE_SHIFT;
4366
4367
/* First, find the pdp and check that its valid. */
4368
l1e = &pmap->pm_pml1[pml1index];
4369
if ((be64toh(*l1e) & PG_V) == 0) {
4370
/* Have to allocate a new pd, recurse */
4371
if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4372
lockp) == NULL) {
4373
vm_page_unwire_noq(m);
4374
vm_page_free_zero(m);
4375
return (NULL);
4376
}
4377
l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4378
l2e = &l2e[pdpindex & RPTE_MASK];
4379
} else {
4380
l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4381
l2e = &l2e[pdpindex & RPTE_MASK];
4382
if ((be64toh(*l2e) & PG_V) == 0) {
4383
/* Have to allocate a new pd, recurse */
4384
if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4385
lockp) == NULL) {
4386
vm_page_unwire_noq(m);
4387
vm_page_free_zero(m);
4388
return (NULL);
4389
}
4390
} else {
4391
/* Add reference to the pd page */
4392
pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME);
4393
pdpg->ref_count++;
4394
}
4395
}
4396
l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME);
4397
4398
/* Now we know where the page directory page is */
4399
l3e = &l3e[ptepindex & RPTE_MASK];
4400
KASSERT((be64toh(*l3e) & PG_V) == 0,
4401
("%s: L3 entry %#lx is valid", __func__, *l3e));
4402
pde_store(l3e, VM_PAGE_TO_PHYS(m));
4403
}
4404
4405
pmap_resident_count_inc(pmap, 1);
4406
return (m);
4407
}
4408
static vm_page_t
4409
pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4410
{
4411
vm_pindex_t pdpindex, ptepindex;
4412
pml2_entry_t *pdpe;
4413
vm_page_t pdpg;
4414
4415
retry:
4416
pdpe = pmap_pml2e(pmap, va);
4417
if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) {
4418
/* Add a reference to the pd page. */
4419
pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME);
4420
pdpg->ref_count++;
4421
} else {
4422
/* Allocate a pd page. */
4423
ptepindex = pmap_l3e_pindex(va);
4424
pdpindex = ptepindex >> RPTE_SHIFT;
4425
pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
4426
if (pdpg == NULL && lockp != NULL)
4427
goto retry;
4428
}
4429
return (pdpg);
4430
}
4431
4432
static vm_page_t
4433
pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4434
{
4435
vm_pindex_t ptepindex;
4436
pml3_entry_t *pd;
4437
vm_page_t m;
4438
4439
/*
4440
* Calculate pagetable page index
4441
*/
4442
ptepindex = pmap_l3e_pindex(va);
4443
retry:
4444
/*
4445
* Get the page directory entry
4446
*/
4447
pd = pmap_pml3e(pmap, va);
4448
4449
/*
4450
* This supports switching from a 2MB page to a
4451
* normal 4K page.
4452
*/
4453
if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
4454
if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
4455
/*
4456
* Invalidation of the 2MB page mapping may have caused
4457
* the deallocation of the underlying PD page.
4458
*/
4459
pd = NULL;
4460
}
4461
}
4462
4463
/*
4464
* If the page table page is mapped, we just increment the
4465
* hold count, and activate it.
4466
*/
4467
if (pd != NULL && (be64toh(*pd) & PG_V) != 0) {
4468
m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME);
4469
m->ref_count++;
4470
} else {
4471
/*
4472
* Here if the pte page isn't mapped, or if it has been
4473
* deallocated.
4474
*/
4475
m = _pmap_allocpte(pmap, ptepindex, lockp);
4476
if (m == NULL && lockp != NULL)
4477
goto retry;
4478
}
4479
return (m);
4480
}
4481
4482
static void
4483
mmu_radix_pinit0(pmap_t pmap)
4484
{
4485
4486
CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4487
PMAP_LOCK_INIT(pmap);
4488
pmap->pm_pml1 = kernel_pmap->pm_pml1;
4489
pmap->pm_pid = kernel_pmap->pm_pid;
4490
4491
vm_radix_init(&pmap->pm_radix);
4492
TAILQ_INIT(&pmap->pm_pvchunk);
4493
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4494
kernel_pmap->pm_flags =
4495
pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4496
}
4497
/*
4498
* pmap_protect_l3e: do the things to protect a 2mpage in a process
4499
*/
4500
static bool
4501
pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
4502
{
4503
pt_entry_t newpde, oldpde;
4504
vm_offset_t eva, va;
4505
vm_page_t m;
4506
bool anychanged;
4507
4508
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4509
KASSERT((sva & L3_PAGE_MASK) == 0,
4510
("pmap_protect_l3e: sva is not 2mpage aligned"));
4511
anychanged = false;
4512
retry:
4513
oldpde = newpde = be64toh(*l3e);
4514
if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4515
(PG_MANAGED | PG_M | PG_RW)) {
4516
eva = sva + L3_PAGE_SIZE;
4517
for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4518
va < eva; va += PAGE_SIZE, m++)
4519
vm_page_dirty(m);
4520
}
4521
if ((prot & VM_PROT_WRITE) == 0) {
4522
newpde &= ~(PG_RW | PG_M);
4523
newpde |= RPTE_EAA_R;
4524
}
4525
if (prot & VM_PROT_EXECUTE)
4526
newpde |= PG_X;
4527
if (newpde != oldpde) {
4528
/*
4529
* As an optimization to future operations on this PDE, clear
4530
* PG_PROMOTED. The impending invalidation will remove any
4531
* lingering 4KB page mappings from the TLB.
4532
*/
4533
if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED)))
4534
goto retry;
4535
anychanged = true;
4536
}
4537
return (anychanged);
4538
}
4539
4540
void
4541
mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4542
vm_prot_t prot)
4543
{
4544
vm_offset_t va_next;
4545
pml1_entry_t *l1e;
4546
pml2_entry_t *l2e;
4547
pml3_entry_t ptpaddr, *l3e;
4548
pt_entry_t *pte;
4549
bool anychanged;
4550
4551
CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
4552
prot);
4553
4554
KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4555
if (prot == VM_PROT_NONE) {
4556
mmu_radix_remove(pmap, sva, eva);
4557
return;
4558
}
4559
4560
if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4561
(VM_PROT_WRITE|VM_PROT_EXECUTE))
4562
return;
4563
4564
#ifdef INVARIANTS
4565
if (VERBOSE_PROTECT || pmap_logging)
4566
printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
4567
pmap, sva, eva, prot, pmap->pm_pid);
4568
#endif
4569
anychanged = false;
4570
4571
PMAP_LOCK(pmap);
4572
for (; sva < eva; sva = va_next) {
4573
l1e = pmap_pml1e(pmap, sva);
4574
if ((be64toh(*l1e) & PG_V) == 0) {
4575
va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
4576
if (va_next < sva)
4577
va_next = eva;
4578
continue;
4579
}
4580
4581
l2e = pmap_l1e_to_l2e(l1e, sva);
4582
if ((be64toh(*l2e) & PG_V) == 0) {
4583
va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
4584
if (va_next < sva)
4585
va_next = eva;
4586
continue;
4587
}
4588
4589
va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
4590
if (va_next < sva)
4591
va_next = eva;
4592
4593
l3e = pmap_l2e_to_l3e(l2e, sva);
4594
ptpaddr = be64toh(*l3e);
4595
4596
/*
4597
* Weed out invalid mappings.
4598
*/
4599
if (ptpaddr == 0)
4600
continue;
4601
4602
/*
4603
* Check for large page.
4604
*/
4605
if ((ptpaddr & RPTE_LEAF) != 0) {
4606
/*
4607
* Are we protecting the entire large page? If not,
4608
* demote the mapping and fall through.
4609
*/
4610
if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
4611
if (pmap_protect_l3e(pmap, l3e, sva, prot))
4612
anychanged = true;
4613
continue;
4614
} else if (!pmap_demote_l3e(pmap, l3e, sva)) {
4615
/*
4616
* The large page mapping was destroyed.
4617
*/
4618
continue;
4619
}
4620
}
4621
4622
if (va_next > eva)
4623
va_next = eva;
4624
4625
for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
4626
sva += PAGE_SIZE) {
4627
pt_entry_t obits, pbits;
4628
vm_page_t m;
4629
4630
retry:
4631
MPASS(pte == pmap_pte(pmap, sva));
4632
obits = pbits = be64toh(*pte);
4633
if ((pbits & PG_V) == 0)
4634
continue;
4635
4636
if ((prot & VM_PROT_WRITE) == 0) {
4637
if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4638
(PG_MANAGED | PG_M | PG_RW)) {
4639
m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4640
vm_page_dirty(m);
4641
}
4642
pbits &= ~(PG_RW | PG_M);
4643
pbits |= RPTE_EAA_R;
4644
}
4645
if (prot & VM_PROT_EXECUTE)
4646
pbits |= PG_X;
4647
4648
if (pbits != obits) {
4649
if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits)))
4650
goto retry;
4651
if (obits & (PG_A|PG_M)) {
4652
anychanged = true;
4653
#ifdef INVARIANTS
4654
if (VERBOSE_PROTECT || pmap_logging)
4655
printf("%#lx %#lx -> %#lx\n",
4656
sva, obits, pbits);
4657
#endif
4658
}
4659
}
4660
}
4661
}
4662
if (anychanged)
4663
pmap_invalidate_all(pmap);
4664
PMAP_UNLOCK(pmap);
4665
}
4666
4667
void
4668
mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
4669
{
4670
4671
CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
4672
pt_entry_t oldpte, pa, *pte;
4673
vm_page_t m;
4674
uint64_t cache_bits, attr_bits;
4675
vm_offset_t va;
4676
4677
oldpte = 0;
4678
attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
4679
va = sva;
4680
pte = kvtopte(va);
4681
while (va < sva + PAGE_SIZE * count) {
4682
if (__predict_false((va & L3_PAGE_MASK) == 0))
4683
pte = kvtopte(va);
4684
MPASS(pte == pmap_pte(kernel_pmap, va));
4685
4686
/*
4687
* XXX there has to be a more efficient way than traversing
4688
* the page table every time - but go for correctness for
4689
* today
4690
*/
4691
4692
m = *ma++;
4693
cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
4694
pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
4695
if (be64toh(*pte) != pa) {
4696
oldpte |= be64toh(*pte);
4697
pte_store(pte, pa);
4698
}
4699
va += PAGE_SIZE;
4700
pte++;
4701
}
4702
if (__predict_false((oldpte & RPTE_VALID) != 0))
4703
pmap_invalidate_range(kernel_pmap, sva, sva + count *
4704
PAGE_SIZE);
4705
else
4706
ptesync();
4707
}
4708
4709
void
4710
mmu_radix_qremove(vm_offset_t sva, int count)
4711
{
4712
vm_offset_t va;
4713
pt_entry_t *pte;
4714
4715
CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
4716
KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
4717
4718
va = sva;
4719
pte = kvtopte(va);
4720
while (va < sva + PAGE_SIZE * count) {
4721
if (__predict_false((va & L3_PAGE_MASK) == 0))
4722
pte = kvtopte(va);
4723
pte_clear(pte);
4724
pte++;
4725
va += PAGE_SIZE;
4726
}
4727
pmap_invalidate_range(kernel_pmap, sva, va);
4728
}
4729
4730
/***************************************************
4731
* Page table page management routines.....
4732
***************************************************/
4733
/*
4734
* Schedule the specified unused page table page to be freed. Specifically,
4735
* add the page to the specified list of pages that will be released to the
4736
* physical memory manager after the TLB has been updated.
4737
*/
4738
static __inline void
4739
pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
4740
{
4741
4742
if (set_PG_ZERO)
4743
m->flags |= PG_ZERO;
4744
else
4745
m->flags &= ~PG_ZERO;
4746
SLIST_INSERT_HEAD(free, m, plinks.s.ss);
4747
}
4748
4749
/*
4750
* Inserts the specified page table page into the specified pmap's collection
4751
* of idle page table pages. Each of a pmap's page table pages is responsible
4752
* for mapping a distinct range of virtual addresses. The pmap's collection is
4753
* ordered by this virtual address range.
4754
*/
4755
static __inline int
4756
pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
4757
{
4758
4759
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4760
return (vm_radix_insert(&pmap->pm_radix, mpte));
4761
}
4762
4763
/*
4764
* Removes the page table page mapping the specified virtual address from the
4765
* specified pmap's collection of idle page table pages, and returns it.
4766
* Otherwise, returns NULL if there is no page table page corresponding to the
4767
* specified virtual address.
4768
*/
4769
static __inline vm_page_t
4770
pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4771
{
4772
4773
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4774
return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
4775
}
4776
4777
/*
4778
* Decrements a page table page's wire count, which is used to record the
4779
* number of valid page table entries within the page. If the wire count
4780
* drops to zero, then the page table page is unmapped. Returns true if the
4781
* page table page was unmapped and false otherwise.
4782
*/
4783
static inline bool
4784
pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4785
{
4786
4787
--m->ref_count;
4788
if (m->ref_count == 0) {
4789
_pmap_unwire_ptp(pmap, va, m, free);
4790
return (true);
4791
} else
4792
return (false);
4793
}
4794
4795
static void
4796
_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4797
{
4798
4799
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4800
/*
4801
* unmap the page table page
4802
*/
4803
if (m->pindex >= NUPDE + NUPDPE) {
4804
/* PDP page */
4805
pml1_entry_t *pml1;
4806
pml1 = pmap_pml1e(pmap, va);
4807
*pml1 = 0;
4808
} else if (m->pindex >= NUPDE) {
4809
/* PD page */
4810
pml2_entry_t *l2e;
4811
l2e = pmap_pml2e(pmap, va);
4812
*l2e = 0;
4813
} else {
4814
/* PTE page */
4815
pml3_entry_t *l3e;
4816
l3e = pmap_pml3e(pmap, va);
4817
*l3e = 0;
4818
}
4819
pmap_resident_count_dec(pmap, 1);
4820
if (m->pindex < NUPDE) {
4821
/* We just released a PT, unhold the matching PD */
4822
vm_page_t pdpg;
4823
4824
pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME);
4825
pmap_unwire_ptp(pmap, va, pdpg, free);
4826
}
4827
else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
4828
/* We just released a PD, unhold the matching PDP */
4829
vm_page_t pdppg;
4830
4831
pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME);
4832
pmap_unwire_ptp(pmap, va, pdppg, free);
4833
}
4834
4835
/*
4836
* Put page on a list so that it is released after
4837
* *ALL* TLB shootdown is done
4838
*/
4839
pmap_add_delayed_free_list(m, free, true);
4840
}
4841
4842
/*
4843
* After removing a page table entry, this routine is used to
4844
* conditionally free the page, and manage the hold/wire counts.
4845
*/
4846
static int
4847
pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
4848
struct spglist *free)
4849
{
4850
vm_page_t mpte;
4851
4852
if (va >= VM_MAXUSER_ADDRESS)
4853
return (0);
4854
KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4855
mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4856
return (pmap_unwire_ptp(pmap, va, mpte, free));
4857
}
4858
4859
void
4860
mmu_radix_release(pmap_t pmap)
4861
{
4862
4863
CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4864
KASSERT(pmap->pm_stats.resident_count == 0,
4865
("pmap_release: pmap resident count %ld != 0",
4866
pmap->pm_stats.resident_count));
4867
KASSERT(vm_radix_is_empty(&pmap->pm_radix),
4868
("pmap_release: pmap has reserved page table page(s)"));
4869
4870
pmap_invalidate_all(pmap);
4871
isa3_proctab[pmap->pm_pid].proctab0 = 0;
4872
uma_zfree(zone_radix_pgd, pmap->pm_pml1);
4873
vmem_free(asid_arena, pmap->pm_pid, 1);
4874
}
4875
4876
/*
4877
* Create the PV entry for a 2MB page mapping. Always returns true unless the
4878
* flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
4879
* false if the PV entry cannot be allocated without resorting to reclamation.
4880
*/
4881
static bool
4882
pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
4883
struct rwlock **lockp)
4884
{
4885
struct md_page *pvh;
4886
pv_entry_t pv;
4887
vm_paddr_t pa;
4888
4889
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4890
/* Pass NULL instead of the lock pointer to disable reclamation. */
4891
if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4892
NULL : lockp)) == NULL)
4893
return (false);
4894
pv->pv_va = va;
4895
pa = pde & PG_PS_FRAME;
4896
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4897
pvh = pa_to_pvh(pa);
4898
TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
4899
pvh->pv_gen++;
4900
return (true);
4901
}
4902
4903
/*
4904
* Fills a page table page with mappings to consecutive physical pages.
4905
*/
4906
static void
4907
pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4908
{
4909
pt_entry_t *pte;
4910
4911
for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4912
*pte = htobe64(newpte);
4913
newpte += PAGE_SIZE;
4914
}
4915
}
4916
4917
static bool
4918
pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
4919
{
4920
struct rwlock *lock;
4921
bool rv;
4922
4923
lock = NULL;
4924
rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
4925
if (lock != NULL)
4926
rw_wunlock(lock);
4927
return (rv);
4928
}
4929
4930
static bool
4931
pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
4932
struct rwlock **lockp)
4933
{
4934
pml3_entry_t oldpde;
4935
pt_entry_t *firstpte;
4936
vm_paddr_t mptepa;
4937
vm_page_t mpte;
4938
struct spglist free;
4939
vm_offset_t sva;
4940
4941
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4942
oldpde = be64toh(*l3e);
4943
KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
4944
("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
4945
oldpde));
4946
if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4947
NULL) {
4948
KASSERT((oldpde & PG_W) == 0,
4949
("pmap_demote_l3e: page table page for a wired mapping"
4950
" is missing"));
4951
4952
/*
4953
* Invalidate the 2MB page mapping and return "failure" if the
4954
* mapping was never accessed or the allocation of the new
4955
* page table page fails. If the 2MB page mapping belongs to
4956
* the direct map region of the kernel's address space, then
4957
* the page allocation request specifies the highest possible
4958
* priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is
4959
* normal. Page table pages are preallocated for every other
4960
* part of the kernel address space, so the direct map region
4961
* is the only part of the kernel address space that must be
4962
* handled here.
4963
*/
4964
if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj(
4965
(va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ?
4966
VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) {
4967
SLIST_INIT(&free);
4968
sva = trunc_2mpage(va);
4969
pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
4970
pmap_invalidate_l3e_page(pmap, sva, oldpde);
4971
vm_page_free_pages_toq(&free, true);
4972
CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
4973
" in pmap %p", va, pmap);
4974
return (false);
4975
}
4976
mpte->pindex = pmap_l3e_pindex(va);
4977
if (va < VM_MAXUSER_ADDRESS)
4978
pmap_resident_count_inc(pmap, 1);
4979
}
4980
mptepa = VM_PAGE_TO_PHYS(mpte);
4981
firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4982
KASSERT((oldpde & PG_A) != 0,
4983
("pmap_demote_l3e: oldpde is missing PG_A"));
4984
KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4985
("pmap_demote_l3e: oldpde is missing PG_M"));
4986
4987
/*
4988
* If the page table page is new, initialize it.
4989
*/
4990
if (mpte->ref_count == 1) {
4991
mpte->ref_count = NPTEPG;
4992
pmap_fill_ptp(firstpte, oldpde);
4993
}
4994
4995
KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME),
4996
("pmap_demote_l3e: firstpte and newpte map different physical"
4997
" addresses"));
4998
4999
/*
5000
* If the mapping has changed attributes, update the page table
5001
* entries.
5002
*/
5003
if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
5004
pmap_fill_ptp(firstpte, oldpde);
5005
5006
/*
5007
* The spare PV entries must be reserved prior to demoting the
5008
* mapping, that is, prior to changing the PDE. Otherwise, the state
5009
* of the PDE and the PV lists will be inconsistent, which can result
5010
* in reclaim_pv_chunk() attempting to remove a PV entry from the
5011
* wrong PV list and pmap_pv_demote_l3e() failing to find the expected
5012
* PV entry for the 2MB page mapping that is being demoted.
5013
*/
5014
if ((oldpde & PG_MANAGED) != 0)
5015
reserve_pv_entries(pmap, NPTEPG - 1, lockp);
5016
5017
/*
5018
* Demote the mapping. This pmap is locked. The old PDE has
5019
* PG_A set. If the old PDE has PG_RW set, it also has PG_M
5020
* set. Thus, there is no danger of a race with another
5021
* processor changing the setting of PG_A and/or PG_M between
5022
* the read above and the store below.
5023
*/
5024
pde_store(l3e, mptepa);
5025
pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde);
5026
/*
5027
* Demote the PV entry.
5028
*/
5029
if ((oldpde & PG_MANAGED) != 0)
5030
pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
5031
5032
counter_u64_add(pmap_l3e_demotions, 1);
5033
CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
5034
" in pmap %p", va, pmap);
5035
return (true);
5036
}
5037
5038
/*
5039
* pmap_remove_kernel_pde: Remove a kernel superpage mapping.
5040
*/
5041
static void
5042
pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
5043
{
5044
vm_paddr_t mptepa;
5045
vm_page_t mpte;
5046
5047
KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
5048
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5049
mpte = pmap_remove_pt_page(pmap, va);
5050
if (mpte == NULL)
5051
panic("pmap_remove_kernel_pde: Missing pt page.");
5052
5053
mptepa = VM_PAGE_TO_PHYS(mpte);
5054
5055
/*
5056
* Initialize the page table page.
5057
*/
5058
pagezero(PHYS_TO_DMAP(mptepa));
5059
5060
/*
5061
* Demote the mapping.
5062
*/
5063
pde_store(l3e, mptepa);
5064
ptesync();
5065
}
5066
5067
/*
5068
* pmap_remove_l3e: do the things to unmap a superpage in a process
5069
*/
5070
static int
5071
pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
5072
struct spglist *free, struct rwlock **lockp)
5073
{
5074
struct md_page *pvh;
5075
pml3_entry_t oldpde;
5076
vm_offset_t eva, va;
5077
vm_page_t m, mpte;
5078
5079
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5080
KASSERT((sva & L3_PAGE_MASK) == 0,
5081
("pmap_remove_l3e: sva is not 2mpage aligned"));
5082
oldpde = be64toh(pte_load_clear(pdq));
5083
if (oldpde & PG_W)
5084
pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
5085
pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5086
if (oldpde & PG_MANAGED) {
5087
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
5088
pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
5089
pmap_pvh_free(pvh, pmap, sva);
5090
eva = sva + L3_PAGE_SIZE;
5091
for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
5092
va < eva; va += PAGE_SIZE, m++) {
5093
if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
5094
vm_page_dirty(m);
5095
if (oldpde & PG_A)
5096
vm_page_aflag_set(m, PGA_REFERENCED);
5097
if (TAILQ_EMPTY(&m->md.pv_list) &&
5098
TAILQ_EMPTY(&pvh->pv_list))
5099
vm_page_aflag_clear(m, PGA_WRITEABLE);
5100
}
5101
}
5102
if (pmap == kernel_pmap) {
5103
pmap_remove_kernel_l3e(pmap, pdq, sva);
5104
} else {
5105
mpte = pmap_remove_pt_page(pmap, sva);
5106
if (mpte != NULL) {
5107
pmap_resident_count_dec(pmap, 1);
5108
KASSERT(mpte->ref_count == NPTEPG,
5109
("pmap_remove_l3e: pte page wire count error"));
5110
mpte->ref_count = 0;
5111
pmap_add_delayed_free_list(mpte, free, false);
5112
}
5113
}
5114
return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free));
5115
}
5116
5117
/*
5118
* pmap_remove_pte: do the things to unmap a page in a process
5119
*/
5120
static int
5121
pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
5122
pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
5123
{
5124
struct md_page *pvh;
5125
pt_entry_t oldpte;
5126
vm_page_t m;
5127
5128
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5129
oldpte = be64toh(pte_load_clear(ptq));
5130
if (oldpte & RPTE_WIRED)
5131
pmap->pm_stats.wired_count -= 1;
5132
pmap_resident_count_dec(pmap, 1);
5133
if (oldpte & RPTE_MANAGED) {
5134
m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
5135
if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5136
vm_page_dirty(m);
5137
if (oldpte & PG_A)
5138
vm_page_aflag_set(m, PGA_REFERENCED);
5139
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5140
pmap_pvh_free(&m->md, pmap, va);
5141
if (TAILQ_EMPTY(&m->md.pv_list) &&
5142
(m->flags & PG_FICTITIOUS) == 0) {
5143
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5144
if (TAILQ_EMPTY(&pvh->pv_list))
5145
vm_page_aflag_clear(m, PGA_WRITEABLE);
5146
}
5147
}
5148
return (pmap_unuse_pt(pmap, va, ptepde, free));
5149
}
5150
5151
/*
5152
* Remove a single page from a process address space
5153
*/
5154
static bool
5155
pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
5156
struct spglist *free)
5157
{
5158
struct rwlock *lock;
5159
pt_entry_t *pte;
5160
bool invalidate_all;
5161
5162
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5163
if ((be64toh(*l3e) & RPTE_VALID) == 0) {
5164
return (false);
5165
}
5166
pte = pmap_l3e_to_pte(l3e, va);
5167
if ((be64toh(*pte) & RPTE_VALID) == 0) {
5168
return (false);
5169
}
5170
lock = NULL;
5171
5172
invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock);
5173
if (lock != NULL)
5174
rw_wunlock(lock);
5175
if (!invalidate_all)
5176
pmap_invalidate_page(pmap, va);
5177
return (invalidate_all);
5178
}
5179
5180
/*
5181
* Removes the specified range of addresses from the page table page.
5182
*/
5183
static bool
5184
pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
5185
pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
5186
{
5187
pt_entry_t *pte;
5188
vm_offset_t va;
5189
bool anyvalid;
5190
5191
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5192
anyvalid = false;
5193
va = eva;
5194
for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
5195
sva += PAGE_SIZE) {
5196
MPASS(pte == pmap_pte(pmap, sva));
5197
if (*pte == 0) {
5198
if (va != eva) {
5199
anyvalid = true;
5200
va = eva;
5201
}
5202
continue;
5203
}
5204
if (va == eva)
5205
va = sva;
5206
if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) {
5207
anyvalid = true;
5208
sva += PAGE_SIZE;
5209
break;
5210
}
5211
}
5212
if (anyvalid)
5213
pmap_invalidate_all(pmap);
5214
else if (va != eva)
5215
pmap_invalidate_range(pmap, va, sva);
5216
return (anyvalid);
5217
}
5218
5219
void
5220
mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5221
{
5222
struct rwlock *lock;
5223
vm_offset_t va_next;
5224
pml1_entry_t *l1e;
5225
pml2_entry_t *l2e;
5226
pml3_entry_t ptpaddr, *l3e;
5227
struct spglist free;
5228
bool anyvalid;
5229
5230
CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5231
5232
/*
5233
* Perform an unsynchronized read. This is, however, safe.
5234
*/
5235
if (pmap->pm_stats.resident_count == 0)
5236
return;
5237
5238
anyvalid = false;
5239
SLIST_INIT(&free);
5240
5241
/* XXX something fishy here */
5242
sva = (sva + PAGE_MASK) & ~PAGE_MASK;
5243
eva = (eva + PAGE_MASK) & ~PAGE_MASK;
5244
5245
PMAP_LOCK(pmap);
5246
5247
/*
5248
* special handling of removing one page. a very
5249
* common operation and easy to short circuit some
5250
* code.
5251
*/
5252
if (sva + PAGE_SIZE == eva) {
5253
l3e = pmap_pml3e(pmap, sva);
5254
if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) {
5255
anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
5256
goto out;
5257
}
5258
}
5259
5260
lock = NULL;
5261
for (; sva < eva; sva = va_next) {
5262
if (pmap->pm_stats.resident_count == 0)
5263
break;
5264
l1e = pmap_pml1e(pmap, sva);
5265
if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) {
5266
va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5267
if (va_next < sva)
5268
va_next = eva;
5269
continue;
5270
}
5271
5272
l2e = pmap_l1e_to_l2e(l1e, sva);
5273
if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) {
5274
va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5275
if (va_next < sva)
5276
va_next = eva;
5277
continue;
5278
}
5279
5280
/*
5281
* Calculate index for next page table.
5282
*/
5283
va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5284
if (va_next < sva)
5285
va_next = eva;
5286
5287
l3e = pmap_l2e_to_l3e(l2e, sva);
5288
ptpaddr = be64toh(*l3e);
5289
5290
/*
5291
* Weed out invalid mappings.
5292
*/
5293
if (ptpaddr == 0)
5294
continue;
5295
5296
/*
5297
* Check for large page.
5298
*/
5299
if ((ptpaddr & RPTE_LEAF) != 0) {
5300
/*
5301
* Are we removing the entire large page? If not,
5302
* demote the mapping and fall through.
5303
*/
5304
if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5305
pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
5306
anyvalid = true;
5307
continue;
5308
} else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
5309
&lock)) {
5310
/* The large page mapping was destroyed. */
5311
continue;
5312
} else
5313
ptpaddr = be64toh(*l3e);
5314
}
5315
5316
/*
5317
* Limit our scan to either the end of the va represented
5318
* by the current page table page, or to the end of the
5319
* range being removed.
5320
*/
5321
if (va_next > eva)
5322
va_next = eva;
5323
5324
if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
5325
anyvalid = true;
5326
}
5327
if (lock != NULL)
5328
rw_wunlock(lock);
5329
out:
5330
if (anyvalid)
5331
pmap_invalidate_all(pmap);
5332
PMAP_UNLOCK(pmap);
5333
vm_page_free_pages_toq(&free, true);
5334
}
5335
5336
void
5337
mmu_radix_remove_all(vm_page_t m)
5338
{
5339
struct md_page *pvh;
5340
pv_entry_t pv;
5341
pmap_t pmap;
5342
struct rwlock *lock;
5343
pt_entry_t *pte, tpte;
5344
pml3_entry_t *l3e;
5345
vm_offset_t va;
5346
struct spglist free;
5347
int pvh_gen, md_gen;
5348
5349
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5350
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5351
("pmap_remove_all: page %p is not managed", m));
5352
SLIST_INIT(&free);
5353
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5354
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5355
pa_to_pvh(VM_PAGE_TO_PHYS(m));
5356
retry:
5357
rw_wlock(lock);
5358
while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
5359
pmap = PV_PMAP(pv);
5360
if (!PMAP_TRYLOCK(pmap)) {
5361
pvh_gen = pvh->pv_gen;
5362
rw_wunlock(lock);
5363
PMAP_LOCK(pmap);
5364
rw_wlock(lock);
5365
if (pvh_gen != pvh->pv_gen) {
5366
rw_wunlock(lock);
5367
PMAP_UNLOCK(pmap);
5368
goto retry;
5369
}
5370
}
5371
va = pv->pv_va;
5372
l3e = pmap_pml3e(pmap, va);
5373
(void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
5374
PMAP_UNLOCK(pmap);
5375
}
5376
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
5377
pmap = PV_PMAP(pv);
5378
if (!PMAP_TRYLOCK(pmap)) {
5379
pvh_gen = pvh->pv_gen;
5380
md_gen = m->md.pv_gen;
5381
rw_wunlock(lock);
5382
PMAP_LOCK(pmap);
5383
rw_wlock(lock);
5384
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5385
rw_wunlock(lock);
5386
PMAP_UNLOCK(pmap);
5387
goto retry;
5388
}
5389
}
5390
pmap_resident_count_dec(pmap, 1);
5391
l3e = pmap_pml3e(pmap, pv->pv_va);
5392
KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found"
5393
" a 2mpage in page %p's pv list", m));
5394
pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5395
tpte = be64toh(pte_load_clear(pte));
5396
if (tpte & PG_W)
5397
pmap->pm_stats.wired_count--;
5398
if (tpte & PG_A)
5399
vm_page_aflag_set(m, PGA_REFERENCED);
5400
5401
/*
5402
* Update the vm_page_t clean and reference bits.
5403
*/
5404
if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5405
vm_page_dirty(m);
5406
pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free);
5407
pmap_invalidate_page(pmap, pv->pv_va);
5408
TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5409
m->md.pv_gen++;
5410
free_pv_entry(pmap, pv);
5411
PMAP_UNLOCK(pmap);
5412
}
5413
vm_page_aflag_clear(m, PGA_WRITEABLE);
5414
rw_wunlock(lock);
5415
vm_page_free_pages_toq(&free, true);
5416
}
5417
5418
/*
5419
* Destroy all managed, non-wired mappings in the given user-space
5420
* pmap. This pmap cannot be active on any processor besides the
5421
* caller.
5422
*
5423
* This function cannot be applied to the kernel pmap. Moreover, it
5424
* is not intended for general use. It is only to be used during
5425
* process termination. Consequently, it can be implemented in ways
5426
* that make it faster than pmap_remove(). First, it can more quickly
5427
* destroy mappings by iterating over the pmap's collection of PV
5428
* entries, rather than searching the page table. Second, it doesn't
5429
* have to test and clear the page table entries atomically, because
5430
* no processor is currently accessing the user address space. In
5431
* particular, a page table entry's dirty bit won't change state once
5432
* this function starts.
5433
*
5434
* Although this function destroys all of the pmap's managed,
5435
* non-wired mappings, it can delay and batch the invalidation of TLB
5436
* entries without calling pmap_delayed_invl_started() and
5437
* pmap_delayed_invl_finished(). Because the pmap is not active on
5438
* any other processor, none of these TLB entries will ever be used
5439
* before their eventual invalidation. Consequently, there is no need
5440
* for either pmap_remove_all() or pmap_remove_write() to wait for
5441
* that eventual TLB invalidation.
5442
*/
5443
5444
void
5445
mmu_radix_remove_pages(pmap_t pmap)
5446
{
5447
5448
CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
5449
pml3_entry_t ptel3e;
5450
pt_entry_t *pte, tpte;
5451
struct spglist free;
5452
vm_page_t m, mpte, mt;
5453
pv_entry_t pv;
5454
struct md_page *pvh;
5455
struct pv_chunk *pc, *npc;
5456
struct rwlock *lock;
5457
int64_t bit;
5458
uint64_t inuse, bitmask;
5459
int allfree, field, idx;
5460
#ifdef PV_STATS
5461
int freed;
5462
#endif
5463
bool superpage;
5464
vm_paddr_t pa;
5465
5466
/*
5467
* Assert that the given pmap is only active on the current
5468
* CPU. Unfortunately, we cannot block another CPU from
5469
* activating the pmap while this function is executing.
5470
*/
5471
KASSERT(pmap->pm_pid == mfspr(SPR_PID),
5472
("non-current asid %lu - expected %lu", pmap->pm_pid,
5473
mfspr(SPR_PID)));
5474
5475
lock = NULL;
5476
5477
SLIST_INIT(&free);
5478
PMAP_LOCK(pmap);
5479
TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5480
allfree = 1;
5481
#ifdef PV_STATS
5482
freed = 0;
5483
#endif
5484
for (field = 0; field < _NPCM; field++) {
5485
inuse = ~pc->pc_map[field] & pc_freemask[field];
5486
while (inuse != 0) {
5487
bit = cnttzd(inuse);
5488
bitmask = 1UL << bit;
5489
idx = field * 64 + bit;
5490
pv = &pc->pc_pventry[idx];
5491
inuse &= ~bitmask;
5492
5493
pte = pmap_pml2e(pmap, pv->pv_va);
5494
ptel3e = be64toh(*pte);
5495
pte = pmap_l2e_to_l3e(pte, pv->pv_va);
5496
tpte = be64toh(*pte);
5497
if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
5498
superpage = false;
5499
ptel3e = tpte;
5500
pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5501
PG_FRAME);
5502
pte = &pte[pmap_pte_index(pv->pv_va)];
5503
tpte = be64toh(*pte);
5504
} else {
5505
/*
5506
* Keep track whether 'tpte' is a
5507
* superpage explicitly instead of
5508
* relying on RPTE_LEAF being set.
5509
*
5510
* This is because RPTE_LEAF is numerically
5511
* identical to PG_PTE_PAT and thus a
5512
* regular page could be mistaken for
5513
* a superpage.
5514
*/
5515
superpage = true;
5516
}
5517
5518
if ((tpte & PG_V) == 0) {
5519
panic("bad pte va %lx pte %lx",
5520
pv->pv_va, tpte);
5521
}
5522
5523
/*
5524
* We cannot remove wired pages from a process' mapping at this time
5525
*/
5526
if (tpte & PG_W) {
5527
allfree = 0;
5528
continue;
5529
}
5530
5531
if (superpage)
5532
pa = tpte & PG_PS_FRAME;
5533
else
5534
pa = tpte & PG_FRAME;
5535
5536
m = PHYS_TO_VM_PAGE(pa);
5537
KASSERT(m->phys_addr == pa,
5538
("vm_page_t %p phys_addr mismatch %016jx %016jx",
5539
m, (uintmax_t)m->phys_addr,
5540
(uintmax_t)tpte));
5541
5542
KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5543
m < &vm_page_array[vm_page_array_size],
5544
("pmap_remove_pages: bad tpte %#jx",
5545
(uintmax_t)tpte));
5546
5547
pte_clear(pte);
5548
5549
/*
5550
* Update the vm_page_t clean/reference bits.
5551
*/
5552
if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5553
if (superpage) {
5554
for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5555
vm_page_dirty(mt);
5556
} else
5557
vm_page_dirty(m);
5558
}
5559
5560
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5561
5562
/* Mark free */
5563
pc->pc_map[field] |= bitmask;
5564
if (superpage) {
5565
pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5566
pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5567
TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
5568
pvh->pv_gen++;
5569
if (TAILQ_EMPTY(&pvh->pv_list)) {
5570
for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5571
if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5572
TAILQ_EMPTY(&mt->md.pv_list))
5573
vm_page_aflag_clear(mt, PGA_WRITEABLE);
5574
}
5575
mpte = pmap_remove_pt_page(pmap, pv->pv_va);
5576
if (mpte != NULL) {
5577
pmap_resident_count_dec(pmap, 1);
5578
KASSERT(mpte->ref_count == NPTEPG,
5579
("pmap_remove_pages: pte page wire count error"));
5580
mpte->ref_count = 0;
5581
pmap_add_delayed_free_list(mpte, &free, false);
5582
}
5583
} else {
5584
pmap_resident_count_dec(pmap, 1);
5585
#ifdef VERBOSE_PV
5586
printf("freeing pv (%p, %p)\n",
5587
pmap, pv);
5588
#endif
5589
TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5590
m->md.pv_gen++;
5591
if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5592
TAILQ_EMPTY(&m->md.pv_list) &&
5593
(m->flags & PG_FICTITIOUS) == 0) {
5594
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5595
if (TAILQ_EMPTY(&pvh->pv_list))
5596
vm_page_aflag_clear(m, PGA_WRITEABLE);
5597
}
5598
}
5599
pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
5600
#ifdef PV_STATS
5601
freed++;
5602
#endif
5603
}
5604
}
5605
PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5606
PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5607
PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5608
if (allfree) {
5609
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5610
free_pv_chunk(pc);
5611
}
5612
}
5613
if (lock != NULL)
5614
rw_wunlock(lock);
5615
pmap_invalidate_all(pmap);
5616
PMAP_UNLOCK(pmap);
5617
vm_page_free_pages_toq(&free, true);
5618
}
5619
5620
void
5621
mmu_radix_remove_write(vm_page_t m)
5622
{
5623
struct md_page *pvh;
5624
pmap_t pmap;
5625
struct rwlock *lock;
5626
pv_entry_t next_pv, pv;
5627
pml3_entry_t *l3e;
5628
pt_entry_t oldpte, *pte;
5629
int pvh_gen, md_gen;
5630
5631
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5632
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5633
("pmap_remove_write: page %p is not managed", m));
5634
vm_page_assert_busied(m);
5635
5636
if (!pmap_page_is_write_mapped(m))
5637
return;
5638
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5639
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5640
pa_to_pvh(VM_PAGE_TO_PHYS(m));
5641
retry_pv_loop:
5642
rw_wlock(lock);
5643
TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
5644
pmap = PV_PMAP(pv);
5645
if (!PMAP_TRYLOCK(pmap)) {
5646
pvh_gen = pvh->pv_gen;
5647
rw_wunlock(lock);
5648
PMAP_LOCK(pmap);
5649
rw_wlock(lock);
5650
if (pvh_gen != pvh->pv_gen) {
5651
PMAP_UNLOCK(pmap);
5652
rw_wunlock(lock);
5653
goto retry_pv_loop;
5654
}
5655
}
5656
l3e = pmap_pml3e(pmap, pv->pv_va);
5657
if ((be64toh(*l3e) & PG_RW) != 0)
5658
(void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
5659
KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5660
("inconsistent pv lock %p %p for page %p",
5661
lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5662
PMAP_UNLOCK(pmap);
5663
}
5664
TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
5665
pmap = PV_PMAP(pv);
5666
if (!PMAP_TRYLOCK(pmap)) {
5667
pvh_gen = pvh->pv_gen;
5668
md_gen = m->md.pv_gen;
5669
rw_wunlock(lock);
5670
PMAP_LOCK(pmap);
5671
rw_wlock(lock);
5672
if (pvh_gen != pvh->pv_gen ||
5673
md_gen != m->md.pv_gen) {
5674
PMAP_UNLOCK(pmap);
5675
rw_wunlock(lock);
5676
goto retry_pv_loop;
5677
}
5678
}
5679
l3e = pmap_pml3e(pmap, pv->pv_va);
5680
KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
5681
("pmap_remove_write: found a 2mpage in page %p's pv list",
5682
m));
5683
pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5684
retry:
5685
oldpte = be64toh(*pte);
5686
if (oldpte & PG_RW) {
5687
if (!atomic_cmpset_long(pte, htobe64(oldpte),
5688
htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))))
5689
goto retry;
5690
if ((oldpte & PG_M) != 0)
5691
vm_page_dirty(m);
5692
pmap_invalidate_page(pmap, pv->pv_va);
5693
}
5694
PMAP_UNLOCK(pmap);
5695
}
5696
rw_wunlock(lock);
5697
vm_page_aflag_clear(m, PGA_WRITEABLE);
5698
}
5699
5700
/*
5701
* Clear the wired attribute from the mappings for the specified range of
5702
* addresses in the given pmap. Every valid mapping within that range
5703
* must have the wired attribute set. In contrast, invalid mappings
5704
* cannot have the wired attribute set, so they are ignored.
5705
*
5706
* The wired attribute of the page table entry is not a hardware
5707
* feature, so there is no need to invalidate any TLB entries.
5708
* Since pmap_demote_l3e() for the wired entry must never fail,
5709
* pmap_delayed_invl_started()/finished() calls around the
5710
* function are not needed.
5711
*/
5712
void
5713
mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5714
{
5715
vm_offset_t va_next;
5716
pml1_entry_t *l1e;
5717
pml2_entry_t *l2e;
5718
pml3_entry_t *l3e;
5719
pt_entry_t *pte;
5720
5721
CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5722
PMAP_LOCK(pmap);
5723
for (; sva < eva; sva = va_next) {
5724
l1e = pmap_pml1e(pmap, sva);
5725
if ((be64toh(*l1e) & PG_V) == 0) {
5726
va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5727
if (va_next < sva)
5728
va_next = eva;
5729
continue;
5730
}
5731
l2e = pmap_l1e_to_l2e(l1e, sva);
5732
if ((be64toh(*l2e) & PG_V) == 0) {
5733
va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5734
if (va_next < sva)
5735
va_next = eva;
5736
continue;
5737
}
5738
va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5739
if (va_next < sva)
5740
va_next = eva;
5741
l3e = pmap_l2e_to_l3e(l2e, sva);
5742
if ((be64toh(*l3e) & PG_V) == 0)
5743
continue;
5744
if ((be64toh(*l3e) & RPTE_LEAF) != 0) {
5745
if ((be64toh(*l3e) & PG_W) == 0)
5746
panic("pmap_unwire: pde %#jx is missing PG_W",
5747
(uintmax_t)(be64toh(*l3e)));
5748
5749
/*
5750
* Are we unwiring the entire large page? If not,
5751
* demote the mapping and fall through.
5752
*/
5753
if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5754
atomic_clear_long(l3e, htobe64(PG_W));
5755
pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
5756
PAGE_SIZE;
5757
continue;
5758
} else if (!pmap_demote_l3e(pmap, l3e, sva))
5759
panic("pmap_unwire: demotion failed");
5760
}
5761
if (va_next > eva)
5762
va_next = eva;
5763
for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
5764
sva += PAGE_SIZE) {
5765
MPASS(pte == pmap_pte(pmap, sva));
5766
if ((be64toh(*pte) & PG_V) == 0)
5767
continue;
5768
if ((be64toh(*pte) & PG_W) == 0)
5769
panic("pmap_unwire: pte %#jx is missing PG_W",
5770
(uintmax_t)(be64toh(*pte)));
5771
5772
/*
5773
* PG_W must be cleared atomically. Although the pmap
5774
* lock synchronizes access to PG_W, another processor
5775
* could be setting PG_M and/or PG_A concurrently.
5776
*/
5777
atomic_clear_long(pte, htobe64(PG_W));
5778
pmap->pm_stats.wired_count--;
5779
}
5780
}
5781
PMAP_UNLOCK(pmap);
5782
}
5783
5784
void
5785
mmu_radix_zero_page(vm_page_t m)
5786
{
5787
vm_offset_t addr;
5788
5789
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5790
addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5791
pagezero(addr);
5792
}
5793
5794
void
5795
mmu_radix_zero_page_area(vm_page_t m, int off, int size)
5796
{
5797
caddr_t addr;
5798
5799
CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
5800
MPASS(off + size <= PAGE_SIZE);
5801
addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5802
memset(addr + off, 0, size);
5803
}
5804
5805
static int
5806
mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5807
{
5808
pml3_entry_t *l3ep;
5809
pt_entry_t pte;
5810
vm_paddr_t pa;
5811
int val;
5812
5813
CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
5814
PMAP_LOCK(pmap);
5815
5816
l3ep = pmap_pml3e(pmap, addr);
5817
if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) {
5818
if (be64toh(*l3ep) & RPTE_LEAF) {
5819
pte = be64toh(*l3ep);
5820
/* Compute the physical address of the 4KB page. */
5821
pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
5822
PG_FRAME;
5823
val = MINCORE_PSIND(1);
5824
} else {
5825
/* Native endian PTE, do not pass to functions */
5826
pte = be64toh(*pmap_l3e_to_pte(l3ep, addr));
5827
pa = pte & PG_FRAME;
5828
val = 0;
5829
}
5830
} else {
5831
pte = 0;
5832
pa = 0;
5833
val = 0;
5834
}
5835
if ((pte & PG_V) != 0) {
5836
val |= MINCORE_INCORE;
5837
if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5838
val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5839
if ((pte & PG_A) != 0)
5840
val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5841
}
5842
if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5843
(MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5844
(pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5845
*locked_pa = pa;
5846
}
5847
PMAP_UNLOCK(pmap);
5848
return (val);
5849
}
5850
5851
void
5852
mmu_radix_activate(struct thread *td)
5853
{
5854
pmap_t pmap;
5855
uint32_t curpid;
5856
5857
CTR2(KTR_PMAP, "%s(%p)", __func__, td);
5858
critical_enter();
5859
pmap = vmspace_pmap(td->td_proc->p_vmspace);
5860
curpid = mfspr(SPR_PID);
5861
if (pmap->pm_pid > isa3_base_pid &&
5862
curpid != pmap->pm_pid) {
5863
mmu_radix_pid_set(pmap);
5864
}
5865
critical_exit();
5866
}
5867
5868
/*
5869
* Increase the starting virtual address of the given mapping if a
5870
* different alignment might result in more superpage mappings.
5871
*/
5872
void
5873
mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
5874
vm_offset_t *addr, vm_size_t size)
5875
{
5876
5877
CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
5878
size);
5879
vm_offset_t superpage_offset;
5880
5881
if (size < L3_PAGE_SIZE)
5882
return;
5883
if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5884
offset += ptoa(object->pg_color);
5885
superpage_offset = offset & L3_PAGE_MASK;
5886
if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
5887
(*addr & L3_PAGE_MASK) == superpage_offset)
5888
return;
5889
if ((*addr & L3_PAGE_MASK) < superpage_offset)
5890
*addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
5891
else
5892
*addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
5893
}
5894
5895
static void *
5896
mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
5897
{
5898
vm_offset_t va, tmpva, ppa, offset;
5899
5900
ppa = trunc_page(pa);
5901
offset = pa & PAGE_MASK;
5902
size = roundup2(offset + size, PAGE_SIZE);
5903
if (pa < powerpc_ptob(Maxmem))
5904
panic("bad pa: %#lx less than Maxmem %#lx\n",
5905
pa, powerpc_ptob(Maxmem));
5906
va = kva_alloc(size);
5907
if (bootverbose)
5908
printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
5909
KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
5910
5911
if (!va)
5912
panic("%s: Couldn't alloc kernel virtual memory", __func__);
5913
5914
for (tmpva = va; size > 0;) {
5915
mmu_radix_kenter_attr(tmpva, ppa, attr);
5916
size -= PAGE_SIZE;
5917
tmpva += PAGE_SIZE;
5918
ppa += PAGE_SIZE;
5919
}
5920
ptesync();
5921
5922
return ((void *)(va + offset));
5923
}
5924
5925
static void *
5926
mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
5927
{
5928
5929
CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
5930
5931
return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
5932
}
5933
5934
void
5935
mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5936
{
5937
5938
CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
5939
5940
if (m->md.mdpg_cache_attrs == ma)
5941
return;
5942
5943
m->md.mdpg_cache_attrs = ma;
5944
5945
/*
5946
* If "m" is a normal page, update its direct mapping. This update
5947
* can be relied upon to perform any cache operations that are
5948
* required for data coherence.
5949
*/
5950
if ((m->flags & PG_FICTITIOUS) == 0 &&
5951
mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
5952
PAGE_SIZE, m->md.mdpg_cache_attrs))
5953
panic("memory attribute change on the direct map failed");
5954
}
5955
5956
static void
5957
mmu_radix_unmapdev(void *p, vm_size_t size)
5958
{
5959
vm_offset_t offset, va;
5960
5961
CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, p, size);
5962
5963
/* If we gave a direct map region in pmap_mapdev, do nothing */
5964
va = (vm_offset_t)p;
5965
if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
5966
return;
5967
5968
offset = va & PAGE_MASK;
5969
size = round_page(offset + size);
5970
va = trunc_page(va);
5971
5972
if (pmap_initialized) {
5973
mmu_radix_qremove(va, atop(size));
5974
kva_free(va, size);
5975
}
5976
}
5977
5978
void
5979
mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5980
{
5981
vm_paddr_t pa = 0;
5982
int sync_sz;
5983
5984
if (__predict_false(pm == NULL))
5985
pm = &curthread->td_proc->p_vmspace->vm_pmap;
5986
5987
while (sz > 0) {
5988
pa = pmap_extract(pm, va);
5989
sync_sz = PAGE_SIZE - (va & PAGE_MASK);
5990
sync_sz = min(sync_sz, sz);
5991
if (pa != 0) {
5992
pa += (va & PAGE_MASK);
5993
__syncicache((void *)PHYS_TO_DMAP(pa), sync_sz);
5994
}
5995
va += sync_sz;
5996
sz -= sync_sz;
5997
}
5998
}
5999
6000
static __inline void
6001
pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
6002
{
6003
uint64_t opte, npte;
6004
6005
/*
6006
* The cache mode bits are all in the low 32-bits of the
6007
* PTE, so we can just spin on updating the low 32-bits.
6008
*/
6009
do {
6010
opte = be64toh(*pte);
6011
npte = opte & ~mask;
6012
npte |= cache_bits;
6013
} while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte)));
6014
}
6015
6016
/*
6017
* Tries to demote a 1GB page mapping.
6018
*/
6019
static bool
6020
pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
6021
{
6022
pml2_entry_t oldpdpe;
6023
pml3_entry_t *firstpde, newpde, *pde;
6024
vm_paddr_t pdpgpa;
6025
vm_page_t pdpg;
6026
6027
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6028
oldpdpe = be64toh(*l2e);
6029
KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
6030
("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
6031
pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
6032
if (pdpg == NULL) {
6033
CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
6034
" in pmap %p", va, pmap);
6035
return (false);
6036
}
6037
pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT;
6038
pdpgpa = VM_PAGE_TO_PHYS(pdpg);
6039
firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
6040
KASSERT((oldpdpe & PG_A) != 0,
6041
("pmap_demote_pdpe: oldpdpe is missing PG_A"));
6042
KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
6043
("pmap_demote_pdpe: oldpdpe is missing PG_M"));
6044
newpde = oldpdpe;
6045
6046
/*
6047
* Initialize the page directory page.
6048
*/
6049
for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
6050
*pde = htobe64(newpde);
6051
newpde += L3_PAGE_SIZE;
6052
}
6053
6054
/*
6055
* Demote the mapping.
6056
*/
6057
pde_store(l2e, pdpgpa);
6058
6059
/*
6060
* Flush PWC --- XXX revisit
6061
*/
6062
pmap_invalidate_all(pmap);
6063
6064
counter_u64_add(pmap_l2e_demotions, 1);
6065
CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6066
" in pmap %p", va, pmap);
6067
return (true);
6068
}
6069
6070
vm_paddr_t
6071
mmu_radix_kextract(vm_offset_t va)
6072
{
6073
pml3_entry_t l3e;
6074
vm_paddr_t pa;
6075
6076
CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6077
if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
6078
pa = DMAP_TO_PHYS(va);
6079
} else {
6080
/* Big-endian PTE on stack */
6081
l3e = *pmap_pml3e(kernel_pmap, va);
6082
if (be64toh(l3e) & RPTE_LEAF) {
6083
pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
6084
pa |= (va & L3_PAGE_MASK);
6085
} else {
6086
/*
6087
* Beware of a concurrent promotion that changes the
6088
* PDE at this point! For example, vtopte() must not
6089
* be used to access the PTE because it would use the
6090
* new PDE. It is, however, safe to use the old PDE
6091
* because the page table page is preserved by the
6092
* promotion.
6093
*/
6094
pa = be64toh(*pmap_l3e_to_pte(&l3e, va));
6095
pa = (pa & PG_FRAME) | (va & PAGE_MASK);
6096
pa |= (va & PAGE_MASK);
6097
}
6098
}
6099
return (pa);
6100
}
6101
6102
static pt_entry_t
6103
mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
6104
{
6105
6106
if (ma != VM_MEMATTR_DEFAULT) {
6107
return pmap_cache_bits(ma);
6108
}
6109
6110
/*
6111
* Assume the page is cache inhibited and access is guarded unless
6112
* it's in our available memory array.
6113
*/
6114
for (int i = 0; i < pregions_sz; i++) {
6115
if ((pa >= pregions[i].mr_start) &&
6116
(pa < (pregions[i].mr_start + pregions[i].mr_size)))
6117
return (RPTE_ATTR_MEM);
6118
}
6119
return (RPTE_ATTR_GUARDEDIO);
6120
}
6121
6122
static void
6123
mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
6124
{
6125
pt_entry_t *pte, pteval;
6126
uint64_t cache_bits;
6127
6128
pte = kvtopte(va);
6129
MPASS(pte != NULL);
6130
pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
6131
cache_bits = mmu_radix_calc_wimg(pa, ma);
6132
pte_store(pte, pteval | cache_bits);
6133
}
6134
6135
void
6136
mmu_radix_kremove(vm_offset_t va)
6137
{
6138
pt_entry_t *pte;
6139
6140
CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6141
6142
pte = kvtopte(va);
6143
pte_clear(pte);
6144
}
6145
6146
int
6147
mmu_radix_decode_kernel_ptr(vm_offset_t addr,
6148
int *is_user, vm_offset_t *decoded)
6149
{
6150
6151
CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
6152
*decoded = addr;
6153
*is_user = (addr < VM_MAXUSER_ADDRESS);
6154
return (0);
6155
}
6156
6157
static int
6158
mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
6159
{
6160
6161
CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
6162
return (mem_valid(pa, size));
6163
}
6164
6165
static void
6166
mmu_radix_scan_init(void)
6167
{
6168
6169
CTR1(KTR_PMAP, "%s()", __func__);
6170
UNIMPLEMENTED();
6171
}
6172
6173
static void
6174
mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
6175
void **va)
6176
{
6177
CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
6178
UNIMPLEMENTED();
6179
}
6180
6181
vm_offset_t
6182
mmu_radix_quick_enter_page(vm_page_t m)
6183
{
6184
vm_paddr_t paddr;
6185
6186
CTR2(KTR_PMAP, "%s(%p)", __func__, m);
6187
paddr = VM_PAGE_TO_PHYS(m);
6188
return (PHYS_TO_DMAP(paddr));
6189
}
6190
6191
void
6192
mmu_radix_quick_remove_page(vm_offset_t addr __unused)
6193
{
6194
/* no work to do here */
6195
CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
6196
}
6197
6198
static void
6199
pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
6200
{
6201
cpu_flush_dcache((void *)sva, eva - sva);
6202
}
6203
6204
int
6205
mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
6206
vm_memattr_t mode)
6207
{
6208
int error;
6209
6210
CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
6211
PMAP_LOCK(kernel_pmap);
6212
error = pmap_change_attr_locked(va, size, mode, true);
6213
PMAP_UNLOCK(kernel_pmap);
6214
return (error);
6215
}
6216
6217
static int
6218
pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
6219
{
6220
vm_offset_t base, offset, tmpva;
6221
vm_paddr_t pa_start, pa_end, pa_end1;
6222
pml2_entry_t *l2e;
6223
pml3_entry_t *l3e;
6224
pt_entry_t *pte;
6225
int cache_bits, error;
6226
bool changed;
6227
6228
PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6229
base = trunc_page(va);
6230
offset = va & PAGE_MASK;
6231
size = round_page(offset + size);
6232
6233
/*
6234
* Only supported on kernel virtual addresses, including the direct
6235
* map but excluding the recursive map.
6236
*/
6237
if (base < DMAP_MIN_ADDRESS)
6238
return (EINVAL);
6239
6240
cache_bits = pmap_cache_bits(mode);
6241
changed = false;
6242
6243
/*
6244
* Pages that aren't mapped aren't supported. Also break down 2MB pages
6245
* into 4KB pages if required.
6246
*/
6247
for (tmpva = base; tmpva < base + size; ) {
6248
l2e = pmap_pml2e(kernel_pmap, tmpva);
6249
if (l2e == NULL || *l2e == 0)
6250
return (EINVAL);
6251
if (be64toh(*l2e) & RPTE_LEAF) {
6252
/*
6253
* If the current 1GB page already has the required
6254
* memory type, then we need not demote this page. Just
6255
* increment tmpva to the next 1GB page frame.
6256
*/
6257
if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) {
6258
tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6259
continue;
6260
}
6261
6262
/*
6263
* If the current offset aligns with a 1GB page frame
6264
* and there is at least 1GB left within the range, then
6265
* we need not break down this page into 2MB pages.
6266
*/
6267
if ((tmpva & L2_PAGE_MASK) == 0 &&
6268
tmpva + L2_PAGE_MASK < base + size) {
6269
tmpva += L2_PAGE_MASK;
6270
continue;
6271
}
6272
if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
6273
return (ENOMEM);
6274
}
6275
l3e = pmap_l2e_to_l3e(l2e, tmpva);
6276
KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
6277
tmpva, l2e));
6278
if (*l3e == 0)
6279
return (EINVAL);
6280
if (be64toh(*l3e) & RPTE_LEAF) {
6281
/*
6282
* If the current 2MB page already has the required
6283
* memory type, then we need not demote this page. Just
6284
* increment tmpva to the next 2MB page frame.
6285
*/
6286
if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) {
6287
tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6288
continue;
6289
}
6290
6291
/*
6292
* If the current offset aligns with a 2MB page frame
6293
* and there is at least 2MB left within the range, then
6294
* we need not break down this page into 4KB pages.
6295
*/
6296
if ((tmpva & L3_PAGE_MASK) == 0 &&
6297
tmpva + L3_PAGE_MASK < base + size) {
6298
tmpva += L3_PAGE_SIZE;
6299
continue;
6300
}
6301
if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
6302
return (ENOMEM);
6303
}
6304
pte = pmap_l3e_to_pte(l3e, tmpva);
6305
if (*pte == 0)
6306
return (EINVAL);
6307
tmpva += PAGE_SIZE;
6308
}
6309
error = 0;
6310
6311
/*
6312
* Ok, all the pages exist, so run through them updating their
6313
* cache mode if required.
6314
*/
6315
pa_start = pa_end = 0;
6316
for (tmpva = base; tmpva < base + size; ) {
6317
l2e = pmap_pml2e(kernel_pmap, tmpva);
6318
if (be64toh(*l2e) & RPTE_LEAF) {
6319
if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) {
6320
pmap_pte_attr(l2e, cache_bits,
6321
RPTE_ATTR_MASK);
6322
changed = true;
6323
}
6324
if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6325
(*l2e & PG_PS_FRAME) < dmaplimit) {
6326
if (pa_start == pa_end) {
6327
/* Start physical address run. */
6328
pa_start = be64toh(*l2e) & PG_PS_FRAME;
6329
pa_end = pa_start + L2_PAGE_SIZE;
6330
} else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME))
6331
pa_end += L2_PAGE_SIZE;
6332
else {
6333
/* Run ended, update direct map. */
6334
error = pmap_change_attr_locked(
6335
PHYS_TO_DMAP(pa_start),
6336
pa_end - pa_start, mode, flush);
6337
if (error != 0)
6338
break;
6339
/* Start physical address run. */
6340
pa_start = be64toh(*l2e) & PG_PS_FRAME;
6341
pa_end = pa_start + L2_PAGE_SIZE;
6342
}
6343
}
6344
tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6345
continue;
6346
}
6347
l3e = pmap_l2e_to_l3e(l2e, tmpva);
6348
if (be64toh(*l3e) & RPTE_LEAF) {
6349
if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) {
6350
pmap_pte_attr(l3e, cache_bits,
6351
RPTE_ATTR_MASK);
6352
changed = true;
6353
}
6354
if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6355
(be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) {
6356
if (pa_start == pa_end) {
6357
/* Start physical address run. */
6358
pa_start = be64toh(*l3e) & PG_PS_FRAME;
6359
pa_end = pa_start + L3_PAGE_SIZE;
6360
} else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME))
6361
pa_end += L3_PAGE_SIZE;
6362
else {
6363
/* Run ended, update direct map. */
6364
error = pmap_change_attr_locked(
6365
PHYS_TO_DMAP(pa_start),
6366
pa_end - pa_start, mode, flush);
6367
if (error != 0)
6368
break;
6369
/* Start physical address run. */
6370
pa_start = be64toh(*l3e) & PG_PS_FRAME;
6371
pa_end = pa_start + L3_PAGE_SIZE;
6372
}
6373
}
6374
tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6375
} else {
6376
pte = pmap_l3e_to_pte(l3e, tmpva);
6377
if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) {
6378
pmap_pte_attr(pte, cache_bits,
6379
RPTE_ATTR_MASK);
6380
changed = true;
6381
}
6382
if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6383
(be64toh(*pte) & PG_FRAME) < dmaplimit) {
6384
if (pa_start == pa_end) {
6385
/* Start physical address run. */
6386
pa_start = be64toh(*pte) & PG_FRAME;
6387
pa_end = pa_start + PAGE_SIZE;
6388
} else if (pa_end == (be64toh(*pte) & PG_FRAME))
6389
pa_end += PAGE_SIZE;
6390
else {
6391
/* Run ended, update direct map. */
6392
error = pmap_change_attr_locked(
6393
PHYS_TO_DMAP(pa_start),
6394
pa_end - pa_start, mode, flush);
6395
if (error != 0)
6396
break;
6397
/* Start physical address run. */
6398
pa_start = be64toh(*pte) & PG_FRAME;
6399
pa_end = pa_start + PAGE_SIZE;
6400
}
6401
}
6402
tmpva += PAGE_SIZE;
6403
}
6404
}
6405
if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6406
pa_end1 = MIN(pa_end, dmaplimit);
6407
if (pa_start != pa_end1)
6408
error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6409
pa_end1 - pa_start, mode, flush);
6410
}
6411
6412
/*
6413
* Flush CPU caches if required to make sure any data isn't cached that
6414
* shouldn't be, etc.
6415
*/
6416
if (changed) {
6417
pmap_invalidate_all(kernel_pmap);
6418
6419
if (flush)
6420
pmap_invalidate_cache_range(base, tmpva);
6421
}
6422
return (error);
6423
}
6424
6425
/*
6426
* Allocate physical memory for the vm_page array and map it into KVA,
6427
* attempting to back the vm_pages with domain-local memory.
6428
*/
6429
void
6430
mmu_radix_page_array_startup(long pages)
6431
{
6432
#ifdef notyet
6433
pml2_entry_t *l2e;
6434
pml3_entry_t *pde;
6435
pml3_entry_t newl3;
6436
vm_offset_t va;
6437
long pfn;
6438
int domain, i;
6439
#endif
6440
vm_paddr_t pa;
6441
vm_offset_t start, end;
6442
6443
vm_page_array_size = pages;
6444
6445
start = VM_MIN_KERNEL_ADDRESS;
6446
end = start + pages * sizeof(struct vm_page);
6447
6448
pa = vm_phys_early_alloc(-1, end - start);
6449
6450
start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
6451
#ifdef notyet
6452
/* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */
6453
for (va = start; va < end; va += L3_PAGE_SIZE) {
6454
pfn = first_page + (va - start) / sizeof(struct vm_page);
6455
domain = vm_phys_domain(ptoa(pfn));
6456
l2e = pmap_pml2e(kernel_pmap, va);
6457
if ((be64toh(*l2e) & PG_V) == 0) {
6458
pa = vm_phys_early_alloc(domain, PAGE_SIZE);
6459
dump_add_page(pa);
6460
pagezero(PHYS_TO_DMAP(pa));
6461
pde_store(l2e, (pml2_entry_t)pa);
6462
}
6463
pde = pmap_l2e_to_l3e(l2e, va);
6464
if ((be64toh(*pde) & PG_V) != 0)
6465
panic("Unexpected pde %p", pde);
6466
pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
6467
for (i = 0; i < NPDEPG; i++)
6468
dump_add_page(pa + i * PAGE_SIZE);
6469
newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
6470
pte_store(pde, newl3);
6471
}
6472
#endif
6473
vm_page_array = (vm_page_t)start;
6474
}
6475
6476
#ifdef DDB
6477
#include <sys/kdb.h>
6478
#include <ddb/ddb.h>
6479
6480
static void
6481
pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
6482
{
6483
pml1_entry_t *l1e;
6484
pml2_entry_t *l2e;
6485
pml3_entry_t *l3e;
6486
pt_entry_t *pte;
6487
6488
l1e = &l1[pmap_pml1e_index(va)];
6489
db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e));
6490
if ((be64toh(*l1e) & PG_V) == 0) {
6491
db_printf("\n");
6492
return;
6493
}
6494
l2e = pmap_l1e_to_l2e(l1e, va);
6495
db_printf(" l2e %#016lx", be64toh(*l2e));
6496
if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) {
6497
db_printf("\n");
6498
return;
6499
}
6500
l3e = pmap_l2e_to_l3e(l2e, va);
6501
db_printf(" l3e %#016lx", be64toh(*l3e));
6502
if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) {
6503
db_printf("\n");
6504
return;
6505
}
6506
pte = pmap_l3e_to_pte(l3e, va);
6507
db_printf(" pte %#016lx\n", be64toh(*pte));
6508
}
6509
6510
void
6511
pmap_page_print_mappings(vm_page_t m)
6512
{
6513
pmap_t pmap;
6514
pv_entry_t pv;
6515
6516
db_printf("page %p(%lx)\n", m, m->phys_addr);
6517
/* need to elide locks if running in ddb */
6518
TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
6519
db_printf("pv: %p ", pv);
6520
db_printf("va: %#016lx ", pv->pv_va);
6521
pmap = PV_PMAP(pv);
6522
db_printf("pmap %p ", pmap);
6523
if (pmap != NULL) {
6524
db_printf("asid: %lu\n", pmap->pm_pid);
6525
pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
6526
}
6527
}
6528
}
6529
6530
DB_SHOW_COMMAND(pte, pmap_print_pte)
6531
{
6532
vm_offset_t va;
6533
pmap_t pmap;
6534
6535
if (!have_addr) {
6536
db_printf("show pte addr\n");
6537
return;
6538
}
6539
va = (vm_offset_t)addr;
6540
6541
if (va >= DMAP_MIN_ADDRESS)
6542
pmap = kernel_pmap;
6543
else if (kdb_thread != NULL)
6544
pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
6545
else
6546
pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
6547
6548
pmap_pte_walk(pmap->pm_pml1, va);
6549
}
6550
6551
#endif
6552
6553